From d14f60dd2599877be2ca0077301e67c824ef6f83 Mon Sep 17 00:00:00 2001 From: Packit Service <user-cont-team+packit-service@redhat.com> Date: Dec 10 2020 05:26:29 +0000 Subject: rdma-core-29.0 base --- diff --git a/.clang-format b/.clang-format new file mode 100644 index 0000000..dda8d88 --- /dev/null +++ b/.clang-format @@ -0,0 +1,134 @@ +# SPDX-License-Identifier: GPL-2.0 +# +# clang-format configuration file. Intended for clang-format >= 4. +# +# For more information, see: +# +# Documentation/process/clang-format.rst +# https://clang.llvm.org/docs/ClangFormat.html +# https://clang.llvm.org/docs/ClangFormatStyleOptions.html +# +--- +AccessModifierOffset: -4 +AlignAfterOpenBracket: Align +AlignConsecutiveAssignments: false +AlignConsecutiveDeclarations: false +#AlignEscapedNewlines: Left # Unknown to clang-format-4.0 +AlignOperands: true +AlignTrailingComments: false +AllowAllParametersOfDeclarationOnNextLine: false +AllowShortBlocksOnASingleLine: false +AllowShortCaseLabelsOnASingleLine: false +AllowShortFunctionsOnASingleLine: None +AllowShortIfStatementsOnASingleLine: false +AllowShortLoopsOnASingleLine: false +AlwaysBreakAfterDefinitionReturnType: None +AlwaysBreakAfterReturnType: None +AlwaysBreakBeforeMultilineStrings: false +AlwaysBreakTemplateDeclarations: false +BinPackArguments: true +BinPackParameters: true +BraceWrapping: + AfterClass: false + AfterControlStatement: false + AfterEnum: false + AfterFunction: true + AfterNamespace: true + AfterObjCDeclaration: false + AfterStruct: false + AfterUnion: false + #AfterExternBlock: false # Unknown to clang-format-5.0 + BeforeCatch: false + BeforeElse: false + IndentBraces: false + #SplitEmptyFunction: true # Unknown to clang-format-4.0 + #SplitEmptyRecord: true # Unknown to clang-format-4.0 + #SplitEmptyNamespace: true # Unknown to clang-format-4.0 +BreakBeforeBinaryOperators: None +BreakBeforeBraces: Custom +#BreakBeforeInheritanceComma: false # Unknown to clang-format-4.0 +BreakBeforeTernaryOperators: false +BreakConstructorInitializersBeforeComma: false +#BreakConstructorInitializers: BeforeComma # Unknown to clang-format-4.0 +BreakAfterJavaFieldAnnotations: false +BreakStringLiterals: false +ColumnLimit: 80 +CommentPragmas: '^ IWYU pragma:' +#CompactNamespaces: false # Unknown to clang-format-4.0 +ConstructorInitializerAllOnOneLineOrOnePerLine: false +ConstructorInitializerIndentWidth: 8 +ContinuationIndentWidth: 8 +Cpp11BracedListStyle: false +DerivePointerAlignment: false +DisableFormat: false +ExperimentalAutoDetectBinPacking: false +#FixNamespaceComments: false # Unknown to clang-format-4.0 + +# Taken from: +# grep -Rh '^#define [^[:space:]]*for_each[^[:space:]]*(' build/include/ \ +# | sed "s,^#define \([^[:space:]]*for_each[^[:space:]]*\)(.*$, - '\1'," \ +# | sort | uniq +ForEachMacros: + - 'list_for_each' + - 'list_for_each_off' + - 'list_for_each_off_dir_' + - 'list_for_each_rev' + - 'list_for_each_rev_off' + - 'list_for_each_rev_safe' + - 'list_for_each_rev_safe_off' + - 'list_for_each_safe' + - 'list_for_each_safe_off' + - 'list_for_each_safe_off_dir_' + +#IncludeBlocks: Preserve # Unknown to clang-format-5.0 +IncludeCategories: + - Regex: '.*' + Priority: 1 +IncludeIsMainRegex: '(Test)?$' +IndentCaseLabels: false +#IndentPPDirectives: None # Unknown to clang-format-5.0 +IndentWidth: 8 +IndentWrappedFunctionNames: false +JavaScriptQuotes: Leave +JavaScriptWrapImports: true +KeepEmptyLinesAtTheStartOfBlocks: false +MacroBlockBegin: '' +MacroBlockEnd: '' +MaxEmptyLinesToKeep: 1 +NamespaceIndentation: Inner +#ObjCBinPackProtocolList: Auto # Unknown to clang-format-5.0 +ObjCBlockIndentWidth: 8 +ObjCSpaceAfterProperty: true +ObjCSpaceBeforeProtocolList: true + +# Taken from git's rules +#PenaltyBreakAssignment: 10 # Unknown to clang-format-4.0 +PenaltyBreakBeforeFirstCallParameter: 30 +PenaltyBreakComment: 10 +PenaltyBreakFirstLessLess: 0 +PenaltyBreakString: 10 +PenaltyExcessCharacter: 100 +PenaltyReturnTypeOnItsOwnLine: 60 + +PointerAlignment: Right +ReflowComments: false +SortIncludes: false +#SortUsingDeclarations: false # Unknown to clang-format-4.0 +SpaceAfterCStyleCast: false +SpaceAfterTemplateKeyword: true +SpaceBeforeAssignmentOperators: true +#SpaceBeforeCtorInitializerColon: true # Unknown to clang-format-5.0 +#SpaceBeforeInheritanceColon: true # Unknown to clang-format-5.0 +SpaceBeforeParens: ControlStatements +#SpaceBeforeRangeBasedForLoopColon: true # Unknown to clang-format-5.0 +SpaceInEmptyParentheses: false +SpacesBeforeTrailingComments: 1 +SpacesInAngles: false +SpacesInContainerLiterals: false +SpacesInCStyleCastParentheses: false +SpacesInParentheses: false +SpacesInSquareBrackets: false +Standard: Cpp03 +TabWidth: 8 +UseTab: Always +... diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..76227d0 --- /dev/null +++ b/.gitignore @@ -0,0 +1,74 @@ +# -*- mode: gitignore; -*- +# CMake +cmake_install.cmake +CMakeFiles +CMakeCache.txt +lib*.a +/bin/** +/lib/** +/include/** +/.ninja* +*.ninja +Makefile + +# Tags +TAGS +.TAGS +!TAGS/ +tags +.tags +!tags/ +gtags.files +GTAGS +GRTAGS +GPATH + +# cscope +cscope.files +cscope.out +cscope.in.out +cscope.po.out + +# Emacs +*~ +\#*\# +/.emacs.desktop +/.emacs.desktop.lock +*.elc +auto-save-list +tramp +.\#* + +# Org-mode +.org-id-locations +*_archive + +# flymake-mode +*_flymake.* + +# eshell files +/eshell/history +/eshell/lastdir + +# elpa packages +/elpa/ + +# reftex files +*.rel + +# AUCTeX auto folder +/auto/ + +# cask packages +.cask/ + +# vim +[._]*.s[a-w][a-z] +[._]s[a-w][a-z] +*.un~ +Session.vim +.netrwhist +*~ + +# python +*.pyc diff --git a/.mailmap b/.mailmap new file mode 100644 index 0000000..b6efd1e --- /dev/null +++ b/.mailmap @@ -0,0 +1,10 @@ +# +# This list is used by git-shortlog to fix a few botched name translations +# in the git archive, either because the author's full name was messed up +# and/or not always written the same way, making contributions from the +# same person appearing not to be so or badly displayed. +# +Jason Gunthorpe <jgg@ziepe.ca> <jgg@mellanox.com> +Jason Gunthorpe <jgg@ziepe.ca> <jgunthorpe@obsidianresearch.com> +Steve Wise <larrystevenwise@gmail.com> <swise@chelsio.com> +Steve Wise <larrystevenwise@gmail.com> <swise@opengridcomputing.com> diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 0000000..fcc50e8 --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,748 @@ +# COPYRIGHT (c) 2016 Obsidian Research Corporation. See COPYING file +# Run cmake as: +# mkdir build +# cmake -GNinja .. +# ninja +# +# Common options passed to cmake are: +# -DIN_PLACE=1 +# Configure the build to be run from the build directory, this results in something +# that is not installable. +# -DCMAKE_EXPORT_COMPILE_COMMANDS=1 +# Write a compile_commands.json file for clang tooling +# -DCMAKE_BUILD_TYPE=RelWithDebInfo +# Change the optimization level, Debug disables optimization, +# Release is for packagers +# -DENABLE_VALGRIND=0 (default enabled) +# Disable valgrind notations, this has a tiny positive performance impact +# -DENABLE_RESOLVE_NEIGH=0 (default enabled) +# Do not link to libnl and do not resolve neighbours internally for Ethernet, +# and do not build iwpmd. +# -DENABLE_STATIC=1 (default disabled) +# Produce static libraries along with the usual shared libraries. +# -DVERBS_PROVIDER_DIR='' (default /usr/lib.../libibverbs) +# Use the historical search path for providers, in the standard system library. +# -DNO_COMPAT_SYMS=1 (default disabled) +# Do not generate backwards compatibility symbols in the shared +# libraries. This may is necessary if using a dynmic linker that does +# not support symbol versions, such as uclibc. +# -DIOCTL_MODE=write (default both) +# Disable new kABI ioctl() support and support only the legacy write +# path. May also be 'ioctl' to disable fallback to write. +# -DIBACM_SERVER_MODE_DEFAULT (default unix) +# Selects how clients can connect to this server: +# open) Allow incoming connections from any TCP client (internal or external). +# loop) Limit incoming connections for server_port to 127.0.0.1. +# unix) Use unix-domain sockets, hence limits service to the same machine. +# -DIBACM_ACME_PLUS_KERNEL_ONLY_DEFAULT (default 0) +# If non-zero, limit incoming requests to kernel or the ib_acme utility +# (i.e. do not serve librdmacm requests) +# -DPYTHON_EXECUTABLE +# Override automatic detection of python to use a certain +# exectuable. This can be used to force the build to use python2 on a +# system that has python3 installed. Otherwise the build automatically +# prefers python3 if available. +# -DNO_PYVERBS=1 (default, build pyverbs) +# Invoke cython to build pyverbs. Usually you will run with this option +# set +# -DENABLE_IBDIAGS_COMPAT=True (default False) +# Include obsolete scripts. These scripts are replaced by C programs with +# a different interface now. + +cmake_minimum_required(VERSION 2.8.11 FATAL_ERROR) +project(rdma-core C) + +# CMake likes to use -rdynamic too much, they fixed it in 3.4. +if(POLICY CMP0065) + cmake_policy(SET CMP0065 NEW) +else() + # .. but we really do want to opt out. + string(REPLACE "-rdynamic" "" CMAKE_SHARED_LIBRARY_LINK_C_FLAGS "${CMAKE_SHARED_LIBRARY_LINK_C_FLAGS}") +endif() + +# Make RDMA_CHECK_C_LINKER_FLAG work better +if(POLICY CMP0056) + cmake_policy(SET CMP0056 NEW) +endif() + +set(PACKAGE_NAME "RDMA") + +# See Documentation/versioning.md +set(PACKAGE_VERSION "29.0") +# When this is changed the values in these files need changing too: +# debian/control +# debian/libibverbs1.symbols +set(IBVERBS_PABI_VERSION "25") +set(IBVERBS_PROVIDER_SUFFIX "-rdmav${IBVERBS_PABI_VERSION}.so") + +#------------------------- +# Basic standard paths + +# Override the CMAKE_INSTALL_ dirs to be under the build/ directory +if (IN_PLACE) + set(CMAKE_INSTALL_SYSCONFDIR "${CMAKE_BINARY_DIR}/etc") + set(CMAKE_INSTALL_BINDIR "${CMAKE_BINARY_DIR}/bin") + set(CMAKE_INSTALL_SBINDIR "${CMAKE_BINARY_DIR}/bin") + set(CMAKE_INSTALL_PREFIX "${CMAKE_BINARY_DIR}") + set(CMAKE_INSTALL_LIBDIR "lib") + set(CMAKE_INSTALL_INCLUDEDIR "include") +endif() + +include(GNUInstallDirs) +# C include root +set(BUILD_INCLUDE ${CMAKE_BINARY_DIR}/include) +# Executables +set(BUILD_BIN ${CMAKE_BINARY_DIR}/bin) +# Libraries +set(BUILD_LIB ${CMAKE_BINARY_DIR}/lib) +# Static library pre-processing +set(BUILD_STATIC_LIB ${CMAKE_BINARY_DIR}/lib/statics) +# Used for IN_PLACE configuration +set(BUILD_ETC ${CMAKE_BINARY_DIR}/etc) +set(BUILD_PYTHON ${CMAKE_BINARY_DIR}/python) + +set(IBDIAG_CONFIG_PATH "${CMAKE_INSTALL_FULL_SYSCONFDIR}/infiniband-diags") +set(IBDIAG_NODENAME_MAP_PATH "${CMAKE_INSTALL_FULL_SYSCONFDIR}/rdma/ib-node-name-map") + +set(CMAKE_INSTALL_INITDDIR "${CMAKE_INSTALL_SYSCONFDIR}/init.d" + CACHE PATH "Location for init.d files") +set(CMAKE_INSTALL_SYSTEMD_SERVICEDIR "${CMAKE_INSTALL_PREFIX}/lib/systemd/system" + CACHE PATH "Location for systemd service files") +set(CMAKE_INSTALL_SYSTEMD_BINDIR "/lib/systemd" + CACHE PATH "Location for systemd extra binaries") + +set(ACM_PROVIDER_DIR "${CMAKE_INSTALL_FULL_LIBDIR}/ibacm" + CACHE PATH "Location for ibacm provider plugin shared library files.") +# Location to find the provider plugin shared library files +set(VERBS_PROVIDER_DIR "${CMAKE_INSTALL_FULL_LIBDIR}/libibverbs" + CACHE PATH "Location for provider plugin shared library files. If set to empty the system search path is used.") + +# Allow the 'run' dir to be configurable, this historically has been /var/run, but +# some systems now use /run/ +set(CMAKE_INSTALL_RUNDIR "var/run" + CACHE PATH "Location for runtime information, typically /var/run, or /run") +if(NOT IS_ABSOLUTE ${CMAKE_INSTALL_RUNDIR}) + set(CMAKE_INSTALL_FULL_RUNDIR "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_RUNDIR}") +else() + set(CMAKE_INSTALL_FULL_RUNDIR "${CMAKE_INSTALL_RUNDIR}") +endif() + +# Allow the udev rules.d dir to be configurable, this has historically been +# /lib/udev/rules.d/, but some systems now prefix /usr/ +set(CMAKE_INSTALL_UDEV_RULESDIR "lib/udev/rules.d" + CACHE PATH "Location for system udev rules, typically /lib/udev/rules.d or /usr/lib/udev/rules.d") +if(NOT IS_ABSOLUTE ${CMAKE_INSTALL_UDEV_RULESDIR}) + set(CMAKE_INSTALL_FULL_UDEV_RULESDIR "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_UDEV_RULESDIR}") +else() + set(CMAKE_INSTALL_FULL_UDEV_RULESDIR "${CMAKE_INSTALL_UDEV_RULESDIR}") +endif() + +# Allow the perl library dir to be configurable +set(CMAKE_INSTALL_PERLDIR "share/perl5" + CACHE PATH "Location for system perl library, typically /usr/share/perl5") +if(NOT IS_ABSOLUTE ${CMAKE_INSTALL_PERLDIR}) + set(CMAKE_INSTALL_FULL_PERLDIR "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_PERLDIR}") +else() + set(CMAKE_INSTALL_FULL_PERLDIR "${CMAKE_INSTALL_PERLDIR}") +endif() + +# Location to place provider .driver files +if (IN_PLACE) + set(CONFIG_DIR "${BUILD_ETC}/libibverbs.d") + set(VERBS_PROVIDER_DIR "${BUILD_LIB}") + set(ACM_PROVIDER_DIR "${BUILD_LIB}/ibacm") +else() + set(CONFIG_DIR "${CMAKE_INSTALL_FULL_SYSCONFDIR}/libibverbs.d") +endif() + +set(DISTRO_FLAVOUR "None" CACHE + STRING "Flavour of distribution to install for. This primarily impacts the init.d scripts installed.") + +#------------------------- +# Load CMake components +set(BUILDLIB "${CMAKE_SOURCE_DIR}/buildlib") +set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${BUILDLIB}") + +include(CMakeParseArguments) +include(CheckCCompilerFlag) +include(CheckIncludeFile) +include(CheckTypeSize) +include(RDMA_EnableCStd) +include(RDMA_Sparse) +include(RDMA_BuildType) +include(RDMA_DoFixup) +include(publish_headers) +include(rdma_functions) +include(pyverbs_functions) +include(rdma_man) + +if (NOT DEFINED ENABLE_STATIC) + set(ENABLE_STATIC "OFF" CACHE BOOL "Produce static linking libraries as well as shared libraries.") +endif() + +#------------------------- +# Setup the basic C compiler +RDMA_BuildType() +include_directories(${BUILD_INCLUDE}) + +# Look for Python. We prefer some variant of python 3 if the system has it. +FIND_PACKAGE(PythonInterp 3 QUIET) +if (PythonInterp_FOUND) + # pyverbs can only use python3: + if (NO_PYVERBS) + set(CYTHON_EXECUTABLE "") + else() + FIND_PACKAGE(cython) + endif() +else() + # But we still must have python (be it 2) for the build process: + FIND_PACKAGE(PythonInterp REQUIRED) + set(CYTHON_EXECUTABLE "") +endif() + +RDMA_CheckSparse() + +# Require GNU99 mode +RDMA_EnableCStd() + +# Extra warnings. Turn on -Wextra to keep aware of interesting developments from gcc, +# but turn off some that are not terribly useful for this source. +# FIXME: I wonder how many of the signed compares are bugs? +RDMA_AddOptCFlag(CMAKE_C_FLAGS HAVE_C_WARNINGS + "-Wall -Wextra -Wno-sign-compare -Wno-unused-parameter") +RDMA_AddOptCFlag(CMAKE_C_FLAGS HAVE_C_WMISSING_PROTOTYPES "-Wmissing-prototypes") +RDMA_AddOptCFlag(CMAKE_C_FLAGS HAVE_C_WMISSING_DECLARATIONS "-Wmissing-declarations") +RDMA_AddOptCFlag(CMAKE_C_FLAGS HAVE_C_WWRITE_STRINGS "-Wwrite-strings") +RDMA_AddOptCFlag(CMAKE_C_FLAGS HAVE_C_WFORMAT_2 "-Wformat=2") +RDMA_AddOptCFlag(CMAKE_C_FLAGS HAVE_C_WCAST_FUNCTION "-Wcast-function-type") +RDMA_AddOptCFlag(CMAKE_C_FLAGS HAVE_C_WFORMAT_NONLITERAL "-Wformat-nonliteral") +RDMA_AddOptCFlag(CMAKE_C_FLAGS HAVE_C_WDATE_TIME "-Wdate-time") +RDMA_AddOptCFlag(CMAKE_C_FLAGS HAVE_C_WNESTED_EXTERNS "-Wnested-externs") + +# At some point after 4.4 gcc fixed shadow to ignore function vs variable +# conflicts +set(SAFE_CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS}") + set(CMAKE_REQUIRED_FLAGS "-Wshadow") +CHECK_C_SOURCE_COMPILES(" + #include <unistd.h> + int main(int argc,const char *argv[]) { int access = 1; return access; }" + HAVE_C_WORKING_SHADOW + FAIL_REGEX "warning") +if (HAVE_C_WORKING_SHADOW) + RDMA_AddOptCFlag(CMAKE_C_FLAGS HAVE_C_WORKING_SHADOW "-Wshadow") +endif() +set(CMAKE_REQUIRED_FLAGS "${SAFE_CMAKE_REQUIRED_FLAGS}") + +# At some point around 5.4 gcc fixed missing-field-initializers to ignore this +# common idiom we use extensively. Since this is a useful warning for +# developers try and leave it on if the compiler supports it. +CHECK_C_SOURCE_COMPILES(" + struct foo { int a; int b; }; + int main(int argc,const char *argv[]) { struct foo tmp = {}; return tmp.a; }" + HAVE_C_WORKING_MISSING_FIELD_INITIALIZERS + FAIL_REGEX "warning") +if (NOT HAVE_C_WORKING_MISSING_FIELD_INITIALIZERS) + RDMA_AddOptCFlag(CMAKE_C_FLAGS HAVE_C_WNO_MISSING_FIELD_INITIALIZERS "-Wno-missing-field-initializers") +endif() + +# Check that the compiler supports -fno-strict-aliasing. +# The use of this flag in the source is discouraged +set(NO_STRICT_ALIASING_FLAGS "") +RDMA_AddOptCFlag(NO_STRICT_ALIASING_FLAGS HAVE_NO_STRICT_ALIASING + "-fno-strict-aliasing") + +# pyverbs has a problem with var-tracking warnings, turn it off if we can. +set(NO_VAR_TRACKING_FLAGS "") +RDMA_AddOptCFlag(NO_VAR_TRACKING_FLAGS HAVE_NO_VAR_TRACKING_ASSIGNMENTS + "-fno-var-tracking-assignments") + +CHECK_C_SOURCE_COMPILES(" + #include <unistd.h> + + void entry(void); + + static void do_entry(void) {} + void entry(void) __attribute__((ifunc(\"resolve_entry\"))); + typedef void (*fn_t)(void); + static fn_t resolve_entry(void) {return &do_entry;} + + int main(int argc,const char *argv[]) { entry(); }" + HAVE_FUNC_ATTRIBUTE_IFUNC + FAIL_REGEX "warning") + +# The code does not do the racy fcntl if the various CLOEXEC's are not +# supported so it really doesn't work right if this isn't available. Thus hard +# require it. +CHECK_C_SOURCE_COMPILES(" + #include <sys/types.h> + #include <sys/stat.h> + #include <sys/socket.h> + #include <fcntl.h> + int main(int argc,const char *argv[]) { + open(\".\",O_RDONLY | O_CLOEXEC); + socket(AF_INET, SOCK_STREAM | SOCK_CLOEXEC, 0); + return 0; + }" HAS_CLOEXEC) + +if (NOT HAS_CLOEXEC) +# At least uclibc wrongly hides this POSIX constant behind _GNU_SOURCE +CHECK_C_SOURCE_COMPILES(" + #define _GNU_SOURCE + #include <sys/types.h> + #include <sys/stat.h> + #include <sys/socket.h> + #include <fcntl.h> + int main(int argc,const char *argv[]) { + open(\".\",O_RDONLY | O_CLOEXEC); + socket(AF_INET, SOCK_STREAM | SOCK_CLOEXEC, 0); + return 0; + }" HAS_CLOEXEC_GNU_SOURCE) + if (HAS_CLOEXEC_GNU_SOURCE) + set(HAS_CLOEXEC 1) + add_definitions("-D_GNU_SOURCE=") + endif() +endif() + +if (NOT HAS_CLOEXEC) + message(FATAL_ERROR "O_CLOEXEC/SOCK_CLOEXEC/fopen(..,\"e\") support is required but not found") +endif() + +# always_inline is supported +CHECK_C_SOURCE_COMPILES(" + int foo(void); + inline __attribute__((always_inline)) int foo(void) {return 0;} + int main(int argc,const char *argv[]) { return foo(); }" + HAVE_FUNC_ATTRIBUTE_ALWAYS_INLINE + FAIL_REGEX "warning") + +# Linux __u64 is an unsigned long long +CHECK_C_SOURCE_COMPILES(" +#include <linux/types.h> + int main(int argc,const char *argv[]) { __u64 tmp = 0; unsigned long long *tmp2 = &tmp; return *tmp2; }" + HAVE_LONG_LONG_U64 + FAIL_REGEX "warning") + +if (NOT HAVE_LONG_LONG_U64) + # Modern Linux has switched to use ull in all cases, but to avoid disturbing + # userspace some platforms continued to use unsigned long by default. This + # define will cause kernel headers to consistently use unsigned long long + add_definitions("-D__SANE_USERSPACE_TYPES__") +endif() + +# glibc and kernel uapi headers can co-exist +CHECK_C_SOURCE_COMPILES(" + #include <sys/socket.h> + #include <netinet/in.h> + #include <linux/in.h> + #include <linux/in6.h> + int main(int argc,const char *argv[]) { return 0; }" + HAVE_GLIBC_UAPI_COMPAT) +RDMA_DoFixup("${HAVE_GLIBC_UAPI_COMPAT}" "linux/in.h") +RDMA_DoFixup("${HAVE_GLIBC_UAPI_COMPAT}" "linux/in6.h") + +# The compiler has working -fstrict-aliasing support, old gcc's do not. If +# broken then globally disable strict aliasing. +RDMA_Check_Aliasing(HAVE_WORKING_STRICT_ALIASING) +if (NOT HAVE_WORKING_STRICT_ALIASING) + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${NO_STRICT_ALIASING_FLAGS}") +endif() + +# Check if off_t is 64 bits, eg large file support is enabled +CHECK_C_SOURCE_COMPILES(" +#include <sys/types.h> + #define BUILD_ASSERT_OR_ZERO(cond) (sizeof(char [1 - 2*!(cond)]) - 1) + int main(int argc,const char *argv[]) { return BUILD_ASSERT_OR_ZERO(sizeof(off_t) >= 8); }" + HAVE_LARGE_FILES) + +if (NOT HAVE_LARGE_FILES) + CHECK_C_SOURCE_COMPILES(" +#define _FILE_OFFSET_BITS 64 +#include <sys/types.h> + #define BUILD_ASSERT_OR_ZERO(cond) (sizeof(char [1 - 2*!(cond)]) - 1) + int main(int argc,const char *argv[]) { return BUILD_ASSERT_OR_ZERO(sizeof(off_t) >= 8); }" + HAVE_LARGE_FILES2) + if (NOT HAVE_LARGE_FILES2) + message(FATAL_ERROR "Could not enable large file support") + endif() + add_definitions("-D_FILE_OFFSET_BITS=64") +endif() + +# Provide a shim if C11 stdatomic.h is not supported. +if (NOT HAVE_SPARSE) + CHECK_INCLUDE_FILE("stdatomic.h" HAVE_STDATOMIC) + RDMA_DoFixup("${HAVE_STDATOMIC}" "stdatomic.h") +endif() + +RDMA_Check_SSE(HAVE_TARGET_SSE) + +# Enable development support features +# Prune unneeded shared libraries during linking +RDMA_AddOptLDFlag(CMAKE_EXE_LINKER_FLAGS SUPPORTS_AS_NEEDED "-Wl,--as-needed") +RDMA_AddOptLDFlag(CMAKE_SHARED_LINKER_FLAGS SUPPORTS_AS_NEEDED "-Wl,--as-needed") +RDMA_AddOptLDFlag(CMAKE_MODULE_LINKER_FLAGS SUPPORTS_AS_NEEDED "-Wl,--as-needed") + +# Ensure all shared ELFs have fully described linking +RDMA_AddOptLDFlag(CMAKE_EXE_LINKER_FLAGS SUPPORTS_NO_UNDEFINED "-Wl,--no-undefined") +RDMA_AddOptLDFlag(CMAKE_SHARED_LINKER_FLAGS SUPPORTS_NO_UNDEFINED "-Wl,--no-undefined") + +# Enable gold linker - gold has different linking checks +#RDMA_AddOptLDFlag(CMAKE_EXE_LINKER_FLAGS SUPPORTS_NO_UNDEFINED "-fuse-ld=gold") +#RDMA_AddOptLDFlag(CMAKE_SHARED_LINKER_FLAGS SUPPORTS_NO_UNDEFINED "-fuse-ld=gold") +#RDMA_AddOptLDFlag(CMAKE_MODULE_LINKER_FLAGS SUPPORTS_NO_UNDEFINED "-fuse-ld=gold") + +# Verify that GNU --version-script and asm(".symver") works +find_package(LDSymVer REQUIRED) +if (NO_COMPAT_SYMS) + set(HAVE_LIMITED_SYMBOL_VERSIONS 1) +else() + set(HAVE_FULL_SYMBOL_VERSIONS 1) +endif() + +# A cython & python-devel installation that matches our selected interpreter. + +if (CYTHON_EXECUTABLE) + # cmake has really bad logic here, if PythonIterp has been run it tries to + # find a matching -devel installation but will happily return a non-matching + # one too. We need them both to match exactly to guarantee cython does the + # right thing. + FIND_PACKAGE(PythonLibs ${PYTHON_VERSION_MAJOR}.${PYTHON_VERSION_MINOR} + EXACT REQUIRED) + + # Get a default installation path + execute_process(COMMAND "${PYTHON_EXECUTABLE}" -c + "from distutils.sysconfig import get_python_lib; print(get_python_lib(True, False, '${CMAKE_INSTALL_PREFIX}'))" + OUTPUT_VARIABLE py_path) + string(STRIP ${py_path} py_path) + set(CMAKE_INSTALL_PYTHON_ARCH_LIB "${py_path}" + CACHE PATH "Location for architecture specific python libraries") + + # See PEP3149 + execute_process(COMMAND "${PYTHON_EXECUTABLE}" -c + "import sysconfig; x = sysconfig.get_config_var(\"EXT_SUFFIX\"); print(x if x else '.so')" + OUTPUT_VARIABLE py_path) + string(STRIP ${py_path} CMAKE_PYTHON_SO_SUFFIX) +endif() + +# Look for pandoc and rst2man for making manual pages +FIND_PACKAGE(pandoc) +FIND_PACKAGE(rst2man) + +#------------------------- +# Find libraries +# pthread +FIND_PACKAGE (Threads REQUIRED) + +FIND_PACKAGE(PkgConfig REQUIRED) + +# libnl +if (NOT DEFINED ENABLE_RESOLVE_NEIGH) + set(ENABLE_RESOLVE_NEIGH "ON" CACHE BOOL "Enable internal resolution of neighbours for Etherent") +endif() +if (ENABLE_RESOLVE_NEIGH) + # FIXME use of pkgconfig is discouraged + pkg_check_modules(NL libnl-3.0 libnl-route-3.0 REQUIRED) + include_directories(${NL_INCLUDE_DIRS}) + set(NL_KIND 3) +else() + set(NL_KIND 0) + set(NL_LIBRARIES "") + RDMA_DoFixup(0 "netlink/attr.h") + RDMA_DoFixup(0 "netlink/msg.h") + RDMA_DoFixup(0 "netlink/netlink.h") + RDMA_DoFixup(0 "netlink/object-api.h") + RDMA_DoFixup(0 "netlink/route/link.h") + RDMA_DoFixup(0 "netlink/route/link/vlan.h") + RDMA_DoFixup(0 "netlink/route/neighbour.h") + RDMA_DoFixup(0 "netlink/route/route.h") + RDMA_DoFixup(0 "netlink/route/rtnl.h") +endif() + +# Older stuff blows up if these headers are included together +if (NOT NL_KIND EQUAL 0) + set(SAFE_CMAKE_REQUIRED_INCLUDES "${CMAKE_REQUIRED_INCLUDES}") + set(CMAKE_REQUIRED_INCLUDES "${NL_INCLUDE_DIRS}") + CHECK_C_SOURCE_COMPILES(" +#include <netlink/route/link.h> +#include <net/if.h> + int main(int argc,const char *argv[]) {return 0;}" + HAVE_WORKING_IF_H) + set(CMAKE_REQUIRED_INCLUDES "${SAFE_CMAKE_REQUIRED_INCLUDES}") +endif() + +# udev +find_package(UDev) +include_directories(${UDEV_INCLUDE_DIRS}) + +# Statically determine sizeof(long), this is largely unnecessary, no new code +# should rely on this. +check_type_size("long" SIZEOF_LONG BUILTIN_TYPES_ONLY LANGUAGE C) + +# Determine if this arch supports cache coherent DMA. This isn't really an +# arch specific property, but for our purposes arches that do not support it +# also do not define wmb/etc which breaks our compile. +CHECK_C_SOURCE_COMPILES(" +#include \"${CMAKE_CURRENT_SOURCE_DIR}/util/udma_barrier.h\" + int main(int argc,const char *argv[]) {return 0;}" + HAVE_COHERENT_DMA) + +find_package(Systemd) +include_directories(${SYSTEMD_INCLUDE_DIRS}) +RDMA_DoFixup("${SYSTEMD_FOUND}" "systemd/sd-daemon.h") + +#------------------------- +# Apply fixups + +# We prefer to build with valgrind memcheck.h present, but if not, or the user +# requested valgrind disabled, then replace it with our dummy stub. +if (NOT DEFINED ENABLE_VALGRIND) + set(ENABLE_VALGRIND "ON" CACHE BOOL "Enable use of valgrind annotations") +endif() +if (ENABLE_VALGRIND) + CHECK_INCLUDE_FILE("valgrind/memcheck.h" HAVE_VALGRIND_MEMCHECK) + CHECK_INCLUDE_FILE("valgrind/drd.h" HAVE_VALGRIND_DRD) +else() + set(HAVE_VALGRIND_MEMCHECK 0) + set(HAVE_VALGRIND_DRD 0) +endif() +RDMA_DoFixup("${HAVE_VALGRIND_MEMCHECK}" "valgrind/memcheck.h") +RDMA_DoFixup("${HAVE_VALGRIND_DRD}" "valgrind/drd.h") + +# Older glibc does not include librt +CHECK_C_SOURCE_COMPILES(" +#include <time.h> +int main(int argc,const char *argv[]) { + clock_gettime(CLOCK_MONOTONIC,0); + clock_nanosleep(CLOCK_MONOTONIC,0,0,0); + return 0; +};" LIBC_HAS_LIBRT) +if (NOT LIBC_HAS_LIBRT) + set(RT_LIBRARIES "rt") +endif() + +# Check for static_assert +CHECK_C_SOURCE_COMPILES(" +#include <assert.h> +static_assert(1, \"failed\"); +int main(int argc,const char *argv[]) { + static_assert(1, \"failed\"); + return 0; +};" HAVE_STATIC_ASSERT) +RDMA_DoFixup("${HAVE_STATIC_ASSERT}" "assert.h") + +#------------------------- +# Final warning flags + +# Old version of cmake used 'main(){..}' as their test program which breaks with -Werror. +# So set this flag last. +RDMA_AddOptCFlag(CMAKE_C_FLAGS HAVE_C_WSTRICT_PROTOTYPES "-Wstrict-prototypes") +RDMA_AddOptCFlag(CMAKE_C_FLAGS HAVE_C_WOLD_STYLE_DEFINITION "-Wold-style-definition") + +if (ENABLE_WERROR) + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Werror") + message(STATUS "Enabled -Werror") +endif() + +# Old versions of libnl have a duplicated rtnl_route_put, disbale the warning on those +# systems +if (NOT NL_KIND EQUAL 0) + set(SAFE_CMAKE_REQUIRED_INCLUDES "${CMAKE_REQUIRED_INCLUDES}") + set(SAFE_CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS}") + set(CMAKE_REQUIRED_INCLUDES "${NL_INCLUDE_DIRS}") + set(CMAKE_REQUIRED_FLAGS "-Wredundant-decls") + CHECK_C_SOURCE_COMPILES(" + #include <netlink/route/route.h> + int main(int argc,const char *argv[]) { return 0; }" + HAVE_C_WREDUNDANT_DECLS + FAIL_REGEX "warning") + set(CMAKE_REQUIRED_INCLUDES "${SAFE_CMAKE_REQUIRED_INCLUDES}") + set(CMAKE_REQUIRED_FLAGS "${SAFE_CMAKE_REQUIRED_FLAGS}") +endif() +RDMA_AddOptCFlag(CMAKE_C_FLAGS HAVE_C_WREDUNDANT_DECLS "-Wredundant-decls") + +#------------------------- +# Build Prep +# Write out a git ignore file to the build directory if it isn't the source +# directory. For developer convenience +if (NOT ${CMAKE_CURRENT_BINARY_DIR} STREQUAL ${CMAKE_CURRENT_SOURCE_DIR}) + file(WRITE ${CMAKE_BINARY_DIR}/.gitignore "*") +endif() + +if ("${IOCTL_MODE}" STREQUAL "both") + set(IOCTL_MODE_NUM 3) +elseif ("${IOCTL_MODE}" STREQUAL "write") + set(IOCTL_MODE_NUM 2) +elseif ("${IOCTL_MODE}" STREQUAL "ioctl") + set(IOCTL_MODE_NUM 1) +elseif ("${IOCTL_MODE}" STREQUAL "") + set(IOCTL_MODE_NUM 3) +else() + message(FATAL_ERROR "-DIOCTL_MODE=${IOCTL_MODE} is not a valid choice") +endif() + +# Configuration defaults + +if ("${IBACM_SERVER_MODE_DEFAULT}" STREQUAL "open") + set(IBACM_SERVER_MODE_DEFAULT "IBACM_SERVER_MODE_OPEN") +elseif ("${IBACM_SERVER_MODE_DEFAULT}" STREQUAL "loop") + set(IBACM_SERVER_MODE_DEFAULT "IBACM_SERVER_MODE_LOOP") +else() + set(IBACM_SERVER_MODE_DEFAULT "IBACM_SERVER_MODE_UNIX") +endif() + +if (IBACM_ACME_PLUS_KERNEL_ONLY_DEFAULT) + set(IBACM_ACME_PLUS_KERNEL_ONLY_DEFAULT 1) +else() + set(IBACM_ACME_PLUS_KERNEL_ONLY_DEFAULT 0) +endif() + +configure_file("${BUILDLIB}/config.h.in" "${BUILD_INCLUDE}/config.h" ESCAPE_QUOTES @ONLY) + +#------------------------- +# Sub-directories +add_subdirectory(ccan) +add_subdirectory(util) +add_subdirectory(Documentation) +add_subdirectory(kernel-boot) +add_subdirectory(kernel-headers) +# Libraries +add_subdirectory(libibumad) +add_subdirectory(libibumad/man) +add_subdirectory(libibverbs) +add_subdirectory(libibverbs/man) +add_subdirectory(librdmacm) +add_subdirectory(librdmacm/man) + +# Providers +if (HAVE_COHERENT_DMA) +add_subdirectory(providers/bnxt_re) +add_subdirectory(providers/cxgb4) # NO SPARSE +add_subdirectory(providers/efa) +add_subdirectory(providers/efa/man) +add_subdirectory(providers/hns) +add_subdirectory(providers/i40iw) # NO SPARSE +add_subdirectory(providers/mlx4) +add_subdirectory(providers/mlx4/man) +add_subdirectory(providers/mlx5) +add_subdirectory(providers/mlx5/man) +add_subdirectory(providers/mthca) +add_subdirectory(providers/ocrdma) +add_subdirectory(providers/qedr) +add_subdirectory(providers/vmw_pvrdma) +endif() + +add_subdirectory(providers/hfi1verbs) +add_subdirectory(providers/ipathverbs) +add_subdirectory(providers/rxe) +add_subdirectory(providers/rxe/man) +add_subdirectory(providers/siw) + +add_subdirectory(libibmad) +add_subdirectory(libibnetdisc) +add_subdirectory(libibnetdisc/man) +add_subdirectory(infiniband-diags) +add_subdirectory(infiniband-diags/scripts) +add_subdirectory(infiniband-diags/man) + +if (CYTHON_EXECUTABLE) + add_subdirectory(pyverbs) + add_subdirectory(tests) +endif() + +# Binaries +if (NOT NL_KIND EQUAL 0) + add_subdirectory(ibacm) # NO SPARSE +endif() + +if (NOT NL_KIND EQUAL 0) + add_subdirectory(iwpmd) +endif() +add_subdirectory(libibumad/tests) +add_subdirectory(libibverbs/examples) +add_subdirectory(librdmacm/examples) +if (UDEV_FOUND) + add_subdirectory(rdma-ndd) +endif() +add_subdirectory(srp_daemon) + +ibverbs_finalize() +rdma_finalize_libs() + +#------------------------- +# Display a summary +# Only report things that are non-ideal. +message(STATUS "Missing Optional Items:") +if (NOT HAVE_FUNC_ATTRIBUTE_ALWAYS_INLINE) + message(STATUS " Compiler attribute always_inline NOT supported") +endif() +if (NOT HAVE_FUNC_ATTRIBUTE_IFUNC) + message(STATUS " Compiler attribute ifunc NOT supported") +endif() +if (NOT HAVE_COHERENT_DMA) + message(STATUS " Architecture NOT able to do coherent DMA (check util/udma_barrier.h) some providers disabled!") +endif() +if (NOT HAVE_STDATOMIC) + message(STATUS " C11 stdatomic.h NOT available (old compiler)") +endif() +if (NOT HAVE_STATIC_ASSERT) + message(STATUS " C11 static_assert NOT available (old compiler)") +endif() +if (NOT HAVE_WORKING_STRICT_ALIASING) + message(STATUS " Compiler cannot do strict aliasing") +endif() +if (NOT HAVE_VALGRIND_MEMCHECK) + message(STATUS " Valgrind memcheck.h NOT enabled") +endif() +if (NOT HAVE_VALGRIND_DRD) + message(STATUS " Valgrind drd.h NOT enabled") +endif() +if (NL_KIND EQUAL 0) + message(STATUS " neighbour resolution NOT enabled") +else() + if (NOT HAVE_WORKING_IF_H) + message(STATUS " netlink/route/link.h and net/if.h NOT co-includable (old headers)") + endif() +endif() +if (NOT PANDOC_FOUND) + if (NOT EXISTS "${CMAKE_SOURCE_DIR}/buildlib/pandoc-prebuilt") + message(STATUS " pandoc NOT found and NO prebuilt man pages. 'install' disabled") + else() + message(STATUS " pandoc NOT found (using prebuilt man pages)") + endif() +endif() +if (NOT RST2MAN_FOUND) + if (NOT EXISTS "${CMAKE_SOURCE_DIR}/buildlib/pandoc-prebuilt") + message(STATUS " rst2man NOT found and NO prebuilt man pages. 'install' disabled") + else() + message(STATUS " rst2man NOT found (using prebuilt man pages)") + endif() +endif() +if (NOT CYTHON_EXECUTABLE) + message(STATUS " cython NOT found (disabling pyverbs)") +endif() +if (NOT SYSTEMD_FOUND) + message(STATUS " libsystemd NOT found (disabling features)") +endif() +if (NOT UDEV_FOUND) + message(STATUS " libudev NOT found (disabling features)") +endif() +if (NOT HAVE_C_WARNINGS) + message(STATUS " extended C warnings NOT supported") +endif() +if (NOT HAVE_NO_STRICT_ALIASING) + message(STATUS " -fno-strict-aliasing NOT supported") +endif() +if (NOT HAVE_C_WORKING_MISSING_FIELD_INITIALIZERS) + message(STATUS " -Wmissing-field-initializers does NOT work") +endif() +if (NOT HAVE_C_WORKING_SHADOW) + message(STATUS " -Wshadow does NOT work") +endif() +if (NOT HAVE_C_WREDUNDANT_DECLS) + message(STATUS " -Wredundant-decls does NOT work") +endif() +if (NOT HAVE_GLIBC_UAPI_COMPAT) + message(STATUS " libc netinet/in.h and linux/in.h do NOT coexist") +endif() +if (NOT HAVE_TARGET_SSE) + message(STATUS " attribute(target(\"sse\")) does NOT work") +endif() diff --git a/COPYING.BSD_FB b/COPYING.BSD_FB new file mode 100644 index 0000000..4423761 --- /dev/null +++ b/COPYING.BSD_FB @@ -0,0 +1,22 @@ + OpenIB.org BSD license (FreeBSD Variant) + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + - Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + - Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/COPYING.BSD_MIT b/COPYING.BSD_MIT new file mode 100644 index 0000000..a1432b6 --- /dev/null +++ b/COPYING.BSD_MIT @@ -0,0 +1,20 @@ + OpenIB.org BSD license (MIT variant) + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + - Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + - Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + diff --git a/COPYING.GPL2 b/COPYING.GPL2 new file mode 100644 index 0000000..d159169 --- /dev/null +++ b/COPYING.GPL2 @@ -0,0 +1,339 @@ + GNU GENERAL PUBLIC LICENSE + Version 2, June 1991 + + Copyright (C) 1989, 1991 Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The licenses for most software are designed to take away your +freedom to share and change it. By contrast, the GNU General Public +License is intended to guarantee your freedom to share and change free +software--to make sure the software is free for all its users. This +General Public License applies to most of the Free Software +Foundation's software and to any other program whose authors commit to +using it. (Some other Free Software Foundation software is covered by +the GNU Lesser General Public License instead.) You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +this service if you wish), that you receive source code or can get it +if you want it, that you can change the software or use pieces of it +in new free programs; and that you know you can do these things. + + To protect your rights, we need to make restrictions that forbid +anyone to deny you these rights or to ask you to surrender the rights. +These restrictions translate to certain responsibilities for you if you +distribute copies of the software, or if you modify it. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must give the recipients all the rights that +you have. You must make sure that they, too, receive or can get the +source code. And you must show them these terms so they know their +rights. + + We protect your rights with two steps: (1) copyright the software, and +(2) offer you this license which gives you legal permission to copy, +distribute and/or modify the software. + + Also, for each author's protection and ours, we want to make certain +that everyone understands that there is no warranty for this free +software. If the software is modified by someone else and passed on, we +want its recipients to know that what they have is not the original, so +that any problems introduced by others will not reflect on the original +authors' reputations. + + Finally, any free program is threatened constantly by software +patents. We wish to avoid the danger that redistributors of a free +program will individually obtain patent licenses, in effect making the +program proprietary. To prevent this, we have made it clear that any +patent must be licensed for everyone's free use or not licensed at all. + + The precise terms and conditions for copying, distribution and +modification follow. + + GNU GENERAL PUBLIC LICENSE + TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + + 0. This License applies to any program or other work which contains +a notice placed by the copyright holder saying it may be distributed +under the terms of this General Public License. The "Program", below, +refers to any such program or work, and a "work based on the Program" +means either the Program or any derivative work under copyright law: +that is to say, a work containing the Program or a portion of it, +either verbatim or with modifications and/or translated into another +language. (Hereinafter, translation is included without limitation in +the term "modification".) Each licensee is addressed as "you". + +Activities other than copying, distribution and modification are not +covered by this License; they are outside its scope. The act of +running the Program is not restricted, and the output from the Program +is covered only if its contents constitute a work based on the +Program (independent of having been made by running the Program). +Whether that is true depends on what the Program does. + + 1. You may copy and distribute verbatim copies of the Program's +source code as you receive it, in any medium, provided that you +conspicuously and appropriately publish on each copy an appropriate +copyright notice and disclaimer of warranty; keep intact all the +notices that refer to this License and to the absence of any warranty; +and give any other recipients of the Program a copy of this License +along with the Program. + +You may charge a fee for the physical act of transferring a copy, and +you may at your option offer warranty protection in exchange for a fee. + + 2. You may modify your copy or copies of the Program or any portion +of it, thus forming a work based on the Program, and copy and +distribute such modifications or work under the terms of Section 1 +above, provided that you also meet all of these conditions: + + a) You must cause the modified files to carry prominent notices + stating that you changed the files and the date of any change. + + b) You must cause any work that you distribute or publish, that in + whole or in part contains or is derived from the Program or any + part thereof, to be licensed as a whole at no charge to all third + parties under the terms of this License. + + c) If the modified program normally reads commands interactively + when run, you must cause it, when started running for such + interactive use in the most ordinary way, to print or display an + announcement including an appropriate copyright notice and a + notice that there is no warranty (or else, saying that you provide + a warranty) and that users may redistribute the program under + these conditions, and telling the user how to view a copy of this + License. (Exception: if the Program itself is interactive but + does not normally print such an announcement, your work based on + the Program is not required to print an announcement.) + +These requirements apply to the modified work as a whole. If +identifiable sections of that work are not derived from the Program, +and can be reasonably considered independent and separate works in +themselves, then this License, and its terms, do not apply to those +sections when you distribute them as separate works. But when you +distribute the same sections as part of a whole which is a work based +on the Program, the distribution of the whole must be on the terms of +this License, whose permissions for other licensees extend to the +entire whole, and thus to each and every part regardless of who wrote it. + +Thus, it is not the intent of this section to claim rights or contest +your rights to work written entirely by you; rather, the intent is to +exercise the right to control the distribution of derivative or +collective works based on the Program. + +In addition, mere aggregation of another work not based on the Program +with the Program (or with a work based on the Program) on a volume of +a storage or distribution medium does not bring the other work under +the scope of this License. + + 3. You may copy and distribute the Program (or a work based on it, +under Section 2) in object code or executable form under the terms of +Sections 1 and 2 above provided that you also do one of the following: + + a) Accompany it with the complete corresponding machine-readable + source code, which must be distributed under the terms of Sections + 1 and 2 above on a medium customarily used for software interchange; or, + + b) Accompany it with a written offer, valid for at least three + years, to give any third party, for a charge no more than your + cost of physically performing source distribution, a complete + machine-readable copy of the corresponding source code, to be + distributed under the terms of Sections 1 and 2 above on a medium + customarily used for software interchange; or, + + c) Accompany it with the information you received as to the offer + to distribute corresponding source code. (This alternative is + allowed only for noncommercial distribution and only if you + received the program in object code or executable form with such + an offer, in accord with Subsection b above.) + +The source code for a work means the preferred form of the work for +making modifications to it. For an executable work, complete source +code means all the source code for all modules it contains, plus any +associated interface definition files, plus the scripts used to +control compilation and installation of the executable. However, as a +special exception, the source code distributed need not include +anything that is normally distributed (in either source or binary +form) with the major components (compiler, kernel, and so on) of the +operating system on which the executable runs, unless that component +itself accompanies the executable. + +If distribution of executable or object code is made by offering +access to copy from a designated place, then offering equivalent +access to copy the source code from the same place counts as +distribution of the source code, even though third parties are not +compelled to copy the source along with the object code. + + 4. You may not copy, modify, sublicense, or distribute the Program +except as expressly provided under this License. Any attempt +otherwise to copy, modify, sublicense or distribute the Program is +void, and will automatically terminate your rights under this License. +However, parties who have received copies, or rights, from you under +this License will not have their licenses terminated so long as such +parties remain in full compliance. + + 5. You are not required to accept this License, since you have not +signed it. However, nothing else grants you permission to modify or +distribute the Program or its derivative works. These actions are +prohibited by law if you do not accept this License. Therefore, by +modifying or distributing the Program (or any work based on the +Program), you indicate your acceptance of this License to do so, and +all its terms and conditions for copying, distributing or modifying +the Program or works based on it. + + 6. Each time you redistribute the Program (or any work based on the +Program), the recipient automatically receives a license from the +original licensor to copy, distribute or modify the Program subject to +these terms and conditions. You may not impose any further +restrictions on the recipients' exercise of the rights granted herein. +You are not responsible for enforcing compliance by third parties to +this License. + + 7. If, as a consequence of a court judgment or allegation of patent +infringement or for any other reason (not limited to patent issues), +conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot +distribute so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you +may not distribute the Program at all. For example, if a patent +license would not permit royalty-free redistribution of the Program by +all those who receive copies directly or indirectly through you, then +the only way you could satisfy both it and this License would be to +refrain entirely from distribution of the Program. + +If any portion of this section is held invalid or unenforceable under +any particular circumstance, the balance of the section is intended to +apply and the section as a whole is intended to apply in other +circumstances. + +It is not the purpose of this section to induce you to infringe any +patents or other property right claims or to contest validity of any +such claims; this section has the sole purpose of protecting the +integrity of the free software distribution system, which is +implemented by public license practices. Many people have made +generous contributions to the wide range of software distributed +through that system in reliance on consistent application of that +system; it is up to the author/donor to decide if he or she is willing +to distribute software through any other system and a licensee cannot +impose that choice. + +This section is intended to make thoroughly clear what is believed to +be a consequence of the rest of this License. + + 8. If the distribution and/or use of the Program is restricted in +certain countries either by patents or by copyrighted interfaces, the +original copyright holder who places the Program under this License +may add an explicit geographical distribution limitation excluding +those countries, so that distribution is permitted only in or among +countries not thus excluded. In such case, this License incorporates +the limitation as if written in the body of this License. + + 9. The Free Software Foundation may publish revised and/or new versions +of the General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + +Each version is given a distinguishing version number. If the Program +specifies a version number of this License which applies to it and "any +later version", you have the option of following the terms and conditions +either of that version or of any later version published by the Free +Software Foundation. If the Program does not specify a version number of +this License, you may choose any version ever published by the Free Software +Foundation. + + 10. If you wish to incorporate parts of the Program into other free +programs whose distribution conditions are different, write to the author +to ask for permission. For software which is copyrighted by the Free +Software Foundation, write to the Free Software Foundation; we sometimes +make exceptions for this. Our decision will be guided by the two goals +of preserving the free status of all derivatives of our free software and +of promoting the sharing and reuse of software generally. + + NO WARRANTY + + 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY +FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN +OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES +PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED +OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS +TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE +PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, +REPAIR OR CORRECTION. + + 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR +REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, +INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING +OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED +TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY +YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER +PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE +POSSIBILITY OF SUCH DAMAGES. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +convey the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + <one line to give the program's name and a brief idea of what it does.> + Copyright (C) <year> <name of author> + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +Also add information on how to contact you by electronic and paper mail. + +If the program is interactive, make it output a short notice like this +when it starts in an interactive mode: + + Gnomovision version 69, Copyright (C) year name of author + Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, the commands you use may +be called something other than `show w' and `show c'; they could even be +mouse-clicks or menu items--whatever suits your program. + +You should also get your employer (if you work as a programmer) or your +school, if any, to sign a "copyright disclaimer" for the program, if +necessary. Here is a sample; alter the names: + + Yoyodyne, Inc., hereby disclaims all copyright interest in the program + `Gnomovision' (which makes passes at compilers) written by James Hacker. + + <signature of Ty Coon>, 1 April 1989 + Ty Coon, President of Vice + +This General Public License does not permit incorporating your program into +proprietary programs. If your program is a subroutine library, you may +consider it more useful to permit linking proprietary applications with the +library. If this is what you want to do, use the GNU Lesser General +Public License instead of this License. diff --git a/COPYING.md b/COPYING.md new file mode 100644 index 0000000..91f2fdd --- /dev/null +++ b/COPYING.md @@ -0,0 +1,65 @@ +# Default Dual License + +Unless otherwise stated this software is available to you under a choice of +one of two licenses. You may choose to be licensed under the terms of the +OpenIB.org BSD (MIT variant) license (see COPYING.BSD_MIT) or the GNU General +Public License (GPL) Version 2 (see COPYING.GPL2), both included in this +package. + +Files marked 'See COPYING file' are licensed under the above Dual License. + +# Other Options + +Individual source files may use a license different from the above Defaul Dual +License. If a license is declared in the file then it supersedes the Default +License. + +If a directory contains a COPYING file then the License from that file becomes +the Default License for files in that directory and below. + +# Copyright Holders + +Refer to individual files for information on the copyright holders. + +# License Catalog (Informative, Non Binding) + +## Utilities + +Utility source code that may be linked into any binary are available under +several licenses: + + - MIT license (see ccan/LICENSE.MIT) + - Creative Commons CC0 1.0 Universal License (see ccan/LICENSE.CC0) + +## Providers + +The following providers use a different license than the Default Dual +License. Refer to files in each directory for details. + +hfi1verbs +: Dual License: GPLv2 or Intel 3 clause BSD license + +ipathverbs +: Dual License: GPLv2 or PathScale BSD Patent license + +ocrdma +: Dual License: GPLv2 or OpenIB.org BSD (FreeBSD variant), See COPYING.BSD_FB + +## Libraries + +All library compilable source code (.c and .h files) are available under the +Default Dual License. + +Unmarked ancillary files may be available under a Dual License: GPLv2 or +OpenIB.org BSD (FreeBSD variant). + +## Tools (iwpmd, srp_daemon, ibacm) + +All compilable source code (.c and .h files) are available under the Default +Dual License. + +Unmarked ancillary files may be available under a Dual License: GPLv2 or +OpenIB.org BSD (FreeBSD variant). + +srp_daemon/srp_daemon/srp_daemon.sh: Any one of the GPLv2, a 2 clause BSD +license or the CPLv1. diff --git a/Documentation/CMakeLists.txt b/Documentation/CMakeLists.txt new file mode 100644 index 0000000..bda16fa --- /dev/null +++ b/Documentation/CMakeLists.txt @@ -0,0 +1,11 @@ +install(FILES + ibacm.md + ibsrpdm.md + libibverbs.md + librdmacm.md + rxe.md + udev.md + tag_matching.md + ../README.md + ../MAINTAINERS + DESTINATION "${CMAKE_INSTALL_DOCDIR}") diff --git a/Documentation/azure-pipelines.md b/Documentation/azure-pipelines.md new file mode 100644 index 0000000..4d95313 --- /dev/null +++ b/Documentation/azure-pipelines.md @@ -0,0 +1,102 @@ +# Azure Pipelines Continuous Integration + +rdma-core uses Azure Pipelines to run a variety of compile tests on every pull +request. These tests are intented to run through a variety of distribution +configurations with the goal to have rdma-core build and work on a wide range +of distributions. + +The system consists of several components: + - An Azure Container Registry + - The script buildlib/cbuild to produce the container images representing the + test scenarios + - The instructions in buildlib/azure-pipelines.yml and related support scripts + - An Azure Pipelines account linked to the rdma-core GitHub + - A GitHub Check + +Things are arranged so that the cbuild script can run the same commands in the +same containers on the local docker system, it does not rely on any special or +unique capabilities of Azure Pipelines. + +# The Containers + +Containers are built with the cbuild script. Internally it generates a +Dockerfile and builds a docker container. + +```sh +$ buildlib/cbuild build-images centos7 +``` + +cbuild has definitions for a wide range of platforms that are interesting to test. + +## Uploading Containers + +Containers that are used by Azure Pipelines are prefixed with +ucfconsort.azurecr.io/rdma-core/ to indicate they are served from that docker +registry (which is implemented as a Azure Cotnainer Registry service). + +Once built the container should be uploaded with: + +```sh +# Needed onetime +$ az login + +$ sudo az acr login --name ucfconsort +$ sudo docker push ucfconsort.azurecr.io/rdma-core/centos7:latest +``` + +The user will need to be authorized to access the private registry. + +## Testing containers locally + +cbuild has several modes for doing local testing on the container. + +The fastest is to use 'cbuild make' as a replacement for Ninja. It will run +cmake and ninja commands inside the container, but using the local source +tree unmodified. This is useful to test and resolve compilation problems. + +```sh +$ buildlib/cbuild make centos7 +``` + +Using 'make --run-shell' will perform all container setup but instead of +running Ninja it will open a bash shell inside the same container +environment. This is useful to test and debug the container contents. + +Package builds can be tested using 'cbuild pkg'. This automatically generates +a source .tar.gz and then runs rpmbuild/etc within the container. This is +useful for testing the package building scripts. Note that any changes must be +checked in or they will not be included. + +package builds are some of the tests that Azure Pipelines runs. + +# Azure Pipelines + +The actions are controlled by the content of buildlib/azure-pipelines.yml. The +process is fairly straightforward and consists of both running distribution +package builds and a series of different compilers and analysis checks. + +The compiler checks are run in a special 'azure-pipelines' container that has +several compilers, ARM64 cross compilation, and other things. + +cbuild is able to run an emulation of the pipelines commands using +'buildlib/cbuild pkg azp' + +## Azure Pipelines Security + +Microsoft has a strange security model - by default they do not send any login +secrets to the VM if the VM is triggered from a GitHub Pull Request. This is +required as the VM runs code from the PR, and a hostile PR could ex-filtrate +the secret data. + +However, since fetching the containers requires a security token it means PR +cannot get the container, and are basically entirely useless. The only option +Azure Pipeliens has is to inject *all* security tokens, including the GitHub +token, which is madness. + +The compromise is that when a non-team member user proposes a Pull Request a +team member must reivew it and add "/azp run" to the comments to ack that the +PR content is not hostile. + +See + +https://developercommunity.visualstudio.com/content/idea/392281/granular-permissions-on-secrets-for-github-fork-pu.html diff --git a/Documentation/ibacm.md b/Documentation/ibacm.md new file mode 100644 index 0000000..8ed293d --- /dev/null +++ b/Documentation/ibacm.md @@ -0,0 +1,109 @@ +# The Assistant for InfiniBand Communication Management (IB ACM) + +The IB ACM library implements and provides a framework for name, address, and +route resolution services over InfiniBand. The IB ACM provides information +needed to establish a connection, but does not implement the CM protocol. + +IB ACM services are used by librdmacm to implement the rdma_resolve_addr, +rdma_resolve_route, and rdma_getaddrinfo routines. + +The IB ACM is focused on being scalable and efficient. The current +implementation limits network traffic, SA interactions, and centralized +services. ACM supports multiple resolution protocols in order to handle +different fabric topologies. + +This release is limited in its handling of dynamic changes. + +The IB ACM package is comprised of two components: the ibacm service +and a test/configuration utility - ib_acme. + +# Details + +### ib_acme + +The ib_acme program serves a dual role. It acts as a utility to test +ibacm operation and help verify if the ibacm service and selected +protocol is usable for a given cluster configuration. Additionally, +it automatically generates ibacm configuration files to assist with +or eliminate manual setup. + + +### acm configuration files + +The ibacm service relies on two configuration files. + +The acm_addr.cfg file contains name and address mappings for each IB +<device, port, pkey> endpoint. Although the names in the acm_addr.cfg +file can be anything, ib_acme maps the host name and IP addresses to +the IB endpoints. + +The acm_opts.cfg file provides a set of configurable options for the +ibacm service, such as timeout, number of retries, logging level, etc. +ib_acme generates the acm_opts.cfg file using static information. A +future enhancement would adjust options based on the current system +and cluster size. + +### ibacm + +The ibacm service is responsible for resolving names and addresses to +InfiniBand path information and caching such data. It is implemented as a +daemon that execute with administrative privileges. + +The ibacm implements a client interface over TCP sockets, which is +abstracted by the librdmacm library. One or more back-end protocols are +used by the ibacm service to satisfy user requests. Although the +ibacm supports standard SA path record queries on the back-end, it +provides an experimental multicast resolution protocol in hope of +achieving greater scalability. The latter is not usable on all fabric +topologies, specifically ones that may not have reversible paths. +Users should use the ib_acme utility to verify that multicast protocol +is usable before running other applications. + +Conceptually, the ibacm service implements an ARP like protocol and either +uses IB multicast records to construct path record data or queries the +SA directly, depending on the selected route protocol. By default, the +ibacm services uses and caches SA path record queries. + +Specifically, all IB endpoints join a number of multicast groups. +Multicast groups differ based on rates, mtu, sl, etc., and are prioritized. +All participating endpoints must be able to communicate on the lowest +priority multicast group. The ibacm assigns one or more names/addresses +to each IB endpoint using the acm_addr.cfg file. Clients provide source +and destination names or addresses as input to the service, and receive +as output path record data. + +The service maps a client's source name/address to a local IB endpoint. +If a client does not provide a source address, then the ibacm service +will select one based on the destination and local routing tables. If the +destination name/address is not cached locally, it sends a multicast +request out on the lowest priority multicast group on the local endpoint. +The request carries a list of multicast groups that the sender can use. +The recipient of the request selects the highest priority multicast group +that it can use as well and returns that information directly to the sender. +The request data is cached by all endpoints that receive the multicast +request message. The source endpoint also caches the response and uses +the multicast group that was selected to construct or obtain path record +data, which is returned to the client. + +The current implementation of the IB ACM has several additional restrictions: +- The ibacm is limited in its handling of dynamic changes; + the ibacm should be stopped and restarted if a cluster is reconfigured. +- Support for IPv6 has not been verified. +- The number of addresses that can be assigned to a single endpoint is + limited to 4. +- The number of multicast groups that an endpoint can support is limited to 2. + +The ibacm contains several internal caches. These include caches for +GID and LID destination addresses. These caches can be optionally +preloaded. ibacm supports the OpenSM dump_pr plugin "full" PathRecord +format which is used to preload these caches. The file format is specified +in the ibacm_opts.cfg file via the route_preload setting which should +be set to opensm_full_v1 for this file format. Default format is +none which does not preload these caches. See dump_pr.notes.txt in dump_pr +for more information on the opensm_full_v1 file format and how to configure +OpenSM to generate this file. + +Additionally, the name, IPv4, and IPv6 caches can be be preloaded by using +the addr_preload option. The default is none which does not preload these +caches. To preload these caches, set this option to acm_hosts and +configure the addr_data_file appropriately. diff --git a/Documentation/ibsrpdm.md b/Documentation/ibsrpdm.md new file mode 100644 index 0000000..0fc544d --- /dev/null +++ b/Documentation/ibsrpdm.md @@ -0,0 +1,41 @@ +# Using ibsrpdm + +ibsrpdm is used for discovering and connecting to SRP SCSI targets on +InfiniBand fabrics. These targets can be accessed with the InfiniBand SRP +initiator module, "ib_srp," included in Linux kernels 2.6.15 and newer. + +To run ibsrpdm, the ib_umad module must be loaded, as well as an appropriate +low-level driver for the installed IB hardware. + +With no command line parameters, ibsrpdm displays information about +SRP targets in human-readable form: + + # ibsrpdm + IO Unit Info: + port LID: 0009 + port GID: fe800000000000000005ad00000013e9 + change ID: 73b0 + max controllers: 0x01 + + controller[ 1] + GUID: 0005ad00000013e7 + vendor ID: 0005ad + device ID: 0005ad + IO class : 0100 + ID: Topspin SRP/FC TCA + service entries: 2 + service[ 0]: 0000000000000066 / SRP.T10:20030003BA27CC7A + service[ 1]: 0000000000000066 / SRP.T10:20030003BA27CF53 + +With the "-c" flag, ibsrpdm displays information in a form that can be +written to the kernel SRP initiators add_target file to connect to the +SRP targets. For example: + + # ibsrpdm -c + id_ext=20030003BA27CC7A,ioc_guid=0005ad00000013e7,dgid=fe800000000000000005ad00000013e9,pkey=ffff,service_id=0000000000000066 + id_ext=20030003BA27CF53,ioc_guid=0005ad00000013e7,dgid=fe800000000000000005ad00000013e9,pkey=ffff,service_id=0000000000000066 + +Given this, the command below will connect to the first target +discovered from the first port of the local HCA device "mthca0": + + # echo -n id_ext=20030003BA27CC7A,ioc_guid=0005ad00000013e7,dgid=fe800000000000000005ad00000013e9,pkey=ffff,service_id=0000000000000066 > /sys/class/infiniband_srp/srp-mthca0-1/add_target diff --git a/Documentation/libibverbs.md b/Documentation/libibverbs.md new file mode 100644 index 0000000..cbe076e --- /dev/null +++ b/Documentation/libibverbs.md @@ -0,0 +1,58 @@ +# Introduction + +libibverbs is a library that allows programs to use RDMA "verbs" for +direct access to RDMA (currently InfiniBand and iWARP) hardware from +userspace. For more information on RDMA verbs, see the InfiniBand +Architecture Specification vol. 1, especially chapter 11, and the RDMA +Consortium's RDMA Protocol Verbs Specification. + +# Using libibverbs + +### Device nodes + +The verbs library expects special character device files named +/dev/infiniband/uverbsN to be created. When you load the kernel +modules, including both the low-level driver for your IB hardware as +well as the ib_uverbs module, you should see one or more uverbsN +entries in /sys/class/infiniband_verbs in addition to the +/dev/infiniband/uverbsN character device files. + +To create the appropriate character device files automatically with +udev, a rule like + + KERNEL="uverbs*", NAME="infiniband/%k" + +can be used. This will create device nodes named + + /dev/infiniband/uverbs0 + +and so on. Since the RDMA userspace verbs should be safe for use by +non-privileged users, you may want to add an appropriate MODE or GROUP +to your udev rule. + +### Permissions + +To use IB verbs from userspace, a process must be able to access the +appropriate /dev/infiniband/uverbsN special device file. You can +check the permissions on this file with the command + + ls -l /dev/infiniband/uverbs* + +Make sure that the permissions on these files are such that the +user/group that your verbs program runs as can access the device file. + +To use IB verbs from userspace, a process must also have permission to +tell the kernel to lock sufficient memory for all of your registered +memory regions as well as the memory used internally by IB resources +such as queue pairs (QPs) and completion queues (CQs). To check your +resource limits, use the command + + ulimit -l + +(or "limit memorylocked" for csh-like shells). + +If you see a small number such as 32 (the units are KB) then you will +need to increase this limit. This is usually done for ordinary users +via the file /etc/security/limits.conf. More configuration may be +necessary if you are logging in via OpenSSH and your sshd is +configured to use privilege separation. diff --git a/Documentation/librdmacm.md b/Documentation/librdmacm.md new file mode 100644 index 0000000..7b8bc45 --- /dev/null +++ b/Documentation/librdmacm.md @@ -0,0 +1,46 @@ +# Device files + +The userspace CMA uses a single device file regardless of the number +of adapters or ports present. + +To create the appropriate character device file automatically with +udev, a rule like + + KERNEL="rdma_cm", NAME="infiniband/%k", MODE="0666" + +can be used. This will create the device node named + + /dev/infiniband/rdma_cm + +or you can create it manually + + mknod /dev/infiniband/rdma_cm c 231 255 + + +# Common issues + +Using multiple interfaces +: The librdmacm does support multiple interfaces. To make use + of multiple interfaces, however, you need to instruct linux + to only send ARP replies on the interface targeted in the ARP + request. This can be done using a command similar to the + following: + + sysctl -w net.ipv4.conf.all.arp_ignore=2 + + Without this change, it's possible for linux to resopnd to ARP + requests on a different interface (IP address) than the IP + address carried in the ARP request. This causes the RDMA stack + to incorrectly map the remote IP address to the wrong RDMA + device. + +Using loopback +: The librdmacm relies on ARP to resolve IP address to RDMA + addresses. To support loopback connections between different + ports on the same system, ARP must be enabled for local + resolution: + + sysctl net.ipv4.conf.all.accept_local=1 + + Without this setting, loopback connections may timeout + during address resolution. diff --git a/Documentation/pyverbs.md b/Documentation/pyverbs.md new file mode 100755 index 0000000..3577171 --- /dev/null +++ b/Documentation/pyverbs.md @@ -0,0 +1,613 @@ +# Pyverbs + +Pyverbs provides a Python API over rdma-core, the Linux userspace C API for +the RDMA stack. + +## Goals + +1. Provide easier access to RDMA: RDMA has a steep learning curve as is and + the C interface requires the user to initialize multiple structs before + having usable objects. Pyverbs attempts to remove much of this overhead and + provide a smoother user experience. +2. Improve our code by providing a test suite for rdma-core. This means that + new features will be tested before merge, and it also means that users and + distros will have tests for new and existing features, as well as the means + to create them quickly. +3. Stay up-to-date with rdma-core - cover new features during development and + provide a test / unit-test alongside the feature. + +## Limitations + +Python handles memory for users. As a result, memory is allocated by Pyverbs +when needed (e.g. user buffer for memory region). The memory will be accessible +to the users, but not allocated or freed by them. + +## Usage Examples +Note that all examples use a hard-coded device name ('mlx5_0'). +##### Open an IB device + +Import the device module and open a device by name: + +```python +import pyverbs.device as d +ctx = d.Context(name='mlx5_0') +``` + +'ctx' is Pyverbs' equivalent to rdma-core's ibv_context. At this point, the IB +device is already open and ready to use. + +##### Query a device +```python +import pyverbs.device as d +ctx = d.Context(name='mlx5_0') +attr = ctx.query_device() +print(attr) +FW version : 16.24.0185 +Node guid : 9803:9b03:0000:e4c6 +Sys image GUID : 9803:9b03:0000:e4c6 +Max MR size : 0xffffffffffffffff +Page size cap : 0xfffffffffffff000 +Vendor ID : 0x2c9 +Vendor part ID : 4119 +HW version : 0 +Max QP : 262144 +Max QP WR : 32768 +Device cap flags : 3983678518 +Max SGE : 30 +Max SGE RD : 30 +MAX CQ : 16777216 +Max CQE : 4194303 +Max MR : 16777216 +Max PD : 16777216 +Max QP RD atom : 16 +Max EE RD atom : 0 +Max res RD atom : 4194304 +Max QP init RD atom : 16 +Max EE init RD atom : 0 +Atomic caps : 1 +Max EE : 0 +Max RDD : 0 +Max MW : 16777216 +Max raw IPv6 QPs : 0 +Max raw ethy QP : 0 +Max mcast group : 2097152 +Max mcast QP attach : 240 +Max AH : 2147483647 +Max FMR : 0 +Max map per FMR : 2147483647 +Max SRQ : 8388608 +Max SRQ WR : 32767 +Max SRQ SGE : 31 +Max PKeys : 128 +local CA ack delay : 16 +Phys port count : 1 +``` + +'attr' is Pyverbs' equivalent to ibv_device_attr. Pyverbs will provide it to +the user upon completion of the call to ibv_query_device. + +##### Query GID + +```python +import pyverbs.device as d +ctx = d.Context(name='mlx5_0') +gid = ctx.query_gid(port_num=1, index=3) +print(gid) +0000:0000:0000:0000:0000:ffff:0b87:3c08 +``` + +'gid' is Pyverbs' equivalent to ibv_gid, provided to the user by Pyverbs. + +##### Query port +The following code snippet provides an example of pyverbs' equivalent of +querying a port. Context's query_port() command wraps ibv_query_port(). +The example below queries the first port of the device. +```python +import pyverbs.device as d +ctx=d.Context(name='mlx5_0') +port_attr = ctx.query_port(1) +print(port_attr) +Port state : Active (4) +Max MTU : 4096 (5) +Active MTU : 1024 (3) +SM lid : 0 +Port lid : 0 +lmc : 0x0 +Link layer : Ethernet +Max message size : 0x40000000 +Port cap flags : IBV_PORT_CM_SUP IBV_PORT_IP_BASED_GIDS +Port cap flags 2 : +max VL num : 0 +Bad Pkey counter : 0 +Qkey violations counter : 0 +Gid table len : 256 +Pkey table len : 1 +SM sl : 0 +Subnet timeout : 0 +Init type reply : 0 +Active width : 4X (2) +Ative speed : 25.0 Gbps (32) +Phys state : Link up (5) +Flags : 1 +``` + +##### Extended query device +The example below shows how to open a device using pyverbs and query the +extended device's attributes. +Context's query_device_ex() command wraps ibv_query_device_ex(). +```python +import pyverbs.device as d + +ctx = d.Context(name='mlx5_0') +attr = ctx.query_device_ex() +attr.max_dm_size +131072 +attr.rss_caps.max_rwq_indirection_table_size +2048 +``` + +#### Create RDMA objects +##### PD +The following example shows how to open a device and use its context to create +a PD. +```python +import pyverbs.device as d +from pyverbs.pd import PD + +with d.Context(name='mlx5_0') as ctx: + pd = PD(ctx) +``` +##### MR +The example below shows how to create a MR using pyverbs. Similar to C, a +device must be opened prior to creation and a PD has to be allocated. +```python +import pyverbs.device as d +from pyverbs.pd import PD +from pyverbs.mr import MR +import pyverbs.enums as e + +with d.Context(name='mlx5_0') as ctx: + with PD(ctx) as pd: + mr_len = 1000 + flags = e.IBV_ACCESS_LOCAL_WRITE + mr = MR(pd, mr_len, flags) +``` +##### Memory window +The following example shows the equivalent of creating a type 1 memory window. +It includes opening a device and allocating the necessary PD. +```python +import pyverbs.device as d +from pyverbs.pd import PD +from pyverbs.mr import MW +import pyverbs.enums as e + +with d.Context(name='mlx5_0') as ctx: + with PD(ctx) as pd: + mw = MW(pd, e.IBV_MW_TYPE_1) +``` +##### Device memory +The following snippet shows how to allocate a DM - a direct memory object, +using the device's memory. +```python +import random + +from pyverbs.device import DM, AllocDmAttr +import pyverbs.device as d + +with d.Context(name='mlx5_0') as ctx: + attr = ctx.query_device_ex() + if attr.max_dm_size != 0: + dm_len = random.randint(4, attr.max_dm_size) + dm_attrs = AllocDmAttr(dm_len) + dm = DM(ctx, dm_attrs) +``` + +##### DM MR +The example below shows how to open a DMMR - device memory MR, using the +device's own memory rather than a user-allocated buffer. +```python +import random + +from pyverbs.device import DM, AllocDmAttr +from pyverbs.mr import DMMR +import pyverbs.device as d +from pyverbs.pd import PD +import pyverbs.enums as e + +with d.Context(name='mlx5_0') as ctx: + attr = ctx.query_device_ex() + if attr.max_dm_size != 0: + dm_len = random.randint(4, attr.max_dm_size) + dm_attrs = AllocDmAttr(dm_len) + dm_mr_len = random.randint(4, dm_len) + with DM(ctx, dm_attrs) as dm: + with PD(ctx) as pd: + dm_mr = DMMR(pd, dm_mr_len, e.IBV_ACCESS_ZERO_BASED, dm=dm, + offset=0) +``` + +##### CQ +The following snippets show how to create CQs using pyverbs. Pyverbs supports +both CQ and extended CQ (CQEX). +As in C, a completion queue can be created with or without a completion +channel, the snippets show that. +CQ's 3rd parameter is cq_context, a user-defined context. We're using None in +our snippets. +```python +import random + +from pyverbs.cq import CompChannel, CQ +import pyverbs.device as d + +with d.Context(name='mlx5_0') as ctx: + num_cqes = random.randint(0, 200) # Just arbitrary values. Max value can be + # found in device attributes + comp_vector = 0 # An arbitrary value. comp_vector is limited by the + # context's num_comp_vectors + if random.choice([True, False]): + with CompChannel(ctx) as cc: + cq = CQ(ctx, num_cqes, None, cc, comp_vector) + else: + cq = CQ(ctx, num_cqes, None, None, comp_vector) + print(cq) +CQ +Handle : 0 +CQEs : 63 +``` + +```python +import random + +from pyverbs.cq import CqInitAttrEx, CQEX +import pyverbs.device as d +import pyverbs.enums as e + +with d.Context(name='mlx5_0') as ctx: + num_cqe = random.randint(0, 200) + wc_flags = e.IBV_WC_EX_WITH_CVLAN + comp_mask = 0 # Not using flags in this example + # completion channel is not used in this example + attrs = CqInitAttrEx(cqe=num_cqe, wc_flags=wc_flags, comp_mask=comp_mask, + flags=0) + print(attrs) + cq_ex = CQEX(ctx, attrs) + print(cq_ex) + Number of CQEs : 10 +WC flags : IBV_WC_EX_WITH_CVLAN +comp mask : 0 +flags : 0 + +Extended CQ: +Handle : 0 +CQEs : 15 +``` + +##### Addressing related objects +The following code demonstrates creation of GlobalRoute, AHAttr and AH objects. +The example creates a global AH so it can also run on RoCE without +modifications. +```python + +from pyverbs.addr import GlobalRoute, AHAttr, AH +import pyverbs.device as d +from pyverbs.pd import PD + +with d.Context(name='mlx5_0') as ctx: + port_number = 1 + gid_index = 0 # GID index 0 always exists and valid + gid = ctx.query_gid(port_number, gid_index) + gr = GlobalRoute(dgid=gid, sgid_index=gid_index) + ah_attr = AHAttr(gr=gr, is_global=1, port_num=port_number) + print(ah_attr) + with PD(ctx) as pd: + ah = AH(pd, attr=ah_attr) +DGID : fe80:0000:0000:0000:9a03:9bff:fe00:e4bf +flow label : 0 +sgid index : 0 +hop limit : 1 +traffic class : 0 +``` + +##### QP +The following snippets will demonstrate creation of a QP and a simple post_send +operation. For more complex examples, please see pyverbs/examples section. +```python +from pyverbs.qp import QPCap, QPInitAttr, QPAttr, QP +from pyverbs.addr import GlobalRoute +from pyverbs.addr import AH, AHAttr +import pyverbs.device as d +import pyverbs.enums as e +from pyverbs.pd import PD +from pyverbs.cq import CQ +import pyverbs.wr as pwr + + +ctx = d.Context(name='mlx5_0') +pd = PD(ctx) +cq = CQ(ctx, 100, None, None, 0) +cap = QPCap(100, 10, 1, 1, 0) +qia = QPInitAttr(cap=cap, qp_type = e.IBV_QPT_UD, scq=cq, rcq=cq) +# A UD QP will be in RTS if a QPAttr object is provided +udqp = QP(pd, qia, QPAttr()) +port_num = 1 +gid_index = 3 # Hard-coded for RoCE v2 interface +gid = ctx.query_gid(port_num, gid_index) +gr = GlobalRoute(dgid=gid, sgid_index=gid_index) +ah_attr = AHAttr(gr=gr, is_global=1, port_num=port_num) +ah=AH(pd, ah_attr) +wr = pwr.SendWR() +wr.set_wr_ud(ah, 0x1101, 0) # in real life, use real values +udqp.post_send(wr) +``` +###### Extended QP +An extended QP exposes a new set of QP send operations to the user - +extensibility for new send opcodes, vendor specific send opcodes and even vendor +specific QP types. +Pyverbs now exposes the needed interface to create such a QP. +Note that the IBV_QP_INIT_ATTR_SEND_OPS_FLAGS in the `comp_mask` is mandatory +when using the extended QP's new post send mechanism. +```python +from pyverbs.qp import QPCap, QPInitAttrEx, QPAttr, QPEx +import pyverbs.device as d +import pyverbs.enums as e +from pyverbs.pd import PD +from pyverbs.cq import CQ + + +ctx = d.Context(name='mlx5_0') +pd = PD(ctx) +cq = CQ(ctx, 100) +cap = QPCap(100, 10, 1, 1, 0) +qia = QPInitAttrEx(qp_type=e.IBV_QPT_UD, scq=cq, rcq=cq, cap=cap, pd=pd, + comp_mask=e.IBV_QP_INIT_ATTR_SEND_OPS_FLAGS| \ + e.IBV_QP_INIT_ATTR_PD) +qp = QPEx(ctx, qia) +``` + +##### XRCD +The following code demonstrates creation of an XRCD object. +```python +from pyverbs.xrcd import XRCD, XRCDInitAttr +import pyverbs.device as d +import pyverbs.enums as e +import stat +import os + + +ctx = d.Context(name='ibp0s8f0') +xrcd_fd = os.open('/tmp/xrcd', os.O_RDONLY | os.O_CREAT, + stat.S_IRUSR | stat.S_IRGRP) +init = XRCDInitAttr(e.IBV_XRCD_INIT_ATTR_FD | e.IBV_XRCD_INIT_ATTR_OFLAGS, + os.O_CREAT, xrcd_fd) +xrcd = XRCD(ctx, init) +``` + +##### SRQ +The following code snippet will demonstrate creation of an XRC SRQ object. +For more complex examples, please see pyverbs/tests/test_odp. +```python +from pyverbs.xrcd import XRCD, XRCDInitAttr +from pyverbs.srq import SRQ, SrqInitAttrEx +import pyverbs.device as d +import pyverbs.enums as e +from pyverbs.cq import CQ +from pyverbs.pd import PD +import stat +import os + + +ctx = d.Context(name='ibp0s8f0') +pd = PD(ctx) +cq = CQ(ctx, 100, None, None, 0) +xrcd_fd = os.open('/tmp/xrcd', os.O_RDONLY | os.O_CREAT, + stat.S_IRUSR | stat.S_IRGRP) +init = XRCDInitAttr(e.IBV_XRCD_INIT_ATTR_FD | e.IBV_XRCD_INIT_ATTR_OFLAGS, + os.O_CREAT, xrcd_fd) +xrcd = XRCD(ctx, init) + +srq_attr = SrqInitAttrEx(max_wr=10) +srq_attr.srq_type = e.IBV_SRQT_XRC +srq_attr.pd = pd +srq_attr.xrcd = xrcd +srq_attr.cq = cq +srq_attr.comp_mask = e.IBV_SRQ_INIT_ATTR_TYPE | e.IBV_SRQ_INIT_ATTR_PD | \ + e.IBV_SRQ_INIT_ATTR_CQ | e.IBV_SRQ_INIT_ATTR_XRCD +srq = SRQ(ctx, srq_attr) + + +##### Open an mlx5 provider +A provider is essentially a Context with driver-specific extra features. As +such, it inherits from Context. In legcay flow Context iterates over the IB +devices and opens the one matches the name given by the user (name= argument). +When provider attributes are also given (attr=), the Context will assign the +relevant ib_device to its device member, so that the provider will be able to +open the device in its specific way as demonstated below: + +```python +import pyverbs.providers.mlx5.mlx5dv as m +from pyverbs.pd import PD +attr = m.Mlx5DVContextAttr() # Default values are fine +ctx = m.Mlx5Context(attr=attr, name='rocep0s8f0') +# The provider context can be used as a regular Context, e.g.: +pd = PD(ctx) # Success +``` + +##### Query an mlx5 provider +After opening an mlx5 provider, users can use the device-specific query for +non-legacy attributes. The following snippet demonstrates how to do that. +```python +import pyverbs.providers.mlx5.mlx5dv as m +ctx = m.Mlx5Context(attr=m.Mlx5DVContextAttr(), name='ibp0s8f0') +mlx5_attrs = ctx.query_mlx5_device() +print(mlx5_attrs) +Version : 0 +Flags : CQE v1, Support CQE 128B compression, Support CQE 128B padding, Support packet based credit mode (in RC QP) +comp mask : CQE compression, SW parsing, Striding RQ, Tunnel offloads, Dynamic BF regs, Clock info update, Flow action flags +CQE compression caps: + max num : 64 + supported formats : with hash, with RX checksum CSUM, with stride index +SW parsing caps: + SW parsing offloads : + supported QP types : +Striding RQ caps: + min single stride log num of bytes: 6 + max single stride log num of bytes: 13 + min single wqe log num of strides: 9 + max single wqe log num of strides: 16 + supported QP types : Raw Packet +Tunnel offloads caps: +Max dynamic BF registers: 1024 +Max clock info update [nsec]: 1099511 +Flow action flags : 0 +``` + +##### Create an mlx5 QP +Using an Mlx5Context object, one can create either a legacy QP (creation +process is the same) or an mlx5 QP. An mlx5 QP is a QP by inheritance but its +constructor receives a keyword argument named `dv_init_attr`. If the user +provides it, the QP will be created using `mlx5dv_create_qp` rather than +`ibv_create_qp_ex`. The following snippet demonstrates how to create both a DC +(dynamically connected) QP and a Raw Packet QP which uses mlx5-specific +capabilities, unavailable using the legacy interface. Currently, pyverbs +supports only creation of a DCI. DCT support will be added in one of the +following PRs. +```python +from pyverbs.providers.mlx5.mlx5dv import Mlx5Context, Mlx5DVContextAttr +from pyverbs.providers.mlx5.mlx5dv import Mlx5DVQPInitAttr, Mlx5QP +import pyverbs.providers.mlx5.mlx5_enums as me +from pyverbs.qp import QPInitAttrEx, QPCap +import pyverbs.enums as e +from pyverbs.cq import CQ +from pyverbs.pd import PD + +with Mlx5Context(name='rocep0s8f0', attr=Mlx5DVContextAttr()) as ctx: + with PD(ctx) as pd: + with CQ(ctx, 100) as cq: + cap = QPCap(100, 0, 1, 0) + # Create a DC QP of type DCI + qia = QPInitAttrEx(cap=cap, pd=pd, scq=cq, qp_type=e.IBV_QPT_DRIVER, + comp_mask=e.IBV_QP_INIT_ATTR_PD, rcq=cq) + attr = Mlx5DVQPInitAttr(comp_mask=me.MLX5DV_QP_INIT_ATTR_MASK_DC) + attr.dc_type = me.MLX5DV_DCTYPE_DCI + + dci = Mlx5QP(ctx, qia, dv_init_attr=attr) + + # Create a Raw Packet QP using mlx5-specific capabilities + qia.qp_type = e.IBV_QPT_RAW_PACKET + attr.comp_mask = me.MLX5DV_QP_INIT_ATTR_MASK_QP_CREATE_FLAGS + attr.create_flags = me.MLX5DV_QP_CREATE_ALLOW_SCATTER_TO_CQE |\ + me.MLX5DV_QP_CREATE_TIR_ALLOW_SELF_LOOPBACK_UC |\ + me.MLX5DV_QP_CREATE_TUNNEL_OFFLOADS + qp = Mlx5QP(ctx, qia, dv_init_attr=attr) +``` + +##### Create an mlx5 CQ +Mlx5Context also allows users to create an mlx5 specific CQ. The Mlx5CQ inherits +from CQEX, but its constructor receives 3 parameters instead of 2. The 3rd +parameter is a keyword argument named `dv_init_attr`. If provided by the user, +the CQ will be created using `mlx5dv_create_cq`. +The following snippet shows this simple creation process. +```python +from pyverbs.providers.mlx5.mlx5dv import Mlx5Context, Mlx5DVContextAttr +from pyverbs.providers.mlx5.mlx5dv import Mlx5DVCQInitAttr, Mlx5CQ +import pyverbs.providers.mlx5.mlx5_enums as me +from pyverbs.cq import CqInitAttrEx + +with Mlx5Context(name='rocep0s8f0', attr=Mlx5DVContextAttr()) as ctx: + cqia = CqInitAttrEx() + mlx5_cqia = Mlx5DVCQInitAttr(comp_mask=me.MLX5DV_CQ_INIT_ATTR_MASK_COMPRESSED_CQE, + cqe_comp_res_format=me.MLX5DV_CQE_RES_FORMAT_CSUM) + cq = Mlx5CQ(ctx, cqia, dv_init_attr=mlx5_cqia) +``` + +##### CMID +The following code snippet will demonstrate creation of a CMID object, which +represents rdma_cm_id C struct, and establish connection between two peers. +Currently only synchronous control path is supported (rdma_create_ep). +For more complex examples, please see tests/test_rdmacm. +```python +from pyverbs.qp import QPInitAttr, QPCap +from pyverbs.cmid import CMID, AddrInfo +import pyverbs.cm_enums as ce + + +cap = QPCap(max_recv_wr=1) +qp_init_attr = QPInitAttr(cap=cap) +server = '11.137.14.124' +port = '7471' + +# Passive side + +sai = AddrInfo(server, port, ce.RDMA_PS_TCP, ce.RAI_PASSIVE) +sid = CMID(creator=sai, qp_init_attr=qp_init_attr) +sid.listen() # listen for incoming connection requests +new_id = sid.get_request() # check if there are any connection requests +new_id.accept() # new_id is connected to remote peer and ready to communicate + +# Active side + +cai = AddrInfo(server, port, ce.RDMA_PS_TCP) +cid = CMID(creator=cai, qp_init_attr=qp_init_attr) +cid.connect() # send connection request to server +``` + +##### ParentDomain +The following code demonstrates the creation of Parent Domain object. +In this example, a simple Python allocator is defined. It uses MemAlloc class to +allocate aligned memory using a C style aligned_alloc. +```python +from pyverbs.pd import PD, ParentDomainInitAttr, ParentDomain, \ + ParentDomainContext +from pyverbs.device import Context +import pyverbs.mem_alloc as mem + + +def alloc_p_func(pd, context, size, alignment, resource_type): + p = mem.posix_memalign(size, alignment) + return p + + +def free_p_func(pd, context, ptr, resource_type): + mem.free(ptr) + + +ctx = Context(name='rocep0s8f0') +pd = PD(ctx) +pd_ctx = ParentDomainContext(pd, alloc_p_func, free_p_func) +pd_attr = ParentDomainInitAttr(pd=pd, pd_context=pd_ctx) +parent_domain = ParentDomain(ctx, attr=pd_attr) +``` + +##### MLX5 VAR +The following code snippet demonstrates how to allocate an mlx5dv_var then using +it for memory address mapping, then freeing the VAR. +```python +from pyverbs.providers.mlx5.mlx5dv import Mlx5VAR +from pyverbs.device import Context +import mmap + +ctx = Context(name='rocep0s8f0') +var = Mlx5VAR(ctx) +var_map = mmap.mmap(fileno=ctx.cmd_fd, length=var.length, offset=var.mmap_off) +# There is no munmap method in mmap Python module, but by closing the mmap +# instance the memory is unmapped. +var_map.close() +var.close() +``` + +##### MLX5 PP +Packet Pacing (PP) entry can be used for some device commands over the DEVX +interface. It allows a rate-limited flow configuration on SQs. +The following code snippet demonstrates how to allocate an mlx5dv_pp with rate +limit value of 5, then frees the entry. +```python +from pyverbs.providers.mlx5.mlx5dv import Mlx5Context, Mlx5DVContextAttr, Mlx5PP +import pyverbs.providers.mlx5.mlx5_enums as e + +# The device must be opened as DEVX context +mlx5dv_attr = Mlx5DVContextAttr(e.MLX5DV_CONTEXT_FLAGS_DEVX) +ctx = Mlx5Context(attr=mlx5dv_attr, name='rocep0s8f0') +rate_limit_inbox = (5).to_bytes(length=4, byteorder='big', signed=True) +pp = Mlx5PP(ctx, rate_limit_inbox) +pp.close() +``` diff --git a/Documentation/release.md b/Documentation/release.md new file mode 100644 index 0000000..2a40ef7 --- /dev/null +++ b/Documentation/release.md @@ -0,0 +1,99 @@ +# Release Process + +Release process of rdma-core library consists from three stages + +1. Change library version, according to [Overall Pacakge Version](versioning.md) guide. +2. Push the change above to master branch and ensure that Travis CI reports successful build. +3. Create local annotated signed tag vX.X.X (`git tag vX.X.X -a -s`). +4. Issue `git release` command which will push tag, trigger Travis CI to upload + release tar.gz file and create release notes based on tag context with release notes in it. + +## git release + +There are many implmentations of different `git release` commands. We recommend you to use +the command from [this](https://github.com/mpalmer/github-release) repository due to its simplicity. + +--- +Copy&Paste from relevant [README](https://github.com/mpalmer/github-release/blob/master/README.md) + +--- + +This very simple gem provides a `git release` command, which will +automatically fill out any and all "release tags" into fully-blown "Github +Releases", complete with release notes, a heading, and all the other good +things in life. + +Using this gem, you can turn the following tag annotation: + + First Release + + It is with much fanfare and blowing of horns that I bequeath the + awesomeness of `git release` upon the world. + + Features in this release include: + + * Ability to create a release from a tag annotation or commit message; + * Automatically generates an OAuth token if needed; + * Feeds your cat while you're hacking(*) + + You should install it now! `gem install github-release` + +Into [this](https://github.com/mpalmer/github-release/releases/tag/v0.1.0) +simply by running + + git release + +### Installation + +Simply install the gem: + + gem install github-release + + +### Usage + +Using `git release` is very simple. Just make sure that your `origin` +remote points to your Github repo, and then run `git release`. All tags +that look like a "version tag" (see "Configuration", below) will be created +as Github releases (if they don't already exist) and the message from the +tag will be used as the release notes. + +The format of the release notes is quite straightforward -- the first line +of the message associated with the commit will be used as the "name" of the +release, with the rest of the message used as the "body" of the release. +The body will be interpreted as Github-flavoured markdown, so if you'd like +to get fancy, go for your life. + +The message associated with the "release tag" is either the tag's annotation +message (if it is an annotated tag) or else the commit log of the commit on +which the tag is placed. I *strongly* recommend annotated tags (but then +again, [I'm biased...](http://theshed.hezmatt.org/git-version-bump)) + +The first time you use `git release`, it will ask you for your Github +username and password. This is used to request an OAuth token to talk to +the Github API, which is then stored in your global git config. Hence you +*shouldn't* be asked for your credentials every time you use `git release`. +If you need to use multiple github accounts for different repos, you can +override the `release.api-token` config parameter in your repo configuration +(but you'll have to get your own OAuth token). + + +### Configuration + +There are a few things you can configure to make `git release` work slightly +differently. None of them should be required for normal, sane use. + + * `release.remote` (default `origin`) -- The name of the remote which is + used to determine what github repository to send release notes to. + + * `release.api-token` (default is runtime generated) -- The OAuth token + to use to authenticate access to the Github API. When you first run `git + release`, you'll be prompted for a username and password to use to + generate an initial token; if you need to override it on a per-repo + basis, this is the key you'll use. + + * `release.tag-regex` (default `v\d+\.\d+(\.\d+)?$`) -- The regular + expression to filter which tags denote releases, as opposed to other tags + you might have decided to make. Only tags which match this regular + expression will be pushed up by `git release`, and only those tags will + be marked as releases. diff --git a/Documentation/rxe.md b/Documentation/rxe.md new file mode 100644 index 0000000..ea11fc4 --- /dev/null +++ b/Documentation/rxe.md @@ -0,0 +1,14 @@ +# Configure Soft-RoCE (RXE): + +Create RXE device over network interface (e.g. eth0): + + # rdma link add rxe_eth0 type rxe netdev eth0 + +Use the status command to display the current configuration: + + # rdma link + +If you are using a Mellanox HCA, make sure that the mlx4_ib/mlx5_ib kernel +module is not loaded (modprobe –rv mlx4_ib) in the soft-RoCE machine. Now you +have an Infiniband device called “rxe0_eth0” that can be used to run any RoCE +app. diff --git a/Documentation/stable.md b/Documentation/stable.md new file mode 100644 index 0000000..c12b276 --- /dev/null +++ b/Documentation/stable.md @@ -0,0 +1,89 @@ +# Stable Branch Release + + +## General + +Current Maintainer: Nicolas Morey-Chaisemartin <NMoreyChaisemartin@suse.de> + +Upstream rdma-core is considered stable after each mainline release. +Branched stable releases, off a mainline release, are on as-needed basis and limited to bug fixes only. + +All bug fixes are to be backported from mainline and applied by stable branch maintainer. + +Branched stable releases will append an additional release number (e.g. 15.1) and will ensure that Azure Pipelines CI reports a successful build. + +Regular stable releases will be generated at the same time as mainline releases. +Additional stable releases can be generated if the need arise (Needed by distributions or OFED). + +## Patch Rules + + * It must be obviously correct and tested. + * It cannot be bigger than 100 lines, with context. + * It must fix only one thing. + * It must fix a real bug that bothers people (not a, "This could be a problem..." type thing). + * ABI must NOT be changed by the fix. + +## Submitting to the stable branch + +Submissions to the stable branch follow the same process as [kernel-stable](https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/Documentation/process/stable-kernel-rules.rst). + +### Option 1 + +Patches sent to master should add the tag: + + `Cc: stable@linux-rdma.org` + +in the sign-off area. Once the patch is merged, it will be applied to the stable tree +without anything else needing to be done by the author or subsystem maintainer. + +If the patch should be applied to more than one release, add the info version as such: + + `Cc: stable@linux-rdma.org # v15.1 v14` + + +### Option 2 + +After the patch has been merged to master, send an email to +stable@linux-rdma.org containing the subject of the patch, the commit ID, +why you think it should be applied, and what rdma-core version you wish it to +be applied to. + +### Option 3 + +Send the patch, after verifying that it follows the above rules, to stable@linux-rdma.org. +You must note the upstream commit ID in the changelog of your submission, + as well as the rdma-core version you wish it to be applied to. + +Option 1 is strongly preferred, is the easiest and most common. +Option 2 and Option 3 are more useful if the patch isn’t deemed worthy at the time it is applied to a public git tree (for instance, because it deserves more regression testing first). +Option 3 is especially useful if the patch needs some special handling to apply to an older version. + +Note that for Option 3, if the patch deviates from the original upstream patch (for example because it had to be backported) this must be very clearly documented and justified in the patch description. + +## Versioning + +See versioning.md for setting package version on a stable branch. + + +## Creating a stable branch + +Stable branch should be created from a release tag of the master branch. +The first thing to do on a master branch is to commit the mainstream release ABI infos +so that latters patches/fixes can be checked against this reference. + +To do that, the creator of the branch should run +``` +./buildlib/cbuild build-images azp +mkdir ABI +touch ABI/.gitignore +git add ABI/.gitignore +git commit -m "ABI Files" +./buildlib/cbuild pkg azp +git add ABI/* +git commit --amend +``` + +'cbuild pkg azp' will fail as the ABI verification step files, but it will +produce the ABI reference files. + +Note that the ABI directory must NOT be committed at any point in the master branch. diff --git a/Documentation/tag_matching.md b/Documentation/tag_matching.md new file mode 100644 index 0000000..81d4dd8 --- /dev/null +++ b/Documentation/tag_matching.md @@ -0,0 +1,341 @@ +# Hardware tag matching + +## Introduction + +The MPI standard defines a set of rules, known as tag-matching, for matching +source send operations to destination receives according to the following +attributes: + +* Communicator +* User tag - wild card may be specified by the receiver +* Source rank - wild card may be specified by the receiver +* Destination rank - wild card may be specified by the receiver + +These matching attributes are specified by all Send and Receive operations. +Send operations from a given source to a given destination are processed in +the order in which the Sends were posted. Receive operations are associated +with the earliest send operation (from any source) that matches the +attributes, in the order in which the Receives were posted. Note that Receive +tags are not necessarily consumed in the order they are created, e.g., a later +generated tag may be consumed if earlier tags do not satisfy the matching +rules. + +When a message arrives at the receiver, MPI implementations often classify it +as either 'expected' or 'unexpected' according to whether a Receive operation +with a matching tag has already been posted by the application. In the +expected case, the message may be processed immediately. In the unexpected +case, the message is saved in an unexpected message queue, and will be +processed when a matching Receive operation is posted. + +To bound the amount of memory to hold unexpected messages, MPI implementations +use 2 data transfer protocols. The 'eager' protocol is used for small +messages. Eager messages are sent without any prior synchronization and +processed/buffered at the receiver. Typically, with RDMA, a single RDMA-Send +operation is used to transfer the data. + +The 'rendezvous' protocol is used for large messages. Initially, only the +message tag is sent along with some meta-data. Only when the tag is matched to +a Receive operation, will the receiver initiate the corresponding data +transfer. A common RDMA implementation is to send the message tag with an +RDMA-Send, and transfer the data with an RDMA-Read issued by the receiver. +When the transfer is complete, the receiver will notify the sender that its +buffer may be freed using an RDMA-Send. + +## RDMA tag-matching offload + +Tag-matching offload satisfies the following principals: +- Tag-matching is viewed as an RDMA application, and thus does not affect the + RDMA transport in any way [(*)](#m1) +- Tag-matching processing will be split between HW and SW. + * HW will hold a bounded prefix of Receive tags +- HW will process and transfer any expected message that matches a tag held + in HW. + * In case the message uses the rendezvous protocol, HW will also initiate + the RDMA-Read data transfer and send a notification message when the + data transfer completes. +- SW will handle any message that is either unexpected or whose tag is not + held in HW. + +<a name="m1">(*)</a> +This concept can apply to additional application-specific offloads in the +future. + +Tag-matching is initially defined for RC transport. Tag-matching messages are +encapsulated in RDMA-Send messages and contain the following headers: + +``` + 0 1 2 3 + 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 + Tag Matching Header (TMH): + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Operation | reserved | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | User data (optional) | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Tag | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Tag | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + + Rendezvous Header (RVH): + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Virtual Address | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Virtual Address | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Remote Key | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Length | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +``` + +Tag-matching messages always contain a TMH. An RHV is added for Rendezvous +request messages. The following message formats are defined: +- Eager request: TMH | payload +- Rendezvous request: TMH | RHV | optional meta-data [(**)](#m2) +- Rendezvous response: TMH + +Note that rendezvous data transfers are standard RDMA-Reads + +<a name="m2">(**)</a> +Rendezvous request messages may also arrive unexpected; in this case, the +message is handled in SW, optionally leveraging additional meta-data passed by +the sender. + +As tag-matching messages are standard RDMA-Sends, no special HW support is +needed at the sender. At the receiver, we introduce a new SRQ type - a +Tag-Matching SRQ (TM-SRQ). The TM-SRQ forms the serialization point for +matching messages coming from any of the associated RC connections, and reports +all tag matching completions and events to a dedicated CQ. +2 kinds of buffers may be posted to the TM-SRQ: +- Buffers associated with tags (tagged-buffers), which are used when a match + is made by HW +- Standard SRQ buffers, which are used for unexpected messages (from HW's + perspective) +When a message is matched by HW, the payload is transferred directly to the +application buffer (both in the eager and the rendezvous case), while skipping +any TM headers. Otherwise, the entire message, including any TM headers, is +scattered to the SRQ buffer. + +Since unexpected messages are handled in SW, there exists an inherent race +between the arrival of messages from the wire and posting of new tagged +buffers. For example, consider 2 incoming messages m1 and m2 and matching +buffers b1 and b2 that are posted asynchronously. If b1 is posted after m1 +arrives but before m2, m1 would be delivered as an unexpected message while m2 +would match b1, violating the ordering rules. + +Consequently, whenever HW deems a message unexpected, tag matching must be +disabled for new tags until SW and HW synchronize. This synchronization is +achieved by reporting to HW the number of unexpected messages handled by SW +(with respect to the current posted tags). When the SW and HW are in synch, tag +matching resumes normally. + +## Tag Matching Verbs + +### Capabilities + +Tag matching capabilities are queried by ibv_query_device_ex(), and report the +following attributes: + +* **max_rndv_hdr_size** - Max size of rendezvous request header +* **max_num_tags** - Max number of tagged buffers in a TM-SRQ matching list +* **max_ops** - Max number of outstanding tag matching list operations +* **max_sge** - Max number of SGEs in a tagged buffer +* **flags** - the following flags are currently defined: + - IBV_TM_CAP_RC - Support tag matching on RC transport + + +### TM-SRQ creation + +TM-SRQs are created by the ibv_create_srq_ex() Verb, which accepts the +following new attributes: +* **srq_type** - set to **IBV_SRQT_TM** +* **comp_mask** - set the **IBV_SRQ_INIT_ATTR_TM** flag +* **tm_cap** - TM properties for this TM-SRQ; defined as follows: + +```h +struct ibv_tm_cap { + uint32_t max_num_tags; /* Matching list size */ + uint32_t max_ops; /* Number of outstanding TM operations */ +} +``` +Similarly to XRC SRQs, a TM-SRQ has a dedicated CQ. + +RC QPs are associated with the TM-SRQ just like standard SRQs. However, the +ownership of the QP's Send Queue is passed to the TM-SRQ, which uses it to +initiate rendezvous RDMA-Reads. Receive completions are reported to the +TM-SRQ's CQ. + + +### Managing TM receive buffers + +Untagged (unexpected) buffers are posted using the standard +**ibv_post_srq_recv**() Verb. + +Tagged buffers are manipulated by a new **ibv_post_srq_ops**() Verb: + +```h +int ibv_post_srq_ops(struct ibv_srq *srq, struct ibv_ops_wr *wr, + struct ibv_ops_wr **bad_wr); +``` +```h +struct ibv_ops_wr { + uint64_t wr_id; /* User defined WR ID */ + /* Pointer to next WR in list, NULL if last WR */ + struct ibv_ops_wr *next; + enum ibv_ops_wr_opcode opcode; /* From enum ibv_ops_wr_opcode */ + int flags; /* From enum ibv_ops_flags */ + struct { + /* Number of unexpected messages + * handled by SW */ + uint32_t unexpected_cnt; + /* Input parameter for the DEL opcode + * and output parameter for the ADD opcode */ + uint32_t handle; + struct { + /* WR ID for TM_RECV */ + uint64_t recv_wr_id; + struct ibv_sge *sg_list; + int num_sge; + uint64_t tag; + uint64_t mask; + } add; + } tm; +}; +``` + +The following opcodes are defined: + +Opcode **IBV_WR_TAG_ADD** - add a tagged buffer entry to the tag matching list. +The input consists of an SGE list, a tag, a mask (matching parameters), and the +latest unexpected message count. A handle that uniquely identifies the entry is +returned upon success. + +Opcode **IBV_WR_TAG_DEL** - delete a tag entry. +The input is an entry handle returned from a previous **IBV_WR_TAG_ADD** +operation, and the latest unexpected message count. + +Note that the operation may fail if the associated tag was consumed by an +incoming message. In this case **IBV_WC_TM_ERR** status will be returned in WC. + +Opcode **IBV_WR_TAG_SYNC** - report the number of unexpected messages handled by +the SW. +The input comprises only the unexpected message count. To reduce explicit +synchronization to a minimum, all completions indicate when synchronization is +necessary by setting the **IBV_WC_TM_SYNC_REQ** flag. + +**ibv_post_srq_ops**() operations are non-signaled by default. To request an +explicit completion for a given operation, the standard **IBV_OPS_SIGNALED** +flag must be set. The number of outstanding tag-manipulation operations must +not exceed the **max_ops** capability. + +While **wr_id** identifies the tag manipulation operation itself, the +**recv_wr_id** field is used to identify the tagged buffer in receive +completions. + + +### Sending TM messages + +TM messages are sent using standard RC Send operations. A TM message comprises +a Tag-Matching Header (TMH), an optional Rendezvous Header (RVH), and +a payload. + +TMH and RVH are defined in infiniband/tm_types.h: + +```h +struct ibv_tmh { + uint8_t opcode; + uint8_t reserved[3]; + __be32 app_ctx; + __be64 tag; +}; +``` +```h +struct ibv_rvh { + __be64 va; + __be32 rkey; + __be32 len; +}; +``` + +The following opcodes are defined: + +* **IBV_TM_NO_TAG** - Send a message without a tag. +Such a message will always be treated as unexpected by the receiver TM-SRQ. +Any data following the opcode is ignored by the tag matching logic, and the +message is delivered in its entirety (including the opcode) to the standard +SRQ buffer. + +* **IBV_TM_OP_EAGER** - Send an eager tagged message. +The message consists of a TMH followed by payload. + +* **IBV_TM_OP_RNDV** - Send a tagged rendezvous request. +The message consists of a TMH, an RVH, and optional additional data (which may +be inspected by receiver SW if the message is deemed unexpected). The RVH must +refer to a registered buffer containing the rendezvous payload. The total +rendezvous message size must not exceed the **max_rndv_hdr_size** capability. +The Sender must consider the operation outstanding until a TM message with the +**IBV_TM_OP_FIN** opcode is received, after which the buffer may be deregistered +and freed. + +* **IBV_TM_OP_FIN** - Send a rendezvous completion indication. +The message consists of a copy of the original TMH and RVH of the rendezvous +request, apart the opcode. This message is sent after the receiver has +completed the transfer of the rendezvous payload by an RDMA-read operation. It +may be sent either by HW or SW, depending on whether the rendezvous request +was handled as expected or unexpected by the TM-SRQ. + + +### TM completion processing + +There are 2 types of TM completions: tag-manipulation and receive completions. + +Tag-manipulation operations generate the following completion opcodes: +* **IBV_WC_TM_ADD** - completion of a tag addition operation +* **IBV_WC_TM_DEL** - completion of a tag removal operation +* **IBV_WC_TM_SYNC** - completion of a synchronization operation + +These completions are complemented by the **IBV_WC_TM_SYNC_REQ** flag, which +indicates whether further HW synchronization is needed. + +TM receive completions generate the following completion codes: +* **IBV_WC_RECV** - standard SRQ completion; used for unexpected messages +* **IBV_WC_TM_NO_TAG** - completion of a message sent with the + **IBV_TM_NO_TAG** opcode. +* **IBV_WC_TM_RECV** - completion of a tag-matching operation + +The **IBV_WC_TM_RECV** completion is complemented by the following completion +flags: +- **IBV_WC_TM_MATCH** - a match was performed +- **IBV_WC_TM_DATA_VALID** - all data of the matched message has been + delivered to memory + +In single-packet eager messages, both flags are set. When larger messages or +rendezvous transfers are involved, matching and data transfer completion are +distinct events that generate 2 completion events for the same **recv_wr_id**. +While data transfer completions may be arbitrarily delayed depending on +message size, matching completion is reported immediately and is always +serialized with respect to other matches and the completion of unexpected +messages. + +In addition, **IBV_WC_TM_RECV** completions provide further information about +the matched message. This information is obtained using extended CQ processing +via the following extractor function: + +```h +static inline void ibv_wc_read_tm_info(struct ibv_cq_ex *cq, + struct ibv_wc_tm_info *tm_info); +``` +```h +struct ibv_wc_tm_info { + uint64_t tag; /* tag from TMH */ + uint32_t priv; /* opaque user data from TMH */ +}; +``` + +Finally, when a posted tagged buffer is insufficient to hold the data of a +rendezvous request, the HW completes the buffer with an +IBV_WC_TM_RNDV_INCOMPLETE status. In this case, the TMH and RVH headers are +scattered into the tagged buffer (tag-matching has still been completed!), and +message handling is resumed by SW. + diff --git a/Documentation/testing.md b/Documentation/testing.md new file mode 100644 index 0000000..54e6c35 --- /dev/null +++ b/Documentation/testing.md @@ -0,0 +1,154 @@ +# Testing in rdma-core + +rdma-core now offers an infrastructure for quick and easy additions of feature- +specific tests. + +## Design +### Resources Management +`BaseResources` class is the basic objects aggregator available. It includes a +Context and a PD. +Inheriting from it is `TrafficResources` class, which also holds a MR, CQ and +QP, making it enough to support loopback traffic testing. It exposes methods for +creation of these objects which can be overridden by inheriting classes. +Inheriting from `TrafficResources` are currently three classes: +- `RCResources` +- `UDResources` +- `XRXResources` + +The above subclasses add traffic-specific constants. For example, `UDResources` +overrides create_mr and adds the size of the GRH header to the message size. +`RCResources` exposes a wrapper to modify the QP to RTS. + +### Tests-related Classes +`unittest.TestCase` is a logical test unit in Python's unittest module. +`RDMATestCase` inherits from it and adds the option to accept parameters +(example will follow below) or use a random set of valid parameters: +- If no device was provided, it iterates over the existing devices, for each + port of each device, it checks which GID indexes are valid (in RoCE, only + IPv4 and IPv6 based GIDs are used). Each <dev, port, gid> is added to an array + and one entry is selected. +- If a device was provided, the same process is done for all ports of this + device, and so on. + +### Traffic Utilities +tests/utils.py offers a few wrappers for common traffic operations, making the +use of default values even shorter. Those traffic utilities accept an +aggregation object as their first parameter and rely on that object to have +valid RDMA resources for proper functioning. +- get_[send, recv]_wr() creates a [Send, Recv]WR object with a single SGE. It + also sets the MR content to be 'c's for client side or 's's for server side + (this is later validated). +- post_send() posts a single send request to the aggregation object's QP. If the + QP is a UD QP, an address vector will be added to the send WR. +- post_recv() posts the given RecvWR <num> times, so it can be used to fill the + RQ prior to traffic as well as during traffic. +- poll_cq() polls <num> completions from the CQ and raises an exception on a + non-success status. +- validate() verifies that the data in the MR is as expected ('c's for server, + 's's for client). +- traffic() runs <num> iterations of send/recv between 2 players. + +## How to run rdma-core's tests +#### Developers +The tests can be executed from ./build/bin: +``` +./build.sh +./build/bin/run_tests.py +``` +#### Users +The tests are not a Python package, as such they can be found under +/usr/share/doc/rdma-core-{version}/tests. +In order to run all tests: +``` +python /usr/share/doc/rdma-core-<version>/tests/run_tests.py +``` +#### Execution output +Output will be something like: +``` +$ ./build/bin/run_tests.py +..........................................ss............... +---------------------------------------------------------------------- +Ran 59 tests in 13.268s + +OK (skipped=2) +``` +A dot represents a passing test. 's' means a skipped test. 'E' means a test +that failed. + +Tests can also be executed in verbose mode: +``` +$ python3 /usr/share/doc/rdma-core-26.0/tests/run_tests.py -v +test_create_ah (test_addr.AHTest) ... ok +test_create_ah_roce (test_addr.AHTest) ... ok +test_destroy_ah (test_addr.AHTest) ... ok +test_create_comp_channel (test_cq.CCTest) ... ok +< many more lines here> +test_odp_rc_traffic (test_odp.OdpTestCase) ... skipped 'No port is up, can't run traffic' +test_odp_ud_traffic (test_odp.OdpTestCase) ... skipped 'No port is up, can't run traffic' +<more lines> + +---------------------------------------------------------------------- +Ran 59 tests in 12.857s + +OK (skipped=2) +``` +Verbose mode provides the reason for skipping the test (if one was provided by +the test developer). + +### Customized Execution +tests/__init__.py defines a `_load_tests` function that returns an array with +the tests that will be executed. +The default implementation collects all test_* methods from all the classes that +inherit from `unittest.TestCase` (or `RDMATestCase`) and located in files under +tests directory which names starts with test_. +Users can execute part of the tests by adding `-k` to the run_tests.py command. +The following example executes only tests cases in files starting with +`test_device` and not `test_`. + +``` +$ build/bin/run_tests.py -v -k test_device +test_create_dm (tests.test_device.DMTest) ... ok +test_create_dm_bad_flow (tests.test_device.DMTest) ... ok +test_destroy_dm (tests.test_device.DMTest) ... ok +test_destroy_dm_bad_flow (tests.test_device.DMTest) ... ok +test_dm_read (tests.test_device.DMTest) ... ok +test_dm_write (tests.test_device.DMTest) ... ok +test_dm_write_bad_flow (tests.test_device.DMTest) ... ok +test_dev_list (tests.test_device.DeviceTest) ... ok +test_open_dev (tests.test_device.DeviceTest) ... ok +test_query_device (tests.test_device.DeviceTest) ... ok +test_query_device_ex (tests.test_device.DeviceTest) ... ok +test_query_gid (tests.test_device.DeviceTest) ... ok +test_query_port (tests.test_device.DeviceTest) ... ok +test_query_port_bad_flow (tests.test_device.DeviceTest) ... ok + +---------------------------------------------------------------------- +Ran 14 tests in 0.152s + +OK +``` +We're using 'parametrize' as it instantiates the TestCase for us. +'parametrize' can accept arguments as well (device name, IB port, GID index and +PKey index): +``` +suite = unittest.TestSuite() +suite.addTest(RDMATestCase.parametrize(YourTestCase, dev_name='devname')) +``` + +## Writing Tests +The following section explains how to add a new test, using tests/test_odp.py +as an example. It's a simple test that runs ping-pong over a few different +traffic types. + +ODP requires capability check, so a decorator was added to tests/utils.py. +The first change for ODP execution is when registering a memory region (need to +set the ON_DEMAND access flag), so we do as follows: +1. Create the players by inheriting from `RCResources` (for RC traffic). +2. In the player, override create_mr() and add the decorator to it. It will run + before the actual call to ibv_reg_mr and if ODP caps are off, the test will + be skipped. + 3. Create the `OdpTestCase` by inheriting from `RDMATestCase`. + 4. In the test case, add a method starting with test_, to let the unittest + infrastructure that this is a test. + 5. In the test method, create the players (which already check the ODP caps) + and call the traffic() function, providing it the two players. diff --git a/Documentation/udev.md b/Documentation/udev.md new file mode 100644 index 0000000..cf94a4f --- /dev/null +++ b/Documentation/udev.md @@ -0,0 +1,200 @@ +# Kernel Module Loading + +The RDMA subsystem relies on the kernel, udev and systemd to load modules on +demand when RDMA hardware is present. The RDMA subsystem is unique since it +does not load the optional RDMA hardware modules unless the system has the +rdma-core package installed. + +This is to avoid exposing systems not using RDMA from having RDMA enabled, for +instance if a system has a multi-protocol ethernet adapter, but is only using +the net stack interface. + +## Boot ordering with systemd + +systemd assumes everything is hot pluggable and runs in an event driven +manner. This creates a chain of hot plug events as each part of the system +autoloads based on earlier parts. The first step in the process is udev +loading the physical hardware driver. + +This can happen in several spots along the bootup: + + - From the initrd or built into the kernel. If hardware modules are present + in the initrd then they are loaded into the kernel before booting the + system. This is done largely synchronously with the boot process. + + - From udev when it auto detects PCI hardware or otherwise. + This happens asynchronously in the boot process, systemd does not wait for + udev to finish loading modules before it continues on. + + This path makes it very likely the system will experience a RDMA 'hot plug' + scenario. + + - From systemd's fixed module loader systemd-modules-load.service, e.g. from + the list in /etc/modules-load.d/. In this case the modules load happens + synchronously within systemd and it will hold off sysinit.target until + modules are loaded + +Once the hardware module is loaded it may be necessary to load a protocol +module, e.g. to enable RDMA support on an ethernet device. + +This is triggered automatically by udev rules that match the master devices +and load the protocol module with udev's module loader. This happens +asynchronously to the rest of the systemd startup. + +Once a RDMA device is created by the kernel then udev will cause systemd to +schedule ULP module loading services (e.g. rdma-load-modules@.service) specific +to the plugged hardware. If sysinit.target has not yet been passed then these +loaders will defer sysinit.target until they complete, otherwise this is a hot +plug event and things will load asynchronously to the boot up process. + +Finally udev will cause systemd to start RDMA specific daemons like +srp_daemon, rdma-ndd and iwpmd. These starts are linked to the detection of +the first RDMA hardware, and the daemons internally handle hot plug events for +other hardware. + +## Hot Plug compatible services + +Services using RDMA need to have device specific systemd dependencies in their +unit files, either created by hand by the admin or by using udev rules. + +For instance, a service that uses /dev/infiniband/umad0 requires: + +``` +After=dev-infiniband-umad0.device +BindsTo=dev-infiniband-umad0.device +``` + +Which will ensure the service will not run until the required umad device +appears, and will be stopped if the umad device is unplugged. + +This is similar to how systemd handles mounting filesystems and configuring +ethernet devices. + +## Interaction with legacy non-hotplug services + +Services that cannot handle hot plug must be ordered after +systemd-udev-settle.service, which will wait for udev to complete loading +modules and scheduling systemd services. This ensures that all RDMA hardware +present at boot is setup before proceeding to run the legacy service. + +Admins using legacy services can also place their RDMA hardware modules +(e.g. mlx4_ib) directly in /etc/modules-load.d/ or in their initrd which will +cause systemd to defer passing to sysinit.target until all RDMA hardware is +setup, this is usually sufficient for legacy services. This is probably the +default behavior in many configurations. + +# Systemd Ordering + +Within rdma-core we have a series of units which run in the pre `basic.target` +world to setup kernel services: + + - `iwpmd` + - `rdma-ndd` + - `rdma-load-modules@.service` + - `ibacmd.socket` + +These special units use DefaultDependencies=no and order before any other unit that +uses DefaultDependencies=yes. This will happen even in the case of hotplug. + +Units for normal rdma-using daemons should use DefaultDependencies=yes, and +either this pattern for 'any RDMA device': + +``` +[Unit] +# Order after rdma-hw.target has become active and setup the kernel services +Requires=rdma-hw.target +After=rdma-hw.target + +[Install] +# Autostart when RDMA hardware is present +WantedBy=rdma-hw.target +``` + +Or this pattern for a specific RDMA device: + +``` +[Unit] +# Order after RDMA services are setup +After=rdma-hw.target +# Run only while a specific umad device is present +After=dev-infiniband-umad0.device +BindsTo=dev-infiniband-umad0.device + +[Install] +# Schedual the unit to be runnable when RDMA hardware is present, but +# it will only start once the requested device actuall appears. +WantedBy=rdma-hw.target +``` + +Note, the above does explicitly reference `After=rdma-hw.target` even though +all the current constituents of that target order before +`sysinit.target`. This is to provide greater flexibility in the future. + +## rdma-hw.target + +This target is Wanted automatically by udev as soon as any RDMA hardware is +plugged in or becomes available at boot. + +This may be used to pull in rdma management daemons dynamically when RDMA +hardware is found. Such daemons should use: + +``` +[Install] +WantedBy=rdma-hw.target +``` + +In their unit files. + +`rdma-hw.target` is also a synchronization point that orders after the low level, +pre `sysinit.target` RDMA related units have been started. + +# Stable names + +The library provides general utility and udev rule to automatically perform +stable IB device name assignments, so users will always see names based on +topology/GUID information. Such naming scheme has big advantage that the +names are fully automatic, fully predictable and they stay fixed even if +hardware is added or removed (i.e. no reenumeration takes place) and that +broken hardware can be replaced seamlessly. + +The name is combination of link type (Infiniband, RoCE, iWARP, OPA or USNIC) +and the chosen naming policy, like NAME_KERNEL, NAME_PCI, NAME_GUID, NAME_ONBOARD +or NAME_FALLBACK. Those naming policies are controlled by udev rule and can be +overwritten by placing own rename policy udev rules into /etc/udev/rules.d/ +directory. + + * NAME_KERNEL - don't change names and rely on kernel assignment. This + will keep RDMA names as before. Example: "mlx5_0". + * NAME_PCI - read PCI location and topology as a source for stable names, + which won't change in any software event (reset, PCI probe e.t.c.). + Example: "ibp0s12f4". + * NAME_GUID - read node GUID information in similar manner to + net MAC naming policy. Example "rocex525400c0fe123455". + * NAME_ONBOARD - read Firmware/BIOS provided index numbers for on-board devices. + Example: "ibo3". + * NAME_FALLBACK - automatic fallback: NAME_ONBOARD->NAME_PCI->NAME_KERNEL + +No doubts that new names are harder to read than the "mlx5_0" everybody, +is used to, but being consistent in scripts is much more important. + +There is a distinction between real devices and virtual ones like RXE or SIW. +For real devices, the naming policy is NAME_FALLBACK, while virtual devices keep +their kernel name. + +In similar way to netdev, NAME_GUID scheme is not participating in fallback mechanism +and needs to be enabled explicitly by the users. + +Type of names: + + * o<index> - on-board device index number + * s<slot>[f<function>] - hotplug slot index number + * x<GUID> - Node GUID + * [P<domain>]p<bus>s<slot>[f<function>] - PCI geographical location + +Notes: + + * All multi-function PCI devices will carry the [f<function>] number in the + device name, including the function 0 device. + * When using PCI geography, The PCI domain is only prepended when it is not 0. + * SR-IOV virtual devices are named based on the name of the parent interface, + with a suffix of "v<N>", where <N> is the virtual device number. diff --git a/Documentation/versioning.md b/Documentation/versioning.md new file mode 100644 index 0000000..c40e2e9 --- /dev/null +++ b/Documentation/versioning.md @@ -0,0 +1,193 @@ +# Overall Package Version + +This version number is set in the top level CMakeLists.txt: + +```sh +set(PACKAGE_VERSION "11") +``` + +For upstream releases this is a single integer showing the release +ordering. We do not attempt to encode any 'ABI' information in this version. + +Branched stabled releases can append an additional counter eg `11.2`. + +Unofficial releases should include a distributor tag, eg '11.vendor2'. + +When the PACKAGE_VERSION is changed, the packaging files should be updated: + +```diff +diff --git a/CMakeLists.txt b/CMakeLists.txt +index a2464ec5..cf237904 100644 +--- a/CMakeLists.txt ++++ b/CMakeLists.txt +@@ -44,7 +44,7 @@ endif() + set(PACKAGE_NAME "RDMA") + + # See Documentation/versioning.md +-set(PACKAGE_VERSION "15") ++set(PACKAGE_VERSION "16") + # When this is changed the values in these files need changing too: + # debian/libibverbs1.symbols + # libibverbs/libibverbs.map +diff --git a/buildlib/centos6.spec b/buildlib/centos6.spec +index 8631a3544c9a..c537cc54e8be 100644 +--- a/buildlib/centos6.spec ++++ b/buildlib/centos6.spec +@@ -1,5 +1,5 @@ + Name: rdma-core +-Version: 15.0 ++Version: 16.0 + Release: 1%{?dist} + Summary: RDMA core userspace libraries and daemons +diff --git a/debian/changelog b/debian/changelog +index 86b402f4..9ee7fe16 100644 +--- a/debian/changelog ++++ b/debian/changelog +@@ -1,4 +1,4 @@ +-rdma-core (15-1) unstable; urgency=low ++rdma-core (16-1) unstable; urgency=low + + * New version. + * Adding debian/copyright. +diff --git a/redhat/rdma-core.spec b/redhat/rdma-core.spec +index cc0c3ba0..62334730 100644 +--- a/redhat/rdma-core.spec ++++ b/redhat/rdma-core.spec +@@ -1,5 +1,5 @@ + Name: rdma-core +-Version: 15 ++Version: 16 + Release: 1%{?dist} + Summary: RDMA core userspace libraries and daemons + +diff --git a/suse/rdma-core.spec b/suse/rdma-core.spec +index 76ca7286..a19f9e01 100644 +--- a/suse/rdma-core.spec ++++ b/suse/rdma-core.spec +@@ -19,7 +19,7 @@ + %bcond_without systemd + %define git_ver %{nil} + Name: rdma-core +-Version: 15 ++Version: 16 + Release: 0 + Summary: RDMA core userspace libraries and daemons + License: GPL-2.0 or BSD-2-Clause + +``` + +# Shared Library Versions + +The shared libraries use the typical semantic versioning scheme, eg +*libibumad* has a version like `3.1.11`. + +The version number is broken up into three fields: +- '3' is called the SONAME and is embedded into the ELF: + ```sh + $ readelf -ds build/lib/libibumad.so.3.1.11 + 0x000000000000000e (SONAME) Library soname: [libibumad.so.3] + ``` + + We do not expect this value to ever change for our libraries. It indicates + the overall ABI; changing it means the library will not dynamically link + to old programs anymore. + +- '1' is called the ABI level and is used within the ELF as the last component + symbol version tag. This version must be changed every time a new symbol + is introduced. It allows the user to see what version of the ABI the + library provides. + +- '11' is the overall release number and is copied from `PACKAGE_VERSION` This + version increases with every package release, even if the library code did + not change. It allows the user to see what upstream source was used to build + the library. + +This version is encoded into the filename `build/lib/libibumad.so.3.1.11` and +a symlink from `libibumad.so.3` to `build/lib/libibumad.so.3.1.11` is created. + +## Shared Library Symbol Versions + +Symbol versions are a linker technique that lets the library author provide +two symbols with different ABIs that have the same API name. The linker +differentiates the two cases internally. This allows the library author to +change the ABI that the API uses. This project typically does not make use of +this feature. + +As a secondary feature, the symbol version is also used by package managers +like RPM to manage the ABI level. To make this work properly the ABI level +must be correctly encoded into the symbol version. + +## Adding a new symbol + +First, increase the ABI level of the library. It is safe to re-use the ABI +level for multiple new functions within a single release, but once a release +is tagged the ABI level becomes *immutable*. The maintainer can provide +guidance on what ABI level to use for each series. + +```diff + rdma_library(ibumad libibumad.map + # See Documentation/versioning.md +- 3 3.1.${PACKAGE_VERSION} ++ 3 3.2.${PACKAGE_VERSION} +``` + +Next, add your new symbol to the symbol version file: + +```diff ++ IBUMAD_3.2 { ++ global: ++ umad_new_symbol; ++ } IBUMAD_1.0; +``` + +NOTE: Once a release is made the stanzas in the map file are *immutable* and +cannot be changed. Do not add your new symbol to old stanzas. + +The new symbol should appear in the ELF: + +```sh +$ readelf -s build/lib/libibumad.so.3.1.11 + 35: 00000000000031e0 450 FUNC GLOBAL DEFAULT 12 umad_new_symbol@@IBUMAD_3.2 +``` + +Finally update the `debian/libibumad3.symbols` file. + +## Private symbols in libibverbs + +Many symbols in libibverbs are private to rdma-core, they are being marked in +the map file using the IBVERBS_PRIVATE_ prefix. + +For simplicity, there is only one version of the private symbol version +stanza, and it is bumped whenever any change (add/remove/modify) to any of the +private ABI is done. This makes it very clear if an incompatible provider is +being used with libibverbs. + +Due to this there is no reason to provide compat symbol versions for the +private ABI. + +When the private symbol version is bumped, the packaging files should be updated: + +```diff +diff --git a/debian/control b/debian/control +index 642a715e..8def05c9 100644 +--- a/debian/control ++++ b/debian/control +@@ -138,7 +138,7 @@ Section: libs + Pre-Depends: ${misc:Pre-Depends} + Depends: adduser, ${misc:Depends}, ${shlibs:Depends} + Recommends: ibverbs-providers +-Breaks: ibverbs-providers (<< 16~) ++Breaks: ibverbs-providers (<< 17~) + Description: Library for direct userspace use of RDMA (InfiniBand/iWARP) + libibverbs is a library that allows userspace processes to use RDMA + "verbs" as described in the InfiniBand Architecture Specification and +``` + +### Use of private symbols between component packages + +A distribution packaging system still must have the correct dependencies +between libraries within rdma-core that may use these private symbols. + +For this reason the private symbols can only be used by provider libraries and +the distribution must ensure that a matched set of provider libraries and +libibverbs are installed. diff --git a/MAINTAINERS b/MAINTAINERS new file mode 100644 index 0000000..948c3a7 --- /dev/null +++ b/MAINTAINERS @@ -0,0 +1,184 @@ + List of maintainers + +Generally patches should be submitted to the main development mailing list: + +linux-rdma@vger.kernel.org + +Descriptions of section entries: + F: Files and directories with wildcard patterns. + A trailing slash includes all files and subdirectory files. + F: providers/mlx4/ all files in and below providers/mlx4/ + F: providers/* all files in providers, but not below + F: */net/* all files in "any top level directory"/net + One pattern per line. Multiple F: lines acceptable. + H: Historical authors + L: Mailing list that is relevant to this area + M: Designated reviewer: FullName <address@domain> + These reviewers should be CCed on patches. + S: Status, one of the following: + Supported: Someone is actually paid to look after this. + Maintained: Someone actually looks after it. + Odd Fixes: It has a maintainer but they don't have time to do + much other than throw the odd patch in. See below.. + Orphan: No current maintainer [but maybe you could take the + role as you write your new code]. + Obsolete: Old code. Something tagged obsolete generally means + it has been replaced by a better system and you + should be using that. + + ----------------------------------- + +* OVERALL PACKAGE +M: Doug Ledford <dledford@redhat.com> +M: Leon Romanovsky <leon@kernel.org> +M: Jason Gunthorpe <jgg@mellanox.com> +S: Supported + +BUILD SYSTEM +M: Jason Gunthorpe <jgg@mellanox.com> +S: Supported +F: */CMakeLists.txt +F: */lib*.map +F: buildlib/ + +DEBIAN PACKAGING +M: Benjamin Drung <benjamin.drung@cloud.ionos.com> +S: Supported +F: debian/ + +BNXT_RE USERSPACE PROVIDER (for bnxt_re.ko) +M: Devesh Sharma <Devesh.sharma@broadcom.com> +S: Supported +F: providers/bnxt_re/ + +CXGB4 USERSPACE PROVIDER (for iw_cxgb4.ko) +M: Steve Wise <swise@opengridcomputing.com> +S: Supported +F: providers/cxgb4/ + +EFA USERSPACE PROVIDER (for efa.ko) +M: Gal Pressman <galpress@amazon.com> +S: Supported +F: providers/efa/ + +HF1 USERSPACE PROVIDER (for hf1.ko) +M: Mike Marciniszyn <mike.marciniszyn@intel.com> +M: Dennis Dalessandro <dennis.dalessandro@intel.com> +S: Supported +L: intel-opa@lists.01.org (moderated for non-subscribers) +F: providers/hfi1verbs/ + +HNS USERSPACE PROVIDER (for hns-roce.ko) +M: Lijun Ou <oulijun@huawei.com> +M: Wei Hu(Xavier) <xavier.huwei@huawei.com> +S: Supported +F: providers/hns/ + +I40IW USERSPACE PROVIDER (for i40iw.ko) +M: Tatyana Nikolova <Tatyana.E.Nikolova@intel.com> +S: Supported +F: providers/i40iw/ + +RDMA Communication Manager Assistant (for librdmacm.so) +M: Haakon Bugge <haakon.bugge@oracle.com> +M: Mark Haywood <mark.haywood@oracle.com> +S: Supported +F: ibacm/* + +IPATH/QIB USERSPACE PROVIDER (for ib_qib.ko) +M: Mike Marciniszyn <mike.marciniszyn@intel.com> +M: Dennis Dalessandro <dennis.dalessandro@intel.com> +L: infinipath@intel.com +S: Supported +F: providers/ipathverbs/ + +IWARP PORT MAPPER DAEMON (for iwarp kernel providers) +M: Tatyana Nikolova <Tatyana.E.Nikolova@intel.com> +M: Steve Wise <swise@opengridcomputing.com> +H: Robert Sharp <robert.o.sharp@intel.com> +S: Supported +F: iwpmd/ + +LIBIBUMAD USERSPACE LIBRARY FOR SMP AND GMP MAD PROCESSING (/dev/infiniband/umadX) +M: Daniel Klein <danielk@mellanox.com> +H: Hal Rosenstock <hal@dev.mellanox.co.il> +H: Sasha Khapyorsky <sashak@voltaire.com> +H: Shahar Frank <shahar@voltaire.com> +S: Supported +F: libibumad/ + +LIBIBVERBS USERSPACE LIBRARY FOR RDMA VERBS (/dev/infiniband/uverbsX) +M: Doug Ledford <dledford@redhat.com> +M: Yishai Hadas <yishaih@dev.mellanox.co.il> +H: Michael S. Tsirkin <mst@mellanox.co.il> +H: Sean Hefty <sean.hefty@intel.com> +H: Dotan Barak <dotanba@gmail.com> +H: Roland Dreier <roland@topspin.com> +S: Supported +F: libibverbs/ + +LIBRDMACM USERSPACE LIBRARY FOR RDMA CONNECTION MANAGEMENT (/dev/infiniband/rdma_cm) +M: Sean Hefty <sean.hefty@intel.com> +S: Supported +F: librdmacm/ + +MLX4 USERSPACE PROVIDER (for mlx4_ib.ko) +M: Yishai Hadas <yishaih@mellanox.com> +H: Roland Dreier <rolandd@cisco.com> +S: Supported +F: providers/mlx4/ + +MLX5 USERSPACE PROVIDER (for mlx5_ib.ko) +M: Yishai Hadas <yishaih@mellanox.com> +H: Eli Cohen <eli@mellanox.com> +S: Supported +F: providers/mlx5/ + +MTHCA USERSPACE PROVIDER (for ib_mthca.ko) +M: Vladimir Sokolovsky <vlad@mellanox.com> +H: Michael S. Tsirkin <mst@mellanox.co.il> +H: Roland Dreier <roland@topspin.com> +S: Supported +F: providers/mthca/ + +OCRDMA USERSPACE PROVIDER (for ocrdma.ko) +M: Devesh Sharma <Devesh.sharma@broadcom.com> +S: Supported +F: providers/ocrdma/ + +QEDR USERSPACE PROVIDER (for qedr.ko) +M: Michal Kalderon <michal.kalderon@marvell.com> +M: Ariel Elior <ariel.elior@marvell.com> +S: Supported +F: providers/qedr/ + +RXE SOFT ROCEE USERSPACE PROVIDER (for rdma_rxe.ko) +M: Moni Shoua <monis@mellanox.com> +S: Supported +F: providers/rxe/ + +SIW SOFT IWARP USERSPACE PROVIDER (for siw.ko) +M: Bernard Metzler <bmt@zurich.ibm.com> +S: Supported +F: providers/siw/ + +SRP DAEMON (for ib_srp.ko) +M: Bart Van Assche <bvanassche@acm.org> +S: Supported +F: srp_daemon/ + +SUSE PACKAGING +M: Nicolas Morey-Chaisemartin <nmoreychaisemartin@suse.de> +S: Supported +F: suse/ + +VMWARE PVRDMA USERSPACE PROVIDER (for vmw_pvrdma.ko) +M: Adit Ranadive <aditr@vmware.com> +L: pv-drivers@vmware.com +S: Supported +F: providers/vmw_pvrdma/ + +PYVERBS +M: Edward Srouji <edwards@mellanox.com> +S: Supported +F: pyverbs/ diff --git a/README.md b/README.md new file mode 100644 index 0000000..b649c6f --- /dev/null +++ b/README.md @@ -0,0 +1,155 @@ +[](https://dev.azure.com/ucfconsort/rdma-core/_build/latest?definitionId=2&branchName=master) + +# RDMA Core Userspace Libraries and Daemons + +This is the userspace components for the Linux Kernel's drivers/infiniband +subsystem. Specifically this contains the userspace libraries for the +following device nodes: + + - /dev/infiniband/uverbsX (libibverbs) + - /dev/infiniband/rdma_cm (librdmacm) + - /dev/infiniband/umadX (libibumad) + +The userspace component of the libibverbs RDMA kernel drivers are included +under the providers/ directory. Support for the following Kernel RDMA drivers +is included: + + - efa.ko + - iw_cxgb4.ko + - hfi1.ko + - hns-roce.ko + - i40iw.ko + - ib_qib.ko + - mlx4_ib.ko + - mlx5_ib.ko + - ib_mthca.ko + - ocrdma.ko + - qedr.ko + - rdma_rxe.ko + - siw.ko + - vmw_pvrdma.ko + +Additional service daemons are provided for: + - srp_daemon (ib_srp.ko) + - iwpmd (for iwarp kernel providers) + - ibacm (for InfiniBand communication management assistant) + +# Building + +This project uses a cmake based build system. Quick start: + +```sh +$ bash build.sh +``` + +*build/bin* will contain the sample programs and *build/lib* will contain the +shared libraries. The build is configured to run all the programs 'in-place' +and cannot be installed. + +NOTE: It is not currently easy to run from the build directory, the plugins +only load from the system path. + +### Debian Derived + +```sh +$ apt-get install build-essential cmake gcc libudev-dev libnl-3-dev libnl-route-3-dev ninja-build pkg-config valgrind python3-dev cython3 python3-docutils pandoc +``` + +### Fedora + +```sh +$ dnf install cmake gcc libnl3-devel libudev-devel pkgconfig valgrind-devel ninja-build python3-devel python3-Cython python3-docutils pandoc +``` + +NOTE: Fedora Core uses the name 'ninja-build' for the 'ninja' command. + +### openSUSE + +```sh +$ zypper install cmake gcc libnl3-devel libudev-devel ninja pkg-config valgrind-devel python3-devel python3-Cython python3-docutils pandoc +``` + +## Building on CentOS 6/7, Amazon Linux 1/2 + +Install required packages: + +```sh +$ yum install cmake gcc libnl3-devel libudev-devel make pkgconfig valgrind-devel +``` + +Developers on CentOS 7 or Amazon Linux 2 are suggested to install more modern +tooling for the best experience. + +CentOS 7: + +```sh +$ yum install epel-release +$ yum install cmake3 ninja-build pandoc +``` + +Amazon Linux 2: + +```sh +$ amazon-linux-extras install epel +$ yum install cmake3 ninja-build pandoc +``` + +NOTE: EPEL uses the name 'ninja-build' for the 'ninja' command, and 'cmake3' +for the 'cmake' command. + +# Usage + +To set up software RDMA on an existing interface with either of the available +drivers, use the following commands, substituting `<DRIVER>` with the name of +the driver of your choice (`rdma_rxe` or `siw`) and `<TYPE>` with the type +corresponding to the driver (`rxe` or `siw`). + +``` +# modprobe <DRIVER> +# rdma link add <NAME> type <TYPE> netdev <DEVICE> +``` + +Please note that you need version of `iproute2` recent enough is required for the +command above to work. + +You can use either `ibv_devices` or `rdma link` to verify that the device was +successfully added. + +# Reporting bugs + +Bugs should be reported to the <linux-rdma@vger.kernel.org> mailing list +In your bug report, please include: + + * Information about your system: + - Linux distribution and version + - Linux kernel and version + - InfiniBand hardware and firmware version + - ... any other relevant information + + * How to reproduce the bug. + + * If the bug is a crash, the exact output printed out when the crash + occurred, including any kernel messages produced. + +# Submitting patches + +Patches should also be submitted to the <linux-rdma@vger.kernel.org> +mailing list. Please use unified diff form (the -u option to GNU diff), +and include a good description of what your patch does and why it should +be applied. If your patch fixes a bug, please make sure to describe the +bug and how your fix works. + +Make sure that your contribution can be licensed under the same +license as the original code you are patching, and that you have all +necessary permissions to release your work. + +## Azure Pipelines CI + +Submitted patches must pass the Azure Pipelines CI automatic builds without +warnings. A build similar to AZP can be run locally using docker and the +'buildlib/cbuild' script. + +```sh +$ buildlib/cbuild build-images azp +$ buildlib/cbuild pkg azp +``` diff --git a/build.sh b/build.sh new file mode 100755 index 0000000..a6b1e8f --- /dev/null +++ b/build.sh @@ -0,0 +1,31 @@ +#!/bin/bash +set -e + +SRCDIR=`dirname $0` +BUILDDIR="$SRCDIR/build" + +mkdir -p "$BUILDDIR" + +if hash cmake3 2>/dev/null; then + # CentOS users are encouraged to install cmake3 from EPEL + CMAKE=cmake3 +else + CMAKE=cmake +fi + +if hash ninja-build 2>/dev/null; then + # Fedora uses this name + NINJA=ninja-build +elif hash ninja 2>/dev/null; then + NINJA=ninja +fi + +cd "$BUILDDIR" + +if [ "x$NINJA" == "x" ]; then + $CMAKE -DIN_PLACE=1 ${EXTRA_CMAKE_FLAGS:-} .. + make +else + $CMAKE -DIN_PLACE=1 -GNinja ${EXTRA_CMAKE_FLAGS:-} .. + $NINJA +fi diff --git a/buildlib/FindLDSymVer.cmake b/buildlib/FindLDSymVer.cmake new file mode 100644 index 0000000..48238f2 --- /dev/null +++ b/buildlib/FindLDSymVer.cmake @@ -0,0 +1,52 @@ +# COPYRIGHT (c) 2016 Obsidian Research Corporation. +# Licensed under BSD (MIT variant) or GPLv2. See COPYING. +# find_package helper to detect symbol version support in the compiler and +# linker. If supported then LDSYMVER_MODE will be set to GNU + +# Basic sample GNU style map file +file(WRITE "${CMAKE_CURRENT_BINARY_DIR}/test.map" " +IBVERBS_1.0 { + global: + ibv_get_device_list; + local: *; +}; + +IBVERBS_1.1 { + global: + ibv_get_device_list; +} IBVERBS_1.0; +") + +# See RDMA_CHECK_C_LINKER_FLAG +set(SAFE_CMAKE_REQUIRED_LIBRARIES "${CMAKE_REQUIRED_LIBRARIES}") +set(SAFE_CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS}") +if (POLICY CMP0056) + set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -Wl,--version-script=${CMAKE_CURRENT_BINARY_DIR}/test.map") +else() + set(CMAKE_REQUIRED_LIBRARIES "${CMAKE_REQUIRED_LIBRARIES} -Wl,--version-script=${CMAKE_CURRENT_BINARY_DIR}/test.map") +endif() + +# And matching source, this also checks that .symver asm works +check_c_source_compiles(" +void ibv_get_device_list_1(void); +void ibv_get_device_list_1(void){} +asm(\".symver ibv_get_device_list_1, ibv_get_device_list@IBVERBS_1.1\"); +void ibv_get_device_list_0(void); +void ibv_get_device_list_0(void){} +asm(\".symver ibv_get_device_list_0, ibv_get_device_list@@IBVERBS_1.0\"); + +int main(int argc,const char *argv[]){return 0;}" _LDSYMVER_SUCCESS) + +file(REMOVE "${CMAKE_CURRENT_BINARY_DIR}/test.map") +set(CMAKE_EXE_LINKER_FLAGS "${SAFE_CMAKE_EXE_LINKER_FLAGS}") +set(CMAKE_REQUIRED_LIBRARIES "${SAFE_CMAKE_REQUIRED_LIBRARIES}") + +if (_LDSYMVER_SUCCESS) + set(LDSYMVER_MODE "GNU" CACHE INTERNAL "How to set symbol versions on shared libraries") +endif() + +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args( + LDSymVer + REQUIRED_VARS LDSYMVER_MODE + ) diff --git a/buildlib/FindSystemd.cmake b/buildlib/FindSystemd.cmake new file mode 100644 index 0000000..fbced40 --- /dev/null +++ b/buildlib/FindSystemd.cmake @@ -0,0 +1,30 @@ +# COPYRIGHT (c) 2015 Obsidian Research Corporation. +# Licensed under BSD (MIT variant) or GPLv2. See COPYING. + +find_path(LIBSYSTEMD_INCLUDE_DIRS "systemd/sd-journal.h") + +if (LIBSYSTEMD_INCLUDE_DIRS) + set(SYSTEMD_INCLUDE_DIRS ${LIBSYSTEMD_INCLUDE_DIRS}) + find_library(LIBSYSTEMD_LIBRARY NAMES systemd libsystemd) + # Older systemd uses a split library + if (NOT LIBSYSTEMD_LIBRARY) + find_library(LIBSYSTEMD_JOURNAL_LIBRARY NAMES systemd-journal libsystemd-journal) + find_library(LIBSYSTEMD_ID128_LIBRARY NAMES systemd-id128 libsystemd-id128) + find_library(LIBSYSTEMD_DAEMON_LIBRARY NAMES systemd-daemon libsystemd-daemon) + + if (LIBSYSTEMD_JOURNAL_LIBRARY AND LIBSYSTEMD_ID128_LIBRARY AND LIBSYSTEMD_DAEMON_LIBRARY) + set(SYSTEMD_LIBRARIES + ${LIBSYSTEMD_JOURNAL_LIBRARY} + ${LIBSYSTEMD_ID128_LIBRARY} + ${LIBSYSTEMD_DAEMON_LIBRARY}) + endif() + else() + set(SYSTEMD_LIBRARIES ${LIBSYSTEMD_LIBRARY}) + endif() + set(SYSTEMD_INCLUDE_DIRS) +endif() + +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args(Systemd REQUIRED_VARS SYSTEMD_LIBRARIES LIBSYSTEMD_INCLUDE_DIRS) + +mark_as_advanced(LIBSYSTEMD_LIBRARY LIBSYSTEMD_JOURNAL_LIBRARY LIBSYSTEMD_ID128_LIBRARY LIBSYSTEMD_DAEMON_LIBRARY) diff --git a/buildlib/FindUDev.cmake b/buildlib/FindUDev.cmake new file mode 100644 index 0000000..3a26943 --- /dev/null +++ b/buildlib/FindUDev.cmake @@ -0,0 +1,11 @@ +# COPYRIGHT (c) 2016 Obsidian Research Corporation. +# Licensed under BSD (MIT variant) or GPLv2. See COPYING. + +find_library(LIBUDEV_LIBRARY NAMES udev libudev) + +set(UDEV_LIBRARIES ${LIBUDEV_LIBRARY}) + +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args(UDev REQUIRED_VARS LIBUDEV_LIBRARY) + +mark_as_advanced(LIBUDEV_LIBRARY) diff --git a/buildlib/Findcython.cmake b/buildlib/Findcython.cmake new file mode 100644 index 0000000..ac1610c --- /dev/null +++ b/buildlib/Findcython.cmake @@ -0,0 +1,37 @@ +# SPDX-License-Identifier: (GPL-2.0 OR Linux-OpenIB) +# Copyright (c) 2018, Mellanox Technologies. All rights reserved. See COPYING file + +execute_process(COMMAND "${PYTHON_EXECUTABLE}" -c + "from Cython.Compiler.Main import main; import Cython; print(Cython.__version__);" + OUTPUT_VARIABLE _VERSION + RESULT_VARIABLE _VERSION_RESULT + ERROR_QUIET) + +if(NOT _VERSION_RESULT) + # We make our own cython script because it is very hard to figure out which + # cython exectuable wrapper is appropriately matched to the python + # interpreter we want to use. Cython must use the matching version of python + # or things will go wrong. + string(STRIP "${_VERSION}" CYTHON_VERSION_STRING) + set(CYTHON_EXECUTABLE "${BUILD_PYTHON}/cython") + file(WRITE "${CYTHON_EXECUTABLE}" "#!${PYTHON_EXECUTABLE} +from Cython.Compiler.Main import main +main(command_line = 1)") + execute_process(COMMAND "chmod" "a+x" "${CYTHON_EXECUTABLE}") + + # Dockers with older Cython versions fail to build pyverbs. Until we get to + # the bottom of this, disable pyverbs for older Cython versions. + if (CYTHON_VERSION_STRING VERSION_LESS "0.25") + message("Cython version < 0.25, disabling") + unset(CYTHON_EXECUTABLE) + endif() + +endif() +unset(_VERSION_RESULT) +unset(_VERSION) + +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args(cython + REQUIRED_VARS CYTHON_EXECUTABLE CYTHON_VERSION_STRING + VERSION_VAR CYTHON_VERSION_STRING) +mark_as_advanced(CYTHON_EXECUTABLE) diff --git a/buildlib/Findpandoc.cmake b/buildlib/Findpandoc.cmake new file mode 100644 index 0000000..ca1694a --- /dev/null +++ b/buildlib/Findpandoc.cmake @@ -0,0 +1,21 @@ +# COPYRIGHT (c) 2017 Mellanox Technologies Ltd +# Licensed under BSD (MIT variant) or GPLv2. See COPYING. +find_program(PANDOC_EXECUTABLE NAMES pandoc) + +if(PANDOC_EXECUTABLE) + execute_process(COMMAND "${PANDOC_EXECUTABLE}" -v + OUTPUT_VARIABLE _VERSION + RESULT_VARIABLE _VERSION_RESULT + ERROR_QUIET) + + if(NOT _VERSION_RESULT) + string(REGEX REPLACE "^pandoc ([^\n]+)\n.*" "\\1" PANDOC_VERSION_STRING "${_VERSION}") + endif() + unset(_VERSION_RESULT) + unset(_VERSION) +endif() + +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args(pandoc REQUIRED_VARS PANDOC_EXECUTABLE PANDOC_VERSION_STRING VERSION_VAR PANDOC_VERSION_STRING) + +mark_as_advanced(PANDOC_EXECUTABLE) diff --git a/buildlib/Findrst2man.cmake b/buildlib/Findrst2man.cmake new file mode 100644 index 0000000..a723660 --- /dev/null +++ b/buildlib/Findrst2man.cmake @@ -0,0 +1,21 @@ +# COPYRIGHT (c) 2019 Mellanox Technologies Ltd +# Licensed under BSD (MIT variant) or GPLv2. See COPYING. +find_program(RST2MAN_EXECUTABLE NAMES rst2man) + +if(RST2MAN_EXECUTABLE) + execute_process(COMMAND "${RST2MAN_EXECUTABLE}" --version + OUTPUT_VARIABLE _VERSION + RESULT_VARIABLE _VERSION_RESULT + ERROR_QUIET) + + if(NOT _VERSION_RESULT) + string(REGEX REPLACE "^rst2man \\(Docutils ([^,]+), .*" "\\1" RST2MAN_VERSION_STRING "${_VERSION}") + endif() + unset(_VERSION_RESULT) + unset(_VERSION) +endif() + +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args(rst2man REQUIRED_VARS RST2MAN_EXECUTABLE RST2MAN_VERSION_STRING VERSION_VAR RST2MAN_VERSION_STRING) + +mark_as_advanced(RST2MAN_EXECUTABLE) diff --git a/buildlib/RDMA_BuildType.cmake b/buildlib/RDMA_BuildType.cmake new file mode 100644 index 0000000..17206f5 --- /dev/null +++ b/buildlib/RDMA_BuildType.cmake @@ -0,0 +1,42 @@ +# COPYRIGHT (c) 2015 Obsidian Research Corporation. +# Licensed under BSD (MIT variant) or GPLv2. See COPYING. + +function(RDMA_BuildType) + set(build_types Debug Release RelWithDebInfo MinSizeRel) + + # Set the default build type to RelWithDebInfo. Since RDMA is typically used + # in performance contexts it doesn't make much sense to have the default build + # turn off the optimizer. + if(NOT CMAKE_BUILD_TYPE) + set(CMAKE_BUILD_TYPE RelWithDebInfo CACHE STRING + "Options are ${build_types}" + FORCE + ) + set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS ${build_types}) + endif() + + # Release should be used by packagers, it is the same as the default RelWithDebInfo, + # this means it uses -O2 and -DNDEBUG (not -O3) + foreach (language CXX C) + set(VAR_TO_MODIFY "CMAKE_${language}_FLAGS_RELEASE") + if ("${${VAR_TO_MODIFY}}" STREQUAL "${${VAR_TO_MODIFY}_INIT}") + set(${VAR_TO_MODIFY} "${CMAKE_${language}_FLAGS_RELWITHDEBINFO_INIT}" + CACHE STRING "Default flags for Release configuration" FORCE) + endif() + endforeach() + + # RelWithDebInfo should be used by developers, it is the same as Release but + # with the -DNDEBUG removed + foreach (language CXX C) + set(VAR_TO_MODIFY "CMAKE_${language}_FLAGS_RELWITHDEBINFO") + if (${${VAR_TO_MODIFY}} STREQUAL ${${VAR_TO_MODIFY}_INIT}) + string(REGEX REPLACE "(^| )[/-]D *NDEBUG($| )" + " " + replacement + "${${VAR_TO_MODIFY}}" + ) + set(${VAR_TO_MODIFY} "${replacement}" + CACHE STRING "Default flags for RelWithDebInfo configuration" FORCE) + endif() + endforeach() +endfunction() diff --git a/buildlib/RDMA_DoFixup.cmake b/buildlib/RDMA_DoFixup.cmake new file mode 100644 index 0000000..cd7d3b2 --- /dev/null +++ b/buildlib/RDMA_DoFixup.cmake @@ -0,0 +1,38 @@ +# COPYRIGHT (c) 2016 Obsidian Research Corporation. +# Licensed under BSD (MIT variant) or GPLv2. See COPYING. + +# Execute a header fixup based on NOT_NEEDED for HEADER + +# The buildlib includes alternate header file shims for several scenarios, if +# the build system detects a feature is present then it should call RDMA_DoFixup +# with the test as true. If false then the shim header will be installed. + +# Typically the shim header will replace a missing header with stubs, or it +# will augment an existing header with include_next. +function(RDMA_DoFixup not_needed header) + cmake_parse_arguments(ARGS "NO_SHIM" "" "" ${ARGN}) + string(REPLACE / - header-bl ${header}) + + if (NOT EXISTS "${BUILDLIB}/fixup-include/${header-bl}") + # NO_SHIM lets cmake succeed if the header exists in the system but no + # shim is provided, but this will always fail if the shim is needed but + # does not exist. + if (NOT ARGS_NO_SHIM OR NOT "${not_needed}") + message(FATAL_ERROR "Fixup header ${BUILDLIB}/fixup-include/${header-bl} is not present") + endif() + endif() + + set(DEST "${BUILD_INCLUDE}/${header}") + if (NOT "${not_needed}") + if(CMAKE_VERSION VERSION_LESS "2.8.12") + get_filename_component(DIR ${DEST} PATH) + else() + get_filename_component(DIR ${DEST} DIRECTORY) + endif() + file(MAKE_DIRECTORY "${DIR}") + + rdma_create_symlink("${BUILDLIB}/fixup-include/${header-bl}" "${DEST}") + else() + file(REMOVE ${DEST}) + endif() +endfunction() diff --git a/buildlib/RDMA_EnableCStd.cmake b/buildlib/RDMA_EnableCStd.cmake new file mode 100644 index 0000000..16dfe55 --- /dev/null +++ b/buildlib/RDMA_EnableCStd.cmake @@ -0,0 +1,132 @@ +# COPYRIGHT (c) 2016 Obsidian Research Corporation. +# Licensed under BSD (MIT variant) or GPLv2. See COPYING. + +# cmake does not have way to do this even slightly sanely until CMP0056 +function(RDMA_CHECK_C_LINKER_FLAG FLAG CACHE_VAR) + set(SAFE_CMAKE_REQUIRED_LIBRARIES "${CMAKE_REQUIRED_LIBRARIES}") + set(SAFE_CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS}") + + if (POLICY CMP0056) + set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${FLAG}") + else() + set(CMAKE_REQUIRED_LIBRARIES "${CMAKE_REQUIRED_LIBRARIES} ${FLAG}") + endif() + + CHECK_C_COMPILER_FLAG("" ${CACHE_VAR}) + + set(CMAKE_EXE_LINKER_FLAGS "${SAFE_CMAKE_EXE_LINKER_FLAGS}") + set(CMAKE_REQUIRED_LIBRARIES "${SAFE_CMAKE_REQUIRED_LIBRARIES}") +endfunction() + +# Test if the CC compiler supports the linker flag and if so add it to TO_VAR +function(RDMA_AddOptLDFlag TO_VAR CACHE_VAR FLAG) + RDMA_CHECK_C_LINKER_FLAG("${FLAG}" ${CACHE_VAR}) + if (${CACHE_VAR}) + SET(${TO_VAR} "${${TO_VAR}} ${FLAG}" PARENT_SCOPE) + endif() +endfunction() + +# Test if the CC compiler supports the flag and if so add it to TO_VAR +function(RDMA_AddOptCFlag TO_VAR CACHE_VAR FLAG) + CHECK_C_COMPILER_FLAG("${FLAG}" ${CACHE_VAR}) + if (${CACHE_VAR}) + SET(${TO_VAR} "${${TO_VAR}} ${FLAG}" PARENT_SCOPE) + endif() +endfunction() + +# Enable the minimum required gnu11 standard in the compiler +# This was introduced in GCC 4.7 +function(RDMA_EnableCStd) + if (HAVE_SPARSE) + # Sparse doesn't support gnu11, but doesn't fail if the option is present, + # force gnu99 instead. + SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -std=gnu99" PARENT_SCOPE) + return() + endif() + + if (CMAKE_VERSION VERSION_LESS "3.1") + # Check for support of the usual flag + CHECK_C_COMPILER_FLAG("-std=gnu11" SUPPORTS_GNU11) + if (SUPPORTS_GNU11) + SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -std=gnu11" PARENT_SCOPE) + else() + SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -std=gnu99" PARENT_SCOPE) + endif() + else() + # Newer cmake can do this internally + set(CMAKE_C_STANDARD 11 PARENT_SCOPE) + endif() +endfunction() + +function(RDMA_Check_Aliasing TO_VAR) + SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -O2") + CHECK_C_SOURCE_COMPILES(" +struct in6_addr {unsigned int u6_addr32[4];}; +struct iphdr {unsigned int daddr;}; +union ibv_gid {unsigned char raw[16];}; + +static void map_ipv4_addr_to_ipv6(struct in6_addr *ipv6) {ipv6->u6_addr32[0] = 0;} +static int set_ah_attr_by_ipv4(struct iphdr *ip4h) +{ + union ibv_gid sgid = {}; + map_ipv4_addr_to_ipv6((struct in6_addr *)&sgid); + return 0; +} + +int main(int argc, char *argv[]) +{ + struct in6_addr a; + struct iphdr h = {}; + map_ipv4_addr_to_ipv6(&a); + return set_ah_attr_by_ipv4(&h); +}" + HAVE_WORKING_STRICT_ALIASING + FAIL_REGEX "warning") + + set(${TO_VAR} "${HAVE_WORKING_STRICT_ALIASING}" PARENT_SCOPE) +endfunction() + +function(RDMA_Check_SSE TO_VAR) + set(SSE_CHECK_PROGRAM " +#if defined(__i386__) +#include <string.h> +#include <xmmintrin.h> +int __attribute__((target(\"sse\"))) main(int argc, char *argv[]) +{ + __m128 tmp = {}; + + tmp = _mm_loadl_pi(tmp, (__m64 *)&main); + _mm_storel_pi((__m64 *)&main, tmp); + return memchr(&tmp, 0, sizeof(tmp)) == &tmp; +} +#else +int main(int argc, char *argv[]) +{ + return 0; +} +#endif +") + + CHECK_C_SOURCE_COMPILES( + "${SSE_CHECK_PROGRAM}" + HAVE_TARGET_SSE + FAIL_REGEX "warning") + + if(NOT HAVE_TARGET_SSE) + # Older compiler, we can work around this by adding -msse instead of + # relying on the function attribute. + set(CMAKE_REQUIRED_FLAGS "-msse") + CHECK_C_SOURCE_COMPILES( + "${SSE_CHECK_PROGRAM}" + NEED_MSSE_FLAG + FAIL_REGEX "warning") + set(CMAKE_REQUIRED_FLAGS) + + if(NEED_MSSE_FLAG) + set(SSE_FLAGS "-msse" PARENT_SCOPE) + else() + message(FATAL_ERROR "Can not figure out how to turn on sse instructions for i386") + endif() + endif() + set(${TO_VAR} "${HAVE_TARGET_SSE}" PARENT_SCOPE) +endFunction() diff --git a/buildlib/RDMA_Sparse.cmake b/buildlib/RDMA_Sparse.cmake new file mode 100644 index 0000000..72581fe --- /dev/null +++ b/buildlib/RDMA_Sparse.cmake @@ -0,0 +1,35 @@ +# COPYRIGHT (c) 2017 Obsidian Research Corporation. +# Licensed under BSD (MIT variant) or GPLv2. See COPYING. + +function(RDMA_CheckSparse) + # Sparse defines __CHECKER__, but only for the 'sparse pass', which has no + # way to fail the compiler. + CHECK_C_SOURCE_COMPILES(" +#if __CHECKER__ +#warning \"SPARSE DETECTED\" +#endif +int main(int argc,const char *argv[]) {return 0;} +" + HAVE_NO_SPARSE + FAIL_REGEX "SPARSE DETECTED") + + if (HAVE_NO_SPARSE) + set(HAVE_SPARSE FALSE PARENT_SCOPE) + else() + set(HAVE_SPARSE TRUE PARENT_SCOPE) + + # Replace various glibc headers with our own versions that have embedded sparse annotations. + execute_process(COMMAND "${PYTHON_EXECUTABLE}" "${BUILDLIB}/gen-sparse.py" + "--out" "${BUILD_INCLUDE}/" + "--src" "${CMAKE_SOURCE_DIR}/" + "--cc" "${CMAKE_C_COMPILER}" + RESULT_VARIABLE retcode) + if(NOT "${retcode}" STREQUAL "0") + message(FATAL_ERROR "glibc header file patching for sparse failed. Review include/*.rej and fix the rejects, then do " + "${BUILDLIB}/gen-sparse.py -out ${BUILD_INCLUDE}/ --src ${CMAKE_SOURCE_DIR}/ --save") + endif() + + # Enable endian analysis in sparse + add_definitions("-D__CHECK_ENDIAN__") + endif() +endfunction() diff --git a/buildlib/azp-checkpatch b/buildlib/azp-checkpatch new file mode 100755 index 0000000..d7149a5 --- /dev/null +++ b/buildlib/azp-checkpatch @@ -0,0 +1,72 @@ +#!/usr/bin/env python3 +import subprocess +import urllib.request +import os +import re +import tempfile +import collections +import sys + +base = os.environ["SYSTEM_PULLREQUEST_TARGETBRANCH"] +if not re.match("^[0-9a-fA-F]{40}$", base): + base = "refs/remotes/origin/" + base + +with tempfile.TemporaryDirectory() as dfn: + patches = subprocess.check_output( + [ + "git", "format-patch", + "--output-directory", dfn, + os.environ["SYSTEM_PULLREQUEST_SOURCECOMMITID"], "^" + base + ], + universal_newlines=True).splitlines() + if len(patches) == 0: + sys.exit(0) + + ckp = os.path.join(dfn, "checkpatch.pl") + urllib.request.urlretrieve( + "https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/plain/scripts/checkpatch.pl", + ckp) + urllib.request.urlretrieve( + "https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/plain/scripts/spelling.txt", + os.path.join(dfn, "spelling.txt")) + os.symlink( + os.path.join(os.getcwd(), "buildlib/const_structs.checkpatch"), + os.path.join(dfn, "const_structs.checkpatch")) + checkpatch = [ + "perl", ckp, "--no-tree", "--ignore", + "PREFER_KERNEL_TYPES,FILE_PATH_CHANGES,EXECUTE_PERMISSIONS,USE_NEGATIVE_ERRNO,CONST_STRUCT", + "--emacs", "--mailback", "--quiet", "--no-summary" + ] + + failed = False + for fn in patches: + proc = subprocess.run( + checkpatch + [os.path.basename(fn)], + cwd=dfn, + stdout=subprocess.PIPE, + universal_newlines=True, + stderr=subprocess.STDOUT) + if proc.returncode == 0: + assert (not proc.stdout) + continue + sys.stdout.write(proc.stdout) + + failed = True + for g in re.finditer( + r"^\d+-.*:\d+: (\S+): (.*)(?:\n#(\d+): (?:FILE: (.*):(\d+):)?)?$", + proc.stdout, + flags=re.MULTILINE): + itms = {} + if g.group(1) == "WARNING": + itms["type"] = "warning" + else: + itms["type"] = "error" + if g.group(4): + itms["sourcepath"] = g.group(4) + itms["linenumber"] = g.group(5) + print("##vso[task.logissue %s]%s" % (";".join( + "%s=%s" % (k, v) + for k, v in sorted(itms.items())), g.group(2))) + +if failed: + print("##vso[task.complete result=SucceededWithIssues]]azp-checkpatch") diff --git a/buildlib/azure-pipelines-release.yml b/buildlib/azure-pipelines-release.yml new file mode 100644 index 0000000..fd5e4a1 --- /dev/null +++ b/buildlib/azure-pipelines-release.yml @@ -0,0 +1,48 @@ +# See https://aka.ms/yaml +# This pipeline runs to produce GitHub releases when tags are pushed. The +# pipeline is never run from a PR and has access to all the build secrets, +# including write permission to GitHub. + +trigger: + tags: + include: + - v* + +resources: + containers: + - container: azp + image: ucfconsort.azurecr.io/rdma-core/azure_pipelines:25.0 + endpoint: ucfconsort_registry + +stages: + - stage: Release + jobs: + - job: SrcPrep + displayName: Build Source Tar + pool: + vmImage: 'Ubuntu-16.04' + container: azp + steps: + - checkout: self + fetchDepth: 1 + + - bash: | + set -e + mkdir build-pandoc artifacts + cd build-pandoc + CC=gcc-9 cmake -GNinja .. + ninja docs + + cd .. + python3 buildlib/cbuild make-dist-tar build-pandoc + displayName: Prebuild Documentation + + - task: GithubRelease@0 + displayName: 'Create GitHub Release' + inputs: + githubConnection: github_release + repositoryName: linux-rdma/rdma-core + assets: ./*.tar.gz + action: create + isDraft: true + addChangeLog: true diff --git a/buildlib/azure-pipelines.yml b/buildlib/azure-pipelines.yml new file mode 100644 index 0000000..697d21d --- /dev/null +++ b/buildlib/azure-pipelines.yml @@ -0,0 +1,238 @@ +# See https://aka.ms/yaml + +trigger: + - master + - stable-v4* + - stable-v3* + - stable-v29 + - stable-v28 + - stable-v27 + - stable-v26 + - stable-v25 + - dev/stable-v4*/* + - dev/stable-v3*/* + - dev/stable-v29/* + - dev/stable-v28/* + - dev/stable-v27/* + - dev/stable-v26/* + - dev/stable-v25/* +pr: + - master + +resources: + containers: + - container: azp + image: ucfconsort.azurecr.io/rdma-core/azure_pipelines:28.0 + endpoint: ucfconsort_registry + - container: centos6 + image: ucfconsort.azurecr.io/rdma-core/centos6:25.0 + endpoint: ucfconsort_registry + - container: centos7 + image: ucfconsort.azurecr.io/rdma-core/centos7:25.0 + endpoint: ucfconsort_registry + - container: centos8 + image: ucfconsort.azurecr.io/rdma-core/centos8:25.0 + endpoint: ucfconsort_registry + - container: fedora + image: ucfconsort.azurecr.io/rdma-core/fc31:25.0 + endpoint: ucfconsort_registry + - container: xenial + image: ucfconsort.azurecr.io/rdma-core/ubuntu-16.04:28.0 + endpoint: ucfconsort_registry + - container: leap + image: ucfconsort.azurecr.io/rdma-core/opensuse-15.0:25.0 + endpoint: ucfconsort_registry + +stages: + - stage: Build + jobs: + - job: Compile + displayName: Compile Tests + pool: + vmImage: 'ubuntu-latest' + container: azp + steps: + - task: PythonScript@0 + displayName: checkpatch + condition: eq(variables['Build.Reason'], 'PullRequest') + inputs: + scriptPath: buildlib/azp-checkpatch + pythonInterpreter: /usr/bin/python3 + + - bash: | + set -e + mkdir build-gcc9 + cd build-gcc9 + CC=gcc-9 cmake -GNinja .. -DIOCTL_MODE=both -DENABLE_STATIC=1 -DENABLE_WERROR=1 + ninja + displayName: gcc 9.1 Compile + + - task: PythonScript@0 + displayName: Check Build Script + inputs: + scriptPath: buildlib/check-build + arguments: --src .. --cc gcc-9 + workingDirectory: build-gcc9 + pythonInterpreter: /usr/bin/python3 + + # Run sparse on the subdirectories which are sparse clean + - bash: | + set -e + mkdir build-sparse + mv CMakeLists.txt CMakeLists-orig.txt + grep -v "# NO SPARSE" CMakeLists-orig.txt > CMakeLists.txt + cd build-sparse + CC=cgcc cmake -GNinja .. -DIOCTL_MODE=both -DNO_PYVERBS=1 -DENABLE_WERROR=1 + ninja | grep -v '^\[' | tee out + # sparse does not fail gcc on messages + if [ -s out ]; then + false + fi + mv ../CMakeLists-orig.txt ../CMakeLists.txt + displayName: sparse Analysis + + - bash: | + set -e + mkdir build-clang + cd build-clang + CC=clang-9 CFLAGS="-m32" cmake -GNinja .. -DIOCTL_MODE=both -DNO_PYVERBS=1 -DENABLE_WERROR=1 + ninja + displayName: clang 9.0 32-bit Compile + + - bash: | + set -e + mv util/udma_barrier.h util/udma_barrier.h.old + echo "#error Fail" >> util/udma_barrier.h + cd build-gcc9 + rm CMakeCache.txt + CC=gcc-9 cmake -GNinja .. -DIOCTL_MODE=both -DENABLE_WERROR=1 + ninja + mv ../util/udma_barrier.h.old ../util/udma_barrier.h + displayName: Simulate non-coherent DMA Platform Compile + + - bash: | + set -e + mkdir build-arm64 + cd build-arm64 + CC=aarch64-linux-gnu-gcc-8 cmake -GNinja .. -DIOCTL_MODE=both -DNO_PYVERBS=1 -DENABLE_WERROR=1 + ninja + displayName: gcc 8.3 ARM64 Compile + + - bash: | + set -e + mkdir build-ppc64el + cd build-ppc64el + CC=powerpc64le-linux-gnu-gcc-8 cmake -GNinja .. -DIOCTL_MODE=both -DNO_PYVERBS=1 -DENABLE_WERROR=1 + ninja + displayName: gcc 8.3 PPC64EL Compile + + - bash: | + set -e + sed -i -e 's/ninja \(.*\)-v/ninja \1/g' debian/rules + debian/rules CC=clang-9 EXTRA_CMAKE_FLAGS="-DCMAKE_BUILD_TYPE=Debug -DENABLE_WERROR=1" build + displayName: clang 9.0 Bionic Build + - bash: | + set -e + fakeroot debian/rules binary + displayName: clang 9.0 Bionic .deb Build + - bash: | + set -e + lintian ../*.deb + displayName: Debian Lintian for .deb packages + + - job: SrcPrep + displayName: Build Source Tar + pool: + vmImage: 'ubuntu-latest' + container: azp + steps: + - checkout: self + fetchDepth: 1 + + - bash: | + set -e + mkdir build-pandoc artifacts + cd build-pandoc + CC=gcc-9 cmake -GNinja .. + ninja docs + cd ../artifacts + # FIXME: Check Build.SourceBranch for tag consistency + python3 ../buildlib/cbuild make-dist-tar ../build-pandoc + displayName: Prebuild Documentation + + - task: PublishPipelineArtifact@0 + inputs: + # Contains a rdma-core-XX.tar.gz file + artifactName: source_tar + targetPath: artifacts + + - job: RPM_Distros + displayName: Test Build RPMs for + dependsOn: SrcPrep + pool: + vmImage: 'ubuntu-latest' + strategy: + matrix: + centos6: + CONTAINER: centos6 + SPEC: buildlib/centos6.spec + RPMBUILD_OPTS: + centos7: + CONTAINER: centos7 + SPEC: redhat/rdma-core.spec + RPMBUILD_OPTS: --define 'EXTRA_CMAKE_FLAGS -DCMAKE_BUILD_TYPE=Debug -DENABLE_WERROR=1' + centos8: + CONTAINER: centos8 + SPEC: redhat/rdma-core.spec + RPMBUILD_OPTS: --define 'EXTRA_CMAKE_FLAGS -DCMAKE_BUILD_TYPE=Debug -DENABLE_WERROR=1' + fedora31: + CONTAINER: fedora + SPEC: redhat/rdma-core.spec + RPMBUILD_OPTS: --define 'EXTRA_CMAKE_FLAGS -DCMAKE_BUILD_TYPE=Debug -DENABLE_WERROR=1' + leap: + CONTAINER: leap + SPEC: suse/rdma-core.spec + RPMBUILD_OPTS: --define 'EXTRA_CMAKE_FLAGS -DCMAKE_BUILD_TYPE=Debug -DENABLE_WERROR=1' --without=curlmini + container: $[ variables['CONTAINER'] ] + steps: + - checkout: none + + - task: DownloadPipelineArtifact@0 + inputs: + artifactName: source_tar + targetPath: . + + - bash: | + set -e + mkdir SOURCES tmp + tar --wildcards -xzf rdma-core*.tar.gz */$(SPEC) --strip-components=2 + RPM_SRC=$((rpmspec -P *.spec || grep ^Source: *.spec) | awk '/^Source:/{split($0,a,"[ \t]+");print(a[2])}') + (cd SOURCES && ln -sf ../rdma-core*.tar.gz "$RPM_SRC") + rpmbuild --define '_tmppath '$(pwd)'/tmp' --define '_topdir '$(pwd) -bb *.spec $(RPMBUILD_OPTS) + displayName: Perform Package Build + + - job: DEB_Distros + displayName: Test Build DEBs for + dependsOn: SrcPrep + pool: + vmImage: 'ubuntu-latest' + strategy: + matrix: + xenial: + CONTAINER: xenial + container: $[ variables['CONTAINER'] ] + steps: + - checkout: none + + - task: DownloadPipelineArtifact@0 + inputs: + artifactName: source_tar + targetPath: . + + - bash: | + set -e + mv *.tar.gz src.tar.gz + tar -xzf src.tar.gz + cd rdma-core*/ + dpkg-buildpackage -b -d + displayName: Perform Package Build diff --git a/buildlib/cbuild b/buildlib/cbuild new file mode 100755 index 0000000..62f83a6 --- /dev/null +++ b/buildlib/cbuild @@ -0,0 +1,1138 @@ +#!/usr/bin/env python3 +# Copyright 2015-2016 Obsidian Research Corp. +# Licensed under BSD (MIT variant) or GPLv2. See COPYING. +# PYTHON_ARGCOMPLETE_OK +"""cbuild - Build in a docker container + +This script helps using docker containers to run software builds. This allows +building for a wide range of distributions without having to install them. + +Each target distribution has a base docker image and a set of packages to +install. The first step is to build the customized docker container: + + $ buildlib/cbuild build-images fedora + +This will download the base image and customize it with the required packages. + +Next, a build can be performed 'in place'. This is useful to do edit/compile +cycles with an alternate distribution. + + $ buildlib/cbuild make fedora + +The build output will be placed in build-fcXX, where XX is latest fedora release. + +Finally, a full package build can be performed inside the container. Note this +mode actually creates a source tree inside the container based on the current +git HEAD commit, so any uncommitted edits will be lost. + + $ buildlib/cbuild pkg fedora + +In this case only the final package results are copied outside the container +(to ..) and everything else is discarded. + +In all cases the containers that are spun up are deleted after they are +finished, only the base container created during 'build-images' is kept. The +'--run-shell' option can be used to setup the container to the point of +running the build command and instead run an interactive bash shell. This is +useful for debugging certain kinds of build problems.""" + +from __future__ import print_function +import argparse +import collections +import filecmp +import grp +import inspect +import json +import multiprocessing +import os +import pipes +import pwd +import re +import shutil +import subprocess +import sys +import tempfile +import yaml +from contextlib import contextmanager; + +project = "rdma-core"; + +def get_version(): + """Return the version string for the project, this gets automatically written + into the packaging files.""" + with open("CMakeLists.txt","r") as F: + for ln in F: + g = re.match(r'^set\(PACKAGE_VERSION "(.+)"\)',ln) + if g is None: + continue; + return g.group(1); + raise RuntimeError("Could not find version"); + +class DockerFile(object): + def __init__(self,src): + self.lines = ["FROM %s"%(src)]; + +class Environment(object): + azp_images = None; + pandoc = True; + python_cmd = "python3"; + aliases = set(); + use_make = False; + proxy = True; + build_pyverbs = True; + + to_azp = False; + + def _get_azp_names(self): + if Environment.azp_images: + return Environment.azp_images; + + with open("buildlib/azure-pipelines.yml") as F: + azp = yaml.safe_load(F) + Environment.azp_images = set(I["image"] for I in azp["resources"]["containers"]) + return Environment.azp_images; + + def image_name(self): + if self.to_azp: + # Get the version number of the container out of the azp file. + prefix = "ucfconsort.azurecr.io/%s/%s:"%(project, self.name); + for I in self._get_azp_names(): + if I.startswith(prefix): + return I; + raise ValueError("Image is not used in buildlib/azure-pipelines.yml") + return "build-%s/%s"%(project,self.name); + +# ------------------------------------------------------------------------- + +class YumEnvironment(Environment): + is_rpm = True; + def get_docker_file(self,tmpdir): + res = DockerFile(self.docker_parent); + res.lines.append("RUN yum install -y %s && yum clean all"%( + " ".join(sorted(self.pkgs)))); + return res; + +class centos6(YumEnvironment): + docker_parent = "centos:6"; + pkgs = { + 'cmake', + 'gcc', + 'libnl3-devel', + 'libudev-devel', + 'make', + 'pkgconfig', + 'python', + 'python-argparse', + 'python-docutils', + 'rpm-build', + 'valgrind-devel', + }; + name = "centos6"; + use_make = True; + pandoc = False; + build_pyverbs = False; + specfile = "buildlib/centos6.spec"; + python_cmd = "python"; + to_azp = True; + +class centos7(YumEnvironment): + docker_parent = "centos:7"; + pkgs = centos6.pkgs | {'systemd-devel'}; + name = "centos7"; + use_make = True; + pandoc = False; + build_pyverbs = False; + specfile = "redhat/rdma-core.spec"; + python_cmd = "python"; + to_azp = True; + +class centos7_epel(centos7): + pkgs = (centos7.pkgs - {"cmake","make"}) | { + "cmake3", + "ninja-build", + "pandoc", + "python34-setuptools", + 'python34-Cython', + 'python34-devel', + }; + name = "centos7_epel"; + build_pyverbs = True; + use_make = False; + pandoc = True; + ninja_cmd = "ninja-build"; + # Our spec file does not know how to cope with cmake3 + is_rpm = False; + to_azp = False; + + def get_docker_file(self,tmpdir): + res = YumEnvironment.get_docker_file(self,tmpdir); + res.lines.insert(1,"RUN yum install -y epel-release"); + res.lines.append("RUN ln -s /usr/bin/cmake3 /usr/local/bin/cmake && ln -sf /usr/bin/python3.4 /usr/bin/python3"); + return res; + +class amazonlinux1(YumEnvironment): + docker_parent = "amazonlinux:1"; + pkgs = { + 'cmake', + 'gcc', + 'libnl3-devel', + 'libudev-devel', + 'make', + 'pkgconfig', + 'python', + 'python-argparse', + 'python27-docutils', + 'rpm-build', + 'valgrind-devel', + }; + name = "amazonlinux1"; + use_make = True; + pandoc = False; + build_pyverbs = False; + specfile = "buildlib/centos6.spec"; + python_cmd = "python"; + to_azp = False; + +class amazonlinux2(YumEnvironment): + docker_parent = "amazonlinux:2"; + pkgs = centos7.pkgs; + name = "amazonlinux2"; + use_make = True; + pandoc = False; + build_pyverbs = False; + specfile = "redhat/rdma-core.spec"; + python_cmd = "python"; + to_azp = False; + +class centos8(Environment): + docker_parent = "centos:8"; + pkgs = { + "pandoc", + "perl-generators", + "python3-Cython", + "python3-devel", + "python3-docutils", + 'cmake', + 'gcc', + 'libnl3-devel', + 'libudev-devel', + 'ninja-build', + 'pkgconfig', + 'rpm-build', + 'systemd-devel', + 'valgrind-devel', + }; + name = "centos8"; + specfile = "redhat/rdma-core.spec"; + is_rpm = True; + to_azp = True; + proxy = False; + + def get_docker_file(self,tmpdir): + res = DockerFile(self.docker_parent); + res.lines.append("RUN dnf install --enablerepo=PowerTools -y %s && dnf clean all"%( + " ".join(sorted(self.pkgs)))); + return res; + +class fc31(Environment): + docker_parent = "fedora:31"; + pkgs = centos8.pkgs + name = "fc31"; + specfile = "redhat/rdma-core.spec"; + ninja_cmd = "ninja-build"; + is_rpm = True; + aliases = {"fedora"}; + to_azp = True; + + def get_docker_file(self,tmpdir): + res = DockerFile(self.docker_parent); + res.lines.append("RUN dnf install -y %s && dnf clean all"%( + " ".join(sorted(self.pkgs)))); + return res; + +# ------------------------------------------------------------------------- + +class APTEnvironment(Environment): + is_deb = True; + build_python = True; + def get_docker_file(self,tmpdir): + res = DockerFile(self.docker_parent); + res.lines.append("RUN apt-get update; apt-get install -y --no-install-recommends %s && apt-get clean && rm -rf /usr/share/doc/ /usr/lib/debug /var/lib/apt/lists/"%( + " ".join(sorted(self.pkgs)))); + return res; + + def add_source_list(self,tmpdir,name,content): + sld = os.path.join(tmpdir,"etc","apt","sources.list.d"); + if not os.path.isdir(sld): + os.makedirs(sld); + with open(os.path.join(sld,name),"w") as F: + F.write(content + "\n"); + + def add_ppa(self,tmpdir,srcline,keyid): + gpgd = os.path.join(tmpdir,"etc","apt","trusted.gpg.d"); + if not os.path.isdir(gpgd): + os.makedirs(gpgd); + + # The container does not have gpg or other stuff to get the signing + # key for the toolchain ppa. Fetch it in the host and just import the + # gpg data directly into the trusted keyring. + kb = os.path.join(tmpdir,"%s.kb.gpg"%(keyid)); + env = {k:v for k,v in os.environ.items()}; + env["HOME"] = tmpdir; + os.makedirs(os.path.join(tmpdir,".gnupg"), exist_ok=True) + subprocess.check_call(["gpg","--keyserver", "keyserver.ubuntu.com", "--no-default-keyring","--keyring",kb,"--always-trust", + "--recv-key",keyid],env=env); + kr = os.path.join(gpgd,"%s.gpg"%(keyid)); + with open(kr,"wb") as F: + F.write(subprocess.check_output(["gpg","--no-default-keyring", + "--keyring",kb, + "--export",keyid],env=env)); + os.unlink(kb); + + self.add_source_list(tmpdir,keyid + ".list",srcline); + +class xenial(APTEnvironment): + docker_parent = "ubuntu:16.04" + pkgs = { + 'build-essential', + 'cmake', + 'debhelper', + 'dh-systemd', + 'fakeroot', # for AZP + 'gcc', + 'libnl-3-dev', + 'libnl-route-3-dev', + 'libsystemd-dev', + 'libudev-dev', + 'make', + 'ninja-build', + 'pandoc', + 'pkg-config', + 'python3', + 'python3-docutils', + 'valgrind', + }; + name = "ubuntu-16.04"; + aliases = {"xenial"}; + to_azp = True; + +class bionic(APTEnvironment): + docker_parent = "ubuntu:18.04" + pkgs = xenial.pkgs | { + 'cython3', + 'python3-dev', + }; + name = "ubuntu-18.04"; + aliases = {"bionic", "ubuntu"}; + +class jessie(APTEnvironment): + docker_parent = "debian:8" + pkgs = xenial.pkgs; + name = "debian-8"; + aliases = {"jessie"}; + build_pyverbs = False; + +class stretch(APTEnvironment): + docker_parent = "debian:9" + pkgs = bionic.pkgs; + name = "debian-9"; + aliases = {"stretch"}; + +class debian_experimental(APTEnvironment): + docker_parent = "debian:experimental" + pkgs = (stretch.pkgs ^ {"gcc"}) | {"gcc-9"}; + name = "debian-experimental"; + + def get_docker_file(self,tmpdir): + res = DockerFile(self.docker_parent); + res.lines.append("RUN apt-get update && apt-get -t experimental install -y --no-install-recommends %s && apt-get clean"%( + " ".join(sorted(self.pkgs)))); + return res; + +# ------------------------------------------------------------------------- + +class ZypperEnvironment(Environment): + proxy = False; + is_rpm = True; + def get_docker_file(self,tmpdir): + res = DockerFile(self.docker_parent); + res.lines.append("RUN zypper --non-interactive refresh"); + res.lines.append("RUN zypper --non-interactive dist-upgrade"); + res.lines.append("RUN zypper --non-interactive install %s"%( + " ".join(sorted(self.pkgs)))); + return res; + +class leap(ZypperEnvironment): + docker_parent = "opensuse/leap:15.0"; + specfile = "suse/rdma-core.spec"; + pkgs = { + 'cmake', + 'gcc', + 'libnl3-devel', + 'libudev-devel', + 'udev', + 'make', + 'ninja', + 'pandoc', + 'pkg-config', + 'python3', + 'rpm-build', + 'systemd-devel', + 'valgrind-devel', + 'python3-Cython', + 'python3-devel', + 'python3-docutils', + }; + rpmbuild_options = [ "--without=curlmini" ]; + to_azp = True; + name = "opensuse-15.0"; + aliases = {"leap"}; + +class tumbleweed(ZypperEnvironment): + docker_parent = "opensuse/tumbleweed:latest"; + pkgs = (leap.pkgs ^ {"valgrind-devel"}) | {"valgrind-client-headers"}; + name = "tumbleweed"; + specfile = "suse/rdma-core.spec"; + rpmbuild_options = [ "--without=curlmini" ]; + +# ------------------------------------------------------------------------- + +class azure_pipelines(APTEnvironment): + docker_parent = "ubuntu:18.04" + pkgs = { + "abi-compliance-checker", + "abi-dumper", + "ca-certificates", + "clang-9", + "cmake", + "cython3", + "debhelper", + "dh-systemd", + "dpkg-dev", + "fakeroot", + "gcc-9", + "git", + "python2.7", + "libc6-dev", + "libnl-3-dev", + "libnl-route-3-dev", + "libsystemd-dev", + "libudev-dev", + "lintian", + "make", + "ninja-build", + "pandoc", + "pkg-config", + "python3-docutils", + "python3", + "python3-dev", + "python3-docutils", + "python3-yaml", + "sparse", + "valgrind", + } | { + # 32 bit build support + "libgcc-9-dev:i386", + "libc6-dev:i386", + "libnl-3-dev:i386", + "libnl-route-3-dev:i386", + "libsystemd-dev:i386", + "libudev-dev:i386", + } | { + # ARM 64 cross compiler + "gcc-8-aarch64-linux-gnu", + "libgcc-8-dev:arm64", + "libc6-dev:arm64", + "libnl-3-dev:arm64", + "libnl-route-3-dev:arm64", + "libsystemd-dev:arm64", + "libudev-dev:arm64", + } | { + # PPC 64 cross compiler + "gcc-8-powerpc64le-linux-gnu", + "libgcc-8-dev:ppc64el", + "libc6-dev:ppc64el", + "libnl-3-dev:ppc64el", + "libnl-route-3-dev:ppc64el", + "libsystemd-dev:ppc64el", + "libudev-dev:ppc64el", + } + to_azp = True; + name = "azure_pipelines"; + aliases = {"azp"} + + def get_docker_file(self,tmpdir): + res = bionic.get_docker_file(self,tmpdir); + self.add_ppa(tmpdir, + "deb http://ppa.launchpad.net/ubuntu-toolchain-r/test/ubuntu bionic main", + "60C317803A41BA51845E371A1E9377A2BA9EF27F"); + self.add_ppa(tmpdir, + "deb [arch=amd64] http://apt.llvm.org/bionic/ llvm-toolchain-bionic-8 main", + "15CF4D18AF4F7421"); + self.add_source_list(tmpdir,"arm64.list", + """deb [arch=arm64,ppc64el] http://ports.ubuntu.com/ bionic main universe +deb [arch=arm64,ppc64el] http://ports.ubuntu.com/ bionic-security main universe +deb [arch=arm64,ppc64el] http://ports.ubuntu.com/ bionic-updates main universe"""); + + res.lines.insert(1,"ADD etc/ /etc/"); + res.lines.insert(1,"RUN dpkg --add-architecture i386 &&" + "dpkg --add-architecture ppc64el &&" + "dpkg --add-architecture arm64 &&" + "sed -i -e 's/^deb /deb [arch=amd64,i386] /g' /etc/apt/sources.list"); + return res; + +# ------------------------------------------------------------------------- + +environments = [centos6(), + centos7(), + centos7_epel(), + centos8(), + amazonlinux1(), + amazonlinux2(), + xenial(), + bionic(), + jessie(), + stretch(), + fc31(), + leap(), + tumbleweed(), + debian_experimental(), + azure_pipelines(), +]; + +class ToEnvActionPkg(argparse.Action): + """argparse helper to parse environment lists into environment classes""" + def __call__(self, parser, namespace, values, option_string=None): + if not isinstance(values,list): + values = [values]; + + res = set(); + for I in values: + if I == "all": + for env in environments: + if env.name != "centos6" and env.name != "centos7_epel": + res.add(env); + else: + for env in environments: + if env.name == I or I in env.aliases: + res.add(env); + setattr(namespace, self.dest, sorted(res,key=lambda x:x.name)) + + +class ToEnvAction(argparse.Action): + """argparse helper to parse environment lists into environment classes""" + def __call__(self, parser, namespace, values, option_string=None): + if not isinstance(values,list): + values = [values]; + + res = set(); + for I in values: + if I == "all": + res.update(environments); + else: + for env in environments: + if env.name == I or I in env.aliases: + res.add(env); + setattr(namespace, self.dest, sorted(res,key=lambda x:x.name)) + +def env_choices_pkg(): + """All the names that can be used with ToEnvAction""" + envs = set(("all",)); + for I in environments: + if getattr(I,"is_deb",False) or getattr(I,"is_rpm",False): + envs.add(I.name); + envs.update(I.aliases); + return envs; + +def env_choices(): + """All the names that can be used with ToEnvAction""" + envs = set(("all",)); + for I in environments: + envs.add(I.name); + envs.update(I.aliases); + return envs; + +def docker_cmd(env,*cmd): + """Invoke docker""" + cmd = list(cmd); + if env.sudo: + return subprocess.check_call(["sudo","docker"] + cmd); + return subprocess.check_call(["docker"] + cmd); + +def docker_cmd_str(env,*cmd): + """Invoke docker""" + cmd = list(cmd); + if env.sudo: + return subprocess.check_output(["sudo","docker"] + cmd).decode(); + return subprocess.check_output(["docker"] + cmd).decode(); + +@contextmanager +def private_tmp(args): + """Simple version of Python 3's tempfile.TemporaryDirectory""" + dfn = tempfile.mkdtemp(); + try: + yield dfn; + finally: + try: + shutil.rmtree(dfn); + except: + # The debian builds result in root owned files because we don't use fakeroot + subprocess.check_call(['sudo','rm','-rf',dfn]); + +@contextmanager +def inDirectory(dir): + cdir = os.getcwd(); + try: + os.chdir(dir); + yield True; + finally: + os.chdir(cdir); + +def map_git_args(src_root,to): + """Return a list of docker arguments that will map the .git directory into the + container""" + srcd = os.path.join(src_root,".git"); + res = ["-v","%s:%s:ro"%(srcd, + os.path.join(to,".git"))]; + + alternates = os.path.join(srcd,"objects/info/alternates"); + if os.path.exists(alternates): + with open(alternates) as F: + for I in F.readlines(): + I = I.strip(); + res.extend(["-v","%s:%s:ro"%(I,I)]); + + return res; + +def get_image_id(args,image_name): + img = json.loads(docker_cmd_str(args,"inspect",image_name)); + image_id = img[0]["Id"]; + # Newer dockers put a prefix + if ":" in image_id: + image_id = image_id.partition(':')[2]; + return image_id; + +# ------------------------------------------------------------------------- + +def get_tar_file(args,tarfn,pandoc_prebuilt=False): + """Create a tar file that matches what buildlib/github-release would do if it + was a tagged release""" + prefix = "%s-%s/"%(project,get_version()); + if not pandoc_prebuilt: + subprocess.check_call(["git","archive", + # This must match the prefix generated buildlib/github-release + "--prefix",prefix, + "--output",tarfn, + "HEAD"]); + return; + + # When the OS does not support pandoc we got through the extra step to + # build pandoc output in the azp container and include it in the + # tar. + if not args.use_prebuilt_pandoc: + subprocess.check_call(["buildlib/cbuild","make","azure_pipelines","docs"]); + + cmd_make_dist_tar(argparse.Namespace(BUILD="build-azure_pipelines",tarfn=tarfn, + script_pwd="",tag=None)); + +def run_rpm_build(args,spec_file,env): + with open(spec_file,"r") as F: + for ln in F: + if ln.startswith("Version:"): + ver = ln.strip().partition(' ')[2].strip(); + assert(ver == get_version()); + + if ln.startswith("Source:"): + tarfn = ln.strip().partition(' ')[2].strip(); + + image_id = get_image_id(args,env.image_name()); + with private_tmp(args) as tmpdir: + os.mkdir(os.path.join(tmpdir,"SOURCES")); + os.mkdir(os.path.join(tmpdir,"tmp")); + + get_tar_file(args,os.path.join(tmpdir,"SOURCES",tarfn), + pandoc_prebuilt=not env.pandoc); + + with open(spec_file,"r") as inF: + spec = list(inF); + tspec_file = os.path.basename(spec_file); + with open(os.path.join(tmpdir,tspec_file),"w") as outF: + outF.write("".join(spec)); + + home = os.path.join(os.path.sep,"home",os.getenv("LOGNAME")); + vdir = os.path.join(home,"rpmbuild"); + + opts = [ + "run", + "--rm=true", + "-v","%s:%s"%(tmpdir,vdir), + "-w",vdir, + "-h","builder-%s"%(image_id[:12]), + "-e","HOME=%s"%(home), + "-e","TMPDIR=%s"%(os.path.join(vdir,"tmp")), + ]; + + # rpmbuild complains if we do not have an entry in passwd and group + # for the user we are going to use to do the build. + with open(os.path.join(tmpdir,"go.py"),"w") as F: + print(""" +import os,subprocess; +with open("/etc/passwd","a") as F: + F.write({passwd!r} + "\\n"); +with open("/etc/group","a") as F: + F.write({group!r} + "\\n"); +os.setgid({gid:d}); +os.setuid({uid:d}); + +# For Centos6 +if "check_output" in dir(subprocess): + # Get RPM to tell us the expected tar filename. + for ln in subprocess.check_output(["rpmspec","-P",{tspec_file!r}]).splitlines(): + if ln.startswith(b"Source:"): + tarfn = ln.strip().partition(b' ')[2].strip(); + if tarfn != {tarfn!r}: + os.symlink({tarfn!r},os.path.join(b"SOURCES",tarfn)); +""".format(passwd=":".join(str(I) for I in pwd.getpwuid(os.getuid())), + group=":".join(str(I) for I in grp.getgrgid(os.getgid())), + uid=os.getuid(), + gid=os.getgid(), + tarfn=tarfn, + tspec_file=tspec_file), file=F); + + extra_opts = getattr(env,"rpmbuild_options", []) + bopts = ["-bb",tspec_file] + extra_opts; + for arg in args.with_flags: + bopts.extend(["--with", arg]); + for arg in args.without_flags: + bopts.extend(["--without", arg]); + if "pyverbs" not in args.with_flags + args.without_flags: + if env.build_pyverbs: + bopts.extend(["--with", "pyverbs"]); + + print('os.execlp("rpmbuild","rpmbuild",%s)'%( + ",".join(repr(I) for I in bopts)), file=F); + + if args.run_shell: + opts.append("-ti"); + opts.append(env.image_name()); + + if args.run_shell: + opts.append("/bin/bash"); + else: + opts.extend([env.python_cmd,"go.py"]); + + docker_cmd(args,*opts) + + print() + for path,jnk,files in os.walk(os.path.join(tmpdir,"RPMS")): + for I in files: + print("Final RPM: ",os.path.join("..",I)); + shutil.move(os.path.join(path,I), + os.path.join("..",I)); + +def run_deb_build(args,env): + image_id = get_image_id(args,env.image_name()); + with private_tmp(args) as tmpdir: + os.mkdir(os.path.join(tmpdir,"src")); + os.mkdir(os.path.join(tmpdir,"tmp")); + + opwd = os.getcwd(); + with inDirectory(os.path.join(tmpdir,"src")): + subprocess.check_call(["git", + "--git-dir",os.path.join(opwd,".git"), + "reset","--hard","HEAD"]); + + home = os.path.join(os.path.sep,"home",os.getenv("LOGNAME")); + + opts = [ + "run", + "--read-only", + "--rm=true", + "-v","%s:%s"%(tmpdir,home), + "-w",os.path.join(home,"src"), + "-h","builder-%s"%(image_id[:12]), + "-e","HOME=%s"%(home), + "-e","TMPDIR=%s"%(os.path.join(home,"tmp")), + "-e","DEB_BUILD_OPTIONS=parallel=%u"%(multiprocessing.cpu_count()), + ]; + + # Create a go.py that will let us run the compilation as the user and + # then switch to root only for the packaging step. + with open(os.path.join(tmpdir,"go.py"),"w") as F: + print(""" +import subprocess,os; +def to_user(): + os.setgid({gid:d}); + os.setuid({uid:d}); +subprocess.check_call(["debian/rules","debian/rules","build"], + preexec_fn=to_user); +subprocess.check_call(["debian/rules","debian/rules","binary"]); +""".format(uid=os.getuid(), + gid=os.getgid()), file=F); + + if args.run_shell: + opts.append("-ti"); + opts.append(env.image_name()); + + if args.run_shell: + opts.append("/bin/bash"); + else: + opts.extend(["python3",os.path.join(home,"go.py")]); + + docker_cmd(args,*opts); + + print() + for I in os.listdir(tmpdir): + if I.endswith(".deb"): + print("Final DEB: ",os.path.join("..",I)); + shutil.move(os.path.join(tmpdir,I), + os.path.join("..",I)); + +def copy_abi_files(src): + """Retrieve the current ABI files and place them in the source tree.""" + if not os.path.isdir(src): + return; + + for path,jnk,files in os.walk(src): + for I in files: + if not I.startswith("current-"): + continue; + + ref_fn = os.path.join("ABI",I[8:]); + cur_fn = os.path.join(src, path, I); + + if os.path.isfile(ref_fn) and filecmp.cmp(ref_fn,cur_fn,False): + continue; + + print("Changed ABI File: ", ref_fn); + shutil.copy(cur_fn, ref_fn); + +def run_azp_build(args,env): + # Load the commands from the pipelines file + with open("buildlib/azure-pipelines.yml") as F: + azp = yaml.safe_load(F); + for bst in azp["stages"]: + if bst["stage"] == "Build": + break; + else: + raise ValueError("No Build stage found"); + for job in bst["jobs"]: + if job["job"] == "Compile": + break; + else: + raise ValueError("No Compile job found"); + + script = ["#!/bin/bash"] + workdir = "/__w/1" + srcdir = os.path.join(workdir,"s"); + for I in job["steps"]: + script.append("echo ==================================="); + script.append("echo %s"%(I["displayName"])); + script.append("cd %s"%(srcdir)); + if "bash" in I: + script.append(I["bash"]); + elif I.get("task") == "PythonScript@0": + script.append("set -e"); + if "workingDirectory" in I["inputs"]: + script.append("cd %s"%(os.path.join(srcdir,I["inputs"]["workingDirectory"]))); + script.append("%s %s %s"%(I["inputs"]["pythonInterpreter"], + os.path.join(srcdir,I["inputs"]["scriptPath"]), + I["inputs"].get("arguments",""))); + else: + raise ValueError("Unknown stanza %r"%(I)); + + with private_tmp(args) as tmpdir: + os.mkdir(os.path.join(tmpdir,"s")); + os.mkdir(os.path.join(tmpdir,"tmp")); + + opwd = os.getcwd(); + with inDirectory(os.path.join(tmpdir,"s")): + subprocess.check_call(["git", + "--git-dir",os.path.join(opwd,".git"), + "reset","--hard","HEAD"]); + subprocess.check_call(["git", + "--git-dir",os.path.join(opwd,".git"), + "fetch", + "--no-tags", + "https://github.com/linux-rdma/rdma-core.git","HEAD", + "master"]); + base = subprocess.check_output(["git", + "--git-dir",os.path.join(opwd,".git"), + "merge-base", + "HEAD","FETCH_HEAD"]).decode().strip(); + + opts = [ + "run", + "--read-only", + "--rm=true", + "-v","%s:%s"%(tmpdir, workdir), + "-w",srcdir, + "-u",str(os.getuid()), + "-e","SYSTEM_PULLREQUEST_SOURCECOMMITID=HEAD", + # azp puts the branch name 'master' here, we need to put a commit ID.. + "-e","SYSTEM_PULLREQUEST_TARGETBRANCH=%s"%(base), + "-e","HOME=%s"%(workdir), + "-e","TMPDIR=%s"%(os.path.join(workdir,"tmp")), + ] + map_git_args(opwd,srcdir); + + if args.run_shell: + opts.append("-ti"); + opts.append(env.image_name()); + + with open(os.path.join(tmpdir,"go.sh"),"w") as F: + F.write("\n".join(script)) + + if args.run_shell: + opts.append("/bin/bash"); + else: + opts.extend(["/bin/bash",os.path.join(workdir,"go.sh")]); + + try: + docker_cmd(args,*opts); + except subprocess.CalledProcessError as e: + copy_abi_files(os.path.join(tmpdir, "s/ABI")); + raise; + copy_abi_files(os.path.join(tmpdir, "s/ABI")); + +def args_pkg(parser): + parser.add_argument("ENV",action=ToEnvActionPkg,choices=env_choices_pkg()); + parser.add_argument("--run-shell",default=False,action="store_true", + help="Instead of running the build, enter a shell"); + parser.add_argument("--use-prebuilt-pandoc",default=False,action="store_true", + help="Do not rebuild the pandoc cache in build-azure_pipelines/pandoc-prebuilt/"); + parser.add_argument("--with", default=[],action="append", dest="with_flags", + help="Enable specified feature in RPM builds"); + parser.add_argument("--without", default=[],action="append", dest="without_flags", + help="Disable specified feature in RPM builds"); +def cmd_pkg(args): + """Build a package in the given environment.""" + for env in args.ENV: + if env.name == "azure_pipelines": + run_azp_build(args,env); + elif getattr(env,"is_deb",False): + run_deb_build(args,env); + elif getattr(env,"is_rpm",False): + run_rpm_build(args, + getattr(env,"specfile","%s.spec"%(project)), + env); + else: + print("%s does not support packaging"%(env.name)); + +# ------------------------------------------------------------------------- + +def args_make(parser): + parser.add_argument("--run-shell",default=False,action="store_true", + help="Instead of running the build, enter a shell"); + parser.add_argument("ENV",action=ToEnvAction,choices=env_choices()); + parser.add_argument('ARGS', nargs=argparse.REMAINDER); +def cmd_make(args): + """Run cmake and ninja within a docker container. If cmake has not yet been + run then this runs it with the given environment variables, then invokes ninja. + Otherwise ninja is invoked without calling cmake.""" + SRC = os.getcwd(); + + for env in args.ENV: + BUILD = "build-%s"%(env.name) + if not os.path.exists(BUILD): + os.mkdir(BUILD); + + home = os.path.join(os.path.sep,"home",os.getenv("LOGNAME")); + + dirs = [os.getcwd(),"/tmp"]; + # Import the symlink target too if BUILD is a symlink + BUILD_r = os.path.realpath(BUILD); + if not BUILD_r.startswith(os.path.realpath(SRC)): + dirs.append(BUILD_r); + + cmake_args = [] + if not env.build_pyverbs: + cmake_args.extend(["-DNO_PYVERBS=1"]); + + cmake_envs = [] + ninja_args = [] + for I in args.ARGS: + if I.startswith("-D"): + cmake_args.append(I); + elif I.find('=') != -1: + cmake_envs.append(I); + else: + ninja_args.append(I); + if env.use_make: + need_cmake = not os.path.exists(os.path.join(BUILD_r,"Makefile")); + else: + need_cmake = not os.path.exists(os.path.join(BUILD_r,"build.ninja")); + opts = ["run", + "--read-only", + "--rm=true", + "-ti", + "-u",str(os.getuid()), + "-e","HOME=%s"%(home), + "-w",BUILD_r, + ]; + for I in dirs: + opts.append("-v"); + opts.append("%s:%s"%(I,I)); + for I in cmake_envs: + opts.append("-e"); + opts.append(I); + if args.run_shell: + opts.append("-ti"); + opts.append(env.image_name()); + + if args.run_shell: + os.execlp("sudo","sudo","docker",*(opts + ["/bin/bash"])); + + if need_cmake: + if env.use_make: + prog_args = ["cmake",SRC] + cmake_args; + else: + prog_args = ["cmake","-GNinja",SRC] + cmake_args; + docker_cmd(args,*(opts + prog_args)); + + if env.use_make: + prog_args = ["make","-C",BUILD_r] + ninja_args; + else: + prog_args = [getattr(env,"ninja_cmd","ninja"), + "-C",BUILD_r] + ninja_args; + + if len(args.ENV) <= 1: + os.execlp("sudo","sudo","docker",*(opts + prog_args)); + else: + docker_cmd(args,*(opts + prog_args)); + +# ------------------------------------------------------------------------- + +def get_build_args(args,env): + """Return extra docker arguments for building. This is the system APT proxy.""" + res = []; + if args.pull: + res.append("--pull"); + + if env.proxy and os.path.exists("/etc/apt/apt.conf.d/01proxy"): + # The line in this file must be 'Acquire::http { Proxy "http://xxxx:3142"; };' + with open("/etc/apt/apt.conf.d/01proxy") as F: + proxy = F.read().strip().split('"')[1]; + res.append("--build-arg"); + res.append('http_proxy=%s'%(proxy)); + return res; + +def args_build_images(parser): + parser.add_argument("ENV",nargs="+",action=ToEnvAction,choices=env_choices()); + parser.add_argument("--no-pull",default=True,action="store_false", + dest="pull", + help="Instead of running the build, enter a shell"); +def cmd_build_images(args): + """Run from the top level source directory to make the docker images that are + needed for building. This only needs to be run once.""" + # Docker copies the permissions from the local host and we need this umask + # to be 022 or the container breaks + os.umask(0o22) + for env in args.ENV: + with private_tmp(args) as tmpdir: + df = env.get_docker_file(tmpdir); + fn = os.path.join(tmpdir,"Dockerfile"); + with open(fn,"wt") as F: + for ln in df.lines: + print(ln, file=F); + opts = (["build"] + + get_build_args(args,env) + + ["-f",fn, + "-t",env.image_name(), + tmpdir]); + docker_cmd(args,*opts); + +# ------------------------------------------------------------------------- + +def args_push_azp_images(args): + pass +def cmd_push_azp_images(args): + """Push the images required for Azure Pipelines to the container + registry. Must have done 'az login' first""" + subprocess.check_call(["sudo","az","acr","login","--name","ucfconsort"]); + with private_tmp(args) as tmpdir: + nfn = os.path.join(tmpdir,"build.ninja"); + with open(nfn,"w") as F: + F.write("""rule push + command = docker push $img + description=Push $img\n"""); + + for env in environments: + name = env.image_name() + if "ucfconsort.azurecr.io" not in name: + continue + F.write("build push_%s : push\n img = %s\n"%(env.name,env.image_name())); + F.write("default push_%s\n"%(env.name)); + subprocess.check_call(["sudo","ninja"],cwd=tmpdir); + +# ------------------------------------------------------------------------- +def args_make_dist_tar(parser): + parser.add_argument("BUILD",help="Path to the build directory") + parser.add_argument("--tarfn",help="Output TAR filename") + parser.add_argument("--tag",help="git tag to sanity check against") +def cmd_make_dist_tar(args): + """Make the standard distribution tar. The BUILD argument must point to a build + output directory that has pandoc-prebuilt""" + ver = get_version(); + + if not args.tarfn: + args.tarfn = "%s-%s.tar.gz"%(project,ver) + + # The tag name and the cmake file must match. + if args.tag: + assert args.tag == "v" + ver; + + os.umask(0o22) + with private_tmp(args) as tmpdir: + tmp_tarfn = os.path.join(tmpdir,"tmp.tar"); + + prefix = "%s-%s/"%(project,get_version()); + subprocess.check_call(["git","archive", + "--prefix",prefix, + "--output",tmp_tarfn, + "HEAD"]); + + # Mangle the paths and append the prebuilt stuff to the tar file + if args.BUILD: + subprocess.check_call([ + "tar", + "-C",os.path.join(args.script_pwd,args.BUILD,"pandoc-prebuilt"), + "-rf",tmp_tarfn, + "./", + "--xform",r"s|^\.|%sbuildlib/pandoc-prebuilt|g"%(prefix)]); + + assert args.tarfn.endswith(".gz") or args.tarfn.endswith(".tgz"); + with open(os.path.join(args.script_pwd,args.tarfn),"w") as F: + subprocess.check_call(["gzip","-9c",tmp_tarfn],stdout=F); + +# ------------------------------------------------------------------------- + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Operate docker for building this package') + subparsers = parser.add_subparsers(title="Sub Commands",dest="command"); + subparsers.required = True; + + funcs = globals(); + for k,v in list(funcs.items()): + if k.startswith("cmd_") and inspect.isfunction(v): + sparser = subparsers.add_parser(k[4:].replace('_','-'), + help=v.__doc__); + sparser.required = True; + funcs["args_" + k[4:]](sparser); + sparser.set_defaults(func=v); + + try: + import argcomplete; + argcomplete.autocomplete(parser); + except ImportError: + pass; + + args = parser.parse_args(); + args.sudo = True; + + # This script must always run from the top of the git tree, and a git + # checkout is mandatory. + git_top = subprocess.check_output(["git","rev-parse","--show-toplevel"]).strip(); + args.script_pwd = os.getcwd(); + os.chdir(git_top); + + args.func(args); diff --git a/buildlib/centos6.spec b/buildlib/centos6.spec new file mode 100644 index 0000000..199930b --- /dev/null +++ b/buildlib/centos6.spec @@ -0,0 +1,109 @@ +Name: rdma-core +Version: 29.0 +Release: 1%{?dist} +Summary: RDMA core userspace libraries and daemons + +# Almost everything is licensed under the OFA dual GPLv2, 2 Clause BSD license +# providers/ipathverbs/ Dual licensed using a BSD license with an extra patent clause +# providers/rxe/ Incorporates code from ipathverbs and contains the patent clause +# providers/hfi1verbs Uses the 3 Clause BSD license +License: (GPLv2 or BSD) and (GPLv2 or PathScale-BSD) +Url: https://github.com/linux-rdma/rdma-core +Source: rdma-core.tgz + +BuildRequires: binutils +BuildRequires: cmake >= 2.8.11 +BuildRequires: gcc +BuildRequires: libudev-devel +BuildRequires: pkgconfig +BuildRequires: pkgconfig(libnl-3.0) +BuildRequires: pkgconfig(libnl-route-3.0) +BuildRequires: valgrind-devel +BuildRequires: python + +%define CMAKE_FLAGS %{nil} +BuildRequires: make + +%description +Temporary packaging + +This is a simple example without the split sub packages to get things started. + +%prep +%setup + +%build + +%define my_unitdir /tmp/ + +# New RPM defines _rundir, usually as /run +%if 0%{?_rundir:1} +%else +%define _rundir /var/run +%endif + +# New RPM defines _udevrulesdir, usually as /usr/lib/udev/rules.d +%if 0%{?_udevrulesdir:1} +%else +# This is the old path (eg for C6) +%define _udevrulesdir /lib/udev/rules.d +%endif + +# Pass all of the rpm paths directly to GNUInstallDirs and our other defines. +%cmake %{CMAKE_FLAGS} \ + -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_INSTALL_BINDIR:PATH=%{_bindir} \ + -DCMAKE_INSTALL_SBINDIR:PATH=%{_sbindir} \ + -DCMAKE_INSTALL_LIBDIR:PATH=%{_libdir} \ + -DCMAKE_INSTALL_LIBEXECDIR:PATH=%{_libexecdir} \ + -DCMAKE_INSTALL_LOCALSTATEDIR:PATH=%{_localstatedir} \ + -DCMAKE_INSTALL_SHAREDSTATEDIR:PATH=%{_sharedstatedir} \ + -DCMAKE_INSTALL_INCLUDEDIR:PATH=%{_includedir} \ + -DCMAKE_INSTALL_INFODIR:PATH=%{_infodir} \ + -DCMAKE_INSTALL_MANDIR:PATH=%{_mandir} \ + -DCMAKE_INSTALL_SYSCONFDIR:PATH=%{_sysconfdir} \ + -DCMAKE_INSTALL_SYSTEMD_SERVICEDIR:PATH=%{my_unitdir} \ + -DCMAKE_INSTALL_INITDDIR:PATH=%{_initrddir} \ + -DCMAKE_INSTALL_RUNDIR:PATH=%{_rundir} \ + -DCMAKE_INSTALL_DOCDIR:PATH=%{_docdir}/%{name}-%{version} \ + -DCMAKE_INSTALL_UDEV_RULESDIR:PATH=%{_udevrulesdir} \ + -DCMAKE_INSTALL_PERLDIR:PATH=%{perl_vendorlib} +make -s %{?_smp_mflags} + +%install +DESTDIR=%{buildroot} make install + +%if 0%{?_unitdir:1} +rm -rf %{buildroot}/%{_initrddir}/ +%else +rm -rf %{buildroot}/%{my_unitdir}/ +%endif + +%files +%doc %{_mandir}/man*/* +%{_bindir}/* +%{_includedir}/* +%{_libdir}/lib*.so* +%{_libdir}/libibverbs/* +%{_libdir}/ibacm/* +%{_libdir}/rsocket/* +%{_libdir}/pkgconfig/*.pc +%{_sbindir}/* +%{_libexecdir}/* +%{_udevrulesdir}/* +%{_udevrulesdir}/../rdma_rename +%doc %{_docdir}/%{name}-%{version}/* +%if 0%{?_unitdir:1} +%{_unitdir}/* +%else +%config %{_initrddir}/* +%endif +%config %{_sysconfdir}/iwpmd.conf +%config %{_sysconfdir}/srp_daemon.conf +%config %{_sysconfdir}/libibverbs.d/* +%config %{_sysconfdir}/rdma/modules/* +%{perl_vendorlib}/IBswcountlimits.pm +%config(noreplace) %{_sysconfdir}/udev/rules.d/* +%config(noreplace) %{_sysconfdir}/infiniband-diags/error_thresholds +%config(noreplace) %{_sysconfdir}/infiniband-diags/ibdiag.conf +%{_sysconfdir}/modprobe.d/* diff --git a/buildlib/check-build b/buildlib/check-build new file mode 100755 index 0000000..ab8524e --- /dev/null +++ b/buildlib/check-build @@ -0,0 +1,511 @@ +#!/usr/bin/env python3 +# Copyright 2017 Obsidian Research Corp. +# Licensed under BSD (MIT variant) or GPLv2. See COPYING. +"""check-build - Run static checks on a build""" +from __future__ import print_function +import argparse +import inspect +import os +import re +import shutil +import subprocess +import tempfile +import sys +import copy +import shlex +import pipes +from contextlib import contextmanager; +from distutils.version import LooseVersion; + +def get_src_dir(): + """Get the source directory using git""" + git_top = subprocess.check_output(["git","rev-parse","--git-dir"]).decode().strip(); + if git_top == ".git": + return "."; + return os.path.dirname(git_top); + +def get_package_version(args): + """Return PACKAGE_VERSION from CMake""" + with open(os.path.join(args.SRC,"CMakeLists.txt")) as F: + for ln in F: + g = re.match(r'^set\(PACKAGE_VERSION "(.+)"\)',ln) + if g is None: + continue; + return g.group(1); + raise RuntimeError("Could not find version"); + +@contextmanager +def inDirectory(dir): + cdir = os.getcwd(); + try: + os.chdir(dir); + yield True; + finally: + os.chdir(cdir); + +@contextmanager +def private_tmp(): + """Simple version of Python 3's tempfile.TemporaryDirectory""" + dfn = tempfile.mkdtemp(); + try: + yield dfn; + finally: + shutil.rmtree(dfn); + +# ------------------------------------------------------------------------- + +def get_symbol_vers(fn,exported=True): + """Return the symbol version suffixes from the ELF file, eg IB_VERBS_1.0, etc""" + syms = subprocess.check_output(["readelf","--wide","-s",fn]).decode(); + + go = False; + res = set(); + for I in syms.splitlines(): + if I.startswith("Symbol table '.dynsym'"): + go = True; + continue; + + if I.startswith(" ") and go: + itms = I.split(); + if exported: + if (len(itms) == 8 and itms[3] == "OBJECT" and + itms[4] == "GLOBAL" and itms[6] == "ABS"): + res.add(itms[7]); + else: + if (len(itms) >= 8 and itms[3] == "FUNC" and + itms[4] == "GLOBAL" and itms[6] == "UND"): + res.add(itms[7]); + else: + go = False; + if not res: + raise ValueError("Failed to read ELF symbol versions from %r"%(fn)); + return res; + +def check_lib_symver(args,fn): + g = re.match(r"lib([^.]+)\.so\.(\d+)\.(\d+)\.(.*)",fn); + if g.group(4) != args.PACKAGE_VERSION: + raise ValueError("Shared Library filename %r does not have the package version %r (%r)%"( + fn,args.PACKAGE_VERSION,g.groups())); + + # umad/etc used the wrong symbol version name when they moved to soname 3.0 + if g.group(1) == "ibumad": + newest_symver = "%s_%s.%s"%(g.group(1).upper(),'1',g.group(3)); + elif g.group(1) == "ibmad": + newest_symver = "%s_%s.%s"%(g.group(1).upper(),'1',g.group(3)); + elif g.group(1) == "ibnetdisc": + newest_symver = "%s_%s.%s"%(g.group(1).upper(),'1',g.group(3)); + else: + newest_symver = "%s_%s.%s"%(g.group(1).upper(),g.group(2),g.group(3)); + + syms = get_symbol_vers(fn); + if newest_symver not in syms: + raise ValueError("Symbol version %r implied by filename %r not in ELF (%r)"%( + newest_symver,fn,syms)); + + # The private symbol tag should also be older than the package version + private = set(I for I in syms if "PRIVATE" in I) + if len(private) > 1: + raise ValueError("Too many private symbol versions in ELF %r (%r)"%(fn,private)); + if private: + private_rel = list(private)[0].split('_')[-1]; + if private_rel > args.PACKAGE_VERSION: + raise ValueError("Private Symbol Version %r is newer than the package version %r"%( + private,args.PACKAGE_VERSION)); + + syms = list(syms - private); + syms.sort(key=LooseVersion) + if newest_symver != syms[-1]: + raise ValueError("Symbol version %r implied by filename %r not the newest in ELF (%r)"%( + newest_symver,fn,syms)); + +def test_lib_names(args): + """Check that the library filename matches the symbol versions""" + libd = os.path.join(args.BUILD,"lib"); + + # List of shlibs that follow the ABI guidelines + libs = {}; + with inDirectory(libd): + for fn in os.listdir("."): + if os.path.islink(fn): + lfn = os.readlink(fn); + if not os.path.islink(lfn): + check_lib_symver(args,lfn); +# ------------------------------------------------------------------------- + +def check_verbs_abi(args,fn): + g = re.match(r"lib([^-]+)-rdmav(\d+).so",fn); + if g is None: + raise ValueError("Provider library has unknown file name format %r"%(fn)); + + private_ver = int(g.group(2)); + syms = get_symbol_vers(fn,exported=False); + syms = {I.partition("@")[2] for I in syms}; + assert "IBVERBS_PRIVATE_%u"%(private_ver) in syms; + assert len([I for I in syms if I.startswith("IBVERBS_PRIVATE")]) == 1; + +def test_verbs_private(args): + """Check that the IBVERBS_PRIVATE symbols match the library name, eg that the + map file and the cmake stuff are in sync.""" + libd = os.path.join(args.BUILD,"lib"); + with inDirectory(libd): + for fn in os.listdir("."): + if not os.path.islink(fn) and "rdmav" in fn and fn.endswith(".so"): + check_verbs_abi(args,fn); + +# ------------------------------------------------------------------------- + +def check_abi(args,fn): + g1 = re.match(r"lib([^.]+).so\.(.+)\.(.+)",fn); + g2 = re.match(r"lib([^.]+).so\.(.+\..+)",fn); + if g1 is None or g2 is None: + raise ValueError("Library has unknown file name format %r"%(fn)); + + ref_fn = os.path.join(args.SRC,"ABI",g1.group(1) + ".dump"); + cur_fn = os.path.join(args.SRC,"ABI","current-" + g1.group(1) + ".dump"); + subprocess.check_call(["abi-dumper", + "-lver",g2.group(1), + fn, + "-o",cur_fn]); + + if not os.path.exists(ref_fn): + print("ABI file does not exist for %r"%(ref_fn), file=sys.stderr); + return False; + + subprocess.check_call(["abi-compliance-checker", + "-l",g1.group(1), + "-old",ref_fn, + "-new",cur_fn]); + + return True; + +def test_verbs_uapi(args): + """Compare the ABI output from 'abi-dumper' between what is present in git and + what was built in this tree. This allows us to detect changes in ABI on + the -stable branch.""" + + # User must provide the ABI dir in the source tree + if not os.path.isdir(os.path.join(args.SRC,"ABI")): + print("ABI check skipped, no ABI/ directory."); + return; + + libd = os.path.join(args.BUILD,"lib"); + success = True; + with inDirectory(libd): + for fn in os.listdir("."): + if not os.path.islink(fn) and re.match(r"lib.+\.so\..+\..+",fn): + success = success & check_abi(args,fn); + + assert success == True; + +# ------------------------------------------------------------------------- + +def is_obsolete(fn): + """True if the header is obsolete and should not be compiled anyhow.""" + with open(fn) as F: + for ln in F.readlines(): + if re.search(r"#warning.*This header is obsolete",ln): + return True; + return False; + +def is_fixup(fn): + """True if this is a fixup header, fixup headers are exempted because they + required includes are not the same for kernel headers (eg netinet/in.h)""" + if os.path.islink(fn): + return "buildlib/fixup-include/" in os.readlink(fn); + return False; + +def get_headers(incdir): + includes = set(); + for root,dirs,files in os.walk(incdir): + for I in files: + if I.endswith(".h"): + includes.add(os.path.join(root,I)); + return includes; + +def compile_test_headers(tmpd,incdir,includes,with_cxx=False): + cppflags = subprocess.check_output(["pkg-config","libnl-3.0","--cflags-only-I"]).decode().strip(); + cppflags = "-I %s %s"%(incdir,cppflags) + with open(os.path.join(tmpd,"build.ninja"),"wt") as F: + print("rule comp", file=F); + print(" command = %s -Werror -c %s $in -o $out"%(args.CC,cppflags), file=F); + print(" description=Header check for $in", file=F); + print("rule comp_cxx", file=F); + print(" command = %s -Werror -c %s $in -o $out"%(args.CXX,cppflags), file=F); + print(" description=Header C++ check for $in", file=F); + count = 0; + for I in sorted(includes): + if is_obsolete(I) or is_fixup(I): + continue; + print("build %s : comp %s"%("out%d.o"%(count),I), file=F); + print("default %s"%("out%d.o"%(count)), file=F); + print("build %s : comp_cxx %s"%("outxx%d.o"%(count),I), file=F); + if with_cxx: + print("default %s"%("outxx%d.o"%(count)), file=F); + count = count + 1; + subprocess.check_call(["ninja"],cwd=tmpd); + +def test_published_headers(args): + """Test that every header file can be included on its own, and has no obvious + implicit dependencies. This is intended as a first pass check of the public + installed API headers""" + incdir = os.path.abspath(os.path.join(args.BUILD,"include")); + includes = get_headers(incdir); + + # Make a little ninja file to compile each header + with private_tmp() as tmpd: + compile_test_headers(tmpd,incdir,includes); + +# ------------------------------------------------------------------------- + +allowed_uapi_headers = { + # This header is installed in all supported distributions + "rdma/ib_user_sa.h", + "rdma/ib_user_verbs.h", +} + +non_cxx_headers = { + "infiniband/arch.h", + "infiniband/ib.h", + "infiniband/ib_user_ioctl_verbs.h", + "infiniband/ibnetdisc_osd.h", + "infiniband/mad_osd.h", + "infiniband/mlx5_api.h", + "infiniband/mlx5_user_ioctl_verbs.h", + "infiniband/opcode.h", + "infiniband/sa-kern-abi.h", + "infiniband/sa.h", + "infiniband/verbs_api.h", + "rdma/rdma_cma_abi.h", +} + +def test_installed_headers(args): + """This test also checks that the public headers can be compiled on their own, + but goes further and confirms that the public headers do not depend on any + internal headers, or kernel kAPI headers.""" + with private_tmp() as tmpd: + env = copy.deepcopy(os.environ); + env["DESTDIR"] = tmpd; + subprocess.check_output(["ninja","install"],env=env,cwd=args.BUILD); + + includes = get_headers(tmpd); + incdir = os.path.commonprefix(list(includes)); + rincludes = {I[len(incdir):] for I in includes}; + + bincdir = os.path.abspath(os.path.join(args.BUILD,"include")); + all_includes = set(); + for I in get_headers(bincdir): + if not is_fixup(I) and not is_obsolete(I): + all_includes.add(I[len(bincdir)+1:]); + + # Drop error includes for any include file that is internal, this way + # when we compile the public headers any include of an internal header + # will fail. + for I in sorted(all_includes - rincludes): + if I in allowed_uapi_headers: + continue; + + I = os.path.join(incdir,I) + dfn = os.path.dirname(I); + if not os.path.isdir(dfn): + os.makedirs(dfn); + assert not os.path.exists(I); + with open(I,"w") as F: + print('#error "Private internal header"', file=F); + + # Roughly check that the headers have the extern "C" for C++ + # compilation. + for I in sorted(rincludes - non_cxx_headers): + with open(os.path.join(incdir,I)) as F: + if 'extern "C" {' not in F.read(): + raise ValueError("No extern C in %r"%(I)); + + compile_test_headers(tmpd,incdir,includes,with_cxx=True); + +# ------------------------------------------------------------------------- + + +def get_symbol_names(fn): + """Return the defined, public, symbols from a ELF shlib""" + syms = subprocess.check_output(["readelf", "--wide", "-s", fn]).decode() + go = False + res = set() + for I in syms.splitlines(): + if I.startswith("Symbol table '.dynsym'"): + go = True + continue + + if I.startswith(" ") and go: + g = re.match( + r"\s+\d+:\s+[0-9a-f]+\s+\d+.*(?:FUNC|OBJECT)\s+GLOBAL\s+DEFAULT\s+\d+\s+(\S+)@@(\S+)$", + I) + if not g or "PRIVATE" in g.group(2): + continue + res.add(g.group(1)) + else: + go = False + + return res + + +def get_cc_args_from_pkgconfig(args, name, static): + """Get the compile arguments from pkg-config for the named librarary""" + os.environ["PKG_CONFIG_PATH"] = os.path.join(args.BUILD, "lib", + "pkgconfig") + flags = ["pkg-config", "--errors-to-stdout", "--cflags", "--libs"] + if static: + flags.append("--static") + opts = subprocess.check_output(flags + ["lib" + name]).decode() + opts = shlex.split(opts) + + opts.insert(0, "-Wall") + opts.insert(0, "-Werror") + opts.insert(0, "-L%s" % (os.path.join(args.BUILD, "lib"))) + opts.insert(1, "-I%s" % (os.path.join(args.BUILD, "include"))) + if not static: + return opts + + # Only static link the pkg-config stuff, otherwise we get warnings about + # static linking portions of glibc that need NSS. + opts.insert(0, "-Wl,-Bstatic") + opts.append("-Wl,-Bdynamic") + + # We need this extra libpthread/m because libnl's pkgconfig file is + # broken and doesn't include the private libraries it requires. :( + if "-lnl-3" in opts: + opts.append("-lm") + opts.append("-lpthread") + + # Put glibc associated libraries out side the static link section, + if "-lpthread" in opts: + while "-lpthread" in opts: + opts.remove("-lpthread") + opts.append("-lpthread") + if "-lm" in opts: + while "-lm" in opts: + opts.remove("-lm") + opts.append("-lm") + return opts + + +def compile_ninja(args, Fninja, name, cfn, opts): + print(""" +rule comp_{name} + command = {CC} -Wall -o $out $in {opts} + description = Compile and link $out +build {name} : comp_{name} {cfn} +default {name}""".format( + name=name, + CC=args.CC, + cfn=cfn, + opts=" ".join(pipes.quote(I) for I in opts)), file=Fninja) + + +def get_providers(args): + """Return a list of provider names""" + return set( + I for I in os.listdir(os.path.join(args.SRC, "providers")) + if not I.startswith(".")) + + +def check_static_lib(args, tmpd, Fninja, static_lib, shared_lib, name): + syms = get_symbol_names(shared_lib) + if not syms: + return + + cfn = os.path.join(tmpd, "%s-test.c" % (name)) + with open(cfn, "wt") as F: + F.write("#include <stdio.h>\n") + for I in syms: + F.write("extern void %s(void);\n" % (I)) + F.write("int main(int argc,const char *argv[]) {\n") + for I in syms: + F.write('printf("%%p",&%s);\n' % (I)) + F.write("return 0; }\n") + + compile_ninja(args, Fninja, "%s-static-out" % (name), cfn, + get_cc_args_from_pkgconfig(args, name, static=True)) + compile_ninja(args, Fninja, "%s-shared-out" % (name), cfn, + get_cc_args_from_pkgconfig(args, name, static=False)) + + +def check_static_providers(args, tmpd, Fninja): + """Test that expected values for RDMA_STATIC_PROVIDERS are accepted and the + link works""" + cfn = os.path.join(tmpd, "provider-test.c") + with open(cfn, "wt") as F: + F.write("#include <infiniband/verbs.h>\n") + F.write("int main(int argc,const char *argv[]) {\n") + F.write('ibv_get_device_list(NULL);\n') + F.write("return 0; }\n") + + opts = get_cc_args_from_pkgconfig( + args, "ibverbs", static=True) + + providers = get_providers(args) + for I in sorted(providers | { + "none", + "all", + }): + compile_ninja(args, Fninja, "providers-%s-static-out" % (I), cfn, + ["-DRDMA_STATIC_PROVIDERS=%s" % (I)] + opts) + + compile_ninja( + args, Fninja, "providers-static-out", cfn, + ["-DRDMA_STATIC_PROVIDERS=%s" % (",".join(providers))] + opts) + + +def test_static_libs(args): + """Compile then link statically and dynamically a dummy program that touches + every symbol in the libraries using pkgconfig output to guide the link + options. This tests that pkgconfig is setup properly and that all the + magic with incorporating the internal libraries for static linking has + done its job.""" + libd = os.path.join(args.BUILD, "lib") + success = True + libs = [] + with inDirectory(libd): + fns = set(fn for fn in os.listdir(".") if not os.path.islink(fn)) + for static_lib in fns: + g = re.match(r"lib(.+)\.a$", static_lib) + if g: + for shared_lib in fns: + if re.match(r"lib%s.*\.so" % (g.group(1)), shared_lib): + libs.append((os.path.join(libd, static_lib), + os.path.join(libd, shared_lib), + g.group(1))) + break + else: + raise ValueError( + "Failed to find matching shared library for %r" % + (static_lib)) + + with private_tmp() as tmpd: + with open(os.path.join(tmpd, "build.ninja"), "wt") as Fninja: + for I in libs: + check_static_lib(args, tmpd, Fninja, I[0], I[1], I[2]) + check_static_providers(args, tmpd, Fninja) + subprocess.check_call(["ninja"], cwd=tmpd) + + +# ------------------------------------------------------------------------- + +parser = argparse.ArgumentParser(description='Run build time tests') +parser.add_argument("--build",default=os.getcwd(),dest="BUILD", + help="Build directory to inpsect"); +parser.add_argument("--src",default=None,dest="SRC", + help="Top of the source tree"); +parser.add_argument("--cc",default="cc",dest="CC", + help="C compiler to use"); +parser.add_argument("--cxx",default="c++",dest="CXX", + help="C++ compiler to use"); +args = parser.parse_args(); + +if args.SRC is None: + args.SRC = get_src_dir(); +args.SRC = os.path.abspath(args.SRC); +args.PACKAGE_VERSION = get_package_version(args); + +funcs = globals(); +for k,v in list(funcs.items()): + if k.startswith("test_") and inspect.isfunction(v): + v(args); diff --git a/buildlib/config.h.in b/buildlib/config.h.in new file mode 100644 index 0000000..5f42d65 --- /dev/null +++ b/buildlib/config.h.in @@ -0,0 +1,77 @@ +#ifndef CONFIG_H_IN +#define CONFIG_H_IN + +#define HAVE_STATEMENT_EXPR 1 +#define HAVE_BUILTIN_TYPES_COMPATIBLE_P 1 +#define HAVE_TYPEOF 1 +#define HAVE_ISBLANK 1 +#define HAVE_BUILTIN_CLZ 1 +#define HAVE_BUILTIN_CLZL 1 + +#define PACKAGE_VERSION "@PACKAGE_VERSION@" + +// FIXME: Remove this, The cmake version hard-requires new style CLOEXEC support +#define STREAM_CLOEXEC "e" + +#define RDMA_CDEV_DIR "/dev/infiniband" + +#define IBV_CONFIG_DIR "@CONFIG_DIR@" +#define RS_CONF_DIR "@CMAKE_INSTALL_FULL_SYSCONFDIR@/rdma/rsocket" +#define IWPM_CONFIG_FILE "@CMAKE_INSTALL_FULL_SYSCONFDIR@/iwpmd.conf" + +#define SRP_DEAMON_CONFIG_FILE "@CMAKE_INSTALL_FULL_SYSCONFDIR@/srp_daemon.conf" +#define SRP_DEAMON_LOCK_PREFIX "@CMAKE_INSTALL_FULL_RUNDIR@/srp_daemon" + +#define ACM_CONF_DIR "@CMAKE_INSTALL_FULL_SYSCONFDIR@/rdma" +#define IBACM_LIB_PATH "@ACM_PROVIDER_DIR@" +#define IBACM_BIN_PATH "@CMAKE_INSTALL_FULL_BINDIR@" +#define IBACM_PID_FILE "@CMAKE_INSTALL_FULL_RUNDIR@/ibacm.pid" +#define IBACM_PORT_BASE "ibacm-tcp.port" +#define IBACM_IBACME_PORT_FILE "@CMAKE_INSTALL_FULL_RUNDIR@/" IBACM_PORT_BASE +#define IBACM_PORT_FILE "@CMAKE_INSTALL_FULL_RUNDIR@/ibacm.port" +#define IBACM_LOG_FILE "@CMAKE_INSTALL_FULL_LOCALSTATEDIR@/log/ibacm.log" +#define IBACM_SERVER_BASE "ibacm-unix.sock" +#define IBACM_IBACME_SERVER_PATH "@CMAKE_INSTALL_FULL_RUNDIR@/" IBACM_SERVER_BASE +#define IBACM_SERVER_PATH "@CMAKE_INSTALL_FULL_RUNDIR@/ibacm.sock" + +#define IBDIAG_CONFIG_PATH "@IBDIAG_CONFIG_PATH@" +#define IBDIAG_NODENAME_MAP_PATH "@IBDIAG_NODENAME_MAP_PATH@" + +#define VERBS_PROVIDER_DIR "@VERBS_PROVIDER_DIR@" +#define VERBS_PROVIDER_SUFFIX "@IBVERBS_PROVIDER_SUFFIX@" +#define IBVERBS_PABI_VERSION @IBVERBS_PABI_VERSION@ + +// FIXME This has been supported in compilers forever, we should just fail to build on such old systems. +#cmakedefine HAVE_FUNC_ATTRIBUTE_ALWAYS_INLINE 1 + +#cmakedefine HAVE_FUNC_ATTRIBUTE_IFUNC 1 + +#cmakedefine HAVE_WORKING_IF_H 1 + +// Operating mode for symbol versions +#cmakedefine HAVE_FULL_SYMBOL_VERSIONS 1 +#cmakedefine HAVE_LIMITED_SYMBOL_VERSIONS 1 + +@SIZEOF_LONG_CODE@ + +#if @IOCTL_MODE_NUM@ == 1 +# define VERBS_IOCTL_ONLY 1 +# define VERBS_WRITE_ONLY 0 +#elif @IOCTL_MODE_NUM@ == 2 +# define VERBS_IOCTL_ONLY 0 +# define VERBS_WRITE_ONLY 1 +#elif @IOCTL_MODE_NUM@ == 3 +# define VERBS_IOCTL_ONLY 0 +# define VERBS_WRITE_ONLY 0 +#endif + +// Configuration defaults + +#define IBACM_SERVER_MODE_UNIX 0 +#define IBACM_SERVER_MODE_LOOP 1 +#define IBACM_SERVER_MODE_OPEN 2 +#define IBACM_SERVER_MODE_DEFAULT @IBACM_SERVER_MODE_DEFAULT@ + +#define IBACM_ACME_PLUS_KERNEL_ONLY_DEFAULT @IBACM_ACME_PLUS_KERNEL_ONLY_DEFAULT@ + +#endif diff --git a/buildlib/const_structs.checkpatch b/buildlib/const_structs.checkpatch new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/buildlib/const_structs.checkpatch diff --git a/buildlib/fixup-include/assert.h b/buildlib/fixup-include/assert.h new file mode 100644 index 0000000..848c931 --- /dev/null +++ b/buildlib/fixup-include/assert.h @@ -0,0 +1,10 @@ +#ifndef _FIXUP_ASSERT_H +#define _FIXUP_ASSERT_H + +#include_next <assert.h> + +/* Without C11 compiler support it is not possible to implement static_assert */ +#undef static_assert +#define static_assert(_cond, msg) + +#endif diff --git a/buildlib/fixup-include/linux-in.h b/buildlib/fixup-include/linux-in.h new file mode 100644 index 0000000..07fdbda --- /dev/null +++ b/buildlib/fixup-include/linux-in.h @@ -0,0 +1,2 @@ +/* if in.h can't be included just leave it empty */ +#include <netinet/in.h> diff --git a/buildlib/fixup-include/linux-in6.h b/buildlib/fixup-include/linux-in6.h new file mode 100644 index 0000000..87965b3 --- /dev/null +++ b/buildlib/fixup-include/linux-in6.h @@ -0,0 +1,2 @@ +/* if in6.h can't be included just leave it empty */ +#include <netinet/in.h> diff --git a/buildlib/fixup-include/netlink-attr.h b/buildlib/fixup-include/netlink-attr.h new file mode 100644 index 0000000..8c071b9 --- /dev/null +++ b/buildlib/fixup-include/netlink-attr.h @@ -0,0 +1,267 @@ +#ifndef _FIXUP_NETLINK_ATTR_H +#define _FIXUP_NETLINK_ATTR_H + +#include <linux/netlink.h> + +#include <netdb.h> +#include <stdint.h> +#include <stdlib.h> + +struct nlmsghdr; +struct nl_msg; +struct nl_sock; +struct nlattr; +struct nl_cb; +struct sockaddr_nl; +struct nlmsgerr; +struct nl_addr; +struct nl_cache; +struct nl_object; + +typedef int (*nl_recvmsg_msg_cb_t)(struct nl_msg *msg, void *arg); +typedef int (*nl_recvmsg_err_cb_t)(struct sockaddr_nl *nla, + struct nlmsgerr *nlerr, void *arg); + +struct nla_policy { + int type; +}; + +enum { + NLA_U8, + NLA_U32, + NLA_U64, + NL_AUTO_PORT, + NL_AUTO_SEQ, + NL_STOP, + NL_OK, + NL_CB_DEFAULT, + NL_CB_VALID, + NL_CB_CUSTOM, + NLE_PARSE_ERR, + NLE_NOMEM, +}; + +static inline struct nl_sock *nl_socket_alloc(void) +{ + return NULL; +} + +static inline int nl_connect(struct nl_sock *sk, int kind) +{ + return -1; +} + +static inline void nl_socket_free(struct nl_sock *sk) +{ +} + +static inline void nl_socket_disable_auto_ack(struct nl_sock *sk) +{ +} + +static inline void nl_socket_disable_msg_peek(struct nl_sock *sk) +{ +} + +static inline void nl_socket_disable_seq_check(struct nl_sock *sk) +{ +} + +static inline int nl_socket_get_fd(struct nl_sock *sk) +{ + return -1; +} + +static inline int nl_socket_add_membership(struct nl_sock *sk, int group) +{ + return -1; +} + +static inline struct nlmsghdr *nlmsg_put(struct nl_msg *msg, uint32_t pid, + uint32_t seq, int type, int payload, + int flags) +{ + return NULL; +} + +static inline struct nl_msg *nlmsg_alloc(void) +{ + return NULL; +} + +static inline struct nl_msg *nlmsg_alloc_simple(int nlmsgtype, int flags) + +{ + return NULL; +} + +static inline void nlmsg_free(struct nl_msg *msg) +{ +} + +static inline int nl_send_auto(struct nl_sock *sk, struct nl_msg *msg) +{ + return -1; +} + +static inline struct nlmsghdr *nlmsg_hdr(struct nl_msg *msg) +{ + return NULL; +} + +static inline int nlmsg_parse(struct nlmsghdr *nlh, int hdrlen, + struct nlattr *tb[], int maxtype, + struct nla_policy *policy) +{ + return -1; +} + +static inline int nl_msg_parse(struct nl_msg *msg, + void (*cb)(struct nl_object *, void *), + void *arg) +{ + return -1; +} + +static inline int nlmsg_append(struct nl_msg *n, void *data, size_t len, + int pad) +{ + return -1; +} + +static inline int nl_send_simple(struct nl_sock *sk, int type, int flags, + void *buf, size_t size) +{ + return -1; +} + +static inline int nl_recvmsgs(struct nl_sock *sk, struct nl_cb *cb) +{ + return -1; +} + +static inline int nl_recvmsgs_default(struct nl_sock *sk) +{ + return -1; +} + +static inline struct nl_cb *nl_cb_alloc(int kind) +{ + return NULL; +} + +static inline int nl_cb_set(struct nl_cb *cb, int type, int kind, + nl_recvmsg_msg_cb_t func, void *arg) +{ + return -1; +} + +static inline int nl_socket_modify_err_cb(struct nl_sock *sk, int kind, + nl_recvmsg_err_cb_t func, void *arg) +{ + return -1; +} + +static inline int nl_socket_modify_cb(struct nl_sock *sk, int type, int kind, + nl_recvmsg_msg_cb_t func, void *arg) +{ + return -1; +} + +#define NLA_PUT_U32(msg, attrtype, value) ({ goto nla_put_failure; }) +#define NLA_PUT_STRING(msg, attrtype, value) ({ goto nla_put_failure; }) +#define NLA_PUT_ADDR(msg, attrtype, value) ({ goto nla_put_failure; }) + +static inline const char *nla_get_string(struct nlattr *tb) +{ + return NULL; +} + +static inline uint8_t nla_get_u8(struct nlattr *tb) +{ + return 0; +} + +static inline uint32_t nla_get_u32(struct nlattr *tb) +{ + return 0; +} + +static inline uint64_t nla_get_u64(struct nlattr *tb) +{ + return 0; +} + +static inline struct nl_addr *nl_addr_clone(struct nl_addr *src) +{ + return NULL; +} + +static inline int nl_addr_info(struct nl_addr *addr, struct addrinfo **result) +{ + return -1; +} + +static inline struct nl_addr *nl_addr_build(int family, void *buf, size_t size) +{ + return NULL; +} + +static inline unsigned int nl_addr_get_len(struct nl_addr *addr) +{ + return 0; +} + +static inline void *nl_addr_get_binary_addr(struct nl_addr *addr) +{ + return NULL; +} + +static inline int nl_addr_get_family(struct nl_addr *addr) +{ + return -1; +} + +static inline int nl_addr_get_prefixlen(struct nl_addr *addr) +{ + return -1; +} + +static inline int nl_addr_fill_sockaddr(struct nl_addr *addr, + struct sockaddr *sa, socklen_t *salen) +{ + return -1; +} + +static inline void nl_addr_put(struct nl_addr *addr) +{ +} + +static inline void nl_addr_set_prefixlen(struct nl_addr *addr, int prefixlen) +{ +} + +static inline void nl_cache_mngt_unprovide(struct nl_cache *cache) +{ +} + +static inline void nl_cache_free(struct nl_cache *cache) +{ +} + +static inline int nl_object_match_filter(struct nl_object *obj, + struct nl_object *filter) +{ + return -1; +} + +static inline int nl_cache_refill(struct nl_sock *sk, struct nl_cache *cache) +{ + return -1; +} + +static inline void nl_cache_mngt_provide(struct nl_cache *cache) +{ +} + +#endif diff --git a/buildlib/fixup-include/netlink-msg.h b/buildlib/fixup-include/netlink-msg.h new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/buildlib/fixup-include/netlink-msg.h diff --git a/buildlib/fixup-include/netlink-netlink.h b/buildlib/fixup-include/netlink-netlink.h new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/buildlib/fixup-include/netlink-netlink.h diff --git a/buildlib/fixup-include/netlink-object-api.h b/buildlib/fixup-include/netlink-object-api.h new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/buildlib/fixup-include/netlink-object-api.h diff --git a/buildlib/fixup-include/netlink-route-link-vlan.h b/buildlib/fixup-include/netlink-route-link-vlan.h new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/buildlib/fixup-include/netlink-route-link-vlan.h diff --git a/buildlib/fixup-include/netlink-route-link.h b/buildlib/fixup-include/netlink-route-link.h new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/buildlib/fixup-include/netlink-route-link.h diff --git a/buildlib/fixup-include/netlink-route-neighbour.h b/buildlib/fixup-include/netlink-route-neighbour.h new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/buildlib/fixup-include/netlink-route-neighbour.h diff --git a/buildlib/fixup-include/netlink-route-route.h b/buildlib/fixup-include/netlink-route-route.h new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/buildlib/fixup-include/netlink-route-route.h diff --git a/buildlib/fixup-include/netlink-route-rtnl.h b/buildlib/fixup-include/netlink-route-rtnl.h new file mode 100644 index 0000000..e594d1a --- /dev/null +++ b/buildlib/fixup-include/netlink-route-rtnl.h @@ -0,0 +1,114 @@ +#ifndef _FIXUP_NETLINK_ROUTE_RTNL_H +#define _FIXUP_NETLINK_ROUTE_RTNL_H + +#include <netlink/attr.h> + +struct rtnl_addr; +struct rtnl_neigh; +struct rtnl_route; +struct rtnl_nexthop; + +static inline struct rtnl_neigh * +rtnl_neigh_get(struct nl_cache *cache, int ifindex, struct nl_addr *dst) +{ + return NULL; +} + +static inline struct rtnl_link *rtnl_link_get(struct nl_cache *cache, + int ifindex) +{ + return NULL; +} + +static void rtnl_neigh_put(struct rtnl_neigh *neigh) +{ +} + +static inline int rtnl_addr_get_family(struct rtnl_addr *addr) +{ + return -1; +} + +static inline struct nl_addr *rtnl_neigh_get_lladdr(struct rtnl_neigh *neigh) +{ + return NULL; +} + +static inline struct rtnl_neigh *rtnl_neigh_alloc(void) +{ + return NULL; +} + +static inline void rtnl_neigh_set_ifindex(struct rtnl_neigh *neigh, int ifindex) +{ +} +static inline int rtnl_neigh_set_dst(struct rtnl_neigh *neigh, + struct nl_addr *addr) +{ + return -1; +} + +static inline uint8_t rtnl_route_get_type(struct rtnl_route *route) +{ + return 0; +} + +static inline struct nl_addr *rtnl_route_get_pref_src(struct rtnl_route *route) +{ + return NULL; +} + +static inline struct rtnl_nexthop *rtnl_route_nexthop_n(struct rtnl_route *r, + int n) +{ + return NULL; +} + +static inline int rtnl_route_nh_get_ifindex(struct rtnl_nexthop *nh) +{ + return -1; +} + +static inline struct nl_addr *rtnl_route_nh_get_gateway(struct rtnl_nexthop *nh) +{ + return NULL; +} + +static inline int rtnl_link_alloc_cache(struct nl_sock *sk, int family, + struct nl_cache **result) +{ + return -1; +} + +static inline struct nl_addr *rtnl_link_get_addr(struct rtnl_link *link) +{ + return NULL; +} + +static inline int rtnl_link_vlan_get_id(struct rtnl_link *link) +{ + return -1; +} + +static inline void rtnl_link_put(struct rtnl_link *link) +{ +} + +static inline int rtnl_link_is_vlan(struct rtnl_link *link) +{ + return -1; +} + +static inline int rtnl_route_alloc_cache(struct nl_sock *sk, int family, + int flags, struct nl_cache **result) +{ + return -1; +} + +static inline int rtnl_neigh_alloc_cache(struct nl_sock *sock, + struct nl_cache **result) +{ + return -1; +} + +#endif diff --git a/buildlib/fixup-include/stdatomic.h b/buildlib/fixup-include/stdatomic.h new file mode 100644 index 0000000..6af810f --- /dev/null +++ b/buildlib/fixup-include/stdatomic.h @@ -0,0 +1,369 @@ +/* + * An implementation of C11 stdatomic.h directly borrowed from FreeBSD + * (original copyright follows), with minor modifications for + * portability to other systems. Works for recent Clang (that + * implement the feature c_atomic) and GCC 4.7+; includes + * compatibility for GCC below 4.7 but I wouldn't recommend it. + * + * Caveats and limitations: + * - Only the ``_Atomic parentheses'' notation is implemented, while + * the ``_Atomic space'' one is not. + * - _Atomic types must be typedef'ed, or programs using them will + * not type check correctly (incompatible anonymous structure + * types). + * - Non-scalar _Atomic types would require runtime support for + * runtime locking, which, as far as I know, is not currently + * available on any system. + */ + +/*- + * Copyright (c) 2011 Ed Schouten <ed@FreeBSD.org> + * David Chisnall <theraven@FreeBSD.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: src/include/stdatomic.h,v 1.10.2.2 2012/05/30 19:21:54 theraven Exp $ + */ + +#ifndef _STDATOMIC_H_ +#define _STDATOMIC_H_ + +#include <stddef.h> +#include <stdint.h> + +#if !defined(__has_feature) +#define __has_feature(x) 0 +#endif +#if !defined(__has_builtin) +#define __has_builtin(x) 0 +#endif +#if !defined(__GNUC_PREREQ__) +#if defined(__GNUC__) && defined(__GNUC_MINOR__) +#define __GNUC_PREREQ__(maj, min) \ + ((__GNUC__ << 16) + __GNUC_MINOR__ >= ((maj) << 16) + (min)) +#else +#define __GNUC_PREREQ__(maj, min) 0 +#endif +#endif + +#if !defined(__CLANG_ATOMICS) && !defined(__GNUC_ATOMICS) +#if __has_feature(c_atomic) +#define __CLANG_ATOMICS +#elif __GNUC_PREREQ__(4, 7) +#define __GNUC_ATOMICS +#elif !defined(__GNUC__) +#error "stdatomic.h does not support your compiler" +#endif +#endif + +#if !defined(__CLANG_ATOMICS) +#define _Atomic(T) struct { volatile __typeof__(T) __val; } +#endif + +/* + * 7.17.2 Initialization. + */ + +#if defined(__CLANG_ATOMICS) +#define ATOMIC_VAR_INIT(value) (value) +#define atomic_init(obj, value) __c11_atomic_init(obj, value) +#else +#define ATOMIC_VAR_INIT(value) { .__val = (value) } +#define atomic_init(obj, value) do { \ + (obj)->__val = (value); \ +} while (0) +#endif + +/* + * Clang and recent GCC both provide predefined macros for the memory + * orderings. If we are using a compiler that doesn't define them, use the + * clang values - these will be ignored in the fallback path. + */ + +#ifndef __ATOMIC_RELAXED +#define __ATOMIC_RELAXED 0 +#endif +#ifndef __ATOMIC_CONSUME +#define __ATOMIC_CONSUME 1 +#endif +#ifndef __ATOMIC_ACQUIRE +#define __ATOMIC_ACQUIRE 2 +#endif +#ifndef __ATOMIC_RELEASE +#define __ATOMIC_RELEASE 3 +#endif +#ifndef __ATOMIC_ACQ_REL +#define __ATOMIC_ACQ_REL 4 +#endif +#ifndef __ATOMIC_SEQ_CST +#define __ATOMIC_SEQ_CST 5 +#endif + +/* + * 7.17.3 Order and consistency. + * + * The memory_order_* constants that denote the barrier behaviour of the + * atomic operations. + */ + +enum memory_order { + memory_order_relaxed = __ATOMIC_RELAXED, + memory_order_consume = __ATOMIC_CONSUME, + memory_order_acquire = __ATOMIC_ACQUIRE, + memory_order_release = __ATOMIC_RELEASE, + memory_order_acq_rel = __ATOMIC_ACQ_REL, + memory_order_seq_cst = __ATOMIC_SEQ_CST +}; + +typedef enum memory_order memory_order; + +/* + * 7.17.4 Fences. + */ + +#ifdef __CLANG_ATOMICS +#define atomic_thread_fence(order) __c11_atomic_thread_fence(order) +#define atomic_signal_fence(order) __c11_atomic_signal_fence(order) +#elif defined(__GNUC_ATOMICS) +#define atomic_thread_fence(order) __atomic_thread_fence(order) +#define atomic_signal_fence(order) __atomic_signal_fence(order) +#else +#define atomic_thread_fence(order) __sync_synchronize() +#define atomic_signal_fence(order) __asm volatile ("" : : : "memory") +#endif + +/* + * 7.17.5 Lock-free property. + */ + +#if defined(__CLANG_ATOMICS) +#define atomic_is_lock_free(obj) \ + __c11_atomic_is_lock_free(sizeof(obj)) +#elif defined(__GNUC_ATOMICS) +#define atomic_is_lock_free(obj) \ + __atomic_is_lock_free(sizeof((obj)->__val)) +#else +#define atomic_is_lock_free(obj) \ + (sizeof((obj)->__val) <= sizeof(void *)) +#endif + +/* + * 7.17.6 Atomic integer types. + */ + +typedef _Atomic(_Bool) atomic_bool; +typedef _Atomic(char) atomic_char; +typedef _Atomic(signed char) atomic_schar; +typedef _Atomic(unsigned char) atomic_uchar; +typedef _Atomic(short) atomic_short; +typedef _Atomic(unsigned short) atomic_ushort; +typedef _Atomic(int) atomic_int; +typedef _Atomic(unsigned int) atomic_uint; +typedef _Atomic(long) atomic_long; +typedef _Atomic(unsigned long) atomic_ulong; +typedef _Atomic(long long) atomic_llong; +typedef _Atomic(unsigned long long) atomic_ullong; +#if 0 +typedef _Atomic(char16_t) atomic_char16_t; +typedef _Atomic(char32_t) atomic_char32_t; +#endif +typedef _Atomic(wchar_t) atomic_wchar_t; +typedef _Atomic(int_least8_t) atomic_int_least8_t; +typedef _Atomic(uint_least8_t) atomic_uint_least8_t; +typedef _Atomic(int_least16_t) atomic_int_least16_t; +typedef _Atomic(uint_least16_t) atomic_uint_least16_t; +typedef _Atomic(int_least32_t) atomic_int_least32_t; +typedef _Atomic(uint_least32_t) atomic_uint_least32_t; +typedef _Atomic(int_least64_t) atomic_int_least64_t; +typedef _Atomic(uint_least64_t) atomic_uint_least64_t; +typedef _Atomic(int_fast8_t) atomic_int_fast8_t; +typedef _Atomic(uint_fast8_t) atomic_uint_fast8_t; +typedef _Atomic(int_fast16_t) atomic_int_fast16_t; +typedef _Atomic(uint_fast16_t) atomic_uint_fast16_t; +typedef _Atomic(int_fast32_t) atomic_int_fast32_t; +typedef _Atomic(uint_fast32_t) atomic_uint_fast32_t; +typedef _Atomic(int_fast64_t) atomic_int_fast64_t; +typedef _Atomic(uint_fast64_t) atomic_uint_fast64_t; +typedef _Atomic(intptr_t) atomic_intptr_t; +typedef _Atomic(uintptr_t) atomic_uintptr_t; +typedef _Atomic(size_t) atomic_size_t; +typedef _Atomic(ptrdiff_t) atomic_ptrdiff_t; +typedef _Atomic(intmax_t) atomic_intmax_t; +typedef _Atomic(uintmax_t) atomic_uintmax_t; + +/* + * 7.17.7 Operations on atomic types. + */ + +/* + * Compiler-specific operations. + */ + +#if defined(__CLANG_ATOMICS) +#define atomic_compare_exchange_strong_explicit(object, expected, \ + desired, success, failure) \ + __c11_atomic_compare_exchange_strong(object, expected, desired, \ + success, failure) +#define atomic_compare_exchange_weak_explicit(object, expected, \ + desired, success, failure) \ + __c11_atomic_compare_exchange_weak(object, expected, desired, \ + success, failure) +#define atomic_exchange_explicit(object, desired, order) \ + __c11_atomic_exchange(object, desired, order) +#define atomic_fetch_add_explicit(object, operand, order) \ + __c11_atomic_fetch_add(object, operand, order) +#define atomic_fetch_and_explicit(object, operand, order) \ + __c11_atomic_fetch_and(object, operand, order) +#define atomic_fetch_or_explicit(object, operand, order) \ + __c11_atomic_fetch_or(object, operand, order) +#define atomic_fetch_sub_explicit(object, operand, order) \ + __c11_atomic_fetch_sub(object, operand, order) +#define atomic_fetch_xor_explicit(object, operand, order) \ + __c11_atomic_fetch_xor(object, operand, order) +#define atomic_load_explicit(object, order) \ + __c11_atomic_load(object, order) +#define atomic_store_explicit(object, desired, order) \ + __c11_atomic_store(object, desired, order) +#elif defined(__GNUC_ATOMICS) +#define atomic_compare_exchange_strong_explicit(object, expected, \ + desired, success, failure) \ + __atomic_compare_exchange_n(&(object)->__val, expected, \ + desired, 0, success, failure) +#define atomic_compare_exchange_weak_explicit(object, expected, \ + desired, success, failure) \ + __atomic_compare_exchange_n(&(object)->__val, expected, \ + desired, 1, success, failure) +#define atomic_exchange_explicit(object, desired, order) \ + __atomic_exchange_n(&(object)->__val, desired, order) +#define atomic_fetch_add_explicit(object, operand, order) \ + __atomic_fetch_add(&(object)->__val, operand, order) +#define atomic_fetch_and_explicit(object, operand, order) \ + __atomic_fetch_and(&(object)->__val, operand, order) +#define atomic_fetch_or_explicit(object, operand, order) \ + __atomic_fetch_or(&(object)->__val, operand, order) +#define atomic_fetch_sub_explicit(object, operand, order) \ + __atomic_fetch_sub(&(object)->__val, operand, order) +#define atomic_fetch_xor_explicit(object, operand, order) \ + __atomic_fetch_xor(&(object)->__val, operand, order) +#define atomic_load_explicit(object, order) \ + __atomic_load_n(&(object)->__val, order) +#define atomic_store_explicit(object, desired, order) \ + __atomic_store_n(&(object)->__val, desired, order) +#else +#define atomic_compare_exchange_strong_explicit(object, expected, \ + desired, success, failure) ({ \ + __typeof__((object)->__val) __v; \ + _Bool __r; \ + __v = __sync_val_compare_and_swap(&(object)->__val, \ + *(expected), desired); \ + __r = *(expected) == __v; \ + *(expected) = __v; \ + __r; \ +}) + +#define atomic_compare_exchange_weak_explicit(object, expected, \ + desired, success, failure) \ + atomic_compare_exchange_strong_explicit(object, expected, \ + desired, success, failure) +#if __has_builtin(__sync_swap) +/* Clang provides a full-barrier atomic exchange - use it if available. */ +#define atomic_exchange_explicit(object, desired, order) \ + __sync_swap(&(object)->__val, desired) +#else +/* + * __sync_lock_test_and_set() is only an acquire barrier in theory (although in + * practice it is usually a full barrier) so we need an explicit barrier after + * it. + */ +#define atomic_exchange_explicit(object, desired, order) ({ \ + __typeof__((object)->__val) __v; \ + __v = __sync_lock_test_and_set(&(object)->__val, desired); \ + __sync_synchronize(); \ + __v; \ +}) +#endif +#define atomic_fetch_add_explicit(object, operand, order) \ + __sync_fetch_and_add(&(object)->__val, operand) +#define atomic_fetch_and_explicit(object, operand, order) \ + __sync_fetch_and_and(&(object)->__val, operand) +#define atomic_fetch_or_explicit(object, operand, order) \ + __sync_fetch_and_or(&(object)->__val, operand) +#define atomic_fetch_sub_explicit(object, operand, order) \ + __sync_fetch_and_sub(&(object)->__val, operand) +#define atomic_fetch_xor_explicit(object, operand, order) \ + __sync_fetch_and_xor(&(object)->__val, operand) +#define atomic_load_explicit(object, order) \ + __sync_fetch_and_add(&(object)->__val, 0) +#define atomic_store_explicit(object, desired, order) do { \ + __sync_synchronize(); \ + (object)->__val = (desired); \ + __sync_synchronize(); \ +} while (0) +#endif + +/* + * Convenience functions. + */ + +#define atomic_compare_exchange_strong(object, expected, desired) \ + atomic_compare_exchange_strong_explicit(object, expected, \ + desired, memory_order_seq_cst, memory_order_seq_cst) +#define atomic_compare_exchange_weak(object, expected, desired) \ + atomic_compare_exchange_weak_explicit(object, expected, \ + desired, memory_order_seq_cst, memory_order_seq_cst) +#define atomic_exchange(object, desired) \ + atomic_exchange_explicit(object, desired, memory_order_seq_cst) +#define atomic_fetch_add(object, operand) \ + atomic_fetch_add_explicit(object, operand, memory_order_seq_cst) +#define atomic_fetch_and(object, operand) \ + atomic_fetch_and_explicit(object, operand, memory_order_seq_cst) +#define atomic_fetch_or(object, operand) \ + atomic_fetch_or_explicit(object, operand, memory_order_seq_cst) +#define atomic_fetch_sub(object, operand) \ + atomic_fetch_sub_explicit(object, operand, memory_order_seq_cst) +#define atomic_fetch_xor(object, operand) \ + atomic_fetch_xor_explicit(object, operand, memory_order_seq_cst) +#define atomic_load(object) \ + atomic_load_explicit(object, memory_order_seq_cst) +#define atomic_store(object, desired) \ + atomic_store_explicit(object, desired, memory_order_seq_cst) + +/* + * 7.17.8 Atomic flag type and operations. + */ + +typedef atomic_bool atomic_flag; + +#define ATOMIC_FLAG_INIT ATOMIC_VAR_INIT(0) + +#define atomic_flag_clear_explicit(object, order) \ + atomic_store_explicit(object, 0, order) +#define atomic_flag_test_and_set_explicit(object, order) \ + atomic_compare_exchange_strong_explicit(object, 0, 1, order, order) + +#define atomic_flag_clear(object) \ + atomic_flag_clear_explicit(object, memory_order_seq_cst) +#define atomic_flag_test_and_set(object) \ + atomic_flag_test_and_set_explicit(object, memory_order_seq_cst) + +#endif /* !_STDATOMIC_H_ */ diff --git a/buildlib/fixup-include/systemd-sd-daemon.h b/buildlib/fixup-include/systemd-sd-daemon.h new file mode 100644 index 0000000..29cb70c --- /dev/null +++ b/buildlib/fixup-include/systemd-sd-daemon.h @@ -0,0 +1,16 @@ +#define SD_LISTEN_FDS_START 3 + +static inline int sd_listen_fds(int unset_environment) +{ + return 0; +} + +static inline int sd_is_socket(int fd, int family, int type, int listening) +{ + return 0; +} + +static inline int sd_notify(int unset_environment, const char *state) +{ + return 0; +} diff --git a/buildlib/fixup-include/valgrind-drd.h b/buildlib/fixup-include/valgrind-drd.h new file mode 100644 index 0000000..9e491fc --- /dev/null +++ b/buildlib/fixup-include/valgrind-drd.h @@ -0,0 +1,3 @@ +static inline void ANNOTATE_BENIGN_RACE_SIZED(const void *mem,size_t len,const char *desc) {} +#define ANNOTATE_BENIGN_RACE_SIZED ANNOTATE_BENIGN_RACE_SIZED + diff --git a/buildlib/fixup-include/valgrind-memcheck.h b/buildlib/fixup-include/valgrind-memcheck.h new file mode 100644 index 0000000..6457a5a --- /dev/null +++ b/buildlib/fixup-include/valgrind-memcheck.h @@ -0,0 +1,5 @@ +static inline void VALGRIND_MAKE_MEM_DEFINED(const void *mem,size_t len) {} +#define VALGRIND_MAKE_MEM_DEFINED VALGRIND_MAKE_MEM_DEFINED + +static inline void VALGRIND_MAKE_MEM_UNDEFINED(const void *mem,size_t len) {} +#define VALGRIND_MAKE_MEM_UNDEFINED VALGRIND_MAKE_MEM_UNDEFINED diff --git a/buildlib/gen-sparse.py b/buildlib/gen-sparse.py new file mode 100755 index 0000000..3b8c77e --- /dev/null +++ b/buildlib/gen-sparse.py @@ -0,0 +1,187 @@ +#!/usr/bin/env python3 +# Copyright 2015-2017 Obsidian Research Corp. +# Licensed under BSD (MIT variant) or GPLv2. See COPYING. +import argparse +import subprocess +import os +import collections +import re +import itertools + +headers = { + "bits/sysmacros.h", + "endian.h", + "netinet/in.h", + "pthread.h", + "stdatomic.h", + "stdlib.h", + "sys/socket.h", + }; + +def norm_header(fn): + for I in headers: + flat = I.replace("/","-"); + if fn.endswith(flat): + return I; + if fn.endswith(flat + ".diff"): + return I; + return None; + +def find_system_header(args,hdr): + """/usr/include is not always where the include files are, particularly if we + are running full multi-arch as the azure_pipeline container does. Get gcc + to tell us where /usr/include is""" + if "incpath" not in args: + cpp = subprocess.check_output([args.cc, "-print-prog-name=cpp"],universal_newlines=True).strip() + data = subprocess.check_output([cpp, "-v"],universal_newlines=True,stdin=subprocess.DEVNULL, + stderr=subprocess.STDOUT) + args.incpath = []; + for incdir in re.finditer(r"^ (/\S+)$", data, re.MULTILINE): + incdir = incdir.group(1) + if "fixed" in incdir: + continue; + args.incpath.append(incdir) + + for incdir in args.incpath: + fn = os.path.join(incdir,hdr) + if os.path.exists(fn): + return fn + return None; + +def get_buildlib_patches(dfn): + """Within the buildlib directory we store patches for the glibc headers. Each + patch is in a numbered sub directory that indicates the order to try, the + number should match the glibc version used to make the diff.""" + ver_hdrs = []; + all_hdrs = [] + for d,_,files in os.walk(dfn): + for I in files: + if d != dfn: + bn = int(os.path.basename(d)); + else: + bn = 0; + + if bn == 0: + all_hdrs.append(os.path.join(d,I)); + else: + ver_hdrs.append((bn,os.path.join(d,I))); + ver_hdrs.sort(reverse=True); + + def add_to_dict(d,lst): + for I in lst: + nh = norm_header(I) + assert nh not in d + d[nh] = (I, find_system_header(args,nh)) + + ret = [] + for k,g in itertools.groupby(ver_hdrs,key=lambda x:x[0]): + dd = {} + ret.append(dd) + add_to_dict(dd,(I for _,I in g)) + add_to_dict(dd,all_hdrs) + return ret; + +def is_patch(fn): + with open(fn) as F: + return F.read(10).startswith("-- /"); + +def apply_patch(src,patch,dest): + """Patch a single system header. The output goes into our include search path + and takes precedence over the system version.""" + if src is None: + return False + + dfn = os.path.dirname(dest); + if not os.path.isdir(dfn): + os.makedirs(dfn); + + if not patch.endswith(".diff"): + if not os.path.exists(dest): + os.symlink(patch,dest); + return True; + + try: + if os.path.exists(dest + ".rej"): + os.unlink(dest + ".rej"); + + subprocess.check_output(["patch","-f","--follow-symlinks","-V","never","-i",patch,"-o",dest,src]); + + if os.path.exists(dest + ".rej"): + print("Patch from %r failed"%(patch)); + return False; + except subprocess.CalledProcessError: + print("Patch from %r failed"%(patch)); + return False; + return True; + +def replace_headers(suite): + # Local system does not have the reference system header, this suite is + # not supported + for fn,pfn in suite.items(): + if pfn[1] is None: + return False; + + for fn,pfn in suite.items(): + if not apply_patch(pfn[1],pfn[0],os.path.join(args.INCLUDE,fn)): + break; + else: + return True; + + for fn,_ in suite.items(): + try: + os.unlink(os.path.join(args.INCLUDE,fn)) + except OSError: + continue; + return False; + +def save(fn,outdir): + """Diff the header file in our include directory against the system header and + store the diff into buildlib. This makes it fairly easy to maintain the + replacement headers.""" + if os.path.islink(os.path.join(args.INCLUDE,fn)): + return; + + flatfn = fn.replace("/","-") + ".diff"; + flatfn = os.path.join(outdir,flatfn); + + with open(flatfn,"wt") as F: + try: + subprocess.check_call(["diff","-u", + find_system_header(args,fn), + os.path.join(args.INCLUDE,fn)], + stdout=F); + except subprocess.CalledProcessError as ex: + if ex.returncode == 1: + return; + raise; + +parser = argparse.ArgumentParser(description='Produce sparse shim header files') +parser.add_argument("--out",dest="INCLUDE",required=True, + help="Directory to write header files to"); +parser.add_argument("--src",dest="SRC",required=True, + help="Top of the source tree"); +parser.add_argument("--cc",default="gcc", + help="System compiler to use to locate the default system headers"); +parser.add_argument("--save",action="store_true",default=False, + help="Save mode will write the current content of the headers to buildlib as a diff."); +args = parser.parse_args(); + +if args.save: + # Get the glibc version string + ver = subprocess.check_output(["ldd","--version"]).decode() + ver = ver.splitlines()[0].split(' ')[-1]; + ver = ver.partition(".")[-1]; + outdir = os.path.join(args.SRC,"buildlib","sparse-include",ver); + if not os.path.isdir(outdir): + os.makedirs(outdir); + + for I in headers: + save(I,outdir); +else: + failed = False; + suites = get_buildlib_patches(os.path.join(args.SRC,"buildlib","sparse-include")); + for I in suites: + if replace_headers(I): + break; + else: + raise ValueError("Patch applications failed"); diff --git a/buildlib/make_abi_structs.py b/buildlib/make_abi_structs.py new file mode 100644 index 0000000..0817735 --- /dev/null +++ b/buildlib/make_abi_structs.py @@ -0,0 +1,56 @@ +#/usr/bin/env python +"""This script transforms the structs inside the kernel ABI headers into a define +of an anonymous struct. + +eg + struct abc {int foo;}; +becomes + #define _STRUCT_abc struct {int foo;}; + +This allows the exact same struct to be included in the provider wrapper struct: + +struct abc_resp { + struct ibv_abc ibv_resp; + _STRUCT_abc; +}; + +Which duplicates the struct layout and naming we have historically used, but +sources the data directly from the kernel headers instead of manually copying.""" +import re; +import functools; +import sys; + +def in_struct(ln,FO,nesting=0): + """Copy a top level structure over to the #define output, keeping track of + nested structures.""" + if nesting == 0: + if re.match(r"(}.*);",ln): + FO.write(ln[:-1] + "\n\n"); + return find_struct; + + FO.write(ln + " \\\n"); + + if ln == "struct {" or ln == "union {": + return functools.partial(in_struct,nesting=nesting+1); + + if re.match(r"}.*;",ln): + return functools.partial(in_struct,nesting=nesting-1); + return functools.partial(in_struct,nesting=nesting); + +def find_struct(ln,FO): + """Look for the start of a top level structure""" + if ln.startswith("struct ") or ln.startswith("union "): + g = re.match(r"(struct|union)\s+(\S+)\s+{",ln); + FO.write("#define _STRUCT_%s %s { \\\n"%(g.group(2),g.group(1))); + return in_struct; + return find_struct; + +with open(sys.argv[1]) as FI: + with open(sys.argv[2],"w") as FO: + state = find_struct; + for ln in FI: + # Drop obvious comments + ln = ln.strip(); + ln = re.sub(r"/\*.*\*/","",ln); + ln = re.sub(r"//.*$","",ln); + state = state(ln,FO); diff --git a/buildlib/pandoc-prebuilt.py b/buildlib/pandoc-prebuilt.py new file mode 100644 index 0000000..afba326 --- /dev/null +++ b/buildlib/pandoc-prebuilt.py @@ -0,0 +1,62 @@ +#!/usr/bin/env python +import os +import shutil +import subprocess +import sys +import hashlib +import re + +def hash_rst_includes(incdir,txt): + h = "" + for fn in re.findall(br"^..\s+include::\s+(.*)$", txt, flags=re.MULTILINE): + with open(os.path.join(incdir,fn.decode()),"rb") as F: + h = h + hashlib.sha1(F.read()).hexdigest(); + return h.encode(); + +def get_id(SRC): + """Return a unique ID for the SRC file. For simplicity and robustness we just + content hash it""" + incdir = os.path.dirname(SRC) + with open(SRC,"rb") as F: + txt = F.read(); + if SRC.endswith(".rst"): + txt = txt + hash_rst_includes(incdir,txt); + return hashlib.sha1(txt).hexdigest(); + +def do_retrieve(src_root,SRC): + """Retrieve the file from the prebuild cache and write it to DEST""" + prebuilt = os.path.join(src_root,"buildlib","pandoc-prebuilt",get_id(SRC)) + sys.stdout.write(prebuilt); + +def do_build_pandoc(build_root,pandoc,SRC,DEST): + """Build the markdown into a man page with pandoc and then keep a copy of the + output under build/pandoc-prebuilt""" + try: + subprocess.check_call([pandoc,"-s","-t","man",SRC,"-o",DEST]); + except subprocess.CalledProcessError: + sys.exit(100); + shutil.copy(DEST,os.path.join(build_root,"pandoc-prebuilt",get_id(SRC))); + +def do_build_rst2man(build_root,rst2man,SRC,DEST): + """Build the markdown into a man page with pandoc and then keep a copy of the + output under build/pandoc-prebuilt""" + try: + subprocess.check_call([rst2man,SRC,DEST]); + except subprocess.CalledProcessError: + sys.exit(100); + shutil.copy(DEST,os.path.join(build_root,"pandoc-prebuilt",get_id(SRC))); + +# We support python 2.6 so argparse is not available. +if len(sys.argv) == 4: + assert(sys.argv[1] == "--retrieve"); + do_retrieve(sys.argv[2],sys.argv[3]); +elif len(sys.argv) == 7: + assert(sys.argv[1] == "--build"); + if sys.argv[3] == "--pandoc": + do_build_pandoc(sys.argv[2],sys.argv[4],sys.argv[5],sys.argv[6]); + elif sys.argv[3] == "--rst": + do_build_rst2man(sys.argv[2],sys.argv[4],sys.argv[5],sys.argv[6]); + else: + raise ValueError("Bad sys.argv[3]"); +else: + raise ValueError("Must provide --build or --retrieve"); diff --git a/buildlib/pandoc-prebuilt/00b1d0691cdea71ca370160f85854622bfef1e92 b/buildlib/pandoc-prebuilt/00b1d0691cdea71ca370160f85854622bfef1e92 new file mode 100644 index 0000000..9378a25 --- /dev/null +++ b/buildlib/pandoc-prebuilt/00b1d0691cdea71ca370160f85854622bfef1e92 @@ -0,0 +1,423 @@ +.\" Automatically generated by Pandoc 1.19.2.4 +.\" +.TH "ibv_flow_action_esp" "3" "" "" "" +.hy +.SH NAME +.PP +ibv_flow_action_esp \- Flow action esp for verbs +.SH SYNOPSIS +.IP +.nf +\f[C] +#include\ <infiniband/verbs.h> + +struct\ ibv_flow_action\ * +ibv_create_flow_action_esp(struct\ ibv_context\ *ctx, +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ struct\ ibv_flow_action_esp\ *esp); +int +ibv_modify_flow_action_esp(struct\ ibv_flow_action\ *action, +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ struct\ ibv_flow_action_esp\ *esp); + +int\ ibv_destroy_flow_action(struct\ ibv_flow_action\ *action); +\f[] +.fi +.SH DESCRIPTION +.PP +An IPSEC ESP flow steering action allows a flow steering rule to decrypt +or encrypt a packet after matching. +Each action contains the necessary information for this operation in the +\f[I]params\f[] argument. +.PP +After the crypto operation the packet will continue to be processed by +flow steering rules until it reaches a final action of discard or +delivery. +.PP +After the action is created, then it should be associated with a +\f[I]struct ibv_flow_attr\f[] using \f[I]struct +ibv_flow_spec_action_handle\f[] flow specification. +Each action can be associated with multiple flows, and +\f[I]ibv_modify_flow_action_esp\f[] will alter all associated flows +simultaneously. +.SH ARGUMENTS +.TP +.B \f[I]ctx\f[] +RDMA device context to create the action on. +.RS +.RE +.TP +.B \f[I]esp\f[] +ESP parameters and key material for the action. +.RS +.RE +.TP +.B \f[I]action\f[] +Existing action to modify ESP parameters. +.RS +.RE +.SS \f[I]action\f[] Argument +.IP +.nf +\f[C] +struct\ ibv_flow_action_esp\ { +\ \ \ \ struct\ ibv_flow_action_esp_attr\ *esp_attr; + +\ \ \ \ /*\ See\ Key\ Material\ */ +\ \ \ \ uint16_t\ \ \ \ \ \ \ \ keymat_proto; +\ \ \ \ uint16_t\ \ \ \ \ \ \ \ keymat_len; +\ \ \ \ void\ \ \ \ \ \ \ \ \ \ \ *keymat_ptr; + +\ \ \ \ /*\ See\ Replay\ Protection\ */ +\ \ \ \ uint16_t\ \ \ \ \ \ \ \ replay_proto; +\ \ \ \ uint16_t\ \ \ \ \ \ \ \ replay_len; +\ \ \ \ void\ \ \ \ \ \ \ \ \ \ \ *replay_ptr; + +\ \ \ \ struct\ ibv_flow_action_esp_encap\ *esp_encap; + +\ \ \ \ uint32_t\ \ \ \ \ \ \ \ comp_mask; +\ \ \ \ uint32_t\ \ \ \ \ \ \ \ esn; +}; +\f[] +.fi +.TP +.B \f[I]comp_mask\f[] +Bitmask specifying what fields in the structure are valid. +.RS +.RE +.TP +.B \f[I]esn\f[] +The starting value of the ESP extended sequence number. +Valid only if \f[I]IBV_FLOW_ACTION_ESP_MASK_ESN\f[] is set in +\f[I]comp_mask\f[]. +.RS +.PP +The 32 bits of \f[I]esn\f[] will be used to compute the full 64 bit ESN +required for the AAD construction. +.PP +When in \f[I]IB_UVERBS_FLOW_ACTION_ESP_FLAGS_INLINE_CRYPTO\f[] mode, the +implementation will automatically track rollover of the lower 32 bits of +the ESN. +However, an update of the window is required once every 2^31 sequences. +.PP +When in \f[I]IB_UVERBS_FLOW_ACTION_ESP_FLAGS_FULL_OFFLOAD\f[] mode this +value is automatically incremended and it is also used for anti\-replay +checks. +.RE +.TP +.B \f[I]esp_attr\f[] +See \f[I]ESP Attributes\f[]. +May be NULL on modify. +.RS +.RE +.TP +.B \f[I]keymat_proto\f[], \f[I]keymat_len\f[], \f[I]keymat_ptr\f[] +Describe the key material and encryption standard to use. +May be NULL on modify. +.RS +.RE +.TP +.B \f[I]replay_proto\f[], \f[I]replay_len\f[], \f[I]replay_ptr\f[] +Describe the replay protection scheme used to manage sequence numbers +and prevent replay attacks. +This field is only valid in full offload mode. +May be NULL on modify. +.RS +.RE +.TP +.B \f[I]esp_encap\f[] +Describe the encapsulation of ESP packets such as the IP tunnel and/or +UDP encapsulation. +This field is only valid in full offload mode. +May be NULL on modify. +.RS +.RE +.SS ESP attributes +.IP +.nf +\f[C] +struct\ ibv_flow_action_esp_attr\ { +\ \ \ \ uint32_t\ \ \ spi; +\ \ \ \ uint32_t\ \ \ seq; +\ \ \ \ uint32_t\ \ \ tfc_pad; +\ \ \ \ uint32_t\ \ \ flags; +\ \ \ \ uint64_t\ \ \ hard_limit_pkts; +}; +\f[] +.fi +.TP +.B \f[I]flags\f[] +A bitwise OR of the various \f[I]IB_UVERBS_FLOW_ACTION_ESP_FLAGS\f[] +described below. +.RS +.TP +.B \f[I]IB_UVERBS_FLOW_ACTION_ESP_FLAGS_DECRYPT\f[], \f[I]IB_UVERBS_FLOW_ACTION_ESP_FLAGS_ENCRYPT\f[] +The action will decrypt or encrypt a packet using the provided keying +material. +.RS +.PP +The implementation may require that encrypt is only used with an egress +flow steering rule, and that decrypt is only used with an ingress flow +steering rule. +.RE +.RE +.SS Full Offload Mode +.PP +When \f[I]esp_attr\f[] flag +\f[I]IB_UVERBS_FLOW_ACTION_ESP_FLAGS_FULL_OFFLOAD\f[] is set the ESP +header and trailer are added and removed automatically during the cipher +operation. +In this case the \f[I]esn\f[] and \f[I]spi\f[] are used to populate and +check the ESP header, and any information from the \f[I]keymat\f[] (eg a +IV) is placed in the headers and otherwise handled automatically. +.PP +For decrypt the hardware will perform anti\-replay. +.PP +Decryption failure will cause the packet to be dropped. +.PP +This action must be combined with the flow steering that identifies the +packets protected by the SA defined in this action. +.PP +The following members of the esp_attr are used only in full offload +mode: +.TP +.B \f[I]spi\f[] +The value for the ESP Security Parameters Index. +It is only used for +\f[I]IB_UVERBS_FLOW_ACTION_ESP_FLAGS_FULL_OFFLAOD\f[]. +.RS +.RE +.TP +.B \f[I]seq\f[] +The initial 32 lower bytes of the sequence number. +This is the value of the ESP sequence number. +It is only used for +\f[I]IB_UVERBS_FLOW_ACTION_ESP_FLAGS_FULL_OFFLAOD\f[]. +.RS +.RE +.TP +.B \f[I]tfc_pad\f[] +The length of Traffic Flow Confidentiality Padding as specified by +RFC4303. +If it is set to zero no additional padding is added. +It is only used for +\f[I]IB_UVERBS_FLOW_ACTION_ESP_FLAGS_FULL_OFFLAOD\f[]. +.RS +.RE +.TP +.B \f[I]hard_limit_pkts\f[] +The hard lifetime of the SA measured in number of packets. +As specified by RFC4301. +After this limit is reached the action will drop future packets to +prevent breaking the crypto. +It is only used for +\f[I]IB_UVERBS_FLOW_ACTION_ESP_FLAGS_FULL_OFFLAOD\f[]. +.RS +.RE +.SS Inline Crypto Mode +.PP +When \f[I]esp_attr\f[] flag +\f[I]IB_UVERBS_FLOW_ACTION_ESP_FLAGS_INLINE_CRYPTO\f[] is set the user +must providate packets with additional headers. +.PP +For encrypt the packet must contain a fully populated IPSEC packet +except the data payload is left un\-encrypted and there is no IPsec +trailer. +If the IV must be unpredictable, then a flag should indicate the +transofrmation such as \f[I]IB_UVERBS_FLOW_ACTION_IV_ALGO_SEQ\f[]. +.PP +\f[I]IB_UVERBS_FLOW_ACTION_IV_ALGO_SEQ\f[] means that the IV is +incremented sequentually. +If the IV algorithm is supported by HW, then it could provide support +for LSO offload with ESP inline crypto. +.PP +Finally, the IV used to encrypt the packet replaces the IV field +provided, the payload is encrypted and authenticated, a trailer with +padding is added and the ICV is added as well. +.PP +For decrypt the packet is authenticated and decrypted in\-place, +resulting in a decrypted IPSEC packet with no trailer. +The result of decryption and authentication can be retrieved from an +extended CQ via the \f[I]ibv_wc_read_XXX(3)\f[] function. +.PP +This mode must be combined with the flow steering including +\f[I]IBV_FLOW_SPEC_IPV4\f[] and \f[I]IBV_FLOW_SPEC_ESP\f[] to match the +outer packet headers to ensure that the action is only applied to IPSEC +packets with the correct identifiers. +.PP +For inline crypto, we have some special requirements to maintain a +stateless ESN while maintaining the same parameters as software. +The system supports offloading a portion of the IPSEC flow, enabling a +single flow to be split between multiple NICs. +.SS Determining the ESN for Ingress Packets +.PP +We require a "modify" command once every 2^31 packets. +This modify command allows the implementation in HW to be stateless, as +follows: +.IP +.nf +\f[C] +\ \ \ \ \ \ \ \ \ \ \ ESN\ 1\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ ESN\ 2\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ ESN\ 3 +|\-\-\-\-\-\-\-\-\-\-\-\-\-*\-\-\-\-\-\-\-\-\-\-\-\-\-|\-\-\-\-\-\-\-\-\-\-\-\-\-*\-\-\-\-\-\-\-\-\-\-\-\-\-|\-\-\-\-\-\-\-\-\-\-\-\-\-* +^\ \ \ \ \ \ \ \ \ \ \ \ \ ^\ \ \ \ \ \ \ \ \ \ \ \ \ ^\ \ \ \ \ \ \ \ \ \ \ \ \ ^\ \ \ \ \ \ \ \ \ \ \ \ \ ^\ \ \ \ \ \ \ \ \ \ \ \ \ ^ +\f[] +.fi +.PP +^ \- marks where command invoked to update the SA ESN state machine. +.PD 0 +.P +.PD +| \- marks the start of the ESN scope (0\-2^32\-1). +At this point move SA ESN "new_window" bit to zero and increment ESN. +.PD 0 +.P +.PD +* \- marks the middle of the ESN scope (2^31). +At this point move SA ESN "new_window" bit to one. +.PP +For decryption the implementation uses the following state machine to +determine ESN: +.IP +.nf +\f[C] +if\ (!overlap)\ { +\ \ \ \ use\ esn\ //\ regardless\ of\ packet.seq +}\ else\ {\ //\ new_window +\ \ \ \ if\ (packet.seq\ >=\ 2^31) +\ \ \ \ \ \ \ \ use\ esn +\ \ \ \ else\ //\ packet.seq\ <\ 2^31 +\ \ \ \ \ \ \ \ use\ esn+1 +} +\f[] +.fi +.PP +This mechanism is controlled by the \f[I]esp_attr\f[] flag: +.TP +.B \f[I]IB_UVERBS_FLOW_ACTION_ESP_FLAGS_ESN_NEW_WINDOW\f[] +This flag is only used to provide stateless ESN support for inline +crypto. +It is used only for +\f[I]IB_UVERBS_FLOW_ACTION_ESP_FLAGS_INLINE_CRYPTO\f[] and +\f[I]IBV_FLOW_ACTION_ESP_MASK_ESN\f[]. +.RS +.PP +Setting this flag indicates that the bottom of the replay window is +between 2^31 \- 2^32. +.RE +.SS Key Material for AES GCM (\f[I]IBV_ACTION_ESP_KEYMAT_AES_GCM\f[]) +.PP +The AES GCM crypto algorithm as defined by RFC4106. +This struct is to be provided in \f[I]keymat_ptr\f[] when +\f[I]keymat_proto\f[] is set to \f[I]IBV_ACTION_ESP_KEYMAT_AES_GCM\f[]. +.IP +.nf +\f[C] +struct\ ibv_flow_action_esp_aes_keymat_aes_gcm\ { +\ \ \ \ uint64_t\ \ \ iv; +\ \ \ \ uint32_t\ \ \ iv_algo;\ /*\ Use\ enum\ ib_uverbs_flow_action_esp_aes_gcm_keymat_iv_algo\ */ + +\ \ \ \ uint32_t\ \ \ salt; +\ \ \ \ uint32_t\ \ \ icv_len; + +\ \ \ \ uint32_t\ \ \ key_len; +\ \ \ \ uint32_t\ \ \ aes_key[256\ /\ 32]; +}; +\f[] +.fi +.TP +.B \f[I]iv\f[] +The starting value for the initialization vector used only with +\f[I]IB_UVERBS_FLOW_ACTION_ESP_FLAGS_FULL_OFFLOAD\f[] encryption as +defined in RFC4106. +This field is ignored for +\f[I]IB_UVERBS_FLOW_ACTION_ESP_FLAGS_INLINE_CRYPTO\f[]. +.RS +.PP +For a given key, the IV MUST NOT be reused. +.RE +.TP +.B \f[I]iv_algo\f[] +The algorithm used to transform/generate new IVs with +\f[I]IB_UVERBS_FLOW_ACTION_ESP_FLAGS_FULL_OFFLOAD\f[] encryption. +.RS +.PP +The only supported value is \f[I]IB_UVERBS_FLOW_ACTION_IV_ALGO_SEQ\f[] +to generate sequantial IVs. +.RE +.TP +.B \f[I]salt\f[] +The salt as defined by RFC4106. +.RS +.RE +.TP +.B \f[I]icv_len\f[] +The length of the Integrity Check Value in bytes as defined by RFC4106. +.RS +.RE +.TP +.B \f[I]aes_key\f[], \f[I]key_len\f[] +The cipher key data. +It must be either 16, 24 or 32 bytes as defined by RFC4106. +.RS +.RE +.SS Bitmap Replay Protection (\f[I]IBV_FLOW_ACTION_ESP_REPLAY_BMP\f[]) +.PP +A shifting bitmap is used to identify which packets have already been +transmitted. +Each bit in the bitmap represents a packet, it is set if a packet with +this ESP sequence number has been received and it passed authentication. +If a packet with the same sequence is received, then the bit is already +set, causing replay protection to drop the packet. +The bitmap represents a window of \f[I]size\f[] sequence numbers. +If a newer sequence number is received, then the bitmap will shift to +represent this as in RFC6479. +The replay window cannot shift more than 2^31 sequence numbers forward. +.PP +This struct is to be provided in \f[I]replay_ptr\f[] when +\f[I]reply_proto\f[] is set to \f[I]IBV_FLOW_ACTION_ESP_REPLAY_BMP\f[]. +In this mode reply_ptr and reply_len should point to a struct +ibv_flow_action_esp_replay_bmp containing: \f[I]size\f[] : The size of +the bitmap. +.SS ESP Encapsulation +.PP +An \f[I]esp_encap\f[] specification is required when \f[I]eps_attr\f[] +flags \f[I]IB_UVERBS_FLOW_ACTION_ESP_FLAGS_TUNNEL\f[] is set. +It is used to provide the fields for the encapsulation header that is +added/removed to/from packets. +Tunnel and Transport mode are defined as in RFC4301. +UDP encapsulation of ESP can be specified by providing the appropriate +UDP header. +.PP +This setting is only used in +\f[I]IB_UVERBS_FLOW_ACTION_ESP_FLAGS_FULL_OFFLOAD\f[] mode. +.IP +.nf +\f[C] +struct\ ibv_flow_action_esp_encap\ { +\ \ \ \ void\ \ \ \ \ \ \ \ *val;\ \ \ \ \ \ \ \ /*\ pointer\ to\ struct\ ibv_flow_xxxx_filter\ */ +\ \ \ \ struct\ ibv_flow_action_esp_encap\ \ \ *next_ptr; +\ \ \ \ uint16_t\ \ \ \ len;\ \ \ \ \ \ \ \ \ \ \ \ /*\ Len\ of\ mask\ and\ pointer\ (separately)\ */ +\ \ \ \ uint16_t\ \ \ \ type;\ \ \ \ \ \ \ \ \ \ \ /*\ Use\ flow_spec\ enum\ */ +}; +\f[] +.fi +.PP +Each link in the list specifies a network header in the same manner as +the flow steering API. +The header should be selected from a supported header in \[aq]enum +ibv_flow_spec_type\[aq]. +.SH RETURN VALUE +.PP +Upon success \f[I]ibv_create_flow_action_esp\f[] will return a new +\f[I]struct ibv_flow_action\f[] object, on error NULL will be returned +and errno will be set. +.PP +Upon success \f[I]ibv_modify_action_esp\f[] will return 0. +On error the value of errno will be returned. +If ibv_modify_flow_action fails, it is guaranteed that the last action +still holds. +If it succeeds, there is a point in the future where the old action is +applied on all packets until this point and the new one is applied on +all packets from this point and on. +.SH SEE ALSO +.PP +\f[I]ibv_create_flow(3)\f[], \f[I]ibv_destroy_action(3)\f[], \f[I]RFC +4106\f[] diff --git a/buildlib/pandoc-prebuilt/07882ad3161d9480324a783f52621167a7a5c7ca b/buildlib/pandoc-prebuilt/07882ad3161d9480324a783f52621167a7a5c7ca new file mode 100644 index 0000000..b8333eb --- /dev/null +++ b/buildlib/pandoc-prebuilt/07882ad3161d9480324a783f52621167a7a5c7ca @@ -0,0 +1,129 @@ +.\" Automatically generated by Pandoc 1.19.2.4 +.\" +.TH "mlx5dv_create_qp" "3" "2018\-9\-1" "mlx5" "mlx5 Programmer\[aq]s Manual" +.hy +.SH NAME +.PP +mlx5dv_create_qp \- creates a queue pair (QP) +.SH SYNOPSIS +.IP +.nf +\f[C] +#include\ <infiniband/mlx5dv.h> + +struct\ ibv_qp\ *mlx5dv_create_qp(struct\ ibv_context\ \ \ \ \ \ \ \ \ *context, +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ struct\ ibv_qp_init_attr_ex\ *qp_attr, +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ struct\ mlx5dv_qp_init_attr\ *mlx5_qp_attr) +\f[] +.fi +.SH DESCRIPTION +.PP +\f[B]mlx5dv_create_qp()\f[] creates a queue pair (QP) with specific +driver properties. +.SH ARGUMENTS +.PP +Please see \f[I]ibv_create_qp_ex(3)\f[] man page for \f[I]context\f[] +and \f[I]qp_attr\f[]. +.SS mlx5_qp_attr +.IP +.nf +\f[C] +struct\ mlx5dv_qp_init_attr\ { +\ \ \ \ uint64_t\ comp_mask; +\ \ \ \ uint32_t\ create_flags; +\ \ \ \ struct\ mlx5dv_dc_init_attr\ \ dc_init_attr; +\ \ \ \ uint64_t\ send_ops_flags; +}; +\f[] +.fi +.TP +.B \f[I]comp_mask\f[] +Bitmask specifying what fields in the structure are valid: +MLX5DV_QP_INIT_ATTR_MASK_QP_CREATE_FLAGS: valid values in +\f[I]create_flags\f[] MLX5DV_QP_INIT_ATTR_MASK_DC: valid values in +\f[I]dc_init_attr\f[] MLX5DV_QP_INIT_ATTR_MASK_SEND_OPS_FLAGS: valid +values in \f[I]send_ops_flags\f[] +.RS +.RE +.TP +.B \f[I]create_flags\f[] +A bitwise OR of the various values described below. +.RS +.PP +MLX5DV_QP_CREATE_TUNNEL_OFFLOADS: Enable offloading such as checksum and +LRO for incoming tunneling traffic. +.PP +MLX5DV_QP_CREATE_TIR_ALLOW_SELF_LOOPBACK_UC: Allow receiving loopback +unicast traffic. +.PP +MLX5DV_QP_CREATE_TIR_ALLOW_SELF_LOOPBACK_MC: Allow receiving loopback +multicast traffic. +.PP +MLX5DV_QP_CREATE_DISABLE_SCATTER_TO_CQE: Disable scatter to CQE feature +which is enabled by default. +.PP +MLX5DV_QP_CREATE_ALLOW_SCATTER_TO_CQE: Allow scatter to CQE for +requester even if the qp was not configured to signal all WRs. +.PP +MLX5DV_QP_CREATE_PACKET_BASED_CREDIT_MODE: Set QP to work in +end\-to\-end packet\-based credit, instead of the default message\-based +credits (IB spec. +section 9.7.7.2). +.PD 0 +.P +.PD +It is the applications responsibility to make sure that the peer QP is +configured with same mode. +.RE +.TP +.B \f[I]dc_init_attr\f[] +DC init attributes. +.RS +.RE +.SS \f[I]dc_init_attr\f[] +.IP +.nf +\f[C] +struct\ mlx5dv_dc_init_attr\ { +\ \ \ \ enum\ mlx5dv_dc_type\ dc_type; +\ \ \ \ uint64_t\ dct_access_key; +}; +\f[] +.fi +.TP +.B \f[I]dc_type\f[] +MLX5DV_DCTYPE_DCT QP type: Target DC. +MLX5DV_DCTYPE_DCI QP type: Initiator DC. +.RS +.RE +.TP +.B \f[I]dct_access_key\f[] +used to create a DCT QP. +.RS +.RE +.TP +.B \f[I]send_ops_flags\f[] +A bitwise OR of the various values described below. +.RS +.PP +MLX5DV_QP_EX_WITH_MR_INTERLEAVED: Enables the mlx5dv_wr_mr_interleaved() +work requset on this QP. +.PP +MLX5DV_QP_EX_WITH_MR_LIST: Enables the mlx5dv_wr_mr_list() work requset +on this QP. +.RE +.SH NOTES +.PP +\f[B]mlx5dv_qp_ex_from_ibv_qp_ex()\f[] is used to get \f[I]struct +mlx5dv_qp_ex\f[] for accessing the send ops interfaces when +IBV_QP_INIT_ATTR_SEND_OPS_FLAGS is used. +.SH RETURN VALUE +.PP +\f[B]mlx5dv_create_qp()\f[] returns a pointer to the created QP, on +error NULL will be returned and errno will be set. +.SH SEE ALSO +.PP +\f[B]ibv_query_device_ex\f[](3), \f[B]ibv_create_qp_ex\f[](3), +.SH AUTHOR +.PP +Yonatan Cohen <yonatanc@mellanox.com> diff --git a/buildlib/pandoc-prebuilt/08751a05c4d0d02d49ed48223246ac6c6b98f8ee b/buildlib/pandoc-prebuilt/08751a05c4d0d02d49ed48223246ac6c6b98f8ee new file mode 100644 index 0000000..bf35766 --- /dev/null +++ b/buildlib/pandoc-prebuilt/08751a05c4d0d02d49ed48223246ac6c6b98f8ee @@ -0,0 +1,214 @@ +.\" Man page generated from reStructuredText. +. +.TH SMINFO 8 "2017-08-21" "" "Open IB Diagnostics" +.SH NAME +sminfo \- query InfiniBand SMInfo attribute +. +.nr rst2man-indent-level 0 +. +.de1 rstReportMargin +\\$1 \\n[an-margin] +level \\n[rst2man-indent-level] +level margin: \\n[rst2man-indent\\n[rst2man-indent-level]] +- +\\n[rst2man-indent0] +\\n[rst2man-indent1] +\\n[rst2man-indent2] +.. +.de1 INDENT +.\" .rstReportMargin pre: +. RS \\$1 +. nr rst2man-indent\\n[rst2man-indent-level] \\n[an-margin] +. nr rst2man-indent-level +1 +.\" .rstReportMargin post: +.. +.de UNINDENT +. RE +.\" indent \\n[an-margin] +.\" old: \\n[rst2man-indent\\n[rst2man-indent-level]] +.nr rst2man-indent-level -1 +.\" new: \\n[rst2man-indent\\n[rst2man-indent-level]] +.in \\n[rst2man-indent\\n[rst2man-indent-level]]u +.. +.SH SYNOPSIS +.sp +sminfo [options] sm_lid | sm_dr_path [modifier] +.SH DESCRIPTION +.sp +Optionally set and display the output of a sminfo query in human readable +format. The target SM is the one listed in the local port info, or the SM +specified by the optional SM lid or by the SM direct routed path. +.sp +Note: using sminfo for any purposes other then simple query may be very +dangerous, and may result in a malfunction of the target SM. +.SH OPTIONS +.INDENT 0.0 +.TP +.B \fB\-s, \-\-state <state>\fP set SM state +0 not active +.sp +1 discovering +.sp +2 standby +.sp +3 master +.UNINDENT +.sp +\fB\-p, \-\-priority <priority>\fP set priority (0\-15) +.sp +\fB\-a, \-\-activity <val>\fP set activity count +.SS Addressing Flags +.\" Define the common option -D for Directed routes +. +.sp +\fB\-D, \-\-Direct\fP The address specified is a directed route +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +Examples: + [options] \-D [options] "0" # self port + [options] \-D [options] "0,1,2,1,4" # out via port 1, then 2, ... + + (Note the second number in the path specified must match the port being + used. This can be specified using the port selection flag \(aq\-P\(aq or the + port found through the automatic selection process.) +.ft P +.fi +.UNINDENT +.UNINDENT +.\" Define the common option -G +. +.sp +\fB\-G, \-\-Guid\fP The address specified is a Port GUID +.\" Define the common option -L +. +.sp +\fB\-L, \-\-Lid\fP The address specified is a LID +.SS Port Selection flags +.\" Define the common option -C +. +.sp +\fB\-C, \-\-Ca <ca_name>\fP use the specified ca_name. +.\" Define the common option -P +. +.sp +\fB\-P, \-\-Port <ca_port>\fP use the specified ca_port. +.\" Explanation of local port selection +. +.SS Local port Selection +.sp +Multiple port/Multiple CA support: when no IB device or port is specified +(see the "local umad parameters" below), the libibumad library +selects the port to use by the following criteria: +.INDENT 0.0 +.INDENT 3.5 +.INDENT 0.0 +.IP 1. 3 +the first port that is ACTIVE. +.IP 2. 3 +if not found, the first port that is UP (physical link up). +.UNINDENT +.sp +If a port and/or CA name is specified, the libibumad library attempts +to fulfill the user request, and will fail if it is not possible. +.sp +For example: +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +ibaddr # use the first port (criteria #1 above) +ibaddr \-C mthca1 # pick the best port from "mthca1" only. +ibaddr \-P 2 # use the second (active/up) port from the first available IB device. +ibaddr \-C mthca0 \-P 2 # use the specified port only. +.ft P +.fi +.UNINDENT +.UNINDENT +.UNINDENT +.UNINDENT +.SS Debugging flags +.\" Define the common option -d +. +.INDENT 0.0 +.TP +.B \-d +raise the IB debugging level. +May be used several times (\-ddd or \-d \-d \-d). +.UNINDENT +.\" Define the common option -e +. +.INDENT 0.0 +.TP +.B \-e +show send and receive errors (timeouts and others) +.UNINDENT +.\" Define the common option -h +. +.sp +\fB\-h, \-\-help\fP show the usage message +.\" Define the common option -v +. +.INDENT 0.0 +.TP +.B \fB\-v, \-\-verbose\fP +increase the application verbosity level. +May be used several times (\-vv or \-v \-v \-v) +.UNINDENT +.\" Define the common option -V +. +.sp +\fB\-V, \-\-version\fP show the version info. +.SS Configuration flags +.\" Define the common option -t +. +.sp +\fB\-t, \-\-timeout <timeout_ms>\fP override the default timeout for the solicited mads. +.\" Define the common option -y +. +.INDENT 0.0 +.TP +.B \fB\-y, \-\-m_key <key>\fP +use the specified M_key for requests. If non\-numeric value (like \(aqx\(aq) +is specified then a value will be prompted for. +.UNINDENT +.\" Define the common option -z +. +.sp +\fB\-\-config, \-z <config_file>\fP Specify alternate config file. +.INDENT 0.0 +.INDENT 3.5 +Default: /usr/local/etc/infiniband\-diags/ibdiag.conf +.UNINDENT +.UNINDENT +.SH FILES +.\" Common text for the config file +. +.SS CONFIG FILE +.sp +/usr/local/etc/infiniband\-diags/ibdiag.conf +.sp +A global config file is provided to set some of the common options for all +tools. See supplied config file for details. +.SH EXAMPLES +.INDENT 0.0 +.TP +.B :: +sminfo # local port\(aqs sminfo +sminfo 32 # show sminfo of lid 32 +sminfo \-G 0x8f1040023 # same but using guid address +.UNINDENT +.SH SEE ALSO +.sp +smpdump (8) +.SH AUTHOR +.INDENT 0.0 +.TP +.B Hal Rosenstock +< \fI\%halr@voltaire.com\fP > +.UNINDENT +.\" Generated by docutils manpage writer. +. diff --git a/buildlib/pandoc-prebuilt/0ac7ce8cf9db88f5ac3aa4e5afdeae555017e1fc b/buildlib/pandoc-prebuilt/0ac7ce8cf9db88f5ac3aa4e5afdeae555017e1fc new file mode 100644 index 0000000..c906920 --- /dev/null +++ b/buildlib/pandoc-prebuilt/0ac7ce8cf9db88f5ac3aa4e5afdeae555017e1fc @@ -0,0 +1,108 @@ +.\" Automatically generated by Pandoc 1.19.2.4 +.\" +.TH "mlx5dv_devx_qp[/cq/srq/wq/ind_tbl]_modify / query" "3" "" "" "" +.hy +.SH NAME +.PP +mlx5dv_devx_qp_modify \- Modifies a verbs QP via DEVX +.PP +mlx5dv_devx_qp_query \- Queries a verbs QP via DEVX +.PP +mlx5dv_devx_cq_modify \- Modifies a verbs CQ via DEVX +.PP +mlx5dv_devx_cq_query \- Queries a verbs CQ via DEVX +.PP +mlx5dv_devx_srq_modify \- Modifies a verbs SRQ via DEVX +.PP +mlx5dv_devx_srq_query \- Queries a verbs SRQ via DEVX +.PP +mlx5dv_devx_wq_modify \- Modifies a verbs WQ via DEVX +.PP +mlx5dv_devx_wq_query \- Queries a verbs WQ via DEVX +.PP +mlx5dv_devx_ind_tbl_modify \- Modifies a verbs indirection table via +DEVX +.PP +mlx5dv_devx_ind_tbl_query \- Queries a verbs indirection table via DEVX +.SH SYNOPSIS +.IP +.nf +\f[C] +#include\ <infiniband/mlx5dv.h> +int\ mlx5dv_devx_qp_modify(struct\ ibv_qp\ *qp,\ const\ void\ *in,\ size_t\ inlen, +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ void\ *out,\ size_t\ outlen); +int\ mlx5dv_devx_qp_query(struct\ ibv_qp\ *qp,\ const\ void\ *in,\ size_t\ inlen, +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ void\ *out,\ size_t\ outlen); +int\ mlx5dv_devx_cq_modify(struct\ ibv_cq\ *cq,\ const\ void\ *in,\ size_t\ inlen, +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ void\ *out,\ size_t\ outlen); +int\ mlx5dv_devx_cq_query(struct\ ibv_cq\ *cq,\ const\ void\ *in,\ size_t\ inlen, +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ void\ *out,\ size_t\ outlen); +int\ mlx5dv_devx_srq_modify(struct\ ibv_srq\ *srq,\ const\ void\ *in,\ size_t\ inlen, +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ void\ *out,\ size_t\ outlen); +int\ mlx5dv_devx_srq_query(struct\ ibv_srq\ *srq,\ const\ void\ *in,\ size_t\ inlen, +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ void\ *out,\ size_t\ outlen); +int\ mlx5dv_devx_wq_modify(struct\ ibv_wq\ *wq,\ const\ void\ *in,\ size_t\ inlen, +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ void\ *out,\ size_t\ outlen); +int\ mlx5dv_devx_wq_query(struct\ ibv_wq\ *wq,\ const\ void\ *in,\ size_t\ inlen, +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ void\ *out,\ size_t\ outlen); +int\ mlx5dv_devx_ind_tbl_modify(struct\ ibv_rwq_ind_table\ *ind_tbl, +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ const\ void\ *in,\ size_t\ inlen, +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ void\ *out,\ size_t\ outlen); +int\ mlx5dv_devx_ind_tbl_query(struct\ ibv_rwq_ind_table\ *ind_tbl, +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ const\ void\ *in,\ size_t\ inlen, +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ void\ *out,\ size_t\ outlen); +\f[] +.fi +.SH DESCRIPTION +.PP +Modify / query a verb object over the DEVX interface. +.PP +The DEVX API enables direct access from the user space area to the mlx5 +device driver by using the KABI mechanism. +The main purpose is to make the user space driver as independent as +possible from the kernel so that future device functionality and +commands can be activated with minimal to none kernel changes. +.PP +The above APIs enables modifying/querying a verb object via the DEVX +interface. +This enables interoperability between verbs and DEVX. +As such an application can use the create method from verbs (e.g. +ibv_create_qp) and modify and query the created object via DEVX (e.g. +mlx5dv_devx_qp_modify). +.SH ARGUMENTS +.TP +.B \f[I]qp/cq/wq/srq/ind_tbl\f[] +The ibv_xxx object to issue the action on. +.RS +.RE +.TP +.B \f[I]in\f[] +A buffer which contains the command\[aq]s input data provided in a +device specification format. +.RS +.RE +.TP +.B \f[I]inlen\f[] +The size of \f[I]in\f[] buffer in bytes. +.RS +.RE +.TP +.B \f[I]out\f[] +A buffer which contains the command\[aq]s output data according to the +device specification format. +.RS +.RE +.TP +.B \f[I]outlen\f[] +The size of \f[I]out\f[] buffer in bytes. +.RS +.RE +.SH RETURN VALUE +.PP +Upon success 0 is returned or the value of errno on a failure. +.SH SEE ALSO +.PP +\f[B]mlx5dv_open_device\f[], \f[B]mlx5dv_devx_obj_create\f[] +.SH AUTHOR +.PP +Yishai Hadas <yishaih@mellanox.com> diff --git a/buildlib/pandoc-prebuilt/1088fd600b498b50bbfa1dd8bf792ca7afd656d4 b/buildlib/pandoc-prebuilt/1088fd600b498b50bbfa1dd8bf792ca7afd656d4 new file mode 100644 index 0000000..bad07ed --- /dev/null +++ b/buildlib/pandoc-prebuilt/1088fd600b498b50bbfa1dd8bf792ca7afd656d4 @@ -0,0 +1,195 @@ +.\" Man page generated from reStructuredText. +. +.TH IBCCCONFIG 8 "2012-05-31" "" "OpenIB Diagnostics" +.SH NAME +IBCCCONFIG \- configure congestion control settings +. +.nr rst2man-indent-level 0 +. +.de1 rstReportMargin +\\$1 \\n[an-margin] +level \\n[rst2man-indent-level] +level margin: \\n[rst2man-indent\\n[rst2man-indent-level]] +- +\\n[rst2man-indent0] +\\n[rst2man-indent1] +\\n[rst2man-indent2] +.. +.de1 INDENT +.\" .rstReportMargin pre: +. RS \\$1 +. nr rst2man-indent\\n[rst2man-indent-level] \\n[an-margin] +. nr rst2man-indent-level +1 +.\" .rstReportMargin post: +.. +.de UNINDENT +. RE +.\" indent \\n[an-margin] +.\" old: \\n[rst2man-indent\\n[rst2man-indent-level]] +.nr rst2man-indent-level -1 +.\" new: \\n[rst2man-indent\\n[rst2man-indent-level]] +.in \\n[rst2man-indent\\n[rst2man-indent-level]]u +.. +.SH SYNOPSIS +.sp +ibccconfig [common_options] [\-c cckey] <op> <lid|guid> [port] +.SH DESCRIPTION +.sp +\fBibccconfig\fP +supports the configuration of congestion control settings on switches +and HCAs. +.sp +\fBWARNING \-\- You should understand what you are doing before using this tool. +Misuse of this tool could result in a broken fabric.\fP +.SH OPTIONS +.INDENT 0.0 +.TP +.B Current supported operations and their parameters: +CongestionKeyInfo (CK) <lid|guid> <cckey> <cckeyprotectbit> <cckeyleaseperiod> <cckeyviolations> +SwitchCongestionSetting (SS) <lid|guid> <controlmap> <victimmask> <creditmask> <threshold> <packetsize> <csthreshold> <csreturndelay> <markingrate> +SwitchPortCongestionSetting (SP) <lid|guid> <portnum> <valid> <control_type> <threshold> <packet_size> <cong_parm_marking_rate> +CACongestionSetting (CS) <lid|guid> <port_control> <control_map> <ccti_timer> <ccti_increase> <trigger_threshold> <ccti_min> +CongestionControlTable (CT) <lid|guid> <cctilimit> <index> <cctentry> <cctentry> ... +.UNINDENT +.sp +\fB\-\-cckey, \-c, <cckey>\fP +Specify a congestion control (CC) key. If none is specified, a key of 0 is used. +.SS Debugging flags +.\" Define the common option -d +. +.INDENT 0.0 +.TP +.B \-d +raise the IB debugging level. +May be used several times (\-ddd or \-d \-d \-d). +.UNINDENT +.\" Define the common option -e +. +.INDENT 0.0 +.TP +.B \-e +show send and receive errors (timeouts and others) +.UNINDENT +.\" Define the common option -h +. +.sp +\fB\-h, \-\-help\fP show the usage message +.\" Define the common option -v +. +.INDENT 0.0 +.TP +.B \fB\-v, \-\-verbose\fP +increase the application verbosity level. +May be used several times (\-vv or \-v \-v \-v) +.UNINDENT +.\" Define the common option -V +. +.sp +\fB\-V, \-\-version\fP show the version info. +.SS Addressing Flags +.\" Define the common option -G +. +.sp +\fB\-G, \-\-Guid\fP The address specified is a Port GUID +.\" Define the common option -L +. +.sp +\fB\-L, \-\-Lid\fP The address specified is a LID +.\" Define the common option -s +. +.sp +\fB\-s, \-\-sm_port <smlid>\fP use \(aqsmlid\(aq as the target lid for SA queries. +.SS Port Selection flags +.\" Define the common option -C +. +.sp +\fB\-C, \-\-Ca <ca_name>\fP use the specified ca_name. +.\" Define the common option -P +. +.sp +\fB\-P, \-\-Port <ca_port>\fP use the specified ca_port. +.\" Explanation of local port selection +. +.SS Local port Selection +.sp +Multiple port/Multiple CA support: when no IB device or port is specified +(see the "local umad parameters" below), the libibumad library +selects the port to use by the following criteria: +.INDENT 0.0 +.INDENT 3.5 +.INDENT 0.0 +.IP 1. 3 +the first port that is ACTIVE. +.IP 2. 3 +if not found, the first port that is UP (physical link up). +.UNINDENT +.sp +If a port and/or CA name is specified, the libibumad library attempts +to fulfill the user request, and will fail if it is not possible. +.sp +For example: +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +ibaddr # use the first port (criteria #1 above) +ibaddr \-C mthca1 # pick the best port from "mthca1" only. +ibaddr \-P 2 # use the second (active/up) port from the first available IB device. +ibaddr \-C mthca0 \-P 2 # use the specified port only. +.ft P +.fi +.UNINDENT +.UNINDENT +.UNINDENT +.UNINDENT +.SS Configuration flags +.\" Define the common option -y +. +.INDENT 0.0 +.TP +.B \fB\-y, \-\-m_key <key>\fP +use the specified M_key for requests. If non\-numeric value (like \(aqx\(aq) +is specified then a value will be prompted for. +.UNINDENT +.\" Define the common option -z +. +.sp +\fB\-\-config, \-z <config_file>\fP Specify alternate config file. +.INDENT 0.0 +.INDENT 3.5 +Default: /usr/local/etc/infiniband\-diags/ibdiag.conf +.UNINDENT +.UNINDENT +.SH EXAMPLES +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +ibccconfig SwitchCongestionSetting 2 0x1F 0x1FFFFFFFFF 0x0 0xF 8 0 0:0 1 # Configure Switch Congestion Settings +ibccconfig CACongestionSetting 1 0 0x3 150 1 0 0 # Configure CA Congestion Settings to SL 0 and SL 1 +ibccconfig CACongestionSetting 1 0 0x4 200 1 0 0 # Configure CA Congestion Settings to SL 2 +ibccconfig CongestionControlTable 1 63 0 0:0 0:1 ... # Configure first block of Congestion Control Table +ibccconfig CongestionControlTable 1 127 0 0:64 0:65 ... # Configure second block of Congestion Control Table +.ft P +.fi +.UNINDENT +.UNINDENT +.SH FILES +.\" Common text for the config file +. +.SS CONFIG FILE +.sp +/usr/local/etc/infiniband\-diags/ibdiag.conf +.sp +A global config file is provided to set some of the common options for all +tools. See supplied config file for details. +.SH AUTHOR +.INDENT 0.0 +.TP +.B Albert Chu +< \fI\%chu11@llnl.gov\fP > +.UNINDENT +.\" Generated by docutils manpage writer. +. diff --git a/buildlib/pandoc-prebuilt/15509ed914ee358ac026220db5abf5f9fe1737de b/buildlib/pandoc-prebuilt/15509ed914ee358ac026220db5abf5f9fe1737de new file mode 100644 index 0000000..7a629ad --- /dev/null +++ b/buildlib/pandoc-prebuilt/15509ed914ee358ac026220db5abf5f9fe1737de @@ -0,0 +1,31 @@ +.\" Automatically generated by Pandoc 1.19.2.4 +.\" +.TH "IBV_GET_DEVICE_NAME" "3" "2006\-10\-31" "libibverbs" "Libibverbs Programmer\[aq]s Manual" +.hy +.SH NAME +.PP +ibv_get_device_name \- get an RDMA device\[aq]s name +.SH SYNOPSIS +.IP +.nf +\f[C] +#include\ <infiniband/verbs.h> + +const\ char\ *ibv_get_device_name(struct\ ibv_device\ *device); +\f[] +.fi +.SH DESCRIPTION +.PP +\f[B]ibv_get_device_name()\f[] returns a human\-readable name associated +with the RDMA device \f[I]device\f[]. +.SH RETURN VALUE +.PP +\f[B]ibv_get_device_name()\f[] returns a pointer to the device name, or +NULL if the request fails. +.SH SEE ALSO +.PP +\f[B]ibv_get_device_guid\f[](3), \f[B]ibv_get_device_list\f[](3), +\f[B]ibv_open_device\f[](3) +.SH AUTHOR +.PP +Dotan Barak <dotanba@gmail.com> diff --git a/buildlib/pandoc-prebuilt/1c3f51131206bb1a7ed34fbdc897910d313df687 b/buildlib/pandoc-prebuilt/1c3f51131206bb1a7ed34fbdc897910d313df687 new file mode 100644 index 0000000..98d7d66 --- /dev/null +++ b/buildlib/pandoc-prebuilt/1c3f51131206bb1a7ed34fbdc897910d313df687 @@ -0,0 +1,80 @@ +.\" Automatically generated by Pandoc 1.19.2.4 +.\" +.TH "mlx5dv_flow_action_esp" "3" "" "" "" +.hy +.SH NAME +.PP +mlx5dv_flow_action_esp \- Flow action esp for mlx5 provider +.SH SYNOPSIS +.IP +.nf +\f[C] +#include\ <infiniband/mlx5/mlx5dv.h> + +struct\ ibv_flow_action\ * +mlx5dv_create_flow_action_esp(struct\ ibv_context\ *ctx, +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ struct\ ibv_flow_action_esp_attr\ *esp, +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ struct\ mlx5dv_flow_action_esp\ *mlx5_attr); +\f[] +.fi +.SH DESCRIPTION +.PP +Create an IPSEC ESP flow steering action. +.PD 0 +.P +.PD +This verb is identical to \f[I]ibv_create_flow_action_esp\f[] verb, but +allows mlx5 specific flags. +.SH ARGUMENTS +.PP +Please see \f[I]ibv_flow_action_esp(3)\f[] man page for \f[I]ctx\f[] and +\f[I]esp\f[]. +.SS \f[I]mlx5_attr\f[] argument +.IP +.nf +\f[C] +struct\ mlx5dv_flow_action_esp\ { +\ \ \ \ uint64_t\ comp_mask;\ \ /*\ Use\ enum\ mlx5dv_flow_action_esp_mask\ */ +\ \ \ \ uint32_t\ action_flags;\ /*\ Use\ enum\ mlx5dv_flow_action_flags\ */ +}; +\f[] +.fi +.TP +.B \f[I]comp_mask\f[] +Bitmask specifying what fields in the structure are valid (\f[I]enum +mlx5dv_flow_action_esp_mask\f[]). +.RS +.RE +.TP +.B \f[I]action_flags\f[] +A bitwise OR of the various values described below. +.RS +.PP +\f[I]MLX5DV_FLOW_ACTION_FLAGS_REQUIRE_METADATA\f[]: +.PD 0 +.P +.PD +Each received and transmitted packet using offload is expected to carry +metadata in the form of a L2 header +.PD 0 +.P +.PD +with ethernet type 0x8CE4, followed by 6 bytes of data and the original +packet ethertype. +.RE +.SH NOTE +.PP +The ESN is expected to be placed in the IV field for egress packets. +.PD 0 +.P +.PD +The 64 bit sequence number is written in big\-endian over the 64 bit IV +field. +.PD 0 +.P +.PD +There is no need to call modify to update the ESN window on egress when +this DV is used. +.SH SEE ALSO +.PP +\f[I]ibv_flow_action_esp(3)\f[], \f[I]RFC 4106\f[] diff --git a/buildlib/pandoc-prebuilt/2082c9e75706a10a0c0c9925f5108736249d8368 b/buildlib/pandoc-prebuilt/2082c9e75706a10a0c0c9925f5108736249d8368 new file mode 100644 index 0000000..d896f39 --- /dev/null +++ b/buildlib/pandoc-prebuilt/2082c9e75706a10a0c0c9925f5108736249d8368 @@ -0,0 +1,101 @@ +.\" Automatically generated by Pandoc 1.19.2.4 +.\" +.TH "ibv_create_counters" "3" "2018\-04\-02" "libibverbs" "Libibverbs Programmer\[aq]s Manual" +.hy +.SH NAME +.PP +\f[B]ibv_create_counters\f[], \f[B]ibv_destroy_counters\f[] \- Create or +destroy a counters handle +.SH SYNOPSIS +.IP +.nf +\f[C] +#include\ <infiniband/verbs.h> + +struct\ ibv_counters\ * +ibv_create_counters(struct\ ibv_context\ *context, +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ struct\ ibv_counters_init_attr\ *init_attr); + +int\ ibv_destroy_counters(struct\ ibv_counters\ *counters); +\f[] +.fi +.SH DESCRIPTION +.PP +\f[B]ibv_create_counters\f[]() creates a new counters handle for the +RDMA device context. +.PP +An ibv_counters handle can be attached to a verbs resource (e.g.: QP, +WQ, Flow) statically when these are created. +.PP +For example attach an ibv_counters statically to a Flow (struct +ibv_flow) during creation of a new Flow by calling +\f[B]ibv_create_flow()\f[]. +.PP +Counters are cleared upon creation and values will be monotonically +increasing. +.PP +\f[B]ibv_destroy_counters\f[]() releases the counters handle, user +should detach the counters object before destroying it. +.SH ARGUMENTS +.TP +.B \f[I]context\f[] +RDMA device context to create the counters on. +.RS +.RE +.TP +.B \f[I]init_attr\f[] +Is an ibv_counters_init_attr struct, as defined in verbs.h. +.RS +.RE +.SS \f[I]init_attr\f[] Argument +.IP +.nf +\f[C] +struct\ ibv_counters_init_attr\ { +\ \ \ \ int\ comp_mask; +}; +\f[] +.fi +.TP +.B \f[I]comp_mask\f[] +Bitmask specifying what fields in the structure are valid. +.RS +.RE +.SH RETURN VALUE +.PP +\f[B]ibv_create_counters\f[]() returns a pointer to the allocated +ibv_counters object, or NULL if the request fails (and sets errno to +indicate the failure reason) +.PP +\f[B]ibv_destroy_counters\f[]() returns 0 on success, or the value of +errno on failure (which indicates the failure reason) +.SH ERRORS +.TP +.B EOPNOTSUPP +\f[B]ibv_create_counters\f[]() is not currently supported on this device +(ENOSYS may sometimes be returned by old versions of libibverbs). +.RS +.RE +.TP +.B ENOMEM +\f[B]ibv_create_counters\f[]() could not create ibv_counters object, not +enough memory +.RS +.RE +.TP +.B EINVAL +invalid parameter supplied \f[B]ibv_destroy_counters\f[]() +.RS +.RE +.SH EXAMPLE +.PP +An example of use of ibv_counters is shown in \f[B]ibv_read_counters\f[] +.SH SEE ALSO +.PP +\f[B]ibv_attach_counters_point_flow\f[], \f[B]ibv_read_counters\f[], +\f[B]ibv_create_flow\f[] +.SH AUTHORS +.PP +Raed Salem <raeds@mellanox.com> +.PP +Alex Rosenbaum <alexr@mellanox.com> diff --git a/buildlib/pandoc-prebuilt/23046225aae54879fdd2d044ba307096e412d64c b/buildlib/pandoc-prebuilt/23046225aae54879fdd2d044ba307096e412d64c new file mode 100644 index 0000000..0f691a3 --- /dev/null +++ b/buildlib/pandoc-prebuilt/23046225aae54879fdd2d044ba307096e412d64c @@ -0,0 +1,79 @@ +.\" Automatically generated by Pandoc 1.19.2.4 +.\" +.TH "mlx5dv_alloc_var / mlx5dv_free_var" "3" "" "" "" +.hy +.SH NAME +.PP +mlx5dv_alloc_var \- Allocates a VAR +.PP +mlx5dv_free_var \- Frees a VAR +.SH SYNOPSIS +.IP +.nf +\f[C] +#include\ <infiniband/mlx5dv.h> + +struct\ mlx5dv_var\ * +mlx5dv_alloc_var(struct\ ibv_context\ *context,\ uint32_t\ flags); + +void\ mlx5dv_free_var(struct\ mlx5dv_var\ *dv_var); +\f[] +.fi +.SH DESCRIPTION +.PP +Create / free a VAR which can be used for some device commands over the +DEVX interface. +.PP +The DEVX API enables direct access from the user space area to the mlx5 +device driver, the VAR information is needed for few commands related to +Virtio. +.SH ARGUMENTS +.TP +.B \f[I]context\f[] +RDMA device context to work on. +.RS +.RE +.TP +.B \f[I]flags\f[] +Allocation flags for the UAR. +.RS +.RE +.SS dv_var +.IP +.nf +\f[C] +struct\ mlx5dv_var\ { +\ \ \ \ uint32_t\ page_id; +\ \ \ \ uint32_t\ length; +\ \ \ \ off_t\ mmap_off; +\ \ \ \ uint64_t\ comp_mask; +}; +\f[] +.fi +.TP +.B \f[I]page_id\f[] +The device page id to be used. +.RS +.RE +.TP +.B \f[I]length\f[] +The mmap length parameter to be used for mapping a VA to the allocated +VAR entry. +.RS +.RE +.TP +.B \f[I]mmap_off\f[] +The mmap offset parameter to be used for mapping a VA to the allocated +VAR entry. +.RS +.RE +.SH RETURN VALUE +.PP +Upon success \f[I]mlx5dv_alloc_var\f[] returns a pointer to the created +VAR ,on error NULL will be returned and errno will be set. +.SH SEE ALSO +.PP +\f[B]mlx5dv_open_device\f[], \f[B]mlx5dv_devx_obj_create\f[] +.SH AUTHOR +.PP +Yishai Hadas <yishaih@mellanox.com> diff --git a/buildlib/pandoc-prebuilt/23660644c7d16519530ca5d9fe12f0f800e1f1c0 b/buildlib/pandoc-prebuilt/23660644c7d16519530ca5d9fe12f0f800e1f1c0 new file mode 100644 index 0000000..8fa3384 --- /dev/null +++ b/buildlib/pandoc-prebuilt/23660644c7d16519530ca5d9fe12f0f800e1f1c0 @@ -0,0 +1,69 @@ +.\" Automatically generated by Pandoc 1.19.2.4 +.\" +.TH "mlx5dv_open_device" "3" "" "" "" +.hy +.SH NAME +.PP +mlx5dv_open_device \- Open an RDMA device context for the mlx5 provider +.SH SYNOPSIS +.IP +.nf +\f[C] +#include\ <infiniband/mlx5dv.h> + +struct\ ibv_context\ * +mlx5dv_open_device(struct\ ibv_device\ *device,\ struct\ mlx5dv_context_attr\ *attr); +\f[] +.fi +.SH DESCRIPTION +.PP +Open an RDMA device context with specific mlx5 provider attributes. +.SH ARGUMENTS +.TP +.B \f[I]device\f[] +RDMA device to open. +.RS +.RE +.SS \f[I]attr\f[] argument +.IP +.nf +\f[C] +struct\ mlx5dv_context_attr\ { +\ \ \ \ \ \ \ \ uint32_t\ flags; +\ \ \ \ \ \ \ \ uint64_t\ comp_mask; +}; +\f[] +.fi +.TP +.B \f[I]flags\f[] +.IP +.nf +\f[C] +A\ bitwise\ OR\ of\ the\ various\ values\ described\ below. + +*MLX5DV_CONTEXT_FLAGS_DEVX*: +Allocate\ a\ DEVX\ context +\f[] +.fi +.RS +.RE +.TP +.B \f[I]comp_mask\f[] +.IP +.nf +\f[C] +Bitmask\ specifying\ what\ fields\ in\ the\ structure\ are\ valid +\f[] +.fi +.RS +.RE +.SH RETURN VALUE +.PP +Returns a pointer to the allocated device context, or NULL if the +request fails. +.SH SEE ALSO +.PP +\f[I]ibv_open_device(3)\f[] +.SH AUTHOR +.PP +Yishai Hadas <yishaih@mellanox.com> diff --git a/buildlib/pandoc-prebuilt/241312b7f23c00b7c2e6311643a22e00e6eedaac b/buildlib/pandoc-prebuilt/241312b7f23c00b7c2e6311643a22e00e6eedaac new file mode 100644 index 0000000..52080c0 --- /dev/null +++ b/buildlib/pandoc-prebuilt/241312b7f23c00b7c2e6311643a22e00e6eedaac @@ -0,0 +1,174 @@ +.\" Man page generated from reStructuredText. +. +.TH IBPING 8 "2012-05-14" "" "Open IB Diagnostics" +.SH NAME +IBPING \- ping an InfiniBand address +. +.nr rst2man-indent-level 0 +. +.de1 rstReportMargin +\\$1 \\n[an-margin] +level \\n[rst2man-indent-level] +level margin: \\n[rst2man-indent\\n[rst2man-indent-level]] +- +\\n[rst2man-indent0] +\\n[rst2man-indent1] +\\n[rst2man-indent2] +.. +.de1 INDENT +.\" .rstReportMargin pre: +. RS \\$1 +. nr rst2man-indent\\n[rst2man-indent-level] \\n[an-margin] +. nr rst2man-indent-level +1 +.\" .rstReportMargin post: +.. +.de UNINDENT +. RE +.\" indent \\n[an-margin] +.\" old: \\n[rst2man-indent\\n[rst2man-indent-level]] +.nr rst2man-indent-level -1 +.\" new: \\n[rst2man-indent\\n[rst2man-indent-level]] +.in \\n[rst2man-indent\\n[rst2man-indent-level]]u +.. +.SH SYNOPSIS +.sp +ibping [options] <dest lid | guid> +.SH DESCRIPTION +.sp +ibping uses vendor mads to validate connectivity between IB nodes. +On exit, (IP) ping like output is show. ibping is run as client/server. +Default is to run as client. Note also that a default ping server is +implemented within the kernel. +.SH OPTIONS +.sp +\fB\-c, \-\-count\fP +stop after count packets +.sp +\fB\-f, \-\-flood\fP +flood destination: send packets back to back without delay +.sp +\fB\-o, \-\-oui\fP +use specified OUI number to multiplex vendor mads +.sp +\fB\-S, \-\-Server\fP +start in server mode (do not return) +.SS Addressing Flags +.\" Define the common option -L +. +.sp +\fB\-L, \-\-Lid\fP The address specified is a LID +.\" Define the common option -G +. +.sp +\fB\-G, \-\-Guid\fP The address specified is a Port GUID +.\" Define the common option -s +. +.sp +\fB\-s, \-\-sm_port <smlid>\fP use \(aqsmlid\(aq as the target lid for SA queries. +.SS Port Selection flags +.\" Define the common option -C +. +.sp +\fB\-C, \-\-Ca <ca_name>\fP use the specified ca_name. +.\" Define the common option -P +. +.sp +\fB\-P, \-\-Port <ca_port>\fP use the specified ca_port. +.\" Explanation of local port selection +. +.SS Local port Selection +.sp +Multiple port/Multiple CA support: when no IB device or port is specified +(see the "local umad parameters" below), the libibumad library +selects the port to use by the following criteria: +.INDENT 0.0 +.INDENT 3.5 +.INDENT 0.0 +.IP 1. 3 +the first port that is ACTIVE. +.IP 2. 3 +if not found, the first port that is UP (physical link up). +.UNINDENT +.sp +If a port and/or CA name is specified, the libibumad library attempts +to fulfill the user request, and will fail if it is not possible. +.sp +For example: +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +ibaddr # use the first port (criteria #1 above) +ibaddr \-C mthca1 # pick the best port from "mthca1" only. +ibaddr \-P 2 # use the second (active/up) port from the first available IB device. +ibaddr \-C mthca0 \-P 2 # use the specified port only. +.ft P +.fi +.UNINDENT +.UNINDENT +.UNINDENT +.UNINDENT +.SS Configuration flags +.\" Define the common option -z +. +.sp +\fB\-\-config, \-z <config_file>\fP Specify alternate config file. +.INDENT 0.0 +.INDENT 3.5 +Default: /usr/local/etc/infiniband\-diags/ibdiag.conf +.UNINDENT +.UNINDENT +.\" Define the common option -t +. +.sp +\fB\-t, \-\-timeout <timeout_ms>\fP override the default timeout for the solicited mads. +.SS Debugging flags +.\" Define the common option -h +. +.sp +\fB\-h, \-\-help\fP show the usage message +.\" Define the common option -d +. +.INDENT 0.0 +.TP +.B \-d +raise the IB debugging level. +May be used several times (\-ddd or \-d \-d \-d). +.UNINDENT +.\" Define the common option -e +. +.INDENT 0.0 +.TP +.B \-e +show send and receive errors (timeouts and others) +.UNINDENT +.\" Define the common option -v +. +.INDENT 0.0 +.TP +.B \fB\-v, \-\-verbose\fP +increase the application verbosity level. +May be used several times (\-vv or \-v \-v \-v) +.UNINDENT +.\" Define the common option -V +. +.sp +\fB\-V, \-\-version\fP show the version info. +.SH FILES +.\" Common text for the config file +. +.SS CONFIG FILE +.sp +/usr/local/etc/infiniband\-diags/ibdiag.conf +.sp +A global config file is provided to set some of the common options for all +tools. See supplied config file for details. +.SH AUTHOR +.INDENT 0.0 +.TP +.B Hal Rosenstock +< \fI\%halr@voltaire.com\fP > +.UNINDENT +.\" Generated by docutils manpage writer. +. diff --git a/buildlib/pandoc-prebuilt/2a9899c3a62b0c9164f7f76f08930a7e80ea9e51 b/buildlib/pandoc-prebuilt/2a9899c3a62b0c9164f7f76f08930a7e80ea9e51 new file mode 100644 index 0000000..9618af6 --- /dev/null +++ b/buildlib/pandoc-prebuilt/2a9899c3a62b0c9164f7f76f08930a7e80ea9e51 @@ -0,0 +1,82 @@ +.\" Automatically generated by Pandoc 1.19.2.4 +.\" +.TH "mlx5dv_devx_get_event" "3" "" "" "" +.hy +.SH NAME +.PP +mlx5dv_devx_get_event \- Get an asynchronous event. +.SH SYNOPSIS +.IP +.nf +\f[C] +#include\ <infiniband/mlx5dv.h> + +struct\ mlx5dv_devx_async_event_hdr\ { +\ \ \ \ uint64_t\ \ \ \ cookie; +\ \ \ \ uint8_t\ \ \ \ \ out_data[]; +}; + +ssize_t\ mlx5dv_devx_get_event(struct\ mlx5dv_devx_event_channel\ *event_channel, +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ struct\ mlx5dv_devx_async_event_hdr\ *event_data, +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ size_t\ event_resp_len) +\f[] +.fi +.SH DESCRIPTION +.PP +Get a device event on the given \f[I]event_channel\f[]. +Post a successful subscription over the event channel by calling to +mlx5dv_devx_subscribe_devx_event() the application should use this API +to get the response once an event has occurred. +.PP +Upon response the \f[I]cookie\f[] that was supplied upon the +subscription is returned and the \f[I]out_data\f[] includes the data +itself. +The \f[I]out_data\f[] may be omitted in case the channel was created +with the omit data flag. +.PP +The application must supply a large enough buffer to hold the event +according to the device specification, the buffer size is given by the +input \f[I]event_resp_len\f[] parameter. +.SH ARGUMENTS +.TP +.B \f[I]event_channel\f[] +.IP +.nf +\f[C] +The\ channel\ to\ get\ the\ event\ over. +\f[] +.fi +.RS +.RE +.TP +.B \f[I]event_data\f[] +The output data from the asynchronous event. +.RS +.RE +.TP +.B \f[I]event_resp_len\f[] +The output buffer size to hold the response. +.RS +.RE +.SH RETURN VALUE +.PP +Upon success \f[I]mlx5dv_devx_get_event\f[] will return the number of +bytes read, otherwise \-1 will be returned and errno was set. +.SH NOTES +.PP +In case the \f[I]event_channel\f[] was created with the omit data flag, +events having the same type may be combined per subscription and be +reported once with the matching \f[I]cookie\f[]. +In that mode of work, ordering is not preserved between those events to +other on this channel. +.PP +On the other hand, when each event should hold the device data ordering +is preserved, however, events might be loose as of lack of kernel +memory, in that case EOVERFLOW will be reported. +.SH SEE ALSO +.PP +\f[I]mlx5dv_open_device(3)\f[], +\f[I]mlx5dv_devx_subscribe_devx_event(3)\f[] +.SH AUTHOR +.PP +Yishai Hadas <yishaih@mellanox.com> diff --git a/buildlib/pandoc-prebuilt/2b0acd4321378a4260fb5b442a2d4e8b4834c12d b/buildlib/pandoc-prebuilt/2b0acd4321378a4260fb5b442a2d4e8b4834c12d new file mode 100644 index 0000000..277f93d --- /dev/null +++ b/buildlib/pandoc-prebuilt/2b0acd4321378a4260fb5b442a2d4e8b4834c12d @@ -0,0 +1,79 @@ +.\" Man page generated from reStructuredText. +. +.TH IBCACHEEDIT 8 "2017-08-21" "" "Open IB Diagnostics" +.SH NAME +ibcacheedit \- edit an ibnetdiscover cache +. +.nr rst2man-indent-level 0 +. +.de1 rstReportMargin +\\$1 \\n[an-margin] +level \\n[rst2man-indent-level] +level margin: \\n[rst2man-indent\\n[rst2man-indent-level]] +- +\\n[rst2man-indent0] +\\n[rst2man-indent1] +\\n[rst2man-indent2] +.. +.de1 INDENT +.\" .rstReportMargin pre: +. RS \\$1 +. nr rst2man-indent\\n[rst2man-indent-level] \\n[an-margin] +. nr rst2man-indent-level +1 +.\" .rstReportMargin post: +.. +.de UNINDENT +. RE +.\" indent \\n[an-margin] +.\" old: \\n[rst2man-indent\\n[rst2man-indent-level]] +.nr rst2man-indent-level -1 +.\" new: \\n[rst2man-indent\\n[rst2man-indent-level]] +.in \\n[rst2man-indent\\n[rst2man-indent-level]]u +.. +.SH SYNOPSIS +.sp +ibcacheedit [options] <orig.cache> <new.cache> +.SH DESCRIPTION +.sp +ibcacheedit allows users to edit an ibnetdiscover cache created through the +\fB\-\-cache\fP option in \fBibnetdiscover(8)\fP . +.SH OPTIONS +.INDENT 0.0 +.TP +.B \fB\-\-switchguid BEFOREGUID:AFTERGUID\fP +Specify a switchguid that should be changed. The before and after guid +should be separated by a colon. On switches, port guids are identical +to the switch guid, so port guids will be adjusted as well on switches. +.TP +.B \fB\-\-caguid BEFOREGUID:AFTERGUID\fP +Specify a caguid that should be changed. The before and after guid +should be separated by a colon. +.TP +.B \fB\-\-sysimgguid BEFOREGUID:AFTERGUID\fP +Specify a sysimgguid that should be changed. The before and after guid +should be spearated by a colon. +.TP +.B \fB\-\-portguid NODEGUID:BEFOREGUID:AFTERGUID\fP +Specify a portguid that should be changed. The nodeguid of the port +(e.g. switchguid or caguid) should be specified first, followed by a +colon, the before port guid, another colon, then the after port guid. +On switches, port guids are identical to the switch guid, so the switch +guid will be adjusted as well on switches. +.UNINDENT +.SS Debugging flags +.\" Define the common option -h +. +.sp +\fB\-h, \-\-help\fP show the usage message +.\" Define the common option -V +. +.sp +\fB\-V, \-\-version\fP show the version info. +.SH AUTHORS +.INDENT 0.0 +.TP +.B Albert Chu +< \fI\%chu11@llnl.gov\fP > +.UNINDENT +.\" Generated by docutils manpage writer. +. diff --git a/buildlib/pandoc-prebuilt/2b17e4fb06589e6a6911da4c72f7903f110168e8 b/buildlib/pandoc-prebuilt/2b17e4fb06589e6a6911da4c72f7903f110168e8 new file mode 100644 index 0000000..fcdca74 --- /dev/null +++ b/buildlib/pandoc-prebuilt/2b17e4fb06589e6a6911da4c72f7903f110168e8 @@ -0,0 +1,184 @@ +.\" Man page generated from reStructuredText. +. +.TH IBROUTERS 8 "2016-12-20" "" "OpenIB Diagnostics" +.SH NAME +IBROUTERS \- show InfiniBand router nodes in topology +. +.nr rst2man-indent-level 0 +. +.de1 rstReportMargin +\\$1 \\n[an-margin] +level \\n[rst2man-indent-level] +level margin: \\n[rst2man-indent\\n[rst2man-indent-level]] +- +\\n[rst2man-indent0] +\\n[rst2man-indent1] +\\n[rst2man-indent2] +.. +.de1 INDENT +.\" .rstReportMargin pre: +. RS \\$1 +. nr rst2man-indent\\n[rst2man-indent-level] \\n[an-margin] +. nr rst2man-indent-level +1 +.\" .rstReportMargin post: +.. +.de UNINDENT +. RE +.\" indent \\n[an-margin] +.\" old: \\n[rst2man-indent\\n[rst2man-indent-level]] +.nr rst2man-indent-level -1 +.\" new: \\n[rst2man-indent\\n[rst2man-indent-level]] +.in \\n[rst2man-indent\\n[rst2man-indent-level]]u +.. +.SH SYNOPSIS +.sp +ibrouters [options] [<topology\-file>] +.SH DESCRIPTION +.sp +ibrouters is a script which either walks the IB subnet topology or uses an +already saved topology file and extracts the router nodes. +.SH OPTIONS +.\" Define the common option -C +. +.sp +\fB\-C, \-\-Ca <ca_name>\fP use the specified ca_name. +.\" Define the common option -P +. +.sp +\fB\-P, \-\-Port <ca_port>\fP use the specified ca_port. +.\" Define the common option -t +. +.sp +\fB\-t, \-\-timeout <timeout_ms>\fP override the default timeout for the solicited mads. +.\" Define the common option -y +. +.INDENT 0.0 +.TP +.B \fB\-y, \-\-m_key <key>\fP +use the specified M_key for requests. If non\-numeric value (like \(aqx\(aq) +is specified then a value will be prompted for. +.UNINDENT +.\" Define the common option -h +. +.sp +\fB\-h, \-\-help\fP show the usage message +.\" Define the common option -z +. +.sp +\fB\-\-config, \-z <config_file>\fP Specify alternate config file. +.INDENT 0.0 +.INDENT 3.5 +Default: /usr/local/etc/infiniband\-diags/ibdiag.conf +.UNINDENT +.UNINDENT +.\" Explanation of local port selection +. +.SS Local port Selection +.sp +Multiple port/Multiple CA support: when no IB device or port is specified +(see the "local umad parameters" below), the libibumad library +selects the port to use by the following criteria: +.INDENT 0.0 +.INDENT 3.5 +.INDENT 0.0 +.IP 1. 3 +the first port that is ACTIVE. +.IP 2. 3 +if not found, the first port that is UP (physical link up). +.UNINDENT +.sp +If a port and/or CA name is specified, the libibumad library attempts +to fulfill the user request, and will fail if it is not possible. +.sp +For example: +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +ibaddr # use the first port (criteria #1 above) +ibaddr \-C mthca1 # pick the best port from "mthca1" only. +ibaddr \-P 2 # use the second (active/up) port from the first available IB device. +ibaddr \-C mthca0 \-P 2 # use the specified port only. +.ft P +.fi +.UNINDENT +.UNINDENT +.UNINDENT +.UNINDENT +.SH FILES +.\" Common text for the config file +. +.SS CONFIG FILE +.sp +/usr/local/etc/infiniband\-diags/ibdiag.conf +.sp +A global config file is provided to set some of the common options for all +tools. See supplied config file for details. +.\" Common text to describe the node name map file. +. +.SS NODE NAME MAP FILE FORMAT +.sp +The node name map is used to specify user friendly names for nodes in the +output. GUIDs are used to perform the lookup. +.sp +This functionality is provided by the opensm\-libs package. See \fBopensm(8)\fP +for the file location for your installation. +.sp +\fBGenerically:\fP +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +# comment +<guid> "<name>" +.ft P +.fi +.UNINDENT +.UNINDENT +.sp +\fBExample:\fP +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +# IB1 +# Line cards +0x0008f104003f125c "IB1 (Rack 11 slot 1 ) ISR9288/ISR9096 Voltaire sLB\-24D" +0x0008f104003f125d "IB1 (Rack 11 slot 1 ) ISR9288/ISR9096 Voltaire sLB\-24D" +0x0008f104003f10d2 "IB1 (Rack 11 slot 2 ) ISR9288/ISR9096 Voltaire sLB\-24D" +0x0008f104003f10d3 "IB1 (Rack 11 slot 2 ) ISR9288/ISR9096 Voltaire sLB\-24D" +0x0008f104003f10bf "IB1 (Rack 11 slot 12 ) ISR9288/ISR9096 Voltaire sLB\-24D" + +# Spines +0x0008f10400400e2d "IB1 (Rack 11 spine 1 ) ISR9288 Voltaire sFB\-12D" +0x0008f10400400e2e "IB1 (Rack 11 spine 1 ) ISR9288 Voltaire sFB\-12D" +0x0008f10400400e2f "IB1 (Rack 11 spine 1 ) ISR9288 Voltaire sFB\-12D" +0x0008f10400400e31 "IB1 (Rack 11 spine 2 ) ISR9288 Voltaire sFB\-12D" +0x0008f10400400e32 "IB1 (Rack 11 spine 2 ) ISR9288 Voltaire sFB\-12D" + +# GUID Node Name +0x0008f10400411a08 "SW1 (Rack 3) ISR9024 Voltaire 9024D" +0x0008f10400411a28 "SW2 (Rack 3) ISR9024 Voltaire 9024D" +0x0008f10400411a34 "SW3 (Rack 3) ISR9024 Voltaire 9024D" +0x0008f104004119d0 "SW4 (Rack 3) ISR9024 Voltaire 9024D" +.ft P +.fi +.UNINDENT +.UNINDENT +.SH SEE ALSO +.sp +ibnetdiscover(8) +.SH DEPENDENCIES +.sp +ibnetdiscover, ibnetdiscover format +.SH AUTHOR +.INDENT 0.0 +.TP +.B Hal Rosenstock +< \fI\%halr@voltaire.com\fP > +.UNINDENT +.\" Generated by docutils manpage writer. +. diff --git a/buildlib/pandoc-prebuilt/2cd4402b920e0a57d92dcf281f2091ee6e4ac141 b/buildlib/pandoc-prebuilt/2cd4402b920e0a57d92dcf281f2091ee6e4ac141 new file mode 100644 index 0000000..9713cfd --- /dev/null +++ b/buildlib/pandoc-prebuilt/2cd4402b920e0a57d92dcf281f2091ee6e4ac141 @@ -0,0 +1,41 @@ +.\" Automatically generated by Pandoc 1.19.2.4 +.\" +.TH "IBV_RESIZE_CQ" "3" "2006\-10\-31" "libibverbs" "Libibverbs Programmer\[aq]s Manual" +.hy +.SH NAME +.PP +ibv_resize_cq \- resize a completion queue (CQ) +.SH SYNOPSIS +.IP +.nf +\f[C] +#include\ <infiniband/verbs.h> + +int\ ibv_resize_cq(struct\ ibv_cq\ *cq,\ int\ cqe); +\f[] +.fi +.SH DESCRIPTION +.PP +\f[B]ibv_resize_cq()\f[] resizes the completion queue (CQ) \f[I]cq\f[] +to have at least \f[I]cqe\f[] entries. +\f[I]cqe\f[] must be at least the number of unpolled entries in the CQ +\f[I]cq\f[]. +If \f[I]cqe\f[] is a valid value less than the current CQ size, +\f[B]ibv_resize_cq()\f[] may not do anything, since this function is +only guaranteed to resize the CQ to a size at least as big as the +requested size. +.SH RETURN VALUE +.PP +\f[B]ibv_resize_cq()\f[] returns 0 on success, or the value of errno on +failure (which indicates the failure reason). +.SH NOTES +.PP +\f[B]ibv_resize_cq()\f[] may assign a CQ size greater than or equal to +the requested size. +The cqe member of \f[I]cq\f[] will be updated to the actual size. +.SH SEE ALSO +.PP +\f[B]ibv_create_cq\f[](3), \f[B]ibv_destroy_cq\f[](3) +.SH AUTHOR +.PP +Dotan Barak <dotanba@gmail.com> diff --git a/buildlib/pandoc-prebuilt/2d8bf0753443ec6498bc7a90d728d90110707533 b/buildlib/pandoc-prebuilt/2d8bf0753443ec6498bc7a90d728d90110707533 new file mode 100644 index 0000000..7db2a37 --- /dev/null +++ b/buildlib/pandoc-prebuilt/2d8bf0753443ec6498bc7a90d728d90110707533 @@ -0,0 +1,80 @@ +.\" Automatically generated by Pandoc 1.19.2.4 +.\" +.TH "mlx5dv_devx_subscribe_devx_event, mlx5dv_devx_subscribe_devx_event_fd" "3" "" "" "" +.hy +.SH NAME +.PP +mlx5dv_devx_subscribe_devx_event \- Subscribe over an event channel for +device events. +.PP +mlx5dv_devx_subscribe_devx_event_fd \- Subscribe over an event channel +for device events to signal eventfd. +.SH SYNOPSIS +.IP +.nf +\f[C] +#include\ <infiniband/mlx5dv.h> + +int\ mlx5dv_devx_subscribe_devx_event(struct\ mlx5dv_devx_event_channel\ *dv_event_channel, +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ struct\ mlx5dv_devx_obj\ *obj, +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ uint16_t\ events_sz, +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ uint16_t\ events_num[], +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ uint64_t\ cookie) + +int\ mlx5dv_devx_subscribe_devx_event_fd(struct\ mlx5dv_devx_event_channel\ *dv_event_channel, +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ int\ fd, +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ struct\ mlx5dv_devx_obj\ *obj, +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ uint16_t\ event_num) +\f[] +.fi +.SH DESCRIPTION +.PP +Subscribe over a DEVX event channel for device events. +.SH ARGUMENTS +.TP +.B \f[I]dv_event_channel\f[] +Event channel to subscribe over. +.RS +.RE +.TP +.B \f[I]fd\f[] +A file descriptor that previously was opened by the eventfd() system +call. +.RS +.RE +.TP +.B \f[I]obj\f[] +DEVX object that \f[I]events_num\f[] relates to, can be NULL for +unaffiliated events. +.RS +.RE +.TP +.B \f[I]events_sz\f[] +Size of the \f[I]events_num\f[] buffer that holds the events to +subscribe for. +.RS +.RE +.TP +.B \f[I]events_num\f[] +Holds the required event numbers to subscribe for, numbers are according +to the device specification. +.RS +.RE +.TP +.B \f[I]cookie\f[] +The value to be returned back when reading the event, can be used as an +ID for application use. +.RS +.RE +.SH NOTES +.PP +When mlx5dv_devx_subscribe_devx_event_fd will be used the \f[I]fd\f[] +will be signaled once an event has occurred. +.SH SEE ALSO +.PP +\f[I]mlx5dv_open_device(3)\f[], +\f[I]mlx5dv_devx_create_event_channel(3)\f[], +\f[I]mlx5dv_devx_get_event(3)\f[] +.SH AUTHOR +.PP +Yishai Hadas <yishaih@mellanox.com> diff --git a/buildlib/pandoc-prebuilt/34cf0e59f60dd9af279902148ab5180325339afc b/buildlib/pandoc-prebuilt/34cf0e59f60dd9af279902148ab5180325339afc new file mode 100644 index 0000000..6d544c5 --- /dev/null +++ b/buildlib/pandoc-prebuilt/34cf0e59f60dd9af279902148ab5180325339afc @@ -0,0 +1,32 @@ +.\" Automatically generated by Pandoc 1.19.2.4 +.\" +.TH "IBV_INC_RKEY" "3" "2015\-01\-29" "libibverbs" "Libibverbs Programmer\[aq]s Manual" +.hy +.SH NAME +.PP +ibv_inc_rkey \- creates a new rkey from the given one +.SH SYNOPSIS +.IP +.nf +\f[C] +#include\ <infiniband/verbs.h> + +uint32_t\ ibv_inc_rkey(uint32_t\ rkey); +\f[] +.fi +.SH DESCRIPTION +.PP +\f[B]ibv_inc_rkey()\f[] Increases the 8 LSB of \f[I]rkey\f[] and returns +the new value. +.SH RETURN VALUE +.PP +\f[B]ibv_inc_rkey()\f[] returns the new rkey. +.SH NOTES +.PP +The verb generates a new rkey that is different from the previous one on +its tag part but has the same index (bits 0xffffff00). +A use case for this verb can be to create a new rkey from a Memory +window\[aq]s rkey when binding it to a Memory region. +.SH AUTHORS +.PP +Majd Dibbiny <majd@mellanox.com>, Yishai Hadas <yishaih@mellanox.com> diff --git a/buildlib/pandoc-prebuilt/35aa3d62439669ddf4e47543a8194b7cd203a278 b/buildlib/pandoc-prebuilt/35aa3d62439669ddf4e47543a8194b7cd203a278 new file mode 100644 index 0000000..27eb376 --- /dev/null +++ b/buildlib/pandoc-prebuilt/35aa3d62439669ddf4e47543a8194b7cd203a278 @@ -0,0 +1,314 @@ +.\" Man page generated from reStructuredText. +. +.TH IBTRACERT 8 "2018-04-02" "" "Open IB Diagnostics" +.SH NAME +ibtracert \- trace InfiniBand path +. +.nr rst2man-indent-level 0 +. +.de1 rstReportMargin +\\$1 \\n[an-margin] +level \\n[rst2man-indent-level] +level margin: \\n[rst2man-indent\\n[rst2man-indent-level]] +- +\\n[rst2man-indent0] +\\n[rst2man-indent1] +\\n[rst2man-indent2] +.. +.de1 INDENT +.\" .rstReportMargin pre: +. RS \\$1 +. nr rst2man-indent\\n[rst2man-indent-level] \\n[an-margin] +. nr rst2man-indent-level +1 +.\" .rstReportMargin post: +.. +.de UNINDENT +. RE +.\" indent \\n[an-margin] +.\" old: \\n[rst2man-indent\\n[rst2man-indent-level]] +.nr rst2man-indent-level -1 +.\" new: \\n[rst2man-indent\\n[rst2man-indent-level]] +.in \\n[rst2man-indent\\n[rst2man-indent-level]]u +.. +.SH SYNOPSIS +.sp +ibtracert [options] [<lid|guid> [<startlid> [<endlid>]]] +.SH DESCRIPTION +.sp +ibtracert uses SMPs to trace the path from a source GID/LID to a +destination GID/LID. Each hop along the path is displayed until +the destination is reached or a hop does not respond. By using +the \-m option, multicast path tracing can be performed between source +and destination nodes. +.SH OPTIONS +.INDENT 0.0 +.TP +.B \fB\-n, \-\-no_info\fP +simple format; don\(aqt show additional information +.TP +.B \fB\-m\fP +show the multicast trace of the specified mlid +.TP +.B \fB\-f, \-\-force\fP +force route to destination port +.UNINDENT +.SS Addressing Flags +.\" Define the common option -G +. +.sp +\fB\-G, \-\-Guid\fP The address specified is a Port GUID +.\" Define the common option -L +. +.sp +\fB\-L, \-\-Lid\fP The address specified is a LID +.\" Define the common option -s +. +.sp +\fB\-s, \-\-sm_port <smlid>\fP use \(aqsmlid\(aq as the target lid for SA queries. +.\" Define the common option --ports-file +. +.sp +\fB\-\-ports\-file <ports\-file>\fP Specify a ports file. +.INDENT 0.0 +.INDENT 3.5 +This file contains multiple source and destination lid or guid pairs. See FILES section. +.UNINDENT +.UNINDENT +.SS Port Selection flags +.\" Define the common option -C +. +.sp +\fB\-C, \-\-Ca <ca_name>\fP use the specified ca_name. +.\" Define the common option -P +. +.sp +\fB\-P, \-\-Port <ca_port>\fP use the specified ca_port. +.\" Explanation of local port selection +. +.SS Local port Selection +.sp +Multiple port/Multiple CA support: when no IB device or port is specified +(see the "local umad parameters" below), the libibumad library +selects the port to use by the following criteria: +.INDENT 0.0 +.INDENT 3.5 +.INDENT 0.0 +.IP 1. 3 +the first port that is ACTIVE. +.IP 2. 3 +if not found, the first port that is UP (physical link up). +.UNINDENT +.sp +If a port and/or CA name is specified, the libibumad library attempts +to fulfill the user request, and will fail if it is not possible. +.sp +For example: +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +ibaddr # use the first port (criteria #1 above) +ibaddr \-C mthca1 # pick the best port from "mthca1" only. +ibaddr \-P 2 # use the second (active/up) port from the first available IB device. +ibaddr \-C mthca0 \-P 2 # use the specified port only. +.ft P +.fi +.UNINDENT +.UNINDENT +.UNINDENT +.UNINDENT +.SS Debugging flags +.\" Define the common option -d +. +.INDENT 0.0 +.TP +.B \-d +raise the IB debugging level. +May be used several times (\-ddd or \-d \-d \-d). +.UNINDENT +.\" Define the common option -e +. +.INDENT 0.0 +.TP +.B \-e +show send and receive errors (timeouts and others) +.UNINDENT +.\" Define the common option -h +. +.sp +\fB\-h, \-\-help\fP show the usage message +.\" Define the common option -v +. +.INDENT 0.0 +.TP +.B \fB\-v, \-\-verbose\fP +increase the application verbosity level. +May be used several times (\-vv or \-v \-v \-v) +.UNINDENT +.\" Define the common option -V +. +.sp +\fB\-V, \-\-version\fP show the version info. +.SS Configuration flags +.\" Define the common option -t +. +.sp +\fB\-t, \-\-timeout <timeout_ms>\fP override the default timeout for the solicited mads. +.\" Define the common option --node-name-map +. +.sp +\fB\-\-node\-name\-map <node\-name\-map>\fP Specify a node name map. +.INDENT 0.0 +.INDENT 3.5 +This file maps GUIDs to more user friendly names. See FILES section. +.UNINDENT +.UNINDENT +.\" Define the common option -y +. +.INDENT 0.0 +.TP +.B \fB\-y, \-\-m_key <key>\fP +use the specified M_key for requests. If non\-numeric value (like \(aqx\(aq) +is specified then a value will be prompted for. +.UNINDENT +.\" Define the common option -z +. +.sp +\fB\-\-config, \-z <config_file>\fP Specify alternate config file. +.INDENT 0.0 +.INDENT 3.5 +Default: /usr/local/etc/infiniband\-diags/ibdiag.conf +.UNINDENT +.UNINDENT +.SH FILES +.\" Common text for the config file +. +.SS CONFIG FILE +.sp +/usr/local/etc/infiniband\-diags/ibdiag.conf +.sp +A global config file is provided to set some of the common options for all +tools. See supplied config file for details. +.\" Common text to describe the node name map file. +. +.SS NODE NAME MAP FILE FORMAT +.sp +The node name map is used to specify user friendly names for nodes in the +output. GUIDs are used to perform the lookup. +.sp +This functionality is provided by the opensm\-libs package. See \fBopensm(8)\fP +for the file location for your installation. +.sp +\fBGenerically:\fP +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +# comment +<guid> "<name>" +.ft P +.fi +.UNINDENT +.UNINDENT +.sp +\fBExample:\fP +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +# IB1 +# Line cards +0x0008f104003f125c "IB1 (Rack 11 slot 1 ) ISR9288/ISR9096 Voltaire sLB\-24D" +0x0008f104003f125d "IB1 (Rack 11 slot 1 ) ISR9288/ISR9096 Voltaire sLB\-24D" +0x0008f104003f10d2 "IB1 (Rack 11 slot 2 ) ISR9288/ISR9096 Voltaire sLB\-24D" +0x0008f104003f10d3 "IB1 (Rack 11 slot 2 ) ISR9288/ISR9096 Voltaire sLB\-24D" +0x0008f104003f10bf "IB1 (Rack 11 slot 12 ) ISR9288/ISR9096 Voltaire sLB\-24D" + +# Spines +0x0008f10400400e2d "IB1 (Rack 11 spine 1 ) ISR9288 Voltaire sFB\-12D" +0x0008f10400400e2e "IB1 (Rack 11 spine 1 ) ISR9288 Voltaire sFB\-12D" +0x0008f10400400e2f "IB1 (Rack 11 spine 1 ) ISR9288 Voltaire sFB\-12D" +0x0008f10400400e31 "IB1 (Rack 11 spine 2 ) ISR9288 Voltaire sFB\-12D" +0x0008f10400400e32 "IB1 (Rack 11 spine 2 ) ISR9288 Voltaire sFB\-12D" + +# GUID Node Name +0x0008f10400411a08 "SW1 (Rack 3) ISR9024 Voltaire 9024D" +0x0008f10400411a28 "SW2 (Rack 3) ISR9024 Voltaire 9024D" +0x0008f10400411a34 "SW3 (Rack 3) ISR9024 Voltaire 9024D" +0x0008f104004119d0 "SW4 (Rack 3) ISR9024 Voltaire 9024D" +.ft P +.fi +.UNINDENT +.UNINDENT +.\" Common text to describe the port file. +. +.SS PORTS FILE FORMAT +.sp +The ports file can be used to specify multiple source and destination pairs. They can be lids or guids. If guids, use the \-G option to indicate that. +.sp +\fBGenerically:\fP +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +# comment +<src> <dst> +.ft P +.fi +.UNINDENT +.UNINDENT +.sp +\fBExample:\fP +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +73 207 +203 657 +531 101 + +> OR < + +0x0008f104003f125c 0x0008f104003f133d +0x0008f1040011ab07 0x0008f104004265c0 +0x0008f104007c5510 0x0008f1040099bb08 +.ft P +.fi +.UNINDENT +.UNINDENT +.SH EXAMPLES +.sp +Unicast examples +.INDENT 0.0 +.TP +.B :: +ibtracert 4 16 # show path between lids 4 and 16 +ibtracert \-n 4 16 # same, but using simple output format +ibtracert \-G 0x8f1040396522d 0x002c9000100d051 # use guid addresses +.UNINDENT +.sp +Multicast example +.INDENT 0.0 +.TP +.B :: +ibtracert \-m 0xc000 4 16 # show multicast path of mlid 0xc000 between lids 4 and 16 +.UNINDENT +.SH SEE ALSO +.sp +ibroute (8) +.SH AUTHOR +.INDENT 0.0 +.TP +.B Hal Rosenstock +<\fI\%hal.rosenstock@gmail.com\fP> +.TP +.B Ira Weiny +< \fI\%ira.weiny@intel.com\fP > +.UNINDENT +.\" Generated by docutils manpage writer. +. diff --git a/buildlib/pandoc-prebuilt/35de5f25ab929eed324046bc74a2a953f3b8a47b b/buildlib/pandoc-prebuilt/35de5f25ab929eed324046bc74a2a953f3b8a47b new file mode 100644 index 0000000..bb41e29 --- /dev/null +++ b/buildlib/pandoc-prebuilt/35de5f25ab929eed324046bc74a2a953f3b8a47b @@ -0,0 +1,49 @@ +.\" Automatically generated by Pandoc 1.19.2.4 +.\" +.TH "IBV_ATTACH_MCAST" "3" "2006\-10\-31" "libibverbs" "Libibverbs Programmer\[aq]s Manual" +.hy +.SH NAME +.PP +ibv_attach_mcast, ibv_detach_mcast \- attach and detach a queue pair +(QPs) to/from a multicast group +.SH SYNOPSIS +.IP +.nf +\f[C] +#include\ <infiniband/verbs.h> + +int\ ibv_attach_mcast(struct\ ibv_qp\ *qp,\ const\ union\ ibv_gid\ *gid,\ uint16_t\ lid); + +int\ ibv_detach_mcast(struct\ ibv_qp\ *qp,\ const\ union\ ibv_gid\ *gid,\ uint16_t\ lid); +\f[] +.fi +.SH DESCRIPTION +.PP +\f[B]ibv_attach_mcast()\f[] attaches the QP \f[I]qp\f[] to the multicast +group having MGID \f[I]gid\f[] and MLID \f[I]lid\f[]. +.PP +\f[B]ibv_detach_mcast()\f[] detaches the QP \f[I]qp\f[] to the multicast +group having MGID \f[I]gid\f[] and MLID \f[I]lid\f[]. +.SH RETURN VALUE +.PP +\f[B]ibv_attach_mcast()\f[] and \f[B]ibv_detach_mcast()\f[] returns 0 on +success, or the value of errno on failure (which indicates the failure +reason). +.SH NOTES +.PP +Only QPs of Transport Service Type \f[B]IBV_QPT_UD\f[] may be attached +to multicast groups. +.PP +If a QP is attached to the same multicast group multiple times, the QP +will still receive a single copy of a multicast message. +.PP +In order to receive multicast messages, a join request for the multicast +group must be sent to the subnet administrator (SA), so that the +fabric\[aq]s multicast routing is configured to deliver messages to the +local port. +.SH SEE ALSO +.PP +\f[B]ibv_create_qp\f[](3) +.SH AUTHOR +.PP +Dotan Barak <dotanba@gmail.com> diff --git a/buildlib/pandoc-prebuilt/38456c16c465b34868aad3b267abb2ce131e2e58 b/buildlib/pandoc-prebuilt/38456c16c465b34868aad3b267abb2ce131e2e58 new file mode 100644 index 0000000..888e48a --- /dev/null +++ b/buildlib/pandoc-prebuilt/38456c16c465b34868aad3b267abb2ce131e2e58 @@ -0,0 +1,314 @@ +.\" Automatically generated by Pandoc 1.19.2.4 +.\" +.TH "MLX5DV_DR API" "3" "2019\-03\-28" "mlx5" "mlx5 Programmer\[aq]s Manual" +.hy +.SH NAME +.PP +mlx5dv_dr_domain_create, mlx5dv_dr_domain_sync, mlx5dv_dr_domain_destroy +\- Manage flow domains +.PP +mlx5dv_dr_table_create, mlx5dv_dr_table_destroy \- Manage flow tables +.PP +mlx5dv_dr_matcher_create, mlx5dv_dr_matcher_destroy \- Manage flow +matchers +.PP +mlx5dv_dr_rule_create, mlx5dv_dr_rule_destroy \- Manage flow rules +.PP +mlx5dv_dr_action_create_drop \- Create drop action +.PP +mlx5dv_dr_action_create_tag \- Create tag actions +.PP +mlx5dv_dr_action_create_dest_ibv_qp, mlx5dv_dr_action_create_dest_table, +mlx5dv_dr_action_create_dest_vport \- Create packet destination actions +.PP +mlx5dv_dr_action_create_packet_reformat \- Create packet reformat +actions +.PP +mlx5dv_dr_action_create_modify_header \- Create modify header actions +.PP +mlx5dv_dr_action_create_flow_counter \- Create devx flow counter actions +.PP +mlx5dv_dr_action_create_flow_meter, mlx5dv_dr_action_modify_flow_meter +\- Create and modify meter action +.PP +mlx5dv_dr_action_destroy \- Destroy actions +.SH SYNOPSIS +.IP +.nf +\f[C] +#include\ <infiniband/mlx5dv.h> + +struct\ mlx5dv_dr_domain\ *mlx5dv_dr_domain_create( +\ \ \ \ \ \ \ \ struct\ ibv_context\ *ctx, +\ \ \ \ \ \ \ \ enum\ mlx5dv_dr_domain_type\ type); + +int\ mlx5dv_dr_domain_sync( +\ \ \ \ \ \ \ \ struct\ mlx5dv_dr_domain\ *domain, +\ \ \ \ \ \ \ \ uint32_t\ flags); + +int\ mlx5dv_dr_domain_destroy(struct\ mlx5dv_dr_domain\ *domain); + +struct\ mlx5dv_dr_table\ *mlx5dv_dr_table_create( +\ \ \ \ \ \ \ \ struct\ mlx5dv_dr_domain\ *domain, +\ \ \ \ \ \ \ \ uint32_t\ level); + +int\ mlx5dv_dr_table_destroy(struct\ mlx5dv_dr_table\ *table); + +struct\ mlx5dv_dr_matcher\ *mlx5dv_dr_matcher_create( +\ \ \ \ \ \ \ \ struct\ mlx5dv_dr_table\ *table, +\ \ \ \ \ \ \ \ uint16_t\ priority, +\ \ \ \ \ \ \ \ uint8_t\ match_criteria_enable, +\ \ \ \ \ \ \ \ struct\ mlx5dv_flow_match_parameters\ *mask); + +int\ mlx5dv_dr_matcher_destroy(struct\ mlx5dv_dr_matcher\ *matcher); + +struct\ mlx5dv_dr_rule\ *mlx5dv_dr_rule_create( +\ \ \ \ \ \ \ \ struct\ mlx5dv_dr_matcher\ *matcher, +\ \ \ \ \ \ \ \ struct\ mlx5dv_flow_match_parameters\ *value, +\ \ \ \ \ \ \ \ size_t\ num_actions, +\ \ \ \ \ \ \ \ struct\ mlx5dv_dr_action\ *actions[]); + +void\ mlx5dv_dr_rule_destroy(struct\ mlx5dv_dr_rule\ *rule); + +struct\ mlx5dv_dr_action\ *mlx5dv_dr_action_create_drop(void); + +struct\ mlx5dv_dr_action\ *mlx5dv_dr_action_create_tag( +\ \ \ \ \ \ \ \ uint32_t\ tag_value); + +struct\ mlx5dv_dr_action\ *mlx5dv_dr_action_create_dest_ibv_qp( +\ \ \ \ \ \ \ \ struct\ ibv_qp\ *ibqp); + +struct\ mlx5dv_dr_action\ *mlx5dv_dr_action_create_dest_table( +\ \ \ \ \ \ \ \ struct\ mlx5dv_dr_table\ *table); + +struct\ mlx5dv_dr_action\ *mlx5dv_dr_action_create_dest_vport( +\ \ \ \ \ \ \ \ struct\ mlx5dv_dr_domain\ *domain, +\ \ \ \ \ \ \ \ uint32_t\ vport); + +struct\ mlx5dv_dr_action\ *mlx5dv_dr_action_create_packet_reformat( +\ \ \ \ \ \ \ \ struct\ mlx5dv_dr_domain\ *domain, +\ \ \ \ \ \ \ \ uint32_t\ flags, +\ \ \ \ \ \ \ \ enum\ mlx5dv_flow_action_packet_reformat_type\ reformat_type, +\ \ \ \ \ \ \ \ size_t\ data_sz,\ void\ *data); + +struct\ mlx5dv_dr_action\ *mlx5dv_dr_action_create_modify_header( +\ \ \ \ \ \ \ \ struct\ mlx5dv_dr_domain\ *domain, +\ \ \ \ \ \ \ \ uint32_t\ flags, +\ \ \ \ \ \ \ \ size_t\ actions_sz, +\ \ \ \ \ \ \ \ __be64\ actions[]); + +struct\ mlx5dv_dr_action\ *mlx5dv_dr_action_create_flow_counter( +\ \ \ \ \ \ \ \ struct\ mlx5dv_devx_obj\ *devx_obj, +\ \ \ \ \ \ \ \ uint32_t\ offset); + +struct\ mlx5dv_dr_action\ * +mlx5dv_dr_action_create_flow_meter(struct\ mlx5dv_dr_flow_meter_attr\ *attr); + +int\ mlx5dv_dr_action_modify_flow_meter(struct\ mlx5dv_dr_action\ *action, +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ struct\ mlx5dv_dr_flow_meter_attr\ *attr, +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ __be64\ modify_field_select); + +int\ mlx5dv_dr_action_destroy(struct\ mlx5dv_dr_action\ *action); +\f[] +.fi +.SH DESCRIPTION +.PP +The Direct Rule API (mlx5dv_dr_*) allows complete access by verbs +application to the device`s packet steering functionality. +.PP +Steering flow rules are the combination of attributes with a match +pattern and a list of actions. +Rules can have several distinct actions (such as counting, +encapsulating, decapsulating before redirecting packets to a particular +queue or port, etc.). +In order to manage the rule execution order for the packet processing +matching by HW, multiple flow tables in an ordered chain and multiple +flow matchers sorted by priorities are defined. +.SS Domain +.PP +\f[I]mlx5dv_dr_domain_create()\f[] creates a DR domain object to be used +with \f[I]mlx5dv_dr_table_create()\f[] and +\f[I]mlx5dv_dr_action_create_*()\f[]. +.PP +A domain should be destroyed by calling +\f[I]mlx5dv_dr_domain_destroy()\f[] once all depended resources are +released. +.PP +The device support the following domains types: +.PP +\f[B]MLX5DV_DR_DOMAIN_TYPE_NIC_RX\f[] Manage ethernet packets received +on the NIC. +Packets in this domain can be dropped, dispatched to QP`s, modified or +redirected to additional tables inside the domain. +Default behavior: Drop packet. +.PP +\f[B]MLX5DV_DR_DOMAIN_TYPE_NIC_TX\f[] Manage ethernet packets transmit +on the NIC. +Packets in this domain can be dropped, modified or redirected to +additional tables inside the domain. +Default behavior: Forward packet to NIC vport (to eSwitch or wire). +.PP +\f[B]MLX5DV_DR_DOMAIN_TYPE_FDB\f[] Manage ethernet packets in the +eSwitch Forwarding Data Base for packets received from wire or from any +other vport. +Packets in this domain can be dropped, dispatched to vport, modified or +redirected to additional tables inside the domain. +Default behavior: Forward packet to eSwitch manager vport. +.PP +\f[I]mlx5dv_dr_domain_sync()\f[] is used in order to flush the rule +submission queue. +By default, rules in a domain are updated in HW asynchronously. +\f[B]flags\f[] should be a set of type \f[I]enum +mlx5dv_dr_domain_sync_flags\f[]: +.PP +\f[B]MLX5DV_DR_DOMAIN_SYNC_FLAGS_SW\f[]: block until completion of all +software queued tasks. +.PP +\f[B]MLX5DV_DR_DOMAIN_SYNC_FLAGS_HW\f[]: clear the steering HW cache to +enforce next packet hits the latest rules, in addition to the SW SYNC +handling. +.SS Table +.PP +\f[I]mlx5dv_dr_table_create()\f[] creates a DR table in the +\f[B]domain\f[], at the appropriate \f[B]level\f[], and can be used with +\f[I]mlx5dv_dr_matcher_create()\f[] and +\f[I]mlx5dv_dr_action_create_dest_table()\f[]. +All packets start traversing the steering domain tree at table +\f[B]level\f[] zero (0). +Using rule and action, packets can by redirected to other tables in the +domain. +.PP +A table should be destroyed by calling +\f[I]mlx5dv_dr_table_destroy()\f[] once all depended resources are +released. +.SS Matcher +.PP +\f[I]mlx5dv_dr_matcher_create()\f[] create a matcher object in +\f[B]table\f[], at sorted \f[B]priority\f[] (lower value is check +first). +A matcher can hold multiple rules, all with identical \f[B]mask\f[] of +type \f[I]struct mlx5dv_flow_match_parameters\f[] which represents the +exact attributes to be compared by HW steering. +The \f[B]match_criteria_enable\f[] and \f[B]mask\f[] are defined in a +device spec format. +Only the fields that where masked in the \f[I]matcher\f[] should be +filled by the rule in \f[I]mlx5dv_dr_rule_create()\f[]. +.PP +A matcher should be destroyed by calling +\f[I]mlx5dv_dr_matcher_destroy()\f[] once all depended resources are +released. +.SS Actions +.PP +A set of action create API are defined by +\f[I]mlx5dv_dr_action_create_*()\f[]. +All action are created as \f[I]struct mlx5dv_dr_action\f[]. +An action should be destroyed by calling +\f[I]mlx5dv_dr_action_destroy()\f[] once all depended rules are +destroyed. +.PP +When an action handle is reused for multiple rules, the same action will +be executed. +e.g.: action \[aq]count\[aq] will count multiple flows rules on the same +HW flow counter context. +action \[aq]drop\[aq] will drop packets of different rule from any +matcher. +.PP +Action: Drop \f[I]mlx5dv_dr_action_create_drop\f[] create a terminating +action which drops packets. +Can not be mixed with Destination actions. +.PP +Action: Tag \f[I]mlx5dv_dr_action_create_tag\f[] creates a +non\-terminating action which tags packets with \f[B]tag_value\f[]. +The \f[B]tag_value\f[] is available in the CQE of the packet received. +Valid only on domain type NIC_RX. +.PP +Action: Destination \f[I]mlx5dv_dr_action_create_dest_ibv_qp\f[] creates +a terminating action delivering the packet to a QP, defined by +\f[B]ibqp\f[]. +Valid only on domain type NIC_RX. +\f[I]mlx5dv_dr_action_create_dest_table\f[] creates a forwarding action +to another flow table, defined by \f[B]table\f[]. +The destination \f[B]table\f[] must be from the same domain with a level +higher than zero. +\f[I]mlx5dv_dr_action_create_dest_vport\f[] creates a forwarding action +to a \f[B]vport\f[] on the same \f[B]domain\f[]. +Valid only on domain type FDB. +.PP +Action: Packet Reformat \f[I]mlx5dv_dr_action_create_packet_reformat\f[] +create a packet reformat context and action in the \f[B]domain\f[]. +The \f[B]reformat_type\f[], \f[B]data_sz\f[] and \f[B]data\f[] are +defined in \f[I]man mlx5dv_create_flow_action_packet_reformat\f[]. +.PP +Action: Modify Header \f[I]mlx5dv_dr_action_create_modify_header\f[] +create a modify header context and action in the \f[B]domain\f[]. +The \f[B]actions_sz\f[] and \f[B]actions\f[] are defined in \f[I]man +mlx5dv_create_flow_action_modify_header\f[]. +.PP +Action: Flow Count \f[I]mlx5dv_dr_action_create_flow_counter\f[] creates +a flow counter action from a DEVX flow counter object, based on +\f[B]devx_obj\f[] and specific counter index from \f[B]offset\f[] in the +counter bulk. +.PP +Action: Meter \f[I]mlx5dv_dr_action_create_flow_meter\f[] creates a +meter action based on the flow meter parameters. +The paramertes are according to the device specification. +\f[I]mlx5dv_dr_action_modify_flow_meter\f[] modifies existing flow meter +\f[B]action\f[] based on \f[B]modify_field_select\f[]. +\f[B]modify_field_select\f[] is according to the device specification. +.PP +Action Flags: action \f[B]flags\f[] can be set to one of the types of +\f[I]enum mlx5dv_dr_action_flags\f[]: +.PP +\f[B]MLX5DV_DR_ACTION_FLAGS_ROOT_LEVEL\f[]: is used to indicate the +action is targeted for flow table in level=0 (ROOT) of the specific +domain. +.SS Rule +.PP +\f[I]mlx5dv_dr_rule_create()\f[] creates a HW steering rule entry in +\f[B]matcher\f[]. +The \f[B]value\f[] of type \f[I]struct mlx5dv_flow_match_parameters\f[] +holds the exact attribute values of the steering rule to be matched, in +a device spec format. +Only the fields that where masked in the \f[I]matcher\f[] should be +filled. +HW will perform the set of \f[B]num_actions\f[] from the \f[B]action\f[] +array of type \f[I]struct mlx5dv_dr_action\f[], once a packet matches +the exact \f[B]value\f[] of the rule (referred to as a \[aq]hit\[aq]). +.PP +\f[I]mlx5dv_dr_rule_destroy()\f[] destroys the rule. +.SH RETURN VALUE +.PP +The create API calls will return a pointer to the relevant object: +table, matcher, action, rule. +on failure, NULL will be returned and errno will be set. +.PP +The destroy API calls will returns 0 on success, or the value of errno +on failure (which indicates the failure reason). +.SH LIMITATIONS +.PP +Application can verify is a feature is supported by \f[I]trail and +error\f[]. +No capabilities are exposed, as the combination of all the options +exposed are way to large to define. +.PP +Tables are size less by definition. +They are expected to grow and shrink to accommodate for all rules, +according to driver capabilities. +Once reaching a limit, an error is returned. +.PP +Matchers in same priority, in the same table, will have undefined +ordered. +.PP +A rule with identical value pattern to another rule on a given matcher +are rejected. +.PP +IP version in matcher mask and rule should be equal and set to 4, 6 or +0. +# SEE ALSO +.PP +\f[B]mlx5dv_open_device(3)\f[], +\f[B]mlx5dv_create_flow_action_packet_reformat(3)\f[], +\f[B]mlx5dv_create_flow_action_modify_header(3)\f[]. +.SH AUTHOR +.PP +Alex Rosenbaum <alexr@mellanox.com> Alex Vesker <valex@mellanox.com> diff --git a/buildlib/pandoc-prebuilt/3e78d98dac48566e3ad826ace68e4f9c3440a347 b/buildlib/pandoc-prebuilt/3e78d98dac48566e3ad826ace68e4f9c3440a347 new file mode 100644 index 0000000..25021af --- /dev/null +++ b/buildlib/pandoc-prebuilt/3e78d98dac48566e3ad826ace68e4f9c3440a347 @@ -0,0 +1,214 @@ +.\" Man page generated from reStructuredText. +. +.TH IBADDR 8 "2013-10-11" "" "OpenIB Diagnostics" +.SH NAME +IBADDR \- query InfiniBand address(es) +. +.nr rst2man-indent-level 0 +. +.de1 rstReportMargin +\\$1 \\n[an-margin] +level \\n[rst2man-indent-level] +level margin: \\n[rst2man-indent\\n[rst2man-indent-level]] +- +\\n[rst2man-indent0] +\\n[rst2man-indent1] +\\n[rst2man-indent2] +.. +.de1 INDENT +.\" .rstReportMargin pre: +. RS \\$1 +. nr rst2man-indent\\n[rst2man-indent-level] \\n[an-margin] +. nr rst2man-indent-level +1 +.\" .rstReportMargin post: +.. +.de UNINDENT +. RE +.\" indent \\n[an-margin] +.\" old: \\n[rst2man-indent\\n[rst2man-indent-level]] +.nr rst2man-indent-level -1 +.\" new: \\n[rst2man-indent\\n[rst2man-indent-level]] +.in \\n[rst2man-indent\\n[rst2man-indent-level]]u +.. +.SH SYNOPSIS +.sp +ibaddr [options] +.SH DESCRIPTION +.sp +Display the lid (and range) as well as the GID address of the +port specified (by DR path, lid, or GUID) or the local port by default. +.sp +Note: this utility can be used as simple address resolver. +.SH OPTIONS +.sp +\fB\-\-gid_show, \-g\fP +show gid address only +.sp +\fB\-\-lid_show, \-l\fP +show lid range only +.sp +\fB\-\-Lid_show, \-L\fP +show lid range (in decimal) only +.SS Addressing Flags +.\" Define the common option -D for Directed routes +. +.sp +\fB\-D, \-\-Direct\fP The address specified is a directed route +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +Examples: + [options] \-D [options] "0" # self port + [options] \-D [options] "0,1,2,1,4" # out via port 1, then 2, ... + + (Note the second number in the path specified must match the port being + used. This can be specified using the port selection flag \(aq\-P\(aq or the + port found through the automatic selection process.) +.ft P +.fi +.UNINDENT +.UNINDENT +.\" Define the common option -G +. +.sp +\fB\-G, \-\-Guid\fP The address specified is a Port GUID +.\" Define the common option -s +. +.sp +\fB\-s, \-\-sm_port <smlid>\fP use \(aqsmlid\(aq as the target lid for SA queries. +.SS Debugging flags +.\" Define the common option -d +. +.INDENT 0.0 +.TP +.B \-d +raise the IB debugging level. +May be used several times (\-ddd or \-d \-d \-d). +.UNINDENT +.\" Define the common option -e +. +.INDENT 0.0 +.TP +.B \-e +show send and receive errors (timeouts and others) +.UNINDENT +.\" Define the common option -h +. +.sp +\fB\-h, \-\-help\fP show the usage message +.\" Define the common option -v +. +.INDENT 0.0 +.TP +.B \fB\-v, \-\-verbose\fP +increase the application verbosity level. +May be used several times (\-vv or \-v \-v \-v) +.UNINDENT +.\" Define the common option -V +. +.sp +\fB\-V, \-\-version\fP show the version info. +.SS Port Selection flags +.\" Define the common option -C +. +.sp +\fB\-C, \-\-Ca <ca_name>\fP use the specified ca_name. +.\" Define the common option -P +. +.sp +\fB\-P, \-\-Port <ca_port>\fP use the specified ca_port. +.\" Explanation of local port selection +. +.SS Local port Selection +.sp +Multiple port/Multiple CA support: when no IB device or port is specified +(see the "local umad parameters" below), the libibumad library +selects the port to use by the following criteria: +.INDENT 0.0 +.INDENT 3.5 +.INDENT 0.0 +.IP 1. 3 +the first port that is ACTIVE. +.IP 2. 3 +if not found, the first port that is UP (physical link up). +.UNINDENT +.sp +If a port and/or CA name is specified, the libibumad library attempts +to fulfill the user request, and will fail if it is not possible. +.sp +For example: +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +ibaddr # use the first port (criteria #1 above) +ibaddr \-C mthca1 # pick the best port from "mthca1" only. +ibaddr \-P 2 # use the second (active/up) port from the first available IB device. +ibaddr \-C mthca0 \-P 2 # use the specified port only. +.ft P +.fi +.UNINDENT +.UNINDENT +.UNINDENT +.UNINDENT +.SS Configuration flags +.\" Define the common option -y +. +.INDENT 0.0 +.TP +.B \fB\-y, \-\-m_key <key>\fP +use the specified M_key for requests. If non\-numeric value (like \(aqx\(aq) +is specified then a value will be prompted for. +.UNINDENT +.\" Define the common option -t +. +.sp +\fB\-t, \-\-timeout <timeout_ms>\fP override the default timeout for the solicited mads. +.\" Define the common option -z +. +.sp +\fB\-\-config, \-z <config_file>\fP Specify alternate config file. +.INDENT 0.0 +.INDENT 3.5 +Default: /usr/local/etc/infiniband\-diags/ibdiag.conf +.UNINDENT +.UNINDENT +.SH FILES +.\" Common text for the config file +. +.SS CONFIG FILE +.sp +/usr/local/etc/infiniband\-diags/ibdiag.conf +.sp +A global config file is provided to set some of the common options for all +tools. See supplied config file for details. +.SH EXAMPLES +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +ibaddr # local port\e\(aqs address +ibaddr 32 # show lid range and gid of lid 32 +ibaddr \-G 0x8f1040023 # same but using guid address +ibaddr \-l 32 # show lid range only +ibaddr \-L 32 # show decimal lid range only +ibaddr \-g 32 # show gid address only +.ft P +.fi +.UNINDENT +.UNINDENT +.SH SEE ALSO +.sp +\fBibroute (8), ibtracert (8)\fP +.SH AUTHOR +.INDENT 0.0 +.TP +.B Hal Rosenstock +< \fI\%halr@voltaire.com\fP > +.UNINDENT +.\" Generated by docutils manpage writer. +. diff --git a/buildlib/pandoc-prebuilt/41bbb0bed7a781be59e8c0dcd8b7278af2ce6882 b/buildlib/pandoc-prebuilt/41bbb0bed7a781be59e8c0dcd8b7278af2ce6882 new file mode 100644 index 0000000..f313d1f --- /dev/null +++ b/buildlib/pandoc-prebuilt/41bbb0bed7a781be59e8c0dcd8b7278af2ce6882 @@ -0,0 +1,39 @@ +.\" Automatically generated by Pandoc 1.19.2.4 +.\" +.TH "UMAD_INIT" "3" "May 21, 2007" "OpenIB" "OpenIB Programmer\[aq]s Manual" +.hy +.SH NAME +.PP +umad_init, umad_done \- perform library initialization and finalization +.SH SYNOPSIS +.IP +.nf +\f[C] +#include\ <infiniband/umad.h> + +int\ umad_init(void); + +int\ umad_done(void); +\f[] +.fi +.SH DESCRIPTION +.PP +\f[B]umad_init()\f[] and \f[B]umad_done()\f[] do nothing. +.SH RETURN VALUE +.PP +Always 0. +.SH COMPATIBILITY +.PP +Versions prior to release 18 of the library require \f[B]umad_init()\f[] +to be called prior to using any other library functions. +Old versions could return a failure code of \-1 from +\f[B]umad_init()\f[]. +.PP +For compatibility, applications should continue to call +\f[B]umad_init()\f[], and check the return code, prior to calling other +\f[B]umad_\f[] functions. +If \f[B]umad_init()\f[] returns an error, then no further use of the +umad library should be attempted. +.SH AUTHORS +.PP +Dotan Barak <dotanb@mellanox.co.il>, Hal Rosenstock <halr@voltaire.com> diff --git a/buildlib/pandoc-prebuilt/42f038a5f87713ae1079c61615b27d0a41336faa b/buildlib/pandoc-prebuilt/42f038a5f87713ae1079c61615b27d0a41336faa new file mode 100644 index 0000000..de933fe --- /dev/null +++ b/buildlib/pandoc-prebuilt/42f038a5f87713ae1079c61615b27d0a41336faa @@ -0,0 +1,486 @@ +.\"t +.\" Automatically generated by Pandoc 1.19.2.4 +.\" +.TH "IBV_WR API" "3" "2018\-11\-27" "libibverbs" "Libibverbs Programmer\[aq]s Manual" +.hy +.SH NAME +.PP +ibv_wr_abort, ibv_wr_complete, ibv_wr_start \- Manage regions allowed to +post work +.PP +ibv_wr_atomic_cmp_swp, ibv_wr_atomic_fetch_add \- Post remote atomic +operation work requests +.PP +ibv_wr_bind_mw, ibv_wr_local_inv \- Post work requests for memory +windows +.PP +ibv_wr_rdma_read, ibv_wr_rdma_write, ibv_wr_rdma_write_imm \- Post RDMA +work requests +.PP +ibv_wr_send, ibv_wr_send_imm, ibv_wr_send_inv \- Post send work requests +.PP +ibv_wr_send_tso \- Post segmentation offload work requests +.PP +ibv_wr_set_inline_data, ibv_wr_set_inline_data_list \- Attach inline +data to the last work request +.PP +ibv_wr_set_sge, ibv_wr_set_sge_list \- Attach data to the last work +request +.PP +ibv_wr_set_ud_addr \- Attach UD addressing info to the last work request +.PP +ibv_wr_set_xrc_srqn \- Attach an XRC SRQN to the last work request +.SH SYNOPSIS +.IP +.nf +\f[C] +#include\ <infiniband/verbs.h> + +void\ ibv_wr_abort(struct\ ibv_qp_ex\ *qp); +int\ ibv_wr_complete(struct\ ibv_qp_ex\ *qp); +void\ ibv_wr_start(struct\ ibv_qp_ex\ *qp); + +void\ ibv_wr_atomic_cmp_swp(struct\ ibv_qp_ex\ *qp,\ uint32_t\ rkey, +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ uint64_t\ remote_addr,\ uint64_t\ compare, +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ uint64_t\ swap); +void\ ibv_wr_atomic_fetch_add(struct\ ibv_qp_ex\ *qp,\ uint32_t\ rkey, +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ uint64_t\ remote_addr,\ uint64_t\ add); + +void\ ibv_wr_bind_mw(struct\ ibv_qp_ex\ *qp,\ struct\ ibv_mw\ *mw,\ uint32_t\ rkey, +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ const\ struct\ ibv_mw_bind_info\ *bind_info); +void\ ibv_wr_local_inv(struct\ ibv_qp_ex\ *qp,\ uint32_t\ invalidate_rkey); + +void\ ibv_wr_rdma_read(struct\ ibv_qp_ex\ *qp,\ uint32_t\ rkey, +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ uint64_t\ remote_addr); +void\ ibv_wr_rdma_write(struct\ ibv_qp_ex\ *qp,\ uint32_t\ rkey, +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ uint64_t\ remote_addr); +void\ ibv_wr_rdma_write_imm(struct\ ibv_qp_ex\ *qp,\ uint32_t\ rkey, +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ uint64_t\ remote_addr,\ __be32\ imm_data); + +void\ ibv_wr_send(struct\ ibv_qp_ex\ *qp); +void\ ibv_wr_send_imm(struct\ ibv_qp_ex\ *qp,\ __be32\ imm_data); +void\ ibv_wr_send_inv(struct\ ibv_qp_ex\ *qp,\ uint32_t\ invalidate_rkey); +void\ ibv_wr_send_tso(struct\ ibv_qp_ex\ *qp,\ void\ *hdr,\ uint16_t\ hdr_sz, +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ uint16_t\ mss); + +void\ ibv_wr_set_inline_data(struct\ ibv_qp_ex\ *qp,\ void\ *addr,\ size_t\ length); +void\ ibv_wr_set_inline_data_list(struct\ ibv_qp_ex\ *qp,\ size_t\ num_buf, +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ const\ struct\ ibv_data_buf\ *buf_list); +void\ ibv_wr_set_sge(struct\ ibv_qp_ex\ *qp,\ uint32_t\ lkey,\ uint64_t\ addr, +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ uint32_t\ length); +void\ ibv_wr_set_sge_list(struct\ ibv_qp_ex\ *qp,\ size_t\ num_sge, +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ const\ struct\ ibv_sge\ *sg_list); + +void\ ibv_wr_set_ud_addr(struct\ ibv_qp_ex\ *qp,\ struct\ ibv_ah\ *ah, +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ uint32_t\ remote_qpn,\ uint32_t\ remote_qkey); +void\ ibv_wr_set_xrc_srqn(struct\ ibv_qp_ex\ *qp,\ uint32_t\ remote_srqn); +\f[] +.fi +.SH DESCRIPTION +.PP +The verbs work request API (ibv_wr_*) allows efficient posting of work +to a send queue using function calls instead of the struct based +\f[I]ibv_post_send()\f[] scheme. +This approach is designed to minimize CPU branching and locking during +the posting process. +.PP +This API is intended to be used to access additional functionality +beyond what is provided by \f[I]ibv_post_send()\f[]. +.PP +WRs batches of \f[I]ibv_post_send()\f[] and this API WRs batches can +interleave together just if they are not posted within the critical +region of each other. +(A critical region in this API formed by \f[I]ibv_wr_start()\f[] and +\f[I]ibv_wr_complete()\f[]/\f[I]ibv_wr_abort()\f[]) +.SH USAGE +.PP +To use these APIs the QP must be created using ibv_create_qp_ex() which +allows setting the \f[B]IBV_QP_INIT_ATTR_SEND_OPS_FLAGS\f[] in +\f[I]comp_mask\f[]. +The \f[I]send_ops_flags\f[] should be set to the OR of the work request +types that will be posted to the QP. +.PP +If the QP does not support all the requested work request types then QP +creation will fail. +.PP +Posting work requests to the QP is done within the critical region +formed by \f[I]ibv_wr_start()\f[] and +\f[I]ibv_wr_complete()\f[]/\f[I]ibv_wr_abort()\f[] (see CONCURRENCY +below). +.PP +Each work request is created by calling a WR builder function (see the +table column WR builder below) to start creating the work request, +followed by allowed/required setter functions described below. +.PP +The WR builder and setter combination can be called multiple times to +efficiently post multiple work requests within a single critical region. +.PP +Each WR builder will use the \f[I]wr_id\f[] member of \f[I]struct +ibv_qp_ex\f[] to set the value to be returned in the completion. +Some operations will also use the \f[I]wr_flags\f[] member to influence +operation (see Flags below). +These values should be set before invoking the WR builder function. +.PP +For example a simple send could be formed as follows: +.IP +.nf +\f[C] +qpx\->wr_id\ =\ 1; +ibv_wr_send(qpx); +ibv_wr_set_sge(qpx,\ lkey,\ &data,\ sizeof(data)); +\f[] +.fi +.PP +The section WORK REQUESTS describes the various WR builders and setters +in details. +.PP +Posting work is completed by calling \f[I]ibv_wr_complete()\f[] or +\f[I]ibv_wr_abort()\f[]. +No work is executed to the queue until \f[I]ibv_wr_complete()\f[] +returns success. +\f[I]ibv_wr_abort()\f[] will discard all work prepared since +\f[I]ibv_wr_start()\f[]. +.SH WORK REQUESTS +.PP +Many of the operations match the opcodes available for +\f[I]ibv_post_send()\f[]. +Each operation has a WR builder function, a list of allowed setters, and +a flag bit to request the operation with \f[I]send_ops_flags\f[] in +\f[I]struct ibv_qp_init_attr_ex\f[] (see the EXAMPLE below). +.PP +.TS +tab(@); +l l l l. +T{ +Operation +T}@T{ +WR builder +T}@T{ +QP Type Supported +T}@T{ +setters +T} +_ +T{ +ATOMIC_CMP_AND_SWP +T}@T{ +ibv_wr_atomic_cmp_swp() +T}@T{ +RC, XRC_SEND +T}@T{ +DATA, QP +T} +T{ +ATOMIC_FETCH_AND_ADD +T}@T{ +ibv_wr_atomic_fetch_add() +T}@T{ +RC, XRC_SEND +T}@T{ +DATA, QP +T} +T{ +BIND_MW +T}@T{ +ibv_wr_bind_mw() +T}@T{ +UC, RC, XRC_SEND +T}@T{ +NONE +T} +T{ +LOCAL_INV +T}@T{ +ibv_wr_local_inv() +T}@T{ +UC, RC, XRC_SEND +T}@T{ +NONE +T} +T{ +RDMA_READ +T}@T{ +ibv_wr_rdma_read() +T}@T{ +RC, XRC_SEND +T}@T{ +DATA, QP +T} +T{ +RDMA_WRITE +T}@T{ +ibv_wr_rdma_write() +T}@T{ +UC, RC, XRC_SEND +T}@T{ +DATA, QP +T} +T{ +RDMA_WRITE_WITH_IMM +T}@T{ +ibv_wr_rdma_write_imm() +T}@T{ +UC, RC, XRC_SEND +T}@T{ +DATA, QP +T} +T{ +SEND +T}@T{ +ibv_wr_send() +T}@T{ +UD, UC, RC, XRC_SEND, RAW_PACKET +T}@T{ +DATA, QP +T} +T{ +SEND_WITH_IMM +T}@T{ +ibv_wr_send_imm() +T}@T{ +UD, UC, RC, SRC SEND +T}@T{ +DATA, QP +T} +T{ +SEND_WITH_INV +T}@T{ +ibv_wr_send_inv() +T}@T{ +UC, RC, XRC_SEND +T}@T{ +DATA, QP +T} +T{ +TSO +T}@T{ +ibv_wr_send_tso() +T}@T{ +UD, RAW_PACKET +T}@T{ +DATA, QP +T} +.TE +.SS Atomic operations +.PP +Atomic operations are only atomic so long as all writes to memory go +only through the same RDMA hardware. +It is not atomic with writes performed by the CPU, or by other RDMA +hardware in the system. +.TP +.B \f[I]ibv_wr_atomic_cmp_swp()\f[] +If the remote 64 bit memory location specified by \f[I]rkey\f[] and +\f[I]remote_addr\f[] equals \f[I]compare\f[] then set it to +\f[I]swap\f[]. +.RS +.RE +.TP +.B \f[I]ibv_wr_atomic_fetch_add()\f[] +Add \f[I]add\f[] to the 64 bit memory location specified \f[I]rkey\f[] +and \f[I]remote_addr\f[]. +.RS +.RE +.SS Memory Windows +.PP +Memory window type 2 operations (See man page for ibv_alloc_mw). +.TP +.B \f[I]ibv_wr_bind_mw()\f[] +Bind a MW type 2 specified by \f[B]mw\f[], set a new \f[B]rkey\f[] and +set its properties by \f[B]bind_info\f[]. +.RS +.RE +.TP +.B \f[I]ibv_wr_local_inv()\f[] +Invalidate a MW type 2 which is associated with \f[B]rkey\f[]. +.RS +.RE +.SS RDMA +.TP +.B \f[I]ibv_wr_rdma_read()\f[] +Read from the remote memory location specified \f[I]rkey\f[] and +\f[I]remote_addr\f[]. +The number of bytes to read, and the local location to store the data, +is determined by the DATA buffers set after this call. +.RS +.RE +.TP +.B \f[I]ibv_wr_rdma_write()\f[], \f[I]ibv_wr_rdma_write_imm()\f[] +Write to the remote memory location specified \f[I]rkey\f[] and +\f[I]remote_addr\f[]. +The number of bytes to read, and the local location to get the data, is +determined by the DATA buffers set after this call. +.RS +.PP +The _imm version causes the remote side to get a +IBV_WC_RECV_RDMA_WITH_IMM containing the 32 bits of immediate data. +.RE +.SS Message Send +.TP +.B \f[I]ibv_wr_send()\f[], \f[I]ibv_wr_send_imm()\f[] +Send a message. +The number of bytes to send, and the local location to get the data, is +determined by the DATA buffers set after this call. +.RS +.PP +The _imm version causes the remote side to get a +IBV_WC_RECV_RDMA_WITH_IMM containing the 32 bits of immediate data. +.RE +.TP +.B \f[I]ibv_wr_send_inv()\f[] +The data transfer is the same as for \f[I]ibv_wr_send()\f[], however the +remote side will invalidate the MR specified by \f[I]invalidate_rkey\f[] +before delivering a completion. +.RS +.RE +.TP +.B \f[I]ibv_wr_send_tso()\f[] +Produce multiple SEND messages using TCP Segmentation Offload. +The SGE points to a TCP Stream buffer which will be segmented into MSS +size SENDs. +The hdr includes the entire network headers up to and including the TCP +header and is prefixed before each segment. +.RS +.RE +.SS QP Specific setters +.PP +Certain QP types require each post to be accompanied by additional +setters, these setters are mandatory for any operation listing a QP +setter in the above table. +.TP +.B \f[I]UD\f[] QPs +\f[I]ibv_wr_set_ud_addr()\f[] must be called to set the destination +address of the work. +.RS +.RE +.TP +.B \f[I]XRC_SEND\f[] QPs +\f[I]ibv_wr_set_xrc_srqn()\f[] must be called to set the destination +SRQN field. +.RS +.RE +.SS DATA transfer setters +.PP +For work that requires to transfer data one of the following setters +should be called once after the WR builder: +.TP +.B \f[I]ibv_wr_set_sge()\f[] +Transfer data to/from a single buffer given by the lkey, addr and +length. +This is equivalent to \f[I]ibv_wr_set_sge_list()\f[] with a single +element. +.RS +.RE +.TP +.B \f[I]ibv_wr_set_sge_list()\f[] +Transfer data to/from a list of buffers, logically concatenated +together. +Each buffer is specified by an element in an array of \f[I]struct +ibv_sge\f[]. +.RS +.RE +.PP +Inline setters will copy the send data during the setter and allows the +caller to immediately re\-use the buffer. +This behavior is identical to the IBV_SEND_INLINE flag. +Generally this copy is done in a way that optimizes SEND latency and is +suitable for small messages. +The provider will limit the amount of data it can support in a single +operation. +This limit is requested in the \f[I]max_inline_data\f[] member of +\f[I]struct ibv_qp_init_attr\f[]. +Valid only for SEND and RDMA_WRITE. +.TP +.B \f[I]ibv_wr_set_inline_data()\f[] +Copy send data from a single buffer given by the addr and length. +This is equivalent to \f[I]ibv_wr_set_inline_data_list()\f[] with a +single element. +.RS +.RE +.TP +.B \f[I]ibv_wr_set_inline_data_list()\f[] +Copy send data from a list of buffers, logically concatenated together. +Each buffer is specified by an element in an array of \f[I]struct +ibv_inl_data\f[]. +.RS +.RE +.SS Flags +.PP +A bit mask of flags may be specified in \f[I]wr_flags\f[] to control the +behavior of the work request. +.TP +.B \f[B]IBV_SEND_FENCE\f[] +Do not start this work request until prior work has completed. +.RS +.RE +.TP +.B \f[B]IBV_SEND_IP_CSUM\f[] +Offload the IPv4 and TCP/UDP checksum calculation +.RS +.RE +.TP +.B \f[B]IBV_SEND_SIGNALED\f[] +A completion will be generated in the completion queue for the +operation. +.RS +.RE +.TP +.B \f[B]IBV_SEND_SOLICTED\f[] +Set the solicted bit in the RDMA packet. +This informs the other side to generate a completion event upon +receiving the RDMA operation. +.RS +.RE +.SH CONCURRENCY +.PP +The provider will provide locking to ensure that \f[I]ibv_wr_start()\f[] +and \f[I]ibv_wr_complete()/abort()\f[] form a per\-QP critical section +where no other threads can enter. +.PP +If an \f[I]ibv_td\f[] is provided during QP creation then no locking +will be performed and it is up to the caller to ensure that only one +thread can be within the critical region at a time. +.SH RETURN VALUE +.PP +Applications should use this API in a way that does not create failures. +The individual APIs do not return a failure indication to avoid +branching. +.PP +If a failure is detected during operation, for instance due to an +invalid argument, then \f[I]ibv_wr_complete()\f[] will return failure +and the entire posting will be aborted. +.SH EXAMPLE +.IP +.nf +\f[C] +/*\ create\ RC\ QP\ type\ and\ specify\ the\ required\ send\ opcodes\ */ +qp_init_attr_ex.qp_type\ =\ IBV_QPT_RC; +qp_init_attr_ex.comp_mask\ |=\ IBV_QP_INIT_ATTR_SEND_OPS_FLAGS; +qp_init_attr_ex.send_ops_flags\ |=\ IBV_QP_EX_WITH_RDMA_WRITE; +qp_init_attr_ex.send_ops_flags\ |=\ IBV_QP_EX_WITH_RDMA_WRITE_WITH_IMM; + +ibv_qp\ *qp\ =\ ibv_create_qp_ex(ctx,\ qp_init_attr_ex); +ibv_qp_ex\ *qpx\ =\ ibv_qp_to_qp_ex(qp); + +ibv_wr_start(qpx); + +/*\ create\ 1st\ WRITE\ WR\ entry\ */ +qpx\->wr_id\ =\ my_wr_id_1; +ibv_wr_rdma_write(qpx,\ rkey,\ remote_addr_1); +ibv_wr_set_sge(qpx,\ lkey,\ local_addr_1,\ length_1); + +/*\ create\ 2nd\ WRITE_WITH_IMM\ WR\ entry\ */ +qpx\->wr_id\ =\ my_wr_id_2; +qpx\->wr_flags\ =\ IBV_SEND_SIGNALED; +ibv_wr_rdma_write_imm(qpx,\ rkey,\ remote_addr_2,\ htonl(0x1234)); +ibv_set_wr_sge(qpx,\ lkey,\ local_addr_2,\ length_2); + +/*\ Begin\ processing\ WRs\ */ +ret\ =\ ibv_wr_complete(qpx); +\f[] +.fi +.SH SEE ALSO +.PP +\f[B]ibv_post_send\f[](3), \f[B]ibv_create_qp_ex(3)\f[]. +.SH AUTHOR +.PP +Jason Gunthorpe <jgg@mellanox.com> Guy Levi <guyle@mellanox.com> diff --git a/buildlib/pandoc-prebuilt/4331ceea9cb78a19ecc0e8ba10b2b1dde00bc30d b/buildlib/pandoc-prebuilt/4331ceea9cb78a19ecc0e8ba10b2b1dde00bc30d new file mode 100644 index 0000000..d7c8249 --- /dev/null +++ b/buildlib/pandoc-prebuilt/4331ceea9cb78a19ecc0e8ba10b2b1dde00bc30d @@ -0,0 +1,451 @@ +.\" Man page generated from reStructuredText. +. +.TH INFINIBAND-DIAGS 8 "2017-08-21" "" "Open IB Diagnostics" +.SH NAME +infiniband-diags \- Diagnostics for InfiniBand Fabrics +. +.nr rst2man-indent-level 0 +. +.de1 rstReportMargin +\\$1 \\n[an-margin] +level \\n[rst2man-indent-level] +level margin: \\n[rst2man-indent\\n[rst2man-indent-level]] +- +\\n[rst2man-indent0] +\\n[rst2man-indent1] +\\n[rst2man-indent2] +.. +.de1 INDENT +.\" .rstReportMargin pre: +. RS \\$1 +. nr rst2man-indent\\n[rst2man-indent-level] \\n[an-margin] +. nr rst2man-indent-level +1 +.\" .rstReportMargin post: +.. +.de UNINDENT +. RE +.\" indent \\n[an-margin] +.\" old: \\n[rst2man-indent\\n[rst2man-indent-level]] +.nr rst2man-indent-level -1 +.\" new: \\n[rst2man-indent\\n[rst2man-indent-level]] +.in \\n[rst2man-indent\\n[rst2man-indent-level]]u +.. +.SH DESCRIPTION +.sp +infiniband\-diags is a set of utilities designed to help configure, debug, and +maintain infiniband fabrics. Many tools and utilities are provided. Some with +similar functionality. +.sp +The base utilities use directed route MAD\(aqs to perform their operations. They +may therefore work even in unconfigured subnets. Other, higher level +utilities, require LID routed MAD\(aqs and to some extent SA/SM access. +.SH THE USE OF SMPS (QP0) +.sp +Many of the tools in this package rely on the use of SMPs via QP0 to acquire +data directly from the SMA. While this mode of operation is not technically in +compliance with the InfiniBand specification, practical experience has found +that this level of diagnostics is valuable when working with a fabric which is +broken or only partially configured. For this reason many of these tools may +require the use of an MKey or operation from Virtual Machines may be restricted +for security reasons. +.SH COMMON OPTIONS +.sp +Most OpenIB diagnostics take some of the following common flags. The exact list +of supported flags per utility can be found in the documentation for those +commands. +.SS Addressing Flags +.sp +The \-D and \-G option have two forms: +.\" Define the common option -D for Directed routes +. +.sp +\fB\-D, \-\-Direct\fP The address specified is a directed route +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +Examples: + [options] \-D [options] "0" # self port + [options] \-D [options] "0,1,2,1,4" # out via port 1, then 2, ... + + (Note the second number in the path specified must match the port being + used. This can be specified using the port selection flag \(aq\-P\(aq or the + port found through the automatic selection process.) +.ft P +.fi +.UNINDENT +.UNINDENT +.\" Define the common option -D for Directed routes +. +.sp +\fB\-D, \-\-Direct <dr_path>\fP The address specified is a directed route +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +Examples: + \-D "0" # self port + \-D "0,1,2,1,4" # out via port 1, then 2, ... + + (Note the second number in the path specified must match the port being + used. This can be specified using the port selection flag \(aq\-P\(aq or the + port found through the automatic selection process.) +.ft P +.fi +.UNINDENT +.UNINDENT +.\" Define the common option -G +. +.sp +\fB\-G, \-\-Guid\fP The address specified is a Port GUID +.\" Define the common option -G +. +.sp +\fB\-\-port\-guid, \-G <port_guid>\fP Specify a port_guid +.\" Define the common option -L +. +.sp +\fB\-L, \-\-Lid\fP The address specified is a LID +.\" Define the common option -s +. +.sp +\fB\-s, \-\-sm_port <smlid>\fP use \(aqsmlid\(aq as the target lid for SA queries. +.SS Port Selection flags +.\" Define the common option -C +. +.sp +\fB\-C, \-\-Ca <ca_name>\fP use the specified ca_name. +.\" Define the common option -P +. +.sp +\fB\-P, \-\-Port <ca_port>\fP use the specified ca_port. +.\" Explanation of local port selection +. +.SS Local port Selection +.sp +Multiple port/Multiple CA support: when no IB device or port is specified +(see the "local umad parameters" below), the libibumad library +selects the port to use by the following criteria: +.INDENT 0.0 +.INDENT 3.5 +.INDENT 0.0 +.IP 1. 3 +the first port that is ACTIVE. +.IP 2. 3 +if not found, the first port that is UP (physical link up). +.UNINDENT +.sp +If a port and/or CA name is specified, the libibumad library attempts +to fulfill the user request, and will fail if it is not possible. +.sp +For example: +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +ibaddr # use the first port (criteria #1 above) +ibaddr \-C mthca1 # pick the best port from "mthca1" only. +ibaddr \-P 2 # use the second (active/up) port from the first available IB device. +ibaddr \-C mthca0 \-P 2 # use the specified port only. +.ft P +.fi +.UNINDENT +.UNINDENT +.UNINDENT +.UNINDENT +.SS Debugging flags +.\" Define the common option -d +. +.INDENT 0.0 +.TP +.B \-d +raise the IB debugging level. +May be used several times (\-ddd or \-d \-d \-d). +.UNINDENT +.\" Define the common option -e +. +.INDENT 0.0 +.TP +.B \-e +show send and receive errors (timeouts and others) +.UNINDENT +.\" Define the common option -h +. +.sp +\fB\-h, \-\-help\fP show the usage message +.\" Define the common option -v +. +.INDENT 0.0 +.TP +.B \fB\-v, \-\-verbose\fP +increase the application verbosity level. +May be used several times (\-vv or \-v \-v \-v) +.UNINDENT +.\" Define the common option -V +. +.sp +\fB\-V, \-\-version\fP show the version info. +.SS Configuration flags +.\" Define the common option -t +. +.sp +\fB\-t, \-\-timeout <timeout_ms>\fP override the default timeout for the solicited mads. +.\" Define the common option -z +. +.INDENT 0.0 +.TP +.B \fB\-\-outstanding_smps, \-o <val>\fP +Specify the number of outstanding SMP\(aqs which should be issued during the scan +.sp +Default: 2 +.UNINDENT +.\" Define the common option --node-name-map +. +.sp +\fB\-\-node\-name\-map <node\-name\-map>\fP Specify a node name map. +.INDENT 0.0 +.INDENT 3.5 +This file maps GUIDs to more user friendly names. See FILES section. +.UNINDENT +.UNINDENT +.\" Define the common option -z +. +.sp +\fB\-\-config, \-z <config_file>\fP Specify alternate config file. +.INDENT 0.0 +.INDENT 3.5 +Default: /usr/local/etc/infiniband\-diags/ibdiag.conf +.UNINDENT +.UNINDENT +.SH COMMON FILES +.sp +The following config files are common amongst many of the utilities. +.\" Common text for the config file +. +.SS CONFIG FILE +.sp +/usr/local/etc/infiniband\-diags/ibdiag.conf +.sp +A global config file is provided to set some of the common options for all +tools. See supplied config file for details. +.\" Common text to describe the node name map file. +. +.SS NODE NAME MAP FILE FORMAT +.sp +The node name map is used to specify user friendly names for nodes in the +output. GUIDs are used to perform the lookup. +.sp +This functionality is provided by the opensm\-libs package. See \fBopensm(8)\fP +for the file location for your installation. +.sp +\fBGenerically:\fP +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +# comment +<guid> "<name>" +.ft P +.fi +.UNINDENT +.UNINDENT +.sp +\fBExample:\fP +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +# IB1 +# Line cards +0x0008f104003f125c "IB1 (Rack 11 slot 1 ) ISR9288/ISR9096 Voltaire sLB\-24D" +0x0008f104003f125d "IB1 (Rack 11 slot 1 ) ISR9288/ISR9096 Voltaire sLB\-24D" +0x0008f104003f10d2 "IB1 (Rack 11 slot 2 ) ISR9288/ISR9096 Voltaire sLB\-24D" +0x0008f104003f10d3 "IB1 (Rack 11 slot 2 ) ISR9288/ISR9096 Voltaire sLB\-24D" +0x0008f104003f10bf "IB1 (Rack 11 slot 12 ) ISR9288/ISR9096 Voltaire sLB\-24D" + +# Spines +0x0008f10400400e2d "IB1 (Rack 11 spine 1 ) ISR9288 Voltaire sFB\-12D" +0x0008f10400400e2e "IB1 (Rack 11 spine 1 ) ISR9288 Voltaire sFB\-12D" +0x0008f10400400e2f "IB1 (Rack 11 spine 1 ) ISR9288 Voltaire sFB\-12D" +0x0008f10400400e31 "IB1 (Rack 11 spine 2 ) ISR9288 Voltaire sFB\-12D" +0x0008f10400400e32 "IB1 (Rack 11 spine 2 ) ISR9288 Voltaire sFB\-12D" + +# GUID Node Name +0x0008f10400411a08 "SW1 (Rack 3) ISR9024 Voltaire 9024D" +0x0008f10400411a28 "SW2 (Rack 3) ISR9024 Voltaire 9024D" +0x0008f10400411a34 "SW3 (Rack 3) ISR9024 Voltaire 9024D" +0x0008f104004119d0 "SW4 (Rack 3) ISR9024 Voltaire 9024D" +.ft P +.fi +.UNINDENT +.UNINDENT +.\" Common text to describe the Topology file. +. +.SS TOPOLOGY FILE FORMAT +.sp +The topology file format is human readable and largely intuitive. +Most identifiers are given textual names like vendor ID (vendid), device ID +(device ID), GUIDs of various types (sysimgguid, caguid, switchguid, etc.). +PortGUIDs are shown in parentheses (). For switches, this is shown on the +switchguid line. For CA and router ports, it is shown on the connectivity +lines. The IB node is identified followed by the number of ports and a quoted +the node GUID. On the right of this line is a comment (#) followed by the +NodeDescription in quotes. If the node is a switch, this line also contains +whether switch port 0 is base or enhanced, and the LID and LMC of port 0. +Subsequent lines pertaining to this node show the connectivity. On the +left is the port number of the current node. On the right is the peer node +(node at other end of link). It is identified in quotes with nodetype +followed by \- followed by NodeGUID with the port number in square brackets. +Further on the right is a comment (#). What follows the comment is +dependent on the node type. If it it a switch node, it is followed by +the NodeDescription in quotes and the LID of the peer node. If it is a +CA or router node, it is followed by the local LID and LMC and then +followed by the NodeDescription in quotes and the LID of the peer node. +The active link width and speed are then appended to the end of this +output line. +.sp +An example of this is: +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +# +# Topology file: generated on Tue Jun 5 14:15:10 2007 +# +# Max of 3 hops discovered +# Initiated from node 0008f10403960558 port 0008f10403960559 + +Non\-Chassis Nodes + +vendid=0x8f1 +devid=0x5a06 +sysimgguid=0x5442ba00003000 +switchguid=0x5442ba00003080(5442ba00003080) +Switch 24 "S\-005442ba00003080" # "ISR9024 Voltaire" base port 0 lid 6 lmc 0 +[22] "H\-0008f10403961354"[1](8f10403961355) # "MT23108 InfiniHost Mellanox Technologies" lid 4 4xSDR +[10] "S\-0008f10400410015"[1] # "SW\-6IB4 Voltaire" lid 3 4xSDR +[8] "H\-0008f10403960558"[2](8f1040396055a) # "MT23108 InfiniHost Mellanox Technologies" lid 14 4xSDR +[6] "S\-0008f10400410015"[3] # "SW\-6IB4 Voltaire" lid 3 4xSDR +[12] "H\-0008f10403960558"[1](8f10403960559) # "MT23108 InfiniHost Mellanox Technologies" lid 10 4xSDR + +vendid=0x8f1 +devid=0x5a05 +switchguid=0x8f10400410015(8f10400410015) +Switch 8 "S\-0008f10400410015" # "SW\-6IB4 Voltaire" base port 0 lid 3 lmc 0 +[6] "H\-0008f10403960984"[1](8f10403960985) # "MT23108 InfiniHost Mellanox Technologies" lid 16 4xSDR +[4] "H\-005442b100004900"[1](5442b100004901) # "MT23108 InfiniHost Mellanox Technologies" lid 12 4xSDR +[1] "S\-005442ba00003080"[10] # "ISR9024 Voltaire" lid 6 1xSDR +[3] "S\-005442ba00003080"[6] # "ISR9024 Voltaire" lid 6 4xSDR + +vendid=0x2c9 +devid=0x5a44 +caguid=0x8f10403960984 +Ca 2 "H\-0008f10403960984" # "MT23108 InfiniHost Mellanox Technologies" +[1](8f10403960985) "S\-0008f10400410015"[6] # lid 16 lmc 1 "SW\-6IB4 Voltaire" lid 3 4xSDR + +vendid=0x2c9 +devid=0x5a44 +caguid=0x5442b100004900 +Ca 2 "H\-005442b100004900" # "MT23108 InfiniHost Mellanox Technologies" +[1](5442b100004901) "S\-0008f10400410015"[4] # lid 12 lmc 1 "SW\-6IB4 Voltaire" lid 3 4xSDR + +vendid=0x2c9 +devid=0x5a44 +caguid=0x8f10403961354 +Ca 2 "H\-0008f10403961354" # "MT23108 InfiniHost Mellanox Technologies" +[1](8f10403961355) "S\-005442ba00003080"[22] # lid 4 lmc 1 "ISR9024 Voltaire" lid 6 4xSDR + +vendid=0x2c9 +devid=0x5a44 +caguid=0x8f10403960558 +Ca 2 "H\-0008f10403960558" # "MT23108 InfiniHost Mellanox Technologies" +[2](8f1040396055a) "S\-005442ba00003080"[8] # lid 14 lmc 1 "ISR9024 Voltaire" lid 6 4xSDR +[1](8f10403960559) "S\-005442ba00003080"[12] # lid 10 lmc 1 "ISR9024 Voltaire" lid 6 1xSDR +.ft P +.fi +.UNINDENT +.UNINDENT +.sp +When grouping is used, IB nodes are organized into chassis which are +numbered. Nodes which cannot be determined to be in a chassis are +displayed as "Non\-Chassis Nodes". External ports are also shown on the +connectivity lines. +.SH UTILITIES LIST +.SS Basic fabric connectivity +.INDENT 0.0 +.INDENT 3.5 +See: ibnetdiscover, iblinkinfo +.UNINDENT +.UNINDENT +.SS Node information +.INDENT 0.0 +.INDENT 3.5 +See: ibnodes, ibswitches, ibhosts, ibrouters +.UNINDENT +.UNINDENT +.SS Port information +.INDENT 0.0 +.INDENT 3.5 +See: ibportstate, ibaddr +.UNINDENT +.UNINDENT +.SS Switch Forwarding Table info +.INDENT 0.0 +.INDENT 3.5 +See: ibtracert, ibroute, dump_lfts, dump_mfts, check_lft_balance, ibfindnodesusing +.UNINDENT +.UNINDENT +.SS Performance counters +.INDENT 0.0 +.INDENT 3.5 +See: ibqueryerrors, perfquery +.UNINDENT +.UNINDENT +.SS Local HCA info +.INDENT 0.0 +.INDENT 3.5 +See: ibstat, ibstatus +.UNINDENT +.UNINDENT +.SS Connectivity check +.INDENT 0.0 +.INDENT 3.5 +See: ibping, ibsysstat +.UNINDENT +.UNINDENT +.SS Low level query tools +.INDENT 0.0 +.INDENT 3.5 +See: smpquery, smpdump, saquery, sminfo +.UNINDENT +.UNINDENT +.SS Fabric verification tools +.INDENT 0.0 +.INDENT 3.5 +See: ibidsverify +.UNINDENT +.UNINDENT +.SH BACKWARDS COMPATIBILITY SCRIPTS +.sp +The following scripts have been identified as redundant and/or lower performing +as compared to the above scripts. They are provided as legacy scripts when +\-\-enable\-compat\-utils is specified at build time. +.sp +ibcheckerrors, ibclearcounters, ibclearerrors, ibdatacounters +ibchecknet, ibchecknode, ibcheckport, ibcheckportstate, +ibcheckportwidth, ibcheckstate, ibcheckwidth, ibswportwatch, +ibprintca, ibprintrt, ibprintswitch, set_nodedesc.sh +.SH AUTHORS +.INDENT 0.0 +.TP +.B Ira Weiny +< \fI\%ira.weiny@intel.com\fP > +.UNINDENT +.\" Generated by docutils manpage writer. +. diff --git a/buildlib/pandoc-prebuilt/4777e0f7fc83bcb92ce35f8ef344140f79bd1cce b/buildlib/pandoc-prebuilt/4777e0f7fc83bcb92ce35f8ef344140f79bd1cce new file mode 100644 index 0000000..95af5f5 --- /dev/null +++ b/buildlib/pandoc-prebuilt/4777e0f7fc83bcb92ce35f8ef344140f79bd1cce @@ -0,0 +1,206 @@ +.\" Man page generated from reStructuredText. +. +.TH SMPDUMP 8 "2017-08-21" "" "Open IB Diagnostics" +.SH NAME +smpdump \- dump InfiniBand subnet management attributes +. +.nr rst2man-indent-level 0 +. +.de1 rstReportMargin +\\$1 \\n[an-margin] +level \\n[rst2man-indent-level] +level margin: \\n[rst2man-indent\\n[rst2man-indent-level]] +- +\\n[rst2man-indent0] +\\n[rst2man-indent1] +\\n[rst2man-indent2] +.. +.de1 INDENT +.\" .rstReportMargin pre: +. RS \\$1 +. nr rst2man-indent\\n[rst2man-indent-level] \\n[an-margin] +. nr rst2man-indent-level +1 +.\" .rstReportMargin post: +.. +.de UNINDENT +. RE +.\" indent \\n[an-margin] +.\" old: \\n[rst2man-indent\\n[rst2man-indent-level]] +.nr rst2man-indent-level -1 +.\" new: \\n[rst2man-indent\\n[rst2man-indent-level]] +.in \\n[rst2man-indent\\n[rst2man-indent-level]]u +.. +.SH SYNOPSIS +.sp +smpdump [options] <dlid|dr_path> <attribute> [attribute_modifier] +.SH DESCRIPTION +.sp +smpdump is a general purpose SMP utility which gets SM attributes from a +specified SMA. The result is dumped in hex by default. +.SH OPTIONS +.INDENT 0.0 +.TP +.B \fBdlid|drpath\fP +LID or DR path to SMA +.TP +.B \fBattribute\fP +IBA attribute ID for SM attribute +.TP +.B \fBattribute_modifier\fP +IBA modifier for SM attribute +.TP +.B \fB\-s, \-\-string\fP +Print strings in packet if possible +.UNINDENT +.SS Addressing Flags +.\" Define the common option -D for Directed routes +. +.sp +\fB\-D, \-\-Direct\fP The address specified is a directed route +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +Examples: + [options] \-D [options] "0" # self port + [options] \-D [options] "0,1,2,1,4" # out via port 1, then 2, ... + + (Note the second number in the path specified must match the port being + used. This can be specified using the port selection flag \(aq\-P\(aq or the + port found through the automatic selection process.) +.ft P +.fi +.UNINDENT +.UNINDENT +.\" Define the common option -L +. +.sp +\fB\-L, \-\-Lid\fP The address specified is a LID +.SS Port Selection flags +.\" Define the common option -C +. +.sp +\fB\-C, \-\-Ca <ca_name>\fP use the specified ca_name. +.\" Define the common option -P +. +.sp +\fB\-P, \-\-Port <ca_port>\fP use the specified ca_port. +.\" Explanation of local port selection +. +.SS Local port Selection +.sp +Multiple port/Multiple CA support: when no IB device or port is specified +(see the "local umad parameters" below), the libibumad library +selects the port to use by the following criteria: +.INDENT 0.0 +.INDENT 3.5 +.INDENT 0.0 +.IP 1. 3 +the first port that is ACTIVE. +.IP 2. 3 +if not found, the first port that is UP (physical link up). +.UNINDENT +.sp +If a port and/or CA name is specified, the libibumad library attempts +to fulfill the user request, and will fail if it is not possible. +.sp +For example: +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +ibaddr # use the first port (criteria #1 above) +ibaddr \-C mthca1 # pick the best port from "mthca1" only. +ibaddr \-P 2 # use the second (active/up) port from the first available IB device. +ibaddr \-C mthca0 \-P 2 # use the specified port only. +.ft P +.fi +.UNINDENT +.UNINDENT +.UNINDENT +.UNINDENT +.SS Debugging flags +.\" Define the common option -d +. +.INDENT 0.0 +.TP +.B \-d +raise the IB debugging level. +May be used several times (\-ddd or \-d \-d \-d). +.UNINDENT +.\" Define the common option -e +. +.INDENT 0.0 +.TP +.B \-e +show send and receive errors (timeouts and others) +.UNINDENT +.\" Define the common option -h +. +.sp +\fB\-h, \-\-help\fP show the usage message +.\" Define the common option -v +. +.INDENT 0.0 +.TP +.B \fB\-v, \-\-verbose\fP +increase the application verbosity level. +May be used several times (\-vv or \-v \-v \-v) +.UNINDENT +.\" Define the common option -V +. +.sp +\fB\-V, \-\-version\fP show the version info. +.SS Configuration flags +.\" Define the common option -t +. +.sp +\fB\-t, \-\-timeout <timeout_ms>\fP override the default timeout for the solicited mads. +.\" Define the common option -z +. +.sp +\fB\-\-config, \-z <config_file>\fP Specify alternate config file. +.INDENT 0.0 +.INDENT 3.5 +Default: /usr/local/etc/infiniband\-diags/ibdiag.conf +.UNINDENT +.UNINDENT +.SH FILES +.\" Common text for the config file +. +.SS CONFIG FILE +.sp +/usr/local/etc/infiniband\-diags/ibdiag.conf +.sp +A global config file is provided to set some of the common options for all +tools. See supplied config file for details. +.SH EXAMPLES +.sp +Direct Routed Examples +.INDENT 0.0 +.TP +.B :: +smpdump \-D 0,1,2,3,5 16 # NODE DESC +smpdump \-D 0,1,2 0x15 2 # PORT INFO, port 2 +.UNINDENT +.sp +LID Routed Examples +.INDENT 0.0 +.TP +.B :: +smpdump 3 0x15 2 # PORT INFO, lid 3 port 2 +smpdump 0xa0 0x11 # NODE INFO, lid 0xa0 +.UNINDENT +.SH SEE ALSO +.sp +smpquery (8) +.SH AUTHOR +.INDENT 0.0 +.TP +.B Hal Rosenstock +< \fI\%halr@voltaire.com\fP > +.UNINDENT +.\" Generated by docutils manpage writer. +. diff --git a/buildlib/pandoc-prebuilt/4a2fda3e7e3b15e84396f81e6aae0bde38dcfb98 b/buildlib/pandoc-prebuilt/4a2fda3e7e3b15e84396f81e6aae0bde38dcfb98 new file mode 100644 index 0000000..2f20af5 --- /dev/null +++ b/buildlib/pandoc-prebuilt/4a2fda3e7e3b15e84396f81e6aae0bde38dcfb98 @@ -0,0 +1,34 @@ +.\" Automatically generated by Pandoc 1.19.2.4 +.\" +.TH "IBV_QUERY_GID" "3" "2006\-10\-31" "libibverbs" "Libibverbs Programmer\[aq]s Manual" +.hy +.SH NAME +.PP +ibv_query_gid \- query an InfiniBand port\[aq]s GID table +.SH SYNOPSIS +.IP +.nf +\f[C] +#include\ <infiniband/verbs.h> + +int\ ibv_query_gid(struct\ ibv_context\ *context, +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ uint8_t\ port_num, +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ int\ index, +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ union\ ibv_gid\ *gid); +\f[] +.fi +.SH DESCRIPTION +.PP +\f[B]ibv_query_gid()\f[] returns the GID value in entry \f[I]index\f[] +of port \f[I]port_num\f[] for device context \f[I]context\f[] through +the pointer \f[I]gid\f[]. +.SH RETURN VALUE +.PP +\f[B]ibv_query_gid()\f[] returns 0 on success, and \-1 on error. +.SH SEE ALSO +.PP +\f[B]ibv_open_device\f[](3), \f[B]ibv_query_device\f[](3), +\f[B]ibv_query_pkey\f[](3), \f[B]ibv_query_port\f[](3) +.SH AUTHOR +.PP +Dotan Barak <dotanba@gmail.com> diff --git a/buildlib/pandoc-prebuilt/4aefe6ec699efe9cbeab3f78569c8ed5da970a2e b/buildlib/pandoc-prebuilt/4aefe6ec699efe9cbeab3f78569c8ed5da970a2e new file mode 100644 index 0000000..f5f2c77 --- /dev/null +++ b/buildlib/pandoc-prebuilt/4aefe6ec699efe9cbeab3f78569c8ed5da970a2e @@ -0,0 +1,101 @@ +.\" Automatically generated by Pandoc 1.19.2.4 +.\" +.TH "mlx5dv_create_flow_action_packet_reformat" "3" "" "" "" +.hy +.SH NAME +.PP +mlx5dv_create_flow_action_packet_reformat \- Flow action reformat packet +for mlx5 provider +.SH SYNOPSIS +.IP +.nf +\f[C] +#include\ <infiniband/mlx5dv.h> + +struct\ ibv_flow_action\ * +mlx5dv_create_flow_action_packet_reformat(struct\ ibv_context\ *ctx, +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ size_t\ data_sz, +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ void\ *data, +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ enum\ mlx5dv_flow_action_packet_reformat_type\ reformat_type, +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ enum\ mlx5dv_flow_table_type\ ft_type) +\f[] +.fi +.SH DESCRIPTION +.PP +Create a packet reformat flow steering action. +It allows adding/removing packet headers. +.SH ARGUMENTS +.TP +.B \f[I]ctx\f[] +.IP +.nf +\f[C] +RDMA\ device\ context\ to\ create\ the\ action\ on. +\f[] +.fi +.RS +.RE +.TP +.B \f[I]data_sz\f[] +.IP +.nf +\f[C] +The\ size\ of\ *data*\ buffer. +\f[] +.fi +.RS +.RE +.TP +.B \f[I]data\f[] +.IP +.nf +\f[C] +A\ buffer\ which\ contains\ headers\ in\ case\ the\ actions\ requires\ them. +\f[] +.fi +.RS +.RE +.TP +.B \f[I]reformat_type\f[] +.IP +.nf +\f[C] +The\ reformat\ type\ to\ be\ create.\ Use\ enum\ mlx5dv_flow_action_packet_reformat_type. +\f[] +.fi +.RS +.PP +MLX5DV_FLOW_ACTION_PACKET_REFORMAT_TYPE_L2_TUNNEL_TO_L2: Decap a generic +L2 tunneled packet up to inner L2. +.PP +MLX5DV_FLOW_ACTION_PACKET_REFORMAT_TYPE_L2_TO_L2_TUNNEL: Generic encap, +\f[I]data\f[] should contain the encapsulating headers. +.PP +MLX5DV_FLOW_ACTION_PACKET_REFORMAT_TYPE_L3_TUNNEL_TO_L2: Will do decap +where the inner packet starts from L3. +\f[I]data\f[] should be MAC or MAC + vlan (14 or 18 bytes) to be +appended to the packet after the decap action. +.PP +MLX5DV_FLOW_ACTION_PACKET_REFORMAT_TYPE_L2_TO_L3_TUNNEL: Will do encap +where is L2 of the original packet will not be included. +\f[I]data\f[] should be the encapsulating header. +.RE +.TP +.B \f[I]ft_type\f[] +.IP +.nf +\f[C] +It\ defines\ the\ flow\ table\ type\ to\ which\ the\ packet\ reformat\ action +\f[] +.fi +.RS +will be attached. +.RE +.SH RETURN VALUE +.PP +Upon success \f[I]mlx5dv_create_flow_action_packet_reformat\f[] will +return a new \f[I]struct ibv_flow_action\f[] object, on error NULL will +be returned and errno will be set. +.SH SEE ALSO +.PP +\f[I]ibv_create_flow(3)\f[], \f[I]ibv_create_flow_action(3)\f[] diff --git a/buildlib/pandoc-prebuilt/4c131186395bc5abd8dae77341e6a5fecc2aa827 b/buildlib/pandoc-prebuilt/4c131186395bc5abd8dae77341e6a5fecc2aa827 new file mode 100644 index 0000000..25056d5 --- /dev/null +++ b/buildlib/pandoc-prebuilt/4c131186395bc5abd8dae77341e6a5fecc2aa827 @@ -0,0 +1,103 @@ +.\" Automatically generated by Pandoc 1.19.2.4 +.\" +.TH "mlx5dv_create_flow" "3" "2018\-9\-19" "mlx5" "mlx5 Programmer\[aq]s Manual" +.hy +.SH NAME +.PP +mlx5dv_create_flow \- creates a steering flow rule +.SH SYNOPSIS +.IP +.nf +\f[C] +#include\ <infiniband/mlx5dv.h> + +struct\ ibv_flow\ * +mlx5dv_create_flow(struct\ mlx5dv_flow_matcher\ *flow_matcher, +\ \ \ \ \ \ \ \ \ \ \ struct\ mlx5dv_flow_match_parameters\ *match_value, +\ \ \ \ \ \ \ \ \ \ \ size_t\ num_actions, +\ \ \ \ \ \ \ \ \ \ \ struct\ mlx5dv_flow_action_attr\ actions_attr[]) +\f[] +.fi +.SH DESCRIPTION +.PP +\f[B]mlx5dv_create_flow()\f[] creates a steering flow rule with the +ability to specify specific driver properties. +.SH ARGUMENTS +.PP +Please see \f[I]mlx5dv_create_flow_matcher(3)\f[] for +\f[I]flow_matcher\f[] and \f[I]match_value\f[]. +.TP +.B \f[I]num_actions\f[] +Specifies how many actions are passed in \f[I]actions_attr\f[] +.RS +.RE +.SS \f[I]actions_attr\f[] +.IP +.nf +\f[C] +struct\ mlx5dv_flow_action_attr\ { +\ \ \ \ enum\ mlx5dv_flow_action_type\ type; +\ \ \ \ union\ { +\ \ \ \ \ \ \ \ struct\ ibv_qp\ *qp; +\ \ \ \ \ \ \ \ struct\ ibv_counters\ *counter; +\ \ \ \ \ \ \ \ struct\ ibv_flow_action\ *action; +\ \ \ \ \ \ \ \ uint32_t\ tag_value; +\ \ \ \ \ \ \ \ struct\ mlx5dv_devx_obj\ *obj; +\ \ \ \ }; +}; +\f[] +.fi +.TP +.B \f[I]type\f[] +MLX5DV_FLOW_ACTION_DEST_IBV_QP The QP passed will receive the matched +packets. +MLX5DV_FLOW_ACTION_IBV_FLOW_ACTION The flow action to be applied. +MLX5DV_FLOW_ACTION_TAG Flow tag to be provided in work completion. +MLX5DV_FLOW_ACTION_DEST_DEVX The DEVX destination object for the matched +packets. +MLX5DV_FLOW_ACTION_COUNTERS_DEVX The DEVX counter object for the matched +packets. +.RS +.RE +.TP +.B \f[I]qp\f[] +QP passed, to be used with \f[I]type\f[] +\f[I]MLX5DV_FLOW_ACTION_DEST_IBV_QP\f[]. +.RS +.RE +.TP +.B \f[I]action\f[] +Flow action, to be used with \f[I]type\f[] +\f[I]MLX5DV_FLOW_ACTION_IBV_FLOW_ACTION\f[] see +\f[I]mlx5dv_create_flow_action_modify_header(3)\f[] and +\f[I]mlx5dv_create_flow_action_packet_reformat(3)\f[]. +.RS +.RE +.TP +.B \f[I]tag_value\f[] +tag value to be passed in the work completion, to be used with +\f[I]type\f[] \f[I]MLX5DV_FLOW_ACTION_TAG\f[] see +\f[I]ibv_create_cq_ex(3)\f[]. +.RS +.RE +.TP +.B \f[I]obj\f[] +DEVX object, to be used with \f[I]type\f[] +\f[I]MLX5DV_FLOW_ACTION_DEST_DEVX\f[] or by +\f[I]MLX5DV_FLOW_ACTION_COUNTERS_DEVX\f[]. +.RS +.RE +.SH RETURN VALUE +.PP +\f[B]mlx5dv_create_flow\f[] returns a pointer to the created flow rule, +on error NULL will be returned and errno will be set. +.SH SEE ALSO +.PP +\f[I]mlx5dv_create_flow_action_modify_header(3)\f[], +\f[I]mlx5dv_create_flow_action_packet_reformat(3)\f[], +\f[I]mlx5dv_create_flow_matcher(3)\f[], \f[I]mlx5dv_create_qp(3)\f[], +\f[I]ibv_create_qp_ex(3)\f[] \f[I]ibv_create_cq_ex(3)\f[] +\f[I]ibv_create_counters(3)\f[] +.SH AUTHOR +.PP +Mark Bloch <marb@mellanox.com> diff --git a/buildlib/pandoc-prebuilt/515c4ddf52e644f7f347c40deac43d5ddb5bb19d b/buildlib/pandoc-prebuilt/515c4ddf52e644f7f347c40deac43d5ddb5bb19d new file mode 100644 index 0000000..b857381 --- /dev/null +++ b/buildlib/pandoc-prebuilt/515c4ddf52e644f7f347c40deac43d5ddb5bb19d @@ -0,0 +1,44 @@ +.\" Automatically generated by Pandoc 1.19.2.4 +.\" +.TH "EFADV_CREATE_DRIVER_QP" "3" "2019\-01\-23" "efa" "EFA Direct Verbs Manual" +.hy +.SH NAME +.PP +efadv_create_driver_qp \- Create EFA specific Queue Pair # SYNOPSIS +.IP +.nf +\f[C] +#include\ <infiniband/efadv.h> + +struct\ ibv_qp\ *efadv_create_driver_qp(struct\ ibv_pd\ *ibvpd, +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ struct\ ibv_qp_init_attr\ *attr, +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ uint32_t\ driver_qp_type); +\f[] +.fi +.SH DESCRIPTION +.PP +\f[B]efadv_create_driver_qp()\f[] Create device\-specific Queue Pairs. +.PP +Scalable Reliable Datagram (SRD) transport provides reliable +out\-of\-order delivery, transparently utilizing multiple network paths +to reduce network tail latency. +Its interface is similar to UD, in particular it supports message size +up to MTU, with error handling extended to support reliable +communication. +.TP +.B \f[I]driver_qp_type\f[] +The type of QP to be created: +.RS +.PP +EFADV_QP_DRIVER_TYPE_SRD: Create an SRD QP. +.RE +.SH RETURN VALUE +.PP +efadv_create_driver_qp() returns a pointer to the created QP, or NULL if +the request fails. +.SH SEE ALSO +.PP +\f[B]efadv\f[](7) +.SH AUTHORS +.PP +Gal Pressman <galpress@amazon.com> diff --git a/buildlib/pandoc-prebuilt/561c21785df0cfbff916d5860a43a2e301875e90 b/buildlib/pandoc-prebuilt/561c21785df0cfbff916d5860a43a2e301875e90 new file mode 100644 index 0000000..5d35760 --- /dev/null +++ b/buildlib/pandoc-prebuilt/561c21785df0cfbff916d5860a43a2e301875e90 @@ -0,0 +1,126 @@ +.\" Man page generated from reStructuredText. +. +.TH IBFINDNODESUSING 8 "2017-08-21" "" "Open IB Diagnostics" +.SH NAME +ibfindnodesusing \- find a list of end nodes which are routed through the specified switch and port +. +.nr rst2man-indent-level 0 +. +.de1 rstReportMargin +\\$1 \\n[an-margin] +level \\n[rst2man-indent-level] +level margin: \\n[rst2man-indent\\n[rst2man-indent-level]] +- +\\n[rst2man-indent0] +\\n[rst2man-indent1] +\\n[rst2man-indent2] +.. +.de1 INDENT +.\" .rstReportMargin pre: +. RS \\$1 +. nr rst2man-indent\\n[rst2man-indent-level] \\n[an-margin] +. nr rst2man-indent-level +1 +.\" .rstReportMargin post: +.. +.de UNINDENT +. RE +.\" indent \\n[an-margin] +.\" old: \\n[rst2man-indent\\n[rst2man-indent-level]] +.nr rst2man-indent-level -1 +.\" new: \\n[rst2man-indent\\n[rst2man-indent-level]] +.in \\n[rst2man-indent\\n[rst2man-indent-level]]u +.. +.SH SYNOPSIS +.sp +ibfindnodesusing.pl [options] <switch_guid|switch_name> <port> +.SH DESCRIPTION +.sp +ibfindnodesusing.pl uses ibroute and detects the current nodes which are routed +through both directions of the link specified. The link is specified by one +switch port end; the script finds the remote end automatically. +.SH OPTIONS +.INDENT 0.0 +.TP +.B \fB\-h\fP +show help +.TP +.B \fB\-R\fP +Recalculate the ibnetdiscover information, ie do not use the cached +information. This option is slower but should be used if the diag +tools have not been used for some time or if there are other reasons to +believe that the fabric has changed. +.UNINDENT +.sp +\fB\-C <ca_name>\fP use the specified ca_name. +.sp +\fB\-P <ca_port>\fP use the specified ca_port. +.SH FILES +.\" Common text for the config file +. +.SS CONFIG FILE +.sp +/usr/local/etc/infiniband\-diags/ibdiag.conf +.sp +A global config file is provided to set some of the common options for all +tools. See supplied config file for details. +.\" Common text to describe the node name map file. +. +.SS NODE NAME MAP FILE FORMAT +.sp +The node name map is used to specify user friendly names for nodes in the +output. GUIDs are used to perform the lookup. +.sp +This functionality is provided by the opensm\-libs package. See \fBopensm(8)\fP +for the file location for your installation. +.sp +\fBGenerically:\fP +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +# comment +<guid> "<name>" +.ft P +.fi +.UNINDENT +.UNINDENT +.sp +\fBExample:\fP +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +# IB1 +# Line cards +0x0008f104003f125c "IB1 (Rack 11 slot 1 ) ISR9288/ISR9096 Voltaire sLB\-24D" +0x0008f104003f125d "IB1 (Rack 11 slot 1 ) ISR9288/ISR9096 Voltaire sLB\-24D" +0x0008f104003f10d2 "IB1 (Rack 11 slot 2 ) ISR9288/ISR9096 Voltaire sLB\-24D" +0x0008f104003f10d3 "IB1 (Rack 11 slot 2 ) ISR9288/ISR9096 Voltaire sLB\-24D" +0x0008f104003f10bf "IB1 (Rack 11 slot 12 ) ISR9288/ISR9096 Voltaire sLB\-24D" + +# Spines +0x0008f10400400e2d "IB1 (Rack 11 spine 1 ) ISR9288 Voltaire sFB\-12D" +0x0008f10400400e2e "IB1 (Rack 11 spine 1 ) ISR9288 Voltaire sFB\-12D" +0x0008f10400400e2f "IB1 (Rack 11 spine 1 ) ISR9288 Voltaire sFB\-12D" +0x0008f10400400e31 "IB1 (Rack 11 spine 2 ) ISR9288 Voltaire sFB\-12D" +0x0008f10400400e32 "IB1 (Rack 11 spine 2 ) ISR9288 Voltaire sFB\-12D" + +# GUID Node Name +0x0008f10400411a08 "SW1 (Rack 3) ISR9024 Voltaire 9024D" +0x0008f10400411a28 "SW2 (Rack 3) ISR9024 Voltaire 9024D" +0x0008f10400411a34 "SW3 (Rack 3) ISR9024 Voltaire 9024D" +0x0008f104004119d0 "SW4 (Rack 3) ISR9024 Voltaire 9024D" +.ft P +.fi +.UNINDENT +.UNINDENT +.SH AUTHOR +.INDENT 0.0 +.TP +.B Ira Weiny +< \fI\%ira.weiny@intel.com\fP > +.UNINDENT +.\" Generated by docutils manpage writer. +. diff --git a/buildlib/pandoc-prebuilt/561ea56de897d453a681e70cd2be7d0c0335e784 b/buildlib/pandoc-prebuilt/561ea56de897d453a681e70cd2be7d0c0335e784 new file mode 100644 index 0000000..95140f7 --- /dev/null +++ b/buildlib/pandoc-prebuilt/561ea56de897d453a681e70cd2be7d0c0335e784 @@ -0,0 +1,61 @@ +.\" Automatically generated by Pandoc 1.19.2.4 +.\" +.TH "mlx5dv_create_flow_action_modify_header" "3" "" "" "" +.hy +.SH NAME +.PP +mlx5dv_create_flow_action_modify_header \- Flow action modify header for +mlx5 provider +.SH SYNOPSIS +.IP +.nf +\f[C] +#include\ <infiniband/mlx5dv.h> + +struct\ ibv_flow_action\ * +mlx5dv_create_flow_action_modify_header(struct\ ibv_context\ *ctx, +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ size_t\ actions_sz, +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ uint64_t\ actions[], +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ enum\ mlx5dv_flow_table_type\ ft_type) +\f[] +.fi +.SH DESCRIPTION +.PP +Create a modify header flow steering action, it allows mutating a packet +header. +.SH ARGUMENTS +.TP +.B \f[I]ctx\f[] +RDMA device context to create the action on. +.RS +.RE +.TP +.B \f[I]actions_sz\f[] +The size of \f[I]actions\f[] buffer in bytes. +.RS +.RE +.TP +.B \f[I]actions\f[] +A buffer which contains modify actions provided in device spec format +(i.e. +be64). +.RS +.RE +.TP +.B \f[I]ft_type\f[] +Defines the flow table type to which the modify header action will be +attached. +.RS +.PP +MLX5DV_FLOW_TABLE_TYPE_NIC_RX: RX FLOW TABLE +.PP +MLX5DV_FLOW_TABLE_TYPE_NIC_TX: TX FLOW TABLE +.RE +.SH RETURN VALUE +.PP +Upon success \f[I]mlx5dv_create_flow_action_modify_header\f[] will +return a new \f[I]struct ibv_flow_action\f[] object, on error NULL will +be returned and errno will be set. +.SH SEE ALSO +.PP +\f[I]ibv_create_flow(3)\f[], \f[I]ibv_create_flow_action(3)\f[] diff --git a/buildlib/pandoc-prebuilt/58748d44e47709c08982c6349ef8fc8891398ef3 b/buildlib/pandoc-prebuilt/58748d44e47709c08982c6349ef8fc8891398ef3 new file mode 100644 index 0000000..dbc26ee --- /dev/null +++ b/buildlib/pandoc-prebuilt/58748d44e47709c08982c6349ef8fc8891398ef3 @@ -0,0 +1,94 @@ +.\" Automatically generated by Pandoc 1.19.2.4 +.\" +.TH "mlx5dv_devx_create_cmd_comp, mlx5dv_devx_destroy_cmd_comp, get_async" "3" "" "" "" +.hy +.SH NAME +.PP +mlx5dv_devx_create_cmd_comp \- Create a command completion to be used +for DEVX asynchronous commands. +.PP +mlx5dv_devx_destroy_cmd_comp \- Destroy a devx command completion. +.PP +mlx5dv_devx_get_async_cmd_comp \- Get an asynchronous command +completion. +# SYNOPSIS +.IP +.nf +\f[C] +#include\ <infiniband/mlx5dv.h> + +struct\ mlx5dv_devx_cmd_comp\ { +\ \ \ \ int\ fd; +}; + +struct\ mlx5dv_devx_cmd_comp\ * +mlx5dv_devx_create_cmd_comp(struct\ ibv_context\ *context) + +void\ mlx5dv_devx_destroy_cmd_comp(struct\ mlx5dv_devx_cmd_comp\ *cmd_comp) + +struct\ mlx5dv_devx_async_cmd_hdr\ { +\ \ \ \ uint64_t\ \ \ \ wr_id; +\ \ \ \ uint8_t\ \ \ \ \ out_data[]; +}; + +int\ mlx5dv_devx_get_async_cmd_comp(struct\ mlx5dv_devx_cmd_comp\ *cmd_comp, +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ struct\ mlx5dv_devx_async_cmd_hdr\ *cmd_resp, +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ size_t\ cmd_resp_len) +\f[] +.fi +.SH DESCRIPTION +.PP +Create or destroy a command completion to be used for DEVX asynchronous +commands. +.PP +The create verb exposes an mlx5dv_devx_cmd_comp object that can be used +as part of asynchronous DEVX commands. +This lets an application run asynchronously without blocking and once +the response is ready read it from this object. +.PP +The response can be read by the mlx5dv_devx_get_async_cmd_comp() API, +upon response the \f[I]wr_id\f[] that was supplied upon the asynchronous +command is returned and the \f[I]out_data\f[] includes the data itself. +The application must supply a large enough buffer to match any command +that was issued on the \f[I]cmd_comp\f[], its size is given by the input +\f[I]cmd_resp_len\f[] parameter. +.SH ARGUMENTS +.TP +.B \f[I]context\f[] +.IP +.nf +\f[C] +RDMA\ device\ context\ to\ create\ the\ action\ on. +\f[] +.fi +.RS +.RE +.TP +.B \f[I]cmd_comp\f[] +The command completion object. +.RS +.RE +.TP +.B \f[I]cmd_resp\f[] +The output data from the asynchronous command. +.RS +.RE +.TP +.B \f[I]cmd_resp_len\f[] +The output buffer size to hold the response. +.RS +.RE +.SH RETURN VALUE +.PP +Upon success \f[I]mlx5dv_devx_create_cmd_comp\f[] will return a new +\f[I]struct mlx5dv_devx_cmd_comp\f[] object, on error NULL will be +returned and errno will be set. +.PP +Upon success \f[I]mlx5dv_devx_get_async_cmd_comp\f[] will return 0, +otherwise errno will be returned. +.SH SEE ALSO +.PP +\f[I]mlx5dv_open_device(3)\f[], \f[I]mlx5dv_devx_obj_create(3)\f[] +.SH AUTHOR +.PP +Yishai Hadas <yishaih@mellanox.com> diff --git a/buildlib/pandoc-prebuilt/59234b57ac865b4965d1158c7bbcad075f57cb70 b/buildlib/pandoc-prebuilt/59234b57ac865b4965d1158c7bbcad075f57cb70 new file mode 100644 index 0000000..b6798a9 --- /dev/null +++ b/buildlib/pandoc-prebuilt/59234b57ac865b4965d1158c7bbcad075f57cb70 @@ -0,0 +1,176 @@ +.\" Man page generated from reStructuredText. +. +.TH IBNODES 8 "2012-05-14" "" "OpenIB Diagnostics" +.SH NAME +IBNODES \- show InfiniBand nodes in topology +. +.nr rst2man-indent-level 0 +. +.de1 rstReportMargin +\\$1 \\n[an-margin] +level \\n[rst2man-indent-level] +level margin: \\n[rst2man-indent\\n[rst2man-indent-level]] +- +\\n[rst2man-indent0] +\\n[rst2man-indent1] +\\n[rst2man-indent2] +.. +.de1 INDENT +.\" .rstReportMargin pre: +. RS \\$1 +. nr rst2man-indent\\n[rst2man-indent-level] \\n[an-margin] +. nr rst2man-indent-level +1 +.\" .rstReportMargin post: +.. +.de UNINDENT +. RE +.\" indent \\n[an-margin] +.\" old: \\n[rst2man-indent\\n[rst2man-indent-level]] +.nr rst2man-indent-level -1 +.\" new: \\n[rst2man-indent\\n[rst2man-indent-level]] +.in \\n[rst2man-indent\\n[rst2man-indent-level]]u +.. +.SH SYNOPSIS +.sp +ibnodes [options] [<topology\-file>] +.SH DESCRIPTION +.sp +ibnodes is a script which either walks the IB subnet topology or uses an +already saved topology file and extracts the IB nodes (CAs and switches). +.SH OPTIONS +.\" Define the common option -C +. +.sp +\fB\-C, \-\-Ca <ca_name>\fP use the specified ca_name. +.\" Define the common option -P +. +.sp +\fB\-P, \-\-Port <ca_port>\fP use the specified ca_port. +.\" Define the common option -t +. +.sp +\fB\-t, \-\-timeout <timeout_ms>\fP override the default timeout for the solicited mads. +.\" Define the common option -h +. +.sp +\fB\-h, \-\-help\fP show the usage message +.\" Define the common option -z +. +.sp +\fB\-\-config, \-z <config_file>\fP Specify alternate config file. +.INDENT 0.0 +.INDENT 3.5 +Default: /usr/local/etc/infiniband\-diags/ibdiag.conf +.UNINDENT +.UNINDENT +.\" Explanation of local port selection +. +.SS Local port Selection +.sp +Multiple port/Multiple CA support: when no IB device or port is specified +(see the "local umad parameters" below), the libibumad library +selects the port to use by the following criteria: +.INDENT 0.0 +.INDENT 3.5 +.INDENT 0.0 +.IP 1. 3 +the first port that is ACTIVE. +.IP 2. 3 +if not found, the first port that is UP (physical link up). +.UNINDENT +.sp +If a port and/or CA name is specified, the libibumad library attempts +to fulfill the user request, and will fail if it is not possible. +.sp +For example: +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +ibaddr # use the first port (criteria #1 above) +ibaddr \-C mthca1 # pick the best port from "mthca1" only. +ibaddr \-P 2 # use the second (active/up) port from the first available IB device. +ibaddr \-C mthca0 \-P 2 # use the specified port only. +.ft P +.fi +.UNINDENT +.UNINDENT +.UNINDENT +.UNINDENT +.SH FILES +.\" Common text for the config file +. +.SS CONFIG FILE +.sp +/usr/local/etc/infiniband\-diags/ibdiag.conf +.sp +A global config file is provided to set some of the common options for all +tools. See supplied config file for details. +.\" Common text to describe the node name map file. +. +.SS NODE NAME MAP FILE FORMAT +.sp +The node name map is used to specify user friendly names for nodes in the +output. GUIDs are used to perform the lookup. +.sp +This functionality is provided by the opensm\-libs package. See \fBopensm(8)\fP +for the file location for your installation. +.sp +\fBGenerically:\fP +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +# comment +<guid> "<name>" +.ft P +.fi +.UNINDENT +.UNINDENT +.sp +\fBExample:\fP +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +# IB1 +# Line cards +0x0008f104003f125c "IB1 (Rack 11 slot 1 ) ISR9288/ISR9096 Voltaire sLB\-24D" +0x0008f104003f125d "IB1 (Rack 11 slot 1 ) ISR9288/ISR9096 Voltaire sLB\-24D" +0x0008f104003f10d2 "IB1 (Rack 11 slot 2 ) ISR9288/ISR9096 Voltaire sLB\-24D" +0x0008f104003f10d3 "IB1 (Rack 11 slot 2 ) ISR9288/ISR9096 Voltaire sLB\-24D" +0x0008f104003f10bf "IB1 (Rack 11 slot 12 ) ISR9288/ISR9096 Voltaire sLB\-24D" + +# Spines +0x0008f10400400e2d "IB1 (Rack 11 spine 1 ) ISR9288 Voltaire sFB\-12D" +0x0008f10400400e2e "IB1 (Rack 11 spine 1 ) ISR9288 Voltaire sFB\-12D" +0x0008f10400400e2f "IB1 (Rack 11 spine 1 ) ISR9288 Voltaire sFB\-12D" +0x0008f10400400e31 "IB1 (Rack 11 spine 2 ) ISR9288 Voltaire sFB\-12D" +0x0008f10400400e32 "IB1 (Rack 11 spine 2 ) ISR9288 Voltaire sFB\-12D" + +# GUID Node Name +0x0008f10400411a08 "SW1 (Rack 3) ISR9024 Voltaire 9024D" +0x0008f10400411a28 "SW2 (Rack 3) ISR9024 Voltaire 9024D" +0x0008f10400411a34 "SW3 (Rack 3) ISR9024 Voltaire 9024D" +0x0008f104004119d0 "SW4 (Rack 3) ISR9024 Voltaire 9024D" +.ft P +.fi +.UNINDENT +.UNINDENT +.SH SEE ALSO +.sp +ibnetdiscover(8) +.SH DEPENDENCIES +.sp +ibnetdiscover, ibnetdiscover format +.SH AUTHOR +.INDENT 0.0 +.TP +.B Hal Rosenstock +< \fI\%halr@voltaire.com\fP > +.UNINDENT +.\" Generated by docutils manpage writer. +. diff --git a/buildlib/pandoc-prebuilt/5b31ff63cab89c3d9f686c913c8ea8fb013fb5c6 b/buildlib/pandoc-prebuilt/5b31ff63cab89c3d9f686c913c8ea8fb013fb5c6 new file mode 100644 index 0000000..d37553d --- /dev/null +++ b/buildlib/pandoc-prebuilt/5b31ff63cab89c3d9f686c913c8ea8fb013fb5c6 @@ -0,0 +1,117 @@ +.\" Man page generated from reStructuredText. +. +.TH IBSTAT 8 "2017-08-21" "" "Open IB Diagnostics" +.SH NAME +ibstat \- query basic status of InfiniBand device(s) +. +.nr rst2man-indent-level 0 +. +.de1 rstReportMargin +\\$1 \\n[an-margin] +level \\n[rst2man-indent-level] +level margin: \\n[rst2man-indent\\n[rst2man-indent-level]] +- +\\n[rst2man-indent0] +\\n[rst2man-indent1] +\\n[rst2man-indent2] +.. +.de1 INDENT +.\" .rstReportMargin pre: +. RS \\$1 +. nr rst2man-indent\\n[rst2man-indent-level] \\n[an-margin] +. nr rst2man-indent-level +1 +.\" .rstReportMargin post: +.. +.de UNINDENT +. RE +.\" indent \\n[an-margin] +.\" old: \\n[rst2man-indent\\n[rst2man-indent-level]] +.nr rst2man-indent-level -1 +.\" new: \\n[rst2man-indent\\n[rst2man-indent-level]] +.in \\n[rst2man-indent\\n[rst2man-indent-level]]u +.. +.SH SYNOPSIS +.sp +ibstat [options] <ca_name> [portnum] +.SH DESCRIPTION +.sp +ibstat is a binary which displays basic information obtained from the local +IB driver. Output includes LID, SMLID, port state, link width active, and port +physical state. +.sp +It is similar to the ibstatus utility but implemented as a binary rather +than a script. It has options to list CAs and/or ports and displays more +information than ibstatus. +.SH OPTIONS +.INDENT 0.0 +.TP +.B \fB\-l, \-\-list_of_cas\fP +list all IB devices +.TP +.B \fB\-s, \-\-short\fP +short output +.TP +.B \fB\-p, \-\-port_list\fP +show port list +.TP +.B \fBca_name\fP +InfiniBand device name +.TP +.B \fBportnum\fP +port number of InfiniBand device +.UNINDENT +.SS Debugging flags +.\" Define the common option -d +. +.INDENT 0.0 +.TP +.B \-d +raise the IB debugging level. +May be used several times (\-ddd or \-d \-d \-d). +.UNINDENT +.\" Define the common option -h +. +.sp +\fB\-h, \-\-help\fP show the usage message +.\" Define the common option -v +. +.INDENT 0.0 +.TP +.B \fB\-v, \-\-verbose\fP +increase the application verbosity level. +May be used several times (\-vv or \-v \-v \-v) +.UNINDENT +.\" Define the common option -V +. +.sp +\fB\-V, \-\-version\fP show the version info. +.SS Configuration flags +.\" Define the common option -z +. +.sp +\fB\-\-config, \-z <config_file>\fP Specify alternate config file. +.INDENT 0.0 +.INDENT 3.5 +Default: /usr/local/etc/infiniband\-diags/ibdiag.conf +.UNINDENT +.UNINDENT +.SH EXAMPLES +.INDENT 0.0 +.TP +.B :: +ibstat # display status of all ports on all IB devices +ibstat \-l # list all IB devices +ibstat \-p # show port guids +ibstat mthca0 2 # show status of port 2 of \(aqmthca0\(aq +.UNINDENT +.SH SEE ALSO +.sp +ibstatus (8) +.SH AUTHOR +.INDENT 0.0 +.TP +.B Hal Rosenstock +< \fI\%halr@voltaire.com\fP > +.UNINDENT +.\" Generated by docutils manpage writer. +. diff --git a/buildlib/pandoc-prebuilt/5d0581b1521f84755910847649fa7df349461762 b/buildlib/pandoc-prebuilt/5d0581b1521f84755910847649fa7df349461762 new file mode 100644 index 0000000..e45da89 --- /dev/null +++ b/buildlib/pandoc-prebuilt/5d0581b1521f84755910847649fa7df349461762 @@ -0,0 +1,31 @@ +.\" Automatically generated by Pandoc 1.19.2.4 +.\" +.TH "IBV_GET_DEVICE_GUID" "3" "2006\-10\-31" "libibverbs" "Libibverbs Programmer\[aq]s Manual" +.hy +.SH NAME +.PP +ibv_get_device_guid \- get an RDMA device\[aq]s GUID +.SH SYNOPSIS +.IP +.nf +\f[C] +#include\ <infiniband/verbs.h> + +uint64_t\ ibv_get_device_guid(struct\ ibv_device\ *device); +\f[] +.fi +.SH DESCRIPTION +.PP +\f[B]ibv_get_device_name()\f[] returns the Global Unique IDentifier +(GUID) of the RDMA device \f[I]device\f[]. +.SH RETURN VALUE +.PP +\f[B]ibv_get_device_guid()\f[] returns the GUID of the device in network +byte order. +.SH SEE ALSO +.PP +\f[B]ibv_get_device_list\f[](3), \f[B]ibv_get_device_name\f[](3), +\f[B]ibv_open_device\f[](3) +.SH AUTHOR +.PP +Dotan Barak <dotanba@gmail.com> diff --git a/buildlib/pandoc-prebuilt/5f21cbf573fd8a93285ae0383d13fc994ea9a79a b/buildlib/pandoc-prebuilt/5f21cbf573fd8a93285ae0383d13fc994ea9a79a new file mode 100644 index 0000000..4df57ff --- /dev/null +++ b/buildlib/pandoc-prebuilt/5f21cbf573fd8a93285ae0383d13fc994ea9a79a @@ -0,0 +1,309 @@ +.\" Man page generated from reStructuredText. +. +.TH SMPQUERY 8 "2017-08-21" "" "Open IB Diagnostics" +.SH NAME +smpquery \- query InfiniBand subnet management attributes +. +.nr rst2man-indent-level 0 +. +.de1 rstReportMargin +\\$1 \\n[an-margin] +level \\n[rst2man-indent-level] +level margin: \\n[rst2man-indent\\n[rst2man-indent-level]] +- +\\n[rst2man-indent0] +\\n[rst2man-indent1] +\\n[rst2man-indent2] +.. +.de1 INDENT +.\" .rstReportMargin pre: +. RS \\$1 +. nr rst2man-indent\\n[rst2man-indent-level] \\n[an-margin] +. nr rst2man-indent-level +1 +.\" .rstReportMargin post: +.. +.de UNINDENT +. RE +.\" indent \\n[an-margin] +.\" old: \\n[rst2man-indent\\n[rst2man-indent-level]] +.nr rst2man-indent-level -1 +.\" new: \\n[rst2man-indent\\n[rst2man-indent-level]] +.in \\n[rst2man-indent\\n[rst2man-indent-level]]u +.. +.SH SYNOPSIS +.sp +smpquery [options] <op> <dest dr_path|lid|guid> [op params] +.SH DESCRIPTION +.sp +smpquery allows a basic subset of standard SMP queries including the following: +node info, node description, switch info, port info. Fields are displayed in +human readable format. +.SH OPTIONS +.sp +Current supported operations (case insensitive) and their parameters: +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +Nodeinfo (NI) <addr> + +Nodedesc (ND) <addr> + +Portinfo (PI) <addr> [<portnum>] # default port is zero + +PortInfoExtended (PIE) <addr> [<portnum>] + +Switchinfo (SI) <addr> + +PKeyTable (PKeys) <addr> [<portnum>] + +SL2VLTable (SL2VL) <addr> [<portnum>] + +VLArbitration (VLArb) <addr> [<portnum>] + +GUIDInfo (GI) <addr> + +MlnxExtPortInfo (MEPI) <addr> [<portnum>] # default port is zero +.ft P +.fi +.UNINDENT +.UNINDENT +.INDENT 0.0 +.TP +.B \fB\-c, \-\-combined\fP +Use Combined route address argument \fB<lid> <DR_Path>\fP +.TP +.B \fB\-x, \-\-extended\fP +Set SMSupportsExtendedSpeeds bit 31 in AttributeModifier +(only impacts PortInfo queries). +.UNINDENT +.\" Define the common option -K +. +.INDENT 0.0 +.TP +.B \fB\-K, \-\-show_keys\fP +show security keys (mkey, smkey, etc.) associated with the request. +.UNINDENT +.SS Addressing Flags +.\" Define the common option -D for Directed routes +. +.sp +\fB\-D, \-\-Direct\fP The address specified is a directed route +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +Examples: + [options] \-D [options] "0" # self port + [options] \-D [options] "0,1,2,1,4" # out via port 1, then 2, ... + + (Note the second number in the path specified must match the port being + used. This can be specified using the port selection flag \(aq\-P\(aq or the + port found through the automatic selection process.) +.ft P +.fi +.UNINDENT +.UNINDENT +.\" Define the common option -G +. +.sp +\fB\-G, \-\-Guid\fP The address specified is a Port GUID +.\" Define the common option -L +. +.sp +\fB\-L, \-\-Lid\fP The address specified is a LID +.\" Define the common option -s +. +.sp +\fB\-s, \-\-sm_port <smlid>\fP use \(aqsmlid\(aq as the target lid for SA queries. +.SS Port Selection flags +.\" Define the common option -C +. +.sp +\fB\-C, \-\-Ca <ca_name>\fP use the specified ca_name. +.\" Define the common option -P +. +.sp +\fB\-P, \-\-Port <ca_port>\fP use the specified ca_port. +.\" Explanation of local port selection +. +.SS Local port Selection +.sp +Multiple port/Multiple CA support: when no IB device or port is specified +(see the "local umad parameters" below), the libibumad library +selects the port to use by the following criteria: +.INDENT 0.0 +.INDENT 3.5 +.INDENT 0.0 +.IP 1. 3 +the first port that is ACTIVE. +.IP 2. 3 +if not found, the first port that is UP (physical link up). +.UNINDENT +.sp +If a port and/or CA name is specified, the libibumad library attempts +to fulfill the user request, and will fail if it is not possible. +.sp +For example: +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +ibaddr # use the first port (criteria #1 above) +ibaddr \-C mthca1 # pick the best port from "mthca1" only. +ibaddr \-P 2 # use the second (active/up) port from the first available IB device. +ibaddr \-C mthca0 \-P 2 # use the specified port only. +.ft P +.fi +.UNINDENT +.UNINDENT +.UNINDENT +.UNINDENT +.SS Debugging flags +.\" Define the common option -d +. +.INDENT 0.0 +.TP +.B \-d +raise the IB debugging level. +May be used several times (\-ddd or \-d \-d \-d). +.UNINDENT +.\" Define the common option -e +. +.INDENT 0.0 +.TP +.B \-e +show send and receive errors (timeouts and others) +.UNINDENT +.\" Define the common option -h +. +.sp +\fB\-h, \-\-help\fP show the usage message +.\" Define the common option -v +. +.INDENT 0.0 +.TP +.B \fB\-v, \-\-verbose\fP +increase the application verbosity level. +May be used several times (\-vv or \-v \-v \-v) +.UNINDENT +.\" Define the common option -V +. +.sp +\fB\-V, \-\-version\fP show the version info. +.SS Configuration flags +.\" Define the common option -t +. +.sp +\fB\-t, \-\-timeout <timeout_ms>\fP override the default timeout for the solicited mads. +.\" Define the common option --node-name-map +. +.sp +\fB\-\-node\-name\-map <node\-name\-map>\fP Specify a node name map. +.INDENT 0.0 +.INDENT 3.5 +This file maps GUIDs to more user friendly names. See FILES section. +.UNINDENT +.UNINDENT +.\" Define the common option -y +. +.INDENT 0.0 +.TP +.B \fB\-y, \-\-m_key <key>\fP +use the specified M_key for requests. If non\-numeric value (like \(aqx\(aq) +is specified then a value will be prompted for. +.UNINDENT +.\" Define the common option -z +. +.sp +\fB\-\-config, \-z <config_file>\fP Specify alternate config file. +.INDENT 0.0 +.INDENT 3.5 +Default: /usr/local/etc/infiniband\-diags/ibdiag.conf +.UNINDENT +.UNINDENT +.SH FILES +.\" Common text for the config file +. +.SS CONFIG FILE +.sp +/usr/local/etc/infiniband\-diags/ibdiag.conf +.sp +A global config file is provided to set some of the common options for all +tools. See supplied config file for details. +.\" Common text to describe the node name map file. +. +.SS NODE NAME MAP FILE FORMAT +.sp +The node name map is used to specify user friendly names for nodes in the +output. GUIDs are used to perform the lookup. +.sp +This functionality is provided by the opensm\-libs package. See \fBopensm(8)\fP +for the file location for your installation. +.sp +\fBGenerically:\fP +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +# comment +<guid> "<name>" +.ft P +.fi +.UNINDENT +.UNINDENT +.sp +\fBExample:\fP +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +# IB1 +# Line cards +0x0008f104003f125c "IB1 (Rack 11 slot 1 ) ISR9288/ISR9096 Voltaire sLB\-24D" +0x0008f104003f125d "IB1 (Rack 11 slot 1 ) ISR9288/ISR9096 Voltaire sLB\-24D" +0x0008f104003f10d2 "IB1 (Rack 11 slot 2 ) ISR9288/ISR9096 Voltaire sLB\-24D" +0x0008f104003f10d3 "IB1 (Rack 11 slot 2 ) ISR9288/ISR9096 Voltaire sLB\-24D" +0x0008f104003f10bf "IB1 (Rack 11 slot 12 ) ISR9288/ISR9096 Voltaire sLB\-24D" + +# Spines +0x0008f10400400e2d "IB1 (Rack 11 spine 1 ) ISR9288 Voltaire sFB\-12D" +0x0008f10400400e2e "IB1 (Rack 11 spine 1 ) ISR9288 Voltaire sFB\-12D" +0x0008f10400400e2f "IB1 (Rack 11 spine 1 ) ISR9288 Voltaire sFB\-12D" +0x0008f10400400e31 "IB1 (Rack 11 spine 2 ) ISR9288 Voltaire sFB\-12D" +0x0008f10400400e32 "IB1 (Rack 11 spine 2 ) ISR9288 Voltaire sFB\-12D" + +# GUID Node Name +0x0008f10400411a08 "SW1 (Rack 3) ISR9024 Voltaire 9024D" +0x0008f10400411a28 "SW2 (Rack 3) ISR9024 Voltaire 9024D" +0x0008f10400411a34 "SW3 (Rack 3) ISR9024 Voltaire 9024D" +0x0008f104004119d0 "SW4 (Rack 3) ISR9024 Voltaire 9024D" +.ft P +.fi +.UNINDENT +.UNINDENT +.SH EXAMPLES +.INDENT 0.0 +.TP +.B :: +smpquery portinfo 3 1 # portinfo by lid, with port modifier +smpquery \-G switchinfo 0x2C9000100D051 1 # switchinfo by guid +smpquery \-D nodeinfo 0 # nodeinfo by direct route +smpquery \-c nodeinfo 6 0,12 # nodeinfo by combined route +.UNINDENT +.SH SEE ALSO +.sp +smpdump (8) +.SH AUTHOR +.INDENT 0.0 +.TP +.B Hal Rosenstock +< \fI\%hal@mellanox.com\fP > +.UNINDENT +.\" Generated by docutils manpage writer. +. diff --git a/buildlib/pandoc-prebuilt/5f388dcd4b7814975cdfb7bf0ebab9733c72f1e6 b/buildlib/pandoc-prebuilt/5f388dcd4b7814975cdfb7bf0ebab9733c72f1e6 new file mode 100644 index 0000000..a5f0b7e --- /dev/null +++ b/buildlib/pandoc-prebuilt/5f388dcd4b7814975cdfb7bf0ebab9733c72f1e6 @@ -0,0 +1,374 @@ +.\" Man page generated from reStructuredText. +. +.TH SAQUERY 8 "2017-08-21" "" "Open IB Diagnostics" +.SH NAME +saquery \- query InfiniBand subnet administration attributes +. +.nr rst2man-indent-level 0 +. +.de1 rstReportMargin +\\$1 \\n[an-margin] +level \\n[rst2man-indent-level] +level margin: \\n[rst2man-indent\\n[rst2man-indent-level]] +- +\\n[rst2man-indent0] +\\n[rst2man-indent1] +\\n[rst2man-indent2] +.. +.de1 INDENT +.\" .rstReportMargin pre: +. RS \\$1 +. nr rst2man-indent\\n[rst2man-indent-level] \\n[an-margin] +. nr rst2man-indent-level +1 +.\" .rstReportMargin post: +.. +.de UNINDENT +. RE +.\" indent \\n[an-margin] +.\" old: \\n[rst2man-indent\\n[rst2man-indent-level]] +.nr rst2man-indent-level -1 +.\" new: \\n[rst2man-indent\\n[rst2man-indent-level]] +.in \\n[rst2man-indent\\n[rst2man-indent-level]]u +.. +.SH SYNOPSIS +.sp +saquery [options] [<name> | <lid> | <guid>] +.SH DESCRIPTION +.sp +saquery issues the selected SA query. Node records are queried by default. +.SH OPTIONS +.INDENT 0.0 +.TP +.B \fB\-p\fP +get PathRecord info +.TP +.B \fB\-N\fP +get NodeRecord info +.TP +.B \fB\-D, \-\-list\fP +get NodeDescriptions of CAs only +.TP +.B \fB\-S\fP +get ServiceRecord info +.TP +.B \fB\-I\fP +get InformInfoRecord (subscription) info +.TP +.B \fB\-L\fP +return the Lids of the name specified +.TP +.B \fB\-l\fP +return the unique Lid of the name specified +.TP +.B \fB\-G\fP +return the Guids of the name specified +.TP +.B \fB\-O\fP +return the name for the Lid specified +.TP +.B \fB\-U\fP +return the name for the Guid specified +.TP +.B \fB\-c\fP +get the SA\(aqs class port info +.TP +.B \fB\-s\fP +return the PortInfoRecords with isSM or isSMdisabled capability mask bit on +.TP +.B \fB\-g\fP +get multicast group info +.TP +.B \fB\-m\fP +get multicast member info. If a group is specified, limit the output +to the group specified and print one line containing only the GUID and +node description for each entry. Example: saquery \-m 0xc000 +.TP +.B \fB\-x\fP +get LinkRecord info +.TP +.B \fB\-\-src\-to\-dst <src:dst>\fP +get a PathRecord for <src:dst> +where src and dst are either node names or LIDs +.TP +.B \fB\-\-sgid\-to\-dgid <sgid:dgid>\fP +get a PathRecord for \fBsgid\fP to \fBdgid\fP +where both GIDs are in an IPv6 format acceptable to \fBinet_pton (3)\fP +.TP +.B \fB\-\-smkey <val>\fP +use SM_Key value for the query. Will be used only with "trusted" +queries. If non\-numeric value (like \(aqx\(aq) is specified then saquery +will prompt for a value. +Default (when not specified here or in +/usr/local/etc/infiniband\-diags/ibdiag.conf) is to use SM_Key == 0 (or +"untrusted") +.UNINDENT +.\" Define the common option -K +. +.INDENT 0.0 +.TP +.B \fB\-K, \-\-show_keys\fP +show security keys (mkey, smkey, etc.) associated with the request. +.UNINDENT +.sp +\fB\-\-slid <lid>\fP Source LID (PathRecord) +.sp +\fB\-\-dlid <lid>\fP Destination LID (PathRecord) +.sp +\fB\-\-mlid <lid>\fP Multicast LID (MCMemberRecord) +.sp +\fB\-\-sgid <gid>\fP Source GID (IPv6 format) (PathRecord) +.sp +\fB\-\-dgid <gid>\fP Destination GID (IPv6 format) (PathRecord) +.sp +\fB\-\-gid <gid>\fP Port GID (MCMemberRecord) +.sp +\fB\-\-mgid <gid>\fP Multicast GID (MCMemberRecord) +.sp +\fB\-\-reversible\fP Reversible path (PathRecord) +.sp +\fB\-\-numb_path\fP Number of paths (PathRecord) +.INDENT 0.0 +.TP +.B \fB\-\-pkey\fP P_Key (PathRecord, MCMemberRecord). If non\-numeric value (like \(aqx\(aq) +is specified then saquery will prompt for a value +.UNINDENT +.sp +\fB\-\-qos_class\fP QoS Class (PathRecord) +.sp +\fB\-\-sl\fP Service level (PathRecord, MCMemberRecord) +.sp +\fB\-\-mtu\fP MTU and selector (PathRecord, MCMemberRecord) +.sp +\fB\-\-rate\fP Rate and selector (PathRecord, MCMemberRecord) +.sp +\fB\-\-pkt_lifetime\fP Packet lifetime and selector (PathRecord, MCMemberRecord) +.INDENT 0.0 +.TP +.B \fB\-\-qkey\fP Q_Key (MCMemberRecord). If non\-numeric value (like \(aqx\(aq) is specified +then saquery will prompt for a value +.UNINDENT +.sp +\fB\-\-tclass\fP Traffic Class (PathRecord, MCMemberRecord) +.sp +\fB\-\-flow_label\fP Flow Label (PathRecord, MCMemberRecord) +.sp +\fB\-\-hop_limit\fP Hop limit (PathRecord, MCMemberRecord) +.sp +\fB\-\-scope\fP Scope (MCMemberRecord) +.sp +\fB\-\-join_state\fP Join state (MCMemberRecord) +.sp +\fB\-\-proxy_join\fP Proxy join (MCMemberRecord) +.sp +\fB\-\-service_id\fP ServiceID (PathRecord) +.sp +Supported query names (and aliases): +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +ClassPortInfo (CPI) +NodeRecord (NR) [lid] +PortInfoRecord (PIR) [[lid]/[port]/[options]] +SL2VLTableRecord (SL2VL) [[lid]/[in_port]/[out_port]] +PKeyTableRecord (PKTR) [[lid]/[port]/[block]] +VLArbitrationTableRecord (VLAR) [[lid]/[port]/[block]] +InformInfoRecord (IIR) +LinkRecord (LR) [[from_lid]/[from_port]] [[to_lid]/[to_port]] +ServiceRecord (SR) +PathRecord (PR) +MCMemberRecord (MCMR) +LFTRecord (LFTR) [[lid]/[block]] +MFTRecord (MFTR) [[mlid]/[position]/[block]] +GUIDInfoRecord (GIR) [[lid]/[block]] +SwitchInfoRecord (SWIR) [lid] +SMInfoRecord (SMIR) [lid] +.ft P +.fi +.UNINDENT +.UNINDENT +.SS Port Selection flags +.\" Define the common option -C +. +.sp +\fB\-C, \-\-Ca <ca_name>\fP use the specified ca_name. +.\" Define the common option -P +. +.sp +\fB\-P, \-\-Port <ca_port>\fP use the specified ca_port. +.\" Explanation of local port selection +. +.SS Local port Selection +.sp +Multiple port/Multiple CA support: when no IB device or port is specified +(see the "local umad parameters" below), the libibumad library +selects the port to use by the following criteria: +.INDENT 0.0 +.INDENT 3.5 +.INDENT 0.0 +.IP 1. 3 +the first port that is ACTIVE. +.IP 2. 3 +if not found, the first port that is UP (physical link up). +.UNINDENT +.sp +If a port and/or CA name is specified, the libibumad library attempts +to fulfill the user request, and will fail if it is not possible. +.sp +For example: +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +ibaddr # use the first port (criteria #1 above) +ibaddr \-C mthca1 # pick the best port from "mthca1" only. +ibaddr \-P 2 # use the second (active/up) port from the first available IB device. +ibaddr \-C mthca0 \-P 2 # use the specified port only. +.ft P +.fi +.UNINDENT +.UNINDENT +.UNINDENT +.UNINDENT +.SS Debugging flags +.\" Define the common option -d +. +.INDENT 0.0 +.TP +.B \-d +raise the IB debugging level. +May be used several times (\-ddd or \-d \-d \-d). +.UNINDENT +.\" Define the common option -e +. +.INDENT 0.0 +.TP +.B \-e +show send and receive errors (timeouts and others) +.UNINDENT +.\" Define the common option -h +. +.sp +\fB\-h, \-\-help\fP show the usage message +.\" Define the common option -v +. +.INDENT 0.0 +.TP +.B \fB\-v, \-\-verbose\fP +increase the application verbosity level. +May be used several times (\-vv or \-v \-v \-v) +.UNINDENT +.\" Define the common option -V +. +.sp +\fB\-V, \-\-version\fP show the version info. +.SS Configuration flags +.\" Define the common option -t +. +.sp +\fB\-t, \-\-timeout <timeout_ms>\fP override the default timeout for the solicited mads. +.\" Define the common option -z +. +.INDENT 0.0 +.TP +.B \fB\-\-outstanding_smps, \-o <val>\fP +Specify the number of outstanding SMP\(aqs which should be issued during the scan +.sp +Default: 2 +.UNINDENT +.\" Define the common option --node-name-map +. +.sp +\fB\-\-node\-name\-map <node\-name\-map>\fP Specify a node name map. +.INDENT 0.0 +.INDENT 3.5 +This file maps GUIDs to more user friendly names. See FILES section. +.UNINDENT +.UNINDENT +.\" Define the common option -z +. +.sp +\fB\-\-config, \-z <config_file>\fP Specify alternate config file. +.INDENT 0.0 +.INDENT 3.5 +Default: /usr/local/etc/infiniband\-diags/ibdiag.conf +.UNINDENT +.UNINDENT +.SH COMMON FILES +.\" Common text for the config file +. +.SS CONFIG FILE +.sp +/usr/local/etc/infiniband\-diags/ibdiag.conf +.sp +A global config file is provided to set some of the common options for all +tools. See supplied config file for details. +.\" Common text to describe the node name map file. +. +.SS NODE NAME MAP FILE FORMAT +.sp +The node name map is used to specify user friendly names for nodes in the +output. GUIDs are used to perform the lookup. +.sp +This functionality is provided by the opensm\-libs package. See \fBopensm(8)\fP +for the file location for your installation. +.sp +\fBGenerically:\fP +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +# comment +<guid> "<name>" +.ft P +.fi +.UNINDENT +.UNINDENT +.sp +\fBExample:\fP +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +# IB1 +# Line cards +0x0008f104003f125c "IB1 (Rack 11 slot 1 ) ISR9288/ISR9096 Voltaire sLB\-24D" +0x0008f104003f125d "IB1 (Rack 11 slot 1 ) ISR9288/ISR9096 Voltaire sLB\-24D" +0x0008f104003f10d2 "IB1 (Rack 11 slot 2 ) ISR9288/ISR9096 Voltaire sLB\-24D" +0x0008f104003f10d3 "IB1 (Rack 11 slot 2 ) ISR9288/ISR9096 Voltaire sLB\-24D" +0x0008f104003f10bf "IB1 (Rack 11 slot 12 ) ISR9288/ISR9096 Voltaire sLB\-24D" + +# Spines +0x0008f10400400e2d "IB1 (Rack 11 spine 1 ) ISR9288 Voltaire sFB\-12D" +0x0008f10400400e2e "IB1 (Rack 11 spine 1 ) ISR9288 Voltaire sFB\-12D" +0x0008f10400400e2f "IB1 (Rack 11 spine 1 ) ISR9288 Voltaire sFB\-12D" +0x0008f10400400e31 "IB1 (Rack 11 spine 2 ) ISR9288 Voltaire sFB\-12D" +0x0008f10400400e32 "IB1 (Rack 11 spine 2 ) ISR9288 Voltaire sFB\-12D" + +# GUID Node Name +0x0008f10400411a08 "SW1 (Rack 3) ISR9024 Voltaire 9024D" +0x0008f10400411a28 "SW2 (Rack 3) ISR9024 Voltaire 9024D" +0x0008f10400411a34 "SW3 (Rack 3) ISR9024 Voltaire 9024D" +0x0008f104004119d0 "SW4 (Rack 3) ISR9024 Voltaire 9024D" +.ft P +.fi +.UNINDENT +.UNINDENT +.SH DEPENDENCIES +.sp +OpenSM (or other running SM/SA), libosmcomp, libibumad, libibmad +.SH AUTHORS +.INDENT 0.0 +.TP +.B Ira Weiny +< \fI\%ira.weiny@intel.com\fP > +.TP +.B Hal Rosenstock +< \fI\%halr@mellanox.com\fP > +.UNINDENT +.\" Generated by docutils manpage writer. +. diff --git a/buildlib/pandoc-prebuilt/624de381c4dd90a5061dfb899e33d1aff4f8af1c b/buildlib/pandoc-prebuilt/624de381c4dd90a5061dfb899e33d1aff4f8af1c new file mode 100644 index 0000000..174184c --- /dev/null +++ b/buildlib/pandoc-prebuilt/624de381c4dd90a5061dfb899e33d1aff4f8af1c @@ -0,0 +1,36 @@ +.\" Automatically generated by Pandoc 1.19.2.4 +.\" +.TH "mlx5dv_is_supported" "3" "" "" "" +.hy +.SH NAME +.PP +mlx5dv_is_supported \- Check whether an RDMA device implemented by the +mlx5 provider +.SH SYNOPSIS +.IP +.nf +\f[C] +#include\ <infiniband/mlx5dv.h> + +bool\ mlx5dv_is_supported(struct\ ibv_device\ *device); +\f[] +.fi +.SH DESCRIPTION +.PP +mlx5dv functions may be called only if this function returns true for +the RDMA device. +.SH ARGUMENTS +.TP +.B \f[I]device\f[] +RDMA device to check. +.RS +.RE +.SH RETURN VALUE +.PP +Returns true if device is implemented by mlx5 provider. +.SH SEE ALSO +.PP +\f[I]mlx5dv(7)\f[] +.SH AUTHOR +.PP +Artemy Kovalyov <artemyko@mellanox.com> diff --git a/buildlib/pandoc-prebuilt/6465ebc6a2cda4fc52fa1badb65b07e7effaca6b b/buildlib/pandoc-prebuilt/6465ebc6a2cda4fc52fa1badb65b07e7effaca6b new file mode 100644 index 0000000..07353f7 --- /dev/null +++ b/buildlib/pandoc-prebuilt/6465ebc6a2cda4fc52fa1badb65b07e7effaca6b @@ -0,0 +1,78 @@ +.\" Automatically generated by Pandoc 1.19.2.4 +.\" +.TH "mlx5dv_create_mkey / mlx5dv_destroy_mkey" "3" "" "" "" +.hy +.SH NAME +.PP +mlx5dv_create_mkey \- Creates an indirect mkey +.PP +mlx5dv_create_mkey \- Destroys an indirect mkey +.SH SYNOPSIS +.IP +.nf +\f[C] +#include\ <infiniband/mlx5dv.h> + +struct\ mlx5dv_mkey_init_attr\ { +\ \ \ \ struct\ ibv_pd\ \ \ *pd; +\ \ \ \ uint32_t\ \ \ \ create_flags; +\ \ \ \ uint16_t\ \ \ \ max_entries; +}; + +struct\ mlx5dv_mkey\ { +\ \ \ \ uint32_t\ \ \ \ lkey; +\ \ \ \ uint32_t\ \ \ \ rkey; +}; + +struct\ mlx5dv_mkey\ * +mlx5dv_create_mkey(struct\ mlx5dv_mkey_init_attr\ *mkey_init_attr); + +int\ mlx5dv_destroy_mkey(struct\ mlx5dv_mkey\ *mkey); +\f[] +.fi +.SH DESCRIPTION +.PP +Create / destroy an indirect mkey. +.PP +Create an indirect mkey to enable application uses its specific device +functionality. +.SH ARGUMENTS +.SS mkey_init_attr +.TP +.B \f[I]pd\f[] +ibv protection domain. +.RS +.RE +.TP +.B \f[I]create_flags\f[] +MLX5DV_MKEY_INIT_ATTR_FLAGS_INDIRECT: Indirect mkey is being created. +.RS +.RE +.TP +.B \f[I]max_entries\f[] +Requested max number of pointed entries by this indirect mkey. +The function will update the \f[I]mkey_init_attr\->max_entries\f[] with +the actual mkey value that was created; it will be greater than or equal +to the value requested. +.RS +.RE +.SH RETURN VALUE +.PP +Upon success \f[I]mlx5dv_create_mkey\f[] will return a new \f[I]struct +mlx5dv_mkey\f[] on error NULL will be returned and errno will be set. +.PP +Upon success destroy 0 is returned or the value of errno on a failure. +.SH Notes +.PP +To let this functionality works a DEVX context should be opened by using +\f[I]mlx5dv_open_device\f[]. +.PP +The created indirect mkey can`t work with scatter to CQE feature, +consider \f[I]mlx5dv_create_qp()\f[] with +MLX5DV_QP_CREATE_DISABLE_SCATTER_TO_CQE for small messages. +.SH SEE ALSO +.PP +\f[B]mlx5dv_open_device\f[](3), \f[B]mlx5dv_create_qp\f[](3) +.SH AUTHOR +.PP +Yishai Hadas <yishaih@mellanox.com> diff --git a/buildlib/pandoc-prebuilt/6962baf519ab44a4635fd03f70c3033b30b7467e b/buildlib/pandoc-prebuilt/6962baf519ab44a4635fd03f70c3033b30b7467e new file mode 100644 index 0000000..6737fcc --- /dev/null +++ b/buildlib/pandoc-prebuilt/6962baf519ab44a4635fd03f70c3033b30b7467e @@ -0,0 +1,91 @@ +.\" Automatically generated by Pandoc 1.19.2.4 +.\" +.TH "mlx4dv_set_context_attr" "3" "" "" "" +.hy +.SH NAME +.PP +mlx4dv_set_context_attr \- Set context attributes +.SH SYNOPSIS +.IP +.nf +\f[C] +#include\ <infiniband/mlx4dv.h> + +int\ mlx4dv_set_context_attr(struct\ ibv_context\ *context, +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ enum\ mlx4dv_set_ctx_attr_type\ attr_type, +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ void\ *attr); +\f[] +.fi +.SH DESCRIPTION +.PP +mlx4dv_set_context_attr gives the ability to set vendor specific +attributes on the RDMA context. +.SH ARGUMENTS +.TP +.B \f[I]context\f[] +RDMA device context to work on. +.RS +.RE +.TP +.B \f[I]attr_type\f[] +The type of the provided attribute. +.RS +.RE +.TP +.B \f[I]attr\f[] +Pointer to the attribute to be set. +## attr_type +.RS +.RE +.IP +.nf +\f[C] +enum\ mlx4dv_set_ctx_attr_type\ { +\ \ \ \ /*\ Attribute\ type\ uint8_t\ */ +\ \ \ \ MLX4DV_SET_CTX_ATTR_LOG_WQS_RANGE_SZ\ \ \ \ =\ 0, +\ \ \ \ MLX4DV_SET_CTX_ATTR_BUF_ALLOCATORS\ \ =\ 1, +}; +\f[] +.fi +.TP +.B \f[I]MLX4DV_SET_CTX_ATTR_LOG_WQS_RANGE_SZ\f[] +Change the LOG WQs Range size for RSS +.RS +.RE +.TP +.B \f[I]MLX4DV_SET_CTX_ATTR_BUF_ALLOCATORS\f[] +Provide an external buffer allocator +.RS +.RE +.IP +.nf +\f[C] +struct\ mlx4dv_ctx_allocators\ { +\ \ \ \ void\ *(*alloc)(size_t\ size,\ void\ *priv_data); +\ \ \ \ void\ (*free)(void\ *ptr,\ void\ *priv_data); +\ \ \ \ void\ *data; +}; +\f[] +.fi +.TP +.B \f[I]alloc\f[] +Function used for buffer allocation instead of libmlx4 internal method +.RS +.RE +.TP +.B \f[I]free\f[] +Function used to free buffers allocated by alloc function +.RS +.RE +.TP +.B \f[I]data\f[] +Metadata that can be used by alloc and free functions +.RS +.RE +.SH RETURN VALUE +.PP +Returns 0 on success, or the value of errno on failure (which indicates +the failure reason). +.SH AUTHOR +.PP +Majd Dibbiny <majd@mellanox.com> diff --git a/buildlib/pandoc-prebuilt/6a82b0bc695f8fd980a86aefaf5890804a010761 b/buildlib/pandoc-prebuilt/6a82b0bc695f8fd980a86aefaf5890804a010761 new file mode 100644 index 0000000..9868ffe --- /dev/null +++ b/buildlib/pandoc-prebuilt/6a82b0bc695f8fd980a86aefaf5890804a010761 @@ -0,0 +1,47 @@ +.\" Automatically generated by Pandoc 1.19.2.4 +.\" +.TH "ibv_alloc_null_mr" "3" "2018\-6\-1" "libibverbs" "Libibverbs Programmer\[aq]s Manual" +.hy +.SH NAME +.PP +ibv_alloc_null_mr \- allocate a null memory region (MR) +.SH SYNOPSIS +.IP +.nf +\f[C] +#include\ <infiniband/verbs.h> + +struct\ ibv_mr\ *ibv_alloc_null_mr(struct\ ibv_pd\ *pd); +\f[] +.fi +.SH DESCRIPTION +.PP +\f[B]ibv_alloc_null_mr()\f[] allocates a null memory region (MR) that is +associated with the protection domain \f[I]pd\f[]. +.PP +A null MR discards all data written to it, and always returns 0 on read. +It has the maximum length and only the lkey is valid, the MR is not +exposed as an rkey. +.PP +A device should implement the null MR in a way that bypasses PCI +transfers, internally discarding or sourcing 0 data. +This provides a way to avoid PCI bus transfers by using a scatter/gather +list in commands if applications do not intend to access the data, or +need data to be 0 filled. +.PP +Specifically upon \f[B]ibv_post_send()\f[] the device skips PCI read +cycles and upon \f[B]ibv_post_recv()\f[] the device skips PCI write +cycles which finally improves performance. +.PP +\f[B]ibv_dereg_mr()\f[] deregisters the MR. +The use of ibv_rereg_mr() or ibv_bind_mw() with this MR is invalid. +.SH RETURN VALUE +.PP +\f[B]ibv_alloc_null_mr()\f[] returns a pointer to the allocated MR, or +NULL if the request fails. +.SH SEE ALSO +.PP +\f[B]ibv_reg_mr\f[](3), \f[B]ibv_dereg_mr\f[](3), +.SH AUTHOR +.PP +Yonatan Cohen <yonatanc@mellanox.com> diff --git a/buildlib/pandoc-prebuilt/6ae20730dd330526b00e52b24456be5bc4e07a9a b/buildlib/pandoc-prebuilt/6ae20730dd330526b00e52b24456be5bc4e07a9a new file mode 100644 index 0000000..9f960a9 --- /dev/null +++ b/buildlib/pandoc-prebuilt/6ae20730dd330526b00e52b24456be5bc4e07a9a @@ -0,0 +1,73 @@ +.\" Automatically generated by Pandoc 1.19.2.4 +.\" +.TH "mlx5dv_alloc_dm" "3" "2018\-9\-1" "mlx5" "mlx5 Programmer\[aq]s Manual" +.hy +.SH NAME +.PP +mlx5dv_alloc_dm \- allocates device memory (DM) +.SH SYNOPSIS +.IP +.nf +\f[C] +#include\ <infiniband/mlx5dv.h> + +struct\ ibv_dm\ *mlx5dv_alloc_dm(struct\ ibv_context\ *context, +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ struct\ ibv_alloc_dm_attr\ *dm_attr, +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ struct\ mlx5dv_alloc_dm_attr\ *mlx5_dm_attr) +\f[] +.fi +.SH DESCRIPTION +.PP +\f[B]mlx5dv_alloc_dm()\f[] allocates device memory (DM) with specific +driver properties. +.SH ARGUMENTS +.PP +Please see \f[I]ibv_alloc_dm(3)\f[] man page for \f[I]context\f[] and +\f[I]dm_attr\f[]. +.SS mlx5_dm_attr +.IP +.nf +\f[C] +struct\ mlx5dv_alloc_dm_attr\ { +\ \ \ \ enum\ mlx5dv_alloc_dm_type\ type; +\ \ \ \ uint64_t\ comp_mask; +}; +\f[] +.fi +.TP +.B \f[I]type\f[] +The device memory type user wishes to allocate: +.RS +.PP +MLX5DV_DM_TYPE_MEMIC Device memory of type MEMIC \- On\-Chip memory that +can be allocated and used as memory region for transmitting/receiving +packet directly from/to the memory on the chip. +.PP +MLX5DV_DM_TYPE_STEERING_SW_ICM Device memory of type STEERING SW ICM \- +This memory is used by the device to store the packet steering tables +and rules. +Can be used for direct table and steering rules creation when allocated +by a privileged user. +.PP +MLX5DV_DM_TYPE_HEADER_MODIFY_SW_ICM Device memory of type HEADER MODIFY +SW ICM \- This memory is used by the device to store the packet header +modification tables and rules. +Can be used for direct table and header modification rules creation when +allocated by a privileged user. +.RE +.TP +.B \f[I]comp_mask\f[] +Bitmask specifying what fields in the structure are valid: Currently +reserved and should be set to 0. +.RS +.RE +.SH RETURN VALUE +.PP +\f[B]mlx5dv_alloc_dm()\f[] returns a pointer to the created DM, on error +NULL will be returned and errno will be set. +.SH SEE ALSO +.PP +\f[B]ibv_alloc_dm\f[](3), +.SH AUTHOR +.PP +Ariel Levkovich <lariel@mellanox.com> diff --git a/buildlib/pandoc-prebuilt/6de8298d2452a2503f893112a0955baf560008c1 b/buildlib/pandoc-prebuilt/6de8298d2452a2503f893112a0955baf560008c1 new file mode 100644 index 0000000..a09b93e --- /dev/null +++ b/buildlib/pandoc-prebuilt/6de8298d2452a2503f893112a0955baf560008c1 @@ -0,0 +1,46 @@ +.\" Automatically generated by Pandoc 1.19.2.4 +.\" +.TH "IBV_RATE_TO_MULT" "3" "2006\-10\-31" "libibverbs" "Libibverbs Programmer\[aq]s Manual" +.hy +.SH NAME +.PP +ibv_rate_to_mult \- convert IB rate enumeration to multiplier of 2.5 +Gbit/sec +.PP +mult_to_ibv_rate \- convert multiplier of 2.5 Gbit/sec to an IB rate +enumeration +.SH SYNOPSIS +.IP +.nf +\f[C] +#include\ <infiniband/verbs.h> + +int\ ibv_rate_to_mult(enum\ ibv_rate\ rate); + +enum\ ibv_rate\ mult_to_ibv_rate(int\ mult); +\f[] +.fi +.SH DESCRIPTION +.PP +\f[B]ibv_rate_to_mult()\f[] converts the IB transmission rate +enumeration \f[I]rate\f[] to a multiple of 2.5 Gbit/sec (the base rate). +For example, if \f[I]rate\f[] is \f[B]IBV_RATE_5_GBPS\f[], the value 2 +will be returned (5 Gbit/sec = 2 * 2.5 Gbit/sec). +.PP +\f[B]mult_to_ibv_rate()\f[] converts the multiplier value (of 2.5 +Gbit/sec) \f[I]mult\f[] to an IB transmission rate enumeration. +For example, if \f[I]mult\f[] is 2, the rate enumeration +\f[B]IBV_RATE_5_GBPS\f[] will be returned. +.SH RETURN VALUE +.PP +\f[B]ibv_rate_to_mult()\f[] returns the multiplier of the base rate 2.5 +Gbit/sec. +.PP +\f[B]mult_to_ibv_rate()\f[] returns the enumeration representing the IB +transmission rate. +.SH SEE ALSO +.PP +\f[B]ibv_query_port\f[](3) +.SH AUTHOR +.PP +Dotan Barak <dotanba@gmail.com> diff --git a/buildlib/pandoc-prebuilt/6f156ab04b00e1def09a7f620cbf79717fa1d367 b/buildlib/pandoc-prebuilt/6f156ab04b00e1def09a7f620cbf79717fa1d367 new file mode 100644 index 0000000..b80b4d1 --- /dev/null +++ b/buildlib/pandoc-prebuilt/6f156ab04b00e1def09a7f620cbf79717fa1d367 @@ -0,0 +1,287 @@ +.\" Man page generated from reStructuredText. +. +.TH PERFQUERY 8 "2017-08-21" "" "Open IB Diagnostics" +.SH NAME +perfquery \- query InfiniBand port counters on a single port +. +.nr rst2man-indent-level 0 +. +.de1 rstReportMargin +\\$1 \\n[an-margin] +level \\n[rst2man-indent-level] +level margin: \\n[rst2man-indent\\n[rst2man-indent-level]] +- +\\n[rst2man-indent0] +\\n[rst2man-indent1] +\\n[rst2man-indent2] +.. +.de1 INDENT +.\" .rstReportMargin pre: +. RS \\$1 +. nr rst2man-indent\\n[rst2man-indent-level] \\n[an-margin] +. nr rst2man-indent-level +1 +.\" .rstReportMargin post: +.. +.de UNINDENT +. RE +.\" indent \\n[an-margin] +.\" old: \\n[rst2man-indent\\n[rst2man-indent-level]] +.nr rst2man-indent-level -1 +.\" new: \\n[rst2man-indent\\n[rst2man-indent-level]] +.in \\n[rst2man-indent\\n[rst2man-indent-level]]u +.. +.SH SYNOPSIS +.sp +perfquery [options] [<lid|guid> [[port(s)] [reset_mask]]] +.SH DESCRIPTION +.sp +perfquery uses PerfMgt GMPs to obtain the PortCounters (basic performance and +error counters), PortExtendedCounters, PortXmitDataSL, PortRcvDataSL, +PortRcvErrorDetails, PortXmitDiscardDetails, PortExtendedSpeedsCounters, or +PortSamplesControl from the PMA at the node/port specified. Optionally shows +aggregated counters for all ports of node. Finally it can, reset after read, +or just reset the counters. +.sp +Note: In PortCounters, PortCountersExtended, PortXmitDataSL, and PortRcvDataSL, +components that represent Data (e.g. PortXmitData and PortRcvData) indicate +octets divided by 4 rather than just octets. +.sp +Note: Inputting a port of 255 indicates an operation be performed on all ports. +.sp +Note: For PortCounters, ExtendedCounters, and resets, multiple ports can be +specified by either a comma separated list or a port range. See examples below. +.SH OPTIONS +.INDENT 0.0 +.TP +.B \fB\-x, \-\-extended\fP +show extended port counters rather than (basic) port counters. +Note that extended port counters attribute is optional. +.TP +.B \fB\-X, \-\-xmtsl\fP +show transmit data SL counter. This is an optional counter for QoS. +.TP +.B \fB\-S, \-\-rcvsl\fP +show receive data SL counter. This is an optional counter for QoS. +.TP +.B \fB\-D, \-\-xmtdisc\fP +show transmit discard details. This is an optional counter. +.TP +.B \fB\-E, \-\-rcverr\fP +show receive error details. This is an optional counter. +.TP +.B \fB\-D, \-\-xmtdisc\fP +show transmit discard details. This is an optional counter. +.TP +.B \fB\-T, \-\-extended_speeds\fP +show extended speeds port counters. This is an optional counter. +.TP +.B \fB\-\-oprcvcounters\fP +show Rcv Counters per Op code. This is an optional counter. +.TP +.B \fB\-\-flowctlcounters\fP +show flow control counters. This is an optional counter. +.TP +.B \fB\-\-vloppackets\fP +show packets received per Op code per VL. This is an optional counter. +.TP +.B \fB\-\-vlopdata\fP +show data received per Op code per VL. This is an optional counter. +.TP +.B \fB\-\-vlxmitflowctlerrors\fP +show flow control update errors per VL. This is an optional counter. +.TP +.B \fB\-\-vlxmitcounters\fP +show ticks waiting to transmit counters per VL. This is an optional counter. +.TP +.B \fB\-\-swportvlcong\fP +show sw port VL congestion. This is an optional counter. +.TP +.B \fB\-\-rcvcc\fP +show Rcv congestion control counters. This is an optional counter. +.TP +.B \fB\-\-slrcvfecn\fP +show SL Rcv FECN counters. This is an optional counter. +.TP +.B \fB\-\-slrcvbecn\fP +show SL Rcv BECN counters. This is an optional counter. +.TP +.B \fB\-\-xmitcc\fP +show Xmit congestion control counters. This is an optional counter. +.TP +.B \fB\-\-vlxmittimecc\fP +show VL Xmit Time congestion control counters. This is an optional counter. +.TP +.B \fB\-c, \-\-smplctl\fP +show port samples control. +.TP +.B \fB\-a, \-\-all_ports\fP +show aggregated counters for all ports of the destination lid, reset +all counters for all ports, or if multiple ports are specified, aggregate +the counters of the specified ports. If the destination lid does not support +the AllPortSelect flag, all ports will be iterated through to emulate +AllPortSelect behavior. +.TP +.B \fB\-l, \-\-loop_ports\fP +If all ports are selected by the user (either through the \fB\-a\fP option +or port 255) or multiple ports are specified iterate through each port rather +than doing than aggregate operation. +.TP +.B \fB\-r, \-\-reset_after_read\fP +reset counters after read +.TP +.B \fB\-R, \-\-Reset_only\fP +only reset counters +.UNINDENT +.SS Addressing Flags +.\" Define the common option -G +. +.sp +\fB\-G, \-\-Guid\fP The address specified is a Port GUID +.\" Define the common option -L +. +.sp +\fB\-L, \-\-Lid\fP The address specified is a LID +.\" Define the common option -s +. +.sp +\fB\-s, \-\-sm_port <smlid>\fP use \(aqsmlid\(aq as the target lid for SA queries. +.SS Port Selection flags +.\" Define the common option -C +. +.sp +\fB\-C, \-\-Ca <ca_name>\fP use the specified ca_name. +.\" Define the common option -P +. +.sp +\fB\-P, \-\-Port <ca_port>\fP use the specified ca_port. +.\" Explanation of local port selection +. +.SS Local port Selection +.sp +Multiple port/Multiple CA support: when no IB device or port is specified +(see the "local umad parameters" below), the libibumad library +selects the port to use by the following criteria: +.INDENT 0.0 +.INDENT 3.5 +.INDENT 0.0 +.IP 1. 3 +the first port that is ACTIVE. +.IP 2. 3 +if not found, the first port that is UP (physical link up). +.UNINDENT +.sp +If a port and/or CA name is specified, the libibumad library attempts +to fulfill the user request, and will fail if it is not possible. +.sp +For example: +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +ibaddr # use the first port (criteria #1 above) +ibaddr \-C mthca1 # pick the best port from "mthca1" only. +ibaddr \-P 2 # use the second (active/up) port from the first available IB device. +ibaddr \-C mthca0 \-P 2 # use the specified port only. +.ft P +.fi +.UNINDENT +.UNINDENT +.UNINDENT +.UNINDENT +.SS Debugging flags +.\" Define the common option -d +. +.INDENT 0.0 +.TP +.B \-d +raise the IB debugging level. +May be used several times (\-ddd or \-d \-d \-d). +.UNINDENT +.\" Define the common option -e +. +.INDENT 0.0 +.TP +.B \-e +show send and receive errors (timeouts and others) +.UNINDENT +.\" Define the common option -h +. +.sp +\fB\-h, \-\-help\fP show the usage message +.\" Define the common option -v +. +.INDENT 0.0 +.TP +.B \fB\-v, \-\-verbose\fP +increase the application verbosity level. +May be used several times (\-vv or \-v \-v \-v) +.UNINDENT +.\" Define the common option -V +. +.sp +\fB\-V, \-\-version\fP show the version info. +.SS Configuration flags +.\" Define the common option -t +. +.sp +\fB\-t, \-\-timeout <timeout_ms>\fP override the default timeout for the solicited mads. +.\" Define the common option -y +. +.INDENT 0.0 +.TP +.B \fB\-y, \-\-m_key <key>\fP +use the specified M_key for requests. If non\-numeric value (like \(aqx\(aq) +is specified then a value will be prompted for. +.UNINDENT +.\" Define the common option -z +. +.sp +\fB\-\-config, \-z <config_file>\fP Specify alternate config file. +.INDENT 0.0 +.INDENT 3.5 +Default: /usr/local/etc/infiniband\-diags/ibdiag.conf +.UNINDENT +.UNINDENT +.SH FILES +.\" Common text for the config file +. +.SS CONFIG FILE +.sp +/usr/local/etc/infiniband\-diags/ibdiag.conf +.sp +A global config file is provided to set some of the common options for all +tools. See supplied config file for details. +.SH EXAMPLES +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +perfquery # read local port performance counters +perfquery 32 1 # read performance counters from lid 32, port 1 +perfquery \-x 32 1 # read extended performance counters from lid 32, port 1 +perfquery \-a 32 # read perf counters from lid 32, all ports +perfquery \-r 32 1 # read performance counters and reset +perfquery \-x \-r 32 1 # read extended performance counters and reset +perfquery \-R 0x20 1 # reset performance counters of port 1 only +perfquery \-x \-R 0x20 1 # reset extended performance counters of port 1 only +perfquery \-R \-a 32 # reset performance counters of all ports +perfquery \-R 32 2 0x0fff # reset only error counters of port 2 +perfquery \-R 32 2 0xf000 # reset only non\-error counters of port 2 +perfquery \-a 32 1\-10 # read performance counters from lid 32, port 1\-10, aggregate output +perfquery \-l 32 1\-10 # read performance counters from lid 32, port 1\-10, output each port +perfquery \-a 32 1,4,8 # read performance counters from lid 32, port 1, 4, and 8, aggregate output +perfquery \-l 32 1,4,8 # read performance counters from lid 32, port 1, 4, and 8, output each port +.ft P +.fi +.UNINDENT +.UNINDENT +.SH AUTHOR +.INDENT 0.0 +.TP +.B Hal Rosenstock +< \fI\%hal.rosenstock@gmail.com\fP > +.UNINDENT +.\" Generated by docutils manpage writer. +. diff --git a/buildlib/pandoc-prebuilt/71b9f30576194f743a340f6eaef13c674b4019d5 b/buildlib/pandoc-prebuilt/71b9f30576194f743a340f6eaef13c674b4019d5 new file mode 100644 index 0000000..6788c04 --- /dev/null +++ b/buildlib/pandoc-prebuilt/71b9f30576194f743a340f6eaef13c674b4019d5 @@ -0,0 +1,34 @@ +.\" Automatically generated by Pandoc 1.19.2.4 +.\" +.TH "IBV_QUERY_PKEY" "3" "2006\-10\-31" "libibverbs" "Libibverbs Programmer\[aq]s Manual" +.hy +.SH NAME +.PP +ibv_query_pkey \- query an InfiniBand port\[aq]s P_Key table +.SH SYNOPSIS +.IP +.nf +\f[C] +#include\ <infiniband/verbs.h> + +int\ ibv_query_pkey(struct\ ibv_context\ *context, +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ uint8_t\ port_num, +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ int\ index, +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ uint16_t\ *pkey); +\f[] +.fi +.SH DESCRIPTION +.PP +\f[B]ibv_query_pkey()\f[] returns the P_Key value (in network byte +order) in entry \f[I]index\f[] of port \f[I]port_num\f[] for device +context \f[I]context\f[] through the pointer \f[I]pkey\f[]. +.SH RETURN VALUE +.PP +\f[B]ibv_query_pkey()\f[] returns 0 on success, and \-1 on error. +.SH SEE ALSO +.PP +\f[B]ibv_open_device\f[](3), \f[B]ibv_query_device\f[](3), +\f[B]ibv_query_gid\f[](3), \f[B]ibv_query_port\f[](3) +.SH AUTHOR +.PP +Dotan Barak <dotanba@gmail.com> diff --git a/buildlib/pandoc-prebuilt/771e81c03946e49b29d803afc6498a1c2c346ce8 b/buildlib/pandoc-prebuilt/771e81c03946e49b29d803afc6498a1c2c346ce8 new file mode 100644 index 0000000..196e922 --- /dev/null +++ b/buildlib/pandoc-prebuilt/771e81c03946e49b29d803afc6498a1c2c346ce8 @@ -0,0 +1,52 @@ +.\" Automatically generated by Pandoc 1.19.2.4 +.\" +.TH "MLX5DV_DUMP API" "3" "2019\-11\-18" "mlx5" "mlx5 Programmer\[aq]s Manual" +.hy +.SH NAME +.PP +mlx5dv_dump_dr_domain \- Dump DR Domain +.PP +mlx5dv_dump_dr_table \- Dump DR Table +.PP +mlx5dv_dump_dr_matcher \- Dump DR Matcher +.PP +mlx5dv_dump_dr_rule \- Dump DR Rule +.SH SYNOPSIS +.IP +.nf +\f[C] +#include\ <infiniband/mlx5dv.h> + +int\ mlx5dv_dump_dr_domain(FILE\ *fout,\ struct\ mlx5dv_dr_domain\ *domain); +int\ mlx5dv_dump_dr_table(FILE\ *fout,\ struct\ mlx5dv_dr_table\ *table); +int\ mlx5dv_dump_dr_matcher(FILE\ *fout,\ struct\ mlx5dv_dr_matcher\ *matcher); +int\ mlx5dv_dump_dr_rule(FILE\ *fout,\ struct\ mlx5dv_dr_rule\ *rule); +\f[] +.fi +.SH DESCRIPTION +.PP +The Dump API (mlx5dv_dump_*) allows the dumping of the existing +rdma\-core resources to the provided file. +The output file format is vendor specific. +.PP +\f[I]mlx5dv_dump_dr_domain()\f[] dumps a DR Domain object properties to +a specified file. +.PP +\f[I]mlx5dv_dump_dr_table()\f[] dumps a DR Table object properties to a +specified file. +.PP +\f[I]mlx5dv_dump_dr_matcher()\f[] dumps a DR Matcher object properties +to a specified file. +.PP +\f[I]mlx5dv_dump_dr_rule()\f[] dumps a DR Rule object properties to a +specified file. +.SH RETURN VALUE +.PP +The API calls returns 0 on success, or the value of errno on failure +(which indicates the failure reason). +The calls are blocking \- function returns only when all related +resources info is written to the file. +.SH AUTHOR +.PP +Yevgeny Kliteynik <kliteyn@mellanox.com> Muhammad Sammar +<muhammads@mellanox.com> diff --git a/buildlib/pandoc-prebuilt/77296d8207743f01a23a4bd7b5c59c7b84c454e7 b/buildlib/pandoc-prebuilt/77296d8207743f01a23a4bd7b5c59c7b84c454e7 new file mode 100644 index 0000000..9076a2d --- /dev/null +++ b/buildlib/pandoc-prebuilt/77296d8207743f01a23a4bd7b5c59c7b84c454e7 @@ -0,0 +1,87 @@ +.\" Automatically generated by Pandoc 1.19.2.4 +.\" +.TH "mlx5dv_devx_alloc_uar / mlx5dv_devx_free_uar" "3" "" "" "" +.hy +.SH NAME +.PP +mlx5dv_devx_alloc_uar \- Allocates a DEVX UAR +.PP +mlx5dv_devx_free_uar \- Frees a DEVX UAR +.SH SYNOPSIS +.IP +.nf +\f[C] +#include\ <infiniband/mlx5dv.h> + +struct\ mlx5dv_devx_uar\ *mlx5dv_devx_alloc_uar(struct\ ibv_context\ *context, +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ uint32_t\ flags); + +void\ mlx5dv_devx_free_uar(struct\ mlx5dv_devx_uar\ *devx_uar); +\f[] +.fi +.SH DESCRIPTION +.PP +Create / free a DEVX UAR which is needed for other device commands over +the DEVX interface. +.PP +The DEVX API enables direct access from the user space area to the mlx5 +device driver, the UAR information is needed for few commands as of QP +creation. +.SH ARGUMENTS +.TP +.B \f[I]context\f[] +RDMA device context to work on. +.RS +.RE +.TP +.B \f[I]flags\f[] +Allocation flags for the UAR. +MLX5DV_UAR_ALLOC_TYPE_BF: Allocate UAR with Blueflame properties. +MLX5DV_UAR_ALLOC_TYPE_NC: Allocate UAR with non\-cache properties. +.RS +.RE +.SS devx_uar +.IP +.nf +\f[C] +struct\ mlx5dv_devx_uar\ { +\ \ \ \ void\ *reg_addr; +\ \ \ \ void\ *base_addr; +\ \ \ \ uint32_t\ page_id; +\ \ \ \ off_t\ mmap_off; +\ \ \ \ uint64_t\ comp_mask; +}; +\f[] +.fi +.TP +.B \f[I]reg_addr\f[] +The write address of DB/BF. +.RS +.RE +.TP +.B \f[I]base_addr\f[] +The base address of the UAR. +.RS +.RE +.TP +.B \f[I]page_id\f[] +The device page id to be used. +.RS +.RE +.TP +.B \f[I]mmap_off\f[] +The mmap offset parameter to be used for re\-mapping, to be used by a +secondary process. +.RS +.RE +.SH RETURN VALUE +.PP +Upon success \f[I]mlx5dv_devx_alloc_uar\f[] will return a new +\f[I]struct mlx5dv_devx_uar\f[], on error NULL will be returned and +errno will be set. +.SH SEE ALSO +.PP +\f[B]mlx5dv_open_device\f[], \f[B]mlx5dv_devx_obj_create\f[] +.SH AUTHOR +.PP +Yishai Hadas <yishaih@mellanox.com> diff --git a/buildlib/pandoc-prebuilt/77e091fce9252614b7c6136f15917606746eac44 b/buildlib/pandoc-prebuilt/77e091fce9252614b7c6136f15917606746eac44 new file mode 100644 index 0000000..7b8a484 --- /dev/null +++ b/buildlib/pandoc-prebuilt/77e091fce9252614b7c6136f15917606746eac44 @@ -0,0 +1,51 @@ +.\" Automatically generated by Pandoc 1.19.2.4 +.\" +.TH "mlx5dv_devx_query_eqn" "3" "" "" "" +.hy +.SH NAME +.PP +mlx5dv_devx_query_eqn \- Query EQN for a given vector id. +.SH SYNOPSIS +.IP +.nf +\f[C] +#include\ <infiniband/mlx5dv.h> + +int\ mlx5dv_devx_query_eqn(struct\ ibv_context\ *context,\ uint32_t\ vector, +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ uint32_t\ *eqn); +\f[] +.fi +.SH DESCRIPTION +.PP +Query EQN for a given input vector, the EQN is needed for other device +commands over the DEVX interface. +.PP +The DEVX API enables direct access from the user space area to the mlx5 +device driver, the EQN information is needed for few commands such as CQ +creation. +.SH ARGUMENTS +.TP +.B \f[I]context\f[] +RDMA device context to work on. +.RS +.RE +.TP +.B \f[I]vector\f[] +Completion vector number. +.RS +.RE +.TP +.B \f[I]eqn\f[] +The device EQ number which relates to the given input vector. +.RS +.RE +.SH RETURN VALUE +.PP +returns 0 on success, or the value of errno on failure (which indicates +the failure reason). +.SH SEE ALSO +.PP +\f[B]mlx5dv_open_device\f[], \f[B]mlx5dv_devx_obj_create\f[] +.SH AUTHOR +.PP +Yishai Hadas <yishaih@mellanox.com> diff --git a/buildlib/pandoc-prebuilt/7fbac8884b21e9bc3bbb20607d56da0b48f2a156 b/buildlib/pandoc-prebuilt/7fbac8884b21e9bc3bbb20607d56da0b48f2a156 new file mode 100644 index 0000000..a0a0d06 --- /dev/null +++ b/buildlib/pandoc-prebuilt/7fbac8884b21e9bc3bbb20607d56da0b48f2a156 @@ -0,0 +1,184 @@ +.\" Man page generated from reStructuredText. +. +.TH IBSWITCHES 8 "2016-12-20" "" "OpenIB Diagnostics" +.SH NAME +IBSWITCHES \- show InfiniBand switch nodes in topology +. +.nr rst2man-indent-level 0 +. +.de1 rstReportMargin +\\$1 \\n[an-margin] +level \\n[rst2man-indent-level] +level margin: \\n[rst2man-indent\\n[rst2man-indent-level]] +- +\\n[rst2man-indent0] +\\n[rst2man-indent1] +\\n[rst2man-indent2] +.. +.de1 INDENT +.\" .rstReportMargin pre: +. RS \\$1 +. nr rst2man-indent\\n[rst2man-indent-level] \\n[an-margin] +. nr rst2man-indent-level +1 +.\" .rstReportMargin post: +.. +.de UNINDENT +. RE +.\" indent \\n[an-margin] +.\" old: \\n[rst2man-indent\\n[rst2man-indent-level]] +.nr rst2man-indent-level -1 +.\" new: \\n[rst2man-indent\\n[rst2man-indent-level]] +.in \\n[rst2man-indent\\n[rst2man-indent-level]]u +.. +.SH SYNOPSIS +.sp +ibswitches [options] [<topology\-file>] +.SH DESCRIPTION +.sp +ibswitches is a script which either walks the IB subnet topology or uses an +already saved topology file and extracts the switch nodes. +.SH OPTIONS +.\" Define the common option -C +. +.sp +\fB\-C, \-\-Ca <ca_name>\fP use the specified ca_name. +.\" Define the common option -P +. +.sp +\fB\-P, \-\-Port <ca_port>\fP use the specified ca_port. +.\" Define the common option -t +. +.sp +\fB\-t, \-\-timeout <timeout_ms>\fP override the default timeout for the solicited mads. +.\" Define the common option -y +. +.INDENT 0.0 +.TP +.B \fB\-y, \-\-m_key <key>\fP +use the specified M_key for requests. If non\-numeric value (like \(aqx\(aq) +is specified then a value will be prompted for. +.UNINDENT +.\" Define the common option -h +. +.sp +\fB\-h, \-\-help\fP show the usage message +.\" Define the common option -z +. +.sp +\fB\-\-config, \-z <config_file>\fP Specify alternate config file. +.INDENT 0.0 +.INDENT 3.5 +Default: /usr/local/etc/infiniband\-diags/ibdiag.conf +.UNINDENT +.UNINDENT +.\" Explanation of local port selection +. +.SS Local port Selection +.sp +Multiple port/Multiple CA support: when no IB device or port is specified +(see the "local umad parameters" below), the libibumad library +selects the port to use by the following criteria: +.INDENT 0.0 +.INDENT 3.5 +.INDENT 0.0 +.IP 1. 3 +the first port that is ACTIVE. +.IP 2. 3 +if not found, the first port that is UP (physical link up). +.UNINDENT +.sp +If a port and/or CA name is specified, the libibumad library attempts +to fulfill the user request, and will fail if it is not possible. +.sp +For example: +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +ibaddr # use the first port (criteria #1 above) +ibaddr \-C mthca1 # pick the best port from "mthca1" only. +ibaddr \-P 2 # use the second (active/up) port from the first available IB device. +ibaddr \-C mthca0 \-P 2 # use the specified port only. +.ft P +.fi +.UNINDENT +.UNINDENT +.UNINDENT +.UNINDENT +.SH FILES +.\" Common text for the config file +. +.SS CONFIG FILE +.sp +/usr/local/etc/infiniband\-diags/ibdiag.conf +.sp +A global config file is provided to set some of the common options for all +tools. See supplied config file for details. +.\" Common text to describe the node name map file. +. +.SS NODE NAME MAP FILE FORMAT +.sp +The node name map is used to specify user friendly names for nodes in the +output. GUIDs are used to perform the lookup. +.sp +This functionality is provided by the opensm\-libs package. See \fBopensm(8)\fP +for the file location for your installation. +.sp +\fBGenerically:\fP +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +# comment +<guid> "<name>" +.ft P +.fi +.UNINDENT +.UNINDENT +.sp +\fBExample:\fP +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +# IB1 +# Line cards +0x0008f104003f125c "IB1 (Rack 11 slot 1 ) ISR9288/ISR9096 Voltaire sLB\-24D" +0x0008f104003f125d "IB1 (Rack 11 slot 1 ) ISR9288/ISR9096 Voltaire sLB\-24D" +0x0008f104003f10d2 "IB1 (Rack 11 slot 2 ) ISR9288/ISR9096 Voltaire sLB\-24D" +0x0008f104003f10d3 "IB1 (Rack 11 slot 2 ) ISR9288/ISR9096 Voltaire sLB\-24D" +0x0008f104003f10bf "IB1 (Rack 11 slot 12 ) ISR9288/ISR9096 Voltaire sLB\-24D" + +# Spines +0x0008f10400400e2d "IB1 (Rack 11 spine 1 ) ISR9288 Voltaire sFB\-12D" +0x0008f10400400e2e "IB1 (Rack 11 spine 1 ) ISR9288 Voltaire sFB\-12D" +0x0008f10400400e2f "IB1 (Rack 11 spine 1 ) ISR9288 Voltaire sFB\-12D" +0x0008f10400400e31 "IB1 (Rack 11 spine 2 ) ISR9288 Voltaire sFB\-12D" +0x0008f10400400e32 "IB1 (Rack 11 spine 2 ) ISR9288 Voltaire sFB\-12D" + +# GUID Node Name +0x0008f10400411a08 "SW1 (Rack 3) ISR9024 Voltaire 9024D" +0x0008f10400411a28 "SW2 (Rack 3) ISR9024 Voltaire 9024D" +0x0008f10400411a34 "SW3 (Rack 3) ISR9024 Voltaire 9024D" +0x0008f104004119d0 "SW4 (Rack 3) ISR9024 Voltaire 9024D" +.ft P +.fi +.UNINDENT +.UNINDENT +.SH SEE ALSO +.sp +ibnetdiscover(8) +.SH DEPENDENCIES +.sp +ibnetdiscover, ibnetdiscover format +.SH AUTHOR +.INDENT 0.0 +.TP +.B Hal Rosenstock +< \fI\%halr@voltaire.com\fP > +.UNINDENT +.\" Generated by docutils manpage writer. +. diff --git a/buildlib/pandoc-prebuilt/8b642eff1f99e733d9048cc9ad091fca9296f615 b/buildlib/pandoc-prebuilt/8b642eff1f99e733d9048cc9ad091fca9296f615 new file mode 100644 index 0000000..50339be --- /dev/null +++ b/buildlib/pandoc-prebuilt/8b642eff1f99e733d9048cc9ad091fca9296f615 @@ -0,0 +1,137 @@ +.\" Automatically generated by Pandoc 1.19.2.4 +.\" +.TH "IBV_ADVISE_MR" "3" "2018\-10\-19" "libibverbs" "Libibverbs Programmer\[aq]s Manual" +.hy +.SH NAME +.PP +ibv_advise_mr \- Gives advice or directions to the kernel about an +address range belongs to a memory region (MR). +.SH SYNOPSIS +.IP +.nf +\f[C] +#include\ <infiniband/verbs.h> + +int\ ibv_advise_mr(struct\ ibv_pd\ *pd, +\ \ \ \ \ \ \ \ \ \ enum\ ibv_advise_mr_advice\ advice, +\ \ \ \ \ \ \ \ \ \ uint32_t\ flags, +\ \ \ \ \ \ \ \ \ \ struct\ ibv_sge\ *sg_list, +\ \ \ \ \ \ \ \ \ \ uint32_t\ num_sge) +\f[] +.fi +.SH DESCRIPTION +.PP +\f[B]ibv_advise_mr()\f[] Give advice or directions to the kernel about +an address range belonging to a memory region (MR). +Applications that are aware of future access patterns can use this verb +in order to leverage this knowledge to improve system or application +performance. +.PP +\f[B]Conventional advice values\f[] +.TP +.B \f[I]IBV_ADVISE_MR_ADVICE_PREFETCH\f[] +Pre\-fetch a range of an on\-demand paging MR. +Make pages present with read\-only permission before the actual IO is +conducted. +This would provide a way to reduce latency by overlapping paging\-in and +either compute time or IO to other ranges. +.RS +.RE +.TP +.B \f[I]IBV_ADVISE_MR_ADVICE_PREFETCH_WRITE\f[] +Like IBV_ADVISE_MR_ADVICE_PREFETCH but with read\-access and +write\-access permission to the fetched memory. +.RS +.RE +.SH ARGUMENTS +.TP +.B \f[I]pd\f[] +The protection domain (PD) associated with the MR. +.RS +.RE +.TP +.B \f[I]advice\f[] +The requested advise value (as listed above). +.RS +.RE +.TP +.B \f[I]flags\f[] +Describes the properties of the advise operation \f[B]Conventional +advice values\f[] \f[I]IBV_ADVISE_MR_FLAG_FLUSH\f[] : Request to be a +synchronized operation. +Return to the caller after the operation is completed. +.RS +.RE +.TP +.B \f[I]sg_list\f[] +Pointer to the s/g array When using IBV_ADVISE_OP_PREFETCH advise value, +all the lkeys of all the scatter gather elements (SGEs) must be +associated with ODP MRs (MRs that were registered with +IBV_ACCESS_ON_DEMAND). +.RS +.RE +.TP +.B \f[I]num_sge\f[] +Number of elements in the s/g array +.RS +.RE +.SH RETURN VALUE +.PP +\f[B]ibv_advise_mr()\f[] returns 0 when the call was successful, or the +value of errno on failure (which indicates the failure reason). +.TP +.B \f[I]EOPNOTSUPP\f[] +libibverbs or provider driver doesn\[aq]t support the ibv_advise_mr() +verb (ENOSYS may sometimes be returned by old versions of libibverbs). +.RS +.RE +.TP +.B \f[I]ENOTSUP\f[] +The advise operation isn\[aq]t supported. +.RS +.RE +.TP +.B \f[I]EFAULT\f[] +In one of the following: o When the range requested is out of the MR +bounds, or when parts of it are not part of the process address space. +o One of the lkeys provided in the scatter gather list is invalid or +with wrong write access. +.RS +.RE +.TP +.B \f[I]EINVAL\f[] +In one of the following: o The PD is invalid. +o The flags are invalid. +.RS +.RE +.SH NOTES +.PP +An application may pre\-fetch any address range within an ODP MR when +using the \f[B]IBV_ADVISE_MR_ADVICE_PREFETCH\f[] or +\f[B]IBV_ADVISE_MR_ADVICE_PREFETCH_WRITE\f[] advice. +Semantically, this operation is best\-effort. +That means the kernel does not guarantee that underlying pages are +updated in the HCA or the pre\-fetched pages would remain resident. +.PP +When using \f[B]IBV_ADVISE_MR_ADVICE_PREFETCH\f[] or +\f[B]IBV_ADVISE_MR_ADVICE_PREFETCH_WRITE\f[] advice, the operation will +be done in the following stages: o Page in the user pages to memory +(pages aren\[aq]t pinned). +o Get the dma mapping of these user pages. +o Post the underlying page translations to the HCA. +.PP +If \f[B]IBV_ADVISE_MR_FLAG_FLUSH\f[] is specified then the underlying +pages are guaranteed to be updated in the HCA before returning SUCCESS. +Otherwise the driver can choose to postpone the posting of the new +translations to the HCA. +When performing a local RDMA access operation it is recommended to use +IBV_ADVISE_MR_FLAG_FLUSH flag with one of the pre\-fetch advices to +increase probability that the pages translations are valid in the HCA +and avoid future page faults. +.SH SEE ALSO +.PP +\f[B]ibv_reg_mr\f[](3), \f[B]ibv_rereg_mr\f[](3), +\f[B]ibv_dereg_mr\f[](3) +.SH AUTHOR +.PP +Aviad Yehezkel <aviadye@mellanox.com> diff --git a/buildlib/pandoc-prebuilt/8ffcb0db55efb46a50559f39000fd7b8b82d57cc b/buildlib/pandoc-prebuilt/8ffcb0db55efb46a50559f39000fd7b8b82d57cc new file mode 100644 index 0000000..292d3a7 --- /dev/null +++ b/buildlib/pandoc-prebuilt/8ffcb0db55efb46a50559f39000fd7b8b82d57cc @@ -0,0 +1,52 @@ +.\" Automatically generated by Pandoc 1.19.2.4 +.\" +.TH "RDMA_INIT_QP_ATTR" "3" "2018\-12\-31" "librdmacm" "Librdmacm Programmer\[aq]s Manual" +.hy +.SH NAME +.PP +rdma_init_qp_attr \- Returns qp attributes of a rdma_cm_id. +.SH SYNOPSIS +.IP +.nf +\f[C] +#include\ <rdma/rdma_cma.h> + +int\ rdma_init_qp_attr(struct\ rdma_cm_id\ *id, +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ struct\ ibv_qp_attr\ *qp_attr, +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ int\ *qp_attr_mask); +\f[] +.fi +.SH DESCRIPTION +.PP +\f[B]rdma_init_qp_attr()\f[] returns qp attributes of a rdma_cm_id. +.PP +Information about qp attributes and qp attributes mask is returned +through the \f[I]qp_attr\f[] and \f[I]qp_attr_mask\f[] parameters. +.PP +For details on the qp_attr structure, see ibv_modify_qp. +.SH ARGUMENTS +.TP +.B \f[I]id\f[] +RDMA identifier. +.RS +.RE +.TP +.B \f[I]qp_attr\f[] +A reference to a qp attributes struct containing response information. +.RS +.RE +.TP +.B \f[I]qp_attr_mask\f[] +A reference to a qp attributes mask containing response information. +.RS +.RE +.SH RETURN VALUE +.PP +\f[B]rdma_init_qp_attr()\f[] returns 0 on success, or \-1 on error. +If an error occurs, errno will be set to indicate the failure reason. +.SH SEE ALSO +.PP +\f[B]rdma_cm\f[](7), \f[B]ibv_modify_qp\f[](3) +.SH AUTHOR +.PP +Danit Goldberg <danitg@mellanox.com> diff --git a/buildlib/pandoc-prebuilt/971674ea9c99ebc02210ea2412f59a09a2432784 b/buildlib/pandoc-prebuilt/971674ea9c99ebc02210ea2412f59a09a2432784 new file mode 100644 index 0000000..b08f44b --- /dev/null +++ b/buildlib/pandoc-prebuilt/971674ea9c99ebc02210ea2412f59a09a2432784 @@ -0,0 +1,79 @@ +.\" Man page generated from reStructuredText. +. +.TH IBIDSVERIFY 8 "2017-08-21" "" "Open IB Diagnostics" +.SH NAME +ibidsverify \- validate IB identifiers in subnet and report errors +. +.nr rst2man-indent-level 0 +. +.de1 rstReportMargin +\\$1 \\n[an-margin] +level \\n[rst2man-indent-level] +level margin: \\n[rst2man-indent\\n[rst2man-indent-level]] +- +\\n[rst2man-indent0] +\\n[rst2man-indent1] +\\n[rst2man-indent2] +.. +.de1 INDENT +.\" .rstReportMargin pre: +. RS \\$1 +. nr rst2man-indent\\n[rst2man-indent-level] \\n[an-margin] +. nr rst2man-indent-level +1 +.\" .rstReportMargin post: +.. +.de UNINDENT +. RE +.\" indent \\n[an-margin] +.\" old: \\n[rst2man-indent\\n[rst2man-indent-level]] +.nr rst2man-indent-level -1 +.\" new: \\n[rst2man-indent\\n[rst2man-indent-level]] +.in \\n[rst2man-indent\\n[rst2man-indent-level]]u +.. +.SH SYNOPSIS +.sp +ibidsverify.pl [\-h] [\-R] +.SH DESCRIPTION +.sp +ibidsverify.pl is a perl script which uses a full topology file that was +created by ibnetdiscover, scans the network to validate the LIDs and GUIDs +in the subnet. The validation consists of checking that there are no zero +or duplicate identifiers. +.sp +Finally, ibidsverify.pl will also reuse the cached ibnetdiscover output from +some of the other diag tools which makes it a bit faster than running +ibnetdiscover from scratch. +.SH OPTIONS +.sp +\fB\-R\fP +Recalculate the ibnetdiscover information, ie do not use the cached +information. This option is slower but should be used if the diag tools have +not been used for some time or if there are other reasons to believe the +fabric has changed. +.sp +\fB\-C <ca_name>\fP use the specified ca_name. +.sp +\fB\-P <ca_port>\fP use the specified ca_port. +.SH EXIT STATUS +.sp +Exit status is 1 if errors are found, 0 otherwise. +.SH FILES +.\" Common text for the config file +. +.SS CONFIG FILE +.sp +/usr/local/etc/infiniband\-diags/ibdiag.conf +.sp +A global config file is provided to set some of the common options for all +tools. See supplied config file for details. +.SH SEE ALSO +.sp +\fBibnetdiscover(8)\fP +.SH AUTHOR +.INDENT 0.0 +.TP +.B Hal Rosenstock +< \fI\%halr@voltaire.com\fP > +.UNINDENT +.\" Generated by docutils manpage writer. +. diff --git a/buildlib/pandoc-prebuilt/9773cb8b25ced5224270d00b9cc23819649350c2 b/buildlib/pandoc-prebuilt/9773cb8b25ced5224270d00b9cc23819649350c2 new file mode 100644 index 0000000..996d0d1 --- /dev/null +++ b/buildlib/pandoc-prebuilt/9773cb8b25ced5224270d00b9cc23819649350c2 @@ -0,0 +1,192 @@ +.\" Man page generated from reStructuredText. +. +.TH IBCCQUERY 8 "2012-05-31" "" "OpenIB Diagnostics" +.SH NAME +IBCCQUERY \- query congestion control settings/info +. +.nr rst2man-indent-level 0 +. +.de1 rstReportMargin +\\$1 \\n[an-margin] +level \\n[rst2man-indent-level] +level margin: \\n[rst2man-indent\\n[rst2man-indent-level]] +- +\\n[rst2man-indent0] +\\n[rst2man-indent1] +\\n[rst2man-indent2] +.. +.de1 INDENT +.\" .rstReportMargin pre: +. RS \\$1 +. nr rst2man-indent\\n[rst2man-indent-level] \\n[an-margin] +. nr rst2man-indent-level +1 +.\" .rstReportMargin post: +.. +.de UNINDENT +. RE +.\" indent \\n[an-margin] +.\" old: \\n[rst2man-indent\\n[rst2man-indent-level]] +.nr rst2man-indent-level -1 +.\" new: \\n[rst2man-indent\\n[rst2man-indent-level]] +.in \\n[rst2man-indent\\n[rst2man-indent-level]]u +.. +.SH SYNOPSIS +.sp +ibccquery [common_options] [\-c cckey] <op> <lid|guid> [port] +.SH DESCRIPTION +.sp +ibccquery support the querying of settings and other information related +to congestion control. +.SH OPTIONS +.INDENT 0.0 +.TP +.B Current supported operations and their parameters: +CongestionInfo (CI) <addr> +CongestionKeyInfo (CK) <addr> +CongestionLog (CL) <addr> +SwitchCongestionSetting (SS) <addr> +SwitchPortCongestionSetting (SP) <addr> [<portnum>] +CACongestionSetting (CS) <addr> +CongestionControlTable (CT) <addr> +Timestamp (TI) <addr> +.UNINDENT +.sp +\fB\-\-cckey, \-c <cckey>\fP +Specify a congestion control (CC) key. If none is specified, a key of 0 is used. +.SS Debugging flags +.\" Define the common option -d +. +.INDENT 0.0 +.TP +.B \-d +raise the IB debugging level. +May be used several times (\-ddd or \-d \-d \-d). +.UNINDENT +.\" Define the common option -e +. +.INDENT 0.0 +.TP +.B \-e +show send and receive errors (timeouts and others) +.UNINDENT +.\" Define the common option -h +. +.sp +\fB\-h, \-\-help\fP show the usage message +.\" Define the common option -v +. +.INDENT 0.0 +.TP +.B \fB\-v, \-\-verbose\fP +increase the application verbosity level. +May be used several times (\-vv or \-v \-v \-v) +.UNINDENT +.\" Define the common option -V +. +.sp +\fB\-V, \-\-version\fP show the version info. +.SS Addressing Flags +.\" Define the common option -G +. +.sp +\fB\-G, \-\-Guid\fP The address specified is a Port GUID +.\" Define the common option -L +. +.sp +\fB\-L, \-\-Lid\fP The address specified is a LID +.\" Define the common option -s +. +.sp +\fB\-s, \-\-sm_port <smlid>\fP use \(aqsmlid\(aq as the target lid for SA queries. +.SS Port Selection flags +.\" Define the common option -C +. +.sp +\fB\-C, \-\-Ca <ca_name>\fP use the specified ca_name. +.\" Define the common option -P +. +.sp +\fB\-P, \-\-Port <ca_port>\fP use the specified ca_port. +.\" Explanation of local port selection +. +.SS Local port Selection +.sp +Multiple port/Multiple CA support: when no IB device or port is specified +(see the "local umad parameters" below), the libibumad library +selects the port to use by the following criteria: +.INDENT 0.0 +.INDENT 3.5 +.INDENT 0.0 +.IP 1. 3 +the first port that is ACTIVE. +.IP 2. 3 +if not found, the first port that is UP (physical link up). +.UNINDENT +.sp +If a port and/or CA name is specified, the libibumad library attempts +to fulfill the user request, and will fail if it is not possible. +.sp +For example: +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +ibaddr # use the first port (criteria #1 above) +ibaddr \-C mthca1 # pick the best port from "mthca1" only. +ibaddr \-P 2 # use the second (active/up) port from the first available IB device. +ibaddr \-C mthca0 \-P 2 # use the specified port only. +.ft P +.fi +.UNINDENT +.UNINDENT +.UNINDENT +.UNINDENT +.SS Configuration flags +.\" Define the common option -y +. +.INDENT 0.0 +.TP +.B \fB\-y, \-\-m_key <key>\fP +use the specified M_key for requests. If non\-numeric value (like \(aqx\(aq) +is specified then a value will be prompted for. +.UNINDENT +.\" Define the common option -z +. +.sp +\fB\-\-config, \-z <config_file>\fP Specify alternate config file. +.INDENT 0.0 +.INDENT 3.5 +Default: /usr/local/etc/infiniband\-diags/ibdiag.conf +.UNINDENT +.UNINDENT +.SH FILES +.\" Common text for the config file +. +.SS CONFIG FILE +.sp +/usr/local/etc/infiniband\-diags/ibdiag.conf +.sp +A global config file is provided to set some of the common options for all +tools. See supplied config file for details. +.SH EXAMPLES +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +ibccquery CongestionInfo 3 # Congestion Info by lid +ibccquery SwitchPortCongestionSetting 3 # Query all Switch Port Congestion Settings +ibccquery SwitchPortCongestionSetting 3 1 # Query Switch Port Congestion Setting for port 1 +.ft P +.fi +.UNINDENT +.UNINDENT +.SH AUTHOR +.INDENT 0.0 +.TP +.B Albert Chu +< \fI\%chu11@llnl.gov\fP > +.UNINDENT +.\" Generated by docutils manpage writer. +. diff --git a/buildlib/pandoc-prebuilt/983dc82fa7ae24ca010e5d6e9d76e86725041150 b/buildlib/pandoc-prebuilt/983dc82fa7ae24ca010e5d6e9d76e86725041150 new file mode 100644 index 0000000..bb86789 --- /dev/null +++ b/buildlib/pandoc-prebuilt/983dc82fa7ae24ca010e5d6e9d76e86725041150 @@ -0,0 +1,47 @@ +.\" Automatically generated by Pandoc 1.19.2.4 +.\" +.TH "IBV_REQ_NOTIFY_CQ" "3" "2006\-10\-31" "libibverbs" "Libibverbs Programmer\[aq]s Manual" +.hy +.SH NAME +.PP +ibv_req_notify_cq \- request completion notification on a completion +queue (CQ) +.SH SYNOPSIS +.IP +.nf +\f[C] +#include\ <infiniband/verbs.h> + +int\ ibv_req_notify_cq(struct\ ibv_cq\ *cq,\ int\ solicited_only); +\f[] +.fi +.SH DESCRIPTION +.PP +\f[B]ibv_req_notify_cq()\f[] requests a completion notification on the +completion queue (CQ) \f[I]cq\f[]. +.PP +Upon the addition of a new CQ entry (CQE) to \f[I]cq\f[], a completion +event will be added to the completion channel associated with the CQ. +If the argument \f[I]solicited_only\f[] is zero, a completion event is +generated for any new CQE. +If \f[I]solicited_only\f[] is non\-zero, an event is only generated for +a new CQE with that is considered "solicited." A CQE is solicited if it +is a receive completion for a message with the Solicited Event header +bit set, or if the status is not successful. +All other successful receive completions, or any successful send +completion is unsolicited. +.SH RETURN VALUE +.PP +\f[B]ibv_req_notify_cq()\f[] returns 0 on success, or the value of errno +on failure (which indicates the failure reason). +.SH NOTES +.PP +The request for notification is "one shot." Only one completion event +will be generated for each call to \f[B]ibv_req_notify_cq()\f[]. +.SH SEE ALSO +.PP +\f[B]ibv_create_comp_channel\f[](3), \f[B]ibv_create_cq\f[](3), +\f[B]ibv_get_cq_event\f[](3) +.SH AUTHOR +.PP +Dotan Barak <dotanba@gmail.com> diff --git a/buildlib/pandoc-prebuilt/9fcf3f165ed59609a3e94199a5b963ba461c423a b/buildlib/pandoc-prebuilt/9fcf3f165ed59609a3e94199a5b963ba461c423a new file mode 100644 index 0000000..cc85e4f --- /dev/null +++ b/buildlib/pandoc-prebuilt/9fcf3f165ed59609a3e94199a5b963ba461c423a @@ -0,0 +1,91 @@ +.\" Automatically generated by Pandoc 1.19.2.4 +.\" +.TH "IBV_GET_DEVICE_LIST" "3" "2006\-10\-31" "libibverbs" "Libibverbs Programmer\[aq]s Manual" +.hy +.SH NAME +.PP +ibv_get_device_list, ibv_free_device_list \- get and release list of +available RDMA devices +.SH SYNOPSIS +.IP +.nf +\f[C] +#include\ <infiniband/verbs.h> + +struct\ ibv_device\ **ibv_get_device_list(int\ *num_devices); + +void\ ibv_free_device_list(struct\ ibv_device\ **list); +\f[] +.fi +.SH DESCRIPTION +.PP +\f[B]ibv_get_device_list()\f[] returns a NULL\-terminated array of RDMA +devices currently available. +The argument \f[I]num_devices\f[] is optional; if not NULL, it is set to +the number of devices returned in the array. +.PP +\f[B]ibv_free_device_list()\f[] frees the array of devices \f[I]list\f[] +returned by \f[B]ibv_get_device_list()\f[]. +.SH RETURN VALUE +.PP +\f[B]ibv_get_device_list()\f[] returns the array of available RDMA +devices, or sets \f[I]errno\f[] and returns NULL if the request fails. +If no devices are found then \f[I]num_devices\f[] is set to 0, and +non\-NULL is returned. +.PP +\f[B]ibv_free_device_list()\f[] returns no value. +.SH ERRORS +.TP +.B \f[B]EPERM\f[] +Permission denied. +.RS +.RE +.TP +.B \f[B]ENOSYS\f[] +No kernel support for RDMA. +.RS +.RE +.TP +.B \f[B]ENOMEM\f[] +Insufficient memory to complete the operation. +.RS +.RE +.SH NOTES +.PP +Client code should open all the devices it intends to use with +\f[B]ibv_open_device()\f[] before calling +\f[B]ibv_free_device_list()\f[]. +Once it frees the array with \f[B]ibv_free_device_list()\f[], it will be +able to use only the open devices; pointers to unopened devices will no +longer be valid. +.PP +Setting the environment variable \f[B]IBV_SHOW_WARNINGS\f[] will cause +warnings to be emitted to stderr if a kernel verbs device is discovered, +but no corresponding userspace driver can be found for it. +.SH STATIC LINKING +.PP +If \f[B]libibverbs\f[] is statically linked to the application then all +provider drivers must also be statically linked. +The library will not load dynamic providers when static linking is used. +.PP +To link the providers set the \f[B]RDMA_STATIC_PROVIDERS\f[] define to +the comma separated list of desired providers when compiling the +application. +The special keyword \[aq]all\[aq] will statically link all supported +\f[B]libibverbs\f[] providers. +.PP +This is intended to be used along with \f[B]pkg\-config(1)\f[] to setup +the proper flags for \f[B]libibverbs\f[] linking. +.PP +If this is not done then \f[B]ibv_get_device_list\f[] will always return +an empty list. +.PP +Using only dynamic linking for \f[B]libibverbs\f[] applications is +strongly recommended. +.SH SEE ALSO +.PP +\f[B]ibv_fork_init\f[](3), \f[B]ibv_get_device_guid\f[](3), +\f[B]ibv_get_device_name\f[](3), \f[B]ibv_open_device\f[](3) +.SH AUTHOR +.PP +Dotan Barak <dotanba@gmail.com> diff --git a/buildlib/pandoc-prebuilt/a19d89862b00778baf547f3fd1be6f8362c33642 b/buildlib/pandoc-prebuilt/a19d89862b00778baf547f3fd1be6f8362c33642 new file mode 100644 index 0000000..0648f1d --- /dev/null +++ b/buildlib/pandoc-prebuilt/a19d89862b00778baf547f3fd1be6f8362c33642 @@ -0,0 +1,35 @@ +.\" Automatically generated by Pandoc 1.19.2.4 +.\" +.TH "IBV_GET_SRQ_NUM" "3" "2013\-06\-26" "libibverbs" "Libibverbs Programmer\[aq]s Manual" +.hy +.SH NAME +.PP +ibv_get_srq_num \- return srq number associated with the given shared +receive queue (SRQ) +.SH SYNOPSIS +.IP +.nf +\f[C] +#include\ <infiniband/verbs.h> + +int\ ibv_get_srq_num(struct\ ibv_srq\ *srq,\ uint32_t\ *srq_num); +\f[] +.fi +.SH DESCRIPTION +.PP +\f[B]ibv_get_srq_num()\f[] return srq number associated with the given +shared receive queue The argument \f[I]srq\f[] is an ibv_srq struct, as +defined in . +\f[I]srq_num\f[] is an output parameter that holds the returned srq +number. +.SH RETURN VALUE +.PP +\f[B]ibv_get_srq_num()\f[] returns 0 on success, or the value of errno +on failure (which indicates the failure reason). +.SH SEE ALSO +.PP +\f[B]ibv_alloc_pd\f[](3), \f[B]ibv_create_srq_ex\f[](3), +\f[B]ibv_modify_srq\f[](3) +.SH AUTHOR +.PP +Yishai Hadas <yishaih@mellanox.com> diff --git a/buildlib/pandoc-prebuilt/a87ace1daaff861a7854e9773f09a3467c40f02e b/buildlib/pandoc-prebuilt/a87ace1daaff861a7854e9773f09a3467c40f02e new file mode 100644 index 0000000..4c02006 --- /dev/null +++ b/buildlib/pandoc-prebuilt/a87ace1daaff861a7854e9773f09a3467c40f02e @@ -0,0 +1,154 @@ +.\" Automatically generated by Pandoc 1.19.2.4 +.\" +.TH "mlx5dv_devx_obj_create / destroy / modify /query / general" "3" "" "" "" +.hy +.SH NAME +.PP +mlx5dv_devx_obj_create \- Creates a devx object +.PP +mlx5dv_devx_obj_destroy \- Destroys a devx object +.PP +mlx5dv_devx_obj_modify \- Modifies a devx object +.PP +mlx5dv_devx_obj_query \- Queries a devx object +.PP +mlx5dv_devx_obj_query_async \- Queries a devx object in an asynchronous +mode +.PP +mlx5dv_devx_general_cmd \- Issues a general command over the devx +interface +.SH SYNOPSIS +.IP +.nf +\f[C] +#include\ <infiniband/mlx5dv.h> + +struct\ mlx5dv_devx_obj\ * +mlx5dv_devx_obj_create(struct\ ibv_context\ *context,\ const\ void\ *in,\ size_t\ inlen, +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ void\ *out,\ size_t\ outlen); +int\ mlx5dv_devx_obj_query(struct\ mlx5dv_devx_obj\ *obj,\ const\ void\ *in,\ size_t\ inlen, +\ \ \ \ \ \ \ \ \ \ \ \ \ \ void\ *out,\ size_t\ outlen); +int\ mlx5dv_devx_obj_query_async(struct\ mlx5dv_devx_obj\ *obj,\ const\ void\ *in, +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ size_t\ inlen,\ size_t\ outlen, +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ uint64_t\ wr_id, +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ struct\ mlx5dv_devx_cmd_comp\ *cmd_comp); +int\ mlx5dv_devx_obj_modify(struct\ mlx5dv_devx_obj\ *obj,\ const\ void\ *in,\ size_t\ inlen, +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ void\ *out,\ size_t\ outlen); +int\ mlx5dv_devx_obj_destroy(struct\ mlx5dv_devx_obj\ *obj); +int\ mlx5dv_devx_general_cmd(struct\ ibv_context\ *context,\ const\ void\ *in,\ size_t\ inlen, +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ void\ *out,\ size_t\ outlen); +\f[] +.fi +.SH DESCRIPTION +.PP +Create / destroy / modify / query a devx object, issue a general command +over the devx interface. +.PP +The DEVX API enables direct access from the user space area to the mlx5 +device driver by using the KABI mechanism. +The main purpose is to make the user space driver as independent as +possible from the kernel so that future device functionality and +commands can be activated with minimal to none kernel changes. +.PP +A DEVX object represents some underlay firmware object, the input +command to create it is some raw data given by the user application +which should match the device specification. +Upon successful creation the output buffer includes the raw data from +the device according to its specification, this data can be used as part +of related firmware commands to this object. +.PP +Once the DEVX object is created it can be queried/modified/destroyed by +the matching mlx5dv_devx_obj_xxx() API. +Both the input and the output for those APIs need to match the device +specification as well. +.PP +The mlx5dv_devx_general_cmd() API enables issuing some general command +which is not related to an object such as query device capabilities. +.PP +The mlx5dv_devx_obj_query_async() API is similar to the query object +API, however, it runs asynchronously without blocking. +The input includes an mlx5dv_devx_cmd_comp object and an identifier +named \[aq]wr_id\[aq] for this command. +The response should be read upon success with the +mlx5dv_devx_get_async_cmd_comp() API. +The \[aq]wr_id\[aq] that was supplied as an input is returned as part of +the response to let application knows for which command the response is +related to. +.PP +An application can gradually migrate to use DEVX according to its needs, +it is not all or nothing. +For example it can create an ibv_cq via ibv_create_cq() verb and then +use the returned cqn to create a DEVX QP object by the +mlx5dv_devx_obj_create() API which needs that cqn. +.PP +The above example can enable an application to create a QP with some +driver specific attributes that are not exposed in the ibv_create_qp() +API, in that case no user or kernel change may be needed at all as the +command input reaches directly to the firmware. +.PP +The expected users for the DEVX APIs are application that use the mlx5 +DV APIs and are familiar with the device specification in both control +and data path. +.PP +To successfully create a DEVX object and work on, a DEVX context must be +created, this is done by the mlx5dv_open_device() API with the +\f[I]MLX5DV_CONTEXT_FLAGS_DEVX\f[] flag. +.SH ARGUMENTS +.TP +.B \f[I]context\f[] +RDMA device context to create the action on. +.RS +.RE +.TP +.B \f[I]in\f[] +A buffer which contains the command\[aq]s input data provided in a +device specification format. +.RS +.RE +.TP +.B \f[I]inlen\f[] +The size of \f[I]in\f[] buffer in bytes. +.RS +.RE +.TP +.B \f[I]out\f[] +A buffer which contains the command\[aq]s output data according to the +device specification format. +.RS +.RE +.TP +.B \f[I]outlen\f[] +The size of \f[I]out\f[] buffer in bytes. +.RS +.RE +.TP +.B \f[I]obj\f[] +For query, modify, destroy: the devx object to work on. +.RS +.RE +.TP +.B \f[I]wr_id\f[] +The command identifier when working in asynchronous mode. +.RS +.RE +.TP +.B \f[I]cmd_comp\f[] +The command completion object to read the response from in asynchronous +mode. +.RS +.RE +.SH RETURN VALUE +.PP +Upon success \f[I]mlx5dv_devx_create_obj\f[] will return a new +\f[I]struct mlx5dv_devx_obj\f[] on error NULL will be returned and errno +will be set. +.PP +Upon success query, modify, destroy, general commands, 0 is returned or +the value of errno on a failure. +.SH SEE ALSO +.PP +\f[B]mlx5dv_open_device\f[], \f[B]mlx5dv_devx_create_cmd_comp\f[], +\f[B]mlx5dv_devx_get_async_cmd_comp\f[] +.SH AUTHOR +.PP +Yishai Hadas <yishaih@mellanox.com> diff --git a/buildlib/pandoc-prebuilt/a8ffa02582b1604aa43cc72efd3bcde41e65b4cd b/buildlib/pandoc-prebuilt/a8ffa02582b1604aa43cc72efd3bcde41e65b4cd new file mode 100644 index 0000000..fadb209 --- /dev/null +++ b/buildlib/pandoc-prebuilt/a8ffa02582b1604aa43cc72efd3bcde41e65b4cd @@ -0,0 +1,60 @@ +.\" Automatically generated by Pandoc 1.19.2.4 +.\" +.TH "IBV_FORK_INIT" "3" "2006\-10\-31" "libibverbs" "Libibverbs Programmer\[aq]s Manual" +.hy +.SH NAME +.PP +ibv_fork_init \- initialize libibverbs to support fork() +.SH SYNOPSIS +.IP +.nf +\f[C] +#include\ <infiniband/verbs.h> + +int\ ibv_fork_init(void); +\f[] +.fi +.SH DESCRIPTION +.PP +\f[B]ibv_fork_init()\f[] initializes libibverbs\[aq]s data structures to +handle \f[B]fork()\f[] function calls correctly and avoid data +corruption, whether \f[B]fork()\f[] is called explicitly or implicitly +(such as in \f[B]system()\f[]). +.PP +It is not necessary to use this function if all parent process threads +are always blocked until all child processes end or change address +spaces via an \f[B]exec()\f[] operation. +.SH RETURN VALUE +.PP +\f[B]ibv_fork_init()\f[] returns 0 on success, or the value of errno on +failure (which indicates the failure reason). +.SH NOTES +.PP +\f[B]ibv_fork_init()\f[] works on Linux kernels supporting the +\f[B]MADV_DONTFORK\f[] flag for \f[B]madvise()\f[] (2.6.17 and higher). +.PP +Setting the environment variable \f[B]RDMAV_FORK_SAFE\f[] or +\f[B]IBV_FORK_SAFE\f[] has the same effect as calling +\f[B]ibv_fork_init()\f[]. +.PP +Setting the environment variable \f[B]RDMAV_HUGEPAGES_SAFE\f[] tells the +library to check the underlying page size used by the kernel for memory +regions. +This is required if an application uses huge pages either directly or +indirectly via a library such as libhugetlbfs. +.PP +Calling \f[B]ibv_fork_init()\f[] will reduce performance due to an extra +system call for every memory registration, and the additional memory +allocated to track memory regions. +The precise performance impact depends on the workload and usually will +not be significant. +.PP +Setting \f[B]RDMAV_HUGEPAGES_SAFE\f[] adds further overhead to all +memory registrations. +.SH SEE ALSO +.PP +\f[B]exec\f[](3), \f[B]fork\f[](2), \f[B]ibv_get_device_list\f[](3), +\f[B]system\f[](3), \f[B]wait\f[](2) +.SH AUTHOR +.PP +Dotan Barak <dotanba@gmail.com> diff --git a/buildlib/pandoc-prebuilt/a91b9346f932b9f38e4ce3ec5ee815fd39fa0a91 b/buildlib/pandoc-prebuilt/a91b9346f932b9f38e4ce3ec5ee815fd39fa0a91 new file mode 100644 index 0000000..9850df6 --- /dev/null +++ b/buildlib/pandoc-prebuilt/a91b9346f932b9f38e4ce3ec5ee815fd39fa0a91 @@ -0,0 +1,63 @@ +.\" Automatically generated by Pandoc 1.19.2.4 +.\" +.TH "mlx5dv_devx_create_event_channel, mlx5dv_devx_destroy_event_channel" "3" "" "" "" +.hy +.SH NAME +.PP +mlx5dv_devx_create_event_channel \- Create an event channel to be used +for DEVX asynchronous events. +.PP +mlx5dv_devx_destroy_event_channel \- Destroy a DEVX event channel. +.SH SYNOPSIS +.IP +.nf +\f[C] +#include\ <infiniband/mlx5dv.h> + +struct\ mlx5dv_devx_event_channel\ { +\ \ \ \ int\ fd; +}; + +struct\ mlx5dv_devx_event_channel\ * +mlx5dv_devx_create_event_channel(struct\ ibv_context\ *context, +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ enum\ mlx5dv_devx_create_event_channel_flags\ flags) + +void\ mlx5dv_devx_destroy_event_channel(struct\ mlx5dv_devx_event_channel\ *event_channel) +\f[] +.fi +.SH DESCRIPTION +.PP +Create or destroy a channel to be used for DEVX asynchronous events. +.PP +The create verb exposes an mlx5dv_devx_event_channel object that can be +used to read asynchronous DEVX events. +This lets an application to subscribe to get device events and once an +event occurred read it from this object. +.SH ARGUMENTS +.TP +.B \f[I]context\f[] +.IP +.nf +\f[C] +RDMA\ device\ context\ to\ create\ the\ channel\ on. +\f[] +.fi +.RS +.RE +.TP +.B \f[I]flags\f[] +MLX5DV_DEVX_CREATE_EVENT_CHANNEL_FLAGS_OMIT_EV_DATA: omit the event data +on this channel. +.RS +.RE +.SH RETURN VALUE +.PP +Upon success \f[I]mlx5dv_devx_create_event_channel\f[] will return a new +\f[I]struct mlx5dv_devx_event_channel\f[] object, on error NULL will be +returned and errno will be set. +.SH SEE ALSO +.PP +\f[I]mlx5dv_open_device(3)\f[], \f[I]mlx5dv_devx_obj_create(3)\f[] +.SH AUTHOR +.PP +Yishai Hadas <yishaih@mellanox.com> diff --git a/buildlib/pandoc-prebuilt/b003d15c599b5ef09af22508eeec09d65fc91a4e b/buildlib/pandoc-prebuilt/b003d15c599b5ef09af22508eeec09d65fc91a4e new file mode 100644 index 0000000..f56ef55 --- /dev/null +++ b/buildlib/pandoc-prebuilt/b003d15c599b5ef09af22508eeec09d65fc91a4e @@ -0,0 +1,235 @@ +.\" Man page generated from reStructuredText. +. +.TH DUMP_FTS 8 "2013-03-26" "" "OpenIB Diagnostics" +.SH NAME +DUMP_FTS \- dump InfiniBand forwarding tables +. +.nr rst2man-indent-level 0 +. +.de1 rstReportMargin +\\$1 \\n[an-margin] +level \\n[rst2man-indent-level] +level margin: \\n[rst2man-indent\\n[rst2man-indent-level]] +- +\\n[rst2man-indent0] +\\n[rst2man-indent1] +\\n[rst2man-indent2] +.. +.de1 INDENT +.\" .rstReportMargin pre: +. RS \\$1 +. nr rst2man-indent\\n[rst2man-indent-level] \\n[an-margin] +. nr rst2man-indent-level +1 +.\" .rstReportMargin post: +.. +.de UNINDENT +. RE +.\" indent \\n[an-margin] +.\" old: \\n[rst2man-indent\\n[rst2man-indent-level]] +.nr rst2man-indent-level -1 +.\" new: \\n[rst2man-indent\\n[rst2man-indent-level]] +.in \\n[rst2man-indent\\n[rst2man-indent-level]]u +.. +.SH SYNOPSIS +.sp +dump_fts [options] [<startlid> [<endlid>]] +.SH DESCRIPTION +.sp +dump_fts is similar to ibroute but dumps tables for every switch found in an +ibnetdiscover scan of the subnet. +.sp +The dump file format is compatible with loading into OpenSM using +the \-R file \-U /path/to/dump\-file syntax. +.SH OPTIONS +.INDENT 0.0 +.TP +.B \fB\-a, \-\-all\fP +show all lids in range, even invalid entries +.TP +.B \fB\-n, \-\-no_dests\fP +do not try to resolve destinations +.TP +.B \fB\-M, \-\-Multicast\fP +show multicast forwarding tables +In this case, the range parameters are specifying the mlid range. +.UNINDENT +.SS Port Selection flags +.\" Define the common option -C +. +.sp +\fB\-C, \-\-Ca <ca_name>\fP use the specified ca_name. +.\" Define the common option -P +. +.sp +\fB\-P, \-\-Port <ca_port>\fP use the specified ca_port. +.\" Explanation of local port selection +. +.SS Local port Selection +.sp +Multiple port/Multiple CA support: when no IB device or port is specified +(see the "local umad parameters" below), the libibumad library +selects the port to use by the following criteria: +.INDENT 0.0 +.INDENT 3.5 +.INDENT 0.0 +.IP 1. 3 +the first port that is ACTIVE. +.IP 2. 3 +if not found, the first port that is UP (physical link up). +.UNINDENT +.sp +If a port and/or CA name is specified, the libibumad library attempts +to fulfill the user request, and will fail if it is not possible. +.sp +For example: +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +ibaddr # use the first port (criteria #1 above) +ibaddr \-C mthca1 # pick the best port from "mthca1" only. +ibaddr \-P 2 # use the second (active/up) port from the first available IB device. +ibaddr \-C mthca0 \-P 2 # use the specified port only. +.ft P +.fi +.UNINDENT +.UNINDENT +.UNINDENT +.UNINDENT +.SS Debugging flags +.\" Define the common option -d +. +.INDENT 0.0 +.TP +.B \-d +raise the IB debugging level. +May be used several times (\-ddd or \-d \-d \-d). +.UNINDENT +.\" Define the common option -e +. +.INDENT 0.0 +.TP +.B \-e +show send and receive errors (timeouts and others) +.UNINDENT +.\" Define the common option -h +. +.sp +\fB\-h, \-\-help\fP show the usage message +.\" Define the common option -v +. +.INDENT 0.0 +.TP +.B \fB\-v, \-\-verbose\fP +increase the application verbosity level. +May be used several times (\-vv or \-v \-v \-v) +.UNINDENT +.\" Define the common option -V +. +.sp +\fB\-V, \-\-version\fP show the version info. +.SS Configuration flags +.\" Define the common option -t +. +.sp +\fB\-t, \-\-timeout <timeout_ms>\fP override the default timeout for the solicited mads. +.\" Define the common option -y +. +.INDENT 0.0 +.TP +.B \fB\-y, \-\-m_key <key>\fP +use the specified M_key for requests. If non\-numeric value (like \(aqx\(aq) +is specified then a value will be prompted for. +.UNINDENT +.\" Define the common option --node-name-map +. +.sp +\fB\-\-node\-name\-map <node\-name\-map>\fP Specify a node name map. +.INDENT 0.0 +.INDENT 3.5 +This file maps GUIDs to more user friendly names. See FILES section. +.UNINDENT +.UNINDENT +.\" Define the common option -z +. +.sp +\fB\-\-config, \-z <config_file>\fP Specify alternate config file. +.INDENT 0.0 +.INDENT 3.5 +Default: /usr/local/etc/infiniband\-diags/ibdiag.conf +.UNINDENT +.UNINDENT +.SH FILES +.\" Common text for the config file +. +.SS CONFIG FILE +.sp +/usr/local/etc/infiniband\-diags/ibdiag.conf +.sp +A global config file is provided to set some of the common options for all +tools. See supplied config file for details. +.\" Common text to describe the node name map file. +. +.SS NODE NAME MAP FILE FORMAT +.sp +The node name map is used to specify user friendly names for nodes in the +output. GUIDs are used to perform the lookup. +.sp +This functionality is provided by the opensm\-libs package. See \fBopensm(8)\fP +for the file location for your installation. +.sp +\fBGenerically:\fP +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +# comment +<guid> "<name>" +.ft P +.fi +.UNINDENT +.UNINDENT +.sp +\fBExample:\fP +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +# IB1 +# Line cards +0x0008f104003f125c "IB1 (Rack 11 slot 1 ) ISR9288/ISR9096 Voltaire sLB\-24D" +0x0008f104003f125d "IB1 (Rack 11 slot 1 ) ISR9288/ISR9096 Voltaire sLB\-24D" +0x0008f104003f10d2 "IB1 (Rack 11 slot 2 ) ISR9288/ISR9096 Voltaire sLB\-24D" +0x0008f104003f10d3 "IB1 (Rack 11 slot 2 ) ISR9288/ISR9096 Voltaire sLB\-24D" +0x0008f104003f10bf "IB1 (Rack 11 slot 12 ) ISR9288/ISR9096 Voltaire sLB\-24D" + +# Spines +0x0008f10400400e2d "IB1 (Rack 11 spine 1 ) ISR9288 Voltaire sFB\-12D" +0x0008f10400400e2e "IB1 (Rack 11 spine 1 ) ISR9288 Voltaire sFB\-12D" +0x0008f10400400e2f "IB1 (Rack 11 spine 1 ) ISR9288 Voltaire sFB\-12D" +0x0008f10400400e31 "IB1 (Rack 11 spine 2 ) ISR9288 Voltaire sFB\-12D" +0x0008f10400400e32 "IB1 (Rack 11 spine 2 ) ISR9288 Voltaire sFB\-12D" + +# GUID Node Name +0x0008f10400411a08 "SW1 (Rack 3) ISR9024 Voltaire 9024D" +0x0008f10400411a28 "SW2 (Rack 3) ISR9024 Voltaire 9024D" +0x0008f10400411a34 "SW3 (Rack 3) ISR9024 Voltaire 9024D" +0x0008f104004119d0 "SW4 (Rack 3) ISR9024 Voltaire 9024D" +.ft P +.fi +.UNINDENT +.UNINDENT +.SH SEE ALSO +.sp +\fBdump_lfts(8), dump_mfts(8), ibroute(8), ibswitches(8), opensm(8)\fP +.SH AUTHORS +.INDENT 0.0 +.TP +.B Ira Weiny +< \fI\%ira.weiny@intel.com\fP > +.UNINDENT +.\" Generated by docutils manpage writer. +. diff --git a/buildlib/pandoc-prebuilt/b4a6bc6bbb2f05ddc2593766851a6aaf9fd4d306 b/buildlib/pandoc-prebuilt/b4a6bc6bbb2f05ddc2593766851a6aaf9fd4d306 new file mode 100644 index 0000000..1055bb0 --- /dev/null +++ b/buildlib/pandoc-prebuilt/b4a6bc6bbb2f05ddc2593766851a6aaf9fd4d306 @@ -0,0 +1,79 @@ +.\" Automatically generated by Pandoc 1.19.2.4 +.\" +.TH "mlx5dv_pp_alloc / mlx5dv_pp_free" "3" "" "" "" +.hy +.SH NAME +.PP +mlx5dv_pp_alloc \- Allocates a packet pacing entry +.PP +mlx5dv_pp_free \- Frees a packet pacing entry +.SH SYNOPSIS +.IP +.nf +\f[C] +#include\ <infiniband/mlx5dv.h> + +struct\ mlx5dv_pp\ * +mlx5dv_pp_alloc(struct\ ibv_context\ *context, +\ \ \ \ \ \ \ \ size_t\ pp_context_sz, +\ \ \ \ \ \ \ \ const\ void\ *pp_context, +\ \ \ \ \ \ \ \ uint32_t\ flags); + +void\ mlx5dv_pp_free(struct\ mlx5dv_pp\ *dv_pp); +\f[] +.fi +.SH DESCRIPTION +.PP +Create / free a packet pacing entry which can be used for some device +commands over the DEVX interface. +.PP +The DEVX API enables direct access from the user space area to the mlx5 +device driver, the packet pacing information is needed for few commands +where a packet pacing index is needed. +.SH ARGUMENTS +.TP +.B \f[I]context\f[] +RDMA device context to work on, need to be opened with DEVX support by +using mlx5dv_open_device(). +.RS +.RE +.TP +.B \f[I]pp_context_sz\f[] +Length of \f[I]pp_context\f[] input buffer. +.RS +.RE +.TP +.B \f[I]pp_context\f[] +Packet pacing context according to the device specification. +.RS +.RE +.TP +.B \f[I]flags\f[] +MLX5DV_PP_ALLOC_FLAGS_DEDICATED_INDEX: allocate a dedicated index. +.RS +.RE +.SS dv_pp +.IP +.nf +\f[C] +struct\ mlx5dv_pp\ { +\ \ \ \ uint16_t\ index; +}; +\f[] +.fi +.TP +.B \f[I]index\f[] +The device index to be used. +.RS +.RE +.SH RETURN VALUE +.PP +Upon success \f[I]mlx5dv_pp_alloc\f[] returns a pointer to the created +packet pacing object, on error NULL will be returned and errno will be +set. +.SH SEE ALSO +.PP +\f[B]mlx5dv_open_device\f[], \f[B]mlx5dv_devx_obj_create\f[] +.SH AUTHOR +.PP +Yishai Hadas <yishaih@mellanox.com> diff --git a/buildlib/pandoc-prebuilt/ba65d77d17660b85d41abd686123bb873bfe53d2 b/buildlib/pandoc-prebuilt/ba65d77d17660b85d41abd686123bb873bfe53d2 new file mode 100644 index 0000000..f10cb17 --- /dev/null +++ b/buildlib/pandoc-prebuilt/ba65d77d17660b85d41abd686123bb873bfe53d2 @@ -0,0 +1,290 @@ +.\" Man page generated from reStructuredText. +. +.TH IBROUTE 8 "2017-08-21" "" "Open IB Diagnostics" +.SH NAME +ibroute \- query InfiniBand switch forwarding tables +. +.nr rst2man-indent-level 0 +. +.de1 rstReportMargin +\\$1 \\n[an-margin] +level \\n[rst2man-indent-level] +level margin: \\n[rst2man-indent\\n[rst2man-indent-level]] +- +\\n[rst2man-indent0] +\\n[rst2man-indent1] +\\n[rst2man-indent2] +.. +.de1 INDENT +.\" .rstReportMargin pre: +. RS \\$1 +. nr rst2man-indent\\n[rst2man-indent-level] \\n[an-margin] +. nr rst2man-indent-level +1 +.\" .rstReportMargin post: +.. +.de UNINDENT +. RE +.\" indent \\n[an-margin] +.\" old: \\n[rst2man-indent\\n[rst2man-indent-level]] +.nr rst2man-indent-level -1 +.\" new: \\n[rst2man-indent\\n[rst2man-indent-level]] +.in \\n[rst2man-indent\\n[rst2man-indent-level]]u +.. +.SH SYNOPSIS +.sp +ibroute [options] [<dest dr_path|lid|guid> [<startlid> [<endlid>]]] +.SH DESCRIPTION +.sp +ibroute uses SMPs to display the forwarding tables (unicast +(LinearForwardingTable or LFT) or multicast (MulticastForwardingTable or MFT)) +for the specified switch LID and the optional lid (mlid) range. +The default range is all valid entries in the range 1...FDBTop. +.SH OPTIONS +.INDENT 0.0 +.TP +.B \fB\-a, \-\-all\fP +show all lids in range, even invalid entries +.TP +.B \fB\-n, \-\-no_dests\fP +do not try to resolve destinations +.TP +.B \fB\-M, \-\-Multicast\fP +show multicast forwarding tables +In this case, the range parameters are specifying the mlid range. +.UNINDENT +.SS Addressing Flags +.\" Define the common option -D for Directed routes +. +.sp +\fB\-D, \-\-Direct\fP The address specified is a directed route +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +Examples: + [options] \-D [options] "0" # self port + [options] \-D [options] "0,1,2,1,4" # out via port 1, then 2, ... + + (Note the second number in the path specified must match the port being + used. This can be specified using the port selection flag \(aq\-P\(aq or the + port found through the automatic selection process.) +.ft P +.fi +.UNINDENT +.UNINDENT +.\" Define the common option -G +. +.sp +\fB\-G, \-\-Guid\fP The address specified is a Port GUID +.\" Define the common option -L +. +.sp +\fB\-L, \-\-Lid\fP The address specified is a LID +.\" Define the common option -s +. +.sp +\fB\-s, \-\-sm_port <smlid>\fP use \(aqsmlid\(aq as the target lid for SA queries. +.SS Port Selection flags +.\" Define the common option -C +. +.sp +\fB\-C, \-\-Ca <ca_name>\fP use the specified ca_name. +.\" Define the common option -P +. +.sp +\fB\-P, \-\-Port <ca_port>\fP use the specified ca_port. +.\" Explanation of local port selection +. +.SS Local port Selection +.sp +Multiple port/Multiple CA support: when no IB device or port is specified +(see the "local umad parameters" below), the libibumad library +selects the port to use by the following criteria: +.INDENT 0.0 +.INDENT 3.5 +.INDENT 0.0 +.IP 1. 3 +the first port that is ACTIVE. +.IP 2. 3 +if not found, the first port that is UP (physical link up). +.UNINDENT +.sp +If a port and/or CA name is specified, the libibumad library attempts +to fulfill the user request, and will fail if it is not possible. +.sp +For example: +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +ibaddr # use the first port (criteria #1 above) +ibaddr \-C mthca1 # pick the best port from "mthca1" only. +ibaddr \-P 2 # use the second (active/up) port from the first available IB device. +ibaddr \-C mthca0 \-P 2 # use the specified port only. +.ft P +.fi +.UNINDENT +.UNINDENT +.UNINDENT +.UNINDENT +.SS Debugging flags +.\" Define the common option -d +. +.INDENT 0.0 +.TP +.B \-d +raise the IB debugging level. +May be used several times (\-ddd or \-d \-d \-d). +.UNINDENT +.\" Define the common option -e +. +.INDENT 0.0 +.TP +.B \-e +show send and receive errors (timeouts and others) +.UNINDENT +.\" Define the common option -h +. +.sp +\fB\-h, \-\-help\fP show the usage message +.\" Define the common option -v +. +.INDENT 0.0 +.TP +.B \fB\-v, \-\-verbose\fP +increase the application verbosity level. +May be used several times (\-vv or \-v \-v \-v) +.UNINDENT +.\" Define the common option -V +. +.sp +\fB\-V, \-\-version\fP show the version info. +.SS Configuration flags +.\" Define the common option -t +. +.sp +\fB\-t, \-\-timeout <timeout_ms>\fP override the default timeout for the solicited mads. +.\" Define the common option -y +. +.INDENT 0.0 +.TP +.B \fB\-y, \-\-m_key <key>\fP +use the specified M_key for requests. If non\-numeric value (like \(aqx\(aq) +is specified then a value will be prompted for. +.UNINDENT +.\" Define the common option --node-name-map +. +.sp +\fB\-\-node\-name\-map <node\-name\-map>\fP Specify a node name map. +.INDENT 0.0 +.INDENT 3.5 +This file maps GUIDs to more user friendly names. See FILES section. +.UNINDENT +.UNINDENT +.\" Define the common option -z +. +.sp +\fB\-\-config, \-z <config_file>\fP Specify alternate config file. +.INDENT 0.0 +.INDENT 3.5 +Default: /usr/local/etc/infiniband\-diags/ibdiag.conf +.UNINDENT +.UNINDENT +.SH FILES +.\" Common text for the config file +. +.SS CONFIG FILE +.sp +/usr/local/etc/infiniband\-diags/ibdiag.conf +.sp +A global config file is provided to set some of the common options for all +tools. See supplied config file for details. +.\" Common text to describe the node name map file. +. +.SS NODE NAME MAP FILE FORMAT +.sp +The node name map is used to specify user friendly names for nodes in the +output. GUIDs are used to perform the lookup. +.sp +This functionality is provided by the opensm\-libs package. See \fBopensm(8)\fP +for the file location for your installation. +.sp +\fBGenerically:\fP +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +# comment +<guid> "<name>" +.ft P +.fi +.UNINDENT +.UNINDENT +.sp +\fBExample:\fP +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +# IB1 +# Line cards +0x0008f104003f125c "IB1 (Rack 11 slot 1 ) ISR9288/ISR9096 Voltaire sLB\-24D" +0x0008f104003f125d "IB1 (Rack 11 slot 1 ) ISR9288/ISR9096 Voltaire sLB\-24D" +0x0008f104003f10d2 "IB1 (Rack 11 slot 2 ) ISR9288/ISR9096 Voltaire sLB\-24D" +0x0008f104003f10d3 "IB1 (Rack 11 slot 2 ) ISR9288/ISR9096 Voltaire sLB\-24D" +0x0008f104003f10bf "IB1 (Rack 11 slot 12 ) ISR9288/ISR9096 Voltaire sLB\-24D" + +# Spines +0x0008f10400400e2d "IB1 (Rack 11 spine 1 ) ISR9288 Voltaire sFB\-12D" +0x0008f10400400e2e "IB1 (Rack 11 spine 1 ) ISR9288 Voltaire sFB\-12D" +0x0008f10400400e2f "IB1 (Rack 11 spine 1 ) ISR9288 Voltaire sFB\-12D" +0x0008f10400400e31 "IB1 (Rack 11 spine 2 ) ISR9288 Voltaire sFB\-12D" +0x0008f10400400e32 "IB1 (Rack 11 spine 2 ) ISR9288 Voltaire sFB\-12D" + +# GUID Node Name +0x0008f10400411a08 "SW1 (Rack 3) ISR9024 Voltaire 9024D" +0x0008f10400411a28 "SW2 (Rack 3) ISR9024 Voltaire 9024D" +0x0008f10400411a34 "SW3 (Rack 3) ISR9024 Voltaire 9024D" +0x0008f104004119d0 "SW4 (Rack 3) ISR9024 Voltaire 9024D" +.ft P +.fi +.UNINDENT +.UNINDENT +.SH EXAMPLES +.sp +Unicast examples +.INDENT 0.0 +.TP +.B :: +ibroute 4 # dump all lids with valid out ports of switch with lid 4 +ibroute \-a 4 # same, but dump all lids, even with invalid out ports +ibroute \-n 4 # simple dump format \- no destination resolution +ibroute 4 10 # dump lids starting from 10 (up to FDBTop) +ibroute 4 0x10 0x20 # dump lid range +ibroute \-G 0x08f1040023 # resolve switch by GUID +ibroute \-D 0,1 # resolve switch by direct path +.UNINDENT +.sp +Multicast examples +.INDENT 0.0 +.TP +.B :: +ibroute \-M 4 # dump all non empty mlids of switch with lid 4 +ibroute \-M 4 0xc010 0xc020 # same, but with range +ibroute \-M \-n 4 # simple dump format +.UNINDENT +.SH SEE ALSO +.sp +ibtracert (8) +.SH AUTHOR +.INDENT 0.0 +.TP +.B Hal Rosenstock +< \fI\%halr@voltaire.com\fP > +.UNINDENT +.\" Generated by docutils manpage writer. +. diff --git a/buildlib/pandoc-prebuilt/bc330f50986a4c202ab66bc12d82b6904bff909f b/buildlib/pandoc-prebuilt/bc330f50986a4c202ab66bc12d82b6904bff909f new file mode 100644 index 0000000..7b980ff --- /dev/null +++ b/buildlib/pandoc-prebuilt/bc330f50986a4c202ab66bc12d82b6904bff909f @@ -0,0 +1,31 @@ +.\" Automatically generated by Pandoc 1.19.2.4 +.\" +.TH "EFADV" "7" "2019\-01\-19" "efa" "EFA Direct Verbs Manual" +.hy +.SH NAME +.PP +efadv \- Direct verbs for efa devices +.PP +This provides low level access to efa devices to perform direct +operations, without general branching performed by libibverbs. +.SH DESCRIPTION +.PP +The libibverbs API is an abstract one. +It is agnostic to any underlying provider specific implementation. +While this abstraction has the advantage of user applications +portability, it has a performance penalty. +For some applications optimizing performance is more important than +portability. +.PP +The efa direct verbs API is intended for such applications. +It exposes efa specific low level operations, allowing the application +to bypass the libibverbs API. +.PP +The direct include of efadv.h together with linkage to efa library will +allow usage of this new interface. +.SH SEE ALSO +.PP +\f[B]verbs\f[](7) +.SH AUTHORS +.PP +Gal Pressman <galpress@amazon.com> diff --git a/buildlib/pandoc-prebuilt/bde0f0fb11d80958e182842cb166935bb5be3347 b/buildlib/pandoc-prebuilt/bde0f0fb11d80958e182842cb166935bb5be3347 new file mode 100644 index 0000000..024e42c --- /dev/null +++ b/buildlib/pandoc-prebuilt/bde0f0fb11d80958e182842cb166935bb5be3347 @@ -0,0 +1,40 @@ +.\" Automatically generated by Pandoc 1.19.2.4 +.\" +.TH "IBV_GET_PKEY_INDEX" "3" "2018\-07\-16" "libibverbs" "Libibverbs Programmer\[aq]s Manual" +.hy +.SH NAME +.PP +ibv_get_pkey_index \- obtain the index in the P_Key table of a P_Key +.SH SYNOPSIS +.IP +.nf +\f[C] +#include\ <infiniband/verbs.h> + +int\ ibv_get_pkey_index(struct\ ibv_context\ *context, +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ uint8_t\ port_num, +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ __be16\ pkey); +\f[] +.fi +.SH DESCRIPTION +.PP +Every InfiniBand HCA maintains a P_Key table for each of its ports that +is indexed by an integer and with a P_Key in each element. +Certain InfiniBand data structures that work with P_Keys expect a P_Key +index, e.g. +\f[B]struct ibv_qp_attr\f[] and \f[B]struct ib_mad_addr\f[]. +Hence the function \f[B]ibv_get_pkey_index()\f[] that accepts a P_Key in +network byte order and that returns an index in the P_Key table as +result. +.SH RETURN VALUE +.PP +\f[B]ibv_get_pkey_index()\f[] returns the P_Key index on success, and +\-1 on error. +.SH SEE ALSO +.PP +\f[B]ibv_open_device\f[](3), \f[B]ibv_query_device\f[](3), +\f[B]ibv_query_gid\f[](3), \f[B]ibv_query_pkey\f[](3), +\f[B]ibv_query_port\f[](3) +.SH AUTHOR +.PP +Bart Van Assche <bvanassche@acm.org> diff --git a/buildlib/pandoc-prebuilt/c04cb3485c93cab9966ca5313a020b7b83bfbd9a b/buildlib/pandoc-prebuilt/c04cb3485c93cab9966ca5313a020b7b83bfbd9a new file mode 100644 index 0000000..0e7d19f --- /dev/null +++ b/buildlib/pandoc-prebuilt/c04cb3485c93cab9966ca5313a020b7b83bfbd9a @@ -0,0 +1,339 @@ +.\" Man page generated from reStructuredText. +. +.TH IBQUERYERRORS 8 "2016-09-26" "" "OpenIB Diagnostics" +.SH NAME +IBQUERYERRORS \- query and report IB port counters +. +.nr rst2man-indent-level 0 +. +.de1 rstReportMargin +\\$1 \\n[an-margin] +level \\n[rst2man-indent-level] +level margin: \\n[rst2man-indent\\n[rst2man-indent-level]] +- +\\n[rst2man-indent0] +\\n[rst2man-indent1] +\\n[rst2man-indent2] +.. +.de1 INDENT +.\" .rstReportMargin pre: +. RS \\$1 +. nr rst2man-indent\\n[rst2man-indent-level] \\n[an-margin] +. nr rst2man-indent-level +1 +.\" .rstReportMargin post: +.. +.de UNINDENT +. RE +.\" indent \\n[an-margin] +.\" old: \\n[rst2man-indent\\n[rst2man-indent-level]] +.nr rst2man-indent-level -1 +.\" new: \\n[rst2man-indent\\n[rst2man-indent-level]] +.in \\n[rst2man-indent\\n[rst2man-indent-level]]u +.. +.SH SYNOPSIS +.sp +ibqueryerrors [options] +.SH DESCRIPTION +.sp +The default behavior is to report the port error counters which exceed a +threshold for each port in the fabric. The default threshold is zero (0). +Error fields can also be suppressed entirely. +.sp +In addition to reporting errors on every port. ibqueryerrors can report the +port transmit and receive data as well as report full link information to the +remote port if available. +.SH OPTIONS +.sp +\fB\-s, \-\-suppress <err1,err2,...>\fP +Suppress the errors listed in the comma separated list provided. +.sp +\fB\-c, \-\-suppress\-common\fP +Suppress some of the common "side effect" counters. These counters usually do +not indicate an error condition and can be usually be safely ignored. +.sp +\fB\-r, \-\-report\-port\fP +Report the port information. This includes LID, port, external port (if +applicable), link speed setting, remote GUID, remote port, remote external port +(if applicable), and remote node description information. +.sp +\fB\-\-data\fP +Include the optional transmit and receive data counters. +.sp +\fB\-\-threshold\-file <filename>\fP +Specify an alternate threshold file. The default is /usr/local/etc/infiniband\-diags/error_thresholds +.sp +\fB\-\-switch\fP print data for switch\(aqs only +.sp +\fB\-\-ca\fP print data for CA\(aqs only +.sp +\fB\-\-skip\-sl\fP Use the default sl for queries. This is not recommended when +using a QoS aware routing engine as it can cause a credit deadlock. +.sp +\fB\-\-router\fP print data for routers only +.sp +\fB\-\-clear\-errors \-k\fP Clear error counters after read. +.sp +\fB\-\-clear\-counts \-K\fP Clear data counters after read. +.sp +\fBCAUTION\fP clearing data or error counters will occur regardless of if they +are printed or not. See \fB\-\-counters\fP and \fB\-\-data\fP for details on +controlling which counters are printed. +.sp +\fB\-\-details\fP include receive error and transmit discard details +.sp +\fB\-\-counters\fP print data counters only +.SS Partial Scan flags +.sp +The node to start a partial scan can be specified with the following addresses. +.\" Define the common option -G +. +.sp +\fB\-\-port\-guid, \-G <port_guid>\fP Specify a port_guid +.\" Define the common option -D for Directed routes +. +.sp +\fB\-D, \-\-Direct <dr_path>\fP The address specified is a directed route +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +Examples: + \-D "0" # self port + \-D "0,1,2,1,4" # out via port 1, then 2, ... + + (Note the second number in the path specified must match the port being + used. This can be specified using the port selection flag \(aq\-P\(aq or the + port found through the automatic selection process.) +.ft P +.fi +.UNINDENT +.UNINDENT +.sp +\fBNote:\fP For switches results are printed for all ports not just switch port 0. +.sp +\fB\-S <port_guid>\fP same as "\-G". (provided only for backward compatibility) +.SS Cache File flags +.\" Define the common option load-cache +. +.sp +\fB\-\-load\-cache <filename>\fP +Load and use the cached ibnetdiscover data stored in the specified +filename. May be useful for outputting and learning about other +fabrics or a previous state of a fabric. +.SS Port Selection flags +.\" Define the common option -C +. +.sp +\fB\-C, \-\-Ca <ca_name>\fP use the specified ca_name. +.\" Define the common option -P +. +.sp +\fB\-P, \-\-Port <ca_port>\fP use the specified ca_port. +.\" Explanation of local port selection +. +.SS Local port Selection +.sp +Multiple port/Multiple CA support: when no IB device or port is specified +(see the "local umad parameters" below), the libibumad library +selects the port to use by the following criteria: +.INDENT 0.0 +.INDENT 3.5 +.INDENT 0.0 +.IP 1. 3 +the first port that is ACTIVE. +.IP 2. 3 +if not found, the first port that is UP (physical link up). +.UNINDENT +.sp +If a port and/or CA name is specified, the libibumad library attempts +to fulfill the user request, and will fail if it is not possible. +.sp +For example: +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +ibaddr # use the first port (criteria #1 above) +ibaddr \-C mthca1 # pick the best port from "mthca1" only. +ibaddr \-P 2 # use the second (active/up) port from the first available IB device. +ibaddr \-C mthca0 \-P 2 # use the specified port only. +.ft P +.fi +.UNINDENT +.UNINDENT +.UNINDENT +.UNINDENT +.SS Configuration flags +.\" Define the common option -z +. +.sp +\fB\-\-config, \-z <config_file>\fP Specify alternate config file. +.INDENT 0.0 +.INDENT 3.5 +Default: /usr/local/etc/infiniband\-diags/ibdiag.conf +.UNINDENT +.UNINDENT +.\" Define the common option -z +. +.INDENT 0.0 +.TP +.B \fB\-\-outstanding_smps, \-o <val>\fP +Specify the number of outstanding SMP\(aqs which should be issued during the scan +.sp +Default: 2 +.UNINDENT +.\" Define the common option --node-name-map +. +.sp +\fB\-\-node\-name\-map <node\-name\-map>\fP Specify a node name map. +.INDENT 0.0 +.INDENT 3.5 +This file maps GUIDs to more user friendly names. See FILES section. +.UNINDENT +.UNINDENT +.\" Define the common option -t +. +.sp +\fB\-t, \-\-timeout <timeout_ms>\fP override the default timeout for the solicited mads. +.\" Define the common option -y +. +.INDENT 0.0 +.TP +.B \fB\-y, \-\-m_key <key>\fP +use the specified M_key for requests. If non\-numeric value (like \(aqx\(aq) +is specified then a value will be prompted for. +.UNINDENT +.SS Debugging flags +.\" Define the common option -d +. +.INDENT 0.0 +.TP +.B \-d +raise the IB debugging level. +May be used several times (\-ddd or \-d \-d \-d). +.UNINDENT +.\" Define the common option -e +. +.INDENT 0.0 +.TP +.B \-e +show send and receive errors (timeouts and others) +.UNINDENT +.\" Define the common option -h +. +.sp +\fB\-h, \-\-help\fP show the usage message +.\" Define the common option -v +. +.INDENT 0.0 +.TP +.B \fB\-v, \-\-verbose\fP +increase the application verbosity level. +May be used several times (\-vv or \-v \-v \-v) +.UNINDENT +.\" Define the common option -V +. +.sp +\fB\-V, \-\-version\fP show the version info. +.sp +\fB\-R\fP (This option is obsolete and does nothing) +.SH EXIT STATUS +.sp +\fB\-1\fP if scan fails. +.sp +\fB0\fP if scan succeeds without errors beyond thresholds +.sp +\fB1\fP if errors are found beyond thresholds or inconsistencies are found in check mode. +.SH FILES +.SS ERROR THRESHOLD +.sp +/usr/local/etc/infiniband\-diags/error_thresholds +.sp +Define threshold values for errors. File format is simple "name=val". +Comments begin with \(aq#\(aq +.sp +\fBExample:\fP +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +# Define thresholds for error counters +SymbolErrorCounter=10 +LinkErrorRecoveryCounter=10 +VL15Dropped=100 +.ft P +.fi +.UNINDENT +.UNINDENT +.\" Common text for the config file +. +.SS CONFIG FILE +.sp +/usr/local/etc/infiniband\-diags/ibdiag.conf +.sp +A global config file is provided to set some of the common options for all +tools. See supplied config file for details. +.\" Common text to describe the node name map file. +. +.SS NODE NAME MAP FILE FORMAT +.sp +The node name map is used to specify user friendly names for nodes in the +output. GUIDs are used to perform the lookup. +.sp +This functionality is provided by the opensm\-libs package. See \fBopensm(8)\fP +for the file location for your installation. +.sp +\fBGenerically:\fP +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +# comment +<guid> "<name>" +.ft P +.fi +.UNINDENT +.UNINDENT +.sp +\fBExample:\fP +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +# IB1 +# Line cards +0x0008f104003f125c "IB1 (Rack 11 slot 1 ) ISR9288/ISR9096 Voltaire sLB\-24D" +0x0008f104003f125d "IB1 (Rack 11 slot 1 ) ISR9288/ISR9096 Voltaire sLB\-24D" +0x0008f104003f10d2 "IB1 (Rack 11 slot 2 ) ISR9288/ISR9096 Voltaire sLB\-24D" +0x0008f104003f10d3 "IB1 (Rack 11 slot 2 ) ISR9288/ISR9096 Voltaire sLB\-24D" +0x0008f104003f10bf "IB1 (Rack 11 slot 12 ) ISR9288/ISR9096 Voltaire sLB\-24D" + +# Spines +0x0008f10400400e2d "IB1 (Rack 11 spine 1 ) ISR9288 Voltaire sFB\-12D" +0x0008f10400400e2e "IB1 (Rack 11 spine 1 ) ISR9288 Voltaire sFB\-12D" +0x0008f10400400e2f "IB1 (Rack 11 spine 1 ) ISR9288 Voltaire sFB\-12D" +0x0008f10400400e31 "IB1 (Rack 11 spine 2 ) ISR9288 Voltaire sFB\-12D" +0x0008f10400400e32 "IB1 (Rack 11 spine 2 ) ISR9288 Voltaire sFB\-12D" + +# GUID Node Name +0x0008f10400411a08 "SW1 (Rack 3) ISR9024 Voltaire 9024D" +0x0008f10400411a28 "SW2 (Rack 3) ISR9024 Voltaire 9024D" +0x0008f10400411a34 "SW3 (Rack 3) ISR9024 Voltaire 9024D" +0x0008f104004119d0 "SW4 (Rack 3) ISR9024 Voltaire 9024D" +.ft P +.fi +.UNINDENT +.UNINDENT +.SH AUTHOR +.INDENT 0.0 +.TP +.B Ira Weiny +< \fI\%ira.weiny@intel.com\fP > +.UNINDENT +.\" Generated by docutils manpage writer. +. diff --git a/buildlib/pandoc-prebuilt/c0c239f1fb706358d4ee439f21164f7fb0662c86 b/buildlib/pandoc-prebuilt/c0c239f1fb706358d4ee439f21164f7fb0662c86 new file mode 100644 index 0000000..d03ec65 --- /dev/null +++ b/buildlib/pandoc-prebuilt/c0c239f1fb706358d4ee439f21164f7fb0662c86 @@ -0,0 +1,43 @@ +.\" Automatically generated by Pandoc 1.19.2.4 +.\" +.TH "IBV_RATE_TO_MBPS" "3" "2012\-03\-31" "libibverbs" "Libibverbs Programmer\[aq]s Manual" +.hy +.SH NAME +.PP +ibv_rate_to_mbps \- convert IB rate enumeration to Mbit/sec +.PP +mbps_to_ibv_rate \- convert Mbit/sec to an IB rate enumeration +.SH SYNOPSIS +.IP +.nf +\f[C] +#include\ <infiniband/verbs.h> + +int\ ibv_rate_to_mbps(enum\ ibv_rate\ rate); + +enum\ ibv_rate\ mbps_to_ibv_rate(int\ mbps); +\f[] +.fi +.SH DESCRIPTION +.PP +\f[B]ibv_rate_to_mbps()\f[] converts the IB transmission rate +enumeration \f[I]rate\f[] to a number of Mbit/sec. +For example, if \f[I]rate\f[] is \f[B]IBV_RATE_5_GBPS\f[], the value +5000 will be returned (5 Gbit/sec = 5000 Mbit/sec). +.PP +\f[B]mbps_to_ibv_rate()\f[] converts the number of Mbit/sec +\f[I]mult\f[] to an IB transmission rate enumeration. +For example, if \f[I]mult\f[] is 5000, the rate enumeration +\f[B]IBV_RATE_5_GBPS\f[] will be returned. +.SH RETURN VALUE +.PP +\f[B]ibv_rate_to_mbps()\f[] returns the number of Mbit/sec. +.PP +\f[B]mbps_to_ibv_rate()\f[] returns the enumeration representing the IB +transmission rate. +.SH SEE ALSO +.PP +\f[B]ibv_query_port\f[](3) +.SH AUTHOR +.PP +Dotan Barak <dotanb@dev.mellanox.co.il> diff --git a/buildlib/pandoc-prebuilt/c10b498742b7bd02b349331d8ab6ed7a2951bbc5 b/buildlib/pandoc-prebuilt/c10b498742b7bd02b349331d8ab6ed7a2951bbc5 new file mode 100644 index 0000000..39c281a --- /dev/null +++ b/buildlib/pandoc-prebuilt/c10b498742b7bd02b349331d8ab6ed7a2951bbc5 @@ -0,0 +1,51 @@ +.\" Automatically generated by Pandoc 1.19.2.4 +.\" +.TH "RDMA_ESTABLISH" "3" "2019\-01\-16" "librdmacm" "Librdmacm Programmer\[aq]s Manual" +.hy +.SH NAME +.PP +rdma_establish \- Complete an active connection request. +.SH SYNOPSIS +.IP +.nf +\f[C] +#include\ <rdma/rdma_cma.h> + +int\ rdma_establish(struct\ rdma_cm_id\ *id); +\f[] +.fi +.SH DESCRIPTION +.PP +\f[B]rdma_establish()\f[] Acknowledge an incoming connection response +event and complete the connection establishment. +.PP +Notes: +.PP +If a QP has not been created on the rdma_cm_id, this function should be +called by the active side to complete the connection, +.PP +after getting connect response event. +.PP +This will trigger a connection established event on the passive side. +.PP +This function should not be used on an rdma_cm_id on which a QP has been +created. +.SH ARGUMENTS +.TP +.B \f[I]id\f[] +RDMA identifier. +.RS +.RE +.SH RETURN VALUE +.PP +\f[B]rdma_establish()\f[] returns 0 on success, or \-1 on error. +If an error occurs, errno will be set to indicate the failure reason. +.SH SEE ALSO +.PP +\f[B]rdma_connect\f[](3), \f[B]rdma_disconnect\f[](3) +\f[B]rdma_get_cm_event\f[](3) +.SH AUTHORS +.PP +Danit Goldberg <danitg@mellanox.com> +.PP +Yossi Itigin <yosefe@mellanox.com> diff --git a/buildlib/pandoc-prebuilt/c6cf51c33703f96d23549f640ab1e80205143daf b/buildlib/pandoc-prebuilt/c6cf51c33703f96d23549f640ab1e80205143daf new file mode 100644 index 0000000..6199fb3 --- /dev/null +++ b/buildlib/pandoc-prebuilt/c6cf51c33703f96d23549f640ab1e80205143daf @@ -0,0 +1,150 @@ +.\" Automatically generated by Pandoc 1.19.2.4 +.\" +.TH "ibv_attach_counters_point_flow" "3" "2018\-04\-02" "libibverbs" "Libibverbs Programmer\[aq]s Manual" +.hy +.SH NAME +.PP +\f[B]ibv_attach_counters_point_flow\f[] \- attach individual counter +definition to a flow object +.SH SYNOPSIS +.IP +.nf +\f[C] +#include\ <infiniband/verbs.h> + +int\ ibv_attach_counters_point_flow(struct\ ibv_counters\ *counters, +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ struct\ ibv_counter_attach_attr\ *counter_attach_attr, +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ struct\ ibv_flow\ *flow); +\f[] +.fi +.SH DESCRIPTION +.PP +Attach counters point are a family of APIs to attach individual counter +description definition to a verb object at a specific index location. +.PP +Counters object will start collecting values after it is bound to the +verb object resource. +.PP +A static attach can be created when NULL is provided instead of the +reference to the verbs object (e.g.: in case of flow providing NULL +instead of \f[I]flow\f[]). +In this case, this counters object will only start collecting values +after it is bound to the verbs resource, for flow this is when +referencing the counters handle when creating a flow with +\f[B]ibv_create_flow\f[](). +.PP +Once an ibv_counters is bound statically to a verbs resource, no +additional attach is allowed till the counter object is not bound to any +verb object. +.PP +The argument counter_desc specifies which counter value should be +collected. +It is defined in verbs.h as one of the enum ibv_counter_description +options. +.PP +Supported capabilities of specific counter_desc values per verbs object +can be tested by checking the return value for success or ENOTSUP errno. +.PP +Attaching a counters handle to multiple objects of the same type will +accumulate the values into a single index. +e.g.: creating several ibv_flow(s) with the same ibv_counters handle +will collect the values from all relevant flows into the relevant index +location when reading the values from \f[B]ibv_read_counters\f[](), +setting the index more than once with different or same counter_desc +will aggregate the values from all relevant counters into the relevant +index location. +.PP +The runtime values of counters can be read from the hardware by calling +\f[B]ibv_read_counters\f[](). +.SH ARGUMENTS +.TP +.B \f[I]counters\f[] +Existing counters to attach new counter point on. +.RS +.RE +.TP +.B \f[I]counter_attach_attr\f[] +An ibv_counter_attach_attr struct, as defined in verbs.h. +.RS +.RE +.TP +.B \f[I]flow\f[] +Existing flow to attach a new counters point on (in static mode it must +be NULL). +.RS +.RE +.SS \f[I]counter_attach_attr\f[] Argument +.IP +.nf +\f[C] +struct\ ibv_counter_attach_attr\ { +\ \ \ \ enum\ ibv_counter_description\ counter_desc; +\ \ \ \ uint32_t\ index; +\ \ \ \ uint32_t\ comp_mask; +}; +\f[] +.fi +.SS \f[I]counter_desc\f[] Argument +.IP +.nf +\f[C] +enum\ ibv_counter_description\ { +\ \ \ \ IBV_COUNTER_PACKETS, +\ \ \ \ IBV_COUNTER_BYTES, +}; +\f[] +.fi +.TP +.B \f[I]index\f[] +Desired location of the specific counter at the counters object. +.RS +.RE +.TP +.B \f[I]comp_mask\f[] +Bitmask specifying what fields in the structure are valid. +.RS +.RE +.SH RETURN VALUE +.PP +\f[B]ibv_attach_counters_point_flow\f[]() returns 0 on success, or the +value of errno on failure (which indicates the failure reason) +.SH ERRORS +.TP +.B EINVAL +invalid argument(s) passed +.RS +.RE +.TP +.B ENOTSUP +\f[I]counter_desc\f[] is not supported on the requested object +.RS +.RE +.TP +.B EBUSY +the counter object is already bound to a flow, additional attach calls +is not allowed (valid for static attach only) +.RS +.RE +.TP +.B ENOMEM +not enough memory +.RS +.RE +.SH NOTES +.PP +Counter values in each index location are cleared upon creation when +calling \f[B]ibv_create_counters\f[](). +Attaching counters points will only increase these values accordingly. +.SH EXAMPLE +.PP +An example of use of \f[B]ibv_attach_counters_point_flow\f[]() is shown +in \f[B]ibv_read_counters\f[] +.SH SEE ALSO +.PP +\f[B]ibv_create_counters\f[], \f[B]ibv_destroy_counters\f[], +\f[B]ibv_read_counters\f[], \f[B]ibv_create_flow\f[] +.SH AUTHORS +.PP +Raed Salem <raeds@mellanox.com> +.PP +Alex Rosenbaum <alexr@mellanox.com> diff --git a/buildlib/pandoc-prebuilt/cf4e4cd11a7895e2b33c4b3e1625393ebf105452 b/buildlib/pandoc-prebuilt/cf4e4cd11a7895e2b33c4b3e1625393ebf105452 new file mode 100644 index 0000000..25b2a6c --- /dev/null +++ b/buildlib/pandoc-prebuilt/cf4e4cd11a7895e2b33c4b3e1625393ebf105452 @@ -0,0 +1,85 @@ +.\" Automatically generated by Pandoc 1.19.2.4 +.\" +.TH "mlx5dv_create_cq" "3" "2018\-9\-1" "mlx5" "mlx5 Programmer\[aq]s Manual" +.hy +.SH NAME +.PP +mlx5dv_create_cq \- creates a completion queue (CQ) +.SH SYNOPSIS +.IP +.nf +\f[C] +#include\ <infiniband/mlx5dv.h> + +struct\ ibv_cq_ex\ *mlx5dv_create_cq(struct\ ibv_context\ *context, +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ struct\ ibv_cq_init_attr_ex\ *cq_attr, +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ struct\ mlx5dv_cq_init_attr\ *mlx5_cq_attr); +\f[] +.fi +.SH DESCRIPTION +.PP +\f[B]mlx5dv_create_cq()\f[] creates a completion queue (CQ) with +specific driver properties. +.SH ARGUMENTS +.PP +Please see \f[B]ibv_create_cq_ex(3)\f[] man page for \f[B]context\f[] +and \f[B]cq_attr\f[] +.SS mlx5_cq_attr +.IP +.nf +\f[C] +struct\ mlx5dv_cq_init_attr\ { +\ \ \ \ uint64_t\ comp_mask; +\ \ \ \ uint8_t\ \ cqe_comp_res_format; +\ \ \ \ uint32_t\ flags; +\ \ \ \ uint16_t\ cqe_size; +}; +\f[] +.fi +.TP +.B \f[I]comp_mask\f[] +Bitmask specifying what fields in the structure are valid: +.RS +.PP +MLX5DV_CQ_INIT_ATTR_MASK_COMPRESSED_CQE enables creating a CQ in a mode +that few CQEs may be compressed into a single CQE, valid values in +\f[I]cqe_comp_res_format\f[] +.PP +MLX5DV_CQ_INIT_ATTR_MASK_FLAGS valid values in \f[I]flags\f[] +.PP +MLX5DV_CQ_INIT_ATTR_MASK_CQE_SIZE valid values in \f[I]cqe_size\f[] +.RE +.TP +.B \f[I]cqe_comp_res_format\f[] +A bitwise OR of the various CQE response formats of the responder side: +.RS +.PP +MLX5DV_CQE_RES_FORMAT_HASH CQE compression with hash +.PP +MLX5DV_CQE_RES_FORMAT_CSUM CQE compression with RX checksum +.PP +MLX5DV_CQE_RES_FORMAT_CSUM_STRIDX CQE compression with stride index +.RE +.TP +.B \f[I]flags\f[] +A bitwise OR of the various values described below: +.RS +.PP +MLX5DV_CQ_INIT_ATTR_FLAGS_CQE_PAD create a padded 128B CQE +.RE +.TP +.B \f[I]cqe_size\f[] +configure the CQE size to be 64 or 128 bytes other values will fail +mlx5dv_create_cq. +.RS +.RE +.SH RETURN VALUE +.PP +\f[B]mlx5dv_create_cq()\f[] returns a pointer to the created CQ, or NULL +if the request fails and errno will be set. +.SH SEE ALSO +.PP +\f[B]ibv_create_cq_ex\f[](3), +.SH AUTHOR +.PP +Yonatan Cohen <yonatanc@mellanox.com> diff --git a/buildlib/pandoc-prebuilt/d18ccc546cea3fc523f100e7823d52180cfdaa76 b/buildlib/pandoc-prebuilt/d18ccc546cea3fc523f100e7823d52180cfdaa76 new file mode 100644 index 0000000..54f11ea --- /dev/null +++ b/buildlib/pandoc-prebuilt/d18ccc546cea3fc523f100e7823d52180cfdaa76 @@ -0,0 +1,183 @@ +.\" Man page generated from reStructuredText. +. +.TH IBSYSSTAT 8 "2017-08-21" "" "Open IB Diagnostics" +.SH NAME +ibsysstat \- system status on an InfiniBand address +. +.nr rst2man-indent-level 0 +. +.de1 rstReportMargin +\\$1 \\n[an-margin] +level \\n[rst2man-indent-level] +level margin: \\n[rst2man-indent\\n[rst2man-indent-level]] +- +\\n[rst2man-indent0] +\\n[rst2man-indent1] +\\n[rst2man-indent2] +.. +.de1 INDENT +.\" .rstReportMargin pre: +. RS \\$1 +. nr rst2man-indent\\n[rst2man-indent-level] \\n[an-margin] +. nr rst2man-indent-level +1 +.\" .rstReportMargin post: +.. +.de UNINDENT +. RE +.\" indent \\n[an-margin] +.\" old: \\n[rst2man-indent\\n[rst2man-indent-level]] +.nr rst2man-indent-level -1 +.\" new: \\n[rst2man-indent\\n[rst2man-indent-level]] +.in \\n[rst2man-indent\\n[rst2man-indent-level]]u +.. +.SH SYNOPSIS +.sp +ibsysstat [options] <dest lid | guid> [<op>] +.SH DESCRIPTION +.sp +ibsysstat uses vendor mads to validate connectivity between IB nodes +and obtain other information about the IB node. ibsysstat is run as +client/server. Default is to run as client. +.SH OPTIONS +.sp +Current supported operations: +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +ping \e\- verify connectivity to server (default) +host \e\- obtain host information from server +cpu \e\- obtain cpu information from server +.ft P +.fi +.UNINDENT +.UNINDENT +.INDENT 0.0 +.TP +.B \fB\-o, \-\-oui\fP +use specified OUI number to multiplex vendor mads +.TP +.B \fB\-S, \-\-Server\fP +start in server mode (do not return) +.UNINDENT +.SS Addressing Flags +.\" Define the common option -G +. +.sp +\fB\-G, \-\-Guid\fP The address specified is a Port GUID +.\" Define the common option -L +. +.sp +\fB\-L, \-\-Lid\fP The address specified is a LID +.\" Define the common option -s +. +.sp +\fB\-s, \-\-sm_port <smlid>\fP use \(aqsmlid\(aq as the target lid for SA queries. +.SS Port Selection flags +.\" Define the common option -C +. +.sp +\fB\-C, \-\-Ca <ca_name>\fP use the specified ca_name. +.\" Define the common option -P +. +.sp +\fB\-P, \-\-Port <ca_port>\fP use the specified ca_port. +.\" Explanation of local port selection +. +.SS Local port Selection +.sp +Multiple port/Multiple CA support: when no IB device or port is specified +(see the "local umad parameters" below), the libibumad library +selects the port to use by the following criteria: +.INDENT 0.0 +.INDENT 3.5 +.INDENT 0.0 +.IP 1. 3 +the first port that is ACTIVE. +.IP 2. 3 +if not found, the first port that is UP (physical link up). +.UNINDENT +.sp +If a port and/or CA name is specified, the libibumad library attempts +to fulfill the user request, and will fail if it is not possible. +.sp +For example: +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +ibaddr # use the first port (criteria #1 above) +ibaddr \-C mthca1 # pick the best port from "mthca1" only. +ibaddr \-P 2 # use the second (active/up) port from the first available IB device. +ibaddr \-C mthca0 \-P 2 # use the specified port only. +.ft P +.fi +.UNINDENT +.UNINDENT +.UNINDENT +.UNINDENT +.SS Debugging flags +.\" Define the common option -d +. +.INDENT 0.0 +.TP +.B \-d +raise the IB debugging level. +May be used several times (\-ddd or \-d \-d \-d). +.UNINDENT +.\" Define the common option -e +. +.INDENT 0.0 +.TP +.B \-e +show send and receive errors (timeouts and others) +.UNINDENT +.\" Define the common option -h +. +.sp +\fB\-h, \-\-help\fP show the usage message +.\" Define the common option -v +. +.INDENT 0.0 +.TP +.B \fB\-v, \-\-verbose\fP +increase the application verbosity level. +May be used several times (\-vv or \-v \-v \-v) +.UNINDENT +.\" Define the common option -V +. +.sp +\fB\-V, \-\-version\fP show the version info. +.SS Configuration flags +.\" Define the common option -t +. +.sp +\fB\-t, \-\-timeout <timeout_ms>\fP override the default timeout for the solicited mads. +.\" Define the common option -z +. +.sp +\fB\-\-config, \-z <config_file>\fP Specify alternate config file. +.INDENT 0.0 +.INDENT 3.5 +Default: /usr/local/etc/infiniband\-diags/ibdiag.conf +.UNINDENT +.UNINDENT +.SH FILES +.\" Common text for the config file +. +.SS CONFIG FILE +.sp +/usr/local/etc/infiniband\-diags/ibdiag.conf +.sp +A global config file is provided to set some of the common options for all +tools. See supplied config file for details. +.SH AUTHOR +.INDENT 0.0 +.TP +.B Hal Rosenstock +< \fI\%halr@voltaire.com\fP > +.UNINDENT +.\" Generated by docutils manpage writer. +. diff --git a/buildlib/pandoc-prebuilt/d5c7e7b0425b7c207ee41b58a93e749b88d7afee b/buildlib/pandoc-prebuilt/d5c7e7b0425b7c207ee41b58a93e749b88d7afee new file mode 100644 index 0000000..1a3a154 --- /dev/null +++ b/buildlib/pandoc-prebuilt/d5c7e7b0425b7c207ee41b58a93e749b88d7afee @@ -0,0 +1,115 @@ +.\" Automatically generated by Pandoc 1.19.2.4 +.\" +.TH "mlx5dv_create_flow_matcher" "3" "2018\-9\-19" "mlx5" "mlx5 Programmer\[aq]s Manual" +.hy +.SH NAME +.PP +mlx5dv_create_flow_matcher \- creates a matcher to be used with +\f[I]mlx5dv_create_flow(3)\f[] +.SH SYNOPSIS +.IP +.nf +\f[C] +#include\ <infiniband/mlx5dv.h> + +struct\ mlx5dv_flow_matcher\ * +mlx5dv_create_flow_matcher(struct\ ibv_context\ *context, +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ struct\ mlx5dv_flow_matcher_attr\ *attr) +\f[] +.fi +.SH DESCRIPTION +.PP +\f[B]mlx5dv_create_flow_matcher()\f[] creates a flow matcher (mask) to +be used with \f[I]mlx5dv_create_flow(3)\f[]. +.SH ARGUMENTS +.PP +Please see \f[I]ibv_open_device(3)\f[] for \f[I]context\f[]. +.SS \f[I]attr\f[] +.IP +.nf +\f[C] +struct\ mlx5dv_flow_matcher_attr\ { +\ \ \ \ enum\ ibv_flow_attr_type\ type; +\ \ \ \ uint32_t\ flags;\ /*\ From\ enum\ ibv_flow_flags\ */ +\ \ \ \ uint16_t\ priority; +\ \ \ \ uint8_t\ match_criteria_enable;\ /*\ Device\ spec\ format\ */ +\ \ \ \ struct\ mlx5dv_flow_match_parameters\ *match_mask; +\ \ \ \ uint64_t\ comp_mask; +\ \ \ \ enum\ mlx5dv_flow_table_type\ ft_type; +}; +\f[] +.fi +.TP +.B \f[I]type\f[] +Type of matcher to be created: IBV_FLOW_ATTR_NORMAL: Normal rule +according to specification. +.RS +.RE +.TP +.B \f[I]flags\f[] +special flags to control rule: 0: Nothing or zero value means matcher +will store ingress flow rules. +IBV_FLOW_ATTR_FLAGS_EGRESS: Specified this matcher will store egress +flow rules. +.RS +.RE +.TP +.B \f[I]priority\f[] +See \f[I]ibv_create_flow(3)\f[]. +.RS +.RE +.TP +.B \f[I]match_criteria_enable\f[] +What match criteria is configured in \f[I]match_mask\f[], passed in +device spec format. +.RS +.RE +.SS \f[I]match_mask\f[] +.IP +.nf +\f[C] +struct\ mlx5dv_flow_match_parameters\ { +\ \ \ \ size_t\ match_sz; +\ \ \ \ uint64_t\ match_buf[];\ /*\ Device\ spec\ format\ */ +}; +\f[] +.fi +.TP +.B \f[I]match_sz\f[] +Size in bytes of \f[I]match_buf\f[]. +.RS +.RE +.TP +.B \f[I]match_buf\f[] +Set which mask to be used, passed in device spec format. +.RS +.RE +.TP +.B \f[I]comp_mask\f[] +MLX5DV_FLOW_MATCHER_MASK_FT_TYPE for \f[I]ft_type\f[] +.RS +.RE +.SS \f[I]ft_type\f[] +.PP +Specified in which flow table type, the matcher will store the flow +rules: MLX5DV_FLOW_TABLE_TYPE_NIC_RX: Specified this matcher will store +ingress flow rules. +MLX5DV_FLOW_TABLE_TYPE_NIC_TX Specified this matcher will store egress +flow rules. +MLX5DV_FLOW_TABLE_TYPE_FDB : Specified this matcher will store FDB +rules. +MLX5DV_FLOW_TABLE_TYPE_RDMA_RX: Specified this matcher will store +ingress RDMA flow rules. +MLX5DV_FLOW_TABLE_TYPE_RDMA_TX: Specified this matcher will store egress +RDMA flow rules. +.SH RETURN VALUE +.PP +\f[B]mlx5dv_create_flow_matcher\f[] returns a pointer to +\f[I]mlx5dv_flow_matcher\f[], on error NULL will be returned and errno +will be set. +.SH SEE ALSO +.PP +\f[I]ibv_open_device(3)\f[], \f[I]ibv_create_flow(3)\f[] +.SH AUTHOR +.PP +Mark Bloch <markb@mellanox.com> diff --git a/buildlib/pandoc-prebuilt/dbbc4d103d1e8637d22d6bb9c313c0cf978f0007 b/buildlib/pandoc-prebuilt/dbbc4d103d1e8637d22d6bb9c313c0cf978f0007 new file mode 100644 index 0000000..0b9cd73 --- /dev/null +++ b/buildlib/pandoc-prebuilt/dbbc4d103d1e8637d22d6bb9c313c0cf978f0007 @@ -0,0 +1,259 @@ +.\" Man page generated from reStructuredText. +. +.TH IBPORTSTATE 8 "2013-03-26" "" "Open IB Diagnostics" +.SH NAME +IBPORTSTATE \- handle port (physical) state and link speed of an InfiniBand port +. +.nr rst2man-indent-level 0 +. +.de1 rstReportMargin +\\$1 \\n[an-margin] +level \\n[rst2man-indent-level] +level margin: \\n[rst2man-indent\\n[rst2man-indent-level]] +- +\\n[rst2man-indent0] +\\n[rst2man-indent1] +\\n[rst2man-indent2] +.. +.de1 INDENT +.\" .rstReportMargin pre: +. RS \\$1 +. nr rst2man-indent\\n[rst2man-indent-level] \\n[an-margin] +. nr rst2man-indent-level +1 +.\" .rstReportMargin post: +.. +.de UNINDENT +. RE +.\" indent \\n[an-margin] +.\" old: \\n[rst2man-indent\\n[rst2man-indent-level]] +.nr rst2man-indent-level -1 +.\" new: \\n[rst2man-indent\\n[rst2man-indent-level]] +.in \\n[rst2man-indent\\n[rst2man-indent-level]]u +.. +.SH SYNOPSIS +.sp +ibportstate [options] <dest dr_path|lid|guid> <portnum> [<op>] +.SH DESCRIPTION +.sp +ibportstate allows the port state and port physical state of an IB port +to be queried (in addition to link width and speed being validated +relative to the peer port when the port queried is a switch port), +or a switch port to be disabled, enabled, or reset. It +also allows the link speed/width enabled on any IB port to be adjusted. +.SH OPTIONS +.INDENT 0.0 +.TP +.B \fB<op>\fP +.INDENT 7.0 +.TP +.B Supported ops: enable, disable, reset, speed, espeed, fdr10, width, query, +on, off, down, arm, active, vls, mtu, lid, smlid, lmc, +mkey, mkeylease, mkeyprot +(Default is query) +.UNINDENT +.sp +\fBenable, disable, and reset\fP are only allowed on switch ports (An +error is indicated if attempted on CA or router ports) +.sp +\fBoff\fP change the port state to disable. +.sp +\fBon\fP change the port state to enable(only when the current state is disable). +.sp +\fBspeed and width\fP are allowed on any port +.sp +\fBspeed\fP values are the legal values for PortInfo:LinkSpeedEnabled (An +error is indicated if PortInfo:LinkSpeedSupported does not support this +setting) +.sp +\fBespeed\fP is allowed on any port supporting extended link speeds +.sp +\fBfdr10\fP is allowed on any port supporting fdr10 (An error is +indicated if port\(aqs capability mask indicates extended link speeds are +not supported or if PortInfo:LinkSpeedExtSupported does not support +this setting) +.sp +\fBwidth\fP values are legal values for PortInfo:LinkWidthEnabled (An +error is indicated if PortInfo:LinkWidthSupported does not support this +setting) (NOTE: Speed and width changes are not effected until the port +goes through link renegotiation) +.sp +\fBquery\fP also validates port characteristics (link width, speed, +espeed, and fdr10) based on the peer port. This checking is done when +the port queried is a switch port as it relies on combined routing (an +initial LID route with directed routing to the peer) which can only be +done on a switch. This peer port validation feature of query op +requires LID routing to be functioning in the subnet. +.sp +\fBmkey, mkeylease, and mkeyprot\fP are only allowed on CAs, routers, or +switch port 0 (An error is generated if attempted on external switch +ports). Hexadecimal and octal mkeys may be specified by prepending the +key with \(aq0x\(aq or \(aq0\(aq, respectively. If a non\-numeric value (like \(aqx\(aq) +is specified for the mkey, then ibportstate will prompt for a value. +.UNINDENT +.SS Addressing Flags +.\" Define the common option -L +. +.sp +\fB\-L, \-\-Lid\fP The address specified is a LID +.\" Define the common option -G +. +.sp +\fB\-G, \-\-Guid\fP The address specified is a Port GUID +.\" Define the common option -D for Directed routes +. +.sp +\fB\-D, \-\-Direct\fP The address specified is a directed route +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +Examples: + [options] \-D [options] "0" # self port + [options] \-D [options] "0,1,2,1,4" # out via port 1, then 2, ... + + (Note the second number in the path specified must match the port being + used. This can be specified using the port selection flag \(aq\-P\(aq or the + port found through the automatic selection process.) +.ft P +.fi +.UNINDENT +.UNINDENT +.\" Define the common option -s +. +.sp +\fB\-s, \-\-sm_port <smlid>\fP use \(aqsmlid\(aq as the target lid for SA queries. +.SS Port Selection flags +.\" Define the common option -C +. +.sp +\fB\-C, \-\-Ca <ca_name>\fP use the specified ca_name. +.\" Define the common option -P +. +.sp +\fB\-P, \-\-Port <ca_port>\fP use the specified ca_port. +.\" Explanation of local port selection +. +.SS Local port Selection +.sp +Multiple port/Multiple CA support: when no IB device or port is specified +(see the "local umad parameters" below), the libibumad library +selects the port to use by the following criteria: +.INDENT 0.0 +.INDENT 3.5 +.INDENT 0.0 +.IP 1. 3 +the first port that is ACTIVE. +.IP 2. 3 +if not found, the first port that is UP (physical link up). +.UNINDENT +.sp +If a port and/or CA name is specified, the libibumad library attempts +to fulfill the user request, and will fail if it is not possible. +.sp +For example: +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +ibaddr # use the first port (criteria #1 above) +ibaddr \-C mthca1 # pick the best port from "mthca1" only. +ibaddr \-P 2 # use the second (active/up) port from the first available IB device. +ibaddr \-C mthca0 \-P 2 # use the specified port only. +.ft P +.fi +.UNINDENT +.UNINDENT +.UNINDENT +.UNINDENT +.SS Configuration flags +.\" Define the common option -z +. +.sp +\fB\-\-config, \-z <config_file>\fP Specify alternate config file. +.INDENT 0.0 +.INDENT 3.5 +Default: /usr/local/etc/infiniband\-diags/ibdiag.conf +.UNINDENT +.UNINDENT +.\" Define the common option -t +. +.sp +\fB\-t, \-\-timeout <timeout_ms>\fP override the default timeout for the solicited mads. +.\" Define the common option -y +. +.INDENT 0.0 +.TP +.B \fB\-y, \-\-m_key <key>\fP +use the specified M_key for requests. If non\-numeric value (like \(aqx\(aq) +is specified then a value will be prompted for. +.UNINDENT +.SS Debugging flags +.\" Define the common option -h +. +.sp +\fB\-h, \-\-help\fP show the usage message +.\" Define the common option -d +. +.INDENT 0.0 +.TP +.B \-d +raise the IB debugging level. +May be used several times (\-ddd or \-d \-d \-d). +.UNINDENT +.\" Define the common option -e +. +.INDENT 0.0 +.TP +.B \-e +show send and receive errors (timeouts and others) +.UNINDENT +.\" Define the common option -K +. +.INDENT 0.0 +.TP +.B \fB\-K, \-\-show_keys\fP +show security keys (mkey, smkey, etc.) associated with the request. +.UNINDENT +.\" Define the common option -v +. +.INDENT 0.0 +.TP +.B \fB\-v, \-\-verbose\fP +increase the application verbosity level. +May be used several times (\-vv or \-v \-v \-v) +.UNINDENT +.\" Define the common option -V +. +.sp +\fB\-V, \-\-version\fP show the version info. +.SH FILES +.\" Common text for the config file +. +.SS CONFIG FILE +.sp +/usr/local/etc/infiniband\-diags/ibdiag.conf +.sp +A global config file is provided to set some of the common options for all +tools. See supplied config file for details. +.SH EXAMPLES +.INDENT 0.0 +.TP +.B :: +ibportstate 3 1 disable # by lid +ibportstate \-G 0x2C9000100D051 1 enable # by guid +ibportstate \-D 0 1 # (query) by direct route +ibportstate 3 1 reset # by lid +ibportstate 3 1 speed 1 # by lid +ibportstate 3 1 width 1 # by lid +ibportstate \-D 0 1 lid 0x1234 arm # by direct route +.UNINDENT +.SH AUTHOR +.INDENT 0.0 +.TP +.B Hal Rosenstock +< \fI\%hal.rosenstock@gmail.com\fP > +.UNINDENT +.\" Generated by docutils manpage writer. +. diff --git a/buildlib/pandoc-prebuilt/e3173caa05e72ddab52839bf6377dc30c0d34b0f b/buildlib/pandoc-prebuilt/e3173caa05e72ddab52839bf6377dc30c0d34b0f new file mode 100644 index 0000000..8c7f795 --- /dev/null +++ b/buildlib/pandoc-prebuilt/e3173caa05e72ddab52839bf6377dc30c0d34b0f @@ -0,0 +1,399 @@ +.\" Man page generated from reStructuredText. +. +.TH IBNETDISCOVER 8 "2013-06-22" "" "Open IB Diagnostics" +.SH NAME +IBNETDISCOVER \- discover InfiniBand topology +. +.nr rst2man-indent-level 0 +. +.de1 rstReportMargin +\\$1 \\n[an-margin] +level \\n[rst2man-indent-level] +level margin: \\n[rst2man-indent\\n[rst2man-indent-level]] +- +\\n[rst2man-indent0] +\\n[rst2man-indent1] +\\n[rst2man-indent2] +.. +.de1 INDENT +.\" .rstReportMargin pre: +. RS \\$1 +. nr rst2man-indent\\n[rst2man-indent-level] \\n[an-margin] +. nr rst2man-indent-level +1 +.\" .rstReportMargin post: +.. +.de UNINDENT +. RE +.\" indent \\n[an-margin] +.\" old: \\n[rst2man-indent\\n[rst2man-indent-level]] +.nr rst2man-indent-level -1 +.\" new: \\n[rst2man-indent\\n[rst2man-indent-level]] +.in \\n[rst2man-indent\\n[rst2man-indent-level]]u +.. +.SH SYNOPSIS +.sp +ibnetdiscover [options] [<topology\-file>] +.SH DESCRIPTION +.sp +ibnetdiscover performs IB subnet discovery and outputs a human readable +topology file. GUIDs, node types, and port numbers are displayed +as well as port LIDs and NodeDescriptions. All nodes (and links) are displayed +(full topology). Optionally, this utility can be used to list the current +connected nodes by nodetype. The output is printed to standard output +unless a topology file is specified. +.SH OPTIONS +.sp +\fB\-l, \-\-list\fP +List of connected nodes +.sp +\fB\-g, \-\-grouping\fP +Show grouping. Grouping correlates IB nodes by different vendor specific +schemes. It may also show the switch external ports correspondence. +.sp +\fB\-H, \-\-Hca_list\fP +List of connected CAs +.sp +\fB\-S, \-\-Switch_list\fP +List of connected switches +.sp +\fB\-R, \-\-Router_list\fP +List of connected routers +.sp +\fB\-s, \-\-show\fP +Show progress information during discovery. +.sp +\fB\-f, \-\-full\fP +Show full information (ports\(aq speed and width, vlcap) +.sp +\fB\-p, \-\-ports\fP +Obtain a ports report which is a +list of connected ports with relevant information (like LID, portnum, +GUID, width, speed, and NodeDescription). +.sp +\fB\-m, \-\-max_hops\fP +Report max hops discovered. +.\" Define the common option -z +. +.INDENT 0.0 +.TP +.B \fB\-\-outstanding_smps, \-o <val>\fP +Specify the number of outstanding SMP\(aqs which should be issued during the scan +.sp +Default: 2 +.UNINDENT +.SS Cache File flags +.\" Define the common option cache +. +.sp +\fB\-\-cache <filename>\fP +Cache the ibnetdiscover network data in the specified filename. This +cache may be used by other tools for later analysis. +.\" Define the common option load-cache +. +.sp +\fB\-\-load\-cache <filename>\fP +Load and use the cached ibnetdiscover data stored in the specified +filename. May be useful for outputting and learning about other +fabrics or a previous state of a fabric. +.\" Define the common option diff +. +.sp +\fB\-\-diff <filename>\fP +Load cached ibnetdiscover data and do a diff comparison to the current +network or another cache. A special diff output for ibnetdiscover +output will be displayed showing differences between the old and current +fabric. By default, the following are compared for differences: switches, +channel adapters, routers, and port connections. +.\" Define the common option diffcheck +. +.sp +\fB\-\-diffcheck <key(s)>\fP +Specify what diff checks should be done in the \fB\-\-diff\fP option above. +Comma separate multiple diff check key(s). The available diff checks +are: \fBsw = switches\fP, \fBca = channel adapters\fP, \fBrouter\fP = routers, +\fBport\fP = port connections, \fBlid\fP = lids, \fBnodedesc\fP = node +descriptions. Note that \fBport\fP, \fBlid\fP, and \fBnodedesc\fP are +checked only for the node types that are specified (e.g. \fBsw\fP, +\fBca\fP, \fBrouter\fP). If \fBport\fP is specified alongside \fBlid\fP +or \fBnodedesc\fP, remote port lids and node descriptions will also be compared. +.SS Port Selection flags +.\" Define the common option -C +. +.sp +\fB\-C, \-\-Ca <ca_name>\fP use the specified ca_name. +.\" Define the common option -P +. +.sp +\fB\-P, \-\-Port <ca_port>\fP use the specified ca_port. +.\" Explanation of local port selection +. +.SS Local port Selection +.sp +Multiple port/Multiple CA support: when no IB device or port is specified +(see the "local umad parameters" below), the libibumad library +selects the port to use by the following criteria: +.INDENT 0.0 +.INDENT 3.5 +.INDENT 0.0 +.IP 1. 3 +the first port that is ACTIVE. +.IP 2. 3 +if not found, the first port that is UP (physical link up). +.UNINDENT +.sp +If a port and/or CA name is specified, the libibumad library attempts +to fulfill the user request, and will fail if it is not possible. +.sp +For example: +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +ibaddr # use the first port (criteria #1 above) +ibaddr \-C mthca1 # pick the best port from "mthca1" only. +ibaddr \-P 2 # use the second (active/up) port from the first available IB device. +ibaddr \-C mthca0 \-P 2 # use the specified port only. +.ft P +.fi +.UNINDENT +.UNINDENT +.UNINDENT +.UNINDENT +.SS Configuration flags +.\" Define the common option -z +. +.sp +\fB\-\-config, \-z <config_file>\fP Specify alternate config file. +.INDENT 0.0 +.INDENT 3.5 +Default: /usr/local/etc/infiniband\-diags/ibdiag.conf +.UNINDENT +.UNINDENT +.\" Define the common option -z +. +.INDENT 0.0 +.TP +.B \fB\-\-outstanding_smps, \-o <val>\fP +Specify the number of outstanding SMP\(aqs which should be issued during the scan +.sp +Default: 2 +.UNINDENT +.\" Define the common option --node-name-map +. +.sp +\fB\-\-node\-name\-map <node\-name\-map>\fP Specify a node name map. +.INDENT 0.0 +.INDENT 3.5 +This file maps GUIDs to more user friendly names. See FILES section. +.UNINDENT +.UNINDENT +.\" Define the common option -t +. +.sp +\fB\-t, \-\-timeout <timeout_ms>\fP override the default timeout for the solicited mads. +.\" Define the common option -y +. +.INDENT 0.0 +.TP +.B \fB\-y, \-\-m_key <key>\fP +use the specified M_key for requests. If non\-numeric value (like \(aqx\(aq) +is specified then a value will be prompted for. +.UNINDENT +.SS Debugging flags +.\" Define the common option -d +. +.INDENT 0.0 +.TP +.B \-d +raise the IB debugging level. +May be used several times (\-ddd or \-d \-d \-d). +.UNINDENT +.\" Define the common option -e +. +.INDENT 0.0 +.TP +.B \-e +show send and receive errors (timeouts and others) +.UNINDENT +.\" Define the common option -h +. +.sp +\fB\-h, \-\-help\fP show the usage message +.\" Define the common option -v +. +.INDENT 0.0 +.TP +.B \fB\-v, \-\-verbose\fP +increase the application verbosity level. +May be used several times (\-vv or \-v \-v \-v) +.UNINDENT +.\" Define the common option -V +. +.sp +\fB\-V, \-\-version\fP show the version info. +.SH FILES +.\" Common text for the config file +. +.SS CONFIG FILE +.sp +/usr/local/etc/infiniband\-diags/ibdiag.conf +.sp +A global config file is provided to set some of the common options for all +tools. See supplied config file for details. +.\" Common text to describe the node name map file. +. +.SS NODE NAME MAP FILE FORMAT +.sp +The node name map is used to specify user friendly names for nodes in the +output. GUIDs are used to perform the lookup. +.sp +This functionality is provided by the opensm\-libs package. See \fBopensm(8)\fP +for the file location for your installation. +.sp +\fBGenerically:\fP +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +# comment +<guid> "<name>" +.ft P +.fi +.UNINDENT +.UNINDENT +.sp +\fBExample:\fP +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +# IB1 +# Line cards +0x0008f104003f125c "IB1 (Rack 11 slot 1 ) ISR9288/ISR9096 Voltaire sLB\-24D" +0x0008f104003f125d "IB1 (Rack 11 slot 1 ) ISR9288/ISR9096 Voltaire sLB\-24D" +0x0008f104003f10d2 "IB1 (Rack 11 slot 2 ) ISR9288/ISR9096 Voltaire sLB\-24D" +0x0008f104003f10d3 "IB1 (Rack 11 slot 2 ) ISR9288/ISR9096 Voltaire sLB\-24D" +0x0008f104003f10bf "IB1 (Rack 11 slot 12 ) ISR9288/ISR9096 Voltaire sLB\-24D" + +# Spines +0x0008f10400400e2d "IB1 (Rack 11 spine 1 ) ISR9288 Voltaire sFB\-12D" +0x0008f10400400e2e "IB1 (Rack 11 spine 1 ) ISR9288 Voltaire sFB\-12D" +0x0008f10400400e2f "IB1 (Rack 11 spine 1 ) ISR9288 Voltaire sFB\-12D" +0x0008f10400400e31 "IB1 (Rack 11 spine 2 ) ISR9288 Voltaire sFB\-12D" +0x0008f10400400e32 "IB1 (Rack 11 spine 2 ) ISR9288 Voltaire sFB\-12D" + +# GUID Node Name +0x0008f10400411a08 "SW1 (Rack 3) ISR9024 Voltaire 9024D" +0x0008f10400411a28 "SW2 (Rack 3) ISR9024 Voltaire 9024D" +0x0008f10400411a34 "SW3 (Rack 3) ISR9024 Voltaire 9024D" +0x0008f104004119d0 "SW4 (Rack 3) ISR9024 Voltaire 9024D" +.ft P +.fi +.UNINDENT +.UNINDENT +.\" Common text to describe the Topology file. +. +.SS TOPOLOGY FILE FORMAT +.sp +The topology file format is human readable and largely intuitive. +Most identifiers are given textual names like vendor ID (vendid), device ID +(device ID), GUIDs of various types (sysimgguid, caguid, switchguid, etc.). +PortGUIDs are shown in parentheses (). For switches, this is shown on the +switchguid line. For CA and router ports, it is shown on the connectivity +lines. The IB node is identified followed by the number of ports and a quoted +the node GUID. On the right of this line is a comment (#) followed by the +NodeDescription in quotes. If the node is a switch, this line also contains +whether switch port 0 is base or enhanced, and the LID and LMC of port 0. +Subsequent lines pertaining to this node show the connectivity. On the +left is the port number of the current node. On the right is the peer node +(node at other end of link). It is identified in quotes with nodetype +followed by \- followed by NodeGUID with the port number in square brackets. +Further on the right is a comment (#). What follows the comment is +dependent on the node type. If it it a switch node, it is followed by +the NodeDescription in quotes and the LID of the peer node. If it is a +CA or router node, it is followed by the local LID and LMC and then +followed by the NodeDescription in quotes and the LID of the peer node. +The active link width and speed are then appended to the end of this +output line. +.sp +An example of this is: +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +# +# Topology file: generated on Tue Jun 5 14:15:10 2007 +# +# Max of 3 hops discovered +# Initiated from node 0008f10403960558 port 0008f10403960559 + +Non\-Chassis Nodes + +vendid=0x8f1 +devid=0x5a06 +sysimgguid=0x5442ba00003000 +switchguid=0x5442ba00003080(5442ba00003080) +Switch 24 "S\-005442ba00003080" # "ISR9024 Voltaire" base port 0 lid 6 lmc 0 +[22] "H\-0008f10403961354"[1](8f10403961355) # "MT23108 InfiniHost Mellanox Technologies" lid 4 4xSDR +[10] "S\-0008f10400410015"[1] # "SW\-6IB4 Voltaire" lid 3 4xSDR +[8] "H\-0008f10403960558"[2](8f1040396055a) # "MT23108 InfiniHost Mellanox Technologies" lid 14 4xSDR +[6] "S\-0008f10400410015"[3] # "SW\-6IB4 Voltaire" lid 3 4xSDR +[12] "H\-0008f10403960558"[1](8f10403960559) # "MT23108 InfiniHost Mellanox Technologies" lid 10 4xSDR + +vendid=0x8f1 +devid=0x5a05 +switchguid=0x8f10400410015(8f10400410015) +Switch 8 "S\-0008f10400410015" # "SW\-6IB4 Voltaire" base port 0 lid 3 lmc 0 +[6] "H\-0008f10403960984"[1](8f10403960985) # "MT23108 InfiniHost Mellanox Technologies" lid 16 4xSDR +[4] "H\-005442b100004900"[1](5442b100004901) # "MT23108 InfiniHost Mellanox Technologies" lid 12 4xSDR +[1] "S\-005442ba00003080"[10] # "ISR9024 Voltaire" lid 6 1xSDR +[3] "S\-005442ba00003080"[6] # "ISR9024 Voltaire" lid 6 4xSDR + +vendid=0x2c9 +devid=0x5a44 +caguid=0x8f10403960984 +Ca 2 "H\-0008f10403960984" # "MT23108 InfiniHost Mellanox Technologies" +[1](8f10403960985) "S\-0008f10400410015"[6] # lid 16 lmc 1 "SW\-6IB4 Voltaire" lid 3 4xSDR + +vendid=0x2c9 +devid=0x5a44 +caguid=0x5442b100004900 +Ca 2 "H\-005442b100004900" # "MT23108 InfiniHost Mellanox Technologies" +[1](5442b100004901) "S\-0008f10400410015"[4] # lid 12 lmc 1 "SW\-6IB4 Voltaire" lid 3 4xSDR + +vendid=0x2c9 +devid=0x5a44 +caguid=0x8f10403961354 +Ca 2 "H\-0008f10403961354" # "MT23108 InfiniHost Mellanox Technologies" +[1](8f10403961355) "S\-005442ba00003080"[22] # lid 4 lmc 1 "ISR9024 Voltaire" lid 6 4xSDR + +vendid=0x2c9 +devid=0x5a44 +caguid=0x8f10403960558 +Ca 2 "H\-0008f10403960558" # "MT23108 InfiniHost Mellanox Technologies" +[2](8f1040396055a) "S\-005442ba00003080"[8] # lid 14 lmc 1 "ISR9024 Voltaire" lid 6 4xSDR +[1](8f10403960559) "S\-005442ba00003080"[12] # lid 10 lmc 1 "ISR9024 Voltaire" lid 6 1xSDR +.ft P +.fi +.UNINDENT +.UNINDENT +.sp +When grouping is used, IB nodes are organized into chassis which are +numbered. Nodes which cannot be determined to be in a chassis are +displayed as "Non\-Chassis Nodes". External ports are also shown on the +connectivity lines. +.SH AUTHORS +.INDENT 0.0 +.TP +.B Hal Rosenstock +< \fI\%halr@voltaire.com\fP > +.TP +.B Ira Weiny +< \fI\%ira.weiny@intel.com\fP > +.UNINDENT +.\" Generated by docutils manpage writer. +. diff --git a/buildlib/pandoc-prebuilt/e44e94a238c3c63d976a79adb52e34fb24140a85 b/buildlib/pandoc-prebuilt/e44e94a238c3c63d976a79adb52e34fb24140a85 new file mode 100644 index 0000000..f1b5250 --- /dev/null +++ b/buildlib/pandoc-prebuilt/e44e94a238c3c63d976a79adb52e34fb24140a85 @@ -0,0 +1,87 @@ +.\" Automatically generated by Pandoc 1.19.2.4 +.\" +.TH "IBV_REREG_MR" "3" "2016\-03\-13" "libibverbs" "Libibverbs Programmer\[aq]s Manual" +.hy +.SH NAME +.PP +ibv_rereg_mr \- re\-register a memory region (MR) +.SH SYNOPSIS +.IP +.nf +\f[C] +#include\ <infiniband/verbs.h> + +int\ ibv_rereg_mr(struct\ ibv_mr\ *mr, +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ int\ flags, +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ struct\ ibv_pd\ *pd, +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ void\ *addr, +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ size_t\ length, +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ int\ access); +\f[] +.fi +.SH DESCRIPTION +.PP +\f[B]ibv_rereg_mr()\f[] Modifies the attributes of an existing memory +region (MR) \f[I]mr\f[]. +Conceptually, this call performs the functions deregister memory region +followed by register memory region. +Where possible, resources are reused instead of deallocated and +reallocated. +.PP +\f[I]flags\f[] is a bit\-mask used to indicate which of the following +properties of the memory region are being modified. +Flags should be a combination (bit field) of: +.TP +.B \f[B]IBV_REREG_MR_CHANGE_TRANSLATION \f[] +Change translation (location and length) +.RS +.RE +.TP +.B \f[B]IBV_REREG_MR_CHANGE_PD \f[] +Change protection domain +.RS +.RE +.TP +.B \f[B]IBV_REREG_MR_CHANGE_ACCESS \f[] +Change access flags +.RS +.RE +.PP +When \f[B]IBV_REREG_MR_CHANGE_PD\f[] is used, \f[I]pd\f[] represents the +new PD this MR should be registered to. +.PP +When \f[B]IBV_REREG_MR_CHANGE_TRANSLATION\f[] is used, \f[I]addr\f[]. +represents the virtual address (user\-space pointer) of the new MR, +while \f[I]length\f[] represents its length. +.PP +The access and other flags are represented in the field \f[I]access\f[]. +This field describes the desired memory protection attributes; it is +either 0 or the bitwise OR of one or more of ibv_access_flags. +.SH RETURN VALUE +.PP +\f[B]ibv_rereg_mr()\f[] returns 0 on success, otherwise an error has +occurred, \f[I]enum ibv_rereg_mr_err_code\f[] represents the error as of +below. +.PP +IBV_REREG_MR_ERR_INPUT \- Old MR is valid, an input error was detected +by libibverbs. +.PP +IBV_REREG_MR_ERR_DONT_FORK_NEW \- Old MR is valid, failed via don\[aq]t +fork on new address range. +.PP +IBV_REREG_MR_ERR_DO_FORK_OLD \- New MR is valid, failed via do fork on +old address range. +.PP +IBV_REREG_MR_ERR_CMD \- MR shouldn\[aq]t be used, command error. +.PP +IBV_REREG_MR_ERR_CMD_AND_DO_FORK_NEW \- MR shouldn\[aq]t be used, +command error, invalid fork state on new address range. +.SH NOTES +.PP +Even on a failure, the user still needs to call ibv_dereg_mr on this MR. +.SH SEE ALSO +.PP +\f[B]ibv_dereg_mr\f[](3), \f[B]ibv_reg_mr\f[](3) +.SH AUTHORS +.PP +Matan Barak <matanb@mellanox.com>, Yishai Hadas <yishaih@mellanox.com> diff --git a/buildlib/pandoc-prebuilt/e4d776d0b6f839435f0db61df3122af0280416e6 b/buildlib/pandoc-prebuilt/e4d776d0b6f839435f0db61df3122af0280416e6 new file mode 100644 index 0000000..0a0b430 --- /dev/null +++ b/buildlib/pandoc-prebuilt/e4d776d0b6f839435f0db61df3122af0280416e6 @@ -0,0 +1,184 @@ +.\" Automatically generated by Pandoc 1.19.2.4 +.\" +.TH "ibv_read_counters" "3" "2018\-04\-02" "libibverbs" "Libibverbs Programmer\[aq]s Manual" +.hy +.SH NAME +.PP +\f[B]ibv_read_counters\f[] \- Read counter values +.SH SYNOPSIS +.IP +.nf +\f[C] +#include\ <infiniband/verbs.h> + +int\ ibv_read_counters(struct\ ibv_counters\ *counters, +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ uint64_t\ *counters_value, +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ uint32_t\ ncounters, +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ uint32_t\ flags); +\f[] +.fi +.SH DESCRIPTION +.PP +\f[B]ibv_read_counters\f[]() returns the values of the chosen counters +into \f[I]counters_value\f[] array of which can accumulate +\f[I]ncounters\f[]. +The values are filled according to the configuration defined by the user +in the \f[B]ibv_attach_counters_point_xxx\f[] functions. +.SH ARGUMENTS +.TP +.B \f[I]counters\f[] +Counters object to read. +.RS +.RE +.TP +.B \f[I]counters_value\f[] +Input buffer to hold read result. +.RS +.RE +.TP +.B \f[I]ncounters\f[] +Number of counters to fill. +.RS +.RE +.TP +.B \f[I]flags\f[] +Use enum ibv_read_counters_flags. +.RS +.RE +.SS \f[I]flags\f[] Argument +.TP +.B IBV_READ_COUNTERS_ATTR_PREFER_CACHED +Will prefer reading the values from driver cache, else it will do +volatile hardware access which is the default. +.RS +.RE +.SH RETURN VALUE +.PP +\f[B]ibv_read_counters\f[]() returns 0 on success, or the value of errno +on failure (which indicates the failure reason) +.SH EXAMPLE +.PP +Example: Statically attach counters to a new flow +.PP +This example demonstrates the use of counters which are attached +statically with the creation of a new flow. +The counters are read from hardware periodically, and finally all +resources are released. +.IP +.nf +\f[C] +/*\ create\ counters\ object\ and\ define\ its\ counters\ points\ \ \ \ \ \ \ \ */ +/*\ create\ simple\ L2\ flow\ with\ hardcoded\ MAC,\ and\ a\ count\ action\ */ +/*\ read\ counters\ periodically,\ every\ 1sec,\ until\ loop\ ends\ \ \ \ \ \ */ +/*\ assumes\ user\ prepared\ a\ RAW_PACKET\ QP\ as\ input\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ */ +/*\ only\ limited\ error\ checking\ in\ run\ time\ for\ code\ simplicity\ \ */ + +#include\ <inttypes.h> +#include\ <infiniband/verbs.h> + +/*\ the\ below\ MAC\ should\ be\ replaced\ by\ user\ */ +#define\ FLOW_SPEC_ETH_MAC_VAL\ { +\ \ \ \ .dst_mac\ =\ {\ 0x00,\ 0x01,\ 0x02,\ 0x03,\ 0x04,0x05}, +\ \ \ \ .src_mac\ =\ {\ 0x00,\ 0x00,\ 0x00,\ 0x00,\ 0x00,\ 0x00}, +\ \ \ \ .ether_type\ =\ 0,\ .vlan_tag\ =\ 0,\ } +#define\ FLOW_SPEC_ETH_MAC_MASK\ { +\ \ \ \ .dst_mac\ =\ {\ 0xFF,\ 0xFF,\ 0xFF,\ 0xFF,\ 0xFF,\ 0xFF}, +\ \ \ \ .src_mac\ =\ {\ 0xFF,\ 0xFF,\ 0xFF,\ 0xFF,\ 0xFF,\ 0xFF}, +\ \ \ \ .ether_type\ =\ 0,\ .vlan_tag\ =\ 0,\ } + +void\ example_create_flow_with_counters_on_raw_qp(struct\ ibv_qp\ *qp)\ { +\ \ \ \ int\ idx\ =\ 0; +\ \ \ \ int\ loop\ =\ 10; +\ \ \ \ struct\ ibv_flow\ *flow\ =\ NULL; +\ \ \ \ struct\ ibv_counters\ *counters\ =\ NULL; +\ \ \ \ struct\ ibv_counters_init_attr\ init_attr\ =\ {0}; +\ \ \ \ struct\ ibv_counter_attach_attr\ attach_attr\ =\ {0}; + +\ \ \ \ /*\ create\ single\ counters\ handle\ */ +\ \ \ \ counters\ =\ ibv_create_counters(qp\->context,\ &init_attr); + +\ \ \ \ /*\ define\ counters\ points\ */ +\ \ \ \ attach_attr.counter_desc\ =\ IBV_COUNTER_PACKETS; +\ \ \ \ attach_attr.index\ =\ idx++; +\ \ \ \ ret\ =\ ibv_attach_counters_point_flow(counters,\ &attach_attr,\ NULL); +\ \ \ \ if\ (ret\ ==\ ENOTSUP)\ { +\ \ \ \ \ \ \ \ fprintf(stderr,\ "Attaching\ IBV_COUNTER_PACKETS\ to\ flow\ is\ not\ \\ +supported"); +\ \ \ \ \ \ \ \ exit(1); +\ \ \ \ } +\ \ \ \ attach_attr.counter_desc\ =\ IBV_COUNTER_BYTES; +\ \ \ \ attach_attr.index\ =\ idx++; +\ \ \ \ ibv_attach_counters_point_flow(counters,\ &attach_attr,\ NULL); +\ \ \ \ if\ (ret\ ==\ ENOTSUP)\ { +\ \ \ \ \ \ \ \ fprintf(stderr,\ "Attaching\ IBV_COUNTER_BYTES\ to\ flow\ is\ not\ \\ +supported"); +\ \ \ \ \ \ \ \ exit(1); +\ \ \ \ } + +\ \ \ \ /*\ define\ a\ new\ flow\ attr\ that\ includes\ the\ counters\ handle\ */ +\ \ \ \ struct\ raw_eth_flow_attr\ { +\ \ \ \ \ \ \ \ struct\ ibv_flow_attr\ \ \ \ \ \ \ \ \ \ \ \ \ \ attr; +\ \ \ \ \ \ \ \ struct\ ibv_flow_spec_eth\ \ \ \ \ \ \ \ \ \ spec_eth; +\ \ \ \ \ \ \ \ struct\ ibv_flow_spec_counter_action\ spec_count; +\ \ \ \ }\ flow_attr\ =\ { +\ \ \ \ \ \ \ \ .attr\ =\ { +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ .comp_mask\ \ =\ 0, +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ .type\ \ \ \ \ \ \ =\ IBV_FLOW_ATTR_NORMAL, +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ .size\ \ \ \ \ \ \ =\ sizeof(flow_attr), +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ .priority\ \ \ =\ 0, +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ .num_of_specs\ =\ 2,\ /*\ ETH\ +\ COUNT\ */ +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ .port\ \ \ \ \ \ \ =\ 1, +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ .flags\ \ \ \ \ \ =\ 0, +\ \ \ \ \ \ \ \ \ \ \ \ }, +\ \ \ \ \ \ \ \ .spec_eth\ =\ { +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ .type\ =\ IBV_EXP_FLOW_SPEC_ETH, +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ .size\ =\ sizeof(struct\ ibv_flow_spec_eth), +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ .val\ \ =\ FLOW_SPEC_ETH_MAC_VAL, +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ .mask\ =\ FLOW_SPEC_ETH_MAC_MASK, +\ \ \ \ \ \ \ \ \ \ \ \ }, +\ \ \ \ \ \ \ \ .spec_count\ =\ { +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ .type\ \ \ =\ IBV_FLOW_SPEC_ACTION_COUNT, +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ .size\ \ \ =\ sizeof(struct\ ibv_flow_spec_counter_action), +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ .counters\ =\ counters,\ /*\ attached\ this\ counters\ handle +to\ the\ newly\ created\ ibv_flow\ */\ }\ }; + +\ \ \ \ /*\ create\ the\ flow\ */ +\ \ \ \ flow\ =\ ibv_create_flow(qp,\ &flow_attr.attr); + +\ \ \ \ /*\ allocate\ array\ for\ counters\ value\ reading\ */ +\ \ \ \ uint64_t\ *counters_value\ =\ malloc(sizeof(uint64_t)\ *\ idx); + +\ \ \ \ /*\ periodical\ read\ and\ print\ of\ flow\ counters\ */ +\ \ \ \ while\ (\-\-loop)\ { +\ \ \ \ \ \ \ \ sleep(1); + +\ \ \ \ \ \ \ \ /*\ read\ hardware\ counters\ values\ */ +\ \ \ \ \ \ \ \ ibv_read_counters(counters,\ counters_value,\ idx, +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ IBV_READ_COUNTERS_ATTR_PREFER_CACHED); + +\ \ \ \ \ \ \ \ printf("PACKETS\ =\ %"PRIu64",\ BYTES\ =\ %"PRIu64\ \\n", +\ \ \ \ \ \ \ \ \ \ \ \ counters_value[0],\ counters_value[1]\ ); +\ \ \ \ } + +\ \ \ \ /*\ all\ done,\ release\ all\ */ +\ \ \ \ free(counters_value); + +\ \ \ \ /*\ destroy\ flow\ and\ detach\ counters\ */ +\ \ \ \ ibv_destroy_flow(flow); + +\ \ \ \ /*\ destroy\ counters\ handle\ */ +\ \ \ \ ibv_destroy_counters(counters); + +\ \ \ \ return; +} +\f[] +.fi +.SH SEE ALSO +.PP +\f[B]ibv_create_counters\f[], \f[B]ibv_destroy_counters\f[], +\f[B]ibv_attach_counters_point_flow\f[], \f[B]ibv_create_flow\f[] +.SH AUTHORS +.PP +Raed Salem <raeds@mellanox.com> +.PP +Alex Rosenbaum <alexr@mellanox.com> diff --git a/buildlib/pandoc-prebuilt/edcb345e0afc5fdd0f2beadfd7bbbb5ec6c13043 b/buildlib/pandoc-prebuilt/edcb345e0afc5fdd0f2beadfd7bbbb5ec6c13043 new file mode 100644 index 0000000..e8c05f8 --- /dev/null +++ b/buildlib/pandoc-prebuilt/edcb345e0afc5fdd0f2beadfd7bbbb5ec6c13043 @@ -0,0 +1,184 @@ +.\" Man page generated from reStructuredText. +. +.TH IBHOSTS 8 "2016-12-20" "" "OpenIB Diagnostics" +.SH NAME +IBHOSTS \- show InfiniBand host nodes in topology +. +.nr rst2man-indent-level 0 +. +.de1 rstReportMargin +\\$1 \\n[an-margin] +level \\n[rst2man-indent-level] +level margin: \\n[rst2man-indent\\n[rst2man-indent-level]] +- +\\n[rst2man-indent0] +\\n[rst2man-indent1] +\\n[rst2man-indent2] +.. +.de1 INDENT +.\" .rstReportMargin pre: +. RS \\$1 +. nr rst2man-indent\\n[rst2man-indent-level] \\n[an-margin] +. nr rst2man-indent-level +1 +.\" .rstReportMargin post: +.. +.de UNINDENT +. RE +.\" indent \\n[an-margin] +.\" old: \\n[rst2man-indent\\n[rst2man-indent-level]] +.nr rst2man-indent-level -1 +.\" new: \\n[rst2man-indent\\n[rst2man-indent-level]] +.in \\n[rst2man-indent\\n[rst2man-indent-level]]u +.. +.SH SYNOPSIS +.sp +ibhosts [options] [<topology\-file>] +.SH DESCRIPTION +.sp +ibhosts is a script which either walks the IB subnet topology or uses an +already saved topology file and extracts the CA nodes. +.SH OPTIONS +.\" Define the common option -C +. +.sp +\fB\-C, \-\-Ca <ca_name>\fP use the specified ca_name. +.\" Define the common option -P +. +.sp +\fB\-P, \-\-Port <ca_port>\fP use the specified ca_port. +.\" Define the common option -t +. +.sp +\fB\-t, \-\-timeout <timeout_ms>\fP override the default timeout for the solicited mads. +.\" Define the common option -y +. +.INDENT 0.0 +.TP +.B \fB\-y, \-\-m_key <key>\fP +use the specified M_key for requests. If non\-numeric value (like \(aqx\(aq) +is specified then a value will be prompted for. +.UNINDENT +.\" Define the common option -h +. +.sp +\fB\-h, \-\-help\fP show the usage message +.\" Define the common option -z +. +.sp +\fB\-\-config, \-z <config_file>\fP Specify alternate config file. +.INDENT 0.0 +.INDENT 3.5 +Default: /usr/local/etc/infiniband\-diags/ibdiag.conf +.UNINDENT +.UNINDENT +.\" Explanation of local port selection +. +.SS Local port Selection +.sp +Multiple port/Multiple CA support: when no IB device or port is specified +(see the "local umad parameters" below), the libibumad library +selects the port to use by the following criteria: +.INDENT 0.0 +.INDENT 3.5 +.INDENT 0.0 +.IP 1. 3 +the first port that is ACTIVE. +.IP 2. 3 +if not found, the first port that is UP (physical link up). +.UNINDENT +.sp +If a port and/or CA name is specified, the libibumad library attempts +to fulfill the user request, and will fail if it is not possible. +.sp +For example: +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +ibaddr # use the first port (criteria #1 above) +ibaddr \-C mthca1 # pick the best port from "mthca1" only. +ibaddr \-P 2 # use the second (active/up) port from the first available IB device. +ibaddr \-C mthca0 \-P 2 # use the specified port only. +.ft P +.fi +.UNINDENT +.UNINDENT +.UNINDENT +.UNINDENT +.SH FILES +.\" Common text for the config file +. +.SS CONFIG FILE +.sp +/usr/local/etc/infiniband\-diags/ibdiag.conf +.sp +A global config file is provided to set some of the common options for all +tools. See supplied config file for details. +.\" Common text to describe the node name map file. +. +.SS NODE NAME MAP FILE FORMAT +.sp +The node name map is used to specify user friendly names for nodes in the +output. GUIDs are used to perform the lookup. +.sp +This functionality is provided by the opensm\-libs package. See \fBopensm(8)\fP +for the file location for your installation. +.sp +\fBGenerically:\fP +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +# comment +<guid> "<name>" +.ft P +.fi +.UNINDENT +.UNINDENT +.sp +\fBExample:\fP +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +# IB1 +# Line cards +0x0008f104003f125c "IB1 (Rack 11 slot 1 ) ISR9288/ISR9096 Voltaire sLB\-24D" +0x0008f104003f125d "IB1 (Rack 11 slot 1 ) ISR9288/ISR9096 Voltaire sLB\-24D" +0x0008f104003f10d2 "IB1 (Rack 11 slot 2 ) ISR9288/ISR9096 Voltaire sLB\-24D" +0x0008f104003f10d3 "IB1 (Rack 11 slot 2 ) ISR9288/ISR9096 Voltaire sLB\-24D" +0x0008f104003f10bf "IB1 (Rack 11 slot 12 ) ISR9288/ISR9096 Voltaire sLB\-24D" + +# Spines +0x0008f10400400e2d "IB1 (Rack 11 spine 1 ) ISR9288 Voltaire sFB\-12D" +0x0008f10400400e2e "IB1 (Rack 11 spine 1 ) ISR9288 Voltaire sFB\-12D" +0x0008f10400400e2f "IB1 (Rack 11 spine 1 ) ISR9288 Voltaire sFB\-12D" +0x0008f10400400e31 "IB1 (Rack 11 spine 2 ) ISR9288 Voltaire sFB\-12D" +0x0008f10400400e32 "IB1 (Rack 11 spine 2 ) ISR9288 Voltaire sFB\-12D" + +# GUID Node Name +0x0008f10400411a08 "SW1 (Rack 3) ISR9024 Voltaire 9024D" +0x0008f10400411a28 "SW2 (Rack 3) ISR9024 Voltaire 9024D" +0x0008f10400411a34 "SW3 (Rack 3) ISR9024 Voltaire 9024D" +0x0008f104004119d0 "SW4 (Rack 3) ISR9024 Voltaire 9024D" +.ft P +.fi +.UNINDENT +.UNINDENT +.SH SEE ALSO +.sp +ibnetdiscover(8) +.SH DEPENDENCIES +.sp +ibnetdiscover, ibnetdiscover format +.SH AUTHOR +.INDENT 0.0 +.TP +.B Hal Rosenstock +< \fI\%halr@voltaire.com\fP > +.UNINDENT +.\" Generated by docutils manpage writer. +. diff --git a/buildlib/pandoc-prebuilt/f34fcba7aaa68b2aa752241370d19d79cfdc2cb4 b/buildlib/pandoc-prebuilt/f34fcba7aaa68b2aa752241370d19d79cfdc2cb4 new file mode 100644 index 0000000..4810a00 --- /dev/null +++ b/buildlib/pandoc-prebuilt/f34fcba7aaa68b2aa752241370d19d79cfdc2cb4 @@ -0,0 +1,177 @@ +.\" Automatically generated by Pandoc 1.19.2.4 +.\" +.TH "MLX5DV_WR" "3" "2019\-02\-24" "mlx5" "mlx5 Programmer\[aq]s Manual" +.hy +.SH NAME +.PP +mlx5dv_wr_set_dc_addr \- Attach a DC info to the last work request +.SH SYNOPSIS +.IP +.nf +\f[C] +#include\ <infiniband/mlx5dv.h> + +static\ inline\ void\ mlx5dv_wr_set_dc_addr(struct\ mlx5dv_qp_ex\ *mqp, +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ struct\ ibv_ah\ *ah, +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ uint32_t\ remote_dctn, +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ uint64_t\ remote_dc_key); + +struct\ mlx5dv_mr_interleaved\ { +\ \ \ \ uint64_t\ \ \ \ \ \ \ \ addr; +\ \ \ \ uint32_t\ \ \ \ \ \ \ \ bytes_count; +\ \ \ \ uint32_t\ \ \ \ \ \ \ \ bytes_skip; +\ \ \ \ uint32_t\ \ \ \ \ \ \ \ lkey; +}; + +static\ inline\ void\ mlx5dv_wr_mr_interleaved(struct\ mlx5dv_qp_ex\ *mqp, +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ struct\ mlx5dv_mkey\ *mkey, +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ uint32_t\ access_flags,\ /*\ use\ enum\ ibv_access_flags\ */ +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ uint32_t\ repeat_count, +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ uint16_t\ num_interleaved, +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ struct\ mlx5dv_mr_interleaved\ *data); + +static\ inline\ void\ mlx5dv_wr_mr_list(struct\ mlx5dv_qp_ex\ *mqp, +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ struct\ mlx5dv_mkey\ *mkey, +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ uint32_t\ access_flags,\ /*\ use\ enum\ ibv_access_flags\ */ +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ uint16_t\ num_sges, +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ struct\ ibv_sge\ *sge); +\f[] +.fi +.SH DESCRIPTION +.PP +The MLX5DV work request APIs (mlx5dv_wr_*) is an extension for IBV work +request API (ibv_wr_*) with mlx5 specific features for send work +request. +This may be used together with or without ibv_wr_* calls. +.SH USAGE +.PP +To use these APIs a QP must be created using mlx5dv_create_qp() with +\f[I]send_ops_flags\f[] of struct ibv_qp_init_attr_ex set. +.PP +If the QP does not support all the requested work request types then QP +creation will fail. +.PP +The mlx5dv_qp_ex is extracted from the IBV_QP by ibv_qp_to_qp_ex() and +mlx5dv_qp_ex_from_ibv_qp_ex(). +This should be used to apply the mlx5 specific features on the posted +WR. +.PP +A work request creation requires to use the ibv_qp_ex as described in +the man for ibv_wr_post and mlx5dv_qp with its available builders and +setters. +.SS QP Specific builders +.TP +.B \f[I]RC\f[] QPs +\f[I]mlx5dv_wr_mr_interleaved()\f[] +.RS +.PP +registers an interleaved memory layout by using an indirect mkey and +some interleaved data. +The layout of the memory pointed by the mkey after its registration will +be the \f[I]data\f[] representation for the \f[I]num_interleaved\f[] +entries. +This single layout representation is repeated by \f[I]repeat_count\f[]. +.PP +The \f[I]data\f[] as described by struct mlx5dv_mr_interleaved will hold +real data defined by \f[I]bytes_count\f[] and then a padding of +\f[I]bytes_skip\f[]. +Post a successful registration, RDMA operations can use this +\f[I]mkey\f[]. +The hardware will scatter the data according to the pattern. +The \f[I]mkey\f[] should be used in a zero\-based mode. +The \f[I]addr\f[] field in its \f[I]ibv_sge\f[] is an offset in the +total data. +To create this \f[I]mkey\f[] mlx5dv_create_mkey() should be used. +.PP +Current implementation requires the IBV_SEND_INLINE option to be on in +\f[I]ibv_qp_ex\->wr_flags\f[] field. +To be able to have more than 3 \f[I]num_interleaved\f[] entries, the QP +should be created with a larger WQE size that may fit it. +This should be done using the \f[I]max_inline_data\f[] attribute of +\f[I]struct ibv_qp_cap\f[] upon its creation. +.PP +As one entry will be consumed for strided header, the \f[I]mkey\f[] +should be created with one more entry than the required +\f[I]num_interleaved\f[]. +.PP +In case \f[I]ibv_qp_ex\->wr_flags\f[] turns on IBV_SEND_SIGNALED, the +reported WC opcode will be MLX5DV_WC_UMR. +Unregister the \f[I]mkey\f[] to enable another pattern registration +should be done via ibv_post_send with IBV_WR_LOCAL_INV opcode. +.RE +\f[I]mlx5dv_wr_mr_list()\f[] +.RS +.PP +registers a memory layout based on list of ibv_sge. +The layout of the memory pointed by the \f[I]mkey\f[] after its +registration will be based on the list of \f[I]sge\f[] counted by +\f[I]num_sges\f[]. +Post a successful registration RDMA operations can use this +\f[I]mkey\f[], the hardware will scatter the data according to the +pattern. +The \f[I]mkey\f[] should be used in a zero\-based mode, the +\f[I]addr\f[] field in its \f[I]ibv_sge\f[] is an offset in the total +data. +.PP +Current implementation requires the IBV_SEND_INLINE option to be on in +\f[I]ibv_qp_ex\->wr_flags\f[] field. +To be able to have more than 4 \f[I]num_sge\f[] entries, the QP should +be created with a larger WQE size that may fit it. +This should be done using the \f[I]max_inline_data\f[] attribute of +\f[I]struct ibv_qp_cap\f[] upon its creation. +.PP +In case \f[I]ibv_qp_ex\->wr_flags\f[] turns on IBV_SEND_SIGNALED, the +reported WC opcode will be MLX5DV_WC_UMR. +Unregister the \f[I]mkey\f[] to enable other pattern registration should +be done via ibv_post_send with IBV_WR_LOCAL_INV opcode. +.RE +.SS QP Specific setters +.TP +.B \f[I]DCI\f[] QPs +\f[I]mlx5dv_wr_set_dc_addr()\f[] must be called to set the DCI WR +properties. +The destination address of the work is specified by \f[I]ah\f[], the +remote DCT number is specified by \f[I]remote_dctn\f[] and the DC key is +specified by \f[I]remote_dc_key\f[]. +This setter is available when the QP transport is DCI and send_ops_flags +in struct ibv_qp_init_attr_ex is set. +The available builders and setters for DCI QP are the same as RC QP. +.RS +.RE +.SH EXAMPLE +.IP +.nf +\f[C] +/*\ create\ DC\ QP\ type\ and\ specify\ the\ required\ send\ opcodes\ */ +attr_ex.qp_type\ =\ IBV_QPT_DRIVER; +attr_ex.comp_mask\ |=\ IBV_QP_INIT_ATTR_SEND_OPS_FLAGS; +attr_ex.send_ops_flags\ |=\ IBV_QP_EX_WITH_RDMA_WRITE; + +attr_dv.comp_mask\ |=\ MLX5DV_QP_INIT_ATTR_MASK_DC; +attr_dv.dc_init_attr.dc_type\ =\ MLX5DV_DCTYPE_DCI; + +ibv_qp\ *qp\ =\ mlx5dv_create_qp(ctx,\ attr_ex,\ attr_dv); +ibv_qp_ex\ *qpx\ =\ ibv_qp_to_qp_ex(qp); +mlx5dv_qp_ex\ *mqpx\ =\ mlx5dv_qp_ex_from_ibv_qp_ex(qpx); + +ibv_wr_start(qpx); + +/*\ Use\ ibv_qp_ex\ object\ to\ set\ WR\ generic\ attributes\ */ +qpx\->wr_id\ =\ my_wr_id_1; +qpx\->wr_flags\ =\ IBV_SEND_SIGNALED; +ibv_wr_rdma_write(qpx,\ rkey,\ remote_addr_1); +ibv_wr_set_sge(qpx,\ lkey,\ local_addr_1,\ length_1); + +/*\ Use\ mlx5\ DC\ setter\ using\ mlx5dv_qp_ex\ object\ */ +mlx5dv_wr_set_wr_dc_addr(mqpx,\ ah,\ remote_dctn,\ remote_dc_key); + +ret\ =\ ibv_wr_complete(qpx); +\f[] +.fi +.SH SEE ALSO +.PP +\f[B]ibv_post_send\f[](3), \f[B]ibv_create_qp_ex(3)\f[], +\f[B]ibv_wr_post(3)\f[], \f[B]mlx5dv_create_mkey(3)\f[]. +.SH AUTHOR +.PP +Guy Levi <guyle@mellanox.com> diff --git a/buildlib/pandoc-prebuilt/f48a8d31ddfa68fad6c3badbc768ac703976c43f b/buildlib/pandoc-prebuilt/f48a8d31ddfa68fad6c3badbc768ac703976c43f new file mode 100644 index 0000000..22fa0ec --- /dev/null +++ b/buildlib/pandoc-prebuilt/f48a8d31ddfa68fad6c3badbc768ac703976c43f @@ -0,0 +1,67 @@ +.\" Man page generated from reStructuredText. +. +.TH CHECK_LFT_BALANCE 8 "2017-08-21" "" "Open IB Diagnostics" +.SH NAME +check_lft_balance \- check InfiniBand unicast forwarding tables balance +. +.nr rst2man-indent-level 0 +. +.de1 rstReportMargin +\\$1 \\n[an-margin] +level \\n[rst2man-indent-level] +level margin: \\n[rst2man-indent\\n[rst2man-indent-level]] +- +\\n[rst2man-indent0] +\\n[rst2man-indent1] +\\n[rst2man-indent2] +.. +.de1 INDENT +.\" .rstReportMargin pre: +. RS \\$1 +. nr rst2man-indent\\n[rst2man-indent-level] \\n[an-margin] +. nr rst2man-indent-level +1 +.\" .rstReportMargin post: +.. +.de UNINDENT +. RE +.\" indent \\n[an-margin] +.\" old: \\n[rst2man-indent\\n[rst2man-indent-level]] +.nr rst2man-indent-level -1 +.\" new: \\n[rst2man-indent\\n[rst2man-indent-level]] +.in \\n[rst2man-indent\\n[rst2man-indent-level]]u +.. +.SH SYNOPSIS +.sp +check_lft_balance.sh [\-hRv] +.SH DESCRIPTION +.sp +check_lft_balance.sh is a script which checks for balancing in Infiniband +unicast forwarding tables. It analyzes the output of +\fBdump_lfts(8)\fP and \fBiblinkinfo(8)\fP +.SH OPTIONS +.INDENT 0.0 +.TP +.B \fB\-h\fP +show help +.TP +.B \fB\-R\fP +Recalculate dump_lfts information, ie do not use the cached +information. This option is slower but should be used if the diag +tools have not been used for some time or if there are other reasons to +believe that the fabric has changed. +.TP +.B \fB\-v\fP +verbose output +.UNINDENT +.SH SEE ALSO +.sp +\fBdump_lfts(8)\fP +\fBiblinkinfo(8)\fP +.SH AUTHORS +.INDENT 0.0 +.TP +.B Albert Chu +< \fI\%chu11@llnl.gov\fP > +.UNINDENT +.\" Generated by docutils manpage writer. +. diff --git a/buildlib/pandoc-prebuilt/fa29d88a48409a61841ea15857c81feb01fd166d b/buildlib/pandoc-prebuilt/fa29d88a48409a61841ea15857c81feb01fd166d new file mode 100644 index 0000000..6da35cf --- /dev/null +++ b/buildlib/pandoc-prebuilt/fa29d88a48409a61841ea15857c81feb01fd166d @@ -0,0 +1,88 @@ +.\" Automatically generated by Pandoc 1.19.2.4 +.\" +.TH "mlx5dv_devx_umem_reg, mlx5dv_devx_umem_dereg" "3" "" "" "" +.hy +.SH NAME +.PP +mlx5dv_devx_umem_reg \- Register a user memory to be used by the devx +interface +.PP +mlx5dv_devx_umem_dereg \- Deregister a devx umem object +.SH SYNOPSIS +.IP +.nf +\f[C] +#include\ <infiniband/mlx5dv.h> + +struct\ mlx5dv_devx_umem\ { +\ \ \ \ uint32_t\ umem_id; +}; + +struct\ mlx5dv_devx_umem\ * +mlx5dv_devx_umem_reg(struct\ ibv_context\ *context,\ void\ *addr,\ size_t\ size, +\ \ \ \ \ \ \ \ \ \ \ \ \ uint32_t\ access) + +int\ mlx5dv_devx_umem_dereg(struct\ mlx5dv_devx_umem\ *dv_devx_umem) +\f[] +.fi +.SH DESCRIPTION +.PP +Register or deregister a user memory to be used by the devx interface. +.PP +The register verb exposes a UMEM DEVX object for user memory +registration for DMA. +The API to register the user memory gets as input the user address, +length and access flags, and provides to the user as output an object +which holds the UMEM ID returned by the firmware to this registered +memory. +.PP +The user will use that UMEM ID in device direct commands that use this +memory instead of the physical addresses list, for example upon +\f[I]mlx5dv_devx_obj_create\f[] to create a QP. +.SH ARGUMENTS +.TP +.B \f[I]context\f[] +.IP +.nf +\f[C] +RDMA\ device\ context\ to\ create\ the\ action\ on. +\f[] +.fi +.RS +.RE +.TP +.B \f[I]addr\f[] +The memory start address to register. +.RS +.RE +.TP +.B \f[I]size\f[] +.IP +.nf +\f[C] +The\ size\ of\ *addr*\ buffer. +\f[] +.fi +.RS +.RE +.TP +.B \f[I]access\f[] +The desired memory protection attributes; it is either 0 or the bitwise +OR of one or more of \f[I]enum ibv_access_flags\f[]. +.RS +.RE +.SH RETURN VALUE +.PP +Upon success \f[I]mlx5dv_devx_umem_reg\f[] will return a new \f[I]struct +mlx5dv_devx_umem\f[] object, on error NULL will be returned and errno +will be set. +.PP +\f[I]mlx5dv_devx_umem_dereg\f[] returns 0 on success, or the value of +errno on failure (which indicates the failure reason). +.SH SEE ALSO +.PP +\f[I]mlx5dv_open_device(3)\f[], \f[I]ibv_reg_mr(3)\f[], +\f[I]mlx5dv_devx_obj_create(3)\f[] +.SH AUTHOR +.PP +Yishai Hadas <yishaih@mellanox.com> diff --git a/buildlib/pandoc-prebuilt/fbb031d1e2e9c235f240d562903bbe11d25f29ed b/buildlib/pandoc-prebuilt/fbb031d1e2e9c235f240d562903bbe11d25f29ed new file mode 100644 index 0000000..3be12a4 --- /dev/null +++ b/buildlib/pandoc-prebuilt/fbb031d1e2e9c235f240d562903bbe11d25f29ed @@ -0,0 +1,316 @@ +.\" Man page generated from reStructuredText. +. +.TH IBLINKINFO 8 "2018-07-09" "" "OpenIB Diagnostics" +.SH NAME +IBLINKINFO \- report link info for all links in the fabric +. +.nr rst2man-indent-level 0 +. +.de1 rstReportMargin +\\$1 \\n[an-margin] +level \\n[rst2man-indent-level] +level margin: \\n[rst2man-indent\\n[rst2man-indent-level]] +- +\\n[rst2man-indent0] +\\n[rst2man-indent1] +\\n[rst2man-indent2] +.. +.de1 INDENT +.\" .rstReportMargin pre: +. RS \\$1 +. nr rst2man-indent\\n[rst2man-indent-level] \\n[an-margin] +. nr rst2man-indent-level +1 +.\" .rstReportMargin post: +.. +.de UNINDENT +. RE +.\" indent \\n[an-margin] +.\" old: \\n[rst2man-indent\\n[rst2man-indent-level]] +.nr rst2man-indent-level -1 +.\" new: \\n[rst2man-indent\\n[rst2man-indent-level]] +.in \\n[rst2man-indent\\n[rst2man-indent-level]]u +.. +.SH SYNOPSIS +.sp +iblinkinfo <options> +.SH DESCRIPTION +.sp +iblinkinfo reports link info for each port in an IB fabric, node by node. +Optionally, iblinkinfo can do partial scans and limit its output to parts of a +fabric. +.SH OPTIONS +.sp +\fB\-\-down, \-d\fP +Print only nodes which have a port in the "Down" state. +.sp +\fB\-\-line, \-l\fP +Print all information for each link on one line. Default is to print a header +with the node information and then a list for each port (useful for +grep\(aqing output). +.sp +\fB\-\-additional, \-p\fP +Print additional port settings (<LifeTime>,<HoqLife>,<VLStallCount>) +.sp +\fB\-\-switches\-only\fP +Show only switches in output. +.sp +\fB\-\-cas\-only\fP +Show only CAs in output. +.SS Partial Scan flags +.sp +The node to start a partial scan can be specified with the following addresses. +.\" Define the common option -G +. +.sp +\fB\-\-port\-guid, \-G <port_guid>\fP Specify a port_guid +.\" Define the common option -D for Directed routes +. +.sp +\fB\-D, \-\-Direct <dr_path>\fP The address specified is a directed route +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +Examples: + \-D "0" # self port + \-D "0,1,2,1,4" # out via port 1, then 2, ... + + (Note the second number in the path specified must match the port being + used. This can be specified using the port selection flag \(aq\-P\(aq or the + port found through the automatic selection process.) +.ft P +.fi +.UNINDENT +.UNINDENT +.sp +\fBNote:\fP For switches results are printed for all ports not just switch port 0. +.sp +\fB\-\-switch, \-S <port_guid>\fP same as "\-G". (provided only for backward compatibility) +.sp +How much of the scan to be printed can be controlled with the following. +.sp +\fB\-\-all, \-a\fP +Print all nodes found in a partial fabric scan. Normally a +partial fabric scan will return only the node specified. This option will +print the other nodes found as well. +.sp +\fB\-\-hops, \-n <hops>\fP +Specify the number of hops away from a specified node to scan. This is useful +to expand a partial fabric scan beyond the node specified. +.SS Cache File flags +.\" Define the common option load-cache +. +.sp +\fB\-\-load\-cache <filename>\fP +Load and use the cached ibnetdiscover data stored in the specified +filename. May be useful for outputting and learning about other +fabrics or a previous state of a fabric. +.\" Define the common option diff +. +.sp +\fB\-\-diff <filename>\fP +Load cached ibnetdiscover data and do a diff comparison to the current +network or another cache. A special diff output for ibnetdiscover +output will be displayed showing differences between the old and current +fabric. By default, the following are compared for differences: switches, +channel adapters, routers, and port connections. +.sp +\fB\-\-diffcheck <key(s)>\fP +Specify what diff checks should be done in the \fB\-\-diff\fP option above. Comma +separate multiple diff check key(s). The available diff checks are: \fBport\fP = +port connections, \fBstate\fP = port state, \fBlid\fP = lids, \fBnodedesc\fP = node +descriptions. Note that \fBport\fP, \fBlid\fP, and \fBnodedesc\fP are checked only +for the node types that are specified (e.g. \fBswitches\-only\fP, \fBcas\-only\fP). +If \fBport\fP is specified alongside \fBlid\fP or \fBnodedesc\fP, remote port lids +and node descriptions will also be compared. +.sp +\fB\-\-filterdownports <filename>\fP +Filter downports indicated in a ibnetdiscover cache. If a port was previously +indicated as down in the specified cache, and is still down, do not output it in the +resulting output. This option may be particularly useful for environments +where switches are not fully populated, thus much of the default iblinkinfo +info is considered useless. See \fBibnetdiscover\fP for information on caching +ibnetdiscover output. +.SS Port Selection flags +.\" Define the common option -C +. +.sp +\fB\-C, \-\-Ca <ca_name>\fP use the specified ca_name. +.\" Define the common option -P +. +.sp +\fB\-P, \-\-Port <ca_port>\fP use the specified ca_port. +.\" Explanation of local port selection +. +.SS Local port Selection +.sp +Multiple port/Multiple CA support: when no IB device or port is specified +(see the "local umad parameters" below), the libibumad library +selects the port to use by the following criteria: +.INDENT 0.0 +.INDENT 3.5 +.INDENT 0.0 +.IP 1. 3 +the first port that is ACTIVE. +.IP 2. 3 +if not found, the first port that is UP (physical link up). +.UNINDENT +.sp +If a port and/or CA name is specified, the libibumad library attempts +to fulfill the user request, and will fail if it is not possible. +.sp +For example: +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +ibaddr # use the first port (criteria #1 above) +ibaddr \-C mthca1 # pick the best port from "mthca1" only. +ibaddr \-P 2 # use the second (active/up) port from the first available IB device. +ibaddr \-C mthca0 \-P 2 # use the specified port only. +.ft P +.fi +.UNINDENT +.UNINDENT +.UNINDENT +.UNINDENT +.SS Configuration flags +.\" Define the common option -z +. +.sp +\fB\-\-config, \-z <config_file>\fP Specify alternate config file. +.INDENT 0.0 +.INDENT 3.5 +Default: /usr/local/etc/infiniband\-diags/ibdiag.conf +.UNINDENT +.UNINDENT +.\" Define the common option -z +. +.INDENT 0.0 +.TP +.B \fB\-\-outstanding_smps, \-o <val>\fP +Specify the number of outstanding SMP\(aqs which should be issued during the scan +.sp +Default: 2 +.UNINDENT +.\" Define the common option --node-name-map +. +.sp +\fB\-\-node\-name\-map <node\-name\-map>\fP Specify a node name map. +.INDENT 0.0 +.INDENT 3.5 +This file maps GUIDs to more user friendly names. See FILES section. +.UNINDENT +.UNINDENT +.\" Define the common option -t +. +.sp +\fB\-t, \-\-timeout <timeout_ms>\fP override the default timeout for the solicited mads. +.\" Define the common option -y +. +.INDENT 0.0 +.TP +.B \fB\-y, \-\-m_key <key>\fP +use the specified M_key for requests. If non\-numeric value (like \(aqx\(aq) +is specified then a value will be prompted for. +.UNINDENT +.SS Debugging flags +.\" Define the common option -e +. +.INDENT 0.0 +.TP +.B \-e +show send and receive errors (timeouts and others) +.UNINDENT +.\" Define the common option -h +. +.sp +\fB\-h, \-\-help\fP show the usage message +.\" Define the common option -v +. +.INDENT 0.0 +.TP +.B \fB\-v, \-\-verbose\fP +increase the application verbosity level. +May be used several times (\-vv or \-v \-v \-v) +.UNINDENT +.\" Define the common option -V +. +.sp +\fB\-V, \-\-version\fP show the version info. +.SH EXIT STATUS +.sp +0 on success, \-1 on failure to scan the fabric, 1 if check mode is used and +inconsistencies are found. +.SH FILES +.\" Common text for the config file +. +.SS CONFIG FILE +.sp +/usr/local/etc/infiniband\-diags/ibdiag.conf +.sp +A global config file is provided to set some of the common options for all +tools. See supplied config file for details. +.\" Common text to describe the node name map file. +. +.SS NODE NAME MAP FILE FORMAT +.sp +The node name map is used to specify user friendly names for nodes in the +output. GUIDs are used to perform the lookup. +.sp +This functionality is provided by the opensm\-libs package. See \fBopensm(8)\fP +for the file location for your installation. +.sp +\fBGenerically:\fP +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +# comment +<guid> "<name>" +.ft P +.fi +.UNINDENT +.UNINDENT +.sp +\fBExample:\fP +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +# IB1 +# Line cards +0x0008f104003f125c "IB1 (Rack 11 slot 1 ) ISR9288/ISR9096 Voltaire sLB\-24D" +0x0008f104003f125d "IB1 (Rack 11 slot 1 ) ISR9288/ISR9096 Voltaire sLB\-24D" +0x0008f104003f10d2 "IB1 (Rack 11 slot 2 ) ISR9288/ISR9096 Voltaire sLB\-24D" +0x0008f104003f10d3 "IB1 (Rack 11 slot 2 ) ISR9288/ISR9096 Voltaire sLB\-24D" +0x0008f104003f10bf "IB1 (Rack 11 slot 12 ) ISR9288/ISR9096 Voltaire sLB\-24D" + +# Spines +0x0008f10400400e2d "IB1 (Rack 11 spine 1 ) ISR9288 Voltaire sFB\-12D" +0x0008f10400400e2e "IB1 (Rack 11 spine 1 ) ISR9288 Voltaire sFB\-12D" +0x0008f10400400e2f "IB1 (Rack 11 spine 1 ) ISR9288 Voltaire sFB\-12D" +0x0008f10400400e31 "IB1 (Rack 11 spine 2 ) ISR9288 Voltaire sFB\-12D" +0x0008f10400400e32 "IB1 (Rack 11 spine 2 ) ISR9288 Voltaire sFB\-12D" + +# GUID Node Name +0x0008f10400411a08 "SW1 (Rack 3) ISR9024 Voltaire 9024D" +0x0008f10400411a28 "SW2 (Rack 3) ISR9024 Voltaire 9024D" +0x0008f10400411a34 "SW3 (Rack 3) ISR9024 Voltaire 9024D" +0x0008f104004119d0 "SW4 (Rack 3) ISR9024 Voltaire 9024D" +.ft P +.fi +.UNINDENT +.UNINDENT +.SH AUTHOR +.INDENT 0.0 +.TP +.B Ira Weiny +< \fI\%ira.weiny@intel.com\fP > +.UNINDENT +.\" Generated by docutils manpage writer. +. diff --git a/buildlib/pandoc-prebuilt/fc060326b1cf67bbf2a3ea45651805d6311baf77 b/buildlib/pandoc-prebuilt/fc060326b1cf67bbf2a3ea45651805d6311baf77 new file mode 100644 index 0000000..e1e6b19 --- /dev/null +++ b/buildlib/pandoc-prebuilt/fc060326b1cf67bbf2a3ea45651805d6311baf77 @@ -0,0 +1,217 @@ +.\" Man page generated from reStructuredText. +. +.TH VENDSTAT 8 "2017-08-21" "" "Open IB Diagnostics" +.SH NAME +vendstat \- query InfiniBand vendor specific functions +. +.nr rst2man-indent-level 0 +. +.de1 rstReportMargin +\\$1 \\n[an-margin] +level \\n[rst2man-indent-level] +level margin: \\n[rst2man-indent\\n[rst2man-indent-level]] +- +\\n[rst2man-indent0] +\\n[rst2man-indent1] +\\n[rst2man-indent2] +.. +.de1 INDENT +.\" .rstReportMargin pre: +. RS \\$1 +. nr rst2man-indent\\n[rst2man-indent-level] \\n[an-margin] +. nr rst2man-indent-level +1 +.\" .rstReportMargin post: +.. +.de UNINDENT +. RE +.\" indent \\n[an-margin] +.\" old: \\n[rst2man-indent\\n[rst2man-indent-level]] +.nr rst2man-indent-level -1 +.\" new: \\n[rst2man-indent\\n[rst2man-indent-level]] +.in \\n[rst2man-indent\\n[rst2man-indent-level]]u +.. +.SH SYNOPSIS +.sp +vendstat [options] <lid|guid> +.SH DESCRIPTION +.sp +vendstat uses vendor specific MADs to access beyond the IB spec +vendor specific functionality. Currently, there is support for +Mellanox InfiniSwitch\-III (IS3) and InfiniSwitch\-IV (IS4). +.SH OPTIONS +.INDENT 0.0 +.TP +.B \fB\-N\fP +show IS3 or IS4 general information. +.TP +.B \fB\-w\fP +show IS3 port xmit wait counters. +.TP +.B \fB\-i\fP +show IS4 counter group info. +.TP +.B \fB\-c <num,num>\fP +configure IS4 counter groups. +.sp +Configure IS4 counter groups 0 and 1. Such configuration is not +persistent across IS4 reboot. First number is for counter group 0 and +second is for counter group 1. +.sp +Group 0 counter config values: +.UNINDENT +.INDENT 0.0 +.TP +.B :: +.INDENT 7.0 +.INDENT 3.5 +0 \- PortXmitDataSL0\-7 +1 \- PortXmitDataSL8\-15 +2 \- PortRcvDataSL0\-7 +.UNINDENT +.UNINDENT +.sp +Group 1 counter config values: +.UNINDENT +.INDENT 0.0 +.TP +.B :: +1 \- PortXmitDataSL8\-15 +2 \- PortRcvDataSL0\-7 +8 \- PortRcvDataSL8\-15 +.TP +.B \fB\-R, \-\-Read <addr,mask>\fP +Read configuration space record at addr +.TP +.B \fB\-W, \-\-Write <addr,val,mask>\fP +Write configuration space record at addr +.UNINDENT +.SS Addressing Flags +.\" Define the common option -G +. +.sp +\fB\-G, \-\-Guid\fP The address specified is a Port GUID +.\" Define the common option -L +. +.sp +\fB\-L, \-\-Lid\fP The address specified is a LID +.\" Define the common option -s +. +.sp +\fB\-s, \-\-sm_port <smlid>\fP use \(aqsmlid\(aq as the target lid for SA queries. +.SS Port Selection flags +.\" Define the common option -C +. +.sp +\fB\-C, \-\-Ca <ca_name>\fP use the specified ca_name. +.\" Define the common option -P +. +.sp +\fB\-P, \-\-Port <ca_port>\fP use the specified ca_port. +.\" Explanation of local port selection +. +.SS Local port Selection +.sp +Multiple port/Multiple CA support: when no IB device or port is specified +(see the "local umad parameters" below), the libibumad library +selects the port to use by the following criteria: +.INDENT 0.0 +.INDENT 3.5 +.INDENT 0.0 +.IP 1. 3 +the first port that is ACTIVE. +.IP 2. 3 +if not found, the first port that is UP (physical link up). +.UNINDENT +.sp +If a port and/or CA name is specified, the libibumad library attempts +to fulfill the user request, and will fail if it is not possible. +.sp +For example: +.INDENT 0.0 +.INDENT 3.5 +.sp +.nf +.ft C +ibaddr # use the first port (criteria #1 above) +ibaddr \-C mthca1 # pick the best port from "mthca1" only. +ibaddr \-P 2 # use the second (active/up) port from the first available IB device. +ibaddr \-C mthca0 \-P 2 # use the specified port only. +.ft P +.fi +.UNINDENT +.UNINDENT +.UNINDENT +.UNINDENT +.SS Debugging flags +.\" Define the common option -d +. +.INDENT 0.0 +.TP +.B \-d +raise the IB debugging level. +May be used several times (\-ddd or \-d \-d \-d). +.UNINDENT +.\" Define the common option -e +. +.INDENT 0.0 +.TP +.B \-e +show send and receive errors (timeouts and others) +.UNINDENT +.\" Define the common option -h +. +.sp +\fB\-h, \-\-help\fP show the usage message +.\" Define the common option -v +. +.INDENT 0.0 +.TP +.B \fB\-v, \-\-verbose\fP +increase the application verbosity level. +May be used several times (\-vv or \-v \-v \-v) +.UNINDENT +.\" Define the common option -V +. +.sp +\fB\-V, \-\-version\fP show the version info. +.SS Configuration flags +.\" Define the common option -t +. +.sp +\fB\-t, \-\-timeout <timeout_ms>\fP override the default timeout for the solicited mads. +.\" Define the common option -z +. +.sp +\fB\-\-config, \-z <config_file>\fP Specify alternate config file. +.INDENT 0.0 +.INDENT 3.5 +Default: /usr/local/etc/infiniband\-diags/ibdiag.conf +.UNINDENT +.UNINDENT +.SH FILES +.\" Common text for the config file +. +.SS CONFIG FILE +.sp +/usr/local/etc/infiniband\-diags/ibdiag.conf +.sp +A global config file is provided to set some of the common options for all +tools. See supplied config file for details. +.SH EXAMPLES +.INDENT 0.0 +.TP +.B :: +vendstat \-N 6 # read IS3 or IS4 general information +vendstat \-w 6 # read IS3 port xmit wait counters +vendstat \-i 6 12 # read IS4 port 12 counter group info +vendstat \-c 0,1 6 12 # configure IS4 port 12 counter groups for PortXmitDataSL +vendstat \-c 2,8 6 12 # configure IS4 port 12 counter groups for PortRcvDataSL +.UNINDENT +.SH AUTHOR +.INDENT 0.0 +.TP +.B Hal Rosenstock +< \fI\%hal.rosenstock@gmail.com\fP > +.UNINDENT +.\" Generated by docutils manpage writer. +. diff --git a/buildlib/pandoc-prebuilt/fc30617d889e83a4c77a329249b2ecc3ce5b227f b/buildlib/pandoc-prebuilt/fc30617d889e83a4c77a329249b2ecc3ce5b227f new file mode 100644 index 0000000..974767b --- /dev/null +++ b/buildlib/pandoc-prebuilt/fc30617d889e83a4c77a329249b2ecc3ce5b227f @@ -0,0 +1,72 @@ +.\" Man page generated from reStructuredText. +. +.TH IBSTATUS 8 "2017-08-21" "" "Open IB Diagnostics" +.SH NAME +ibstatus \- query basic status of InfiniBand device(s) +. +.nr rst2man-indent-level 0 +. +.de1 rstReportMargin +\\$1 \\n[an-margin] +level \\n[rst2man-indent-level] +level margin: \\n[rst2man-indent\\n[rst2man-indent-level]] +- +\\n[rst2man-indent0] +\\n[rst2man-indent1] +\\n[rst2man-indent2] +.. +.de1 INDENT +.\" .rstReportMargin pre: +. RS \\$1 +. nr rst2man-indent\\n[rst2man-indent-level] \\n[an-margin] +. nr rst2man-indent-level +1 +.\" .rstReportMargin post: +.. +.de UNINDENT +. RE +.\" indent \\n[an-margin] +.\" old: \\n[rst2man-indent\\n[rst2man-indent-level]] +.nr rst2man-indent-level -1 +.\" new: \\n[rst2man-indent\\n[rst2man-indent-level]] +.in \\n[rst2man-indent\\n[rst2man-indent-level]]u +.. +.SH SYNOPSIS +.sp +ibstatus [\-h] [devname[:port]]... +.SH DESCRIPTION +.sp +ibstatus is a script which displays basic information obtained from the local +IB driver. Output includes LID, SMLID, port state, link width active, and port +physical state. +.SH OPTIONS +.\" Define the common option -h +. +.sp +\fB\-h, \-\-help\fP show the usage message +.INDENT 0.0 +.TP +.B \fBdevname\fP +InfiniBand device name +.TP +.B \fBportnum\fP +port number of InfiniBand device +.UNINDENT +.SH EXAMPLES +.INDENT 0.0 +.TP +.B :: +ibstatus # display status of all IB ports +ibstatus mthca1 # status of mthca1 ports +ibstatus mthca1:1 mthca0:2 # show status of specified ports +.UNINDENT +.SH SEE ALSO +.sp +\fBibstat (8)\fP +.SH AUTHOR +.INDENT 0.0 +.TP +.B Hal Rosenstock +< \fI\%halr@voltaire.com\fP > +.UNINDENT +.\" Generated by docutils manpage writer. +. diff --git a/buildlib/pandoc-prebuilt/fe1de88695b9f8551b1f861987b4188fdd592002 b/buildlib/pandoc-prebuilt/fe1de88695b9f8551b1f861987b4188fdd592002 new file mode 100644 index 0000000..79185e0 --- /dev/null +++ b/buildlib/pandoc-prebuilt/fe1de88695b9f8551b1f861987b4188fdd592002 @@ -0,0 +1,41 @@ +.\" Automatically generated by Pandoc 1.19.2.4 +.\" +.TH "IBV_EVENT_TYPE_STR" "3" "2006\-10\-31" "libibverbs" "Libibverbs Programmer\[aq]s Manual" +.hy +.SH NAME +.PP +ibv_event_type_str \- Return string describing event_type enum value +.PP +ibv_node_type_str \- Return string describing node_type enum value +.PP +ibv_port_state_str \- Return string describing port_state enum value +.SH SYNOPSIS +.IP +.nf +\f[C] +#include\ <infiniband/verbs.h> + +const\ char\ *ibv_event_type_str(enum\ ibv_event_type\ event_type); + +const\ char\ *ibv_node_type_str(enum\ ibv_node_type\ node_type); + +const\ char\ *ibv_port_state_str(enum\ ibv_port_state\ port_state); +\f[] +.fi +.SH DESCRIPTION +.PP +\f[B]ibv_node_type_str()\f[] returns a string describing the node type +enum value \f[I]node_type\f[]. +.PP +\f[B]ibv_port_state_str()\f[] returns a string describing the port state +enum value \f[I]port_state\f[]. +.PP +\f[B]ibv_event_type_str()\f[] returns a string describing the event type +enum value \f[I]event_type\f[]. +.SH RETURN VALUE +.PP +These functions return a constant string that describes the enum value +passed as their argument. +.SH AUTHOR +.PP +Roland Dreier <rolandd@cisco.com> diff --git a/buildlib/provider.map b/buildlib/provider.map new file mode 100644 index 0000000..e985a6f --- /dev/null +++ b/buildlib/provider.map @@ -0,0 +1,6 @@ +/* The providers do not export any symbols at all. Instead they rely on + attribute(constructor) to cause their init function to run at dlopen + time. */ +{ + local: *; +}; diff --git a/buildlib/publish_headers.cmake b/buildlib/publish_headers.cmake new file mode 100644 index 0000000..5b30986 --- /dev/null +++ b/buildlib/publish_headers.cmake @@ -0,0 +1,30 @@ +# COPYRIGHT (c) 2016 Obsidian Research Corporation. +# Licensed under BSD (MIT variant) or GPLv2. See COPYING. + +# Same as publish_headers but does not install them during the install phase +function(publish_internal_headers DEST) + if(NOT ARGN) + message(SEND_ERROR "Error: publish_internal_headers called without any files") + return() + endif() + + set(DDIR "${BUILD_INCLUDE}/${DEST}") + file(MAKE_DIRECTORY "${DDIR}") + + foreach(SFIL ${ARGN}) + get_filename_component(FIL ${SFIL} NAME) + rdma_create_symlink("${CMAKE_CURRENT_SOURCE_DIR}/${SFIL}" "${DDIR}/${FIL}") + endforeach() +endfunction() + +# Copy headers from the source directory to the proper place in the +# build/include directory. This also installs them into /usr/include/xx during +# the install phase +function(publish_headers DEST) + publish_internal_headers("${DEST}" ${ARGN}) + + foreach(SFIL ${ARGN}) + get_filename_component(FIL ${SFIL} NAME) + install(FILES "${SFIL}" DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/${DEST}/" RENAME "${FIL}") + endforeach() +endfunction() diff --git a/buildlib/pyverbs_functions.cmake b/buildlib/pyverbs_functions.cmake new file mode 100644 index 0000000..ca41fbb --- /dev/null +++ b/buildlib/pyverbs_functions.cmake @@ -0,0 +1,64 @@ +# SPDX-License-Identifier: (GPL-2.0 OR Linux-OpenIB) +# Copyright (c) 2018, Mellanox Technologies. All rights reserved. See COPYING file + +function(rdma_cython_module PY_MODULE LINKER_FLAGS) + foreach(PYX_FILE ${ARGN}) + get_filename_component(FILENAME ${PYX_FILE} NAME_WE) + get_filename_component(DIR ${PYX_FILE} DIRECTORY) + if (DIR) + set(PYX "${CMAKE_CURRENT_SOURCE_DIR}/${DIR}/${FILENAME}.pyx") + else() + set(PYX "${CMAKE_CURRENT_SOURCE_DIR}/${FILENAME}.pyx") + endif() + set(CFILE "${CMAKE_CURRENT_BINARY_DIR}/${FILENAME}.c") + include_directories(${PYTHON_INCLUDE_DIRS}) + add_custom_command( + OUTPUT "${CFILE}" + MAIN_DEPENDENCY "${PYX}" + COMMAND ${CYTHON_EXECUTABLE} "${PYX}" -o "${CFILE}" + "-I${PYTHON_INCLUDE_DIRS}" + COMMENT "Cythonizing ${PYX}" + ) + + string(REGEX REPLACE "\\.so$" "" SONAME "${FILENAME}${CMAKE_PYTHON_SO_SUFFIX}") + add_library(${SONAME} SHARED ${CFILE}) + set_target_properties(${SONAME} PROPERTIES + COMPILE_FLAGS "${CMAKE_C_FLAGS} -fPIC -fno-strict-aliasing -Wno-unused-function -Wno-redundant-decls -Wno-shadow -Wno-cast-function-type -Wno-implicit-fallthrough -Wno-unknown-warning -Wno-unknown-warning-option ${NO_VAR_TRACKING_FLAGS}" + LIBRARY_OUTPUT_DIRECTORY "${BUILD_PYTHON}/${PY_MODULE}" + PREFIX "") + target_link_libraries(${SONAME} LINK_PRIVATE ${PYTHON_LIBRARIES} ibverbs rdmacm ${LINKER_FLAGS}) + install(TARGETS ${SONAME} + DESTINATION ${CMAKE_INSTALL_PYTHON_ARCH_LIB}/${PY_MODULE}) + endforeach() +endfunction() + +function(rdma_python_module PY_MODULE) + foreach(PY_FILE ${ARGN}) + get_filename_component(LINK "${CMAKE_CURRENT_SOURCE_DIR}/${PY_FILE}" ABSOLUTE) + rdma_create_symlink("${LINK}" "${BUILD_PYTHON}/${PY_MODULE}/${PY_FILE}") + install(FILES ${CMAKE_CURRENT_SOURCE_DIR}/${PY_FILE} + DESTINATION ${CMAKE_INSTALL_PYTHON_ARCH_LIB}/${PY_MODULE}) + endforeach() +endfunction() + +function(rdma_python_test PY_MODULE) + foreach(PY_FILE ${ARGN}) + install(FILES ${CMAKE_CURRENT_SOURCE_DIR}/${PY_FILE} + DESTINATION ${CMAKE_INSTALL_DOCDIR}/${PY_MODULE}) + endforeach() +endfunction() + +# Make a python script runnable from the build/bin directory with all the +# correct paths filled in +function(rdma_internal_binary) + foreach(PY_FILE ${ARGN}) + get_filename_component(ABS "${CMAKE_CURRENT_SOURCE_DIR}/${PY_FILE}" ABSOLUTE) + get_filename_component(FN "${CMAKE_CURRENT_SOURCE_DIR}/${PY_FILE}" NAME) + set(BIN_FN "${BUILD_BIN}/${FN}") + + file(WRITE "${BIN_FN}" "#!/bin/sh +PYTHONPATH='${BUILD_PYTHON}' exec '${PYTHON_EXECUTABLE}' '${ABS}' \"$@\" +") + execute_process(COMMAND "chmod" "a+x" "${BIN_FN}") + endforeach() +endfunction() diff --git a/buildlib/rdma_functions.cmake b/buildlib/rdma_functions.cmake new file mode 100644 index 0000000..fa3fed3 --- /dev/null +++ b/buildlib/rdma_functions.cmake @@ -0,0 +1,312 @@ +# COPYRIGHT (c) 2016 Obsidian Research Corporation. +# Licensed under BSD (MIT variant) or GPLv2. See COPYING. + +# Helper functions for use in the sub CMakeLists files to make them simpler +# and more uniform. + +# Global list of tuples of (SHARED STATIC MAP) library target names +set(RDMA_STATIC_LIBS "" CACHE INTERNAL "Doc" FORCE) + +# Global list of tuples of (PROVIDER_NAME LIB_NAME) +set(RDMA_PROVIDER_LIST "" CACHE INTERNAL "Doc" FORCE) + +set(COMMON_LIBS_PIC ccan_pic rdma_util_pic) +set(COMMON_LIBS ccan rdma_util) + +function(rdma_public_static_lib SHLIB STATICLIB VERSION_SCRIPT) + if (NOT IS_ABSOLUTE ${VERSION_SCRIPT}) + set(VERSION_SCRIPT "${CMAKE_CURRENT_SOURCE_DIR}/${VERSION_SCRIPT}") + endif() + + set_target_properties(${STATICLIB} PROPERTIES + OUTPUT_NAME ${SHLIB} + ARCHIVE_OUTPUT_DIRECTORY "${BUILD_STATIC_LIB}") + target_compile_definitions(${STATICLIB} PRIVATE _STATIC_LIBRARY_BUILD_=1) + + list(APPEND RDMA_STATIC_LIBS ${SHLIB} ${STATICLIB} ${VERSION_SCRIPT}) + set(RDMA_STATIC_LIBS "${RDMA_STATIC_LIBS}" CACHE INTERNAL "") +endfunction() + +function(rdma_make_dir DDIR) + if(NOT EXISTS "${DDIR}/") + execute_process(COMMAND "${CMAKE_COMMAND}" "-E" "make_directory" + "${DDIR}" RESULT_VARIABLE retcode) + if(NOT "${retcode}" STREQUAL "0") + message(FATAL_ERROR "Failed to create directory ${DDIR}") + endif() + endif() +endfunction() + +# Create a symlink at filename DEST +# If the directory containing DEST does not exist then it is created +# automatically. +function(rdma_create_symlink LINK_CONTENT DEST) + if(NOT LINK_CONTENT) + message(FATAL_ERROR "Failed to provide LINK_CONTENT") + endif() + + # Make sure the directory exists, cmake doesn't create target DESTINATION + # directories until everything is finished, do it manually here if necessary + if(CMAKE_VERSION VERSION_LESS "2.8.12") + get_filename_component(DDIR "${DEST}" PATH) + else() + get_filename_component(DDIR "${DEST}" DIRECTORY) + endif() + + rdma_make_dir("${DDIR}") + + # Newer versions of cmake can use "${CMAKE_COMMAND}" "-E" "create_symlink" + # however it is broken weirdly on older versions. + execute_process(COMMAND "ln" "-Tsf" + "${LINK_CONTENT}" "${DEST}" RESULT_VARIABLE retcode) + if(NOT "${retcode}" STREQUAL "0") + message(FATAL_ERROR "Failed to create symlink in ${DEST}") + endif() +endfunction() + +# Install a symlink during 'make install' +function(rdma_install_symlink LINK_CONTENT DEST) + # Create a link in the build tree with the right content + get_filename_component(FN "${DEST}" NAME) + rdma_create_symlink("${LINK_CONTENT}" "${CMAKE_CURRENT_BINARY_DIR}/${FN}") + + # Have cmake install it. Doing it this way lets cpack work if we ever wish + # to use that. + get_filename_component(DIR "${DEST}" PATH) + install(FILES "${CMAKE_CURRENT_BINARY_DIR}/${FN}" + DESTINATION "${DIR}") +endfunction() + +# Wrapper for install() that runs the single file through configure_file first. +# This only works with the basic single file install(FILE file ARGS..) pattern +function(rdma_subst_install ARG1 file) + if (NOT "${ARG1}" STREQUAL "FILES") + message(FATAL_ERROR "Bad use of rdma_subst_install") + endif() + configure_file("${file}" "${CMAKE_CURRENT_BINARY_DIR}/${file}" @ONLY) + install(FILES "${CMAKE_CURRENT_BINARY_DIR}/${file}" ${ARGN}) +endfunction() + +# Modify shared library target DEST to use VERSION_SCRIPT as the linker map file +function(rdma_set_library_map DEST VERSION_SCRIPT) + if (NOT IS_ABSOLUTE ${VERSION_SCRIPT}) + set(VERSION_SCRIPT "${CMAKE_CURRENT_SOURCE_DIR}/${VERSION_SCRIPT}") + endif() + set_property(TARGET ${DEST} APPEND_STRING PROPERTY + LINK_FLAGS " -Wl,--version-script,${VERSION_SCRIPT}") + + # NOTE: This won't work with ninja prior to cmake 3.4 + set_property(TARGET ${DEST} APPEND_STRING PROPERTY + LINK_DEPENDS ${VERSION_SCRIPT}) +endfunction() + +# Basic function to produce a standard libary with a GNU LD version script. +function(rdma_library DEST VERSION_SCRIPT SOVERSION VERSION) + # Create a static library + if (ENABLE_STATIC) + add_library(${DEST}-static STATIC ${ARGN}) + target_link_libraries(${DEST}-static LINK ${COMMON_LIBS}) + rdma_public_static_lib(${DEST} ${DEST}-static ${VERSION_SCRIPT}) + endif() + + # Create a shared library + add_library(${DEST} SHARED ${ARGN}) + rdma_set_library_map(${DEST} ${VERSION_SCRIPT}) + target_link_libraries(${DEST} LINK_PRIVATE ${COMMON_LIBS_PIC}) + set_target_properties(${DEST} PROPERTIES + SOVERSION ${SOVERSION} + VERSION ${VERSION} + LIBRARY_OUTPUT_DIRECTORY "${BUILD_LIB}") + install(TARGETS ${DEST} DESTINATION "${CMAKE_INSTALL_LIBDIR}") +endfunction() + +# Create a special provider with exported symbols in it The shared provider +# exists as a normal system library with the normal shared library SONAME and +# other convections. The system library is symlinked into the +# VERBS_PROVIDER_DIR so it can be dlopened as a provider as well. +function(rdma_shared_provider DEST VERSION_SCRIPT SOVERSION VERSION) + # Installed driver file + file(WRITE "${CMAKE_CURRENT_BINARY_DIR}/${DEST}.driver" "driver ${DEST}\n") + install(FILES "${CMAKE_CURRENT_BINARY_DIR}/${DEST}.driver" DESTINATION "${CONFIG_DIR}") + + # Uninstalled driver file + file(MAKE_DIRECTORY "${BUILD_ETC}/libibverbs.d/") + file(WRITE "${BUILD_ETC}/libibverbs.d/${DEST}.driver" "driver ${BUILD_LIB}/lib${DEST}\n") + + list(APPEND RDMA_PROVIDER_LIST ${DEST} ${DEST}) + set(RDMA_PROVIDER_LIST "${RDMA_PROVIDER_LIST}" CACHE INTERNAL "") + + # Create a static provider library + if (ENABLE_STATIC) + add_library(${DEST}-static STATIC ${ARGN}) + rdma_public_static_lib(${DEST} ${DEST}-static ${VERSION_SCRIPT}) + endif() + + # Create the plugin shared library + add_library(${DEST} SHARED ${ARGN}) + rdma_set_library_map(${DEST} ${VERSION_SCRIPT}) + + target_link_libraries(${DEST} LINK_PRIVATE ${COMMON_LIBS_PIC}) + target_link_libraries(${DEST} LINK_PRIVATE ibverbs) + target_link_libraries(${DEST} LINK_PRIVATE ${CMAKE_THREAD_LIBS_INIT}) + set_target_properties(${DEST} PROPERTIES + SOVERSION ${SOVERSION} + VERSION ${VERSION} + LIBRARY_OUTPUT_DIRECTORY "${BUILD_LIB}") + install(TARGETS ${DEST} DESTINATION "${CMAKE_INSTALL_LIBDIR}") + + # Compute a relative symlink from VERBS_PROVIDER_DIR to LIBDIR + execute_process(COMMAND ${PYTHON_EXECUTABLE} ${CMAKE_SOURCE_DIR}/buildlib/relpath + "${CMAKE_INSTALL_FULL_LIBDIR}/lib${DEST}.so.${VERSION}" + "${VERBS_PROVIDER_DIR}" + OUTPUT_VARIABLE DEST_LINK_PATH OUTPUT_STRIP_TRAILING_WHITESPACE + RESULT_VARIABLE retcode) + if(NOT "${retcode}" STREQUAL "0") + message(FATAL_ERROR "Unable to run buildlib/relpath, do you have python?") + endif() + + rdma_install_symlink("${DEST_LINK_PATH}" "${VERBS_PROVIDER_DIR}/lib${DEST}${IBVERBS_PROVIDER_SUFFIX}") + rdma_create_symlink("lib${DEST}.so.${VERSION}" "${BUILD_LIB}/lib${DEST}${IBVERBS_PROVIDER_SUFFIX}") +endfunction() + +# Create a provider shared library for libibverbs +function(rdma_provider DEST) + # Installed driver file + file(WRITE "${CMAKE_CURRENT_BINARY_DIR}/${DEST}.driver" "driver ${DEST}\n") + install(FILES "${CMAKE_CURRENT_BINARY_DIR}/${DEST}.driver" DESTINATION "${CONFIG_DIR}") + + # Uninstalled driver file + file(MAKE_DIRECTORY "${BUILD_ETC}/libibverbs.d/") + file(WRITE "${BUILD_ETC}/libibverbs.d/${DEST}.driver" "driver ${BUILD_LIB}/lib${DEST}\n") + + list(APPEND RDMA_PROVIDER_LIST ${DEST} "${DEST}-rdmav${IBVERBS_PABI_VERSION}") + set(RDMA_PROVIDER_LIST "${RDMA_PROVIDER_LIST}" CACHE INTERNAL "") + + # Create a static provider library + if (ENABLE_STATIC) + add_library(${DEST} STATIC ${ARGN}) + rdma_public_static_lib("${DEST}-rdmav${IBVERBS_PABI_VERSION}" ${DEST} ${BUILDLIB}/provider.map) + endif() + + # Create the plugin shared library + set(DEST "${DEST}-rdmav${IBVERBS_PABI_VERSION}") + add_library(${DEST} MODULE ${ARGN}) + # Even though these are modules we still want to use Wl,--no-undefined + set_target_properties(${DEST} PROPERTIES LINK_FLAGS ${CMAKE_SHARED_LINKER_FLAGS}) + rdma_set_library_map(${DEST} ${BUILDLIB}/provider.map) + target_link_libraries(${DEST} LINK_PRIVATE ${COMMON_LIBS_PIC}) + target_link_libraries(${DEST} LINK_PRIVATE ibverbs) + target_link_libraries(${DEST} LINK_PRIVATE ${CMAKE_THREAD_LIBS_INIT}) + set_target_properties(${DEST} PROPERTIES LIBRARY_OUTPUT_DIRECTORY "${BUILD_LIB}") + # Provider Plugins do not use SONAME versioning, there is no reason to + # create the usual symlinks. + + if (VERBS_PROVIDER_DIR) + install(TARGETS ${DEST} DESTINATION "${VERBS_PROVIDER_DIR}") + else() + install(TARGETS ${DEST} DESTINATION "${CMAKE_INSTALL_LIBDIR}") + + # FIXME: This symlink is provided for compat with the old build, but it + # never should have existed in the first place, nothing should use this + # name, we can probably remove it. + rdma_install_symlink("lib${DEST}${IBVERBS_PROVIDER_SUFFIX}" "${CMAKE_INSTALL_LIBDIR}/lib${DEST}.so") + endif() +endfunction() + + # Create an installed executable +function(rdma_executable EXEC) + add_executable(${EXEC} ${ARGN}) + target_link_libraries(${EXEC} LINK_PRIVATE ${COMMON_LIBS}) + set_target_properties(${EXEC} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${BUILD_BIN}") + install(TARGETS ${EXEC} DESTINATION "${CMAKE_INSTALL_BINDIR}") +endfunction() + + # Create an installed executable (under sbin) +function(rdma_sbin_executable EXEC) + add_executable(${EXEC} ${ARGN}) + target_link_libraries(${EXEC} LINK_PRIVATE ${COMMON_LIBS}) + set_target_properties(${EXEC} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${BUILD_BIN}") + install(TARGETS ${EXEC} DESTINATION "${CMAKE_INSTALL_SBINDIR}") +endfunction() + +# Create an test executable (not-installed) +function(rdma_test_executable EXEC) + add_executable(${EXEC} ${ARGN}) + target_link_libraries(${EXEC} LINK_PRIVATE ${COMMON_LIBS}) + set_target_properties(${EXEC} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${BUILD_BIN}") +endfunction() + +# Finalize the setup of the static libraries by copying the meta information +# from the shared to static and setting up the static builder +function(rdma_finalize_libs) + list(LENGTH RDMA_STATIC_LIBS LEN) + if (LEN LESS 3) + return() + endif() + + math(EXPR LEN ${LEN}-1) + foreach(I RANGE 0 ${LEN} 3) + list(GET RDMA_STATIC_LIBS ${I} SHARED) + math(EXPR I ${I}+1) + list(GET RDMA_STATIC_LIBS ${I} STATIC) + math(EXPR I ${I}+1) + list(GET RDMA_STATIC_LIBS ${I} MAP) + + # PUBLIC libraries + set(LIBS "") + get_property(TMP TARGET ${SHARED} PROPERTY INTERFACE_LINK_LIBRARIES SET) + if (TMP) + get_target_property(TMP ${SHARED} INTERFACE_LINK_LIBRARIES) + set_target_properties(${STATIC} PROPERTIES INTERFACE_LINK_LIBRARIES "${TMP}") + set(LIBS "${TMP}") + endif() + + # PRIVATE libraries + get_property(TMP TARGET ${SHARED} PROPERTY LINK_LIBRARIES SET) + if (TMP) + get_target_property(TMP ${SHARED} LINK_LIBRARIES) + set_target_properties(${STATIC} PROPERTIES LINK_LIBRARIES "${TMP}") + list(APPEND LIBS "${TMP}") + endif() + + set(ARGS ${ARGS} --map "${MAP}" --lib "$<TARGET_FILE:${STATIC}>") + set(DEPENDS ${DEPENDS} ${STATIC} ${MAP}) + + get_target_property(TMP ${STATIC} OUTPUT_NAME) + set(OUTPUTS ${OUTPUTS} "${BUILD_LIB}/lib${TMP}.a") + install(FILES "${BUILD_LIB}/lib${TMP}.a" DESTINATION "${CMAKE_INSTALL_LIBDIR}") + endforeach() + + foreach(STATIC ${COMMON_LIBS}) + set(ARGS ${ARGS} --internal_lib "$<TARGET_FILE:${STATIC}>") + set(DEPENDS ${DEPENDS} ${STATIC}) + endforeach() + + add_custom_command( + OUTPUT ${OUTPUTS} + COMMAND "${PYTHON_EXECUTABLE}" "${CMAKE_SOURCE_DIR}/buildlib/sanitize_static_lib.py" + --version ${PACKAGE_VERSION} + --ar "${CMAKE_AR}" --nm "${CMAKE_NM}" --objcopy "${CMAKE_OBJCOPY}" ${ARGS} + DEPENDS ${DEPENDS} "${CMAKE_SOURCE_DIR}/buildlib/sanitize_static_lib.py" + COMMENT "Building distributable static libraries" + VERBATIM) + add_custom_target("make_static" ALL DEPENDS ${OUTPUTS}) +endfunction() + +# Generate a pkg-config file +function(rdma_pkg_config PC_LIB_NAME PC_REQUIRES_PRIVATE PC_LIB_PRIVATE) + set(PC_LIB_NAME "${PC_LIB_NAME}") + set(PC_LIB_PRIVATE "${PC_LIB_PRIVATE}") + set(PC_REQUIRES_PRIVATE "${PC_REQUIRES_PRIVATE}") + get_target_property(PC_VERSION ${PC_LIB_NAME} VERSION) + + # With IN_PLACE=1 the install step is not run, so generate the file in the build dir + if (IN_PLACE) + set(PC_RPATH "-Wl,-rpath,\${libdir}") + endif() + + configure_file(${BUILDLIB}/template.pc.in ${BUILD_LIB}/pkgconfig/lib${PC_LIB_NAME}.pc @ONLY) + if (NOT IN_PLACE) + install(FILES ${BUILD_LIB}/pkgconfig/lib${PC_LIB_NAME}.pc DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig) + endif() +endfunction() diff --git a/buildlib/rdma_man.cmake b/buildlib/rdma_man.cmake new file mode 100644 index 0000000..f8f43c9 --- /dev/null +++ b/buildlib/rdma_man.cmake @@ -0,0 +1,118 @@ +# COPYRIGHT (c) 2017-2018 Mellanox Technologies Ltd +# Licensed under BSD (MIT variant) or GPLv2. See COPYING. + +rdma_make_dir("${CMAKE_BINARY_DIR}/pandoc-prebuilt") +add_custom_target("docs" ALL DEPENDS "${OBJ}") + +function(rdma_man_get_prebuilt SRC OUT) + # If rst2man is not installed then we install the man page from the + # pre-built cache directory under buildlib. When the release tar file is + # made the man pages are pre-built and included. This is done via install + # so that ./build.sh never depends on pandoc, only 'ninja install'. + execute_process( + COMMAND "${PYTHON_EXECUTABLE}" "${CMAKE_SOURCE_DIR}/buildlib/pandoc-prebuilt.py" --retrieve "${CMAKE_SOURCE_DIR}" "${SRC}" + WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}" + OUTPUT_VARIABLE OBJ + RESULT_VARIABLE retcode) + if(NOT "${retcode}" STREQUAL "0") + message(FATAL_ERROR "Failed to load prebuilt pandoc output") + endif() + set(${OUT} "${OBJ}" PARENT_SCOPE) +endfunction() + +function(rdma_md_man_page SRC MAN_SECT MANFN) + set(OBJ "${CMAKE_CURRENT_BINARY_DIR}/${MANFN}") + + if (PANDOC_EXECUTABLE) + add_custom_command( + OUTPUT "${OBJ}" + COMMAND "${PYTHON_EXECUTABLE}" "${CMAKE_SOURCE_DIR}/buildlib/pandoc-prebuilt.py" --build "${CMAKE_BINARY_DIR}" --pandoc "${PANDOC_EXECUTABLE}" "${SRC}" "${OBJ}" + MAIN_DEPENDENCY "${SRC}" + WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}" + COMMENT "Creating man page ${MANFN}" + VERBATIM) + add_custom_target("man-${MANFN}" ALL DEPENDS "${OBJ}") + add_dependencies("docs" "man-${MANFN}") + else() + rdma_man_get_prebuilt(${SRC} OBJ) + endif() + + install(FILES "${OBJ}" + RENAME "${MANFN}" + DESTINATION "${CMAKE_INSTALL_MANDIR}/man${MAN_SECT}/") +endfunction() + +function(rdma_rst_man_page SRC MAN_SECT MANFN) + set(OBJ "${CMAKE_CURRENT_BINARY_DIR}/${MANFN}") + + if (RST2MAN_EXECUTABLE) + add_custom_command( + OUTPUT "${OBJ}" + COMMAND "${PYTHON_EXECUTABLE}" "${CMAKE_SOURCE_DIR}/buildlib/pandoc-prebuilt.py" --build "${CMAKE_BINARY_DIR}" --rst "${RST2MAN_EXECUTABLE}" "${SRC}" "${OBJ}" + MAIN_DEPENDENCY "${SRC}" + WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}" + COMMENT "Creating man page ${MANFN}" + VERBATIM) + add_custom_target("man-${MANFN}" ALL DEPENDS "${OBJ}") + add_dependencies("docs" "man-${MANFN}") + else() + rdma_man_get_prebuilt(${SRC} OBJ) + endif() + + install(FILES "${OBJ}" + RENAME "${MANFN}" + DESTINATION "${CMAKE_INSTALL_MANDIR}/man${MAN_SECT}/") +endfunction() + +# Install man pages. This deduces the section from the trailing integer in the +# filename +function(rdma_man_pages) + foreach(I ${ARGN}) + if ("${I}" MATCHES "\\.md$") + string(REGEX REPLACE "^.+[.](.+)\\.md$" "\\1" MAN_SECT "${I}") + string(REGEX REPLACE "^(.+)\\.md$" "\\1" BASE_NAME "${I}") + get_filename_component(BASE_NAME "${BASE_NAME}" NAME) + + rdma_md_man_page( + "${I}" + "${MAN_SECT}" + "${BASE_NAME}") + elseif ("${I}" MATCHES "\\.in\\.rst$") + string(REGEX REPLACE "^.+[.](.+)\\.in\\.rst$" "\\1" MAN_SECT "${I}") + string(REGEX REPLACE "^(.+)\\.in\\.rst$" "\\1" BASE_NAME "${I}") + get_filename_component(BASE_NAME "${BASE_NAME}" NAME) + + configure_file("${I}" "${CMAKE_CURRENT_BINARY_DIR}/${BASE_NAME}.rst" @ONLY) + + rdma_rst_man_page( + "${CMAKE_CURRENT_BINARY_DIR}/${BASE_NAME}.rst" + "${MAN_SECT}" + "${BASE_NAME}") + elseif ("${I}" MATCHES "\\.in$") + string(REGEX REPLACE "^.+[.](.+)\\.in$" "\\1" MAN_SECT "${I}") + string(REGEX REPLACE "^(.+)\\.in$" "\\1" BASE_NAME "${I}") + get_filename_component(BASE_NAME "${BASE_NAME}" NAME) + rdma_subst_install(FILES "${I}" + DESTINATION "${CMAKE_INSTALL_MANDIR}/man${MAN_SECT}/" + RENAME "${BASE_NAME}") + else() + string(REGEX REPLACE "^.+[.](.+)$" "\\1" MAN_SECT "${I}") + install(FILES "${I}" DESTINATION "${CMAKE_INSTALL_MANDIR}/man${MAN_SECT}/") + endif() + endforeach() +endfunction() + +# Create an alias for a man page, using a symlink. +# Input is a list of pairs of names (MAN_PAGE ALIAS) +# NOTE: The section must currently be the same for both. +function(rdma_alias_man_pages) + list(LENGTH ARGN LEN) + math(EXPR LEN ${LEN}-1) + foreach(I RANGE 0 ${LEN} 2) + list(GET ARGN ${I} FROM) + math(EXPR I ${I}+1) + list(GET ARGN ${I} TO) + string(REGEX REPLACE "^.+[.](.+)$" "\\1" MAN_SECT ${FROM}) + rdma_install_symlink("${FROM}" "${CMAKE_INSTALL_MANDIR}/man${MAN_SECT}/${TO}") + endforeach() +endfunction() diff --git a/buildlib/relpath b/buildlib/relpath new file mode 100644 index 0000000..965ce37 --- /dev/null +++ b/buildlib/relpath @@ -0,0 +1,8 @@ +#!/usr/bin/env python +# Copyright 2017 Mellanox Technologies, Inc. +# Licensed under BSD (MIT variant) or GPLv2. See COPYING. + +import os +import sys + +print(os.path.relpath(sys.argv[1], sys.argv[2])) diff --git a/buildlib/sanitize_static_lib.py b/buildlib/sanitize_static_lib.py new file mode 100644 index 0000000..55b6926 --- /dev/null +++ b/buildlib/sanitize_static_lib.py @@ -0,0 +1,278 @@ +#!/usr/bin/env python +# Copyright (c) 2018 Mellanox Technologies, Ltd. All rights reserved. +# Licensed under BSD (MIT variant) or GPLv2. See COPYING. +"""This tool is used to create installable versions of the static libraries in rdma-core. + +This is complicated because rdma-core was not designed with static libraries +in mind and relies on the dynamic linker to hide a variety of internal +details. + +The build uses several internal utility libraries across the providers and the +libraries. When building statically these libraries have to become inlined +into the various main libraries. This script figures out which static +libraries should include which internal libraries and inlines them +appropriately. + +rdma-core is not careful to use globally unique names throughout all the +libraries and all the providers. Normally the map file in the dynamic linker +will hide these external symbols. This script does something similar for static +linking by analyzing the libraries and map files then renaming internal +symbols with a globally unique prefix. + +This is far too complicated to handle internally with cmake, so we have cmake +produce the nearly completed libraries, then process them here using bintuils, +and finally produce the final installation ready libraries.""" + +import collections +import subprocess +import argparse +import tempfile +import itertools +import sys +import os +import re + +SymVer = collections.namedtuple( + "SymVer", ["version", "prior_version", "globals", "locals"]) + +try: + from tempfile import TemporaryDirectory +except ImportError: + import shutil + import tempfile + + # From /usr/lib/python3/dist-packages/setuptools/py31compat.py + class TemporaryDirectory(object): + def __init__(self): + self.name = None + self.name = tempfile.mkdtemp() + + def __enter__(self): + return self.name + + def __exit__(self, exctype, excvalue, exctrace): + try: + shutil.rmtree(self.name, True) + except OSError: + pass + self.name = None + + +try: + from subprocess import check_output +except ImportError: + # From /usr/lib/python2.7/subprocess.py + def check_output(*popenargs, **kwargs): + if 'stdout' in kwargs: + raise ValueError( + 'stdout argument not allowed, it will be overridden.') + process = subprocess.Popen( + stdout=subprocess.PIPE, *popenargs, **kwargs) + output, unused_err = process.communicate() + retcode = process.poll() + if retcode: + cmd = kwargs.get("args") + if cmd is None: + cmd = popenargs[0] + raise CalledProcessError(retcode, cmd, output=output) + return output + + subprocess.check_output = check_output + + +def parse_stanza(version, prior_version, lines): + gbl = [] + local = [] + cur = None + + cur = 0 + for I in re.finditer( + r"\s*(?:(global:)|(local:)(\s*\*\s*;)|(?:(\w+)\s*;))", + lines, + flags=re.DOTALL | re.MULTILINE): + if I.group(1): # global + lst = gbl + if I.group(2): # local + lst = local + if I.group(3): # wildcard + lst.append("*") + assert (cur is not gbl) + if I.group(4): # symbol name + lst.append(I.group(4)) + + assert cur == I.start() + cur = I.end() + assert cur == len(lines) + + return SymVer(version or "", prior_version or "", gbl, local) + + +def load_map(fn): + """This is a lame regex based parser for GNU linker map files. It asserts if + the map file is invalid. It returns a list of the global symbols""" + with open(fn, "rt") as F: + lines = F.read() + p = re.compile(r"/\*.*?\*/", flags=re.DOTALL) + lines = re.sub(p, "", lines) + lines = lines.strip() + + # Extract each stanza + res = [] + cur = 0 + for I in re.finditer( + r"\s*(?:(\S+)\s+)?{(.*?)\s*}(\s*\S+)?\s*;", + lines, + flags=re.DOTALL | re.MULTILINE): + assert cur == I.start() + res.append(parse_stanza(I.group(1), I.group(3), I.group(2))) + cur = I.end() + assert cur == len(lines) + + return res + + +class Lib(object): + def __init__(self, libfn, tmpdir): + self.libfn = os.path.basename(libfn) + self.objdir = os.path.join(tmpdir, self.libfn) + self.final_objdir = os.path.join(tmpdir, "r-" + self.libfn) + self.final_lib = os.path.join(os.path.dirname(libfn), "..", self.libfn) + self.needs = set() + self.needed = set() + + os.makedirs(self.objdir) + os.makedirs(self.final_objdir) + + subprocess.check_call([args.ar, "x", libfn], cwd=self.objdir) + self.objects = [I for I in os.listdir(self.objdir)] + self.get_syms() + + def get_syms(self): + """Read the definedsymbols from each object file""" + self.syms = set() + self.needed_syms = set() + for I in self.objects: + I = os.path.join(self.objdir, I) + syms = subprocess.check_output([args.nm, "--defined-only", I]) + for ln in syms.decode().splitlines(): + ln = ln.split() + if ln[1].isupper(): + self.syms.add(ln[2]) + + syms = subprocess.check_output([args.nm, "--undefined-only", I]) + for ln in syms.decode().splitlines(): + ln = ln.split() + if ln[0].isupper(): + if not ln[1].startswith("verbs_provider_"): + self.needed_syms.add(ln[1]) + + def rename_syms(self, rename_fn): + """Invoke objcopy on all the objects to rename their symbols""" + for I in self.objects: + subprocess.check_call([ + args.objcopy, + "--redefine-syms=%s" % (rename_fn), + os.path.join(self.objdir, I), + os.path.join(self.final_objdir, I) + ]) + + def incorporate_internal(self, internal_libs): + """If this library requires an internal library then we want to inline it into + this lib when we reconstruct it.""" + for lib in self.needs.intersection(internal_libs): + self.objects.extend( + os.path.join(lib.final_objdir, I) for I in lib.objects) + + def finalize(self): + """Write out the now modified library""" + try: + os.unlink(self.final_lib) + except OSError: + pass + subprocess.check_call( + [args.ar, "qsc", self.final_lib] + + [os.path.join(self.final_objdir, I) for I in self.objects]) + + +def compute_graph(libs): + """Look at the symbols each library provides vs the symbols each library needs + and organize the libraries into a graph.""" + for a, b in itertools.permutations(libs, 2): + if not a.syms.isdisjoint(b.needed_syms): + b.needs.add(a) + a.needed.add(b) + + # Use transitivity to prune the needs list + def prune(cur_lib, to_prune): + for I in cur_lib.needed: + I.needs.discard(to_prune) + to_prune.needed.discard(I) + prune(I, to_prune) + + for cur_lib in libs: + for I in list(cur_lib.needed): + prune(I, cur_lib) + + +parser = argparse.ArgumentParser( + description='Generate static libraries for distribution') +parser.add_argument( + "--map", + dest="maps", + action="append", + help="List of map files defining all the public symbols", + default=[]) +parser.add_argument( + "--lib", dest="libs", action="append", help="The input static libraries") +parser.add_argument( + "--internal_lib", + dest="internal_libs", + action="append", + help= + "The internal static libraries, these will be merged into other libraries") +parser.add_argument( + "--version", action="store", help="Package version number", required=True) +parser.add_argument("--ar", action="store", help="ar tool", required=True) +parser.add_argument("--nm", action="store", help="nm tool", required=True) +parser.add_argument( + "--objcopy", action="store", help="objcopy tool", required=True) +args = parser.parse_args() + +global_syms = set() +for fn in sorted(set(args.maps)): + for I in load_map(fn): + # Private symbols in libibverbs are also mangled for maximum safety. + if "PRIVATE" not in I.version: + global_syms.update(I.globals) + +with TemporaryDirectory() as tmpdir: + libs = set(Lib(fn, tmpdir) for fn in args.libs) + internal_libs = set(Lib(fn, tmpdir) for fn in args.internal_libs) + all_libs = libs | internal_libs + + all_syms = set() + for I in all_libs: + all_syms.update(I.syms) + compute_graph(all_libs) + + # To support the ibv_static_providers() machinery these are made global + # too, even though they are not in map files. We only want to expose them + # for the static linking case. + global_syms.add("ibv_static_providers") + for I in all_syms: + if I.startswith("verbs_provider_"): + global_syms.add(I) + + # Generate a redefine file for objcopy that will sanitize the internal names + prefix = re.sub(r"\W", "_", args.version) + redefine_fn = os.path.join(tmpdir, "redefine") + with open(redefine_fn, "wt") as F: + for I in sorted(all_syms - global_syms): + F.write("%s rdmacore%s_%s\n" % (I, prefix, I)) + + for I in all_libs: + I.rename_syms(redefine_fn) + + for I in libs: + I.incorporate_internal(internal_libs) + I.finalize() diff --git a/buildlib/sparse-include/19/netinet-in.h.diff b/buildlib/sparse-include/19/netinet-in.h.diff new file mode 100644 index 0000000..6dd7645 --- /dev/null +++ b/buildlib/sparse-include/19/netinet-in.h.diff @@ -0,0 +1,121 @@ +--- /usr/include/netinet/in.h 2016-05-26 10:27:23.000000000 +0000 ++++ build-sparse/include/netinet/in.h 2017-03-15 21:50:20.436860311 +0000 +@@ -22,12 +22,12 @@ + #include <stdint.h> + #include <sys/socket.h> + #include <bits/types.h> +- ++#include <linux/types.h> + + __BEGIN_DECLS + + /* Internet address. */ +-typedef uint32_t in_addr_t; ++typedef __be32 in_addr_t; + struct in_addr + { + in_addr_t s_addr; +@@ -114,7 +114,7 @@ + #endif /* !__USE_KERNEL_IPV6_DEFS */ + + /* Type to represent a port. */ +-typedef uint16_t in_port_t; ++typedef __be16 in_port_t; + + /* Standard well-known ports. */ + enum +@@ -173,36 +173,36 @@ + #define IN_CLASSB_HOST (0xffffffff & ~IN_CLASSB_NET) + #define IN_CLASSB_MAX 65536 + +-#define IN_CLASSC(a) ((((in_addr_t)(a)) & 0xe0000000) == 0xc0000000) ++#define IN_CLASSC(a) ((((uint32_t)(a)) & 0xe0000000) == 0xc0000000) + #define IN_CLASSC_NET 0xffffff00 + #define IN_CLASSC_NSHIFT 8 + #define IN_CLASSC_HOST (0xffffffff & ~IN_CLASSC_NET) + +-#define IN_CLASSD(a) ((((in_addr_t)(a)) & 0xf0000000) == 0xe0000000) ++#define IN_CLASSD(a) ((((uint32_t)(a)) & 0xf0000000) == 0xe0000000) + #define IN_MULTICAST(a) IN_CLASSD(a) + +-#define IN_EXPERIMENTAL(a) ((((in_addr_t)(a)) & 0xe0000000) == 0xe0000000) +-#define IN_BADCLASS(a) ((((in_addr_t)(a)) & 0xf0000000) == 0xf0000000) ++#define IN_EXPERIMENTAL(a) ((((uint32_t)(a)) & 0xe0000000) == 0xe0000000) ++#define IN_BADCLASS(a) ((((uint32_t)(a)) & 0xf0000000) == 0xf0000000) + + /* Address to accept any incoming messages. */ +-#define INADDR_ANY ((in_addr_t) 0x00000000) ++#define INADDR_ANY ((uint32_t) 0x00000000) + /* Address to send to all hosts. */ +-#define INADDR_BROADCAST ((in_addr_t) 0xffffffff) ++#define INADDR_BROADCAST ((uint32_t) 0xffffffff) + /* Address indicating an error return. */ +-#define INADDR_NONE ((in_addr_t) 0xffffffff) ++#define INADDR_NONE ((uint32_t) 0xffffffff) + + /* Network number for local host loopback. */ + #define IN_LOOPBACKNET 127 + /* Address to loopback in software to local host. */ + #ifndef INADDR_LOOPBACK +-# define INADDR_LOOPBACK ((in_addr_t) 0x7f000001) /* Inet 127.0.0.1. */ ++# define INADDR_LOOPBACK ((uint32_t) 0x7f000001) /* Inet 127.0.0.1. */ + #endif + + /* Defines for Multicast INADDR. */ +-#define INADDR_UNSPEC_GROUP ((in_addr_t) 0xe0000000) /* 224.0.0.0 */ +-#define INADDR_ALLHOSTS_GROUP ((in_addr_t) 0xe0000001) /* 224.0.0.1 */ +-#define INADDR_ALLRTRS_GROUP ((in_addr_t) 0xe0000002) /* 224.0.0.2 */ +-#define INADDR_MAX_LOCAL_GROUP ((in_addr_t) 0xe00000ff) /* 224.0.0.255 */ ++#define INADDR_UNSPEC_GROUP ((uint32_t) 0xe0000000) /* 224.0.0.0 */ ++#define INADDR_ALLHOSTS_GROUP ((uint32_t) 0xe0000001) /* 224.0.0.1 */ ++#define INADDR_ALLRTRS_GROUP ((uint32_t) 0xe0000002) /* 224.0.0.2 */ ++#define INADDR_MAX_LOCAL_GROUP ((uint32_t) 0xe00000ff) /* 224.0.0.255 */ + + #ifndef __USE_KERNEL_IPV6_DEFS + /* IPv6 address */ +@@ -212,8 +212,8 @@ + { + uint8_t __u6_addr8[16]; + #if defined __USE_MISC || defined __USE_GNU +- uint16_t __u6_addr16[8]; +- uint32_t __u6_addr32[4]; ++ __be16 __u6_addr16[8]; ++ __be32 __u6_addr32[4]; + #endif + } __in6_u; + #define s6_addr __in6_u.__u6_addr8 +@@ -253,7 +253,7 @@ + { + __SOCKADDR_COMMON (sin6_); + in_port_t sin6_port; /* Transport layer port # */ +- uint32_t sin6_flowinfo; /* IPv6 flow information */ ++ __be32 sin6_flowinfo; /* IPv6 flow information */ + struct in6_addr sin6_addr; /* IPv6 address */ + uint32_t sin6_scope_id; /* IPv6 scope-id */ + }; +@@ -371,12 +371,12 @@ + this was a short-sighted decision since on different systems the types + may have different representations but the values are always the same. */ + +-extern uint32_t ntohl (uint32_t __netlong) __THROW __attribute__ ((__const__)); +-extern uint16_t ntohs (uint16_t __netshort) ++extern uint32_t ntohl (__be32 __netlong) __THROW __attribute__ ((__const__)); ++extern uint16_t ntohs (__be16 __netshort) + __THROW __attribute__ ((__const__)); +-extern uint32_t htonl (uint32_t __hostlong) ++extern __be32 htonl (uint32_t __hostlong) + __THROW __attribute__ ((__const__)); +-extern uint16_t htons (uint16_t __hostshort) ++extern __be16 htons (uint16_t __hostshort) + __THROW __attribute__ ((__const__)); + + #include <endian.h> +@@ -384,7 +384,7 @@ + /* Get machine dependent optimized versions of byte swapping functions. */ + #include <bits/byteswap.h> + +-#ifdef __OPTIMIZE__ ++#ifdef __disabled_OPTIMIZE__ + /* We can optimize calls to the conversion functions. Either nothing has + to be done or we are using directly the byte-swapping functions which + often can be inlined. */ diff --git a/buildlib/sparse-include/23/netinet-in.h.diff b/buildlib/sparse-include/23/netinet-in.h.diff new file mode 100644 index 0000000..6156a96 --- /dev/null +++ b/buildlib/sparse-include/23/netinet-in.h.diff @@ -0,0 +1,121 @@ +--- /usr/include/netinet/in.h 2016-11-16 15:44:03.000000000 -0700 ++++ build-sparse/include/netinet/in.h 2017-03-15 13:55:43.865288477 -0600 +@@ -22,12 +22,12 @@ + #include <stdint.h> + #include <sys/socket.h> + #include <bits/types.h> +- ++#include <linux/types.h> + + __BEGIN_DECLS + + /* Internet address. */ +-typedef uint32_t in_addr_t; ++typedef __be32 in_addr_t; + struct in_addr + { + in_addr_t s_addr; +@@ -116,7 +116,7 @@ + #endif /* !__USE_KERNEL_IPV6_DEFS */ + + /* Type to represent a port. */ +-typedef uint16_t in_port_t; ++typedef __be16 in_port_t; + + /* Standard well-known ports. */ + enum +@@ -175,36 +175,36 @@ + #define IN_CLASSB_HOST (0xffffffff & ~IN_CLASSB_NET) + #define IN_CLASSB_MAX 65536 + +-#define IN_CLASSC(a) ((((in_addr_t)(a)) & 0xe0000000) == 0xc0000000) ++#define IN_CLASSC(a) ((((uint32_t)(a)) & 0xe0000000) == 0xc0000000) + #define IN_CLASSC_NET 0xffffff00 + #define IN_CLASSC_NSHIFT 8 + #define IN_CLASSC_HOST (0xffffffff & ~IN_CLASSC_NET) + +-#define IN_CLASSD(a) ((((in_addr_t)(a)) & 0xf0000000) == 0xe0000000) ++#define IN_CLASSD(a) ((((uint32_t)(a)) & 0xf0000000) == 0xe0000000) + #define IN_MULTICAST(a) IN_CLASSD(a) + +-#define IN_EXPERIMENTAL(a) ((((in_addr_t)(a)) & 0xe0000000) == 0xe0000000) +-#define IN_BADCLASS(a) ((((in_addr_t)(a)) & 0xf0000000) == 0xf0000000) ++#define IN_EXPERIMENTAL(a) ((((uint32_t)(a)) & 0xe0000000) == 0xe0000000) ++#define IN_BADCLASS(a) ((((uint32_t)(a)) & 0xf0000000) == 0xf0000000) + + /* Address to accept any incoming messages. */ +-#define INADDR_ANY ((in_addr_t) 0x00000000) ++#define INADDR_ANY ((uint32_t) 0x00000000) + /* Address to send to all hosts. */ +-#define INADDR_BROADCAST ((in_addr_t) 0xffffffff) ++#define INADDR_BROADCAST ((uint32_t) 0xffffffff) + /* Address indicating an error return. */ +-#define INADDR_NONE ((in_addr_t) 0xffffffff) ++#define INADDR_NONE ((uint32_t) 0xffffffff) + + /* Network number for local host loopback. */ + #define IN_LOOPBACKNET 127 + /* Address to loopback in software to local host. */ + #ifndef INADDR_LOOPBACK +-# define INADDR_LOOPBACK ((in_addr_t) 0x7f000001) /* Inet 127.0.0.1. */ ++# define INADDR_LOOPBACK ((uint32_t) 0x7f000001) /* Inet 127.0.0.1. */ + #endif + + /* Defines for Multicast INADDR. */ +-#define INADDR_UNSPEC_GROUP ((in_addr_t) 0xe0000000) /* 224.0.0.0 */ +-#define INADDR_ALLHOSTS_GROUP ((in_addr_t) 0xe0000001) /* 224.0.0.1 */ +-#define INADDR_ALLRTRS_GROUP ((in_addr_t) 0xe0000002) /* 224.0.0.2 */ +-#define INADDR_MAX_LOCAL_GROUP ((in_addr_t) 0xe00000ff) /* 224.0.0.255 */ ++#define INADDR_UNSPEC_GROUP ((uint32_t) 0xe0000000) /* 224.0.0.0 */ ++#define INADDR_ALLHOSTS_GROUP ((uint32_t) 0xe0000001) /* 224.0.0.1 */ ++#define INADDR_ALLRTRS_GROUP ((uint32_t) 0xe0000002) /* 224.0.0.2 */ ++#define INADDR_MAX_LOCAL_GROUP ((uint32_t) 0xe00000ff) /* 224.0.0.255 */ + + #ifndef __USE_KERNEL_IPV6_DEFS + /* IPv6 address */ +@@ -214,8 +214,8 @@ + { + uint8_t __u6_addr8[16]; + #ifdef __USE_MISC +- uint16_t __u6_addr16[8]; +- uint32_t __u6_addr32[4]; ++ __be16 __u6_addr16[8]; ++ __be32 __u6_addr32[4]; + #endif + } __in6_u; + #define s6_addr __in6_u.__u6_addr8 +@@ -255,7 +255,7 @@ + { + __SOCKADDR_COMMON (sin6_); + in_port_t sin6_port; /* Transport layer port # */ +- uint32_t sin6_flowinfo; /* IPv6 flow information */ ++ __be32 sin6_flowinfo; /* IPv6 flow information */ + struct in6_addr sin6_addr; /* IPv6 address */ + uint32_t sin6_scope_id; /* IPv6 scope-id */ + }; +@@ -373,12 +373,12 @@ + this was a short-sighted decision since on different systems the types + may have different representations but the values are always the same. */ + +-extern uint32_t ntohl (uint32_t __netlong) __THROW __attribute__ ((__const__)); +-extern uint16_t ntohs (uint16_t __netshort) ++extern uint32_t ntohl (__be32 __netlong) __THROW __attribute__ ((__const__)); ++extern uint16_t ntohs (__be16 __netshort) + __THROW __attribute__ ((__const__)); +-extern uint32_t htonl (uint32_t __hostlong) ++extern __be32 htonl (uint32_t __hostlong) + __THROW __attribute__ ((__const__)); +-extern uint16_t htons (uint16_t __hostshort) ++extern __be16 htons (uint16_t __hostshort) + __THROW __attribute__ ((__const__)); + + #include <endian.h> +@@ -386,7 +386,7 @@ + /* Get machine dependent optimized versions of byte swapping functions. */ + #include <bits/byteswap.h> + +-#ifdef __OPTIMIZE__ ++#ifdef __disabled_OPTIMIZE__ + /* We can optimize calls to the conversion functions. Either nothing has + to be done or we are using directly the byte-swapping functions which + often can be inlined. */ diff --git a/buildlib/sparse-include/23/sys-socket.h.diff b/buildlib/sparse-include/23/sys-socket.h.diff new file mode 100644 index 0000000..09cbea5 --- /dev/null +++ b/buildlib/sparse-include/23/sys-socket.h.diff @@ -0,0 +1,11 @@ +--- /usr/include/sys/socket.h 2016-11-16 15:43:53.000000000 -0700 ++++ build-sparse/include/sys/socket.h 2017-03-15 12:43:28.736376893 -0600 +@@ -65,7 +65,7 @@ + uses with any of the listed types to be allowed without complaint. + G++ 2.7 does not support transparent unions so there we want the + old-style declaration, too. */ +-#if defined __cplusplus || !__GNUC_PREREQ (2, 7) || !defined __USE_GNU ++#if 1 + # define __SOCKADDR_ARG struct sockaddr *__restrict + # define __CONST_SOCKADDR_ARG const struct sockaddr * + #else diff --git a/buildlib/sparse-include/25/netinet-in.h.diff b/buildlib/sparse-include/25/netinet-in.h.diff new file mode 100644 index 0000000..4238087 --- /dev/null +++ b/buildlib/sparse-include/25/netinet-in.h.diff @@ -0,0 +1,121 @@ +--- /usr/include/netinet/in.h 2017-03-09 00:51:29.000000000 +0000 ++++ build-tumbleweed/include/netinet/in.h 2017-03-21 18:13:51.951339197 +0000 +@@ -22,12 +22,12 @@ + #include <stdint.h> + #include <sys/socket.h> + #include <bits/types.h> +- ++#include <linux/types.h> + + __BEGIN_DECLS + + /* Internet address. */ +-typedef uint32_t in_addr_t; ++typedef __be32 in_addr_t; + struct in_addr + { + in_addr_t s_addr; +@@ -116,7 +116,7 @@ + #endif /* !__USE_KERNEL_IPV6_DEFS */ + + /* Type to represent a port. */ +-typedef uint16_t in_port_t; ++typedef __be16 in_port_t; + + /* Standard well-known ports. */ + enum +@@ -175,36 +175,36 @@ + #define IN_CLASSB_HOST (0xffffffff & ~IN_CLASSB_NET) + #define IN_CLASSB_MAX 65536 + +-#define IN_CLASSC(a) ((((in_addr_t)(a)) & 0xe0000000) == 0xc0000000) ++#define IN_CLASSC(a) ((((uint32_t)(a)) & 0xe0000000) == 0xc0000000) + #define IN_CLASSC_NET 0xffffff00 + #define IN_CLASSC_NSHIFT 8 + #define IN_CLASSC_HOST (0xffffffff & ~IN_CLASSC_NET) + +-#define IN_CLASSD(a) ((((in_addr_t)(a)) & 0xf0000000) == 0xe0000000) ++#define IN_CLASSD(a) ((((uint32_t)(a)) & 0xf0000000) == 0xe0000000) + #define IN_MULTICAST(a) IN_CLASSD(a) + +-#define IN_EXPERIMENTAL(a) ((((in_addr_t)(a)) & 0xe0000000) == 0xe0000000) +-#define IN_BADCLASS(a) ((((in_addr_t)(a)) & 0xf0000000) == 0xf0000000) ++#define IN_EXPERIMENTAL(a) ((((uint32_t)(a)) & 0xe0000000) == 0xe0000000) ++#define IN_BADCLASS(a) ((((uint32_t)(a)) & 0xf0000000) == 0xf0000000) + + /* Address to accept any incoming messages. */ +-#define INADDR_ANY ((in_addr_t) 0x00000000) ++#define INADDR_ANY ((uint32_t) 0x00000000) + /* Address to send to all hosts. */ +-#define INADDR_BROADCAST ((in_addr_t) 0xffffffff) ++#define INADDR_BROADCAST ((uint32_t) 0xffffffff) + /* Address indicating an error return. */ +-#define INADDR_NONE ((in_addr_t) 0xffffffff) ++#define INADDR_NONE ((uint32_t) 0xffffffff) + + /* Network number for local host loopback. */ + #define IN_LOOPBACKNET 127 + /* Address to loopback in software to local host. */ + #ifndef INADDR_LOOPBACK +-# define INADDR_LOOPBACK ((in_addr_t) 0x7f000001) /* Inet 127.0.0.1. */ ++# define INADDR_LOOPBACK ((uint32_t) 0x7f000001) /* Inet 127.0.0.1. */ + #endif + + /* Defines for Multicast INADDR. */ +-#define INADDR_UNSPEC_GROUP ((in_addr_t) 0xe0000000) /* 224.0.0.0 */ +-#define INADDR_ALLHOSTS_GROUP ((in_addr_t) 0xe0000001) /* 224.0.0.1 */ +-#define INADDR_ALLRTRS_GROUP ((in_addr_t) 0xe0000002) /* 224.0.0.2 */ +-#define INADDR_MAX_LOCAL_GROUP ((in_addr_t) 0xe00000ff) /* 224.0.0.255 */ ++#define INADDR_UNSPEC_GROUP ((uint32_t) 0xe0000000) /* 224.0.0.0 */ ++#define INADDR_ALLHOSTS_GROUP ((uint32_t) 0xe0000001) /* 224.0.0.1 */ ++#define INADDR_ALLRTRS_GROUP ((uint32_t) 0xe0000002) /* 224.0.0.2 */ ++#define INADDR_MAX_LOCAL_GROUP ((uint32_t) 0xe00000ff) /* 224.0.0.255 */ + + #if !__USE_KERNEL_IPV6_DEFS + /* IPv6 address */ +@@ -213,8 +213,8 @@ + union + { + uint8_t __u6_addr8[16]; +- uint16_t __u6_addr16[8]; +- uint32_t __u6_addr32[4]; ++ __be16 __u6_addr16[8]; ++ __be32 __u6_addr32[4]; + } __in6_u; + #define s6_addr __in6_u.__u6_addr8 + #ifdef __USE_MISC +@@ -253,7 +253,7 @@ + { + __SOCKADDR_COMMON (sin6_); + in_port_t sin6_port; /* Transport layer port # */ +- uint32_t sin6_flowinfo; /* IPv6 flow information */ ++ __be32 sin6_flowinfo; /* IPv6 flow information */ + struct in6_addr sin6_addr; /* IPv6 address */ + uint32_t sin6_scope_id; /* IPv6 scope-id */ + }; +@@ -371,12 +371,12 @@ + this was a short-sighted decision since on different systems the types + may have different representations but the values are always the same. */ + +-extern uint32_t ntohl (uint32_t __netlong) __THROW __attribute__ ((__const__)); +-extern uint16_t ntohs (uint16_t __netshort) ++extern uint32_t ntohl (__be32 __netlong) __THROW __attribute__ ((__const__)); ++extern uint16_t ntohs (__be16 __netshort) + __THROW __attribute__ ((__const__)); +-extern uint32_t htonl (uint32_t __hostlong) ++extern __be32 htonl (uint32_t __hostlong) + __THROW __attribute__ ((__const__)); +-extern uint16_t htons (uint16_t __hostshort) ++extern __be16 htons (uint16_t __hostshort) + __THROW __attribute__ ((__const__)); + + #include <endian.h> +@@ -385,7 +385,7 @@ + #include <bits/byteswap.h> + #include <bits/uintn-identity.h> + +-#ifdef __OPTIMIZE__ ++#ifdef __disabled_OPTIMIZE__ + /* We can optimize calls to the conversion functions. Either nothing has + to be done or we are using directly the byte-swapping functions which + often can be inlined. */ diff --git a/buildlib/sparse-include/27/bits-sysmacros.h.diff b/buildlib/sparse-include/27/bits-sysmacros.h.diff new file mode 100644 index 0000000..4ec5864 --- /dev/null +++ b/buildlib/sparse-include/27/bits-sysmacros.h.diff @@ -0,0 +1,24 @@ +--- /usr/include/bits/sysmacros.h 2018-04-16 20:14:20.000000000 +0000 ++++ include/bits/sysmacros.h 2019-05-16 19:30:02.096174695 +0000 +@@ -40,8 +40,8 @@ + __SYSMACROS_DECLARE_MAJOR (DECL_TEMPL) \ + { \ + unsigned int __major; \ +- __major = ((__dev & (__dev_t) 0x00000000000fff00u) >> 8); \ +- __major |= ((__dev & (__dev_t) 0xfffff00000000000u) >> 32); \ ++ __major = ((__dev & (__dev_t) 0x00000000000fff00ul) >> 8); \ ++ __major |= ((__dev & (__dev_t) 0xfffff00000000000ul) >> 32); \ + return __major; \ + } + +@@ -52,8 +52,8 @@ + __SYSMACROS_DECLARE_MINOR (DECL_TEMPL) \ + { \ + unsigned int __minor; \ +- __minor = ((__dev & (__dev_t) 0x00000000000000ffu) >> 0); \ +- __minor |= ((__dev & (__dev_t) 0x00000ffffff00000u) >> 12); \ ++ __minor = ((__dev & (__dev_t) 0x00000000000000fful) >> 0); \ ++ __minor |= ((__dev & (__dev_t) 0x00000ffffff00000ul) >> 12); \ + return __minor; \ + } + diff --git a/buildlib/sparse-include/27/netinet-in.h.diff b/buildlib/sparse-include/27/netinet-in.h.diff new file mode 100644 index 0000000..685f23f --- /dev/null +++ b/buildlib/sparse-include/27/netinet-in.h.diff @@ -0,0 +1,121 @@ +--- /usr/include/netinet/in.h 2018-04-16 20:14:20.000000000 +0000 ++++ include/netinet/in.h 2019-05-16 19:22:42.725853784 +0000 +@@ -22,12 +22,12 @@ + #include <bits/stdint-uintn.h> + #include <sys/socket.h> + #include <bits/types.h> +- ++#include <linux/types.h> + + __BEGIN_DECLS + + /* Internet address. */ +-typedef uint32_t in_addr_t; ++typedef __be32 in_addr_t; + struct in_addr + { + in_addr_t s_addr; +@@ -116,7 +116,7 @@ + #endif /* !__USE_KERNEL_IPV6_DEFS */ + + /* Type to represent a port. */ +-typedef uint16_t in_port_t; ++typedef __be16 in_port_t; + + /* Standard well-known ports. */ + enum +@@ -175,36 +175,36 @@ + #define IN_CLASSB_HOST (0xffffffff & ~IN_CLASSB_NET) + #define IN_CLASSB_MAX 65536 + +-#define IN_CLASSC(a) ((((in_addr_t)(a)) & 0xe0000000) == 0xc0000000) ++#define IN_CLASSC(a) ((((uint32_t)(a)) & 0xe0000000) == 0xc0000000) + #define IN_CLASSC_NET 0xffffff00 + #define IN_CLASSC_NSHIFT 8 + #define IN_CLASSC_HOST (0xffffffff & ~IN_CLASSC_NET) + +-#define IN_CLASSD(a) ((((in_addr_t)(a)) & 0xf0000000) == 0xe0000000) ++#define IN_CLASSD(a) ((((uint32_t)(a)) & 0xf0000000) == 0xe0000000) + #define IN_MULTICAST(a) IN_CLASSD(a) + +-#define IN_EXPERIMENTAL(a) ((((in_addr_t)(a)) & 0xe0000000) == 0xe0000000) +-#define IN_BADCLASS(a) ((((in_addr_t)(a)) & 0xf0000000) == 0xf0000000) ++#define IN_EXPERIMENTAL(a) ((((uint32_t)(a)) & 0xe0000000) == 0xe0000000) ++#define IN_BADCLASS(a) ((((uint32_t)(a)) & 0xf0000000) == 0xf0000000) + + /* Address to accept any incoming messages. */ +-#define INADDR_ANY ((in_addr_t) 0x00000000) ++#define INADDR_ANY ((uint32_t) 0x00000000) + /* Address to send to all hosts. */ +-#define INADDR_BROADCAST ((in_addr_t) 0xffffffff) ++#define INADDR_BROADCAST ((uint32_t) 0xffffffff) + /* Address indicating an error return. */ +-#define INADDR_NONE ((in_addr_t) 0xffffffff) ++#define INADDR_NONE ((uint32_t) 0xffffffff) + + /* Network number for local host loopback. */ + #define IN_LOOPBACKNET 127 + /* Address to loopback in software to local host. */ + #ifndef INADDR_LOOPBACK +-# define INADDR_LOOPBACK ((in_addr_t) 0x7f000001) /* Inet 127.0.0.1. */ ++# define INADDR_LOOPBACK ((uint32_t) 0x7f000001) /* Inet 127.0.0.1. */ + #endif + + /* Defines for Multicast INADDR. */ +-#define INADDR_UNSPEC_GROUP ((in_addr_t) 0xe0000000) /* 224.0.0.0 */ +-#define INADDR_ALLHOSTS_GROUP ((in_addr_t) 0xe0000001) /* 224.0.0.1 */ +-#define INADDR_ALLRTRS_GROUP ((in_addr_t) 0xe0000002) /* 224.0.0.2 */ +-#define INADDR_MAX_LOCAL_GROUP ((in_addr_t) 0xe00000ff) /* 224.0.0.255 */ ++#define INADDR_UNSPEC_GROUP ((uint32_t) 0xe0000000) /* 224.0.0.0 */ ++#define INADDR_ALLHOSTS_GROUP ((uint32_t) 0xe0000001) /* 224.0.0.1 */ ++#define INADDR_ALLRTRS_GROUP ((uint32_t) 0xe0000002) /* 224.0.0.2 */ ++#define INADDR_MAX_LOCAL_GROUP ((uint32_t) 0xe00000ff) /* 224.0.0.255 */ + + #if !__USE_KERNEL_IPV6_DEFS + /* IPv6 address */ +@@ -213,8 +213,8 @@ + union + { + uint8_t __u6_addr8[16]; +- uint16_t __u6_addr16[8]; +- uint32_t __u6_addr32[4]; ++ __be16 __u6_addr16[8]; ++ __be32 __u6_addr32[4]; + } __in6_u; + #define s6_addr __in6_u.__u6_addr8 + #ifdef __USE_MISC +@@ -253,7 +253,7 @@ + { + __SOCKADDR_COMMON (sin6_); + in_port_t sin6_port; /* Transport layer port # */ +- uint32_t sin6_flowinfo; /* IPv6 flow information */ ++ __be32 sin6_flowinfo; /* IPv6 flow information */ + struct in6_addr sin6_addr; /* IPv6 address */ + uint32_t sin6_scope_id; /* IPv6 scope-id */ + }; +@@ -371,12 +371,12 @@ + this was a short-sighted decision since on different systems the types + may have different representations but the values are always the same. */ + +-extern uint32_t ntohl (uint32_t __netlong) __THROW __attribute__ ((__const__)); +-extern uint16_t ntohs (uint16_t __netshort) ++extern uint32_t ntohl (__be32 __netlong) __THROW __attribute__ ((__const__)); ++extern uint16_t ntohs (__be16 __netshort) + __THROW __attribute__ ((__const__)); +-extern uint32_t htonl (uint32_t __hostlong) ++extern __be32 htonl (uint32_t __hostlong) + __THROW __attribute__ ((__const__)); +-extern uint16_t htons (uint16_t __hostshort) ++extern __be16 htons (uint16_t __hostshort) + __THROW __attribute__ ((__const__)); + + #include <endian.h> +@@ -385,7 +385,7 @@ + #include <bits/byteswap.h> + #include <bits/uintn-identity.h> + +-#ifdef __OPTIMIZE__ ++#ifdef __disabled_OPTIMIZE__ + /* We can optimize calls to the conversion functions. Either nothing has + to be done or we are using directly the byte-swapping functions which + often can be inlined. */ diff --git a/buildlib/sparse-include/27/stdlib.h.diff b/buildlib/sparse-include/27/stdlib.h.diff new file mode 100644 index 0000000..5ddced1 --- /dev/null +++ b/buildlib/sparse-include/27/stdlib.h.diff @@ -0,0 +1,23 @@ +--- /usr/include/stdlib.h 2018-04-16 20:14:20.000000000 +0000 ++++ include/stdlib.h 2019-05-16 19:38:38.071615242 +0000 +@@ -130,6 +130,20 @@ + + /* Likewise for '_FloatN' and '_FloatNx'. */ + ++/* For whatever reason our sparse does not understand these new compiler types */ ++#undef __GLIBC_USE_IEC_60559_TYPES_EXT ++#define __GLIBC_USE_IEC_60559_TYPES_EXT 0 ++#undef __HAVE_FLOAT32 ++#define __HAVE_FLOAT32 0 ++#undef __HAVE_FLOAT32X ++#define __HAVE_FLOAT32X 0 ++#undef __HAVE_FLOAT64 ++#define __HAVE_FLOAT64 0 ++#undef __HAVE_FLOAT64X ++#define __HAVE_FLOAT64X 0 ++#undef __HAVE_FLOAT128 ++#define __HAVE_FLOAT128 0 ++ + #if __HAVE_FLOAT16 && __GLIBC_USE (IEC_60559_TYPES_EXT) + extern _Float16 strtof16 (const char *__restrict __nptr, + char **__restrict __endptr) diff --git a/buildlib/sparse-include/27/sys-socket.h.diff b/buildlib/sparse-include/27/sys-socket.h.diff new file mode 100644 index 0000000..92ee3bf --- /dev/null +++ b/buildlib/sparse-include/27/sys-socket.h.diff @@ -0,0 +1,11 @@ +--- /usr/include/sys/socket.h 2018-04-16 20:14:20.000000000 +0000 ++++ include/sys/socket.h 2019-05-16 19:22:42.721853727 +0000 +@@ -54,7 +54,7 @@ + uses with any of the listed types to be allowed without complaint. + G++ 2.7 does not support transparent unions so there we want the + old-style declaration, too. */ +-#if defined __cplusplus || !__GNUC_PREREQ (2, 7) || !defined __USE_GNU ++#if 1 + # define __SOCKADDR_ARG struct sockaddr *__restrict + # define __CONST_SOCKADDR_ARG const struct sockaddr * + #else diff --git a/buildlib/sparse-include/endian.h b/buildlib/sparse-include/endian.h new file mode 100644 index 0000000..26c317b --- /dev/null +++ b/buildlib/sparse-include/endian.h @@ -0,0 +1,44 @@ +/* COPYRIGHT (c) 2017 Obsidian Research Corporation. + Licensed under BSD (MIT variant) or GPLv2. See COPYING. */ + +#ifndef _SPARSE_ENDIAN_H_ +#define _SPARSE_ENDIAN_H_ + +#include_next <endian.h> + +#include <util/compiler.h> + +#undef htobe16 +#undef htole16 +#undef be16toh +#undef le16toh + +#undef htobe32 +#undef htole32 +#undef be32toh +#undef le32toh + +#undef htobe64 +#undef htole64 +#undef be64toh +#undef le64toh + +/* These do not actually work, but this trivially ensures that sparse sees all + * the types. */ + +#define htobe16(x) ((__force __be16)__builtin_bswap16(x)) +#define htole16(x) ((__force __le16)__builtin_bswap16(x)) +#define be16toh(x) ((uint16_t)__builtin_bswap16((__force uint16_t)(__be16)(x))) +#define le16toh(x) ((uint16_t)__builtin_bswap16((__force uint16_t)(__le16)(x))) + +#define htobe32(x) ((__force __be32)__builtin_bswap32(x)) +#define htole32(x) ((__force __le32)__builtin_bswap32(x)) +#define be32toh(x) ((uint32_t)__builtin_bswap32((__force uint32_t)(__be32)(x))) +#define le32toh(x) ((uint32_t)__builtin_bswap32((__force uint32_t)(__le32)(x))) + +#define htobe64(x) ((__force __be64)__builtin_bswap64(x)) +#define htole64(x) ((__force __le64)__builtin_bswap64(x)) +#define be64toh(x) ((uint64_t)__builtin_bswap64((__force uint64_t)(__be64)(x))) +#define le64toh(x) ((uint64_t)__builtin_bswap64((__force uint64_t)(__le64)(x))) + +#endif diff --git a/buildlib/sparse-include/pthread.h b/buildlib/sparse-include/pthread.h new file mode 100644 index 0000000..bd38b60 --- /dev/null +++ b/buildlib/sparse-include/pthread.h @@ -0,0 +1,13 @@ +/* COPYRIGHT (c) 2017 Obsidian Research Corporation. + Licensed under BSD (MIT variant) or GPLv2. See COPYING. */ + +#ifndef _SPARSE_PTHREAD_H_ +#define _SPARSE_PTHREAD_H_ + +#include_next <pthread.h> + +/* Sparse complains that the glibc version of this has 0 instead of NULL */ +#undef PTHREAD_MUTEX_INITIALIZER +#define PTHREAD_MUTEX_INITIALIZER {} + +#endif diff --git a/buildlib/sparse-include/stdatomic.h b/buildlib/sparse-include/stdatomic.h new file mode 100644 index 0000000..20fde14 --- /dev/null +++ b/buildlib/sparse-include/stdatomic.h @@ -0,0 +1,176 @@ +/* COPYRIGHT (c) 2017 Obsidian Research Corporation. + * Licensed under BSD (MIT variant) or GPLv2. See COPYING. + * + * A version of C11 stdatomic.h that doesn't make spare angry. This doesn't + * actually work. + */ + +#ifndef _SPARSE_STDATOMIC_H_ +#define _SPARSE_STDATOMIC_H_ + +#include <stddef.h> +#include <stdint.h> + +#define _Atomic(T) struct {volatile __typeof__(T) __val; } + +#define ATOMIC_VAR_INIT(value) \ + { \ + .__val = (value) \ + } +#define atomic_init(obj, value) \ + do { \ + (obj)->__val = (value); \ + } while (0) + +enum memory_order { + memory_order_relaxed, + memory_order_consume, + memory_order_acquire, + memory_order_release, + memory_order_acq_rel, + memory_order_seq_cst, +}; + +typedef enum memory_order memory_order; + +#define atomic_thread_fence(order) __asm volatile("" : : : "memory") +#define atomic_signal_fence(order) __asm volatile("" : : : "memory") + +#define atomic_is_lock_free(obj) (sizeof((obj)->__val) <= sizeof(void *)) + +typedef _Atomic(_Bool) atomic_bool; +typedef _Atomic(char) atomic_char; +typedef _Atomic(signed char) atomic_schar; +typedef _Atomic(unsigned char) atomic_uchar; +typedef _Atomic(short) atomic_short; +typedef _Atomic(unsigned short) atomic_ushort; +typedef _Atomic(int) atomic_int; +typedef _Atomic(unsigned int) atomic_uint; +typedef _Atomic(long) atomic_long; +typedef _Atomic(unsigned long) atomic_ulong; +typedef _Atomic(long long) atomic_llong; +typedef _Atomic(unsigned long long) atomic_ullong; +typedef _Atomic(wchar_t) atomic_wchar_t; +typedef _Atomic(int_least8_t) atomic_int_least8_t; +typedef _Atomic(uint_least8_t) atomic_uint_least8_t; +typedef _Atomic(int_least16_t) atomic_int_least16_t; +typedef _Atomic(uint_least16_t) atomic_uint_least16_t; +typedef _Atomic(int_least32_t) atomic_int_least32_t; +typedef _Atomic(uint_least32_t) atomic_uint_least32_t; +typedef _Atomic(int_least64_t) atomic_int_least64_t; +typedef _Atomic(uint_least64_t) atomic_uint_least64_t; +typedef _Atomic(int_fast8_t) atomic_int_fast8_t; +typedef _Atomic(uint_fast8_t) atomic_uint_fast8_t; +typedef _Atomic(int_fast16_t) atomic_int_fast16_t; +typedef _Atomic(uint_fast16_t) atomic_uint_fast16_t; +typedef _Atomic(int_fast32_t) atomic_int_fast32_t; +typedef _Atomic(uint_fast32_t) atomic_uint_fast32_t; +typedef _Atomic(int_fast64_t) atomic_int_fast64_t; +typedef _Atomic(uint_fast64_t) atomic_uint_fast64_t; +typedef _Atomic(intptr_t) atomic_intptr_t; +typedef _Atomic(uintptr_t) atomic_uintptr_t; +typedef _Atomic(size_t) atomic_size_t; +typedef _Atomic(ptrdiff_t) atomic_ptrdiff_t; +typedef _Atomic(intmax_t) atomic_intmax_t; +typedef _Atomic(uintmax_t) atomic_uintmax_t; + +#define atomic_compare_exchange_strong_explicit(object, expected, desired, \ + success, failure) \ + ({ \ + __typeof__((object)->__val) __v = (object)->__val; \ + bool __r; \ + if (__v == *(expected)) { \ + r = true; \ + (object)->__val = (desired); \ + } else { \ + r = false; \ + *(expected) = __val; \ + } \ + __r; \ + }) + +#define atomic_compare_exchange_weak_explicit(object, expected, desired, \ + success, failure) \ + atomic_compare_exchange_strong_explicit(object, expected, desired, \ + success, failure) + +#define atomic_exchange_explicit(object, desired, order) \ + ({ \ + __typeof__((object)->__val) __v = (object)->__val; \ + (object)->__val = (operand); \ + __v; \ + }) +#define atomic_fetch_add_explicit(object, operand, order) \ + ({ \ + __typeof__((object)->__val) __v = (object)->__val; \ + (object)->__val += (operand); \ + __v; \ + }) +#define atomic_fetch_and_explicit(object, operand, order) \ + ({ \ + __typeof__((object)->__val) __v = (object)->__val; \ + (object)->__val &= (operand); \ + __v; \ + }) +#define atomic_fetch_or_explicit(object, operand, order) \ + ({ \ + __typeof__((object)->__val) __v = (object)->__val; \ + (object)->__val |= (operand); \ + __v; \ + }) +#define atomic_fetch_sub_explicit(object, operand, order) \ + ({ \ + __typeof__((object)->__val) __v = (object)->__val; \ + (object)->__val -= (operand); \ + __v; \ + }) +#define atomic_fetch_xor_explicit(object, operand, order) \ + ({ \ + __typeof__((object)->__val) __v = (object)->__val; \ + (object)->__val ^= (operand); \ + __v; \ + }) + +#define atomic_load_explicit(object, order) ((object)->__val) +#define atomic_store_explicit(object, desired, order) \ + ({ (object)->__val = (desired); }) + +#define atomic_compare_exchange_strong(object, expected, desired) \ + atomic_compare_exchange_strong_explicit(object, expected, desired, \ + memory_order_seq_cst, \ + memory_order_seq_cst) +#define atomic_compare_exchange_weak(object, expected, desired) \ + atomic_compare_exchange_weak_explicit(object, expected, desired, \ + memory_order_seq_cst, \ + memory_order_seq_cst) +#define atomic_exchange(object, desired) \ + atomic_exchange_explicit(object, desired, memory_order_seq_cst) +#define atomic_fetch_add(object, operand) \ + atomic_fetch_add_explicit(object, operand, memory_order_seq_cst) +#define atomic_fetch_and(object, operand) \ + atomic_fetch_and_explicit(object, operand, memory_order_seq_cst) +#define atomic_fetch_or(object, operand) \ + atomic_fetch_or_explicit(object, operand, memory_order_seq_cst) +#define atomic_fetch_sub(object, operand) \ + atomic_fetch_sub_explicit(object, operand, memory_order_seq_cst) +#define atomic_fetch_xor(object, operand) \ + atomic_fetch_xor_explicit(object, operand, memory_order_seq_cst) +#define atomic_load(object) atomic_load_explicit(object, memory_order_seq_cst) +#define atomic_store(object, desired) \ + atomic_store_explicit(object, desired, memory_order_seq_cst) + +typedef atomic_bool atomic_flag; + +#define ATOMIC_FLAG_INIT ATOMIC_VAR_INIT(0) + +#define atomic_flag_clear_explicit(object, order) \ + atomic_store_explicit(object, 0, order) +#define atomic_flag_test_and_set_explicit(object, order) \ + atomic_compare_exchange_strong_explicit(object, 0, 1, order, order) + +#define atomic_flag_clear(object) \ + atomic_flag_clear_explicit(object, memory_order_seq_cst) +#define atomic_flag_test_and_set(object) \ + atomic_flag_test_and_set_explicit(object, memory_order_seq_cst) + +#endif diff --git a/buildlib/template.pc.in b/buildlib/template.pc.in new file mode 100644 index 0000000..618840c --- /dev/null +++ b/buildlib/template.pc.in @@ -0,0 +1,13 @@ +prefix=@CMAKE_INSTALL_PREFIX@ +exec_prefix=${prefix} +libdir=${prefix}/@CMAKE_INSTALL_LIBDIR@ +includedir=${prefix}/@CMAKE_INSTALL_INCLUDEDIR@ + +Name: lib@PC_LIB_NAME@ +Description: RDMA Core Userspace Library +URL: https://github.com/linux-rdma/rdma-core +Version: @PC_VERSION@ +Libs: -L${libdir} -l@PC_LIB_NAME@ @PC_RPATH@ +Libs.private: @PC_LIB_PRIVATE@ +Requires.private: @PC_REQUIRES_PRIVATE@ +Cflags: -I${includedir} diff --git a/ccan/CMakeLists.txt b/ccan/CMakeLists.txt new file mode 100644 index 0000000..5c5c6a2 --- /dev/null +++ b/ccan/CMakeLists.txt @@ -0,0 +1,23 @@ +publish_internal_headers(ccan + array_size.h + bitmap.h + build_assert.h + check_type.h + compiler.h + container_of.h + ilog.h + list.h + minmax.h + str.h + str_debug.h + ) + +set(C_FILES + bitmap.c + ilog.c + list.c + str.c + ) +add_library(ccan STATIC ${C_FILES}) +add_library(ccan_pic STATIC ${C_FILES}) +set_property(TARGET ccan_pic PROPERTY POSITION_INDEPENDENT_CODE TRUE) diff --git a/ccan/LICENSE.CCO b/ccan/LICENSE.CCO new file mode 100644 index 0000000..57f2f1b --- /dev/null +++ b/ccan/LICENSE.CCO @@ -0,0 +1,97 @@ +Statement of Purpose + +The laws of most jurisdictions throughout the world automatically confer +exclusive Copyright and Related Rights (defined below) upon the creator and +subsequent owner(s) (each and all, an "owner") of an original work of +authorship and/or a database (each, a "Work"). + +Certain owners wish to permanently relinquish those rights to a Work for the +purpose of contributing to a commons of creative, cultural and scientific works +("Commons") that the public can reliably and without fear of later claims of +infringement build upon, modify, incorporate in other works, reuse and +redistribute as freely as possible in any form whatsoever and for any purposes, +including without limitation commercial purposes. These owners may contribute +to the Commons to promote the ideal of a free culture and the further +production of creative, cultural and scientific works, or to gain reputation or +greater distribution for their Work in part through the use and efforts of +others. + +For these and/or other purposes and motivations, and without any expectation of +additional consideration or compensation, the person associating CC0 with a +Work (the "Affirmer"), to the extent that he or she is an owner of Copyright +and Related Rights in the Work, voluntarily elects to apply CC0 to the Work and +publicly distribute the Work under its terms, with knowledge of his or her +Copyright and Related Rights in the Work and the meaning and intended legal +effect of CC0 on those rights. + +1. Copyright and Related Rights. A Work made available under CC0 may be +protected by copyright and related or neighboring rights ("Copyright and +Related Rights"). Copyright and Related Rights include, but are not limited to, +the following: + + the right to reproduce, adapt, distribute, perform, display, communicate, +and translate a Work; moral rights retained by the original author(s) and/or +performer(s); publicity and privacy rights pertaining to a person's image or +likeness depicted in a Work; rights protecting against unfair competition in +regards to a Work, subject to the limitations in paragraph 4(a), below; rights +protecting the extraction, dissemination, use and reuse of data in a Work; +database rights (such as those arising under Directive 96/9/EC of the European +Parliament and of the Council of 11 March 1996 on the legal protection of +databases, and under any national implementation thereof, including any amended +or successor version of such directive); and other similar, equivalent or +corresponding rights throughout the world based on applicable law or treaty, +and any national implementations thereof. + +2. Waiver. To the greatest extent permitted by, but not in contravention of, +applicable law, Affirmer hereby overtly, fully, permanently, irrevocably and +unconditionally waives, abandons, and surrenders all of Affirmer's Copyright +and Related Rights and associated claims and causes of action, whether now +known or unknown (including existing as well as future claims and causes of +action), in the Work (i) in all territories worldwide, (ii) for the maximum +duration provided by applicable law or treaty (including future time +extensions), (iii) in any current or future medium and for any number of +copies, and (iv) for any purpose whatsoever, including without limitation +commercial, advertising or promotional purposes (the "Waiver"). Affirmer makes +the Waiver for the benefit of each member of the public at large and to the +detriment of Affirmer's heirs and successors, fully intending that such Waiver +shall not be subject to revocation, rescission, cancellation, termination, or +any other legal or equitable action to disrupt the quiet enjoyment of the Work +by the public as contemplated by Affirmer's express Statement of Purpose. + +3. Public License Fallback. Should any part of the Waiver for any reason be +judged legally invalid or ineffective under applicable law, then the Waiver +shall be preserved to the maximum extent permitted taking into account +Affirmer's express Statement of Purpose. In addition, to the extent the Waiver +is so judged Affirmer hereby grants to each affected person a royalty-free, non +transferable, non sublicensable, non exclusive, irrevocable and unconditional +license to exercise Affirmer's Copyright and Related Rights in the Work (i) in +all territories worldwide, (ii) for the maximum duration provided by applicable +law or treaty (including future time extensions), (iii) in any current or +future medium and for any number of copies, and (iv) for any purpose +whatsoever, including without limitation commercial, advertising or promotional +purposes (the "License"). The License shall be deemed effective as of the date +CC0 was applied by Affirmer to the Work. Should any part of the License for any +reason be judged legally invalid or ineffective under applicable law, such +partial invalidity or ineffectiveness shall not invalidate the remainder of the +License, and in such case Affirmer hereby affirms that he or she will not (i) +exercise any of his or her remaining Copyright and Related Rights in the Work +or (ii) assert any associated claims and causes of action with respect to the +Work, in either case contrary to Affirmer's express Statement of Purpose. + +4. Limitations and Disclaimers. + + No trademark or patent rights held by Affirmer are waived, abandoned, +surrendered, licensed or otherwise affected by this document. Affirmer offers +the Work as-is and makes no representations or warranties of any kind +concerning the Work, express, implied, statutory or otherwise, including +without limitation warranties of title, merchantability, fitness for a +particular purpose, non infringement, or the absence of latent or other +defects, accuracy, or the present or absence of errors, whether or not +discoverable, all to the greatest extent permissible under applicable law. +Affirmer disclaims responsibility for clearing rights of other persons that may +apply to the Work or any use thereof, including without limitation any person's +Copyright and Related Rights in the Work. Further, Affirmer disclaims +responsibility for obtaining any necessary consents, permissions or other +rights required for any use of the Work. Affirmer understands and acknowledges +that Creative Commons is not a party to this document and has no duty or +obligation with respect to this CC0 or use of the Work. diff --git a/ccan/LICENSE.MIT b/ccan/LICENSE.MIT new file mode 100644 index 0000000..89de354 --- /dev/null +++ b/ccan/LICENSE.MIT @@ -0,0 +1,17 @@ +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. diff --git a/ccan/array_size.h b/ccan/array_size.h new file mode 100644 index 0000000..37b200f --- /dev/null +++ b/ccan/array_size.h @@ -0,0 +1,26 @@ +/* CC0 (Public domain) - see LICENSE file for details */ +#ifndef CCAN_ARRAY_SIZE_H +#define CCAN_ARRAY_SIZE_H +#include "config.h" +#include <ccan/build_assert.h> + +/** + * ARRAY_SIZE - get the number of elements in a visible array + * @arr: the array whose size you want. + * + * This does not work on pointers, or arrays declared as [], or + * function parameters. With correct compiler support, such usage + * will cause a build error (see build_assert). + */ +#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]) + _array_size_chk(arr)) + +#if HAVE_BUILTIN_TYPES_COMPATIBLE_P && HAVE_TYPEOF +/* Two gcc extensions. + * &a[0] degrades to a pointer: a different type from an array */ +#define _array_size_chk(arr) \ + BUILD_ASSERT_OR_ZERO(!__builtin_types_compatible_p(typeof(arr), \ + typeof(&(arr)[0]))) +#else +#define _array_size_chk(arr) 0 +#endif +#endif /* CCAN_ALIGNOF_H */ diff --git a/ccan/bitmap.c b/ccan/bitmap.c new file mode 100644 index 0000000..ea5531c --- /dev/null +++ b/ccan/bitmap.c @@ -0,0 +1,125 @@ +/* Licensed under LGPLv2.1+ - see LICENSE file for details */ + +#include "config.h" + +#include <ccan/bitmap.h> + +#include <assert.h> + +#define BIT_ALIGN_DOWN(n) ((n) & ~(BITMAP_WORD_BITS - 1)) +#define BIT_ALIGN_UP(n) BIT_ALIGN_DOWN((n) + BITMAP_WORD_BITS - 1) + +void bitmap_zero_range(bitmap *bmap, unsigned long n, unsigned long m) +{ + unsigned long an = BIT_ALIGN_UP(n); + unsigned long am = BIT_ALIGN_DOWN(m); + bitmap_word headmask = -1ULL >> (n % BITMAP_WORD_BITS); + bitmap_word tailmask = ~(-1ULL >> (m % BITMAP_WORD_BITS)); + + assert(m >= n); + + if (am < an) { + BITMAP_WORD(bmap, n) &= ~bitmap_bswap(headmask & tailmask); + return; + } + + if (an > n) + BITMAP_WORD(bmap, n) &= ~bitmap_bswap(headmask); + + if (am > an) + memset(&BITMAP_WORD(bmap, an), 0, + (am - an) / BITMAP_WORD_BITS * sizeof(bitmap_word)); + + if (m > am) + BITMAP_WORD(bmap, m) &= ~bitmap_bswap(tailmask); +} + +void bitmap_fill_range(bitmap *bmap, unsigned long n, unsigned long m) +{ + unsigned long an = BIT_ALIGN_UP(n); + unsigned long am = BIT_ALIGN_DOWN(m); + bitmap_word headmask = -1ULL >> (n % BITMAP_WORD_BITS); + bitmap_word tailmask = ~(-1ULL >> (m % BITMAP_WORD_BITS)); + + assert(m >= n); + + if (am < an) { + BITMAP_WORD(bmap, n) |= bitmap_bswap(headmask & tailmask); + return; + } + + if (an > n) + BITMAP_WORD(bmap, n) |= bitmap_bswap(headmask); + + if (am > an) + memset(&BITMAP_WORD(bmap, an), 0xff, + (am - an) / BITMAP_WORD_BITS * sizeof(bitmap_word)); + + if (m > am) + BITMAP_WORD(bmap, m) |= bitmap_bswap(tailmask); +} + +static int bitmap_clz(bitmap_word w) +{ +#if HAVE_BUILTIN_CLZL + return __builtin_clzl(w); +#else + int lz = 0; + bitmap_word mask = 1UL << (BITMAP_WORD_BITS - 1); + + while (!(w & mask)) { + lz++; + mask >>= 1; + } + + return lz; +#endif +} + +unsigned long bitmap_ffs(const bitmap *bmap, + unsigned long n, unsigned long m) +{ + unsigned long an = BIT_ALIGN_UP(n); + unsigned long am = BIT_ALIGN_DOWN(m); + bitmap_word headmask = -1ULL >> (n % BITMAP_WORD_BITS); + bitmap_word tailmask = ~(-1ULL >> (m % BITMAP_WORD_BITS)); + + assert(m >= n); + + if (am < an) { + bitmap_word w = bitmap_bswap(BITMAP_WORD(bmap, n)); + + w &= (headmask & tailmask); + + return w ? am + bitmap_clz(w) : m; + } + + if (an > n) { + bitmap_word w = bitmap_bswap(BITMAP_WORD(bmap, n)); + + w &= headmask; + + if (w) + return BIT_ALIGN_DOWN(n) + bitmap_clz(w); + } + + while (an < am) { + bitmap_word w = bitmap_bswap(BITMAP_WORD(bmap, an)); + + if (w) + return an + bitmap_clz(w); + + an += BITMAP_WORD_BITS; + } + + if (m > am) { + bitmap_word w = bitmap_bswap(BITMAP_WORD(bmap, m)); + + w &= tailmask; + + if (w) + return am + bitmap_clz(w); + } + + return m; +} diff --git a/ccan/bitmap.h b/ccan/bitmap.h new file mode 100644 index 0000000..ff0b8c8 --- /dev/null +++ b/ccan/bitmap.h @@ -0,0 +1,239 @@ +/* Licensed under LGPLv2+ - see LICENSE file for details */ +#ifndef CCAN_BITMAP_H_ +#define CCAN_BITMAP_H_ + +#include <stdbool.h> +#include <stdlib.h> +#include <string.h> +#include <limits.h> + +typedef unsigned long bitmap_word; + +#define BITMAP_WORD_BITS (sizeof(bitmap_word) * CHAR_BIT) +#define BITMAP_NWORDS(_n) \ + (((_n) + BITMAP_WORD_BITS - 1) / BITMAP_WORD_BITS) + +/* + * We wrap each word in a structure for type checking. + */ +typedef struct { + bitmap_word w; +} bitmap; + +#define BITMAP_DECLARE(_name, _nbits) \ + bitmap (_name)[BITMAP_NWORDS(_nbits)] + +static inline size_t bitmap_sizeof(unsigned long nbits) +{ + return BITMAP_NWORDS(nbits) * sizeof(bitmap_word); +} + +static inline bitmap_word bitmap_bswap(bitmap_word w) +{ + /* We do not need to have the bitmap in any specific endianness */ + return w; +} + +#define BITMAP_WORD(_bm, _n) ((_bm)[(_n) / BITMAP_WORD_BITS].w) +#define BITMAP_WORDBIT(_n) \ + (bitmap_bswap(1UL << (BITMAP_WORD_BITS - ((_n) % BITMAP_WORD_BITS) - 1))) + +#define BITMAP_HEADWORDS(_nbits) \ + ((_nbits) / BITMAP_WORD_BITS) +#define BITMAP_HEADBYTES(_nbits) \ + (BITMAP_HEADWORDS(_nbits) * sizeof(bitmap_word)) + +#define BITMAP_TAILWORD(_bm, _nbits) \ + ((_bm)[BITMAP_HEADWORDS(_nbits)].w) +#define BITMAP_HASTAIL(_nbits) (((_nbits) % BITMAP_WORD_BITS) != 0) +#define BITMAP_TAILBITS(_nbits) \ + (bitmap_bswap(~(-1UL >> ((_nbits) % BITMAP_WORD_BITS)))) +#define BITMAP_TAIL(_bm, _nbits) \ + (BITMAP_TAILWORD(_bm, _nbits) & BITMAP_TAILBITS(_nbits)) + +static inline void bitmap_set_bit(bitmap *bmap, unsigned long n) +{ + BITMAP_WORD(bmap, n) |= BITMAP_WORDBIT(n); +} + +static inline void bitmap_clear_bit(bitmap *bmap, unsigned long n) +{ + BITMAP_WORD(bmap, n) &= ~BITMAP_WORDBIT(n); +} + +static inline void bitmap_change_bit(bitmap *bmap, unsigned long n) +{ + BITMAP_WORD(bmap, n) ^= BITMAP_WORDBIT(n); +} + +static inline bool bitmap_test_bit(const bitmap *bmap, unsigned long n) +{ + return !!(BITMAP_WORD(bmap, n) & BITMAP_WORDBIT(n)); +} + +void bitmap_zero_range(bitmap *bmap, unsigned long n, unsigned long m); +void bitmap_fill_range(bitmap *bmap, unsigned long n, unsigned long m); + +static inline void bitmap_zero(bitmap *bmap, unsigned long nbits) +{ + memset(bmap, 0, bitmap_sizeof(nbits)); +} + +static inline void bitmap_fill(bitmap *bmap, unsigned long nbits) +{ + memset(bmap, 0xff, bitmap_sizeof(nbits)); +} + +static inline void bitmap_copy(bitmap *dst, const bitmap *src, + unsigned long nbits) +{ + memcpy(dst, src, bitmap_sizeof(nbits)); +} + +#define BITMAP_DEF_BINOP(_name, _op) \ + static inline void bitmap_##_name(bitmap *dst, bitmap *src1, bitmap *src2, \ + unsigned long nbits) \ + { \ + unsigned long i = 0; \ + for (i = 0; i < BITMAP_NWORDS(nbits); i++) { \ + dst[i].w = src1[i].w _op src2[i].w; \ + } \ + } + +BITMAP_DEF_BINOP(and, &) +BITMAP_DEF_BINOP(or, |) +BITMAP_DEF_BINOP(xor, ^) +BITMAP_DEF_BINOP(andnot, & ~) + +#undef BITMAP_DEF_BINOP + +static inline void bitmap_complement(bitmap *dst, const bitmap *src, + unsigned long nbits) +{ + unsigned long i; + + for (i = 0; i < BITMAP_NWORDS(nbits); i++) + dst[i].w = ~src[i].w; +} + +static inline bool bitmap_equal(const bitmap *src1, const bitmap *src2, + unsigned long nbits) +{ + return (memcmp(src1, src2, BITMAP_HEADBYTES(nbits)) == 0) + && (!BITMAP_HASTAIL(nbits) + || (BITMAP_TAIL(src1, nbits) == BITMAP_TAIL(src2, nbits))); +} + +static inline bool bitmap_intersects(const bitmap *src1, const bitmap *src2, + unsigned long nbits) +{ + unsigned long i; + + for (i = 0; i < BITMAP_HEADWORDS(nbits); i++) { + if (src1[i].w & src2[i].w) + return true; + } + if (BITMAP_HASTAIL(nbits) && + (BITMAP_TAIL(src1, nbits) & BITMAP_TAIL(src2, nbits))) + return true; + return false; +} + +static inline bool bitmap_subset(const bitmap *src1, const bitmap *src2, + unsigned long nbits) +{ + unsigned long i; + + for (i = 0; i < BITMAP_HEADWORDS(nbits); i++) { + if (src1[i].w & ~src2[i].w) + return false; + } + if (BITMAP_HASTAIL(nbits) && + (BITMAP_TAIL(src1, nbits) & ~BITMAP_TAIL(src2, nbits))) + return false; + return true; +} + +static inline bool bitmap_full(const bitmap *bmap, unsigned long nbits) +{ + unsigned long i; + + for (i = 0; i < BITMAP_HEADWORDS(nbits); i++) { + if (bmap[i].w != -1UL) + return false; + } + if (BITMAP_HASTAIL(nbits) && + (BITMAP_TAIL(bmap, nbits) != BITMAP_TAILBITS(nbits))) + return false; + + return true; +} + +static inline bool bitmap_empty(const bitmap *bmap, unsigned long nbits) +{ + unsigned long i; + + for (i = 0; i < BITMAP_HEADWORDS(nbits); i++) { + if (bmap[i].w != 0) + return false; + } + if (BITMAP_HASTAIL(nbits) && (BITMAP_TAIL(bmap, nbits) != 0)) + return false; + + return true; +} + +unsigned long bitmap_ffs(const bitmap *bmap, + unsigned long n, unsigned long m); + +/* + * Allocation functions + */ +static inline bitmap *bitmap_alloc(unsigned long nbits) +{ + return malloc(bitmap_sizeof(nbits)); +} + +static inline bitmap *bitmap_alloc0(unsigned long nbits) +{ + bitmap *bmap; + + bmap = bitmap_alloc(nbits); + if (bmap) + bitmap_zero(bmap, nbits); + return bmap; +} + +static inline bitmap *bitmap_alloc1(unsigned long nbits) +{ + bitmap *bmap; + + bmap = bitmap_alloc(nbits); + if (bmap) + bitmap_fill(bmap, nbits); + return bmap; +} + +static inline bitmap *bitmap_realloc0(bitmap *bmap, unsigned long obits, + unsigned long nbits) +{ + bmap = realloc(bmap, bitmap_sizeof(nbits)); + + if ((nbits > obits) && bmap) + bitmap_zero_range(bmap, obits, nbits); + + return bmap; +} + +static inline bitmap *bitmap_realloc1(bitmap *bmap, unsigned long obits, + unsigned long nbits) +{ + bmap = realloc(bmap, bitmap_sizeof(nbits)); + + if ((nbits > obits) && bmap) + bitmap_fill_range(bmap, obits, nbits); + + return bmap; +} + +#endif /* CCAN_BITMAP_H_ */ diff --git a/ccan/build_assert.h b/ccan/build_assert.h new file mode 100644 index 0000000..0ecd7ff --- /dev/null +++ b/ccan/build_assert.h @@ -0,0 +1,40 @@ +/* CC0 (Public domain) - see LICENSE.CC0 file for details */ +#ifndef CCAN_BUILD_ASSERT_H +#define CCAN_BUILD_ASSERT_H + +/** + * BUILD_ASSERT - assert a build-time dependency. + * @cond: the compile-time condition which must be true. + * + * Your compile will fail if the condition isn't true, or can't be evaluated + * by the compiler. This can only be used within a function. + * + * Example: + * #include <stddef.h> + * ... + * static char *foo_to_char(struct foo *foo) + * { + * // This code needs string to be at start of foo. + * BUILD_ASSERT(offsetof(struct foo, string) == 0); + * return (char *)foo; + * } + */ +#define BUILD_ASSERT(cond) \ + do { (void) sizeof(char [1 - 2*!(cond)]); } while(0) + +/** + * BUILD_ASSERT_OR_ZERO - assert a build-time dependency, as an expression. + * @cond: the compile-time condition which must be true. + * + * Your compile will fail if the condition isn't true, or can't be evaluated + * by the compiler. This can be used in an expression: its value is "0". + * + * Example: + * #define foo_to_char(foo) \ + * ((char *)(foo) \ + * + BUILD_ASSERT_OR_ZERO(offsetof(struct foo, string) == 0)) + */ +#define BUILD_ASSERT_OR_ZERO(cond) \ + (sizeof(char [1 - 2*!(cond)]) - 1) + +#endif /* CCAN_BUILD_ASSERT_H */ diff --git a/ccan/check_type.h b/ccan/check_type.h new file mode 100644 index 0000000..a576a50 --- /dev/null +++ b/ccan/check_type.h @@ -0,0 +1,64 @@ +/* CC0 (Public domain) - see LICENSE.CC0 file for details */ +#ifndef CCAN_CHECK_TYPE_H +#define CCAN_CHECK_TYPE_H +#include "config.h" + +/** + * check_type - issue a warning or build failure if type is not correct. + * @expr: the expression whose type we should check (not evaluated). + * @type: the exact type we expect the expression to be. + * + * This macro is usually used within other macros to try to ensure that a macro + * argument is of the expected type. No type promotion of the expression is + * done: an unsigned int is not the same as an int! + * + * check_type() always evaluates to 0. + * + * If your compiler does not support typeof, then the best we can do is fail + * to compile if the sizes of the types are unequal (a less complete check). + * + * Example: + * // They should always pass a 64-bit value to _set_some_value! + * #define set_some_value(expr) \ + * _set_some_value((check_type((expr), uint64_t), (expr))) + */ + +/** + * check_types_match - issue a warning or build failure if types are not same. + * @expr1: the first expression (not evaluated). + * @expr2: the second expression (not evaluated). + * + * This macro is usually used within other macros to try to ensure that + * arguments are of identical types. No type promotion of the expressions is + * done: an unsigned int is not the same as an int! + * + * check_types_match() always evaluates to 0. + * + * If your compiler does not support typeof, then the best we can do is fail + * to compile if the sizes of the types are unequal (a less complete check). + * + * Example: + * // Do subtraction to get to enclosing type, but make sure that + * // pointer is of correct type for that member. + * #define container_of(mbr_ptr, encl_type, mbr) \ + * (check_types_match((mbr_ptr), &((encl_type *)0)->mbr), \ + * ((encl_type *) \ + * ((char *)(mbr_ptr) - offsetof(enclosing_type, mbr)))) + */ +#if HAVE_TYPEOF +#define check_type(expr, type) \ + ((typeof(expr) *)0 != (type *)0) + +#define check_types_match(expr1, expr2) \ + ((typeof(expr1) *)0 != (typeof(expr2) *)0) +#else +#include <ccan/build_assert.h> +/* Without typeof, we can only test the sizes. */ +#define check_type(expr, type) \ + BUILD_ASSERT_OR_ZERO(sizeof(expr) == sizeof(type)) + +#define check_types_match(expr1, expr2) \ + BUILD_ASSERT_OR_ZERO(sizeof(expr1) == sizeof(expr2)) +#endif /* HAVE_TYPEOF */ + +#endif /* CCAN_CHECK_TYPE_H */ diff --git a/ccan/compiler.h b/ccan/compiler.h new file mode 100644 index 0000000..cc0d4d1 --- /dev/null +++ b/ccan/compiler.h @@ -0,0 +1,230 @@ +/* CC0 (Public domain) - see LICENSE file for details */ +#ifndef CCAN_COMPILER_H +#define CCAN_COMPILER_H +#include "config.h" + +#ifndef COLD +/** + * COLD - a function is unlikely to be called. + * + * Used to mark an unlikely code path and optimize appropriately. + * It is usually used on logging or error routines. + * + * Example: + * static void COLD moan(const char *reason) + * { + * fprintf(stderr, "Error: %s (%s)\n", reason, strerror(errno)); + * } + */ +#define COLD __attribute__((__cold__)) +#endif + +#ifndef NORETURN +/** + * NORETURN - a function does not return + * + * Used to mark a function which exits; useful for suppressing warnings. + * + * Example: + * static void NORETURN fail(const char *reason) + * { + * fprintf(stderr, "Error: %s (%s)\n", reason, strerror(errno)); + * exit(1); + * } + */ +#define NORETURN __attribute__((__noreturn__)) +#endif + +#ifndef PRINTF_FMT +/** + * PRINTF_FMT - a function takes printf-style arguments + * @nfmt: the 1-based number of the function's format argument. + * @narg: the 1-based number of the function's first variable argument. + * + * This allows the compiler to check your parameters as it does for printf(). + * + * Example: + * void PRINTF_FMT(2,3) my_printf(const char *prefix, const char *fmt, ...); + */ +#define PRINTF_FMT(nfmt, narg) \ + __attribute__((format(__printf__, nfmt, narg))) +#endif + +#ifndef CONST_FUNCTION +/** + * CONST_FUNCTION - a function's return depends only on its argument + * + * This allows the compiler to assume that the function will return the exact + * same value for the exact same arguments. This implies that the function + * must not use global variables, or dereference pointer arguments. + */ +#define CONST_FUNCTION __attribute__((__const__)) + +#ifndef PURE_FUNCTION +/** + * PURE_FUNCTION - a function is pure + * + * A pure function is one that has no side effects other than it's return value + * and uses no inputs other than it's arguments and global variables. + */ +#define PURE_FUNCTION __attribute__((__pure__)) +#endif +#endif + +#ifndef UNNEEDED +/** + * UNNEEDED - a variable/function may not be needed + * + * This suppresses warnings about unused variables or functions, but tells + * the compiler that if it is unused it need not emit it into the source code. + * + * Example: + * // With some preprocessor options, this is unnecessary. + * static UNNEEDED int counter; + * + * // With some preprocessor options, this is unnecessary. + * static UNNEEDED void add_to_counter(int add) + * { + * counter += add; + * } + */ +#define UNNEEDED __attribute__((__unused__)) +#endif + +#ifndef NEEDED +/** + * NEEDED - a variable/function is needed + * + * This suppresses warnings about unused variables or functions, but tells + * the compiler that it must exist even if it (seems) unused. + * + * Example: + * // Even if this is unused, these are vital for debugging. + * static NEEDED int counter; + * static NEEDED void dump_counter(void) + * { + * printf("Counter is %i\n", counter); + * } + */ +#define NEEDED __attribute__((__used__)) +#endif + +#ifndef UNUSED +/** + * UNUSED - a parameter is unused + * + * Some compilers (eg. gcc with -W or -Wunused) warn about unused + * function parameters. This suppresses such warnings and indicates + * to the reader that it's deliberate. + * + * Example: + * // This is used as a callback, so needs to have this prototype. + * static int some_callback(void *unused UNUSED) + * { + * return 0; + * } + */ +#define UNUSED __attribute__((__unused__)) +#endif + +#ifndef IS_COMPILE_CONSTANT +/** + * IS_COMPILE_CONSTANT - does the compiler know the value of this expression? + * @expr: the expression to evaluate + * + * When an expression manipulation is complicated, it is usually better to + * implement it in a function. However, if the expression being manipulated is + * known at compile time, it is better to have the compiler see the entire + * expression so it can simply substitute the result. + * + * This can be done using the IS_COMPILE_CONSTANT() macro. + * + * Example: + * enum greek { ALPHA, BETA, GAMMA, DELTA, EPSILON }; + * + * // Out-of-line version. + * const char *greek_name(enum greek greek); + * + * // Inline version. + * static inline const char *_greek_name(enum greek greek) + * { + * switch (greek) { + * case ALPHA: return "alpha"; + * case BETA: return "beta"; + * case GAMMA: return "gamma"; + * case DELTA: return "delta"; + * case EPSILON: return "epsilon"; + * default: return "**INVALID**"; + * } + * } + * + * // Use inline if compiler knows answer. Otherwise call function + * // to avoid copies of the same code everywhere. + * #define greek_name(g) \ + * (IS_COMPILE_CONSTANT(greek) ? _greek_name(g) : greek_name(g)) + */ +#define IS_COMPILE_CONSTANT(expr) __builtin_constant_p(expr) +#endif + +#ifndef WARN_UNUSED_RESULT +/** + * WARN_UNUSED_RESULT - warn if a function return value is unused. + * + * Used to mark a function where it is extremely unlikely that the caller + * can ignore the result, eg realloc(). + * + * Example: + * // buf param may be freed by this; need return value! + * static char *WARN_UNUSED_RESULT enlarge(char *buf, unsigned *size) + * { + * return realloc(buf, (*size) *= 2); + * } + */ +#define WARN_UNUSED_RESULT __attribute__((__warn_unused_result__)) +#endif + + +/** + * WARN_DEPRECATED - warn that a function/type/variable is deprecated when used. + * + * Used to mark a function, type or variable should not be used. + * + * Example: + * WARN_DEPRECATED char *oldfunc(char *buf); + */ +#define WARN_DEPRECATED __attribute__((__deprecated__)) + + +/** + * NO_NULL_ARGS - specify that no arguments to this function can be NULL. + * + * The compiler will warn if any pointer args are NULL. + * + * Example: + * NO_NULL_ARGS char *my_copy(char *buf); + */ +#define NO_NULL_ARGS __attribute__((__nonnull__)) + +/** + * NON_NULL_ARGS - specify that some arguments to this function can't be NULL. + * @...: 1-based argument numbers for which args can't be NULL. + * + * The compiler will warn if any of the specified pointer args are NULL. + * + * Example: + * char *my_copy2(char *buf, char *maybenull) NON_NULL_ARGS(1); + */ +#define NON_NULL_ARGS(...) __attribute__((__nonnull__(__VA_ARGS__))) + + +/** + * LAST_ARG_NULL - specify the last argument of a variadic function must be NULL. + * + * The compiler will warn if the last argument isn't NULL. + * + * Example: + * char *join_string(char *buf, ...) LAST_ARG_NULL; + */ +#define LAST_ARG_NULL __attribute__((__sentinel__)) + +#endif /* CCAN_COMPILER_H */ diff --git a/ccan/container_of.h b/ccan/container_of.h new file mode 100644 index 0000000..9180f37 --- /dev/null +++ b/ccan/container_of.h @@ -0,0 +1,146 @@ +/* CC0 (Public domain) - see LICENSE.CC0 file for details */ +#ifndef CCAN_CONTAINER_OF_H +#define CCAN_CONTAINER_OF_H +#include <stddef.h> + +#include "config.h" +#include <ccan/check_type.h> + +/** + * container_of - get pointer to enclosing structure + * @member_ptr: pointer to the structure member + * @containing_type: the type this member is within + * @member: the name of this member within the structure. + * + * Given a pointer to a member of a structure, this macro does pointer + * subtraction to return the pointer to the enclosing type. + * + * Example: + * struct foo { + * int fielda, fieldb; + * // ... + * }; + * struct info { + * int some_other_field; + * struct foo my_foo; + * }; + * + * static struct info *foo_to_info(struct foo *foo) + * { + * return container_of(foo, struct info, my_foo); + * } + */ +#ifndef container_of +#define container_of(member_ptr, containing_type, member) \ + ((containing_type *) \ + ((char *)(member_ptr) \ + - container_off(containing_type, member)) \ + + check_types_match(*(member_ptr), ((containing_type *)0)->member)) +#endif + +/** + * container_of_or_null - get pointer to enclosing structure, or NULL + * @member_ptr: pointer to the structure member + * @containing_type: the type this member is within + * @member: the name of this member within the structure. + * + * Given a pointer to a member of a structure, this macro does pointer + * subtraction to return the pointer to the enclosing type, unless it + * is given NULL, in which case it also returns NULL. + * + * Example: + * struct foo { + * int fielda, fieldb; + * // ... + * }; + * struct info { + * int some_other_field; + * struct foo my_foo; + * }; + * + * static struct info *foo_to_info_allowing_null(struct foo *foo) + * { + * return container_of_or_null(foo, struct info, my_foo); + * } + */ +static inline char *container_of_or_null_(void *member_ptr, size_t offset) +{ + return member_ptr ? (char *)member_ptr - offset : NULL; +} +#define container_of_or_null(member_ptr, containing_type, member) \ + ((containing_type *) \ + container_of_or_null_(member_ptr, \ + container_off(containing_type, member)) \ + + check_types_match(*(member_ptr), ((containing_type *)0)->member)) + +/** + * container_off - get offset to enclosing structure + * @containing_type: the type this member is within + * @member: the name of this member within the structure. + * + * Given a pointer to a member of a structure, this macro does + * typechecking and figures out the offset to the enclosing type. + * + * Example: + * struct foo { + * int fielda, fieldb; + * // ... + * }; + * struct info { + * int some_other_field; + * struct foo my_foo; + * }; + * + * static struct info *foo_to_info(struct foo *foo) + * { + * size_t off = container_off(struct info, my_foo); + * return (void *)((char *)foo - off); + * } + */ +#define container_off(containing_type, member) \ + offsetof(containing_type, member) + +/** + * container_of_var - get pointer to enclosing structure using a variable + * @member_ptr: pointer to the structure member + * @container_var: a pointer of same type as this member's container + * @member: the name of this member within the structure. + * + * Given a pointer to a member of a structure, this macro does pointer + * subtraction to return the pointer to the enclosing type. + * + * Example: + * static struct info *foo_to_i(struct foo *foo) + * { + * struct info *i = container_of_var(foo, i, my_foo); + * return i; + * } + */ +#if HAVE_TYPEOF +#define container_of_var(member_ptr, container_var, member) \ + container_of(member_ptr, typeof(*container_var), member) +#else +#define container_of_var(member_ptr, container_var, member) \ + ((void *)((char *)(member_ptr) - \ + container_off_var(container_var, member))) +#endif + +/** + * container_off_var - get offset of a field in enclosing structure + * @container_var: a pointer to a container structure + * @member: the name of a member within the structure. + * + * Given (any) pointer to a structure and a its member name, this + * macro does pointer subtraction to return offset of member in a + * structure memory layout. + * + */ +#if HAVE_TYPEOF +#define container_off_var(var, member) \ + container_off(typeof(*var), member) +#else +#define container_off_var(var, member) \ + ((const char *)&(var)->member - (const char *)(var)) +#endif + +#endif /* CCAN_CONTAINER_OF_H */ diff --git a/ccan/ilog.c b/ccan/ilog.c new file mode 100644 index 0000000..369fb70 --- /dev/null +++ b/ccan/ilog.c @@ -0,0 +1,141 @@ +/*(C) Timothy B. Terriberry (tterribe@xiph.org) 2001-2009 CC0 (Public domain). + * See LICENSE file for details. */ +#include "ilog.h" +#include <limits.h> + +/*The fastest fallback strategy for platforms with fast multiplication appears + to be based on de Bruijn sequences~\cite{LP98}. + Tests confirmed this to be true even on an ARM11, where it is actually faster + than using the native clz instruction. + Define ILOG_NODEBRUIJN to use a simpler fallback on platforms where + multiplication or table lookups are too expensive. + + @UNPUBLISHED{LP98, + author="Charles E. Leiserson and Harald Prokop", + title="Using de {Bruijn} Sequences to Index a 1 in a Computer Word", + month=Jun, + year=1998, + note="\url{http://supertech.csail.mit.edu/papers/debruijn.pdf}" + }*/ +static UNNEEDED const unsigned char DEBRUIJN_IDX32[32]={ + 0, 1,28, 2,29,14,24, 3,30,22,20,15,25,17, 4, 8, + 31,27,13,23,21,19,16, 7,26,12,18, 6,11, 5,10, 9 +}; + +/* We always compile these in, in case someone takes address of function. */ +#undef ilog32_nz +#undef ilog32 +#undef ilog64_nz +#undef ilog64 + +int ilog32(uint32_t _v){ +/*On a Pentium M, this branchless version tested as the fastest version without + multiplications on 1,000,000,000 random 32-bit integers, edging out a + similar version with branches, and a 256-entry LUT version.*/ +# if defined(ILOG_NODEBRUIJN) + int ret; + int m; + ret=_v>0; + m=(_v>0xFFFFU)<<4; + _v>>=m; + ret|=m; + m=(_v>0xFFU)<<3; + _v>>=m; + ret|=m; + m=(_v>0xFU)<<2; + _v>>=m; + ret|=m; + m=(_v>3)<<1; + _v>>=m; + ret|=m; + ret+=_v>1; + return ret; +/*This de Bruijn sequence version is faster if you have a fast multiplier.*/ +# else + int ret; + ret=_v>0; + _v|=_v>>1; + _v|=_v>>2; + _v|=_v>>4; + _v|=_v>>8; + _v|=_v>>16; + _v=(_v>>1)+1; + ret+=DEBRUIJN_IDX32[_v*0x77CB531U>>27&0x1F]; + return ret; +# endif +} + +int ilog32_nz(uint32_t _v) +{ + return ilog32(_v); +} + +int ilog64(uint64_t _v){ +# if defined(ILOG_NODEBRUIJN) + uint32_t v; + int ret; + int m; + ret=_v>0; + m=(_v>0xFFFFFFFFU)<<5; + v=(uint32_t)(_v>>m); + ret|=m; + m=(v>0xFFFFU)<<4; + v>>=m; + ret|=m; + m=(v>0xFFU)<<3; + v>>=m; + ret|=m; + m=(v>0xFU)<<2; + v>>=m; + ret|=m; + m=(v>3)<<1; + v>>=m; + ret|=m; + ret+=v>1; + return ret; +# else +/*If we don't have a 64-bit word, split it into two 32-bit halves.*/ +# if LONG_MAX<9223372036854775807LL + uint32_t v; + int ret; + int m; + ret=_v>0; + m=(_v>0xFFFFFFFFU)<<5; + v=(uint32_t)(_v>>m); + ret|=m; + v|=v>>1; + v|=v>>2; + v|=v>>4; + v|=v>>8; + v|=v>>16; + v=(v>>1)+1; + ret+=DEBRUIJN_IDX32[v*0x77CB531U>>27&0x1F]; + return ret; +/*Otherwise do it in one 64-bit operation.*/ +# else + static const unsigned char DEBRUIJN_IDX64[64]={ + 0, 1, 2, 7, 3,13, 8,19, 4,25,14,28, 9,34,20,40, + 5,17,26,38,15,46,29,48,10,31,35,54,21,50,41,57, + 63, 6,12,18,24,27,33,39,16,37,45,47,30,53,49,56, + 62,11,23,32,36,44,52,55,61,22,43,51,60,42,59,58 + }; + int ret; + ret=_v>0; + _v|=_v>>1; + _v|=_v>>2; + _v|=_v>>4; + _v|=_v>>8; + _v|=_v>>16; + _v|=_v>>32; + _v=(_v>>1)+1; + ret+=DEBRUIJN_IDX64[_v*0x218A392CD3D5DBFULL>>58&0x3F]; + return ret; +# endif +# endif +} + +int ilog64_nz(uint64_t _v) +{ + return ilog64(_v); +} + diff --git a/ccan/ilog.h b/ccan/ilog.h new file mode 100644 index 0000000..2793a70 --- /dev/null +++ b/ccan/ilog.h @@ -0,0 +1,151 @@ +/* CC0 (Public domain) - see LICENSE file for details */ +#if !defined(_ilog_H) +# define _ilog_H (1) +# include "config.h" +# include <stdint.h> +# include <limits.h> +# include <ccan/compiler.h> + +/** + * ilog32 - Integer binary logarithm of a 32-bit value. + * @_v: A 32-bit value. + * Returns floor(log2(_v))+1, or 0 if _v==0. + * This is the number of bits that would be required to represent _v in two's + * complement notation with all of the leading zeros stripped. + * Note that many uses will resolve to the fast macro version instead. + * + * See Also: + * ilog32_nz(), ilog64() + * + * Example: + * // Rounds up to next power of 2 (if not a power of 2). + * static uint32_t round_up32(uint32_t i) + * { + * assert(i != 0); + * return 1U << ilog32(i-1); + * } + */ +int ilog32(uint32_t _v); + +/** + * ilog32_nz - Integer binary logarithm of a non-zero 32-bit value. + * @_v: A 32-bit value. + * Returns floor(log2(_v))+1, or undefined if _v==0. + * This is the number of bits that would be required to represent _v in two's + * complement notation with all of the leading zeros stripped. + * Note that many uses will resolve to the fast macro version instead. + * See Also: + * ilog32(), ilog64_nz() + * Example: + * // Find Last Set (ie. highest bit set, 0 to 31). + * static uint32_t fls32(uint32_t i) + * { + * assert(i != 0); + * return ilog32_nz(i) - 1; + * } + */ +int ilog32_nz(uint32_t _v); + +/** + * ilog64 - Integer binary logarithm of a 64-bit value. + * @_v: A 64-bit value. + * Returns floor(log2(_v))+1, or 0 if _v==0. + * This is the number of bits that would be required to represent _v in two's + * complement notation with all of the leading zeros stripped. + * Note that many uses will resolve to the fast macro version instead. + * See Also: + * ilog64_nz(), ilog32() + */ +int ilog64(uint64_t _v); + +/** + * ilog64_nz - Integer binary logarithm of a non-zero 64-bit value. + * @_v: A 64-bit value. + * Returns floor(log2(_v))+1, or undefined if _v==0. + * This is the number of bits that would be required to represent _v in two's + * complement notation with all of the leading zeros stripped. + * Note that many uses will resolve to the fast macro version instead. + * See Also: + * ilog64(), ilog32_nz() + */ +int ilog64_nz(uint64_t _v); + +/** + * STATIC_ILOG_32 - The integer logarithm of an (unsigned, 32-bit) constant. + * @_v: A non-negative 32-bit constant. + * Returns floor(log2(_v))+1, or 0 if _v==0. + * This is the number of bits that would be required to represent _v in two's + * complement notation with all of the leading zeros stripped. + * This macro should only be used when you need a compile-time constant, + * otherwise ilog32 or ilog32_nz are just as fast and more flexible. + * + * Example: + * #define MY_PAGE_SIZE 4096 + * #define MY_PAGE_BITS (STATIC_ILOG_32(PAGE_SIZE) - 1) + */ +#define STATIC_ILOG_32(_v) (STATIC_ILOG5((uint32_t)(_v))) + +/** + * STATIC_ILOG_64 - The integer logarithm of an (unsigned, 64-bit) constant. + * @_v: A non-negative 64-bit constant. + * Returns floor(log2(_v))+1, or 0 if _v==0. + * This is the number of bits that would be required to represent _v in two's + * complement notation with all of the leading zeros stripped. + * This macro should only be used when you need a compile-time constant, + * otherwise ilog64 or ilog64_nz are just as fast and more flexible. + */ +#define STATIC_ILOG_64(_v) (STATIC_ILOG6((uint64_t)(_v))) + +/* Private implementation details */ + +/*Note the casts to (int) below: this prevents "upgrading" + the type of an entire expression to an (unsigned) size_t.*/ +#if INT_MAX>=2147483647 && HAVE_BUILTIN_CLZ +#define builtin_ilog32_nz(v) \ + (((int)sizeof(unsigned)*CHAR_BIT) - __builtin_clz(v)) +#elif LONG_MAX>=2147483647L && HAVE_BUILTIN_CLZL +#define builtin_ilog32_nz(v) \ + (((int)sizeof(unsigned)*CHAR_BIT) - __builtin_clzl(v)) +#endif + +#if INT_MAX>=9223372036854775807LL && HAVE_BUILTIN_CLZ +#define builtin_ilog64_nz(v) \ + (((int)sizeof(unsigned)*CHAR_BIT) - __builtin_clz(v)) +#elif LONG_MAX>=9223372036854775807LL && HAVE_BUILTIN_CLZL +#define builtin_ilog64_nz(v) \ + (((int)sizeof(unsigned long)*CHAR_BIT) - __builtin_clzl(v)) +#elif HAVE_BUILTIN_CLZLL +#define builtin_ilog64_nz(v) \ + (((int)sizeof(unsigned long long)*CHAR_BIT) - __builtin_clzll(v)) +#endif + +#ifdef builtin_ilog32_nz +#define ilog32(_v) (builtin_ilog32_nz(_v)&-!!(_v)) +#define ilog32_nz(_v) builtin_ilog32_nz(_v) +#else +#define ilog32_nz(_v) ilog32(_v) +#define ilog32(_v) (IS_COMPILE_CONSTANT(_v) ? STATIC_ILOG_32(_v) : ilog32(_v)) +#endif /* builtin_ilog32_nz */ + +#ifdef builtin_ilog64_nz +#define ilog64(_v) (builtin_ilog64_nz(_v)&-!!(_v)) +#define ilog64_nz(_v) builtin_ilog64_nz(_v) +#else +#define ilog64_nz(_v) ilog64(_v) +#define ilog64(_v) (IS_COMPILE_CONSTANT(_v) ? STATIC_ILOG_64(_v) : ilog64(_v)) +#endif /* builtin_ilog64_nz */ + +/* Macros for evaluating compile-time constant ilog. */ +# define STATIC_ILOG0(_v) (!!(_v)) +# define STATIC_ILOG1(_v) (((_v)&0x2)?2:STATIC_ILOG0(_v)) +# define STATIC_ILOG2(_v) (((_v)&0xC)?2+STATIC_ILOG1((_v)>>2):STATIC_ILOG1(_v)) +# define STATIC_ILOG3(_v) \ + (((_v)&0xF0)?4+STATIC_ILOG2((_v)>>4):STATIC_ILOG2(_v)) +# define STATIC_ILOG4(_v) \ + (((_v)&0xFF00)?8+STATIC_ILOG3((_v)>>8):STATIC_ILOG3(_v)) +# define STATIC_ILOG5(_v) \ + (((_v)&0xFFFF0000)?16+STATIC_ILOG4((_v)>>16):STATIC_ILOG4(_v)) +# define STATIC_ILOG6(_v) \ + (((_v)&0xFFFFFFFF00000000ULL)?32+STATIC_ILOG5((_v)>>32):STATIC_ILOG5(_v)) + +#endif /* _ilog_H */ diff --git a/ccan/list.c b/ccan/list.c new file mode 100644 index 0000000..b49e6f6 --- /dev/null +++ b/ccan/list.c @@ -0,0 +1,43 @@ +/* Licensed under MIT - see LICENSE.MIT file for details */ +#include <stdio.h> +#include <stdlib.h> +#include "list.h" + +static void *corrupt(const char *abortstr, + const struct list_node *head, + const struct list_node *node, + unsigned int count) +{ + if (abortstr) { + fprintf(stderr, + "%s: prev corrupt in node %p (%u) of %p\n", + abortstr, node, count, head); + abort(); + } + return NULL; +} + +struct list_node *list_check_node(const struct list_node *node, + const char *abortstr) +{ + const struct list_node *p, *n; + int count = 0; + + for (p = node, n = node->next; n != node; p = n, n = n->next) { + count++; + if (n->prev != p) + return corrupt(abortstr, node, n, count); + } + /* Check prev on head node. */ + if (node->prev != p) + return corrupt(abortstr, node, node, 0); + + return (struct list_node *)node; +} + +struct list_head *list_check(const struct list_head *h, const char *abortstr) +{ + if (!list_check_node(&h->n, abortstr)) + return NULL; + return (struct list_head *)h; +} diff --git a/ccan/list.h b/ccan/list.h new file mode 100644 index 0000000..f400666 --- /dev/null +++ b/ccan/list.h @@ -0,0 +1,842 @@ +/* Licensed under MIT - see LICENSE.MIT file for details */ +#ifndef CCAN_LIST_H +#define CCAN_LIST_H +//#define CCAN_LIST_DEBUG 1 +#include <stdbool.h> +#include <assert.h> +#include <ccan/str.h> +#include <ccan/container_of.h> +#include <ccan/check_type.h> + +/** + * struct list_node - an entry in a doubly-linked list + * @next: next entry (self if empty) + * @prev: previous entry (self if empty) + * + * This is used as an entry in a linked list. + * Example: + * struct child { + * const char *name; + * // Linked list of all us children. + * struct list_node list; + * }; + */ +struct list_node +{ + struct list_node *next, *prev; +}; + +/** + * struct list_head - the head of a doubly-linked list + * @h: the list_head (containing next and prev pointers) + * + * This is used as the head of a linked list. + * Example: + * struct parent { + * const char *name; + * struct list_head children; + * unsigned int num_children; + * }; + */ +struct list_head +{ + struct list_node n; +}; + +/** + * list_check - check head of a list for consistency + * @h: the list_head + * @abortstr: the location to print on aborting, or NULL. + * + * Because list_nodes have redundant information, consistency checking between + * the back and forward links can be done. This is useful as a debugging check. + * If @abortstr is non-NULL, that will be printed in a diagnostic if the list + * is inconsistent, and the function will abort. + * + * Returns the list head if the list is consistent, NULL if not (it + * can never return NULL if @abortstr is set). + * + * See also: list_check_node() + * + * Example: + * static void dump_parent(struct parent *p) + * { + * struct child *c; + * + * printf("%s (%u children):\n", p->name, p->num_children); + * list_check(&p->children, "bad child list"); + * list_for_each(&p->children, c, list) + * printf(" -> %s\n", c->name); + * } + */ +struct list_head *list_check(const struct list_head *h, const char *abortstr); + +/** + * list_check_node - check node of a list for consistency + * @n: the list_node + * @abortstr: the location to print on aborting, or NULL. + * + * Check consistency of the list node is in (it must be in one). + * + * See also: list_check() + * + * Example: + * static void dump_child(const struct child *c) + * { + * list_check_node(&c->list, "bad child list"); + * printf("%s\n", c->name); + * } + */ +struct list_node *list_check_node(const struct list_node *n, + const char *abortstr); + +#define LIST_LOC __FILE__ ":" stringify(__LINE__) +#ifdef CCAN_LIST_DEBUG +#define list_debug(h, loc) list_check((h), loc) +#define list_debug_node(n, loc) list_check_node((n), loc) +#else +#define list_debug(h, loc) ((void)loc, h) +#define list_debug_node(n, loc) ((void)loc, n) +#endif + +/** + * LIST_HEAD_INIT - initializer for an empty list_head + * @name: the name of the list. + * + * Explicit initializer for an empty list. + * + * See also: + * LIST_HEAD, list_head_init() + * + * Example: + * static struct list_head my_list = LIST_HEAD_INIT(my_list); + */ +#define LIST_HEAD_INIT(name) { { &(name).n, &(name).n } } + +/** + * LIST_HEAD - define and initialize an empty list_head + * @name: the name of the list. + * + * The LIST_HEAD macro defines a list_head and initializes it to an empty + * list. It can be prepended by "static" to define a static list_head. + * + * See also: + * LIST_HEAD_INIT, list_head_init() + * + * Example: + * static LIST_HEAD(my_global_list); + */ +#define LIST_HEAD(name) \ + struct list_head name = LIST_HEAD_INIT(name) + +/** + * list_head_init - initialize a list_head + * @h: the list_head to set to the empty list + * + * Example: + * ... + * struct parent *parent = malloc(sizeof(*parent)); + * + * list_head_init(&parent->children); + * parent->num_children = 0; + */ +static inline void list_head_init(struct list_head *h) +{ + h->n.next = h->n.prev = &h->n; +} + +/** + * list_node_init - initialize a list_node + * @n: the list_node to link to itself. + * + * You don't need to use this normally! But it lets you list_del(@n) + * safely. + */ +static inline void list_node_init(struct list_node *n) +{ + n->next = n->prev = n; +} + +/** + * list_add_after - add an entry after an existing node in a linked list + * @h: the list_head to add the node to (for debugging) + * @p: the existing list_node to add the node after + * @n: the new list_node to add to the list. + * + * The existing list_node must already be a member of the list. + * The new list_node does not need to be initialized; it will be overwritten. + * + * Example: + * struct child c1, c2, c3; + * LIST_HEAD(h); + * + * list_add_tail(&h, &c1.list); + * list_add_tail(&h, &c3.list); + * list_add_after(&h, &c1.list, &c2.list); + */ +#define list_add_after(h, p, n) list_add_after_(h, p, n, LIST_LOC) +static inline void list_add_after_(struct list_head *h, + struct list_node *p, + struct list_node *n, + const char *abortstr) +{ + n->next = p->next; + n->prev = p; + p->next->prev = n; + p->next = n; + (void)list_debug(h, abortstr); +} + +/** + * list_add - add an entry at the start of a linked list. + * @h: the list_head to add the node to + * @n: the list_node to add to the list. + * + * The list_node does not need to be initialized; it will be overwritten. + * Example: + * struct child *child = malloc(sizeof(*child)); + * + * child->name = "marvin"; + * list_add(&parent->children, &child->list); + * parent->num_children++; + */ +#define list_add(h, n) list_add_(h, n, LIST_LOC) +static inline void list_add_(struct list_head *h, + struct list_node *n, + const char *abortstr) +{ + list_add_after_(h, &h->n, n, abortstr); +} + +/** + * list_add_before - add an entry before an existing node in a linked list + * @h: the list_head to add the node to (for debugging) + * @p: the existing list_node to add the node before + * @n: the new list_node to add to the list. + * + * The existing list_node must already be a member of the list. + * The new list_node does not need to be initialized; it will be overwritten. + * + * Example: + * list_head_init(&h); + * list_add_tail(&h, &c1.list); + * list_add_tail(&h, &c3.list); + * list_add_before(&h, &c3.list, &c2.list); + */ +#define list_add_before(h, p, n) list_add_before_(h, p, n, LIST_LOC) +static inline void list_add_before_(struct list_head *h, + struct list_node *p, + struct list_node *n, + const char *abortstr) +{ + n->next = p; + n->prev = p->prev; + p->prev->next = n; + p->prev = n; + (void)list_debug(h, abortstr); +} + +/** + * list_add_tail - add an entry at the end of a linked list. + * @h: the list_head to add the node to + * @n: the list_node to add to the list. + * + * The list_node does not need to be initialized; it will be overwritten. + * Example: + * list_add_tail(&parent->children, &child->list); + * parent->num_children++; + */ +#define list_add_tail(h, n) list_add_tail_(h, n, LIST_LOC) +static inline void list_add_tail_(struct list_head *h, + struct list_node *n, + const char *abortstr) +{ + list_add_before_(h, &h->n, n, abortstr); +} + +/** + * list_empty - is a list empty? + * @h: the list_head + * + * If the list is empty, returns true. + * + * Example: + * assert(list_empty(&parent->children) == (parent->num_children == 0)); + */ +#define list_empty(h) list_empty_(h, LIST_LOC) +static inline bool list_empty_(const struct list_head *h, const char* abortstr) +{ + (void)list_debug(h, abortstr); + return h->n.next == &h->n; +} + +/** + * list_empty_nodebug - is a list empty (and don't perform debug checks)? + * @h: the list_head + * + * If the list is empty, returns true. + * This differs from list_empty() in that if CCAN_LIST_DEBUG is set it + * will NOT perform debug checks. Only use this function if you REALLY + * know what you're doing. + * + * Example: + * assert(list_empty_nodebug(&parent->children) == (parent->num_children == 0)); + */ +#ifndef CCAN_LIST_DEBUG +#define list_empty_nodebug(h) list_empty(h) +#else +static inline bool list_empty_nodebug(const struct list_head *h) +{ + return h->n.next == &h->n; +} +#endif + +/** + * list_empty_nocheck - is a list empty? + * @h: the list_head + * + * If the list is empty, returns true. This doesn't perform any + * debug check for list consistency, so it can be called without + * locks, racing with the list being modified. This is ok for + * checks where an incorrect result is not an issue (optimized + * bail out path for example). + */ +static inline bool list_empty_nocheck(const struct list_head *h) +{ + return h->n.next == &h->n; +} + +/** + * list_del - delete an entry from an (unknown) linked list. + * @n: the list_node to delete from the list. + * + * Note that this leaves @n in an undefined state; it can be added to + * another list, but not deleted again. + * + * See also: + * list_del_from(), list_del_init() + * + * Example: + * list_del(&child->list); + * parent->num_children--; + */ +#define list_del(n) list_del_(n, LIST_LOC) +static inline void list_del_(struct list_node *n, const char* abortstr) +{ + (void)list_debug_node(n, abortstr); + n->next->prev = n->prev; + n->prev->next = n->next; +#ifdef CCAN_LIST_DEBUG + /* Catch use-after-del. */ + n->next = n->prev = NULL; +#endif +} + +/** + * list_del_init - delete a node, and reset it so it can be deleted again. + * @n: the list_node to be deleted. + * + * list_del(@n) or list_del_init() again after this will be safe, + * which can be useful in some cases. + * + * See also: + * list_del_from(), list_del() + * + * Example: + * list_del_init(&child->list); + * parent->num_children--; + */ +#define list_del_init(n) list_del_init_(n, LIST_LOC) +static inline void list_del_init_(struct list_node *n, const char *abortstr) +{ + list_del_(n, abortstr); + list_node_init(n); +} + +/** + * list_del_from - delete an entry from a known linked list. + * @h: the list_head the node is in. + * @n: the list_node to delete from the list. + * + * This explicitly indicates which list a node is expected to be in, + * which is better documentation and can catch more bugs. + * + * See also: list_del() + * + * Example: + * list_del_from(&parent->children, &child->list); + * parent->num_children--; + */ +static inline void list_del_from(struct list_head *h, struct list_node *n) +{ +#ifdef CCAN_LIST_DEBUG + { + /* Thorough check: make sure it was in list! */ + struct list_node *i; + for (i = h->n.next; i != n; i = i->next) + assert(i != &h->n); + } +#endif /* CCAN_LIST_DEBUG */ + + /* Quick test that catches a surprising number of bugs. */ + assert(!list_empty(h)); + list_del(n); +} + +/** + * list_swap - swap out an entry from an (unknown) linked list for a new one. + * @o: the list_node to replace from the list. + * @n: the list_node to insert in place of the old one. + * + * Note that this leaves @o in an undefined state; it can be added to + * another list, but not deleted/swapped again. + * + * See also: + * list_del() + * + * Example: + * struct child x1, x2; + * LIST_HEAD(xh); + * + * list_add(&xh, &x1.list); + * list_swap(&x1.list, &x2.list); + */ +#define list_swap(o, n) list_swap_(o, n, LIST_LOC) +static inline void list_swap_(struct list_node *o, + struct list_node *n, + const char* abortstr) +{ + (void)list_debug_node(o, abortstr); + *n = *o; + n->next->prev = n; + n->prev->next = n; +#ifdef CCAN_LIST_DEBUG + /* Catch use-after-del. */ + o->next = o->prev = NULL; +#endif +} + +/** + * list_entry - convert a list_node back into the structure containing it. + * @n: the list_node + * @type: the type of the entry + * @member: the list_node member of the type + * + * Example: + * // First list entry is children.next; convert back to child. + * child = list_entry(parent->children.n.next, struct child, list); + * + * See Also: + * list_top(), list_for_each() + */ +#define list_entry(n, type, member) container_of(n, type, member) + +/** + * list_top - get the first entry in a list + * @h: the list_head + * @type: the type of the entry + * @member: the list_node member of the type + * + * If the list is empty, returns NULL. + * + * Example: + * struct child *first; + * first = list_top(&parent->children, struct child, list); + * if (!first) + * printf("Empty list!\n"); + */ +#define list_top(h, type, member) \ + ((type *)list_top_((h), list_off_(type, member))) + +static inline const void *list_top_(const struct list_head *h, size_t off) +{ + if (list_empty(h)) + return NULL; + return (const char *)h->n.next - off; +} + +/** + * list_pop - remove the first entry in a list + * @h: the list_head + * @type: the type of the entry + * @member: the list_node member of the type + * + * If the list is empty, returns NULL. + * + * Example: + * struct child *one; + * one = list_pop(&parent->children, struct child, list); + * if (!one) + * printf("Empty list!\n"); + */ +#define list_pop(h, type, member) \ + ((type *)list_pop_((h), list_off_(type, member))) + +static inline const void *list_pop_(const struct list_head *h, size_t off) +{ + struct list_node *n; + + if (list_empty(h)) + return NULL; + n = h->n.next; + list_del(n); + return (const char *)n - off; +} + +/** + * list_tail - get the last entry in a list + * @h: the list_head + * @type: the type of the entry + * @member: the list_node member of the type + * + * If the list is empty, returns NULL. + * + * Example: + * struct child *last; + * last = list_tail(&parent->children, struct child, list); + * if (!last) + * printf("Empty list!\n"); + */ +#define list_tail(h, type, member) \ + ((type *)list_tail_((h), list_off_(type, member))) + +static inline const void *list_tail_(const struct list_head *h, size_t off) +{ + if (list_empty(h)) + return NULL; + return (const char *)h->n.prev - off; +} + +/** + * list_for_each - iterate through a list. + * @h: the list_head (warning: evaluated multiple times!) + * @i: the structure containing the list_node + * @member: the list_node member of the structure + * + * This is a convenient wrapper to iterate @i over the entire list. It's + * a for loop, so you can break and continue as normal. + * + * Example: + * list_for_each(&parent->children, child, list) + * printf("Name: %s\n", child->name); + */ +#define list_for_each(h, i, member) \ + list_for_each_off(h, i, list_off_var_(i, member)) + +/** + * list_for_each_rev - iterate through a list backwards. + * @h: the list_head + * @i: the structure containing the list_node + * @member: the list_node member of the structure + * + * This is a convenient wrapper to iterate @i over the entire list. It's + * a for loop, so you can break and continue as normal. + * + * Example: + * list_for_each_rev(&parent->children, child, list) + * printf("Name: %s\n", child->name); + */ +#define list_for_each_rev(h, i, member) \ + list_for_each_rev_off(h, i, list_off_var_(i, member)) + +/** + * list_for_each_rev_safe - iterate through a list backwards, + * maybe during deletion + * @h: the list_head + * @i: the structure containing the list_node + * @nxt: the structure containing the list_node + * @member: the list_node member of the structure + * + * This is a convenient wrapper to iterate @i over the entire list backwards. + * It's a for loop, so you can break and continue as normal. The extra + * variable * @nxt is used to hold the next element, so you can delete @i + * from the list. + * + * Example: + * struct child *next; + * list_for_each_rev_safe(&parent->children, child, next, list) { + * printf("Name: %s\n", child->name); + * } + */ +#define list_for_each_rev_safe(h, i, nxt, member) \ + list_for_each_rev_safe_off(h, i, nxt, list_off_var_(i, member)) + +/** + * list_for_each_safe - iterate through a list, maybe during deletion + * @h: the list_head + * @i: the structure containing the list_node + * @nxt: the structure containing the list_node + * @member: the list_node member of the structure + * + * This is a convenient wrapper to iterate @i over the entire list. It's + * a for loop, so you can break and continue as normal. The extra variable + * @nxt is used to hold the next element, so you can delete @i from the list. + * + * Example: + * list_for_each_safe(&parent->children, child, next, list) { + * list_del(&child->list); + * parent->num_children--; + * } + */ +#define list_for_each_safe(h, i, nxt, member) \ + list_for_each_safe_off(h, i, nxt, list_off_var_(i, member)) + +/** + * list_next - get the next entry in a list + * @h: the list_head + * @i: a pointer to an entry in the list. + * @member: the list_node member of the structure + * + * If @i was the last entry in the list, returns NULL. + * + * Example: + * struct child *second; + * second = list_next(&parent->children, first, list); + * if (!second) + * printf("No second child!\n"); + */ +#define list_next(h, i, member) \ + ((list_typeof(i))list_entry_or_null(list_debug(h, \ + __FILE__ ":" stringify(__LINE__)), \ + (i)->member.next, \ + list_off_var_((i), member))) + +/** + * list_prev - get the previous entry in a list + * @h: the list_head + * @i: a pointer to an entry in the list. + * @member: the list_node member of the structure + * + * If @i was the first entry in the list, returns NULL. + * + * Example: + * first = list_prev(&parent->children, second, list); + * if (!first) + * printf("Can't go back to first child?!\n"); + */ +#define list_prev(h, i, member) \ + ((list_typeof(i))list_entry_or_null(list_debug(h, \ + __FILE__ ":" stringify(__LINE__)), \ + (i)->member.prev, \ + list_off_var_((i), member))) + +/** + * list_append_list - empty one list onto the end of another. + * @to: the list to append into + * @from: the list to empty. + * + * This takes the entire contents of @from and moves it to the end of + * @to. After this @from will be empty. + * + * Example: + * struct list_head adopter; + * + * list_append_list(&adopter, &parent->children); + * assert(list_empty(&parent->children)); + * parent->num_children = 0; + */ +#define list_append_list(t, f) list_append_list_(t, f, \ + __FILE__ ":" stringify(__LINE__)) +static inline void list_append_list_(struct list_head *to, + struct list_head *from, + const char *abortstr) +{ + struct list_node *from_tail = list_debug(from, abortstr)->n.prev; + struct list_node *to_tail = list_debug(to, abortstr)->n.prev; + + /* Sew in head and entire list. */ + to->n.prev = from_tail; + from_tail->next = &to->n; + to_tail->next = &from->n; + from->n.prev = to_tail; + + /* Now remove head. */ + list_del(&from->n); + list_head_init(from); +} + +/** + * list_prepend_list - empty one list into the start of another. + * @to: the list to prepend into + * @from: the list to empty. + * + * This takes the entire contents of @from and moves it to the start + * of @to. After this @from will be empty. + * + * Example: + * list_prepend_list(&adopter, &parent->children); + * assert(list_empty(&parent->children)); + * parent->num_children = 0; + */ +#define list_prepend_list(t, f) list_prepend_list_(t, f, LIST_LOC) +static inline void list_prepend_list_(struct list_head *to, + struct list_head *from, + const char *abortstr) +{ + struct list_node *from_tail = list_debug(from, abortstr)->n.prev; + struct list_node *to_head = list_debug(to, abortstr)->n.next; + + /* Sew in head and entire list. */ + to->n.next = &from->n; + from->n.prev = &to->n; + to_head->prev = from_tail; + from_tail->next = to_head; + + /* Now remove head. */ + list_del(&from->n); + list_head_init(from); +} + +/* internal macros, do not use directly */ +#define list_for_each_off_dir_(h, i, off, dir) \ + for (i = list_node_to_off_(list_debug(h, LIST_LOC)->n.dir, \ + (off)); \ + list_node_from_off_((void *)i, (off)) != &(h)->n; \ + i = list_node_to_off_(list_node_from_off_((void *)i, (off))->dir, \ + (off))) + +#define list_for_each_safe_off_dir_(h, i, nxt, off, dir) \ + for (i = list_node_to_off_(list_debug(h, LIST_LOC)->n.dir, \ + (off)), \ + nxt = list_node_to_off_(list_node_from_off_(i, (off))->dir, \ + (off)); \ + list_node_from_off_(i, (off)) != &(h)->n; \ + i = nxt, \ + nxt = list_node_to_off_(list_node_from_off_(i, (off))->dir, \ + (off))) + +/** + * list_for_each_off - iterate through a list of memory regions. + * @h: the list_head + * @i: the pointer to a memory region wich contains list node data. + * @off: offset(relative to @i) at which list node data resides. + * + * This is a low-level wrapper to iterate @i over the entire list, used to + * implement all oher, more high-level, for-each constructs. It's a for loop, + * so you can break and continue as normal. + * + * WARNING! Being the low-level macro that it is, this wrapper doesn't know + * nor care about the type of @i. The only assumtion made is that @i points + * to a chunk of memory that at some @offset, relative to @i, contains a + * properly filled `struct node_list' which in turn contains pointers to + * memory chunks and it's turtles all the way down. Whith all that in mind + * remember that given the wrong pointer/offset couple this macro will + * happilly churn all you memory untill SEGFAULT stops it, in other words + * caveat emptor. + * + * It is worth mentioning that one of legitimate use-cases for that wrapper + * is operation on opaque types with known offset for `struct list_node' + * member(preferably 0), because it allows you not to disclose the type of + * @i. + * + * Example: + * list_for_each_off(&parent->children, child, + * offsetof(struct child, list)) + * printf("Name: %s\n", child->name); + */ +#define list_for_each_off(h, i, off) \ + list_for_each_off_dir_((h),(i),(off),next) + +/** + * list_for_each_rev_off - iterate through a list of memory regions backwards + * @h: the list_head + * @i: the pointer to a memory region wich contains list node data. + * @off: offset(relative to @i) at which list node data resides. + * + * See list_for_each_off for details + */ +#define list_for_each_rev_off(h, i, off) \ + list_for_each_off_dir_((h),(i),(off),prev) + +/** + * list_for_each_safe_off - iterate through a list of memory regions, maybe + * during deletion + * @h: the list_head + * @i: the pointer to a memory region wich contains list node data. + * @nxt: the structure containing the list_node + * @off: offset(relative to @i) at which list node data resides. + * + * For details see `list_for_each_off' and `list_for_each_safe' + * descriptions. + * + * Example: + * list_for_each_safe_off(&parent->children, child, + * next, offsetof(struct child, list)) + * printf("Name: %s\n", child->name); + */ +#define list_for_each_safe_off(h, i, nxt, off) \ + list_for_each_safe_off_dir_((h),(i),(nxt),(off),next) + +/** + * list_for_each_rev_safe_off - iterate backwards through a list of + * memory regions, maybe during deletion + * @h: the list_head + * @i: the pointer to a memory region wich contains list node data. + * @nxt: the structure containing the list_node + * @off: offset(relative to @i) at which list node data resides. + * + * For details see `list_for_each_rev_off' and `list_for_each_rev_safe' + * descriptions. + * + * Example: + * list_for_each_rev_safe_off(&parent->children, child, + * next, offsetof(struct child, list)) + * printf("Name: %s\n", child->name); + */ +#define list_for_each_rev_safe_off(h, i, nxt, off) \ + list_for_each_safe_off_dir_((h),(i),(nxt),(off),prev) + +/* Other -off variants. */ +#define list_entry_off(n, type, off) \ + ((type *)list_node_from_off_((n), (off))) + +#define list_head_off(h, type, off) \ + ((type *)list_head_off((h), (off))) + +#define list_tail_off(h, type, off) \ + ((type *)list_tail_((h), (off))) + +#define list_add_off(h, n, off) \ + list_add((h), list_node_from_off_((n), (off))) + +#define list_del_off(n, off) \ + list_del(list_node_from_off_((n), (off))) + +#define list_del_from_off(h, n, off) \ + list_del_from(h, list_node_from_off_((n), (off))) + +/* Offset helper functions so we only single-evaluate. */ +static inline void *list_node_to_off_(struct list_node *node, size_t off) +{ + return (void *)((char *)node - off); +} +static inline struct list_node *list_node_from_off_(void *ptr, size_t off) +{ + return (struct list_node *)((char *)ptr + off); +} + +/* Get the offset of the member, but make sure it's a list_node. */ +#define list_off_(type, member) \ + (container_off(type, member) + \ + check_type(((type *)0)->member, struct list_node)) + +#define list_off_var_(var, member) \ + (container_off_var(var, member) + \ + check_type(var->member, struct list_node)) + +#if HAVE_TYPEOF +#define list_typeof(var) typeof(var) +#else +#define list_typeof(var) void * +#endif + +/* Returns member, or NULL if at end of list. */ +static inline void *list_entry_or_null(const struct list_head *h, + const struct list_node *n, + size_t off) +{ + if (n == &h->n) + return NULL; + return (char *)n - off; +} +#endif /* CCAN_LIST_H */ diff --git a/ccan/minmax.h b/ccan/minmax.h new file mode 100644 index 0000000..ab6c554 --- /dev/null +++ b/ccan/minmax.h @@ -0,0 +1,65 @@ +/* CC0 (Public domain) - see LICENSE.CC0 file for details */ +#ifndef CCAN_MINMAX_H +#define CCAN_MINMAX_H + +#include "config.h" + +#include <ccan/build_assert.h> + +#if !HAVE_STATEMENT_EXPR || !HAVE_TYPEOF +/* + * Without these, there's no way to avoid unsafe double evaluation of + * the arguments + */ +#error Sorry, minmax module requires statement expressions and typeof +#endif + +#if HAVE_BUILTIN_TYPES_COMPATIBLE_P +#define MINMAX_ASSERT_COMPATIBLE(a, b) \ + BUILD_ASSERT(__builtin_types_compatible_p(a, b)) +#else +#define MINMAX_ASSERT_COMPATIBLE(a, b) \ + do { } while (0) +#endif + +#define min(a, b) \ + ({ \ + typeof(a) _a = (a); \ + typeof(b) _b = (b); \ + MINMAX_ASSERT_COMPATIBLE(typeof(_a), typeof(_b)); \ + _a < _b ? _a : _b; \ + }) + +#define max(a, b) \ + ({ \ + typeof(a) _a = (a); \ + typeof(b) _b = (b); \ + MINMAX_ASSERT_COMPATIBLE(typeof(_a), typeof(_b)); \ + _a > _b ? _a : _b; \ + }) + +#define clamp(v, f, c) (max(min((v), (c)), (f))) + + +#define min_t(t, a, b) \ + ({ \ + t _ta = (a); \ + t _tb = (b); \ + min(_ta, _tb); \ + }) +#define max_t(t, a, b) \ + ({ \ + t _ta = (a); \ + t _tb = (b); \ + max(_ta, _tb); \ + }) + +#define clamp_t(t, v, f, c) \ + ({ \ + t _tv = (v); \ + t _tf = (f); \ + t _tc = (c); \ + clamp(_tv, _tf, _tc); \ + }) + +#endif /* CCAN_MINMAX_H */ diff --git a/ccan/str.c b/ccan/str.c new file mode 100644 index 0000000..3da90f4 --- /dev/null +++ b/ccan/str.c @@ -0,0 +1,13 @@ +/* CC0 (Public domain) - see LICENSE.CC0 file for details */ +#include <ccan/str.h> + +size_t strcount(const char *haystack, const char *needle) +{ + size_t i = 0, nlen = strlen(needle); + + while ((haystack = strstr(haystack, needle)) != NULL) { + i++; + haystack += nlen; + } + return i; +} diff --git a/ccan/str.h b/ccan/str.h new file mode 100644 index 0000000..68c8a51 --- /dev/null +++ b/ccan/str.h @@ -0,0 +1,228 @@ +/* CC0 (Public domain) - see LICENSE.CC0 file for details */ +#ifndef CCAN_STR_H +#define CCAN_STR_H +#include "config.h" +#include <string.h> +#include <stdbool.h> +#include <limits.h> +#include <ctype.h> + +/** + * streq - Are two strings equal? + * @a: first string + * @b: first string + * + * This macro is arguably more readable than "!strcmp(a, b)". + * + * Example: + * if (streq(somestring, "")) + * printf("String is empty!\n"); + */ +#define streq(a,b) (strcmp((a),(b)) == 0) + +/** + * strstarts - Does this string start with this prefix? + * @str: string to test + * @prefix: prefix to look for at start of str + * + * Example: + * if (strstarts(somestring, "foo")) + * printf("String %s begins with 'foo'!\n", somestring); + */ +#define strstarts(str,prefix) (strncmp((str),(prefix),strlen(prefix)) == 0) + +/** + * strends - Does this string end with this postfix? + * @str: string to test + * @postfix: postfix to look for at end of str + * + * Example: + * if (strends(somestring, "foo")) + * printf("String %s end with 'foo'!\n", somestring); + */ +static inline bool strends(const char *str, const char *postfix) +{ + if (strlen(str) < strlen(postfix)) + return false; + + return streq(str + strlen(str) - strlen(postfix), postfix); +} + +/** + * stringify - Turn expression into a string literal + * @expr: any C expression + * + * Example: + * #define PRINT_COND_IF_FALSE(cond) \ + * ((cond) || printf("%s is false!", stringify(cond))) + */ +#define stringify(expr) stringify_1(expr) +/* Double-indirection required to stringify expansions */ +#define stringify_1(expr) #expr + +/** + * strcount - Count number of (non-overlapping) occurrences of a substring. + * @haystack: a C string + * @needle: a substring + * + * Example: + * assert(strcount("aaa aaa", "a") == 6); + * assert(strcount("aaa aaa", "ab") == 0); + * assert(strcount("aaa aaa", "aa") == 2); + */ +size_t strcount(const char *haystack, const char *needle); + +/** + * STR_MAX_CHARS - Maximum possible size of numeric string for this type. + * @type_or_expr: a pointer or integer type or expression. + * + * This provides enough space for a nul-terminated string which represents the + * largest possible value for the type or expression. + * + * Note: The implementation adds extra space so hex values or negative + * values will fit (eg. sprintf(... "%p"). ) + * + * Example: + * char str[STR_MAX_CHARS(int)]; + * + * sprintf(str, "%i", 7); + */ +#define STR_MAX_CHARS(type_or_expr) \ + ((sizeof(type_or_expr) * CHAR_BIT + 8) / 9 * 3 + 2 \ + + STR_MAX_CHARS_TCHECK_(type_or_expr)) + +#if HAVE_TYPEOF +/* Only a simple type can have 0 assigned, so test that. */ +#define STR_MAX_CHARS_TCHECK_(type_or_expr) \ + ({ typeof(type_or_expr) x = 0; (void)x; 0; }) +#else +#define STR_MAX_CHARS_TCHECK_(type_or_expr) 0 +#endif + +/** + * cisalnum - isalnum() which takes a char (and doesn't accept EOF) + * @c: a character + * + * Surprisingly, the standard ctype.h isalnum() takes an int, which + * must have the value of EOF (-1) or an unsigned char. This variant + * takes a real char, and doesn't accept EOF. + */ +static inline bool cisalnum(char c) +{ + return isalnum((unsigned char)c); +} +static inline bool cisalpha(char c) +{ + return isalpha((unsigned char)c); +} +static inline bool cisascii(char c) +{ + return isascii((unsigned char)c); +} +#if HAVE_ISBLANK +static inline bool cisblank(char c) +{ + return isblank((unsigned char)c); +} +#endif +static inline bool ciscntrl(char c) +{ + return iscntrl((unsigned char)c); +} +static inline bool cisdigit(char c) +{ + return isdigit((unsigned char)c); +} +static inline bool cisgraph(char c) +{ + return isgraph((unsigned char)c); +} +static inline bool cislower(char c) +{ + return islower((unsigned char)c); +} +static inline bool cisprint(char c) +{ + return isprint((unsigned char)c); +} +static inline bool cispunct(char c) +{ + return ispunct((unsigned char)c); +} +static inline bool cisspace(char c) +{ + return isspace((unsigned char)c); +} +static inline bool cisupper(char c) +{ + return isupper((unsigned char)c); +} +static inline bool cisxdigit(char c) +{ + return isxdigit((unsigned char)c); +} + +#include <ccan/str_debug.h> + +/* These checks force things out of line, hence they are under DEBUG. */ +#ifdef CCAN_STR_DEBUG +#include <ccan/build_assert.h> + +/* These are commonly misused: they take -1 or an *unsigned* char value. */ +#undef isalnum +#undef isalpha +#undef isascii +#undef isblank +#undef iscntrl +#undef isdigit +#undef isgraph +#undef islower +#undef isprint +#undef ispunct +#undef isspace +#undef isupper +#undef isxdigit + +/* You can use a char if char is unsigned. */ +#if HAVE_BUILTIN_TYPES_COMPATIBLE_P && HAVE_TYPEOF +#define str_check_arg_(i) \ + ((i) + BUILD_ASSERT_OR_ZERO(!__builtin_types_compatible_p(typeof(i), \ + char) \ + || (char)255 > 0)) +#else +#define str_check_arg_(i) (i) +#endif + +#define isalnum(i) str_isalnum(str_check_arg_(i)) +#define isalpha(i) str_isalpha(str_check_arg_(i)) +#define isascii(i) str_isascii(str_check_arg_(i)) +#if HAVE_ISBLANK +#define isblank(i) str_isblank(str_check_arg_(i)) +#endif +#define iscntrl(i) str_iscntrl(str_check_arg_(i)) +#define isdigit(i) str_isdigit(str_check_arg_(i)) +#define isgraph(i) str_isgraph(str_check_arg_(i)) +#define islower(i) str_islower(str_check_arg_(i)) +#define isprint(i) str_isprint(str_check_arg_(i)) +#define ispunct(i) str_ispunct(str_check_arg_(i)) +#define isspace(i) str_isspace(str_check_arg_(i)) +#define isupper(i) str_isupper(str_check_arg_(i)) +#define isxdigit(i) str_isxdigit(str_check_arg_(i)) + +#if HAVE_TYPEOF +/* With GNU magic, we can make const-respecting standard string functions. */ +#undef strstr +#undef strchr +#undef strrchr + +/* + 0 is needed to decay array into pointer. */ +#define strstr(haystack, needle) \ + ((typeof((haystack) + 0))str_strstr((haystack), (needle))) +#define strchr(haystack, c) \ + ((typeof((haystack) + 0))str_strchr((haystack), (c))) +#define strrchr(haystack, c) \ + ((typeof((haystack) + 0))str_strrchr((haystack), (c))) +#endif +#endif /* CCAN_STR_DEBUG */ + +#endif /* CCAN_STR_H */ diff --git a/ccan/str_debug.h b/ccan/str_debug.h new file mode 100644 index 0000000..7a33438 --- /dev/null +++ b/ccan/str_debug.h @@ -0,0 +1,30 @@ +/* CC0 (Public domain) - see LICENSE.CC0 file for details */ +#ifndef CCAN_STR_DEBUG_H +#define CCAN_STR_DEBUG_H + +/* #define CCAN_STR_DEBUG 1 */ + +#ifdef CCAN_STR_DEBUG +/* Because we mug the real ones with macros, we need our own wrappers. */ +int str_isalnum(int i); +int str_isalpha(int i); +int str_isascii(int i); +#if HAVE_ISBLANK +int str_isblank(int i); +#endif +int str_iscntrl(int i); +int str_isdigit(int i); +int str_isgraph(int i); +int str_islower(int i); +int str_isprint(int i); +int str_ispunct(int i); +int str_isspace(int i); +int str_isupper(int i); +int str_isxdigit(int i); + +char *str_strstr(const char *haystack, const char *needle); +char *str_strchr(const char *s, int c); +char *str_strrchr(const char *s, int c); +#endif /* CCAN_STR_DEBUG */ + +#endif /* CCAN_STR_DEBUG_H */ diff --git a/debian/changelog b/debian/changelog new file mode 100644 index 0000000..8120d7b --- /dev/null +++ b/debian/changelog @@ -0,0 +1,209 @@ +rdma-core (29.0-1) unstable; urgency=medium + + * New upstream release. + + -- Jason Gunthorpe <jgg@obsidianresearch.com> Mon, 23 Dec 2019 13:36:56 +0100 + +rdma-core (27.0-1) unstable; urgency=medium + + * New upstream release + - libcxgb3: Remove libcxgb3 from rdma-core + - libnes: Remove libnes from rdma-core + * Add missing build dependency dh-python + + -- Benjamin Drung <benjamin.drung@cloud.ionos.com> Mon, 23 Dec 2019 13:22:46 +0100 + +rdma-core (26.0-2) unstable; urgency=medium + + * Improve/extent description of python3-pyverbs + * Bump Standards-Version to 4.4.1 (no changes required) + * Add Rules-Requires-Root: no + + -- Benjamin Drung <benjamin.drung@cloud.ionos.com> Tue, 29 Oct 2019 13:22:15 +0100 + +rdma-core (26.0-1) unstable; urgency=medium + + * New upstream release. + - Include infiniband-diags source package producing infiniband-diags, + libibmad5, libibmad-dev, libibnetdisc5, and libibnetdisc-dev. + * Update private libibverbs symbols + * Specify Build-Depends-Package for libibmad5 and libibnetdisc5 + + -- Benjamin Drung <benjamin.drung@cloud.ionos.com> Thu, 24 Oct 2019 11:27:45 +0200 + +rdma-core (24.0-2) unstable; urgency=medium + + * Skip installing efa if the architecture lacks coherent DMA support + + -- Benjamin Drung <benjamin.drung@cloud.ionos.com> Thu, 11 Jul 2019 12:34:23 +0200 + +rdma-core (24.0-1) unstable; urgency=medium + + * New upstream release. + * Drop pyverbs-Add-shebang-to-ib_devices.py-example.patch (applied upstream) + * Bump Standards-Version to 4.4.0 (no changes needed) + * Switch to debhelper 12 + * Add Pre-Depends on ${misc:Pre-Depends} + * Drop debug symbol migration + + -- Benjamin Drung <benjamin.drung@cloud.ionos.com> Wed, 10 Jul 2019 12:39:27 +0200 + +rdma-core (22.1-1) unstable; urgency=medium + + * New upstream bugfix release. + + -- Benjamin Drung <benjamin.drung@cloud.ionos.com> Wed, 06 Feb 2019 15:58:48 +0100 + +rdma-core (22.0-1) unstable; urgency=medium + + * New upstream release. + - mlx5: Add DEVX APIs for interop with verbs objects + - Add pyverbs Python binding + * Update private libibverbs symbols + * Bump Standards-Version to 4.3.0 (no changes required) + + -- Benjamin Drung <benjamin.drung@cloud.ionos.com> Tue, 22 Jan 2019 13:27:29 +0100 + +rdma-core (21.0-1) unstable; urgency=medium + + * New upstream release. + - Drop ibacm sysV init script to avoid issues with the sysV to systemd + wrapper starting the service instead of the socket (LP: #1794825) + - Include static libraries in the build + * Update private libibverbs symbols + * Specify Build-Depends-Package in symbols + + -- Benjamin Drung <benjamin.drung@cloud.ionos.com> Tue, 20 Nov 2018 11:49:25 +0100 + +rdma-core (20.0-1) unstable; urgency=medium + + * New upstream release. + - Switch from net-tools to iproute2 for rxe_cfg + - Install pkg-config files + * Update libibverbs symbols and let libibverbs1 break ibverbs-providers < 20~ + * Drop all patches (accepted upstream) + * Bump Standards-Version to 4.2.1 (no changes needed) + + -- Benjamin Drung <benjamin.drung@profitbricks.com> Mon, 10 Sep 2018 11:23:11 +0200 + +rdma-core (19.0-1) unstable; urgency=medium + + * New upstream release. + * Switch to debhelper 11 + * Add patch to fix bad whatis entries in man pages + + -- Benjamin Drung <benjamin.drung@profitbricks.com> Thu, 28 Jun 2018 15:01:27 +0200 + +rdma-core (18.1-1) unstable; urgency=medium + + * New upstream bugfix release. + * Drop all patches (applied upstream) + + -- Benjamin Drung <benjamin.drung@profitbricks.com> Tue, 12 Jun 2018 11:53:44 +0200 + +rdma-core (18.0-1) unstable; urgency=medium + + * New upstream release. + * Update private libibverbs symbols and let libibverbs1 break + ibverbs-providers < 18~ + * Fix bad whatis entries in man pages + * Fix spelling mistakes in ibv_create_flow_action.3 man page + * Use versioned Breaks & Replaces for ibverbs-providers to make it + multi-arch coinstallable (Closes: #898055) + + -- Benjamin Drung <benjamin.drung@profitbricks.com> Mon, 07 May 2018 13:40:40 +0200 + +rdma-core (17.1-2) unstable; urgency=medium + + * Support for new architecture riscv64 (Closes: #894995) by + - Whitelist (instead of blacklist) architectures that support valgrind + - Whitelist (instead of blacklist) coherent DMA supporting architectures + * Bump Standards-Version to 4.1.4 (no changes needed) + + -- Benjamin Drung <benjamin.drung@profitbricks.com> Mon, 30 Apr 2018 19:01:44 +0200 + +rdma-core (17.1-1) unstable; urgency=medium + + * New upstream bugfix release. + + -- Benjamin Drung <benjamin.drung@profitbricks.com> Mon, 19 Mar 2018 13:32:31 +0100 + +rdma-core (17.0-1) unstable; urgency=medium + + * New upstream release + - Remove the obsolete libibcm library + * Update private libibverbs symbols and let libibverbs1 break + ibverbs-providers < 17~ + * Update copyright for kernel-headers directory + + -- Benjamin Drung <benjamin.drung@profitbricks.com> Mon, 19 Feb 2018 12:47:42 +0100 + +rdma-core (16.2-1) unstable; urgency=medium + + * New upstream bugfix release + * Guard udevadm call again + * Override intentional systemd WantedBy= relationship lintian warning + + -- Benjamin Drung <benjamin.drung@profitbricks.com> Thu, 15 Feb 2018 11:41:14 +0100 + +rdma-core (16.1-2) unstable; urgency=medium + + * Do not require valgrind on ia64 (Closes: #887511) + + -- Benjamin Drung <benjamin.drung@profitbricks.com> Fri, 19 Jan 2018 12:37:05 +0100 + +rdma-core (16.1-1) unstable; urgency=medium + + * New upstream bugfix release. + * Bump Standards-Version to 4.1.3 (no changes needed) + * Add udev dependency to rdma-core and srptools + + -- Benjamin Drung <benjamin.drung@profitbricks.com> Thu, 04 Jan 2018 14:42:26 +0100 + +rdma-core (16.0-1) unstable; urgency=medium + + * New upstream release. + * Update private libibverbs symbols + * Bump Standards-Version to 4.1.2 (no changes needed) + + -- Benjamin Drung <benjamin.drung@profitbricks.com> Tue, 12 Dec 2017 11:01:38 +0100 + +rdma-core (15.1-1) unstable; urgency=medium + + * New upstream release. + * Add m68k as non-coherent DMA architecture + * Mark libraries as Multi-Arch: same + + -- Benjamin Drung <benjamin.drung@profitbricks.com> Thu, 30 Nov 2017 12:08:26 +0100 + +rdma-core (15-3) unstable; urgency=medium + + * debian/rules: Include architecture.mk for DEB_HOST_ARCH definition + * Add alpha, hppa, sh4 as non-coherent DMA archs + * Do not require valgrind on x32 (not available there due to build failure) + + -- Benjamin Drung <benjamin.drung@profitbricks.com> Thu, 16 Nov 2017 17:33:48 +0100 + +rdma-core (15-2) unstable; urgency=medium + + * Do not build ibacm for non-Linux architectures + * Do not require valgrind if not available + * Let libibverbs1 15 break ibverbs-providers 14 + * Drop dh-systemd build dependency + * Bump Standards-Version to 4.1.1 (no changes needed) + * Drop lintian overrides for false positives + * Set myself as maintainer (instead of linux-rdma) + * Do not try to install disabled ibverbs providers on architectures that do + not provide cache coherent DMA (Closes: #881731) + * Explicitly list private libibverbs symbols + + -- Benjamin Drung <benjamin.drung@profitbricks.com> Thu, 16 Nov 2017 12:55:28 +0100 + +rdma-core (15-1) unstable; urgency=medium + + * New upstream version. ibverbs-providers combines the source packages + libcxgb3, libipathverbs, libmlx4, libmlx5, libmthca, and libnes. + rdma-core also combines the source packages ibacm, libibcm, libibumad, + libibverbs, librdmacm, and srptools (Closes: #848971) + + -- Benjamin Drung <benjamin.drung@profitbricks.com> Mon, 18 Sep 2017 11:00:39 +0200 diff --git a/debian/compat b/debian/compat new file mode 100644 index 0000000..ec63514 --- /dev/null +++ b/debian/compat @@ -0,0 +1 @@ +9 diff --git a/debian/control b/debian/control new file mode 100644 index 0000000..a97b2f0 --- /dev/null +++ b/debian/control @@ -0,0 +1,447 @@ +Source: rdma-core +Maintainer: Linux RDMA Mailing List <linux-rdma@vger.kernel.org> +Uploaders: Benjamin Drung <benjamin.drung@cloud.ionos.com>, + Talat Batheesh <talatb@mellanox.com> +Section: net +Priority: optional +Build-Depends: cmake (>= 2.8.11), + cython3, + debhelper (>= 9), + debhelper (>= 9.20160709) | dh-systemd, + dh-python, + dpkg-dev (>= 1.17), + libnl-3-dev, + libnl-route-3-dev, + libsystemd-dev, + libudev-dev, + ninja-build, + pandoc, + pkg-config, + python3-dev, + python3-docutils, + valgrind [amd64 arm64 armhf i386 mips mips64el mipsel powerpc ppc64 ppc64el s390x] +Rules-Requires-Root: no +Standards-Version: 4.4.1 +Vcs-Git: https://github.com/linux-rdma/rdma-core.git +Vcs-Browser: https://github.com/linux-rdma/rdma-core +Homepage: https://github.com/linux-rdma/rdma-core + +Package: rdma-core +Architecture: linux-any +Depends: lsb-base (>= 3.2-14~), + udev, + ${misc:Depends}, + ${perl:Depends}, + ${shlibs:Depends} +Pre-Depends: ${misc:Pre-Depends} +Recommends: dmidecode, ethtool, iproute2 +Breaks: infiniband-diags (<< 2.0.0) +Replaces: infiniband-diags (<< 2.0.0) +Description: RDMA core userspace infrastructure and documentation + This package provides the basic boot time support for systems that use the + Linux kernel's remote direct memory access (RDMA) subystem which includes + InfiniBand, iWARP, and RDMA over Converged Ethernet (RoCE). + . + Several kernel RDMA support daemons are included: + - The rdma-ndd daemon which watches for RDMA device changes and/or hostname + changes and updates the Node Description of the RDMA devices based on + those changes. + - The iWARP Port Mapper Daemon (iwpmd) which provides a kernel support + service in userspace for iWARP drivers to claim TCP ports through the + standard socket interface. + +Package: ibacm +Architecture: linux-any +Depends: lsb-base (>= 3.2-14~), + rdma-core (>= 15), + ${misc:Depends}, + ${shlibs:Depends} +Description: InfiniBand Communication Manager Assistant (ACM) + The IB ACM implements and provides a framework for name, address, and + route (path) resolution services over InfiniBand. + It is intended to address connection setup scalability issues running + MPI applications on large clusters. The IB ACM provides information + needed to establish a connection, but does not implement the CM protocol. + A primary user of the ibacm service is the librdmacm library. + +Package: ibverbs-providers +Architecture: linux-any +Multi-Arch: same +Depends: ${misc:Depends}, ${shlibs:Depends} +Provides: libefa1, libipathverbs1, libmlx4-1, libmlx5-1, libmthca1 +Replaces: libipathverbs1 (<< 15), + libmlx4-1 (<< 15), + libmlx5-1 (<< 15), + libmthca1 (<< 15) +Breaks: libipathverbs1 (<< 15), + libmlx4-1 (<< 15), + libmlx5-1 (<< 15), + libmthca1 (<< 15) +Description: User space provider drivers for libibverbs + libibverbs is a library that allows userspace processes to use RDMA + "verbs" as described in the InfiniBand Architecture Specification and + the RDMA Protocol Verbs Specification. iWARP ethernet NICs support + RDMA over hardware-offloaded TCP/IP, while InfiniBand is a + high-throughput, low-latency networking technology. InfiniBand host + channel adapters (HCAs) and iWARP NICs commonly support direct + hardware access from userspace (kernel bypass), and libibverbs + supports this when available. + . + A RDMA driver consists of a kernel portion and a user space portion. + This package contains the user space verbs drivers: + . + - bnxt_re: Broadcom NetXtreme-E RoCE HCAs + - cxgb4: Chelsio T4 iWARP HCAs + - efa: Amazon Elastic Fabric Adapter + - hfi1verbs: Intel Omni-Path HFI + - hns: HiSilicon Hip06 SoC + - i40iw: Intel Ethernet Connection X722 RDMA + - ipathverbs: QLogic InfiniPath HCAs + - mlx4: Mellanox ConnectX-3 InfiniBand HCAs + - mlx5: Mellanox Connect-IB/X-4+ InfiniBand HCAs + - mthca: Mellanox InfiniBand HCAs + - ocrdma: Emulex OneConnect RDMA/RoCE device + - qedr: QLogic QL4xxx RoCE HCAs + - rxe: A software implementation of the RoCE protocol + - siw: A software implementation of the iWarp protocol + - vmw_pvrdma: VMware paravirtual RDMA device + +Package: ibverbs-utils +Architecture: linux-any +Depends: ${misc:Depends}, ${shlibs:Depends} +Description: Examples for the libibverbs library + libibverbs is a library that allows userspace processes to use RDMA + "verbs" as described in the InfiniBand Architecture Specification and + the RDMA Protocol Verbs Specification. iWARP ethernet NICs support + RDMA over hardware-offloaded TCP/IP, while InfiniBand is a + high-throughput, low-latency networking technology. InfiniBand host + channel adapters (HCAs) and iWARP NICs commonly support direct + hardware access from userspace (kernel bypass), and libibverbs + supports this when available. + . + This package contains useful libibverbs1 example programs such as + ibv_devinfo, which displays information about InfiniBand devices. + +Package: libibverbs-dev +Section: libdevel +Architecture: linux-any +Multi-Arch: same +Depends: ibverbs-providers (= ${binary:Version}), + libibverbs1 (= ${binary:Version}), + libnl-3-dev, + libnl-route-3-dev, + ${misc:Depends} +Description: Development files for the libibverbs library + libibverbs is a library that allows userspace processes to use RDMA + "verbs" as described in the InfiniBand Architecture Specification and + the RDMA Protocol Verbs Specification. iWARP ethernet NICs support + RDMA over hardware-offloaded TCP/IP, while InfiniBand is a + high-throughput, low-latency networking technology. InfiniBand host + channel adapters (HCAs) and iWARP NICs commonly support direct + hardware access from userspace (kernel bypass), and libibverbs + supports this when available. + . + This package is needed to compile programs against libibverbs1. + It contains the header files and static libraries (optionally) + needed for compiling. + +Package: libibverbs1 +Architecture: linux-any +Multi-Arch: same +Section: libs +Pre-Depends: ${misc:Pre-Depends} +Depends: adduser, ${misc:Depends}, ${shlibs:Depends} +Recommends: ibverbs-providers +Breaks: ibverbs-providers (<< 25~) +Description: Library for direct userspace use of RDMA (InfiniBand/iWARP) + libibverbs is a library that allows userspace processes to use RDMA + "verbs" as described in the InfiniBand Architecture Specification and + the RDMA Protocol Verbs Specification. iWARP ethernet NICs support + RDMA over hardware-offloaded TCP/IP, while InfiniBand is a + high-throughput, low-latency networking technology. InfiniBand host + channel adapters (HCAs) and iWARP NICs commonly support direct + hardware access from userspace (kernel bypass), and libibverbs + supports this when available. + . + For this library to be useful, a device-specific plug-in module + should also be installed. + . + This package contains the shared library. + +Package: libibverbs1-dbg +Section: debug +Architecture: linux-any +Multi-Arch: same +Depends: libibverbs1 (= ${binary:Version}), ${misc:Depends} +Description: Debug symbols for the libibverbs library + libibverbs is a library that allows userspace processes to use RDMA + "verbs" as described in the InfiniBand Architecture Specification and + the RDMA Protocol Verbs Specification. iWARP ethernet NICs support + RDMA over hardware-offloaded TCP/IP, while InfiniBand is a + high-throughput, low-latency networking technology. InfiniBand host + channel adapters (HCAs) and iWARP NICs commonly support direct + hardware access from userspace (kernel bypass), and libibverbs + supports this when available. + . + This package contains the debug symbols associated with + libibverbs1. They will automatically be used by gdb for debugging + libibverbs-related issues. + +Package: libibumad-dev +Section: libdevel +Architecture: linux-any +Multi-Arch: same +Depends: libibumad3 (= ${binary:Version}), ${misc:Depends} +Description: Development files for libibumad + libibumad provides userspace Infiniband Management Datagram (uMAD) + functions which sit on top of the uMAD modules in the kernel. + These are used by InfiniBand diagnostic and management tools. + . + This package is needed to compile programs against libibumad. + It contains the header files and static libraries (optionally) + needed for compiling. + +Package: libibumad3 +Architecture: linux-any +Multi-Arch: same +Section: libs +Pre-Depends: ${misc:Pre-Depends} +Depends: ${misc:Depends}, ${shlibs:Depends} +Description: InfiniBand Userspace Management Datagram (uMAD) library + libibumad provides userspace Infiniband Management Datagram (uMAD) + functions which sit on top of the uMAD modules in the kernel. + These are used by InfiniBand diagnostic and management tools. + . + This package contains the shared library. + +Package: libibumad3-dbg +Section: debug +Architecture: linux-any +Depends: libibumad3 (= ${binary:Version}), ${misc:Depends} +Description: Debug symbols for the libibumad3 library + libibumad provides userspace Infiniband Management Datagram (uMAD) + functions which sit on top of the uMAD modules in the kernel. + These are used by InfiniBand diagnostic and management tools. + . + This package contains the debug symbols associated with + libibumad3. They will automatically be used by gdb for debugging + libibumad-related issues. + +Package: librdmacm-dev +Section: libdevel +Architecture: linux-any +Multi-Arch: same +Depends: libibverbs-dev, librdmacm1 (= ${binary:Version}), ${misc:Depends} +Description: Development files for the librdmacm library + librdmacm is a library that allows applications to set up reliable + connected and unreliable datagram transfers when using RDMA adapters. + It provides a transport-neutral interface in the sense that the same + code can be used for both InfiniBand and iWARP adapters. The + interface is based on sockets, but adapted for queue pair (QP) based + semantics: communication must use a specific RDMA device, and data + transfers are message-based. + . + librdmacm only provides communication management (connection setup + and tear-down) and works in conjunction with the verbs interface + provided by libibverbs, which provides the interface used to actually + transfer data. + . + This package is needed to compile programs against librdmacm1. + It contains the header files and static libraries (optionally) + needed for compiling. + +Package: librdmacm1 +Architecture: linux-any +Multi-Arch: same +Section: libs +Pre-Depends: ${misc:Pre-Depends} +Depends: ${misc:Depends}, ${shlibs:Depends} +Description: Library for managing RDMA connections + librdmacm is a library that allows applications to set up reliable + connected and unreliable datagram transfers when using RDMA adapters. + It provides a transport-neutral interface in the sense that the same + code can be used for both InfiniBand and iWARP adapters. The + interface is based on sockets, but adapted for queue pair (QP) based + semantics: communication must use a specific RDMA device, and data + transfers are message-based. + . + librdmacm only provides communication management (connection setup + and tear-down) and works in conjunction with the verbs interface + provided by libibverbs, which provides the interface used to actually + transfer data. + . + This package contains the shared library. + +Package: librdmacm1-dbg +Section: debug +Architecture: linux-any +Depends: librdmacm1 (= ${binary:Version}), ${misc:Depends} +Description: Debug symbols for the librdmacm library + librdmacm is a library that allows applications to set up reliable + connected and unreliable datagram transfers when using RDMA adapters. + It provides a transport-neutral interface in the sense that the same + code can be used for both InfiniBand and iWARP adapters. The + interface is based on sockets, but adapted for queue pair (QP) based + semantics: communication must use a specific RDMA device, and data + transfers are message-based. + . + librdmacm only provides communication management (connection setup + and tear-down) and works in conjunction with the verbs interface + provided by libibverbs, which provides the interface used to actually + transfer data. + . + This package contains the debug symbols associated with + librdmacm1. They will automatically be used by gdb for debugging + librdmacm-related issues. + +Package: rdmacm-utils +Architecture: linux-any +Depends: ${misc:Depends}, ${shlibs:Depends} +Description: Examples for the librdmacm library + librdmacm is a library that allows applications to set up reliable + connected and unreliable datagram transfers when using RDMA adapters. + It provides a transport-neutral interface in the sense that the same + code can be used for both InfiniBand and iWARP adapters. The + interface is based on sockets, but adapted for queue pair (QP) based + semantics: communication must use a specific RDMA device, and data + transfers are message-based. + . + librdmacm only provides communication management (connection setup + and tear-down) and works in conjunction with the verbs interface + provided by libibverbs, which provides the interface used to actually + transfer data. + . + This package contains useful librdmacm1 example programs such as + rping and udaddy. + +Package: srptools +Architecture: linux-any +Depends: lsb-base (>= 3.2-14~), + rdma-core (>= 15), + udev, + ${misc:Depends}, + ${shlibs:Depends} +Pre-Depends: ${misc:Pre-Depends} +Description: Tools for Infiniband attached storage (SRP) + In conjunction with the kernel ib_srp driver, srptools allows you to + discover and use Infiniband attached storage devices which use the + SCSI RDMA Protocol (SRP). + +Package: python3-pyverbs +Section: python +Architecture: linux-any +Depends: rdma-core (>= 21), + ${misc:Depends}, + ${python3:Depends}, + ${shlibs:Depends} +Provides: ${python3:Provides} +Description: Python bindings for rdma-core + Pyverbs provides a Python API over rdma-core, the Linux userspace C API for + the remote direct memory access (RDMA) stack. + . + One goal is to provide easier access to RDMA: RDMA has a steep learning curve + as is and the C interface requires the user to initialize multiple structs + before having usable objects. Pyverbs attempts to remove much of this overhead + and provide a smoother user experience. + +Package: infiniband-diags +Architecture: linux-any +Depends: libibnetdisc5 (= ${binary:Version}), + ${misc:Depends}, + ${perl:Depends}, + ${shlibs:Depends} +Description: InfiniBand diagnostic programs + InfiniBand is a switched fabric communications link used in + high-performance computing and enterprise data centers. Its features + include high throughput, low latency, quality of service and + failover, and it is designed to be scalable. + . + This package provides diagnostic programs and scripts needed to + diagnose an InfiniBand subnet. + +Package: libibmad5 +Section: libs +Architecture: linux-any +Pre-Depends: ${misc:Pre-Depends} +Depends: ${misc:Depends}, ${shlibs:Depends} +Description: Infiniband Management Datagram (MAD) library + libibmad provides low layer InfiniBand functions for use by the + Infiniband diagnostic and management programs. These include + Management Datagrams (MAD), Subnet Administration (SA), Subnet + Management Packets (SMP) and other basic functions. + . + This package contains the shared library. + +Package: libibmad5-dbg +Section: debug +Architecture: linux-any +Pre-Depends: ${misc:Pre-Depends} +Depends: libibmad5 (= ${binary:Version}), ${misc:Depends} +Description: Debug symbols for Infiniband Management Datagram (MAD) library + libibmad provides low layer InfiniBand functions for use by the + Infiniband diagnostic and management programs. These include + Management Datagrams (MAD), Subnet Administration (SA), Subnet + Management Packets (SMP) and other basic functions. + . + This package contains the debug symbols associated with + libibmad5. They will automatically be used by gdb for debugging + libibmad-related issues. + +Package: libibmad-dev +Section: libdevel +Architecture: linux-any +Depends: libibmad5 (= ${binary:Version}), ${misc:Depends} +Description: Development files for libibmad + libibmad provides low layer Infiniband functions for use by the + InfiniBand diagnostic and management programs. These include + Management Datagrams (MAD), Subnet Administration (SA), Subnet + Management Packets (SMP) and other basic functions. + . + This package is needed to compile programs against libibmad5. + It contains the header files and static libraries (optionally) + needed for compiling. + +Package: libibnetdisc5 +Section: libs +Architecture: linux-any +Pre-Depends: ${misc:Pre-Depends} +Depends: ${misc:Depends}, ${shlibs:Depends} +Description: InfiniBand diagnostics library + InfiniBand is a switched fabric communications link used in + high-performance computing and enterprise data centers. Its features + include high throughput, low latency, quality of service and + failover, and it is designed to be scalable. + . + This package provides libraries required by the InfiniBand + diagnostic programs. + +Package: libibnetdisc5-dbg +Section: debug +Architecture: linux-any +Multi-Arch: same +Depends: libibnetdisc5 (= ${binary:Version}), ${misc:Depends} +Description: Debug symbols for the libibnetdisc library + InfiniBand is a switched fabric communications link used in + high-performance computing and enterprise data centers. Its features + include high throughput, low latency, quality of service and + failover, and it is designed to be scalable. + . + This package contains the debug symbols associated with + libibnetdisc5. They will automatically be used by gdb for debugging + libibnetdisc-related issues. + +Package: libibnetdisc-dev +Section: libdevel +Architecture: linux-any +Depends: libibnetdisc5 (= ${binary:Version}), ${misc:Depends} +Breaks: infiniband-diags (<< 2.0.0) +Replaces: infiniband-diags (<< 2.0.0) +Description: InfiniBand diagnostics library headers + InfiniBand is a switched fabric communications link used in + high-performance computing and enterprise data centers. Its features + include high throughput, low latency, quality of service and + failover, and it is designed to be scalable. + . + This package provides development files required to build + applications aginast the libibnetdisc5 InfiniBand diagnostic + libraries. diff --git a/debian/copyright b/debian/copyright new file mode 100644 index 0000000..3bf582b --- /dev/null +++ b/debian/copyright @@ -0,0 +1,666 @@ +Format: https://www.debian.org/doc/packaging-manuals/copyright-format/1.0/ +Upstream-Name: rdma-core +Upstream-Contact: Doug Ledford <dledford@redhat.com>, + Leon Romanovsky <Leon@kernel.org> +Source: https://github.com/linux-rdma/rdma-core + +Files: * +Copyright: disclaimed +License: BSD-MIT or GPL-2 + +Files: debian/* +Copyright: 2008, Genome Research Ltd + 2014, Ana Beatriz Guerrero Lopez <ana@debian.org> + 2015-2016, Jason Gunthorpe <jgunthorpe@obsidianresearch.com> + 2016-2018, Benjamin Drung <benjamin.drung@cloud.ionos.com> + 2016-2017, Talat Batheesh <talatb@mellanox.com> +License: GPL-2+ + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + . + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + . + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + . + On Debian systems, the full text of the GNU General Public License + version 2 can be found in the file `/usr/share/common-licenses/GPL-2'. + +Files: CMakeLists.txt +Copyright: 2015-2017, Obsidian Research Corporation. +License: BSD-MIT or GPL-2 + +Files: buildlib/* +Copyright: 2015-2017, Obsidian Research Corporation. + 2016-2017 Mellanox Technologies, Inc +License: BSD-MIT or GPL-2 + +Files: buildlib/fixup-include/stdatomic.h +Copyright: 2011 Ed Schouten <ed@FreeBSD.org> + David Chisnall <theraven@FreeBSD.org> +License: BSD-2-clause + +Files: ccan/* +Copyright: unspecified +License: CC0 + +Files: ccan/list.* +Copyright: unspecified +License: MIT + +Files: ibacm/* +Copyright: 2009-2014, Intel Corporation. + 2013, Mellanox Technologies LTD. +License: BSD-MIT + +Files: ibacm/man/* + ibacm/ibacm.init.in +Copyright: disclaimed +License: BSD-2-clause + +Files: ibacm/CMakeLists.txt + ibacm/ibacm_hosts.data +Copyright: disclaimed +License: BSD-MIT or GPL-2 + +Files: iwpmd/* +Copyright: 2013-2016, Intel Corporation. +License: BSD-MIT or GPL-2 + +Files: kernel-headers/* +Copyright: disclaimed +License: GPL-2 or BSD-2-clause + +Files: kernel-headers/rdma/rdma_netlink.h +Copyright: disclaimed +License: GPL-2 + +Files: kernel-headers/rdma/hfi/* +Copyright: disclaimed +License: GPL-2 or BSD-3-clause + +Files: libibumad/* +Copyright: 2004-2017, Mellanox Technologies Ltd. + 2004, Infinicon Corporation. + 2004-2014, Intel Corporation. + 2004, Topspin Corporation. + 2004-2009, Voltaire Inc. + 2013 Lawrence Livermore National Security + 2013, Oracle and/or its affiliates. +License: BSD-MIT or GPL-2 + +Files: libibumad/man/* +Copyright: disclaimed +License: BSD-2-clause + +Files: libibverbs/* +Copyright: 2004-2012, Intel Corporation. + 2004-2005, Topspin Communications. + 2005-2007, Cisco Systems, Inc. + 2005, PathScale, Inc. + 2005, Mellanox Technologies Ltd. + 2005, Voltaire, Inc. + 2008, Lawrence Livermore National Laboratory. +License: BSD-MIT or GPL-2 + +Files: libibverbs/man/* + libibverbs/neigh.h + libibverbs/neigh.c +Copyright: disclaimed +License: BSD-2-clause + +Files: librdmacm/* +Copyright: 2005-2014, Intel Corporation. + 2005, Ammasso, Inc. + 2005, Voltaire Inc. + 2006, Open Grid Computing, Inc. + 2014-2015, Mellanox Technologies LTD. +License: BSD-MIT or GPL-2 + +Files: librdmacm/examples/cmtime.c + librdmacm/examples/rcopy.c + librdmacm/examples/rdma_client.c + librdmacm/examples/rdma_server.c + librdmacm/examples/rdma_xclient.c + librdmacm/examples/rdma_xserver.c + librdmacm/examples/riostream.c + librdmacm/examples/rstream.c + librdmacm/examples/udpong.c +Copyright: 2005-2014, Intel Corporation. + 2014-2015, Mellanox Technologies LTD. +License: BSD-MIT + +Files: librdmacm/docs/rsocket +Copyright: disclaimed +License: BSD-2-clause + +Files: librdmacm/man/* +Copyright: disclaimed +License: BSD-2-clause + +Files: providers/bnxt_re/* +Copyright: 2015-2017, Broadcom Limited and/or its subsidiaries +License: BSD-2-clause or GPL-2 + +Files: providers/cxgb4/* +Copyright: 2003-2016, Chelsio Communications, Inc. +License: BSD-MIT or GPL-2 + +Files: providers/efa/* +Copyright: 2019 Amazon.com, Inc. or its affiliates. +License: BSD-2-clause or GPL-2 + +Files: providers/hfi1verbs/* +Copyright: 2005 PathScale, Inc. + 2006-2009 QLogic Corporation + 2015 Intel Corporation +License: BSD-3-clause or GPL-2 + +Files: providers/hns/* +Copyright: 2016, Hisilicon Limited. +License: BSD-MIT or GPL-2 + +Files: providers/i40iw/* +Copyright: 2015-2016, Intel Corporation. +License: BSD-MIT or GPL-2 + +Files: providers/ipathverbs/* +Copyright: 2006-2010, QLogic Corp. + 2005, PathScale, Inc. + 2013, Intel Corporation +License: BSD-MIT or GPL-2 + +Files: providers/mlx4/* +Copyright: 2004-2005, Topspin Communications. + 2005-2007, Cisco, Inc. + 2005-2017, Mellanox Technologies Ltd. +License: BSD-MIT or GPL-2 + +Files: providers/mlx5/* +Copyright: 2010-2017, Mellanox Technologies, Inc. +License: BSD-MIT or GPL-2 + +Files: providers/mlx5/man/*.3 + providers/mlx5/man/*.7 +Copyright: disclaimed +License: BSD-MIT + +Files: providers/mthca/* +Copyright: 2004-2005, Topspin Communications. + 2005-2006, Cisco Systems. + 2005, Mellanox Technologies Ltd. +License: BSD-MIT or GPL-2 + +Files: providers/ocrdma/* +Copyright: 2008-2013, Emulex. +License: BSD-2-clause or GPL-2 + +Files: providers/qedr/* +Copyright: 2015-2016, QLogic Corporation. +License: BSD-MIT or GPL-2 + +Files: providers/rxe/* +Copyright: 2009-2011, System Fabric Works, Inc. + 2009-2011, Mellanox Technologies Ltd. + 2006-2007, QLogic Corporation. + 2005, PathScale, Inc. +License: BSD-MIT or GPL-2 + +Files: providers/siw/* +Copyright: 2008-2019, IBM Corporation. +License: BSD-3-clause or GPL-2 + +Files: providers/vmw_pvrdma/* +Copyright: 2012-2016 VMware, Inc. +License: BSD-2-clause or GPL-2 + +Files: rdma-ndd/* +Copyright: 2004-2016, Intel Corporation. +License: BSD-MIT or GPL-2 + +Files: redhat/* +Copyright: 1996-2013, Red Hat, Inc. +License: GPL-2 + +Files: srp_daemon/* +Copyright: 2005, Topspin Communications. + 2006, Cisco Systems, Inc. + 2006, Mellanox Technologies Ltd. +License: BSD-MIT or GPL-2 + +Files: srp_daemon/srp_daemon.8.in +Copyright: 2006 Mellanox Technologies. +License: CPL-1.0 or BSD-2-clause or GPL-2 + +Files: srp_daemon/srpd.in + srp_daemon/ibsrpdm.8 +Copyright: disclaimed +License: BSD-2-clause + +Files: util/udma_barrier.h +Copyright: 2005 Topspin Communications. +License: BSD-MIT or GPL-2 + +License: BSD-MIT + OpenIB.org BSD license (MIT variant) + . + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + . + - Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + . + - Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + . + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. + +License: BSD-2-clause + OpenIB.org BSD license (FreeBSD Variant) + . + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + . + - Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + . + - Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + . + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE + FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +License: BSD-3-clause + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + . + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + . + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +License: GPL-2 + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + . + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + . + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + . + On Debian systems, the full text of the GNU General Public License + version 2 can be found in the file `/usr/share/common-licenses/GPL-2'. + +License: CC0 + The laws of most jurisdictions throughout the world automatically confer + exclusive Copyright and Related Rights (defined below) upon the creator and + subsequent owner(s) (each and all, an "owner") of an original work of + authorship and/or a database (each, a "Work"). + . + Certain owners wish to permanently relinquish those rights to a Work for the + purpose of contributing to a commons of creative, cultural and scientific works + ("Commons") that the public can reliably and without fear of later claims of + infringement build upon, modify, incorporate in other works, reuse and + redistribute as freely as possible in any form whatsoever and for any purposes, + including without limitation commercial purposes. These owners may contribute + to the Commons to promote the ideal of a free culture and the further + production of creative, cultural and scientific works, or to gain reputation or + greater distribution for their Work in part through the use and efforts of + others. + . + For these and/or other purposes and motivations, and without any expectation of + additional consideration or compensation, the person associating CC0 with a + Work (the "Affirmer"), to the extent that he or she is an owner of Copyright + and Related Rights in the Work, voluntarily elects to apply CC0 to the Work and + publicly distribute the Work under its terms, with knowledge of his or her + Copyright and Related Rights in the Work and the meaning and intended legal + effect of CC0 on those rights. + . + 1. Copyright and Related Rights. A Work made available under CC0 may be + protected by copyright and related or neighboring rights ("Copyright and + Related Rights"). Copyright and Related Rights include, but are not limited to, + the following: + . + the right to reproduce, adapt, distribute, perform, display, communicate, + and translate a Work; moral rights retained by the original author(s) and/or + performer(s); publicity and privacy rights pertaining to a person's image or + likeness depicted in a Work; rights protecting against unfair competition in + regards to a Work, subject to the limitations in paragraph 4(a), below; rights + protecting the extraction, dissemination, use and reuse of data in a Work; + database rights (such as those arising under Directive 96/9/EC of the European + Parliament and of the Council of 11 March 1996 on the legal protection of + databases, and under any national implementation thereof, including any amended + or successor version of such directive); and other similar, equivalent or + corresponding rights throughout the world based on applicable law or treaty, + and any national implementations thereof. + . + 2. Waiver. To the greatest extent permitted by, but not in contravention of, + applicable law, Affirmer hereby overtly, fully, permanently, irrevocably and + unconditionally waives, abandons, and surrenders all of Affirmer's Copyright + and Related Rights and associated claims and causes of action, whether now + known or unknown (including existing as well as future claims and causes of + action), in the Work (i) in all territories worldwide, (ii) for the maximum + duration provided by applicable law or treaty (including future time + extensions), (iii) in any current or future medium and for any number of + copies, and (iv) for any purpose whatsoever, including without limitation + commercial, advertising or promotional purposes (the "Waiver"). Affirmer makes + the Waiver for the benefit of each member of the public at large and to the + detriment of Affirmer's heirs and successors, fully intending that such Waiver + shall not be subject to revocation, rescission, cancellation, termination, or + any other legal or equitable action to disrupt the quiet enjoyment of the Work + by the public as contemplated by Affirmer's express Statement of Purpose. + . + 3. Public License Fallback. Should any part of the Waiver for any reason be + judged legally invalid or ineffective under applicable law, then the Waiver + shall be preserved to the maximum extent permitted taking into account + Affirmer's express Statement of Purpose. In addition, to the extent the Waiver + is so judged Affirmer hereby grants to each affected person a royalty-free, non + transferable, non sublicensable, non exclusive, irrevocable and unconditional + license to exercise Affirmer's Copyright and Related Rights in the Work (i) in + all territories worldwide, (ii) for the maximum duration provided by applicable + law or treaty (including future time extensions), (iii) in any current or + future medium and for any number of copies, and (iv) for any purpose + whatsoever, including without limitation commercial, advertising or promotional + purposes (the "License"). The License shall be deemed effective as of the date + CC0 was applied by Affirmer to the Work. Should any part of the License for any + reason be judged legally invalid or ineffective under applicable law, such + partial invalidity or ineffectiveness shall not invalidate the remainder of the + License, and in such case Affirmer hereby affirms that he or she will not (i) + exercise any of his or her remaining Copyright and Related Rights in the Work + or (ii) assert any associated claims and causes of action with respect to the + Work, in either case contrary to Affirmer's express Statement of Purpose. + . + 4. Limitations and Disclaimers. + . + No trademark or patent rights held by Affirmer are waived, abandoned, + surrendered, licensed or otherwise affected by this document. Affirmer offers + the Work as-is and makes no representations or warranties of any kind + concerning the Work, express, implied, statutory or otherwise, including + without limitation warranties of title, merchantability, fitness for a + particular purpose, non infringement, or the absence of latent or other + defects, accuracy, or the present or absence of errors, whether or not + discoverable, all to the greatest extent permissible under applicable law. + Affirmer disclaims responsibility for clearing rights of other persons that may + apply to the Work or any use thereof, including without limitation any person's + Copyright and Related Rights in the Work. Further, Affirmer disclaims + responsibility for obtaining any necessary consents, permissions or other + rights required for any use of the Work. Affirmer understands and acknowledges + that Creative Commons is not a party to this document and has no duty or + obligation with respect to this CC0 or use of the Work. + +License: MIT + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + . + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + . + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. + +License: CPL-1.0 + THE ACCOMPANYING PROGRAM IS PROVIDED UNDER THE TERMS OF THIS COMMON PUBLIC + LICENSE ("AGREEMENT"). ANY USE, REPRODUCTION OR DISTRIBUTION OF THE PROGRAM + CONSTITUTES RECIPIENT'S ACCEPTANCE OF THIS AGREEMENT. + . + 1. DEFINITIONS + . + "Contribution" means: + . + a) in the case of the initial Contributor, the initial code and + documentation distributed under this Agreement, and + . + b) in the case of each subsequent Contributor: + . + i) changes to the Program, and + . + ii) additions to the Program; + . + where such changes and/or additions to the Program originate from and are + distributed by that particular Contributor. A Contribution 'originates' from a + Contributor if it was added to the Program by such Contributor itself or anyone + acting on such Contributor's behalf. Contributions do not include additions to + the Program which: (i) are separate modules of software distributed in + conjunction with the Program under their own license agreement, and (ii) are not + derivative works of the Program. + . + "Contributor" means any person or entity that distributes the Program. + . + "Licensed Patents " mean patent claims licensable by a Contributor which are + necessarily infringed by the use or sale of its Contribution alone or when + combined with the Program. + . + "Program" means the Contributions distributed in accordance with this Agreement. + . + "Recipient" means anyone who receives the Program under this Agreement, + including all Contributors. + . + 2. GRANT OF RIGHTS + . + a) Subject to the terms of this Agreement, each Contributor hereby grants + Recipient a non-exclusive, worldwide, royalty-free copyright license to + reproduce, prepare derivative works of, publicly display, publicly perform, + distribute and sublicense the Contribution of such Contributor, if any, and such + derivative works, in source code and object code form. + . + b) Subject to the terms of this Agreement, each Contributor hereby grants + Recipient a non-exclusive, worldwide, royalty-free patent license under Licensed + Patents to make, use, sell, offer to sell, import and otherwise transfer the + Contribution of such Contributor, if any, in source code and object code form. + This patent license shall apply to the combination of the Contribution and the + Program if, at the time the Contribution is added by the Contributor, such + addition of the Contribution causes such combination to be covered by the + Licensed Patents. The patent license shall not apply to any other combinations + which include the Contribution. No hardware per se is licensed hereunder. + . + c) Recipient understands that although each Contributor grants the licenses + to its Contributions set forth herein, no assurances are provided by any + Contributor that the Program does not infringe the patent or other intellectual + property rights of any other entity. Each Contributor disclaims any liability to + Recipient for claims brought by any other entity based on infringement of + intellectual property rights or otherwise. As a condition to exercising the + rights and licenses granted hereunder, each Recipient hereby assumes sole + responsibility to secure any other intellectual property rights needed, if any. + For example, if a third party patent license is required to allow Recipient to + distribute the Program, it is Recipient's responsibility to acquire that license + before distributing the Program. + . + d) Each Contributor represents that to its knowledge it has sufficient + copyright rights in its Contribution, if any, to grant the copyright license set + forth in this Agreement. + . + 3. REQUIREMENTS + . + A Contributor may choose to distribute the Program in object code form under its + own license agreement, provided that: + . + a) it complies with the terms and conditions of this Agreement; and + . + b) its license agreement: + . + i) effectively disclaims on behalf of all Contributors all warranties and + conditions, express and implied, including warranties or conditions of title and + non-infringement, and implied warranties or conditions of merchantability and + fitness for a particular purpose; + . + ii) effectively excludes on behalf of all Contributors all liability for + damages, including direct, indirect, special, incidental and consequential + damages, such as lost profits; + . + iii) states that any provisions which differ from this Agreement are offered + by that Contributor alone and not by any other party; and + . + iv) states that source code for the Program is available from such + Contributor, and informs licensees how to obtain it in a reasonable manner on or + through a medium customarily used for software exchange. + . + When the Program is made available in source code form: + . + a) it must be made available under this Agreement; and + . + b) a copy of this Agreement must be included with each copy of the Program. + . + Contributors may not remove or alter any copyright notices contained within the + Program. + . + Each Contributor must identify itself as the originator of its Contribution, if + any, in a manner that reasonably allows subsequent Recipients to identify the + originator of the Contribution. + . + 4. COMMERCIAL DISTRIBUTION + . + Commercial distributors of software may accept certain responsibilities with + respect to end users, business partners and the like. While this license is + intended to facilitate the commercial use of the Program, the Contributor who + includes the Program in a commercial product offering should do so in a manner + which does not create potential liability for other Contributors. Therefore, if + a Contributor includes the Program in a commercial product offering, such + Contributor ("Commercial Contributor") hereby agrees to defend and indemnify + every other Contributor ("Indemnified Contributor") against any losses, damages + and costs (collectively "Losses") arising from claims, lawsuits and other legal + actions brought by a third party against the Indemnified Contributor to the + extent caused by the acts or omissions of such Commercial Contributor in + connection with its distribution of the Program in a commercial product + offering. The obligations in this section do not apply to any claims or Losses + relating to any actual or alleged intellectual property infringement. In order + to qualify, an Indemnified Contributor must: a) promptly notify the Commercial + Contributor in writing of such claim, and b) allow the Commercial Contributor to + control, and cooperate with the Commercial Contributor in, the defense and any + related settlement negotiations. The Indemnified Contributor may participate in + any such claim at its own expense. + . + For example, a Contributor might include the Program in a commercial product + offering, Product X. That Contributor is then a Commercial Contributor. If that + Commercial Contributor then makes performance claims, or offers warranties + related to Product X, those performance claims and warranties are such + Commercial Contributor's responsibility alone. Under this section, the + Commercial Contributor would have to defend claims against the other + Contributors related to those performance claims and warranties, and if a court + requires any other Contributor to pay any damages as a result, the Commercial + Contributor must pay those damages. + . + 5. NO WARRANTY + . + EXCEPT AS EXPRESSLY SET FORTH IN THIS AGREEMENT, THE PROGRAM IS PROVIDED ON AN + "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, EITHER EXPRESS OR + IMPLIED INCLUDING, WITHOUT LIMITATION, ANY WARRANTIES OR CONDITIONS OF TITLE, + NON-INFRINGEMENT, MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE. Each + Recipient is solely responsible for determining the appropriateness of using and + distributing the Program and assumes all risks associated with its exercise of + rights under this Agreement, including but not limited to the risks and costs of + program errors, compliance with applicable laws, damage to or loss of data, + programs or equipment, and unavailability or interruption of operations. + . + 6. DISCLAIMER OF LIABILITY + . + EXCEPT AS EXPRESSLY SET FORTH IN THIS AGREEMENT, NEITHER RECIPIENT NOR ANY + CONTRIBUTORS SHALL HAVE ANY LIABILITY FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING WITHOUT LIMITATION LOST + PROFITS), HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + OUT OF THE USE OR DISTRIBUTION OF THE PROGRAM OR THE EXERCISE OF ANY RIGHTS + GRANTED HEREUNDER, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. + . + 7. GENERAL + . + If any provision of this Agreement is invalid or unenforceable under applicable + law, it shall not affect the validity or enforceability of the remainder of the + terms of this Agreement, and without further action by the parties hereto, such + provision shall be reformed to the minimum extent necessary to make such + provision valid and enforceable. + . + If Recipient institutes patent litigation against a Contributor with respect to + a patent applicable to software (including a cross-claim or counterclaim in a + lawsuit), then any patent licenses granted by that Contributor to such Recipient + under this Agreement shall terminate as of the date such litigation is filed. In + addition, if Recipient institutes patent litigation against any entity + (including a cross-claim or counterclaim in a lawsuit) alleging that the Program + itself (excluding combinations of the Program with other software or hardware) + infringes such Recipient's patent(s), then such Recipient's rights granted under + Section 2(b) shall terminate as of the date such litigation is filed. + . + All Recipient's rights under this Agreement shall terminate if it fails to + comply with any of the material terms or conditions of this Agreement and does + not cure such failure in a reasonable period of time after becoming aware of + such noncompliance. If all Recipient's rights under this Agreement terminate, + Recipient agrees to cease use and distribution of the Program as soon as + reasonably practicable. However, Recipient's obligations under this Agreement + and any licenses granted by Recipient relating to the Program shall continue and + survive. + . + Everyone is permitted to copy and distribute copies of this Agreement, but in + order to avoid inconsistency the Agreement is copyrighted and may only be + modified in the following manner. The Agreement Steward reserves the right to + publish new versions (including revisions) of this Agreement from time to time. + No one other than the Agreement Steward has the right to modify this Agreement. + IBM is the initial Agreement Steward. IBM may assign the responsibility to serve + as the Agreement Steward to a suitable separate entity. Each new version of the + Agreement will be given a distinguishing version number. The Program (including + Contributions) may always be distributed subject to the version of the Agreement + under which it was received. In addition, after a new version of the Agreement + is published, Contributor may elect to distribute the Program (including its + Contributions) under the new version. Except as expressly stated in Sections + 2(a) and 2(b) above, Recipient receives no rights or licenses to the + intellectual property of any Contributor under this Agreement, whether + expressly, by implication, estoppel or otherwise. All rights in the Program not + expressly granted under this Agreement are reserved. + . + This Agreement is governed by the laws of the State of New York and the + intellectual property laws of the United States of America. No party to this + Agreement will bring a legal action under this Agreement more than one year + after the cause of action arose. Each party waives its rights to a jury trial in + any resulting litigation. diff --git a/debian/ibacm.install b/debian/ibacm.install new file mode 100644 index 0000000..4cf6a1b --- /dev/null +++ b/debian/ibacm.install @@ -0,0 +1,12 @@ +lib/systemd/system/ibacm.service +lib/systemd/system/ibacm.socket +usr/bin/ib_acme +usr/include/infiniband/acm.h +usr/include/infiniband/acm_prov.h +usr/lib/*/ibacm/libibacmp.so +usr/sbin/ibacm +usr/share/doc/rdma-core/ibacm.md usr/share/doc/ibacm/ +usr/share/man/man1/ib_acme.1 +usr/share/man/man7/ibacm.7 +usr/share/man/man7/ibacm_prov.7 +usr/share/man/man8/ibacm.8 diff --git a/debian/ibacm.lintian-overrides b/debian/ibacm.lintian-overrides new file mode 100644 index 0000000..66fffd9 --- /dev/null +++ b/debian/ibacm.lintian-overrides @@ -0,0 +1,2 @@ +# The wantedby target rdma-hw.target is intentional (see rdma-core) +ibacm: systemd-service-file-refers-to-unusual-wantedby-target lib/systemd/system/ibacm.service rdma-hw.target diff --git a/debian/ibacm.maintscript b/debian/ibacm.maintscript new file mode 100644 index 0000000..c79d4bd --- /dev/null +++ b/debian/ibacm.maintscript @@ -0,0 +1 @@ +rm_conffile /etc/init.d/ibacm 19.0-1ubuntu1~ diff --git a/debian/ibverbs-providers.install b/debian/ibverbs-providers.install new file mode 100644 index 0000000..4f971fb --- /dev/null +++ b/debian/ibverbs-providers.install @@ -0,0 +1,5 @@ +etc/libibverbs.d/ +usr/lib/*/libefa.so.* +usr/lib/*/libibverbs/lib*-rdmav*.so +usr/lib/*/libmlx4.so.* +usr/lib/*/libmlx5.so.* diff --git a/debian/ibverbs-providers.lintian-overrides b/debian/ibverbs-providers.lintian-overrides new file mode 100644 index 0000000..8a44d54 --- /dev/null +++ b/debian/ibverbs-providers.lintian-overrides @@ -0,0 +1,2 @@ +# libefa, libmlx4 and libmlx5 are ibverbs provider that provides more functions. +ibverbs-providers: package-name-doesnt-match-sonames libefa1 libmlx4-1 libmlx5-1 diff --git a/debian/ibverbs-providers.maintscript b/debian/ibverbs-providers.maintscript new file mode 100644 index 0000000..4ca22c5 --- /dev/null +++ b/debian/ibverbs-providers.maintscript @@ -0,0 +1,2 @@ +rm_conffile /etc/libibverbs.d/cxgb3.driver 27.0-2~ +rm_conffile /etc/libibverbs.d/nes.driver 27.0-2~ diff --git a/debian/ibverbs-providers.symbols b/debian/ibverbs-providers.symbols new file mode 100644 index 0000000..34decae --- /dev/null +++ b/debian/ibverbs-providers.symbols @@ -0,0 +1,108 @@ +libmlx4.so.1 ibverbs-providers #MINVER# +* Build-Depends-Package: libibverbs-dev + MLX4_1.0@MLX4_1.0 15 + mlx4dv_init_obj@MLX4_1.0 15 + mlx4dv_query_device@MLX4_1.0 15 + mlx4dv_create_qp@MLX4_1.0 15 + mlx4dv_set_context_attr@MLX4_1.0 15 +libmlx5.so.1 ibverbs-providers #MINVER# +* Build-Depends-Package: libibverbs-dev + MLX5_1.0@MLX5_1.0 13 + MLX5_1.1@MLX5_1.1 14 + MLX5_1.2@MLX5_1.2 15 + MLX5_1.3@MLX5_1.3 16 + MLX5_1.4@MLX5_1.4 17 + MLX5_1.5@MLX5_1.5 18 + MLX5_1.6@MLX5_1.6 20 + MLX5_1.7@MLX5_1.7 21 + MLX5_1.8@MLX5_1.8 22 + MLX5_1.9@MLX5_1.9 23 + MLX5_1.10@MLX5_1.10 24 + MLX5_1.11@MLX5_1.11 25 + MLX5_1.12@MLX5_1.12 28 + MLX5_1.13@MLX5_1.13 29 + mlx5dv_init_obj@MLX5_1.0 13 + mlx5dv_init_obj@MLX5_1.2 15 + mlx5dv_query_device@MLX5_1.0 13 + mlx5dv_create_cq@MLX5_1.1 14 + mlx5dv_set_context_attr@MLX5_1.2 15 + mlx5dv_create_qp@MLX5_1.3 16 + mlx5dv_create_wq@MLX5_1.3 16 + mlx5dv_get_clock_info@MLX5_1.4 17 + mlx5dv_create_flow_action_esp@MLX5_1.5 18 + mlx5dv_create_flow_matcher@MLX5_1.6 20 + mlx5dv_destroy_flow_matcher@MLX5_1.6 20 + mlx5dv_create_flow@MLX5_1.6 20 + mlx5dv_create_flow_action_modify_header@MLX5_1.7 21 + mlx5dv_create_flow_action_packet_reformat@MLX5_1.7 21 + mlx5dv_devx_alloc_uar@MLX5_1.7 21 + mlx5dv_devx_free_uar@MLX5_1.7 21 + mlx5dv_devx_general_cmd@MLX5_1.7 21 + mlx5dv_devx_obj_create@MLX5_1.7 21 + mlx5dv_devx_obj_destroy@MLX5_1.7 21 + mlx5dv_devx_obj_modify@MLX5_1.7 21 + mlx5dv_devx_obj_query@MLX5_1.7 21 + mlx5dv_devx_query_eqn@MLX5_1.7 21 + mlx5dv_devx_umem_dereg@MLX5_1.7 21 + mlx5dv_devx_umem_reg@MLX5_1.7 21 + mlx5dv_open_device@MLX5_1.7 21 + mlx5dv_devx_cq_modify@MLX5_1.8 22 + mlx5dv_devx_cq_query@MLX5_1.8 22 + mlx5dv_devx_ind_tbl_modify@MLX5_1.8 22 + mlx5dv_devx_ind_tbl_query@MLX5_1.8 22 + mlx5dv_devx_qp_modify@MLX5_1.8 22 + mlx5dv_devx_qp_query@MLX5_1.8 22 + mlx5dv_devx_srq_modify@MLX5_1.8 22 + mlx5dv_devx_srq_query@MLX5_1.8 22 + mlx5dv_devx_wq_modify@MLX5_1.8 22 + mlx5dv_devx_wq_query@MLX5_1.8 22 + mlx5dv_is_supported@MLX5_1.8 22 + mlx5dv_devx_create_cmd_comp@MLX5_1.9 23 + mlx5dv_devx_destroy_cmd_comp@MLX5_1.9 23 + mlx5dv_devx_get_async_cmd_comp@MLX5_1.9 23 + mlx5dv_devx_obj_query_async@MLX5_1.9 23 + mlx5dv_alloc_dm@MLX5_1.10 24 + mlx5dv_create_mkey@MLX5_1.10 24 + mlx5dv_destroy_mkey@MLX5_1.10 24 + mlx5dv_dr_action_create_dest_table@MLX5_1.10 24 + mlx5dv_dr_action_create_dest_ibv_qp@MLX5_1.10 24 + mlx5dv_dr_action_create_dest_vport@MLX5_1.10 24 + mlx5dv_dr_action_create_flow_counter@MLX5_1.10 24 + mlx5dv_dr_action_create_drop@MLX5_1.10 24 + mlx5dv_dr_action_create_modify_header@MLX5_1.10 24 + mlx5dv_dr_action_create_packet_reformat@MLX5_1.10 24 + mlx5dv_dr_action_create_tag@MLX5_1.10 24 + mlx5dv_dr_action_destroy@MLX5_1.10 24 + mlx5dv_dr_domain_create@MLX5_1.10 24 + mlx5dv_dr_domain_destroy@MLX5_1.10 24 + mlx5dv_dr_domain_sync@MLX5_1.10 24 + mlx5dv_dr_matcher_create@MLX5_1.10 24 + mlx5dv_dr_matcher_destroy@MLX5_1.10 24 + mlx5dv_dr_rule_create@MLX5_1.10 24 + mlx5dv_dr_rule_destroy@MLX5_1.10 24 + mlx5dv_dr_table_create@MLX5_1.10 24 + mlx5dv_dr_table_destroy@MLX5_1.10 24 + mlx5dv_qp_ex_from_ibv_qp_ex@MLX5_1.10 24 + mlx5dv_devx_create_event_channel@MLX5_1.11 25 + mlx5dv_devx_destroy_event_channel@MLX5_1.11 25 + mlx5dv_devx_get_event@MLX5_1.11 25 + mlx5dv_devx_subscribe_devx_event@MLX5_1.11 25 + mlx5dv_devx_subscribe_devx_event_fd@MLX5_1.11 25 + mlx5dv_alloc_var@MLX5_1.12 28 + mlx5dv_dr_action_create_flow_meter@MLX5_1.12 28 + mlx5dv_dr_action_modify_flow_meter@MLX5_1.12 28 + mlx5dv_dump_dr_domain@MLX5_1.12 28 + mlx5dv_dump_dr_matcher@MLX5_1.12 28 + mlx5dv_dump_dr_rule@MLX5_1.12 28 + mlx5dv_dump_dr_table@MLX5_1.12 28 + mlx5dv_free_var@MLX5_1.12 28 + mlx5dv_pp_alloc@MLX5_1.13 29 + mlx5dv_pp_free@MLX5_1.13 29 +libefa.so.1 ibverbs-providers #MINVER# +* Build-Depends-Package: libibverbs-dev + EFA_1.0@EFA_1.0 24 + EFA_1.1@EFA_1.1 26 + efadv_create_driver_qp@EFA_1.0 24 + efadv_create_qp_ex@EFA_1.1 26 + efadv_query_device@EFA_1.1 26 + efadv_query_ah@EFA_1.1 26 diff --git a/debian/ibverbs-utils.install b/debian/ibverbs-utils.install new file mode 100644 index 0000000..170b8d2 --- /dev/null +++ b/debian/ibverbs-utils.install @@ -0,0 +1,16 @@ +usr/bin/ibv_asyncwatch +usr/bin/ibv_devices +usr/bin/ibv_devinfo +usr/bin/ibv_rc_pingpong +usr/bin/ibv_srq_pingpong +usr/bin/ibv_uc_pingpong +usr/bin/ibv_ud_pingpong +usr/bin/ibv_xsrq_pingpong +usr/share/man/man1/ibv_asyncwatch.1 +usr/share/man/man1/ibv_devices.1 +usr/share/man/man1/ibv_devinfo.1 +usr/share/man/man1/ibv_rc_pingpong.1 +usr/share/man/man1/ibv_srq_pingpong.1 +usr/share/man/man1/ibv_uc_pingpong.1 +usr/share/man/man1/ibv_ud_pingpong.1 +usr/share/man/man1/ibv_xsrq_pingpong.1 diff --git a/debian/infiniband-diags.install b/debian/infiniband-diags.install new file mode 100644 index 0000000..5cbda3a --- /dev/null +++ b/debian/infiniband-diags.install @@ -0,0 +1,64 @@ +etc/infiniband-diags/error_thresholds +etc/infiniband-diags/ibdiag.conf +usr/sbin/check_lft_balance +usr/sbin/dump_fts +usr/sbin/dump_lfts +usr/sbin/dump_mfts +usr/sbin/ibaddr +usr/sbin/ibcacheedit +usr/sbin/ibccconfig +usr/sbin/ibccquery +usr/sbin/ibfindnodesusing +usr/sbin/ibhosts +usr/sbin/ibidsverify +usr/sbin/iblinkinfo +usr/sbin/ibnetdiscover +usr/sbin/ibnodes +usr/sbin/ibping +usr/sbin/ibportstate +usr/sbin/ibqueryerrors +usr/sbin/ibroute +usr/sbin/ibrouters +usr/sbin/ibstat +usr/sbin/ibstatus +usr/sbin/ibswitches +usr/sbin/ibsysstat +usr/sbin/ibtracert +usr/sbin/perfquery +usr/sbin/saquery +usr/sbin/sminfo +usr/sbin/smpdump +usr/sbin/smpquery +usr/sbin/vendstat +usr/share/man/man8/check_lft_balance.8 +usr/share/man/man8/dump_fts.8 +usr/share/man/man8/dump_lfts.8 +usr/share/man/man8/dump_mfts.8 +usr/share/man/man8/ibaddr.8 +usr/share/man/man8/ibcacheedit.8 +usr/share/man/man8/ibccconfig.8 +usr/share/man/man8/ibccquery.8 +usr/share/man/man8/ibfindnodesusing.8 +usr/share/man/man8/ibhosts.8 +usr/share/man/man8/ibidsverify.8 +usr/share/man/man8/iblinkinfo.8 +usr/share/man/man8/ibnetdiscover.8 +usr/share/man/man8/ibnodes.8 +usr/share/man/man8/ibping.8 +usr/share/man/man8/ibportstate.8 +usr/share/man/man8/ibqueryerrors.8 +usr/share/man/man8/ibroute.8 +usr/share/man/man8/ibrouters.8 +usr/share/man/man8/ibstat.8 +usr/share/man/man8/ibstatus.8 +usr/share/man/man8/ibswitches.8 +usr/share/man/man8/ibsysstat.8 +usr/share/man/man8/ibtracert.8 +usr/share/man/man8/infiniband-diags.8 +usr/share/man/man8/perfquery.8 +usr/share/man/man8/saquery.8 +usr/share/man/man8/sminfo.8 +usr/share/man/man8/smpdump.8 +usr/share/man/man8/smpquery.8 +usr/share/man/man8/vendstat.8 +usr/share/perl5/IBswcountlimits.pm diff --git a/debian/libibmad-dev.install b/debian/libibmad-dev.install new file mode 100644 index 0000000..0083650 --- /dev/null +++ b/debian/libibmad-dev.install @@ -0,0 +1,5 @@ +usr/include/infiniband/mad.h +usr/include/infiniband/mad_osd.h +usr/lib/*/libibmad*.a +usr/lib/*/libibmad*.so +usr/lib/*/pkgconfig/libibmad.pc diff --git a/debian/libibmad5.install b/debian/libibmad5.install new file mode 100644 index 0000000..d89b393 --- /dev/null +++ b/debian/libibmad5.install @@ -0,0 +1 @@ +usr/lib/*/libibmad*.so.* diff --git a/debian/libibmad5.symbols b/debian/libibmad5.symbols new file mode 100644 index 0000000..851b6db --- /dev/null +++ b/debian/libibmad5.symbols @@ -0,0 +1,155 @@ +libibmad.so.5 libibmad5 #MINVER# +* Build-Depends-Package: libibmad-dev + IBMAD_1.3@IBMAD_1.3 1.3.11 + bm_call_via@IBMAD_1.3 1.3.11 + cc_config_status_via@IBMAD_1.3 1.3.11 + cc_query_status_via@IBMAD_1.3 1.3.11 + drpath2str@IBMAD_1.3 1.3.11 + ib_node_query_via@IBMAD_1.3 1.3.11 + ib_path_query@IBMAD_1.3 1.3.11 + ib_path_query_via@IBMAD_1.3 1.3.11 + ib_resolve_gid_via@IBMAD_1.3 1.3.11 + ib_resolve_guid_via@IBMAD_1.3 1.3.11 + ib_resolve_portid_str@IBMAD_1.3 1.3.11 + ib_resolve_portid_str_via@IBMAD_1.3 1.3.11 + ib_resolve_self@IBMAD_1.3 1.3.11 + ib_resolve_self_via@IBMAD_1.3 1.3.11 + ib_resolve_smlid@IBMAD_1.3 1.3.11 + ib_resolve_smlid_via@IBMAD_1.3 1.3.11 + ib_vendor_call@IBMAD_1.3 1.3.11 + ib_vendor_call_via@IBMAD_1.3 1.3.11 + ibdebug@IBMAD_1.3 1.3.11 + mad_alloc@IBMAD_1.3 1.3.11 + mad_build_pkt@IBMAD_1.3 1.3.11 + mad_class_agent@IBMAD_1.3 1.3.11 + mad_decode_field@IBMAD_1.3 1.3.11 + mad_dump_array@IBMAD_1.3 1.3.11 + mad_dump_bitfield@IBMAD_1.3 1.3.11 + mad_dump_cc_cacongestionentry@IBMAD_1.3 1.3.11 + mad_dump_cc_cacongestionsetting@IBMAD_1.3 1.3.11 + mad_dump_cc_congestioncontroltable@IBMAD_1.3 1.3.11 + mad_dump_cc_congestioncontroltableentry@IBMAD_1.3 1.3.11 + mad_dump_cc_congestioninfo@IBMAD_1.3 1.3.11 + mad_dump_cc_congestionkeyinfo@IBMAD_1.3 1.3.11 + mad_dump_cc_congestionlog@IBMAD_1.3 1.3.11 + mad_dump_cc_congestionlogca@IBMAD_1.3 1.3.11 + mad_dump_cc_congestionlogentryca@IBMAD_1.3 1.3.11 + mad_dump_cc_congestionlogentryswitch@IBMAD_1.3 1.3.11 + mad_dump_cc_congestionlogswitch@IBMAD_1.3 1.3.11 + mad_dump_cc_switchcongestionsetting@IBMAD_1.3 1.3.11 + mad_dump_cc_switchportcongestionsettingelement@IBMAD_1.3 1.3.11 + mad_dump_cc_timestamp@IBMAD_1.3 1.3.11 + mad_dump_classportinfo@IBMAD_1.3 1.3.11 + mad_dump_field@IBMAD_1.3 1.3.11 + mad_dump_fields@IBMAD_1.3 1.3.11 + mad_dump_hex@IBMAD_1.3 1.3.11 + mad_dump_int@IBMAD_1.3 1.3.11 + mad_dump_linkdowndefstate@IBMAD_1.3 1.3.11 + mad_dump_linkspeed@IBMAD_1.3 1.3.11 + mad_dump_linkspeeden@IBMAD_1.3 1.3.11 + mad_dump_linkspeedext@IBMAD_1.3 1.3.11 + mad_dump_linkspeedexten@IBMAD_1.3 1.3.11 + mad_dump_linkspeedextsup@IBMAD_1.3 1.3.11 + mad_dump_linkspeedsup@IBMAD_1.3 1.3.11 + mad_dump_linkwidth@IBMAD_1.3 1.3.11 + mad_dump_linkwidthen@IBMAD_1.3 1.3.11 + mad_dump_linkwidthsup@IBMAD_1.3 1.3.11 + mad_dump_mlnx_ext_port_info@IBMAD_1.3 1.3.11 + mad_dump_mtu@IBMAD_1.3 1.3.11 + mad_dump_node_type@IBMAD_1.3 1.3.11 + mad_dump_nodedesc@IBMAD_1.3 1.3.11 + mad_dump_nodeinfo@IBMAD_1.3 1.3.11 + mad_dump_opervls@IBMAD_1.3 1.3.11 + mad_dump_perfcounters@IBMAD_1.3 1.3.11 + mad_dump_perfcounters_ext@IBMAD_1.3 1.3.11 + mad_dump_perfcounters_port_flow_ctl_counters@IBMAD_1.3 1.3.11 + mad_dump_perfcounters_port_op_rcv_counters@IBMAD_1.3 1.3.11 + mad_dump_perfcounters_port_vl_op_data@IBMAD_1.3 1.3.11 + mad_dump_perfcounters_port_vl_op_packet@IBMAD_1.3 1.3.11 + mad_dump_perfcounters_port_vl_xmit_flow_ctl_update_errors@IBMAD_1.3 1.3.11 + mad_dump_perfcounters_port_vl_xmit_wait_counters@IBMAD_1.3 1.3.11 + mad_dump_perfcounters_rcv_con_ctrl@IBMAD_1.3 1.3.11 + mad_dump_perfcounters_rcv_err@IBMAD_1.3 1.3.11 + mad_dump_perfcounters_rcv_sl@IBMAD_1.3 1.3.11 + mad_dump_perfcounters_sl_rcv_becn@IBMAD_1.3 1.3.11 + mad_dump_perfcounters_sl_rcv_fecn@IBMAD_1.3 1.3.11 + mad_dump_perfcounters_sw_port_vl_congestion@IBMAD_1.3 1.3.11 + mad_dump_perfcounters_vl_xmit_time_cong@IBMAD_1.3 1.3.11 + mad_dump_perfcounters_xmit_con_ctrl@IBMAD_1.3 1.3.11 + mad_dump_perfcounters_xmt_disc@IBMAD_1.3 1.3.11 + mad_dump_perfcounters_xmt_sl@IBMAD_1.3 1.3.11 + mad_dump_physportstate@IBMAD_1.3 1.3.11 + mad_dump_port_ext_speeds_counters@IBMAD_1.3 1.3.11 + mad_dump_port_ext_speeds_counters_rsfec_active@IBMAD_1.3 1.3.12 + mad_dump_portcapmask2@IBMAD_1.3 2.1.0 + mad_dump_portcapmask@IBMAD_1.3 1.3.11 + mad_dump_portinfo@IBMAD_1.3 1.3.11 + mad_dump_portinfo_ext@IBMAD_1.3 1.3.12 + mad_dump_portsamples_control@IBMAD_1.3 1.3.11 + mad_dump_portsamples_result@IBMAD_1.3 1.3.11 + mad_dump_portstate@IBMAD_1.3 1.3.11 + mad_dump_portstates@IBMAD_1.3 1.3.11 + mad_dump_rhex@IBMAD_1.3 1.3.11 + mad_dump_sltovl@IBMAD_1.3 1.3.11 + mad_dump_string@IBMAD_1.3 1.3.11 + mad_dump_switchinfo@IBMAD_1.3 1.3.11 + mad_dump_uint@IBMAD_1.3 1.3.11 + mad_dump_val@IBMAD_1.3 1.3.11 + mad_dump_vlarbitration@IBMAD_1.3 1.3.11 + mad_dump_vlcap@IBMAD_1.3 1.3.11 + mad_encode@IBMAD_1.3 1.3.11 + mad_encode_field@IBMAD_1.3 1.3.11 + mad_field_name@IBMAD_1.3 1.3.11 + mad_free@IBMAD_1.3 1.3.11 + mad_get_array@IBMAD_1.3 1.3.11 + mad_get_field64@IBMAD_1.3 1.3.11 + mad_get_field@IBMAD_1.3 1.3.11 + mad_get_retries@IBMAD_1.3 1.3.11 + mad_get_timeout@IBMAD_1.3 1.3.11 + mad_print_field@IBMAD_1.3 1.3.11 + mad_receive@IBMAD_1.3 1.3.11 + mad_receive_via@IBMAD_1.3 1.3.11 + mad_register_client@IBMAD_1.3 1.3.11 + mad_register_client_via@IBMAD_1.3 1.3.11 + mad_register_server@IBMAD_1.3 1.3.11 + mad_register_server_via@IBMAD_1.3 1.3.11 + mad_respond@IBMAD_1.3 1.3.11 + mad_respond_via@IBMAD_1.3 1.3.11 + mad_rpc@IBMAD_1.3 1.3.11 + mad_rpc_class_agent@IBMAD_1.3 1.3.11 + mad_rpc_close_port@IBMAD_1.3 1.3.11 + mad_rpc_open_port@IBMAD_1.3 1.3.11 + mad_rpc_portid@IBMAD_1.3 1.3.11 + mad_rpc_rmpp@IBMAD_1.3 1.3.11 + mad_rpc_set_retries@IBMAD_1.3 1.3.11 + mad_rpc_set_timeout@IBMAD_1.3 1.3.11 + mad_send@IBMAD_1.3 1.3.11 + mad_send_via@IBMAD_1.3 1.3.11 + mad_set_array@IBMAD_1.3 1.3.11 + mad_set_field64@IBMAD_1.3 1.3.11 + mad_set_field@IBMAD_1.3 1.3.11 + mad_trid@IBMAD_1.3 1.3.11 + madrpc@IBMAD_1.3 1.3.11 + madrpc_init@IBMAD_1.3 1.3.11 + madrpc_portid@IBMAD_1.3 1.3.11 + madrpc_rmpp@IBMAD_1.3 1.3.11 + madrpc_save_mad@IBMAD_1.3 1.3.11 + madrpc_set_retries@IBMAD_1.3 1.3.11 + madrpc_set_timeout@IBMAD_1.3 1.3.11 + madrpc_show_errors@IBMAD_1.3 1.3.11 + performance_reset_via@IBMAD_1.3 1.3.11 + pma_query_via@IBMAD_1.3 1.3.11 + portid2portnum@IBMAD_1.3 1.3.11 + portid2str@IBMAD_1.3 1.3.11 + sa_call@IBMAD_1.3 1.3.11 + sa_rpc_call@IBMAD_1.3 1.3.11 + smp_mkey_get@IBMAD_1.3 1.3.11 + smp_mkey_set@IBMAD_1.3 1.3.11 + smp_query@IBMAD_1.3 1.3.11 + smp_query_status_via@IBMAD_1.3 1.3.11 + smp_query_via@IBMAD_1.3 1.3.11 + smp_set@IBMAD_1.3 1.3.11 + smp_set_status_via@IBMAD_1.3 1.3.11 + smp_set_via@IBMAD_1.3 1.3.11 + str2drpath@IBMAD_1.3 1.3.11 + xdump@IBMAD_1.3 1.3.11 diff --git a/debian/libibnetdisc-dev.install b/debian/libibnetdisc-dev.install new file mode 100644 index 0000000..bd5f4c7 --- /dev/null +++ b/debian/libibnetdisc-dev.install @@ -0,0 +1,13 @@ +usr/include/infiniband/ibnetdisc* +usr/lib/*/libibnetdisc*.a +usr/lib/*/libibnetdisc*.so +usr/lib/*/pkgconfig/libibnetdisc.pc +usr/share/man/man3/ibnd_debug.3 +usr/share/man/man3/ibnd_destroy_fabric.3 +usr/share/man/man3/ibnd_discover_fabric.3 +usr/share/man/man3/ibnd_find_node_dr.3 +usr/share/man/man3/ibnd_find_node_guid.3 +usr/share/man/man3/ibnd_iter_nodes.3 +usr/share/man/man3/ibnd_iter_nodes_type.3 +usr/share/man/man3/ibnd_set_max_smps_on_wire.3 +usr/share/man/man3/ibnd_show_progress.3 diff --git a/debian/libibnetdisc5.install b/debian/libibnetdisc5.install new file mode 100644 index 0000000..54684fd --- /dev/null +++ b/debian/libibnetdisc5.install @@ -0,0 +1 @@ +usr/lib/*/libibnetdisc*.so.* diff --git a/debian/libibnetdisc5.symbols b/debian/libibnetdisc5.symbols new file mode 100644 index 0000000..fe004b3 --- /dev/null +++ b/debian/libibnetdisc5.symbols @@ -0,0 +1,21 @@ +libibnetdisc.so.5 libibnetdisc5 #MINVER# +* Build-Depends-Package: libibnetdisc-dev + IBNETDISC_1.0@IBNETDISC_1.0 1.6.1 + ibnd_cache_fabric@IBNETDISC_1.0 1.6.1 + ibnd_destroy_fabric@IBNETDISC_1.0 1.6.1 + ibnd_discover_fabric@IBNETDISC_1.0 1.6.1 + ibnd_find_node_dr@IBNETDISC_1.0 1.6.1 + ibnd_find_node_guid@IBNETDISC_1.0 1.6.1 + ibnd_find_port_dr@IBNETDISC_1.0 1.6.1 + ibnd_find_port_guid@IBNETDISC_1.0 1.6.1 + ibnd_find_port_lid@IBNETDISC_1.0 1.6.4 + ibnd_get_chassis_guid@IBNETDISC_1.0 1.6.1 + ibnd_get_chassis_slot_str@IBNETDISC_1.0 1.6.1 + ibnd_get_chassis_type@IBNETDISC_1.0 1.6.1 + ibnd_is_xsigo_guid@IBNETDISC_1.0 1.6.1 + ibnd_is_xsigo_hca@IBNETDISC_1.0 1.6.1 + ibnd_is_xsigo_tca@IBNETDISC_1.0 1.6.1 + ibnd_iter_nodes@IBNETDISC_1.0 1.6.1 + ibnd_iter_nodes_type@IBNETDISC_1.0 1.6.1 + ibnd_iter_ports@IBNETDISC_1.0 1.6.1 + ibnd_load_fabric@IBNETDISC_1.0 1.6.1 diff --git a/debian/libibumad-dev.install b/debian/libibumad-dev.install new file mode 100644 index 0000000..bcc4fc8 --- /dev/null +++ b/debian/libibumad-dev.install @@ -0,0 +1,5 @@ +usr/include/infiniband/umad*.h +usr/lib/*/libibumad*.so +usr/lib/*/libibumad.a +usr/lib/*/pkgconfig/libibumad.pc +usr/share/man/man3/umad_* diff --git a/debian/libibumad3.install b/debian/libibumad3.install new file mode 100644 index 0000000..f6c298d --- /dev/null +++ b/debian/libibumad3.install @@ -0,0 +1 @@ +usr/lib/*/libibumad*.so.* diff --git a/debian/libibumad3.symbols b/debian/libibumad3.symbols new file mode 100644 index 0000000..f399f6c --- /dev/null +++ b/debian/libibumad3.symbols @@ -0,0 +1,42 @@ +libibumad.so.3 libibumad3 #MINVER# +* Build-Depends-Package: libibumad-dev + IBUMAD_1.0@IBUMAD_1.0 1.3.9 + IBUMAD_1.1@IBUMAD_1.1 3.1.26 + umad_addr_dump@IBUMAD_1.0 1.3.9 + umad_attribute_str@IBUMAD_1.0 1.3.10.2 + umad_class_str@IBUMAD_1.0 1.3.10.2 + umad_close_port@IBUMAD_1.0 1.3.9 + umad_common_mad_status_str@IBUMAD_1.0 1.3.10.2 + umad_debug@IBUMAD_1.0 1.3.9 + umad_done@IBUMAD_1.0 1.3.9 + umad_dump@IBUMAD_1.0 1.3.9 + umad_free_ca_device_list@IBUMAD_1.1 3.1.26 + umad_get_ca@IBUMAD_1.0 1.3.9 + umad_get_ca_device_list@IBUMAD_1.1 3.1.26 + umad_get_ca_portguids@IBUMAD_1.0 1.3.9 + umad_get_cas_names@IBUMAD_1.0 1.3.9 + umad_get_fd@IBUMAD_1.0 1.3.9 + umad_get_issm_path@IBUMAD_1.0 1.3.9 + umad_get_mad@IBUMAD_1.0 1.3.9 + umad_get_mad_addr@IBUMAD_1.0 1.3.9 + umad_get_pkey@IBUMAD_1.0 1.3.9 + umad_get_port@IBUMAD_1.0 1.3.9 + umad_init@IBUMAD_1.0 1.3.9 + umad_method_str@IBUMAD_1.0 1.3.10.2 + umad_open_port@IBUMAD_1.0 1.3.9 + umad_poll@IBUMAD_1.0 1.3.9 + umad_recv@IBUMAD_1.0 1.3.9 + umad_register2@IBUMAD_1.0 1.3.10.2 + umad_register@IBUMAD_1.0 1.3.9 + umad_register_oui@IBUMAD_1.0 1.3.9 + umad_release_ca@IBUMAD_1.0 1.3.9 + umad_release_port@IBUMAD_1.0 1.3.9 + umad_sa_mad_status_str@IBUMAD_1.0 1.3.10.2 + umad_send@IBUMAD_1.0 1.3.9 + umad_set_addr@IBUMAD_1.0 1.3.9 + umad_set_addr_net@IBUMAD_1.0 1.3.9 + umad_set_grh@IBUMAD_1.0 1.3.9 + umad_set_pkey@IBUMAD_1.0 1.3.9 + umad_size@IBUMAD_1.0 1.3.9 + umad_status@IBUMAD_1.0 1.3.9 + umad_unregister@IBUMAD_1.0 1.3.9 diff --git a/debian/libibverbs-dev.install b/debian/libibverbs-dev.install new file mode 100644 index 0000000..bc8caa5 --- /dev/null +++ b/debian/libibverbs-dev.install @@ -0,0 +1,35 @@ +usr/include/infiniband/arch.h +usr/include/infiniband/efadv.h +usr/include/infiniband/ib_user_ioctl_verbs.h +usr/include/infiniband/mlx4dv.h +usr/include/infiniband/mlx5_api.h +usr/include/infiniband/mlx5_user_ioctl_verbs.h +usr/include/infiniband/mlx5dv.h +usr/include/infiniband/opcode.h +usr/include/infiniband/sa-kern-abi.h +usr/include/infiniband/sa.h +usr/include/infiniband/tm_types.h +usr/include/infiniband/verbs.h +usr/include/infiniband/verbs_api.h +usr/lib/*/lib*-rdmav*.a +usr/lib/*/libefa.a +usr/lib/*/libefa.so +usr/lib/*/libibverbs*.so +usr/lib/*/libibverbs.a +usr/lib/*/libmlx4.a +usr/lib/*/libmlx4.so +usr/lib/*/libmlx5.a +usr/lib/*/libmlx5.so +usr/lib/*/pkgconfig/libefa.pc +usr/lib/*/pkgconfig/libibverbs.pc +usr/lib/*/pkgconfig/libmlx4.pc +usr/lib/*/pkgconfig/libmlx5.pc +usr/share/man/man3/efadv_*.3 +usr/share/man/man3/ibv_* +usr/share/man/man3/mbps_to_ibv_rate.3 +usr/share/man/man3/mlx4dv_*.3 +usr/share/man/man3/mlx5dv_*.3 +usr/share/man/man3/mult_to_ibv_rate.3 +usr/share/man/man7/efadv.7 +usr/share/man/man7/mlx4dv.7 +usr/share/man/man7/mlx5dv.7 diff --git a/debian/libibverbs1.install b/debian/libibverbs1.install new file mode 100644 index 0000000..83bdd80 --- /dev/null +++ b/debian/libibverbs1.install @@ -0,0 +1,2 @@ +usr/lib/*/libibverbs*.so.* +usr/share/doc/rdma-core/libibverbs.md usr/share/doc/libibverbs1/ diff --git a/debian/libibverbs1.postinst b/debian/libibverbs1.postinst new file mode 100644 index 0000000..784458a --- /dev/null +++ b/debian/libibverbs1.postinst @@ -0,0 +1,10 @@ +#!/bin/sh +# postinst script for libibverbs1 + +set -e + +if [ "$1" = configure ]; then + getent group rdma > /dev/null 2>&1 || addgroup --system --quiet rdma +fi + +#DEBHELPER# diff --git a/debian/libibverbs1.symbols b/debian/libibverbs1.symbols new file mode 100644 index 0000000..ec40b29 --- /dev/null +++ b/debian/libibverbs1.symbols @@ -0,0 +1,103 @@ +libibverbs.so.1 libibverbs1 #MINVER# +* Build-Depends-Package: libibverbs-dev + IBVERBS_1.0@IBVERBS_1.0 1.1.6 + IBVERBS_1.1@IBVERBS_1.1 1.1.6 + IBVERBS_1.5@IBVERBS_1.5 20 + IBVERBS_1.6@IBVERBS_1.6 24 + IBVERBS_1.7@IBVERBS_1.7 25 + IBVERBS_1.8@IBVERBS_1.8 28 + (symver)IBVERBS_PRIVATE_25 25 + ibv_ack_async_event@IBVERBS_1.0 1.1.6 + ibv_ack_async_event@IBVERBS_1.1 1.1.6 + ibv_ack_cq_events@IBVERBS_1.0 1.1.6 + ibv_ack_cq_events@IBVERBS_1.1 1.1.6 + ibv_alloc_pd@IBVERBS_1.0 1.1.6 + ibv_alloc_pd@IBVERBS_1.1 1.1.6 + ibv_attach_mcast@IBVERBS_1.0 1.1.6 + ibv_attach_mcast@IBVERBS_1.1 1.1.6 + ibv_close_device@IBVERBS_1.0 1.1.6 + ibv_close_device@IBVERBS_1.1 1.1.6 + ibv_copy_ah_attr_from_kern@IBVERBS_1.1 1.1.6 + ibv_copy_path_rec_from_kern@IBVERBS_1.0 1.1.6 + ibv_copy_path_rec_to_kern@IBVERBS_1.0 1.1.6 + ibv_copy_qp_attr_from_kern@IBVERBS_1.0 1.1.6 + ibv_create_ah@IBVERBS_1.0 1.1.6 + ibv_create_ah@IBVERBS_1.1 1.1.6 + ibv_create_ah_from_wc@IBVERBS_1.1 1.1.6 + ibv_create_comp_channel@IBVERBS_1.0 1.1.6 + ibv_create_cq@IBVERBS_1.0 1.1.6 + ibv_create_cq@IBVERBS_1.1 1.1.6 + ibv_create_qp@IBVERBS_1.0 1.1.6 + ibv_create_qp@IBVERBS_1.1 1.1.6 + ibv_create_srq@IBVERBS_1.0 1.1.6 + ibv_create_srq@IBVERBS_1.1 1.1.6 + ibv_dealloc_pd@IBVERBS_1.0 1.1.6 + ibv_dealloc_pd@IBVERBS_1.1 1.1.6 + ibv_dereg_mr@IBVERBS_1.0 1.1.6 + ibv_dereg_mr@IBVERBS_1.1 1.1.6 + ibv_destroy_ah@IBVERBS_1.0 1.1.6 + ibv_destroy_ah@IBVERBS_1.1 1.1.6 + ibv_destroy_comp_channel@IBVERBS_1.0 1.1.6 + ibv_destroy_cq@IBVERBS_1.0 1.1.6 + ibv_destroy_cq@IBVERBS_1.1 1.1.6 + ibv_destroy_qp@IBVERBS_1.0 1.1.6 + ibv_destroy_qp@IBVERBS_1.1 1.1.6 + ibv_destroy_srq@IBVERBS_1.0 1.1.6 + ibv_destroy_srq@IBVERBS_1.1 1.1.6 + ibv_detach_mcast@IBVERBS_1.0 1.1.6 + ibv_detach_mcast@IBVERBS_1.1 1.1.6 + ibv_dofork_range@IBVERBS_1.1 1.1.6 + ibv_dontfork_range@IBVERBS_1.1 1.1.6 + ibv_event_type_str@IBVERBS_1.1 1.1.6 + ibv_fork_init@IBVERBS_1.1 1.1.6 + ibv_free_device_list@IBVERBS_1.0 1.1.6 + ibv_free_device_list@IBVERBS_1.1 1.1.6 + ibv_get_async_event@IBVERBS_1.0 1.1.6 + ibv_get_async_event@IBVERBS_1.1 1.1.6 + ibv_get_cq_event@IBVERBS_1.0 1.1.6 + ibv_get_cq_event@IBVERBS_1.1 1.1.6 + ibv_get_device_guid@IBVERBS_1.0 1.1.6 + ibv_get_device_guid@IBVERBS_1.1 1.1.6 + ibv_get_device_list@IBVERBS_1.0 1.1.6 + ibv_get_device_list@IBVERBS_1.1 1.1.6 + ibv_get_device_name@IBVERBS_1.0 1.1.6 + ibv_get_device_name@IBVERBS_1.1 1.1.6 + ibv_get_pkey_index@IBVERBS_1.5 20 + ibv_get_sysfs_path@IBVERBS_1.0 1.1.6 + ibv_init_ah_from_wc@IBVERBS_1.1 1.1.6 + ibv_modify_qp@IBVERBS_1.0 1.1.6 + ibv_modify_qp@IBVERBS_1.1 1.1.6 + ibv_modify_srq@IBVERBS_1.0 1.1.6 + ibv_modify_srq@IBVERBS_1.1 1.1.6 + ibv_node_type_str@IBVERBS_1.1 1.1.6 + ibv_open_device@IBVERBS_1.0 1.1.6 + ibv_open_device@IBVERBS_1.1 1.1.6 + ibv_port_state_str@IBVERBS_1.1 1.1.6 + ibv_qp_to_qp_ex@IBVERBS_1.6 24 + ibv_query_device@IBVERBS_1.0 1.1.6 + ibv_query_device@IBVERBS_1.1 1.1.6 + ibv_query_gid@IBVERBS_1.0 1.1.6 + ibv_query_gid@IBVERBS_1.1 1.1.6 + ibv_query_pkey@IBVERBS_1.0 1.1.6 + ibv_query_pkey@IBVERBS_1.1 1.1.6 + ibv_query_port@IBVERBS_1.0 1.1.6 + ibv_query_port@IBVERBS_1.1 1.1.6 + ibv_query_qp@IBVERBS_1.0 1.1.6 + ibv_query_qp@IBVERBS_1.1 1.1.6 + ibv_query_srq@IBVERBS_1.0 1.1.6 + ibv_query_srq@IBVERBS_1.1 1.1.6 + ibv_rate_to_mbps@IBVERBS_1.1 1.1.8 + ibv_rate_to_mult@IBVERBS_1.0 1.1.6 + ibv_read_sysfs_file@IBVERBS_1.0 1.1.6 + ibv_reg_mr@IBVERBS_1.0 1.1.6 + ibv_reg_mr@IBVERBS_1.1 1.1.6 + ibv_reg_mr_iova@IBVERBS_1.7 25 + ibv_reg_mr_iova2@IBVERBS_1.8 28 + ibv_register_driver@IBVERBS_1.1 1.1.6 + ibv_rereg_mr@IBVERBS_1.1 1.2.1 + ibv_resize_cq@IBVERBS_1.0 1.1.6 + ibv_resize_cq@IBVERBS_1.1 1.1.6 + ibv_resolve_eth_l2_from_gid@IBVERBS_1.1 1.2.0 + ibv_wc_status_str@IBVERBS_1.1 1.1.6 + mbps_to_ibv_rate@IBVERBS_1.1 1.1.8 + mult_to_ibv_rate@IBVERBS_1.0 1.1.6 diff --git a/debian/librdmacm-dev.install b/debian/librdmacm-dev.install new file mode 100644 index 0000000..e12c300 --- /dev/null +++ b/debian/librdmacm-dev.install @@ -0,0 +1,62 @@ +usr/include/infiniband/ib.h +usr/include/rdma/rdma_cma.h +usr/include/rdma/rdma_cma_abi.h +usr/include/rdma/rdma_verbs.h +usr/include/rdma/rsocket.h +usr/lib/*/librdmacm*.so +usr/lib/*/librdmacm.a +usr/lib/*/pkgconfig/librdmacm.pc +usr/share/man/man3/rdma_accept.3 +usr/share/man/man3/rdma_ack_cm_event.3 +usr/share/man/man3/rdma_bind_addr.3 +usr/share/man/man3/rdma_connect.3 +usr/share/man/man3/rdma_create_ep.3 +usr/share/man/man3/rdma_create_event_channel.3 +usr/share/man/man3/rdma_create_id.3 +usr/share/man/man3/rdma_create_qp.3 +usr/share/man/man3/rdma_create_srq.3 +usr/share/man/man3/rdma_dereg_mr.3 +usr/share/man/man3/rdma_destroy_ep.3 +usr/share/man/man3/rdma_destroy_event_channel.3 +usr/share/man/man3/rdma_destroy_id.3 +usr/share/man/man3/rdma_destroy_qp.3 +usr/share/man/man3/rdma_destroy_srq.3 +usr/share/man/man3/rdma_disconnect.3 +usr/share/man/man3/rdma_establish.3 +usr/share/man/man3/rdma_event_str.3 +usr/share/man/man3/rdma_free_devices.3 +usr/share/man/man3/rdma_get_cm_event.3 +usr/share/man/man3/rdma_get_devices.3 +usr/share/man/man3/rdma_get_dst_port.3 +usr/share/man/man3/rdma_get_local_addr.3 +usr/share/man/man3/rdma_get_peer_addr.3 +usr/share/man/man3/rdma_get_recv_comp.3 +usr/share/man/man3/rdma_get_request.3 +usr/share/man/man3/rdma_get_send_comp.3 +usr/share/man/man3/rdma_get_src_port.3 +usr/share/man/man3/rdma_getaddrinfo.3 +usr/share/man/man3/rdma_init_qp_attr.3 +usr/share/man/man3/rdma_join_multicast.3 +usr/share/man/man3/rdma_join_multicast_ex.3 +usr/share/man/man3/rdma_leave_multicast.3 +usr/share/man/man3/rdma_listen.3 +usr/share/man/man3/rdma_migrate_id.3 +usr/share/man/man3/rdma_notify.3 +usr/share/man/man3/rdma_post_read.3 +usr/share/man/man3/rdma_post_readv.3 +usr/share/man/man3/rdma_post_recv.3 +usr/share/man/man3/rdma_post_recvv.3 +usr/share/man/man3/rdma_post_send.3 +usr/share/man/man3/rdma_post_sendv.3 +usr/share/man/man3/rdma_post_ud_send.3 +usr/share/man/man3/rdma_post_write.3 +usr/share/man/man3/rdma_post_writev.3 +usr/share/man/man3/rdma_reg_msgs.3 +usr/share/man/man3/rdma_reg_read.3 +usr/share/man/man3/rdma_reg_write.3 +usr/share/man/man3/rdma_reject.3 +usr/share/man/man3/rdma_resolve_addr.3 +usr/share/man/man3/rdma_resolve_route.3 +usr/share/man/man3/rdma_set_option.3 +usr/share/man/man7/rdma_cm.7 +usr/share/man/man7/rsocket.7 diff --git a/debian/librdmacm1.install b/debian/librdmacm1.install new file mode 100644 index 0000000..09140ab --- /dev/null +++ b/debian/librdmacm1.install @@ -0,0 +1,3 @@ +usr/lib/*/librdmacm*.so.* +usr/lib/*/rsocket/librspreload*.so* +usr/share/doc/rdma-core/librdmacm.md usr/share/doc/librdmacm1/ diff --git a/debian/librdmacm1.symbols b/debian/librdmacm1.symbols new file mode 100644 index 0000000..996122f --- /dev/null +++ b/debian/librdmacm1.symbols @@ -0,0 +1,70 @@ +librdmacm.so.1 librdmacm1 #MINVER# +* Build-Depends-Package: librdmacm-dev + RDMACM_1.0@RDMACM_1.0 1.0.15 + RDMACM_1.1@RDMACM_1.1 16 + RDMACM_1.2@RDMACM_1.2 23 + raccept@RDMACM_1.0 1.0.16 + rbind@RDMACM_1.0 1.0.16 + rclose@RDMACM_1.0 1.0.16 + rconnect@RDMACM_1.0 1.0.16 + rdma_accept@RDMACM_1.0 1.0.15 + rdma_ack_cm_event@RDMACM_1.0 1.0.15 + rdma_bind_addr@RDMACM_1.0 1.0.15 + rdma_connect@RDMACM_1.0 1.0.15 + rdma_create_ep@RDMACM_1.0 1.0.15 + rdma_create_event_channel@RDMACM_1.0 1.0.15 + rdma_create_id@RDMACM_1.0 1.0.15 + rdma_create_qp@RDMACM_1.0 1.0.15 + rdma_create_qp_ex@RDMACM_1.0 1.0.19 + rdma_create_srq@RDMACM_1.0 1.0.15 + rdma_create_srq_ex@RDMACM_1.0 1.0.19 + rdma_destroy_ep@RDMACM_1.0 1.0.15 + rdma_destroy_event_channel@RDMACM_1.0 1.0.15 + rdma_destroy_id@RDMACM_1.0 1.0.15 + rdma_destroy_qp@RDMACM_1.0 1.0.15 + rdma_destroy_srq@RDMACM_1.0 1.0.15 + rdma_disconnect@RDMACM_1.0 1.0.15 + rdma_event_str@RDMACM_1.0 1.0.15 + rdma_establish@RDMACM_1.2 23 + rdma_free_devices@RDMACM_1.0 1.0.15 + rdma_freeaddrinfo@RDMACM_1.0 1.0.15 + rdma_get_cm_event@RDMACM_1.0 1.0.15 + rdma_get_devices@RDMACM_1.0 1.0.15 + rdma_get_dst_port@RDMACM_1.0 1.0.19 + rdma_get_request@RDMACM_1.0 1.0.15 + rdma_get_src_port@RDMACM_1.0 1.0.19 + rdma_getaddrinfo@RDMACM_1.0 1.0.15 + rdma_init_qp_attr@RDMACM_1.2 23 + rdma_join_multicast@RDMACM_1.0 1.0.15 + rdma_join_multicast_ex@RDMACM_1.1 16 + rdma_leave_multicast@RDMACM_1.0 1.0.15 + rdma_listen@RDMACM_1.0 1.0.15 + rdma_migrate_id@RDMACM_1.0 1.0.15 + rdma_notify@RDMACM_1.0 1.0.15 + rdma_reject@RDMACM_1.0 1.0.15 + rdma_resolve_addr@RDMACM_1.0 1.0.15 + rdma_resolve_route@RDMACM_1.0 1.0.15 + rdma_set_option@RDMACM_1.0 1.0.15 + rfcntl@RDMACM_1.0 1.0.16 + rgetpeername@RDMACM_1.0 1.0.16 + rgetsockname@RDMACM_1.0 1.0.16 + rgetsockopt@RDMACM_1.0 1.0.16 + riomap@RDMACM_1.0 1.0.19 + riounmap@RDMACM_1.0 1.0.19 + riowrite@RDMACM_1.0 1.0.19 + rlisten@RDMACM_1.0 1.0.16 + rpoll@RDMACM_1.0 1.0.16 + rread@RDMACM_1.0 1.0.16 + rreadv@RDMACM_1.0 1.0.16 + rrecv@RDMACM_1.0 1.0.16 + rrecvfrom@RDMACM_1.0 1.0.16 + rrecvmsg@RDMACM_1.0 1.0.16 + rselect@RDMACM_1.0 1.0.16 + rsend@RDMACM_1.0 1.0.16 + rsendmsg@RDMACM_1.0 1.0.16 + rsendto@RDMACM_1.0 1.0.16 + rsetsockopt@RDMACM_1.0 1.0.16 + rshutdown@RDMACM_1.0 1.0.16 + rsocket@RDMACM_1.0 1.0.16 + rwrite@RDMACM_1.0 1.0.16 + rwritev@RDMACM_1.0 1.0.16 diff --git a/debian/python3-pyverbs.examples b/debian/python3-pyverbs.examples new file mode 100644 index 0000000..08e586f --- /dev/null +++ b/debian/python3-pyverbs.examples @@ -0,0 +1 @@ +pyverbs/examples/ib_devices.py diff --git a/debian/python3-pyverbs.install b/debian/python3-pyverbs.install new file mode 100644 index 0000000..784342a --- /dev/null +++ b/debian/python3-pyverbs.install @@ -0,0 +1,2 @@ +usr/lib/python3/dist-packages/pyverbs +usr/share/doc/rdma-core/tests diff --git a/debian/rdma-core.install b/debian/rdma-core.install new file mode 100644 index 0000000..564d4a1 --- /dev/null +++ b/debian/rdma-core.install @@ -0,0 +1,35 @@ +etc/init.d/iwpmd +etc/iwpmd.conf +etc/modprobe.d/mlx4.conf +etc/modprobe.d/truescale.conf +etc/rdma/modules/infiniband.conf +etc/rdma/modules/iwarp.conf +etc/rdma/modules/iwpmd.conf +etc/rdma/modules/opa.conf +etc/rdma/modules/rdma.conf +etc/rdma/modules/roce.conf +etc/udev/rules.d/70-persistent-ipoib.rules +lib/systemd/system/iwpmd.service +lib/systemd/system/rdma-hw.target +lib/systemd/system/rdma-load-modules@.service +lib/systemd/system/rdma-ndd.service +lib/udev/rdma_rename +lib/udev/rules.d/60-rdma-ndd.rules +lib/udev/rules.d/60-rdma-persistent-naming.rules +lib/udev/rules.d/75-rdma-description.rules +lib/udev/rules.d/90-iwpmd.rules +lib/udev/rules.d/90-rdma-hw-modules.rules +lib/udev/rules.d/90-rdma-ulp-modules.rules +lib/udev/rules.d/90-rdma-umad.rules +usr/lib/truescale-serdes.cmds +usr/sbin/iwpmd +usr/sbin/rdma-ndd +usr/share/doc/rdma-core/MAINTAINERS +usr/share/doc/rdma-core/README.md +usr/share/doc/rdma-core/rxe.md +usr/share/doc/rdma-core/tag_matching.md +usr/share/doc/rdma-core/udev.md +usr/share/man/man5/iwpmd.conf.5 +usr/share/man/man7/rxe.7 +usr/share/man/man8/iwpmd.8 +usr/share/man/man8/rdma-ndd.8 diff --git a/debian/rdma-core.lintian-overrides b/debian/rdma-core.lintian-overrides new file mode 100644 index 0000000..01dea12 --- /dev/null +++ b/debian/rdma-core.lintian-overrides @@ -0,0 +1,5 @@ +# The rdma-ndd service is started by udev. +rdma-core: systemd-service-file-missing-install-key lib/systemd/system/iwpmd.service +rdma-core: systemd-service-file-missing-install-key lib/systemd/system/rdma-ndd.service +# Example/documentary udev rules file +rdma-core: udev-rule-in-etc etc/udev/rules.d/70-persistent-ipoib.rules diff --git a/debian/rdma-core.postinst b/debian/rdma-core.postinst new file mode 100644 index 0000000..6486f62 --- /dev/null +++ b/debian/rdma-core.postinst @@ -0,0 +1,12 @@ +#!/bin/sh +set -e + +#DEBHELPER# + +if [ "$1" = "configure" ]; then + # we ship udev rules, so trigger an update. This has to be done after + # DEBHELPER restarts systemd to get our new service files loaded. + udevadm trigger --subsystem-match=infiniband --action=change || true + udevadm trigger --subsystem-match=net --action=change || true + udevadm trigger --subsystem-match=infiniband_mad --action=change || true +fi diff --git a/debian/rdmacm-utils.install b/debian/rdmacm-utils.install new file mode 100644 index 0000000..74506d6 --- /dev/null +++ b/debian/rdmacm-utils.install @@ -0,0 +1,26 @@ +usr/bin/cmtime +usr/bin/mckey +usr/bin/rcopy +usr/bin/rdma_client +usr/bin/rdma_server +usr/bin/rdma_xclient +usr/bin/rdma_xserver +usr/bin/riostream +usr/bin/rping +usr/bin/rstream +usr/bin/ucmatose +usr/bin/udaddy +usr/bin/udpong +usr/share/man/man1/cmtime.1 +usr/share/man/man1/mckey.1 +usr/share/man/man1/rcopy.1 +usr/share/man/man1/rdma_client.1 +usr/share/man/man1/rdma_server.1 +usr/share/man/man1/rdma_xclient.1 +usr/share/man/man1/rdma_xserver.1 +usr/share/man/man1/riostream.1 +usr/share/man/man1/rping.1 +usr/share/man/man1/rstream.1 +usr/share/man/man1/ucmatose.1 +usr/share/man/man1/udaddy.1 +usr/share/man/man1/udpong.1 diff --git a/debian/rules b/debian/rules new file mode 100755 index 0000000..2b132bf --- /dev/null +++ b/debian/rules @@ -0,0 +1,113 @@ +#!/usr/bin/make -f + +include /usr/share/dpkg/architecture.mk + +export DEB_BUILD_MAINT_OPTIONS=hardening=+all + +COHERENT_DMA_ARCHS = amd64 arm64 i386 ia64 powerpc powerpcspe ppc64 ppc64el s390x sparc64 x32 + +dh_params = --with python3,systemd --builddirectory=build-deb + +%: + dh $@ $(dh_params) + +override_dh_auto_clean: + dh_auto_clean + rm -rf build-deb + for package in ibverbs-providers libibverbs-dev rdma-core; do \ + test ! -e debian/$$package.install.backup || mv debian/$$package.install.backup debian/$$package.install; \ + done + +# Upstream wishes to use CMAKE_BUILD_TYPE=Release, and ensures that has a +# sensible basis of options (eg no -O3, including -g). Debian specific options +# come from CFLAGS as usual. +# +# Upstream encourages the use of Ninja to build the source, convince dh to use +# it until someone writes native support for dh+cmake+ninja. +DH_AUTO_CONFIGURE := "--" \ + "-GNinja" \ + "-DDISTRO_FLAVOUR=Debian" \ + "-DCMAKE_BUILD_TYPE=Release" \ + "-DCMAKE_INSTALL_SYSCONFDIR:PATH=/etc" \ + "-DCMAKE_INSTALL_SYSTEMD_SERVICEDIR:PATH=/lib/systemd/system" \ + "-DCMAKE_INSTALL_INITDDIR:PATH=/etc/init.d" \ + "-DCMAKE_INSTALL_LIBEXECDIR:PATH=/usr/lib" \ + "-DCMAKE_INSTALL_SHAREDSTATEDIR:PATH=/var/lib" \ + "-DCMAKE_INSTALL_RUNDIR:PATH=/run" \ + "-DCMAKE_INSTALL_UDEV_RULESDIR:PATH=/lib/udev/rules.d" \ + "-DCMAKE_INSTALL_PERLDIR:PATH=/usr/share/perl5" \ + "-DENABLE_STATIC=1" \ + $(EXTRA_CMAKE_FLAGS) + +override_dh_auto_configure: + if [ -e /usr/bin/python3 ]; then \ + dh_auto_configure $(DH_AUTO_CONFIGURE) \ + -DPYTHON_EXECUTABLE:PATH=/usr/bin/python3 \ + -DCMAKE_INSTALL_PYTHON_ARCH_LIB:PATH=/usr/lib/python3/dist-packages; \ + else \ + dh_auto_configure $(DH_AUTO_CONFIGURE) \ + -DNO_PYVERBS=1; \ + fi + + +override_dh_auto_build: + ninja -C build-deb -v + +# upstream does not ship test cases +override_dh_auto_test: + +override_dh_auto_install: +# Some providers are disabled on architectures that are not able to do coherent DMA +ifneq (,$(filter-out $(COHERENT_DMA_ARCHS),$(DEB_HOST_ARCH))) + for package in ibverbs-providers libibverbs-dev rdma-core; do \ + test -e debian/$$package.install.backup || cp debian/$$package.install debian/$$package.install.backup; \ + done + sed -i '/efa\|mlx[45]/d' debian/ibverbs-providers.install debian/libibverbs-dev.install debian/rdma-core.install +endif + DESTDIR=$(CURDIR)/debian/tmp ninja -C build-deb install + +# The following files are not used on Debian (we ship our own sysvinit script) +INST_EXCLUDE := "etc/init.d/srpd" \ + "etc/init.d/ibacm" \ + "usr/sbin/run_srp_daemon" \ + "usr/sbin/srp_daemon.sh" +INST_EXCLUDE := $(addprefix -X,$(INST_EXCLUDE)) +override_dh_install: + if [ -e build-deb/python/pyverbs/__init__.py ]; then \ + dh_install --fail-missing $(INST_EXCLUDE); \ + else \ + dh_install -Npython3-pyverbs --fail-missing $(INST_EXCLUDE) --remaining-packages; \ + fi + +# cmake installs the correct init scripts in the correct place, just setup the +# pre-postrms +override_dh_installinit: + dh_installinit -prdma-core --onlyscripts --name=iwpmd + dh_installinit --remaining-packages + +override_dh_installsystemd: + dh_installsystemd -pibacm --no-start ibacm.service + dh_installsystemd -pibacm ibacm.socket + dh_installsystemd --remaining-packages + +# Provider plugin libaries are not shared libraries and do not belong in the +# shlibs file. +# librspreload is a LD_PRELOAD library and does not belong in the shlib files +SHLIBS_EXCLUDE = "/libibverbs/" "librspreload" "/ibacm/" +SHLIBS_EXCLUDE := $(addprefix --exclude=,$(SHLIBS_EXCLUDE)) +override_dh_makeshlibs: + dh_makeshlibs $(SHLIBS_EXCLUDE) + +override_dh_strip: + dh_strip -plibibmad5 --dbg-package=libibmad5-dbg + dh_strip -plibibnetdisc5 --dbg-package=libibnetdisc5-dbg + dh_strip -plibibumad3 --dbg-package=libibumad3-dbg + dh_strip -plibibverbs1 --dbg-package=libibverbs1-dbg + dh_strip -plibrdmacm1 --dbg-package=librdmacm1-dbg + dh_strip --remaining-packages + +# Upstream encourages the use of 'build' as the developer build output +# directory, allow that directory to be present and still allow dh to work. +.PHONY: build +build: + dh $@ $(dh_params) diff --git a/debian/source/format b/debian/source/format new file mode 100644 index 0000000..163aaf8 --- /dev/null +++ b/debian/source/format @@ -0,0 +1 @@ +3.0 (quilt) diff --git a/debian/source/lintian-overrides b/debian/source/lintian-overrides new file mode 100644 index 0000000..65e5635 --- /dev/null +++ b/debian/source/lintian-overrides @@ -0,0 +1,4 @@ +# The libibverbs examples are compiled and put in ibverbs-utils. +rdma-core source: package-does-not-install-examples libibverbs/examples/ +# The librdmacm examples are compiled and put in rdmacm-utils. +rdma-core source: package-does-not-install-examples librdmacm/examples/ diff --git a/debian/srptools.default b/debian/srptools.default new file mode 100644 index 0000000..77ee1a5 --- /dev/null +++ b/debian/srptools.default @@ -0,0 +1,14 @@ +# How often should srp_daemon rescan the fabric (seconds). +RETRIES=60 + +# Where should srp_daemon log to. +LOG=/var/log/srp_daemon.log + +# What ports should srp_daemon be started on. +# Format is CA:port +# ALL or NONE will run on all ports on none +# respectively + +PORTS=NONE +#PORTS=ALL +#PORTS="mthca0:1 mlx4_0:2" diff --git a/debian/srptools.init b/debian/srptools.init new file mode 100644 index 0000000..82eaf1b --- /dev/null +++ b/debian/srptools.init @@ -0,0 +1,124 @@ +#!/bin/bash +### BEGIN INIT INFO +# Provides: srptools +# Required-Start: $remote_fs $syslog +# Required-Stop: $remote_fs $syslog +# Default-Start: 2 3 4 5 +# Default-Stop: 0 1 6 +# Short-Description: Discovers SRP scsi targets. +# Description: Discovers SRP scsi over infiniband targets. +### END INIT INFO + +DAEMON=/usr/sbin/srp_daemon +IBDIR=/sys/class/infiniband + +PORTS="" +RETRIES="" +RETRIES_DEFAULT=60 +LOG="" +LOG_DEFAULT=/var/log/srp_daemon.log + +[ -x $DAEMON ] || exit 0 + +. /lib/lsb/init-functions + +[ -f /etc/default/srptools ] && . /etc/default/srptools + +max() { + echo $(($1 > $2 ? $1 : $2)) +} + +run_daemon() { + # srp does not background itself; using the start-stop-daemon background + # function causes us to lose stdout, which is where it logs to + nohup start-stop-daemon --start --quiet -m \ + --pidfile "/var/run/srp_daemon.${HCA_ID}.${PORT}" \ + --exec $DAEMON -- -e -c -n \ + -i "${HCA_ID}" -p "${PORT}" -R "${RETRIES:-${RETRIES_DEFAULT}}" \ + >> "${LOG:-${LOG_DEFAULT}}" 2>&1 & + RETVAL=$(max "$RETVAL" $?) +} + +# Evaluate shell command $1 for every port in $PORTS +for_all_ports() { + local cmd=$1 p + + if [ "$PORTS" = "ALL" ]; then + for p in ${IBDIR}/*/ports/*; do + [ -e "$p" ] || continue + PORT=$(basename "$p") + HCA_ID=$(basename "$(dirname "$(dirname "$p")")") + eval "$cmd" + done + else + for ADAPTER in $PORTS; do + HCA_ID=${ADAPTER%%:*} + PORT=${ADAPTER#${HCA_ID}:} + [ -n "$HCA_ID" ] && [ -n "$PORT" ] && eval "$cmd" + done + fi +} + +start_daemon() { + local RETVAL=0 + + if [ "$PORTS" = "NONE" ] ; then + echo "srptools disabled." + exit 0 + fi + + for_all_ports run_daemon + case $RETVAL in + 0) log_success_msg "started $DAEMON";; + *) log_failure_msg "failed to start $DAEMON";; + esac + return $RETVAL +} + +stop_daemon() { + local RETVAL=0 PORTS=ALL + + for_all_ports 'start-stop-daemon --stop --quiet --oknodo -m --pidfile "/var/run/srp_daemon.${HCA_ID}.${PORT}"; RETVAL=$(max $RETVAL $?)' + case $RETVAL in + 0) log_success_msg "stopped $DAEMON";; + *) log_failure_msg "failed to stop $DAEMON";; + esac + return $RETVAL +} + +check_status() { + local pidfile=$1 pid + + [ -e "$pidfile" ] || return 3 # not running + pid=$(<"$pidfile") + [ -n "$pid" ] || return 3 # not running + [ -d "/proc/$pid" ] || return 1 # not running and pid file exists + return 0 # running +} + +daemon_status() { + local RETVAL=0 + + for_all_ports 'check_status /var/run/srp_daemon.${HCA_ID}.${PORT} $DAEMON; RETVAL=$(max $RETVAL $?)' + case $RETVAL in + 0) log_success_msg "$DAEMON is running";; + *) log_failure_msg "$DAEMON is not running";; + esac + return $RETVAL +} + +case "$1" in + start) + start_daemon + ;; + stop) + stop_daemon + ;; + status) + daemon_status + ;; + restart | reload | force-reload ) + stop_daemon + start_daemon + ;; +esac diff --git a/debian/srptools.install b/debian/srptools.install new file mode 100644 index 0000000..6670922 --- /dev/null +++ b/debian/srptools.install @@ -0,0 +1,13 @@ +etc/rdma/modules/srp_daemon.conf +etc/srp_daemon.conf +lib/systemd/system/srp_daemon.service +lib/systemd/system/srp_daemon_port@.service +lib/udev/rules.d/60-srp_daemon.rules +usr/lib/srp_daemon/start_on_all_ports +usr/sbin/ibsrpdm +usr/sbin/srp_daemon +usr/share/doc/rdma-core/ibsrpdm.md usr/share/doc/srptools/ +usr/share/man/man5/srp_daemon.service.5 +usr/share/man/man5/srp_daemon_port@.service.5 +usr/share/man/man8/ibsrpdm.8 +usr/share/man/man8/srp_daemon.8 diff --git a/debian/srptools.links b/debian/srptools.links new file mode 100644 index 0000000..d0bfc7b --- /dev/null +++ b/debian/srptools.links @@ -0,0 +1 @@ +/lib/systemd/system/srp_daemon.service /lib/systemd/system/srptools.service diff --git a/debian/srptools.lintian-overrides b/debian/srptools.lintian-overrides new file mode 100644 index 0000000..8678899 --- /dev/null +++ b/debian/srptools.lintian-overrides @@ -0,0 +1,2 @@ +# The wantedby target remote-fs-pre.target is intentional +srptools: systemd-service-file-refers-to-unusual-wantedby-target lib/systemd/system/*.service remote-fs-pre.target diff --git a/debian/srptools.postinst b/debian/srptools.postinst new file mode 100644 index 0000000..398534d --- /dev/null +++ b/debian/srptools.postinst @@ -0,0 +1,10 @@ +#!/bin/sh +set -e + +#DEBHELPER# + +if [ "$1" = "configure" ]; then + # we ship udev rules, so trigger an update. This has to be done after + # DEBHELPER restarts systemd to get our new service files loaded. + udevadm trigger --subsystem-match=infiniband_mad --action=change || true +fi diff --git a/debian/upstream/metadata b/debian/upstream/metadata new file mode 100644 index 0000000..9aa7356 --- /dev/null +++ b/debian/upstream/metadata @@ -0,0 +1,2 @@ +Repository: https://github.com/linux-rdma/rdma-core.git +Repository-Browse: https://github.com/linux-rdma/rdma-core diff --git a/debian/watch b/debian/watch new file mode 100644 index 0000000..fd2043a --- /dev/null +++ b/debian/watch @@ -0,0 +1,2 @@ +version=3 +https://github.com/linux-rdma/rdma-core/releases (?:.*?/)?(?:rdma-core-|v)?(\d[\d.]*)\.tar\.gz diff --git a/ibacm/CMakeLists.txt b/ibacm/CMakeLists.txt new file mode 100644 index 0000000..4702650 --- /dev/null +++ b/ibacm/CMakeLists.txt @@ -0,0 +1,76 @@ +publish_headers(infiniband + include/infiniband/acm_prov.h + ) + +# FIXME: Fixup the include scheme to not require all these -Is +include_directories("include") +include_directories("src") +include_directories("linux") +include_directories(${NL_INCLUDE_DIRS}) + +# NOTE: ibacm exports symbols from its own binary for use by ibacm +rdma_sbin_executable(ibacm + src/acm.c + src/acm_util.c + ) +target_link_libraries(ibacm LINK_PRIVATE + ibverbs + ibumad + ${NL_LIBRARIES} + ${SYSTEMD_LIBRARIES} + ${CMAKE_THREAD_LIBS_INIT} + ${CMAKE_DL_LIBS} + ) +# FIXME: We should probably list the symbols we want to export.. +set_target_properties(ibacm PROPERTIES ENABLE_EXPORTS TRUE) + +# This is a plugin module that dynamically links to ibacm +add_library(ibacmp MODULE + prov/acmp/src/acmp.c + ) +rdma_set_library_map(ibacmp "prov/acmp/src/libibacmp.map") +target_link_libraries(ibacmp LINK_PRIVATE + ibacm + ibverbs + ibumad + ${CMAKE_THREAD_LIBS_INIT} + ) +set_target_properties(ibacmp PROPERTIES + LIBRARY_OUTPUT_DIRECTORY "${BUILD_LIB}") +install(TARGETS ibacmp DESTINATION "${ACM_PROVIDER_DIR}") +# ACM providers are linked into a subdir so that IN_PLACE can work. +file(MAKE_DIRECTORY "${BUILD_LIB}/ibacm/") +rdma_create_symlink("../libibacmp.so" "${BUILD_LIB}/ibacm/libibacmp.so") + +rdma_executable(ib_acme + src/acme.c + src/libacm.c + src/parse.c + ) +target_link_libraries(ib_acme LINK_PRIVATE + ibverbs + ) +target_compile_definitions(ib_acme PRIVATE "-DACME_PRINTS") + +rdma_man_pages( + man/ib_acme.1 + man/ibacm.7 + man/ibacm.8 + man/ibacm_prov.7.in + ) + +# FIXME: update the .init.in +rdma_subst_install(FILES "ibacm.init.in" + DESTINATION "${CMAKE_INSTALL_INITDDIR}" + RENAME "ibacm" + PERMISSIONS OWNER_WRITE OWNER_READ GROUP_READ WORLD_READ OWNER_EXECUTE GROUP_EXECUTE WORLD_EXECUTE) + +rdma_subst_install(FILES "ibacm.service.in" + DESTINATION "${CMAKE_INSTALL_SYSTEMD_SERVICEDIR}" + RENAME ibacm.service + PERMISSIONS OWNER_WRITE OWNER_READ GROUP_READ WORLD_READ) + +install(FILES "ibacm.socket" + DESTINATION "${CMAKE_INSTALL_SYSTEMD_SERVICEDIR}" + RENAME ibacm.socket + PERMISSIONS OWNER_WRITE OWNER_READ GROUP_READ WORLD_READ) diff --git a/ibacm/ibacm.init.in b/ibacm/ibacm.init.in new file mode 100644 index 0000000..eba0107 --- /dev/null +++ b/ibacm/ibacm.init.in @@ -0,0 +1,139 @@ +#!/bin/bash +# Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md +# +# Bring up/down the ibacm daemon +# +# chkconfig: 2345 25 75 +# description: Starts/Stops InfiniBand ACM service +# +### BEGIN INIT INFO +# Provides: ibacm +# Default-Start: 2 3 4 5 +# Default-Stop: 0 1 6 +# Required-Start: $network $remote_fs +# Required-Stop: $network $remote_fs +# Should-Start: +# Should-Stop: +# Short-Description: Starts and stops the InfiniBand ACM service +# Description: The InfiniBand ACM service provides a user space implementation +# of something resembling an ARP cache for InfiniBand SA queries and +# host route lookups. +### END INIT INFO + +pidfile=@CMAKE_INSTALL_FULL_RUNDIR@/ibacm.pid +subsys=/var/lock/subsys/ibacm + +daemon() { /sbin/daemon ${1+"$@"}; } + +if [ -s /etc/init.d/functions ]; then + # RHEL / CentOS / SL / Fedora + . /etc/init.d/functions + _daemon() { daemon ${1+"$@"}; } + _checkpid() { checkpid `cat $pidfile`; } + _success() { success; echo; } + _failure() { failure; echo; } +elif [ -s /lib/lsb/init-functions ]; then + # SLES / OpenSuSE / Debian + . /lib/lsb/init-functions + _daemon() { start_daemon "$@"; } + _checkpid() { checkproc -p $pidfile @CMAKE_INSTALL_FULL_SBINDIR@/ibacm; } + _success() { log_success_msg; } + _failure() { log_failure_msg; } +elif [ -s /etc/rc.status ]; then + # Older SuSE + . /etc/rc.status + _daemon() { /sbin/start_daemon ${1+"$@"}; } + _checkpid() { checkproc -p $pidfile @CMAKE_INSTALL_FULL_SBINDIR@/ibacm; } + _success() { rc_status -v; } + _failure() { rc_status -v; } +fi + +start() +{ + echo -n "Starting ibacm daemon:" + _daemon @CMAKE_INSTALL_FULL_SBINDIR@/ibacm + if [[ $RETVAL -eq 0 ]]; then + _success + else + _failure + fi +} + +stop() +{ + echo -n "Stopping ibacm daemon:" + killproc -p $pidfile ibacm + if [[ $RETVAL -eq 0 ]]; then + _success + else + _failure + fi + rm -f $subsys +} + +status() +{ + echo -n "Checking for ibacm service " + if [ ! -f $subsys -a ! -f $pidfile ]; then + RETVAL=3 + elif [ -f $pidfile ]; then + _checkpid + RETVAL=$? + elif [ -f $subsys ]; then + RETVAL=2 + else + RETVAL=0 + fi + if [[ $RETVAL -eq 0 ]]; then + _success + else + _failure + fi +} + +restart () +{ + stop + start +} + +condrestart () +{ + [ -e $subsys ] && restart || return 0 +} + +usage () +{ + echo + echo "Usage: `basename $0` {start|stop|restart|condrestart|try-restart|force-reload|status}" + echo + return 2 +} + +case $1 in + start|stop|restart|condrestart|try-restart|force-reload) + [ `id -u` != "0" ] && exit 4 ;; +esac + +case $1 in + start) + start + ;; + stop) + stop + ;; + restart | reload) + restart + ;; + condrestart | try-restart | force-reload) + condrestart + ;; + status) + status + ;; + *) + usage + ;; +esac + +exit $RETVAL diff --git a/ibacm/ibacm.service.in b/ibacm/ibacm.service.in new file mode 100644 index 0000000..23d4525 --- /dev/null +++ b/ibacm/ibacm.service.in @@ -0,0 +1,24 @@ +[Unit] +Description=InfiniBand Address Cache Manager Daemon +Documentation=man:ibacm file:@CMAKE_INSTALL_FULL_SYSCONFDIR@/rdma/ibacm_opts.cfg +# Cause systemd to always start the socket, which means the parameters in +# ibacm.socket always configures the listening socket, even if the deamon is +# started directly. +Wants=ibacm.socket +# Ensure required kernel modules are loaded before starting +Wants=rdma-load-modules@rdma.service +After=rdma-load-modules@rdma.service +# Order ibacm startup after basic RDMA hw setup. +After=rdma-hw.target + +# Implicitly after basic.target, note that ibacm writes to /var/log directly +# and thus needs writable filesystems setup. + +[Service] +Type=notify +ExecStart=@CMAKE_INSTALL_FULL_SBINDIR@/ibacm --systemd + +[Install] +Also=ibacm.socket +# Only want ibacm if RDMA hardware is present (or the socket is touched) +WantedBy=rdma-hw.target diff --git a/ibacm/ibacm.socket b/ibacm/ibacm.socket new file mode 100644 index 0000000..abc706b --- /dev/null +++ b/ibacm/ibacm.socket @@ -0,0 +1,37 @@ +# Please copy this file to /etc/systemd/system/ibacm.socket +# before modification, if not done already. +# +# When using socket-based activation of the 'ibacm' service +# ibacm's configuration option 'acme_plus_kernel_only' is ignored +# (i.e. an implicit 'acme_plus_kernel_only no') +# +# In order to get the equivalent behavior of +# configuration 'acme_plus_kernel_only yes' +# Please add a comment (i.e. a '#' character) in front +# of the line 'Symlinks' below, and ensure that file +# file '/run/ibacm.sock' does not exist: +# e.g. by using "rm -f /run/ibacm.sock" after modifying +# the copy of this file that lives in /etc/systemd/system. +# +# Please also remember to reload the systemd configuration by running: +# % systemctl --system daemon-reload + +[Unit] +Description=Socket for InfiniBand Address Cache Manager Daemon +Documentation=man:ibacm +# Ensure that anything ordered after rdma-hw.target will see the socket, even +# if that thing is not ordered after socket.target/basic.target. +Before=rdma-hw.target +# ibacm.socket always starts + +[Socket] +ListenStream=/run/ibacm-unix.sock +Symlinks=/run/ibacm.sock + +# Bind to PF_NETLINK, NETLINK_RDMA, RDMA_NL_GROUP_LS +# Supported in systemd > 234 +ListenNetlink=rdma 4 + +[Install] +# Standard for all sockets +WantedBy=sockets.target diff --git a/ibacm/ibacm_hosts.data b/ibacm/ibacm_hosts.data new file mode 100644 index 0000000..78b978d --- /dev/null +++ b/ibacm/ibacm_hosts.data @@ -0,0 +1,15 @@ +# InfiniBand Communication Management Assistant for clusters hosts file +# +# Entry format is: +# address IB GID +# +# The address may be one of the following: +# host_name - ascii character string, up to 31 characters +# address - IPv4 or IPv6 formatted address +# +# There can be multiple entries for a single IB GID +# +# Samples: +# luna3 fe80::8:f104:39a:169 +# 192.168.1.3 fe80::8:f104:39a:169 +# fe80::208:f104:39a:169 fe80::8:f104:39a:169 diff --git a/ibacm/include/acm_mad.h b/ibacm/include/acm_mad.h new file mode 100644 index 0000000..53979c7 --- /dev/null +++ b/ibacm/include/acm_mad.h @@ -0,0 +1,190 @@ +/* + * Copyright (c) 2009 Intel Corporation. All rights reserved. + * + * This software is available to you under the OpenFabrics.org BSD license + * below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AWV + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#if !defined(ACM_MAD_H) +#define ACM_MAD_H + +#include <endian.h> +#include <infiniband/verbs.h> +#include <infiniband/acm.h> + +#define ACM_SEND_SIZE 256 +#define ACM_RECV_SIZE (ACM_SEND_SIZE + sizeof(struct ibv_grh)) + +#define IB_METHOD_GET 0x01 +#define IB_METHOD_SET 0x02 +#define IB_METHOD_SEND 0x03 +#define IB_METHOD_GET_TABLE 0x12 +#define IB_METHOD_DELETE 0x15 +#define IB_METHOD_RESP 0x80 + +#define ACM_MGMT_CLASS 0x2C + +#define ACM_CTRL_ACK htobe16(0x8000) +#define ACM_CTRL_RESOLVE htobe16(0x0001) + +#define IB_PKEY_FULL_MEMBER 0x8000 + +struct acm_mad { + uint8_t base_version; + uint8_t mgmt_class; + uint8_t class_version; + uint8_t method; + __be16 status; + __be16 control; + __be64 tid; + + uint8_t data[240]; +}; + +#define acm_class_status(status) ((uint8_t) (be16toh(status) >> 8)) + +#define ACM_QKEY 0x80010000 + +/* Map to ACM_EP_INFO_* */ +#define ACM_ADDRESS_INVALID 0x00 +#define ACM_ADDRESS_NAME 0x01 +#define ACM_ADDRESS_IP 0x02 +#define ACM_ADDRESS_IP6 0x03 +#define ACM_ADDRESS_GID 0x04 +#define ACM_ADDRESS_LID 0x05 +#define ACM_ADDRESS_RESERVED 0x06 /* start of reserved range */ + +#define ACM_MAX_GID_COUNT 10 + +struct acm_resolve_rec { + uint8_t dest_type; + uint8_t dest_length; + uint8_t src_type; + uint8_t src_length; + uint8_t gid_cnt; + uint8_t resp_resources; + uint8_t init_depth; + uint8_t reserved; + uint8_t dest[ACM_MAX_ADDRESS]; + uint8_t src[ACM_MAX_ADDRESS]; + union ibv_gid gid[ACM_MAX_GID_COUNT]; +}; + +#define IB_MGMT_CLASS_SA 0x03 + +struct ib_sa_mad { + uint8_t base_version; + uint8_t mgmt_class; + uint8_t class_version; + uint8_t method; + __be16 status; + __be16 reserved1; + __be64 tid; + __be16 attr_id; + __be16 reserved2; + __be32 attr_mod; + + uint8_t rmpp_version; + uint8_t rmpp_type; + uint8_t rmpp_flags; + uint8_t rmpp_status; + __be32 seg_num; + __be32 paylen_newwin; + + __be32 sm_key[2]; + __be16 attr_offset; + __be16 reserved3; + __be64 comp_mask; + + uint8_t data[200]; +}; + +#define IB_SA_ATTR_PATH_REC htobe16(0x0035) + +#define IB_COMP_MASK_PR_SERVICE_ID (htobe64(1 << 0) | \ + htobe64(1 << 1)) +#define IB_COMP_MASK_PR_DGID htobe64(1 << 2) +#define IB_COMP_MASK_PR_SGID htobe64(1 << 3) +#define IB_COMP_MASK_PR_DLID htobe64(1 << 4) +#define IB_COMP_MASK_PR_SLID htobe64(1 << 5) +#define IB_COMP_MASK_PR_RAW_TRAFFIC htobe64(1 << 6) +/* RESERVED htobe64(1 << 7) */ +#define IB_COMP_MASK_PR_FLOW_LABEL htobe64(1 << 8) +#define IB_COMP_MASK_PR_HOP_LIMIT htobe64(1 << 9) +#define IB_COMP_MASK_PR_TCLASS htobe64(1 << 10) +#define IB_COMP_MASK_PR_REVERSIBLE htobe64(1 << 11) +#define IB_COMP_MASK_PR_NUM_PATH htobe64(1 << 12) +#define IB_COMP_MASK_PR_PKEY htobe64(1 << 13) +#define IB_COMP_MASK_PR_QOS_CLASS htobe64(1 << 14) +#define IB_COMP_MASK_PR_SL htobe64(1 << 15) +#define IB_COMP_MASK_PR_MTU_SELECTOR htobe64(1 << 16) +#define IB_COMP_MASK_PR_MTU htobe64(1 << 17) +#define IB_COMP_MASK_PR_RATE_SELECTOR htobe64(1 << 18) +#define IB_COMP_MASK_PR_RATE htobe64(1 << 19) +#define IB_COMP_MASK_PR_PACKET_LIFETIME_SELECTOR htobe64(1 << 20) +#define IB_COMP_MASK_PR_PACKET_LIFETIME htobe64(1 << 21) +#define IB_COMP_MASK_PR_PREFERENCE htobe64(1 << 22) +/* RESERVED htobe64(1 << 23) */ + +#define IB_MC_QPN 0xffffff +#define IB_SA_ATTR_MC_MEMBER_REC htobe16(0x0038) + +#define IB_COMP_MASK_MC_MGID htobe64(1 << 0) +#define IB_COMP_MASK_MC_PORT_GID htobe64(1 << 1) +#define IB_COMP_MASK_MC_QKEY htobe64(1 << 2) +#define IB_COMP_MASK_MC_MLID htobe64(1 << 3) +#define IB_COMP_MASK_MC_MTU_SEL htobe64(1 << 4) +#define IB_COMP_MASK_MC_MTU htobe64(1 << 5) +#define IB_COMP_MASK_MC_TCLASS htobe64(1 << 6) +#define IB_COMP_MASK_MC_PKEY htobe64(1 << 7) +#define IB_COMP_MASK_MC_RATE_SEL htobe64(1 << 8) +#define IB_COMP_MASK_MC_RATE htobe64(1 << 9) +#define IB_COMP_MASK_MC_PACKET_LIFETIME_SEL htobe64(1 << 10) +#define IB_COMP_MASK_MC_PACKET_LIFETIME htobe64(1 << 11) +#define IB_COMP_MASK_MC_SL htobe64(1 << 12) +#define IB_COMP_MASK_MC_FLOW htobe64(1 << 13) +#define IB_COMP_MASK_MC_HOP htobe64(1 << 14) +#define IB_COMP_MASK_MC_SCOPE htobe64(1 << 15) +#define IB_COMP_MASK_MC_JOIN_STATE htobe64(1 << 16) +#define IB_COMP_MASK_MC_PROXY_JOIN htobe64(1 << 17) + +struct ib_mc_member_rec { + union ibv_gid mgid; + union ibv_gid port_gid; + __be32 qkey; + __be16 mlid; + uint8_t mtu; + uint8_t tclass; + __be16 pkey; + uint8_t rate; + uint8_t packet_lifetime; + __be32 sl_flow_hop; + uint8_t scope_state; + uint8_t proxy_join; + uint8_t reserved[2]; + uint8_t pad[4]; +}; + +#endif /* ACM_MAD_H */ diff --git a/ibacm/include/infiniband/acm_prov.h b/ibacm/include/infiniband/acm_prov.h new file mode 100644 index 0000000..691a00e --- /dev/null +++ b/ibacm/include/infiniband/acm_prov.h @@ -0,0 +1,128 @@ +/* + * Copyright (c) 2014 Intel Corporation. All rights reserved. + * + * This software is available to you under the OpenFabrics.org BSD license + * below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AWV + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#if !defined(ACM_PROV_H) +#define ACM_PROV_H + +#include <infiniband/acm.h> +#include <infiniband/umad.h> +#include <infiniband/umad_sa.h> + +#ifdef __cplusplus +extern "C" { +#endif + +#define ACM_PROV_VERSION 1 + +struct acm_device { + struct ibv_context *verbs; + __be64 dev_guid; +}; + +struct acm_port { + struct acm_device *dev; + uint8_t port_num; +}; + +struct acm_endpoint { + struct acm_port *port; + uint16_t pkey; +}; + +struct acm_address { + struct acm_endpoint *endpoint; + union acm_ep_info info; + char *id_string; + uint16_t type; +}; + +struct acm_provider { + size_t size; + uint32_t version; + const char *name; + int (*open_device)(const struct acm_device *device, + void **dev_context); + void (*close_device)(void *dev_context); + int (*open_port)(const struct acm_port *port, + void *dev_context, void **port_context); + void (*close_port)(void *port_context); + int (*open_endpoint)(const struct acm_endpoint *endpoint, + void *port_context, void **ep_context); + void (*close_endpoint)(void *ep_context); + int (*add_address)(const struct acm_address *addr, void *ep_context, + void **addr_context); + void (*remove_address)(void *addr_context); + int (*resolve)(void *addr_context, struct acm_msg *msg, uint64_t id); + int (*query)(void *addr_context, struct acm_msg *msg, uint64_t id); + int (*handle_event)(void *port_context, enum ibv_event_type type); + void (*query_perf)(void *ep_context, uint64_t *values, uint8_t *cnt); +}; + +int provider_query(struct acm_provider **info, uint32_t *version); + +/* Functions exported from core */ +#define acm_log(level, format, ...) \ + acm_write(level, "%s: "format, __func__, ## __VA_ARGS__) +extern void acm_write(int level, const char *format, ...) + __attribute__((format(printf, 2, 3))); +extern void acm_format_name(int level, char *name, size_t name_size, + uint8_t addr_type, const uint8_t *addr, size_t addr_size); + +extern int ib_any_gid(union ibv_gid *gid); +extern uint8_t acm_gid_index(struct acm_port *port, union ibv_gid *gid); +extern int acm_get_gid(struct acm_port *port, int index, union ibv_gid *gid); +extern __be64 acm_path_comp_mask(struct ibv_path_record *path); + +extern int acm_resolve_response(uint64_t id, struct acm_msg *msg); +extern int acm_query_response(uint64_t id, struct acm_msg *msg); + +extern enum ibv_rate acm_get_rate(uint8_t width, uint8_t speed); +extern enum ibv_mtu acm_convert_mtu(int mtu); +extern enum ibv_rate acm_convert_rate(int rate); + +struct acm_sa_mad { + void *context; + struct ib_user_mad umad; + struct umad_sa_packet sa_mad; /* must follow umad and be 64-bit aligned */ +}; + +extern struct acm_sa_mad * +acm_alloc_sa_mad(const struct acm_endpoint *endpoint, void *context, + void (*handler)(struct acm_sa_mad *)); +extern void acm_free_sa_mad(struct acm_sa_mad *mad); +extern int acm_send_sa_mad(struct acm_sa_mad *mad); + +extern const char *acm_get_opts_file(void); +extern void acm_increment_counter(int type); + +#ifdef __cplusplus +} +#endif + +#endif /* ACM_PROV_H */ diff --git a/ibacm/linux/osd.h b/ibacm/linux/osd.h new file mode 100644 index 0000000..e799041 --- /dev/null +++ b/ibacm/linux/osd.h @@ -0,0 +1,128 @@ +/* + * Copyright (c) 2009 Intel Corporation. All rights reserved. + * Copyright (c) 2013 Mellanox Technologies LTD. All rights reserved. + * + * This software is available to you under the OpenFabrics.org BSD license + * below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AWV + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#if !defined(OSD_H) +#define OSD_H + +#include <config.h> +#include <endian.h> +#include <stdint.h> +#include <stdlib.h> +#include <string.h> +#include <stdio.h> +#include <unistd.h> +#include <errno.h> +#include <pthread.h> +#include <sys/socket.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <sys/time.h> + +#include <ccan/minmax.h> + +#define ACM_ADDR_FILE "ibacm_addr.cfg" +#define ACM_OPTS_FILE "ibacm_opts.cfg" + +#if DEFINE_ATOMICS +typedef struct { pthread_mutex_t mut; int val; } atomic_t; +static inline int atomic_inc(atomic_t *atomic) +{ + int v; + + pthread_mutex_lock(&atomic->mut); + v = ++(atomic->val); + pthread_mutex_unlock(&atomic->mut); + return v; +} +static inline int atomic_dec(atomic_t *atomic) +{ + int v; + + pthread_mutex_lock(&atomic->mut); + v = --(atomic->val); + pthread_mutex_unlock(&atomic->mut); + return v; +} +static inline void atomic_init(atomic_t *atomic) +{ + pthread_mutex_init(&atomic->mut, NULL); + atomic->val = 0; +} +#else +typedef struct { volatile int val; } atomic_t; +#define atomic_inc(v) (__sync_add_and_fetch(&(v)->val, 1)) +#define atomic_dec(v) (__sync_sub_and_fetch(&(v)->val, 1)) +#define atomic_init(v) ((v)->val = 0) +#endif +#define atomic_get(v) ((v)->val) +#define atomic_set(v, s) ((v)->val = s) + +typedef struct { pthread_cond_t cond; pthread_mutex_t mutex; } event_t; +static inline void event_init(event_t *e) +{ + pthread_condattr_t attr; + + pthread_condattr_init(&attr); + pthread_condattr_setclock(&attr, CLOCK_MONOTONIC); + pthread_cond_init(&e->cond, &attr); + pthread_mutex_init(&e->mutex, NULL); +} +#define event_signal(e) pthread_cond_signal(&(e)->cond) +#define ONE_SEC_IN_NSEC 1000000000ULL +static inline int event_wait(event_t *e, unsigned int timeout) +{ + struct timespec wait; + int ret; + + clock_gettime(CLOCK_MONOTONIC, &wait); + wait.tv_sec = wait.tv_sec + timeout / 1000; + wait.tv_nsec = wait.tv_nsec + (timeout % 1000) * 1000000; + if (wait.tv_nsec > ONE_SEC_IN_NSEC) { + wait.tv_sec++; + wait.tv_nsec -= ONE_SEC_IN_NSEC; + } + pthread_mutex_lock(&e->mutex); + ret = pthread_cond_timedwait(&e->cond, &e->mutex, &wait); + pthread_mutex_unlock(&e->mutex); + return ret; +} + +static inline uint64_t time_stamp_us(void) +{ + struct timespec t; + clock_gettime(CLOCK_MONOTONIC, &t); + return (t.tv_sec * ONE_SEC_IN_NSEC + t.tv_nsec) / 1000; +} + +#define time_stamp_ms() (time_stamp_us() / (uint64_t) 1000) +#define time_stamp_sec() (time_stamp_ms() / (uint64_t) 1000) +#define time_stamp_min() (time_stamp_sec() / (uint64_t) 60) + +#endif /* OSD_H */ diff --git a/ibacm/man/ib_acme.1 b/ibacm/man/ib_acme.1 new file mode 100644 index 0000000..c0bb578 --- /dev/null +++ b/ibacm/man/ib_acme.1 @@ -0,0 +1,101 @@ +.\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md +.TH "ib_acme" 1 "2014-06-16" "ib_acme" "ib_acme" ib_acme +.SH NAME +ib_acme \- test and configuration utility for the IB ACM +.SH SYNOPSIS +.sp +.nf +\fIib_acme\fR [-f addr_format] [-s src_addr] -d dest_addr [-v] [-c] [-e] [-P] [-S svc_addr] [-C repetitions] +.fi +.nf +\fIib_acme\fR [-A [addr_file]] [-O [opt_file]] [-D dest_dir] [-V] +.fi +.SH "DESCRIPTION" +ib_acme provides assistance configuring and testing the ibacm service. +The first usage of the service will test that the ibacm is running +and operating correctly. The second usage model will automatically +create address and configuration files for the ibacm service. +.SH "OPTIONS" +.TP +\-f addr_format +Specifies the format of the src_addr and dest_addr parameters. Valid +address formats are: 'i' ip address, 'n' host name, 'l' lid, 'g' gid, +and 'u' unspecified. If the -f option is omitted, +an unspecified address format is assumed. ib_acme will use getaddrinfo or +other mechanisms to determine which format the address uses. +.TP +\-s src_addr +Specifies the local source address of the path to resolve. The source +address can be an IP address, system network name, or LID, as indicated by +the addr_format option. +.TP +\-d dest_addr +Specifies the destination address of the path to resolve. The destination +address can be an IP address, system network name, or LID, as indicated by +the addr_format option. +.TP +\-v +Indicates that the resolved path information should be verified with the +active IB SA. Use of the -v option provides a sanity check that +resolved path information is usable given the current cluster configuration. +.TP +\-c +Instructs the ACM service to only returned information that currently resides +in its local cache. +.TP +\-e [N] +Displays one (N = 1, 2, ...) or all endpoints (N = 0 or not present). +.TP +\-P [opt] +Queries performance data from the destination service. Valid options are: +"col" for outputting combined data in column format, "N" (N = 1, 2, ...) for +outputting data for a specific endpoint N, "all" for outputting data for all +endpoints, and "s" for outputting data for a specific endpoint with the address +given by the -s option. +.TP +\-S svc_addr +Hostname, IPv4-address or Unix-domain socket of ACM service, default: /run/ibacm.sock +.TP +\-C repetitions +number of repetitions to perform resolution. Used to measure +performance of ACM cache lookups. Defaults to 1. +.TP +\-A [addr_file] +With this option, the ib_acme utility automatically generates the address +configuration file ibacm_addr.cfg. The generated file is +constructed using the system host name. +.TP +\-O [opt_file] +With this option, the ib_acme utility automatically generates the option +configuration file ibacm_opts.cfg. The generated file is currently generated +using static information. +.TP +\-D dest_dir +Specify the destination directory for the output files. +.TP +\-V +Enables verbose output. When combined with -A or -O options, ib_acme will +display additional details, such as generated address information saved +to the ibacm_addr.cfg file. +.SH "NOTES" +The ib_acme utility performs two main functions. With the -A and -O options, +it automatically generates address or options configuration files. The +generated files are text based and may be edited. These options are intended +to provide a simple way to configure address and option information on all +nodes on a cluster. +.P +The other function of the ib_acme utility is to test the ibacm service, +including helping to verify that the service is usable given the current +cluster configuration. The ib_acme utility can resolve IP addresses, +network names, or IB LIDs into a path record. It can then compare that +path record against one obtained by the SA. When used to test the +ibacm service, the ib_acme utility has the side effect of loading the +ibacm caches. +.P +Multiple, numerical destinations can be specified by adding brackets [] to +the end of a base destination name or address. Users may specify a list of +numerical ranges inside the brackets using the following example as a +guide: node[1-3,5,7-8]. This will result in testing node1, node2, node3, +node5, node7, and node8. +.SH "SEE ALSO" +ibacm(7), ibacm(8) diff --git a/ibacm/man/ibacm.7 b/ibacm/man/ibacm.7 new file mode 100644 index 0000000..694c706 --- /dev/null +++ b/ibacm/man/ibacm.7 @@ -0,0 +1,32 @@ +.\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md +.TH "IBACM" 7 "2014-06-16" "IBACM" "IB ACM User Guide" IBACM +.SH NAME +ibacm \- InfiniBand communication management assistant +.SH SYNOPSIS +.B "#include <infiniband/acm.h>" +.SH "DESCRIPTION" +Used to resolve remote endpoint information before establishing communications +over InfiniBand. +.SH "NOTES" +Th IB ACM provides scalable address and route resolution services over +InfiniBand. It resolves system network names and IP addresses to InfiniBand +path record data using efficient mechanisms, including caching of data. +.P +The IB ACM provides information needed to establish a connection, but does +not implement the communication management protocol. It provides services +similar to rdma_getaddrinfo, rdma_resolve_addr, and rdma_resolve_route using +IB multicast. +The IB ACM does not require IPoIB or use standard naming services, such as +DNS, and limits network communication, especially with the IB SA. +The ib_acme utility assists in verifying what options of the ibacm service +may be usable for the current fabric topology. +.P +Client interactions with the ibacm service are done over sockets through +a standard TCP connection. The librdmacm abstracts this interaction. +.SH "RETURN CODES" +.IP "== 0" +success +.IP "!= 0" +error +.SH "SEE ALSO" +ib_acme(1), ibacm(8) diff --git a/ibacm/man/ibacm.8 b/ibacm/man/ibacm.8 new file mode 100644 index 0000000..3b94f1e --- /dev/null +++ b/ibacm/man/ibacm.8 @@ -0,0 +1,169 @@ +.\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md +.TH "ibacm" 1 "2014-06-16" "ibacm" "ibacm" ibacm +.SH NAME +ibacm \- address and route resolution services for InfiniBand. +.SH SYNOPSIS +.sp +.nf +\fIibacm\fR [-D] [-P] [-A addr_file] [-O option_file] +.fi +.SH "DESCRIPTION" +The IB ACM implements and provides a framework for name, +address, and route (path) resolution services over InfiniBand. +It is intended to address connection setup scalability issues running +MPI applications on large clusters. The IB ACM provides information +needed to establish a connection, but does not implement the CM protocol. +.P +A primary user of the ibacm service is the librdmacm library. This +enables applications to make use of the ibacm service without code +changes or needing to be aware that the service is in use. +librdmacm versions 1.0.12 - 1.0.15 can invoke IB ACM services when built using +the --with-ib_acm option. Version 1.0.16 and newer of librdmacm will automatically +use the IB ACM if it is installed. The IB ACM services tie in under the +rdma_resolve_addr, rdma_resolve_route, and rdma_getaddrinfo routines. +For maximum benefit, the rdma_getaddrinfo routine should be used, +however existing applications should still see significant connection +scaling benefits using the calls +available in librdmacm 1.0.11 and previous releases. +.P +The IB ACM is focused on being scalable, efficient, and extensible. It implements +a plugin architecture that allows a vendor to supply its proprietary provider in +addition to the default provider. The current default provider implementation +ibacmp limits network traffic, SA interactions, and centralized +services. Ibacmp supports multiple resolution protocols in order to handle +different fabric topologies. +.P +The IB ACM package is comprised of three components: the ibacm core service, +the default provider ibacmp shared library, and a test/configuration utility +- ib_acme. All three are userspace components and are available for Linux. +Additional details are given below. +.SH "OPTIONS" +.TP +\-D +run in daemon mode (default) +.TP +\-P +run as standard process +.TP +\-A addr_file +address configuration file +.TP +\-O option_file +option configuration file +.TP +\--systemd +Enable systemd integration. This includes optional socket activation of the daemon's +listening socket. +.SH "QUICK START GUIDE" +1. Prerequisites: libibverbs and libibumad must be installed. +The IB stack should be running with IPoIB configured. +These steps assume that the user has administrative privileges. +.P +2. Install the IB ACM package. This installs ibacm, ibacmp, ib_acme, and init.d scripts. +.P +3. Run 'ibacm' as administrator to start the ibacm daemon. +.P +4. Optionally, run 'ib_acme -d <dest_ip> -v' to verify that +the ibacm service is running. +.P +5. Install librdmacm, using the build option --with-ib_acm if needed. +This build option is not needed with librdmacm 1.0.17 or newer. +The librdmacm will automatically use the ibacm service. +On failures, the librdmacm will fall back to normal resolution. +.P +6. You can use ib_acme -P to gather performance statistics from the local ibacm +daemon to see if the service is working correctly. Similarly, the command +ib_acme -e could be used to enumerate all endpoints created by the local ibacm +service. +.SH "NOTES" +ib_acme: +.P +The ib_acme program serves a dual role. It acts as a utility to test +ibacm operation and help verify if the ibacm service and selected +protocol is usable for a given cluster configuration. Additionally, +it automatically generates ibacm configuration files to assist with +or eliminate manual setup. +.P +ibacm configuration files: +.P +The ibacm service relies on two configuration files. +.P +The ibacm_addr.cfg file contains name and address mappings for each IB +<device, port, pkey> endpoint. Although the names in the ibacm_addr.cfg +file can be anything, ib_acme maps the host name to the IB endpoints. IP +addresses, on the other hand, are assigned dynamically. If the address file +cannot be found, the ibacm service will attempt to create one using default +values. +.P +The ibacm_opts.cfg file provides a set of configurable options for the +ibacm core service and default provider, such as timeout, number of retries, +logging level, etc. ib_acme generates the ibacm_opts.cfg file using static +information. If an option file cannot be found, ibacm will use default values. +.P +ibacm: +.P +The ibacm service is responsible for resolving names and addresses to +InfiniBand path information and caching such data. It +should execute with administrative privileges. +.P +The ibacm implements a client interface over TCP sockets, which is +abstracted by the librdmacm library. One or more providers can be loaded +by the core service, depending on the configuration. In the default provider +ibacmp, one or more back-end protocols are used to satisfy user requests. +Although ibacmp supports standard SA path record queries on the back-end, it +also supports a resolution protocol based on multicast traffic. +The latter is not usable on all fabric topologies, specifically +ones that may not have reversible paths or fabrics using torus routing. +Users should use the ib_acme utility to verify that multicast protocol +is usable before running other applications. +.P +Conceptually, the default provider ibacmp implements an ARP like protocol and either +uses IB multicast records to construct path record data or queries the +SA directly, depending on the selected route protocol. By default, the +ibacmp provider uses and caches SA path record queries. +.P +Specifically, all IB endpoints join a number of multicast groups. +Multicast groups differ based on rates, mtu, sl, etc., and are prioritized. +All participating endpoints must be able to communicate on the lowest +priority multicast group. The ibacmp assigns one or more names/addresses +to each IB endpoint using the ibacm_addr.cfg file. Clients provide source +and destination names or addresses as input to the service, and receive +as output path record data. +.P +The service maps a client's source name/address to a local IB endpoint. +If the destination name/address is not cached locally in the default provider, +it sends a multicast request out on the lowest priority multicast group on the +local endpoint. The request carries a list of multicast groups that the sender can use. +The recipient of the request selects the highest priority multicast group +that it can use as well and returns that information directly to the sender. +The request data is cached by all endpoints that receive the multicast +request message. The source endpoint also caches the response and uses +the multicast group that was selected to construct or obtain path record +data, which is returned to the client. +.P +The current implementation of the provider ibacmp has several additional restrictions: +.P +- The ibacmp is limited in its handling of dynamic changes. +ibacm must be stopped and restarted if a cluster is reconfigured. +.P +- Support for IPv6 has not been verified. +.P +- The number of multicast groups that an endpoint can support is limited to 2. +.P +The ibacmp contains several internal caches. These include caches for GID +and LID destination addresses. These caches can be optionally +preloaded. ibacm supports the OpenSM dump_pr plugin "full" PathRecord +format which is used to preload these caches. +The file format is specified in the ibacm_opts.cfg file via the +route_preload setting which should be set to full_opensm_v1 for this +file format. Default format is none which does not preload these caches. +See dump_pr.notes.txt in dump_pr for more information on the +full_opensm_v1 file format and how to configure OpenSM to +generate this file. +.P +Additionally, the name, IPv4, and IPv6 caches can be be preloaded by using +the addr_preload option. The default is none which does not preload these +caches. To preload these caches, set this option to acm_hosts and +configure the addr_data_file appropriately. +.SH "SEE ALSO" +ibacm(7), ib_acme(1), rdma_cm(7) diff --git a/ibacm/man/ibacm_prov.7.in b/ibacm/man/ibacm_prov.7.in new file mode 100644 index 0000000..37a81e7 --- /dev/null +++ b/ibacm/man/ibacm_prov.7.in @@ -0,0 +1,81 @@ +.\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md +.TH "IBACM_PROV" 7 "2014-06-16" "IBACM_PROV" "IB ACM Provider Guide" IBACM_PROV +.SH NAME +ibacm_prov \- InfiniBand communication management assistant provider interface +.SH SYNOPSIS +.B "#include <infiniband/acm_prov.h>" +.SH "DESCRIPTION" +The ibacm provider interface provides a plugin interface that allows a vendor +to implement proprietary solutions to support scalable address and route +resolution services over InfiniBand. +.P +To add a provider to the ibacm core service, the provider must +.TP +1. be implemented as a shared library; +.TP +2. be installed under a configured directory, eg., @ACM_PROVIDER_DIR@; +.TP +3 export a function provider_query() that returns a pointer to its provider info +and version info. +.P +The prototype of provider_query function is defined below: +.P +.nf +int provider_query(struct acm_provider **info, uint32_t *version); +.fi +.P +This function should return a pointer to its provider structure: +.P +.nf +struct acm_provider { + size_t size; + uint32_t version; + char *name; + int (*open_device)(const struct acm_device *device, + void **dev_context); + void (*close_device)(void *dev_context); + int (*open_port)(const struct acm_port *port, + void *dev_context, void **port_context); + void (*close_port)(void *port_context); + int (*open_endpoint)(const struct acm_endpoint *endpoint, + void *port_context, void **ep_context); + void (*close_endpoint)(void *ep_context); + int (*add_address)(const struct acm_address *addr, void *ep_context, + void **addr_context); + void (*remove_address)(void *addr_context); + int (*resolve)(void *addr_context, struct acm_msg *msg, uint64_t id); + int (*query)(void *addr_context, struct acm_msg *msg, uint64_t id); + int (*handle_event)(void *port_context, enum ibv_event_type type); + void (*query_perf)(void *ep_context, uint64_t *values, uint8_t *cnt); +}; +.fi +.P +The size and version fields provide a way to detect version compatibility. +When a port is assigned to the provider, the ibacm core will call the +open/add_address functions; Similarly, when a port is down or re-assigned to +another provider, the close/remove_address functions will be invoked to release +resources. The ibacm core will centralize the management of events for each device +and events not handled by the ibacm core will be forwarded to the relevant port +through the handle_event() function. The resolve() function will be called to +resolve a destination name into a path record. The performance of the provider +for each endpoint can be queried by calling perf_query(). +.P +To share a configuration file, the path for the ibacm configuration file is +exported through the variable opts_file. Each loaded provider can open this +configuration file and parse the contents related to its own operation. +Non-related sections should be ignored. +.P +Some helper functions are also exported by the ibacm core. For example, the +acm_log define (or the acm_write() function) can be used to log messages into +ibacm's log file (default @CMAKE_INSTALL_FULL_LOCALSTATEDIR@/log/ibacm.log). For details, refer to +the acm_prov.h file. +.SH "NOTES" +A provider should always set the version in its provider info structure as the +value of the define ACM_PROV_VERSION at the time the provider is implemented. Never +set the version to ACM_PROV_VERSION itself as the define may be changed over time +when the provider interface is changed, unless the provider itself is placed in +ibacm source tree. This is to avoid the version problem when the old provider +implementation is built against a new acm_prov.h file. The ibacm will always +check the version of the provider at loading time. +.SH "SEE ALSO" +ib_acme(1), ibacm(7), ibacm(8) diff --git a/ibacm/prov/acmp/src/acmp.c b/ibacm/prov/acmp/src/acmp.c new file mode 100644 index 0000000..2fee103 --- /dev/null +++ b/ibacm/prov/acmp/src/acmp.c @@ -0,0 +1,3051 @@ +/* + * Copyright (c) 2009-2014 Intel Corporation. All rights reserved. + * Copyright (c) 2013 Mellanox Technologies LTD. All rights reserved. + * + * This software is available to you under the OpenIB.org BSD license + * below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AWV + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <config.h> + +#include <endian.h> +#include <stdio.h> +#include <stdarg.h> +#include <string.h> +#include <osd.h> +#include <arpa/inet.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <sys/time.h> +#include <fcntl.h> +#include <dirent.h> +#include <infiniband/acm.h> +#include <infiniband/acm_prov.h> +#include <infiniband/umad.h> +#include <infiniband/verbs.h> +#include <infiniband/umad_sa.h> +#include <infiniband/umad_sa_mcm.h> +#include <ifaddrs.h> +#include <dlfcn.h> +#include <search.h> +#include <netdb.h> +#include <net/if.h> +#include <sys/ioctl.h> +#include <net/if_arp.h> +#include <netinet/in.h> +#include <linux/netlink.h> +#include <linux/rtnetlink.h> +#include <inttypes.h> +#include <ccan/list.h> +#include "acm_util.h" +#include "acm_mad.h" + +#define IB_LID_MCAST_START 0xc000 + +#define MAX_EP_ADDR 4 +#define MAX_EP_MC 2 + +enum acmp_state { + ACMP_INIT, + ACMP_QUERY_ADDR, + ACMP_ADDR_RESOLVED, + ACMP_QUERY_ROUTE, + ACMP_READY +}; + +enum acmp_addr_prot { + ACMP_ADDR_PROT_ACM +}; + +enum acmp_route_prot { + ACMP_ROUTE_PROT_ACM, + ACMP_ROUTE_PROT_SA +}; + +enum acmp_loopback_prot { + ACMP_LOOPBACK_PROT_NONE, + ACMP_LOOPBACK_PROT_LOCAL +}; + +enum acmp_route_preload { + ACMP_ROUTE_PRELOAD_NONE, + ACMP_ROUTE_PRELOAD_OSM_FULL_V1 +}; + +enum acmp_addr_preload { + ACMP_ADDR_PRELOAD_NONE, + ACMP_ADDR_PRELOAD_HOSTS +}; + +/* + * Nested locking order: dest -> ep, dest -> port + */ +struct acmp_ep; + +struct acmp_dest { + uint8_t address[ACM_MAX_ADDRESS]; /* keep first */ + char name[ACM_MAX_ADDRESS]; + struct ibv_ah *ah; + struct ibv_ah_attr av; + struct ibv_path_record path; + union ibv_gid mgid; + __be64 req_id; + struct list_head req_queue; + uint32_t remote_qpn; + pthread_mutex_t lock; + enum acmp_state state; + atomic_t refcnt; + uint64_t addr_timeout; + uint64_t route_timeout; + uint8_t addr_type; + struct acmp_ep *ep; +}; + +struct acmp_device; + +struct acmp_port { + struct acmp_device *dev; + const struct acm_port *port; + struct list_head ep_list; + pthread_mutex_t lock; + struct acmp_dest sa_dest; + enum ibv_port_state state; + enum ibv_mtu mtu; + enum ibv_rate rate; + int subnet_timeout; + uint16_t default_pkey_ix; + uint16_t lid; + uint16_t lid_mask; + uint8_t port_num; +}; + +struct acmp_device { + struct ibv_context *verbs; + const struct acm_device *device; + struct ibv_comp_channel *channel; + struct ibv_pd *pd; + __be64 guid; + struct list_node entry; + pthread_t comp_thread_id; + int port_cnt; + struct acmp_port port[0]; +}; + +/* Maintain separate virtual send queues to avoid deadlock */ +struct acmp_send_queue { + int credits; + struct list_head pending; +}; + +struct acmp_addr { + uint16_t type; + union acm_ep_info info; + struct acm_address addr; + struct acmp_ep *ep; +}; + +struct acmp_addr_ctx { + struct acmp_ep *ep; + int addr_inx; +}; + +struct acmp_ep { + struct acmp_port *port; + struct ibv_cq *cq; + struct ibv_qp *qp; + struct ibv_mr *mr; + uint8_t *recv_bufs; + struct list_node entry; + char id_string[IBV_SYSFS_NAME_MAX + 11]; + void *dest_map[ACM_ADDRESS_RESERVED - 1]; + struct acmp_dest mc_dest[MAX_EP_MC]; + int mc_cnt; + uint16_t pkey_index; + uint16_t pkey; + const struct acm_endpoint *endpoint; + pthread_mutex_t lock; + struct acmp_send_queue resolve_queue; + struct acmp_send_queue resp_queue; + struct list_head active_queue; + struct list_head wait_queue; + enum acmp_state state; + /* This lock protects nmbr_ep_addrs and addr_info */ + pthread_rwlock_t rwlock; + int nmbr_ep_addrs; + struct acmp_addr *addr_info; + atomic_t counters[ACM_MAX_COUNTER]; +}; + +struct acmp_send_msg { + struct list_node entry; + struct acmp_ep *ep; + struct acmp_dest *dest; + struct ibv_ah *ah; + void *context; + void (*resp_handler)(struct acmp_send_msg *req, + struct ibv_wc *wc, struct acm_mad *resp); + struct acmp_send_queue *req_queue; + struct ibv_mr *mr; + struct ibv_send_wr wr; + struct ibv_sge sge; + uint64_t expires; + int tries; + uint8_t data[ACM_SEND_SIZE]; +}; + +struct acmp_request { + uint64_t id; + struct list_node entry; + struct acm_msg msg; + struct acmp_ep *ep; +}; + +static int acmp_open_dev(const struct acm_device *device, void **dev_context); +static void acmp_close_dev(void *dev_context); +static int acmp_open_port(const struct acm_port *port, void *dev_context, + void **port_context); +static void acmp_close_port(void *port_context); +static int acmp_open_endpoint(const struct acm_endpoint *endpoint, + void *port_context, void **ep_context); +static void acmp_close_endpoint(void *ep_context); +static int acmp_add_addr(const struct acm_address *addr, void *ep_context, + void **addr_context); +static void acmp_remove_addr(void *addr_context); +static int acmp_resolve(void *addr_context, struct acm_msg *msg, uint64_t id); +static int acmp_query(void *addr_context, struct acm_msg *msg, uint64_t id); +static int acmp_handle_event(void *port_context, enum ibv_event_type type); +static void acmp_query_perf(void *ep_context, uint64_t *values, uint8_t *cnt); + +static struct acm_provider def_prov = { + .size = sizeof(struct acm_provider), + .version = ACM_PROV_VERSION, + .name = "ibacmp", + .open_device = acmp_open_dev, + .close_device = acmp_close_dev, + .open_port = acmp_open_port, + .close_port = acmp_close_port, + .open_endpoint = acmp_open_endpoint, + .close_endpoint = acmp_close_endpoint, + .add_address = acmp_add_addr, + .remove_address = acmp_remove_addr, + .resolve = acmp_resolve, + .query = acmp_query, + .handle_event = acmp_handle_event, + .query_perf = acmp_query_perf, +}; + +static LIST_HEAD(acmp_dev_list); +static pthread_mutex_t acmp_dev_lock; + +static atomic_t g_tid; +static LIST_HEAD(timeout_list); +static event_t timeout_event; +static atomic_t wait_cnt; +static pthread_t retry_thread_id; +static int retry_thread_started = 0; + +static __thread char log_data[ACM_MAX_ADDRESS]; + +/* + * Service options - may be set through ibacm_opts.cfg file. + */ +static char route_data_file[128] = ACM_CONF_DIR "/ibacm_route.data"; +static char addr_data_file[128] = ACM_CONF_DIR "/ibacm_hosts.data"; +static enum acmp_addr_prot addr_prot = ACMP_ADDR_PROT_ACM; +static int addr_timeout = 1440; +static enum acmp_route_prot route_prot = ACMP_ROUTE_PROT_SA; +static int route_timeout = -1; +static enum acmp_loopback_prot loopback_prot = ACMP_LOOPBACK_PROT_LOCAL; +static int timeout = 2000; +static int retries = 2; +static int resolve_depth = 1; +static int send_depth = 1; +static int recv_depth = 1024; +static uint8_t min_mtu = IBV_MTU_2048; +static uint8_t min_rate = IBV_RATE_10_GBPS; +static enum acmp_route_preload route_preload; +static enum acmp_addr_preload addr_preload; + +static int acmp_initialized = 0; + +static int acmp_compare_dest(const void *dest1, const void *dest2) +{ + return memcmp(dest1, dest2, ACM_MAX_ADDRESS); +} + +static void +acmp_set_dest_addr(struct acmp_dest *dest, uint8_t addr_type, + const uint8_t *addr, size_t size) +{ + memcpy(dest->address, addr, size); + dest->addr_type = addr_type; + acm_format_name(0, dest->name, sizeof dest->name, addr_type, addr, size); +} + +static void +acmp_init_dest(struct acmp_dest *dest, uint8_t addr_type, + const uint8_t *addr, size_t size) +{ + list_head_init(&dest->req_queue); + atomic_init(&dest->refcnt); + atomic_set(&dest->refcnt, 1); + pthread_mutex_init(&dest->lock, NULL); + if (size) + acmp_set_dest_addr(dest, addr_type, addr, size); + dest->state = ACMP_INIT; +} + +static struct acmp_dest * +acmp_alloc_dest(uint8_t addr_type, const uint8_t *addr) +{ + struct acmp_dest *dest; + + dest = calloc(1, sizeof *dest); + if (!dest) { + acm_log(0, "ERROR - unable to allocate dest\n"); + return NULL; + } + + acmp_init_dest(dest, addr_type, addr, ACM_MAX_ADDRESS); + acm_log(1, "%s\n", dest->name); + return dest; +} + +/* Caller must hold ep lock. */ +static struct acmp_dest * +acmp_get_dest(struct acmp_ep *ep, uint8_t addr_type, const uint8_t *addr) +{ + struct acmp_dest *dest, **tdest; + + tdest = tfind(addr, &ep->dest_map[addr_type - 1], acmp_compare_dest); + if (tdest) { + dest = *tdest; + (void) atomic_inc(&dest->refcnt); + acm_log(2, "%s\n", dest->name); + } else { + dest = NULL; + acm_format_name(2, log_data, sizeof log_data, + addr_type, addr, ACM_MAX_ADDRESS); + acm_log(2, "%s not found\n", log_data); + } + return dest; +} + +static void +acmp_put_dest(struct acmp_dest *dest) +{ + acm_log(2, "%s\n", dest->name); + if (atomic_dec(&dest->refcnt) == 0) { + free(dest); + } +} + +/* Caller must hold ep lock. */ +static void +acmp_remove_dest(struct acmp_ep *ep, struct acmp_dest *dest) +{ + acm_log(2, "%s\n", dest->name); + if (!tdelete(dest->address, &ep->dest_map[dest->addr_type - 1], + acmp_compare_dest)) + acm_log(0, "ERROR: %s not found!!\n", dest->name); + + acmp_put_dest(dest); +} + +static struct acmp_dest * +acmp_acquire_dest(struct acmp_ep *ep, uint8_t addr_type, const uint8_t *addr) +{ + struct acmp_dest *dest; + int64_t rec_expr_minutes; + + acm_format_name(2, log_data, sizeof log_data, + addr_type, addr, ACM_MAX_ADDRESS); + acm_log(2, "%s\n", log_data); + pthread_mutex_lock(&ep->lock); + dest = acmp_get_dest(ep, addr_type, addr); + if (dest && dest->state == ACMP_READY && + dest->addr_timeout != (uint64_t)~0ULL) { + rec_expr_minutes = dest->addr_timeout - time_stamp_min(); + if (rec_expr_minutes <= 0) { + acm_log(2, "Record expired\n"); + acmp_remove_dest(ep, dest); + dest = NULL; + } else { + acm_log(2, "Record valid for the next %" PRId64 " minute(s)\n", + rec_expr_minutes); + } + } + if (!dest) { + dest = acmp_alloc_dest(addr_type, addr); + if (dest) { + dest->ep = ep; + tsearch(dest, &ep->dest_map[addr_type - 1], acmp_compare_dest); + (void) atomic_inc(&dest->refcnt); + } + } + pthread_mutex_unlock(&ep->lock); + return dest; +} + +static struct acmp_request *acmp_alloc_req(uint64_t id, struct acm_msg *msg) +{ + struct acmp_request *req; + + req = calloc(1, sizeof *req); + if (!req) { + acm_log(0, "ERROR - unable to alloc client request\n"); + return NULL; + } + + req->id = id; + memcpy(&req->msg, msg, sizeof(req->msg)); + acm_log(2, "id %" PRIu64 ", req %p\n", id, req); + return req; +} + +static void acmp_free_req(struct acmp_request *req) +{ + acm_log(2, "%p\n", req); + free(req); +} + +static struct acmp_send_msg * +acmp_alloc_send(struct acmp_ep *ep, struct acmp_dest *dest, size_t size) +{ + struct acmp_send_msg *msg; + + msg = (struct acmp_send_msg *) calloc(1, sizeof *msg); + if (!msg) { + acm_log(0, "ERROR - unable to allocate send buffer\n"); + return NULL; + } + + msg->ep = ep; + msg->mr = ibv_reg_mr(ep->port->dev->pd, msg->data, size, 0); + if (!msg->mr) { + acm_log(0, "ERROR - failed to register send buffer\n"); + goto err1; + } + + if (!dest->ah) { + msg->ah = ibv_create_ah(ep->port->dev->pd, &dest->av); + if (!msg->ah) { + acm_log(0, "ERROR - unable to create ah\n"); + goto err2; + } + msg->wr.wr.ud.ah = msg->ah; + } else { + msg->wr.wr.ud.ah = dest->ah; + } + + acm_log(2, "get dest %s\n", dest->name); + (void) atomic_inc(&dest->refcnt); + msg->dest = dest; + + msg->wr.next = NULL; + msg->wr.sg_list = &msg->sge; + msg->wr.num_sge = 1; + msg->wr.opcode = IBV_WR_SEND; + msg->wr.send_flags = IBV_SEND_SIGNALED; + msg->wr.wr_id = (uintptr_t) msg; + msg->wr.wr.ud.remote_qpn = dest->remote_qpn; + msg->wr.wr.ud.remote_qkey = ACM_QKEY; + + msg->sge.length = size; + msg->sge.lkey = msg->mr->lkey; + msg->sge.addr = (uintptr_t) msg->data; + acm_log(2, "%p\n", msg); + return msg; + +err2: + ibv_dereg_mr(msg->mr); +err1: + free(msg); + return NULL; +} + +static void +acmp_init_send_req(struct acmp_send_msg *msg, void *context, + void (*resp_handler)(struct acmp_send_msg *req, + struct ibv_wc *wc, struct acm_mad *resp)) +{ + acm_log(2, "%p\n", msg); + msg->tries = retries + 1; + msg->context = context; + msg->resp_handler = resp_handler; +} + +static void acmp_free_send(struct acmp_send_msg *msg) +{ + acm_log(2, "%p\n", msg); + if (msg->ah) + ibv_destroy_ah(msg->ah); + ibv_dereg_mr(msg->mr); + acmp_put_dest(msg->dest); + free(msg); +} + +static void acmp_post_send(struct acmp_send_queue *queue, struct acmp_send_msg *msg) +{ + struct acmp_ep *ep = msg->ep; + struct ibv_send_wr *bad_wr; + + msg->req_queue = queue; + pthread_mutex_lock(&ep->lock); + if (queue->credits) { + acm_log(2, "posting send to QP\n"); + queue->credits--; + list_add_tail(&ep->active_queue, &msg->entry); + ibv_post_send(ep->qp, &msg->wr, &bad_wr); + } else { + acm_log(2, "no sends available, queuing message\n"); + list_add_tail(&queue->pending, &msg->entry); + } + pthread_mutex_unlock(&ep->lock); +} + +static void acmp_post_recv(struct acmp_ep *ep, uint64_t address) +{ + struct ibv_recv_wr wr, *bad_wr; + struct ibv_sge sge; + + wr.next = NULL; + wr.sg_list = &sge; + wr.num_sge = 1; + wr.wr_id = address; + + sge.length = ACM_RECV_SIZE; + sge.lkey = ep->mr->lkey; + sge.addr = address; + + ibv_post_recv(ep->qp, &wr, &bad_wr); +} + +/* Caller must hold ep lock */ +static void acmp_send_available(struct acmp_ep *ep, struct acmp_send_queue *queue) +{ + struct acmp_send_msg *msg; + struct ibv_send_wr *bad_wr; + + msg = list_pop(&queue->pending, struct acmp_send_msg, entry); + if (msg) { + acm_log(2, "posting queued send message\n"); + list_add_tail(&ep->active_queue, &msg->entry); + ibv_post_send(ep->qp, &msg->wr, &bad_wr); + } else { + queue->credits++; + } +} + +static void acmp_complete_send(struct acmp_send_msg *msg) +{ + struct acmp_ep *ep = msg->ep; + + pthread_mutex_lock(&ep->lock); + list_del(&msg->entry); + if (msg->tries) { + acm_log(2, "waiting for response\n"); + msg->expires = time_stamp_ms() + ep->port->subnet_timeout + timeout; + list_add_tail(&ep->wait_queue, &msg->entry); + if (atomic_inc(&wait_cnt) == 1) + event_signal(&timeout_event); + } else { + acm_log(2, "freeing\n"); + acmp_send_available(ep, msg->req_queue); + acmp_free_send(msg); + } + pthread_mutex_unlock(&ep->lock); +} + +static struct acmp_send_msg *acmp_get_request(struct acmp_ep *ep, __be64 tid, int *free) +{ + struct acmp_send_msg *msg, *next, *req = NULL; + struct acm_mad *mad; + + acm_log(2, "\n"); + pthread_mutex_lock(&ep->lock); + list_for_each_safe(&ep->wait_queue, msg, next, entry) { + mad = (struct acm_mad *) msg->data; + if (mad->tid == tid) { + acm_log(2, "match found in wait queue\n"); + req = msg; + list_del(&msg->entry); + (void) atomic_dec(&wait_cnt); + acmp_send_available(ep, msg->req_queue); + *free = 1; + goto unlock; + } + } + + list_for_each(&ep->active_queue, msg, entry) { + mad = (struct acm_mad *) msg->data; + if (mad->tid == tid && msg->tries) { + acm_log(2, "match found in active queue\n"); + req = msg; + req->tries = 0; + *free = 0; + break; + } + } +unlock: + pthread_mutex_unlock(&ep->lock); + return req; +} + +static int acmp_mc_index(struct acmp_ep *ep, union ibv_gid *gid) +{ + int i; + + for (i = 0; i < ep->mc_cnt; i++) { + if (!memcmp(&ep->mc_dest[i].address, gid, sizeof(*gid))) + return i; + } + return -1; +} + +/* Multicast groups are ordered lowest to highest preference. */ +static int acmp_best_mc_index(struct acmp_ep *ep, struct acm_resolve_rec *rec) +{ + int i, index; + + for (i = min_t(int, rec->gid_cnt, ACM_MAX_GID_COUNT) - 1; i >= 0; i--) { + index = acmp_mc_index(ep, &rec->gid[i]); + if (index >= 0) { + return index; + } + } + return -1; +} + +static void +acmp_record_mc_av(struct acmp_port *port, struct ib_mc_member_rec *mc_rec, + struct acmp_dest *dest) +{ + uint32_t sl_flow_hop; + + sl_flow_hop = be32toh(mc_rec->sl_flow_hop); + + dest->av.dlid = be16toh(mc_rec->mlid); + dest->av.sl = (uint8_t) (sl_flow_hop >> 28); + dest->av.src_path_bits = port->sa_dest.av.src_path_bits; + dest->av.static_rate = mc_rec->rate & 0x3F; + dest->av.port_num = port->port_num; + + dest->av.is_global = 1; + dest->av.grh.dgid = mc_rec->mgid; + dest->av.grh.flow_label = (sl_flow_hop >> 8) & 0xFFFFF; + dest->av.grh.sgid_index = acm_gid_index((struct acm_port *) port->port, + &mc_rec->port_gid); + dest->av.grh.hop_limit = (uint8_t) sl_flow_hop; + dest->av.grh.traffic_class = mc_rec->tclass; + + dest->path.dgid = mc_rec->mgid; + dest->path.sgid = mc_rec->port_gid; + dest->path.dlid = mc_rec->mlid; + dest->path.slid = htobe16(port->lid | port->sa_dest.av.src_path_bits); + dest->path.flowlabel_hoplimit = htobe32(sl_flow_hop & 0xFFFFFFF); + dest->path.tclass = mc_rec->tclass; + dest->path.reversible_numpath = IBV_PATH_RECORD_REVERSIBLE | 1; + dest->path.pkey = mc_rec->pkey; + dest->path.qosclass_sl = htobe16((uint16_t) (sl_flow_hop >> 28)); + dest->path.mtu = mc_rec->mtu; + dest->path.rate = mc_rec->rate; + dest->path.packetlifetime = mc_rec->packet_lifetime; +} + +/* Always send the GRH to transfer GID data to remote side */ +static void +acmp_init_path_av(struct acmp_port *port, struct acmp_dest *dest) +{ + uint32_t flow_hop; + + dest->av.dlid = be16toh(dest->path.dlid); + dest->av.sl = be16toh(dest->path.qosclass_sl) & 0xF; + dest->av.src_path_bits = be16toh(dest->path.slid) & 0x7F; + dest->av.static_rate = dest->path.rate & 0x3F; + dest->av.port_num = port->port_num; + + flow_hop = be32toh(dest->path.flowlabel_hoplimit); + dest->av.is_global = 1; + dest->av.grh.flow_label = (flow_hop >> 8) & 0xFFFFF; + pthread_mutex_lock(&port->lock); + if (port->port) + dest->av.grh.sgid_index = acm_gid_index( + (struct acm_port *) port->port, &dest->path.sgid); + else + dest->av.grh.sgid_index = 0; + pthread_mutex_unlock(&port->lock); + dest->av.grh.hop_limit = (uint8_t) flow_hop; + dest->av.grh.traffic_class = dest->path.tclass; +} + +static void acmp_process_join_resp(struct acm_sa_mad *sa_mad) +{ + struct acmp_dest *dest; + struct ib_mc_member_rec *mc_rec; + struct ib_sa_mad *mad; + int index, ret; + struct acmp_ep *ep = sa_mad->context; + + mad = (struct ib_sa_mad *) &sa_mad->sa_mad; + acm_log(1, "response status: 0x%x, mad status: 0x%x\n", + sa_mad->umad.status, mad->status); + pthread_mutex_lock(&ep->lock); + if (sa_mad->umad.status) { + acm_log(0, "ERROR - send join failed 0x%x\n", sa_mad->umad.status); + goto out; + } + if (mad->status) { + acm_log(0, "ERROR - join response status 0x%x\n", mad->status); + goto out; + } + + mc_rec = (struct ib_mc_member_rec *) mad->data; + index = acmp_mc_index(ep, &mc_rec->mgid); + if (index < 0) { + acm_log(0, "ERROR - MGID in join response not found\n"); + goto out; + } + + dest = &ep->mc_dest[index]; + dest->remote_qpn = IB_MC_QPN; + dest->mgid = mc_rec->mgid; + acmp_record_mc_av(ep->port, mc_rec, dest); + + if (index == 0) { + dest->ah = ibv_create_ah(ep->port->dev->pd, &dest->av); + if (!dest->ah) { + acm_log(0, "ERROR - unable to create ah\n"); + goto out; + } + ret = ibv_attach_mcast(ep->qp, &dest->mgid, dest->av.dlid); + if (ret) { + acm_log(0, "ERROR - unable to attach QP to multicast group\n"); + ibv_destroy_ah(dest->ah); + dest->ah = NULL; + goto out; + } + ep->state = ACMP_READY; + } + + atomic_set(&dest->refcnt, 1); + dest->state = ACMP_READY; + acm_log(1, "join successful\n"); +out: + acm_free_sa_mad(sa_mad); + pthread_mutex_unlock(&ep->lock); +} + +static uint8_t +acmp_record_acm_route(struct acmp_ep *ep, struct acmp_dest *dest) +{ + int i; + + acm_log(2, "\n"); + for (i = 0; i < MAX_EP_MC; i++) { + if (!memcmp(&dest->mgid, &ep->mc_dest[i].mgid, sizeof dest->mgid)) + break; + } + if (i == MAX_EP_MC) { + acm_log(0, "ERROR - cannot match mgid\n"); + return ACM_STATUS_EINVAL; + } + + dest->path = ep->mc_dest[i].path; + dest->path.dgid = dest->av.grh.dgid; + dest->path.dlid = htobe16(dest->av.dlid); + dest->addr_timeout = time_stamp_min() + (unsigned) addr_timeout; + dest->route_timeout = time_stamp_min() + (unsigned) route_timeout; + dest->state = ACMP_READY; + return ACM_STATUS_SUCCESS; +} + +static void acmp_init_path_query(struct ib_sa_mad *mad) +{ + acm_log(2, "\n"); + mad->base_version = 1; + mad->mgmt_class = IB_MGMT_CLASS_SA; + mad->class_version = 2; + mad->method = IB_METHOD_GET; + mad->tid = htobe64((uint64_t) atomic_inc(&g_tid)); + mad->attr_id = IB_SA_ATTR_PATH_REC; +} + +/* Caller must hold dest lock */ +static uint8_t acmp_resolve_path_sa(struct acmp_ep *ep, struct acmp_dest *dest, + void (*handler)(struct acm_sa_mad *)) +{ + struct ib_sa_mad *mad; + uint8_t ret; + struct acm_sa_mad *sa_mad; + + acm_log(2, "%s\n", dest->name); + + sa_mad = acm_alloc_sa_mad(ep->endpoint, dest, handler); + if (!sa_mad) { + acm_log(0, "Error - failed to allocate sa_mad\n"); + ret = ACM_STATUS_ENOMEM; + goto err; + } + + mad = (struct ib_sa_mad *) &sa_mad->sa_mad; + acmp_init_path_query(mad); + + memcpy(mad->data, &dest->path, sizeof(dest->path)); + mad->comp_mask = acm_path_comp_mask(&dest->path); + + acm_increment_counter(ACM_CNTR_ROUTE_QUERY); + atomic_inc(&ep->counters[ACM_CNTR_ROUTE_QUERY]); + dest->state = ACMP_QUERY_ROUTE; + if (acm_send_sa_mad(sa_mad)) { + acm_log(0, "Error - Failed to send sa mad\n"); + ret = ACM_STATUS_ENODATA; + goto free_mad; + } + return ACM_STATUS_SUCCESS; +free_mad: + acm_free_sa_mad(sa_mad); +err: + dest->state = ACMP_INIT; + return ret; +} + +static uint8_t +acmp_record_acm_addr(struct acmp_ep *ep, struct acmp_dest *dest, struct ibv_wc *wc, + struct acm_resolve_rec *rec) +{ + int index; + + acm_log(2, "%s\n", dest->name); + index = acmp_best_mc_index(ep, rec); + if (index < 0) { + acm_log(0, "ERROR - no shared multicast groups\n"); + dest->state = ACMP_INIT; + return ACM_STATUS_ENODATA; + } + + acm_log(2, "selecting MC group at index %d\n", index); + dest->av = ep->mc_dest[index].av; + dest->av.dlid = wc->slid; + dest->av.src_path_bits = wc->dlid_path_bits; + dest->av.grh.dgid = ((struct ibv_grh *) (uintptr_t) wc->wr_id)->sgid; + + dest->mgid = ep->mc_dest[index].mgid; + dest->path.sgid = ep->mc_dest[index].path.sgid; + dest->path.dgid = dest->av.grh.dgid; + dest->path.tclass = ep->mc_dest[index].path.tclass; + dest->path.pkey = ep->mc_dest[index].path.pkey; + dest->remote_qpn = wc->src_qp; + + dest->state = ACMP_ADDR_RESOLVED; + return ACM_STATUS_SUCCESS; +} + +static void +acmp_record_path_addr(struct acmp_ep *ep, struct acmp_dest *dest, + struct ibv_path_record *path) +{ + acm_log(2, "%s\n", dest->name); + dest->path.pkey = htobe16(ep->pkey); + dest->path.dgid = path->dgid; + if (path->slid) { + dest->path.slid = path->slid; + } else { + dest->path.slid = htobe16(ep->port->lid); + } + if (!ib_any_gid(&path->sgid)) { + dest->path.sgid = path->sgid; + } else { + dest->path.sgid = ep->mc_dest[0].path.sgid; + } + dest->path.dlid = path->dlid; + dest->state = ACMP_ADDR_RESOLVED; +} + +static uint8_t acmp_validate_addr_req(struct acm_mad *mad) +{ + struct acm_resolve_rec *rec; + + if (mad->method != IB_METHOD_GET) { + acm_log(0, "ERROR - invalid method 0x%x\n", mad->method); + return ACM_STATUS_EINVAL; + } + + rec = (struct acm_resolve_rec *) mad->data; + if (!rec->src_type || rec->src_type >= ACM_ADDRESS_RESERVED) { + acm_log(0, "ERROR - unknown src type 0x%x\n", rec->src_type); + return ACM_STATUS_EINVAL; + } + + return ACM_STATUS_SUCCESS; +} + +static void +acmp_send_addr_resp(struct acmp_ep *ep, struct acmp_dest *dest) +{ + struct acm_resolve_rec *rec; + struct acmp_send_msg *msg; + struct acm_mad *mad; + + acm_log(2, "%s\n", dest->name); + msg = acmp_alloc_send(ep, dest, sizeof (*mad)); + if (!msg) { + acm_log(0, "ERROR - failed to allocate message\n"); + return; + } + + mad = (struct acm_mad *) msg->data; + rec = (struct acm_resolve_rec *) mad->data; + + mad->base_version = 1; + mad->mgmt_class = ACM_MGMT_CLASS; + mad->class_version = 1; + mad->method = IB_METHOD_GET | IB_METHOD_RESP; + mad->status = ACM_STATUS_SUCCESS; + mad->control = ACM_CTRL_RESOLVE; + mad->tid = dest->req_id; + rec->gid_cnt = 1; + memcpy(rec->gid, dest->mgid.raw, sizeof(union ibv_gid)); + + acmp_post_send(&ep->resp_queue, msg); +} + +static int +acmp_resolve_response(uint64_t id, struct acm_msg *req_msg, + struct acmp_dest *dest, uint8_t status) +{ + struct acm_msg msg; + + acm_log(2, "client %" PRIu64 ", status 0x%x\n", id, status); + memset(&msg, 0, sizeof msg); + + if (dest) { + if (status == ACM_STATUS_ENODATA) + atomic_inc(&dest->ep->counters[ACM_CNTR_NODATA]); + else if (status) + atomic_inc(&dest->ep->counters[ACM_CNTR_ERROR]); + } + msg.hdr = req_msg->hdr; + msg.hdr.status = status; + msg.hdr.length = ACM_MSG_HDR_LENGTH; + memset(msg.hdr.data, 0, sizeof(msg.hdr.data)); + + if (status == ACM_STATUS_SUCCESS) { + msg.hdr.length += ACM_MSG_EP_LENGTH; + msg.resolve_data[0].flags = IBV_PATH_FLAG_GMP | + IBV_PATH_FLAG_PRIMARY | IBV_PATH_FLAG_BIDIRECTIONAL; + msg.resolve_data[0].type = ACM_EP_INFO_PATH; + msg.resolve_data[0].info.path = dest->path; + + if (req_msg->hdr.src_out) { + msg.hdr.length += ACM_MSG_EP_LENGTH; + memcpy(&msg.resolve_data[1], + &req_msg->resolve_data[req_msg->hdr.src_index], + ACM_MSG_EP_LENGTH); + } + } + + return acm_resolve_response(id, &msg); +} + +static void +acmp_complete_queued_req(struct acmp_dest *dest, uint8_t status) +{ + struct acmp_request *req; + + acm_log(2, "status %d\n", status); + pthread_mutex_lock(&dest->lock); + while ((req = list_pop(&dest->req_queue, struct acmp_request, entry))) { + pthread_mutex_unlock(&dest->lock); + + acm_log(2, "completing request, client %" PRIu64 "\n", req->id); + acmp_resolve_response(req->id, &req->msg, dest, status); + acmp_free_req(req); + + pthread_mutex_lock(&dest->lock); + } + pthread_mutex_unlock(&dest->lock); +} + +static void +acmp_dest_sa_resp(struct acm_sa_mad *mad) +{ + struct acmp_dest *dest = (struct acmp_dest *) mad->context; + struct ib_sa_mad *sa_mad = (struct ib_sa_mad *) &mad->sa_mad; + uint8_t status; + + if (!mad->umad.status) { + status = (uint8_t) (be16toh(sa_mad->status) >> 8); + } else { + status = ACM_STATUS_ETIMEDOUT; + } + acm_log(2, "%s status=0x%x\n", dest->name, status); + + pthread_mutex_lock(&dest->lock); + if (dest->state != ACMP_QUERY_ROUTE) { + acm_log(1, "notice - discarding SA response\n"); + pthread_mutex_unlock(&dest->lock); + goto out; + } + + if (!status) { + memcpy(&dest->path, sa_mad->data, sizeof(dest->path)); + acmp_init_path_av(dest->ep->port, dest); + dest->addr_timeout = time_stamp_min() + (unsigned) addr_timeout; + dest->route_timeout = time_stamp_min() + (unsigned) route_timeout; + acm_log(2, "timeout addr %" PRIu64 " route %" PRIu64 "\n", + dest->addr_timeout, dest->route_timeout); + dest->state = ACMP_READY; + } else { + dest->state = ACMP_INIT; + } + pthread_mutex_unlock(&dest->lock); + + acmp_complete_queued_req(dest, status); +out: + acm_free_sa_mad(mad); +} + +static void +acmp_resolve_sa_resp(struct acm_sa_mad *mad) +{ + struct acmp_dest *dest = (struct acmp_dest *) mad->context; + int send_resp; + + acm_log(2, "\n"); + acmp_dest_sa_resp(mad); + + pthread_mutex_lock(&dest->lock); + send_resp = (dest->state == ACMP_READY); + pthread_mutex_unlock(&dest->lock); + + if (send_resp) + acmp_send_addr_resp(dest->ep, dest); +} + +static struct acmp_addr * +acmp_addr_lookup(struct acmp_ep *ep, uint8_t *addr, uint16_t type) +{ + struct acmp_addr *ret = NULL; + int i; + + pthread_rwlock_rdlock(&ep->rwlock); + for (i = 0; i < ep->nmbr_ep_addrs; i++) { + if (ep->addr_info[i].type != type) + continue; + + if ((type == ACM_ADDRESS_NAME && + !strncasecmp((char *) ep->addr_info[i].info.name, + (char *) addr, ACM_MAX_ADDRESS)) || + !memcmp(ep->addr_info[i].info.addr, addr, + ACM_MAX_ADDRESS)) { + ret = ep->addr_info + i; + break; + } + } + pthread_rwlock_unlock(&ep->rwlock); + + return ret; +} + +static void +acmp_process_addr_req(struct acmp_ep *ep, struct ibv_wc *wc, struct acm_mad *mad) +{ + struct acm_resolve_rec *rec; + struct acmp_dest *dest; + uint8_t status; + struct acmp_addr *addr; + + acm_log(2, "\n"); + if ((status = acmp_validate_addr_req(mad))) { + acm_log(0, "ERROR - invalid request\n"); + return; + } + + rec = (struct acm_resolve_rec *) mad->data; + dest = acmp_acquire_dest(ep, rec->src_type, rec->src); + if (!dest) { + acm_log(0, "ERROR - unable to add source\n"); + return; + } + + addr = acmp_addr_lookup(ep, rec->dest, rec->dest_type); + if (addr) + dest->req_id = mad->tid; + + pthread_mutex_lock(&dest->lock); + acm_log(2, "dest state %d\n", dest->state); + switch (dest->state) { + case ACMP_READY: + if (dest->remote_qpn == wc->src_qp) + break; + + acm_log(2, "src service has new qp, resetting\n"); + /* fall through */ + case ACMP_INIT: + case ACMP_QUERY_ADDR: + status = acmp_record_acm_addr(ep, dest, wc, rec); + if (status) + break; + /* fall through */ + case ACMP_ADDR_RESOLVED: + if (route_prot == ACMP_ROUTE_PROT_ACM) { + status = acmp_record_acm_route(ep, dest); + break; + } + if (addr || !list_empty(&dest->req_queue)) { + status = acmp_resolve_path_sa(ep, dest, acmp_resolve_sa_resp); + if (status) + break; + } + /* fall through */ + default: + pthread_mutex_unlock(&dest->lock); + acmp_put_dest(dest); + return; + } + pthread_mutex_unlock(&dest->lock); + acmp_complete_queued_req(dest, status); + + if (addr && !status) { + acmp_send_addr_resp(ep, dest); + } + acmp_put_dest(dest); +} + +static void +acmp_process_addr_resp(struct acmp_send_msg *msg, struct ibv_wc *wc, struct acm_mad *mad) +{ + struct acm_resolve_rec *resp_rec; + struct acmp_dest *dest = (struct acmp_dest *) msg->context; + uint8_t status; + + if (mad) { + status = acm_class_status(mad->status); + resp_rec = (struct acm_resolve_rec *) mad->data; + } else { + status = ACM_STATUS_ETIMEDOUT; + resp_rec = NULL; + } + acm_log(2, "resp status 0x%x\n", status); + + pthread_mutex_lock(&dest->lock); + if (dest->state != ACMP_QUERY_ADDR) { + pthread_mutex_unlock(&dest->lock); + goto put; + } + + if (!status) { + status = acmp_record_acm_addr(msg->ep, dest, wc, resp_rec); + if (!status) { + if (route_prot == ACMP_ROUTE_PROT_ACM) { + status = acmp_record_acm_route(msg->ep, dest); + } else { + status = acmp_resolve_path_sa(msg->ep, dest, acmp_dest_sa_resp); + if (!status) { + pthread_mutex_unlock(&dest->lock); + goto put; + } + } + } + } else { + dest->state = ACMP_INIT; + } + pthread_mutex_unlock(&dest->lock); + + acmp_complete_queued_req(dest, status); +put: + acmp_put_dest(dest); +} + +static void acmp_process_acm_recv(struct acmp_ep *ep, struct ibv_wc *wc, struct acm_mad *mad) +{ + struct acmp_send_msg *req; + struct acm_resolve_rec *rec; + int free; + + acm_log(2, "\n"); + if (mad->base_version != 1 || mad->class_version != 1) { + acm_log(0, "ERROR - invalid version %d %d\n", + mad->base_version, mad->class_version); + return; + } + + if (mad->control != ACM_CTRL_RESOLVE) { + acm_log(0, "ERROR - invalid control 0x%x\n", mad->control); + return; + } + + rec = (struct acm_resolve_rec *) mad->data; + acm_format_name(2, log_data, sizeof log_data, + rec->src_type, rec->src, sizeof rec->src); + acm_log(2, "src %s\n", log_data); + acm_format_name(2, log_data, sizeof log_data, + rec->dest_type, rec->dest, sizeof rec->dest); + acm_log(2, "dest %s\n", log_data); + if (mad->method & IB_METHOD_RESP) { + acm_log(2, "received response\n"); + req = acmp_get_request(ep, mad->tid, &free); + if (!req) { + acm_log(1, "notice - response did not match active request\n"); + return; + } + acm_log(2, "found matching request\n"); + req->resp_handler(req, wc, mad); + if (free) + acmp_free_send(req); + } else { + acm_log(2, "unsolicited request\n"); + acmp_process_addr_req(ep, wc, mad); + } +} + +static void +acmp_sa_resp(struct acm_sa_mad *mad) +{ + struct acmp_request *req = (struct acmp_request *) mad->context; + struct ib_sa_mad *sa_mad = (struct ib_sa_mad *) &mad->sa_mad; + + req->msg.hdr.opcode |= ACM_OP_ACK; + if (!mad->umad.status) { + req->msg.hdr.status = (uint8_t) (be16toh(sa_mad->status) >> 8); + memcpy(&req->msg.resolve_data[0].info.path, sa_mad->data, + sizeof(struct ibv_path_record)); + } else { + req->msg.hdr.status = ACM_STATUS_ETIMEDOUT; + } + acm_log(2, "status 0x%x\n", req->msg.hdr.status); + + if (req->msg.hdr.status) + atomic_inc(&req->ep->counters[ACM_CNTR_ERROR]); + acm_query_response(req->id, &req->msg); + acm_free_sa_mad(mad); + acmp_free_req(req); +} + +static void acmp_process_sa_recv(struct acmp_ep *ep, struct ibv_wc *wc, struct acm_mad *mad) +{ + struct ib_sa_mad *sa_mad = (struct ib_sa_mad *) mad; + struct acmp_send_msg *req; + int free; + + acm_log(2, "\n"); + if (mad->base_version != 1 || mad->class_version != 2 || + !(mad->method & IB_METHOD_RESP) || sa_mad->attr_id != IB_SA_ATTR_PATH_REC) { + acm_log(0, "ERROR - unexpected SA MAD %d %d\n", + mad->base_version, mad->class_version); + return; + } + + req = acmp_get_request(ep, mad->tid, &free); + if (!req) { + acm_log(1, "notice - response did not match active request\n"); + return; + } + acm_log(2, "found matching request\n"); + req->resp_handler(req, wc, mad); + if (free) + acmp_free_send(req); +} + +static void acmp_process_recv(struct acmp_ep *ep, struct ibv_wc *wc) +{ + struct acm_mad *mad; + + acm_log(2, "base endpoint name %s\n", ep->id_string); + mad = (struct acm_mad *) (uintptr_t) (wc->wr_id + sizeof(struct ibv_grh)); + switch (mad->mgmt_class) { + case IB_MGMT_CLASS_SA: + acmp_process_sa_recv(ep, wc, mad); + break; + case ACM_MGMT_CLASS: + acmp_process_acm_recv(ep, wc, mad); + break; + default: + acm_log(0, "ERROR - invalid mgmt class 0x%x\n", mad->mgmt_class); + break; + } + + acmp_post_recv(ep, wc->wr_id); +} + +static void acmp_process_comp(struct acmp_ep *ep, struct ibv_wc *wc) +{ + if (wc->status) { + acm_log(0, "ERROR - work completion error\n" + "\topcode %d, completion status %d\n", + wc->opcode, wc->status); + return; + } + + if (wc->opcode & IBV_WC_RECV) + acmp_process_recv(ep, wc); + else + acmp_complete_send((struct acmp_send_msg *) (uintptr_t) wc->wr_id); +} + +static void *acmp_comp_handler(void *context) +{ + struct acmp_device *dev = (struct acmp_device *) context; + struct acmp_ep *ep; + struct ibv_cq *cq; + struct ibv_wc wc; + int cnt; + + acm_log(1, "started\n"); + + if (pthread_setcanceltype(PTHREAD_CANCEL_DEFERRED, NULL)) { + acm_log(0, "Error: failed to set cancel type for dev %s\n", + dev->verbs->device->name); + pthread_exit(NULL); + } + + if (pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, NULL)) { + acm_log(0, "Error: failed to set cancel state for dev %s\n", + dev->verbs->device->name); + pthread_exit(NULL); + } + while (1) { + pthread_testcancel(); + ibv_get_cq_event(dev->channel, &cq, (void *) &ep); + + cnt = 0; + while (ibv_poll_cq(cq, 1, &wc) > 0) { + cnt++; + acmp_process_comp(ep, &wc); + } + + ibv_req_notify_cq(cq, 0); + while (ibv_poll_cq(cq, 1, &wc) > 0) { + cnt++; + acmp_process_comp(ep, &wc); + } + + ibv_ack_cq_events(cq, cnt); + } + + return NULL; +} + +static void acmp_format_mgid(union ibv_gid *mgid, uint16_t pkey, uint8_t tos, + uint8_t rate, uint8_t mtu) +{ + mgid->raw[0] = 0xFF; + mgid->raw[1] = 0x10 | 0x05; + mgid->raw[2] = 0x40; + mgid->raw[3] = 0x01; + mgid->raw[4] = (uint8_t) (pkey >> 8); + mgid->raw[5] = (uint8_t) pkey; + mgid->raw[6] = tos; + mgid->raw[7] = rate; + mgid->raw[8] = mtu; + mgid->raw[9] = 0; + mgid->raw[10] = 0; + mgid->raw[11] = 0; + mgid->raw[12] = 0; + mgid->raw[13] = 0; + mgid->raw[14] = 0; + mgid->raw[15] = 0; +} + +static void acmp_init_join(struct ib_sa_mad *mad, union ibv_gid *port_gid, + uint16_t pkey, uint8_t tos, uint8_t tclass, uint8_t sl, uint8_t rate, uint8_t mtu) +{ + struct ib_mc_member_rec *mc_rec; + + acm_log(2, "\n"); + mad->base_version = 1; + mad->mgmt_class = IB_MGMT_CLASS_SA; + mad->class_version = 2; + mad->method = IB_METHOD_SET; + mad->tid = htobe64((uint64_t) atomic_inc(&g_tid)); + mad->attr_id = IB_SA_ATTR_MC_MEMBER_REC; + mad->comp_mask = + IB_COMP_MASK_MC_MGID | IB_COMP_MASK_MC_PORT_GID | + IB_COMP_MASK_MC_QKEY | IB_COMP_MASK_MC_MTU_SEL| IB_COMP_MASK_MC_MTU | + IB_COMP_MASK_MC_TCLASS | IB_COMP_MASK_MC_PKEY | IB_COMP_MASK_MC_RATE_SEL | + IB_COMP_MASK_MC_RATE | IB_COMP_MASK_MC_SL | IB_COMP_MASK_MC_FLOW | + IB_COMP_MASK_MC_SCOPE | IB_COMP_MASK_MC_JOIN_STATE; + + mc_rec = (struct ib_mc_member_rec *) mad->data; + acmp_format_mgid(&mc_rec->mgid, pkey | IB_PKEY_FULL_MEMBER, tos, rate, mtu); + mc_rec->port_gid = *port_gid; + mc_rec->qkey = htobe32(ACM_QKEY); + mc_rec->mtu = umad_sa_set_rate_mtu_or_life(UMAD_SA_SELECTOR_EXACTLY, mtu); + mc_rec->tclass = tclass; + mc_rec->pkey = htobe16(pkey); + mc_rec->rate = umad_sa_set_rate_mtu_or_life(UMAD_SA_SELECTOR_EXACTLY, rate); + mc_rec->sl_flow_hop = umad_sa_mcm_set_sl_flow_hop(sl, 0, 0); + mc_rec->scope_state = umad_sa_mcm_set_scope_state(UMAD_SA_MCM_ADDR_SCOPE_SITE_LOCAL, + UMAD_SA_MCM_JOIN_STATE_FULL_MEMBER); +} + +static void acmp_join_group(struct acmp_ep *ep, union ibv_gid *port_gid, + uint8_t tos, uint8_t tclass, uint8_t sl, uint8_t rate, uint8_t mtu) +{ + struct ib_sa_mad *mad; + struct ib_mc_member_rec *mc_rec; + struct acm_sa_mad *sa_mad; + + acm_log(2, "\n"); + sa_mad = acm_alloc_sa_mad(ep->endpoint, ep, acmp_process_join_resp); + if (!sa_mad) { + acm_log(0, "Error - failed to allocate sa_mad\n"); + return; + } + + acm_log(0, "%s %d pkey 0x%x, sl 0x%x, rate 0x%x, mtu 0x%x\n", + ep->port->dev->verbs->device->name, + ep->port->port_num, ep->pkey, sl, rate, mtu); + mad = (struct ib_sa_mad *) &sa_mad->sa_mad; + acmp_init_join(mad, port_gid, ep->pkey, tos, tclass, sl, rate, mtu); + mc_rec = (struct ib_mc_member_rec *) mad->data; + acmp_set_dest_addr(&ep->mc_dest[ep->mc_cnt++], ACM_ADDRESS_GID, + mc_rec->mgid.raw, sizeof(mc_rec->mgid)); + ep->mc_dest[ep->mc_cnt - 1].state = ACMP_INIT; + + if (acm_send_sa_mad(sa_mad)) { + acm_log(0, "Error - Failed to send sa mad\n"); + acm_free_sa_mad(sa_mad); + } +} + +static void acmp_ep_join(struct acmp_ep *ep) +{ + struct acmp_port *port; + union ibv_gid gid; + + port = ep->port; + acm_log(1, "%s\n", ep->id_string); + + if (ep->mc_dest[0].state == ACMP_READY && ep->mc_dest[0].ah) { + ibv_detach_mcast(ep->qp, &ep->mc_dest[0].mgid, + ep->mc_dest[0].av.dlid); + ibv_destroy_ah(ep->mc_dest[0].ah); + ep->mc_dest[0].ah = NULL; + } + ep->mc_cnt = 0; + ep->state = ACMP_INIT; + acm_get_gid((struct acm_port *)ep->port->port, 0, &gid); + acmp_join_group(ep, &gid, 0, 0, 0, min_rate, min_mtu); + + if ((route_prot == ACMP_ROUTE_PROT_ACM) && + (port->rate != min_rate || port->mtu != min_mtu)) + acmp_join_group(ep, &gid, 0, 0, 0, port->rate, port->mtu); + + acm_log(1, "join for %s complete\n", ep->id_string); +} + +static int acmp_port_join(void *port_context) +{ + struct acmp_ep *ep; + struct acmp_port *port = port_context; + + acm_log(1, "device %s port %d\n", port->dev->verbs->device->name, + port->port_num); + + list_for_each(&port->ep_list, ep, entry) { + if (!ep->endpoint) { + /* Stale endpoint */ + continue; + } + acmp_ep_join(ep); + } + acm_log(1, "joins for device %s port %d complete\n", + port->dev->verbs->device->name, port->port_num); + + return 0; +} + +static int acmp_handle_event(void *port_context, enum ibv_event_type type) +{ + int ret = 0; + + acm_log(2, "event %s\n", ibv_event_type_str(type)); + + switch (type) { + case IBV_EVENT_CLIENT_REREGISTER: + ret = acmp_port_join(port_context); + break; + default: + break; + } + return ret; +} + +static void acmp_process_timeouts(void) +{ + struct acmp_send_msg *msg; + struct acm_resolve_rec *rec; + struct acm_mad *mad; + + while ((msg = list_pop(&timeout_list, struct acmp_send_msg, entry))) { + mad = (struct acm_mad *) &msg->data[0]; + rec = (struct acm_resolve_rec *) mad->data; + + acm_format_name(0, log_data, sizeof log_data, + rec->dest_type, rec->dest, sizeof rec->dest); + acm_log(0, "notice - dest %s\n", log_data); + + msg->resp_handler(msg, NULL, NULL); + acmp_free_send(msg); + } +} + +static void acmp_process_wait_queue(struct acmp_ep *ep, uint64_t *next_expire) +{ + struct acmp_send_msg *msg, *next; + struct ibv_send_wr *bad_wr; + + list_for_each_safe(&ep->wait_queue, msg, next, entry) { + if (msg->expires <= time_stamp_ms()) { + list_del(&msg->entry); + (void) atomic_dec(&wait_cnt); + if (--msg->tries) { + acm_log(1, "notice - retrying request\n"); + list_add_tail(&ep->active_queue, &msg->entry); + ibv_post_send(ep->qp, &msg->wr, &bad_wr); + } else { + acm_log(0, "notice - failing request\n"); + acmp_send_available(ep, msg->req_queue); + list_add_tail(&timeout_list, &msg->entry); + } + } else { + *next_expire = min(*next_expire, msg->expires); + break; + } + } +} + +/* While the device/port/ep will not be freed, we need to be careful of + * their addition while walking the link lists. Therefore, we need to acquire + * the appropriate locks. + */ +static void *acmp_retry_handler(void *context) +{ + struct acmp_device *dev; + struct acmp_port *port; + struct acmp_ep *ep; + uint64_t next_expire; + int i, wait; + + acm_log(0, "started\n"); + if (pthread_setcanceltype(PTHREAD_CANCEL_DEFERRED, NULL)) { + acm_log(0, "Error: failed to set cancel type \n"); + pthread_exit(NULL); + } + if (pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, NULL)) { + acm_log(0, "Error: failed to set cancel state\n"); + pthread_exit(NULL); + } + retry_thread_started = 1; + + while (1) { + while (!atomic_get(&wait_cnt)) { + pthread_testcancel(); + event_wait(&timeout_event, -1); + } + + next_expire = -1; + pthread_mutex_lock(&acmp_dev_lock); + list_for_each(&acmp_dev_list, dev, entry) { + pthread_mutex_unlock(&acmp_dev_lock); + + for (i = 0; i < dev->port_cnt; i++) { + port = &dev->port[i]; + + pthread_mutex_lock(&port->lock); + list_for_each(&port->ep_list, ep, entry) { + pthread_mutex_unlock(&port->lock); + pthread_mutex_lock(&ep->lock); + if (!list_empty(&ep->wait_queue)) + acmp_process_wait_queue(ep, &next_expire); + pthread_mutex_unlock(&ep->lock); + pthread_mutex_lock(&port->lock); + } + pthread_mutex_unlock(&port->lock); + } + pthread_mutex_lock(&acmp_dev_lock); + } + pthread_mutex_unlock(&acmp_dev_lock); + + acmp_process_timeouts(); + if (next_expire != -1) { + wait = (int) (next_expire - time_stamp_ms()); + if (wait > 0 && atomic_get(&wait_cnt)) { + pthread_testcancel(); + event_wait(&timeout_event, wait); + } + } + } + + retry_thread_started = 0; + return NULL; +} + +/* rwlock must be held read-locked */ +static int +__acmp_query(struct acmp_ep *ep, struct acm_msg *msg, uint64_t id) +{ + struct acmp_request *req; + struct ib_sa_mad *mad; + uint8_t status; + struct acm_sa_mad *sa_mad; + + if (ep->state != ACMP_READY) { + status = ACM_STATUS_ENODATA; + goto resp; + } + + req = acmp_alloc_req(id, msg); + if (!req) { + status = ACM_STATUS_ENOMEM; + goto resp; + } + req->ep = ep; + + sa_mad = acm_alloc_sa_mad(ep->endpoint, req, acmp_sa_resp); + if (!sa_mad) { + acm_log(0, "Error - failed to allocate sa_mad\n"); + status = ACM_STATUS_ENOMEM; + goto free_req; + } + + mad = (struct ib_sa_mad *) &sa_mad->sa_mad; + acmp_init_path_query(mad); + + memcpy(mad->data, &msg->resolve_data[0].info.path, + sizeof(struct ibv_path_record)); + mad->comp_mask = acm_path_comp_mask(&msg->resolve_data[0].info.path); + + acm_increment_counter(ACM_CNTR_ROUTE_QUERY); + atomic_inc(&ep->counters[ACM_CNTR_ROUTE_QUERY]); + if (acm_send_sa_mad(sa_mad)) { + acm_log(0, "Error - Failed to send sa mad\n"); + status = ACM_STATUS_ENODATA; + goto free_mad; + } + return ACM_STATUS_SUCCESS; + +free_mad: + acm_free_sa_mad(sa_mad); +free_req: + acmp_free_req(req); +resp: + msg->hdr.opcode |= ACM_OP_ACK; + msg->hdr.status = status; + if (status == ACM_STATUS_ENODATA) + atomic_inc(&ep->counters[ACM_CNTR_NODATA]); + else + atomic_inc(&ep->counters[ACM_CNTR_ERROR]); + return acm_query_response(id, msg); +} + +static int +acmp_query(void *addr_context, struct acm_msg *msg, uint64_t id) +{ + struct acmp_addr_ctx *addr_ctx = addr_context; + struct acmp_addr *address; + int ret; + + pthread_rwlock_rdlock(&addr_ctx->ep->rwlock); + address = addr_ctx->ep->addr_info + addr_ctx->addr_inx; + ret = __acmp_query(address->ep, msg, id); + pthread_rwlock_unlock(&addr_ctx->ep->rwlock); + + return ret; +} + +static uint8_t +acmp_send_resolve(struct acmp_ep *ep, struct acmp_dest *dest, + struct acm_ep_addr_data *saddr) +{ + struct acmp_send_msg *msg; + struct acm_mad *mad; + struct acm_resolve_rec *rec; + int i; + + acm_log(2, "\n"); + msg = acmp_alloc_send(ep, &ep->mc_dest[0], sizeof(*mad)); + if (!msg) { + acm_log(0, "ERROR - cannot allocate send msg\n"); + return ACM_STATUS_ENOMEM; + } + + acmp_init_send_req(msg, (void *) dest, acmp_process_addr_resp); + (void) atomic_inc(&dest->refcnt); + + mad = (struct acm_mad *) msg->data; + mad->base_version = 1; + mad->mgmt_class = ACM_MGMT_CLASS; + mad->class_version = 1; + mad->method = IB_METHOD_GET; + mad->control = ACM_CTRL_RESOLVE; + mad->tid = htobe64((uint64_t) atomic_inc(&g_tid)); + + rec = (struct acm_resolve_rec *) mad->data; + rec->src_type = (uint8_t) saddr->type; + rec->src_length = ACM_MAX_ADDRESS; + memcpy(rec->src, saddr->info.addr, ACM_MAX_ADDRESS); + rec->dest_type = dest->addr_type; + rec->dest_length = ACM_MAX_ADDRESS; + memcpy(rec->dest, dest->address, ACM_MAX_ADDRESS); + + rec->gid_cnt = (uint8_t) ep->mc_cnt; + for (i = 0; i < ep->mc_cnt; i++) + memcpy(&rec->gid[i], ep->mc_dest[i].address, 16); + + acm_increment_counter(ACM_CNTR_ADDR_QUERY); + atomic_inc(&ep->counters[ACM_CNTR_ADDR_QUERY]); + acmp_post_send(&ep->resolve_queue, msg); + return 0; +} + +/* Caller must hold dest lock */ +static uint8_t acmp_queue_req(struct acmp_dest *dest, uint64_t id, struct acm_msg *msg) +{ + struct acmp_request *req; + + acm_log(2, "id %" PRIu64 "\n", id); + req = acmp_alloc_req(id, msg); + if (!req) { + return ACM_STATUS_ENOMEM; + } + req->ep = dest->ep; + + list_add_tail(&dest->req_queue, &req->entry); + return ACM_STATUS_SUCCESS; +} + +static int acmp_dest_timeout(struct acmp_dest *dest) +{ + uint64_t timestamp = time_stamp_min(); + + if (timestamp > dest->addr_timeout) { + acm_log(2, "%s address timed out\n", dest->name); + dest->state = ACMP_INIT; + return 1; + } else if (timestamp > dest->route_timeout) { + acm_log(2, "%s route timed out\n", dest->name); + dest->state = ACMP_ADDR_RESOLVED; + return 1; + } + return 0; +} + +static int +acmp_check_addr_match(struct ifaddrs *iap, struct acm_ep_addr_data *saddr, + unsigned int d_family) +{ + char sip[INET6_ADDRSTRLEN] = {0}; + char dip[INET6_ADDRSTRLEN] = {0}; + const char *tmp; + size_t sock_size; + unsigned int s_family; + int ret; + + s_family = iap->ifa_addr->sa_family; + + if (!(iap->ifa_flags & IFF_UP) || + (s_family != d_family)) + return -1; + + sock_size = (s_family == AF_INET) ? sizeof(struct sockaddr_in) : + sizeof(struct sockaddr_in6); + + ret = getnameinfo(iap->ifa_addr, sock_size, + sip, sizeof(sip), + NULL, 0, NI_NUMERICHOST); + + if (ret) + return ret; + + tmp = inet_ntop(d_family, (void *)saddr->info.addr, dip, + sizeof(dip)); + if (!tmp) + return -1; + ret = memcmp(sip, dip, strlen(dip)); + return ret; +} + +static void +acmp_acquire_sgid(struct acm_ep_addr_data *saddr, + struct acmp_dest *dest) +{ + struct ifaddrs *addrs, *iap; + unsigned int d_family; + int ret; + + if (!ib_any_gid(&dest->path.sgid)) + return; + + if (dest->addr_type != ACM_ADDRESS_IP6 && + dest->addr_type != ACM_ADDRESS_IP) + return; + + if (getifaddrs(&addrs)) + return; + + d_family = (dest->addr_type == ACM_ADDRESS_IP) ? AF_INET : AF_INET6; + + for (iap = addrs; iap != NULL; iap = iap->ifa_next) { + ret = acmp_check_addr_match(iap, saddr, d_family); + if (!ret) { + ret = acm_if_get_sgid(iap->ifa_name, + &dest->path.sgid); + if (!ret) + break; + } + } + freeifaddrs(addrs); +} + +static int +acmp_resolve_dest(struct acmp_ep *ep, struct acm_msg *msg, uint64_t id) +{ + struct acmp_dest *dest; + struct acm_ep_addr_data *saddr, *daddr; + uint8_t status; + int ret; + + saddr = &msg->resolve_data[msg->hdr.src_index]; + daddr = &msg->resolve_data[msg->hdr.dst_index]; + acm_format_name(2, log_data, sizeof log_data, + daddr->type, daddr->info.addr, sizeof daddr->info.addr); + acm_log(2, "dest %s\n", log_data); + + dest = acmp_acquire_dest(ep, daddr->type, daddr->info.addr); + if (!dest) { + acm_log(0, "ERROR - unable to allocate destination in request\n"); + atomic_inc(&ep->counters[ACM_CNTR_ERROR]); + return acmp_resolve_response(id, msg, NULL, ACM_STATUS_ENOMEM); + } + + pthread_mutex_lock(&dest->lock); +test: + switch (dest->state) { + case ACMP_READY: + if (acmp_dest_timeout(dest)) + goto test; + acm_log(2, "request satisfied from local cache\n"); + acm_increment_counter(ACM_CNTR_ROUTE_CACHE); + atomic_inc(&ep->counters[ACM_CNTR_ROUTE_CACHE]); + status = ACM_STATUS_SUCCESS; + break; + case ACMP_ADDR_RESOLVED: + acm_log(2, "have address, resolving route\n"); + acm_increment_counter(ACM_CNTR_ADDR_CACHE); + atomic_inc(&ep->counters[ACM_CNTR_ADDR_CACHE]); + acmp_acquire_sgid(saddr, dest); + status = acmp_resolve_path_sa(ep, dest, acmp_dest_sa_resp); + if (status) { + break; + } + goto queue; + case ACMP_INIT: + acm_log(2, "sending resolve msg to dest\n"); + status = acmp_send_resolve(ep, dest, saddr); + if (status) { + break; + } + dest->state = ACMP_QUERY_ADDR; + /* fall through */ + default: +queue: + if (daddr->flags & ACM_FLAGS_NODELAY) { + acm_log(2, "lookup initiated, but client wants no delay\n"); + status = ACM_STATUS_ENODATA; + break; + } + status = acmp_queue_req(dest, id, msg); + if (status) { + break; + } + ret = 0; + pthread_mutex_unlock(&dest->lock); + goto put; + } + pthread_mutex_unlock(&dest->lock); + ret = acmp_resolve_response(id, msg, dest, status); +put: + acmp_put_dest(dest); + return ret; +} + +static int +acmp_resolve_path(struct acmp_ep *ep, struct acm_msg *msg, uint64_t id) +{ + struct acmp_dest *dest; + struct ibv_path_record *path; + uint8_t *addr; + uint8_t status; + int ret; + + path = &msg->resolve_data[0].info.path; + addr = msg->resolve_data[1].info.addr; + memset(addr, 0, ACM_MAX_ADDRESS); + if (path->dlid) { + * ((__be16 *) addr) = path->dlid; + dest = acmp_acquire_dest(ep, ACM_ADDRESS_LID, addr); + } else { + memcpy(addr, &path->dgid, sizeof path->dgid); + dest = acmp_acquire_dest(ep, ACM_ADDRESS_GID, addr); + } + if (!dest) { + acm_log(0, "ERROR - unable to allocate destination in request\n"); + atomic_inc(&ep->counters[ACM_CNTR_ERROR]); + return acmp_resolve_response(id, msg, NULL, ACM_STATUS_ENOMEM); + } + + pthread_mutex_lock(&dest->lock); +test: + switch (dest->state) { + case ACMP_READY: + if (acmp_dest_timeout(dest)) + goto test; + acm_log(2, "request satisfied from local cache\n"); + acm_increment_counter(ACM_CNTR_ROUTE_CACHE); + atomic_inc(&ep->counters[ACM_CNTR_ROUTE_CACHE]); + status = ACM_STATUS_SUCCESS; + break; + case ACMP_INIT: + acm_log(2, "have path, bypassing address resolution\n"); + acmp_record_path_addr(ep, dest, path); + /* fall through */ + case ACMP_ADDR_RESOLVED: + acm_log(2, "have address, resolving route\n"); + status = acmp_resolve_path_sa(ep, dest, acmp_dest_sa_resp); + if (status) { + break; + } + /* fall through */ + default: + if (msg->resolve_data[0].flags & ACM_FLAGS_NODELAY) { + acm_log(2, "lookup initiated, but client wants no delay\n"); + status = ACM_STATUS_ENODATA; + break; + } + status = acmp_queue_req(dest, id, msg); + if (status) { + break; + } + ret = 0; + pthread_mutex_unlock(&dest->lock); + goto put; + } + pthread_mutex_unlock(&dest->lock); + ret = acmp_resolve_response(id, msg, dest, status); +put: + acmp_put_dest(dest); + return ret; +} + +static int +acmp_resolve(void *addr_context, struct acm_msg *msg, uint64_t id) +{ + struct acmp_addr_ctx *addr_ctx = addr_context; + struct acmp_addr *address = addr_ctx->ep->addr_info + addr_ctx->addr_inx; + struct acmp_ep *ep = address->ep; + + if (ep->state != ACMP_READY) { + atomic_inc(&ep->counters[ACM_CNTR_NODATA]); + return acmp_resolve_response(id, msg, NULL, ACM_STATUS_ENODATA); + } + + atomic_inc(&ep->counters[ACM_CNTR_RESOLVE]); + if (msg->resolve_data[0].type == ACM_EP_INFO_PATH) + return acmp_resolve_path(ep, msg, id); + else + return acmp_resolve_dest(ep, msg, id); +} + +static void acmp_query_perf(void *ep_context, uint64_t *values, uint8_t *cnt) +{ + struct acmp_ep *ep = ep_context; + int i; + + for (i = 0; i < ACM_MAX_COUNTER; i++) + values[i] = htobe64((uint64_t) atomic_get(&ep->counters[i])); + *cnt = ACM_MAX_COUNTER; +} + +static enum acmp_addr_prot acmp_convert_addr_prot(char *param) +{ + if (!strcasecmp("acm", param)) + return ACMP_ADDR_PROT_ACM; + + return addr_prot; +} + +static enum acmp_route_prot acmp_convert_route_prot(char *param) +{ + if (!strcasecmp("acm", param)) + return ACMP_ROUTE_PROT_ACM; + else if (!strcasecmp("sa", param)) + return ACMP_ROUTE_PROT_SA; + + return route_prot; +} + +static enum acmp_loopback_prot acmp_convert_loopback_prot(char *param) +{ + if (!strcasecmp("none", param)) + return ACMP_LOOPBACK_PROT_NONE; + else if (!strcasecmp("local", param)) + return ACMP_LOOPBACK_PROT_LOCAL; + + return loopback_prot; +} + +static enum acmp_route_preload acmp_convert_route_preload(char *param) +{ + if (!strcasecmp("none", param) || !strcasecmp("no", param)) + return ACMP_ROUTE_PRELOAD_NONE; + else if (!strcasecmp("opensm_full_v1", param)) + return ACMP_ROUTE_PRELOAD_OSM_FULL_V1; + + return route_preload; +} + +static enum acmp_addr_preload acmp_convert_addr_preload(char *param) +{ + if (!strcasecmp("none", param) || !strcasecmp("no", param)) + return ACMP_ADDR_PRELOAD_NONE; + else if (!strcasecmp("acm_hosts", param)) + return ACMP_ADDR_PRELOAD_HOSTS; + + return addr_preload; +} + +static int acmp_post_recvs(struct acmp_ep *ep) +{ + int i, size; + + size = recv_depth * ACM_RECV_SIZE; + ep->recv_bufs = malloc(size); + if (!ep->recv_bufs) { + acm_log(0, "ERROR - unable to allocate receive buffer\n"); + return ACM_STATUS_ENOMEM; + } + + ep->mr = ibv_reg_mr(ep->port->dev->pd, ep->recv_bufs, size, + IBV_ACCESS_LOCAL_WRITE); + if (!ep->mr) { + acm_log(0, "ERROR - unable to register receive buffer\n"); + goto err; + } + + for (i = 0; i < recv_depth; i++) { + acmp_post_recv(ep, (uintptr_t) (ep->recv_bufs + ACM_RECV_SIZE * i)); + } + return 0; + +err: + free(ep->recv_bufs); + return -1; +} + +/* Parse "opensm full v1" file to build LID to GUID table */ +static void acmp_parse_osm_fullv1_lid2guid(FILE *f, __be64 *lid2guid) +{ + char s[128]; + char *p, *ptr, *p_guid, *p_lid; + uint64_t guid; + uint16_t lid; + + while (fgets(s, sizeof s, f)) { + if (s[0] == '#') + continue; + if (!(p = strtok_r(s, " \n", &ptr))) + continue; /* ignore blank lines */ + + if (strncmp(p, "Switch", sizeof("Switch") - 1) && + strncmp(p, "Channel", sizeof("Channel") - 1) && + strncmp(p, "Router", sizeof("Router") - 1)) + continue; + + if (!strncmp(p, "Channel", sizeof("Channel") - 1)) { + p = strtok_r(NULL, " ", &ptr); /* skip 'Adapter' */ + if (!p) + continue; + } + + p_guid = strtok_r(NULL, ",", &ptr); + if (!p_guid) + continue; + + guid = (uint64_t) strtoull(p_guid, NULL, 16); + + ptr = strstr(ptr, "base LID"); + if (!ptr) + continue; + ptr += sizeof("base LID"); + p_lid = strtok_r(NULL, ",", &ptr); + if (!p_lid) + continue; + + lid = (uint16_t) strtoul(p_lid, NULL, 0); + if (lid >= IB_LID_MCAST_START) + continue; + if (lid2guid[lid]) + acm_log(0, "ERROR - duplicate lid %u\n", lid); + else + lid2guid[lid] = htobe64(guid); + } +} + +/* Parse 'opensm full v1' file to populate PR cache */ +static int acmp_parse_osm_fullv1_paths(FILE *f, __be64 *lid2guid, struct acmp_ep *ep) +{ + union ibv_gid sgid, dgid; + struct ibv_port_attr attr = {}; + struct acmp_dest *dest; + char s[128]; + char *p, *ptr, *p_guid, *p_lid; + uint64_t guid; + uint16_t lid, dlid; + __be16 net_dlid; + int sl, mtu, rate; + int ret = 1, i; + uint8_t addr[ACM_MAX_ADDRESS]; + uint8_t addr_type; + + acm_get_gid((struct acm_port *)ep->port->port, 0, &sgid); + + /* Search for endpoint's SLID */ + while (fgets(s, sizeof s, f)) { + if (s[0] == '#') + continue; + if (!(p = strtok_r(s, " \n", &ptr))) + continue; /* ignore blank lines */ + + if (strncmp(p, "Switch", sizeof("Switch") - 1) && + strncmp(p, "Channel", sizeof("Channel") - 1) && + strncmp(p, "Router", sizeof("Router") - 1)) + continue; + + if (!strncmp(p, "Channel", sizeof("Channel") - 1)) { + p = strtok_r(NULL, " ", &ptr); /* skip 'Adapter' */ + if (!p) + continue; + } + + p_guid = strtok_r(NULL, ",", &ptr); + if (!p_guid) + continue; + + guid = (uint64_t) strtoull(p_guid, NULL, 16); + if (guid != be64toh(sgid.global.interface_id)) + continue; + + ptr = strstr(ptr, "base LID"); + if (!ptr) + continue; + ptr += sizeof("base LID"); + p_lid = strtok_r(NULL, ",", &ptr); + if (!p_lid) + continue; + + lid = (uint16_t) strtoul(p_lid, NULL, 0); + if (lid != ep->port->lid) + continue; + + ibv_query_port(ep->port->dev->verbs, ep->port->port_num, &attr); + ret = 0; + break; + } + + while (fgets(s, sizeof s, f)) { + if (s[0] == '#') + continue; + if (!(p = strtok_r(s, " \n", &ptr))) + continue; /* ignore blank lines */ + + if (!strncmp(p, "Switch", sizeof("Switch") - 1) || + !strncmp(p, "Channel", sizeof("Channel") - 1) || + !strncmp(p, "Router", sizeof("Router") - 1)) + break; + + dlid = strtoul(p, NULL, 0); + net_dlid = htobe16(dlid); + + p = strtok_r(NULL, ":", &ptr); + if (!p) + continue; + if (strcmp(p, "UNREACHABLE") == 0) + continue; + sl = atoi(p); + + p = strtok_r(NULL, ":", &ptr); + if (!p) + continue; + mtu = atoi(p); + + p = strtok_r(NULL, ":", &ptr); + if (!p) + continue; + rate = atoi(p); + + if (!lid2guid[dlid]) { + acm_log(0, "ERROR - dlid %u not found in lid2guid table\n", dlid); + continue; + } + + dgid.global.subnet_prefix = sgid.global.subnet_prefix; + dgid.global.interface_id = lid2guid[dlid]; + + for (i = 0; i < 2; i++) { + memset(addr, 0, ACM_MAX_ADDRESS); + if (i == 0) { + addr_type = ACM_ADDRESS_LID; + memcpy(addr, &net_dlid, sizeof net_dlid); + } else { + addr_type = ACM_ADDRESS_GID; + memcpy(addr, &dgid, sizeof(dgid)); + } + dest = acmp_acquire_dest(ep, addr_type, addr); + if (!dest) { + acm_log(0, "ERROR - unable to create dest\n"); + break; + } + + dest->path.sgid = sgid; + dest->path.slid = htobe16(ep->port->lid); + dest->path.dgid = dgid; + dest->path.dlid = net_dlid; + dest->path.reversible_numpath = IBV_PATH_RECORD_REVERSIBLE; + dest->path.pkey = htobe16(ep->pkey); + dest->path.mtu = (uint8_t) mtu; + dest->path.rate = (uint8_t) rate; + dest->path.qosclass_sl = htobe16((uint16_t) sl & 0xF); + if (dlid == ep->port->lid) { + dest->path.packetlifetime = 0; + dest->addr_timeout = (uint64_t)~0ULL; + dest->route_timeout = (uint64_t)~0ULL; + } else { + dest->path.packetlifetime = attr.subnet_timeout; + dest->addr_timeout = time_stamp_min() + (unsigned) addr_timeout; + dest->route_timeout = time_stamp_min() + (unsigned) route_timeout; + } + dest->remote_qpn = 1; + dest->state = ACMP_READY; + acmp_put_dest(dest); + acm_log(1, "added cached dest %s\n", dest->name); + } + } + return ret; +} + +static int acmp_parse_osm_fullv1(struct acmp_ep *ep) +{ + FILE *f; + __be64 *lid2guid; + int ret = 1; + + if (!(f = fopen(route_data_file, "r"))) { + acm_log(0, "ERROR - couldn't open %s\n", route_data_file); + return ret; + } + + lid2guid = calloc(IB_LID_MCAST_START, sizeof(*lid2guid)); + if (!lid2guid) { + acm_log(0, "ERROR - no memory for path record parsing\n"); + goto err; + } + + acmp_parse_osm_fullv1_lid2guid(f, lid2guid); + rewind(f); + ret = acmp_parse_osm_fullv1_paths(f, lid2guid, ep); + free(lid2guid); +err: + fclose(f); + return ret; +} + +static void acmp_parse_hosts_file(struct acmp_ep *ep) +{ + FILE *f; + char s[120]; + char addr[INET6_ADDRSTRLEN], gid[INET6_ADDRSTRLEN]; + uint8_t name[ACM_MAX_ADDRESS]; + struct in6_addr ip_addr, ib_addr; + struct acmp_dest *dest, *gid_dest; + uint8_t addr_type; + + if (!(f = fopen(addr_data_file, "r"))) { + acm_log(0, "ERROR - couldn't open %s\n", addr_data_file); + return; + } + + while (fgets(s, sizeof s, f)) { + if (s[0] == '#') + continue; + + if (sscanf(s, "%46s%46s", addr, gid) != 2) + continue; + + acm_log(2, "%s", s); + if (inet_pton(AF_INET6, gid, &ib_addr) <= 0) { + acm_log(0, "ERROR - %s is not IB GID\n", gid); + continue; + } + memset(name, 0, ACM_MAX_ADDRESS); + if (inet_pton(AF_INET, addr, &ip_addr) > 0) { + addr_type = ACM_ADDRESS_IP; + memcpy(name, &ip_addr, 4); + } else if (inet_pton(AF_INET6, addr, &ip_addr) > 0) { + addr_type = ACM_ADDRESS_IP6; + memcpy(name, &ip_addr, sizeof(ip_addr)); + } else { + addr_type = ACM_ADDRESS_NAME; + strncpy((char *)name, addr, ACM_MAX_ADDRESS); + } + + dest = acmp_acquire_dest(ep, addr_type, name); + if (!dest) { + acm_log(0, "ERROR - unable to create dest %s\n", addr); + continue; + } + + memset(name, 0, ACM_MAX_ADDRESS); + memcpy(name, &ib_addr, sizeof(ib_addr)); + gid_dest = acmp_get_dest(ep, ACM_ADDRESS_GID, name); + if (gid_dest) { + dest->path = gid_dest->path; + dest->state = ACMP_READY; + acmp_put_dest(gid_dest); + } else { + memcpy(&dest->path.dgid, &ib_addr, 16); + //ibv_query_gid(ep->port->dev->verbs, ep->port->port_num, + // 0, &dest->path.sgid); + dest->path.slid = htobe16(ep->port->lid); + dest->path.reversible_numpath = IBV_PATH_RECORD_REVERSIBLE; + dest->path.pkey = htobe16(ep->pkey); + dest->state = ACMP_ADDR_RESOLVED; + } + + dest->remote_qpn = 1; + dest->addr_timeout = time_stamp_min() + (unsigned) addr_timeout; + dest->route_timeout = time_stamp_min() + (unsigned) route_timeout; + acmp_put_dest(dest); + acm_log(1, "added host %s address type %d IB GID %s\n", + addr, addr_type, gid); + } + + fclose(f); +} + +/* + * We currently require that the routing data be preloaded in order to + * load the address data. This is backwards from normal operation, which + * usually resolves the address before the route. + */ +static void acmp_ep_preload(struct acmp_ep *ep) +{ + switch (route_preload) { + case ACMP_ROUTE_PRELOAD_OSM_FULL_V1: + if (acmp_parse_osm_fullv1(ep)) + acm_log(0, "ERROR - failed to preload EP\n"); + break; + default: + break; + } + + switch (addr_preload) { + case ACMP_ADDR_PRELOAD_HOSTS: + acmp_parse_hosts_file(ep); + break; + default: + break; + } +} + +/* rwlock must be held write-locked */ +static int __acmp_add_addr(const struct acm_address *addr, struct acmp_ep *ep, + void **addr_context) +{ + struct acmp_dest *dest; + struct acmp_addr_ctx *addr_ctx; + int i; + + for (i = 0; (i < ep->nmbr_ep_addrs) && + (ep->addr_info[i].type != ACM_ADDRESS_INVALID); i++) + ; + + if (i == ep->nmbr_ep_addrs) { + struct acmp_addr *new_info; + + new_info = realloc(ep->addr_info, (i + 1) * sizeof(*ep->addr_info)); + if (!new_info) { + acm_log(0, "ERROR - no more space for local address\n"); + return -1; + } + ep->addr_info = new_info; + /* Added memory is not initialized */ + memset(ep->addr_info + i, 0, sizeof(*ep->addr_info)); + ++ep->nmbr_ep_addrs; + } + ep->addr_info[i].type = addr->type; + memcpy(&ep->addr_info[i].info, &addr->info, sizeof(addr->info)); + memcpy(&ep->addr_info[i].addr, addr, sizeof(*addr)); + ep->addr_info[i].ep = ep; + + addr_ctx = malloc(sizeof(*addr_ctx)); + if (!addr_ctx) { + acm_log(0, "ERROR - unable to alloc address context struct\n"); + return -1; + } + addr_ctx->ep = ep; + addr_ctx->addr_inx = i; + + if (loopback_prot != ACMP_LOOPBACK_PROT_LOCAL) { + *addr_context = addr_ctx; + return 0; + } + + dest = acmp_acquire_dest(ep, addr->type, (uint8_t *)addr->info.addr); + if (!dest) { + acm_log(0, "ERROR - unable to create loopback dest %s\n", + addr->id_string); + memset(&ep->addr_info[i], 0, sizeof(ep->addr_info[i])); + free(addr_ctx); + return -1; + } + + acm_get_gid((struct acm_port *) ep->port->port, 0, &dest->path.sgid); + dest->path.dgid = dest->path.sgid; + dest->path.dlid = dest->path.slid = htobe16(ep->port->lid); + dest->path.reversible_numpath = IBV_PATH_RECORD_REVERSIBLE; + dest->path.pkey = htobe16(ep->pkey); + dest->path.mtu = (uint8_t) ep->port->mtu; + dest->path.rate = (uint8_t) ep->port->rate; + + dest->remote_qpn = ep->qp->qp_num; + dest->addr_timeout = (uint64_t) ~0ULL; + dest->route_timeout = (uint64_t) ~0ULL; + dest->state = ACMP_READY; + acmp_put_dest(dest); + *addr_context = addr_ctx; + acm_log(1, "added loopback dest %s\n", dest->name); + + return 0; +} + +static int acmp_add_addr(const struct acm_address *addr, void *ep_context, + void **addr_context) +{ + struct acmp_ep *ep = ep_context; + int ret; + + acm_log(2, "\n"); + + pthread_rwlock_wrlock(&ep->rwlock); + ret = __acmp_add_addr(addr, ep, addr_context); + pthread_rwlock_unlock(&ep->rwlock); + + return ret; +} + +static void acmp_remove_addr(void *addr_context) +{ + struct acmp_addr_ctx *addr_ctx = addr_context; + struct acmp_addr *address = addr_ctx->ep->addr_info + addr_ctx->addr_inx; + struct acmp_device *dev; + struct acmp_dest *dest; + struct acmp_ep *ep; + int i; + + acm_log(2, "\n"); + + /* + * The address may be a local destination address. If so, + * delete it from the cache. + */ + + pthread_mutex_lock(&acmp_dev_lock); + list_for_each(&acmp_dev_list, dev, entry) { + pthread_mutex_unlock(&acmp_dev_lock); + + for (i = 0; i < dev->port_cnt; i++) { + struct acmp_port *port = &dev->port[i]; + + pthread_mutex_lock(&port->lock); + list_for_each(&port->ep_list, ep, entry) { + pthread_mutex_unlock(&port->lock); + dest = acmp_get_dest(ep, address->type, address->addr.info.addr); + if (dest) { + acm_log(2, "Found a dest addr, deleting it\n"); + pthread_mutex_lock(&ep->lock); + acmp_remove_dest(ep, dest); + pthread_mutex_unlock(&ep->lock); + } + pthread_mutex_lock(&port->lock); + } + pthread_mutex_unlock(&port->lock); + } + pthread_mutex_lock(&acmp_dev_lock); + } + pthread_mutex_unlock(&acmp_dev_lock); + + memset(address, 0, sizeof(*address)); + free(addr_ctx); +} + +static struct acmp_port *acmp_get_port(struct acm_endpoint *endpoint) +{ + struct acmp_device *dev; + + acm_log(1, "dev 0x%" PRIx64 " port %d pkey 0x%x\n", + be64toh(endpoint->port->dev->dev_guid), + endpoint->port->port_num, endpoint->pkey); + + list_for_each(&acmp_dev_list, dev, entry) { + if (dev->guid == endpoint->port->dev->dev_guid) + return &dev->port[endpoint->port->port_num - 1]; + } + + return NULL; +} + +static struct acmp_ep * +acmp_get_ep(struct acmp_port *port, struct acm_endpoint *endpoint) +{ + struct acmp_ep *ep; + + acm_log(1, "dev 0x%" PRIx64 " port %d pkey 0x%x\n", + be64toh(endpoint->port->dev->dev_guid), + endpoint->port->port_num, endpoint->pkey); + + list_for_each(&port->ep_list, ep, entry) { + if (ep->pkey == endpoint->pkey) + return ep; + } + + return NULL; +} + +static uint16_t acmp_get_pkey_index(struct acm_endpoint *endpoint) +{ + struct acmp_port *port; + int i; + + port = acmp_get_port(endpoint); + if (!port) + return 0; + i = ibv_get_pkey_index(port->dev->verbs, port->port_num, + htobe16(endpoint->pkey)); + if (i < 0) + return 0; + return i; +} + +static void acmp_close_endpoint(void *ep_context) +{ + + struct acmp_ep *ep = ep_context; + + acm_log(1, "%s %d pkey 0x%04x\n", + ep->port->dev->verbs->device->name, + ep->port->port_num, ep->pkey); + + ep->endpoint = NULL; +} + +static struct acmp_ep * +acmp_alloc_ep(struct acmp_port *port, struct acm_endpoint *endpoint) +{ + struct acmp_ep *ep; + int i; + + acm_log(1, "\n"); + ep = calloc(1, sizeof *ep); + if (!ep) + return NULL; + + ep->port = port; + ep->endpoint = endpoint; + ep->pkey = endpoint->pkey; + ep->resolve_queue.credits = resolve_depth; + ep->resp_queue.credits = send_depth; + list_head_init(&ep->resolve_queue.pending); + list_head_init(&ep->resp_queue.pending); + list_head_init(&ep->active_queue); + list_head_init(&ep->wait_queue); + pthread_mutex_init(&ep->lock, NULL); + sprintf(ep->id_string, "%s-%d-0x%x", port->dev->verbs->device->name, + port->port_num, endpoint->pkey); + + if (pthread_rwlock_init(&ep->rwlock, NULL)) { + free(ep); + return NULL; + } + ep->addr_info = NULL; + ep->nmbr_ep_addrs = 0; + + for (i = 0; i < ACM_MAX_COUNTER; i++) + atomic_init(&ep->counters[i]); + + return ep; +} + +static int acmp_open_endpoint(const struct acm_endpoint *endpoint, + void *port_context, void **ep_context) +{ + struct acmp_port *port = port_context; + struct acmp_ep *ep; + struct ibv_qp_init_attr init_attr; + struct ibv_qp_attr attr; + int ret, sq_size; + + ep = acmp_get_ep(port, (struct acm_endpoint *) endpoint); + if (ep) { + acm_log(2, "endpoint for pkey 0x%x already exists\n", endpoint->pkey); + pthread_mutex_lock(&ep->lock); + ep->endpoint = (struct acm_endpoint *) endpoint; + pthread_mutex_unlock(&ep->lock); + *ep_context = (void *) ep; + return 0; + } + + acm_log(2, "creating endpoint for pkey 0x%x\n", endpoint->pkey); + ep = acmp_alloc_ep(port, (struct acm_endpoint *) endpoint); + if (!ep) + return -1; + + sprintf(ep->id_string, "%s-%d-0x%x", + port->dev->verbs->device->name, + port->port_num, endpoint->pkey); + + sq_size = resolve_depth + send_depth; + ep->cq = ibv_create_cq(port->dev->verbs, sq_size + recv_depth, + ep, port->dev->channel, 0); + if (!ep->cq) { + acm_log(0, "ERROR - failed to create CQ\n"); + goto err0; + } + + ret = ibv_req_notify_cq(ep->cq, 0); + if (ret) { + acm_log(0, "ERROR - failed to arm CQ\n"); + goto err1; + } + + memset(&init_attr, 0, sizeof init_attr); + init_attr.cap.max_send_wr = sq_size; + init_attr.cap.max_recv_wr = recv_depth; + init_attr.cap.max_send_sge = 1; + init_attr.cap.max_recv_sge = 1; + init_attr.qp_context = ep; + init_attr.sq_sig_all = 1; + init_attr.qp_type = IBV_QPT_UD; + init_attr.send_cq = ep->cq; + init_attr.recv_cq = ep->cq; + ep->qp = ibv_create_qp(ep->port->dev->pd, &init_attr); + if (!ep->qp) { + acm_log(0, "ERROR - failed to create QP\n"); + goto err1; + } + + attr.qp_state = IBV_QPS_INIT; + attr.port_num = port->port_num; + attr.pkey_index = acmp_get_pkey_index((struct acm_endpoint *) endpoint); + attr.qkey = ACM_QKEY; + ret = ibv_modify_qp(ep->qp, &attr, IBV_QP_STATE | IBV_QP_PKEY_INDEX | + IBV_QP_PORT | IBV_QP_QKEY); + if (ret) { + acm_log(0, "ERROR - failed to modify QP to init\n"); + goto err2; + } + + attr.qp_state = IBV_QPS_RTR; + ret = ibv_modify_qp(ep->qp, &attr, IBV_QP_STATE); + if (ret) { + acm_log(0, "ERROR - failed to modify QP to rtr\n"); + goto err2; + } + + attr.qp_state = IBV_QPS_RTS; + attr.sq_psn = 0; + ret = ibv_modify_qp(ep->qp, &attr, IBV_QP_STATE | IBV_QP_SQ_PSN); + if (ret) { + acm_log(0, "ERROR - failed to modify QP to rts\n"); + goto err2; + } + + ret = acmp_post_recvs(ep); + if (ret) + goto err2; + + pthread_mutex_lock(&port->lock); + list_add(&port->ep_list, &ep->entry); + pthread_mutex_unlock(&port->lock); + acmp_ep_preload(ep); + acmp_ep_join(ep); + *ep_context = (void *) ep; + return 0; + +err2: + ibv_destroy_qp(ep->qp); +err1: + ibv_destroy_cq(ep->cq); +err0: + free(ep); + return -1; +} + +static void acmp_port_up(struct acmp_port *port) +{ + struct ibv_port_attr attr; + uint16_t pkey; + __be16 pkey_be; + __be16 sm_lid; + int i, ret; + int instance; + + acm_log(1, "%s %d\n", port->dev->verbs->device->name, port->port_num); + ret = ibv_query_port(port->dev->verbs, port->port_num, &attr); + if (ret) { + acm_log(0, "ERROR - unable to get port attribute\n"); + return; + } + + port->mtu = attr.active_mtu; + port->rate = acm_get_rate(attr.active_width, attr.active_speed); + if (attr.subnet_timeout >= 8) + port->subnet_timeout = 1 << (attr.subnet_timeout - 8); + + port->lid = attr.lid; + port->lid_mask = 0xffff - ((1 << attr.lmc) - 1); + + port->sa_dest.av.src_path_bits = 0; + port->sa_dest.av.dlid = attr.sm_lid; + port->sa_dest.av.sl = attr.sm_sl; + port->sa_dest.av.port_num = port->port_num; + port->sa_dest.remote_qpn = 1; + sm_lid = htobe16(attr.sm_lid); + acmp_set_dest_addr(&port->sa_dest, ACM_ADDRESS_LID, + (uint8_t *) &sm_lid, sizeof(sm_lid)); + + instance = atomic_inc(&port->sa_dest.refcnt) - 1; + port->sa_dest.state = ACMP_READY; + for (i = 0; i < attr.pkey_tbl_len; i++) { + ret = ibv_query_pkey(port->dev->verbs, port->port_num, i, &pkey_be); + if (ret) + continue; + pkey = be16toh(pkey_be); + if (!(pkey & 0x7fff)) + continue; + + /* Determine pkey index for default partition with preference + * for full membership + */ + if ((pkey & 0x7fff) == 0x7fff) { + port->default_pkey_ix = i; + break; + } + } + + port->state = IBV_PORT_ACTIVE; + acm_log(1, "%s %d %d is up\n", port->dev->verbs->device->name, port->port_num, instance); +} + +static void acmp_port_down(struct acmp_port *port) +{ + int instance; + + acm_log(1, "%s %d\n", port->dev->verbs->device->name, port->port_num); + pthread_mutex_lock(&port->lock); + port->state = IBV_PORT_DOWN; + pthread_mutex_unlock(&port->lock); + + /* + * We wait for the SA destination to be released. We could use an + * event instead of a sleep loop, but it's not worth it given how + * infrequently we should be processing a port down event in practice. + */ + instance = atomic_dec(&port->sa_dest.refcnt); + if (instance == 1) { + pthread_mutex_lock(&port->sa_dest.lock); + port->sa_dest.state = ACMP_INIT; + pthread_mutex_unlock(&port->sa_dest.lock); + } + acm_log(1, "%s %d %d is down\n", port->dev->verbs->device->name, port->port_num, instance); +} + +static int acmp_open_port(const struct acm_port *cport, void *dev_context, + void **port_context) +{ + struct acmp_device *dev = dev_context; + struct acmp_port *port; + + if (cport->port_num < 1 || cport->port_num > dev->port_cnt) { + acm_log(0, "Error: port_num %d is out of range (max %d)\n", + cport->port_num, dev->port_cnt); + return -1; + } + + port = &dev->port[cport->port_num - 1]; + pthread_mutex_lock(&port->lock); + port->port = cport; + port->state = IBV_PORT_DOWN; + pthread_mutex_unlock(&port->lock); + acmp_port_up(port); + *port_context = port; + return 0; +} + +static void acmp_close_port(void *port_context) +{ + struct acmp_port *port = port_context; + + acmp_port_down(port); + pthread_mutex_lock(&port->lock); + port->port = NULL; + pthread_mutex_unlock(&port->lock); +} + +static void acmp_init_port(struct acmp_port *port, struct acmp_device *dev, + uint8_t port_num) +{ + acm_log(1, "%s %d\n", dev->verbs->device->name, port_num); + port->dev = dev; + port->port_num = port_num; + pthread_mutex_init(&port->lock, NULL); + list_head_init(&port->ep_list); + acmp_init_dest(&port->sa_dest, ACM_ADDRESS_LID, NULL, 0); + port->state = IBV_PORT_DOWN; +} + +static int acmp_open_dev(const struct acm_device *device, void **dev_context) +{ + struct acmp_device *dev; + size_t size; + struct ibv_device_attr attr; + int i, ret; + struct ibv_context *verbs; + + acm_log(1, "dev_guid 0x%" PRIx64 " %s\n", be64toh(device->dev_guid), + device->verbs->device->name); + + list_for_each(&acmp_dev_list, dev, entry) { + if (dev->guid == device->dev_guid) { + acm_log(2, "dev_guid 0x%" PRIx64 " already exits\n", + be64toh(device->dev_guid)); + *dev_context = dev; + dev->device = device; + return 0; + } + } + + /* We need to release the core device structure when device close is + * called. But this provider does not support dynamic add/removal of + * devices/ports/endpoints. To avoid use-after-free issues, we open + * our own verbs context, rather than using the one in the core + * device structure. + */ + verbs = ibv_open_device(device->verbs->device); + if (!verbs) { + acm_log(0, "ERROR - opening device %s\n", + device->verbs->device->name); + goto err; + } + + ret = ibv_query_device(verbs, &attr); + if (ret) { + acm_log(0, "ERROR - ibv_query_device (%s) %d\n", + verbs->device->name, ret); + goto err; + } + + size = sizeof(*dev) + sizeof(struct acmp_port) * attr.phys_port_cnt; + dev = (struct acmp_device *) calloc(1, size); + if (!dev) + goto err; + + dev->verbs = verbs; + dev->device = device; + dev->port_cnt = attr.phys_port_cnt; + + dev->pd = ibv_alloc_pd(dev->verbs); + if (!dev->pd) { + acm_log(0, "ERROR - unable to allocate PD\n"); + goto err1; + } + + dev->channel = ibv_create_comp_channel(dev->verbs); + if (!dev->channel) { + acm_log(0, "ERROR - unable to create comp channel\n"); + goto err2; + } + + for (i = 0; i < dev->port_cnt; i++) { + acmp_init_port(&dev->port[i], dev, i + 1); + } + + if (pthread_create(&dev->comp_thread_id, NULL, acmp_comp_handler, dev)) { + acm_log(0, "Error -- failed to create the comp thread for dev %s", + dev->verbs->device->name); + goto err3; + } + + pthread_mutex_lock(&acmp_dev_lock); + list_add(&acmp_dev_list, &dev->entry); + pthread_mutex_unlock(&acmp_dev_lock); + dev->guid = device->dev_guid; + *dev_context = dev; + + acm_log(1, "%s opened\n", dev->verbs->device->name); + return 0; + +err3: + ibv_destroy_comp_channel(dev->channel); +err2: + ibv_dealloc_pd(dev->pd); +err1: + free(dev); +err: + return -1; +} + +static void acmp_close_dev(void *dev_context) +{ + struct acmp_device *dev = dev_context; + + acm_log(1, "dev_guid 0x%" PRIx64 "\n", be64toh(dev->device->dev_guid)); + dev->device = NULL; +} + +static void acmp_set_options(void) +{ + FILE *f; + char s[120]; + char opt[32], value[256]; + const char *opts_file = acm_get_opts_file(); + + if (!(f = fopen(opts_file, "r"))) + return; + + while (fgets(s, sizeof s, f)) { + if (s[0] == '#') + continue; + + if (sscanf(s, "%31s%255s", opt, value) != 2) + continue; + + if (!strcasecmp("addr_prot", opt)) + addr_prot = acmp_convert_addr_prot(value); + else if (!strcasecmp("addr_timeout", opt)) + addr_timeout = atoi(value); + else if (!strcasecmp("route_prot", opt)) + route_prot = acmp_convert_route_prot(value); + else if (!strcmp("route_timeout", opt)) + route_timeout = atoi(value); + else if (!strcasecmp("loopback_prot", opt)) + loopback_prot = acmp_convert_loopback_prot(value); + else if (!strcasecmp("timeout", opt)) + timeout = atoi(value); + else if (!strcasecmp("retries", opt)) + retries = atoi(value); + else if (!strcasecmp("resolve_depth", opt)) + resolve_depth = atoi(value); + else if (!strcasecmp("send_depth", opt)) + send_depth = atoi(value); + else if (!strcasecmp("recv_depth", opt)) + recv_depth = atoi(value); + else if (!strcasecmp("min_mtu", opt)) + min_mtu = acm_convert_mtu(atoi(value)); + else if (!strcasecmp("min_rate", opt)) + min_rate = acm_convert_rate(atoi(value)); + else if (!strcasecmp("route_preload", opt)) + route_preload = acmp_convert_route_preload(value); + else if (!strcasecmp("route_data_file", opt)) + strcpy(route_data_file, value); + else if (!strcasecmp("addr_preload", opt)) + addr_preload = acmp_convert_addr_preload(value); + else if (!strcasecmp("addr_data_file", opt)) + strcpy(addr_data_file, value); + } + + fclose(f); +} + +static void acmp_log_options(void) +{ + acm_log(0, "address resolution %d\n", addr_prot); + acm_log(0, "address timeout %d\n", addr_timeout); + acm_log(0, "route resolution %d\n", route_prot); + acm_log(0, "route timeout %d\n", route_timeout); + acm_log(0, "loopback resolution %d\n", loopback_prot); + acm_log(0, "timeout %d ms\n", timeout); + acm_log(0, "retries %d\n", retries); + acm_log(0, "resolve depth %d\n", resolve_depth); + acm_log(0, "send depth %d\n", send_depth); + acm_log(0, "receive depth %d\n", recv_depth); + acm_log(0, "minimum mtu %d\n", min_mtu); + acm_log(0, "minimum rate %d\n", min_rate); + acm_log(0, "route preload %d\n", route_preload); + acm_log(0, "route data file %s\n", route_data_file); + acm_log(0, "address preload %d\n", addr_preload); + acm_log(0, "address data file %s\n", addr_data_file); +} + +static void __attribute__((constructor)) acmp_init(void) +{ + acmp_set_options(); + + acmp_log_options(); + + atomic_init(&g_tid); + atomic_init(&wait_cnt); + pthread_mutex_init(&acmp_dev_lock, NULL); + event_init(&timeout_event); + + umad_init(); + + acm_log(1, "starting timeout/retry thread\n"); + if (pthread_create(&retry_thread_id, NULL, acmp_retry_handler, NULL)) { + acm_log(0, "Error: failed to create the retry thread"); + retry_thread_started = 0; + return; + } + + acmp_initialized = 1; +} + +int provider_query(struct acm_provider **provider, uint32_t *version) +{ + acm_log(1, "\n"); + + if (!acmp_initialized) + return -1; + + if (provider) + *provider = &def_prov; + if (version) + *version = ACM_PROV_VERSION; + + return 0; +} + diff --git a/ibacm/prov/acmp/src/libibacmp.map b/ibacm/prov/acmp/src/libibacmp.map new file mode 100644 index 0000000..cccd166 --- /dev/null +++ b/ibacm/prov/acmp/src/libibacmp.map @@ -0,0 +1,5 @@ +ACMP_1.0 { + global: + provider_query; + local: *; +}; diff --git a/ibacm/src/acm.c b/ibacm/src/acm.c new file mode 100644 index 0000000..1663c89 --- /dev/null +++ b/ibacm/src/acm.c @@ -0,0 +1,3392 @@ +/* + * Copyright (c) 2009-2014 Intel Corporation. All rights reserved. + * Copyright (c) 2013 Mellanox Technologies LTD. All rights reserved. + * + * This software is available to you under the OpenIB.org BSD license + * below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AWV + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#define _GNU_SOURCE + +#include <config.h> + +#include <stdio.h> +#include <stdarg.h> +#include <string.h> +#include <osd.h> +#include <arpa/inet.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <sys/time.h> +#include <fcntl.h> +#include <dirent.h> +#include <infiniband/acm.h> +#include <infiniband/acm_prov.h> +#include <infiniband/umad.h> +#include <infiniband/verbs.h> +#include <infiniband/umad_types.h> +#include <infiniband/umad_sa.h> +#include <dlfcn.h> +#include <search.h> +#include <net/if.h> +#include <sys/ioctl.h> +#include <sys/socket.h> +#include <sys/un.h> +#include <net/if_arp.h> +#include <linux/netlink.h> +#include <linux/rtnetlink.h> +#include <rdma/rdma_netlink.h> +#include <rdma/ib_user_sa.h> +#include <poll.h> +#include <inttypes.h> +#include <getopt.h> +#include <systemd/sd-daemon.h> +#include <ccan/list.h> +#include <util/util.h> +#include "acm_mad.h" +#include "acm_util.h" + +#define NL_MSG_BUF_SIZE 4096 +#define ACM_PROV_NAME_SIZE 64 +#define NL_CLIENT_INDEX 0 + +struct acmc_subnet { + struct list_node entry; + __be64 subnet_prefix; +}; + +struct acmc_prov { + struct acm_provider *prov; + void *handle; + struct list_node entry; + struct list_head subnet_list; +}; + +struct acmc_prov_context { + struct list_node entry; + atomic_t refcnt; + struct acm_provider *prov; + void *context; +}; + +struct acmc_device; + +struct acmc_port { + struct acmc_device *dev; + struct acm_port port; + struct acm_provider *prov; /* limit to 1 provider per port for now */ + void *prov_port_context; + int mad_portid; + int mad_agentid; + struct ib_mad_addr sa_addr; + struct list_head sa_pending; + struct list_head sa_wait; + int sa_credits; + pthread_mutex_t lock; + struct list_head ep_list; + enum ibv_port_state state; + int gid_cnt; + union ibv_gid *gid_tbl; + uint16_t lid; + uint16_t lid_mask; + int sa_pkey_index; + bool pending_rereg; + uint16_t def_acm_pkey; +}; + +struct acmc_device { + struct acm_device device; + struct list_node entry; + struct list_head prov_dev_context_list; + int port_cnt; + struct acmc_port port[0]; +}; + +struct acmc_addr { + struct acm_address addr; + void *prov_addr_context; + char string_buf[ACM_MAX_ADDRESS]; +}; + +struct acmc_ep { + struct acmc_port *port; + struct acm_endpoint endpoint; + void *prov_ep_context; + /* Although the below two entries are used for dynamic allocations, + * they are accessed by a single thread, so no locking is required. + */ + int nmbr_ep_addrs; + struct acmc_addr *addr_info; + struct list_node entry; +}; + +struct acmc_client { + pthread_mutex_t lock; /* acquire ep lock first */ + int sock; + int index; + atomic_t refcnt; +}; + +union socket_addr { + struct sockaddr sa; + struct sockaddr_in sin; + struct sockaddr_in6 sin6; +}; + +struct acmc_sa_req { + struct list_node entry; + struct acmc_ep *ep; + void (*resp_handler)(struct acm_sa_mad *); + struct acm_sa_mad mad; +}; + +struct acm_nl_path { + struct nlattr attr_hdr; + struct ib_path_rec_data rec; +}; + +struct acm_nl_msg { + struct nlmsghdr nlmsg_header; + union { + uint8_t data[ACM_MSG_DATA_LENGTH]; + struct rdma_ls_resolve_header resolve_header; + struct nlattr attr[0]; + struct acm_nl_path path[0]; + }; +}; + +static char def_prov_name[ACM_PROV_NAME_SIZE] = "ibacmp"; +static LIST_HEAD(provider_list); +static struct acmc_prov *def_provider = NULL; + +static LIST_HEAD(dev_list); + +static int listen_socket; +static int ip_mon_socket; +static struct acmc_client client_array[FD_SETSIZE - 1]; + +static FILE *flog; +static pthread_mutex_t log_lock; +static __thread char log_data[ACM_MAX_ADDRESS]; +static atomic_t counter[ACM_MAX_COUNTER]; + +static struct acmc_device * +acm_get_device_from_gid(union ibv_gid *sgid, uint8_t *port); +static struct acmc_ep *acm_find_ep(struct acmc_port *port, uint16_t pkey); +static int acm_ep_insert_addr(struct acmc_ep *ep, const char *name, uint8_t *addr, + uint8_t addr_type); +static void acm_event_handler(struct acmc_device *dev); +static int acm_nl_send(int sock, struct acm_msg *msg); + +static struct sa_data { + int timeout; + int retries; + int depth; + pthread_t thread_id; + struct pollfd *fds; + struct acmc_port **ports; + int nfds; +} sa = { 2000, 2, 1, 0, NULL, NULL, 0}; + +/* + * Service options - may be set through ibacm_opts.cfg file. + */ +static const char *acme = IBACM_BIN_PATH "/ib_acme -A"; +static const char *opts_file = ACM_CONF_DIR "/" ACM_OPTS_FILE; +static const char *addr_file = ACM_CONF_DIR "/" ACM_ADDR_FILE; +static char log_file[128] = IBACM_LOG_FILE; +static int log_level = 0; +static char lock_file[128] = IBACM_PID_FILE; +static short server_port = 6125; +static int server_mode = IBACM_SERVER_MODE_DEFAULT; +static int acme_plus_kernel_only = IBACM_ACME_PLUS_KERNEL_ONLY_DEFAULT; +static int support_ips_in_addr_cfg = 0; +static char prov_lib_path[256] = IBACM_LIB_PATH; + +void acm_write(int level, const char *format, ...) +{ + va_list args; + struct timeval tv; + struct tm tmtime; + char buffer[20]; + + if (level > log_level) + return; + + gettimeofday(&tv, NULL); + localtime_r(&tv.tv_sec, &tmtime); + strftime(buffer, 20, "%Y-%m-%dT%H:%M:%S", &tmtime); + va_start(args, format); + pthread_mutex_lock(&log_lock); + fprintf(flog, "%s.%03u: ", buffer, (unsigned) (tv.tv_usec / 1000)); + vfprintf(flog, format, args); + fflush(flog); + pthread_mutex_unlock(&log_lock); + va_end(args); +} + +void acm_format_name(int level, char *name, size_t name_size, + uint8_t addr_type, const uint8_t *addr, size_t addr_size) +{ + struct ibv_path_record *path; + + if (level > log_level) + return; + + switch (addr_type) { + case ACM_EP_INFO_NAME: + memcpy(name, addr, addr_size); + break; + case ACM_EP_INFO_ADDRESS_IP: + inet_ntop(AF_INET, addr, name, name_size); + break; + case ACM_EP_INFO_ADDRESS_IP6: + case ACM_ADDRESS_GID: + inet_ntop(AF_INET6, addr, name, name_size); + break; + case ACM_EP_INFO_PATH: + path = (struct ibv_path_record *) addr; + if (path->dlid) { + snprintf(name, name_size, "SLID(%u) DLID(%u)", + be16toh(path->slid), be16toh(path->dlid)); + } else { + acm_format_name(level, name, name_size, ACM_ADDRESS_GID, + path->dgid.raw, sizeof path->dgid); + } + break; + case ACM_ADDRESS_LID: + snprintf(name, name_size, "LID(%u)", be16toh(*((__be16 *) addr))); + break; + default: + strcpy(name, "Unknown"); + break; + } +} + +int ib_any_gid(union ibv_gid *gid) +{ + return ((gid->global.subnet_prefix | gid->global.interface_id) == 0); +} + +const char *acm_get_opts_file(void) +{ + return opts_file; +} + +void acm_increment_counter(int type) +{ + if (type >= 0 && type < ACM_MAX_COUNTER) + atomic_inc(&counter[type]); +} + +static struct acmc_prov_context * +acm_alloc_prov_context(struct acm_provider *prov) +{ + struct acmc_prov_context *ctx; + + ctx = calloc(1, sizeof(*ctx)); + if (!ctx) { + acm_log(0, "Error: failed to allocate prov context\n"); + return NULL; + } + atomic_set(&ctx->refcnt, 1); + ctx->prov = prov; + return ctx; +} + +static struct acmc_prov_context * +acm_get_prov_context(struct list_head *list, struct acm_provider *prov) +{ + struct acmc_prov_context *ctx; + + list_for_each(list, ctx, entry) { + if (ctx->prov == prov) { + return ctx; + } + } + + return NULL; +} + +static struct acmc_prov_context * +acm_acquire_prov_context(struct list_head *list, struct acm_provider *prov) +{ + struct acmc_prov_context *ctx; + + ctx = acm_get_prov_context(list, prov); + if (!ctx) { + ctx = acm_alloc_prov_context(prov); + if (!ctx) { + acm_log(0, "Error -- failed to allocate provider context\n"); + return NULL; + } + list_add_tail(list, &ctx->entry); + } else { + atomic_inc(&ctx->refcnt); + } + + return ctx; +} + +static void +acm_release_prov_context(struct acmc_prov_context *ctx) +{ + if (atomic_dec(&ctx->refcnt) <= 0) { + list_del(&ctx->entry); + free(ctx); + } +} + +uint8_t acm_gid_index(struct acm_port *port, union ibv_gid *gid) +{ + uint8_t i; + struct acmc_port *cport; + + cport = container_of(port, struct acmc_port, port); + for (i = 0; i < cport->gid_cnt; i++) { + if (!memcmp(&cport->gid_tbl[i], gid, sizeof (*gid))) + break; + } + return i; +} + +int acm_get_gid(struct acm_port *port, int index, union ibv_gid *gid) +{ + struct acmc_port *cport; + + cport = container_of(port, struct acmc_port, port); + if (index >= 0 && index < cport->gid_cnt) { + *gid = cport->gid_tbl[index]; + return 0; + } else { + return -1; + } +} + +static size_t acm_addr_len(uint8_t addr_type) +{ + switch (addr_type) { + case ACM_ADDRESS_NAME: + return ACM_MAX_ADDRESS; + case ACM_ADDRESS_IP: + return sizeof(struct in_addr); + case ACM_ADDRESS_IP6: + return sizeof(struct in6_addr); + case ACM_ADDRESS_GID: + return sizeof(union ibv_gid); + case ACM_ADDRESS_LID: + return sizeof(uint16_t); + default: + acm_log(2, "illegal address type %d\n", addr_type); + } + return 0; +} + +static int acm_addr_cmp(struct acm_address *acm_addr, uint8_t *addr, uint8_t addr_type) +{ + if (acm_addr->type != addr_type) + return -2; + + if (acm_addr->type == ACM_ADDRESS_NAME) + return strncasecmp((char *) acm_addr->info.name, + (char *) addr, acm_addr_len(acm_addr->type)); + return memcmp(acm_addr->info.addr, addr, acm_addr_len(acm_addr->type)); +} + +static void acm_mark_addr_invalid(struct acmc_ep *ep, + struct acm_ep_addr_data *data) +{ + int i; + + for (i = 0; i < ep->nmbr_ep_addrs; i++) { + if (!acm_addr_cmp(&ep->addr_info[i].addr, data->info.addr, data->type)) { + ep->addr_info[i].addr.type = ACM_ADDRESS_INVALID; + ep->port->prov->remove_address(ep->addr_info[i].prov_addr_context); + break; + } + } +} + +static struct acm_address * +acm_addr_lookup(const struct acm_endpoint *endpoint, uint8_t *addr, uint8_t addr_type) +{ + struct acmc_ep *ep; + int i; + + ep = container_of(endpoint, struct acmc_ep, endpoint); + for (i = 0; i < ep->nmbr_ep_addrs; i++) + if (!acm_addr_cmp(&ep->addr_info[i].addr, addr, addr_type)) + return &ep->addr_info[i].addr; + + return NULL; +} + +__be64 acm_path_comp_mask(struct ibv_path_record *path) +{ + uint32_t fl_hop; + uint16_t qos_sl; + __be64 comp_mask = 0; + + acm_log(2, "\n"); + if (path->service_id) + comp_mask |= IB_COMP_MASK_PR_SERVICE_ID; + if (!ib_any_gid(&path->dgid)) + comp_mask |= IB_COMP_MASK_PR_DGID; + if (!ib_any_gid(&path->sgid)) + comp_mask |= IB_COMP_MASK_PR_SGID; + if (path->dlid) + comp_mask |= IB_COMP_MASK_PR_DLID; + if (path->slid) + comp_mask |= IB_COMP_MASK_PR_SLID; + + fl_hop = be32toh(path->flowlabel_hoplimit); + if (fl_hop >> 8) + comp_mask |= IB_COMP_MASK_PR_FLOW_LABEL; + if (fl_hop & 0xFF) + comp_mask |= IB_COMP_MASK_PR_HOP_LIMIT; + + if (path->tclass) + comp_mask |= IB_COMP_MASK_PR_TCLASS; + if (path->reversible_numpath & 0x80) + comp_mask |= IB_COMP_MASK_PR_REVERSIBLE; + if (path->pkey) + comp_mask |= IB_COMP_MASK_PR_PKEY; + + qos_sl = be16toh(path->qosclass_sl); + if (qos_sl >> 4) + comp_mask |= IB_COMP_MASK_PR_QOS_CLASS; + if (qos_sl & 0xF) + comp_mask |= IB_COMP_MASK_PR_SL; + + if (path->mtu & 0xC0) + comp_mask |= IB_COMP_MASK_PR_MTU_SELECTOR; + if (path->mtu & 0x3F) + comp_mask |= IB_COMP_MASK_PR_MTU; + if (path->rate & 0xC0) + comp_mask |= IB_COMP_MASK_PR_RATE_SELECTOR; + if (path->rate & 0x3F) + comp_mask |= IB_COMP_MASK_PR_RATE; + if (path->packetlifetime & 0xC0) + comp_mask |= IB_COMP_MASK_PR_PACKET_LIFETIME_SELECTOR; + if (path->packetlifetime & 0x3F) + comp_mask |= IB_COMP_MASK_PR_PACKET_LIFETIME; + + return comp_mask; +} + +int acm_resolve_response(uint64_t id, struct acm_msg *msg) +{ + struct acmc_client *client = &client_array[id]; + int ret; + + acm_log(2, "client %d, status 0x%x\n", client->index, msg->hdr.status); + + if (msg->hdr.status == ACM_STATUS_ENODATA) + atomic_inc(&counter[ACM_CNTR_NODATA]); + else if (msg->hdr.status) + atomic_inc(&counter[ACM_CNTR_ERROR]); + + pthread_mutex_lock(&client->lock); + if (client->sock == -1) { + acm_log(0, "ERROR - connection lost\n"); + ret = ACM_STATUS_ENOTCONN; + goto release; + } + + if (id == NL_CLIENT_INDEX) + ret = acm_nl_send(client->sock, msg); + else + ret = send(client->sock, (char *) msg, msg->hdr.length, 0); + + if (ret != msg->hdr.length) + acm_log(0, "ERROR - failed to send response\n"); + else + ret = 0; + +release: + pthread_mutex_unlock(&client->lock); + (void) atomic_dec(&client->refcnt); + return ret; +} + +static int +acmc_resolve_response(uint64_t id, struct acm_msg *req_msg, uint8_t status) +{ + req_msg->hdr.opcode |= ACM_OP_ACK; + req_msg->hdr.status = status; + if (status != ACM_STATUS_SUCCESS) + req_msg->hdr.length = ACM_MSG_HDR_LENGTH; + memset(req_msg->hdr.data, 0, sizeof(req_msg->hdr.data)); + + return acm_resolve_response(id, req_msg); +} + +int acm_query_response(uint64_t id, struct acm_msg *msg) +{ + struct acmc_client *client = &client_array[id]; + int ret; + + acm_log(2, "status 0x%x\n", msg->hdr.status); + pthread_mutex_lock(&client->lock); + if (client->sock == -1) { + acm_log(0, "ERROR - connection lost\n"); + ret = ACM_STATUS_ENOTCONN; + goto release; + } + + ret = send(client->sock, (char *) msg, msg->hdr.length, 0); + if (ret != msg->hdr.length) + acm_log(0, "ERROR - failed to send response\n"); + else + ret = 0; + +release: + pthread_mutex_unlock(&client->lock); + (void) atomic_dec(&client->refcnt); + return ret; +} + +static int acmc_query_response(uint64_t id, struct acm_msg *msg, uint8_t status) +{ + acm_log(2, "status 0x%x\n", status); + msg->hdr.opcode |= ACM_OP_ACK; + msg->hdr.status = status; + return acm_query_response(id, msg); +} + +static void acm_init_server(void) +{ + FILE *f; + int i; + + for (i = 0; i < FD_SETSIZE - 1; i++) { + pthread_mutex_init(&client_array[i].lock, NULL); + client_array[i].index = i; + client_array[i].sock = -1; + atomic_init(&client_array[i].refcnt); + } + + if (server_mode != IBACM_SERVER_MODE_UNIX) { + f = fopen(IBACM_IBACME_PORT_FILE, "w"); + if (f) { + fprintf(f, "%hu\n", server_port); + fclose(f); + } else + acm_log(0, + "notice - cannot publish ibacm port number\n"); + + unlink(IBACM_PORT_FILE); + if (!acme_plus_kernel_only) { + if (symlink(IBACM_PORT_BASE, IBACM_PORT_FILE) != 0) + acm_log(0, + "notice - can't create port symlink\n"); + } + } else { + unlink(IBACM_IBACME_PORT_FILE); + unlink(IBACM_PORT_FILE); + } +} + +static int acm_listen(void) +{ + union { + struct sockaddr any; + struct sockaddr_in inet; + struct sockaddr_un unx; + } addr; + mode_t saved_mask; + int ret, saved_errno; + + acm_log(2, "\n"); + + memset(&addr, 0, sizeof(addr)); + + if (server_mode == IBACM_SERVER_MODE_UNIX) { + addr.any.sa_family = AF_UNIX; + BUILD_ASSERT(sizeof(IBACM_IBACME_SERVER_PATH) <= + sizeof(addr.unx.sun_path)); + strcpy(addr.unx.sun_path, IBACM_IBACME_SERVER_PATH); + + listen_socket = socket(AF_UNIX, SOCK_STREAM, 0); + if (listen_socket < 0) { + acm_log(0, + "ERROR - unable to allocate unix socket\n"); + return errno; + } + + unlink(addr.unx.sun_path); + saved_mask = umask(0); + ret = bind(listen_socket, &addr.any, sizeof(addr.unx)); + saved_errno = errno; + umask(saved_mask); + + if (ret) { + acm_log(0, + "ERROR - unable to bind listen socket '%s'\n", + addr.unx.sun_path); + return saved_errno; + } + + unlink(IBACM_SERVER_PATH); + if (!acme_plus_kernel_only) { + if (symlink(IBACM_SERVER_BASE, + IBACM_SERVER_PATH) != 0) { + saved_errno = errno; + acm_log(0, + "notice - can't create symlink\n"); + return saved_errno; + } + } + } else { + unlink(IBACM_IBACME_SERVER_PATH); + unlink(IBACM_SERVER_PATH); + + listen_socket = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP); + if (listen_socket == -1) { + acm_log(0, + "ERROR - unable to allocate TCP socket\n"); + return errno; + } + + addr.any.sa_family = AF_INET; + addr.inet.sin_port = htobe16(server_port); + if (server_mode == IBACM_SERVER_MODE_LOOP) + addr.inet.sin_addr.s_addr = htonl(INADDR_LOOPBACK); + ret = bind(listen_socket, &addr.any, sizeof(addr.inet)); + if (ret == -1) { + acm_log(0, "ERROR - unable to bind listen socket\n"); + return errno; + } + } + + ret = listen(listen_socket, 0); + if (ret == -1) { + acm_log(0, "ERROR - unable to start listen\n"); + return errno; + } + + acm_log(2, "listen active\n"); + return 0; +} + +/* Retrieve the listening socket from systemd. */ +static int acm_listen_systemd(void) +{ + int fd; + + int rc = sd_listen_fds(1); + if (rc == -1) { + fprintf(stderr, "sd_listen_fds failed %d\n", rc); + return rc; + } + + if (rc > 2) { + fprintf(stderr, + "sd_listen_fds returned %d fds, expected <= 2\n", rc); + return -1; + } + + for (fd = SD_LISTEN_FDS_START; fd != SD_LISTEN_FDS_START + rc; fd++) { + if (sd_is_socket(fd, AF_NETLINK, SOCK_RAW, 0)) { + /* ListenNetlink for RDMA_NL_GROUP_LS multicast + * messages from the kernel + */ + if (client_array[NL_CLIENT_INDEX].sock != -1) { + fprintf(stderr, + "sd_listen_fds returned more than one netlink socket\n"); + return -1; + } + client_array[NL_CLIENT_INDEX].sock = fd; + + /* systemd sets NONBLOCK on the netlink socket, while + * we want blocking send to the kernel. + */ + if (set_fd_nonblock(fd, false)) { + fprintf(stderr, + "Unable to drop O_NOBLOCK on netlink socket"); + return -1; + } + } else if (sd_is_socket(SD_LISTEN_FDS_START, AF_UNSPEC, + SOCK_STREAM, 1)) { + /* Socket for user space client communication */ + if (listen_socket != -1) { + fprintf(stderr, + "sd_listen_fds returned more than one listening socket\n"); + return -1; + } + listen_socket = fd; + } else { + fprintf(stderr, + "sd_listen_fds socket is not a SOCK_STREAM/SOCK_NETLINK listening socket\n"); + return -1; + } + } + + return 0; +} + +static void acm_disconnect_client(struct acmc_client *client) +{ + pthread_mutex_lock(&client->lock); + shutdown(client->sock, SHUT_RDWR); + close(client->sock); + client->sock = -1; + pthread_mutex_unlock(&client->lock); + (void) atomic_dec(&client->refcnt); +} + +static void acm_svr_accept(void) +{ + int s; + int i; + + acm_log(2, "\n"); + s = accept(listen_socket, NULL, NULL); + if (s == -1) { + acm_log(0, "ERROR - failed to accept connection\n"); + return; + } + + for (i = 0; i < FD_SETSIZE - 1; i++) { + if (i == NL_CLIENT_INDEX) + continue; + if (!atomic_get(&client_array[i].refcnt)) + break; + } + + if (i == FD_SETSIZE - 1) { + acm_log(0, "ERROR - all connections busy - rejecting\n"); + close(s); + return; + } + + client_array[i].sock = s; + atomic_set(&client_array[i].refcnt, 1); + acm_log(2, "assigned client %d\n", i); +} + +static int +acm_is_path_from_port(struct acmc_port *port, struct ibv_path_record *path) +{ + uint8_t i; + + if (!ib_any_gid(&path->sgid)) { + return (acm_gid_index(&port->port, &path->sgid) < + port->gid_cnt); + } + + if (path->slid) { + return (port->lid == (be16toh(path->slid) & port->lid_mask)); + } + + if (ib_any_gid(&path->dgid)) { + return 1; + } + + if (acm_gid_index(&port->port, &path->dgid) < port->gid_cnt) { + return 1; + } + + for (i = 0; i < port->gid_cnt; i++) { + if (port->gid_tbl[i].global.subnet_prefix == + path->dgid.global.subnet_prefix) { + return 1; + } + } + + return 0; +} + +static bool acm_same_partition(uint16_t pkey_a, uint16_t pkey_b) +{ + + acm_log(2, "pkey_a: 0x%04x pkey_b: 0x%04x\n", pkey_a, pkey_b); + + return ((pkey_a | IB_PKEY_FULL_MEMBER) == (pkey_b | IB_PKEY_FULL_MEMBER)); +} + +static struct acmc_addr * +acm_get_port_ep_address(struct acmc_port *port, struct acm_ep_addr_data *data) +{ + struct acmc_ep *ep; + struct acm_address *addr; + int i; + + if (port->state != IBV_PORT_ACTIVE) + return NULL; + + if (data->type == ACM_EP_INFO_PATH && + !acm_is_path_from_port(port, &data->info.path)) + return NULL; + + list_for_each(&port->ep_list, ep, entry) { + if ((data->type == ACM_EP_INFO_PATH) && + (!data->info.path.pkey || + acm_same_partition(be16toh(data->info.path.pkey), ep->endpoint.pkey))) { + for (i = 0; i < ep->nmbr_ep_addrs; i++) { + if (ep->addr_info[i].addr.type) + return &ep->addr_info[i]; + } + return NULL; + } + + if ((addr = acm_addr_lookup(&ep->endpoint, data->info.addr, + (uint8_t) data->type))) + return container_of(addr, struct acmc_addr, addr); + } + + return NULL; +} + +static struct acmc_addr *acm_get_ep_address(struct acm_ep_addr_data *data) +{ + struct acmc_device *dev; + struct acmc_addr *addr; + int i; + + acm_format_name(2, log_data, sizeof log_data, + data->type, data->info.addr, sizeof data->info.addr); + acm_log(2, "%s\n", log_data); + list_for_each(&dev_list, dev, entry) { + for (i = 0; i < dev->port_cnt; i++) { + addr = acm_get_port_ep_address(&dev->port[i], data); + if (addr) + return addr; + } + } + + acm_format_name(0, log_data, sizeof log_data, + data->type, data->info.addr, sizeof data->info.addr); + acm_log(1, "notice - could not find %s\n", log_data); + return NULL; +} + +/* If port_num is zero, iterate through all ports, otherwise consider + * only the specific port_num */ +static struct acmc_ep *acm_get_ep(int index, uint8_t port_num) +{ + struct acmc_device *dev; + struct acmc_ep *ep; + int i, inx = 0; + + acm_log(2, "ep index %d\n", index); + list_for_each(&dev_list, dev, entry) { + for (i = 0; i < dev->port_cnt; i++) { + if (port_num && port_num != (i + 1)) + continue; + if (dev->port[i].state != IBV_PORT_ACTIVE) + continue; + list_for_each(&dev->port[i].ep_list, ep, entry) { + if (index == inx) + return ep; + ++inx; + } + } + } + + acm_log(1, "notice - could not find ep %d\n", index); + return NULL; +} + +static int +acm_svr_query_path(struct acmc_client *client, struct acm_msg *msg) +{ + struct acmc_addr *addr; + struct acmc_ep *ep; + + acm_log(2, "client %d\n", client->index); + if (msg->hdr.length != ACM_MSG_HDR_LENGTH + ACM_MSG_EP_LENGTH) { + acm_log(0, "ERROR - invalid length: 0x%x\n", msg->hdr.length); + return acmc_query_response(client->index, msg, ACM_STATUS_EINVAL); + } + + addr = acm_get_ep_address(&msg->resolve_data[0]); + if (!addr) { + acm_log(1, "notice - could not find local end point address\n"); + return acmc_query_response(client->index, msg, ACM_STATUS_ESRCADDR); + } + + ep = container_of(addr->addr.endpoint, struct acmc_ep, endpoint); + return ep->port->prov->query(addr->prov_addr_context, msg, client->index); +} + +static int acm_svr_select_src(struct acm_ep_addr_data *src, struct acm_ep_addr_data *dst) +{ + union socket_addr addr; + socklen_t len; + int ret; + int s; + + acm_log(2, "selecting source address\n"); + memset(&addr, 0, sizeof addr); + switch (dst->type) { + case ACM_EP_INFO_ADDRESS_IP: + addr.sin.sin_family = AF_INET; + memcpy(&addr.sin.sin_addr, dst->info.addr, 4); + len = sizeof(struct sockaddr_in); + break; + case ACM_EP_INFO_ADDRESS_IP6: + addr.sin6.sin6_family = AF_INET6; + memcpy(&addr.sin6.sin6_addr, dst->info.addr, 16); + len = sizeof(struct sockaddr_in6); + break; + default: + acm_log(1, "notice - bad destination type, cannot lookup source\n"); + return ACM_STATUS_EDESTTYPE; + } + + s = socket(addr.sa.sa_family, SOCK_DGRAM, IPPROTO_UDP); + if (s == -1) { + acm_log(0, "ERROR - unable to allocate socket\n"); + return errno; + } + + ret = connect(s, &addr.sa, len); + if (ret) { + acm_log(0, "ERROR - unable to connect socket\n"); + ret = errno; + goto out; + } + + ret = getsockname(s, &addr.sa, &len); + if (ret) { + acm_log(0, "ERROR - failed to get socket address\n"); + ret = errno; + goto out; + } + + src->type = dst->type; + src->flags = ACM_EP_FLAG_SOURCE; + if (dst->type == ACM_EP_INFO_ADDRESS_IP) { + memcpy(&src->info.addr, &addr.sin.sin_addr, 4); + } else { + memcpy(&src->info.addr, &addr.sin6.sin6_addr, 16); + } +out: + close(s); + return ret; +} + +/* + * Verify the resolve message from the client and return + * references to the source and destination addresses. + * The message buffer contains extra address data buffers. If a + * source address is not given, reference an empty address buffer, + * and we'll resolve a source address later. Record the location of + * the source and destination addresses in the message header data + * to avoid further searches. + */ +static uint8_t acm_svr_verify_resolve(struct acm_msg *msg) +{ + int i, cnt, have_dst = 0; + + if (msg->hdr.length < ACM_MSG_HDR_LENGTH) { + acm_log(0, "ERROR - invalid msg hdr length %d\n", msg->hdr.length); + return ACM_STATUS_EINVAL; + } + + msg->hdr.src_out = 1; + cnt = (msg->hdr.length - ACM_MSG_HDR_LENGTH) / ACM_MSG_EP_LENGTH; + for (i = 0; i < cnt; i++) { + if (msg->resolve_data[i].flags & ACM_EP_FLAG_SOURCE) { + if (!msg->hdr.src_out) { + acm_log(0, "ERROR - multiple sources specified\n"); + return ACM_STATUS_ESRCADDR; + } + if (!msg->resolve_data[i].type || + (msg->resolve_data[i].type >= ACM_ADDRESS_RESERVED)) { + acm_log(0, "ERROR - unsupported source address type\n"); + return ACM_STATUS_ESRCTYPE; + } + msg->hdr.src_out = 0; + msg->hdr.src_index = i; + } + if (msg->resolve_data[i].flags & ACM_EP_FLAG_DEST) { + if (have_dst) { + acm_log(0, "ERROR - multiple destinations specified\n"); + return ACM_STATUS_EDESTADDR; + } + if (!msg->resolve_data[i].type || + (msg->resolve_data[i].type >= ACM_ADDRESS_RESERVED)) { + acm_log(0, "ERROR - unsupported destination address type\n"); + return ACM_STATUS_EDESTTYPE; + } + have_dst = 1; + msg->hdr.dst_index = i; + } + } + + if (!have_dst) { + acm_log(0, "ERROR - destination address required\n"); + return ACM_STATUS_EDESTTYPE; + } + + if (msg->hdr.src_out) { + msg->hdr.src_index = i; + memset(&msg->resolve_data[i], 0, sizeof(struct acm_ep_addr_data)); + } + return ACM_STATUS_SUCCESS; +} + +static int +acm_svr_resolve_dest(struct acmc_client *client, struct acm_msg *msg) +{ + struct acmc_addr *addr; + struct acmc_ep *ep; + struct acm_ep_addr_data *saddr, *daddr; + uint8_t status; + + acm_log(2, "client %d\n", client->index); + status = acm_svr_verify_resolve(msg); + if (status) { + acm_log(0, "notice - misformatted or unsupported request\n"); + return acmc_resolve_response(client->index, msg, status); + } + + saddr = &msg->resolve_data[msg->hdr.src_index]; + daddr = &msg->resolve_data[msg->hdr.dst_index]; + if (msg->hdr.src_out) { + status = acm_svr_select_src(saddr, daddr); + if (status) { + acm_log(0, "notice - unable to select suitable source address\n"); + return acmc_resolve_response(client->index, msg, status); + } + } + + acm_format_name(2, log_data, sizeof log_data, + saddr->type, saddr->info.addr, sizeof saddr->info.addr); + acm_log(2, "src %s\n", log_data); + addr = acm_get_ep_address(saddr); + if (!addr) { + acm_log(0, "notice - unknown local end point address\n"); + return acmc_resolve_response(client->index, msg, ACM_STATUS_ESRCADDR); + } + + ep = container_of(addr->addr.endpoint, struct acmc_ep, endpoint); + return ep->port->prov->resolve(addr->prov_addr_context, msg, client->index); +} + +/* + * The message buffer contains extra address data buffers. We extract the + * destination address from the path record into an extra buffer, so we can + * lookup the destination by either LID or GID. + */ +static int +acm_svr_resolve_path(struct acmc_client *client, struct acm_msg *msg) +{ + struct acmc_addr *addr; + struct acmc_ep *ep; + struct ibv_path_record *path; + + acm_log(2, "client %d\n", client->index); + if (msg->hdr.length < (ACM_MSG_HDR_LENGTH + ACM_MSG_EP_LENGTH)) { + acm_log(0, "notice - invalid msg hdr length %d\n", msg->hdr.length); + return acmc_resolve_response(client->index, msg, ACM_STATUS_EINVAL); + } + + path = &msg->resolve_data[0].info.path; + if (!path->dlid && ib_any_gid(&path->dgid)) { + acm_log(0, "notice - no destination specified\n"); + return acmc_resolve_response(client->index, msg, + ACM_STATUS_EDESTADDR); + } + + acm_format_name(2, log_data, sizeof log_data, ACM_EP_INFO_PATH, + msg->resolve_data[0].info.addr, sizeof *path); + acm_log(2, "path %s\n", log_data); + addr = acm_get_ep_address(&msg->resolve_data[0]); + if (!addr) { + acm_log(0, "notice - unknown local end point address\n"); + return acmc_resolve_response(client->index, msg, + ACM_STATUS_ESRCADDR); + } + + ep = container_of(addr->addr.endpoint, struct acmc_ep, endpoint); + return ep->port->prov->resolve(addr->prov_addr_context, msg, + client->index); +} + +static int acm_svr_resolve(struct acmc_client *client, struct acm_msg *msg) +{ + (void) atomic_inc(&client->refcnt); + + if (msg->resolve_data[0].type == ACM_EP_INFO_PATH) { + if (msg->resolve_data[0].flags & ACM_FLAGS_QUERY_SA) { + return acm_svr_query_path(client, msg); + } else { + return acm_svr_resolve_path(client, msg); + } + } else { + return acm_svr_resolve_dest(client, msg); + } +} + +static int acm_svr_perf_query(struct acmc_client *client, struct acm_msg *msg) +{ + int ret, i; + uint16_t len; + struct acmc_addr *addr; + struct acmc_ep *ep = NULL; + int index; + + acm_log(2, "client %d\n", client->index); + index = msg->hdr.src_index; + msg->hdr.opcode |= ACM_OP_ACK; + msg->hdr.status = ACM_STATUS_SUCCESS; + msg->hdr.dst_index = 0; + + if ((be16toh(msg->hdr.length) < (ACM_MSG_HDR_LENGTH + ACM_MSG_EP_LENGTH) + && index < 1) || + ((be16toh(msg->hdr.length) >= (ACM_MSG_HDR_LENGTH + ACM_MSG_EP_LENGTH) + && !(msg->resolve_data[0].flags & ACM_EP_FLAG_SOURCE)))) { + for (i = 0; i < ACM_MAX_COUNTER; i++) + msg->perf_data[i] = htobe64((uint64_t) atomic_get(&counter[i])); + + msg->hdr.src_out = ACM_MAX_COUNTER; + len = ACM_MSG_HDR_LENGTH + (ACM_MAX_COUNTER * sizeof(uint64_t)); + } else { + if (index >= 1) { + ep = acm_get_ep(index - 1, msg->hdr.src_index); + } else { + addr = acm_get_ep_address(&msg->resolve_data[0]); + if (addr) + ep = container_of(addr->addr.endpoint, + struct acmc_ep, endpoint); + } + + if (ep) { + ep->port->prov->query_perf(ep->prov_ep_context, + msg->perf_data, &msg->hdr.src_out); + len = ACM_MSG_HDR_LENGTH + (msg->hdr.src_out * sizeof(uint64_t)); + } else { + msg->hdr.status = ACM_STATUS_ESRCADDR; + len = ACM_MSG_HDR_LENGTH; + } + } + msg->hdr.length = htobe16(len); + + ret = send(client->sock, (char *) msg, len, 0); + if (ret != len) + acm_log(0, "ERROR - failed to send response\n"); + else + ret = 0; + + return ret; +} + +static int may_be_realloc(struct acm_msg **msg_ptr, + int len, + int cnt, + int *cur_msg_siz_ptr, + int max_msg_siz) +{ + + /* Check if a new address exceeds the protocol constrained max size */ + if (len + (cnt + 1) * ACM_MAX_ADDRESS > max_msg_siz) { + acm_log(0, "ERROR - unable to amend more addresses to acm_msg due to protocol constraints\n"); + return ENOMEM; + } + + /* Check if a new address exceeds current size of msg */ + if (len + (cnt + 1) * ACM_MAX_ADDRESS > *cur_msg_siz_ptr) { + const size_t chunk_size = 16 * ACM_MAX_ADDRESS; + struct acm_msg *new_msg = realloc(*msg_ptr, *cur_msg_siz_ptr + chunk_size); + + if (!new_msg) { + acm_log(0, "ERROR - failed to allocate longer acm_msg\n"); + return ENOMEM; + } + + *msg_ptr = new_msg; + *cur_msg_siz_ptr += chunk_size; + } + + return 0; +} + +static int acm_svr_ep_query(struct acmc_client *client, struct acm_msg **_msg) +{ + int sts; + int ret, i; + uint16_t len; + struct acmc_ep *ep; + int index, cnt = 0; + struct acm_msg *msg = *_msg; + int cur_msg_siz = sizeof(*msg); + int max_msg_siz = USHRT_MAX; + + acm_log(2, "client %d\n", client->index); + index = msg->hdr.src_out; + ep = acm_get_ep(index - 1, msg->hdr.src_index); + if (ep) { + msg->hdr.status = ACM_STATUS_SUCCESS; + msg->ep_data[0].dev_guid = ep->port->dev->device.dev_guid; + msg->ep_data[0].port_num = ep->port->port.port_num; + msg->ep_data[0].phys_port_cnt = ep->port->dev->port_cnt; + msg->ep_data[0].pkey = htobe16(ep->endpoint.pkey); + strncpy((char *)msg->ep_data[0].prov_name, ep->port->prov->name, + ACM_MAX_PROV_NAME - 1); + msg->ep_data[0].prov_name[ACM_MAX_PROV_NAME - 1] = '\0'; + len = ACM_MSG_HDR_LENGTH + sizeof(struct acm_ep_config_data); + for (i = 0; i < ep->nmbr_ep_addrs; i++) { + if (ep->addr_info[i].addr.type != ACM_ADDRESS_INVALID) { + sts = may_be_realloc(_msg, len, cnt, &cur_msg_siz, max_msg_siz); + msg = *_msg; + if (sts) + break; + memcpy(msg->ep_data[0].addrs[cnt++].name, + ep->addr_info[i].string_buf, + ACM_MAX_ADDRESS); + } + } + msg->ep_data[0].addr_cnt = htobe16(cnt); + len += cnt * ACM_MAX_ADDRESS; + } else { + msg->hdr.status = ACM_STATUS_EINVAL; + len = ACM_MSG_HDR_LENGTH; + } + msg->hdr.opcode |= ACM_OP_ACK; + msg->hdr.src_index = 0; + msg->hdr.dst_index = 0; + msg->hdr.length = htobe16(len); + + ret = send(client->sock, (char *) msg, len, 0); + if (ret != len) + acm_log(0, "ERROR - failed to send response\n"); + else + ret = 0; + + return ret; +} + +static int acm_msg_length(struct acm_msg *msg) +{ + return (msg->hdr.opcode == ACM_OP_RESOLVE) ? + msg->hdr.length : be16toh(msg->hdr.length); +} + +static void acm_svr_receive(struct acmc_client *client) +{ + struct acm_msg *msg = malloc(sizeof(*msg)); + int ret; + + if (!msg) { + acm_log(0, "ERROR - Unable to alloc acm_msg\n"); + ret = ENOMEM; + goto out; + } + + acm_log(2, "client %d\n", client->index); + ret = recv(client->sock, (char *)msg, sizeof(*msg), 0); + if (ret <= 0 || ret != acm_msg_length(msg)) { + acm_log(2, "client disconnected\n"); + ret = ACM_STATUS_ENOTCONN; + goto out; + } + + if (msg->hdr.version != ACM_VERSION) { + acm_log(0, "ERROR - unsupported version %d\n", msg->hdr.version); + goto out; + } + + switch (msg->hdr.opcode & ACM_OP_MASK) { + case ACM_OP_RESOLVE: + atomic_inc(&counter[ACM_CNTR_RESOLVE]); + ret = acm_svr_resolve(client, msg); + break; + case ACM_OP_PERF_QUERY: + ret = acm_svr_perf_query(client, msg); + break; + case ACM_OP_EP_QUERY: + ret = acm_svr_ep_query(client, &msg); + break; + default: + acm_log(0, "ERROR - unknown opcode 0x%x\n", msg->hdr.opcode); + break; + } + +out: + free(msg); + if (ret) + acm_disconnect_client(client); +} + +static int acm_nl_to_addr_data(struct acm_ep_addr_data *ad, + int af_family, uint8_t *addr, size_t addr_len) +{ + if (addr_len > ACM_MAX_ADDRESS) + return EINVAL; + + /* find the ep associated with this address "if any" */ + switch (af_family) { + case AF_INET: + ad->type = ACM_ADDRESS_IP; + break; + case AF_INET6: + ad->type = ACM_ADDRESS_IP6; + break; + default: + return EINVAL; + } + memcpy(&ad->info.addr, addr, addr_len); + return 0; +} + +static void acm_add_ep_ip(char *ifname, struct acm_ep_addr_data *data, char *ip_str) +{ + struct acmc_ep *ep; + struct acmc_device *dev; + uint8_t port_num; + uint16_t pkey; + union ibv_gid sgid; + struct acmc_addr *addr; + + addr = acm_get_ep_address(data); + if (addr) { + acm_log(1, "Address '%s' already available\n", ip_str); + return; + } + + if (acm_if_get_sgid(ifname, &sgid)) + return; + + dev = acm_get_device_from_gid(&sgid, &port_num); + if (!dev) + return; + + if (acm_if_get_pkey(ifname, &pkey)) + return; + + acm_log(0, " %s\n", ip_str); + + ep = acm_find_ep(&dev->port[port_num - 1], pkey); + if (ep) { + if (acm_ep_insert_addr(ep, ip_str, data->info.addr, + data->type)) + acm_log(0, "Failed to add '%s' to EP\n", ip_str); + } else { + acm_log(0, "Failed to add '%s' no EP for pkey\n", ip_str); + } +} + +static void acm_rm_ep_ip(struct acm_ep_addr_data *data) +{ + struct acmc_ep *ep; + struct acmc_addr *addr; + + addr = acm_get_ep_address(data); + if (addr) { + ep = container_of(addr->addr.endpoint, struct acmc_ep, endpoint); + acm_format_name(0, log_data, sizeof log_data, + data->type, data->info.addr, sizeof data->info.addr); + acm_log(0, " %s\n", log_data); + acm_mark_addr_invalid(ep, data); + } +} + +static int acm_ipnl_create(void) +{ + struct sockaddr_nl addr; + + if ((ip_mon_socket = socket(PF_NETLINK, SOCK_RAW | SOCK_NONBLOCK, NETLINK_ROUTE)) == -1) { + acm_log(0, "Failed to open NETLINK_ROUTE socket"); + return EIO; + } + + memset(&addr, 0, sizeof(addr)); + addr.nl_family = AF_NETLINK; + addr.nl_groups = RTMGRP_LINK | RTMGRP_IPV4_IFADDR; + + if (bind(ip_mon_socket, (struct sockaddr *)&addr, sizeof(addr)) == -1) { + acm_log(0, "Failed to bind NETLINK_ROUTE socket"); + return EIO; + } + + return 0; +} + +static void acm_ip_iter_cb(char *ifname, union ibv_gid *gid, uint16_t pkey, + uint8_t addr_type, uint8_t *addr, + char *ip_str, void *ctx) +{ + int ret = EINVAL; + struct acmc_device *dev; + struct acmc_ep *ep; + uint8_t port_num; + char gid_str[INET6_ADDRSTRLEN]; + + dev = acm_get_device_from_gid(gid, &port_num); + if (dev) { + ep = acm_find_ep(&dev->port[port_num - 1], pkey); + if (ep) + ret = acm_ep_insert_addr(ep, ip_str, addr, addr_type); + } + + if (ret) { + inet_ntop(AF_INET6, gid->raw, gid_str, sizeof(gid_str)); + acm_log(0, "Failed to add '%s' (gid %s; pkey 0x%x)\n", + ip_str, gid_str, pkey); + } +} + +/* Netlink updates have indicated a failure which means we are no longer in + * sync. This should be a rare condition so we handle this with a "big + * hammer" by clearing and re-reading all the system IP's. + */ +static int resync_system_ips(void) +{ + struct acmc_device *dev; + struct acmc_port *port; + struct acmc_ep *ep; + int i, cnt; + + acm_log(0, "Resyncing all IP's\n"); + + /* mark all IP's invalid */ + list_for_each(&dev_list, dev, entry) { + for (cnt = 0; cnt < dev->port_cnt; cnt++) { + port = &dev->port[cnt]; + + list_for_each(&port->ep_list, ep, entry) { + for (i = 0; i < ep->nmbr_ep_addrs; i++) { + if (ep->addr_info[i].addr.type == ACM_ADDRESS_IP || + ep->addr_info[i].addr.type == ACM_ADDRESS_IP6) + ep->addr_info[i].addr.type = ACM_ADDRESS_INVALID; + } + } + } + } + + return acm_if_iter_sys(acm_ip_iter_cb, NULL); +} + +static void acm_ipnl_handler(void) +{ + int len; + char buffer[NL_MSG_BUF_SIZE]; + struct nlmsghdr *nlh; + char ifname[IFNAMSIZ]; + char ip_str[INET6_ADDRSTRLEN]; + struct acm_ep_addr_data ad; + + while ((len = recv(ip_mon_socket, buffer, NL_MSG_BUF_SIZE, 0)) > 0) { + nlh = (struct nlmsghdr *)buffer; + while ((NLMSG_OK(nlh, len)) && (nlh->nlmsg_type != NLMSG_DONE)) { + struct ifaddrmsg *ifa = (struct ifaddrmsg *) NLMSG_DATA(nlh); + struct ifinfomsg *ifi = (struct ifinfomsg *) NLMSG_DATA(nlh); + struct rtattr *rth = IFA_RTA(ifa); + int rtl = IFA_PAYLOAD(nlh); + + switch (nlh->nlmsg_type) { + case RTM_NEWADDR: + if_indextoname(ifa->ifa_index, ifname); + while (rtl && RTA_OK(rth, rtl)) { + if (rth->rta_type == IFA_LOCAL) { + acm_log(1, "New system address available %s : %s\n", + ifname, inet_ntop(ifa->ifa_family, RTA_DATA(rth), + ip_str, sizeof(ip_str))); + if (!acm_nl_to_addr_data(&ad, ifa->ifa_family, + RTA_DATA(rth), + RTA_PAYLOAD(rth))) { + acm_add_ep_ip(ifname, &ad, ip_str); + } + } + rth = RTA_NEXT(rth, rtl); + } + break; + case RTM_DELADDR: + if_indextoname(ifa->ifa_index, ifname); + while (rtl && RTA_OK(rth, rtl)) { + if (rth->rta_type == IFA_LOCAL) { + acm_log(1, "System address removed %s : %s\n", + ifname, inet_ntop(ifa->ifa_family, RTA_DATA(rth), + ip_str, sizeof(ip_str))); + if (!acm_nl_to_addr_data(&ad, ifa->ifa_family, + RTA_DATA(rth), + RTA_PAYLOAD(rth))) { + acm_rm_ep_ip(&ad); + } + } + rth = RTA_NEXT(rth, rtl); + } + break; + case RTM_NEWLINK: + acm_log(2, "Link added : %s\n", + if_indextoname(ifi->ifi_index, ifname)); + break; + case RTM_DELLINK: + acm_log(2, "Link removed : %s\n", + if_indextoname(ifi->ifi_index, ifname)); + break; + default: + acm_log(2, "unknown netlink message\n"); + break; + } + nlh = NLMSG_NEXT(nlh, len); + } + } + + if (len < 0 && errno == ENOBUFS) { + acm_log(0, "ENOBUFS returned from netlink...\n"); + resync_system_ips(); + } +} + +static int acm_nl_send(int sock, struct acm_msg *msg) +{ + struct sockaddr_nl dst_addr; + struct acm_nl_msg acmnlmsg; + struct acm_nl_msg *orig; + int ret; + int datalen; + + orig = (struct acm_nl_msg *)(uintptr_t)msg->hdr.tid; + + memset(&dst_addr, 0, sizeof(dst_addr)); + dst_addr.nl_family = AF_NETLINK; + dst_addr.nl_groups = (1 << (RDMA_NL_GROUP_LS - 1)); + + memset(&acmnlmsg, 0, sizeof(acmnlmsg)); + acmnlmsg.nlmsg_header.nlmsg_len = NLMSG_HDRLEN; + acmnlmsg.nlmsg_header.nlmsg_pid = getpid(); + acmnlmsg.nlmsg_header.nlmsg_type = orig->nlmsg_header.nlmsg_type; + acmnlmsg.nlmsg_header.nlmsg_seq = orig->nlmsg_header.nlmsg_seq; + + if (msg->hdr.status != ACM_STATUS_SUCCESS) { + acm_log(2, "acm status no success = %d\n", msg->hdr.status); + acmnlmsg.nlmsg_header.nlmsg_flags |= RDMA_NL_LS_F_ERR; + } else { + acm_log(2, "acm status success\n"); + acmnlmsg.nlmsg_header.nlmsg_len += + NLA_ALIGN(sizeof(struct acm_nl_path)); + acmnlmsg.path[0].attr_hdr.nla_type = LS_NLA_TYPE_PATH_RECORD; + acmnlmsg.path[0].attr_hdr.nla_len = sizeof(struct acm_nl_path); + if (orig->resolve_header.path_use == + LS_RESOLVE_PATH_USE_UNIDIRECTIONAL) + acmnlmsg.path[0].rec.flags = IB_PATH_PRIMARY | + IB_PATH_OUTBOUND; + else + acmnlmsg.path[0].rec.flags = IB_PATH_PRIMARY | + IB_PATH_GMP | IB_PATH_BIDIRECTIONAL; + memcpy(acmnlmsg.path[0].rec.path_rec, + &msg->resolve_data[0].info.path, + sizeof(struct ibv_path_record)); + } + + datalen = NLMSG_ALIGN(acmnlmsg.nlmsg_header.nlmsg_len); + ret = sendto(sock, &acmnlmsg, datalen, 0, + (const struct sockaddr *)&dst_addr, + (socklen_t)sizeof(dst_addr)); + if (ret != datalen) { + acm_log(0, "ERROR - sendto = %d errno = %d\n", ret, errno); + ret = -1; + } else { + ret = msg->hdr.length; + } + + free(orig); + + return ret; +} + +#define NLA_LEN(nla) ((nla)->nla_len - NLA_HDRLEN) +#define NLA_DATA(nla) ((char *)(nla) + NLA_HDRLEN) + +static int acm_nl_parse_path_attr(struct nlattr *attr, + struct acm_ep_addr_data *data) +{ + struct ibv_path_record *path; + uint64_t *sid; + struct rdma_nla_ls_gid *gid; + uint8_t *tcl; + uint16_t *pkey; + uint16_t *qos; + uint16_t val; + int ret = 0; + +#define IBV_PATH_RECORD_QOS_MASK 0xfff0 + + path = &data->info.path; + switch (attr->nla_type & RDMA_NLA_TYPE_MASK) { + case LS_NLA_TYPE_SERVICE_ID: + sid = (uint64_t *) NLA_DATA(attr); + if (NLA_LEN(attr) == sizeof(*sid)) { + acm_log(2, "service_id 0x%" PRIx64 "\n", *sid); + path->service_id = htobe64(*sid); + } else { + ret = -1; + } + break; + + case LS_NLA_TYPE_DGID: + gid = (struct rdma_nla_ls_gid *) NLA_DATA(attr); + if (NLA_LEN(attr) == sizeof(gid->gid)) { + acm_format_name(2, log_data, sizeof(log_data), + ACM_ADDRESS_GID, gid->gid, + sizeof(union ibv_gid)); + acm_log(2, "path dgid %s\n", log_data); + memcpy(path->dgid.raw, gid->gid, sizeof(path->dgid)); + data->flags |= ACM_EP_FLAG_DEST; + } else { + ret = -1; + } + break; + + case LS_NLA_TYPE_SGID: + gid = (struct rdma_nla_ls_gid *) NLA_DATA(attr); + if (NLA_LEN(attr) == sizeof(gid->gid)) { + acm_format_name(2, log_data, sizeof(log_data), + ACM_ADDRESS_GID, gid->gid, + sizeof(union ibv_gid)); + acm_log(2, "path sgid %s\n", log_data); + memcpy(path->sgid.raw, gid->gid, sizeof(path->sgid)); + data->flags |= ACM_EP_FLAG_SOURCE; + } else { + ret = -1; + } + break; + + case LS_NLA_TYPE_TCLASS: + tcl = (uint8_t *) NLA_DATA(attr); + if (NLA_LEN(attr) == sizeof(*tcl)) { + acm_log(2, "tclass 0x%x\n", *tcl); + path->tclass = *tcl; + } else { + ret = -1; + } + break; + + case LS_NLA_TYPE_PKEY: + pkey = (uint16_t *) NLA_DATA(attr); + if (NLA_LEN(attr) == sizeof(*pkey)) { + acm_log(2, "pkey 0x%x\n", *pkey); + path->pkey = htobe16(*pkey); + } else { + ret = -1; + } + break; + + case LS_NLA_TYPE_QOS_CLASS: + qos = (uint16_t *) NLA_DATA(attr); + if (NLA_LEN(attr) == sizeof(*qos)) { + acm_log(2, "qos_class 0x%x\n", *qos); + val = be16toh(path->qosclass_sl); + val &= ~IBV_PATH_RECORD_QOS_MASK; + val |= (*qos & IBV_PATH_RECORD_QOS_MASK); + path->qosclass_sl = htobe16(val); + } else { + ret = -1; + } + break; + + default: + acm_log(1, "WARN: unknown attr %x\n", attr->nla_type); + /* We can not ignore a mandatory attribute */ + if (attr->nla_type & RDMA_NLA_F_MANDATORY) + ret = -1; + break; + } + + return ret; +} + +static void acm_nl_process_invalid_request(struct acmc_client *client, + struct acm_nl_msg *acmnlmsg) +{ + struct acm_msg msg; + + memset(&msg, 0, sizeof(msg)); + msg.hdr.opcode = ACM_OP_RESOLVE; + msg.hdr.version = ACM_VERSION; + msg.hdr.length = ACM_MSG_HDR_LENGTH; + msg.hdr.status = ACM_STATUS_EINVAL; + msg.hdr.tid = (uintptr_t) acmnlmsg; + + acm_nl_send(client->sock, &msg); +} + +static void acm_nl_process_resolve(struct acmc_client *client, + struct acm_nl_msg *acmnlmsg) +{ + struct acm_msg msg; + struct nlattr *attr; + int payload_len; + int resolve_hdr_len; + int rem; + int total_attr_len; + int status; + unsigned char *data; + + memset(&msg, 0, sizeof(msg)); + msg.hdr.opcode = ACM_OP_RESOLVE; + msg.hdr.version = ACM_VERSION; + msg.hdr.length = ACM_MSG_HDR_LENGTH + ACM_MSG_EP_LENGTH; + msg.hdr.status = ACM_STATUS_SUCCESS; + msg.hdr.tid = (uintptr_t) acmnlmsg; + msg.resolve_data[0].type = ACM_EP_INFO_PATH; + + /* We support only one pathrecord */ + acm_log(2, "path use 0x%x\n", acmnlmsg->resolve_header.path_use); + if (acmnlmsg->resolve_header.path_use == + LS_RESOLVE_PATH_USE_UNIDIRECTIONAL) + msg.resolve_data[0].info.path.reversible_numpath = 1; + else + msg.resolve_data[0].info.path.reversible_numpath = + IBV_PATH_RECORD_REVERSIBLE | 1; + + data = (unsigned char *) &acmnlmsg->nlmsg_header + NLMSG_HDRLEN; + resolve_hdr_len = NLMSG_ALIGN(sizeof(struct rdma_ls_resolve_header)); + attr = (struct nlattr *) (data + resolve_hdr_len); + payload_len = acmnlmsg->nlmsg_header.nlmsg_len - NLMSG_HDRLEN - + resolve_hdr_len; + rem = payload_len; + while (1) { + if (rem < (int) sizeof(*attr) || + attr->nla_len < sizeof(*attr) || + attr->nla_len > rem) + break; + + status = acm_nl_parse_path_attr(attr, &msg.resolve_data[0]); + if (status) { + acm_nl_process_invalid_request(client, acmnlmsg); + return; + } + + /* Next attribute */ + total_attr_len = NLA_ALIGN(attr->nla_len); + rem -= total_attr_len; + attr = (struct nlattr *) ((char *) attr + total_attr_len); + } + + atomic_inc(&counter[ACM_CNTR_RESOLVE]); + acm_svr_resolve(client, &msg); +} + +static int acm_nl_is_valid_resolve_request(struct acm_nl_msg *acmnlmsg) +{ + int payload_len; + + payload_len = acmnlmsg->nlmsg_header.nlmsg_len - NLMSG_HDRLEN; + if (payload_len < (sizeof(struct rdma_ls_resolve_header) + + sizeof(struct nlattr))) + return 0; + + return 1; +} + +static void acm_nl_receive(struct acmc_client *client) +{ + struct acm_nl_msg *acmnlmsg; + int datalen = sizeof(*acmnlmsg); + int ret; + uint16_t client_inx, op; + + acmnlmsg = calloc(1, sizeof(*acmnlmsg)); + if (!acmnlmsg) { + acm_log(0, "Out of memory for recving nl msg.\n"); + return; + } + ret = recv(client->sock, acmnlmsg, datalen, 0); + if (!NLMSG_OK(&acmnlmsg->nlmsg_header, ret)) { + acm_log(0, "Netlink receive error: %d.\n", ret); + goto rcv_cleanup; + } + + acm_log(2, "nlmsg: len %d type 0x%x flags 0x%x seq %d pid %d\n", + acmnlmsg->nlmsg_header.nlmsg_len, + acmnlmsg->nlmsg_header.nlmsg_type, + acmnlmsg->nlmsg_header.nlmsg_flags, + acmnlmsg->nlmsg_header.nlmsg_seq, + acmnlmsg->nlmsg_header.nlmsg_pid); + + /* Currently we handle only request from the local service client */ + client_inx = RDMA_NL_GET_CLIENT(acmnlmsg->nlmsg_header.nlmsg_type); + op = RDMA_NL_GET_OP(acmnlmsg->nlmsg_header.nlmsg_type); + if (client_inx != RDMA_NL_LS) { + acm_log_once(0, "ERROR - Unknown NL client ID (%d)\n", client_inx); + goto rcv_cleanup; + } + + switch (op) { + case RDMA_NL_LS_OP_RESOLVE: + if (acm_nl_is_valid_resolve_request(acmnlmsg)) + acm_nl_process_resolve(client, acmnlmsg); + else + acm_nl_process_invalid_request(client, acmnlmsg); + break; + default: + /* Not supported*/ + acm_log_once(0, "WARN - invalid opcode %x\n", op); + acm_nl_process_invalid_request(client, acmnlmsg); + break; + } + + return; +rcv_cleanup: + free(acmnlmsg); +} + +static int acm_init_nl(void) +{ + struct sockaddr_nl src_addr; + int ret; + int nl_rcv_socket; + + nl_rcv_socket = socket(PF_NETLINK, SOCK_RAW, NETLINK_RDMA); + if (nl_rcv_socket == -1) { + acm_log(0, "ERROR - unable to allocate netlink recv socket\n"); + return errno; + } + + memset(&src_addr, 0, sizeof(src_addr)); + src_addr.nl_family = AF_NETLINK; + src_addr.nl_pid = getpid(); + src_addr.nl_groups = (1 << (RDMA_NL_GROUP_LS - 1)); + + ret = bind(nl_rcv_socket, (struct sockaddr *)&src_addr, + sizeof(src_addr)); + if (ret == -1) { + acm_log(0, "ERROR - unable to bind netlink socket\n"); + close(nl_rcv_socket); + return errno; + } + + /* init nl client structure */ + client_array[NL_CLIENT_INDEX].sock = nl_rcv_socket; + return 0; +} + +static void acm_server(bool systemd) +{ + fd_set readfds; + int i, n, ret; + struct acmc_device *dev; + + acm_log(0, "started\n"); + acm_init_server(); + + client_array[NL_CLIENT_INDEX].sock = -1; + listen_socket = -1; + if (systemd) { + ret = acm_listen_systemd(); + if (ret) { + acm_log(0, "ERROR - systemd server listen failed\n"); + return; + } + } + + if (listen_socket == -1) { + ret = acm_listen(); + if (ret) { + acm_log(0, "ERROR - server listen failed\n"); + return; + } + } + + if (client_array[NL_CLIENT_INDEX].sock == -1) { + ret = acm_init_nl(); + if (ret) + acm_log(1, "Warn - Netlink init failed\n"); + } + + if (systemd) + sd_notify(0, "READY=1"); + + while (1) { + n = (int) listen_socket; + FD_ZERO(&readfds); + FD_SET(listen_socket, &readfds); + n = max(n, (int) ip_mon_socket); + FD_SET(ip_mon_socket, &readfds); + + for (i = 0; i < FD_SETSIZE - 1; i++) { + if (client_array[i].sock != -1) { + FD_SET(client_array[i].sock, &readfds); + n = max(n, (int) client_array[i].sock); + } + } + + list_for_each(&dev_list, dev, entry) { + FD_SET(dev->device.verbs->async_fd, &readfds); + n = max(n, (int) dev->device.verbs->async_fd); + } + + ret = select(n + 1, &readfds, NULL, NULL, NULL); + if (ret == -1) { + acm_log(0, "ERROR - server select error\n"); + continue; + } + + if (FD_ISSET(listen_socket, &readfds)) + acm_svr_accept(); + + if (FD_ISSET(ip_mon_socket, &readfds)) + acm_ipnl_handler(); + + for (i = 0; i < FD_SETSIZE - 1; i++) { + if (client_array[i].sock != -1 && + FD_ISSET(client_array[i].sock, &readfds)) { + acm_log(2, "receiving from client %d\n", i); + if (i == NL_CLIENT_INDEX) + acm_nl_receive(&client_array[i]); + else + acm_svr_receive(&client_array[i]); + } + } + + list_for_each(&dev_list, dev, entry) { + if (FD_ISSET(dev->device.verbs->async_fd, &readfds)) { + acm_log(2, "handling event from %s\n", + dev->device.verbs->device->name); + acm_event_handler(dev); + } + } + } +} + +enum ibv_rate acm_get_rate(uint8_t width, uint8_t speed) +{ + switch (width) { + case 1: /* 1x */ + switch (speed) { + case 1: return IBV_RATE_2_5_GBPS; + case 2: return IBV_RATE_5_GBPS; + case 4: /* fall through */ + case 8: return IBV_RATE_10_GBPS; + case 16: return IBV_RATE_14_GBPS; + case 32: return IBV_RATE_25_GBPS; + default: return IBV_RATE_MAX; + } + case 2: /* 4x */ + switch (speed) { + case 1: return IBV_RATE_10_GBPS; + case 2: return IBV_RATE_20_GBPS; + case 4: /* fall through */ + case 8: return IBV_RATE_40_GBPS; + case 16: return IBV_RATE_56_GBPS; + case 32: return IBV_RATE_100_GBPS; + default: return IBV_RATE_MAX; + } + case 4: /* 8x */ + switch (speed) { + case 1: return IBV_RATE_20_GBPS; + case 2: return IBV_RATE_40_GBPS; + case 4: /* fall through */ + case 8: return IBV_RATE_80_GBPS; + case 16: return IBV_RATE_112_GBPS; + case 32: return IBV_RATE_200_GBPS; + default: return IBV_RATE_MAX; + } + case 8: /* 12x */ + switch (speed) { + case 1: return IBV_RATE_30_GBPS; + case 2: return IBV_RATE_60_GBPS; + case 4: /* fall through */ + case 8: return IBV_RATE_120_GBPS; + case 16: return IBV_RATE_168_GBPS; + case 32: return IBV_RATE_300_GBPS; + default: return IBV_RATE_MAX; + } + default: + acm_log(0, "ERROR - unknown link width 0x%x\n", width); + return IBV_RATE_MAX; + } +} + +enum ibv_mtu acm_convert_mtu(int mtu) +{ + switch (mtu) { + case 256: return IBV_MTU_256; + case 512: return IBV_MTU_512; + case 1024: return IBV_MTU_1024; + case 2048: return IBV_MTU_2048; + case 4096: return IBV_MTU_4096; + default: return IBV_MTU_2048; + } +} + +enum ibv_rate acm_convert_rate(int rate) +{ + switch (rate) { + case 2: return IBV_RATE_2_5_GBPS; + case 5: return IBV_RATE_5_GBPS; + case 10: return IBV_RATE_10_GBPS; + case 20: return IBV_RATE_20_GBPS; + case 30: return IBV_RATE_30_GBPS; + case 40: return IBV_RATE_40_GBPS; + case 60: return IBV_RATE_60_GBPS; + case 80: return IBV_RATE_80_GBPS; + case 120: return IBV_RATE_120_GBPS; + case 14: return IBV_RATE_14_GBPS; + case 56: return IBV_RATE_56_GBPS; + case 112: return IBV_RATE_112_GBPS; + case 168: return IBV_RATE_168_GBPS; + case 25: return IBV_RATE_25_GBPS; + case 100: return IBV_RATE_100_GBPS; + case 200: return IBV_RATE_200_GBPS; + case 300: return IBV_RATE_300_GBPS; + default: return IBV_RATE_10_GBPS; + } +} + +static FILE *acm_open_addr_file(void) +{ + FILE *f; + + if ((f = fopen(addr_file, "r"))) + return f; + + acm_log(0, "notice - generating %s file\n", addr_file); + if (!(f = popen(acme, "r"))) { + acm_log(0, "ERROR - cannot generate %s\n", addr_file); + return NULL; + } + pclose(f); + return fopen(addr_file, "r"); +} + +static int +__acm_ep_insert_addr(struct acmc_ep *ep, const char *name, uint8_t *addr, + uint8_t addr_type) +{ + int i; + int ret; + uint8_t tmp[ACM_MAX_ADDRESS] = {}; + + memcpy(tmp, addr, acm_addr_len(addr_type)); + + for (i = 0; (i < ep->nmbr_ep_addrs) && + (ep->addr_info[i].addr.type != ACM_ADDRESS_INVALID); i++) + ; + if (i == ep->nmbr_ep_addrs) { + struct acmc_addr *new_info; + int j; + + new_info = realloc(ep->addr_info, (i + 1) * sizeof(*ep->addr_info)); + if (!new_info) { + ret = ENOMEM; + goto out; + } + + /* id_string needs to point to the reallocated string_buf */ + for (j = 0; (j < ep->nmbr_ep_addrs); j++) { + new_info[j].addr.id_string = new_info[j].string_buf; + } + + ep->addr_info = new_info; + + /* Added memory is not initialized */ + memset(ep->addr_info + i, 0, sizeof(*ep->addr_info)); + ep->addr_info[i].addr.endpoint = &ep->endpoint; + ep->addr_info[i].addr.id_string = ep->addr_info[i].string_buf; + ++ep->nmbr_ep_addrs; + } + + /* Open the provider endpoint only if at least a name or + address is found */ + if (!ep->prov_ep_context) { + ret = ep->port->prov->open_endpoint(&ep->endpoint, + ep->port->prov_port_context, + &ep->prov_ep_context); + if (ret) { + acm_log(0, "Error: failed to open prov ep\n"); + goto out; + } + } + ep->addr_info[i].addr.type = addr_type; + strncpy(ep->addr_info[i].string_buf, name, ACM_MAX_ADDRESS); + memcpy(ep->addr_info[i].addr.info.addr, tmp, ACM_MAX_ADDRESS); + ret = ep->port->prov->add_address(&ep->addr_info[i].addr, + ep->prov_ep_context, + &ep->addr_info[i].prov_addr_context); + if (ret) { + acm_log(0, "Error: failed to add addr to provider\n"); + ep->addr_info[i].addr.type = ACM_ADDRESS_INVALID; + } + +out: + return ret; +} + +static int +acm_ep_insert_addr(struct acmc_ep *ep, const char *name, uint8_t *addr, + uint8_t addr_type) +{ + int ret = -1; + + if (!acm_addr_lookup(&ep->endpoint, addr, addr_type)) { + ret = __acm_ep_insert_addr(ep, name, addr, addr_type); + } + + return ret; +} + +static struct acmc_device * +acm_get_device_from_gid(union ibv_gid *sgid, uint8_t *port) +{ + struct acmc_device *dev; + int i; + + list_for_each(&dev_list, dev, entry) { + for (*port = 1; *port <= dev->port_cnt; (*port)++) { + + for (i = 0; i < dev->port[*port - 1].gid_cnt; i++) { + + if (!memcmp(sgid->raw, + dev->port[*port - 1].gid_tbl[i].raw, + sizeof(*sgid))) + return dev; + } + } + } + return NULL; +} + +static void acm_ep_ip_iter_cb(char *ifname, union ibv_gid *gid, uint16_t pkey, + uint8_t addr_type, uint8_t *addr, + char *ip_str, void *ctx) +{ + uint8_t port_num; + struct acmc_device *dev; + struct acmc_ep *ep = ctx; + + dev = acm_get_device_from_gid(gid, &port_num); + if (dev && ep->port->dev == dev + && ep->port->port.port_num == port_num && + /* pkey retrieved from ipoib has always full mmbr bit set */ + (ep->endpoint.pkey | IB_PKEY_FULL_MEMBER) == pkey) { + if (!acm_ep_insert_addr(ep, ip_str, addr, addr_type)) { + acm_log(0, "Added %s %s %d 0x%x from %s\n", ip_str, + dev->device.verbs->device->name, port_num, ep->endpoint.pkey, + ifname); + } + } +} + +static int acm_get_system_ips(struct acmc_ep *ep) +{ + return acm_if_iter_sys(acm_ep_ip_iter_cb, ep); +} + +static int acm_assign_ep_names(struct acmc_ep *ep) +{ + FILE *faddr; + char *dev_name; + char s[120]; + char dev[32], name[ACM_MAX_ADDRESS], pkey_str[8]; + uint16_t pkey; + uint8_t addr[ACM_MAX_ADDRESS], type; + int port; + + dev_name = ep->port->dev->device.verbs->device->name; + acm_log(1, "device %s, port %d, pkey 0x%x\n", + dev_name, ep->port->port.port_num, ep->endpoint.pkey); + + acm_get_system_ips(ep); + + if (!(faddr = acm_open_addr_file())) { + acm_log(0, "ERROR - address file not found\n"); + goto out; + } + + while (fgets(s, sizeof s, faddr)) { + if (s[0] == '#') + continue; + + if (sscanf(s, "%46s%31s%d%7s", name, dev, &port, pkey_str) != 4) + continue; + + acm_log(2, "%s", s); + if (inet_pton(AF_INET, name, addr) > 0) { + if (!support_ips_in_addr_cfg) { + acm_log(0, "ERROR - IP's are not configured to be read from ibacm_addr.cfg\n"); + continue; + } + type = ACM_ADDRESS_IP; + } else if (inet_pton(AF_INET6, name, addr) > 0) { + if (!support_ips_in_addr_cfg) { + acm_log(0, "ERROR - IP's are not configured to be read from ibacm_addr.cfg\n"); + continue; + } + type = ACM_ADDRESS_IP6; + } else { + type = ACM_ADDRESS_NAME; + strncpy((char *)addr, name, sizeof(addr)); + } + + if (strcasecmp(pkey_str, "default")) { + if (sscanf(pkey_str, "%hx", &pkey) != 1) { + acm_log(0, "ERROR - bad pkey format %s\n", pkey_str); + continue; + } + } else { + pkey = ep->port->def_acm_pkey; + } + + if (!strcasecmp(dev_name, dev) && + (ep->port->port.port_num == (uint8_t) port) && + acm_same_partition(ep->endpoint.pkey, pkey)) { + acm_log(1, "assigning %s\n", name); + if (acm_ep_insert_addr(ep, name, addr, type)) { + acm_log(1, "maximum number of names assigned to EP\n"); + break; + } + } + } + fclose(faddr); + +out: + return (!ep->nmbr_ep_addrs || ep->addr_info[0].addr.type == ACM_ADDRESS_INVALID); +} + +static struct acmc_ep *acm_find_ep(struct acmc_port *port, uint16_t pkey) +{ + struct acmc_ep *ep, *res = NULL; + + acm_log(2, "pkey 0x%x\n", pkey); + + list_for_each(&port->ep_list, ep, entry) { + if (acm_same_partition(ep->endpoint.pkey, pkey)) { + res = ep; + break; + } + } + return res; +} + +static void acm_ep_down(struct acmc_ep *ep) +{ + int i; + + acm_log(1, "%s %d pkey 0x%04x\n", + ep->port->dev->device.verbs->device->name, + ep->port->port.port_num, ep->endpoint.pkey); + + for (i = 0; i < ep->nmbr_ep_addrs; i++) { + if (ep->addr_info[i].addr.type && + ep->addr_info[i].prov_addr_context) + ep->port->prov->remove_address(ep->addr_info[i]. + prov_addr_context); + } + + if (ep->prov_ep_context) + ep->port->prov->close_endpoint(ep->prov_ep_context); + + free(ep); +} + +static struct acmc_ep * +acm_alloc_ep(struct acmc_port *port, uint16_t pkey) +{ + struct acmc_ep *ep; + + acm_log(1, "\n"); + ep = calloc(1, sizeof *ep); + if (!ep) + return NULL; + + ep->port = port; + ep->endpoint.port = &port->port; + ep->endpoint.pkey = pkey; + ep->addr_info = NULL; + ep->nmbr_ep_addrs = 0; + + return ep; +} + +static void acm_ep_up(struct acmc_port *port, uint16_t pkey) +{ + struct acmc_ep *ep; + int ret; + + acm_log(1, "\n"); + if (acm_find_ep(port, pkey)) { + acm_log(2, "endpoint for pkey 0x%x already exists\n", pkey); + return; + } + + acm_log(2, "creating endpoint for pkey 0x%x\n", pkey); + ep = acm_alloc_ep(port, pkey); + if (!ep) + return; + + ret = acm_assign_ep_names(ep); + if (ret) { + acm_log(1, "unable to assign EP name for pkey 0x%x\n", pkey); + goto ep_close; + } + + list_add(&port->ep_list, &ep->entry); + return; + +ep_close: + if (ep->prov_ep_context) + port->prov->close_endpoint(ep->prov_ep_context); + + free(ep); +} + +static void acm_assign_provider(struct acmc_port *port) +{ + struct acmc_prov *prov; + struct acmc_subnet *subnet; + + acm_log(2, "port %s/%d\n", port->port.dev->verbs->device->name, + port->port.port_num); + list_for_each(&provider_list, prov, entry) { + list_for_each(&prov->subnet_list, subnet, entry) { + if (subnet->subnet_prefix == + port->gid_tbl[0].global.subnet_prefix) { + acm_log(2, "Found provider %s for port %s/%d\n", + prov->prov->name, + port->port.dev->verbs->device->name, + port->port.port_num); + port->prov = prov->prov; + return; + } + } + } + + /* If no provider is found, assign the default provider*/ + if (!port->prov) { + acm_log(2, "No prov found, assign default prov %s to %s/%d\n", + def_provider ? def_provider->prov->name: "NULL", + port->port.dev->verbs->device->name, + port->port.port_num); + port->prov = def_provider ? def_provider->prov : NULL; + } +} + +static void acm_port_get_gid_tbl(struct acmc_port *port) +{ + union ibv_gid gid; + int i, j, ret; + + for (i = 0;; i++) { + ret = ibv_query_gid(port->port.dev->verbs, port->port.port_num, + i, &gid); + if (ret || !gid.global.interface_id) + break; + } + + if (i > 0) { + port->gid_tbl = calloc(i, sizeof(union ibv_gid)); + if (!port->gid_tbl) { + acm_log(0, "Error: failed to allocate gid table\n"); + port->gid_cnt = 0; + return; + } + + for (j = 0; j < i; j++) { + ret = ibv_query_gid(port->port.dev->verbs, + port->port.port_num, j, + &port->gid_tbl[j]); + if (ret || !port->gid_tbl[j].global.interface_id) + break; + acm_log(2, "guid %d: 0x%" PRIx64 " %" PRIx64 "\n", j, + be64toh(port->gid_tbl[j].global.subnet_prefix), + be64toh(port->gid_tbl[j].global.interface_id)); + } + port->gid_cnt = j; + } + acm_log(2, "port %d gid_cnt %d\n", port->port.port_num, + port->gid_cnt); +} + +static void acm_port_up(struct acmc_port *port) +{ + struct ibv_port_attr attr; + uint16_t pkey; + __be16 pkey_be; + int i, ret; + struct acmc_prov_context *dev_ctx; + int index = -1; + uint16_t first_pkey = 0; + + acm_log(1, "%s %d\n", port->dev->device.verbs->device->name, + port->port.port_num); + ret = ibv_query_port(port->dev->device.verbs, port->port.port_num, + &attr); + if (ret) { + acm_log(0, "ERROR - unable to get port state\n"); + return; + } + if (attr.state != IBV_PORT_ACTIVE) { + acm_log(1, "port not active\n"); + return; + } + + acm_port_get_gid_tbl(port); + port->lid = attr.lid; + port->lid_mask = 0xffff - ((1 << attr.lmc) - 1); + port->sa_addr.lid = htobe16(attr.sm_lid); + port->sa_addr.sl = attr.sm_sl; + port->state = IBV_PORT_ACTIVE; + acm_assign_provider(port); + if (!port->prov) { + acm_log(1, "no provider assigned to port\n"); + return; + } + dev_ctx = acm_acquire_prov_context(&port->dev->prov_dev_context_list, + port->prov); + if (!dev_ctx) { + acm_log(0, "Error -- failed to acquire dev context\n"); + return; + } + + if (atomic_get(&dev_ctx->refcnt) == 1) { + if (port->prov->open_device(&port->dev->device, &dev_ctx->context)) { + acm_log(0, "Error -- failed to open the prov device\n"); + goto err1; + } + } + + if (port->prov->open_port(&port->port, dev_ctx->context, + &port->prov_port_context)) { + acm_log(0, "Error -- failed to open the prov port\n"); + goto err1; + } + + /* Determine the default pkey for SA access first. + * Order of preference: 0xffff, 0x7fff + * Use the first pkey as the default pkey for parsing address file. + */ + for (i = 0; i < attr.pkey_tbl_len; i++) { + ret = ibv_query_pkey(port->dev->device.verbs, + port->port.port_num, i, &pkey_be); + if (ret) + continue; + pkey = be16toh(pkey_be); + if (i == 0) + first_pkey = pkey; + if (pkey == 0xffff) { + index = i; + break; + } + else if (pkey == 0x7fff) { + index = i; + } + } + port->sa_pkey_index = index < 0 ? 0 : index; + port->def_acm_pkey = first_pkey; + + for (i = 0; i < attr.pkey_tbl_len; i++) { + ret = ibv_query_pkey(port->dev->device.verbs, + port->port.port_num, i, &pkey_be); + if (ret) + continue; + pkey = be16toh(pkey_be); + if (!(pkey & 0x7fff)) + continue; + + acm_ep_up(port, pkey); + } + return; +err1: + acm_release_prov_context(dev_ctx); +} + +static void acm_shutdown_port(struct acmc_port *port) +{ + struct acmc_ep *ep; + struct acmc_prov_context *dev_ctx; + + while ((ep = list_pop(&port->ep_list, struct acmc_ep, entry))) + acm_ep_down(ep); + + if (port->prov_port_context) { + port->prov->close_port(port->prov_port_context); + port->prov_port_context = NULL; + dev_ctx = acm_get_prov_context(&port->dev->prov_dev_context_list, + port->prov); + if (dev_ctx) { + if (atomic_get(&dev_ctx->refcnt) == 1) + port->prov->close_device(dev_ctx->context); + acm_release_prov_context(dev_ctx); + } + } + port->prov = NULL; + if (port->gid_tbl) { + free(port->gid_tbl); + port->gid_tbl = NULL; + } + port->gid_cnt = 0; +} + +static void acm_port_down(struct acmc_port *port) +{ + struct ibv_port_attr attr; + int ret; + + acm_log(1, "%s %d\n", port->port.dev->verbs->device->name, port->port.port_num); + ret = ibv_query_port(port->port.dev->verbs, port->port.port_num, &attr); + if (!ret && attr.state == IBV_PORT_ACTIVE) { + acm_log(1, "port active\n"); + return; + } + + port->state = attr.state; + acm_shutdown_port(port); + + acm_log(1, "%s %d is down\n", port->dev->device.verbs->device->name, + port->port.port_num); +} + +static void acm_port_change(struct acmc_port *port) +{ + struct ibv_port_attr attr; + int ret; + + acm_log(1, "%s %d\n", port->port.dev->verbs->device->name, port->port.port_num); + ret = ibv_query_port(port->port.dev->verbs, port->port.port_num, &attr); + if (ret || attr.state != IBV_PORT_ACTIVE) { + acm_log(1, "port not active: don't care\n"); + return; + } + + port->state = attr.state; + acm_shutdown_port(port); + acm_port_up(port); +} + +static void acm_event_handler(struct acmc_device *dev) +{ + struct ibv_async_event event; + int i, ret; + + ret = ibv_get_async_event(dev->device.verbs, &event); + if (ret) + return; + + acm_log(2, "processing async event %s for %s\n", + ibv_event_type_str(event.event_type), + dev->device.verbs->device->name); + i = event.element.port_num - 1; + + switch (event.event_type) { + case IBV_EVENT_PORT_ACTIVE: + if (dev->port[i].state != IBV_PORT_ACTIVE) + acm_port_up(&dev->port[i]); + if (dev->port[i].pending_rereg && dev->port[i].prov_port_context) { + dev->port[i].prov->handle_event(dev->port[i].prov_port_context, + IBV_EVENT_CLIENT_REREGISTER); + dev->port[i].pending_rereg = false; + acm_log(1, "%s %d delayed reregistration\n", + dev->device.verbs->device->name, i + 1); + } + + break; + case IBV_EVENT_PORT_ERR: + if (dev->port[i].state == IBV_PORT_ACTIVE) + acm_port_down(&dev->port[i]); + break; + case IBV_EVENT_CLIENT_REREGISTER: + if ((dev->port[i].state == IBV_PORT_ACTIVE) && + dev->port[i].prov_port_context) { + dev->port[i].prov->handle_event(dev->port[i].prov_port_context, + event.event_type); + acm_log(1, "%s %d has reregistered\n", + dev->device.verbs->device->name, i + 1); + } else { + acm_log(2, "%s %d rereg on inactive port, postpone handling\n", + dev->device.verbs->device->name, i + 1); + dev->port[i].pending_rereg = true; + } + + break; + case IBV_EVENT_LID_CHANGE: + case IBV_EVENT_GID_CHANGE: + case IBV_EVENT_PKEY_CHANGE: + acm_port_change(&dev->port[i]); + break; + default: + break; + } + + ibv_ack_async_event(&event); +} + +static void acm_activate_devices(void) +{ + struct acmc_device *dev; + int i; + + acm_log(1, "\n"); + list_for_each(&dev_list, dev, entry) { + for (i = 0; i < dev->port_cnt; i++) { + acm_port_up(&dev->port[i]); + } + } +} + +static void +acm_open_port(struct acmc_port *port, struct acmc_device *dev, uint8_t port_num) +{ + acm_log(1, "%s %d\n", dev->device.verbs->device->name, port_num); + port->dev = dev; + port->port.dev = &dev->device; + port->port.port_num = port_num; + pthread_mutex_init(&port->lock, NULL); + list_head_init(&port->ep_list); + list_head_init(&port->sa_pending); + list_head_init(&port->sa_wait); + port->sa_credits = sa.depth; + port->sa_addr.qpn = htobe32(1); + port->sa_addr.qkey = htobe32(ACM_QKEY); + + port->mad_portid = umad_open_port(dev->device.verbs->device->name, port_num); + if (port->mad_portid < 0) + acm_log(0, "ERROR - unable to open MAD port\n"); + + port->mad_agentid = umad_register(port->mad_portid, + IB_MGMT_CLASS_SA, 1, 1, NULL); + if (port->mad_agentid < 0) { + umad_close_port(port->mad_portid); + acm_log(0, "ERROR - unable to register MAD client\n"); + } + + port->prov = NULL; + port->state = IBV_PORT_DOWN; +} + +static void acm_open_dev(struct ibv_device *ibdev) +{ + struct acmc_device *dev; + struct ibv_device_attr attr; + struct ibv_port_attr port_attr; + struct ibv_context *verbs; + size_t size; + int i, ret; + bool has_ib_port = false; + + acm_log(1, "%s\n", ibdev->name); + verbs = ibv_open_device(ibdev); + if (verbs == NULL) { + acm_log(0, "ERROR - opening device %s\n", ibdev->name); + return; + } + + ret = ibv_query_device(verbs, &attr); + if (ret) { + acm_log(0, "ERROR - ibv_query_device (%d) %s\n", ret, ibdev->name); + goto err1; + } + + for (i = 0; i < attr.phys_port_cnt; i++) { + ret = ibv_query_port(verbs, i + 1, &port_attr); + if (ret) { + acm_log(0, "ERROR - ibv_query_port (%s, %d) return (%d)\n", + ibdev->name, i + 1, ret); + continue; + } + + if (port_attr.link_layer == IBV_LINK_LAYER_INFINIBAND) { + acm_log(1, "%s port %d is an InfiniBand port\n", ibdev->name, i + 1); + has_ib_port = true; + } else { + acm_log(1, "%s port %d is not an InfiniBand port\n", ibdev->name, i + 1); + } + } + + if (!has_ib_port) { + acm_log(1, "%s does not support InfiniBand.\n", ibdev->name); + goto err1; + } + + size = sizeof(*dev) + sizeof(struct acmc_port) * attr.phys_port_cnt; + dev = (struct acmc_device *) calloc(1, size); + if (!dev) + goto err1; + + dev->device.verbs = verbs; + dev->device.dev_guid = ibv_get_device_guid(ibdev); + dev->port_cnt = attr.phys_port_cnt; + list_head_init(&dev->prov_dev_context_list); + + for (i = 0; i < dev->port_cnt; i++) { + acm_open_port(&dev->port[i], dev, i + 1); + } + + list_add(&dev_list, &dev->entry); + + acm_log(1, "%s opened\n", ibdev->name); + return; + +err1: + ibv_close_device(verbs); +} + +static int acm_open_devices(void) +{ + struct ibv_device **ibdev; + int dev_cnt; + int i; + + acm_log(1, "\n"); + ibdev = ibv_get_device_list(&dev_cnt); + if (!ibdev) { + acm_log(0, "ERROR - unable to get device list\n"); + return -1; + } + + for (i = 0; i < dev_cnt; i++) + acm_open_dev(ibdev[i]); + + ibv_free_device_list(ibdev); + if (list_empty(&dev_list)) { + acm_log(0, "ERROR - no devices\n"); + return -1; + } + + return 0; +} + +static void acm_load_prov_config(void) +{ + FILE *fd; + char s[128]; + char *p, *ptr; + char prov_name[ACM_PROV_NAME_SIZE]; + uint64_t prefix; + struct acmc_prov *prov; + struct acmc_subnet *subnet; + + if (!(fd = fopen(opts_file, "r"))) + return; + + while (fgets(s, sizeof s, fd)) { + if (s[0] == '#') + continue; + + /* Ignore blank lines */ + if (!(p = strtok_r(s, " \n", &ptr))) + continue; + + if (strncasecmp(p, "provider", sizeof("provider") - 1)) + continue; + + p = strtok_r(NULL, " ", &ptr); + if (!p) + continue; + + strncpy(prov_name, p, sizeof(prov_name)); + prov_name[sizeof(prov_name) -1] = '\0'; + + p = strtok_r(NULL, " ", &ptr); + if (!p) + continue; + if (!strncasecmp(p, "default", sizeof("default") - 1)) { + strncpy(def_prov_name, prov_name, sizeof(def_prov_name)); + def_prov_name[sizeof(def_prov_name) -1] = '\0'; + acm_log(2, "default provider: %s\n", def_prov_name); + continue; + } + prefix = strtoull(p, NULL, 0); + acm_log(2, "provider %s subnet_prefix 0x%" PRIx64 "\n", + prov_name, prefix); + + list_for_each(&provider_list, prov, entry) { + if (!strcasecmp(prov->prov->name, prov_name)) { + subnet = calloc(1, sizeof (*subnet)); + if (!subnet) { + acm_log(0, "Error: out of memory\n"); + fclose(fd); + return; + } + subnet->subnet_prefix = htobe64(prefix); + list_add_tail(&prov->subnet_list, + &subnet->entry); + } + } + } + + fclose(fd); + + list_for_each(&provider_list, prov, entry) { + if (!strcasecmp(prov->prov->name, def_prov_name)) { + def_provider = prov; + break; + } + } +} + +static int acm_string_end_compare(const char *s1, const char *s2) +{ + size_t s1_len = strlen(s1); + size_t s2_len = strlen(s2); + + if (s1_len < s2_len) + return -1; + + return strcmp(s1 + s1_len - s2_len, s2); +} + +static int acm_open_providers(void) +{ + DIR *shlib_dir; + struct dirent *dent; + char file_name[256]; + struct stat buf; + void *handle; + struct acmc_prov *prov; + struct acm_provider *provider; + uint32_t version; + char *err_str; + int (*query)(struct acm_provider **, uint32_t *); + + acm_log(1, "\n"); + shlib_dir = opendir(prov_lib_path); + if (!shlib_dir) { + acm_log(0, "ERROR - could not open provider lib dir: %s\n", + prov_lib_path); + return -1; + } + + while ((dent = readdir(shlib_dir))) { + if (acm_string_end_compare(dent->d_name, ".so")) + continue; + + if (!check_snprintf(file_name, sizeof(file_name), "%s/%s", + prov_lib_path, dent->d_name)) + continue; + + if (lstat(file_name, &buf)) { + acm_log(0, "Error - could not stat: %s\n", file_name); + continue; + } + if (!S_ISREG(buf.st_mode)) + continue; + + acm_log(2, "Loading provider %s...\n", file_name); + if (!(handle = dlopen(file_name, RTLD_LAZY))) { + acm_log(0, "Error - could not load provider %s (%s)\n", + file_name, dlerror()); + continue; + } + + query = dlsym(handle, "provider_query"); + if ((err_str = dlerror()) != NULL) { + acm_log(0, "Error - provider_query not found in %s (%s)\n", + file_name, err_str); + dlclose(handle); + continue; + } + + if (query(&provider, &version)) { + acm_log(0, "Error - provider_query failed to %s\n", file_name); + dlclose(handle); + continue; + } + + if (version != ACM_PROV_VERSION || + provider->size != sizeof(struct acm_provider)) { + acm_log(0, "Error -unmatched provider version 0x%08x (size %zd)" + " core 0x%08x (size %zd)\n", version, provider->size, + ACM_PROV_VERSION, sizeof(struct acm_provider)); + dlclose(handle); + continue; + } + + acm_log(1, "Provider %s (%s) loaded\n", provider->name, file_name); + + prov = calloc(1, sizeof(*prov)); + if (!prov) { + acm_log(0, "Error -failed to allocate provider %s\n", file_name); + dlclose(handle); + continue; + } + + prov->prov = provider; + prov->handle = handle; + list_head_init(&prov->subnet_list); + list_add_tail(&provider_list, &prov->entry); + if (!strcasecmp(provider->name, def_prov_name)) + def_provider = prov; + } + + closedir(shlib_dir); + acm_load_prov_config(); + return 0; +} + +static void acm_close_providers(void) +{ + struct acmc_prov *prov; + struct acmc_subnet *subnet; + + acm_log(1, "\n"); + def_provider = NULL; + + while ((prov = list_pop(&provider_list, struct acmc_prov, entry))) { + while ((subnet = list_pop(&prov->subnet_list, + struct acmc_subnet, entry))) + free(subnet); + dlclose(prov->handle); + free(prov); + } +} + +static int acmc_init_sa_fds(void) +{ + struct acmc_device *dev; + int ret, p, i = 0; + + list_for_each(&dev_list, dev, entry) + sa.nfds += dev->port_cnt; + + sa.fds = calloc(sa.nfds, sizeof(*sa.fds)); + sa.ports = calloc(sa.nfds, sizeof(*sa.ports)); + if (!sa.fds || !sa.ports) + return -ENOMEM; + + list_for_each(&dev_list, dev, entry) { + for (p = 0; p < dev->port_cnt; p++) { + sa.fds[i].fd = umad_get_fd(dev->port[p].mad_portid); + sa.fds[i].events = POLLIN; + ret = set_fd_nonblock(sa.fds[i].fd, true); + if (ret) + acm_log(0, "WARNING - umad fd is blocking\n"); + + sa.ports[i++] = &dev->port[p]; + } + } + + return 0; +} + +struct acm_sa_mad * +acm_alloc_sa_mad(const struct acm_endpoint *endpoint, void *context, + void (*handler)(struct acm_sa_mad *)) +{ + struct acmc_sa_req *req; + + if (!endpoint) { + acm_log(0, "Error: NULL endpoint\n"); + return NULL; + } + req = calloc(1, sizeof (*req)); + if (!req) { + acm_log(0, "Error: failed to allocate sa request\n"); + return NULL; + } + + req->ep = container_of(endpoint, struct acmc_ep, endpoint); + req->mad.context = context; + req->resp_handler = handler; + + acm_log(2, "%p\n", req); + return &req->mad; +} + +void acm_free_sa_mad(struct acm_sa_mad *mad) +{ + struct acmc_sa_req *req; + req = container_of(mad, struct acmc_sa_req, mad); + acm_log(2, "%p\n", req); + free(req); +} + +int acm_send_sa_mad(struct acm_sa_mad *mad) +{ + struct acmc_port *port; + struct acmc_sa_req *req; + int ret; + + req = container_of(mad, struct acmc_sa_req, mad); + acm_log(2, "%p from %s\n", req, req->ep->addr_info[0].addr.id_string); + + port = req->ep->port; + mad->umad.addr.qpn = port->sa_addr.qpn; + mad->umad.addr.qkey = port->sa_addr.qkey; + mad->umad.addr.lid = port->sa_addr.lid; + mad->umad.addr.sl = port->sa_addr.sl; + mad->umad.addr.pkey_index = req->ep->port->sa_pkey_index; + + pthread_mutex_lock(&port->lock); + if (port->sa_credits && list_empty(&port->sa_wait)) { + ret = umad_send(port->mad_portid, port->mad_agentid, &mad->umad, + sizeof mad->sa_mad, sa.timeout, sa.retries); + if (!ret) { + port->sa_credits--; + list_add_tail(&port->sa_pending, &req->entry); + } + } else { + ret = 0; + list_add_tail(&port->sa_wait, &req->entry); + } + pthread_mutex_unlock(&port->lock); + return ret; +} + +static void acmc_send_queued_req(struct acmc_port *port) +{ + struct acmc_sa_req *req; + int ret; + + pthread_mutex_lock(&port->lock); + if (list_empty(&port->sa_wait) || !port->sa_credits) { + pthread_mutex_unlock(&port->lock); + return; + } + + req = list_pop(&port->sa_wait, struct acmc_sa_req, entry); + + ret = umad_send(port->mad_portid, port->mad_agentid, &req->mad.umad, + sizeof req->mad.sa_mad, sa.timeout, sa.retries); + if (!ret) { + port->sa_credits--; + list_add_tail(&port->sa_pending, &req->entry); + } + pthread_mutex_unlock(&port->lock); + + if (ret) { + req->mad.umad.status = -ret; + req->resp_handler(&req->mad); + } +} + +static void acmc_recv_mad(struct acmc_port *port) +{ + struct acmc_sa_req *req; + struct acm_sa_mad resp; + int ret, len, found; + struct umad_hdr *hdr; + + if (!port->prov) { + acm_log(1, "no provider assigned to port\n"); + return; + } + + acm_log(2, "\n"); + len = sizeof(resp.sa_mad); + ret = umad_recv(port->mad_portid, &resp.umad, &len, 0); + if (ret < 0) { + acm_log(1, "umad_recv error %d\n", ret); + return; + } + + hdr = &resp.sa_mad.mad_hdr; + acm_log(2, "bv %x cls %x cv %x mtd %x st %d tid %" PRIx64 "x at %x atm %x\n", + hdr->base_version, hdr->mgmt_class, hdr->class_version, + hdr->method, hdr->status, be64toh(hdr->tid), hdr->attr_id, hdr->attr_mod); + found = 0; + pthread_mutex_lock(&port->lock); + list_for_each(&port->sa_pending, req, entry) { + /* The upper 32-bit of the tid is used for agentid in umad */ + if (req->mad.sa_mad.mad_hdr.tid == (hdr->tid & htobe64(0xFFFFFFFF))) { + found = 1; + list_del(&req->entry); + port->sa_credits++; + break; + } + } + pthread_mutex_unlock(&port->lock); + + if (found) { + memcpy(&req->mad.umad, &resp.umad, sizeof(resp.umad) + len); + req->resp_handler(&req->mad); + } +} + +static void *acm_sa_handler(void *context) +{ + int i, ret; + + acm_log(0, "started\n"); + ret = acmc_init_sa_fds(); + if (ret) { + acm_log(0, "ERROR - failed to init fds\n"); + return NULL; + } + + if (pthread_setcanceltype(PTHREAD_CANCEL_DEFERRED, NULL)) { + acm_log(0, "Error: failed to set cancel type \n"); + return NULL; + } + + if (pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, NULL)) { + acm_log(0, "Error: failed to set cancel state\n"); + return NULL; + } + + for (;;) { + pthread_testcancel(); + ret = poll(sa.fds, sa.nfds, -1); + if (ret < 0) { + acm_log(0, "ERROR - sa poll error: %d\n", errno); + continue; + } + + for (i = 0; i < sa.nfds; i++) { + if (!sa.fds[i].revents) + continue; + + if (sa.fds[i].revents & POLLIN) { + acmc_recv_mad(sa.ports[i]); + acmc_send_queued_req(sa.ports[i]); + } + sa.fds[i].revents = 0; + } + } + return NULL; +} + +static void acm_stop_sa_handler(void) +{ + if (pthread_cancel(sa.thread_id)) { + acm_log(0, "Error: failed to cancel sa resp thread \n"); + return; + } + + if (pthread_join(sa.thread_id, NULL)) { + acm_log(0, "Error: failed to join sa resp thread\n"); + return; + } +} + +static void acm_set_options(void) +{ + FILE *f; + char s[120]; + char opt[32], value[256]; + + if (!(f = fopen(opts_file, "r"))) + return; + + while (fgets(s, sizeof s, f)) { + if (s[0] == '#') + continue; + + if (sscanf(s, "%31s%255s", opt, value) != 2) + continue; + + if (!strcasecmp("log_file", opt)) + strcpy(log_file, value); + else if (!strcasecmp("log_level", opt)) + log_level = atoi(value); + else if (!strcasecmp("lock_file", opt)) + strcpy(lock_file, value); + else if (!strcasecmp("server_port", opt)) + server_port = (short) atoi(value); + else if (!strcasecmp("server_mode", opt)) { + if (!strcasecmp(value, "open")) + server_mode = IBACM_SERVER_MODE_OPEN; + else if (!strcasecmp(value, "loop")) + server_mode = IBACM_SERVER_MODE_LOOP; + else + server_mode = IBACM_SERVER_MODE_UNIX; + } else if (!strcasecmp("acme_plus_kernel_only", opt)) + acme_plus_kernel_only = + !strcasecmp(value, "true") || + !strcasecmp(value, "yes") || + strtol(value, NULL, 0); + else if (!strcasecmp("provider_lib_path", opt)) + strcpy(prov_lib_path, value); + else if (!strcasecmp("support_ips_in_addr_cfg", opt)) + support_ips_in_addr_cfg = atoi(value); + else if (!strcasecmp("timeout", opt)) + sa.timeout = atoi(value); + else if (!strcasecmp("retries", opt)) + sa.retries = atoi(value); + else if (!strcasecmp("sa_depth", opt)) + sa.depth = atoi(value); + } + + fclose(f); +} + +static void acm_log_options(void) +{ + static const char * const server_mode_names[] = { + [IBACM_SERVER_MODE_UNIX] = "unix", + [IBACM_SERVER_MODE_LOOP] = "loop", + [IBACM_SERVER_MODE_OPEN] = "open", + }; + + acm_log(0, "log file %s\n", log_file); + acm_log(0, "log level %d\n", log_level); + acm_log(0, "lock file %s\n", lock_file); + acm_log(0, "server_port %d\n", server_port); + acm_log(0, "server_mode %s\n", server_mode_names[server_mode]); + acm_log(0, "acme_plus_kernel_only %s\n", + acme_plus_kernel_only ? "yes" : "no"); + acm_log(0, "timeout %d ms\n", sa.timeout); + acm_log(0, "retries %d\n", sa.retries); + acm_log(0, "sa depth %d\n", sa.depth); + acm_log(0, "options file %s\n", opts_file); + acm_log(0, "addr file %s\n", addr_file); + acm_log(0, "provider lib path %s\n", prov_lib_path); + acm_log(0, "support IP's in ibacm_addr.cfg %d\n", support_ips_in_addr_cfg); +} + +static FILE *acm_open_log(void) +{ + FILE *f; + + if (!strcasecmp(log_file, "stdout")) + return stdout; + + if (!strcasecmp(log_file, "stderr")) + return stderr; + + if (!(f = fopen(log_file, "w"))) + f = stdout; + + return f; +} + +static int acm_open_lock_file(void) +{ + int lock_fd; + char pid[16]; + + lock_fd = open(lock_file, O_RDWR | O_CREAT, 0640); + if (lock_fd < 0) + return lock_fd; + + if (lockf(lock_fd, F_TLOCK, 0)) { + close(lock_fd); + return -1; + } + + snprintf(pid, sizeof pid, "%d\n", getpid()); + if (write(lock_fd, pid, strlen(pid)) != strlen(pid)){ + close(lock_fd); + return -1; + } + return 0; +} + +static void show_usage(char *program) +{ + printf("usage: %s\n", program); + printf(" [-D] - run as a daemon (default)\n"); + printf(" [-P] - run as a standard process\n"); + printf(" [-A addr_file] - address configuration file\n"); + printf(" (default %s/%s)\n", ACM_CONF_DIR, ACM_ADDR_FILE); + printf(" [-O option_file] - option configuration file\n"); + printf(" (default %s/%s)\n", ACM_CONF_DIR, ACM_OPTS_FILE); +} + +int main(int argc, char **argv) +{ + int i, op, as_daemon = 1; + bool systemd = false; + + static const struct option long_opts[] = { + {"systemd", 0, NULL, 's'}, + {} + }; + + while ((op = getopt_long(argc, argv, "DPA:O:", long_opts, NULL)) != + -1) { + switch (op) { + case 'D': + /* option no longer required */ + break; + case 'P': + as_daemon = 0; + break; + case 'A': + addr_file = optarg; + break; + case 'O': + opts_file = optarg; + break; + case 's': + systemd = true; + break; + default: + show_usage(argv[0]); + exit(1); + } + } + + if (as_daemon && !systemd) { + if (daemon(0, 0)) + return EXIT_FAILURE; + } + + acm_set_options(); + + /* usage of systemd implies unix-domain communication */ + if (systemd) + server_mode = IBACM_SERVER_MODE_UNIX; + + if (acm_open_lock_file()) + return -1; + + pthread_mutex_init(&log_lock, NULL); + flog = acm_open_log(); + + acm_log(0, "Assistant to the InfiniBand Communication Manager\n"); + acm_log_options(); + + for (i = 0; i < ACM_MAX_COUNTER; i++) + atomic_init(&counter[i]); + + if (umad_init() != 0) { + acm_log(0, "ERROR - fail to initialize umad\n"); + return -1; + } + + if (acm_open_providers()) { + acm_log(0, "ERROR - unable to open any providers\n"); + return -1; + } + + if (acm_open_devices()) { + acm_log(0, "ERROR - unable to open any devices\n"); + return -1; + } + + acm_log(1, "creating IP Netlink socket\n"); + acm_ipnl_create(); + + acm_log(1, "starting sa response receiving thread\n"); + if (pthread_create(&sa.thread_id, NULL, acm_sa_handler, NULL)) { + acm_log(0, "Error: failed to create sa resp rcving thread"); + return -1; + } + + if (acm_init_if_iter_sys()) { + acm_log(0, "Error: unable to initialize acm_if_iter_sys"); + return -1; + } + + acm_activate_devices(); + acm_log(1, "starting server\n"); + acm_server(systemd); + + acm_log(0, "shutting down\n"); + if (client_array[NL_CLIENT_INDEX].sock != -1) + close(client_array[NL_CLIENT_INDEX].sock); + acm_close_providers(); + acm_stop_sa_handler(); + umad_done(); + acm_fini_if_iter_sys(); + fclose(flog); + return 0; +} diff --git a/ibacm/src/acm_util.c b/ibacm/src/acm_util.c new file mode 100644 index 0000000..1c6b969 --- /dev/null +++ b/ibacm/src/acm_util.c @@ -0,0 +1,246 @@ +/* + * Copyright (c) 2014 Intel Corporation. All rights reserved. + * + * This software is available to you under the OpenFabrics.org BSD license + * below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AWV + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <stdio.h> +#include <stdlib.h> +#include <inttypes.h> +#include <net/if_arp.h> +#include <string.h> +#include <unistd.h> +#include <arpa/inet.h> +#include <sys/socket.h> +#include <sys/types.h> +#include <errno.h> +#include <netlink/route/addr.h> +#include <netlink/route/link.h> +#include <netlink/socket.h> + +#include <infiniband/acm.h> +#include "acm_mad.h" +#include "acm_util.h" + +int acm_if_get_pkey(char *ifname, uint16_t *pkey) +{ + char buf[128], *end; + FILE *f; + int ret; + + snprintf(buf, sizeof buf, "//sys//class//net//%s//pkey", ifname); + f = fopen(buf, "r"); + if (!f) { + acm_log(0, "failed to open %s\n", buf); + return -1; + } + + if (fgets(buf, sizeof buf, f)) { + *pkey = strtol(buf, &end, 16); + ret = 0; + } else { + acm_log(0, "failed to read pkey\n"); + ret = -1; + } + + fclose(f); + return ret; +} + +int acm_if_get_sgid(char *ifname, union ibv_gid *sgid) +{ + char buf[128], *end; + FILE *f; + int i, p, ret; + + snprintf(buf, sizeof buf, "//sys//class//net//%s//address", ifname); + f = fopen(buf, "r"); + if (!f) { + acm_log(0, "failed to open %s\n", buf); + return -1; + } + + if (fgets(buf, sizeof buf, f)) { + for (i = 0, p = 12; i < 16; i++, p += 3) { + buf[p + 2] = '\0'; + sgid->raw[i] = (uint8_t) strtol(buf + p, &end, 16); + } + ret = 0; + } else { + acm_log(0, "failed to read sgid\n"); + ret = -1; + } + + fclose(f); + return ret; +} + +static struct nl_sock *sk; +static struct nl_cache *link_cache; +static struct nl_cache *addr_cache; + +int acm_init_if_iter_sys(void) +{ + int sts; + + sk = nl_socket_alloc(); + if (!sk) { + acm_log(0, "nl_socket_alloc"); + return -1; + } + + sts = nl_connect(sk, NETLINK_ROUTE); + if (sts) { + acm_log(0, "nl_connect failed"); + goto out_connect; + } + + sts = rtnl_link_alloc_cache(sk, AF_UNSPEC, &link_cache); + if (sts) { + acm_log(0, "rtnl_link_alloc_cache failed"); + goto out_connect; + } + + sts = rtnl_addr_alloc_cache(sk, &addr_cache); + if (sts) { + acm_log(0, "rtnl_addr_alloc_cache"); + goto out_addr; + } + + return 0; + +out_addr: + nl_cache_free(link_cache); + +out_connect: + nl_close(sk); + return sts; +} + +void acm_fini_if_iter_sys(void) +{ + nl_cache_free(link_cache); + nl_cache_free(addr_cache); + nl_close(sk); +} + +static inline int af2acm_addr_type(int af) +{ + switch (af) { + case AF_INET: + return ACM_ADDRESS_IP; + + case AF_INET6: + return ACM_ADDRESS_IP6; + } + + acm_log(0, "Unnkown address family\n"); + return ACM_ADDRESS_INVALID; +} + +struct ctx_and_cb { + void *ctx; + acm_if_iter_cb cb; +}; + +static void acm_if_iter(struct nl_object *obj, void *_ctx_and_cb) +{ + struct ctx_and_cb *ctx_cb = (struct ctx_and_cb *)_ctx_and_cb; + struct rtnl_addr *addr = (struct rtnl_addr *)obj; + struct nl_addr *a = rtnl_addr_get_local(addr); + uint8_t bin_addr[ACM_MAX_ADDRESS] = {}; + int addr_len = nl_addr_get_len(a); + char ip_str[INET6_ADDRSTRLEN]; + struct nl_addr *link_addr; + struct rtnl_link *link; + char flags_str[128]; + union ibv_gid sgid; + uint16_t pkey; + char *label; + int af; + + link = rtnl_link_get(link_cache, rtnl_addr_get_ifindex(addr)); + + if (rtnl_link_get_arptype(link) != ARPHRD_INFINIBAND) + return; + + if (!a) + return; + + if (addr_len > ACM_MAX_ADDRESS) { + acm_log(0, "address too long (%d)\n", addr_len); + return; + } + + af = nl_addr_get_family(a); + if (af != AF_INET && af != AF_INET6) + return; + + label = rtnl_addr_get_label(addr); + + link_addr = rtnl_link_get_addr(link); + /* gid has a 4 byte offset into the link address */ + memcpy(sgid.raw, nl_addr_get_binary_addr(link_addr) + 4, sizeof(sgid)); + + if (acm_if_get_pkey(rtnl_link_get_name(link), &pkey)) + return; + + acm_log(2, "name: %5s label: %9s index: %2d flags: %s addr: %s pkey: 0x%04x guid: 0x%" PRIx64 "\n", + rtnl_link_get_name(link), label, + rtnl_addr_get_ifindex(addr), + rtnl_link_flags2str(rtnl_link_get_flags(link), flags_str, sizeof(flags_str)), + nl_addr2str(a, ip_str, sizeof(ip_str)), pkey, + be64toh(sgid.global.interface_id)); + + memcpy(&bin_addr, nl_addr_get_binary_addr(a), addr_len); + ctx_cb->cb(label ? label : rtnl_link_get_name(link), + &sgid, pkey, af2acm_addr_type(af), bin_addr, ip_str, ctx_cb->ctx); +} + + +int acm_if_iter_sys(acm_if_iter_cb cb, void *ctx) +{ + struct ctx_and_cb ctx_cb; + int sts; + + sts = nl_cache_refill(sk, link_cache); + if (sts) { + acm_log(0, "nl_cache_refill link_cache"); + return sts; + } + + sts = nl_cache_refill(sk, addr_cache); + if (sts) { + acm_log(0, "nl_cache_refill addr_cache"); + return sts; + } + + ctx_cb.ctx = ctx; + ctx_cb.cb = cb; + nl_cache_foreach(addr_cache, acm_if_iter, (void *)&ctx_cb); + + return 0; +} diff --git a/ibacm/src/acm_util.h b/ibacm/src/acm_util.h new file mode 100644 index 0000000..2c143fd --- /dev/null +++ b/ibacm/src/acm_util.h @@ -0,0 +1,70 @@ +/* + * Copyright (c) 2014 Intel Corporation. All rights reserved. + * + * This software is available to you under the OpenFabrics.org BSD license + * below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AWV + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#if !defined(ACM_IF_H) +#define ACM_IF_H + +#include <infiniband/verbs.h> +#include <infiniband/acm_prov.h> + + +#ifdef ACME_PRINTS + +#undef acm_log +#define acm_log(level, format, ...) \ + printf(format, ## __VA_ARGS__) +#define acm_log_once(level, format, ...) \ + printf(format, ## __VA_ARGS__) + +#else /* !ACME_PRINTS */ +#define acm_log(level, format, ...) \ + acm_write(level, "%s: "format, __func__, ## __VA_ARGS__) +#define acm_log_once(level, format, ...) do { \ + static bool once; \ + if (!once) { \ + acm_write(level, "%s: "format, __func__, ## __VA_ARGS__); \ + once = true; \ + } \ +} while (0) +#endif /* ACME_PRINTS */ + +int acm_if_is_ib(char *ifname); +int acm_if_get_pkey(char *ifname, uint16_t *pkey); +int acm_if_get_sgid(char *ifname, union ibv_gid *sgid); +int acm_init_if_iter_sys(void); +void acm_fini_if_iter_sys(void); +typedef void (*acm_if_iter_cb)(char *ifname, union ibv_gid *gid, uint16_t pkey, + uint8_t addr_type, uint8_t *addr, + char *ip_str, void *ctx); +int acm_if_iter_sys(acm_if_iter_cb cb, void *ctx); + + +char **parse(const char *args, int *count); + +#endif /* ACM_IF_H */ diff --git a/ibacm/src/acme.c b/ibacm/src/acme.c new file mode 100644 index 0000000..1289fa3 --- /dev/null +++ b/ibacm/src/acme.c @@ -0,0 +1,1120 @@ +/* + * Copyright (c) 2009-2010 Intel Corporation. All rights reserved. + * Copyright (c) 2013 Mellanox Technologies LTD. All rights reserved. + * + * This software is available to you under the OpenIB.org BSD license + * below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AWV + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <config.h> + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <getopt.h> +#include <netdb.h> +#include <arpa/inet.h> +#include <inttypes.h> + +#include <osd.h> +#include <infiniband/verbs.h> +#include <infiniband/acm.h> +#include "libacm.h" +#include "acm_util.h" + +static const char *dest_dir = ACM_CONF_DIR; +static const char *addr_file = ACM_ADDR_FILE; +static const char *opts_file = ACM_OPTS_FILE; + +static char *dest_addr; +static char *src_addr; +#if IBACM_SERVER_MODE_DEFAULT == IBACM_SERVER_MODE_UNIX +static const char *svc_arg = IBACM_IBACME_SERVER_PATH; +#else +static const char *svc_arg = "localhost"; +#endif +static char *dest_arg; +static char *src_arg; +static char addr_type = 'u'; +static int verify; +static int nodelay; +static int repetitions = 1; +static int ep_index; +static int enum_ep; + +enum perf_query_output { + PERF_QUERY_NONE, + PERF_QUERY_ROW, + PERF_QUERY_COL, + PERF_QUERY_EP_INDEX, + PERF_QUERY_EP_ALL, + PERF_QUERY_EP_ADDR +}; +static enum perf_query_output perf_query; +static int verbose; + +static struct ibv_context **verbs; +static int dev_cnt; + +#define VPRINT(format, ...) do { if (verbose) printf(format, ## __VA_ARGS__ ); } while (0) + +static void show_usage(char *program) +{ + printf("usage 1: %s\n", program); + printf("Query specified ibacm service for data\n"); + printf(" [-e [N]] - display one or all endpoints:\n"); + printf(" No index: all endpoints\n"); + printf(" N: endpoint N (N = 1, 2, ...)\n"); + printf(" [-f addr_format] - i(p), n(ame), l(id), g(gid), or u(nspecified)\n"); + printf(" address format for -s and -d options, default: 'u'\n"); + printf(" [-s src_addr] - source address for path queries\n"); + printf(" [-d dest_addr] - destination addresses for path queries\n"); + printf(" [-v] - verify ACM response against SA query response\n"); + printf(" [-c] - read ACM cached data only\n"); + printf(" [-P [opt]] - query performance data from destination service:\n"); + printf(" No option: output combined data in row format.\n"); + printf(" col: output combined data in column format.\n"); + printf(" N: output data for endpoint N (N = 1, 2,...)\n"); + printf(" all: output data for all endpoints\n"); + printf(" s: output data for the endpoint with the\n"); + printf(" address specified in -s option\n"); + printf(" [-S svc_addr] - address of ACM service, default: local service\n"); + printf(" [-C repetitions] - repeat count for resolution\n"); + printf("usage 2: %s\n", program); + printf("Generate default ibacm service configuration and option files\n"); + printf(" -A [addr_file] - generate local address configuration file\n"); + printf(" (default is %s)\n", ACM_ADDR_FILE); + printf(" -O [opt_file] - generate local ibacm_opts.cfg options file\n"); + printf(" (default is %s)\n", ACM_OPTS_FILE); + printf(" -D dest_dir - specify destination directory for output files\n"); + printf(" (default is %s)\n", ACM_CONF_DIR); + printf(" -V - enable verbose output\n"); +} + +static void gen_opts_temp(FILE *f) +{ + fprintf(f, "# InfiniBand Communication Manager Assistant for clusters configuration file\n"); + fprintf(f, "#\n"); + fprintf(f, "# Use ib_acme utility with -O option to automatically generate a sample\n"); + fprintf(f, "# ibacm_opts.cfg file for the current system.\n"); + fprintf(f, "#\n"); + fprintf(f, "# Entry format is:\n"); + fprintf(f, "# name value\n"); + fprintf(f, "\n"); + fprintf(f, "# log_file:\n"); + fprintf(f, "# Specifies the location of the ACM service output. The log file is used to\n"); + fprintf(f, "# assist with ACM service debugging and troubleshooting. The log_file can\n"); + fprintf(f, "# be set to 'stdout', 'stderr', or the name of a file.\n"); + fprintf(f, "# Examples:\n"); + fprintf(f, "# log_file stdout\n"); + fprintf(f, "# log_file stderr\n"); + fprintf(f, "# log_file %s\n", IBACM_LOG_FILE); + fprintf(f, "\n"); + fprintf(f, "log_file %s\n", IBACM_LOG_FILE); + fprintf(f, "\n"); + fprintf(f, "# log_level:\n"); + fprintf(f, "# Indicates the amount of detailed data written to the log file. Log levels\n"); + fprintf(f, "# should be one of the following values:\n"); + fprintf(f, "# 0 - basic configuration & errors\n"); + fprintf(f, "# 1 - verbose configuration & errors\n"); + fprintf(f, "# 2 - verbose operation\n"); + fprintf(f, "\n"); + fprintf(f, "log_level 0\n"); + fprintf(f, "\n"); + fprintf(f, "# lock_file:\n"); + fprintf(f, "# Specifies the location of the ACM lock file used to ensure that only a\n"); + fprintf(f, "# single instance of ACM is running.\n"); + fprintf(f, "\n"); + fprintf(f, "lock_file %s\n", IBACM_PID_FILE); + fprintf(f, "\n"); + fprintf(f, "# addr_prot:\n"); + fprintf(f, "# Default resolution protocol to resolve IP addresses into IB GIDs.\n"); + fprintf(f, "# Supported protocols are:\n"); + fprintf(f, "# acm - Use ACM multicast protocol, which is similar to ARP.\n"); + fprintf(f, "\n"); + fprintf(f, "addr_prot acm\n"); + fprintf(f, "\n"); + fprintf(f, "# addr_timeout:\n"); + fprintf(f, "# Number of minutes to maintain IP address to GID mapping before\n"); + fprintf(f, "# repeating address resolution. A value of -1 indicates that the\n"); + fprintf(f, "# mapping will not time out.\n"); + fprintf(f, "# 1 hour = 60, 1 day = 1440, 1 week = 10080, 1 month ~ 43200"); + fprintf(f, "\n"); + fprintf(f, "addr_timeout 1440\n"); + fprintf(f, "\n"); + fprintf(f, "# route_prot:\n"); + fprintf(f, "# Default resolution protocol to resolve IB routing information.\n"); + fprintf(f, "# Supported protocols are:\n"); + fprintf(f, "# sa - Query SA for path record data and cache results.\n"); + fprintf(f, "# acm - Use ACM multicast protocol.\n"); + fprintf(f, "\n"); + fprintf(f, "route_prot sa\n"); + fprintf(f, "\n"); + fprintf(f, "# route_timeout:\n"); + fprintf(f, "# Number of minutes to maintain IB routing information before\n"); + fprintf(f, "# repeating route resolution. A value of -1 indicates that the\n"); + fprintf(f, "# mapping will not time out. However, the route will\n"); + fprintf(f, "# automatically time out when the address times out.\n"); + fprintf(f, "# 1 hour = 60, 1 day = 1440, 1 week = 10080, 1 month ~ 43200"); + fprintf(f, "\n"); + fprintf(f, "route_timeout -1\n"); + fprintf(f, "\n"); + fprintf(f, "# loopback_prot:\n"); + fprintf(f, "# Address and route resolution protocol to resolve local addresses\n"); + fprintf(f, "# Supported protocols are:\n"); + fprintf(f, "# none - Use same protocols defined for addr_prot and route_prot\n"); + fprintf(f, "# local - Resolve information used locally available data\n"); + fprintf(f, "\n"); + fprintf(f, "loopback_prot local\n"); + fprintf(f, "\n"); + fprintf(f, "# server_port:\n"); + fprintf(f, "# TCP port number that the server listens on.\n"); + fprintf(f, "# If this value is changed, then a corresponding change is required for\n"); + fprintf(f, "# client applications.\n"); + fprintf(f, "\n"); + fprintf(f, "server_port 6125\n"); + fprintf(f, "\n"); + fprintf(f, "# server_mode:\n"); + fprintf(f, "# Selects how clients can connect to this server:\n"); + fprintf(f, "# unix - Use unix-domain sockets,"); + fprintf(f, " hence limits service to the same machine.\n"); + fprintf(f, "# loop - Limit incoming connections"); + fprintf(f, " for server_port to 127.0.0.1.\n"); + fprintf(f, "# open - Allow incoming connections"); + fprintf(f, " from any TCP client (internal or external).\n"); + fprintf(f, "\n"); +#if IBACM_SERVER_MODE_DEFAULT == IBACM_SERVER_MODE_OPEN + fprintf(f, "server_mode open\n"); +#elif IBACM_SERVER_MODE_DEFAULT == IBACM_SERVER_MODE_LOOP + fprintf(f, "server_mode loop\n"); +#else + fprintf(f, "server_mode unix\n"); +#endif + fprintf(f, "\n"); + fprintf(f, "# acme_plus_kernel_only:\n"); + fprintf(f, "# If set to 'true', 'yes' or a non-zero number\n"); + fprintf(f, "# ibacm will only serve requests originating\n"); + fprintf(f, "# from the kernel or the ib_acme utility.\n"); + fprintf(f, "# Please note that this option is ignored if the ibacm\n"); + fprintf(f, "# service is started on demand by systemd,\n"); + fprintf(f, "# in which case this option is treated\n"); + fprintf(f, "# as if it were set to 'no'\n"); + fprintf(f, "\n"); +#if IBACM_ACME_PLUS_KERNEL_ONLY_DEFAULT + fprintf(f, "acme_plus_kernel_only yes\n"); +#else + fprintf(f, "acme_plus_kernel_only no\n"); +#endif + fprintf(f, "\n"); + fprintf(f, "# timeout:\n"); + fprintf(f, "# Additional time, in milliseconds, that the ACM service will wait for a\n"); + fprintf(f, "# response from a remote ACM service or the IB SA. The actual request\n"); + fprintf(f, "# timeout is this value plus the subnet timeout.\n"); + fprintf(f, "\n"); + fprintf(f, "timeout 2000\n"); + fprintf(f, "\n"); + fprintf(f, "# retries:\n"); + fprintf(f, "# Number of times that the ACM service will retry a request. This affects\n"); + fprintf(f, "# both ACM multicast messages and and IB SA messages.\n"); + fprintf(f, "\n"); + fprintf(f, "retries 2\n"); + fprintf(f, "\n"); + fprintf(f, "# resolve_depth:\n"); + fprintf(f, "# Specifies the maximum number of outstanding requests that can be in\n"); + fprintf(f, "# progress simultaneously. A larger resolve depth allows for greater\n"); + fprintf(f, "# parallelism, but increases system resource usage and subnet load.\n"); + fprintf(f, "# If the number of pending requests is greater than the resolve_depth,\n"); + fprintf(f, "# the additional requests will automatically be queued until some of\n"); + fprintf(f, "# the previous requests complete.\n"); + fprintf(f, "\n"); + fprintf(f, "resolve_depth 1\n"); + fprintf(f, "\n"); + fprintf(f, "# sa_depth:\n"); + fprintf(f, "# Specifies the maximum number of outstanding requests to the SA that\n"); + fprintf(f, "# can be in progress simultaneously. A larger SA depth allows for greater\n"); + fprintf(f, "# parallelism, but increases system resource usage and SA load.\n"); + fprintf(f, "# If the number of pending SA requests is greater than the sa_depth,\n"); + fprintf(f, "# the additional requests will automatically be queued until some of\n"); + fprintf(f, "# the previous requests complete. The number of outstanding SA requests\n"); + fprintf(f, "# is separate from the specified resolve_depth.\n"); + fprintf(f, "\n"); + fprintf(f, "sa_depth 1\n"); + fprintf(f, "\n"); + fprintf(f, "# send_depth:\n"); + fprintf(f, "# Specifies the number of outstanding send operations that can\n"); + fprintf(f, "# be in progress simultaneously. A larger send depth allows for\n"); + fprintf(f, "# greater parallelism, but consumes more system resources and subnet load.\n"); + fprintf(f, "# The send_depth is in addition to resolve_depth and sa_depth, and limits\n"); + fprintf(f, "# the transfer of responses.\n"); + fprintf(f, "\n"); + fprintf(f, "send_depth 1\n"); + fprintf(f, "\n"); + fprintf(f, "# recv_depth:\n"); + fprintf(f, "# Specifies the number of buffers allocated and ready to receive remote\n"); + fprintf(f, "# requests. A larger receive depth consumes more system resources, but\n"); + fprintf(f, "# can avoid dropping requests due to insufficient receive buffers.\n"); + fprintf(f, "\n"); + fprintf(f, "recv_depth 1024\n"); + fprintf(f, "\n"); + fprintf(f, "# min_mtu:\n"); + fprintf(f, "# Indicates the minimum MTU supported by the ACM service. The ACM service\n"); + fprintf(f, "# negotiates to use the largest MTU available between both sides of a\n"); + fprintf(f, "# connection. It is most efficient and recommended that min_mtu be set\n"); + fprintf(f, "# to the largest MTU value supported by all nodes in a cluster.\n"); + fprintf(f, "\n"); + fprintf(f, "min_mtu 2048\n"); + fprintf(f, "\n"); + fprintf(f, "# min_rate:\n"); + fprintf(f, "# Indicates the minimum link rate, in Gbps, supported by the ACM service.\n"); + fprintf(f, "# The ACM service negotiates to use the highest rate available between both\n"); + fprintf(f, "# sides of a connection. It is most efficient and recommended that the\n"); + fprintf(f, "# min_rate be set to the largest rate supported by all nodes in a cluster.\n"); + fprintf(f, "\n"); + fprintf(f, "min_rate 10\n"); + fprintf(f, "\n"); + fprintf(f, "# route_preload:\n"); + fprintf(f, "# Specifies if the ACM routing cache should be preloaded, or built on demand.\n"); + fprintf(f, "# If preloaded, indicates the method used to build the cache.\n"); + fprintf(f, "# Supported preload values are:\n"); + fprintf(f, "# none - The routing cache is not pre-built (default)\n"); + fprintf(f, "# opensm_full_v1 - OpenSM 'full' path records dump file format (version 1)\n"); + fprintf(f, "\n"); + fprintf(f, "route_preload none\n"); + fprintf(f, "\n"); + fprintf(f, "# route_data_file:\n"); + fprintf(f, "# Specifies the location of the route data file to use when preloading\n"); + fprintf(f, "# the ACM cache. This option is only valid if route_preload\n"); + fprintf(f, "# indicates that routing data should be read from a file.\n"); + fprintf(f, "# Default is %s/ibacm_route.data\n", ACM_CONF_DIR); + fprintf(f, "# route_data_file %s/ibacm_route.data\n", ACM_CONF_DIR); + fprintf(f, "\n"); + fprintf(f, "# addr_preload:\n"); + fprintf(f, "# Specifies if the ACM address cache should be preloaded, or built on demand.\n"); + fprintf(f, "# If preloaded, indicates the method used to build the cache.\n"); + fprintf(f, "# Supported preload values are:\n"); + fprintf(f, "# none - The address cache is not pre-built (default)\n"); + fprintf(f, "# acm_hosts - ACM address to GID file format\n"); + fprintf(f, "\n"); + fprintf(f, "addr_preload none\n"); + fprintf(f, "\n"); + fprintf(f, "# addr_data_file:\n"); + fprintf(f, "# Specifies the location of the address data file to use when preloading\n"); + fprintf(f, "# the ACM cache. This option is only valid if addr_preload\n"); + fprintf(f, "# indicates that address data should be read from a file.\n"); + fprintf(f, "# Default is %s/ibacm_hosts.data\n", ACM_CONF_DIR); + fprintf(f, "# addr_data_file %s/ibacm_hosts.data\n", ACM_CONF_DIR); + fprintf(f, "\n"); + fprintf(f, "# support_ips_in_addr_cfg:\n"); + fprintf(f, "# If 1 continue to read IP addresses from ibacm_addr.cfg\n"); + fprintf(f, "# Default is 0 \"no\"\n"); + fprintf(f, "# support_ips_in_addr_cfg 0\n"); + fprintf(f, "\n"); + fprintf(f, "# provider_lib_path:\n"); + fprintf(f, "# Specifies the directory of the provider libraries\n"); + fprintf(f, "\n"); + fprintf(f, "# provider_lib_path %s\n", IBACM_LIB_PATH); + fprintf(f, "\n"); + fprintf(f, "# provider:\n"); + fprintf(f, "# Specifies the provider to assign to each subnet\n"); + fprintf(f, "# ACM providers may override the address and route resolution\n"); + fprintf(f, "# protocols with provider specific protocols.\n"); + fprintf(f, "# provider name (prefix | default)\n"); + fprintf(f, "# Example:\n"); + fprintf(f, "# provider ibacmp 0xFE80000000000000\n"); + fprintf(f, "# provider ibacmp default\n"); + fprintf(f, "\n"); +} + +static int open_dir(void) +{ + mkdir(dest_dir, S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH); + if (chdir(dest_dir)) { + printf("Failed to open directory %s: %s\n", dest_dir, strerror(errno)); + return -1; + } + return 0; +} + +static int gen_opts(void) +{ + FILE *f; + + VPRINT("Generating %s/%s\n", dest_dir, opts_file); + if (open_dir() || !(f = fopen(opts_file, "w"))) { + printf("Failed to open option configuration file: %s\n", strerror(errno)); + return -1; + } + + gen_opts_temp(f); + fclose(f); + return 0; +} + +static void gen_addr_temp(FILE *f) +{ + fprintf(f, "# InfiniBand Communication Management Assistant for clusters address file\n"); + fprintf(f, "#\n"); + fprintf(f, "# Use ib_acme utility with -A option to automatically generate a sample\n"); + fprintf(f, "# ibacm_addr.cfg file for the current system.\n"); + fprintf(f, "#\n"); + fprintf(f, "# Entry format is:\n"); + fprintf(f, "# address device port pkey\n"); + fprintf(f, "#\n"); + fprintf(f, "# NOTE: IP addresses are now automatically read and monitored on the system.\n"); + fprintf(f, "# Therefore they are no longer required in this file.\n"); + fprintf(f, "#\n"); + fprintf(f, "# The address may be one of the following:\n"); + fprintf(f, "# host_name - ascii character string, up to 31 characters\n"); + fprintf(f, "#\n"); + fprintf(f, "# device name - struct ibv_device name\n"); + fprintf(f, "# port number - valid port number on device (numbering starts at 1)\n"); + fprintf(f, "# pkey - partition key in hex (can specify 'default' for first entry in pkey table)\n"); + fprintf(f, "#\n"); + fprintf(f, "# Up to 4 addresses can be associated with a given <device, port, pkey> tuple\n"); + fprintf(f, "#\n"); + fprintf(f, "# Samples:\n"); + fprintf(f, "# node31 ibv_device0 1 default\n"); + fprintf(f, "# node31-1 ibv_device0 1 0x00FF\n"); + fprintf(f, "# node31-2 ibv_device0 2 0x00FF\n"); +} + +static int open_verbs(void) +{ + struct ibv_device **dev_array; + int i, ret; + + dev_array = ibv_get_device_list(&dev_cnt); + if (!dev_array) { + printf("ibv_get_device_list - no devices present?\n"); + return -1; + } + + verbs = malloc(sizeof(struct ibv_context *) * dev_cnt); + if (!verbs) { + ret = -1; + goto err1; + } + + for (i = 0; i < dev_cnt; i++) { + verbs[i] = ibv_open_device(dev_array[i]); + if (!verbs[i]) { + printf("ibv_open_device - failed to open device\n"); + ret = -1; + goto err2; + } + } + + ibv_free_device_list(dev_array); + return 0; + +err2: + while (i--) + ibv_close_device(verbs[i]); + free(verbs); +err1: + ibv_free_device_list(dev_array); + return ret; +} + +static void close_verbs(void) +{ + int i; + + for (i = 0; i < dev_cnt; i++) + ibv_close_device(verbs[i]); + free(verbs); +} + +static int gen_addr_names(FILE *f) +{ + struct ibv_device_attr dev_attr; + struct ibv_port_attr port_attr; + int i, index, ret, found_active; + char host_name[256]; + uint8_t p; + + ret = gethostname(host_name, sizeof host_name); + if (ret) { + printf("gethostname error: %d\n", ret); + return ret; + } + strtok(host_name, "."); + + found_active = 0; + index = 1; + for (i = 0; i < dev_cnt; i++) { + ret = ibv_query_device(verbs[i], &dev_attr); + if (ret) + break; + + for (p = 1; p <= dev_attr.phys_port_cnt; p++) { + if (!found_active) { + ret = ibv_query_port(verbs[i], p, &port_attr); + if (!ret && port_attr.state == IBV_PORT_ACTIVE) { + VPRINT("%s %s %d default\n", + host_name, verbs[i]->device->name, p); + fprintf(f, "%s %s %d default\n", + host_name, verbs[i]->device->name, p); + found_active = 1; + } + } + + VPRINT("%s-%d %s %d default\n", + host_name, index, verbs[i]->device->name, p); + fprintf(f, "%s-%d %s %d default\n", + host_name, index++, verbs[i]->device->name, p); + } + } + + return ret; +} + +static int gen_addr(void) +{ + FILE *f; + int ret; + + VPRINT("Generating %s/%s\n", dest_dir, addr_file); + if (open_dir() || !(f = fopen(addr_file, "w"))) { + printf("Failed to open address configuration file: %s\n", strerror(errno)); + return -1; + } + + ret = open_verbs(); + if (ret) { + goto out1; + } + + gen_addr_temp(f); + ret = gen_addr_names(f); + if (ret) { + printf("Failed to auto generate host names in config file\n"); + goto out2; + } + +out2: + close_verbs(); +out1: + fclose(f); + return ret; +} + +static void show_path(struct ibv_path_record *path) +{ + char gid[sizeof "ffff:ffff:ffff:ffff:ffff:ffff:ffff:ffff"]; + uint32_t fl_hop; + + printf("Path information\n"); + inet_ntop(AF_INET6, path->dgid.raw, gid, sizeof gid); + printf(" dgid: %s\n", gid); + inet_ntop(AF_INET6, path->sgid.raw, gid, sizeof gid); + printf(" sgid: %s\n", gid); + printf(" dlid: %u\n", be16toh(path->dlid)); + printf(" slid: %u\n", be16toh(path->slid)); + fl_hop = be32toh(path->flowlabel_hoplimit); + printf(" flow label: 0x%x\n", fl_hop >> 8); + printf(" hop limit: %d\n", (uint8_t) fl_hop); + printf(" tclass: %d\n", path->tclass); + printf(" reversible: %d\n", path->reversible_numpath >> 7); + printf(" pkey: 0x%x\n", be16toh(path->pkey)); + printf(" sl: %d\n", be16toh(path->qosclass_sl) & 0xF); + printf(" mtu: %d\n", path->mtu & 0x1F); + printf(" rate: %d\n", path->rate & 0x1F); + printf(" packet lifetime: %d\n", path->packetlifetime & 0x1F); +} + +static uint32_t get_resolve_flags(void) +{ + uint32_t flags = 0; + + if (nodelay) + flags |= ACM_FLAGS_NODELAY; + + return flags; +} + +static int inet_any_pton(char *addr, struct sockaddr *sa) +{ + struct sockaddr_in *sin; + struct sockaddr_in6 *sin6; + int ret; + + sin = (struct sockaddr_in *) sa; + sa->sa_family = AF_INET; + ret = inet_pton(AF_INET, addr, &sin->sin_addr); + if (ret <= 0) { + sin6 = (struct sockaddr_in6 *) sa; + sa->sa_family = AF_INET6; + ret = inet_pton(AF_INET6, addr, &sin6->sin6_addr); + } + + return ret; +} + +static int resolve_ip(struct ibv_path_record *path) +{ + struct ibv_path_data *paths; + struct sockaddr_storage src, dest; + struct sockaddr *saddr; + int ret, count; + + if (src_addr) { + saddr = (struct sockaddr *) &src; + ret = inet_any_pton(src_addr, saddr); + if (ret <= 0) { + printf("inet_pton error on source address (%s): 0x%x\n", src_addr, ret); + return -1; + } + } else { + saddr = NULL; + } + + ret = inet_any_pton(dest_addr, (struct sockaddr *) &dest); + if (ret <= 0) { + printf("inet_pton error on destination address (%s): 0x%x\n", dest_addr, ret); + return -1; + } + + if (src_addr && src.ss_family != dest.ss_family) { + printf("source and destination address families don't match\n"); + return -1; + } + + ret = ib_acm_resolve_ip(saddr, (struct sockaddr *) &dest, + &paths, &count, get_resolve_flags(), (repetitions == 1)); + if (ret) { + printf("ib_acm_resolve_ip failed: %s\n", strerror(errno)); + return ret; + } + + *path = paths[0].path; + ib_acm_free_paths(paths); + return 0; +} + +static int resolve_name(struct ibv_path_record *path) +{ + struct ibv_path_data *paths; + int ret, count; + + ret = ib_acm_resolve_name(src_addr, dest_addr, &paths, &count, get_resolve_flags(), (repetitions == 1)); + if (ret) { + printf("ib_acm_resolve_name failed: %s\n", strerror(errno)); + return ret; + } + + *path = paths[0].path; + ib_acm_free_paths(paths); + return 0; +} + +static int resolve_lid(struct ibv_path_record *path) +{ + int ret; + + if (src_addr) + path->slid = htobe16((uint16_t) atoi(src_addr)); + path->dlid = htobe16((uint16_t) atoi(dest_addr)); + path->reversible_numpath = IBV_PATH_RECORD_REVERSIBLE | 1; + + ret = ib_acm_resolve_path(path, get_resolve_flags()); + if (ret) + printf("ib_acm_resolve_path failed: %s\n", strerror(errno)); + + return ret; +} + +static int resolve_gid(struct ibv_path_record *path) +{ + int ret; + + if (src_addr) { + ret = inet_pton(AF_INET6, src_addr, &path->sgid); + if (ret <= 0) { + printf("inet_pton error on source address (%s): 0x%x\n", + src_addr, ret); + return ret ? ret : -1; + } + } + + ret = inet_pton(AF_INET6, dest_addr, &path->dgid); + if (ret <= 0) { + printf("inet_pton error on dest address (%s): 0x%x\n", dest_addr, ret); + return ret ? ret : -1; + } + + path->reversible_numpath = IBV_PATH_RECORD_REVERSIBLE | 1; + ret = ib_acm_resolve_path(path, get_resolve_flags()); + if (ret) + printf("ib_acm_resolve_path failed: %s\n", strerror(errno)); + + return ret; +} + +static int verify_resolve(struct ibv_path_record *path) +{ + int ret; + + ret = ib_acm_resolve_path(path, ACM_FLAGS_QUERY_SA); + if (ret) + printf("SA verification: failed %s\n", strerror(errno)); + else + printf("SA verification: success\n"); + + return ret; +} + +static char *get_dest(char *arg, char *format) +{ + static char addr[64]; + struct addrinfo hint, *res; + const char *ai; + int ret; + + if (!arg || addr_type != 'u') { + *format = addr_type; + return arg; + } + + if ((inet_pton(AF_INET, arg, addr) > 0) || (inet_pton(AF_INET6, arg, addr) > 0)) { + *format = 'i'; + return arg; + } + + memset(&hint, 0, sizeof hint); + hint.ai_protocol = IPPROTO_TCP; + ret = getaddrinfo(arg, NULL, &hint, &res); + if (ret) { + *format = 'l'; + return arg; + } + + if (res->ai_family == AF_INET) { + ai = inet_ntop(AF_INET, &((struct sockaddr_in *) res->ai_addr)->sin_addr, + addr, sizeof addr); + } else { + ai = inet_ntop(AF_INET6, &((struct sockaddr_in6 *) res->ai_addr)->sin6_addr, + addr, sizeof addr); + } + freeaddrinfo(res); + + if (ai) { + *format = 'i'; + return addr; + } else { + *format = 'n'; + return arg; + } +} + +static int resolve(char *svc) +{ + char **dest_list, **src_list; + struct ibv_path_record path; + int ret = -1, d = 0, s = 0, i; + char dest_type; + + dest_list = parse(dest_arg, NULL); + if (!dest_list) { + printf("Unable to parse destination argument\n"); + return ret; + } + + src_list = src_arg ? parse(src_arg, NULL) : NULL; + + printf("Service: %s\n", svc); + for (dest_addr = get_dest(dest_list[d], &dest_type); dest_addr; + dest_addr = get_dest(dest_list[++d], &dest_type)) { + s = 0; + src_addr = src_list ? src_list[s] : NULL; + do { + printf("Destination: %s\n", dest_addr); + if (src_addr) + printf("Source: %s\n", src_addr); + for (i = 0; i < repetitions; i++) { + switch (dest_type) { + case 'i': + ret = resolve_ip(&path); + break; + case 'n': + ret = resolve_name(&path); + break; + case 'l': + memset(&path, 0, sizeof path); + ret = resolve_lid(&path); + break; + case 'g': + memset(&path, 0, sizeof path); + ret = resolve_gid(&path); + break; + default: + break; + } + } + + if (!ret) + show_path(&path); + + if (!ret && verify) + ret = verify_resolve(&path); + printf("\n"); + + if (src_list) + src_addr = src_list[++s]; + } while (src_addr); + } + + free(dest_list); + + return ret; +} + +static int query_perf_ip(uint64_t **counters, int *cnt) +{ + union _sockaddr { + struct sockaddr_storage src; + struct sockaddr saddr; + } addr; + uint8_t type; + struct sockaddr_in *sin; + struct sockaddr_in6 *sin6; + int ret; + + VPRINT("%s: src_addr %s\n", __FUNCTION__, src_addr); + addr.saddr.sa_family = AF_INET; + sin = (struct sockaddr_in *) &addr.saddr; + ret = inet_pton(AF_INET, src_addr, &sin->sin_addr); + if (ret <= 0) { + addr.saddr.sa_family = AF_INET6; + sin6 = (struct sockaddr_in6 *)&addr.saddr; + ret = inet_pton(AF_INET6, src_addr, &sin6->sin6_addr); + if (ret <= 0) { + printf("inet_pton error on src address (%s): 0x%x\n", + src_addr, ret); + return -1; + } + type = ACM_EP_INFO_ADDRESS_IP6; + } else { + type = ACM_EP_INFO_ADDRESS_IP; + } + + ret = ib_acm_query_perf_ep_addr((uint8_t *)&addr.src, type, counters, + cnt); + if (ret) { + printf("ib_acm_query_perf failed: %s\n", strerror(errno)); + return ret; + } + + return 0; +} + +static int query_perf_name(uint64_t **counters, int *cnt) +{ + int ret; + + VPRINT("%s: src_addr %s\n", __FUNCTION__, src_addr); + ret = ib_acm_query_perf_ep_addr((uint8_t *)src_addr, ACM_EP_INFO_NAME, + counters, cnt); + if (ret) { + printf("ib_acm_query_perf failed: %s\n", strerror(errno)); + return ret; + } + + return 0; +} + +static int query_perf_ep_addr(uint64_t **counters, int *cnt) +{ + int ret; + char src_type; + + src_addr = get_dest(src_arg, &src_type); + switch (src_type) { + case 'i': + ret = query_perf_ip(counters, cnt); + break; + case 'n': + ret = query_perf_name(counters, cnt); + break; + default: + printf("Unsupported src_type %d\n", src_type); + return -1; + } + + return ret; +} + +static int query_perf_one(char *svc, int index) +{ + static int labels; + int ret, cnt, i; + uint64_t *counters; + + if (perf_query == PERF_QUERY_EP_ADDR) + ret = query_perf_ep_addr(&counters, &cnt); + else + ret = ib_acm_query_perf(index, &counters, &cnt); + + if (ret) { + if (perf_query != PERF_QUERY_EP_ALL) { + printf("%s: Failed to query perf data: %s\n", svc, + strerror(errno)); + } + return ret; + } + + if (perf_query != PERF_QUERY_COL) { + if (!labels) { + printf("svc,"); + for (i = 0; i < cnt - 1; i++) + printf("%s,", ib_acm_cntr_name(i)); + printf("%s\n", ib_acm_cntr_name(i)); + labels = 1; + } + printf("%s,", svc); + for (i = 0; i < cnt - 1; i++) + printf("%llu,", (unsigned long long) counters[i]); + printf("%llu\n", (unsigned long long) counters[i]); + } else { + printf("%s\n", svc); + for (i = 0; i < cnt; i++) { + printf("%s : ", ib_acm_cntr_name(i)); + printf("%llu\n", (unsigned long long) counters[i]); + } + } + ib_acm_free_perf(counters); + + return 0; +} + +static void query_perf(char *svc) +{ + int index = 1; + + if (perf_query != PERF_QUERY_EP_ALL) { + query_perf_one(svc, ep_index); + } + else { + while (!query_perf_one(svc, index++)); + } +} + +static int enumerate_ep(char *svc, int index) +{ + static int labels; + int ret, i; + struct acm_ep_config_data *ep_data; + int phys_port_cnt = 255; + int found = 0; + int port; + + for (port = 1; port <= phys_port_cnt; ++port) { + ret = ib_acm_enum_ep(index, &ep_data, port); + if (ret) + continue; + + found = 1; + + if (!labels) { + printf("svc,guid,port,pkey,ep_index,prov,addr_0,addresses\n"); + labels = 1; + } + + printf("%s,0x%016" PRIx64 ",%d,0x%04x,%d,%s", svc, ep_data->dev_guid, + ep_data->port_num, ep_data->pkey, index, ep_data->prov_name); + for (i = 0; i < ep_data->addr_cnt; i++) + printf(",%s", ep_data->addrs[i].name); + printf("\n"); + phys_port_cnt = ep_data->phys_port_cnt; + ib_acm_free_ep_data(ep_data); + } + + return !found; +} + +static void enumerate_eps(char *svc) +{ + int index = 1; + + if (ep_index > 0) { + if (enumerate_ep(svc, ep_index)) + printf(" Endpoint %d is not available\n", ep_index); + } else { + while (!enumerate_ep(svc, index++)); + } +} + +static int query_svcs(void) +{ + char **svc_list; + int ret = -1, i; + + svc_list = parse(svc_arg, NULL); + if (!svc_list) { + printf("Unable to parse service list argument\n"); + return -1; + } + + for (i = 0; svc_list[i]; i++) { + ret = ib_acm_connect(svc_list[i]); + if (ret) { + printf("%s,unable to contact service: %s\n", + svc_list[i], strerror(errno)); + continue; + } + + if (dest_arg) + ret = resolve(svc_list[i]); + + if (perf_query) + query_perf(svc_list[i]); + + if (enum_ep) + enumerate_eps(svc_list[i]); + + ib_acm_disconnect(); + } + + free(svc_list); + return ret; +} + +static char *opt_arg(int argc, char **argv) +{ + if (optarg) + return optarg; + + if ((optind < argc) && (argv[optind][0] != '-')) + return argv[optind]; + + return NULL; +} + +static void parse_perf_arg(char *arg) +{ + if (!strncasecmp("col", arg, 3)) { + perf_query = PERF_QUERY_COL; + } else if (!strncasecmp("all", arg, 3)) { + perf_query = PERF_QUERY_EP_ALL; + } else if (!strcmp("s", arg)) { + perf_query = PERF_QUERY_EP_ADDR; + } else { + ep_index = atoi(arg); + if (ep_index > 0) + perf_query = PERF_QUERY_EP_INDEX; + else + perf_query = PERF_QUERY_ROW; + } +} + +int main(int argc, char **argv) +{ + int op, ret = 0; + int make_addr = 0; + int make_opts = 0; + + while ((op = getopt(argc, argv, "e::f:s:d:vcA::O::D:P::S:C:V")) != -1) { + switch (op) { + case 'e': + enum_ep = 1; + if (opt_arg(argc, argv)) + ep_index = atoi(opt_arg(argc, argv)); + break; + case 'f': + addr_type = optarg[0]; + if (addr_type != 'i' && addr_type != 'n' && + addr_type != 'l' && addr_type != 'g') + goto show_use; + break; + case 's': + src_arg = optarg; + break; + case 'd': + dest_arg = optarg; + break; + case 'v': + verify = 1; + break; + case 'c': + nodelay = 1; + break; + case 'A': + make_addr = 1; + if (opt_arg(argc, argv)) + addr_file = opt_arg(argc, argv); + break; + case 'O': + make_opts = 1; + if (opt_arg(argc, argv)) + opts_file = opt_arg(argc, argv); + break; + case 'D': + dest_dir = optarg; + break; + case 'P': + if (opt_arg(argc, argv)) + parse_perf_arg(opt_arg(argc, argv)); + else + perf_query = PERF_QUERY_ROW; + break; + case 'S': + svc_arg = optarg; + break; + case 'C': + repetitions = atoi(optarg); + if (!repetitions) + repetitions = 1; + break; + case 'V': + verbose = 1; + break; + default: + goto show_use; + } + } + + if ((src_arg && (!dest_arg && perf_query != PERF_QUERY_EP_ADDR)) || + (perf_query == PERF_QUERY_EP_ADDR && !src_arg) || + (!src_arg && !dest_arg && !perf_query && !make_addr && !make_opts && + !enum_ep)) + goto show_use; + + if (dest_arg || perf_query || enum_ep) + ret = query_svcs(); + + if (!ret && make_addr) + ret = gen_addr(); + + if (!ret && make_opts) + ret = gen_opts(); + + if (verbose || !(make_addr || make_opts) || ret) + printf("return status 0x%x\n", ret); + return ret; + +show_use: + show_usage(argv[0]); + exit(1); +} diff --git a/ibacm/src/libacm.c b/ibacm/src/libacm.c new file mode 100644 index 0000000..d09be36 --- /dev/null +++ b/ibacm/src/libacm.c @@ -0,0 +1,529 @@ +/* + * Copyright (c) 2009 Intel Corporation. All rights reserved. + * Copyright (c) 2013 Mellanox Technologies LTD. All rights reserved. + * + * This software is available to you under the OpenIB.org BSD license + * below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AWV + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <config.h> + +#include <osd.h> +#include "libacm.h" +#include <infiniband/acm.h> +#include <stdio.h> +#include <errno.h> +#include <netdb.h> +#include <arpa/inet.h> +#include <sys/socket.h> +#include <sys/un.h> + +static pthread_mutex_t acm_lock = PTHREAD_MUTEX_INITIALIZER; +static int sock = -1; +static short server_port = 6125; + +static void acm_set_server_port(void) +{ + FILE *f; + + f = fopen(IBACM_IBACME_PORT_FILE, "r"); + if (f) { + if (fscanf(f, "%hu", (unsigned short *) &server_port) != 1) + printf("Failed to read server port\n"); + fclose(f); + } +} + +static int ib_acm_connect_open(char *dest) +{ + struct addrinfo hint, *res; + int ret; + + acm_set_server_port(); + memset(&hint, 0, sizeof hint); + + hint.ai_family = AF_UNSPEC; + hint.ai_protocol = IPPROTO_TCP; + + ret = getaddrinfo(dest, NULL, &hint, &res); + if (ret) + return ret; + + sock = socket(res->ai_family, res->ai_socktype, res->ai_protocol); + if (sock == -1) { + ret = errno; + goto freeaddr; + } + + ((struct sockaddr_in *) res->ai_addr)->sin_port = htobe16(server_port); + ret = connect(sock, res->ai_addr, res->ai_addrlen); + if (ret) { + close(sock); + sock = -1; + } + +freeaddr: + freeaddrinfo(res); + return ret; +} + +static int ib_acm_connect_unix(char *dest) +{ + struct sockaddr_un addr; + int ret; + + addr.sun_family = AF_UNIX; + if (dest) { + if (snprintf(addr.sun_path, sizeof(addr.sun_path), + "%s", dest) >= sizeof(addr.sun_path)) { + errno = ENAMETOOLONG; + return errno; + } + } else { + BUILD_ASSERT(sizeof(IBACM_IBACME_SERVER_PATH) <= + sizeof(addr.sun_path)); + strcpy(addr.sun_path, IBACM_IBACME_SERVER_PATH); + } + + sock = socket(AF_UNIX, SOCK_STREAM, 0); + if (sock < 0) + return errno; + + if (connect(sock, (struct sockaddr *)&addr, sizeof(addr)) != 0) { + ret = errno; + close(sock); + sock = -1; + errno = ret; + return ret; + } + + return 0; +} + +int ib_acm_connect(char *dest) +{ + if (dest && *dest == '/') + return ib_acm_connect_unix(dest); + + return ib_acm_connect_open(dest); +} + +void ib_acm_disconnect(void) +{ + if (sock != -1) { + shutdown(sock, SHUT_RDWR); + close(sock); + sock = -1; + } +} + +static int acm_format_resp(struct acm_msg *msg, + struct ibv_path_data **paths, int *count, int print) +{ + struct ibv_path_data *path_data; + char addr[ACM_MAX_ADDRESS]; + int i, addr_cnt; + + *count = 0; + addr_cnt = (msg->hdr.length - ACM_MSG_HDR_LENGTH) / + sizeof(struct acm_ep_addr_data); + path_data = (struct ibv_path_data *) + calloc(1, addr_cnt * sizeof(struct ibv_path_data)); + if (!path_data) + return -1; + + for (i = 0; i < addr_cnt; i++) { + switch (msg->resolve_data[i].type) { + case ACM_EP_INFO_PATH: + path_data[i].flags = msg->resolve_data[i].flags; + path_data[i].path = msg->resolve_data[i].info.path; + (*count)++; + break; + default: + if (!(msg->resolve_data[i].flags & ACM_EP_FLAG_SOURCE)) + goto err; + + switch (msg->resolve_data[i].type) { + case ACM_EP_INFO_ADDRESS_IP: + inet_ntop(AF_INET, msg->resolve_data[i].info.addr, + addr, sizeof addr); + break; + case ACM_EP_INFO_ADDRESS_IP6: + inet_ntop(AF_INET6, msg->resolve_data[i].info.addr, + addr, sizeof addr); + break; + case ACM_EP_INFO_NAME: + memcpy(addr, msg->resolve_data[i].info.name, + ACM_MAX_ADDRESS); + break; + default: + goto err; + } + if (print) + printf("Source: %s\n", addr); + break; + } + } + + *paths = path_data; + return 0; +err: + free(path_data); + return -1; +} + +static int acm_format_ep_addr(struct acm_ep_addr_data *data, uint8_t *addr, + uint8_t type, uint32_t flags) +{ + data->type = type; + data->flags = flags; + + switch (type) { + case ACM_EP_INFO_NAME: + strncpy((char *) data->info.name, (char *) addr, ACM_MAX_ADDRESS); + break; + case ACM_EP_INFO_ADDRESS_IP: + memcpy(data->info.addr, &((struct sockaddr_in *) addr)->sin_addr, 4); + break; + case ACM_EP_INFO_ADDRESS_IP6: + memcpy(data->info.addr, &((struct sockaddr_in6 *) addr)->sin6_addr, 16); + break; + default: + return -1; + } + + return 0; +} + +static inline int ERR(int err) +{ + errno = err; + return -1; +} + +static int acm_error(uint8_t status) +{ + switch (status) { + case ACM_STATUS_SUCCESS: + return 0; + case ACM_STATUS_ENOMEM: + return ERR(ENOMEM); + case ACM_STATUS_EINVAL: + return ERR(EINVAL); + case ACM_STATUS_ENODATA: + return ERR(ENODATA); + case ACM_STATUS_ENOTCONN: + return ERR(ENOTCONN); + case ACM_STATUS_ETIMEDOUT: + return ERR(ETIMEDOUT); + case ACM_STATUS_ESRCADDR: + case ACM_STATUS_EDESTADDR: + return ERR(EADDRNOTAVAIL); + case ACM_STATUS_ESRCTYPE: + case ACM_STATUS_EDESTTYPE: + default: + return ERR(EINVAL); + } +} + +static int acm_resolve(uint8_t *src, uint8_t *dest, uint8_t type, + struct ibv_path_data **paths, int *count, uint32_t flags, int print) +{ + struct acm_msg msg; + int ret, cnt = 0; + + pthread_mutex_lock(&acm_lock); + memset(&msg, 0, sizeof msg); + msg.hdr.version = ACM_VERSION; + msg.hdr.opcode = ACM_OP_RESOLVE; + + if (src) { + ret = acm_format_ep_addr(&msg.resolve_data[cnt++], src, type, + ACM_EP_FLAG_SOURCE); + if (ret) + goto out; + } + + ret = acm_format_ep_addr(&msg.resolve_data[cnt++], dest, type, + ACM_EP_FLAG_DEST | flags); + if (ret) + goto out; + + msg.hdr.length = ACM_MSG_HDR_LENGTH + (cnt * ACM_MSG_EP_LENGTH); + + ret = send(sock, (char *) &msg, msg.hdr.length, 0); + if (ret != msg.hdr.length) + goto out; + + ret = recv(sock, (char *) &msg, sizeof msg, 0); + if (ret < ACM_MSG_HDR_LENGTH || ret != msg.hdr.length) + goto out; + + if (msg.hdr.status) { + ret = acm_error(msg.hdr.status); + goto out; + } + + ret = acm_format_resp(&msg, paths, count, print); +out: + pthread_mutex_unlock(&acm_lock); + return ret; +} + +int ib_acm_resolve_name(char *src, char *dest, + struct ibv_path_data **paths, int *count, uint32_t flags, int print) +{ + return acm_resolve((uint8_t *) src, (uint8_t *) dest, + ACM_EP_INFO_NAME, paths, count, flags, print); +} + +int ib_acm_resolve_ip(struct sockaddr *src, struct sockaddr *dest, + struct ibv_path_data **paths, int *count, uint32_t flags, int print) +{ + if (((struct sockaddr *) dest)->sa_family == AF_INET) { + return acm_resolve((uint8_t *) src, (uint8_t *) dest, + ACM_EP_INFO_ADDRESS_IP, paths, count, flags, print); + } else { + return acm_resolve((uint8_t *) src, (uint8_t *) dest, + ACM_EP_INFO_ADDRESS_IP6, paths, count, flags, print); + } +} + +int ib_acm_resolve_path(struct ibv_path_record *path, uint32_t flags) +{ + struct acm_msg msg; + struct acm_ep_addr_data *data; + int ret; + + pthread_mutex_lock(&acm_lock); + memset(&msg, 0, sizeof msg); + msg.hdr.version = ACM_VERSION; + msg.hdr.opcode = ACM_OP_RESOLVE; + msg.hdr.length = ACM_MSG_HDR_LENGTH + ACM_MSG_EP_LENGTH; + + data = &msg.resolve_data[0]; + data->flags = flags; + data->type = ACM_EP_INFO_PATH; + data->info.path = *path; + + ret = send(sock, (char *) &msg, msg.hdr.length, 0); + if (ret != msg.hdr.length) + goto out; + + ret = recv(sock, (char *) &msg, sizeof msg, 0); + if (ret < ACM_MSG_HDR_LENGTH || ret != msg.hdr.length) + goto out; + + ret = acm_error(msg.hdr.status); + if (!ret) + *path = data->info.path; + +out: + pthread_mutex_unlock(&acm_lock); + return ret; +} + +int ib_acm_query_perf(int index, uint64_t **counters, int *count) +{ + struct acm_msg msg; + int ret, i; + + pthread_mutex_lock(&acm_lock); + memset(&msg, 0, sizeof msg); + msg.hdr.version = ACM_VERSION; + msg.hdr.opcode = ACM_OP_PERF_QUERY; + msg.hdr.src_index = index; + msg.hdr.length = htobe16(ACM_MSG_HDR_LENGTH); + + ret = send(sock, (char *) &msg, ACM_MSG_HDR_LENGTH, 0); + if (ret != ACM_MSG_HDR_LENGTH) + goto out; + + ret = recv(sock, (char *) &msg, sizeof msg, 0); + if (ret < ACM_MSG_HDR_LENGTH || ret != be16toh(msg.hdr.length)) { + ret = ACM_STATUS_EINVAL; + goto out; + } + + if (msg.hdr.status) { + ret = acm_error(msg.hdr.status); + goto out; + } + + *counters = malloc(sizeof(uint64_t) * msg.hdr.src_out); + if (!*counters) { + ret = ACM_STATUS_ENOMEM; + goto out; + } + + *count = msg.hdr.src_out; + for (i = 0; i < *count; i++) + (*counters)[i] = be64toh(msg.perf_data[i]); + ret = 0; +out: + pthread_mutex_unlock(&acm_lock); + return ret; +} + +int ib_acm_enum_ep(int index, struct acm_ep_config_data **data, uint8_t port) +{ + struct acm_ep_config_data *netw_edata = NULL; + struct acm_ep_config_data *host_edata = NULL; + struct acm_hdr hdr; + struct acm_msg msg; + int ret; + int len; + int i; + + pthread_mutex_lock(&acm_lock); + memset(&msg, 0, sizeof msg); + msg.hdr.version = ACM_VERSION; + msg.hdr.opcode = ACM_OP_EP_QUERY; + msg.hdr.src_out = index; + msg.hdr.src_index = port; + msg.hdr.length = htobe16(ACM_MSG_HDR_LENGTH); + + ret = send(sock, (char *) &msg, ACM_MSG_HDR_LENGTH, 0); + if (ret != ACM_MSG_HDR_LENGTH) + goto out; + + ret = recv(sock, (char *) &hdr, sizeof(hdr), 0); + if (ret != sizeof(hdr)) { + ret = ACM_STATUS_EINVAL; + goto out; + } + + if (hdr.status) { + ret = acm_error(hdr.status); + goto out; + } + + len = be16toh(hdr.length) - sizeof(hdr); + netw_edata = malloc(len); + host_edata = malloc(len); + if (!netw_edata || !host_edata) { + ret = ACM_STATUS_ENOMEM; + goto out; + } + + ret = recv(sock, (char *)netw_edata, len, 0); + if (ret != len) { + ret = ACM_STATUS_EINVAL; + goto out; + } + + host_edata->dev_guid = be64toh(netw_edata->dev_guid); + host_edata->port_num = netw_edata->port_num; + host_edata->phys_port_cnt = netw_edata->phys_port_cnt; + host_edata->pkey = be16toh(netw_edata->pkey); + host_edata->addr_cnt = be16toh(netw_edata->addr_cnt); + + memcpy(host_edata->prov_name, netw_edata->prov_name, + sizeof(host_edata->prov_name)); + + for (i = 0; i < host_edata->addr_cnt; ++i) + host_edata->addrs[i] = netw_edata->addrs[i]; + + *data = host_edata; + ret = 0; +out: + free(netw_edata); + if (ret) + free(host_edata); + pthread_mutex_unlock(&acm_lock); + return ret; +} + +int ib_acm_query_perf_ep_addr(uint8_t *src, uint8_t type, + uint64_t **counters, int *count) +{ + struct acm_msg msg; + int ret, i, len; + + if (!src) + return -1; + + pthread_mutex_lock(&acm_lock); + memset(&msg, 0, sizeof msg); + msg.hdr.version = ACM_VERSION; + msg.hdr.opcode = ACM_OP_PERF_QUERY; + + ret = acm_format_ep_addr(&msg.resolve_data[0], src, type, + ACM_EP_FLAG_SOURCE); + if (ret) + goto out; + + len = ACM_MSG_HDR_LENGTH + ACM_MSG_EP_LENGTH; + msg.hdr.length = htobe16(len); + + ret = send(sock, (char *) &msg, len, 0); + if (ret != len) + goto out; + + ret = recv(sock, (char *) &msg, sizeof msg, 0); + if (ret < ACM_MSG_HDR_LENGTH || ret != be16toh(msg.hdr.length)) { + ret = ACM_STATUS_EINVAL; + goto out; + } + + if (msg.hdr.status) { + ret = acm_error(msg.hdr.status); + goto out; + } + + *counters = malloc(sizeof(uint64_t) * msg.hdr.src_out); + if (!*counters) { + ret = ACM_STATUS_ENOMEM; + goto out; + } + + *count = msg.hdr.src_out; + for (i = 0; i < *count; i++) + (*counters)[i] = be64toh(msg.perf_data[i]); + + ret = 0; +out: + pthread_mutex_unlock(&acm_lock); + return ret; +} + + +const char *ib_acm_cntr_name(int index) +{ + static const char *const cntr_name[] = { + [ACM_CNTR_ERROR] = "Error Count", + [ACM_CNTR_RESOLVE] = "Resolve Count", + [ACM_CNTR_NODATA] = "No Data", + [ACM_CNTR_ADDR_QUERY] = "Addr Query Count", + [ACM_CNTR_ADDR_CACHE] = "Addr Cache Count", + [ACM_CNTR_ROUTE_QUERY] = "Route Query Count", + [ACM_CNTR_ROUTE_CACHE] = "Route Cache Count", + }; + + if (index < ACM_CNTR_ERROR || index > ACM_MAX_COUNTER) + return "Unknown"; + + return cntr_name[index]; +} diff --git a/ibacm/src/libacm.h b/ibacm/src/libacm.h new file mode 100644 index 0000000..fb81b41 --- /dev/null +++ b/ibacm/src/libacm.h @@ -0,0 +1,60 @@ +/* + * Copyright (c) 2009 Intel Corporation. All rights reserved. + * Copyright (c) 2013 Mellanox Technologies LTD. All rights reserved. + * + * This software is available to you under the OpenIB.org BSD license + * below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AWV + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef LIBACM_H +#define LIBACM_H + +#include <infiniband/acm.h> + +struct sockaddr; + +int ib_acm_connect(char *dest_svc); +void ib_acm_disconnect(void); + +int ib_acm_resolve_name(char *src, char *dest, + struct ibv_path_data **paths, int *count, uint32_t flags, + int print); +int ib_acm_resolve_ip(struct sockaddr *src, struct sockaddr *dest, + struct ibv_path_data **paths, int *count, uint32_t flags, + int print); +int ib_acm_resolve_path(struct ibv_path_record *path, uint32_t flags); +#define ib_acm_free_paths(paths) free(paths) + +int ib_acm_query_perf(int index, uint64_t **counters, int *count); +int ib_acm_query_perf_ep_addr(uint8_t *src, uint8_t type, + uint64_t **counters, int *count); +#define ib_acm_free_perf(counters) free(counters) + +const char *ib_acm_cntr_name(int index); + +int ib_acm_enum_ep(int index, struct acm_ep_config_data **data, uint8_t port); +#define ib_acm_free_ep_data(data) free(data) + +#endif /* LIBACM_H */ diff --git a/ibacm/src/parse.c b/ibacm/src/parse.c new file mode 100644 index 0000000..15aab17 --- /dev/null +++ b/ibacm/src/parse.c @@ -0,0 +1,125 @@ +/* + * Copyright (c) 2009-2010 Intel Corporation. All rights reserved. + * + * This software is available to you under the OpenIB.org BSD license + * below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AWV + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include "acm_util.h" + +static char *expand(char *basename, char *args, int *str_cnt, int *str_size) +{ + char buf[256]; + char *str_buf = NULL; + char *token, *tmp; + int from, to, width; + int size = 0, cnt = 0; + + token = strtok(args, ","); + do { + from = atoi(token); + tmp = index(token, '-'); + if (tmp) { + to = atoi(tmp+1); + width = tmp - token; + } else { + to = from; + width = strlen(token); + } + + while (from <= to) { + snprintf(buf, sizeof buf, "%s%0*d", basename, width, from); + str_buf = realloc(str_buf, size + strlen(buf)+1); + strcpy(&str_buf[size], buf); + + from++; + cnt++; + size += strlen(buf)+1; + } + + token = strtok(NULL, ","); + } while (token); + + *str_size = size; + *str_cnt = cnt; + return str_buf; +} + +char **parse(const char *args, int *count) +{ + char **ptrs = NULL; + char *str_buf, *cpy, *token, *next; + int cnt = 0, str_size = 0; + int i; + + /* make a copy that strtok can modify */ + cpy = strdup(args); + if (!cpy) + return NULL; + + if (args[0] == '[') { + cpy[0] = '\0'; + token = cpy; + next = strtok(cpy + 1, "]"); + } else { + token = strtok(cpy, "["); + next = strtok(NULL, "]"); + } + + if (!next) { + str_size = strlen(token) + 1; + str_buf = malloc(str_size); + if (!str_buf) + goto out_cpy; + + strcpy(str_buf, token); + cnt = 1; + } else { + str_buf = expand(cpy, next, &cnt, &str_size); + } + + ptrs = malloc((sizeof str_buf * (cnt + 1)) + str_size); + if (!ptrs) + goto out_str_buf; + + memcpy(&ptrs[cnt + 1], str_buf, str_size); + + ptrs[0] = (char*) &ptrs[cnt + 1]; + for (i = 1; i < cnt; i++) + ptrs[i] = index(ptrs[i - 1], 0) + 1; + ptrs[i] = NULL; + + if (count) + *count = cnt; + +out_str_buf: + free(str_buf); +out_cpy: + free(cpy); + return ptrs; +} diff --git a/infiniband-diags/CMakeLists.txt b/infiniband-diags/CMakeLists.txt new file mode 100644 index 0000000..1fd9ef2 --- /dev/null +++ b/infiniband-diags/CMakeLists.txt @@ -0,0 +1,49 @@ +publish_internal_headers("" + ibdiag_common.h + ibdiag_sa.h + ) + +install(FILES + etc/error_thresholds + etc/ibdiag.conf + DESTINATION "${IBDIAG_CONFIG_PATH}") + +add_library(ibdiags_tools STATIC + ibdiag_common.c + ibdiag_sa.c + ) + +function(ibdiag_programs) + foreach(I ${ARGN}) + rdma_sbin_executable(${I} "${I}.c") + target_link_libraries(${I} LINK_PRIVATE ${RT_LIBRARIES} ibumad ibmad ibdiags_tools ibnetdisc) + endforeach() +endfunction() + +ibdiag_programs( + dump_fts + ibaddr + ibcacheedit + ibccconfig + ibccquery + iblinkinfo + ibnetdiscover + ibping + ibportstate + ibqueryerrors + ibroute + ibstat + ibsysstat + ibtracert + perfquery + saquery + sminfo + smpdump + smpquery + vendstat + ) + +rdma_test_executable(ibsendtrap "ibsendtrap.c") +target_link_libraries(ibsendtrap LINK_PRIVATE ibumad ibmad ibdiags_tools) +rdma_test_executable(mcm_rereg_test "mcm_rereg_test.c") +target_link_libraries(mcm_rereg_test LINK_PRIVATE ibumad ibmad ibdiags_tools) diff --git a/infiniband-diags/dump_fts.c b/infiniband-diags/dump_fts.c new file mode 100644 index 0000000..14a9ae2 --- /dev/null +++ b/infiniband-diags/dump_fts.c @@ -0,0 +1,494 @@ +/* + * Copyright (c) 2004-2009 Voltaire Inc. All rights reserved. + * Copyright (c) 2009-2011 Mellanox Technologies LTD. All rights reserved. + * Copyright (c) 2013 Lawrence Livermore National Security. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <string.h> +#include <inttypes.h> +#include <netinet/in.h> +#include <assert.h> + +#include <infiniband/umad.h> +#include <infiniband/mad.h> +#include <util/node_name_map.h> + +#include <infiniband/ibnetdisc.h> + +#include "ibdiag_common.h" + +static struct ibmad_port *srcport; + +static unsigned startlid, endlid; + +static int brief, dump_all, multicast; + +static char *node_name_map_file = NULL; +static nn_map_t *node_name_map = NULL; + +#define IB_MLIDS_IN_BLOCK (IB_SMP_DATA_SIZE/2) + +static int dump_mlid(char *str, int strlen, unsigned mlid, unsigned nports, + __be16 mft[16][IB_MLIDS_IN_BLOCK]) +{ + uint16_t mask; + unsigned i, chunk, bit, nonzero = 0; + + if (brief) { + int n = 0; + unsigned chunks = ALIGN(nports + 1, 16) / 16; + for (i = 0; i < chunks; i++) { + mask = ntohs(mft[i][mlid % IB_MLIDS_IN_BLOCK]); + if (mask) + nonzero++; + n += snprintf(str + n, strlen - n, "%04hx", mask); + if (n >= strlen) { + n = strlen; + break; + } + } + if (!nonzero && !dump_all) { + str[0] = 0; + return 0; + } + return n; + } + for (i = 0; i <= nports; i++) { + chunk = i / 16; + bit = i % 16; + + mask = ntohs(mft[chunk][mlid % IB_MLIDS_IN_BLOCK]); + if (mask) + nonzero++; + str[i * 2] = (mask & (1 << bit)) ? 'x' : ' '; + str[i * 2 + 1] = ' '; + } + if (!nonzero && !dump_all) { + str[0] = 0; + return 0; + } + str[i * 2] = 0; + return i * 2; +} + +static __be16 mft[16][IB_MLIDS_IN_BLOCK]; + +static void dump_multicast_tables(ibnd_node_t *node, unsigned startl, + unsigned endl, struct ibmad_port *mad_port) +{ + ib_portid_t *portid = &node->path_portid; + char nd[IB_SMP_DATA_SIZE] = { 0 }; + char str[512]; + char *s; + uint64_t nodeguid; + uint32_t mod; + unsigned block, i, j, e, nports, cap, chunks, startblock, lastblock, + top; + char *mapnd = NULL; + int n = 0; + + memcpy(nd, node->nodedesc, strlen(node->nodedesc)); + nports = node->numports; + nodeguid = node->guid; + + mad_decode_field(node->switchinfo, IB_SW_MCAST_FDB_CAP_F, &cap); + mad_decode_field(node->switchinfo, IB_SW_MCAST_FDB_TOP_F, &top); + + if (!endl || endl > IB_MIN_MCAST_LID + cap - 1) + endl = IB_MIN_MCAST_LID + cap - 1; + if (!dump_all && top && top < endl) { + if (top < IB_MIN_MCAST_LID - 1) + IBWARN("illegal top mlid %x", top); + else + endl = top; + } + + if (!startl) + startl = IB_MIN_MCAST_LID; + else if (startl < IB_MIN_MCAST_LID) { + IBWARN("illegal start mlid %x, set to %x", startl, + IB_MIN_MCAST_LID); + startl = IB_MIN_MCAST_LID; + } + + if (endl > IB_MAX_MCAST_LID) { + IBWARN("illegal end mlid %x, truncate to %x", endl, + IB_MAX_MCAST_LID); + endl = IB_MAX_MCAST_LID; + } + + mapnd = remap_node_name(node_name_map, nodeguid, nd); + + printf("Multicast mlids [0x%x-0x%x] of switch %s guid 0x%016" PRIx64 + " (%s):\n", startl, endl, portid2str(portid), nodeguid, + mapnd); + + if (brief) + printf(" MLid Port Mask\n"); + else { + if (nports > 9) { + for (i = 0, s = str; i <= nports; i++) { + *s++ = (i % 10) ? ' ' : '0' + i / 10; + *s++ = ' '; + } + *s = 0; + printf(" %s\n", str); + } + for (i = 0, s = str; i <= nports; i++) + s += sprintf(s, "%d ", i % 10); + printf(" Ports: %s\n", str); + printf(" MLid\n"); + } + if (ibverbose) + printf("Switch multicast mlid capability is %d top is 0x%x\n", + cap, top); + + chunks = ALIGN(nports + 1, 16) / 16; + + startblock = startl / IB_MLIDS_IN_BLOCK; + lastblock = endl / IB_MLIDS_IN_BLOCK; + for (block = startblock; block <= lastblock; block++) { + for (j = 0; j < chunks; j++) { + int status; + mod = (block - IB_MIN_MCAST_LID / IB_MLIDS_IN_BLOCK) + | (j << 28); + + DEBUG("reading block %x chunk %d mod %x", block, j, + mod); + if (!smp_query_status_via + (mft + j, portid, IB_ATTR_MULTICASTFORWTBL, mod, 0, + &status, mad_port)) { + fprintf(stderr, "SubnGet(MFT) failed on switch " + "'%s' %s Node GUID 0x%"PRIx64 + " SMA LID %d; MAD status 0x%x " + "AM 0x%x\n", + mapnd, portid2str(portid), + node->guid, node->smalid, + status, mod); + } + } + + i = block * IB_MLIDS_IN_BLOCK; + e = i + IB_MLIDS_IN_BLOCK; + if (i < startl) + i = startl; + if (e > endl + 1) + e = endl + 1; + + for (; i < e; i++) { + if (dump_mlid(str, sizeof str, i, nports, mft) == 0) + continue; + printf("0x%04x %s\n", i, str); + n++; + } + } + + printf("%d %smlids dumped \n", n, dump_all ? "" : "valid "); + + free(mapnd); +} + +static int dump_lid(char *str, int str_len, int lid, int valid, + ibnd_fabric_t *fabric, int *last_port_lid, + int *base_port_lid, uint64_t *portguid) +{ + char nd[IB_SMP_DATA_SIZE] = { 0 }; + + ibnd_port_t *port = NULL; + + char ntype[50], sguid[30]; + uint64_t nodeguid; + int baselid, lmc, type; + char *mapnd = NULL; + int rc; + + if (brief) { + str[0] = 0; + return 0; + } + + if (lid <= *last_port_lid) { + if (!valid) + return snprintf(str, str_len, + ": (path #%d - illegal port)", + lid - *base_port_lid); + else if (!*portguid) + return snprintf(str, str_len, + ": (path #%d out of %d)", + lid - *base_port_lid + 1, + *last_port_lid - *base_port_lid + 1); + else { + return snprintf(str, str_len, + ": (path #%d out of %d: portguid %s)", + lid - *base_port_lid + 1, + *last_port_lid - *base_port_lid + 1, + mad_dump_val(IB_NODE_PORT_GUID_F, sguid, + sizeof sguid, portguid)); + } + } + + if (!valid) + return snprintf(str, str_len, ": (illegal port)"); + + *portguid = 0; + + port = ibnd_find_port_lid(fabric, lid); + if (!port) { + return snprintf(str, str_len, ": (node info not available fabric scan)"); + } + + nodeguid = port->node->guid; + *portguid = port->guid; + type = port->node->type; + + baselid = port->base_lid; + lmc = port->lmc; + + memcpy(nd, port->node->nodedesc, strlen(port->node->nodedesc)); + + if (lmc > 0) { + *base_port_lid = baselid; + *last_port_lid = baselid + (1 << lmc) - 1; + } + + mapnd = remap_node_name(node_name_map, nodeguid, nd); + + rc = snprintf(str, str_len, ": (%s portguid %s: '%s')", + mad_dump_val(IB_NODE_TYPE_F, ntype, sizeof ntype, + &type), mad_dump_val(IB_NODE_PORT_GUID_F, + sguid, sizeof sguid, + portguid), + mapnd); + + free(mapnd); + return rc; +} + +static void dump_unicast_tables(ibnd_node_t *node, int startl, int endl, + struct ibmad_port *mad_port, + ibnd_fabric_t *fabric) +{ + ib_portid_t * portid = &node->path_portid; + char lft[IB_SMP_DATA_SIZE] = { 0 }; + char nd[IB_SMP_DATA_SIZE] = { 0 }; + char str[200]; + uint64_t nodeguid; + int block, i, e, top; + unsigned nports; + int n = 0, startblock, endblock; + char *mapnd = NULL; + int last_port_lid = 0, base_port_lid = 0; + uint64_t portguid = 0; + + mad_decode_field(node->switchinfo, IB_SW_LINEAR_FDB_TOP_F, &top); + nodeguid = node->guid; + nports = node->numports; + memcpy(nd, node->nodedesc, strlen(node->nodedesc)); + + if (!endl || endl > top) + endl = top; + + if (endl > IB_MAX_UCAST_LID) { + IBWARN("illegal lft top %d, truncate to %d", endl, + IB_MAX_UCAST_LID); + endl = IB_MAX_UCAST_LID; + } + + mapnd = remap_node_name(node_name_map, nodeguid, nd); + + printf("Unicast lids [0x%x-0x%x] of switch %s guid 0x%016" PRIx64 + " (%s):\n", startl, endl, portid2str(portid), nodeguid, + mapnd); + + DEBUG("Switch top is 0x%x\n", top); + + printf(" Lid Out Destination\n"); + printf(" Port Info \n"); + startblock = startl / IB_SMP_DATA_SIZE; + endblock = ALIGN(endl, IB_SMP_DATA_SIZE) / IB_SMP_DATA_SIZE; + for (block = startblock; block < endblock; block++) { + int status; + DEBUG("reading block %d", block); + if (!smp_query_status_via(lft, portid, IB_ATTR_LINEARFORWTBL, block, + 0, &status, mad_port)) { + fprintf(stderr, "SubnGet(LFT) failed on switch " + "'%s' %s Node GUID 0x%"PRIx64 + " SMA LID %d; MAD status 0x%x AM 0x%x\n", + mapnd, portid2str(portid), + node->guid, node->smalid, + status, block); + } + i = block * IB_SMP_DATA_SIZE; + e = i + IB_SMP_DATA_SIZE; + if (i < startl) + i = startl; + if (e > endl + 1) + e = endl + 1; + + for (; i < e; i++) { + unsigned outport = lft[i % IB_SMP_DATA_SIZE]; + unsigned valid = (outport <= nports); + + if (!valid && !dump_all) + continue; + dump_lid(str, sizeof str, i, valid, fabric, + &last_port_lid, &base_port_lid, &portguid); + printf("0x%04x %03u %s\n", i, outport & 0xff, str); + n++; + } + } + + printf("%d %slids dumped \n", n, dump_all ? "" : "valid "); + free(mapnd); +} + +static void dump_node(ibnd_node_t *node, struct ibmad_port *mad_port, + ibnd_fabric_t *fabric) +{ + if (multicast) + dump_multicast_tables(node, startlid, endlid, mad_port); + else + dump_unicast_tables(node, startlid, endlid, + mad_port, fabric); +} + +static void process_switch(ibnd_node_t *node, void *fabric) +{ + dump_node(node, srcport, (ibnd_fabric_t *)fabric); +} + +static int process_opt(void *context, int ch) +{ + switch (ch) { + case 'a': + dump_all++; + break; + case 'M': + multicast++; + break; + case 'n': + brief++; + break; + case 1: + node_name_map_file = strdup(optarg); + if (node_name_map_file == NULL) + IBEXIT("out of memory, strdup for node_name_map_file name failed"); + break; + default: + return -1; + } + return 0; +} + +int main(int argc, char **argv) +{ + int rc = 0; + int mgmt_classes[3] = + { IB_SMI_CLASS, IB_SMI_DIRECT_CLASS, IB_SA_CLASS }; + + struct ibnd_config config = { 0 }; + ibnd_fabric_t *fabric = NULL; + + const struct ibdiag_opt opts[] = { + {"all", 'a', 0, NULL, "show all lids, even invalid entries"}, + {"no_dests", 'n', 0, NULL, + "do not try to resolve destinations"}, + {"Multicast", 'M', 0, NULL, "show multicast forwarding tables"}, + {"node-name-map", 1, 1, "<file>", "node name map file"}, + {} + }; + char usage_args[] = "[<dest dr_path|lid|guid> [<startlid> [<endlid>]]]"; + const char *usage_examples[] = { + " -- Unicast examples:", + "-a\t# same, but dump all lids, even with invalid out ports", + "-n\t# simple dump format - no destination resolving", + "10\t# dump lids starting from 10", + "0x10 0x20\t# dump lid range", + " -- Multicast examples:", + "-M\t# dump all non empty mlids of switch with lid 4", + "-M 0xc010 0xc020\t# same, but with range", + "-M -n\t# simple dump format", + NULL, + }; + + ibdiag_process_opts(argc, argv, &config, "KGDLs", opts, process_opt, + usage_args, usage_examples); + + argc -= optind; + argv += optind; + + if (argc > 0) + startlid = strtoul(argv[0], NULL, 0); + if (argc > 1) + endlid = strtoul(argv[1], NULL, 0); + + node_name_map = open_node_name_map(node_name_map_file); + + if (ibd_timeout) + config.timeout_ms = ibd_timeout; + + config.flags = ibd_ibnetdisc_flags; + config.mkey = ibd_mkey; + + if ((fabric = ibnd_discover_fabric(ibd_ca, ibd_ca_port, NULL, + &config)) != NULL) { + + srcport = mad_rpc_open_port(ibd_ca, ibd_ca_port, mgmt_classes, 3); + if (!srcport) { + fprintf(stderr, + "Failed to open '%s' port '%d'\n", ibd_ca, ibd_ca_port); + rc = -1; + goto Exit; + } + smp_mkey_set(srcport, ibd_mkey); + + if (ibd_timeout) { + mad_rpc_set_timeout(srcport, ibd_timeout); + } + + ibnd_iter_nodes_type(fabric, process_switch, IB_NODE_SWITCH, fabric); + + mad_rpc_close_port(srcport); + + } else { + fprintf(stderr, "Failed to discover fabric\n"); + rc = -1; + } +Exit: + ibnd_destroy_fabric(fabric); + + close_node_name_map(node_name_map); + exit(rc); +} diff --git a/infiniband-diags/etc/error_thresholds b/infiniband-diags/etc/error_thresholds new file mode 100644 index 0000000..28cd295 --- /dev/null +++ b/infiniband-diags/etc/error_thresholds @@ -0,0 +1,16 @@ +# Define error thresholds here + +#SymbolErrorCounter=10 +#LinkErrorRecoveryCounter=10 +#LinkDownedCounter=10 +#PortRcvErrors=10 +#PortRcvRemotePhysicalErrors=100 +#PortRcvSwitchRelayErrors=100 +#PortXmitDiscards=100 +#PortXmitConstraintErrors=100 +#PortRcvConstraintErrors=100 +#LocalLinkIntegrityErrors=10 +#ExcessiveBufferOverrunErrors=10 +#VL15Dropped=100 +#PortXmitWait=1000 + diff --git a/infiniband-diags/etc/ibdiag.conf b/infiniband-diags/etc/ibdiag.conf new file mode 100644 index 0000000..d09524a --- /dev/null +++ b/infiniband-diags/etc/ibdiag.conf @@ -0,0 +1,23 @@ +# Define different defaults for all infiniband-diag tools. These can be +# defined on the command line but this offers a more global config. + +# Defaults are to find the first port with Physical state == "LinkUp" +#CA=mlx4_0 + +# NOTE: that using a different Port may require an altered DR path. +# for example -D 0,1 will not work with port 2 +#Port=1 + +# define a different default timeout +#timeout=50 + +# disable query of Mellanox Extended PortInfo on ibnetdiscover subnet sweeps +# Default = true +#MLX_EPI=false + +# define a default m_key +#m_key=0x00 + +# default smkey to be used for SA requests +#sa_key=0x00 + diff --git a/infiniband-diags/ibaddr.c b/infiniband-diags/ibaddr.c new file mode 100644 index 0000000..19d2ec3 --- /dev/null +++ b/infiniband-diags/ibaddr.c @@ -0,0 +1,163 @@ +/* + * Copyright (c) 2004-2009 Voltaire Inc. All rights reserved. + * Copyright (c) 2011 Mellanox Technologies LTD. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <arpa/inet.h> + +#include <infiniband/umad.h> +#include <infiniband/mad.h> + +#include "ibdiag_common.h" + +static struct ibmad_port *srcport; + +static int ib_resolve_addr(ib_portid_t * portid, int portnum, int show_lid, + int show_gid) +{ + char gid_str[INET6_ADDRSTRLEN]; + uint8_t portinfo[IB_SMP_DATA_SIZE] = { 0 }; + uint8_t nodeinfo[IB_SMP_DATA_SIZE] = { 0 }; + uint64_t guid, prefix; + ibmad_gid_t gid; + int lmc; + + if (!smp_query_via(nodeinfo, portid, IB_ATTR_NODE_INFO, 0, 0, srcport)) + return -1; + + if (!smp_query_via(portinfo, portid, IB_ATTR_PORT_INFO, portnum, 0, + srcport)) + return -1; + + mad_decode_field(portinfo, IB_PORT_LID_F, &portid->lid); + mad_decode_field(portinfo, IB_PORT_GID_PREFIX_F, &prefix); + mad_decode_field(portinfo, IB_PORT_LMC_F, &lmc); + mad_decode_field(nodeinfo, IB_NODE_PORT_GUID_F, &guid); + + mad_encode_field(gid, IB_GID_PREFIX_F, &prefix); + mad_encode_field(gid, IB_GID_GUID_F, &guid); + + if (show_gid) { + printf("GID %s ", inet_ntop(AF_INET6, gid, gid_str, + sizeof gid_str)); + } + + if (show_lid > 0) + printf("LID start 0x%x end 0x%x", portid->lid, + portid->lid + (1 << lmc) - 1); + else if (show_lid < 0) + printf("LID start %u end %u", portid->lid, + portid->lid + (1 << lmc) - 1); + printf("\n"); + return 0; +} + +static int show_lid, show_gid; + +static int process_opt(void *context, int ch) +{ + switch (ch) { + case 'g': + show_gid = 1; + break; + case 'l': + show_lid++; + break; + case 'L': + show_lid = -100; + break; + default: + return -1; + } + return 0; +} + +int main(int argc, char **argv) +{ + int mgmt_classes[3] = + { IB_SMI_CLASS, IB_SMI_DIRECT_CLASS, IB_SA_CLASS }; + ib_portid_t portid = { 0 }; + int port = 0; + + const struct ibdiag_opt opts[] = { + {"gid_show", 'g', 0, NULL, "show gid address only"}, + {"lid_show", 'l', 0, NULL, "show lid range only"}, + {"Lid_show", 'L', 0, NULL, "show lid range (in decimal) only"}, + {} + }; + char usage_args[] = "[<lid|dr_path|guid>]"; + const char *usage_examples[] = { + "\t\t# local port's address", + "32\t\t# show lid range and gid of lid 32", + "-G 0x8f1040023\t# same but using guid address", + "-l 32\t\t# show lid range only", + "-L 32\t\t# show decimal lid range only", + "-g 32\t\t# show gid address only", + NULL + }; + + ibdiag_process_opts(argc, argv, NULL, "KL", opts, process_opt, + usage_args, usage_examples); + + argc -= optind; + argv += optind; + + if (argc > 1) + port = strtoul(argv[1], NULL, 0); + + if (!show_lid && !show_gid) + show_lid = show_gid = 1; + + srcport = mad_rpc_open_port(ibd_ca, ibd_ca_port, mgmt_classes, 3); + if (!srcport) + IBEXIT("Failed to open '%s' port '%d'", ibd_ca, ibd_ca_port); + + smp_mkey_set(srcport, ibd_mkey); + + if (argc) { + if (resolve_portid_str(ibd_ca, ibd_ca_port, &portid, argv[0], + ibd_dest_type, ibd_sm_id, srcport) < 0) + IBEXIT("can't resolve destination port %s", argv[0]); + } else { + if (resolve_self(ibd_ca, ibd_ca_port, &portid, &port, NULL) < 0) + IBEXIT("can't resolve self port %s", argv[0]); + } + + if (ib_resolve_addr(&portid, port, show_lid, show_gid) < 0) + IBEXIT("can't resolve requested address"); + + mad_rpc_close_port(srcport); + exit(0); +} diff --git a/infiniband-diags/ibcacheedit.c b/infiniband-diags/ibcacheedit.c new file mode 100644 index 0000000..4b8dbcb --- /dev/null +++ b/infiniband-diags/ibcacheedit.c @@ -0,0 +1,351 @@ +/* + * Copyright (c) 2010 Lawrence Livermore National Lab. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#define _GNU_SOURCE +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <inttypes.h> + +#include <infiniband/mad.h> +#include <infiniband/ibnetdisc.h> + +#include "ibdiag_common.h" + +static uint64_t switchguid_before; +static uint64_t switchguid_after; +static int switchguid_flag; + +static uint64_t caguid_before; +static uint64_t caguid_after; +static int caguid_flag; + +static uint64_t sysimgguid_before; +static uint64_t sysimgguid_after; +static int sysimgguid_flag; + +static uint64_t portguid_nodeguid; +static uint64_t portguid_before; +static uint64_t portguid_after; +static int portguid_flag; + +struct guids { + uint64_t searchguid; + int searchguid_found; + uint64_t before; + uint64_t after; + int found; +}; + +static int parse_beforeafter(char *arg, uint64_t *before, uint64_t *after) +{ + char *ptr; + char *before_str; + char *after_str; + + ptr = strchr(optarg, ':'); + if (!ptr || !(*(ptr + 1))) { + fprintf(stderr, "invalid input '%s'\n", arg); + return -1; + } + (*ptr) = '\0'; + before_str = arg; + after_str = ptr + 1; + + (*before) = strtoull(before_str, NULL, 0); + (*after) = strtoull(after_str, NULL, 0); + return 0; +} + +static int parse_guidbeforeafter(char *arg, + uint64_t *guid, + uint64_t *before, + uint64_t *after) +{ + char *ptr1; + char *ptr2; + char *guid_str; + char *before_str; + char *after_str; + + ptr1 = strchr(optarg, ':'); + if (!ptr1 || !(*(ptr1 + 1))) { + fprintf(stderr, "invalid input '%s'\n", arg); + return -1; + } + guid_str = arg; + before_str = ptr1 + 1; + + ptr2 = strchr(before_str, ':'); + if (!ptr2 || !(*(ptr2 + 1))) { + fprintf(stderr, "invalid input '%s'\n", arg); + return -1; + } + (*ptr1) = '\0'; + (*ptr2) = '\0'; + after_str = ptr2 + 1; + + (*guid) = strtoull(guid_str, NULL, 0); + (*before) = strtoull(before_str, NULL, 0); + (*after) = strtoull(after_str, NULL, 0); + return 0; +} + +static int process_opt(void *context, int ch) +{ + switch (ch) { + case 1: + if (parse_beforeafter(optarg, + &switchguid_before, + &switchguid_after) < 0) + return -1; + switchguid_flag++; + break; + case 2: + if (parse_beforeafter(optarg, + &caguid_before, + &caguid_after) < 0) + return -1; + caguid_flag++; + break; + case 3: + if (parse_beforeafter(optarg, + &sysimgguid_before, + &sysimgguid_after) < 0) + return -1; + sysimgguid_flag++; + break; + case 4: + if (parse_guidbeforeafter(optarg, + &portguid_nodeguid, + &portguid_before, + &portguid_after) < 0) + return -1; + portguid_flag++; + break; + default: + return -1; + } + + return 0; +} + +static void update_switchportguids(ibnd_node_t *node) +{ + ibnd_port_t *port; + int p; + + for (p = 0; p <= node->numports; p++) { + port = node->ports[p]; + if (port) + port->guid = node->guid; + } +} + +static void replace_node_guid(ibnd_node_t *node, void *user_data) +{ + struct guids *guids; + + guids = (struct guids *)user_data; + + if (node->guid == guids->before) { + + node->guid = guids->after; + + /* port guids are identical to switch guids on + * switches, so update port guids too + */ + if (node->type == IB_NODE_SWITCH) + update_switchportguids(node); + + guids->found++; + } +} + +static void replace_sysimgguid(ibnd_node_t *node, void *user_data) +{ + struct guids *guids; + uint64_t sysimgguid; + + guids = (struct guids *)user_data; + + sysimgguid = mad_get_field64(node->info, 0, IB_NODE_SYSTEM_GUID_F); + if (sysimgguid == guids->before) { + mad_set_field64(node->info, 0, IB_NODE_SYSTEM_GUID_F, + guids->after); + guids->found++; + } +} + +static void replace_portguid(ibnd_node_t *node, void *user_data) +{ + struct guids *guids; + + guids = (struct guids *)user_data; + + if (node->guid != guids->searchguid) + return; + + guids->searchguid_found++; + + if (node->type == IB_NODE_SWITCH) { + /* port guids are identical to switch guids on + * switches, so update switch guid too + */ + if (node->guid == guids->before) { + node->guid = guids->after; + update_switchportguids(node); + guids->found++; + } + } + else { + ibnd_port_t *port; + int p; + + for (p = 1; p <= node->numports; p++) { + port = node->ports[p]; + if (port + && port->guid == guids->before) { + port->guid = guids->after; + guids->found++; + break; + } + } + } +} + +int main(int argc, char **argv) +{ + ibnd_fabric_t *fabric = NULL; + char *orig_cache_file = NULL; + char *new_cache_file = NULL; + struct guids guids; + + const struct ibdiag_opt opts[] = { + {"switchguid", 1, 1, "BEFOREGUID:AFTERGUID", + "Specify before and after switchguid to edit"}, + {"caguid", 2, 1, "BEFOREGUID:AFTERGUID", + "Specify before and after caguid to edit"}, + {"sysimgguid", 3, 1, "BEFOREGUID:AFTERGUID", + "Specify before and after sysimgguid to edit"}, + {"portguid", 4, 1, "NODEGUID:BEFOREGUID:AFTERGUID", + "Specify before and after port guid to edit"}, + {} + }; + const char *usage_args = "<orig.cache> <new.cache>"; + + ibdiag_process_opts(argc, argv, NULL, "CDdeGKLPstvy", + opts, process_opt, usage_args, + NULL); + + argc -= optind; + argv += optind; + + orig_cache_file = argv[0]; + new_cache_file = argv[1]; + + if (!orig_cache_file) + IBEXIT("original cache file not specified"); + + if (!new_cache_file) + IBEXIT("new cache file not specified"); + + if ((fabric = ibnd_load_fabric(orig_cache_file, 0)) == NULL) + IBEXIT("loading original cached fabric failed"); + + if (switchguid_flag) { + guids.before = switchguid_before; + guids.after = switchguid_after; + guids.found = 0; + ibnd_iter_nodes_type(fabric, + replace_node_guid, + IB_NODE_SWITCH, + &guids); + + if (!guids.found) + IBEXIT("switchguid = %" PRIx64 " not found", + switchguid_before); + } + + if (caguid_flag) { + guids.before = caguid_before; + guids.after = caguid_after; + guids.found = 0; + ibnd_iter_nodes_type(fabric, + replace_node_guid, + IB_NODE_CA, + &guids); + + if (!guids.found) + IBEXIT("caguid = %" PRIx64 " not found", + caguid_before); + } + + if (sysimgguid_flag) { + guids.before = sysimgguid_before; + guids.after = sysimgguid_after; + guids.found = 0; + ibnd_iter_nodes(fabric, + replace_sysimgguid, + &guids); + + if (!guids.found) + IBEXIT("sysimgguid = %" PRIx64 " not found", + sysimgguid_before); + } + + if (portguid_flag) { + guids.searchguid = portguid_nodeguid; + guids.searchguid_found = 0; + guids.before = portguid_before; + guids.after = portguid_after; + guids.found = 0; + ibnd_iter_nodes(fabric, + replace_portguid, + &guids); + + if (!guids.searchguid_found) + IBEXIT("nodeguid = %" PRIx64 " not found", + portguid_nodeguid); + + if (!guids.found) + IBEXIT("portguid = %" PRIx64 " not found", + portguid_before); + } + + if (ibnd_cache_fabric(fabric, new_cache_file, 0) < 0) + IBEXIT("caching new cache data failed"); + + ibnd_destroy_fabric(fabric); + exit(0); +} diff --git a/infiniband-diags/ibccconfig.c b/infiniband-diags/ibccconfig.c new file mode 100644 index 0000000..7a02ee7 --- /dev/null +++ b/infiniband-diags/ibccconfig.c @@ -0,0 +1,620 @@ +/* + * Copyright (c) 2004-2009 Voltaire Inc. All rights reserved. + * Copyright (c) 2011 Mellanox Technologies LTD. All rights reserved. + * Copyright (c) 2011 Lawrence Livermore National Lab. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <string.h> +#include <errno.h> +#include <netinet/in.h> +#include <limits.h> +#include <ctype.h> + +#define __STDC_FORMAT_MACROS +#include <inttypes.h> + +#include <infiniband/mad.h> + +#include "ibdiag_common.h" + +static struct ibmad_port *srcport; + +static op_fn_t congestion_key_info; +static op_fn_t switch_congestion_setting; +static op_fn_t switch_port_congestion_setting; +static op_fn_t ca_congestion_setting; +static op_fn_t congestion_control_table; + +static const match_rec_t match_tbl[] = { + {"CongestionKeyInfo", "CK", congestion_key_info, 0, + "<cckey> <cckeyprotectbit> <cckeyleaseperiod> <cckeyviolations>"}, + {"SwitchCongestionSetting", "SS", switch_congestion_setting, 0, + "<controlmap> <victimmask> <creditmask> <threshold> <packetsize> " + "<csthreshold> <csreturndelay> <markingrate>"}, + {"SwitchPortCongestionSetting", "SP", switch_port_congestion_setting, 1, + "<valid> <control_type> <threshold> <packet_size> <cong_parm_marking_rate>"}, + {"CACongestionSetting", "CS", ca_congestion_setting, 0, + "<port_control> <control_map> <ccti_timer> <ccti_increase> " + "<trigger_threshold> <ccti_min>"}, + {"CongestionControlTable", "CT", congestion_control_table, 0, + "<cctilimit> <index> <cctentry> <cctentry> ..."}, + {} +}; + +static uint64_t cckey; + +/*******************************************/ +static const char *parselonglongint(char *arg, uint64_t *val) +{ + char *endptr = NULL; + + errno = 0; + *val = strtoull(arg, &endptr, 0); + if ((endptr && *endptr != '\0') + || errno != 0) { + if (errno == ERANGE) + return "value out of range"; + return "invalid integer input"; + } + + return NULL; +} + +static const char *parseint(char *arg, uint32_t *val, int hexonly) +{ + char *endptr = NULL; + + errno = 0; + *val = strtoul(arg, &endptr, hexonly ? 16 : 0); + if ((endptr && *endptr != '\0') + || errno != 0) { + if (errno == ERANGE) + return "value out of range"; + return "invalid integer input"; + } + + return NULL; +} + +static const char *congestion_key_info(ib_portid_t *dest, char **argv, int argc) +{ + uint8_t rcv[IB_CC_DATA_SZ] = { 0 }; + uint8_t payload[IB_CC_DATA_SZ] = { 0 }; + uint64_t cc_key; + uint32_t cc_keyprotectbit; + uint32_t cc_keyleaseperiod; + uint32_t cc_keyviolations; + const char *errstr; + + if (argc != 4) + return "invalid number of parameters for CongestionKeyInfo"; + + if ((errstr = parselonglongint(argv[0], &cc_key))) + return errstr; + if ((errstr = parseint(argv[1], &cc_keyprotectbit, 0))) + return errstr; + if ((errstr = parseint(argv[2], &cc_keyleaseperiod, 0))) + return errstr; + if ((errstr = parseint(argv[3], &cc_keyviolations, 0))) + return errstr; + + if (cc_keyprotectbit != 0 && cc_keyprotectbit != 1) + return "invalid cc_keyprotectbit value"; + + if (cc_keyleaseperiod > USHRT_MAX) + return "invalid cc_keyleaseperiod value"; + + if (cc_keyviolations > USHRT_MAX) + return "invalid cc_keyviolations value"; + + mad_set_field64(payload, + 0, + IB_CC_CONGESTION_KEY_INFO_CC_KEY_F, + cc_key); + + mad_encode_field(payload, + IB_CC_CONGESTION_KEY_INFO_CC_KEY_PROTECT_BIT_F, + &cc_keyprotectbit); + + mad_encode_field(payload, + IB_CC_CONGESTION_KEY_INFO_CC_KEY_LEASE_PERIOD_F, + &cc_keyleaseperiod); + + /* spec says "setting the counter to a value other than zero results + * in the counter being left unchanged. So if user wants no change, + * they gotta input non-zero + */ + mad_encode_field(payload, + IB_CC_CONGESTION_KEY_INFO_CC_KEY_VIOLATIONS_F, + &cc_keyviolations); + + if (!cc_config_status_via(payload, rcv, dest, IB_CC_ATTR_CONGESTION_KEY_INFO, + 0, 0, NULL, srcport, cckey)) + return "congestion key info config failed"; + + return NULL; +} + + +/* parse like it's a hypothetical 256 bit hex code */ +static const char *parse256(char *arg, uint8_t *buf) +{ + int numdigits = 0; + int startindex; + char *ptr; + int i; + + if (!strncmp(arg, "0x", 2) || !strncmp(arg, "0X", 2)) + arg += 2; + + for (ptr = arg; *ptr; ptr++) { + if (!isxdigit(*ptr)) + return "invalid hex digit read"; + numdigits++; + } + + if (numdigits > 64) + return "hex code too long"; + + /* we need to imagine that this is like a 256-bit int stored + * in big endian. So we need to find the first index + * point where the user's input would start in our array. + */ + startindex = 32 - ((numdigits - 1) / 2) - 1; + + for (i = startindex; i <= 31; i++) { + char tmp[3] = { 0 }; + uint32_t tmpint; + const char *errstr; + + /* I can't help but think there is a strtoX that + * will do this for me, but I can't find it. + */ + if (i == startindex && numdigits % 2) { + memcpy(tmp, arg, 1); + arg++; + } + else { + memcpy(tmp, arg, 2); + arg += 2; + } + + if ((errstr = parseint(tmp, &tmpint, 1))) + return errstr; + buf[i] = tmpint; + } + + return NULL; +} + +static const char *parsecct(char *arg, uint32_t *shift, uint32_t *multiplier) +{ + char buf[1024] = { 0 }; + const char *errstr; + char *ptr; + + strcpy(buf, arg); + + if (!(ptr = strchr(buf, ':'))) + return "ccts are formatted shift:multiplier"; + + *ptr = '\0'; + ptr++; + + if ((errstr = parseint(buf, shift, 0))) + return errstr; + + if ((errstr = parseint(ptr, multiplier, 0))) + return errstr; + + return NULL; +} + +static const char *switch_congestion_setting(ib_portid_t *dest, char **argv, + int argc) +{ + uint8_t rcv[IB_CC_DATA_SZ] = { 0 }; + uint8_t payload[IB_CC_DATA_SZ] = { 0 }; + uint32_t control_map; + uint8_t victim_mask[32] = { 0 }; + uint8_t credit_mask[32] = { 0 }; + uint32_t threshold; + uint32_t packet_size; + uint32_t cs_threshold; + uint32_t cs_returndelay_s; + uint32_t cs_returndelay_m; + uint32_t cs_returndelay; + uint32_t marking_rate; + const char *errstr; + + if (argc != 8) + return "invalid number of parameters for SwitchCongestionSetting"; + + if ((errstr = parseint(argv[0], &control_map, 0))) + return errstr; + + if ((errstr = parse256(argv[1], victim_mask))) + return errstr; + + if ((errstr = parse256(argv[2], credit_mask))) + return errstr; + + if ((errstr = parseint(argv[3], &threshold, 0))) + return errstr; + + if ((errstr = parseint(argv[4], &packet_size, 0))) + return errstr; + + if ((errstr = parseint(argv[5], &cs_threshold, 0))) + return errstr; + + if ((errstr = parsecct(argv[6], &cs_returndelay_s, &cs_returndelay_m))) + return errstr; + + cs_returndelay = cs_returndelay_m; + cs_returndelay |= (cs_returndelay_s << 14); + + if ((errstr = parseint(argv[7], &marking_rate, 0))) + return errstr; + + mad_encode_field(payload, + IB_CC_SWITCH_CONGESTION_SETTING_CONTROL_MAP_F, + &control_map); + + mad_set_array(payload, + 0, + IB_CC_SWITCH_CONGESTION_SETTING_VICTIM_MASK_F, + victim_mask); + + mad_set_array(payload, + 0, + IB_CC_SWITCH_CONGESTION_SETTING_CREDIT_MASK_F, + credit_mask); + + mad_encode_field(payload, + IB_CC_SWITCH_CONGESTION_SETTING_THRESHOLD_F, + &threshold); + + mad_encode_field(payload, + IB_CC_SWITCH_CONGESTION_SETTING_PACKET_SIZE_F, + &packet_size); + + mad_encode_field(payload, + IB_CC_SWITCH_CONGESTION_SETTING_CS_THRESHOLD_F, + &cs_threshold); + + mad_encode_field(payload, + IB_CC_SWITCH_CONGESTION_SETTING_CS_RETURN_DELAY_F, + &cs_returndelay); + + mad_encode_field(payload, + IB_CC_SWITCH_CONGESTION_SETTING_MARKING_RATE_F, + &marking_rate); + + if (!cc_config_status_via(payload, rcv, dest, IB_CC_ATTR_SWITCH_CONGESTION_SETTING, + 0, 0, NULL, srcport, cckey)) + return "switch congestion setting config failed"; + + return NULL; +} + +static const char *switch_port_congestion_setting(ib_portid_t *dest, + char **argv, int argc) +{ + uint8_t rcv[IB_CC_DATA_SZ] = { 0 }; + uint8_t payload[IB_CC_DATA_SZ] = { 0 }; + uint8_t data[IB_CC_DATA_SZ] = { 0 }; + uint32_t portnum; + uint32_t valid; + uint32_t control_type; + uint32_t threshold; + uint32_t packet_size; + uint32_t cong_parm_marking_rate; + uint32_t type; + uint32_t numports; + uint8_t *ptr; + const char *errstr; + + if (argc != 6) + return "invalid number of parameters for SwitchPortCongestion"; + + if ((errstr = parseint(argv[0], &portnum, 0))) + return errstr; + + if ((errstr = parseint(argv[1], &valid, 0))) + return errstr; + + if ((errstr = parseint(argv[2], &control_type, 0))) + return errstr; + + if ((errstr = parseint(argv[3], &threshold, 0))) + return errstr; + + if ((errstr = parseint(argv[4], &packet_size, 0))) + return errstr; + + if ((errstr = parseint(argv[5], &cong_parm_marking_rate, 0))) + return errstr; + + /* Figure out number of ports first */ + if (!smp_query_via(data, dest, IB_ATTR_NODE_INFO, 0, 0, srcport)) + return "node info config failed"; + + mad_decode_field((uint8_t *)data, IB_NODE_TYPE_F, &type); + mad_decode_field((uint8_t *)data, IB_NODE_NPORTS_F, &numports); + + if (type != IB_NODE_SWITCH) + return "destination not a switch"; + + if (portnum > numports) + return "invalid port number specified"; + + /* We are modifying only 1 port, so get the current config */ + if (!cc_query_status_via(payload, dest, IB_CC_ATTR_SWITCH_PORT_CONGESTION_SETTING, + portnum / 32, 0, NULL, srcport, cckey)) + return "switch port congestion setting query failed"; + + ptr = payload + (((portnum % 32) * 4)); + + mad_encode_field(ptr, + IB_CC_SWITCH_PORT_CONGESTION_SETTING_ELEMENT_VALID_F, + &valid); + + mad_encode_field(ptr, + IB_CC_SWITCH_PORT_CONGESTION_SETTING_ELEMENT_CONTROL_TYPE_F, + &control_type); + + mad_encode_field(ptr, + IB_CC_SWITCH_PORT_CONGESTION_SETTING_ELEMENT_THRESHOLD_F, + &threshold); + + mad_encode_field(ptr, + IB_CC_SWITCH_PORT_CONGESTION_SETTING_ELEMENT_PACKET_SIZE_F, + &packet_size); + + mad_encode_field(ptr, + IB_CC_SWITCH_PORT_CONGESTION_SETTING_ELEMENT_CONG_PARM_MARKING_RATE_F, + &cong_parm_marking_rate); + + if (!cc_config_status_via(payload, rcv, dest, IB_CC_ATTR_SWITCH_PORT_CONGESTION_SETTING, + portnum / 48, 0, NULL, srcport, cckey)) + return "switch port congestion setting config failed"; + + return NULL; +} + +static const char *ca_congestion_setting(ib_portid_t *dest, char **argv, + int argc) +{ + uint8_t rcv[IB_CC_DATA_SZ] = { 0 }; + uint8_t payload[IB_CC_DATA_SZ] = { 0 }; + uint32_t port_control; + uint32_t control_map; + uint32_t ccti_timer; + uint32_t ccti_increase; + uint32_t trigger_threshold; + uint32_t ccti_min; + const char *errstr; + int i; + + if (argc != 6) + return "invalid number of parameters for CACongestionSetting"; + + if ((errstr = parseint(argv[0], &port_control, 0))) + return errstr; + + if ((errstr = parseint(argv[1], &control_map, 0))) + return errstr; + + if ((errstr = parseint(argv[2], &ccti_timer, 0))) + return errstr; + + if ((errstr = parseint(argv[3], &ccti_increase, 0))) + return errstr; + + if ((errstr = parseint(argv[4], &trigger_threshold, 0))) + return errstr; + + if ((errstr = parseint(argv[5], &ccti_min, 0))) + return errstr; + + mad_encode_field(payload, + IB_CC_CA_CONGESTION_SETTING_PORT_CONTROL_F, + &port_control); + + mad_encode_field(payload, + IB_CC_CA_CONGESTION_SETTING_CONTROL_MAP_F, + &control_map); + + for (i = 0; i < 16; i++) { + uint8_t *ptr; + + if (!(control_map & (0x1 << i))) + continue; + + ptr = payload + 2 + 2 + i * 8; + + mad_encode_field(ptr, + IB_CC_CA_CONGESTION_ENTRY_CCTI_TIMER_F, + &ccti_timer); + + mad_encode_field(ptr, + IB_CC_CA_CONGESTION_ENTRY_CCTI_INCREASE_F, + &ccti_increase); + + mad_encode_field(ptr, + IB_CC_CA_CONGESTION_ENTRY_TRIGGER_THRESHOLD_F, + &trigger_threshold); + + mad_encode_field(ptr, + IB_CC_CA_CONGESTION_ENTRY_CCTI_MIN_F, + &ccti_min); + } + + if (!cc_config_status_via(payload, rcv, dest, IB_CC_ATTR_CA_CONGESTION_SETTING, + 0, 0, NULL, srcport, cckey)) + return "ca congestion setting config failed"; + + return NULL; +} + +static const char *congestion_control_table(ib_portid_t *dest, char **argv, + int argc) +{ + uint8_t rcv[IB_CC_DATA_SZ] = { 0 }; + uint8_t payload[IB_CC_DATA_SZ] = { 0 }; + uint32_t ccti_limit; + uint32_t index; + uint32_t cctshifts[64]; + uint32_t cctmults[64]; + const char *errstr; + int i; + + if (argc < 2 || argc > 66) + return "invalid number of parameters for CongestionControlTable"; + + if ((errstr = parseint(argv[0], &ccti_limit, 0))) + return errstr; + + if ((errstr = parseint(argv[1], &index, 0))) + return errstr; + + if (ccti_limit && (ccti_limit + 1) != (index * 64 + (argc - 2))) + return "invalid number of cct entries input given ccti_limit and index"; + + for (i = 0; i < (argc - 2); i++) { + if ((errstr = parsecct(argv[i + 2], &cctshifts[i], &cctmults[i]))) + return errstr; + } + + mad_encode_field(payload, + IB_CC_CONGESTION_CONTROL_TABLE_CCTI_LIMIT_F, + &ccti_limit); + + for (i = 0; i < (argc - 2); i++) { + mad_encode_field(payload + 4 + i * 2, + IB_CC_CONGESTION_CONTROL_TABLE_ENTRY_CCT_SHIFT_F, + &cctshifts[i]); + + mad_encode_field(payload + 4 + i * 2, + IB_CC_CONGESTION_CONTROL_TABLE_ENTRY_CCT_MULTIPLIER_F, + &cctmults[i]); + } + + if (!cc_config_status_via(payload, rcv, dest, IB_CC_ATTR_CONGESTION_CONTROL_TABLE, + index, 0, NULL, srcport, cckey)) + return "congestion control table config failed"; + + return NULL; +} + +static int process_opt(void *context, int ch) +{ + switch (ch) { + case 'c': + cckey = (uint64_t) strtoull(optarg, NULL, 0); + break; + default: + return -1; + } + return 0; +} + +int main(int argc, char **argv) +{ + char usage_args[1024]; + int mgmt_classes[3] = { IB_SMI_CLASS, IB_SA_CLASS, IB_CC_CLASS }; + ib_portid_t portid = { 0 }; + const char *err; + op_fn_t *fn; + const match_rec_t *r; + int n; + + const struct ibdiag_opt opts[] = { + {"cckey", 'c', 1, "<key>", "CC key"}, + {} + }; + const char *usage_examples[] = { + "SwitchCongestionSetting 2 0x1F 0x1FFFFFFFFF 0x0 0xF 8 0 0:0 1\t# Configure Switch Congestion Settings", + "CACongestionSetting 1 0 0x3 150 1 0 0\t\t# Configure CA Congestion Settings to SL 0 and SL 1", + "CACongestionSetting 1 0 0x4 200 1 0 0\t\t# Configure CA Congestion Settings to SL 2", + "CongestionControlTable 1 63 0 0:0 0:1 ...\t# Configure first block of Congestion Control Table", + "CongestionControlTable 1 127 0 0:64 0:65 ...\t# Configure second block of Congestion Control Table", + NULL + }; + + n = sprintf(usage_args, "[-c key] <op> <lid|guid>\n" + "\nWARNING -- You should understand what you are " + "doing before using this tool. Misuse of this " + "tool could result in a broken fabric.\n" + "\nSupported ops (and aliases, case insensitive):\n"); + for (r = match_tbl; r->name; r++) { + n += snprintf(usage_args + n, sizeof(usage_args) - n, + " %s (%s) <lid|guid>%s%s%s\n", r->name, + r->alias ? r->alias : "", + r->opt_portnum ? " <portnum>" : "", + r->ops_extra ? " " : "", + r->ops_extra ? r->ops_extra : ""); + if (n >= sizeof(usage_args)) + exit(-1); + } + + ibdiag_process_opts(argc, argv, NULL, "DK", opts, process_opt, + usage_args, usage_examples); + + argc -= optind; + argv += optind; + + if (argc < 2) + ibdiag_show_usage(); + + if (!(fn = match_op(match_tbl, argv[0]))) + IBEXIT("operation '%s' not supported", argv[0]); + + srcport = mad_rpc_open_port(ibd_ca, ibd_ca_port, mgmt_classes, 3); + if (!srcport) + IBEXIT("Failed to open '%s' port '%d'", ibd_ca, ibd_ca_port); + + smp_mkey_set(srcport, ibd_mkey); + + if (resolve_portid_str(ibd_ca, ibd_ca_port, &portid, argv[1], + ibd_dest_type, ibd_sm_id, srcport) < 0) + IBEXIT("can't resolve destination %s", argv[1]); + if ((err = fn(&portid, argv + 2, argc - 2))) + IBEXIT("operation %s: %s", argv[0], err); + + mad_rpc_close_port(srcport); + exit(0); +} diff --git a/infiniband-diags/ibccquery.c b/infiniband-diags/ibccquery.c new file mode 100644 index 0000000..e9e8b49 --- /dev/null +++ b/infiniband-diags/ibccquery.c @@ -0,0 +1,408 @@ +/* + * Copyright (c) 2004-2009 Voltaire Inc. All rights reserved. + * Copyright (c) 2011 Mellanox Technologies LTD. All rights reserved. + * Copyright (c) 2011 Lawrence Livermore National Lab. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <string.h> +#include <netinet/in.h> + +#define __STDC_FORMAT_MACROS +#include <inttypes.h> + +#include <infiniband/mad.h> + +#include "ibdiag_common.h" + +static struct ibmad_port *srcport; + +static op_fn_t class_port_info; +static op_fn_t congestion_info; +static op_fn_t congestion_key_info; +static op_fn_t congestion_log; +static op_fn_t switch_congestion_setting; +static op_fn_t switch_port_congestion_setting; +static op_fn_t ca_congestion_setting; +static op_fn_t congestion_control_table; +static op_fn_t timestamp_dump; + +static const match_rec_t match_tbl[] = { + {"ClassPortInfo", "CP", class_port_info, 0, ""}, + {"CongestionInfo", "CI", congestion_info, 0, ""}, + {"CongestionKeyInfo", "CK", congestion_key_info, 0, ""}, + {"CongestionLog", "CL", congestion_log, 0, ""}, + {"SwitchCongestionSetting", "SS", switch_congestion_setting, 0, ""}, + {"SwitchPortCongestionSetting", "SP", switch_port_congestion_setting, 1, ""}, + {"CACongestionSetting", "CS", ca_congestion_setting, 0, ""}, + {"CongestionControlTable", "CT", congestion_control_table, 0, ""}, + {"Timestamp", "TI", timestamp_dump, 0, ""}, + {} +}; + +static uint64_t cckey = 0; + +/*******************************************/ +static const char *class_port_info(ib_portid_t *dest, char **argv, int argc) +{ + char buf[2048]; + char data[IB_CC_DATA_SZ] = { 0 }; + + if (!cc_query_status_via(data, dest, CLASS_PORT_INFO, + 0, 0, NULL, srcport, cckey)) + return "class port info query failed"; + + mad_dump_classportinfo(buf, sizeof buf, data, sizeof data); + + printf("# ClassPortInfo: %s\n%s", portid2str(dest), buf); + return NULL; +} + +static const char *congestion_info(ib_portid_t *dest, char **argv, int argc) +{ + char buf[2048]; + char data[IB_CC_DATA_SZ] = { 0 }; + + if (!cc_query_status_via(data, dest, IB_CC_ATTR_CONGESTION_INFO, + 0, 0, NULL, srcport, cckey)) + return "congestion info query failed"; + + mad_dump_cc_congestioninfo(buf, sizeof buf, data, sizeof data); + + printf("# CongestionInfo: %s\n%s", portid2str(dest), buf); + return NULL; +} + +static const char *congestion_key_info(ib_portid_t *dest, char **argv, int argc) +{ + char buf[2048]; + char data[IB_CC_DATA_SZ] = { 0 }; + + if (!cc_query_status_via(data, dest, IB_CC_ATTR_CONGESTION_KEY_INFO, + 0, 0, NULL, srcport, cckey)) + return "congestion key info query failed"; + + mad_dump_cc_congestionkeyinfo(buf, sizeof buf, data, sizeof data); + + printf("# CongestionKeyInfo: %s\n%s", portid2str(dest), buf); + return NULL; +} + +static const char *congestion_log(ib_portid_t *dest, char **argv, int argc) +{ + char buf[2048]; + char data[IB_CC_LOG_DATA_SZ] = { 0 }; + char emptybuf[16] = { 0 }; + int i, type; + + if (!cc_query_status_via(data, dest, IB_CC_ATTR_CONGESTION_LOG, + 0, 0, NULL, srcport, cckey)) + return "congestion log query failed"; + + mad_decode_field((uint8_t *)data, IB_CC_CONGESTION_LOG_LOGTYPE_F, &type); + + if (type != 1 && type != 2) + return "unrecognized log type"; + + mad_dump_cc_congestionlog(buf, sizeof buf, data, sizeof data); + + printf("# CongestionLog: %s\n%s", portid2str(dest), buf); + + if (type == 1) { + mad_dump_cc_congestionlogswitch(buf, sizeof buf, data, sizeof data); + printf("%s\n", buf); + for (i = 0; i < 15; i++) { + /* output only if entry not 0 */ + if (memcmp(data + 40 + i * 12, emptybuf, 12)) { + mad_dump_cc_congestionlogentryswitch(buf, sizeof buf, + data + 40 + i * 12, + 12); + printf("%s\n", buf); + } + } + } + else { + /* XXX: Q3/2010 errata lists first entry offset at 80, but we assume + * will be updated to 96 once CurrentTimeStamp field is word aligned. + * In addition, assume max 13 log events instead of 16. Due to + * errata changes increasing size of CA log event, 16 log events is + * no longer possible to fit in max MAD size. + */ + mad_dump_cc_congestionlogca(buf, sizeof buf, data, sizeof data); + printf("%s\n", buf); + for (i = 0; i < 13; i++) { + /* output only if entry not 0 */ + if (memcmp(data + 12 + i * 16, emptybuf, 16)) { + mad_dump_cc_congestionlogentryca(buf, sizeof buf, + data + 12 + i * 16, + 16); + printf("%s\n", buf); + } + } + } + + return NULL; +} + +static const char *switch_congestion_setting(ib_portid_t *dest, char **argv, + int argc) +{ + char buf[2048]; + char data[IB_CC_DATA_SZ] = { 0 }; + + if (!cc_query_status_via(data, dest, IB_CC_ATTR_SWITCH_CONGESTION_SETTING, + 0, 0, NULL, srcport, cckey)) + return "switch congestion setting query failed"; + + mad_dump_cc_switchcongestionsetting(buf, sizeof buf, data, sizeof data); + + printf("# SwitchCongestionSetting: %s\n%s", portid2str(dest), buf); + return NULL; +} + +static const char *switch_port_congestion_setting(ib_portid_t *dest, + char **argv, int argc) +{ + char buf[2048]; + char data[IB_CC_DATA_SZ] = { 0 }; + int type, numports, maxblocks, i, j; + int portnum = 0; + int outputcount = 0; + + if (argc > 0) + portnum = strtol(argv[0], NULL, 0); + + /* Figure out number of ports first */ + if (!smp_query_via(data, dest, IB_ATTR_NODE_INFO, 0, 0, srcport)) + return "node info query failed"; + + mad_decode_field((uint8_t *)data, IB_NODE_TYPE_F, &type); + mad_decode_field((uint8_t *)data, IB_NODE_NPORTS_F, &numports); + + if (type != IB_NODE_SWITCH) + return "destination not a switch"; + + printf("# SwitchPortCongestionSetting: %s\n", portid2str(dest)); + + if (portnum) { + if (portnum > numports) + return "invalid port number specified"; + + memset(data, '\0', sizeof data); + if (!cc_query_status_via(data, dest, IB_CC_ATTR_SWITCH_PORT_CONGESTION_SETTING, + portnum / 48, 0, NULL, srcport, cckey)) + return "switch port congestion setting query failed"; + + mad_dump_cc_switchportcongestionsettingelement(buf, sizeof buf, + data + ((portnum % 48) * 4), + 4); + printf("%s", buf); + return NULL; + } + + /* else get all port info */ + + maxblocks = numports / 48 + 1; + + for (i = 0; i < maxblocks; i++) { + memset(data, '\0', sizeof data); + if (!cc_query_status_via(data, dest, IB_CC_ATTR_SWITCH_PORT_CONGESTION_SETTING, + i, 0, NULL, srcport, cckey)) + return "switch port congestion setting query failed"; + + for (j = 0; j < 48 && outputcount <= numports; j++) { + printf("Port:............................%u\n", i * 48 + j); + mad_dump_cc_switchportcongestionsettingelement(buf, sizeof buf, + data + j * 4, + 4); + printf("%s\n", buf); + outputcount++; + } + } + + return NULL; +} + +static const char *ca_congestion_setting(ib_portid_t *dest, char **argv, + int argc) +{ + char buf[2048]; + char data[IB_CC_DATA_SZ] = { 0 }; + int i; + + if (!cc_query_status_via(data, dest, IB_CC_ATTR_CA_CONGESTION_SETTING, + 0, 0, NULL, srcport, cckey)) + return "ca congestion setting query failed"; + + mad_dump_cc_cacongestionsetting(buf, sizeof buf, data, sizeof data); + + printf("# CACongestionSetting: %s\n%s\n", portid2str(dest), buf); + + for (i = 0; i < 16; i++) { + printf("SL:..............................%u\n", i); + mad_dump_cc_cacongestionentry(buf, sizeof buf, + data + 4 + i * 8, + 8); + printf("%s\n", buf); + } + return NULL; +} + +static const char *congestion_control_table(ib_portid_t *dest, char **argv, + int argc) +{ + char buf[2048]; + char data[IB_CC_DATA_SZ] = { 0 }; + int limit, outputcount = 0; + int i, j; + + if (!cc_query_status_via(data, dest, IB_CC_ATTR_CONGESTION_CONTROL_TABLE, + 0, 0, NULL, srcport, cckey)) + return "congestion control table query failed"; + + mad_decode_field((uint8_t *)data, IB_CC_CONGESTION_CONTROL_TABLE_CCTI_LIMIT_F, &limit); + + mad_dump_cc_congestioncontroltable(buf, sizeof buf, data, sizeof data); + + printf("# CongestionControlTable: %s\n%s\n", portid2str(dest), buf); + + if (!limit) + return NULL; + + for (i = 0; i < (limit/64) + 1; i++) { + + /* first query done */ + if (i) + if (!cc_query_status_via(data, dest, IB_CC_ATTR_CONGESTION_CONTROL_TABLE, + i, 0, NULL, srcport, cckey)) + return "congestion control table query failed"; + + for (j = 0; j < 64 && outputcount <= limit; j++) { + printf("Entry:...........................%u\n", i*64 + j); + mad_dump_cc_congestioncontroltableentry(buf, sizeof buf, + data + 4 + j * 2, + sizeof data - 4 - j * 2); + printf("%s\n", buf); + outputcount++; + } + } + return NULL; +} + +static const char *timestamp_dump(ib_portid_t *dest, char **argv, int argc) +{ + char buf[2048]; + char data[IB_CC_DATA_SZ] = { 0 }; + + if (!cc_query_status_via(data, dest, IB_CC_ATTR_TIMESTAMP, + 0, 0, NULL, srcport, cckey)) + return "timestamp query failed"; + + mad_dump_cc_timestamp(buf, sizeof buf, data, sizeof data); + + printf("# Timestamp: %s\n%s", portid2str(dest), buf); + return NULL; +} + +static int process_opt(void *context, int ch) +{ + switch (ch) { + case 'c': + cckey = (uint64_t) strtoull(optarg, NULL, 0); + break; + default: + return -1; + } + return 0; +} + +int main(int argc, char **argv) +{ + char usage_args[1024]; + int mgmt_classes[3] = { IB_SMI_CLASS, IB_SA_CLASS, IB_CC_CLASS }; + ib_portid_t portid = { 0 }; + const char *err; + op_fn_t *fn; + const match_rec_t *r; + int n; + + const struct ibdiag_opt opts[] = { + {"cckey", 'c', 1, "<key>", "CC key"}, + {} + }; + const char *usage_examples[] = { + "CongestionInfo 3\t\t\t# Congestion Info by lid", + "SwitchPortCongestionSetting 3\t# Query all Switch Port Congestion Settings", + "SwitchPortCongestionSetting 3 1\t# Query Switch Port Congestion Setting for port 1", + NULL + }; + + n = sprintf(usage_args, "[-c key] <op> <lid|guid>\n" + "\nSupported ops (and aliases, case insensitive):\n"); + for (r = match_tbl; r->name; r++) { + n += snprintf(usage_args + n, sizeof(usage_args) - n, + " %s (%s) <lid|guid>%s\n", r->name, + r->alias ? r->alias : "", + r->opt_portnum ? " [<portnum>]" : ""); + if (n >= sizeof(usage_args)) + exit(-1); + } + + ibdiag_process_opts(argc, argv, NULL, "DK", opts, process_opt, + usage_args, usage_examples); + + argc -= optind; + argv += optind; + + if (argc < 2) + ibdiag_show_usage(); + + if (!(fn = match_op(match_tbl, argv[0]))) + IBEXIT("operation '%s' not supported", argv[0]); + + srcport = mad_rpc_open_port(ibd_ca, ibd_ca_port, mgmt_classes, 3); + if (!srcport) + IBEXIT("Failed to open '%s' port '%d'", ibd_ca, ibd_ca_port); + + smp_mkey_set(srcport, ibd_mkey); + + if (resolve_portid_str(ibd_ca, ibd_ca_port, &portid, argv[1], + ibd_dest_type, ibd_sm_id, srcport) < 0) + IBEXIT("can't resolve destination %s", argv[1]); + if ((err = fn(&portid, argv + 2, argc - 2))) + IBEXIT("operation %s: %s", argv[0], err); + + mad_rpc_close_port(srcport); + exit(0); +} diff --git a/infiniband-diags/ibdiag_common.c b/infiniband-diags/ibdiag_common.c new file mode 100644 index 0000000..a06438e --- /dev/null +++ b/infiniband-diags/ibdiag_common.c @@ -0,0 +1,950 @@ +/* + * Copyright (c) 2006-2007 The Regents of the University of California. + * Copyright (c) 2004-2009 Voltaire, Inc. All rights reserved. + * Copyright (c) 2002-2010 Mellanox Technologies LTD. All rights reserved. + * Copyright (c) 1996-2003 Intel Corporation. All rights reserved. + * Copyright (c) 2009 HNR Consulting. All rights reserved. + * Copyright (c) 2011 Lawrence Livermore National Security. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +/** + * Define common functions which can be included in the various C based diags. + */ + +#define _GNU_SOURCE +#include <stdio.h> +#include <errno.h> +#include <string.h> +#include <stdlib.h> +#include <stdarg.h> +#include <sys/types.h> +#include <unistd.h> +#include <ctype.h> +#include <config.h> +#include <getopt.h> +#include <limits.h> +#include <sys/stat.h> +#include <stdarg.h> + +#include <infiniband/umad.h> +#include <infiniband/mad.h> +#include <ibdiag_common.h> + +int ibverbose; +enum MAD_DEST ibd_dest_type = IB_DEST_LID; +ib_portid_t *ibd_sm_id; +static ib_portid_t sm_portid = { 0 }; + +/* general config options */ +#define IBDIAG_CONFIG_GENERAL IBDIAG_CONFIG_PATH"/ibdiag.conf" +char *ibd_ca = NULL; +int ibd_ca_port = 0; +int ibd_timeout = 0; +uint32_t ibd_ibnetdisc_flags = IBND_CONFIG_MLX_EPI; +uint64_t ibd_mkey; +uint64_t ibd_sakey = 0; +int show_keys = 0; +char *ibd_nd_format = NULL; + +static const char *prog_name; +static const char *prog_args; +static const char **prog_examples; +static struct option *long_opts = NULL; +static const struct ibdiag_opt *opts_map[256]; + +static const char *get_build_version(void) +{ + return "BUILD VERSION: " PACKAGE_VERSION; +} + +static void pretty_print(int start, int width, const char *str) +{ + int len = width - start; + const char *p, *e; + + while (1) { + while (isspace(*str)) + str++; + p = str; + do { + e = p + 1; + p = strchr(e, ' '); + } while (p && p - str < len); + if (!p) { + fprintf(stderr, "%s", str); + break; + } + if (e - str == 1) + e = p; + fprintf(stderr, "%.*s\n%*s", (int)(e - str), str, start, ""); + str = e; + } +} + +static inline int val_str_true(const char *val_str) +{ + return ((strncmp(val_str, "TRUE", strlen("TRUE")) == 0) || + (strncmp(val_str, "true", strlen("true")) == 0)); +} + +static void read_ibdiag_config(const char *file) +{ + char buf[1024]; + FILE *config_fd = NULL; + char *p_prefix, *p_last; + char *name; + char *val_str; + struct stat statbuf; + + /* silently ignore missing config file */ + if (stat(file, &statbuf)) + return; + + config_fd = fopen(file, "r"); + if (!config_fd) + return; + + while (fgets(buf, sizeof buf, config_fd) != NULL) { + p_prefix = strtok_r(buf, "\n", &p_last); + if (!p_prefix) + continue; /* ignore blank lines */ + + if (*p_prefix == '#') + continue; /* ignore comment lines */ + + name = strtok_r(p_prefix, "=", &p_last); + val_str = strtok_r(NULL, "\n", &p_last); + + if (strncmp(name, "CA", strlen("CA")) == 0) { + free(ibd_ca); + ibd_ca = strdup(val_str); + } else if (strncmp(name, "Port", strlen("Port")) == 0) { + ibd_ca_port = strtoul(val_str, NULL, 0); + } else if (strncmp(name, "timeout", strlen("timeout")) == 0) { + ibd_timeout = strtoul(val_str, NULL, 0); + } else if (strncmp(name, "MLX_EPI", strlen("MLX_EPI")) == 0) { + if (val_str_true(val_str)) { + ibd_ibnetdisc_flags |= IBND_CONFIG_MLX_EPI; + } else { + ibd_ibnetdisc_flags &= ~IBND_CONFIG_MLX_EPI; + } + } else if (strncmp(name, "m_key", strlen("m_key")) == 0) { + ibd_mkey = strtoull(val_str, NULL, 0); + } else if (strncmp(name, "sa_key", + strlen("sa_key")) == 0) { + ibd_sakey = strtoull(val_str, NULL, 0); + } else if (strncmp(name, "nd_format", + strlen("nd_format")) == 0) { + if (ibd_nd_format) + free(ibd_nd_format); + ibd_nd_format = strdup(val_str); + } + } + + fclose(config_fd); +} + + +void ibdiag_show_usage(void) +{ + struct option *o = long_opts; + int n; + + fprintf(stderr, "\nUsage: %s [options] %s\n\n", prog_name, + prog_args ? prog_args : ""); + + if (long_opts[0].name) + fprintf(stderr, "Options:\n"); + for (o = long_opts; o->name; o++) { + const struct ibdiag_opt *io = opts_map[o->val]; + n = fprintf(stderr, " --%s", io->name); + if (isprint(io->letter)) + n += fprintf(stderr, ", -%c", io->letter); + if (io->has_arg) + n += fprintf(stderr, " %s", + io->arg_tmpl ? io->arg_tmpl : "<val>"); + if (io->description && *io->description) { + n += fprintf(stderr, "%*s ", 24 - n > 0 ? 24 - n : 0, + ""); + pretty_print(n, 74, io->description); + } + fprintf(stderr, "\n"); + } + + if (prog_examples) { + const char **p; + fprintf(stderr, "\nExamples:\n"); + for (p = prog_examples; *p && **p; p++) + fprintf(stderr, " %s %s\n", prog_name, *p); + } + + fprintf(stderr, "\n"); + + exit(2); +} + +static int process_opt(int ch) +{ + char *endp; + long val; + + switch (ch) { + case 'z': + read_ibdiag_config(optarg); + break; + case 'h': + ibdiag_show_usage(); + break; + case 'V': + fprintf(stderr, "%s %s\n", prog_name, get_build_version()); + exit(0); + case 'e': + madrpc_show_errors(1); + break; + case 'v': + ibverbose++; + break; + case 'd': + ibdebug++; + madrpc_show_errors(1); + umad_debug(ibdebug - 1); + break; + case 'C': + ibd_ca = optarg; + break; + case 'P': + ibd_ca_port = strtoul(optarg, NULL, 0); + break; + case 'D': + ibd_dest_type = IB_DEST_DRPATH; + break; + case 'L': + ibd_dest_type = IB_DEST_LID; + break; + case 'G': + ibd_dest_type = IB_DEST_GUID; + break; + case 't': + errno = 0; + val = strtol(optarg, &endp, 0); + if (errno || (endp && *endp != '\0') || val <= 0 || + val > INT_MAX) + IBEXIT("Invalid timeout \"%s\". Timeout requires a " + "positive integer value < %d.", optarg, INT_MAX); + else { + madrpc_set_timeout((int)val); + ibd_timeout = (int)val; + } + break; + case 's': + /* srcport is not required when resolving via IB_DEST_LID */ + if (resolve_portid_str(ibd_ca, ibd_ca_port, &sm_portid, optarg, + IB_DEST_LID, NULL, NULL) < 0) + IBEXIT("cannot resolve SM destination port %s", + optarg); + ibd_sm_id = &sm_portid; + break; + case 'K': + show_keys = 1; + break; + case 'y': + errno = 0; + ibd_mkey = strtoull(optarg, &endp, 0); + if (errno || *endp != '\0') { + errno = 0; + ibd_mkey = strtoull(getpass("M_Key: "), &endp, 0); + if (errno || *endp != '\0') { + IBEXIT("Bad M_Key"); + } + } + break; + default: + return -1; + } + + return 0; +} + +static const struct ibdiag_opt common_opts[] = { + {"config", 'z', 1, "<config>", "use config file, default: " IBDIAG_CONFIG_GENERAL}, + {"Ca", 'C', 1, "<ca>", "Ca name to use"}, + {"Port", 'P', 1, "<port>", "Ca port number to use"}, + {"Direct", 'D', 0, NULL, "use Direct address argument"}, + {"Lid", 'L', 0, NULL, "use LID address argument"}, + {"Guid", 'G', 0, NULL, "use GUID address argument"}, + {"timeout", 't', 1, "<ms>", "timeout in ms"}, + {"sm_port", 's', 1, "<lid>", "SM port lid"}, + {"show_keys", 'K', 0, NULL, "display security keys in output"}, + {"m_key", 'y', 1, "<key>", "M_Key to use in request"}, + {"errors", 'e', 0, NULL, "show send and receive errors"}, + {"verbose", 'v', 0, NULL, "increase verbosity level"}, + {"debug", 'd', 0, NULL, "raise debug level"}, + {"help", 'h', 0, NULL, "help message"}, + {"version", 'V', 0, NULL, "show version"}, + {} +}; + +static void make_opt(struct option *l, const struct ibdiag_opt *o, + const struct ibdiag_opt *map[]) +{ + l->name = o->name; + l->has_arg = o->has_arg; + l->flag = NULL; + l->val = o->letter; + if (!map[l->val]) + map[l->val] = o; +} + +static struct option *make_long_opts(const char *exclude_str, + const struct ibdiag_opt *custom_opts, + const struct ibdiag_opt *map[]) +{ + struct option *res, *l; + const struct ibdiag_opt *o; + unsigned n = 0; + + if (custom_opts) + for (o = custom_opts; o->name; o++) + n++; + + res = malloc((sizeof(common_opts) / sizeof(common_opts[0]) + n) * + sizeof(*res)); + if (!res) + return NULL; + + l = res; + + if (custom_opts) + for (o = custom_opts; o->name; o++) + make_opt(l++, o, map); + + for (o = common_opts; o->name; o++) { + if (exclude_str && strchr(exclude_str, o->letter)) + continue; + make_opt(l++, o, map); + } + + memset(l, 0, sizeof(*l)); + + return res; +} + +static void make_str_opts(const struct option *o, char *p, unsigned size) +{ + unsigned i, n = 0; + + for (n = 0; o->name && n + 2 + o->has_arg < size; o++) { + p[n++] = (char)o->val; + for (i = 0; i < (unsigned)o->has_arg; i++) + p[n++] = ':'; + } + p[n] = '\0'; +} + +int ibdiag_process_opts(int argc, char *const argv[], void *cxt, + const char *exclude_common_str, + const struct ibdiag_opt custom_opts[], + int (*custom_handler) (void *cxt, int val), + const char *usage_args, const char *usage_examples[]) +{ + char str_opts[1024]; + const struct ibdiag_opt *o; + + prog_name = argv[0]; + prog_args = usage_args; + prog_examples = usage_examples; + + if (long_opts) + free(long_opts); + + long_opts = make_long_opts(exclude_common_str, custom_opts, opts_map); + if (!long_opts) + return -1; + + read_ibdiag_config(IBDIAG_CONFIG_GENERAL); + + make_str_opts(long_opts, str_opts, sizeof(str_opts)); + + while (1) { + int ch = getopt_long(argc, argv, str_opts, long_opts, NULL); + if (ch == -1) + break; + o = opts_map[ch]; + if (!o) + ibdiag_show_usage(); + if (custom_handler) { + if (custom_handler(cxt, ch) && + process_opt(ch)) + ibdiag_show_usage(); + } else if (process_opt(ch)) + ibdiag_show_usage(); + } + + return 0; +} + +void ibexit(const char *fn, const char *msg, ...) +{ + char buf[512]; + va_list va; + int n; + + va_start(va, msg); + n = vsprintf(buf, msg, va); + va_end(va); + buf[n] = 0; + + if (ibdebug) + printf("%s: iberror: [pid %d] %s: failed: %s\n", + prog_name ? prog_name : "", getpid(), fn, buf); + else + printf("%s: iberror: failed: %s\n", + prog_name ? prog_name : "", buf); + + exit(-1); +} + +const char *conv_cnt_human_readable(uint64_t val64, float *val, int data) +{ + uint64_t tmp = val64; + int ui = 0; + uint64_t div = 1; + + tmp /= 1024; + while (tmp) { + ui++; + tmp /= 1024; + div *= 1024; + } + + *val = (float)(val64); + if (data) { + *val *= 4; + if (*val/div > 1024) { + ui++; + div *= 1024; + } + } + *val /= div; + + if (data) { + switch (ui) { + case 0: + return ("B"); + case 1: + return ("KB"); + case 2: + return ("MB"); + case 3: + return ("GB"); + case 4: + return ("TB"); + case 5: + return ("PB"); + case 6: + return ("EB"); + default: + return (""); + } + } else { + switch (ui) { + case 0: + return (""); + case 1: + return ("K"); + case 2: + return ("M"); + case 3: + return ("G"); + case 4: + return ("T"); + case 5: + return ("P"); + case 6: + return ("E"); + default: + return (""); + } + } + return (""); +} + +int is_port_info_extended_supported(ib_portid_t * dest, int port, + struct ibmad_port *srcport) +{ + uint8_t data[IB_SMP_DATA_SIZE] = { 0 }; + uint32_t cap_mask; + uint16_t cap_mask2; + int type, portnum; + + if (!smp_query_via(data, dest, IB_ATTR_NODE_INFO, 0, 0, srcport)) + IBEXIT("node info query failed"); + + mad_decode_field(data, IB_NODE_TYPE_F, &type); + if (type == IB_NODE_SWITCH) + portnum = 0; + else + portnum = port; + + if (!smp_query_via(data, dest, IB_ATTR_PORT_INFO, portnum, 0, srcport)) + IBEXIT("port info query failed"); + + mad_decode_field(data, IB_PORT_CAPMASK_F, &cap_mask); + if (cap_mask & be32toh(IB_PORT_CAP_HAS_CAP_MASK2)) { + mad_decode_field(data, IB_PORT_CAPMASK2_F, &cap_mask2); + if (!(cap_mask2 & + be16toh(IB_PORT_CAP2_IS_PORT_INFO_EXT_SUPPORTED))) { + IBWARN("port info capability mask2 = 0x%x doesn't" + " indicate PortInfoExtended support", cap_mask2); + return 0; + } + } else { + IBWARN("port info capability mask2 not supported"); + return 0; + } + + return 1; +} + +int is_mlnx_ext_port_info_supported(uint32_t vendorid, + uint16_t devid) +{ + if (ibd_ibnetdisc_flags & IBND_CONFIG_MLX_EPI) { + + if ((devid >= 0xc738 && devid <= 0xc73b) || + devid == 0xc839 || devid == 0xcb20 || devid == 0xcf08 || + devid == 0xcf09 || devid == 0xd2f0 || + ((vendorid == 0x119f) && + /* Bull SwitchX */ + (devid == 0x1b02 || devid == 0x1b50 || + /* Bull SwitchIB and SwitchIB2 */ + devid == 0x1ba0 || + (devid >= 0x1bd0 && devid <= 0x1bd5) || + /* Bull Quantum */ + devid == 0x1bf0))) + return 1; + if ((devid >= 0x1003 && devid <= 0x101b) || + (devid == 0xa2d2) || + ((vendorid == 0x119f) && + /* Bull ConnectX3 */ + (devid == 0x1b33 || devid == 0x1b73 || + devid == 0x1b40 || devid == 0x1b41 || + devid == 0x1b60 || devid == 0x1b61 || + /* Bull ConnectIB */ + devid == 0x1b83 || + devid == 0x1b93 || devid == 0x1b94 || + /* Bull ConnectX4, Sequana HDR and HDR100 */ + devid == 0x1bb4 || devid == 0x1bb5 || + (devid >= 0x1bc4 && devid <= 0x1bc6)))) + return 1; + } + + return 0; +} + +/** ========================================================================= + * Resolve the SM portid using the umad layer rather than using + * ib_resolve_smlid_via which requires a PortInfo query on the local port. + */ +int resolve_sm_portid(char *ca_name, uint8_t portnum, ib_portid_t *sm_id) +{ + umad_port_t port; + int rc; + + if (!sm_id) + return (-1); + + if ((rc = umad_get_port(ca_name, portnum, &port)) < 0) + return rc; + + memset(sm_id, 0, sizeof(*sm_id)); + sm_id->lid = port.sm_lid; + sm_id->sl = port.sm_sl; + + umad_release_port(&port); + + return 0; +} + +/** ========================================================================= + * Resolve local CA characteristics using the umad layer rather than using + * ib_resolve_self_via which requires SMP queries on the local port. + */ +int resolve_self(char *ca_name, uint8_t ca_port, ib_portid_t *portid, + int *portnum, ibmad_gid_t *gid) +{ + umad_port_t port; + uint64_t prefix, guid; + int rc; + + if (!(portid || portnum || gid)) + return (-1); + + if ((rc = umad_get_port(ca_name, ca_port, &port)) < 0) + return rc; + + if (portid) { + memset(portid, 0, sizeof(*portid)); + portid->lid = port.base_lid; + portid->sl = port.sm_sl; + } + if (portnum) + *portnum = port.portnum; + if (gid) { + memset(gid, 0, sizeof(*gid)); + prefix = be64toh(port.gid_prefix); + guid = be64toh(port.port_guid); + mad_encode_field(*gid, IB_GID_PREFIX_F, &prefix); + mad_encode_field(*gid, IB_GID_GUID_F, &guid); + } + + umad_release_port(&port); + + return 0; +} + +static int resolve_gid(char *ca_name, uint8_t ca_port, ib_portid_t *portid, + ibmad_gid_t gid, ib_portid_t *sm_id, + const struct ibmad_port *srcport) +{ + ib_portid_t tmp; + char buf[IB_SA_DATA_SIZE] = { 0 }; + + if (!sm_id) { + sm_id = &tmp; + if (resolve_sm_portid(ca_name, ca_port, sm_id) < 0) + return -1; + } + + if ((portid->lid = + ib_path_query_via(srcport, gid, gid, sm_id, buf)) < 0) + return -1; + + return 0; +} + +static int resolve_guid(char *ca_name, uint8_t ca_port, ib_portid_t *portid, + uint64_t *guid, ib_portid_t *sm_id, + const struct ibmad_port *srcport) +{ + ib_portid_t tmp; + uint8_t buf[IB_SA_DATA_SIZE] = { 0 }; + __be64 prefix; + ibmad_gid_t selfgid; + + if (!sm_id) { + sm_id = &tmp; + if (resolve_sm_portid(ca_name, ca_port, sm_id) < 0) + return -1; + } + + if (resolve_self(ca_name, ca_port, NULL, NULL, &selfgid) < 0) + return -1; + + memcpy(&prefix, selfgid, sizeof(prefix)); + mad_set_field64(portid->gid, 0, IB_GID_PREFIX_F, + prefix ? be64toh(prefix) : IB_DEFAULT_SUBN_PREFIX); + if (guid) + mad_set_field64(portid->gid, 0, IB_GID_GUID_F, *guid); + + if ((portid->lid = + ib_path_query_via(srcport, selfgid, portid->gid, sm_id, buf)) < 0) + return -1; + + mad_decode_field(buf, IB_SA_PR_SL_F, &portid->sl); + return 0; +} + +/* + * Callers of this function should ensure their ibmad_port has been opened with + * IB_SA_CLASS as this function may require the SA to resolve addresses. + */ +int resolve_portid_str(char *ca_name, uint8_t ca_port, ib_portid_t * portid, + char *addr_str, enum MAD_DEST dest_type, + ib_portid_t *sm_id, const struct ibmad_port *srcport) +{ + ibmad_gid_t gid; + uint64_t guid; + int lid; + char *routepath; + ib_portid_t selfportid = { 0 }; + int selfport = 0; + + memset(portid, 0, sizeof *portid); + + switch (dest_type) { + case IB_DEST_LID: + lid = strtol(addr_str, NULL, 0); + if (!IB_LID_VALID(lid)) + return -1; + return ib_portid_set(portid, lid, 0, 0); + + case IB_DEST_DRPATH: + if (str2drpath(&portid->drpath, addr_str, 0, 0) < 0) + return -1; + return 0; + + case IB_DEST_GUID: + if (!(guid = strtoull(addr_str, NULL, 0))) + return -1; + + /* keep guid in portid? */ + return resolve_guid(ca_name, ca_port, portid, &guid, sm_id, + srcport); + + case IB_DEST_DRSLID: + lid = strtol(addr_str, &routepath, 0); + routepath++; + if (!IB_LID_VALID(lid)) + return -1; + ib_portid_set(portid, lid, 0, 0); + + /* handle DR parsing and set DrSLID to local lid */ + if (resolve_self(ca_name, ca_port, &selfportid, &selfport, + NULL) < 0) + return -1; + if (str2drpath(&portid->drpath, routepath, selfportid.lid, 0) < + 0) + return -1; + return 0; + + case IB_DEST_GID: + if (inet_pton(AF_INET6, addr_str, &gid) <= 0) + return -1; + return resolve_gid(ca_name, ca_port, portid, gid, sm_id, + srcport); + default: + IBWARN("bad dest_type %d", dest_type); + } + + return -1; +} + +static unsigned int get_max_width(unsigned int num) +{ + unsigned r = 0; /* 1x */ + + if (num & 8) + r = 3; /* 12x */ + else { + if (num & 4) + r = 2; /* 8x */ + else if (num & 2) + r = 1; /* 4x */ + else if (num & 0x10) + r = 4; /* 2x */ + } + + return (1 << r); +} + +static unsigned int get_max(unsigned int num) +{ + unsigned r = 0; // r will be lg(num) + + while (num >>= 1) // unroll for more speed... + r++; + + return (1 << r); +} + +void get_max_msg(char *width_msg, char *speed_msg, int msg_size, ibnd_port_t * port) +{ + char buf[64]; + uint32_t max_speed = 0; + uint32_t cap_mask, rem_cap_mask, fdr10; + uint8_t *info = NULL; + + uint32_t max_width = get_max_width(mad_get_field(port->info, 0, + IB_PORT_LINK_WIDTH_SUPPORTED_F) + & mad_get_field(port->remoteport->info, 0, + IB_PORT_LINK_WIDTH_SUPPORTED_F)); + if ((max_width & mad_get_field(port->info, 0, + IB_PORT_LINK_WIDTH_ACTIVE_F)) == 0) + // we are not at the max supported width + // print what we could be at. + snprintf(width_msg, msg_size, "Could be %s", + mad_dump_val(IB_PORT_LINK_WIDTH_ACTIVE_F, + buf, 64, &max_width)); + + if (port->node->type == IB_NODE_SWITCH) { + if (port->node->ports[0]) + info = (uint8_t *)&port->node->ports[0]->info; + } + else + info = (uint8_t *)&port->info; + + if (info) + cap_mask = mad_get_field(info, 0, IB_PORT_CAPMASK_F); + else + cap_mask = 0; + + info = NULL; + if (port->remoteport->node->type == IB_NODE_SWITCH) { + if (port->remoteport->node->ports[0]) + info = (uint8_t *)&port->remoteport->node->ports[0]->info; + } else + info = (uint8_t *)&port->remoteport->info; + + if (info) + rem_cap_mask = mad_get_field(info, 0, IB_PORT_CAPMASK_F); + else + rem_cap_mask = 0; + if (cap_mask & be32toh(IB_PORT_CAP_HAS_EXT_SPEEDS) && + rem_cap_mask & be32toh(IB_PORT_CAP_HAS_EXT_SPEEDS)) + goto check_ext_speed; +check_fdr10_supp: + fdr10 = (mad_get_field(port->ext_info, 0, + IB_MLNX_EXT_PORT_LINK_SPEED_SUPPORTED_F) & FDR10) + && (mad_get_field(port->remoteport->ext_info, 0, + IB_MLNX_EXT_PORT_LINK_SPEED_SUPPORTED_F) & FDR10); + if (fdr10) + goto check_fdr10_active; + + max_speed = get_max(mad_get_field(port->info, 0, + IB_PORT_LINK_SPEED_SUPPORTED_F) + & mad_get_field(port->remoteport->info, 0, + IB_PORT_LINK_SPEED_SUPPORTED_F)); + if ((max_speed & mad_get_field(port->info, 0, + IB_PORT_LINK_SPEED_ACTIVE_F)) == 0) + // we are not at the max supported speed + // print what we could be at. + snprintf(speed_msg, msg_size, "Could be %s", + mad_dump_val(IB_PORT_LINK_SPEED_ACTIVE_F, + buf, 64, &max_speed)); + return; + +check_ext_speed: + if (mad_get_field(port->info, 0, + IB_PORT_LINK_SPEED_EXT_SUPPORTED_F) == 0 || + mad_get_field(port->remoteport->info, 0, + IB_PORT_LINK_SPEED_EXT_SUPPORTED_F) == 0) + goto check_fdr10_supp; + max_speed = get_max(mad_get_field(port->info, 0, + IB_PORT_LINK_SPEED_EXT_SUPPORTED_F) + & mad_get_field(port->remoteport->info, 0, + IB_PORT_LINK_SPEED_EXT_SUPPORTED_F)); + if ((max_speed & mad_get_field(port->info, 0, + IB_PORT_LINK_SPEED_EXT_ACTIVE_F)) == 0) + // we are not at the max supported extended speed + // print what we could be at. + snprintf(speed_msg, msg_size, "Could be %s", + mad_dump_val(IB_PORT_LINK_SPEED_EXT_ACTIVE_F, + buf, 64, &max_speed)); + return; + +check_fdr10_active: + if ((mad_get_field(port->ext_info, 0, + IB_MLNX_EXT_PORT_LINK_SPEED_ACTIVE_F) & FDR10) == 0) { + /* Special case QDR to try to avoid confusion with FDR10 */ + if (mad_get_field(port->info, 0, IB_PORT_LINK_SPEED_ACTIVE_F) == 4) /* QDR (10.0 Gbps) */ + snprintf(speed_msg, msg_size, + "Could be FDR10 (Found link at QDR but expected speed is FDR10)"); + else + snprintf(speed_msg, msg_size, "Could be FDR10"); + } +} + +int vsnprint_field(char *buf, size_t n, enum MAD_FIELDS f, int spacing, + const char *format, va_list va_args) +{ + int len, i, ret; + + len = strlen(mad_field_name(f)); + if (len + 2 > n || spacing + 1 > n) + return 0; + + strncpy(buf, mad_field_name(f), n); + buf[len] = ':'; + for (i = len+1; i < spacing+1; i++) { + buf[i] = '.'; + } + + ret = vsnprintf(&buf[spacing+1], n - spacing, format, va_args); + if (ret >= n - spacing) + buf[n] = '\0'; + + return ret + spacing; +} + +int snprint_field(char *buf, size_t n, enum MAD_FIELDS f, int spacing, + const char *format, ...) +{ + va_list val; + int ret; + + va_start(val, format); + ret = vsnprint_field(buf, n, f, spacing, format, val); + va_end(val); + + return ret; +} + +void dump_portinfo(void *pi, int tabs) +{ + int field, i; + char val[64]; + char buf[1024]; + + for (field = IB_PORT_FIRST_F; field < IB_PORT_LAST_F; field++) { + for (i=0;i<tabs;i++) + printf("\t"); + if (field == IB_PORT_MKEY_F && show_keys == 0) { + snprint_field(buf, 1024, field, 32, NOT_DISPLAYED_STR); + } else { + mad_decode_field(pi, field, val); + if (!mad_dump_field(field, buf, 1024, val)) + return; + } + printf("%s\n", buf); + } + + for (field = IB_PORT_CAPMASK2_F; + field < IB_PORT_LINK_SPEED_EXT_LAST_F; field++) { + for (i=0;i<tabs;i++) + printf("\t"); + mad_decode_field(pi, field, val); + if (!mad_dump_field(field, buf, 1024, val)) + return; + printf("%s\n", buf); + } +} + +op_fn_t *match_op(const match_rec_t match_tbl[], char *name) +{ + const match_rec_t *r; + for (r = match_tbl; r->name; r++) + if (!strcasecmp(r->name, name) || + (r->alias && !strcasecmp(r->alias, name))) + return r->fn; + return NULL; +} diff --git a/infiniband-diags/ibdiag_common.h b/infiniband-diags/ibdiag_common.h new file mode 100644 index 0000000..bee4078 --- /dev/null +++ b/infiniband-diags/ibdiag_common.h @@ -0,0 +1,137 @@ +/* + * Copyright (c) 2006-2007 The Regents of the University of California. + * Copyright (c) 2004-2009 Voltaire Inc. All rights reserved. + * Copyright (c) 2002-2010 Mellanox Technologies LTD. All rights reserved. + * Copyright (c) 1996-2003 Intel Corporation. All rights reserved. + * Copyright (c) 2009 HNR Consulting. All rights reserved. + * Copyright (c) 2011 Lawrence Livermore National Security. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#ifndef _IBDIAG_COMMON_H_ +#define _IBDIAG_COMMON_H_ + +#include <endian.h> + +#include <stdarg.h> +#include <infiniband/mad.h> +#include <util/iba_types.h> +#include <infiniband/ibnetdisc.h> +#include <linux/types.h> + +extern int ibverbose; +extern char *ibd_ca; +extern int ibd_ca_port; +extern enum MAD_DEST ibd_dest_type; +extern ib_portid_t *ibd_sm_id; +extern int ibd_timeout; +extern uint32_t ibd_ibnetdisc_flags; +extern uint64_t ibd_mkey; +extern uint64_t ibd_sakey; +extern int show_keys; +extern char *ibd_nd_format; + +/*========================================================*/ +/* External interface */ +/*========================================================*/ + +#undef DEBUG +#define DEBUG(fmt, ...) do { \ + if (ibdebug) IBDEBUG(fmt, ## __VA_ARGS__); \ +} while (0) +#define VERBOSE(fmt, ...) do { \ + if (ibverbose) IBVERBOSE(fmt, ## __VA_ARGS__); \ +} while (0) +#define IBEXIT(fmt, ...) ibexit(__FUNCTION__, fmt, ## __VA_ARGS__) + +#define NOT_DISPLAYED_STR "<not displayed>" + +struct ibdiag_opt { + const char *name; + char letter; + unsigned has_arg; + const char *arg_tmpl; + const char *description; +}; + +extern int ibdiag_process_opts(int argc, char *const argv[], void *context, + const char *exclude_common_str, + const struct ibdiag_opt custom_opts[], + int (*custom_handler) (void *cxt, int val), + const char *usage_args, + const char *usage_examples[]); +extern void ibdiag_show_usage(void); +extern void ibexit(const char *fn, const char *msg, ...) + __attribute__((format(printf, 2, 3))); + +/* convert counter values to a float with a unit specifier returned (using + * binary prefix) + * "data" is a flag indicating this counter is a byte counter multiplied by 4 + * as per PortCounters[Extended] + */ +const char *conv_cnt_human_readable(uint64_t val64, float *val, int data); + +int is_mlnx_ext_port_info_supported(uint32_t vendorid, uint16_t devid); + +int is_port_info_extended_supported(ib_portid_t * dest, int port, + struct ibmad_port *srcport); +void get_max_msg(char *width_msg, char *speed_msg, int msg_size, + ibnd_port_t * port); + +int resolve_sm_portid(char *ca_name, uint8_t portnum, ib_portid_t *sm_id); +int resolve_self(char *ca_name, uint8_t ca_port, ib_portid_t *portid, + int *port, ibmad_gid_t *gid); +int resolve_portid_str(char *ca_name, uint8_t ca_port, ib_portid_t * portid, + char *addr_str, enum MAD_DEST dest_type, + ib_portid_t *sm_id, const struct ibmad_port *srcport); +int vsnprint_field(char *buf, size_t n, enum MAD_FIELDS f, int spacing, + const char *format, va_list va_args) + __attribute__((format(printf, 5, 0))); +int snprint_field(char *buf, size_t n, enum MAD_FIELDS f, int spacing, + const char *format, ...) + __attribute__((format(printf, 5, 6))); +void dump_portinfo(void *pi, int tabs); + +/** + * Some common command line parsing + */ +typedef const char *(op_fn_t)(ib_portid_t *dest, char **argv, int argc); + +typedef struct match_rec { + const char *name, *alias; + op_fn_t *fn; + unsigned opt_portnum; + const char *ops_extra; +} match_rec_t; + +op_fn_t *match_op(const match_rec_t match_tbl[], char *name); + +#endif /* _IBDIAG_COMMON_H_ */ diff --git a/infiniband-diags/ibdiag_sa.c b/infiniband-diags/ibdiag_sa.c new file mode 100644 index 0000000..a5b99ad --- /dev/null +++ b/infiniband-diags/ibdiag_sa.c @@ -0,0 +1,255 @@ +/* + * Copyright (c) 2006-2007 The Regents of the University of California. + * Copyright (c) 2004-2009 Voltaire, Inc. All rights reserved. + * Copyright (c) 2002-2010 Mellanox Technologies LTD. All rights reserved. + * Copyright (c) 1996-2003 Intel Corporation. All rights reserved. + * Copyright (c) 2009 HNR Consulting. All rights reserved. + * Copyright (c) 2011 Lawrence Livermore National Security. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + + +#include <errno.h> +#include <infiniband/umad.h> + +#include "ibdiag_common.h" +#include "ibdiag_sa.h" + +/* define a common SA query structure + * This is by no means optimal but it moves the saquery functionality out of + * the saquery tool and provides it to other utilities. + */ + +struct sa_handle * sa_get_handle(void) +{ + struct sa_handle * handle; + handle = calloc(1, sizeof(*handle)); + if (!handle) + IBPANIC("calloc failed"); + + resolve_sm_portid(ibd_ca, ibd_ca_port, &handle->dport); + if (!handle->dport.lid) { + IBWARN("No SM/SA found on port %s:%d", + ibd_ca ? "" : ibd_ca, + ibd_ca_port); + goto err; + } + + handle->dport.qp = 1; + if (!handle->dport.qkey) + handle->dport.qkey = IB_DEFAULT_QP1_QKEY; + + if ((handle->fd = umad_open_port(ibd_ca, ibd_ca_port)) < 0) { + IBWARN("umad_open_port on port %s:%d failed", + ibd_ca ? "" : ibd_ca, + ibd_ca_port); + goto err; + } + if ((handle->agent = umad_register(handle->fd, IB_SA_CLASS, 2, 1, NULL)) < 0) { + umad_close_port(handle->fd); + IBWARN("umad_register for SA class failed on port %s:%d", + ibd_ca ? "" : ibd_ca, + ibd_ca_port); + goto err; + } + + return handle; + +err: + free(handle); + return (NULL); +} + +void sa_free_handle(struct sa_handle * h) +{ + umad_unregister(h->fd, h->agent); + umad_close_port(h->fd); + free(h); +} + +int sa_query(struct sa_handle * h, uint8_t method, + uint16_t attr, uint32_t mod, uint64_t comp_mask, + uint64_t sm_key, void *data, size_t datasz, + struct sa_query_result *result) +{ + ib_rpc_t rpc; + void *umad, *mad; + int ret, offset, len = 256; + + memset(&rpc, 0, sizeof(rpc)); + rpc.mgtclass = IB_SA_CLASS; + rpc.method = method; + rpc.attr.id = attr; + rpc.attr.mod = mod; + rpc.mask = comp_mask; + rpc.datasz = datasz; + rpc.dataoffs = IB_SA_DATA_OFFS; + + umad = calloc(1, len + umad_size()); + if (!umad) + IBPANIC("cannot alloc mem for umad: %s\n", strerror(errno)); + + mad_build_pkt(umad, &rpc, &h->dport, NULL, data); + + mad_set_field64(umad_get_mad(umad), 0, IB_SA_MKEY_F, sm_key); + + if (ibdebug > 1) + xdump(stdout, "SA Request:\n", umad_get_mad(umad), len); + + ret = umad_send(h->fd, h->agent, umad, len, ibd_timeout, 0); + if (ret < 0) { + IBWARN("umad_send failed: attr 0x%x: %s\n", + attr, strerror(errno)); + free(umad); + return (-ret); + } + +recv_mad: + ret = umad_recv(h->fd, umad, &len, ibd_timeout); + if (ret < 0) { + if (errno == ENOSPC) { + umad = realloc(umad, umad_size() + len); + goto recv_mad; + } + IBWARN("umad_recv failed: attr 0x%x: %s\n", attr, + strerror(errno)); + free(umad); + return (-ret); + } + + if ((ret = umad_status(umad))) + return ret; + + mad = umad_get_mad(umad); + + if (ibdebug > 1) + xdump(stdout, "SA Response:\n", mad, len); + + method = (uint8_t) mad_get_field(mad, 0, IB_MAD_METHOD_F); + offset = mad_get_field(mad, 0, IB_SA_ATTROFFS_F); + result->status = mad_get_field(mad, 0, IB_MAD_STATUS_F); + result->p_result_madw = mad; + if (result->status != IB_SA_MAD_STATUS_SUCCESS) + result->result_cnt = 0; + else if (method != IB_MAD_METHOD_GET_TABLE) + result->result_cnt = 1; + else if (!offset) + result->result_cnt = 0; + else + result->result_cnt = (len - IB_SA_DATA_OFFS) / (offset << 3); + + return 0; +} + +void sa_free_result_mad(struct sa_query_result *result) +{ + if (result->p_result_madw) { + free((uint8_t *) result->p_result_madw - umad_size()); + result->p_result_madw = NULL; + } +} + +void *sa_get_query_rec(void *mad, unsigned i) +{ + int offset = mad_get_field(mad, 0, IB_SA_ATTROFFS_F); + return (uint8_t *) mad + IB_SA_DATA_OFFS + i * (offset << 3); +} + +static const char *ib_sa_error_str[] = { + "SA_NO_ERROR", + "SA_ERR_NO_RESOURCES", + "SA_ERR_REQ_INVALID", + "SA_ERR_NO_RECORDS", + "SA_ERR_TOO_MANY_RECORDS", + "SA_ERR_REQ_INVALID_GID", + "SA_ERR_REQ_INSUFFICIENT_COMPONENTS", + "SA_ERR_REQ_DENIED", + "SA_ERR_STATUS_PRIO_SUGGESTED", + "SA_ERR_UNKNOWN" +}; + +#define ARR_SIZE(a) (sizeof(a)/sizeof((a)[0])) +#define SA_ERR_UNKNOWN (ARR_SIZE(ib_sa_error_str) - 1) + +static inline const char *ib_sa_err_str(uint8_t status) +{ + if (status > SA_ERR_UNKNOWN) + status = SA_ERR_UNKNOWN; + return (ib_sa_error_str[status]); +} + +static const char *ib_mad_inv_field_str[] = { + "MAD No invalid fields", + "MAD Bad version", + "MAD Method specified is not supported", + "MAD Method/Attribute combination is not supported", + "MAD Reserved", + "MAD Reserved", + "MAD Reserved", + "MAD Invalid value in Attribute field(s) or Attribute Modifier", + "MAD UNKNOWN ERROR" +}; +#define MAD_ERR_UNKNOWN (ARR_SIZE(ib_mad_inv_field_str) - 1) + +static inline const char *ib_mad_inv_field_err_str(uint8_t f) +{ + if (f > MAD_ERR_UNKNOWN) + f = MAD_ERR_UNKNOWN; + return (ib_mad_inv_field_str[f]); +} + +void sa_report_err(int status) +{ + int st = status & 0xff; + char mad_err_str[128] = { 0 }; + char sa_err_str[64] = { 0 }; + int rc; + + if (st) { + rc = snprintf(mad_err_str, sizeof(mad_err_str), " (%s; %s; %s)", + (st & 0x1) ? "BUSY" : "", + (st & 0x2) ? "Redirection Required" : "", + ib_mad_inv_field_err_str(st>>2)); + if (rc > sizeof(mad_err_str)) + fprintf(stderr, "WARN: string buffer overflow\n"); + } + + st = status >> 8; + if (st) { + rc = snprintf(sa_err_str, sizeof(sa_err_str), " SA(%s)", + ib_sa_err_str((uint8_t) st)); + if (rc > sizeof(sa_err_str)) + fprintf(stderr, "WARN: string buffer overflow\n"); + } + + fprintf(stderr, "ERROR: Query result returned 0x%04x, %s%s\n", + status, mad_err_str, sa_err_str); +} diff --git a/infiniband-diags/ibdiag_sa.h b/infiniband-diags/ibdiag_sa.h new file mode 100644 index 0000000..f0347c8 --- /dev/null +++ b/infiniband-diags/ibdiag_sa.h @@ -0,0 +1,96 @@ +/* + * Copyright (c) 2006-2007 The Regents of the University of California. + * Copyright (c) 2004-2009 Voltaire Inc. All rights reserved. + * Copyright (c) 2002-2010 Mellanox Technologies LTD. All rights reserved. + * Copyright (c) 1996-2003 Intel Corporation. All rights reserved. + * Copyright (c) 2009 HNR Consulting. All rights reserved. + * Copyright (c) 2012 Lawrence Livermore National Security. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#ifndef _IBDIAG_SA_H_ +#define _IBDIAG_SA_H_ + +#include <infiniband/mad.h> +#include <util/iba_types.h> + +/* define an SA query structure to be common + * This is by no means optimal but it moves the saquery functionality out of + * the saquery tool and provides it to other utilities. + */ +struct sa_handle { + int fd, agent; + ib_portid_t dport; + struct ibmad_port *srcport; +}; + +struct sa_query_result { + uint32_t status; + unsigned result_cnt; + void *p_result_madw; +}; + +/* NOTE: umad_init must be called prior to sa_get_handle */ +struct sa_handle * sa_get_handle(void); +void sa_free_handle(struct sa_handle * h); + +int sa_query(struct sa_handle *h, uint8_t method, + uint16_t attr, uint32_t mod, uint64_t comp_mask, uint64_t sm_key, + void *data, size_t datasz, struct sa_query_result *result); +void sa_free_result_mad(struct sa_query_result *result); +void *sa_get_query_rec(void *mad, unsigned i); +void sa_report_err(int status); + +/* Macros for setting query values and ComponentMasks */ +static inline uint8_t htobe8(uint8_t val) +{ + return val; +} +#define CHECK_AND_SET_VAL(val, size, comp_with, target, name, mask) \ + if ((int##size##_t) val != (int##size##_t) comp_with) { \ + target = htobe##size((uint##size##_t) val); \ + comp_mask |= IB_##name##_COMPMASK_##mask; \ + } + +#define CHECK_AND_SET_GID(val, target, name, mask) \ + if (valid_gid(&(val))) { \ + memcpy(&(target), &(val), sizeof(val)); \ + comp_mask |= IB_##name##_COMPMASK_##mask; \ + } + +#define CHECK_AND_SET_VAL_AND_SEL(val, target, name, mask, sel) \ + if (val) { \ + target = val; \ + comp_mask |= IB_##name##_COMPMASK_##mask##sel; \ + comp_mask |= IB_##name##_COMPMASK_##mask; \ + } + +#endif /* _IBDIAG_SA_H_ */ diff --git a/infiniband-diags/iblinkinfo.c b/infiniband-diags/iblinkinfo.c new file mode 100644 index 0000000..afd2f4e --- /dev/null +++ b/infiniband-diags/iblinkinfo.c @@ -0,0 +1,774 @@ +/* + * Copyright (c) 2004-2009 Voltaire Inc. All rights reserved. + * Copyright (c) 2007 Xsigo Systems Inc. All rights reserved. + * Copyright (c) 2008 Lawrence Livermore National Lab. All rights reserved. + * Copyright (c) 2010,2011 Mellanox Technologies LTD. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#define _GNU_SOURCE +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <stdarg.h> +#include <time.h> +#include <string.h> +#include <errno.h> +#include <inttypes.h> + +#include <util/node_name_map.h> +#include <infiniband/ibnetdisc.h> + +#include "ibdiag_common.h" + +#define DIFF_FLAG_PORT_CONNECTION 0x01 +#define DIFF_FLAG_PORT_STATE 0x02 +#define DIFF_FLAG_LID 0x04 +#define DIFF_FLAG_NODE_DESCRIPTION 0x08 + +#define DIFF_FLAG_DEFAULT (DIFF_FLAG_PORT_CONNECTION | DIFF_FLAG_PORT_STATE) + +static char *node_name_map_file = NULL; +static nn_map_t *node_name_map = NULL; +static char *load_cache_file = NULL; +static char *diff_cache_file = NULL; +static unsigned diffcheck_flags = DIFF_FLAG_DEFAULT; +static char *filterdownports_cache_file = NULL; +static ibnd_fabric_t *filterdownports_fabric = NULL; + +static struct { + uint64_t guid; + char *guid_str; +} node_label; +static char *dr_path = NULL; +static int all = 0; + +static int down_links_only = 0; +static int line_mode = 0; +static int add_sw_settings = 0; +static int only_flag = 0; +static int only_type = 0; + +static int filterdownport_check(ibnd_node_t *node, ibnd_port_t *port) +{ + ibnd_node_t *fsw; + ibnd_port_t *fport; + int fistate; + + fsw = ibnd_find_node_guid(filterdownports_fabric, node->guid); + + if (!fsw) + return 0; + + if (port->portnum > fsw->numports) + return 0; + + fport = fsw->ports[port->portnum]; + + if (!fport) + return 0; + + fistate = mad_get_field(fport->info, 0, IB_PORT_STATE_F); + + return (fistate == IB_LINK_DOWN) ? 1 : 0; +} + +static void print_port(ibnd_node_t *node, ibnd_port_t *port, + const char *out_prefix) +{ + char width[64], speed[64], state[64], physstate[64]; + char remote_guid_str[256]; + char remote_str[256]; + char link_str[256]; + char width_msg[256]; + char speed_msg[256]; + char ext_port_str[256]; + int iwidth, ispeed, fdr10, espeed, istate, iphystate, cap_mask; + int n = 0; + uint8_t *info = NULL; + int rc; + + if (!port) + return; + + iwidth = mad_get_field(port->info, 0, IB_PORT_LINK_WIDTH_ACTIVE_F); + ispeed = mad_get_field(port->info, 0, IB_PORT_LINK_SPEED_ACTIVE_F); + fdr10 = mad_get_field(port->ext_info, 0, + IB_MLNX_EXT_PORT_LINK_SPEED_ACTIVE_F) & FDR10; + + if (port->node->type == IB_NODE_SWITCH) { + if (port->node->ports[0]) + info = (uint8_t *)&port->node->ports[0]->info; + } + else + info = (uint8_t *)&port->info; + + if (info) { + cap_mask = mad_get_field(info, 0, IB_PORT_CAPMASK_F); + if (cap_mask & be32toh(IB_PORT_CAP_HAS_EXT_SPEEDS)) + espeed = mad_get_field(port->info, 0, + IB_PORT_LINK_SPEED_EXT_ACTIVE_F); + else + espeed = 0; + } else { + ispeed = 0; + iwidth = 0; + espeed = 0; + } + + istate = mad_get_field(port->info, 0, IB_PORT_STATE_F); + iphystate = mad_get_field(port->info, 0, IB_PORT_PHYS_STATE_F); + + remote_guid_str[0] = '\0'; + remote_str[0] = '\0'; + link_str[0] = '\0'; + width_msg[0] = '\0'; + speed_msg[0] = '\0'; + + if (istate == IB_LINK_DOWN + && filterdownports_fabric + && filterdownport_check(node, port)) + return; + + /* C14-24.2.1 states that a down port allows for invalid data to be + * returned for all PortInfo components except PortState and + * PortPhysicalState */ + if (istate != IB_LINK_DOWN) { + if (!espeed) { + if (fdr10) + sprintf(speed, "10.0 Gbps (FDR10)"); + else + mad_dump_val(IB_PORT_LINK_SPEED_ACTIVE_F, speed, + 64, &ispeed); + } else + mad_dump_val(IB_PORT_LINK_SPEED_EXT_ACTIVE_F, speed, + 64, &espeed); + + n = snprintf(link_str, 256, "(%3s %18s %6s/%8s)", + mad_dump_val(IB_PORT_LINK_WIDTH_ACTIVE_F, width, 64, + &iwidth), + speed, + mad_dump_val(IB_PORT_STATE_F, state, 64, &istate), + mad_dump_val(IB_PORT_PHYS_STATE_F, physstate, 64, + &iphystate)); + } else { + n = snprintf(link_str, 256, "( %6s/%8s)", + mad_dump_val(IB_PORT_STATE_F, state, 64, &istate), + mad_dump_val(IB_PORT_PHYS_STATE_F, physstate, 64, + &iphystate)); + } + + /* again default values due to C14-24.2.1 */ + if (add_sw_settings && istate != IB_LINK_DOWN) { + snprintf(link_str + n, 256 - n, + " (HOQ:%d VL_Stall:%d)", + mad_get_field(port->info, 0, + IB_PORT_HOQ_LIFE_F), + mad_get_field(port->info, 0, + IB_PORT_VL_STALL_COUNT_F)); + } + + if (port->remoteport) { + char *remap = + remap_node_name(node_name_map, port->remoteport->node->guid, + port->remoteport->node->nodedesc); + + if (port->remoteport->ext_portnum) + snprintf(ext_port_str, 256, "%d", + port->remoteport->ext_portnum); + else + ext_port_str[0] = '\0'; + + get_max_msg(width_msg, speed_msg, 256, port); + + if (line_mode) { + snprintf(remote_guid_str, 256, + "0x%016" PRIx64 " ", + port->remoteport->guid); + } + + rc = snprintf(remote_str, sizeof(remote_str), + "%s%6d %4d[%2s] \"%s\" (%s %s)\n", + remote_guid_str, port->remoteport->base_lid ? + port->remoteport->base_lid : + port->remoteport->node->smalid, + port->remoteport->portnum, ext_port_str, remap, + width_msg, speed_msg); + if (rc > sizeof(remote_str)) + fprintf(stderr, "WARN: string buffer overflow\n"); + + free(remap); + } else { + if (istate == IB_LINK_DOWN) + snprintf(remote_str, 256, " [ ] \"\" ( )\n"); + else + snprintf(remote_str, 256, " \"Port not available\"\n"); + } + + if (port->ext_portnum) + snprintf(ext_port_str, 256, "%d", port->ext_portnum); + else + ext_port_str[0] = '\0'; + + if (line_mode) { + char *remap = remap_node_name(node_name_map, node->guid, + node->nodedesc); + printf("%s0x%016" PRIx64 " \"%30s\" ", + out_prefix ? out_prefix : "", + port->guid, remap); + free(remap); + } else + printf("%s ", out_prefix ? out_prefix : ""); + + if (port->node->type != IB_NODE_SWITCH) { + if (!line_mode) + printf("0x%016" PRIx64 " ", port->guid); + + printf("%6d %4d[%2s] ==%s==> %s", + port->base_lid, + port->portnum, ext_port_str, link_str, remote_str); + } else + printf("%6d %4d[%2s] ==%s==> %s", + node->smalid, port->portnum, ext_port_str, + link_str, remote_str); +} + +static inline const char *nodetype_str(ibnd_node_t * node) +{ + switch (node->type) { + case IB_NODE_SWITCH: + return "Switch"; + case IB_NODE_CA: + return "CA"; + case IB_NODE_ROUTER: + return "Router"; + } + return "??"; +} + +static void print_node_header(ibnd_node_t *node, int *out_header_flag, + const char *out_prefix) +{ + uint64_t guid = 0; + if ((!out_header_flag || !(*out_header_flag)) && !line_mode) { + char *remap = + remap_node_name(node_name_map, node->guid, node->nodedesc); + if (node->type == IB_NODE_SWITCH) { + if (node->ports[0]) + guid = node->ports[0]->guid; + else + guid = mad_get_field64(node->info, 0, IB_NODE_PORT_GUID_F); + + printf("%s%s: 0x%016" PRIx64 " %s:\n", + out_prefix ? out_prefix : "", + nodetype_str(node), + guid, + remap); + } else + printf("%s%s: %s:\n", + out_prefix ? out_prefix : "", + nodetype_str(node), remap); + (*out_header_flag)++; + free(remap); + } +} + +static void print_node(ibnd_node_t *node, void *user_data) +{ + int i = 0; + int head_print = 0; + char *out_prefix = (char *)user_data; + + for (i = 1; i <= node->numports; i++) { + ibnd_port_t *port = node->ports[i]; + if (!port) + continue; + if (!down_links_only || + mad_get_field(port->info, 0, + IB_PORT_STATE_F) == IB_LINK_DOWN) { + print_node_header(node, &head_print, out_prefix); + print_port(node, port, out_prefix); + } + } +} + +struct iter_diff_data { + uint32_t diff_flags; + ibnd_fabric_t *fabric1; + ibnd_fabric_t *fabric2; + const char *fabric1_prefix; + const char *fabric2_prefix; +}; + +static void diff_node_ports(ibnd_node_t *fabric1_node, + ibnd_node_t *fabric2_node, int *head_print, + struct iter_diff_data *data) +{ + int i = 0; + + for (i = 1; i <= fabric1_node->numports; i++) { + ibnd_port_t *fabric1_port, *fabric2_port; + int output_diff = 0; + + fabric1_port = fabric1_node->ports[i]; + fabric2_port = fabric2_node->ports[i]; + + if (!fabric1_port && !fabric2_port) + continue; + + if (data->diff_flags & DIFF_FLAG_PORT_CONNECTION) { + if ((fabric1_port && !fabric2_port) + || (!fabric1_port && fabric2_port) + || (fabric1_port->remoteport + && !fabric2_port->remoteport) + || (!fabric1_port->remoteport + && fabric2_port->remoteport) + || (fabric1_port->remoteport + && fabric2_port->remoteport + && fabric1_port->remoteport->guid != + fabric2_port->remoteport->guid)) + output_diff++; + } + + /* if either fabric1_port or fabric2_port NULL, should be + * handled by port connection diff code + */ + if (data->diff_flags & DIFF_FLAG_PORT_STATE + && fabric1_port + && fabric2_port) { + int state1, state2; + + state1 = mad_get_field(fabric1_port->info, 0, + IB_PORT_STATE_F); + state2 = mad_get_field(fabric2_port->info, 0, + IB_PORT_STATE_F); + + if (state1 != state2) + output_diff++; + } + + if (data->diff_flags & DIFF_FLAG_PORT_CONNECTION + && data->diff_flags & DIFF_FLAG_LID + && fabric1_port && fabric2_port + && fabric1_port->remoteport && fabric2_port->remoteport + && fabric1_port->remoteport->base_lid != fabric2_port->remoteport->base_lid) + output_diff++; + + if (data->diff_flags & DIFF_FLAG_PORT_CONNECTION + && data->diff_flags & DIFF_FLAG_NODE_DESCRIPTION + && fabric1_port && fabric2_port + && fabric1_port->remoteport && fabric2_port->remoteport + && memcmp(fabric1_port->remoteport->node->nodedesc, + fabric2_port->remoteport->node->nodedesc, + IB_SMP_DATA_SIZE)) + output_diff++; + + if (output_diff && fabric1_port) { + print_node_header(fabric1_node, + head_print, + NULL); + print_port(fabric1_node, + fabric1_port, + data->fabric1_prefix); + } + + if (output_diff && fabric2_port) { + print_node_header(fabric1_node, + head_print, + NULL); + print_port(fabric2_node, + fabric2_port, + data->fabric2_prefix); + } + } +} + +static void diff_node_iter(ibnd_node_t *fabric1_node, void *iter_user_data) +{ + struct iter_diff_data *data = iter_user_data; + ibnd_node_t *fabric2_node; + int head_print = 0; + + DEBUG("DEBUG: fabric1_node %p\n", fabric1_node); + + fabric2_node = ibnd_find_node_guid(data->fabric2, fabric1_node->guid); + if (!fabric2_node) + print_node(fabric1_node, (void *)data->fabric1_prefix); + else if (data->diff_flags & + (DIFF_FLAG_PORT_CONNECTION | DIFF_FLAG_PORT_STATE + | DIFF_FLAG_LID | DIFF_FLAG_NODE_DESCRIPTION)) { + + if ((fabric1_node->type == IB_NODE_SWITCH + && data->diff_flags & DIFF_FLAG_LID + && fabric1_node->smalid != fabric2_node->smalid) || + (data->diff_flags & DIFF_FLAG_NODE_DESCRIPTION + && memcmp(fabric1_node->nodedesc, fabric2_node->nodedesc, + IB_SMP_DATA_SIZE))) { + print_node_header(fabric1_node, + NULL, + data->fabric1_prefix); + print_node_header(fabric2_node, + NULL, + data->fabric2_prefix); + head_print++; + } + + if (fabric1_node->numports != fabric2_node->numports) { + print_node_header(fabric1_node, + &head_print, + NULL); + printf("%snumports = %d\n", data->fabric1_prefix, + fabric1_node->numports); + printf("%snumports = %d\n", data->fabric2_prefix, + fabric2_node->numports); + return; + } + + diff_node_ports(fabric1_node, fabric2_node, + &head_print, data); + } +} + +static int diff_node(ibnd_node_t *node, ibnd_fabric_t *orig_fabric, + ibnd_fabric_t *new_fabric) +{ + struct iter_diff_data iter_diff_data; + + iter_diff_data.diff_flags = diffcheck_flags; + iter_diff_data.fabric1 = orig_fabric; + iter_diff_data.fabric2 = new_fabric; + iter_diff_data.fabric1_prefix = "< "; + iter_diff_data.fabric2_prefix = "> "; + if (node) + diff_node_iter(node, &iter_diff_data); + else { + if (only_flag) + ibnd_iter_nodes_type(orig_fabric, diff_node_iter, + only_type, &iter_diff_data); + else + ibnd_iter_nodes(orig_fabric, diff_node_iter, + &iter_diff_data); + } + + /* Do opposite diff to find existence of node types + * in new_fabric but not in orig_fabric. + * + * In this diff, we don't need to check port connections, + * port state, lids, or node descriptions since it has already + * been done (i.e. checks are only done when guid exists on both + * orig and new). + */ + iter_diff_data.diff_flags = diffcheck_flags & ~DIFF_FLAG_PORT_CONNECTION; + iter_diff_data.diff_flags &= ~DIFF_FLAG_PORT_STATE; + iter_diff_data.diff_flags &= ~DIFF_FLAG_LID; + iter_diff_data.diff_flags &= ~DIFF_FLAG_NODE_DESCRIPTION; + iter_diff_data.fabric1 = new_fabric; + iter_diff_data.fabric2 = orig_fabric; + iter_diff_data.fabric1_prefix = "> "; + iter_diff_data.fabric2_prefix = "< "; + if (node) + diff_node_iter(node, &iter_diff_data); + else { + if (only_flag) + ibnd_iter_nodes_type(new_fabric, diff_node_iter, + only_type, &iter_diff_data); + else + ibnd_iter_nodes(new_fabric, diff_node_iter, + &iter_diff_data); + } + + return 0; +} + +static int process_opt(void *context, int ch) +{ + struct ibnd_config *cfg = context; + char *p; + + switch (ch) { + case 1: + node_name_map_file = strdup(optarg); + if (node_name_map_file == NULL) + IBEXIT("out of memory, strdup for node_name_map_file name failed"); + break; + case 2: + load_cache_file = strdup(optarg); + break; + case 3: + diff_cache_file = strdup(optarg); + break; + case 4: + diffcheck_flags = 0; + p = strtok(optarg, ","); + while (p) { + if (!strcasecmp(p, "port")) + diffcheck_flags |= DIFF_FLAG_PORT_CONNECTION; + else if (!strcasecmp(p, "state")) + diffcheck_flags |= DIFF_FLAG_PORT_STATE; + else if (!strcasecmp(p, "lid")) + diffcheck_flags |= DIFF_FLAG_LID; + else if (!strcasecmp(p, "nodedesc")) + diffcheck_flags |= DIFF_FLAG_NODE_DESCRIPTION; + else { + fprintf(stderr, "invalid diff check key: %s\n", + p); + return -1; + } + p = strtok(NULL, ","); + } + break; + case 5: + filterdownports_cache_file = strdup(optarg); + break; + case 6: + only_flag = 1; + only_type = IB_NODE_SWITCH; + break; + case 7: + only_flag = 1; + only_type = IB_NODE_CA; + break; + case 'S': + case 'G': + node_label.guid_str = optarg; + node_label.guid = (uint64_t)strtoull(node_label.guid_str, NULL, 0); + break; + case 'D': + dr_path = strdup(optarg); + break; + case 'a': + all = 1; + break; + case 'n': + cfg->max_hops = strtoul(optarg, NULL, 0); + break; + case 'd': + down_links_only = 1; + break; + case 'l': + line_mode = 1; + break; + case 'p': + add_sw_settings = 1; + break; + case 'R': /* nop */ + break; + case 'o': + cfg->max_smps = strtoul(optarg, NULL, 0); + break; + default: + return -1; + } + + return 0; +} + +int main(int argc, char **argv) +{ + struct ibnd_config config = { 0 }; + int rc = 0; + int resolved = -1; + ibnd_fabric_t *fabric = NULL; + ibnd_fabric_t *diff_fabric = NULL; + struct ibmad_port *ibmad_port; + ib_portid_t port_id = { 0 }; + uint8_t ni[IB_SMP_DATA_SIZE] = { 0 }; + int mgmt_classes[3] = + { IB_SMI_CLASS, IB_SMI_DIRECT_CLASS, IB_SA_CLASS }; + + const struct ibdiag_opt opts[] = { + {"node-name-map", 1, 1, "<file>", "node name map file"}, + {"switch", 'S', 1, "<port_guid>", + "start partial scan at the port specified by <port_guid> (hex format)"}, + {"port-guid", 'G', 1, "<port_guid>", + "(same as -S)"}, + {"Direct", 'D', 1, "<dr_path>", + "start partial scan at the port specified by <dr_path>"}, + {"all", 'a', 0, NULL, + "print all nodes found in a partial fabric scan"}, + {"hops", 'n', 1, "<hops>", + "Number of hops to include away from specified node"}, + {"down", 'd', 0, NULL, "print only down links"}, + {"line", 'l', 0, NULL, + "(line mode) print all information for each link on a single line"}, + {"additional", 'p', 0, NULL, + "print additional port settings (PktLifeTime, HoqLife, VLStallCount)"}, + {"load-cache", 2, 1, "<file>", + "filename of ibnetdiscover cache to load"}, + {"diff", 3, 1, "<file>", + "filename of ibnetdiscover cache to diff"}, + {"diffcheck", 4, 1, "<key(s)>", + "specify checks to execute for --diff"}, + {"filterdownports", 5, 1, "<file>", + "filename of ibnetdiscover cache to filter downports"}, + {"outstanding_smps", 'o', 1, NULL, + "specify the number of outstanding SMP's which should be " + "issued during the scan"}, + {"switches-only", 6, 0, NULL, + "Output only switches"}, + {"cas-only", 7, 0, NULL, + "Output only CAs"}, + {} + }; + char usage_args[] = ""; + + ibdiag_process_opts(argc, argv, &config, "aDdGgKLlnpRS", opts, + process_opt, usage_args, NULL); + + argc -= optind; + argv += optind; + + ibmad_port = mad_rpc_open_port(ibd_ca, ibd_ca_port, mgmt_classes, 3); + if (!ibmad_port) { + fprintf(stderr, "Failed to open %s port %d\n", ibd_ca, + ibd_ca_port); + exit(1); + } + + smp_mkey_set(ibmad_port, ibd_mkey); + + if (ibd_timeout) { + mad_rpc_set_timeout(ibmad_port, ibd_timeout); + config.timeout_ms = ibd_timeout; + } + + config.flags = ibd_ibnetdisc_flags; + config.mkey = ibd_mkey; + + node_name_map = open_node_name_map(node_name_map_file); + + if (dr_path && load_cache_file) { + mad_rpc_close_port(ibmad_port); + fprintf(stderr, "Cannot specify cache and direct route path\n"); + exit(1); + } + + if (dr_path) { + /* only scan part of the fabric */ + if ((resolved = + resolve_portid_str(ibd_ca, ibd_ca_port, &port_id, dr_path, + IB_DEST_DRPATH, NULL, ibmad_port)) < 0) + IBWARN("Failed to resolve %s; attempting full scan", + dr_path); + } else if (node_label.guid_str) { + if ((resolved = resolve_portid_str( + ibd_ca, ibd_ca_port, &port_id, node_label.guid_str, + IB_DEST_GUID, NULL, ibmad_port)) < 0) + IBWARN("Failed to resolve %s; attempting full scan\n", + node_label.guid_str); + } + + if (!all && dr_path) { + if (!smp_query_via(ni, &port_id, IB_ATTR_NODE_INFO, 0, + ibd_timeout, ibmad_port)){ + mad_rpc_close_port(ibmad_port); + fprintf(stderr, "Failed to get local Node Info\n"); + exit(1); + } + } + mad_rpc_close_port(ibmad_port); + + if (diff_cache_file && + !(diff_fabric = ibnd_load_fabric(diff_cache_file, 0))) + IBEXIT("loading cached fabric for diff failed\n"); + + if (filterdownports_cache_file && + !(filterdownports_fabric = ibnd_load_fabric(filterdownports_cache_file, 0))) + IBEXIT("loading cached fabric for filterdownports failed\n"); + + if (load_cache_file) { + if ((fabric = ibnd_load_fabric(load_cache_file, 0)) == NULL) { + fprintf(stderr, "loading cached fabric failed\n"); + exit(1); + } + } else { + if (resolved >= 0) { + if (!config.max_hops) + config.max_hops = 1; + if (!(fabric = + ibnd_discover_fabric(ibd_ca, ibd_ca_port, &port_id, &config))) + IBWARN("Partial fabric scan failed;" + " attempting full scan\n"); + } + + if (!fabric && + !(fabric = ibnd_discover_fabric(ibd_ca, ibd_ca_port, NULL, &config))) { + fprintf(stderr, "discover failed\n"); + rc = 1; + goto close_port; + } + } + + if (!all && node_label.guid_str) { + ibnd_port_t *p = ibnd_find_port_guid(fabric, node_label.guid); + if (p && (!only_flag || p->node->type == only_type)) { + ibnd_node_t *n = p->node; + if (diff_fabric) + diff_node(n, diff_fabric, fabric); + else + print_node(n, NULL); + } + else + fprintf(stderr, "Failed to find port: %s\n", node_label.guid_str); + } else if (!all && dr_path) { + ibnd_port_t *p = NULL; + mad_decode_field(ni, IB_NODE_PORT_GUID_F, &node_label.guid); + + p = ibnd_find_port_guid(fabric, node_label.guid); + if (p && (!only_flag || p->node->type == only_type)) { + ibnd_node_t *n = p->node; + if (diff_fabric) + diff_node(n, diff_fabric, fabric); + else + print_node(n, NULL); + } + else + fprintf(stderr, "Failed to find port: %s\n", dr_path); + } else { + if (diff_fabric) + diff_node(NULL, diff_fabric, fabric); + else { + if (only_flag) + ibnd_iter_nodes_type(fabric, print_node, + only_type, NULL); + else + ibnd_iter_nodes(fabric, print_node, NULL); + } + } + + ibnd_destroy_fabric(fabric); + if (diff_fabric) + ibnd_destroy_fabric(diff_fabric); + +close_port: + close_node_name_map(node_name_map); + exit(rc); +} diff --git a/infiniband-diags/ibnetdiscover.c b/infiniband-diags/ibnetdiscover.c new file mode 100644 index 0000000..88e44f1 --- /dev/null +++ b/infiniband-diags/ibnetdiscover.c @@ -0,0 +1,1154 @@ +/* + * Copyright (c) 2004-2009 Voltaire Inc. All rights reserved. + * Copyright (c) 2007 Xsigo Systems Inc. All rights reserved. + * Copyright (c) 2008 Lawrence Livermore National Lab. All rights reserved. + * Copyright (c) 2010,2011 Mellanox Technologies LTD. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#define _GNU_SOURCE +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <time.h> +#include <string.h> +#include <inttypes.h> + +#include <infiniband/umad.h> +#include <infiniband/mad.h> +#include <util/node_name_map.h> +#include <infiniband/ibnetdisc.h> + +#include "ibdiag_common.h" + +#define LIST_CA_NODE (1 << IB_NODE_CA) +#define LIST_SWITCH_NODE (1 << IB_NODE_SWITCH) +#define LIST_ROUTER_NODE (1 << IB_NODE_ROUTER) + +#define DIFF_FLAG_SWITCH 0x01 +#define DIFF_FLAG_CA 0x02 +#define DIFF_FLAG_ROUTER 0x04 +#define DIFF_FLAG_PORT_CONNECTION 0x08 +#define DIFF_FLAG_LID 0x10 +#define DIFF_FLAG_NODE_DESCRIPTION 0x20 + +#define DIFF_FLAG_DEFAULT (DIFF_FLAG_SWITCH | DIFF_FLAG_CA | DIFF_FLAG_ROUTER \ + | DIFF_FLAG_PORT_CONNECTION) + +static FILE *f; + +static char *node_name_map_file = NULL; +static nn_map_t *node_name_map = NULL; +static char *cache_file = NULL; +static char *load_cache_file = NULL; +static char *diff_cache_file = NULL; +static unsigned diffcheck_flags = DIFF_FLAG_DEFAULT; + +static int report_max_hops = 0; +static int full_info; + +/** + * Define our own conversion functions to maintain compatibility with the old + * ibnetdiscover which did not use the ibmad conversion functions. + */ +static const char *dump_linkspeed_compat(uint32_t speed) +{ + switch (speed) { + case 1: + return ("SDR"); + break; + case 2: + return ("DDR"); + break; + case 4: + return ("QDR"); + break; + } + return ("???"); +} + +static const char *dump_linkspeedext_compat(uint32_t espeed, uint32_t speed, + uint32_t fdr10) +{ + switch (espeed) { + case 0: + if (fdr10 & FDR10) + return ("FDR10"); + else + return dump_linkspeed_compat(speed); + break; + case 1: + return ("FDR"); + break; + case 2: + return ("EDR"); + break; + case 4: + return ("HDR"); + break; + } + return ("???"); +} + +static const char *dump_linkwidth_compat(uint32_t width) +{ + switch (width) { + case 1: + return ("1x"); + break; + case 2: + return ("4x"); + break; + case 4: + return ("8x"); + break; + case 8: + return ("12x"); + break; + case 16: + return ("2x"); + break; + } + return ("??"); +} + +static inline const char *ports_nt_str_compat(ibnd_node_t * node) +{ + switch (node->type) { + case IB_NODE_SWITCH: + return "SW"; + case IB_NODE_CA: + return "CA"; + case IB_NODE_ROUTER: + return "RT"; + } + return "??"; +} + +static char *node_name(ibnd_node_t * node) +{ + static char buf[256]; + + switch (node->type) { + case IB_NODE_SWITCH: + sprintf(buf, "\"%s", "S"); + break; + case IB_NODE_CA: + sprintf(buf, "\"%s", "H"); + break; + case IB_NODE_ROUTER: + sprintf(buf, "\"%s", "R"); + break; + default: + sprintf(buf, "\"%s", "?"); + break; + } + sprintf(buf + 2, "-%016" PRIx64 "\"", node->guid); + + return buf; +} + +static void list_node(ibnd_node_t *node, void *user_data) +{ + const char *node_type; + char *nodename = remap_node_name(node_name_map, node->guid, + node->nodedesc); + + switch (node->type) { + case IB_NODE_SWITCH: + node_type = "Switch"; + break; + case IB_NODE_CA: + node_type = "Ca"; + break; + case IB_NODE_ROUTER: + node_type = "Router"; + break; + default: + node_type = "???"; + break; + } + fprintf(f, + "%s\t : 0x%016" PRIx64 + " ports %d devid 0x%x vendid 0x%x \"%s\"\n", node_type, + node->guid, node->numports, mad_get_field(node->info, 0, + IB_NODE_DEVID_F), + mad_get_field(node->info, 0, IB_NODE_VENDORID_F), nodename); + + free(nodename); +} + +static void list_nodes(ibnd_fabric_t *fabric, int list) +{ + if (list & LIST_CA_NODE) + ibnd_iter_nodes_type(fabric, list_node, IB_NODE_CA, NULL); + if (list & LIST_SWITCH_NODE) + ibnd_iter_nodes_type(fabric, list_node, IB_NODE_SWITCH, NULL); + if (list & LIST_ROUTER_NODE) + ibnd_iter_nodes_type(fabric, list_node, IB_NODE_ROUTER, NULL); +} + +static void out_ids(ibnd_node_t *node, int group, char *chname, + const char *out_prefix) +{ + uint64_t sysimgguid = + mad_get_field64(node->info, 0, IB_NODE_SYSTEM_GUID_F); + + fprintf(f, "\n%svendid=0x%x\n", out_prefix ? out_prefix : "", + mad_get_field(node->info, 0, IB_NODE_VENDORID_F)); + fprintf(f, "%sdevid=0x%x\n", out_prefix ? out_prefix : "", + mad_get_field(node->info, 0, IB_NODE_DEVID_F)); + if (sysimgguid) + fprintf(f, "%ssysimgguid=0x%" PRIx64, + out_prefix ? out_prefix : "", sysimgguid); + if (group && node->chassis && node->chassis->chassisnum) { + fprintf(f, "\t\t# Chassis %d", node->chassis->chassisnum); + if (chname) + fprintf(f, " (%s)", clean_nodedesc(chname)); + if (ibnd_is_xsigo_tca(node->guid) && node->ports[1] && + node->ports[1]->remoteport) + fprintf(f, " slot %d", + node->ports[1]->remoteport->portnum); + } + if (sysimgguid || + (group && node->chassis && node->chassis->chassisnum)) + fprintf(f, "\n"); +} + +static uint64_t out_chassis(ibnd_fabric_t *fabric, unsigned char chassisnum) +{ + uint64_t guid; + + fprintf(f, "\nChassis %u", chassisnum); + guid = ibnd_get_chassis_guid(fabric, chassisnum); + if (guid) + fprintf(f, " (guid 0x%" PRIx64 ")", guid); + fprintf(f, "\n"); + return guid; +} + +static void out_switch_detail(ibnd_node_t *node, const char *sw_prefix) +{ + char *nodename = NULL; + + nodename = remap_node_name(node_name_map, node->guid, node->nodedesc); + + fprintf(f, "%sSwitch\t%d %s\t\t# \"%s\" %s port 0 lid %d lmc %d", + sw_prefix ? sw_prefix : "", node->numports, node_name(node), + nodename, node->smaenhsp0 ? "enhanced" : "base", + node->smalid, node->smalmc); + + free(nodename); +} + +static void out_switch(ibnd_node_t *node, int group, char *chname, + const char *id_prefix, const char *sw_prefix) +{ + const char *str; + char str2[256]; + + out_ids(node, group, chname, id_prefix); + fprintf(f, "%sswitchguid=0x%" PRIx64, + id_prefix ? id_prefix : "", node->guid); + fprintf(f, "(%" PRIx64 ")", + mad_get_field64(node->info, 0, IB_NODE_PORT_GUID_F)); + if (group) { + fprintf(f, "\t# "); + str = ibnd_get_chassis_type(node); + if (str) + fprintf(f, "%s ", str); + str = ibnd_get_chassis_slot_str(node, str2, 256); + if (str) + fprintf(f, "%s", str); + } + fprintf(f, "\n"); + + out_switch_detail(node, sw_prefix); + fprintf(f, "\n"); +} + +static void out_ca_detail(ibnd_node_t *node, const char *ca_prefix) +{ + const char *node_type; + + switch (node->type) { + case IB_NODE_CA: + node_type = "Ca"; + break; + case IB_NODE_ROUTER: + node_type = "Rt"; + break; + default: + node_type = "???"; + break; + } + + fprintf(f, "%s%s\t%d %s\t\t# \"%s\"", ca_prefix ? ca_prefix : "", + node_type, node->numports, node_name(node), + clean_nodedesc(node->nodedesc)); +} + +static void out_ca(ibnd_node_t *node, int group, char *chname, + const char *id_prefix, const char *ca_prefix) +{ + const char *node_type; + + out_ids(node, group, chname, id_prefix); + switch (node->type) { + case IB_NODE_CA: + node_type = "ca"; + break; + case IB_NODE_ROUTER: + node_type = "rt"; + break; + default: + node_type = "???"; + break; + } + + fprintf(f, "%s%sguid=0x%" PRIx64 "\n", + id_prefix ? id_prefix : "", node_type, node->guid); + out_ca_detail(node, ca_prefix); + if (group && ibnd_is_xsigo_hca(node->guid)) + fprintf(f, " (scp)"); + fprintf(f, "\n"); +} + +#define OUT_BUFFER_SIZE 16 +static char *out_ext_port(ibnd_port_t * port, int group) +{ + static char mapping[OUT_BUFFER_SIZE]; + + if (group && port->ext_portnum != 0) { + snprintf(mapping, OUT_BUFFER_SIZE, + "[ext %d]", port->ext_portnum); + return (mapping); + } + + return (NULL); +} + +static void out_switch_port(ibnd_port_t *port, int group, + const char *out_prefix) +{ + char *ext_port_str = NULL; + char *rem_nodename = NULL; + uint32_t iwidth = mad_get_field(port->info, 0, + IB_PORT_LINK_WIDTH_ACTIVE_F); + uint32_t ispeed = mad_get_field(port->info, 0, + IB_PORT_LINK_SPEED_ACTIVE_F); + uint32_t vlcap = mad_get_field(port->info, 0, + IB_PORT_VL_CAP_F); + uint32_t fdr10 = mad_get_field(port->ext_info, 0, + IB_MLNX_EXT_PORT_LINK_SPEED_ACTIVE_F); + uint32_t cap_mask, espeed; + + DEBUG("port %p:%d remoteport %p\n", port, port->portnum, + port->remoteport); + fprintf(f, "%s[%d]", out_prefix ? out_prefix : "", port->portnum); + + ext_port_str = out_ext_port(port, group); + if (ext_port_str) + fprintf(f, "%s", ext_port_str); + + rem_nodename = remap_node_name(node_name_map, + port->remoteport->node->guid, + port->remoteport->node->nodedesc); + + ext_port_str = out_ext_port(port->remoteport, group); + + if (!port->node->ports[0]) { + cap_mask = 0; + ispeed = 0; + espeed = 0; + } else { + cap_mask = mad_get_field(port->node->ports[0]->info, 0, + IB_PORT_CAPMASK_F); + if (cap_mask & be32toh(IB_PORT_CAP_HAS_EXT_SPEEDS)) + espeed = mad_get_field(port->info, 0, + IB_PORT_LINK_SPEED_EXT_ACTIVE_F); + else + espeed = 0; + } + fprintf(f, "\t%s[%d]%s", + node_name(port->remoteport->node), port->remoteport->portnum, + ext_port_str ? ext_port_str : ""); + if (port->remoteport->node->type != IB_NODE_SWITCH) + fprintf(f, "(%" PRIx64 ") ", port->remoteport->guid); + fprintf(f, "\t\t# \"%s\" lid %d %s%s", + rem_nodename, + port->remoteport->node->type == IB_NODE_SWITCH ? + port->remoteport->node->smalid : + port->remoteport->base_lid, + dump_linkwidth_compat(iwidth), + (ispeed != 4 && !espeed) ? + dump_linkspeed_compat(ispeed) : + dump_linkspeedext_compat(espeed, ispeed, fdr10)); + + if (full_info) + fprintf(f, " s=%d w=%d v=%d", ispeed, iwidth, vlcap); + + if (ibnd_is_xsigo_tca(port->remoteport->guid)) + fprintf(f, " slot %d", port->portnum); + else if (ibnd_is_xsigo_hca(port->remoteport->guid)) + fprintf(f, " (scp)"); + fprintf(f, "\n"); + + free(rem_nodename); +} + +static void out_ca_port(ibnd_port_t *port, int group, const char *out_prefix) +{ + char *str = NULL; + char *rem_nodename = NULL; + uint32_t iwidth = mad_get_field(port->info, 0, + IB_PORT_LINK_WIDTH_ACTIVE_F); + uint32_t ispeed = mad_get_field(port->info, 0, + IB_PORT_LINK_SPEED_ACTIVE_F); + uint32_t vlcap = mad_get_field(port->info, 0, + IB_PORT_VL_CAP_F); + uint32_t fdr10 = mad_get_field(port->ext_info, 0, + IB_MLNX_EXT_PORT_LINK_SPEED_ACTIVE_F); + uint32_t cap_mask, espeed; + + fprintf(f, "%s[%d]", out_prefix ? out_prefix : "", port->portnum); + if (port->node->type != IB_NODE_SWITCH) + fprintf(f, "(%" PRIx64 ") ", port->guid); + fprintf(f, "\t%s[%d]", + node_name(port->remoteport->node), port->remoteport->portnum); + str = out_ext_port(port->remoteport, group); + if (str) + fprintf(f, "%s", str); + if (port->remoteport->node->type != IB_NODE_SWITCH) + fprintf(f, " (%" PRIx64 ") ", port->remoteport->guid); + + rem_nodename = remap_node_name(node_name_map, + port->remoteport->node->guid, + port->remoteport->node->nodedesc); + + cap_mask = mad_get_field(port->info, 0, IB_PORT_CAPMASK_F); + if (cap_mask & be32toh(IB_PORT_CAP_HAS_EXT_SPEEDS)) + espeed = mad_get_field(port->info, 0, + IB_PORT_LINK_SPEED_EXT_ACTIVE_F); + else + espeed = 0; + + fprintf(f, "\t\t# lid %d lmc %d \"%s\" lid %d %s%s", + port->base_lid, port->lmc, rem_nodename, + port->remoteport->node->type == IB_NODE_SWITCH ? + port->remoteport->node->smalid : + port->remoteport->base_lid, + dump_linkwidth_compat(iwidth), + (ispeed != 4 && !espeed) ? + dump_linkspeed_compat(ispeed) : + dump_linkspeedext_compat(espeed, ispeed, fdr10)); + + if (full_info) + fprintf(f, " s=%d w=%d v=%d", ispeed, iwidth, vlcap); + fprintf(f, "\n"); + + free(rem_nodename); +} + +struct iter_user_data { + int group; + int skip_chassis_nodes; +}; + +static void switch_iter_func(ibnd_node_t * node, void *iter_user_data) +{ + ibnd_port_t *port; + int p = 0; + struct iter_user_data *data = (struct iter_user_data *)iter_user_data; + + DEBUG("SWITCH: node %p\n", node); + + /* skip chassis based switches if flagged */ + if (data->skip_chassis_nodes && node->chassis + && node->chassis->chassisnum) + return; + + out_switch(node, data->group, NULL, NULL, NULL); + for (p = 1; p <= node->numports; p++) { + port = node->ports[p]; + if (port && port->remoteport) + out_switch_port(port, data->group, NULL); + } +} + +static void ca_iter_func(ibnd_node_t * node, void *iter_user_data) +{ + ibnd_port_t *port; + int p = 0; + struct iter_user_data *data = (struct iter_user_data *)iter_user_data; + + DEBUG("CA: node %p\n", node); + /* Now, skip chassis based CAs */ + if (data->group && node->chassis && node->chassis->chassisnum) + return; + out_ca(node, data->group, NULL, NULL, NULL); + + for (p = 1; p <= node->numports; p++) { + port = node->ports[p]; + if (port && port->remoteport) + out_ca_port(port, data->group, NULL); + } +} + +static void router_iter_func(ibnd_node_t * node, void *iter_user_data) +{ + ibnd_port_t *port; + int p = 0; + struct iter_user_data *data = (struct iter_user_data *)iter_user_data; + + DEBUG("RT: node %p\n", node); + /* Now, skip chassis based RTs */ + if (data->group && node->chassis && node->chassis->chassisnum) + return; + out_ca(node, data->group, NULL, NULL, NULL); + for (p = 1; p <= node->numports; p++) { + port = node->ports[p]; + if (port && port->remoteport) + out_ca_port(port, data->group, NULL); + } +} + +static int dump_topology(int group, ibnd_fabric_t *fabric) +{ + ibnd_node_t *node; + ibnd_port_t *port; + int i = 0, p = 0; + time_t t = time(NULL); + uint64_t chguid; + char *chname = NULL; + struct iter_user_data iter_user_data; + + fprintf(f, "#\n# Topology file: generated on %s#\n", ctime(&t)); + if (report_max_hops) + fprintf(f, "# Reported max hops discovered: %u\n" + "# Total MADs used: %u\n", + fabric->maxhops_discovered, fabric->total_mads_used); + fprintf(f, "# Initiated from node %016" PRIx64 " port %016" PRIx64 "\n", + fabric->from_node->guid, + mad_get_field64(fabric->from_node->info, 0, + IB_NODE_PORT_GUID_F)); + + /* Make pass on switches */ + if (group) { + ibnd_chassis_t *ch = NULL; + + /* Chassis based switches first */ + for (ch = fabric->chassis; ch; ch = ch->next) { + int n = 0; + + if (!ch->chassisnum) + continue; + chguid = out_chassis(fabric, ch->chassisnum); + chname = NULL; + if (ibnd_is_xsigo_guid(chguid)) { + for (node = ch->nodes; node; + node = node->next_chassis_node) { + if (ibnd_is_xsigo_hca(node->guid)) { + chname = node->nodedesc; + fprintf(f, "Hostname: %s\n", + clean_nodedesc + (node->nodedesc)); + } + } + } + + fprintf(f, "\n# Spine Nodes"); + for (n = 1; n <= SPINES_MAX_NUM; n++) { + if (ch->spinenode[n]) { + out_switch(ch->spinenode[n], group, + chname, NULL, NULL); + for (p = 1; + p <= ch->spinenode[n]->numports; + p++) { + port = + ch->spinenode[n]->ports[p]; + if (port && port->remoteport) + out_switch_port(port, + group, + NULL); + } + } + } + fprintf(f, "\n# Line Nodes"); + for (n = 1; n <= LINES_MAX_NUM; n++) { + if (ch->linenode[n]) { + out_switch(ch->linenode[n], group, + chname, NULL, NULL); + for (p = 1; + p <= ch->linenode[n]->numports; + p++) { + port = + ch->linenode[n]->ports[p]; + if (port && port->remoteport) + out_switch_port(port, + group, + NULL); + } + } + } + + fprintf(f, "\n# Chassis Switches"); + for (node = ch->nodes; node; + node = node->next_chassis_node) { + if (node->type == IB_NODE_SWITCH) { + out_switch(node, group, chname, NULL, + NULL); + for (p = 1; p <= node->numports; p++) { + port = node->ports[p]; + if (port && port->remoteport) + out_switch_port(port, + group, + NULL); + } + } + + } + + fprintf(f, "\n# Chassis CAs"); + for (node = ch->nodes; node; + node = node->next_chassis_node) { + if (node->type == IB_NODE_CA) { + out_ca(node, group, chname, NULL, NULL); + for (p = 1; p <= node->numports; p++) { + port = node->ports[p]; + if (port && port->remoteport) + out_ca_port(port, group, + NULL); + } + } + } + + } + + } else { /* !group */ + iter_user_data.group = group; + iter_user_data.skip_chassis_nodes = 0; + ibnd_iter_nodes_type(fabric, switch_iter_func, IB_NODE_SWITCH, + &iter_user_data); + } + + chname = NULL; + if (group) { + iter_user_data.group = group; + iter_user_data.skip_chassis_nodes = 1; + + fprintf(f, "\nNon-Chassis Nodes\n"); + + ibnd_iter_nodes_type(fabric, switch_iter_func, IB_NODE_SWITCH, + &iter_user_data); + } + + iter_user_data.group = group; + iter_user_data.skip_chassis_nodes = 0; + /* Make pass on CAs */ + ibnd_iter_nodes_type(fabric, ca_iter_func, IB_NODE_CA, &iter_user_data); + + /* Make pass on routers */ + ibnd_iter_nodes_type(fabric, router_iter_func, IB_NODE_ROUTER, + &iter_user_data); + + return i; +} + +static void dump_ports_report(ibnd_node_t *node, void *user_data) +{ + int p = 0; + ibnd_port_t *port = NULL; + char *nodename = NULL; + char *rem_nodename = NULL; + + /* for each port */ + for (p = node->numports, port = node->ports[p]; p > 0; + port = node->ports[--p]) { + uint32_t iwidth, ispeed, fdr10, espeed, cap_mask; + uint8_t *info = NULL; + if (port == NULL) + continue; + iwidth = + mad_get_field(port->info, 0, IB_PORT_LINK_WIDTH_ACTIVE_F); + ispeed = + mad_get_field(port->info, 0, IB_PORT_LINK_SPEED_ACTIVE_F); + if (port->node->type == IB_NODE_SWITCH) { + if (port->node->ports[0]) + info = (uint8_t *)&port->node->ports[0]->info; + } + else + info = (uint8_t *)&port->info; + if (info) { + cap_mask = mad_get_field(info, 0, IB_PORT_CAPMASK_F); + if (cap_mask & be32toh(IB_PORT_CAP_HAS_EXT_SPEEDS)) + espeed = mad_get_field(port->info, 0, + IB_PORT_LINK_SPEED_EXT_ACTIVE_F); + else + espeed = 0; + } else { + ispeed = 0; + iwidth = 0; + espeed = 0; + } + fdr10 = mad_get_field(port->ext_info, 0, + IB_MLNX_EXT_PORT_LINK_SPEED_ACTIVE_F); + nodename = remap_node_name(node_name_map, + port->node->guid, + port->node->nodedesc); + fprintf(stdout, "%2s %5d %2d 0x%016" PRIx64 " %s %s", + ports_nt_str_compat(node), + node->type == + IB_NODE_SWITCH ? node->smalid : port->base_lid, + port->portnum, port->guid, + dump_linkwidth_compat(iwidth), + (ispeed != 4 && !espeed) ? + dump_linkspeed_compat(ispeed) : + dump_linkspeedext_compat(espeed, ispeed, fdr10)); + if (port->remoteport) { + rem_nodename = remap_node_name(node_name_map, + port->remoteport->node->guid, + port->remoteport->node->nodedesc); + fprintf(stdout, + " - %2s %5d %2d 0x%016" PRIx64 + " ( '%s' - '%s' )\n", + ports_nt_str_compat(port->remoteport->node), + port->remoteport->node->type == IB_NODE_SWITCH ? + port->remoteport->node->smalid : + port->remoteport->base_lid, + port->remoteport->portnum, + port->remoteport->guid, nodename, rem_nodename); + free(rem_nodename); + } else + fprintf(stdout, "%36s'%s'\n", "", nodename); + + free(nodename); + } +} + +struct iter_diff_data { + uint32_t diff_flags; + ibnd_fabric_t *fabric1; + ibnd_fabric_t *fabric2; + const char *fabric1_prefix; + const char *fabric2_prefix; + void (*out_header)(ibnd_node_t *, int, char *, const char *, + const char *); + void (*out_header_detail)(ibnd_node_t *, const char *); + void (*out_port)(ibnd_port_t *, int, const char *); +}; + +static void diff_iter_out_header(ibnd_node_t * node, + struct iter_diff_data *data, + int *out_header_flag) +{ + if (!(*out_header_flag)) { + (*data->out_header) (node, 0, NULL, NULL, NULL); + (*out_header_flag)++; + } +} + +static void diff_ports(ibnd_node_t * fabric1_node, ibnd_node_t * fabric2_node, + int *out_header_flag, struct iter_diff_data *data) +{ + ibnd_port_t *fabric1_port; + ibnd_port_t *fabric2_port; + int p; + + for (p = 1; p <= fabric1_node->numports; p++) { + int fabric1_out = 0, fabric2_out = 0; + + fabric1_port = fabric1_node->ports[p]; + fabric2_port = fabric2_node->ports[p]; + + if (data->diff_flags & DIFF_FLAG_PORT_CONNECTION) { + if ((fabric1_port && !fabric2_port) + || ((fabric1_port && fabric2_port) + && (fabric1_port->remoteport + && !fabric2_port->remoteport))) + fabric1_out++; + else if ((!fabric1_port && fabric2_port) + || ((fabric1_port && fabric2_port) + && (!fabric1_port->remoteport + && fabric2_port->remoteport))) + fabric2_out++; + else if ((fabric1_port && fabric2_port) + && ((fabric1_port->guid != fabric2_port->guid) + || + ((fabric1_port->remoteport + && fabric2_port->remoteport) + && (fabric1_port->remoteport->guid != + fabric2_port->remoteport->guid)))) { + fabric1_out++; + fabric2_out++; + } + } + + if ((data->diff_flags & DIFF_FLAG_LID) + && fabric1_port && fabric2_port + && fabric1_port->base_lid != fabric2_port->base_lid) { + fabric1_out++; + fabric2_out++; + } + + if (data->diff_flags & DIFF_FLAG_PORT_CONNECTION + && data->diff_flags & DIFF_FLAG_NODE_DESCRIPTION + && fabric1_port && fabric2_port + && fabric1_port->remoteport && fabric2_port->remoteport + && memcmp(fabric1_port->remoteport->node->nodedesc, + fabric2_port->remoteport->node->nodedesc, + IB_SMP_DATA_SIZE)) { + fabric1_out++; + fabric2_out++; + } + + if (data->diff_flags & DIFF_FLAG_PORT_CONNECTION + && data->diff_flags & DIFF_FLAG_NODE_DESCRIPTION + && fabric1_port && fabric2_port + && fabric1_port->remoteport && fabric2_port->remoteport + && memcmp(fabric1_port->remoteport->node->nodedesc, + fabric2_port->remoteport->node->nodedesc, + IB_SMP_DATA_SIZE)) { + fabric1_out++; + fabric2_out++; + } + + if (data->diff_flags & DIFF_FLAG_PORT_CONNECTION + && data->diff_flags & DIFF_FLAG_LID + && fabric1_port && fabric2_port + && fabric1_port->remoteport && fabric2_port->remoteport + && fabric1_port->remoteport->base_lid != fabric2_port->remoteport->base_lid) { + fabric1_out++; + fabric2_out++; + } + + if (fabric1_out) { + diff_iter_out_header(fabric1_node, data, + out_header_flag); + (*data->out_port) (fabric1_port, 0, + data->fabric1_prefix); + } + if (fabric2_out) { + diff_iter_out_header(fabric1_node, data, + out_header_flag); + (*data->out_port) (fabric2_port, 0, + data->fabric2_prefix); + } + } +} + +static void diff_iter_func(ibnd_node_t * fabric1_node, void *iter_user_data) +{ + struct iter_diff_data *data = iter_user_data; + ibnd_node_t *fabric2_node; + ibnd_port_t *fabric1_port; + int p; + + DEBUG("DEBUG: fabric1_node %p\n", fabric1_node); + + fabric2_node = ibnd_find_node_guid(data->fabric2, fabric1_node->guid); + if (!fabric2_node) { + (*data->out_header) (fabric1_node, 0, NULL, + data->fabric1_prefix, + data->fabric1_prefix); + for (p = 1; p <= fabric1_node->numports; p++) { + fabric1_port = fabric1_node->ports[p]; + if (fabric1_port && fabric1_port->remoteport) + (*data->out_port) (fabric1_port, 0, + data->fabric1_prefix); + } + } else if (data->diff_flags & + (DIFF_FLAG_PORT_CONNECTION | DIFF_FLAG_LID + | DIFF_FLAG_NODE_DESCRIPTION)) { + int out_header_flag = 0; + + if ((data->diff_flags & DIFF_FLAG_LID + && fabric1_node->smalid != fabric2_node->smalid) || + (data->diff_flags & DIFF_FLAG_NODE_DESCRIPTION + && memcmp(fabric1_node->nodedesc, fabric2_node->nodedesc, + IB_SMP_DATA_SIZE))) { + (*data->out_header) (fabric1_node, 0, NULL, NULL, + data->fabric1_prefix); + (*data->out_header_detail) (fabric2_node, + data->fabric2_prefix); + fprintf(f, "\n"); + out_header_flag++; + } + + if (fabric1_node->numports != fabric2_node->numports) { + diff_iter_out_header(fabric1_node, data, + &out_header_flag); + fprintf(f, "%snumports = %d\n", data->fabric1_prefix, + fabric1_node->numports); + fprintf(f, "%snumports = %d\n", data->fabric2_prefix, + fabric2_node->numports); + return; + } + + if (data->diff_flags & DIFF_FLAG_PORT_CONNECTION + || data->diff_flags & DIFF_FLAG_LID) + diff_ports(fabric1_node, fabric2_node, &out_header_flag, + data); + } +} + +static int diff_common(ibnd_fabric_t *orig_fabric, ibnd_fabric_t *new_fabric, + int node_type, uint32_t diff_flags, + void (*out_header)(ibnd_node_t *, int, char *, + const char *, const char *), + void (*out_header_detail)(ibnd_node_t *, const char *), + void (*out_port)(ibnd_port_t *, int, const char *)) +{ + struct iter_diff_data iter_diff_data; + + iter_diff_data.diff_flags = diff_flags; + iter_diff_data.fabric1 = orig_fabric; + iter_diff_data.fabric2 = new_fabric; + iter_diff_data.fabric1_prefix = "< "; + iter_diff_data.fabric2_prefix = "> "; + iter_diff_data.out_header = out_header; + iter_diff_data.out_header_detail = out_header_detail; + iter_diff_data.out_port = out_port; + ibnd_iter_nodes_type(orig_fabric, diff_iter_func, node_type, + &iter_diff_data); + + /* Do opposite diff to find existence of node types + * in new_fabric but not in orig_fabric. + * + * In this diff, we don't need to check port connections, + * lids, or node descriptions since it has already been + * done (i.e. checks are only done when guid exists on both + * orig and new). + */ + iter_diff_data.diff_flags = diff_flags & ~DIFF_FLAG_PORT_CONNECTION; + iter_diff_data.diff_flags &= ~DIFF_FLAG_LID; + iter_diff_data.diff_flags &= ~DIFF_FLAG_NODE_DESCRIPTION; + iter_diff_data.fabric1 = new_fabric; + iter_diff_data.fabric2 = orig_fabric; + iter_diff_data.fabric1_prefix = "> "; + iter_diff_data.fabric2_prefix = "< "; + iter_diff_data.out_header = out_header; + iter_diff_data.out_header_detail = out_header_detail; + iter_diff_data.out_port = out_port; + ibnd_iter_nodes_type(new_fabric, diff_iter_func, node_type, + &iter_diff_data); + + return 0; +} + +static int diff(ibnd_fabric_t *orig_fabric, ibnd_fabric_t *new_fabric) +{ + if (diffcheck_flags & DIFF_FLAG_SWITCH) + diff_common(orig_fabric, new_fabric, IB_NODE_SWITCH, + diffcheck_flags, out_switch, out_switch_detail, + out_switch_port); + + if (diffcheck_flags & DIFF_FLAG_CA) + diff_common(orig_fabric, new_fabric, IB_NODE_CA, + diffcheck_flags, out_ca, out_ca_detail, + out_ca_port); + + if (diffcheck_flags & DIFF_FLAG_ROUTER) + diff_common(orig_fabric, new_fabric, IB_NODE_ROUTER, + diffcheck_flags, out_ca, out_ca_detail, + out_ca_port); + + return 0; +} + +static int list, group, ports_report; + +static int process_opt(void *context, int ch) +{ + struct ibnd_config *cfg = context; + char *p; + + switch (ch) { + case 1: + node_name_map_file = strdup(optarg); + if (node_name_map_file == NULL) + IBEXIT("out of memory, strdup for node_name_map_file name failed"); + break; + case 2: + cache_file = strdup(optarg); + break; + case 3: + load_cache_file = strdup(optarg); + break; + case 4: + diff_cache_file = strdup(optarg); + break; + case 5: + diffcheck_flags = 0; + p = strtok(optarg, ","); + while (p) { + if (!strcasecmp(p, "sw")) + diffcheck_flags |= DIFF_FLAG_SWITCH; + else if (!strcasecmp(p, "ca")) + diffcheck_flags |= DIFF_FLAG_CA; + else if (!strcasecmp(p, "router")) + diffcheck_flags |= DIFF_FLAG_ROUTER; + else if (!strcasecmp(p, "port")) + diffcheck_flags |= DIFF_FLAG_PORT_CONNECTION; + else if (!strcasecmp(p, "lid")) + diffcheck_flags |= DIFF_FLAG_LID; + else if (!strcasecmp(p, "nodedesc")) + diffcheck_flags |= DIFF_FLAG_NODE_DESCRIPTION; + else { + fprintf(stderr, "invalid diff check key: %s\n", + p); + return -1; + } + p = strtok(NULL, ","); + } + break; + case 's': + cfg->show_progress = 1; + break; + case 'f': + full_info = 1; + break; + case 'l': + list = LIST_CA_NODE | LIST_SWITCH_NODE | LIST_ROUTER_NODE; + break; + case 'g': + group = 1; + break; + case 'S': + list = LIST_SWITCH_NODE; + break; + case 'H': + list = LIST_CA_NODE; + break; + case 'R': + list = LIST_ROUTER_NODE; + break; + case 'p': + ports_report = 1; + break; + case 'm': + report_max_hops = 1; + break; + case 'o': + cfg->max_smps = strtoul(optarg, NULL, 0); + break; + default: + return -1; + } + + return 0; +} + +int main(int argc, char **argv) +{ + struct ibnd_config config = { 0 }; + ibnd_fabric_t *fabric = NULL; + ibnd_fabric_t *diff_fabric = NULL; + + const struct ibdiag_opt opts[] = { + {"full", 'f', 0, NULL, "show full information (ports' speed and width, vlcap)"}, + {"show", 's', 0, NULL, "show more information"}, + {"list", 'l', 0, NULL, "list of connected nodes"}, + {"grouping", 'g', 0, NULL, "show grouping"}, + {"Hca_list", 'H', 0, NULL, "list of connected CAs"}, + {"Switch_list", 'S', 0, NULL, "list of connected switches"}, + {"Router_list", 'R', 0, NULL, "list of connected routers"}, + {"node-name-map", 1, 1, "<file>", "node name map file"}, + {"cache", 2, 1, "<file>", + "filename to cache ibnetdiscover data to"}, + {"load-cache", 3, 1, "<file>", + "filename of ibnetdiscover cache to load"}, + {"diff", 4, 1, "<file>", + "filename of ibnetdiscover cache to diff"}, + {"diffcheck", 5, 1, "<key(s)>", + "specify checks to execute for --diff"}, + {"ports", 'p', 0, NULL, "obtain a ports report"}, + {"max_hops", 'm', 0, NULL, + "report max hops discovered by the library"}, + {"outstanding_smps", 'o', 1, NULL, + "specify the number of outstanding SMP's which should be " + "issued during the scan"}, + {} + }; + char usage_args[] = "[topology-file]"; + + ibdiag_process_opts(argc, argv, &config, "DGKLs", opts, process_opt, + usage_args, NULL); + + f = stdout; + + argc -= optind; + argv += optind; + + if (ibd_timeout) + config.timeout_ms = ibd_timeout; + + config.flags = ibd_ibnetdisc_flags; + + if (argc && !(f = fopen(argv[0], "w"))) + IBEXIT("can't open file %s for writing", argv[0]); + + config.mkey = ibd_mkey; + + node_name_map = open_node_name_map(node_name_map_file); + + if (diff_cache_file && + !(diff_fabric = ibnd_load_fabric(diff_cache_file, 0))) + IBEXIT("loading cached fabric for diff failed\n"); + + if (load_cache_file) { + if ((fabric = ibnd_load_fabric(load_cache_file, 0)) == NULL) + IBEXIT("loading cached fabric failed\n"); + } else { + if ((fabric = + ibnd_discover_fabric(ibd_ca, ibd_ca_port, NULL, &config)) == NULL) + IBEXIT("discover failed\n"); + } + + if (ports_report) + ibnd_iter_nodes(fabric, dump_ports_report, NULL); + else if (list) + list_nodes(fabric, list); + else if (diff_fabric) + diff(diff_fabric, fabric); + else + dump_topology(group, fabric); + + if (cache_file) + if (ibnd_cache_fabric(fabric, cache_file, 0) < 0) + IBEXIT("caching ibnetdiscover data failed\n"); + + ibnd_destroy_fabric(fabric); + if (diff_fabric) + ibnd_destroy_fabric(diff_fabric); + close_node_name_map(node_name_map); + exit(0); +} diff --git a/infiniband-diags/ibping.c b/infiniband-diags/ibping.c new file mode 100644 index 0000000..0d10da0 --- /dev/null +++ b/infiniband-diags/ibping.c @@ -0,0 +1,271 @@ +/* + * Copyright (c) 2004-2009 Voltaire Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <string.h> +#include <signal.h> +#include <time.h> + +#include <infiniband/umad.h> +#include <infiniband/mad.h> + +#include "ibdiag_common.h" + +static struct ibmad_port *srcport; + +static uint64_t time_stamp(void) +{ + struct timespec ts; + + clock_gettime(CLOCK_MONOTONIC, &ts); + return ((uint64_t)ts.tv_sec * 1000000ULL) + ts.tv_nsec / 10000ULL; +} + +static char host_and_domain[IB_VENDOR_RANGE2_DATA_SIZE]; +static char last_host[IB_VENDOR_RANGE2_DATA_SIZE]; + +static void get_host_and_domain(char *data, int sz) +{ + char *s = data; + int n; + + if (gethostname(s, sz) < 0) + snprintf(s, sz, "?hostname?"); + + s[sz - 1] = 0; + if ((n = strlen(s)) >= sz) + return; + s[n] = '.'; + s += n + 1; + sz -= n + 1; + + if (getdomainname(s, sz) < 0) + snprintf(s, sz, "?domainname?"); + if (strlen(s) == 0) + s[-1] = 0; /* no domain */ +} + +static char *ibping_serv(void) +{ + void *umad; + void *mad; + char *data; + + DEBUG("starting to serve..."); + + while ((umad = mad_receive_via(NULL, -1, srcport))) { + + if (umad_status(umad) == 0) { + mad = umad_get_mad(umad); + data = (char *)mad + IB_VENDOR_RANGE2_DATA_OFFS; + + memcpy(data, host_and_domain, IB_VENDOR_RANGE2_DATA_SIZE); + + DEBUG("Pong: %s", data); + + if (mad_respond_via(umad, NULL, 0, srcport) < 0) + DEBUG("respond failed"); + + } + mad_free(umad); + } + + DEBUG("server out"); + return NULL; +} + +static int oui = IB_OPENIB_OUI; + +static uint64_t ibping(ib_portid_t * portid, int quiet) +{ + char data[IB_VENDOR_RANGE2_DATA_SIZE] = { 0 }; + ib_vendor_call_t call; + uint64_t start, rtt; + + DEBUG("Ping.."); + + start = time_stamp(); + + call.method = IB_MAD_METHOD_GET; + call.mgmt_class = IB_VENDOR_OPENIB_PING_CLASS; + call.attrid = 0; + call.mod = 0; + call.oui = oui; + call.timeout = 0; + memset(&call.rmpp, 0, sizeof call.rmpp); + + if (!ib_vendor_call_via(data, portid, &call, srcport)) + return ~0ull; + + rtt = time_stamp() - start; + + if (!last_host[0]) + memcpy(last_host, data, sizeof last_host); + + if (!quiet) + printf("Pong from %s (%s): time %" PRIu64 ".%03" PRIu64 " ms\n", + data, portid2str(portid), rtt / 1000, rtt % 1000); + + return rtt; +} + +static uint64_t minrtt = ~0ull, maxrtt, total_rtt; +static uint64_t start, total_time, replied, lost, ntrans; +static ib_portid_t portid = { 0 }; + +static void report(int sig) +{ + total_time = time_stamp() - start; + + DEBUG("out due signal %d", sig); + + printf("\n--- %s (%s) ibping statistics ---\n", last_host, + portid2str(&portid)); + printf("%" PRIu64 " packets transmitted, %" PRIu64 " received, %" PRIu64 + "%% packet loss, time %" PRIu64 " ms\n", ntrans, replied, + (lost != 0) ? lost * 100 / ntrans : 0, total_time / 1000); + printf("rtt min/avg/max = %" PRIu64 ".%03" PRIu64 "/%" PRIu64 ".%03" + PRIu64 "/%" PRIu64 ".%03" PRIu64 " ms\n", + minrtt == ~0ull ? 0 : minrtt / 1000, + minrtt == ~0ull ? 0 : minrtt % 1000, + replied ? total_rtt / replied / 1000 : 0, + replied ? (total_rtt / replied) % 1000 : 0, maxrtt / 1000, + maxrtt % 1000); + + exit(0); +} + +static int server = 0, flood = 0; +static unsigned count = ~0; + +static int process_opt(void *context, int ch) +{ + switch (ch) { + case 'c': + count = strtoul(optarg, NULL, 0); + break; + case 'f': + flood++; + break; + case 'o': + oui = strtoul(optarg, NULL, 0); + break; + case 'S': + server++; + break; + default: + return -1; + } + return 0; +} + +int main(int argc, char **argv) +{ + int mgmt_classes[1] = { IB_SA_CLASS }; + int ping_class = IB_VENDOR_OPENIB_PING_CLASS; + uint64_t rtt; + char *err; + + const struct ibdiag_opt opts[] = { + {"count", 'c', 1, "<num>", "stop after count packets"}, + {"flood", 'f', 0, NULL, "flood destination"}, + {"oui", 'o', 1, NULL, "use specified OUI number"}, + {"Server", 'S', 0, NULL, "start in server mode"}, + {} + }; + char usage_args[] = "<dest lid|guid>"; + + ibdiag_process_opts(argc, argv, NULL, "DKy", opts, process_opt, + usage_args, NULL); + + argc -= optind; + argv += optind; + + if (!argc && !server) + ibdiag_show_usage(); + + srcport = mad_rpc_open_port(ibd_ca, ibd_ca_port, mgmt_classes, 1); + if (!srcport) + IBEXIT("Failed to open '%s' port '%d'", ibd_ca, ibd_ca_port); + + if (server) { + if (mad_register_server_via(ping_class, 0, NULL, oui, srcport) < 0) + IBEXIT("can't serve class %d on this port", + ping_class); + + get_host_and_domain(host_and_domain, sizeof host_and_domain); + + if ((err = ibping_serv())) + IBEXIT("ibping to %s: %s", portid2str(&portid), err); + exit(0); + } + + if (mad_register_client_via(ping_class, 0, srcport) < 0) + IBEXIT("can't register ping class %d on this port", + ping_class); + + if (resolve_portid_str(ibd_ca, ibd_ca_port, &portid, argv[0], + ibd_dest_type, ibd_sm_id, srcport) < 0) + IBEXIT("can't resolve destination port %s", argv[0]); + + signal(SIGINT, report); + signal(SIGTERM, report); + + start = time_stamp(); + + while (count-- > 0) { + ntrans++; + if ((rtt = ibping(&portid, flood)) == ~0ull) { + DEBUG("ibping to %s failed", portid2str(&portid)); + lost++; + } else { + if (rtt < minrtt) + minrtt = rtt; + if (rtt > maxrtt) + maxrtt = rtt; + total_rtt += rtt; + replied++; + } + + if (!flood) + sleep(1); + } + + report(0); + + mad_rpc_close_port(srcport); + + exit(-1); +} diff --git a/infiniband-diags/ibportstate.c b/infiniband-diags/ibportstate.c new file mode 100644 index 0000000..17d1e15 --- /dev/null +++ b/infiniband-diags/ibportstate.c @@ -0,0 +1,779 @@ +/* + * Copyright (c) 2004-2009 Voltaire Inc. All rights reserved. + * Copyright (c) 2010,2011 Mellanox Technologies LTD. All rights reserved. + * Copyright (c) 2011,2016 Oracle and/or its affiliates. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <string.h> +#include <errno.h> + +#include <infiniband/umad.h> +#include <infiniband/mad.h> + +#include "ibdiag_common.h" + +#include <util/compiler.h> + +enum port_ops { + QUERY, + ENABLE, + RESET, + DISABLE, + SPEED, + ESPEED, + FDR10SPEED, + WIDTH, + DOWN, + ARM, + ACTIVE, + VLS, + MTU, + LID, + SMLID, + LMC, + MKEY, + MKEYLEASE, + MKEYPROT, + ON, + OFF +}; + +static struct ibmad_port *srcport; +static uint64_t speed; /* no state change */ +static uint64_t espeed; /* no state change */ +static uint64_t fdr10; /* no state change */ +static uint64_t width; /* no state change */ +static uint64_t lid; +static uint64_t smlid; +static uint64_t lmc; +static uint64_t mtu; +static uint64_t vls; /* no state change */ +static uint64_t mkey; +static uint64_t mkeylease; +static uint64_t mkeyprot; + +static struct { + const char *name; + uint64_t *val; + int set; +} port_args[] = { + {"query", NULL, 0}, /* QUERY */ + {"enable", NULL, 0}, /* ENABLE */ + {"reset", NULL, 0}, /* RESET */ + {"disable", NULL, 0}, /* DISABLE */ + {"speed", &speed, 0}, /* SPEED */ + {"espeed", &espeed, 0}, /* EXTENDED SPEED */ + {"fdr10", &fdr10, 0}, /* FDR10 SPEED */ + {"width", &width, 0}, /* WIDTH */ + {"down", NULL, 0}, /* DOWN */ + {"arm", NULL, 0}, /* ARM */ + {"active", NULL, 0}, /* ACTIVE */ + {"vls", &vls, 0}, /* VLS */ + {"mtu", &mtu, 0}, /* MTU */ + {"lid", &lid, 0}, /* LID */ + {"smlid", &smlid, 0}, /* SMLID */ + {"lmc", &lmc, 0}, /* LMC */ + {"mkey", &mkey, 0}, /* MKEY */ + {"mkeylease", &mkeylease, 0}, /* MKEY LEASE */ + {"mkeyprot", &mkeyprot, 0}, /* MKEY PROTECT BITS */ + {"on", NULL, 0}, /* ON */ + {"off", NULL, 0}, /* OFF */ +}; + +#define NPORT_ARGS (sizeof(port_args) / sizeof(port_args[0])) + +/*******************************************/ + +/* + * Return 1 if node is a switch, else zero. + */ +static int get_node_info(ib_portid_t * dest, uint8_t * data) +{ + int node_type; + + if (!smp_query_via(data, dest, IB_ATTR_NODE_INFO, 0, 0, srcport)) + IBEXIT("smp query nodeinfo failed"); + + node_type = mad_get_field(data, 0, IB_NODE_TYPE_F); + if (node_type == IB_NODE_SWITCH) /* Switch NodeType ? */ + return 1; + else + return 0; +} + +static int get_port_info(ib_portid_t * dest, uint8_t * data, int portnum, + int is_switch) +{ + uint8_t smp[IB_SMP_DATA_SIZE]; + uint8_t *info; + int cap_mask; + + if (is_switch) { + if (!smp_query_via(smp, dest, IB_ATTR_PORT_INFO, 0, 0, srcport)) + IBEXIT("smp query port 0 portinfo failed"); + info = smp; + } else + info = data; + + if (!smp_query_via(data, dest, IB_ATTR_PORT_INFO, portnum, 0, srcport)) + IBEXIT("smp query portinfo failed"); + cap_mask = mad_get_field(info, 0, IB_PORT_CAPMASK_F); + return (cap_mask & be32toh(IB_PORT_CAP_HAS_EXT_SPEEDS)); +} + +static void show_port_info(ib_portid_t * dest, uint8_t * data, int portnum, + int espeed_cap, int is_switch) +{ + char buf[2300]; + char val[64]; + + mad_dump_portstates(buf, sizeof buf, data, sizeof *data); + mad_decode_field(data, IB_PORT_LID_F, val); + mad_dump_field(IB_PORT_LID_F, buf + strlen(buf), + sizeof buf - strlen(buf), val); + sprintf(buf + strlen(buf), "%s", "\n"); + mad_decode_field(data, IB_PORT_SMLID_F, val); + mad_dump_field(IB_PORT_SMLID_F, buf + strlen(buf), + sizeof buf - strlen(buf), val); + sprintf(buf + strlen(buf), "%s", "\n"); + mad_decode_field(data, IB_PORT_LMC_F, val); + mad_dump_field(IB_PORT_LMC_F, buf + strlen(buf), + sizeof buf - strlen(buf), val); + sprintf(buf + strlen(buf), "%s", "\n"); + mad_decode_field(data, IB_PORT_LINK_WIDTH_SUPPORTED_F, val); + mad_dump_field(IB_PORT_LINK_WIDTH_SUPPORTED_F, buf + strlen(buf), + sizeof buf - strlen(buf), val); + sprintf(buf + strlen(buf), "%s", "\n"); + mad_decode_field(data, IB_PORT_LINK_WIDTH_ENABLED_F, val); + mad_dump_field(IB_PORT_LINK_WIDTH_ENABLED_F, buf + strlen(buf), + sizeof buf - strlen(buf), val); + sprintf(buf + strlen(buf), "%s", "\n"); + mad_decode_field(data, IB_PORT_LINK_WIDTH_ACTIVE_F, val); + mad_dump_field(IB_PORT_LINK_WIDTH_ACTIVE_F, buf + strlen(buf), + sizeof buf - strlen(buf), val); + sprintf(buf + strlen(buf), "%s", "\n"); + mad_decode_field(data, IB_PORT_LINK_SPEED_SUPPORTED_F, val); + mad_dump_field(IB_PORT_LINK_SPEED_SUPPORTED_F, buf + strlen(buf), + sizeof buf - strlen(buf), val); + sprintf(buf + strlen(buf), "%s", "\n"); + mad_decode_field(data, IB_PORT_LINK_SPEED_ENABLED_F, val); + mad_dump_field(IB_PORT_LINK_SPEED_ENABLED_F, buf + strlen(buf), + sizeof buf - strlen(buf), val); + sprintf(buf + strlen(buf), "%s", "\n"); + mad_decode_field(data, IB_PORT_LINK_SPEED_ACTIVE_F, val); + mad_dump_field(IB_PORT_LINK_SPEED_ACTIVE_F, buf + strlen(buf), + sizeof buf - strlen(buf), val); + sprintf(buf + strlen(buf), "%s", "\n"); + if (espeed_cap) { + mad_decode_field(data, IB_PORT_LINK_SPEED_EXT_SUPPORTED_F, val); + mad_dump_field(IB_PORT_LINK_SPEED_EXT_SUPPORTED_F, + buf + strlen(buf), sizeof buf - strlen(buf), + val); + sprintf(buf + strlen(buf), "%s", "\n"); + mad_decode_field(data, IB_PORT_LINK_SPEED_EXT_ENABLED_F, val); + mad_dump_field(IB_PORT_LINK_SPEED_EXT_ENABLED_F, + buf + strlen(buf), sizeof buf - strlen(buf), + val); + sprintf(buf + strlen(buf), "%s", "\n"); + mad_decode_field(data, IB_PORT_LINK_SPEED_EXT_ACTIVE_F, val); + mad_dump_field(IB_PORT_LINK_SPEED_EXT_ACTIVE_F, + buf + strlen(buf), sizeof buf - strlen(buf), + val); + sprintf(buf + strlen(buf), "%s", "\n"); + } + if (!is_switch || portnum == 0) { + if (show_keys) { + mad_decode_field(data, IB_PORT_MKEY_F, val); + mad_dump_field(IB_PORT_MKEY_F, buf + strlen(buf), + sizeof buf - strlen(buf), val); + } else + snprint_field(buf+strlen(buf), sizeof(buf)-strlen(buf), + IB_PORT_MKEY_F, 32, NOT_DISPLAYED_STR); + sprintf(buf+strlen(buf), "%s", "\n"); + mad_decode_field(data, IB_PORT_MKEY_LEASE_F, val); + mad_dump_field(IB_PORT_MKEY_LEASE_F, buf + strlen(buf), + sizeof buf - strlen(buf), val); + sprintf(buf+strlen(buf), "%s", "\n"); + mad_decode_field(data, IB_PORT_MKEY_PROT_BITS_F, val); + mad_dump_field(IB_PORT_MKEY_PROT_BITS_F, buf + strlen(buf), + sizeof buf - strlen(buf), val); + sprintf(buf+strlen(buf), "%s", "\n"); + } + + printf("# Port info: %s port %d\n%s", portid2str(dest), portnum, buf); +} + +static void set_port_info(ib_portid_t * dest, uint8_t * data, int portnum, + int espeed_cap, int is_switch) +{ + unsigned mod; + + mod = portnum; + if (espeed_cap) + mod |= (1U)<<31; + if (!smp_set_via(data, dest, IB_ATTR_PORT_INFO, mod, 0, srcport)) + IBEXIT("smp set portinfo failed"); + + printf("\nAfter PortInfo set:\n"); + show_port_info(dest, data, portnum, espeed_cap, is_switch); +} + +static void get_mlnx_ext_port_info(ib_portid_t * dest, uint8_t * data, int portnum) +{ + if (!smp_query_via(data, dest, IB_ATTR_MLNX_EXT_PORT_INFO, + portnum, 0, srcport)) + IBEXIT("smp query ext portinfo failed"); +} + +static void show_mlnx_ext_port_info(ib_portid_t * dest, uint8_t * data, int portnum) +{ + char buf[256]; + + mad_dump_mlnx_ext_port_info(buf, sizeof buf, data, IB_SMP_DATA_SIZE); + + printf("# MLNX ext Port info: %s port %d\n%s", portid2str(dest), + portnum, buf); +} + +static void set_mlnx_ext_port_info(ib_portid_t * dest, uint8_t * data, int portnum) +{ + if (!smp_set_via(data, dest, IB_ATTR_MLNX_EXT_PORT_INFO, + portnum, 0, srcport)) + IBEXIT("smp set MLNX ext portinfo failed"); + + printf("\nAfter MLNXExtendedPortInfo set:\n"); + show_mlnx_ext_port_info(dest, data, portnum); +} + +static int get_link_width(int lwe, int lws) +{ + if (lwe == 255) + return lws; + else + return lwe; +} + +static int get_link_speed(int lse, int lss) +{ + if (lse == 15) + return lss; + else + return lse; +} + +static int get_link_speed_ext(int lsee, int lses) +{ + if (lsee == 31) + return lses; + else + return lsee; +} + +static void validate_width(int peerwidth, int lwa) +{ + if ((width & peerwidth & 0x8)) { + if (lwa != 8) + IBWARN + ("Peer ports operating at active width %d rather than 8 (12x)", + lwa); + } else if ((width & peerwidth & 0x4)) { + if (lwa != 4) + IBWARN + ("Peer ports operating at active width %d rather than 4 (8x)", + lwa); + } else if ((width & peerwidth & 0x2)) { + if (lwa != 2) + IBWARN + ("Peer ports operating at active width %d rather than 2 (4x)", + lwa); + } else if ((width & peerwidth & 0x10)) { + if (lwa != 16) + IBWARN + ("Peer ports operating at active width %d rather than 16 (2x)", + lwa); + } else if ((width & peerwidth & 0x1)) { + if (lwa != 1) + IBWARN + ("Peer ports operating at active width %d rather than 1 (1x)", + lwa); + } +} + +static void validate_speed(int peerspeed, int lsa) +{ + if ((speed & peerspeed & 0x4)) { + if (lsa != 4) + IBWARN + ("Peer ports operating at active speed %d rather than 4 (10.0 Gbps)", + lsa); + } else if ((speed & peerspeed & 0x2)) { + if (lsa != 2) + IBWARN + ("Peer ports operating at active speed %d rather than 2 (5.0 Gbps)", + lsa); + } else if ((speed & peerspeed & 0x1)) { + if (lsa != 1) + IBWARN + ("Peer ports operating at active speed %d rather than 1 (2.5 Gbps)", + lsa); + } +} + +static void validate_extended_speed(int peerespeed, int lsea) +{ + if ((espeed & peerespeed & 0x4)) { + if (lsea != 4) + IBWARN + ("Peer ports operating at active extended speed %d rather than 4 (53.125 Gbps)", + lsea); + } else if ((espeed & peerespeed & 0x2)) { + if (lsea != 2) + IBWARN + ("Peer ports operating at active extended speed %d rather than 2 (25.78125 Gbps)", + lsea); + } else if ((espeed & peerespeed & 0x1)) { + if (lsea != 1) + IBWARN + ("Peer ports operating at active extended speed %d rather than 1 (14.0625 Gbps)", + lsea); + } +} + +int main(int argc, char **argv) +{ + int mgmt_classes[3] = + { IB_SMI_CLASS, IB_SMI_DIRECT_CLASS, IB_SA_CLASS }; + ib_portid_t portid = { 0 }; + int port_op = -1; + int is_switch, is_peer_switch, espeed_cap, peer_espeed_cap; + int state, physstate, lwe, lws, lwa, lse, lss, lsa, lsee, lses, lsea, + fdr10s, fdr10e, fdr10a; + int peerlocalportnum, peerlwe, peerlws, peerlwa, peerlse, peerlss, + peerlsa, peerlsee, peerlses, peerlsea, peerfdr10s, peerfdr10e, + peerfdr10a; + int peerwidth, peerspeed, peerespeed; + uint8_t data[IB_SMP_DATA_SIZE] = { 0 }; + uint8_t data2[IB_SMP_DATA_SIZE] = { 0 }; + ib_portid_t peerportid = { 0 }; + int portnum = 0; + ib_portid_t selfportid = { 0 }; + int selfport = 0; + int changed = 0; + int i; + uint32_t vendorid, rem_vendorid; + uint16_t devid, rem_devid; + uint64_t val; + char *endp; + char usage_args[] = "<dest dr_path|lid|guid> <portnum> [<op>]\n" + "\nSupported ops: enable, disable, on, off, reset, speed, espeed, fdr10,\n" + "\twidth, query, down, arm, active, vls, mtu, lid, smlid, lmc,\n" + "\tmkey, mkeylease, mkeyprot\n"; + const char *usage_examples[] = { + "3 1 disable\t\t\t# by lid", + "-G 0x2C9000100D051 1 enable\t# by guid", + "-D 0 1\t\t\t# (query) by direct route", + "3 1 reset\t\t\t# by lid", + "3 1 speed 1\t\t\t# by lid", + "3 1 width 1\t\t\t# by lid", + "-D 0 1 lid 0x1234 arm\t\t# by direct route", + NULL + }; + + ibdiag_process_opts(argc, argv, NULL, NULL, NULL, NULL, + usage_args, usage_examples); + + argc -= optind; + argv += optind; + + if (argc < 2) + ibdiag_show_usage(); + + srcport = mad_rpc_open_port(ibd_ca, ibd_ca_port, mgmt_classes, 3); + if (!srcport) + IBEXIT("Failed to open '%s' port '%d'", ibd_ca, ibd_ca_port); + + smp_mkey_set(srcport, ibd_mkey); + + if (resolve_portid_str(ibd_ca, ibd_ca_port, &portid, argv[0], + ibd_dest_type, ibd_sm_id, srcport) < 0) + IBEXIT("can't resolve destination port %s", argv[0]); + + if (argc > 1) + portnum = strtol(argv[1], NULL, 0); + + for (i = 2; i < argc; i++) { + int j; + + for (j = 0; j < NPORT_ARGS; j++) { + if (strcmp(argv[i], port_args[j].name)) + continue; + port_args[j].set = 1; + if (!port_args[j].val) { + if (port_op >= 0) + IBEXIT("%s only one of: " + "query, enable, disable, " + "reset, down, arm, active, " + "can be specified", + port_args[j].name); + port_op = j; + break; + } + if (++i >= argc) + IBEXIT("%s requires an additional parameter", + port_args[j].name); + val = strtoull(argv[i], NULL, 0); + switch (j) { + case SPEED: + if (val > 15) + IBEXIT("invalid speed value %" PRIu64, + val); + break; + case ESPEED: + if (val > 31) + IBEXIT("invalid extended speed value %" PRIu64, + val); + break; + case FDR10SPEED: + if (val > 1) + IBEXIT("invalid fdr10 speed value %" PRIu64, + val); + break; + case WIDTH: + if ((val > 31 && val != 255)) + IBEXIT("invalid width value %" PRIu64, + val); + break; + case VLS: + if (val == 0 || val > 5) + IBEXIT("invalid vls value %" PRIu64, + val); + break; + case MTU: + if (val == 0 || val > 5) + IBEXIT("invalid mtu value %" PRIu64, + val); + break; + case LID: + if (val == 0 || val >= 0xC000) + IBEXIT("invalid lid value 0x%" PRIx64, + val); + break; + case SMLID: + if (val == 0 || val >= 0xC000) + IBEXIT("invalid smlid value 0x%" PRIx64, + val); + break; + case LMC: + if (val > 7) + IBEXIT("invalid lmc value %" PRIu64, + val); + break; + case MKEY: + errno = 0; + val = strtoull(argv[i], &endp, 0); + if (errno || *endp != '\0') { + errno = 0; + val = strtoull(getpass("New M_Key: "), + &endp, 0); + if (errno || *endp != '\0') { + IBEXIT("Bad new M_Key\n"); + } + } + /* All 64-bit values are legal */ + break; + case MKEYLEASE: + if (val > 0xFFFF) + IBEXIT("invalid mkey lease time %" PRIu64, + val); + break; + case MKEYPROT: + if (val > 3) + IBEXIT("invalid mkey protection bit setting %" PRIu64, + val); + } + *port_args[j].val = val; + changed = 1; + break; + } + if (j == NPORT_ARGS) + IBEXIT("invalid operation: %s", argv[i]); + } + if (port_op < 0) + port_op = QUERY; + + is_switch = get_node_info(&portid, data); + vendorid = (uint32_t) mad_get_field(data, 0, IB_NODE_VENDORID_F); + devid = (uint16_t) mad_get_field(data, 0, IB_NODE_DEVID_F); + + if ((port_args[MKEY].set || port_args[MKEYLEASE].set || + port_args[MKEYPROT].set) && is_switch && portnum != 0) + IBEXIT("Can't set M_Key fields on switch port != 0"); + + if (port_op != QUERY || changed) + printf("Initial %s PortInfo:\n", is_switch ? "Switch" : "CA/RT"); + else + printf("%s PortInfo:\n", is_switch ? "Switch" : "CA/RT"); + espeed_cap = get_port_info(&portid, data, portnum, is_switch); + show_port_info(&portid, data, portnum, espeed_cap, is_switch); + if (is_mlnx_ext_port_info_supported(vendorid, devid)) { + get_mlnx_ext_port_info(&portid, data2, portnum); + show_mlnx_ext_port_info(&portid, data2, portnum); + } + + if (port_op != QUERY || changed) { + /* + * If we aren't setting the LID and the LID is the default, + * the SMA command will fail due to an invalid LID. + * Set it to something unlikely but valid. + */ + physstate = mad_get_field(data, 0, IB_PORT_PHYS_STATE_F); + + val = mad_get_field(data, 0, IB_PORT_LID_F); + if (!port_args[LID].set && (!val || val == 0xFFFF)) + mad_set_field(data, 0, IB_PORT_LID_F, 0x1234); + val = mad_get_field(data, 0, IB_PORT_SMLID_F); + if (!port_args[SMLID].set && (!val || val == 0xFFFF)) + mad_set_field(data, 0, IB_PORT_SMLID_F, 0x1234); + mad_set_field(data, 0, IB_PORT_STATE_F, 0); /* NOP */ + mad_set_field(data, 0, IB_PORT_PHYS_STATE_F, 0); /* NOP */ + + switch (port_op) { + case ON: + /* Enable only if state is Disable */ + if(physstate != 3) { + printf("Port is already in enable state\n"); + goto close_port; + } + SWITCH_FALLTHROUGH; + case ENABLE: + case RESET: + /* Polling */ + mad_set_field(data, 0, IB_PORT_PHYS_STATE_F, 2); + break; + case OFF: + case DISABLE: + printf("Disable may be irreversible\n"); + mad_set_field(data, 0, IB_PORT_PHYS_STATE_F, 3); + break; + case DOWN: + mad_set_field(data, 0, IB_PORT_STATE_F, 1); + break; + case ARM: + mad_set_field(data, 0, IB_PORT_STATE_F, 3); + break; + case ACTIVE: + mad_set_field(data, 0, IB_PORT_STATE_F, 4); + break; + } + + /* always set enabled speeds/width - defaults to NOP */ + mad_set_field(data, 0, IB_PORT_LINK_SPEED_ENABLED_F, speed); + mad_set_field(data, 0, IB_PORT_LINK_SPEED_EXT_ENABLED_F, espeed); + mad_set_field(data, 0, IB_PORT_LINK_WIDTH_ENABLED_F, width); + + if (port_args[VLS].set) + mad_set_field(data, 0, IB_PORT_OPER_VLS_F, vls); + if (port_args[MTU].set) + mad_set_field(data, 0, IB_PORT_NEIGHBOR_MTU_F, mtu); + if (port_args[LID].set) + mad_set_field(data, 0, IB_PORT_LID_F, lid); + if (port_args[SMLID].set) + mad_set_field(data, 0, IB_PORT_SMLID_F, smlid); + if (port_args[LMC].set) + mad_set_field(data, 0, IB_PORT_LMC_F, lmc); + + if (port_args[FDR10SPEED].set) { + mad_set_field(data2, 0, + IB_MLNX_EXT_PORT_STATE_CHG_ENABLE_F, + FDR10); + mad_set_field(data2, 0, + IB_MLNX_EXT_PORT_LINK_SPEED_ENABLED_F, + fdr10); + set_mlnx_ext_port_info(&portid, data2, portnum); + } + + if (port_args[MKEY].set) + mad_set_field64(data, 0, IB_PORT_MKEY_F, mkey); + if (port_args[MKEYLEASE].set) + mad_set_field(data, 0, IB_PORT_MKEY_LEASE_F, + mkeylease); + if (port_args[MKEYPROT].set) + mad_set_field(data, 0, IB_PORT_MKEY_PROT_BITS_F, + mkeyprot); + + set_port_info(&portid, data, portnum, espeed_cap, is_switch); + + } else if (is_switch && portnum) { + /* Now, make sure PortState is Active */ + /* Or is PortPhysicalState LinkUp sufficient ? */ + mad_decode_field(data, IB_PORT_STATE_F, &state); + mad_decode_field(data, IB_PORT_PHYS_STATE_F, &physstate); + if (state == 4) { /* Active */ + mad_decode_field(data, IB_PORT_LINK_WIDTH_ENABLED_F, + &lwe); + mad_decode_field(data, IB_PORT_LINK_WIDTH_SUPPORTED_F, + &lws); + mad_decode_field(data, IB_PORT_LINK_WIDTH_ACTIVE_F, + &lwa); + mad_decode_field(data, IB_PORT_LINK_SPEED_SUPPORTED_F, + &lss); + mad_decode_field(data, IB_PORT_LINK_SPEED_ACTIVE_F, + &lsa); + mad_decode_field(data, IB_PORT_LINK_SPEED_ENABLED_F, + &lse); + mad_decode_field(data2, + IB_MLNX_EXT_PORT_LINK_SPEED_SUPPORTED_F, + &fdr10s); + mad_decode_field(data2, + IB_MLNX_EXT_PORT_LINK_SPEED_ENABLED_F, + &fdr10e); + mad_decode_field(data2, + IB_MLNX_EXT_PORT_LINK_SPEED_ACTIVE_F, + &fdr10a); + if (espeed_cap) { + mad_decode_field(data, + IB_PORT_LINK_SPEED_EXT_SUPPORTED_F, + &lses); + mad_decode_field(data, + IB_PORT_LINK_SPEED_EXT_ACTIVE_F, + &lsea); + mad_decode_field(data, + IB_PORT_LINK_SPEED_EXT_ENABLED_F, + &lsee); + } + + /* Setup portid for peer port */ + memcpy(&peerportid, &portid, sizeof(peerportid)); + if (portid.lid == 0) { + peerportid.drpath.cnt++; + if (peerportid.drpath.cnt == IB_SUBNET_PATH_HOPS_MAX) { + IBEXIT("Too many hops"); + } + } else { + peerportid.drpath.cnt = 1; + + /* Set DrSLID to local lid */ + if (resolve_self(ibd_ca, ibd_ca_port, &selfportid, + &selfport, NULL) < 0) + IBEXIT("could not resolve self"); + peerportid.drpath.drslid = (uint16_t) selfportid.lid; + peerportid.drpath.drdlid = 0xffff; + } + peerportid.drpath.p[peerportid.drpath.cnt] = (uint8_t) portnum; + + /* Get peer port NodeInfo to obtain peer port number */ + is_peer_switch = get_node_info(&peerportid, data); + rem_vendorid = (uint32_t) mad_get_field(data, 0, IB_NODE_VENDORID_F); + rem_devid = (uint16_t) mad_get_field(data, 0, IB_NODE_DEVID_F); + + mad_decode_field(data, IB_NODE_LOCAL_PORT_F, + &peerlocalportnum); + + printf("Peer PortInfo:\n"); + /* Get peer port characteristics */ + peer_espeed_cap = get_port_info(&peerportid, data, + peerlocalportnum, + is_peer_switch); + if (is_mlnx_ext_port_info_supported(rem_vendorid, rem_devid)) + get_mlnx_ext_port_info(&peerportid, data2, + peerlocalportnum); + show_port_info(&peerportid, data, peerlocalportnum, + peer_espeed_cap, is_peer_switch); + if (is_mlnx_ext_port_info_supported(rem_vendorid, rem_devid)) + show_mlnx_ext_port_info(&peerportid, data2, + peerlocalportnum); + + mad_decode_field(data, IB_PORT_LINK_WIDTH_ENABLED_F, + &peerlwe); + mad_decode_field(data, IB_PORT_LINK_WIDTH_SUPPORTED_F, + &peerlws); + mad_decode_field(data, IB_PORT_LINK_WIDTH_ACTIVE_F, + &peerlwa); + mad_decode_field(data, IB_PORT_LINK_SPEED_SUPPORTED_F, + &peerlss); + mad_decode_field(data, IB_PORT_LINK_SPEED_ACTIVE_F, + &peerlsa); + mad_decode_field(data, IB_PORT_LINK_SPEED_ENABLED_F, + &peerlse); + mad_decode_field(data2, + IB_MLNX_EXT_PORT_LINK_SPEED_SUPPORTED_F, + &peerfdr10s); + mad_decode_field(data2, + IB_MLNX_EXT_PORT_LINK_SPEED_ENABLED_F, + &peerfdr10e); + mad_decode_field(data2, + IB_MLNX_EXT_PORT_LINK_SPEED_ACTIVE_F, + &peerfdr10a); + if (peer_espeed_cap) { + mad_decode_field(data, + IB_PORT_LINK_SPEED_EXT_SUPPORTED_F, + &peerlses); + mad_decode_field(data, + IB_PORT_LINK_SPEED_EXT_ACTIVE_F, + &peerlsea); + mad_decode_field(data, + IB_PORT_LINK_SPEED_EXT_ENABLED_F, + &peerlsee); + } + + /* Now validate peer port characteristics */ + /* Examine Link Width */ + width = get_link_width(lwe, lws); + peerwidth = get_link_width(peerlwe, peerlws); + validate_width(peerwidth, lwa); + + /* Examine Link Speeds */ + speed = get_link_speed(lse, lss); + peerspeed = get_link_speed(peerlse, peerlss); + validate_speed(peerspeed, lsa); + + if (espeed_cap && peer_espeed_cap) { + espeed = get_link_speed_ext(lsee, lses); + peerespeed = get_link_speed_ext(peerlsee, + peerlses); + validate_extended_speed(peerespeed, lsea); + } else { + if (fdr10e & FDR10 && peerfdr10e & FDR10) { + if (!(fdr10a & FDR10)) + IBWARN("Peer ports operating at active speed %d rather than FDR10", lsa); + } + } + } + } + +close_port: + mad_rpc_close_port(srcport); + exit(0); +} diff --git a/infiniband-diags/ibqueryerrors.c b/infiniband-diags/ibqueryerrors.c new file mode 100644 index 0000000..48c9f17 --- /dev/null +++ b/infiniband-diags/ibqueryerrors.c @@ -0,0 +1,1170 @@ +/* + * Copyright (c) 2004-2009 Voltaire Inc. All rights reserved. + * Copyright (c) 2007 Xsigo Systems Inc. All rights reserved. + * Copyright (c) 2008 Lawrence Livermore National Lab. All rights reserved. + * Copyright (c) 2009 HNR Consulting. All rights reserved. + * Copyright (c) 2010,2011 Mellanox Technologies LTD. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#include <config.h> + +#define _GNU_SOURCE +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <stdarg.h> +#include <time.h> +#include <string.h> +#include <errno.h> +#include <inttypes.h> + +#include <util/node_name_map.h> +#include <infiniband/ibnetdisc.h> +#include <infiniband/mad.h> + +#include "ibdiag_common.h" +#include "ibdiag_sa.h" + +static struct ibmad_port *ibmad_port; +static char *node_name_map_file = NULL; +static nn_map_t *node_name_map = NULL; +static char *load_cache_file = NULL; +static uint16_t lid2sl_table[sizeof(uint8_t) * 1024 * 48] = { 0 }; +static int obtain_sl = 1; + +static int data_counters; +static int data_counters_only; +static int port_config; +static uint64_t port_guid; +static char *port_guid_str; +#define SUP_MAX 64 +static int sup_total; +static enum MAD_FIELDS suppressed_fields[SUP_MAX]; +static char *dr_path; +static uint8_t node_type_to_print; +static unsigned clear_errors, clear_counts, details; + +#define PRINT_SWITCH 0x1 +#define PRINT_CA 0x2 +#define PRINT_ROUTER 0x4 +#define PRINT_ALL 0xFF /* all nodes default flag */ + +#define DEFAULT_HALF_WORLD_PR_TIMEOUT (3000) + +static struct { + int nodes_checked; + int bad_nodes; + int ports_checked; + int bad_ports; + int pma_query_failures; +} summary; + +#define DEF_THRES_FILE IBDIAG_CONFIG_PATH"/error_thresholds" +static const char *threshold_file = DEF_THRES_FILE; + +/* define a "packet" with threshold values in it */ +static uint8_t thresholds[1204]; +static char *threshold_str; + +static unsigned valid_gid(ib_gid_t * gid) +{ + ib_gid_t zero_gid; + memset(&zero_gid, 0, sizeof zero_gid); + return memcmp(&zero_gid, gid, sizeof(*gid)); +} + +static void set_thres(char *name, uint64_t val) +{ + int f; + int n; + char tmp[256]; + for (f = IB_PC_EXT_ERR_SYM_F; f <= IB_PC_EXT_XMT_WAIT_F; f++) { + if (strcmp(name, mad_field_name(f)) == 0) { + mad_encode_field(thresholds, f, &val); + snprintf(tmp, 255, "[%s = %" PRIu64 "]", name, val); + threshold_str = realloc(threshold_str, + strlen(threshold_str)+strlen(tmp)+1); + if (!threshold_str) { + fprintf(stderr, "Failed to allocate memory: " + "%s\n", strerror(errno)); + exit(1); + } + n = strlen(threshold_str); + strcpy(threshold_str+n, tmp); + } + } +} + +static void set_thresholds(void) +{ + char buf[1024]; + uint64_t val = 0; + FILE *thresf = fopen(threshold_file, "r"); + char *p_prefix, *p_last; + char *name; + char *val_str; + char str[64]; + + if (!thresf) + return; + + snprintf(str, 63, "Thresholds: "); + threshold_str = malloc(strlen(str)+1); + if (!threshold_str) { + fprintf(stderr, "Failed to allocate memory: %s\n", + strerror(errno)); + exit(1); + } + strcpy(threshold_str, str); + while (fgets(buf, sizeof buf, thresf) != NULL) { + p_prefix = strtok_r(buf, "\n", &p_last); + if (!p_prefix) + continue; /* ignore blank lines */ + + if (*p_prefix == '#') + continue; /* ignore comment lines */ + + name = strtok_r(p_prefix, "=", &p_last); + val_str = strtok_r(NULL, "\n", &p_last); + + val = strtoul(val_str, NULL, 0); + set_thres(name, val); + } + + fclose(thresf); +} + +static int exceeds_threshold(int field, uint64_t val) +{ + uint64_t thres = 0; + mad_decode_field(thresholds, field, &thres); + return (val > thres); +} + +static void print_port_config(ibnd_node_t * node, int portnum) +{ + char width[64], speed[64], state[64], physstate[64]; + char remote_str[256]; + char link_str[256]; + char width_msg[256]; + char speed_msg[256]; + char ext_port_str[256]; + int iwidth, ispeed, fdr10, espeed, istate, iphystate, cap_mask; + uint8_t *info; + int rc; + + ibnd_port_t *port = node->ports[portnum]; + + if (!port) + return; + + iwidth = mad_get_field(port->info, 0, IB_PORT_LINK_WIDTH_ACTIVE_F); + ispeed = mad_get_field(port->info, 0, IB_PORT_LINK_SPEED_ACTIVE_F); + fdr10 = mad_get_field(port->ext_info, 0, + IB_MLNX_EXT_PORT_LINK_SPEED_ACTIVE_F) & FDR10; + + if (port->node->type == IB_NODE_SWITCH) + info = (uint8_t *)&port->node->ports[0]->info; + else + info = (uint8_t *)&port->info; + cap_mask = mad_get_field(info, 0, IB_PORT_CAPMASK_F); + if (cap_mask & be32toh(IB_PORT_CAP_HAS_EXT_SPEEDS)) + espeed = mad_get_field(port->info, 0, + IB_PORT_LINK_SPEED_EXT_ACTIVE_F); + else + espeed = 0; + istate = mad_get_field(port->info, 0, IB_PORT_STATE_F); + iphystate = mad_get_field(port->info, 0, IB_PORT_PHYS_STATE_F); + + remote_str[0] = '\0'; + link_str[0] = '\0'; + width_msg[0] = '\0'; + speed_msg[0] = '\0'; + + /* C14-24.2.1 states that a down port allows for invalid data to be + * returned for all PortInfo components except PortState and + * PortPhysicalState */ + if (istate != IB_LINK_DOWN) { + if (!espeed) { + if (fdr10) + sprintf(speed, "10.0 Gbps (FDR10)"); + else + mad_dump_val(IB_PORT_LINK_SPEED_ACTIVE_F, speed, + 64, &ispeed); + } else + mad_dump_val(IB_PORT_LINK_SPEED_EXT_ACTIVE_F, speed, + 64, &espeed); + + snprintf(link_str, 256, "(%3s %18s %6s/%8s)", + mad_dump_val(IB_PORT_LINK_WIDTH_ACTIVE_F, width, 64, &iwidth), + speed, + mad_dump_val(IB_PORT_STATE_F, state, 64, &istate), + mad_dump_val(IB_PORT_PHYS_STATE_F, physstate, 64, &iphystate)); + } else { + snprintf(link_str, 256, "( %6s/%8s)", + mad_dump_val(IB_PORT_STATE_F, state, 64, &istate), + mad_dump_val(IB_PORT_PHYS_STATE_F, physstate, 64, &iphystate)); + } + + if (port->remoteport) { + char *rem_node_name = NULL; + + if (port->remoteport->ext_portnum) + snprintf(ext_port_str, 256, "%d", + port->remoteport->ext_portnum); + else + ext_port_str[0] = '\0'; + + get_max_msg(width_msg, speed_msg, 256, port); + + rem_node_name = remap_node_name(node_name_map, + port->remoteport->node->guid, + port->remoteport->node-> + nodedesc); + + rc = snprintf(remote_str, sizeof(remote_str), + "0x%016" PRIx64 " %6d %4d[%2s] \"%s\" (%s %s)\n", + port->remoteport->guid, + port->remoteport->base_lid ? port->remoteport-> + base_lid : port->remoteport->node->smalid, + port->remoteport->portnum, ext_port_str, rem_node_name, + width_msg, speed_msg); + if (rc > sizeof(remote_str)) + fprintf(stderr, "WARN: string buffer overflow\n"); + + free(rem_node_name); + } else + snprintf(remote_str, 256, " [ ] \"\" ( )\n"); + + if (port->ext_portnum) + snprintf(ext_port_str, 256, "%d", port->ext_portnum); + else + ext_port_str[0] = '\0'; + + if (node->type == IB_NODE_SWITCH) + printf(" Link info: %6d", node->smalid); + else + printf(" Link info: %6d", port->base_lid); + + printf("%4d[%2s] ==%s==> %s", + port->portnum, ext_port_str, link_str, remote_str); +} + +static int suppress(enum MAD_FIELDS field) +{ + int i = 0; + for (i = 0; i < sup_total; i++) + if (field == suppressed_fields[i]) + return 1; + return 0; +} + +static void report_suppressed(void) +{ + int i = 0; + printf("## Suppressed:"); + for (i = 0; i < sup_total; i++) + printf(" %s", mad_field_name(suppressed_fields[i])); + printf("\n"); +} + +static int print_summary(void) +{ + printf("\n## Summary: %d nodes checked, %d bad nodes found\n", + summary.nodes_checked, summary.bad_nodes); + printf("## %d ports checked, %d ports have errors beyond threshold\n", + summary.ports_checked, summary.bad_ports); + printf("## %s\n", threshold_str); + if (summary.pma_query_failures) + printf("## %d PMA query failures\n", summary.pma_query_failures); + report_suppressed(); + return (summary.bad_ports); +} + +static void insert_lid2sl_table(struct sa_query_result *r) +{ + unsigned int i; + for (i = 0; i < r->result_cnt; i++) { + ib_path_rec_t *p_pr = (ib_path_rec_t *)sa_get_query_rec(r->p_result_madw, i); + lid2sl_table[be16toh(p_pr->dlid)] = ib_path_rec_sl(p_pr); + } +} + +static int path_record_query(ib_gid_t sgid,uint64_t dguid) +{ + ib_path_rec_t pr; + __be64 comp_mask = 0; + uint8_t reversible = 0; + struct sa_handle * h; + + if (!(h = sa_get_handle())) + return -1; + + ibd_timeout = DEFAULT_HALF_WORLD_PR_TIMEOUT; + memset(&pr, 0, sizeof(pr)); + + CHECK_AND_SET_GID(sgid, pr.sgid, PR, SGID); + if(dguid) { + mad_encode_field(sgid.raw, IB_GID_GUID_F, &dguid); + CHECK_AND_SET_GID(sgid, pr.dgid, PR, DGID); + } + + CHECK_AND_SET_VAL(1, 8, -1, pr.num_path, PR, NUMBPATH);/*to get only one PathRecord for each source and destination pair*/ + CHECK_AND_SET_VAL(1, 8, -1, reversible, PR, REVERSIBLE);/*for a reversible path*/ + pr.num_path |= reversible << 7; + struct sa_query_result result; + int ret = sa_query(h, IB_MAD_METHOD_GET_TABLE, + (uint16_t)IB_SA_ATTR_PATHRECORD,0,be64toh(comp_mask),ibd_sakey, + &pr, sizeof(pr), &result); + if (ret) { + sa_free_handle(h); + fprintf(stderr, "Query SA failed: %s; sa call path_query failed\n", strerror(ret)); + return ret; + } + if (result.status != IB_SA_MAD_STATUS_SUCCESS) { + sa_report_err(result.status); + ret = EIO; + goto Exit; + } + + insert_lid2sl_table(&result); +Exit: + sa_free_handle(h); + sa_free_result_mad(&result); + return ret; +} + +static int query_and_dump(char *buf, size_t size, ib_portid_t * portid, + char *node_name, int portnum, + const char *attr_name, uint16_t attr_id, + int start_field, int end_field) +{ + uint8_t pc[1024]; + uint32_t val = 0; + int i, n; + + memset(pc, 0, sizeof(pc)); + + if (!pma_query_via(pc, portid, portnum, ibd_timeout, attr_id, + ibmad_port)) { + IBWARN("%s query failed on %s, %s port %d", attr_name, + node_name, portid2str(portid), portnum); + summary.pma_query_failures++; + return 0; + } + + for (n = 0, i = start_field; i < end_field; i++) { + mad_decode_field(pc, i, (void *)&val); + if (val) + n += snprintf(buf + n, size - n, " [%s == %u]", + mad_field_name(i), val); + } + + return n; +} + +static int check_threshold(uint8_t *pc, uint8_t *pce, uint32_t cap_mask2, + int i, int ext_i, int *n, char *str, size_t size) +{ + uint32_t val32 = 0; + uint64_t val64 = 0; + int is_exceeds = 0; + float val = 0; + const char *unit = ""; + + if (htonl(cap_mask2) & IB_PM_IS_ADDL_PORT_CTRS_EXT_SUP) { + mad_decode_field(pce, ext_i, (void *)&val64); + if (exceeds_threshold(ext_i, val64)) { + unit = conv_cnt_human_readable(val64, &val, 0); + *n += snprintf(str + *n, size - *n, + " [%s == %" PRIu64 " (%5.3f%s)]", + mad_field_name(ext_i), val64, val, unit); + is_exceeds = 1; + } + + } else { + mad_decode_field(pc, i, (void *)&val32); + if (exceeds_threshold(ext_i, val32)) { + *n += snprintf(str + *n, size - *n, " [%s == %u]", + mad_field_name(i), val32); + is_exceeds = 1; + } + } + + return is_exceeds; +} + +static int print_results(ib_portid_t * portid, char *node_name, + ibnd_node_t * node, uint8_t * pc, int portnum, + int *header_printed, uint8_t *pce, __be16 cap_mask, + uint32_t cap_mask2) +{ + char buf[2048]; + char *str = buf; + int i, ext_i, n; + + for (n = 0, i = IB_PC_ERR_SYM_F, ext_i = IB_PC_EXT_ERR_SYM_F; + i <= IB_PC_VL15_DROPPED_F; i++, ext_i++ ) { + if (suppress(i)) + continue; + + /* this is not a counter, skip it */ + if (i == IB_PC_COUNTER_SELECT2_F) { + ext_i--; + continue; + } + + if (check_threshold(pc, pce, cap_mask2, i, ext_i, &n, str, sizeof(buf))) { + + /* If there are PortXmitDiscards, get details (if supported) */ + if (i == IB_PC_XMT_DISCARDS_F && details) { + n += query_and_dump(str + n, sizeof(buf) - n, portid, + node_name, portnum, + "PortXmitDiscardDetails", + IB_GSI_PORT_XMIT_DISCARD_DETAILS, + IB_PC_RCV_LOCAL_PHY_ERR_F, + IB_PC_RCV_ERR_LAST_F); + /* If there are PortRcvErrors, get details (if supported) */ + } else if (i == IB_PC_ERR_RCV_F && details) { + n += query_and_dump(str + n, sizeof(buf) - n, portid, + node_name, portnum, + "PortRcvErrorDetails", + IB_GSI_PORT_RCV_ERROR_DETAILS, + IB_PC_XMT_INACT_DISC_F, + IB_PC_XMT_DISC_LAST_F); + } + } + } + + if (!suppress(IB_PC_XMT_WAIT_F)) { + check_threshold(pc, pce, cap_mask2, IB_PC_XMT_WAIT_F, + IB_PC_EXT_XMT_WAIT_F, &n, str, sizeof(buf)); + } + + /* if we found errors. */ + if (n != 0) { + if (data_counters) { + uint8_t *pkt = pc; + int start_field = IB_PC_XMT_BYTES_F; + int end_field = IB_PC_RCV_PKTS_F; + + if (pce) { + pkt = pce; + start_field = IB_PC_EXT_XMT_BYTES_F; + if (cap_mask & IB_PM_EXT_WIDTH_SUPPORTED) + end_field = IB_PC_EXT_RCV_MPKTS_F; + else + end_field = IB_PC_EXT_RCV_PKTS_F; + } + + for (i = start_field; i <= end_field; i++) { + uint64_t val64 = 0; + float val = 0; + const char *unit = ""; + mad_decode_field(pkt, i, (void *)&val64); + if (val64) { + int data = 0; + if (i == IB_PC_EXT_XMT_BYTES_F || + i == IB_PC_EXT_RCV_BYTES_F || + i == IB_PC_XMT_BYTES_F || + i == IB_PC_RCV_BYTES_F) + data = 1; + unit = conv_cnt_human_readable(val64, + &val, data); + n += snprintf(str + n, sizeof(buf) - n, + " [%s == %" PRIu64 + " (%5.3f%s)]", + mad_field_name(i), val64, val, + unit); + } + } + } + + if (!*header_printed) { + if (node->type == IB_NODE_SWITCH) + printf("Errors for 0x%" PRIx64 " \"%s\"\n", + node->ports[0]->guid, node_name); + else + printf("Errors for \"%s\"\n", node_name); + *header_printed = 1; + summary.bad_nodes++; + } + + if (portnum == 0xFF) { + if (node->type == IB_NODE_SWITCH) + printf(" GUID 0x%" PRIx64 " port ALL:%s\n", + node->ports[0]->guid, str); + } else { + printf(" GUID 0x%" PRIx64 " port %d:%s\n", + node->ports[portnum]->guid, portnum, str); + if (port_config) + print_port_config(node, portnum); + summary.bad_ports++; + } + } + return (n); +} + +static int query_cap_mask(ib_portid_t * portid, char *node_name, int portnum, + __be16 * cap_mask, uint32_t * cap_mask2) +{ + uint8_t pc[1024] = { 0 }; + __be16 rc_cap_mask; + __be32 rc_cap_mask2; + + portid->sl = lid2sl_table[portid->lid]; + + /* PerfMgt ClassPortInfo is a required attribute */ + if (!pma_query_via(pc, portid, portnum, ibd_timeout, CLASS_PORT_INFO, + ibmad_port)) { + IBWARN("classportinfo query failed on %s, %s port %d", + node_name, portid2str(portid), portnum); + summary.pma_query_failures++; + return -1; + } + + /* ClassPortInfo should be supported as part of libibmad */ + memcpy(&rc_cap_mask, pc + 2, sizeof(rc_cap_mask)); /* CapabilityMask */ + memcpy(&rc_cap_mask2, pc + 4, sizeof(rc_cap_mask2)); /* CapabilityMask2 */ + + *cap_mask = rc_cap_mask; + *cap_mask2 = ntohl(rc_cap_mask2) >> 5; + return 0; +} + +static int print_data_cnts(ib_portid_t * portid, __be16 cap_mask, + char *node_name, ibnd_node_t * node, int portnum, + int *header_printed) +{ + uint8_t pc[1024]; + int i; + int start_field = IB_PC_XMT_BYTES_F; + int end_field = IB_PC_RCV_PKTS_F; + + memset(pc, 0, 1024); + + portid->sl = lid2sl_table[portid->lid]; + + if (cap_mask & (IB_PM_EXT_WIDTH_SUPPORTED | IB_PM_EXT_WIDTH_NOIETF_SUP)) { + if (!pma_query_via(pc, portid, portnum, ibd_timeout, + IB_GSI_PORT_COUNTERS_EXT, ibmad_port)) { + IBWARN("IB_GSI_PORT_COUNTERS_EXT query failed on %s, %s port %d", + node_name, portid2str(portid), portnum); + summary.pma_query_failures++; + return (1); + } + start_field = IB_PC_EXT_XMT_BYTES_F; + if (cap_mask & IB_PM_EXT_WIDTH_SUPPORTED) + end_field = IB_PC_EXT_RCV_MPKTS_F; + else + end_field = IB_PC_EXT_RCV_PKTS_F; + } else { + if (!pma_query_via(pc, portid, portnum, ibd_timeout, + IB_GSI_PORT_COUNTERS, ibmad_port)) { + IBWARN("IB_GSI_PORT_COUNTERS query failed on %s, %s port %d", + node_name, portid2str(portid), portnum); + summary.pma_query_failures++; + return (1); + } + start_field = IB_PC_XMT_BYTES_F; + end_field = IB_PC_RCV_PKTS_F; + } + + if (!*header_printed) { + printf("Data Counters for 0x%" PRIx64 " \"%s\"\n", node->guid, + node_name); + *header_printed = 1; + } + + if (portnum == 0xFF) + printf(" GUID 0x%" PRIx64 " port ALL:", node->guid); + else + printf(" GUID 0x%" PRIx64 " port %d:", + node->guid, portnum); + + for (i = start_field; i <= end_field; i++) { + uint64_t val64 = 0; + float val = 0; + const char *unit = ""; + int data = 0; + mad_decode_field(pc, i, (void *)&val64); + if (i == IB_PC_EXT_XMT_BYTES_F || i == IB_PC_EXT_RCV_BYTES_F || + i == IB_PC_XMT_BYTES_F || i == IB_PC_RCV_BYTES_F) + data = 1; + unit = conv_cnt_human_readable(val64, &val, data); + printf(" [%s == %" PRIu64 " (%5.3f%s)]", mad_field_name(i), + val64, val, unit); + } + printf("\n"); + + if (portnum != 0xFF && port_config) + print_port_config(node, portnum); + + return (0); +} + +static int print_errors(ib_portid_t * portid, __be16 cap_mask, uint32_t cap_mask2, + char *node_name, ibnd_node_t * node, int portnum, + int *header_printed) +{ + uint8_t pc[1024]; + uint8_t pce[1024]; + uint8_t *pc_ext = NULL; + + memset(pc, 0, 1024); + memset(pce, 0, 1024); + + portid->sl = lid2sl_table[portid->lid]; + + if (!pma_query_via(pc, portid, portnum, ibd_timeout, + IB_GSI_PORT_COUNTERS, ibmad_port)) { + IBWARN("IB_GSI_PORT_COUNTERS query failed on %s, %s port %d", + node_name, portid2str(portid), portnum); + summary.pma_query_failures++; + return (0); + } + + if (cap_mask & (IB_PM_EXT_WIDTH_SUPPORTED | IB_PM_EXT_WIDTH_NOIETF_SUP)) { + if (!pma_query_via(pce, portid, portnum, ibd_timeout, + IB_GSI_PORT_COUNTERS_EXT, ibmad_port)) { + IBWARN("IB_GSI_PORT_COUNTERS_EXT query failed on %s, %s port %d", + node_name, portid2str(portid), portnum); + summary.pma_query_failures++; + return (0); + } + pc_ext = pce; + } + + if (!(cap_mask & IB_PM_PC_XMIT_WAIT_SUP)) { + /* if PortCounters:PortXmitWait not supported clear this counter */ + uint32_t foo = 0; + mad_encode_field(pc, IB_PC_XMT_WAIT_F, &foo); + } + return (print_results(portid, node_name, node, pc, portnum, + header_printed, pc_ext, cap_mask, cap_mask2)); +} + +static uint8_t *reset_pc_ext(void *rcvbuf, ib_portid_t *dest, int port, + unsigned mask, unsigned timeout, + const struct ibmad_port *srcport) +{ + ib_rpc_t rpc = { 0 }; + int lid = dest->lid; + + DEBUG("lid %u port %d mask 0x%x", lid, port, mask); + + if (lid == -1) { + IBWARN("only lid routed is supported"); + return NULL; + } + + if (!mask) + mask = ~0; + + rpc.mgtclass = IB_PERFORMANCE_CLASS; + rpc.method = IB_MAD_METHOD_SET; + rpc.attr.id = IB_GSI_PORT_COUNTERS_EXT; + + memset(rcvbuf, 0, IB_MAD_SIZE); + + /* Same for attribute IDs */ + mad_set_field(rcvbuf, 0, IB_PC_EXT_PORT_SELECT_F, port); + mad_set_field(rcvbuf, 0, IB_PC_EXT_COUNTER_SELECT_F, mask); + mask = mask >> 16; + mad_set_field(rcvbuf, 0, IB_PC_EXT_COUNTER_SELECT2_F, mask); + rpc.attr.mod = 0; + rpc.timeout = timeout; + rpc.datasz = IB_PC_DATA_SZ; + rpc.dataoffs = IB_PC_DATA_OFFS; + if (!dest->qp) + dest->qp = 1; + if (!dest->qkey) + dest->qkey = IB_DEFAULT_QP1_QKEY; + + return mad_rpc(srcport, &rpc, dest, rcvbuf, rcvbuf); +} + +static void clear_port(ib_portid_t * portid, __be16 cap_mask, uint32_t cap_mask2, + char *node_name, int port) +{ + uint8_t pc[1024] = { 0 }; + /* bits defined in Table 228 PortCounters CounterSelect and + * CounterSelect2 + */ + uint32_t mask = 0; + + if (clear_errors) { + mask |= 0xFFF; + if (cap_mask & IB_PM_PC_XMIT_WAIT_SUP) + mask |= 0x10000; + } + if (clear_counts) + mask |= 0xF000; + + if (mask) + if (!performance_reset_via(pc, portid, port, mask, ibd_timeout, + IB_GSI_PORT_COUNTERS, ibmad_port)) + fprintf(stderr, "Failed to reset errors %s port %d\n", node_name, + port); + + if (clear_errors && details) { + memset(pc, 0, 1024); + performance_reset_via(pc, portid, port, 0xf, ibd_timeout, + IB_GSI_PORT_XMIT_DISCARD_DETAILS, + ibmad_port); + memset(pc, 0, 1024); + performance_reset_via(pc, portid, port, 0x3f, ibd_timeout, + IB_GSI_PORT_RCV_ERROR_DETAILS, + ibmad_port); + } + + if (cap_mask & (IB_PM_EXT_WIDTH_SUPPORTED | IB_PM_EXT_WIDTH_NOIETF_SUP)) { + mask = 0; + if (clear_counts) { + if (cap_mask & IB_PM_EXT_WIDTH_SUPPORTED) + mask = 0xFF; + else + mask = 0x0F; + } + + if (clear_errors && (htonl(cap_mask2) & IB_PM_IS_ADDL_PORT_CTRS_EXT_SUP)) { + mask |= 0xfff0000; + if (cap_mask & IB_PM_PC_XMIT_WAIT_SUP) + mask |= (1 << 28); + } + + if (mask && !reset_pc_ext(pc, portid, port, mask, ibd_timeout, + ibmad_port)) + fprintf(stderr, "Failed to reset extended data counters %s, " + "%s port %d\n", node_name, portid2str(portid), + port); + } +} + +static void print_node(ibnd_node_t *node, void *user_data) +{ + int header_printed = 0; + int p = 0; + int startport = 1; + int type = 0; + int all_port_sup = 0; + ib_portid_t portid = { 0 }; + __be16 cap_mask = 0; + uint32_t cap_mask2 = 0; + char *node_name = NULL; + + switch (node->type) { + case IB_NODE_SWITCH: + type = PRINT_SWITCH; + break; + case IB_NODE_CA: + type = PRINT_CA; + break; + case IB_NODE_ROUTER: + type = PRINT_ROUTER; + break; + } + + if ((type & node_type_to_print) == 0) + return; + + if (node->type == IB_NODE_SWITCH && node->smaenhsp0) + startport = 0; + + node_name = remap_node_name(node_name_map, node->guid, node->nodedesc); + + if (node->type == IB_NODE_SWITCH) { + ib_portid_set(&portid, node->smalid, 0, 0); + p = 0; + } else { + for (p = 1; p <= node->numports; p++) { + if (node->ports[p]) { + ib_portid_set(&portid, + node->ports[p]->base_lid, + 0, 0); + break; + } + } + } + + if ((query_cap_mask(&portid, node_name, p, &cap_mask, &cap_mask2) == 0) && + (cap_mask & IB_PM_ALL_PORT_SELECT)) + all_port_sup = 1; + + if (data_counters_only) { + for (p = startport; p <= node->numports; p++) { + if (node->ports[p]) { + if (node->type == IB_NODE_SWITCH) + ib_portid_set(&portid, node->smalid, 0, 0); + else + ib_portid_set(&portid, node->ports[p]->base_lid, + 0, 0); + + print_data_cnts(&portid, cap_mask, node_name, node, p, + &header_printed); + summary.ports_checked++; + if (!all_port_sup) + clear_port(&portid, cap_mask, cap_mask2, node_name, p); + } + } + } else { + if (all_port_sup) + if (!print_errors(&portid, cap_mask, cap_mask2, node_name, node, + 0xFF, &header_printed)) { + summary.ports_checked += node->numports; + goto clear; + } + + for (p = startport; p <= node->numports; p++) { + if (node->ports[p]) { + if (node->type == IB_NODE_SWITCH) + ib_portid_set(&portid, node->smalid, 0, 0); + else + ib_portid_set(&portid, node->ports[p]->base_lid, + 0, 0); + + print_errors(&portid, cap_mask, cap_mask2, node_name, node, p, + &header_printed); + summary.ports_checked++; + if (!all_port_sup) + clear_port(&portid, cap_mask, cap_mask2, node_name, p); + } + } + } + +clear: + summary.nodes_checked++; + if (all_port_sup) + clear_port(&portid, cap_mask, cap_mask2, node_name, 0xFF); + + free(node_name); +} + +static void add_suppressed(enum MAD_FIELDS field) +{ + if (sup_total >= SUP_MAX) { + IBWARN("Maximum (%d) fields have been suppressed; skipping %s", + sup_total, mad_field_name(field)); + return; + } + suppressed_fields[sup_total++] = field; +} + +static void calculate_suppressed_fields(char *str) +{ + enum MAD_FIELDS f; + char *val, *lasts = NULL; + char *tmp = strdup(str); + + val = strtok_r(tmp, ",", &lasts); + while (val) { + for (f = IB_PC_FIRST_F; f <= IB_PC_LAST_F; f++) + if (strcmp(val, mad_field_name(f)) == 0) + add_suppressed(f); + val = strtok_r(NULL, ",", &lasts); + } + + free(tmp); +} + +static int process_opt(void *context, int ch) +{ + struct ibnd_config *cfg = context; + switch (ch) { + case 's': + calculate_suppressed_fields(optarg); + break; + case 'c': + /* Right now this is the only "common" error */ + add_suppressed(IB_PC_ERR_SWITCH_REL_F); + break; + case 1: + node_name_map_file = strdup(optarg); + if (node_name_map_file == NULL) + IBEXIT("out of memory, strdup for node_name_map_file name failed"); + break; + case 2: + data_counters++; + break; + case 3: + node_type_to_print |= PRINT_SWITCH; + break; + case 4: + node_type_to_print |= PRINT_CA; + break; + case 5: + node_type_to_print |= PRINT_ROUTER; + break; + case 6: + details = 1; + break; + case 7: + load_cache_file = strdup(optarg); + break; + case 8: + threshold_file = strdup(optarg); + break; + case 9: + data_counters_only = 1; + break; + case 10: + obtain_sl = 0; + break; + case 'G': + case 'S': + port_guid_str = optarg; + port_guid = strtoull(optarg, NULL, 0); + break; + case 'D': + dr_path = strdup(optarg); + break; + case 'r': + port_config++; + break; + case 'R': /* nop */ + break; + case 'k': + clear_errors = 1; + break; + case 'K': + clear_counts = 1; + break; + case 'o': + cfg->max_smps = strtoul(optarg, NULL, 0); + break; + default: + return -1; + } + + return 0; +} + +int main(int argc, char **argv) +{ + struct ibnd_config config = { 0 }; + int resolved = -1; + ib_portid_t portid = { 0 }; + ib_portid_t self_portid = { 0 }; + int rc = 0; + ibnd_fabric_t *fabric = NULL; + ib_gid_t self_gid; + int port = 0; + + int mgmt_classes[4] = { IB_SMI_CLASS, IB_SMI_DIRECT_CLASS, IB_SA_CLASS, + IB_PERFORMANCE_CLASS + }; + + const struct ibdiag_opt opts[] = { + {"suppress", 's', 1, "<err1,err2,...>", + "suppress errors listed"}, + {"suppress-common", 'c', 0, NULL, + "suppress some of the common counters"}, + {"node-name-map", 1, 1, "<file>", "node name map file"}, + {"port-guid", 'G', 1, "<port_guid>", + "report the node containing the port specified by <port_guid>"}, + {"", 'S', 1, "<port_guid>", + "Same as \"-G\" for backward compatibility"}, + {"Direct", 'D', 1, "<dr_path>", + "report the node containing the port specified by <dr_path>"}, + {"skip-sl", 10, 0, NULL,"don't obtain SL to all destinations"}, + {"report-port", 'r', 0, NULL, + "report port link information"}, + {"threshold-file", 8, 1, NULL, + "specify an alternate threshold file, default: " DEF_THRES_FILE}, + {"GNDN", 'R', 0, NULL, + "(This option is obsolete and does nothing)"}, + {"data", 2, 0, NULL, "include data counters for ports with errors"}, + {"switch", 3, 0, NULL, "print data for switches only"}, + {"ca", 4, 0, NULL, "print data for CA's only"}, + {"router", 5, 0, NULL, "print data for routers only"}, + {"details", 6, 0, NULL, "include transmit discard details"}, + {"counters", 9, 0, NULL, "print data counters only"}, + {"clear-errors", 'k', 0, NULL, + "Clear error counters after read"}, + {"clear-counts", 'K', 0, NULL, + "Clear data counters after read"}, + {"load-cache", 7, 1, "<file>", + "filename of ibnetdiscover cache to load"}, + {"outstanding_smps", 'o', 1, NULL, + "specify the number of outstanding SMP's which should be " + "issued during the scan"}, + {} + }; + char usage_args[] = ""; + + memset(suppressed_fields, 0, sizeof suppressed_fields); + ibdiag_process_opts(argc, argv, &config, "cDGKLnRrSs", opts, process_opt, + usage_args, NULL); + + argc -= optind; + argv += optind; + + if (!node_type_to_print) + node_type_to_print = PRINT_ALL; + + ibmad_port = mad_rpc_open_port(ibd_ca, ibd_ca_port, mgmt_classes, 4); + if (!ibmad_port) + IBEXIT("Failed to open port; %s:%d\n", ibd_ca, ibd_ca_port); + + smp_mkey_set(ibmad_port, ibd_mkey); + + if (ibd_timeout) { + mad_rpc_set_timeout(ibmad_port, ibd_timeout); + config.timeout_ms = ibd_timeout; + } + + config.flags = ibd_ibnetdisc_flags; + config.mkey = ibd_mkey; + + if (dr_path && load_cache_file) { + mad_rpc_close_port(ibmad_port); + fprintf(stderr, "Cannot specify cache and direct route path\n"); + exit(-1); + } + + if (resolve_self(ibd_ca, ibd_ca_port, &self_portid, &port, &self_gid.raw) < 0) { + mad_rpc_close_port(ibmad_port); + IBEXIT("can't resolve self port %s", argv[0]); + } + + node_name_map = open_node_name_map(node_name_map_file); + + /* limit the scan the fabric around the target */ + if (dr_path) { + if ((resolved = + resolve_portid_str(ibd_ca, ibd_ca_port, &portid, dr_path, + IB_DEST_DRPATH, NULL, ibmad_port)) < 0) + IBWARN("Failed to resolve %s; attempting full scan", + dr_path); + } else if (port_guid_str) { + if ((resolved = + resolve_portid_str(ibd_ca, ibd_ca_port, &portid, + port_guid_str, IB_DEST_GUID, ibd_sm_id, + ibmad_port)) < 0) + IBWARN("Failed to resolve %s; attempting full scan", + port_guid_str); + if(obtain_sl) + lid2sl_table[portid.lid] = portid.sl; + } + + mad_rpc_close_port(ibmad_port); + + if (load_cache_file) { + if ((fabric = ibnd_load_fabric(load_cache_file, 0)) == NULL) { + fprintf(stderr, "loading cached fabric failed\n"); + rc = -1; + goto close_name_map; + } + } else { + if (resolved >= 0) { + if (!config.max_hops) + config.max_hops = 1; + if (!(fabric = ibnd_discover_fabric(ibd_ca, ibd_ca_port, + &portid, &config))) + IBWARN("Single node discover failed;" + " attempting full scan"); + } + + if (!fabric && !(fabric = ibnd_discover_fabric(ibd_ca, + ibd_ca_port, + NULL, + &config))) { + fprintf(stderr, "discover failed\n"); + rc = -1; + goto close_name_map; + } + } + + set_thresholds(); + + /* reopen the global ibmad_port */ + ibmad_port = mad_rpc_open_port(ibd_ca, ibd_ca_port, + mgmt_classes, 4); + if (!ibmad_port) { + ibnd_destroy_fabric(fabric); + close_node_name_map(node_name_map); + IBEXIT("Failed to reopen port: %s:%d\n", + ibd_ca, ibd_ca_port); + } + + smp_mkey_set(ibmad_port, ibd_mkey); + + if (ibd_timeout) + mad_rpc_set_timeout(ibmad_port, ibd_timeout); + + if (port_guid_str) { + ibnd_port_t *ndport = ibnd_find_port_guid(fabric, port_guid); + if (ndport) + print_node(ndport->node, NULL); + else + fprintf(stderr, "Failed to find node: %s\n", + port_guid_str); + } else if (dr_path) { + ibnd_port_t *ndport; + + uint8_t ni[IB_SMP_DATA_SIZE] = { 0 }; + if (!smp_query_via(ni, &portid, IB_ATTR_NODE_INFO, 0, + ibd_timeout, ibmad_port)) { + fprintf(stderr, "Failed to query local Node Info\n"); + goto close_port; + } + + mad_decode_field(ni, IB_NODE_PORT_GUID_F, &(port_guid)); + + ndport = ibnd_find_port_guid(fabric, port_guid); + if (ndport) { + if(obtain_sl) + if(path_record_query(self_gid,ndport->guid)) + goto close_port; + print_node(ndport->node, NULL); + } else + fprintf(stderr, "Failed to find node: %s\n", dr_path); + } else { + if(obtain_sl) + if(path_record_query(self_gid,0)) + goto close_port; + + ibnd_iter_nodes(fabric, print_node, NULL); + } + + rc = print_summary(); + if (rc) + rc = 1; + +close_port: + mad_rpc_close_port(ibmad_port); + ibnd_destroy_fabric(fabric); + +close_name_map: + close_node_name_map(node_name_map); + exit(rc); +} diff --git a/infiniband-diags/ibroute.c b/infiniband-diags/ibroute.c new file mode 100644 index 0000000..207b3c3 --- /dev/null +++ b/infiniband-diags/ibroute.c @@ -0,0 +1,489 @@ +/* + * Copyright (c) 2004-2009 Voltaire Inc. All rights reserved. + * Copyright (c) 2009-2011 Mellanox Technologies LTD. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <string.h> +#include <inttypes.h> +#include <netinet/in.h> + +#include <infiniband/umad.h> +#include <infiniband/mad.h> +#include <util/node_name_map.h> + +#include "ibdiag_common.h" + +static struct ibmad_port *srcport; + +static int brief, dump_all, multicast; + +static char *node_name_map_file = NULL; +static nn_map_t *node_name_map = NULL; + +/*******************************************/ + +static const char *check_switch(ib_portid_t *portid, unsigned int *nports, + uint64_t *guid, uint8_t *sw, char *nd) +{ + uint8_t ni[IB_SMP_DATA_SIZE] = { 0 }; + int type; + + DEBUG("checking node type"); + if (!smp_query_via(ni, portid, IB_ATTR_NODE_INFO, 0, 0, srcport)) { + xdump(stderr, "nodeinfo\n", ni, sizeof ni); + return "node info failed: valid addr?"; + } + + if (!smp_query_via(nd, portid, IB_ATTR_NODE_DESC, 0, 0, srcport)) + return "node desc failed"; + + mad_decode_field(ni, IB_NODE_TYPE_F, &type); + if (type != IB_NODE_SWITCH) + return "not a switch"; + + DEBUG("Gathering information about switch"); + mad_decode_field(ni, IB_NODE_NPORTS_F, nports); + mad_decode_field(ni, IB_NODE_GUID_F, guid); + + if (!smp_query_via(sw, portid, IB_ATTR_SWITCH_INFO, 0, 0, srcport)) + return "switch info failed: is a switch node?"; + + return NULL; +} + +#define IB_MLIDS_IN_BLOCK (IB_SMP_DATA_SIZE/2) + +static int dump_mlid(char *str, int strlen, unsigned mlid, unsigned nports, + __be16 mft[16][IB_MLIDS_IN_BLOCK]) +{ + uint16_t mask; + unsigned i, chunk, bit, nonzero = 0; + + if (brief) { + int n = 0; + unsigned chunks = ALIGN(nports + 1, 16) / 16; + for (i = 0; i < chunks; i++) { + mask = ntohs(mft[i][mlid % IB_MLIDS_IN_BLOCK]); + if (mask) + nonzero++; + n += snprintf(str + n, strlen - n, "%04hx", mask); + if (n >= strlen) { + n = strlen; + break; + } + } + if (!nonzero && !dump_all) { + str[0] = 0; + return 0; + } + return n; + } + for (i = 0; i <= nports; i++) { + chunk = i / 16; + bit = i % 16; + + mask = ntohs(mft[chunk][mlid % IB_MLIDS_IN_BLOCK]); + if (mask) + nonzero++; + str[i * 2] = (mask & (1 << bit)) ? 'x' : ' '; + str[i * 2 + 1] = ' '; + } + if (!nonzero && !dump_all) { + str[0] = 0; + return 0; + } + str[i * 2] = 0; + return i * 2; +} + +static __be16 mft[16][IB_MLIDS_IN_BLOCK]; + +static const char *dump_multicast_tables(ib_portid_t *portid, unsigned startlid, + unsigned endlid) +{ + char nd[IB_SMP_DATA_SIZE] = { 0 }; + uint8_t sw[IB_SMP_DATA_SIZE] = { 0 }; + char str[512], *s; + const char *err; + uint64_t nodeguid; + uint32_t mod; + unsigned block, i, j, e, nports, cap, chunks, startblock, lastblock, + top; + char *mapnd = NULL; + int n = 0; + + if ((err = check_switch(portid, &nports, &nodeguid, sw, nd))) + return err; + + mad_decode_field(sw, IB_SW_MCAST_FDB_CAP_F, &cap); + mad_decode_field(sw, IB_SW_MCAST_FDB_TOP_F, &top); + + if (!endlid || endlid > IB_MIN_MCAST_LID + cap - 1) + endlid = IB_MIN_MCAST_LID + cap - 1; + if (!dump_all && top && top < endlid) { + if (top < IB_MIN_MCAST_LID - 1) + IBWARN("illegal top mlid %x", top); + else + endlid = top; + } + + if (!startlid) + startlid = IB_MIN_MCAST_LID; + else if (startlid < IB_MIN_MCAST_LID) { + IBWARN("illegal start mlid %x, set to %x", startlid, + IB_MIN_MCAST_LID); + startlid = IB_MIN_MCAST_LID; + } + + if (endlid > IB_MAX_MCAST_LID) { + IBWARN("illegal end mlid %x, truncate to %x", endlid, + IB_MAX_MCAST_LID); + endlid = IB_MAX_MCAST_LID; + } + + mapnd = remap_node_name(node_name_map, nodeguid, nd); + + printf("Multicast mlids [0x%x-0x%x] of switch %s guid 0x%016" PRIx64 + " (%s):\n", startlid, endlid, portid2str(portid), nodeguid, + mapnd); + + if (brief) + printf(" MLid Port Mask\n"); + else { + if (nports > 9) { + for (i = 0, s = str; i <= nports; i++) { + *s++ = (i % 10) ? ' ' : '0' + i / 10; + *s++ = ' '; + } + *s = 0; + printf(" %s\n", str); + } + for (i = 0, s = str; i <= nports; i++) + s += sprintf(s, "%d ", i % 10); + printf(" Ports: %s\n", str); + printf(" MLid\n"); + } + if (ibverbose) + printf("Switch multicast mlid capability is %d top is 0x%x\n", + cap, top); + + chunks = ALIGN(nports + 1, 16) / 16; + + startblock = startlid / IB_MLIDS_IN_BLOCK; + lastblock = endlid / IB_MLIDS_IN_BLOCK; + for (block = startblock; block <= lastblock; block++) { + for (j = 0; j < chunks; j++) { + int status; + mod = (block - IB_MIN_MCAST_LID / IB_MLIDS_IN_BLOCK) + | (j << 28); + + DEBUG("reading block %x chunk %d mod %x", block, j, + mod); + if (!smp_query_status_via + (mft + j, portid, IB_ATTR_MULTICASTFORWTBL, mod, 0, + &status, srcport)) { + fprintf(stderr, "SubnGet() failed" + "; MAD status 0x%x AM 0x%x\n", + status, mod); + free(mapnd); + return NULL; + } + } + + i = block * IB_MLIDS_IN_BLOCK; + e = i + IB_MLIDS_IN_BLOCK; + if (i < startlid) + i = startlid; + if (e > endlid + 1) + e = endlid + 1; + + for (; i < e; i++) { + if (dump_mlid(str, sizeof str, i, nports, mft) == 0) + continue; + printf("0x%04x %s\n", i, str); + n++; + } + } + + printf("%d %smlids dumped \n", n, dump_all ? "" : "valid "); + + free(mapnd); + return NULL; +} + +static int dump_lid(char *str, int strlen, int lid, int valid) +{ + char nd[IB_SMP_DATA_SIZE] = { 0 }; + uint8_t ni[IB_SMP_DATA_SIZE] = { 0 }; + uint8_t pi[IB_SMP_DATA_SIZE] = { 0 }; + ib_portid_t lidport = { 0 }; + static int last_port_lid, base_port_lid; + char ntype[50], sguid[30]; + static uint64_t portguid; + uint64_t nodeguid; + int baselid, lmc, type; + char *mapnd = NULL; + int rc; + + if (brief) { + str[0] = 0; + return 0; + } + + if (lid <= last_port_lid) { + if (!valid) + return snprintf(str, strlen, + ": (path #%d - illegal port)", + lid - base_port_lid); + else if (!portguid) + return snprintf(str, strlen, + ": (path #%d out of %d)", + lid - base_port_lid + 1, + last_port_lid - base_port_lid + 1); + else { + return snprintf(str, strlen, + ": (path #%d out of %d: portguid %s)", + lid - base_port_lid + 1, + last_port_lid - base_port_lid + 1, + mad_dump_val(IB_NODE_PORT_GUID_F, sguid, + sizeof sguid, &portguid)); + } + } + + if (!valid) + return snprintf(str, strlen, ": (illegal port)"); + + portguid = 0; + lidport.lid = lid; + + if (!smp_query_via(nd, &lidport, IB_ATTR_NODE_DESC, 0, 100, srcport) || + !smp_query_via(pi, &lidport, IB_ATTR_PORT_INFO, 0, 100, srcport) || + !smp_query_via(ni, &lidport, IB_ATTR_NODE_INFO, 0, 100, srcport)) + return snprintf(str, strlen, ": (unknown node and type)"); + + mad_decode_field(ni, IB_NODE_GUID_F, &nodeguid); + mad_decode_field(ni, IB_NODE_PORT_GUID_F, &portguid); + mad_decode_field(ni, IB_NODE_TYPE_F, &type); + + mad_decode_field(pi, IB_PORT_LID_F, &baselid); + mad_decode_field(pi, IB_PORT_LMC_F, &lmc); + + if (lmc > 0) { + base_port_lid = baselid; + last_port_lid = baselid + (1 << lmc) - 1; + } + + mapnd = remap_node_name(node_name_map, nodeguid, nd); + + rc = snprintf(str, strlen, ": (%s portguid %s: '%s')", + mad_dump_val(IB_NODE_TYPE_F, ntype, sizeof ntype, + &type), mad_dump_val(IB_NODE_PORT_GUID_F, + sguid, sizeof sguid, + &portguid), + mapnd); + + free(mapnd); + return rc; +} + +static const char *dump_unicast_tables(ib_portid_t *portid, int startlid, + int endlid) +{ + char lft[IB_SMP_DATA_SIZE] = { 0 }; + char nd[IB_SMP_DATA_SIZE] = { 0 }; + uint8_t sw[IB_SMP_DATA_SIZE] = { 0 }; + char str[200]; + const char *s; + uint64_t nodeguid; + int block, i, e, top; + unsigned nports; + int n = 0, startblock, endblock; + char *mapnd = NULL; + + if ((s = check_switch(portid, &nports, &nodeguid, sw, nd))) + return s; + + mad_decode_field(sw, IB_SW_LINEAR_FDB_TOP_F, &top); + + if (!endlid || endlid > top) + endlid = top; + + if (endlid > IB_MAX_UCAST_LID) { + IBWARN("illegal lft top %d, truncate to %d", endlid, + IB_MAX_UCAST_LID); + endlid = IB_MAX_UCAST_LID; + } + + mapnd = remap_node_name(node_name_map, nodeguid, nd); + + printf("Unicast lids [0x%x-0x%x] of switch %s guid 0x%016" PRIx64 + " (%s):\n", startlid, endlid, portid2str(portid), nodeguid, + mapnd); + + DEBUG("Switch top is 0x%x\n", top); + + printf(" Lid Out Destination\n"); + printf(" Port Info \n"); + startblock = startlid / IB_SMP_DATA_SIZE; + endblock = ALIGN(endlid, IB_SMP_DATA_SIZE) / IB_SMP_DATA_SIZE; + for (block = startblock; block < endblock; block++) { + int status; + DEBUG("reading block %d", block); + if (!smp_query_status_via(lft, portid, IB_ATTR_LINEARFORWTBL, block, + 0, &status, srcport)) { + fprintf(stderr, "SubnGet() failed" + "; MAD status 0x%x AM 0x%x\n", + status, block); + free(mapnd); + return NULL; + } + i = block * IB_SMP_DATA_SIZE; + e = i + IB_SMP_DATA_SIZE; + if (i < startlid) + i = startlid; + if (e > endlid + 1) + e = endlid + 1; + + for (; i < e; i++) { + unsigned outport = lft[i % IB_SMP_DATA_SIZE]; + unsigned valid = (outport <= nports); + + if (!valid && !dump_all) + continue; + dump_lid(str, sizeof str, i, valid); + printf("0x%04x %03u %s\n", i, outport & 0xff, str); + n++; + } + } + + printf("%d %slids dumped \n", n, dump_all ? "" : "valid "); + free(mapnd); + return NULL; +} + +static int process_opt(void *context, int ch) +{ + switch (ch) { + case 'a': + dump_all++; + break; + case 'M': + multicast++; + break; + case 'n': + brief++; + break; + case 1: + node_name_map_file = strdup(optarg); + if (node_name_map_file == NULL) + IBEXIT("out of memory, strdup for node_name_map_file name failed"); + break; + default: + return -1; + } + return 0; +} + +int main(int argc, char **argv) +{ + int mgmt_classes[3] = + { IB_SMI_CLASS, IB_SMI_DIRECT_CLASS, IB_SA_CLASS }; + ib_portid_t portid = { 0 }; + unsigned startlid = 0, endlid = 0; + const char *err; + + const struct ibdiag_opt opts[] = { + {"all", 'a', 0, NULL, "show all lids, even invalid entries"}, + {"no_dests", 'n', 0, NULL, + "do not try to resolve destinations"}, + {"Multicast", 'M', 0, NULL, "show multicast forwarding tables"}, + {"node-name-map", 1, 1, "<file>", "node name map file"}, + {} + }; + char usage_args[] = "[<dest dr_path|lid|guid> [<startlid> [<endlid>]]]"; + const char *usage_examples[] = { + " -- Unicast examples:", + "4\t# dump all lids with valid out ports of switch with lid 4", + "-a 4\t# same, but dump all lids, even with invalid out ports", + "-n 4\t# simple dump format - no destination resolving", + "4 10\t# dump lids starting from 10", + "4 0x10 0x20\t# dump lid range", + "-G 0x08f1040023\t# resolve switch by GUID", + "-D 0,1\t# resolve switch by direct path", + " -- Multicast examples:", + "-M 4\t# dump all non empty mlids of switch with lid 4", + "-M 4 0xc010 0xc020\t# same, but with range", + "-M -n 4\t# simple dump format", + NULL, + }; + + ibdiag_process_opts(argc, argv, NULL, "K", opts, process_opt, + usage_args, usage_examples); + + argc -= optind; + argv += optind; + + if (!argc) + ibdiag_show_usage(); + + if (argc > 1) + startlid = strtoul(argv[1], NULL, 0); + if (argc > 2) + endlid = strtoul(argv[2], NULL, 0); + + node_name_map = open_node_name_map(node_name_map_file); + + srcport = mad_rpc_open_port(ibd_ca, ibd_ca_port, mgmt_classes, 3); + if (!srcport) + IBEXIT("Failed to open '%s' port '%d'", ibd_ca, ibd_ca_port); + + smp_mkey_set(srcport, ibd_mkey); + + if (resolve_portid_str(ibd_ca, ibd_ca_port, &portid, argv[0], + ibd_dest_type, ibd_sm_id, srcport) < 0) + IBEXIT("can't resolve destination port %s", argv[0]); + + if (multicast) + err = dump_multicast_tables(&portid, startlid, endlid); + else + err = dump_unicast_tables(&portid, startlid, endlid); + + if (err) + IBEXIT("dump tables: %s", err); + + mad_rpc_close_port(srcport); + close_node_name_map(node_name_map); + exit(0); +} diff --git a/infiniband-diags/ibsendtrap.c b/infiniband-diags/ibsendtrap.c new file mode 100644 index 0000000..4c6bbd4 --- /dev/null +++ b/infiniband-diags/ibsendtrap.c @@ -0,0 +1,276 @@ +/* + * Copyright (c) 2008 Lawrence Livermore National Security + * Copyright (c) 2008-2009 Voltaire Inc. All rights reserved. + * Copyright (c) 2009 HNR Consulting. All rights reserved. + * Copyright (c) 2011 Mellanox Technologies LTD. All rights reserved. + * + * Produced at Lawrence Livermore National Laboratory. + * Written by Ira Weiny <weiny2@llnl.gov>. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#include <stdlib.h> +#include <stdio.h> +#include <unistd.h> +#include <string.h> + +#define _GNU_SOURCE + +#include <infiniband/mad.h> + +#include "ibdiag_common.h" + +static struct ibmad_port *srcport; +/* for local link integrity */ +static int error_port = 1; + +static uint16_t get_node_type(ib_portid_t * port) +{ + uint16_t node_type = IB_NODE_TYPE_CA; + uint8_t data[IB_SMP_DATA_SIZE] = { 0 }; + + if (smp_query_via(data, port, IB_ATTR_NODE_INFO, 0, 0, srcport)) + node_type = (uint16_t) mad_get_field(data, 0, IB_NODE_TYPE_F); + return node_type; +} + +static uint32_t get_cap_mask(ib_portid_t * port) +{ + uint8_t data[IB_SMP_DATA_SIZE] = { 0 }; + uint32_t cap_mask = 0; + + if (smp_query_via(data, port, IB_ATTR_PORT_INFO, 0, 0, srcport)) + cap_mask = (uint32_t) mad_get_field(data, 0, IB_PORT_CAPMASK_F); + return cap_mask; +} + +static void build_trap145(ib_mad_notice_attr_t * n, ib_portid_t * port) +{ + n->generic_type = 0x80 | IB_NOTICE_TYPE_INFO; + n->g_or_v.generic.prod_type_lsb = htobe16(get_node_type(port)); + n->g_or_v.generic.trap_num = htobe16(145); + n->issuer_lid = htobe16((uint16_t) port->lid); + n->data_details.ntc_145.new_sys_guid = htobe64(0x1234567812345678ULL); +} + +static void build_trap144_local(ib_mad_notice_attr_t * n, ib_portid_t * port) +{ + n->generic_type = 0x80 | IB_NOTICE_TYPE_INFO; + n->g_or_v.generic.prod_type_lsb = htobe16(get_node_type(port)); + n->g_or_v.generic.trap_num = htobe16(144); + n->issuer_lid = htobe16((uint16_t) port->lid); + n->data_details.ntc_144.lid = n->issuer_lid; + n->data_details.ntc_144.new_cap_mask = htobe32(get_cap_mask(port)); + n->data_details.ntc_144.local_changes = + TRAP_144_MASK_OTHER_LOCAL_CHANGES; +} + +static void build_trap144_nodedesc(ib_mad_notice_attr_t * n, ib_portid_t * port) +{ + build_trap144_local(n, port); + n->data_details.ntc_144.change_flgs = + TRAP_144_MASK_NODE_DESCRIPTION_CHANGE; +} + +static void build_trap144_linkspeed(ib_mad_notice_attr_t * n, + ib_portid_t * port) +{ + build_trap144_local(n, port); + n->data_details.ntc_144.change_flgs = + TRAP_144_MASK_LINK_SPEED_ENABLE_CHANGE; +} + +static void build_trap129(ib_mad_notice_attr_t * n, ib_portid_t * port) +{ + n->generic_type = 0x80 | IB_NOTICE_TYPE_URGENT; + n->g_or_v.generic.prod_type_lsb = htobe16(get_node_type(port)); + n->g_or_v.generic.trap_num = htobe16(129); + n->issuer_lid = htobe16((uint16_t) port->lid); + n->data_details.ntc_129_131.lid = n->issuer_lid; + n->data_details.ntc_129_131.pad = 0; + n->data_details.ntc_129_131.port_num = (uint8_t) error_port; +} + +static void build_trap256_local(ib_mad_notice_attr_t * n, ib_portid_t * port) +{ + n->generic_type = 0x80 | IB_NOTICE_TYPE_SECURITY; + n->g_or_v.generic.prod_type_lsb = htobe16(get_node_type(port)); + n->g_or_v.generic.trap_num = htobe16(256); + n->issuer_lid = htobe16((uint16_t) port->lid); + n->data_details.ntc_256.lid = n->issuer_lid; + n->data_details.ntc_256.dr_slid = htobe16(0xffff); + n->data_details.ntc_256.method = 1; + n->data_details.ntc_256.attr_id = htobe16(0x15); + n->data_details.ntc_256.attr_mod = htobe32(0x12); + n->data_details.ntc_256.mkey = htobe64(0x1234567812345678ULL); +} + +static void build_trap256_lid(ib_mad_notice_attr_t * n, ib_portid_t * port) +{ + build_trap256_local(n, port); + n->data_details.ntc_256.dr_trunc_hop = 0; +} + +static void build_trap256_dr(ib_mad_notice_attr_t * n, ib_portid_t * port) +{ + build_trap256_local(n, port); + n->data_details.ntc_256.dr_trunc_hop = 0x80 | 0x4; + n->data_details.ntc_256.dr_rtn_path[0] = 5; + n->data_details.ntc_256.dr_rtn_path[1] = 6; + n->data_details.ntc_256.dr_rtn_path[2] = 7; + n->data_details.ntc_256.dr_rtn_path[3] = 8; +} + +static void build_trap257_258(ib_mad_notice_attr_t * n, ib_portid_t * port, + uint16_t trap_num) +{ + n->generic_type = 0x80 | IB_NOTICE_TYPE_SECURITY; + n->g_or_v.generic.prod_type_lsb = htobe16(get_node_type(port)); + n->g_or_v.generic.trap_num = htobe16(trap_num); + n->issuer_lid = htobe16((uint16_t) port->lid); + n->data_details.ntc_257_258.lid1 = htobe16(1); + n->data_details.ntc_257_258.lid2 = htobe16(2); + n->data_details.ntc_257_258.key = htobe32(0x12345678); + n->data_details.ntc_257_258.qp1 = htobe32(0x010101); + n->data_details.ntc_257_258.qp2 = htobe32(0x020202); + n->data_details.ntc_257_258.gid1.unicast.prefix = htobe64(0xf8c0000000000001ULL); + n->data_details.ntc_257_258.gid1.unicast.interface_id = htobe64(0x1111222233334444ULL); + n->data_details.ntc_257_258.gid2.unicast.prefix = htobe64(0xf8c0000000000001ULL); + n->data_details.ntc_257_258.gid2.unicast.interface_id = htobe64(0x5678567812341234ULL); +} + +static void build_trap257(ib_mad_notice_attr_t * n, ib_portid_t * port) +{ + build_trap257_258(n, port, 257); +} + +static void build_trap258(ib_mad_notice_attr_t * n, ib_portid_t * port) +{ + build_trap257_258(n, port, 258); +} + +static int send_trap(void (*build) (ib_mad_notice_attr_t *, ib_portid_t *)) +{ + ib_portid_t sm_port; + ib_portid_t selfportid; + int selfport; + ib_rpc_t trap_rpc; + ib_mad_notice_attr_t notice; + + if (resolve_self(ibd_ca, ibd_ca_port, &selfportid, &selfport, NULL)) + IBEXIT("can't resolve self"); + + if (resolve_sm_portid(ibd_ca, ibd_ca_port, &sm_port)) + IBEXIT("can't resolve SM destination port"); + + memset(&trap_rpc, 0, sizeof(trap_rpc)); + trap_rpc.mgtclass = IB_SMI_CLASS; + trap_rpc.method = IB_MAD_METHOD_TRAP; + trap_rpc.trid = mad_trid(); + trap_rpc.attr.id = NOTICE; + trap_rpc.datasz = IB_SMP_DATA_SIZE; + trap_rpc.dataoffs = IB_SMP_DATA_OFFS; + + memset(¬ice, 0, sizeof(notice)); + build(¬ice, &selfportid); + + return mad_send_via(&trap_rpc, &sm_port, NULL, ¬ice, srcport); +} + +typedef struct _trap_def { + const char *trap_name; + void (*build_func) (ib_mad_notice_attr_t *, ib_portid_t *); +} trap_def_t; + +static const trap_def_t traps[] = { + {"node_desc_change", build_trap144_nodedesc}, + {"link_speed_enabled_change", build_trap144_linkspeed}, + {"local_link_integrity", build_trap129}, + {"sys_image_guid_change", build_trap145}, + {"mkey_lid", build_trap256_lid}, + {"mkey_dr", build_trap256_dr}, + {"pkey", build_trap257}, + {"qkey", build_trap258}, + {NULL, NULL} +}; + +static int process_send_trap(const char *trap_name) +{ + int i; + + for (i = 0; traps[i].trap_name; i++) + if (strcmp(traps[i].trap_name, trap_name) == 0) + return send_trap(traps[i].build_func); + ibdiag_show_usage(); + return 1; +} + +int main(int argc, char **argv) +{ + char usage_args[1024]; + int mgmt_classes[2] = { IB_SMI_CLASS, IB_SMI_DIRECT_CLASS }; + const char *trap_name = NULL; + int i, n, rc; + + n = sprintf(usage_args, "[<trap_name>] [<error_port>]\n" + "\nArgument <trap_name> can be one of the following:\n"); + for (i = 0; traps[i].trap_name; i++) { + n += snprintf(usage_args + n, sizeof(usage_args) - n, + " %s\n", traps[i].trap_name); + if (n >= sizeof(usage_args)) + exit(-1); + } + snprintf(usage_args + n, sizeof(usage_args) - n, + "\n default behavior is to send \"%s\"", traps[0].trap_name); + + ibdiag_process_opts(argc, argv, NULL, "DGKL", NULL, NULL, + usage_args, NULL); + + argc -= optind; + argv += optind; + + trap_name = argv[0] ? argv[0] : traps[0].trap_name; + + if (argc > 1) + error_port = atoi(argv[1]); + + madrpc_show_errors(1); + + srcport = mad_rpc_open_port(ibd_ca, ibd_ca_port, mgmt_classes, 2); + if (!srcport) + IBEXIT("Failed to open '%s' port '%d'", ibd_ca, ibd_ca_port); + + smp_mkey_set(srcport, ibd_mkey); + + rc = process_send_trap(trap_name); + mad_rpc_close_port(srcport); + return rc; +} diff --git a/infiniband-diags/ibstat.c b/infiniband-diags/ibstat.c new file mode 100644 index 0000000..47b918f --- /dev/null +++ b/infiniband-diags/ibstat.c @@ -0,0 +1,342 @@ +/* + * Copyright (c) 2004-2009 Voltaire Inc. All rights reserved. + * Copyright (c) 2011 Mellanox Technologies LTD. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#define _GNU_SOURCE + +#include <inttypes.h> +#include <string.h> +#include <stdio.h> +#include <stdlib.h> +#include <netinet/in.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> +#include <errno.h> +#include <linux/types.h> /* __be64 */ + +#include <infiniband/umad.h> + +#include <ibdiag_common.h> + +static const char * const node_type_str[] = { + "???", + "CA", + "Switch", + "Router", + "iWARP RNIC" +}; + +static void ca_dump(umad_ca_t * ca) +{ + if (!ca->node_type) + return; + printf("%s '%s'\n", + ((unsigned)ca->node_type <= + IB_NODE_MAX ? node_type_str[ca->node_type] : "???"), + ca->ca_name); + printf("\t%s type: %s\n", + ((unsigned)ca->node_type <= + IB_NODE_MAX ? node_type_str[ca->node_type] : "???"), + ca->ca_type); + printf("\tNumber of ports: %d\n", ca->numports); + printf("\tFirmware version: %s\n", ca->fw_ver); + printf("\tHardware version: %s\n", ca->hw_ver); + printf("\tNode GUID: 0x%016" PRIx64 "\n", be64toh(ca->node_guid)); + printf("\tSystem image GUID: 0x%016" PRIx64 "\n", + be64toh(ca->system_guid)); +} + +static const char * const port_state_str[] = { + "???", + "Down", + "Initializing", + "Armed", + "Active" +}; + +static const char * const port_phy_state_str[] = { + "No state change", + "Sleep", + "Polling", + "Disabled", + "PortConfigurationTraining", + "LinkUp", + "LinkErrorRecovery", + "PhyTest" +}; + +static int ret_code(void) +{ + int e = errno; + + if (e > 0) + return -e; + return e; +} + +static int sys_read_string(const char *dir_name, const char *file_name, + char *str, int max_len) +{ + char path[256], *s; + int fd, r; + + r = snprintf(path, sizeof(path), "%s/%s", dir_name, file_name); + if (r > sizeof(path)) + return -ENOENT; + + if ((fd = open(path, O_RDONLY)) < 0) + return ret_code(); + + if ((r = read(fd, str, max_len)) < 0) { + int e = errno; + close(fd); + errno = e; + return ret_code(); + } + + str[(r < max_len) ? r : max_len - 1] = 0; + + if ((s = strrchr(str, '\n'))) + *s = 0; + + close(fd); + return 0; +} + +static int is_fdr10(umad_port_t *port) +{ + char port_dir[256]; + char rate[32]; + int len, fdr10 = 0; + char *p; + + len = snprintf(port_dir, sizeof(port_dir), "%s/%s/%s/%d", + SYS_INFINIBAND, port->ca_name, SYS_CA_PORTS_DIR, + port->portnum); + if (len < 0 || len > sizeof(port_dir)) + goto done; + + if (sys_read_string(port_dir, SYS_PORT_RATE, rate, sizeof(rate)) == 0) { + if ((p = strchr(rate, ')'))) { + if (!strncasecmp(p - 5, "fdr10", 5)) + fdr10 = 1; + } + } + +done: + return fdr10; +} + +static int port_dump(umad_port_t * port, int alone) +{ + const char *pre = ""; + const char *hdrpre = ""; + + if (!port) + return -1; + + if (!alone) { + pre = " "; + hdrpre = " "; + } + + printf("%sPort %d:\n", hdrpre, port->portnum); + printf("%sState: %s\n", pre, + (unsigned)port->state <= + 4 ? port_state_str[port->state] : "???"); + printf("%sPhysical state: %s\n", pre, + (unsigned)port->phys_state <= + 7 ? port_phy_state_str[port->phys_state] : "???"); + if (is_fdr10(port)) + printf("%sRate: %d (FDR10)\n", pre, port->rate); + else if (port->rate != 2) /* 1x SDR */ + printf("%sRate: %d\n", pre, port->rate); + else + printf("%sRate: 2.5\n", pre); + printf("%sBase lid: %d\n", pre, port->base_lid); + printf("%sLMC: %d\n", pre, port->lmc); + printf("%sSM lid: %d\n", pre, port->sm_lid); + printf("%sCapability mask: 0x%08x\n", pre, ntohl(port->capmask)); + printf("%sPort GUID: 0x%016" PRIx64 "\n", pre, be64toh(port->port_guid)); + printf("%sLink layer: %s\n", pre, port->link_layer); + return 0; +} + +static int ca_stat(const char *ca_name, int portnum, int no_ports) +{ + umad_ca_t ca; + int r; + + if ((r = umad_get_ca(ca_name, &ca)) < 0) + return r; + + if (!ca.node_type) + return 0; + + if (!no_ports && portnum >= 0) { + if (portnum > ca.numports || !ca.ports[portnum]) { + IBWARN("%s: '%s' has no port number %d - max (%d)", + ((unsigned)ca.node_type <= + IB_NODE_MAX ? node_type_str[ca.node_type] : + "???"), ca_name, portnum, ca.numports); + return -1; + } + printf("%s: '%s'\n", + ((unsigned)ca.node_type <= + IB_NODE_MAX ? node_type_str[ca.node_type] : "???"), + ca.ca_name); + port_dump(ca.ports[portnum], 1); + return 0; + } + + /* print ca header */ + ca_dump(&ca); + + if (no_ports) + return 0; + + for (portnum = 0; portnum <= ca.numports; portnum++) + port_dump(ca.ports[portnum], 0); + + return 0; +} + +static int ports_list(struct umad_device_node *first_node, + struct umad_device_node *last_node) +{ + __be64 guids[64]; + struct umad_device_node *node; + int ports, j; + + for (node = first_node; node && node != last_node; node = node->next) { + if ((ports = + umad_get_ca_portguids(node->ca_name, &guids[0], 64)) < 0) + return -1; + + for (j = 0; j < ports; j++) + if (guids[j]) + printf("0x%016" PRIx64 "\n", be64toh(guids[j])); + } + return 0; +} + +static int list_only, short_format, list_ports; + +static int process_opt(void *context, int ch) +{ + switch (ch) { + case 'l': + list_only++; + break; + case 's': + short_format++; + break; + case 'p': + list_ports++; + break; + default: + return -1; + } + return 0; +} + +int main(int argc, char *argv[]) +{ + struct umad_device_node *device_list; + struct umad_device_node *node; + struct umad_device_node *first_node; + struct umad_device_node *last_node; + int dev_port = -1; + const char *ca_name; + + const struct ibdiag_opt opts[] = { + {"list_of_cas", 'l', 0, NULL, "list all IB devices"}, + {"short", 's', 0, NULL, "short output"}, + {"port_list", 'p', 0, NULL, "show port list"}, + {} + }; + char usage_args[] = "<ca_name> [portnum]"; + const char *usage_examples[] = { + "-l # list all IB devices", + "mthca0 2 # stat port 2 of 'mthca0'", + NULL + }; + + ibdiag_process_opts(argc, argv, NULL, "CDeGKLPsty", opts, process_opt, + usage_args, usage_examples); + + argc -= optind; + argv += optind; + + if (argc > 1) + dev_port = strtol(argv[1], NULL, 0); + + if (umad_init() < 0) + IBPANIC("can't init UMAD library"); + + device_list = umad_get_ca_device_list(); + if (!device_list && errno) + IBPANIC("can't list IB device names"); + + if (argc) { + for (node = device_list; node; node = node->next) + if (!strcmp(node->ca_name, argv[0])) + break; + if (!node) + IBPANIC("'%s' IB device can't be found", argv[0]); + + first_node = node; + last_node = node->next; + } else { + first_node = device_list; + last_node = NULL; + } + + if (list_ports) { + if (ports_list(first_node, last_node) < 0) + IBPANIC("can't list ports"); + umad_free_ca_device_list(device_list); + return 0; + } + + for (node = first_node; node != last_node; node = node->next) { + ca_name = node->ca_name; + if (list_only) + printf("%s\n", ca_name); + else if (ca_stat(ca_name, dev_port, short_format) < 0) + IBPANIC("stat of IB device '%s' failed", ca_name); + } + umad_free_ca_device_list(device_list); + return 0; +} diff --git a/infiniband-diags/ibsysstat.c b/infiniband-diags/ibsysstat.c new file mode 100644 index 0000000..6ff7ca0 --- /dev/null +++ b/infiniband-diags/ibsysstat.c @@ -0,0 +1,366 @@ +/* + * Copyright (c) 2004-2009 Voltaire Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <string.h> + +#include <infiniband/umad.h> +#include <infiniband/mad.h> + +#include "ibdiag_common.h" + +#define MAX_CPUS 8 + +static struct ibmad_port *srcport; + +enum ib_sysstat_attr_t { + IB_PING_ATTR = 0x10, + IB_HOSTINFO_ATTR = 0x11, + IB_CPUINFO_ATTR = 0x12, +}; + +typedef struct cpu_info { + char *model; + char *mhz; +} cpu_info; + +static cpu_info cpus[MAX_CPUS]; +static int host_ncpu; +static int server = 0, oui = IB_OPENIB_OUI; + +static int server_respond(void *umad, int size) +{ + ib_rpc_t rpc = { 0 }; + ib_rmpp_hdr_t rmpp = { 0 }; + ib_portid_t rport; + uint8_t *mad = umad_get_mad(umad); + ib_mad_addr_t *mad_addr; + + if (!(mad_addr = umad_get_mad_addr(umad))) + return -1; + + memset(&rport, 0, sizeof(rport)); + + rport.lid = ntohs(mad_addr->lid); + rport.qp = ntohl(mad_addr->qpn); + rport.qkey = ntohl(mad_addr->qkey); + rport.sl = mad_addr->sl; + if (!rport.qkey && rport.qp == 1) + rport.qkey = IB_DEFAULT_QP1_QKEY; + + rpc.mgtclass = mad_get_field(mad, 0, IB_MAD_MGMTCLASS_F); + rpc.method = IB_MAD_METHOD_GET | IB_MAD_RESPONSE; + rpc.attr.id = mad_get_field(mad, 0, IB_MAD_ATTRID_F); + rpc.attr.mod = mad_get_field(mad, 0, IB_MAD_ATTRMOD_F); + rpc.oui = mad_get_field(mad, 0, IB_VEND2_OUI_F); + rpc.trid = mad_get_field64(mad, 0, IB_MAD_TRID_F); + + if (size > IB_MAD_SIZE) + rmpp.flags = IB_RMPP_FLAG_ACTIVE; + + DEBUG("responding %d bytes to %s, attr 0x%x mod 0x%x qkey %x", + size, portid2str(&rport), rpc.attr.id, rpc.attr.mod, rport.qkey); + + if (mad_build_pkt(umad, &rpc, &rport, &rmpp, NULL) < 0) + return -1; + + if (ibdebug > 1) + xdump(stderr, "mad respond pkt\n", mad, IB_MAD_SIZE); + + if (umad_send(mad_rpc_portid(srcport), + mad_rpc_class_agent(srcport, rpc.mgtclass), umad, size, + rpc.timeout, 0) < 0) { + DEBUG("send failed; %m"); + return -1; + } + + return 0; +} + +static int mk_reply(int attr, void *data, int sz) +{ + char *s = data; + int n, i, ret = 0; + + switch (attr) { + case IB_PING_ATTR: + break; /* nothing to do here, just reply */ + case IB_HOSTINFO_ATTR: + if (gethostname(s, sz) < 0) + snprintf(s, sz, "?hostname?"); + s[sz - 1] = 0; + if ((n = strlen(s)) >= sz - 1) { + ret = sz; + break; + } + s[n] = '.'; + s += n + 1; + sz -= n + 1; + ret += n + 1; + if (getdomainname(s, sz) < 0) + snprintf(s, sz, "?domainname?"); + if ((n = strlen(s)) == 0) + s[-1] = 0; /* no domain */ + else + ret += n; + break; + case IB_CPUINFO_ATTR: + s[0] = '\0'; + for (i = 0; i < host_ncpu && sz > 0; i++) { + n = snprintf(s, sz, "cpu %d: model %s MHZ %s\n", + i, cpus[i].model, cpus[i].mhz); + if (n >= sz) { + IBWARN("cpuinfo truncated"); + ret = sz; + break; + } + sz -= n; + s += n; + ret += n; + } + ret++; + break; + default: + DEBUG("unknown attr %d", attr); + } + return ret; +} + +static uint8_t buf[2048]; + +static char *ibsystat_serv(void) +{ + void *umad; + void *mad; + int attr, mod, size; + + DEBUG("starting to serve..."); + + while ((umad = mad_receive_via(buf, -1, srcport))) { + if (umad_status(buf)) { + DEBUG("drop mad with status %x: %s", umad_status(buf), + strerror(umad_status(buf))); + continue; + } + + mad = umad_get_mad(umad); + + attr = mad_get_field(mad, 0, IB_MAD_ATTRID_F); + mod = mad_get_field(mad, 0, IB_MAD_ATTRMOD_F); + + DEBUG("got packet: attr 0x%x mod 0x%x", attr, mod); + + size = + mk_reply(attr, (uint8_t *) mad + IB_VENDOR_RANGE2_DATA_OFFS, + sizeof(buf) - umad_size() - + IB_VENDOR_RANGE2_DATA_OFFS); + + if (server_respond(umad, IB_VENDOR_RANGE2_DATA_OFFS + size) < 0) + DEBUG("respond failed"); + } + + DEBUG("server out"); + return NULL; +} + +static int match_attr(char *str) +{ + if (!strcmp(str, "ping")) + return IB_PING_ATTR; + if (!strcmp(str, "host")) + return IB_HOSTINFO_ATTR; + if (!strcmp(str, "cpu")) + return IB_CPUINFO_ATTR; + return -1; +} + +static char *ibsystat(ib_portid_t * portid, int attr) +{ + ib_rpc_t rpc = { 0 }; + int fd, agent, timeout, len; + void *data = (uint8_t *) umad_get_mad(buf) + IB_VENDOR_RANGE2_DATA_OFFS; + + DEBUG("Sysstat ping.."); + + rpc.mgtclass = IB_VENDOR_OPENIB_SYSSTAT_CLASS; + rpc.method = IB_MAD_METHOD_GET; + rpc.attr.id = attr; + rpc.attr.mod = 0; + rpc.oui = oui; + rpc.timeout = 0; + rpc.datasz = IB_VENDOR_RANGE2_DATA_SIZE; + rpc.dataoffs = IB_VENDOR_RANGE2_DATA_OFFS; + + portid->qp = 1; + if (!portid->qkey) + portid->qkey = IB_DEFAULT_QP1_QKEY; + + if ((len = mad_build_pkt(buf, &rpc, portid, NULL, NULL)) < 0) + IBPANIC("cannot build packet."); + + fd = mad_rpc_portid(srcport); + agent = mad_rpc_class_agent(srcport, rpc.mgtclass); + timeout = ibd_timeout ? ibd_timeout : MAD_DEF_TIMEOUT_MS; + + if (umad_send(fd, agent, buf, len, timeout, 0) < 0) + IBPANIC("umad_send failed."); + + len = sizeof(buf) - umad_size(); + if (umad_recv(fd, buf, &len, timeout) < 0) + IBPANIC("umad_recv failed."); + + if (umad_status(buf)) + return strerror(umad_status(buf)); + + DEBUG("Got sysstat pong.."); + if (attr != IB_PING_ATTR) + puts(data); + else + printf("sysstat ping succeeded\n"); + return NULL; +} + +static int build_cpuinfo(void) +{ + char line[1024] = { 0 }, *s, *e; + FILE *f; + int ncpu = 0; + + if (!(f = fopen("/proc/cpuinfo", "r"))) { + IBWARN("couldn't open /proc/cpuinfo"); + return 0; + } + + while (fgets(line, sizeof(line) - 1, f)) { + if (!strncmp(line, "processor\t", 10)) { + ncpu++; + if (ncpu > MAX_CPUS) { + fclose(f); + return MAX_CPUS; + } + continue; + } + + if (!ncpu || !(s = strchr(line, ':'))) + continue; + + if ((e = strchr(s, '\n'))) + *e = 0; + if (!strncmp(line, "model name\t", 11)) + cpus[ncpu - 1].model = strdup(s + 1); + else if (!strncmp(line, "cpu MHz\t", 8)) + cpus[ncpu - 1].mhz = strdup(s + 1); + } + + fclose(f); + + DEBUG("ncpu %d", ncpu); + + return ncpu; +} + +static int process_opt(void *context, int ch) +{ + switch (ch) { + case 'o': + oui = strtoul(optarg, NULL, 0); + break; + case 'S': + server++; + break; + default: + return -1; + } + return 0; +} + +int main(int argc, char **argv) +{ + int mgmt_classes[3] = + { IB_SMI_CLASS, IB_SMI_DIRECT_CLASS, IB_SA_CLASS }; + int sysstat_class = IB_VENDOR_OPENIB_SYSSTAT_CLASS; + ib_portid_t portid = { 0 }; + int attr = IB_PING_ATTR; + char *err; + + const struct ibdiag_opt opts[] = { + {"oui", 'o', 1, NULL, "use specified OUI number"}, + {"Server", 'S', 0, NULL, "start in server mode"}, + {} + }; + char usage_args[] = "<dest lid|guid> [<op>]"; + + ibdiag_process_opts(argc, argv, NULL, "DKy", opts, process_opt, + usage_args, NULL); + + argc -= optind; + argv += optind; + + if (!argc && !server) + ibdiag_show_usage(); + + if (argc > 1 && (attr = match_attr(argv[1])) < 0) + ibdiag_show_usage(); + + srcport = mad_rpc_open_port(ibd_ca, ibd_ca_port, mgmt_classes, 3); + if (!srcport) + IBEXIT("Failed to open '%s' port '%d'", ibd_ca, ibd_ca_port); + + if (server) { + if (mad_register_server_via(sysstat_class, 1, NULL, oui, srcport) < + 0) + IBEXIT("can't serve class %d", sysstat_class); + + host_ncpu = build_cpuinfo(); + + if ((err = ibsystat_serv())) + IBEXIT("ibssystat to %s: %s", portid2str(&portid), + err); + exit(0); + } + + if (mad_register_client_via(sysstat_class, 1, srcport) < 0) + IBEXIT("can't register to sysstat class %d", sysstat_class); + + if (resolve_portid_str(ibd_ca, ibd_ca_port, &portid, argv[0], + ibd_dest_type, ibd_sm_id, srcport) < 0) + IBEXIT("can't resolve destination port %s", argv[0]); + + if ((err = ibsystat(&portid, attr))) + IBEXIT("ibsystat to %s: %s", portid2str(&portid), err); + + mad_rpc_close_port(srcport); + exit(0); +} diff --git a/infiniband-diags/ibtracert.c b/infiniband-diags/ibtracert.c new file mode 100644 index 0000000..316580c --- /dev/null +++ b/infiniband-diags/ibtracert.c @@ -0,0 +1,955 @@ +/* + * Copyright (c) 2004-2009 Voltaire Inc. All rights reserved. + * Copyright (c) 2009 HNR Consulting. All rights reserved. + * Copyright (c) 2010,2011 Mellanox Technologies LTD. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#define _GNU_SOURCE +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <ctype.h> +#include <netinet/in.h> +#include <inttypes.h> + +#include <infiniband/umad.h> +#include <infiniband/mad.h> +#include <util/node_name_map.h> + +#include "ibdiag_common.h" + +static struct ibmad_port *srcport; + +#define MAXHOPS 63 + +static const char * const node_type_str[] = { + "???", + "ca", + "switch", + "router", + "iwarp rnic" +}; + +static int timeout = 0; /* ms */ +static int force; +static FILE *f; +static FILE *ports_fd; + +static char *node_name_map_file = NULL; +static char *ports_file = NULL; +static nn_map_t *node_name_map = NULL; + +typedef struct Port Port; +typedef struct Switch Switch; +typedef struct Node Node; + +struct Port { + Port *next; + Port *remoteport; + uint64_t portguid; + int portnum; + int lid; + int lmc; + int state; + int physstate; + char portinfo[64]; +}; + +struct Switch { + int linearcap; + int mccap; + int linearFDBtop; + int fdb_base; + int enhsp0; + int8_t fdb[64]; + char switchinfo[64]; +}; + +struct Node { + Node *htnext; + Node *dnext; + Port *ports; + ib_portid_t path; + int type; + int dist; + int numports; + int upport; + Node *upnode; + uint64_t nodeguid; /* also portguid */ + char nodedesc[64]; + char nodeinfo[64]; +}; + +static Node *nodesdist[MAXHOPS]; +static uint64_t target_portguid; + +/* + * is_port_inactive + * Checks whether or not the port state is other than active. + * The "sw" argument is only relevant when the port is on a + * switch; for HCAs and routers, this argument is ignored. + * Returns 1 when port is not active and 0 when active. + * Base switch port 0 is considered always active. + */ +static int is_port_inactive(Node * node, Port * port, Switch * sw) +{ + int res = 0; + if (port->state != 4 && + (node->type != IB_NODE_SWITCH || + (node->type == IB_NODE_SWITCH && sw->enhsp0))) + res = 1; + return res; +} + +static int get_node(Node * node, Port * port, ib_portid_t * portid) +{ + void *pi = port->portinfo, *ni = node->nodeinfo, *nd = node->nodedesc; + char *s, *e; + + memset(ni, 0, sizeof(node->nodeinfo)); + if (!smp_query_via(ni, portid, IB_ATTR_NODE_INFO, 0, timeout, srcport)) + return -1; + + memset(nd, 0, sizeof(node->nodedesc)); + if (!smp_query_via(nd, portid, IB_ATTR_NODE_DESC, 0, timeout, srcport)) + return -1; + + for (s = nd, e = s + 64; s < e; s++) { + if (!*s) + break; + if (!isprint(*s)) + *s = ' '; + } + + memset(pi, 0, sizeof(port->portinfo)); + if (!smp_query_via(pi, portid, IB_ATTR_PORT_INFO, 0, timeout, srcport)) + return -1; + + mad_decode_field(ni, IB_NODE_GUID_F, &node->nodeguid); + mad_decode_field(ni, IB_NODE_TYPE_F, &node->type); + mad_decode_field(ni, IB_NODE_NPORTS_F, &node->numports); + + mad_decode_field(ni, IB_NODE_PORT_GUID_F, &port->portguid); + mad_decode_field(ni, IB_NODE_LOCAL_PORT_F, &port->portnum); + mad_decode_field(pi, IB_PORT_LID_F, &port->lid); + mad_decode_field(pi, IB_PORT_LMC_F, &port->lmc); + mad_decode_field(pi, IB_PORT_STATE_F, &port->state); + + DEBUG("portid %s: got node %" PRIx64 " '%s'", portid2str(portid), + node->nodeguid, node->nodedesc); + return 0; +} + +static int switch_lookup(Switch * sw, ib_portid_t * portid, int lid) +{ + void *si = sw->switchinfo, *fdb = sw->fdb; + + memset(si, 0, sizeof(sw->switchinfo)); + if (!smp_query_via(si, portid, IB_ATTR_SWITCH_INFO, 0, timeout, + srcport)) + return -1; + + mad_decode_field(si, IB_SW_LINEAR_FDB_CAP_F, &sw->linearcap); + mad_decode_field(si, IB_SW_LINEAR_FDB_TOP_F, &sw->linearFDBtop); + mad_decode_field(si, IB_SW_ENHANCED_PORT0_F, &sw->enhsp0); + + if (lid >= sw->linearcap && lid > sw->linearFDBtop) + return -1; + + memset(fdb, 0, sizeof(sw->fdb)); + if (!smp_query_via(fdb, portid, IB_ATTR_LINEARFORWTBL, lid / 64, + timeout, srcport)) + return -1; + + DEBUG("portid %s: forward lid %d to port %d", + portid2str(portid), lid, sw->fdb[lid % 64]); + return sw->fdb[lid % 64]; +} + +static int sameport(Port * a, Port * b) +{ + return a->portguid == b->portguid || (force && a->lid == b->lid); +} + +static int extend_dpath(ib_dr_path_t * path, int nextport) +{ + if (path->cnt + 2 >= sizeof(path->p)) + return -1; + ++path->cnt; + path->p[path->cnt] = (uint8_t) nextport; + return path->cnt; +} + +static void dump_endnode(int dump, const char *prompt, Node *node, Port *port) +{ + char *nodename = NULL; + + if (!dump) + return; + if (dump == 1) { + fprintf(f, "%s {0x%016" PRIx64 "}[%d]\n", + prompt, node->nodeguid, + node->type == IB_NODE_SWITCH ? 0 : port->portnum); + return; + } + + nodename = + remap_node_name(node_name_map, node->nodeguid, node->nodedesc); + + fprintf(f, "%s %s {0x%016" PRIx64 "} portnum %d lid %u-%u \"%s\"\n", + prompt, + (node->type <= IB_NODE_MAX ? node_type_str[node->type] : "???"), + node->nodeguid, + node->type == IB_NODE_SWITCH ? 0 : port->portnum, port->lid, + port->lid + (1 << port->lmc) - 1, nodename); + + free(nodename); +} + +static void dump_route(int dump, Node * node, int outport, Port * port) +{ + char *nodename = NULL; + + if (!dump && !ibverbose) + return; + + nodename = + remap_node_name(node_name_map, node->nodeguid, node->nodedesc); + + if (dump == 1) + fprintf(f, "[%d] -> {0x%016" PRIx64 "}[%d]\n", + outport, port->portguid, port->portnum); + else + fprintf(f, "[%d] -> %s port {0x%016" PRIx64 + "}[%d] lid %u-%u \"%s\"\n", outport, + (node->type <= + IB_NODE_MAX ? node_type_str[node->type] : "???"), + port->portguid, port->portnum, port->lid, + port->lid + (1 << port->lmc) - 1, nodename); + + free(nodename); +} + +static int find_route(ib_portid_t * from, ib_portid_t * to, int dump) +{ + Node *node, fromnode, tonode, nextnode; + Port *port, fromport, toport, nextport; + Switch sw; + int maxhops = MAXHOPS; + int portnum, outport = 255, next_sw_outport = 255; + + memset(&fromnode,0,sizeof(Node)); + memset(&tonode,0,sizeof(Node)); + memset(&nextnode,0,sizeof(Node)); + memset(&fromport,0,sizeof(Port)); + memset(&toport,0,sizeof(Port)); + memset(&nextport,0,sizeof(Port)); + + DEBUG("from %s", portid2str(from)); + + if (get_node(&fromnode, &fromport, from) < 0 || + get_node(&tonode, &toport, to) < 0) { + IBWARN("can't reach to/from ports"); + if (!force) + return -1; + if (to->lid > 0) + toport.lid = to->lid; + IBWARN("Force: look for lid %d", to->lid); + } + + node = &fromnode; + port = &fromport; + portnum = port->portnum; + + dump_endnode(dump, "From", node, port); + if (node->type == IB_NODE_SWITCH) { + next_sw_outport = switch_lookup(&sw, from, to->lid); + if (next_sw_outport < 0 || next_sw_outport > node->numports) { + /* needed to print the port in badtbl */ + outport = next_sw_outport; + goto badtbl; + } + } + + while (maxhops--) { + if (is_port_inactive(node, port, &sw)) + goto badport; + + if (sameport(port, &toport)) + break; /* found */ + + if (node->type == IB_NODE_SWITCH) { + DEBUG("switch node"); + outport = next_sw_outport; + + if (extend_dpath(&from->drpath, outport) < 0) + goto badpath; + + if (get_node(&nextnode, &nextport, from) < 0) { + IBWARN("can't reach port at %s", + portid2str(from)); + return -1; + } + if (outport == 0) { + if (!sameport(&nextport, &toport)) + goto badtbl; + else + break; /* found SMA port */ + } + } else if ((node->type == IB_NODE_CA) || + (node->type == IB_NODE_ROUTER)) { + int ca_src = 0; + + outport = portnum; + DEBUG("ca or router node"); + if (!sameport(port, &fromport)) { + IBWARN + ("can't continue: reached CA or router port %" + PRIx64 ", lid %d", port->portguid, + port->lid); + return -1; + } + /* we are at CA or router "from" - go one hop back to (hopefully) a switch */ + if (from->drpath.cnt > 0) { + DEBUG("ca or router node - return back 1 hop"); + from->drpath.cnt--; + } else { + ca_src = 1; + if (portnum + && extend_dpath(&from->drpath, portnum) < 0) + goto badpath; + } + if (get_node(&nextnode, &nextport, from) < 0) { + IBWARN("can't reach port at %s", + portid2str(from)); + return -1; + } + /* fix port num to be seen from the CA or router side */ + if (!ca_src) + nextport.portnum = + from->drpath.p[from->drpath.cnt + 1]; + } + /* only if the next node is a switch, get switch info */ + if (nextnode.type == IB_NODE_SWITCH) { + next_sw_outport = switch_lookup(&sw, from, to->lid); + if (next_sw_outport < 0 || + next_sw_outport > nextnode.numports) { + /* needed to print the port in badtbl */ + outport = next_sw_outport; + goto badtbl; + } + } + + port = &nextport; + if (is_port_inactive(&nextnode, port, &sw)) + goto badoutport; + node = &nextnode; + portnum = port->portnum; + dump_route(dump, node, outport, port); + } + + if (maxhops <= 0) { + IBWARN("no route found after %d hops", MAXHOPS); + return -1; + } + dump_endnode(dump, "To", node, port); + return 0; + +badport: + IBWARN("Bad port state found: node \"%s\" port %d state %d", + clean_nodedesc(node->nodedesc), portnum, port->state); + return -1; +badoutport: + IBWARN("Bad out port state found: node \"%s\" outport %d state %d", + clean_nodedesc(node->nodedesc), outport, port->state); + return -1; +badtbl: + IBWARN + ("Bad forwarding table entry found at: node \"%s\" lid entry %d is %d (top %d)", + clean_nodedesc(node->nodedesc), to->lid, outport, sw.linearFDBtop); + return -1; +badpath: + IBWARN("Direct path too long!"); + return -1; +} + +/************************** + * MC span part + */ + +#define HASHGUID(guid) ((uint32_t)(((uint32_t)(guid) * 101) ^ ((uint32_t)((guid) >> 32) * 103))) +#define HTSZ 137 + +static int insert_node(Node * new) +{ + static Node *nodestbl[HTSZ]; + int hash = HASHGUID(new->nodeguid) % HTSZ; + Node *node; + + for (node = nodestbl[hash]; node; node = node->htnext) + if (node->nodeguid == new->nodeguid) { + DEBUG("node %" PRIx64 " already exists", new->nodeguid); + return -1; + } + + new->htnext = nodestbl[hash]; + nodestbl[hash] = new; + + return 0; +} + +static int get_port(Port * port, int portnum, ib_portid_t * portid) +{ + char portinfo[64] = { 0 }; + void *pi = portinfo; + + port->portnum = portnum; + + if (!smp_query_via(pi, portid, IB_ATTR_PORT_INFO, portnum, timeout, + srcport)) + return -1; + + mad_decode_field(pi, IB_PORT_LID_F, &port->lid); + mad_decode_field(pi, IB_PORT_LMC_F, &port->lmc); + mad_decode_field(pi, IB_PORT_STATE_F, &port->state); + mad_decode_field(pi, IB_PORT_PHYS_STATE_F, &port->physstate); + + VERBOSE("portid %s portnum %d: lid %d state %d physstate %d", + portid2str(portid), portnum, port->lid, port->state, + port->physstate); + return 1; +} + +static void link_port(Port * port, Node * node) +{ + port->next = node->ports; + node->ports = port; +} + +static int new_node(Node * node, Port * port, ib_portid_t * path, int dist) +{ + if (port->portguid == target_portguid) { + node->dist = -1; /* tag as target */ + link_port(port, node); + dump_endnode(ibverbose, "found target", node, port); + return 1; /* found; */ + } + + /* BFS search start with my self */ + if (insert_node(node) < 0) + return -1; /* known switch */ + + VERBOSE("insert dist %d node %p port %d lid %d", dist, node, + port->portnum, port->lid); + + link_port(port, node); + + node->dist = dist; + node->path = *path; + node->dnext = nodesdist[dist]; + nodesdist[dist] = node; + + return 0; +} + +static int switch_mclookup(Node * node, ib_portid_t * portid, int mlid, + char *map) +{ + Switch sw; + char mdb[64]; + void *si = sw.switchinfo; + __be16 *msets = (__be16 *) mdb; + int maxsets, block, i, set; + + memset(map, 0, 256); + + memset(si, 0, sizeof(sw.switchinfo)); + if (!smp_query_via(si, portid, IB_ATTR_SWITCH_INFO, 0, timeout, + srcport)) + return -1; + + mlid -= 0xc000; + + mad_decode_field(si, IB_SW_MCAST_FDB_CAP_F, &sw.mccap); + + if (mlid >= sw.mccap) + return -1; + + block = mlid / 32; + maxsets = (node->numports + 15) / 16; /* round up */ + + for (set = 0; set < maxsets; set++) { + memset(mdb, 0, sizeof(mdb)); + if (!smp_query_via(mdb, portid, IB_ATTR_MULTICASTFORWTBL, + block | (set << 28), timeout, srcport)) + return -1; + + for (i = 0; i < 16; i++, map++) { + uint16_t mask = ntohs(msets[mlid % 32]); + if (mask & (1 << i)) + *map = 1; + else + continue; + VERBOSE("Switch guid 0x%" PRIx64 + ": mlid 0x%x is forwarded to port %d", + node->nodeguid, mlid + 0xc000, i + set * 16); + } + } + + return 0; +} + +/* + * Return 1 if found, 0 if not, -1 on errors. + */ +static Node *find_mcpath(ib_portid_t * from, int mlid) +{ + Node *node, *remotenode; + Port *port, *remoteport; + char map[256]; + int r, i; + int dist = 0, leafport = 0; + ib_portid_t *path; + + DEBUG("from %s", portid2str(from)); + + if (!(node = calloc(1, sizeof(Node)))) + IBEXIT("out of memory"); + + if (!(port = calloc(1, sizeof(Port)))) + IBEXIT("out of memory"); + + if (get_node(node, port, from) < 0) { + IBWARN("can't reach node %s", portid2str(from)); + free(node); + free(port); + return NULL; + } + + node->upnode = NULL; /* root */ + if ((r = new_node(node, port, from, 0)) > 0) { + if (node->type != IB_NODE_SWITCH) { + IBWARN("ibtracert from CA to CA is unsupported"); + free(node); + free(port); + return NULL; /* ibtracert from host to itself is unsupported */ + } + + if (switch_mclookup(node, from, mlid, map) < 0 || !map[0]) + return NULL; + return node; + } + + for (dist = 0; dist < MAXHOPS; dist++) { + + for (node = nodesdist[dist]; node; node = node->dnext) { + + path = &node->path; + + VERBOSE("dist %d node %p", dist, node); + dump_endnode(ibverbose, "processing", node, + node->ports); + + memset(map, 0, sizeof(map)); + + if (node->type != IB_NODE_SWITCH) { + if (dist) + continue; + leafport = path->drpath.p[path->drpath.cnt]; + map[port->portnum] = 1; + node->upport = 0; /* starting here */ + DEBUG("Starting from CA 0x%" PRIx64 + " lid %d port %d (leafport %d)", + node->nodeguid, port->lid, port->portnum, + leafport); + } else { /* switch */ + + /* if starting from a leaf port fix up port (up port) */ + if (dist == 1 && leafport) + node->upport = leafport; + + if (switch_mclookup(node, path, mlid, map) < 0) { + IBWARN("skipping bad Switch 0x%" PRIx64 + "", node->nodeguid); + continue; + } + } + + for (i = 1; i <= node->numports; i++) { + if (!map[i] || i == node->upport) + continue; + + if (dist == 0 && leafport) { + if (from->drpath.cnt > 0) + path->drpath.cnt--; + } else { + if (!(port = calloc(1, sizeof(Port)))) + IBEXIT("out of memory"); + + if (get_port(port, i, path) < 0) { + IBWARN + ("can't reach node %s port %d", + portid2str(path), i); + free(port); + return NULL; + } + + if (port->physstate != 5) { /* LinkUP */ + free(port); + continue; + } +#if 0 + link_port(port, node); +#endif + + if (extend_dpath(&path->drpath, i) < 0) { + free(port); + return NULL; + } + } + + if (!(remotenode = calloc(1, sizeof(Node)))) + IBEXIT("out of memory"); + + if (!(remoteport = calloc(1, sizeof(Port)))) + IBEXIT("out of memory"); + + if (get_node(remotenode, remoteport, path) < 0) { + IBWARN + ("NodeInfo on %s port %d failed, skipping port", + portid2str(path), i); + path->drpath.cnt--; /* restore path */ + free(remotenode); + free(remoteport); + continue; + } + + remotenode->upnode = node; + remotenode->upport = remoteport->portnum; + remoteport->remoteport = port; + + if ((r = new_node(remotenode, remoteport, path, + dist + 1)) > 0) + return remotenode; + + if (r == 0) + dump_endnode(ibverbose, "new remote", + remotenode, remoteport); + else if (remotenode->type == IB_NODE_SWITCH) + dump_endnode(2, + "ERR: circle discovered at", + remotenode, remoteport); + + path->drpath.cnt--; /* restore path */ + } + } + } + + return NULL; /* not found */ +} + +static uint64_t find_target_portguid(ib_portid_t * to) +{ + Node tonode; + Port toport; + + if (get_node(&tonode, &toport, to) < 0) { + IBWARN("can't find to port\n"); + return -1; + } + + return toport.portguid; +} + +static void dump_mcpath(Node * node, int dumplevel) +{ + char *nodename = NULL; + + if (node->upnode) + dump_mcpath(node->upnode, dumplevel); + + nodename = + remap_node_name(node_name_map, node->nodeguid, node->nodedesc); + + if (!node->dist) { + printf("From %s 0x%" PRIx64 " port %d lid %u-%u \"%s\"\n", + (node->type <= + IB_NODE_MAX ? node_type_str[node->type] : "???"), + node->nodeguid, node->ports->portnum, node->ports->lid, + node->ports->lid + (1 << node->ports->lmc) - 1, + nodename); + goto free_name; + } + + if (node->dist) { + if (dumplevel == 1) + printf("[%d] -> %s {0x%016" PRIx64 "}[%d]\n", + node->ports->remoteport->portnum, + (node->type <= + IB_NODE_MAX ? node_type_str[node->type] : + "???"), node->nodeguid, node->upport); + else + printf("[%d] -> %s 0x%" PRIx64 "[%d] lid %u \"%s\"\n", + node->ports->remoteport->portnum, + (node->type <= + IB_NODE_MAX ? node_type_str[node->type] : + "???"), node->nodeguid, node->upport, + node->ports->lid, nodename); + } + + if (node->dist < 0) + /* target node */ + printf("To %s 0x%" PRIx64 " port %d lid %u-%u \"%s\"\n", + (node->type <= + IB_NODE_MAX ? node_type_str[node->type] : "???"), + node->nodeguid, node->ports->portnum, node->ports->lid, + node->ports->lid + (1 << node->ports->lmc) - 1, + nodename); + +free_name: + free(nodename); +} + +static int resolve_lid(ib_portid_t *portid) +{ + uint8_t portinfo[64] = { 0 }; + uint16_t lid; + + if (!smp_query_via(portinfo, portid, IB_ATTR_PORT_INFO, 0, 0, NULL)) + return -1; + mad_decode_field(portinfo, IB_PORT_LID_F, &lid); + + ib_portid_set(portid, lid, 0, 0); + + return 0; +} + +static int dumplevel = 2, multicast, mlid; + +static int process_opt(void *context, int ch) +{ + switch (ch) { + case 1: + node_name_map_file = strdup(optarg); + if (node_name_map_file == NULL) + IBEXIT("out of memory, strdup for node_name_map_file name failed"); + break; + case 2: + ports_file = strdup(optarg); + if (ports_file == NULL) + IBEXIT("out of memory, strdup for ports_file name failed"); + break; + case 'm': + multicast++; + mlid = strtoul(optarg, NULL, 0); + break; + case 'f': + force++; + break; + case 'n': + dumplevel = 1; + break; + default: + return -1; + } + return 0; +} + +static int get_route(char *srcid, char *dstid) { + ib_portid_t my_portid = { 0 }; + ib_portid_t src_portid = { 0 }; + ib_portid_t dest_portid = { 0 }; + Node *endnode; + + if (resolve_portid_str(ibd_ca, ibd_ca_port, &src_portid, srcid, + ibd_dest_type, ibd_sm_id, srcport) < 0) { + IBWARN("can't resolve source port %s", srcid); + return -1; + } + + if (resolve_portid_str(ibd_ca, ibd_ca_port, &dest_portid, dstid, + ibd_dest_type, ibd_sm_id, srcport) < 0) { + IBWARN("can't resolve destination port %s", dstid); + return -1; + } + + if (ibd_dest_type == IB_DEST_DRPATH) { + if (resolve_lid(&src_portid) < 0) { + IBWARN("cannot resolve lid for port \'%s\'", + portid2str(&src_portid)); + return -1; + } + if (resolve_lid(&dest_portid) < 0) { + IBWARN("cannot resolve lid for port \'%s\'", + portid2str(&dest_portid)); + return -1; + } + } + + if (dest_portid.lid == 0 || src_portid.lid == 0) { + IBWARN("bad src/dest lid"); + ibdiag_show_usage(); + } + + if (ibd_dest_type != IB_DEST_DRPATH) { + /* first find a direct path to the src port */ + if (find_route(&my_portid, &src_portid, 0) < 0) { + IBWARN("can't find a route to the src port"); + return -1; + } + + src_portid = my_portid; + } + + if (!multicast) { + if (find_route(&src_portid, &dest_portid, dumplevel) < 0) { + IBWARN("can't find a route from src to dest"); + return -1; + } + return 0; + } else { + if (mlid < 0xc000) + IBWARN("invalid MLID; must be 0xc000 or larger"); + } + + if (!(target_portguid = find_target_portguid(&dest_portid))) { + IBWARN("can't reach target lid %d", dest_portid.lid); + return -1; + } + + if (!(endnode = find_mcpath(&src_portid, mlid))) { + IBWARN("can't find a multicast route from src to dest"); + return -1; + } + + /* dump multicast path */ + dump_mcpath(endnode, dumplevel); + + return 0; +} + +int main(int argc, char **argv) +{ + char dstbuf[21]; + char srcbuf[21]; + char portsbuf[80]; + char *p_first; + int len, i; + int line_count = 0; + int num_port_pairs = 0; + int mgmt_classes[3] = + { IB_SMI_CLASS, IB_SMI_DIRECT_CLASS, IB_SA_CLASS }; + + const struct ibdiag_opt opts[] = { + {"force", 'f', 0, NULL, "force"}, + {"no_info", 'n', 0, NULL, "simple format"}, + {"mlid", 'm', 1, "<mlid>", "multicast trace of the mlid"}, + {"node-name-map", 1, 1, "<file>", "node name map file"}, + {"ports-file", 2, 1, "<file>", "port pairs file"}, + {} + }; + char usage_args[] = "<src-addr> <dest-addr>"; + const char *usage_examples[] = { + "- Unicast examples:", + "4 16\t\t\t# show path between lids 4 and 16", + "-n 4 16\t\t# same, but using simple output format", + "-G 0x8f1040396522d 0x002c9000100d051\t# use guid addresses", + + " - Multicast examples:", + "-m 0xc000 4 16\t# show multicast path of mlid 0xc000 between lids 4 and 16", + NULL, + }; + + ibdiag_process_opts(argc, argv, NULL, "DK", opts, process_opt, + usage_args, usage_examples); + + f = stdout; + argc -= optind; + argv += optind; + + if (argc < 2 && ports_file == NULL) + ibdiag_show_usage(); + + if (ibd_timeout) + timeout = ibd_timeout; + + srcport = mad_rpc_open_port(ibd_ca, ibd_ca_port, mgmt_classes, 3); + if (!srcport) + IBEXIT("Failed to open '%s' port '%d'", ibd_ca, ibd_ca_port); + + smp_mkey_set(srcport, ibd_mkey); + + node_name_map = open_node_name_map(node_name_map_file); + + if (ports_file == NULL) { + /* single get_route call when lids/guids on command line */ + if (get_route(argv[0], argv[1]) != 0) + IBEXIT("Failed to get route information"); + } else { + /* multiple get_route calls when reading lids/guids from a file */ + ports_fd = fopen(ports_file, "r"); + if (!ports_fd) + IBEXIT("cannot open ports-file %s", ports_file); + + while (fgets(portsbuf, sizeof(portsbuf), ports_fd) != NULL) { + line_count++; + p_first = strtok(portsbuf, "\n"); + if (!p_first) + continue; /* ignore blank lines */ + + len = (int) strlen(p_first); + for (i = 0; i < len; i++) { + if (!isspace(p_first[i])) + break; + } + if (i == len) /* ignore all spaces */ + continue; + if (p_first[i] == '#') + continue; /* ignore comment lines */ + + if (sscanf(portsbuf, "%20s %20s", srcbuf, dstbuf) != 2) + IBEXIT("ports-file, %s, at line %i contains bad data", + ports_file, line_count); + num_port_pairs++; + if (get_route(srcbuf, dstbuf) != 0) + IBEXIT("Failed to get route information at line %i", + line_count); + } + printf("%i lid/guid pairs processed from %s\n", + num_port_pairs, ports_file); + } + close_node_name_map(node_name_map); + + mad_rpc_close_port(srcport); + + exit(0); +} diff --git a/infiniband-diags/man/CMakeLists.txt b/infiniband-diags/man/CMakeLists.txt new file mode 100644 index 0000000..1f1f91c --- /dev/null +++ b/infiniband-diags/man/CMakeLists.txt @@ -0,0 +1,106 @@ +# rst2man has no way to set the include search path and we need to substitute +# into the common files, so subst/link them all into the build directory +function(rdma_rst_common) + foreach(I ${ARGN}) + if ("${I}" MATCHES "\\.in.rst$") + string(REGEX REPLACE "^(.+)\\.in.rst$" "\\1" BASE_NAME "${I}") + configure_file("common/${I}" "${CMAKE_CURRENT_BINARY_DIR}/common/${BASE_NAME}.rst" @ONLY) + else() + if (NOT CMAKE_CURRENT_SOURCE_DIR STREQUAL CMAKE_CURRENT_BINARY_DIR) + rdma_create_symlink("${CMAKE_CURRENT_SOURCE_DIR}/common/${I}" "${CMAKE_CURRENT_BINARY_DIR}/common/${I}") + endif() + endif() + endforeach() +endfunction() + +rdma_rst_common( + opt_cache.rst + opt_C.rst + opt_diffcheck.rst + opt_diff.rst + opt_d.rst + opt_D.rst + opt_D_with_param.rst + opt_e.rst + opt_G.rst + opt_G_with_param.rst + opt_h.rst + opt_K.rst + opt_load-cache.rst + opt_L.rst + opt_node_name_map.rst + opt_o-outstanding_smps.rst + opt_ports-file.rst + opt_P.rst + opt_s.rst + opt_t.rst + opt_v.rst + opt_V.rst + opt_y.rst + opt_z-config.in.rst + sec_config-file.in.rst + sec_node-name-map.rst + sec_portselection.rst + sec_ports-file.rst + sec_topology-file.rst +) + +rdma_man_pages( + check_lft_balance.8.in.rst + dump_fts.8.in.rst + ibaddr.8.in.rst + ibcacheedit.8.in.rst + ibccconfig.8.in.rst + ibccquery.8.in.rst + ibfindnodesusing.8.in.rst + ibhosts.8.in.rst + ibidsverify.8.in.rst + iblinkinfo.8.in.rst + ibnetdiscover.8.in.rst + ibnodes.8.in.rst + ibping.8.in.rst + ibportstate.8.in.rst + ibqueryerrors.8.in.rst + ibroute.8.in.rst + ibrouters.8.in.rst + ibstat.8.in.rst + ibstatus.8.in.rst + ibswitches.8.in.rst + ibsysstat.8.in.rst + ibtracert.8.in.rst + infiniband-diags.8.in.rst + perfquery.8.in.rst + saquery.8.in.rst + sminfo.8.in.rst + smpdump.8.in.rst + smpquery.8.in.rst + vendstat.8.in.rst + ) + +rdma_alias_man_pages( + dump_fts.8 dump_lfts.8 + dump_fts.8 dump_mfts.8 + ) + +if (ENABLE_IBDIAGS_COMPAT) + rdma_man_pages( + ibcheckerrors.8 + ibcheckerrs.8 + ibchecknet.8 + ibchecknode.8 + ibcheckport.8 + ibcheckportstate.8 + ibcheckportwidth.8 + ibcheckstate.8 + ibcheckwidth.8 + ibclearcounters.8 + ibclearerrors.8 + ibdatacounters.8 + ibdatacounts.8 + ibdiscover.8 + ibprintca.8 + ibprintrt.8 + ibprintswitch.8 + ibswportwatch.8 + ) +endif() diff --git a/infiniband-diags/man/check_lft_balance.8.in.rst b/infiniband-diags/man/check_lft_balance.8.in.rst new file mode 100644 index 0000000..92b7a8a --- /dev/null +++ b/infiniband-diags/man/check_lft_balance.8.in.rst @@ -0,0 +1,51 @@ +================= +check_lft_balance +================= + +-------------------------------------------------- +check InfiniBand unicast forwarding tables balance +-------------------------------------------------- + +:Date: 2017-08-21 +:Manual section: 8 +:Manual group: Open IB Diagnostics + +SYNOPSIS +======== + +check_lft_balance.sh [-hRv] + + +DESCRIPTION +=========== + +check_lft_balance.sh is a script which checks for balancing in Infiniband +unicast forwarding tables. It analyzes the output of +**dump_lfts(8)** and **iblinkinfo(8)** + +OPTIONS +======= + +**-h** + show help + +**-R** + Recalculate dump_lfts information, ie do not use the cached + information. This option is slower but should be used if the diag + tools have not been used for some time or if there are other reasons to + believe that the fabric has changed. + +**-v** + verbose output + +SEE ALSO +======== + +**dump_lfts(8)** +**iblinkinfo(8)** + +AUTHORS +======= + +Albert Chu + < chu11@llnl.gov > diff --git a/infiniband-diags/man/common/opt_C.rst b/infiniband-diags/man/common/opt_C.rst new file mode 100644 index 0000000..223edf3 --- /dev/null +++ b/infiniband-diags/man/common/opt_C.rst @@ -0,0 +1,4 @@ +.. Define the common option -C + +**-C, --Ca <ca_name>** use the specified ca_name. + diff --git a/infiniband-diags/man/common/opt_D.rst b/infiniband-diags/man/common/opt_D.rst new file mode 100644 index 0000000..5f87a16 --- /dev/null +++ b/infiniband-diags/man/common/opt_D.rst @@ -0,0 +1,14 @@ +.. Define the common option -D for Directed routes + +**-D, --Direct** The address specified is a directed route +:: + + Examples: + [options] -D [options] "0" # self port + [options] -D [options] "0,1,2,1,4" # out via port 1, then 2, ... + + (Note the second number in the path specified must match the port being + used. This can be specified using the port selection flag '-P' or the + port found through the automatic selection process.) + + diff --git a/infiniband-diags/man/common/opt_D_with_param.rst b/infiniband-diags/man/common/opt_D_with_param.rst new file mode 100644 index 0000000..f040345 --- /dev/null +++ b/infiniband-diags/man/common/opt_D_with_param.rst @@ -0,0 +1,14 @@ +.. Define the common option -D for Directed routes + +**-D, --Direct <dr_path>** The address specified is a directed route +:: + + Examples: + -D "0" # self port + -D "0,1,2,1,4" # out via port 1, then 2, ... + + (Note the second number in the path specified must match the port being + used. This can be specified using the port selection flag '-P' or the + port found through the automatic selection process.) + + diff --git a/infiniband-diags/man/common/opt_G.rst b/infiniband-diags/man/common/opt_G.rst new file mode 100644 index 0000000..2647f88 --- /dev/null +++ b/infiniband-diags/man/common/opt_G.rst @@ -0,0 +1,4 @@ +.. Define the common option -G + +**-G, --Guid** The address specified is a Port GUID + diff --git a/infiniband-diags/man/common/opt_G_with_param.rst b/infiniband-diags/man/common/opt_G_with_param.rst new file mode 100644 index 0000000..0edb2d5 --- /dev/null +++ b/infiniband-diags/man/common/opt_G_with_param.rst @@ -0,0 +1,4 @@ +.. Define the common option -G + +**--port-guid, -G <port_guid>** Specify a port_guid + diff --git a/infiniband-diags/man/common/opt_K.rst b/infiniband-diags/man/common/opt_K.rst new file mode 100644 index 0000000..2b96bf2 --- /dev/null +++ b/infiniband-diags/man/common/opt_K.rst @@ -0,0 +1,4 @@ +.. Define the common option -K + +**-K, --show_keys** + show security keys (mkey, smkey, etc.) associated with the request. diff --git a/infiniband-diags/man/common/opt_L.rst b/infiniband-diags/man/common/opt_L.rst new file mode 100644 index 0000000..46c6b95 --- /dev/null +++ b/infiniband-diags/man/common/opt_L.rst @@ -0,0 +1,4 @@ +.. Define the common option -L + +**-L, --Lid** The address specified is a LID + diff --git a/infiniband-diags/man/common/opt_P.rst b/infiniband-diags/man/common/opt_P.rst new file mode 100644 index 0000000..bf9a191 --- /dev/null +++ b/infiniband-diags/man/common/opt_P.rst @@ -0,0 +1,4 @@ +.. Define the common option -P + +**-P, --Port <ca_port>** use the specified ca_port. + diff --git a/infiniband-diags/man/common/opt_V.rst b/infiniband-diags/man/common/opt_V.rst new file mode 100644 index 0000000..2ee7526 --- /dev/null +++ b/infiniband-diags/man/common/opt_V.rst @@ -0,0 +1,4 @@ +.. Define the common option -V + +**-V, --version** show the version info. + diff --git a/infiniband-diags/man/common/opt_cache.rst b/infiniband-diags/man/common/opt_cache.rst new file mode 100644 index 0000000..4cad3ae --- /dev/null +++ b/infiniband-diags/man/common/opt_cache.rst @@ -0,0 +1,7 @@ +.. Define the common option cache + +**--cache <filename>** +Cache the ibnetdiscover network data in the specified filename. This +cache may be used by other tools for later analysis. + + diff --git a/infiniband-diags/man/common/opt_d.rst b/infiniband-diags/man/common/opt_d.rst new file mode 100644 index 0000000..2a37ccc --- /dev/null +++ b/infiniband-diags/man/common/opt_d.rst @@ -0,0 +1,6 @@ +.. Define the common option -d + +-d + raise the IB debugging level. + May be used several times (-ddd or -d -d -d). + diff --git a/infiniband-diags/man/common/opt_diff.rst b/infiniband-diags/man/common/opt_diff.rst new file mode 100644 index 0000000..44aa1b7 --- /dev/null +++ b/infiniband-diags/man/common/opt_diff.rst @@ -0,0 +1,9 @@ +.. Define the common option diff + +**--diff <filename>** +Load cached ibnetdiscover data and do a diff comparison to the current +network or another cache. A special diff output for ibnetdiscover +output will be displayed showing differences between the old and current +fabric. By default, the following are compared for differences: switches, +channel adapters, routers, and port connections. + diff --git a/infiniband-diags/man/common/opt_diffcheck.rst b/infiniband-diags/man/common/opt_diffcheck.rst new file mode 100644 index 0000000..9f478c7 --- /dev/null +++ b/infiniband-diags/man/common/opt_diffcheck.rst @@ -0,0 +1,13 @@ +.. Define the common option diffcheck + +**--diffcheck <key(s)>** +Specify what diff checks should be done in the **--diff** option above. +Comma separate multiple diff check key(s). The available diff checks +are: **sw = switches**, **ca = channel adapters**, **router** = routers, +**port** = port connections, **lid** = lids, **nodedesc** = node +descriptions. Note that **port**, **lid**, and **nodedesc** are +checked only for the node types that are specified (e.g. **sw**, +**ca**, **router**). If **port** is specified alongside **lid** +or **nodedesc**, remote port lids and node descriptions will also be compared. + + diff --git a/infiniband-diags/man/common/opt_e.rst b/infiniband-diags/man/common/opt_e.rst new file mode 100644 index 0000000..f0c9103 --- /dev/null +++ b/infiniband-diags/man/common/opt_e.rst @@ -0,0 +1,3 @@ +.. Define the common option -e + +-e show send and receive errors (timeouts and others) diff --git a/infiniband-diags/man/common/opt_h.rst b/infiniband-diags/man/common/opt_h.rst new file mode 100644 index 0000000..2f74999 --- /dev/null +++ b/infiniband-diags/man/common/opt_h.rst @@ -0,0 +1,4 @@ +.. Define the common option -h + +**-h, --help** show the usage message + diff --git a/infiniband-diags/man/common/opt_load-cache.rst b/infiniband-diags/man/common/opt_load-cache.rst new file mode 100644 index 0000000..88dc05f --- /dev/null +++ b/infiniband-diags/man/common/opt_load-cache.rst @@ -0,0 +1,8 @@ +.. Define the common option load-cache + +**--load-cache <filename>** +Load and use the cached ibnetdiscover data stored in the specified +filename. May be useful for outputting and learning about other +fabrics or a previous state of a fabric. + + diff --git a/infiniband-diags/man/common/opt_node_name_map.rst b/infiniband-diags/man/common/opt_node_name_map.rst new file mode 100644 index 0000000..dd2c6cf --- /dev/null +++ b/infiniband-diags/man/common/opt_node_name_map.rst @@ -0,0 +1,6 @@ +.. Define the common option --node-name-map + +**--node-name-map <node-name-map>** Specify a node name map. + + This file maps GUIDs to more user friendly names. See FILES section. + diff --git a/infiniband-diags/man/common/opt_o-outstanding_smps.rst b/infiniband-diags/man/common/opt_o-outstanding_smps.rst new file mode 100644 index 0000000..a52f09e --- /dev/null +++ b/infiniband-diags/man/common/opt_o-outstanding_smps.rst @@ -0,0 +1,7 @@ +.. Define the common option -z + +**--outstanding_smps, -o <val>** + Specify the number of outstanding SMP's which should be issued during the scan + + Default: 2 + diff --git a/infiniband-diags/man/common/opt_ports-file.rst b/infiniband-diags/man/common/opt_ports-file.rst new file mode 100644 index 0000000..5a0f487 --- /dev/null +++ b/infiniband-diags/man/common/opt_ports-file.rst @@ -0,0 +1,6 @@ +.. Define the common option --ports-file + +**--ports-file <ports-file>** Specify a ports file. + + This file contains multiple source and destination lid or guid pairs. See FILES section. + diff --git a/infiniband-diags/man/common/opt_s.rst b/infiniband-diags/man/common/opt_s.rst new file mode 100644 index 0000000..c7ada5e --- /dev/null +++ b/infiniband-diags/man/common/opt_s.rst @@ -0,0 +1,4 @@ +.. Define the common option -s + +**-s, --sm_port <smlid>** use 'smlid' as the target lid for SA queries. + diff --git a/infiniband-diags/man/common/opt_t.rst b/infiniband-diags/man/common/opt_t.rst new file mode 100644 index 0000000..59d3694 --- /dev/null +++ b/infiniband-diags/man/common/opt_t.rst @@ -0,0 +1,4 @@ +.. Define the common option -t + +**-t, --timeout <timeout_ms>** override the default timeout for the solicited mads. + diff --git a/infiniband-diags/man/common/opt_v.rst b/infiniband-diags/man/common/opt_v.rst new file mode 100644 index 0000000..9912007 --- /dev/null +++ b/infiniband-diags/man/common/opt_v.rst @@ -0,0 +1,6 @@ +.. Define the common option -v + +**-v, --verbose** + increase the application verbosity level. + May be used several times (-vv or -v -v -v) + diff --git a/infiniband-diags/man/common/opt_y.rst b/infiniband-diags/man/common/opt_y.rst new file mode 100644 index 0000000..91caf0c --- /dev/null +++ b/infiniband-diags/man/common/opt_y.rst @@ -0,0 +1,6 @@ +.. Define the common option -y + +**-y, --m_key <key>** + use the specified M_key for requests. If non-numeric value (like 'x') + is specified then a value will be prompted for. + diff --git a/infiniband-diags/man/common/opt_z-config.in.rst b/infiniband-diags/man/common/opt_z-config.in.rst new file mode 100644 index 0000000..449a6ef --- /dev/null +++ b/infiniband-diags/man/common/opt_z-config.in.rst @@ -0,0 +1,6 @@ +.. Define the common option -z + +**--config, -z <config_file>** Specify alternate config file. + + Default: @IBDIAG_CONFIG_PATH@/ibdiag.conf + diff --git a/infiniband-diags/man/common/sec_config-file.in.rst b/infiniband-diags/man/common/sec_config-file.in.rst new file mode 100644 index 0000000..060cf7b --- /dev/null +++ b/infiniband-diags/man/common/sec_config-file.in.rst @@ -0,0 +1,10 @@ +.. Common text for the config file + +CONFIG FILE +----------- + +@IBDIAG_CONFIG_PATH@/ibdiag.conf + +A global config file is provided to set some of the common options for all +tools. See supplied config file for details. + diff --git a/infiniband-diags/man/common/sec_node-name-map.rst b/infiniband-diags/man/common/sec_node-name-map.rst new file mode 100644 index 0000000..3e18d1b --- /dev/null +++ b/infiniband-diags/man/common/sec_node-name-map.rst @@ -0,0 +1,43 @@ +.. Common text to describe the node name map file. + +NODE NAME MAP FILE FORMAT +------------------------- + +The node name map is used to specify user friendly names for nodes in the +output. GUIDs are used to perform the lookup. + +This functionality is provided by the opensm-libs package. See **opensm(8)** +for the file location for your installation. + +**Generically:** + +:: + + # comment + <guid> "<name>" + +**Example:** + +:: + + # IB1 + # Line cards + 0x0008f104003f125c "IB1 (Rack 11 slot 1 ) ISR9288/ISR9096 Voltaire sLB-24D" + 0x0008f104003f125d "IB1 (Rack 11 slot 1 ) ISR9288/ISR9096 Voltaire sLB-24D" + 0x0008f104003f10d2 "IB1 (Rack 11 slot 2 ) ISR9288/ISR9096 Voltaire sLB-24D" + 0x0008f104003f10d3 "IB1 (Rack 11 slot 2 ) ISR9288/ISR9096 Voltaire sLB-24D" + 0x0008f104003f10bf "IB1 (Rack 11 slot 12 ) ISR9288/ISR9096 Voltaire sLB-24D" + + # Spines + 0x0008f10400400e2d "IB1 (Rack 11 spine 1 ) ISR9288 Voltaire sFB-12D" + 0x0008f10400400e2e "IB1 (Rack 11 spine 1 ) ISR9288 Voltaire sFB-12D" + 0x0008f10400400e2f "IB1 (Rack 11 spine 1 ) ISR9288 Voltaire sFB-12D" + 0x0008f10400400e31 "IB1 (Rack 11 spine 2 ) ISR9288 Voltaire sFB-12D" + 0x0008f10400400e32 "IB1 (Rack 11 spine 2 ) ISR9288 Voltaire sFB-12D" + + # GUID Node Name + 0x0008f10400411a08 "SW1 (Rack 3) ISR9024 Voltaire 9024D" + 0x0008f10400411a28 "SW2 (Rack 3) ISR9024 Voltaire 9024D" + 0x0008f10400411a34 "SW3 (Rack 3) ISR9024 Voltaire 9024D" + 0x0008f104004119d0 "SW4 (Rack 3) ISR9024 Voltaire 9024D" + diff --git a/infiniband-diags/man/common/sec_ports-file.rst b/infiniband-diags/man/common/sec_ports-file.rst new file mode 100644 index 0000000..c65299e --- /dev/null +++ b/infiniband-diags/man/common/sec_ports-file.rst @@ -0,0 +1,28 @@ +.. Common text to describe the port file. + +PORTS FILE FORMAT +------------------------- + +The ports file can be used to specify multiple source and destination pairs. They can be lids or guids. If guids, use the -G option to indicate that. + +**Generically:** + +:: + + # comment + <src> <dst> + +**Example:** + +:: + + 73 207 + 203 657 + 531 101 + + > OR < + + 0x0008f104003f125c 0x0008f104003f133d + 0x0008f1040011ab07 0x0008f104004265c0 + 0x0008f104007c5510 0x0008f1040099bb08 + diff --git a/infiniband-diags/man/common/sec_portselection.rst b/infiniband-diags/man/common/sec_portselection.rst new file mode 100644 index 0000000..56936b6 --- /dev/null +++ b/infiniband-diags/man/common/sec_portselection.rst @@ -0,0 +1,24 @@ +.. Explanation of local port selection + +Local port Selection +-------------------- + +Multiple port/Multiple CA support: when no IB device or port is specified +(see the "local umad parameters" below), the libibumad library +selects the port to use by the following criteria: + + 1. the first port that is ACTIVE. + 2. if not found, the first port that is UP (physical link up). + + If a port and/or CA name is specified, the libibumad library attempts + to fulfill the user request, and will fail if it is not possible. + + For example: + + :: + + ibaddr # use the first port (criteria #1 above) + ibaddr -C mthca1 # pick the best port from "mthca1" only. + ibaddr -P 2 # use the second (active/up) port from the first available IB device. + ibaddr -C mthca0 -P 2 # use the specified port only. + diff --git a/infiniband-diags/man/common/sec_topology-file.rst b/infiniband-diags/man/common/sec_topology-file.rst new file mode 100644 index 0000000..8249f2d --- /dev/null +++ b/infiniband-diags/man/common/sec_topology-file.rst @@ -0,0 +1,89 @@ +.. Common text to describe the Topology file. + +TOPOLOGY FILE FORMAT +-------------------- + +The topology file format is human readable and largely intuitive. +Most identifiers are given textual names like vendor ID (vendid), device ID +(device ID), GUIDs of various types (sysimgguid, caguid, switchguid, etc.). +PortGUIDs are shown in parentheses (). For switches, this is shown on the +switchguid line. For CA and router ports, it is shown on the connectivity +lines. The IB node is identified followed by the number of ports and a quoted +the node GUID. On the right of this line is a comment (#) followed by the +NodeDescription in quotes. If the node is a switch, this line also contains +whether switch port 0 is base or enhanced, and the LID and LMC of port 0. +Subsequent lines pertaining to this node show the connectivity. On the +left is the port number of the current node. On the right is the peer node +(node at other end of link). It is identified in quotes with nodetype +followed by - followed by NodeGUID with the port number in square brackets. +Further on the right is a comment (#). What follows the comment is +dependent on the node type. If it it a switch node, it is followed by +the NodeDescription in quotes and the LID of the peer node. If it is a +CA or router node, it is followed by the local LID and LMC and then +followed by the NodeDescription in quotes and the LID of the peer node. +The active link width and speed are then appended to the end of this +output line. + +An example of this is: + +:: + + # + # Topology file: generated on Tue Jun 5 14:15:10 2007 + # + # Max of 3 hops discovered + # Initiated from node 0008f10403960558 port 0008f10403960559 + + Non-Chassis Nodes + + vendid=0x8f1 + devid=0x5a06 + sysimgguid=0x5442ba00003000 + switchguid=0x5442ba00003080(5442ba00003080) + Switch 24 "S-005442ba00003080" # "ISR9024 Voltaire" base port 0 lid 6 lmc 0 + [22] "H-0008f10403961354"[1](8f10403961355) # "MT23108 InfiniHost Mellanox Technologies" lid 4 4xSDR + [10] "S-0008f10400410015"[1] # "SW-6IB4 Voltaire" lid 3 4xSDR + [8] "H-0008f10403960558"[2](8f1040396055a) # "MT23108 InfiniHost Mellanox Technologies" lid 14 4xSDR + [6] "S-0008f10400410015"[3] # "SW-6IB4 Voltaire" lid 3 4xSDR + [12] "H-0008f10403960558"[1](8f10403960559) # "MT23108 InfiniHost Mellanox Technologies" lid 10 4xSDR + + vendid=0x8f1 + devid=0x5a05 + switchguid=0x8f10400410015(8f10400410015) + Switch 8 "S-0008f10400410015" # "SW-6IB4 Voltaire" base port 0 lid 3 lmc 0 + [6] "H-0008f10403960984"[1](8f10403960985) # "MT23108 InfiniHost Mellanox Technologies" lid 16 4xSDR + [4] "H-005442b100004900"[1](5442b100004901) # "MT23108 InfiniHost Mellanox Technologies" lid 12 4xSDR + [1] "S-005442ba00003080"[10] # "ISR9024 Voltaire" lid 6 1xSDR + [3] "S-005442ba00003080"[6] # "ISR9024 Voltaire" lid 6 4xSDR + + vendid=0x2c9 + devid=0x5a44 + caguid=0x8f10403960984 + Ca 2 "H-0008f10403960984" # "MT23108 InfiniHost Mellanox Technologies" + [1](8f10403960985) "S-0008f10400410015"[6] # lid 16 lmc 1 "SW-6IB4 Voltaire" lid 3 4xSDR + + vendid=0x2c9 + devid=0x5a44 + caguid=0x5442b100004900 + Ca 2 "H-005442b100004900" # "MT23108 InfiniHost Mellanox Technologies" + [1](5442b100004901) "S-0008f10400410015"[4] # lid 12 lmc 1 "SW-6IB4 Voltaire" lid 3 4xSDR + + vendid=0x2c9 + devid=0x5a44 + caguid=0x8f10403961354 + Ca 2 "H-0008f10403961354" # "MT23108 InfiniHost Mellanox Technologies" + [1](8f10403961355) "S-005442ba00003080"[22] # lid 4 lmc 1 "ISR9024 Voltaire" lid 6 4xSDR + + vendid=0x2c9 + devid=0x5a44 + caguid=0x8f10403960558 + Ca 2 "H-0008f10403960558" # "MT23108 InfiniHost Mellanox Technologies" + [2](8f1040396055a) "S-005442ba00003080"[8] # lid 14 lmc 1 "ISR9024 Voltaire" lid 6 4xSDR + [1](8f10403960559) "S-005442ba00003080"[12] # lid 10 lmc 1 "ISR9024 Voltaire" lid 6 1xSDR + + +When grouping is used, IB nodes are organized into chassis which are +numbered. Nodes which cannot be determined to be in a chassis are +displayed as "Non-Chassis Nodes". External ports are also shown on the +connectivity lines. + diff --git a/infiniband-diags/man/dump_fts.8.in.rst b/infiniband-diags/man/dump_fts.8.in.rst new file mode 100644 index 0000000..a932bc1 --- /dev/null +++ b/infiniband-diags/man/dump_fts.8.in.rst @@ -0,0 +1,85 @@ +======== +DUMP_FTS +======== + +--------------------------------- +dump InfiniBand forwarding tables +--------------------------------- + +:Date: 2013-03-26 +:Manual section: 8 +:Manual group: OpenIB Diagnostics + + + +SYNOPSIS +======== + +dump_fts [options] [<startlid> [<endlid>]] + + +DESCRIPTION +=========== + +dump_fts is similar to ibroute but dumps tables for every switch found in an +ibnetdiscover scan of the subnet. + +The dump file format is compatible with loading into OpenSM using +the -R file -U /path/to/dump-file syntax. + +OPTIONS +======= + +**-a, --all** + show all lids in range, even invalid entries + +**-n, --no_dests** + do not try to resolve destinations + +**-M, --Multicast** + show multicast forwarding tables + In this case, the range parameters are specifying the mlid range. + + +Port Selection flags +-------------------- + +.. include:: common/opt_C.rst +.. include:: common/opt_P.rst +.. include:: common/sec_portselection.rst + +Debugging flags +--------------- + +.. include:: common/opt_d.rst +.. include:: common/opt_e.rst +.. include:: common/opt_h.rst +.. include:: common/opt_v.rst +.. include:: common/opt_V.rst + +Configuration flags +------------------- + +.. include:: common/opt_t.rst +.. include:: common/opt_y.rst +.. include:: common/opt_node_name_map.rst +.. include:: common/opt_z-config.rst + +FILES +===== + +.. include:: common/sec_config-file.rst +.. include:: common/sec_node-name-map.rst + + +SEE ALSO +======== + +**dump_lfts(8), dump_mfts(8), ibroute(8), ibswitches(8), opensm(8)** + + +AUTHORS +======= + +Ira Weiny + < ira.weiny@intel.com > diff --git a/infiniband-diags/man/ibaddr.8.in.rst b/infiniband-diags/man/ibaddr.8.in.rst new file mode 100644 index 0000000..be080de --- /dev/null +++ b/infiniband-diags/man/ibaddr.8.in.rst @@ -0,0 +1,102 @@ +====== +IBADDR +====== + +---------------------------- +query InfiniBand address(es) +---------------------------- + +:Date: 2013-10-11 +:Manual section: 8 +:Manual group: OpenIB Diagnostics + + +SYNOPSIS +======== + +ibaddr [options] + + +DESCRIPTION +=========== + + +Display the lid (and range) as well as the GID address of the +port specified (by DR path, lid, or GUID) or the local port by default. + +Note: this utility can be used as simple address resolver. + +OPTIONS +======= + +**--gid_show, -g** +show gid address only + +**--lid_show, -l** +show lid range only + +**--Lid_show, -L** +show lid range (in decimal) only + + +Addressing Flags +---------------- + +.. include:: common/opt_D.rst +.. include:: common/opt_G.rst +.. include:: common/opt_s.rst + + +Debugging flags +--------------- + +.. include:: common/opt_d.rst +.. include:: common/opt_e.rst +.. include:: common/opt_h.rst +.. include:: common/opt_v.rst +.. include:: common/opt_V.rst + + +Port Selection flags +-------------------- + +.. include:: common/opt_C.rst +.. include:: common/opt_P.rst +.. include:: common/sec_portselection.rst + + +Configuration flags +------------------- + +.. include:: common/opt_y.rst +.. include:: common/opt_t.rst +.. include:: common/opt_z-config.rst + +FILES +===== + +.. include:: common/sec_config-file.rst + + +EXAMPLES +======== + +:: + + ibaddr # local port\'s address + ibaddr 32 # show lid range and gid of lid 32 + ibaddr -G 0x8f1040023 # same but using guid address + ibaddr -l 32 # show lid range only + ibaddr -L 32 # show decimal lid range only + ibaddr -g 32 # show gid address only + +SEE ALSO +======== + +**ibroute (8), ibtracert (8)** + +AUTHOR +====== + +Hal Rosenstock + < halr@voltaire.com > diff --git a/infiniband-diags/man/ibcacheedit.8.in.rst b/infiniband-diags/man/ibcacheedit.8.in.rst new file mode 100644 index 0000000..d287142 --- /dev/null +++ b/infiniband-diags/man/ibcacheedit.8.in.rst @@ -0,0 +1,58 @@ +=========== +ibcacheedit +=========== + +--------------------------- +edit an ibnetdiscover cache +--------------------------- + +:Date: 2017-08-21 +:Manual section: 8 +:Manual group: Open IB Diagnostics + +SYNOPSIS +======== + +ibcacheedit [options] <orig.cache> <new.cache> + +DESCRIPTION +=========== + +ibcacheedit allows users to edit an ibnetdiscover cache created through the +**--cache** option in **ibnetdiscover(8)** . + +OPTIONS +======= + +**--switchguid BEFOREGUID:AFTERGUID** + Specify a switchguid that should be changed. The before and after guid + should be separated by a colon. On switches, port guids are identical + to the switch guid, so port guids will be adjusted as well on switches. + +**--caguid BEFOREGUID:AFTERGUID** + Specify a caguid that should be changed. The before and after guid + should be separated by a colon. + +**--sysimgguid BEFOREGUID:AFTERGUID** + Specify a sysimgguid that should be changed. The before and after guid + should be spearated by a colon. + +**--portguid NODEGUID:BEFOREGUID:AFTERGUID** + Specify a portguid that should be changed. The nodeguid of the port + (e.g. switchguid or caguid) should be specified first, followed by a + colon, the before port guid, another colon, then the after port guid. + On switches, port guids are identical to the switch guid, so the switch + guid will be adjusted as well on switches. + +Debugging flags +--------------- + +.. include:: common/opt_h.rst +.. include:: common/opt_V.rst + + +AUTHORS +======= + +Albert Chu + < chu11@llnl.gov > diff --git a/infiniband-diags/man/ibccconfig.8.in.rst b/infiniband-diags/man/ibccconfig.8.in.rst new file mode 100644 index 0000000..741d50c --- /dev/null +++ b/infiniband-diags/man/ibccconfig.8.in.rst @@ -0,0 +1,94 @@ +========== +IBCCCONFIG +========== + +------------------------------------- +configure congestion control settings +------------------------------------- + +:Date: 2012-05-31 +:Manual section: 8 +:Manual group: OpenIB Diagnostics + + +SYNOPSIS +======== + +ibccconfig [common_options] [-c cckey] <op> <lid|guid> [port] + +DESCRIPTION +=========== + +**ibccconfig** +supports the configuration of congestion control settings on switches +and HCAs. + +**WARNING -- You should understand what you are doing before using this tool. +Misuse of this tool could result in a broken fabric.** + +OPTIONS +======= + +Current supported operations and their parameters: + CongestionKeyInfo (CK) <lid|guid> <cckey> <cckeyprotectbit> <cckeyleaseperiod> <cckeyviolations> + SwitchCongestionSetting (SS) <lid|guid> <controlmap> <victimmask> <creditmask> <threshold> <packetsize> <csthreshold> <csreturndelay> <markingrate> + SwitchPortCongestionSetting (SP) <lid|guid> <portnum> <valid> <control_type> <threshold> <packet_size> <cong_parm_marking_rate> + CACongestionSetting (CS) <lid|guid> <port_control> <control_map> <ccti_timer> <ccti_increase> <trigger_threshold> <ccti_min> + CongestionControlTable (CT) <lid|guid> <cctilimit> <index> <cctentry> <cctentry> ... + +**--cckey, -c, <cckey>** +Specify a congestion control (CC) key. If none is specified, a key of 0 is used. + + +Debugging flags +--------------- + +.. include:: common/opt_d.rst +.. include:: common/opt_e.rst +.. include:: common/opt_h.rst +.. include:: common/opt_v.rst +.. include:: common/opt_V.rst + +Addressing Flags +---------------- + +.. include:: common/opt_G.rst +.. include:: common/opt_L.rst +.. include:: common/opt_s.rst + + +Port Selection flags +-------------------- + +.. include:: common/opt_C.rst +.. include:: common/opt_P.rst +.. include:: common/sec_portselection.rst + +Configuration flags +------------------- + +.. include:: common/opt_y.rst +.. include:: common/opt_z-config.rst + + +EXAMPLES +======== + +:: + + ibccconfig SwitchCongestionSetting 2 0x1F 0x1FFFFFFFFF 0x0 0xF 8 0 0:0 1 # Configure Switch Congestion Settings + ibccconfig CACongestionSetting 1 0 0x3 150 1 0 0 # Configure CA Congestion Settings to SL 0 and SL 1 + ibccconfig CACongestionSetting 1 0 0x4 200 1 0 0 # Configure CA Congestion Settings to SL 2 + ibccconfig CongestionControlTable 1 63 0 0:0 0:1 ... # Configure first block of Congestion Control Table + ibccconfig CongestionControlTable 1 127 0 0:64 0:65 ... # Configure second block of Congestion Control Table + +FILES +===== + +.. include:: common/sec_config-file.rst + +AUTHOR +====== + +Albert Chu + < chu11@llnl.gov > diff --git a/infiniband-diags/man/ibccquery.8.in.rst b/infiniband-diags/man/ibccquery.8.in.rst new file mode 100644 index 0000000..85c0510 --- /dev/null +++ b/infiniband-diags/man/ibccquery.8.in.rst @@ -0,0 +1,90 @@ +========= +IBCCQUERY +========= + +-------------------------------------- +query congestion control settings/info +-------------------------------------- + +:Date: 2012-05-31 +:Manual section: 8 +:Manual group: OpenIB Diagnostics + + +SYNOPSIS +======== +ibccquery [common_options] [-c cckey] <op> <lid|guid> [port] + +DESCRIPTION +=========== + +ibccquery support the querying of settings and other information related +to congestion control. + +OPTIONS +======= + +Current supported operations and their parameters: + CongestionInfo (CI) <addr> + CongestionKeyInfo (CK) <addr> + CongestionLog (CL) <addr> + SwitchCongestionSetting (SS) <addr> + SwitchPortCongestionSetting (SP) <addr> [<portnum>] + CACongestionSetting (CS) <addr> + CongestionControlTable (CT) <addr> + Timestamp (TI) <addr> + + +**--cckey, -c <cckey>** +Specify a congestion control (CC) key. If none is specified, a key of 0 is used. + + +Debugging flags +--------------- + +.. include:: common/opt_d.rst +.. include:: common/opt_e.rst +.. include:: common/opt_h.rst +.. include:: common/opt_v.rst +.. include:: common/opt_V.rst + +Addressing Flags +---------------- + +.. include:: common/opt_G.rst +.. include:: common/opt_L.rst +.. include:: common/opt_s.rst + + +Port Selection flags +-------------------- + +.. include:: common/opt_C.rst +.. include:: common/opt_P.rst +.. include:: common/sec_portselection.rst + +Configuration flags +------------------- + +.. include:: common/opt_y.rst +.. include:: common/opt_z-config.rst + +FILES +===== + +.. include:: common/sec_config-file.rst + +EXAMPLES +======== + +:: + + ibccquery CongestionInfo 3 # Congestion Info by lid + ibccquery SwitchPortCongestionSetting 3 # Query all Switch Port Congestion Settings + ibccquery SwitchPortCongestionSetting 3 1 # Query Switch Port Congestion Setting for port 1 + +AUTHOR +====== + +Albert Chu + < chu11@llnl.gov > diff --git a/infiniband-diags/man/ibcheckerrors.8 b/infiniband-diags/man/ibcheckerrors.8 new file mode 100644 index 0000000..7c2467f --- /dev/null +++ b/infiniband-diags/man/ibcheckerrors.8 @@ -0,0 +1,41 @@ +.TH IBCHECKERRORS 8 "May 21, 2007" "OpenIB" "OpenIB Diagnostics" + +.SH NAME +ibcheckerrors \- validate IB subnet and report errors + +.SH SYNOPSIS +.B ibcheckerrors +[\-h] [\-b] [\-v] [\-N | \-nocolor] [<topology-file> | \-C ca_name +\-P ca_port \-t(imeout) timeout_ms] + +.SH DESCRIPTION +.PP +ibcheckerrors is a script which uses a full topology file that was created by +ibnetdiscover, scans the network to validate the connectivity and reports +errors (from port counters). + +.SH OPTIONS +.PP +\-v increase the verbosity level +.PP +\-b brief mode. Reduce the output to show only if errors are present, + not what they are. +.PP +\-N | \-nocolor use mono rather than color mode +.PP +\-C <ca_name> use the specified ca_name. +.PP +\-P <ca_port> use the specified ca_port. +.PP +\-t <timeout_ms> override the default timeout for the solicited mads. + +.SH SEE ALSO +.BR ibnetdiscover(8), +.BR ibchecknode(8), +.BR ibcheckport(8), +.BR ibcheckerrs(8) + +.SH AUTHOR +.TP +Hal Rosenstock +.RI < halr@voltaire.com > diff --git a/infiniband-diags/man/ibcheckerrs.8 b/infiniband-diags/man/ibcheckerrs.8 new file mode 100644 index 0000000..f8aa848 --- /dev/null +++ b/infiniband-diags/man/ibcheckerrs.8 @@ -0,0 +1,59 @@ +.TH IBCHECKERRS 8 "May 30, 2007" "OpenIB" "OpenIB Diagnostics" + +.SH NAME +ibcheckerrs \- validate IB port (or node) and report errors in counters above threshold + +.SH SYNOPSIS +.B ibcheckerrs +[\-h] [\-b] [\-v] [\-G] [\-T <threshold_file>] [\-s(how_thresholds)] +[\-N | \-nocolor] [\-C ca_name] [\-P ca_port] [\-t(imeout) timeout_ms] +<lid|guid> <port> + + +.SH DESCRIPTION +.PP +Check specified port (or node) and report errors that surpassed their predefined +threshold. Port address is lid unless -G option is used to specify a GUID +address. The predefined thresholds can be dumped using the -s option, and a +user defined threshold_file (using the same format as the dump) can be +specified using the -t <file> option. + +.SH OPTIONS +.PP +\-G use GUID address argument. In most cases, it is the Port GUID. + Example: + "0x08f1040023" +.PP +\-s show predefined thresholds +.PP +\-T use specified threshold file +.PP +\-v increase the verbosity level +.PP +\-b brief mode. Reduce the output to show only if errors are + present, not what they are. +.PP +\-N | \-nocolor use mono rather than color mode +.PP +\-C <ca_name> use the specified ca_name. +.PP +\-P <ca_port> use the specified ca_port. +.PP +\-t <timeout_ms> override the default timeout for the solicited mads. + +.SH EXAMPLE +.PP +ibcheckerrs 2 # check aggregated node counter for lid 2 +.PP +ibcheckerrs 2 4 # check port counters for lid 2 port 4 +.PP +ibcheckerrs -T xxx 2 # check node using xxx threshold file + +.SH SEE ALSO +.BR perfquery(8), +.BR ibaddr(8) + +.SH AUTHOR +.TP +Hal Rosenstock +.RI < halr@voltaire.com > diff --git a/infiniband-diags/man/ibchecknet.8 b/infiniband-diags/man/ibchecknet.8 new file mode 100644 index 0000000..f907823 --- /dev/null +++ b/infiniband-diags/man/ibchecknet.8 @@ -0,0 +1,36 @@ +.TH IBCHECKNET 8 "May 21, 2007" "OpenIB" "OpenIB Diagnostics" + +.SH NAME +ibchecknet \- validate IB subnet and report errors + +.SH SYNOPSIS +.B ibchecknet +[\-h] [\-N | \-nocolor] [<topology-file> | \-C ca_name \-P ca_port +\-t(imeout) timeout_ms] + +.SH DESCRIPTION +.PP +ibchecknet is a script which uses a full topology file that was created +by ibnetdiscover, and scans the network to validate the connectivity and +reports errors (from port counters). + +.SH OPTIONS +.PP +\-N | \-nocolor use mono rather than color mode +.PP +\-C <ca_name> use the specified ca_name. +.PP +\-P <ca_port> use the specified ca_port. +.PP +\-t <timeout_ms> override the default timeout for the solicited mads. + +.SH SEE ALSO +.BR ibnetdiscover(8), +.BR ibchecknode(8), +.BR ibcheckport(8), +.BR ibcheckerrs(8) + +.SH AUTHOR +.TP +Hal Rosenstock +.RI < halr@voltaire.com > diff --git a/infiniband-diags/man/ibchecknode.8 b/infiniband-diags/man/ibchecknode.8 new file mode 100644 index 0000000..3d65d8a --- /dev/null +++ b/infiniband-diags/man/ibchecknode.8 @@ -0,0 +1,43 @@ +.TH IBCHECKNODE 8 "May 21, 2007" "OpenIB" "OpenIB Diagnostics" + +.SH NAME +ibchecknode \- validate IB node and report errors + +.SH SYNOPSIS +.B ibchecknode +[\-h] [\-v] [\-N | \-nocolor] [\-G] [\-C ca_name] [\-P ca_port] +[\-t(imeout) timeout_ms] <lid|guid> + +.SH DESCRIPTION +.PP +Check connectivity and do some simple sanity checks for the specified node. +Port address is a lid unless -G option is used to specify a GUID address. + +.SH OPTIONS +.PP +\-G use GUID address argument. In most cases, it is the Port GUID. + Example: + "0x08f1040023" +.PP +\-v increase the verbosity level +.PP +\-N | \-nocolor use mono rather than color mode +.PP +\-C <ca_name> use the specified ca_name. +.PP +\-P <ca_port> use the specified ca_port. +.PP +\-t <timeout_ms> override the default timeout for the solicited mads. + +.SH EXAMPLE +.PP +ibchecknode 2 # check node via lid 2 + +.SH SEE ALSO +.BR smpquery(8), +.BR ibaddr(8) + +.SH AUTHOR +.TP +Hal Rosenstock +.RI < halr@voltaire.com > diff --git a/infiniband-diags/man/ibcheckport.8 b/infiniband-diags/man/ibcheckport.8 new file mode 100644 index 0000000..f01095b --- /dev/null +++ b/infiniband-diags/man/ibcheckport.8 @@ -0,0 +1,43 @@ +.TH IBCHECKPORT 8 "May 21, 2007" "OpenIB" "OpenIB Diagnostics" + +.SH NAME +ibcheckport \- validate IB port and report errors + +.SH SYNOPSIS +.B ibcheckport +[\-h] [\-v] [\-N | \-nocolor] [\-G] [\-C ca_name] [\-P ca_port] +[\-t(imeout) timeout_ms] <lid|guid> <port> + +.SH DESCRIPTION +.PP +Check connectivity and do some simple sanity checks for the specified port. +Port address is a lid unless -G option is used to specify a GUID address. + +.SH OPTIONS +.PP +\-G use GUID address argument. In most cases, it is the Port GUID. + Example: + "0x08f1040023" +.PP +\-v increase the verbosity level +.PP +\-N | \-nocolor use mono rather than color mode +.PP +\-C <ca_name> use the specified ca_name. +.PP +\-P <ca_port> use the specified ca_port. +.PP +\-t <timeout_ms> override the default timeout for the solicited mads. + +.SH EXAMPLE +.PP +ibcheckport 2 3 # check lid 2 port 3 + +.SH SEE ALSO +.BR smpquery(8), +.BR ibaddr(8) + +.SH AUTHOR +.TP +Hal Rosenstock +.RI < halr@voltaire.com > diff --git a/infiniband-diags/man/ibcheckportstate.8 b/infiniband-diags/man/ibcheckportstate.8 new file mode 100644 index 0000000..8d7f38b --- /dev/null +++ b/infiniband-diags/man/ibcheckportstate.8 @@ -0,0 +1,44 @@ +.TH IBCHECKPORTSTATE 8 "May 21, 2007" "OpenIB" "OpenIB Diagnostics" + +.SH NAME +ibcheckportstate \- validate IB port for LinkUp and not Active state + +.SH SYNOPSIS +.B ibcheckportstate +[\-h] [\-v] [\-N | \-nocolor] [\-G] [\-C ca_name] [\-P ca_port] +[\-t(imeout) timeout_ms] <lid|guid> <port> + +.SH DESCRIPTION +.PP +Check connectivity and check the specified port for proper port state +(Active) and port physical state (LinkUp). +Port address is a lid unless -G option is used to specify a GUID address. + +.SH OPTIONS +.PP +\-G use GUID address argument. In most cases, it is the Port GUID. + Example: + "0x08f1040023" +.PP +\-v increase the verbosity level +.PP +\-N | \-nocolor use mono rather than color mode +.PP +\-C <ca_name> use the specified ca_name. +.PP +\-P <ca_port> use the specified ca_port. +.PP +\-t <timeout_ms> override the default timeout for the solicited mads. + +.SH EXAMPLE +.PP +ibcheckportstate 2 3 # check lid 2 port 3 + +.SH SEE ALSO +.BR smpquery(8), +.BR ibaddr(8) + +.SH AUTHOR +.TP +Hal Rosenstock +.RI < halr@voltaire.com > diff --git a/infiniband-diags/man/ibcheckportwidth.8 b/infiniband-diags/man/ibcheckportwidth.8 new file mode 100644 index 0000000..c368467 --- /dev/null +++ b/infiniband-diags/man/ibcheckportwidth.8 @@ -0,0 +1,43 @@ +.TH IBCHECKPORTWIDTH 8 "May 21, 2007" "OpenIB" "OpenIB Diagnostics" + +.SH NAME +ibcheckportwidth \- validate IB port for 1x link width + +.SH SYNOPSIS +.B ibcheckportwidth +[\-h] [\-v] [\-N | \-nocolor] [\-G] [\-C ca_name] [\-P ca_port] +[\-t(imeout) timeout_ms] <lid|guid> <port> + +.SH DESCRIPTION +.PP +Check connectivity and check the specified port for 1x link width. +Port address is a lid unless -G option is used to specify a GUID address. + +.SH OPTIONS +.PP +\-G use GUID address argument. In most cases, it is the Port GUID. + Example: + "0x08f1040023" +.PP +\-v increase the verbosity level +.PP +\-N | \-nocolor use mono rather than color mode +.PP +\-C <ca_name> use the specified ca_name. +.PP +\-P <ca_port> use the specified ca_port. +.PP +\-t <timeout_ms> override the default timeout for the solicited mads. + +.SH EXAMPLE +.PP +ibcheckportwidth 2 3 # check lid 2 port 3 + +.SH SEE ALSO +.BR smpquery(8), +.BR ibaddr(8) + +.SH AUTHOR +.TP +Hal Rosenstock +.RI < halr@voltaire.com > diff --git a/infiniband-diags/man/ibcheckstate.8 b/infiniband-diags/man/ibcheckstate.8 new file mode 100644 index 0000000..89daeb8 --- /dev/null +++ b/infiniband-diags/man/ibcheckstate.8 @@ -0,0 +1,36 @@ +.TH IBCHECKSTATE 8 "May 21, 2007" "OpenIB" "OpenIB Diagnostics" + +.SH NAME +ibcheckstate \- find ports in IB subnet which are link up but not active + +.SH SYNOPSIS +.B ibcheckstate +[\-h] [\-v] [\-N | \-nocolor] [<topology-file> | \-C ca_name \-P ca_port +\-t(imeout) timeout_ms] + +.SH DESCRIPTION +.PP +ibcheckstat is a script which uses a full topology file that was created by +ibnetdiscover, scans the network to validate the port state and port physical +state, and reports any ports which have a port state other than Active or +a port physical state other than LinkUp. + +.SH OPTIONS +.PP +\-N | \-nocolor use mono rather than color mode +.PP +\-C <ca_name> use the specified ca_name. +.PP +\-P <ca_port> use the specified ca_port. +.PP +\-t <timeout_ms> override the default timeout for the solicited mads. + +.SH SEE ALSO +.BR ibnetdiscover(8), +.BR ibchecknode(8), +.BR ibcheckportstate(8) + +.SH AUTHOR +.TP +Hal Rosenstock +.RI < halr@voltaire.com > diff --git a/infiniband-diags/man/ibcheckwidth.8 b/infiniband-diags/man/ibcheckwidth.8 new file mode 100644 index 0000000..1414fb2 --- /dev/null +++ b/infiniband-diags/man/ibcheckwidth.8 @@ -0,0 +1,36 @@ +.TH IBCHECKWIDTH 8 "May 21, 2007" "OpenIB" "OpenIB Diagnostics" + +.SH NAME +ibcheckwidth \- find 1x links in IB subnet + +.SH SYNOPSIS +.B ibcheckwidth +[\-h] [\-v] [\-N | \-nocolor] [<topology-file> | \-C ca_name +\-P ca_port \-t(imeout) timeout_ms] + + +.SH DESCRIPTION +.PP +ibcheckwidth is a script which uses a full topology file that was created by +ibnetdiscover, scans the network to validate the active link widths and +reports any 1x links. + +.SH OPTIONS +.PP +\-N | \-nocolor use mono rather than color mode +.PP +\-C <ca_name> use the specified ca_name. +.PP +\-P <ca_port> use the specified ca_port. +.PP +\-t <timeout_ms> override the default timeout for the solicited mads. + +.SH SEE ALSO +.BR ibnetdiscover(8), +.BR ibchecknode(8), +.BR ibcheckportwidth(8) + +.SH AUTHOR +.TP +Hal Rosenstock +.RI < halr@voltaire.com > diff --git a/infiniband-diags/man/ibclearcounters.8 b/infiniband-diags/man/ibclearcounters.8 new file mode 100644 index 0000000..1fca7bd --- /dev/null +++ b/infiniband-diags/man/ibclearcounters.8 @@ -0,0 +1,30 @@ +.TH IBCLEARCOUNTERS 8 "May 21, 2007" "OpenIB" "OpenIB Diagnostics" + +.SH NAME +ibclearcounters \- clear port counters in IB subnet + +.SH SYNOPSIS +.B ibclearcounters +[\-h] [<topology-file> | \-C ca_name \-P ca_port \-t(imeout) timeout_ms] + +.SH DESCRIPTION +.PP +ibclearcounters is a script that clears the PMA port counters by either walking +the IB subnet topology or using an already saved topology file. + +.SH OPTIONS +.PP +\-C <ca_name> use the specified ca_name. +.PP +\-P <ca_port> use the specified ca_port. +.PP +\-t <timeout_ms> override the default timeout for the solicited mads. + +.SH SEE ALSO +.BR ibnetdiscover(8), +.BR perfquery(8) + +.SH AUTHOR +.TP +Hal Rosenstock +.RI < halr@voltaire.com > diff --git a/infiniband-diags/man/ibclearerrors.8 b/infiniband-diags/man/ibclearerrors.8 new file mode 100644 index 0000000..7692c64 --- /dev/null +++ b/infiniband-diags/man/ibclearerrors.8 @@ -0,0 +1,34 @@ +.TH IBCLEARERRORS 8 "May 21, 2007" "OpenIB" "OpenIB Diagnostics" + +.SH NAME +ibclearerrors \- clear error counters in IB subnet + +.SH SYNOPSIS +.B ibclearerrors +[\-h] [\-N | \-nocolor] [<topology-file> | \-C ca_name \-P ca_port +\-t(imeout) timeout_ms] + +.SH DESCRIPTION +.PP +ibclearerrors is a script which clears the PMA error counters in PortCounters +by either walking the IB subnet topology or using an already saved topology +file. + +.SH OPTIONS +.PP +\-N | \-nocolor use mono rather than color mode +.PP +\-C <ca_name> use the specified ca_name. +.PP +\-P <ca_port> use the specified ca_port. +.PP +\-t <timeout_ms> override the default timeout for the solicited mads. + +.SH SEE ALSO +.BR ibnetdiscover(8), +.BR perfquery(8) + +.SH AUTHOR +.TP +Hal Rosenstock +.RI < halr@voltaire.com > diff --git a/infiniband-diags/man/ibdatacounters.8 b/infiniband-diags/man/ibdatacounters.8 new file mode 100644 index 0000000..cc1a622 --- /dev/null +++ b/infiniband-diags/man/ibdatacounters.8 @@ -0,0 +1,39 @@ +.TH IBDATACOUNTERS 8 "May 31, 2007" "OpenIB" "OpenIB Diagnostics" + +.SH NAME +ibdatacounters \- query IB subnet for data counters + +.SH SYNOPSIS +.B ibdatacounters +[\-h] [\-b] [\-v] [\-N | \-nocolor] [<topology-file> | \-C ca_name \-P ca_port +\-t(imeout) timeout_ms] + +.SH DESCRIPTION +.PP +ibdatacounters is a script which uses a full topology file that was created by +ibnetdiscover, scans the network to validate the connectivity and reports +the data counters (from port counters). + +.SH OPTIONS +.PP +\-v increase the verbosity level +.PP +\-b brief mode. Reduce the output to show only if errors are present, + not what they are. +.PP +\-N | \-nocolor use mono rather than color mode +.PP +\-C <ca_name> use the specified ca_name. +.PP +\-P <ca_port> use the specified ca_port. +.PP +\-t <timeout_ms> override the default timeout for the solicited mads. + +.SH SEE ALSO +.BR ibnetdiscover(8), +.BR ibdatacounts(8) + +.SH AUTHOR +.TP +Hal Rosenstock +.RI < halr@voltaire.com > diff --git a/infiniband-diags/man/ibdatacounts.8 b/infiniband-diags/man/ibdatacounts.8 new file mode 100644 index 0000000..d1b31e3 --- /dev/null +++ b/infiniband-diags/man/ibdatacounts.8 @@ -0,0 +1,48 @@ +.TH IBDATACOUNTS 8 "May 30, 2007" "OpenIB" "OpenIB Diagnostics" + +.SH NAME +ibdatacounts \- get IB port data counters + +.SH SYNOPSIS +.B ibdatacounts +[\-h] [\-b] [\-v] [\-G] [\-N | \-nocolor] [\-C ca_name] [\-P ca_port] +[\-t(imeout) timeout_ms] <lid|guid> [<port>] + +.SH DESCRIPTION +.PP +Obtain PMA data counters from specified port (or node). +Port address is lid unless -G option is used to specify a GUID +address. + +.SH OPTIONS +.PP +\-G use GUID address argument. In most cases, it is the Port GUID. + Example: + "0x08f1040023" +.PP +\-v increase the verbosity level +.PP +\-b brief mode +.PP +\-N | \-nocolor use mono rather than color mode +.PP +\-C <ca_name> use the specified ca_name. +.PP +\-P <ca_port> use the specified ca_port. +.PP +\-t <timeout_ms> override the default timeout for the solicited mads. + +.SH EXAMPLE +.PP +ibdatacounts 2 # show data counters for lid 2 +.PP +ibdatacounts 2 4 # show data counters for lid 2 port 4 + +.SH SEE ALSO +.BR perfquery(8), +.BR ibaddr(8) + +.SH AUTHOR +.TP +Hal Rosenstock +.RI < halr@voltaire.com > diff --git a/infiniband-diags/man/ibdiscover.8 b/infiniband-diags/man/ibdiscover.8 new file mode 100644 index 0000000..5e1e019 --- /dev/null +++ b/infiniband-diags/man/ibdiscover.8 @@ -0,0 +1,50 @@ +.TH IBDISCOVER.PL 8 "September 21, 2006" "OpenIB" "OpenIB Diagnostics" + +.SH NAME +ibdiscover.pl \- annotate and compare InfiniBand topology + +.SH SYNOPSIS +.B ibdiscover.pl + +.SH DESCRIPTION +.PP +ibdiscover.pl uses a topology file create by ibnetdiscover and a discover.map +file which the network administrator creates which indicates the nodes +to be expected and a ibdiscover.topo file which is the expected connectivity +and produces a new connectivity file (discover.topo.new) and outputs +the changes to stdout. The network administrator can choose to replace +the "old" topo file with the new one or certain changes in. + +The syntax of the ibdiscover.map file is: + +<nodeGUID>|port|"Text for node"|<NodeDescription from ibnetdiscover format> + +e.g. + +8f10400410015|8|"ISR 6000"|# SW-6IB4 Voltaire port 0 lid 5 + +8f10403960558|2|"HCA 1"|# MT23108 InfiniHost Mellanox Technologies + +The syntax of the old and new topo files (ibdiscover.topo and +ibdiscover.topo.new) are: + +<LocalPort>|<LocalNodeGUID>|<RemotePort>|<RemoteNodeGUID> + +e.g. + +10|5442ba00003080|1|8f10400410015 + +These topo files are produced by the ibdiscover.pl tool. + +.SH USAGE + +.PP +ibnetdiscover | ibdiscover.pl + +.SH SEE ALSO +.BR ibnetdiscover(8) + +.SH AUTHOR +.TP +Hal Rosenstock +.RI < halr@voltaire.com > diff --git a/infiniband-diags/man/ibfindnodesusing.8.in.rst b/infiniband-diags/man/ibfindnodesusing.8.in.rst new file mode 100644 index 0000000..82c34ce --- /dev/null +++ b/infiniband-diags/man/ibfindnodesusing.8.in.rst @@ -0,0 +1,53 @@ +================ +ibfindnodesusing +================ + +------------------------------------------------------------------------------- +find a list of end nodes which are routed through the specified switch and port +------------------------------------------------------------------------------- + +:Date: 2017-08-21 +:Manual section: 8 +:Manual group: Open IB Diagnostics + +SYNOPSIS +======== + +ibfindnodesusing.pl [options] <switch_guid|switch_name> <port> + +DESCRIPTION +=========== + +ibfindnodesusing.pl uses ibroute and detects the current nodes which are routed +through both directions of the link specified. The link is specified by one +switch port end; the script finds the remote end automatically. + + +OPTIONS +======= + +**-h** + show help + +**-R** + Recalculate the ibnetdiscover information, ie do not use the cached + information. This option is slower but should be used if the diag + tools have not been used for some time or if there are other reasons to + believe that the fabric has changed. + +**-C <ca_name>** use the specified ca_name. + +**-P <ca_port>** use the specified ca_port. + + +FILES +===== + +.. include:: common/sec_config-file.rst +.. include:: common/sec_node-name-map.rst + +AUTHOR +====== + +Ira Weiny + < ira.weiny@intel.com > diff --git a/infiniband-diags/man/ibhosts.8.in.rst b/infiniband-diags/man/ibhosts.8.in.rst new file mode 100644 index 0000000..7a7fe61 --- /dev/null +++ b/infiniband-diags/man/ibhosts.8.in.rst @@ -0,0 +1,57 @@ +======= +IBHOSTS +======= + +-------------------------------------- +show InfiniBand host nodes in topology +-------------------------------------- + +:Date: 2016-12-20 +:Manual section: 8 +:Manual group: OpenIB Diagnostics + +SYNOPSIS +======== + +ibhosts [options] [<topology-file>] + + +DESCRIPTION +=========== + +ibhosts is a script which either walks the IB subnet topology or uses an +already saved topology file and extracts the CA nodes. + +OPTIONS +======= + +.. include:: common/opt_C.rst +.. include:: common/opt_P.rst +.. include:: common/opt_t.rst +.. include:: common/opt_y.rst +.. include:: common/opt_h.rst +.. include:: common/opt_z-config.rst + +.. include:: common/sec_portselection.rst + +FILES +===== + +.. include:: common/sec_config-file.rst +.. include:: common/sec_node-name-map.rst + +SEE ALSO +======== + +ibnetdiscover(8) + +DEPENDENCIES +============ + +ibnetdiscover, ibnetdiscover format + +AUTHOR +====== + +Hal Rosenstock + < halr@voltaire.com > diff --git a/infiniband-diags/man/ibidsverify.8.in.rst b/infiniband-diags/man/ibidsverify.8.in.rst new file mode 100644 index 0000000..834485d --- /dev/null +++ b/infiniband-diags/man/ibidsverify.8.in.rst @@ -0,0 +1,63 @@ +=========== +ibidsverify +=========== + +--------------------------------------------------- +validate IB identifiers in subnet and report errors +--------------------------------------------------- + +:Date: 2017-08-21 +:Manual section: 8 +:Manual group: Open IB Diagnostics + +SYNOPSIS +======== + +ibidsverify.pl [-h] [-R] + +DESCRIPTION +=========== + +ibidsverify.pl is a perl script which uses a full topology file that was +created by ibnetdiscover, scans the network to validate the LIDs and GUIDs +in the subnet. The validation consists of checking that there are no zero +or duplicate identifiers. + +Finally, ibidsverify.pl will also reuse the cached ibnetdiscover output from +some of the other diag tools which makes it a bit faster than running +ibnetdiscover from scratch. + +OPTIONS +======= + +**-R** +Recalculate the ibnetdiscover information, ie do not use the cached +information. This option is slower but should be used if the diag tools have +not been used for some time or if there are other reasons to believe the +fabric has changed. + +**-C <ca_name>** use the specified ca_name. + +**-P <ca_port>** use the specified ca_port. + +EXIT STATUS +=========== + +Exit status is 1 if errors are found, 0 otherwise. + +FILES +===== + +.. include:: common/sec_config-file.rst + + +SEE ALSO +======== + +**ibnetdiscover(8)** + +AUTHOR +====== + +Hal Rosenstock + < halr@voltaire.com > diff --git a/infiniband-diags/man/iblinkinfo.8.in.rst b/infiniband-diags/man/iblinkinfo.8.in.rst new file mode 100644 index 0000000..7e6b240 --- /dev/null +++ b/infiniband-diags/man/iblinkinfo.8.in.rst @@ -0,0 +1,139 @@ +========== +IBLINKINFO +========== + +-------------------------------------------- +report link info for all links in the fabric +-------------------------------------------- + +:Date: 2018-07-09 +:Manual section: 8 +:Manual group: OpenIB Diagnostics + + +SYNOPSIS +======== + +iblinkinfo <options> + +DESCRIPTION +=========== + +iblinkinfo reports link info for each port in an IB fabric, node by node. +Optionally, iblinkinfo can do partial scans and limit its output to parts of a +fabric. + +OPTIONS +======= + +**--down, -d** +Print only nodes which have a port in the "Down" state. + +**--line, -l** +Print all information for each link on one line. Default is to print a header +with the node information and then a list for each port (useful for +grep'ing output). + + +**--additional, -p** +Print additional port settings (<LifeTime>,<HoqLife>,<VLStallCount>) + +**--switches-only** +Show only switches in output. + +**--cas-only** +Show only CAs in output. + + +Partial Scan flags +------------------ + +The node to start a partial scan can be specified with the following addresses. + +.. include:: common/opt_G_with_param.rst +.. include:: common/opt_D_with_param.rst + +**Note:** For switches results are printed for all ports not just switch port 0. + +**--switch, -S <port_guid>** same as "-G". (provided only for backward compatibility) + +How much of the scan to be printed can be controlled with the following. + +**--all, -a** +Print all nodes found in a partial fabric scan. Normally a +partial fabric scan will return only the node specified. This option will +print the other nodes found as well. + +**--hops, -n <hops>** +Specify the number of hops away from a specified node to scan. This is useful +to expand a partial fabric scan beyond the node specified. + + +Cache File flags +---------------- + +.. include:: common/opt_load-cache.rst +.. include:: common/opt_diff.rst + +**--diffcheck <key(s)>** +Specify what diff checks should be done in the **--diff** option above. Comma +separate multiple diff check key(s). The available diff checks are: **port** = +port connections, **state** = port state, **lid** = lids, **nodedesc** = node +descriptions. Note that **port**, **lid**, and **nodedesc** are checked only +for the node types that are specified (e.g. **switches-only**, **cas-only**). +If **port** is specified alongside **lid** or **nodedesc**, remote port lids +and node descriptions will also be compared. + + +**--filterdownports <filename>** +Filter downports indicated in a ibnetdiscover cache. If a port was previously +indicated as down in the specified cache, and is still down, do not output it in the +resulting output. This option may be particularly useful for environments +where switches are not fully populated, thus much of the default iblinkinfo +info is considered useless. See **ibnetdiscover** for information on caching +ibnetdiscover output. + + +Port Selection flags +-------------------- + +.. include:: common/opt_C.rst +.. include:: common/opt_P.rst +.. include:: common/sec_portselection.rst + +Configuration flags +------------------- + +.. include:: common/opt_z-config.rst +.. include:: common/opt_o-outstanding_smps.rst +.. include:: common/opt_node_name_map.rst +.. include:: common/opt_t.rst +.. include:: common/opt_y.rst + +Debugging flags +--------------- + +.. include:: common/opt_e.rst +.. include:: common/opt_h.rst +.. include:: common/opt_v.rst +.. include:: common/opt_V.rst + +EXIT STATUS +=========== + +0 on success, -1 on failure to scan the fabric, 1 if check mode is used and +inconsistencies are found. + +FILES +===== + +.. include:: common/sec_config-file.rst + +.. include:: common/sec_node-name-map.rst + + +AUTHOR +====== + +Ira Weiny + < ira.weiny@intel.com > diff --git a/infiniband-diags/man/ibnetdiscover.8.in.rst b/infiniband-diags/man/ibnetdiscover.8.in.rst new file mode 100644 index 0000000..28bf9e5 --- /dev/null +++ b/infiniband-diags/man/ibnetdiscover.8.in.rst @@ -0,0 +1,118 @@ +============= +IBNETDISCOVER +============= + +---------------------------- +discover InfiniBand topology +---------------------------- + +:Date: 2013-06-22 +:Manual section: 8 +:Manual group: Open IB Diagnostics + + +SYNOPSIS +======== + +ibnetdiscover [options] [<topology-file>] + + +DESCRIPTION +=========== + +ibnetdiscover performs IB subnet discovery and outputs a human readable +topology file. GUIDs, node types, and port numbers are displayed +as well as port LIDs and NodeDescriptions. All nodes (and links) are displayed +(full topology). Optionally, this utility can be used to list the current +connected nodes by nodetype. The output is printed to standard output +unless a topology file is specified. + +OPTIONS +======= + +**-l, --list** +List of connected nodes + +**-g, --grouping** +Show grouping. Grouping correlates IB nodes by different vendor specific +schemes. It may also show the switch external ports correspondence. + +**-H, --Hca_list** +List of connected CAs + +**-S, --Switch_list** +List of connected switches + +**-R, --Router_list** +List of connected routers + +**-s, --show** +Show progress information during discovery. + +**-f, --full** +Show full information (ports' speed and width, vlcap) + +**-p, --ports** +Obtain a ports report which is a +list of connected ports with relevant information (like LID, portnum, +GUID, width, speed, and NodeDescription). + +**-m, --max_hops** +Report max hops discovered. + +.. include:: common/opt_o-outstanding_smps.rst + + +Cache File flags +---------------- + +.. include:: common/opt_cache.rst +.. include:: common/opt_load-cache.rst +.. include:: common/opt_diff.rst +.. include:: common/opt_diffcheck.rst + + +Port Selection flags +-------------------- + +.. include:: common/opt_C.rst +.. include:: common/opt_P.rst +.. include:: common/sec_portselection.rst + +Configuration flags +------------------- + +.. include:: common/opt_z-config.rst +.. include:: common/opt_o-outstanding_smps.rst +.. include:: common/opt_node_name_map.rst +.. include:: common/opt_t.rst +.. include:: common/opt_y.rst + +Debugging flags +--------------- + +.. include:: common/opt_d.rst +.. include:: common/opt_e.rst +.. include:: common/opt_h.rst +.. include:: common/opt_v.rst +.. include:: common/opt_V.rst + + + +FILES +===== + +.. include:: common/sec_config-file.rst +.. include:: common/sec_node-name-map.rst +.. include:: common/sec_topology-file.rst + + + +AUTHORS +======= + +Hal Rosenstock + < halr@voltaire.com > + +Ira Weiny + < ira.weiny@intel.com > diff --git a/infiniband-diags/man/ibnodes.8.in.rst b/infiniband-diags/man/ibnodes.8.in.rst new file mode 100644 index 0000000..f6a32d3 --- /dev/null +++ b/infiniband-diags/man/ibnodes.8.in.rst @@ -0,0 +1,58 @@ +======= +IBNODES +======= + +--------------------------------- +show InfiniBand nodes in topology +--------------------------------- + +:Date: 2012-05-14 +:Manual section: 8 +:Manual group: OpenIB Diagnostics + + +SYNOPSIS +======== + +ibnodes [options] [<topology-file>] + +DESCRIPTION +=========== + +ibnodes is a script which either walks the IB subnet topology or uses an +already saved topology file and extracts the IB nodes (CAs and switches). + + +OPTIONS +======= + +.. include:: common/opt_C.rst +.. include:: common/opt_P.rst +.. include:: common/opt_t.rst +.. include:: common/opt_h.rst +.. include:: common/opt_z-config.rst + +.. include:: common/sec_portselection.rst + +FILES +===== + +.. include:: common/sec_config-file.rst +.. include:: common/sec_node-name-map.rst + + +SEE ALSO +======== + +ibnetdiscover(8) + +DEPENDENCIES +============ + +ibnetdiscover, ibnetdiscover format + +AUTHOR +====== + +Hal Rosenstock + < halr@voltaire.com > diff --git a/infiniband-diags/man/ibping.8.in.rst b/infiniband-diags/man/ibping.8.in.rst new file mode 100644 index 0000000..e4d11db --- /dev/null +++ b/infiniband-diags/man/ibping.8.in.rst @@ -0,0 +1,86 @@ +====== +IBPING +====== + +-------------------------- +ping an InfiniBand address +-------------------------- + +:Date: 2012-05-14 +:Manual section: 8 +:Manual group: Open IB Diagnostics + + +SYNOPSIS +======== + +ibping [options] <dest lid | guid> + +DESCRIPTION +=========== + +ibping uses vendor mads to validate connectivity between IB nodes. +On exit, (IP) ping like output is show. ibping is run as client/server. +Default is to run as client. Note also that a default ping server is +implemented within the kernel. + + +OPTIONS +======= + +**-c, --count** +stop after count packets + +**-f, --flood** +flood destination: send packets back to back without delay + +**-o, --oui** +use specified OUI number to multiplex vendor mads + +**-S, --Server** +start in server mode (do not return) + + +Addressing Flags +---------------- + +.. include:: common/opt_L.rst +.. include:: common/opt_G.rst +.. include:: common/opt_s.rst + + +Port Selection flags +-------------------- + +.. include:: common/opt_C.rst +.. include:: common/opt_P.rst +.. include:: common/sec_portselection.rst + + +Configuration flags +------------------- + +.. include:: common/opt_z-config.rst +.. include:: common/opt_t.rst + + +Debugging flags +--------------- + +.. include:: common/opt_h.rst +.. include:: common/opt_d.rst +.. include:: common/opt_e.rst +.. include:: common/opt_v.rst +.. include:: common/opt_V.rst + +FILES +===== + +.. include:: common/sec_config-file.rst + + +AUTHOR +====== + +Hal Rosenstock + < halr@voltaire.com > diff --git a/infiniband-diags/man/ibportstate.8.in.rst b/infiniband-diags/man/ibportstate.8.in.rst new file mode 100644 index 0000000..da1de4c --- /dev/null +++ b/infiniband-diags/man/ibportstate.8.in.rst @@ -0,0 +1,129 @@ +=========== +IBPORTSTATE +=========== + +----------------------------------------------------------------- +handle port (physical) state and link speed of an InfiniBand port +----------------------------------------------------------------- + +:Date: 2013-03-26 +:Manual section: 8 +:Manual group: Open IB Diagnostics + + +SYNOPSIS +======== + +ibportstate [options] <dest dr_path|lid|guid> <portnum> [<op>] + +DESCRIPTION +=========== + +ibportstate allows the port state and port physical state of an IB port +to be queried (in addition to link width and speed being validated +relative to the peer port when the port queried is a switch port), +or a switch port to be disabled, enabled, or reset. It +also allows the link speed/width enabled on any IB port to be adjusted. + +OPTIONS +======= + +**<op>** + Supported ops: enable, disable, reset, speed, espeed, fdr10, width, query, + on, off, down, arm, active, vls, mtu, lid, smlid, lmc, + mkey, mkeylease, mkeyprot + (Default is query) + + **enable, disable, and reset** are only allowed on switch ports (An + error is indicated if attempted on CA or router ports) + + **off** change the port state to disable. + + **on** change the port state to enable(only when the current state is disable). + + **speed and width** are allowed on any port + + **speed** values are the legal values for PortInfo:LinkSpeedEnabled (An + error is indicated if PortInfo:LinkSpeedSupported does not support this + setting) + + **espeed** is allowed on any port supporting extended link speeds + + **fdr10** is allowed on any port supporting fdr10 (An error is + indicated if port's capability mask indicates extended link speeds are + not supported or if PortInfo:LinkSpeedExtSupported does not support + this setting) + + **width** values are legal values for PortInfo:LinkWidthEnabled (An + error is indicated if PortInfo:LinkWidthSupported does not support this + setting) (NOTE: Speed and width changes are not effected until the port + goes through link renegotiation) + + **query** also validates port characteristics (link width, speed, + espeed, and fdr10) based on the peer port. This checking is done when + the port queried is a switch port as it relies on combined routing (an + initial LID route with directed routing to the peer) which can only be + done on a switch. This peer port validation feature of query op + requires LID routing to be functioning in the subnet. + + **mkey, mkeylease, and mkeyprot** are only allowed on CAs, routers, or + switch port 0 (An error is generated if attempted on external switch + ports). Hexadecimal and octal mkeys may be specified by prepending the + key with '0x' or '0', respectively. If a non-numeric value (like 'x') + is specified for the mkey, then ibportstate will prompt for a value. + + +Addressing Flags +---------------- + +.. include:: common/opt_L.rst +.. include:: common/opt_G.rst +.. include:: common/opt_D.rst +.. include:: common/opt_s.rst + +Port Selection flags +-------------------- + +.. include:: common/opt_C.rst +.. include:: common/opt_P.rst +.. include:: common/sec_portselection.rst + +Configuration flags +------------------- + +.. include:: common/opt_z-config.rst +.. include:: common/opt_t.rst +.. include:: common/opt_y.rst + +Debugging flags +--------------- + +.. include:: common/opt_h.rst +.. include:: common/opt_d.rst +.. include:: common/opt_e.rst +.. include:: common/opt_K.rst +.. include:: common/opt_v.rst +.. include:: common/opt_V.rst + +FILES +===== + +.. include:: common/sec_config-file.rst + +EXAMPLES +======== + +:: + ibportstate 3 1 disable # by lid + ibportstate -G 0x2C9000100D051 1 enable # by guid + ibportstate -D 0 1 # (query) by direct route + ibportstate 3 1 reset # by lid + ibportstate 3 1 speed 1 # by lid + ibportstate 3 1 width 1 # by lid + ibportstate -D 0 1 lid 0x1234 arm # by direct route + +AUTHOR +====== + +Hal Rosenstock + < hal.rosenstock@gmail.com > diff --git a/infiniband-diags/man/ibprintca.8 b/infiniband-diags/man/ibprintca.8 new file mode 100644 index 0000000..ae304f7 --- /dev/null +++ b/infiniband-diags/man/ibprintca.8 @@ -0,0 +1,44 @@ +.TH IBPRINTCA 8 "May 31, 2007" "OpenIB" "OpenIB Diagnostics" + +.SH NAME +ibprintca.pl \- print either the ca specified or the list of cas from the ibnetdiscover output + +.SH SYNOPSIS +.B ibprintca.pl +[-R -l -C <ca_name> -P <ca_port>] [<ca_guid|node_name>] + +.SH DESCRIPTION +.PP +Faster than greping/viewing with an editor the output of ibnetdiscover, +ibprintca.pl will parse out and print either the CA information for the +specified CA or a list of all the CAs in the subnet. + +Finally, ibprintca.pl will also reuse the cached ibnetdiscover output from +some of the other diag tools which makes it a bit faster than running +ibnetdiscover from scratch. + + +.SH OPTIONS + +.PP +.TP +\fB\-l\fR +List the CAs (simply a wrapper for ibhosts). +.TP +\fB\-R\fR +Recalculate the ibnetdiscover information, ie do not use the cached +information. This option is slower but should be used if the diag tools have +not been used for some time or if there are other reasons to believe that +the fabric has changed. +.TP +\fB\-C <ca_name>\fR use the specified ca_name for the search. +.TP +\fB\-P <ca_port>\fR use the specified ca_port for the search. + +.SH AUTHORS +.TP +Ira Weiny +.RI < weiny2@llnl.gov > +.TP +Hal Rosenstock +.RI < halr@voltaire.com > diff --git a/infiniband-diags/man/ibprintrt.8 b/infiniband-diags/man/ibprintrt.8 new file mode 100644 index 0000000..1151c70 --- /dev/null +++ b/infiniband-diags/man/ibprintrt.8 @@ -0,0 +1,43 @@ +.TH IBPRINTRT 8 "May 31, 2007" "OpenIB" "OpenIB Diagnostics" + +.SH NAME +ibprintrt.pl \- print either only the router specified or a list of routers +from the ibnetdiscover output + +.SH SYNOPSIS +.B ibprintrt.pl +[-R -l -C <ca_name> -P <ca_port>] [<rt_guid|node_name>] + +.SH DESCRIPTION +.PP +Faster than greping/viewing with an editor the output of ibnetdiscover, +ibprintrt.pl will parse out and print either the router information for the +specified IB router or a list of all IB routers in the subnet. + +Finally, ibprintrt.pl will also reuse the cached ibnetdiscover output from +some of the other diag tools which makes it a bit faster than running +ibnetdiscover from scratch. + + +.SH OPTIONS + +.PP +.TP +\fB\-l\fR +List the Rts (simply a wrapper for ibrouters). +.TP +\fB\-R\fR +Recalculate the ibnetdiscover information, ie do not use the cached +information. This option is slower but should be used if the diag tools have +not been used for some time or if there are other reasons to believe that +the fabric has changed. +.TP +\fB\-C <ca_name>\fR use the specified ca_name for the search. +.TP +\fB\-P <ca_port>\fR use the specified ca_port for the search. + + +.SH AUTHOR +.TP +Hal Rosenstock +.RI < halr@voltaire.com > diff --git a/infiniband-diags/man/ibprintswitch.8 b/infiniband-diags/man/ibprintswitch.8 new file mode 100644 index 0000000..9538e7f --- /dev/null +++ b/infiniband-diags/man/ibprintswitch.8 @@ -0,0 +1,48 @@ +.TH IBPRINTSWITCH 8 "May 31, 2007" "OpenIB" "OpenIB Diagnostics" + +.SH NAME +ibprintswitch.pl \- print either the switch specified or a list of switches +from the ibnetdiscover output + +.SH SYNOPSIS +.B ibprintswitch.pl +[-R -l -C <ca_name> -P <ca_port>] [<switch_guid|switch_name>] + +.SH DESCRIPTION +.PP +Faster than greping/viewing with an editor the output of ibnetdiscover, +ibprintswitch.pl will parse out and print either the switch information +for the switch specified or a list of all the switches found in the subnet. +In addition, it will crudely parse on the node description +information and if found report all the information for an entire chasis +if the description information is consistent. + +Finally, ibprintswitch.pl will also reuse the cached ibnetdiscover output +from some of the other diag tools which makes it a bit faster than running +ibnetdiscover from scratch. + +.SH OPTIONS + +.PP +.TP +\fB\-l\fR +List the switches (simply a wrapper for ibswitches). +.TP +\fB\-R\fR +Recalculate the ibnetdiscover information, ie do not use the cached +information. This option is slower but should be used if the diag tools have +not been used for some time or if there are other reasons to believe that +the fabric has changed. +.TP +\fB\-C <ca_name>\fR use the specified ca_name for the search. +.TP +\fB\-P <ca_port>\fR use the specified ca_port for the search. + + +.SH AUTHORS +.TP +Ira Weiny +.RI < weiny2@llnl.gov > +.TP +Hal Rosenstock +.RI < halr@voltaire.com > diff --git a/infiniband-diags/man/ibqueryerrors.8.in.rst b/infiniband-diags/man/ibqueryerrors.8.in.rst new file mode 100644 index 0000000..6ddfc2c --- /dev/null +++ b/infiniband-diags/man/ibqueryerrors.8.in.rst @@ -0,0 +1,159 @@ +============= +IBQUERYERRORS +============= + +--------------------------------- +query and report IB port counters +--------------------------------- + +:Date: 2016-09-26 +:Manual section: 8 +:Manual group: OpenIB Diagnostics + + +SYNOPSIS +======== + +ibqueryerrors [options] + +DESCRIPTION +=========== + +The default behavior is to report the port error counters which exceed a +threshold for each port in the fabric. The default threshold is zero (0). +Error fields can also be suppressed entirely. + +In addition to reporting errors on every port. ibqueryerrors can report the +port transmit and receive data as well as report full link information to the +remote port if available. + +OPTIONS +======= + +**-s, --suppress <err1,err2,...>** +Suppress the errors listed in the comma separated list provided. + +**-c, --suppress-common** +Suppress some of the common "side effect" counters. These counters usually do +not indicate an error condition and can be usually be safely ignored. + +**-r, --report-port** +Report the port information. This includes LID, port, external port (if +applicable), link speed setting, remote GUID, remote port, remote external port +(if applicable), and remote node description information. + +**--data** +Include the optional transmit and receive data counters. + +**--threshold-file <filename>** +Specify an alternate threshold file. The default is @IBDIAG_CONFIG_PATH@/error_thresholds + +**--switch** print data for switch's only + +**--ca** print data for CA's only + +**--skip-sl** Use the default sl for queries. This is not recommended when +using a QoS aware routing engine as it can cause a credit deadlock. + +**--router** print data for routers only + +**--clear-errors -k** Clear error counters after read. + +**--clear-counts -K** Clear data counters after read. + +**CAUTION** clearing data or error counters will occur regardless of if they +are printed or not. See **--counters** and **--data** for details on +controlling which counters are printed. + +**--details** include receive error and transmit discard details + +**--counters** print data counters only + + +Partial Scan flags +------------------ + +The node to start a partial scan can be specified with the following addresses. + +.. include:: common/opt_G_with_param.rst +.. include:: common/opt_D_with_param.rst + +**Note:** For switches results are printed for all ports not just switch port 0. + +**-S <port_guid>** same as "-G". (provided only for backward compatibility) + + +Cache File flags +---------------- + +.. include:: common/opt_load-cache.rst + + + + +Port Selection flags +-------------------- + +.. include:: common/opt_C.rst +.. include:: common/opt_P.rst +.. include:: common/sec_portselection.rst + +Configuration flags +------------------- + +.. include:: common/opt_z-config.rst +.. include:: common/opt_o-outstanding_smps.rst +.. include:: common/opt_node_name_map.rst +.. include:: common/opt_t.rst +.. include:: common/opt_y.rst + +Debugging flags +--------------- + +.. include:: common/opt_d.rst +.. include:: common/opt_e.rst +.. include:: common/opt_h.rst +.. include:: common/opt_v.rst +.. include:: common/opt_V.rst + +**-R** (This option is obsolete and does nothing) + +EXIT STATUS +=========== + +**-1** if scan fails. + +**0** if scan succeeds without errors beyond thresholds + +**1** if errors are found beyond thresholds or inconsistencies are found in check mode. + +FILES +===== + +ERROR THRESHOLD +--------------- + +@IBDIAG_CONFIG_PATH@/error_thresholds + +Define threshold values for errors. File format is simple "name=val". +Comments begin with '#' + +**Example:** + +:: + + # Define thresholds for error counters + SymbolErrorCounter=10 + LinkErrorRecoveryCounter=10 + VL15Dropped=100 + + +.. include:: common/sec_config-file.rst + +.. include:: common/sec_node-name-map.rst + +AUTHOR +====== + +Ira Weiny + < ira.weiny@intel.com > diff --git a/infiniband-diags/man/ibroute.8.in.rst b/infiniband-diags/man/ibroute.8.in.rst new file mode 100644 index 0000000..25b1076 --- /dev/null +++ b/infiniband-diags/man/ibroute.8.in.rst @@ -0,0 +1,110 @@ +======= +ibroute +======= + +----------------------------------------- +query InfiniBand switch forwarding tables +----------------------------------------- + +:Date: 2017-08-21 +:Manual section: 8 +:Manual group: Open IB Diagnostics + +SYNOPSIS +======== + +ibroute [options] [<dest dr_path|lid|guid> [<startlid> [<endlid>]]] + +DESCRIPTION +=========== + +ibroute uses SMPs to display the forwarding tables (unicast +(LinearForwardingTable or LFT) or multicast (MulticastForwardingTable or MFT)) +for the specified switch LID and the optional lid (mlid) range. +The default range is all valid entries in the range 1...FDBTop. + +OPTIONS +======= + +**-a, --all** + show all lids in range, even invalid entries + +**-n, --no_dests** + do not try to resolve destinations + +**-M, --Multicast** + show multicast forwarding tables + In this case, the range parameters are specifying the mlid range. + + +Addressing Flags +---------------- + +.. include:: common/opt_D.rst +.. include:: common/opt_G.rst +.. include:: common/opt_L.rst +.. include:: common/opt_s.rst + +Port Selection flags +-------------------- + +.. include:: common/opt_C.rst +.. include:: common/opt_P.rst +.. include:: common/sec_portselection.rst + +Debugging flags +--------------- + +.. include:: common/opt_d.rst +.. include:: common/opt_e.rst +.. include:: common/opt_h.rst +.. include:: common/opt_v.rst +.. include:: common/opt_V.rst + + +Configuration flags +------------------- + +.. include:: common/opt_t.rst +.. include:: common/opt_y.rst +.. include:: common/opt_node_name_map.rst +.. include:: common/opt_z-config.rst + +FILES +===== + +.. include:: common/sec_config-file.rst + +.. include:: common/sec_node-name-map.rst + +EXAMPLES +======== + +Unicast examples + +:: + ibroute 4 # dump all lids with valid out ports of switch with lid 4 + ibroute -a 4 # same, but dump all lids, even with invalid out ports + ibroute -n 4 # simple dump format - no destination resolution + ibroute 4 10 # dump lids starting from 10 (up to FDBTop) + ibroute 4 0x10 0x20 # dump lid range + ibroute -G 0x08f1040023 # resolve switch by GUID + ibroute -D 0,1 # resolve switch by direct path + +Multicast examples + +:: + ibroute -M 4 # dump all non empty mlids of switch with lid 4 + ibroute -M 4 0xc010 0xc020 # same, but with range + ibroute -M -n 4 # simple dump format + +SEE ALSO +======== + +ibtracert (8) + +AUTHOR +====== + +Hal Rosenstock + < halr@voltaire.com > diff --git a/infiniband-diags/man/ibrouters.8.in.rst b/infiniband-diags/man/ibrouters.8.in.rst new file mode 100644 index 0000000..5dc93f5 --- /dev/null +++ b/infiniband-diags/man/ibrouters.8.in.rst @@ -0,0 +1,58 @@ +========= +IBROUTERS +========= + +---------------------------------------- +show InfiniBand router nodes in topology +---------------------------------------- + +:Date: 2016-12-20 +:Manual section: 8 +:Manual group: OpenIB Diagnostics + + +SYNOPSIS +======== + +ibrouters [options] [<topology-file>] + +DESCRIPTION +=========== + +ibrouters is a script which either walks the IB subnet topology or uses an +already saved topology file and extracts the router nodes. + +OPTIONS +======= + +.. include:: common/opt_C.rst +.. include:: common/opt_P.rst +.. include:: common/opt_t.rst +.. include:: common/opt_y.rst +.. include:: common/opt_h.rst +.. include:: common/opt_z-config.rst + +.. include:: common/sec_portselection.rst + +FILES +===== + +.. include:: common/sec_config-file.rst +.. include:: common/sec_node-name-map.rst + +SEE ALSO +======== + +ibnetdiscover(8) + +DEPENDENCIES +============ + +ibnetdiscover, ibnetdiscover format + + +AUTHOR +====== + +Hal Rosenstock + < halr@voltaire.com > diff --git a/infiniband-diags/man/ibstat.8.in.rst b/infiniband-diags/man/ibstat.8.in.rst new file mode 100644 index 0000000..a332de1 --- /dev/null +++ b/infiniband-diags/man/ibstat.8.in.rst @@ -0,0 +1,82 @@ +====== +ibstat +====== + +------------------------------------------ +query basic status of InfiniBand device(s) +------------------------------------------ + +:Date: 2017-08-21 +:Manual section: 8 +:Manual group: Open IB Diagnostics + +SYNOPSIS +======== + +ibstat [options] <ca_name> [portnum] + +DESCRIPTION +=========== + +ibstat is a binary which displays basic information obtained from the local +IB driver. Output includes LID, SMLID, port state, link width active, and port +physical state. + +It is similar to the ibstatus utility but implemented as a binary rather +than a script. It has options to list CAs and/or ports and displays more +information than ibstatus. + +OPTIONS +======= + +**-l, --list_of_cas** + list all IB devices + +**-s, --short** + short output + +**-p, --port_list** + show port list + +**ca_name** + InfiniBand device name + +**portnum** + port number of InfiniBand device + + + +Debugging flags +--------------- + +.. include:: common/opt_d.rst +.. include:: common/opt_h.rst +.. include:: common/opt_v.rst +.. include:: common/opt_V.rst + + +Configuration flags +------------------- + +.. include:: common/opt_z-config.rst + + + +EXAMPLES +======== + +:: + ibstat # display status of all ports on all IB devices + ibstat -l # list all IB devices + ibstat -p # show port guids + ibstat mthca0 2 # show status of port 2 of 'mthca0' + +SEE ALSO +======== +ibstatus (8) + +AUTHOR +====== + +Hal Rosenstock + < halr@voltaire.com > diff --git a/infiniband-diags/man/ibstatus.8.in.rst b/infiniband-diags/man/ibstatus.8.in.rst new file mode 100644 index 0000000..e9f25e5 --- /dev/null +++ b/infiniband-diags/man/ibstatus.8.in.rst @@ -0,0 +1,53 @@ +======== +ibstatus +======== + +------------------------------------------ +query basic status of InfiniBand device(s) +------------------------------------------ + +:Date: 2017-08-21 +:Manual section: 8 +:Manual group: Open IB Diagnostics + +SYNOPSIS +======== + +ibstatus [\-h] [devname[:port]]... + +DESCRIPTION +=========== + +ibstatus is a script which displays basic information obtained from the local +IB driver. Output includes LID, SMLID, port state, link width active, and port +physical state. + +OPTIONS +======= + +.. include:: common/opt_h.rst + +**devname** + InfiniBand device name + +**portnum** + port number of InfiniBand device + +EXAMPLES +======== + +:: + ibstatus # display status of all IB ports + ibstatus mthca1 # status of mthca1 ports + ibstatus mthca1:1 mthca0:2 # show status of specified ports + +SEE ALSO +======== + +**ibstat (8)** + +AUTHOR +====== + +Hal Rosenstock + < halr@voltaire.com > diff --git a/infiniband-diags/man/ibswitches.8.in.rst b/infiniband-diags/man/ibswitches.8.in.rst new file mode 100644 index 0000000..15303b8 --- /dev/null +++ b/infiniband-diags/man/ibswitches.8.in.rst @@ -0,0 +1,56 @@ +========== +IBSWITCHES +========== + +---------------------------------------- +show InfiniBand switch nodes in topology +---------------------------------------- + +:Date: 2016-12-20 +:Manual section: 8 +:Manual group: OpenIB Diagnostics + + +SYNOPSIS +======== + +ibswitches [options] [<topology-file>] + +DESCRIPTION +=========== + +ibswitches is a script which either walks the IB subnet topology or uses an +already saved topology file and extracts the switch nodes. + +OPTIONS +======= + +.. include:: common/opt_C.rst +.. include:: common/opt_P.rst +.. include:: common/opt_t.rst +.. include:: common/opt_y.rst +.. include:: common/opt_h.rst +.. include:: common/opt_z-config.rst + +.. include:: common/sec_portselection.rst + +FILES +===== + +.. include:: common/sec_config-file.rst +.. include:: common/sec_node-name-map.rst + +SEE ALSO +======== +ibnetdiscover(8) + +DEPENDENCIES +============ + +ibnetdiscover, ibnetdiscover format + +AUTHOR +====== + +Hal Rosenstock + < halr@voltaire.com > diff --git a/infiniband-diags/man/ibswportwatch.8 b/infiniband-diags/man/ibswportwatch.8 new file mode 100644 index 0000000..7191d37 --- /dev/null +++ b/infiniband-diags/man/ibswportwatch.8 @@ -0,0 +1,35 @@ +.TH IBSWPORTWATCH 8 "September 27, 2006" "OpenIB" "OpenIB Diagnostics" + +.SH NAME +ibswportwatch.pl \- poll the counters on the specified switch/port and +report rate of change information. + +.SH SYNOPSIS +.B ibswportwatch.pl +[-p <pause_time> -v -n <cycles> -G] <guid|lid> <port> + +.SH DESCRIPTION +.PP +ibswportwatch.pl polls the port counters of the specified port and +calculates rate of change information. + +.SH OPTIONS + +.PP +.TP +\fB\-p <pause_time>\fR +Specify a pause time (polling interval) other than the default. +.TP +\fB\-v\fR +Be verbose. +.TP +\fB\-n <cycles>\fR +Run for a set number of poll intervals and stop. (Default == -1 == forever) +.TP +\fB\-G\fR +The address provided is a GUID rather than LID. + +.SH AUTHOR +.TP +Ira Weiny +.RI < weiny2@llnl.gov > diff --git a/infiniband-diags/man/ibsysstat.8.in.rst b/infiniband-diags/man/ibsysstat.8.in.rst new file mode 100644 index 0000000..9e8bc60 --- /dev/null +++ b/infiniband-diags/man/ibsysstat.8.in.rst @@ -0,0 +1,87 @@ +========= +ibsysstat +========= + +-------------------------------------- +system status on an InfiniBand address +-------------------------------------- + +:Date: 2017-08-21 +:Manual section: 8 +:Manual group: Open IB Diagnostics + +SYNOPSIS +======== + +ibsysstat [options] <dest lid | guid> [<op>] + +DESCRIPTION +=========== + +ibsysstat uses vendor mads to validate connectivity between IB nodes +and obtain other information about the IB node. ibsysstat is run as +client/server. Default is to run as client. + +OPTIONS +======= + +Current supported operations: + +:: + + ping \- verify connectivity to server (default) + host \- obtain host information from server + cpu \- obtain cpu information from server + +**-o, --oui** + use specified OUI number to multiplex vendor mads + +**-S, --Server** + start in server mode (do not return) + + +Addressing Flags +---------------- + +.. include:: common/opt_G.rst +.. include:: common/opt_L.rst +.. include:: common/opt_s.rst + + +Port Selection flags +-------------------- + +.. include:: common/opt_C.rst +.. include:: common/opt_P.rst +.. include:: common/sec_portselection.rst + + +Debugging flags +--------------- + +.. include:: common/opt_d.rst +.. include:: common/opt_e.rst +.. include:: common/opt_h.rst +.. include:: common/opt_v.rst +.. include:: common/opt_V.rst + + +Configuration flags +------------------- + +.. include:: common/opt_t.rst +.. include:: common/opt_z-config.rst + + +FILES +===== + +.. include:: common/sec_config-file.rst + + + +AUTHOR +====== + +Hal Rosenstock + < halr@voltaire.com > diff --git a/infiniband-diags/man/ibtracert.8.in.rst b/infiniband-diags/man/ibtracert.8.in.rst new file mode 100644 index 0000000..9dea157 --- /dev/null +++ b/infiniband-diags/man/ibtracert.8.in.rst @@ -0,0 +1,112 @@ +========= +ibtracert +========= + +--------------------- +trace InfiniBand path +--------------------- + +:Date: 2018-04-02 +:Manual section: 8 +:Manual group: Open IB Diagnostics + +SYNOPSIS +======== + +ibtracert [options] [<lid|guid> [<startlid> [<endlid>]]] + + +DESCRIPTION +=========== + +ibtracert uses SMPs to trace the path from a source GID/LID to a +destination GID/LID. Each hop along the path is displayed until +the destination is reached or a hop does not respond. By using +the -m option, multicast path tracing can be performed between source +and destination nodes. + +OPTIONS +======= + +**-n, --no_info** + simple format; don't show additional information + +**-m** + show the multicast trace of the specified mlid + +**-f, --force** + force route to destination port + + +Addressing Flags +---------------- + +.. include:: common/opt_G.rst +.. include:: common/opt_L.rst +.. include:: common/opt_s.rst +.. include:: common/opt_ports-file.rst + + +Port Selection flags +-------------------- + +.. include:: common/opt_C.rst +.. include:: common/opt_P.rst +.. include:: common/sec_portselection.rst + + +Debugging flags +--------------- + +.. include:: common/opt_d.rst +.. include:: common/opt_e.rst +.. include:: common/opt_h.rst +.. include:: common/opt_v.rst +.. include:: common/opt_V.rst + + +Configuration flags +------------------- + + +.. include:: common/opt_t.rst +.. include:: common/opt_node_name_map.rst +.. include:: common/opt_y.rst +.. include:: common/opt_z-config.rst + +FILES +===== + +.. include:: common/sec_config-file.rst +.. include:: common/sec_node-name-map.rst +.. include:: common/sec_ports-file.rst + + +EXAMPLES +======== + +Unicast examples + +:: + ibtracert 4 16 # show path between lids 4 and 16 + ibtracert -n 4 16 # same, but using simple output format + ibtracert -G 0x8f1040396522d 0x002c9000100d051 # use guid addresses + +Multicast example + +:: + ibtracert -m 0xc000 4 16 # show multicast path of mlid 0xc000 between lids 4 and 16 + +SEE ALSO +======== +ibroute (8) + + +AUTHOR +====== + +Hal Rosenstock + <hal.rosenstock@gmail.com> + +Ira Weiny + < ira.weiny@intel.com > diff --git a/infiniband-diags/man/infiniband-diags.8.in.rst b/infiniband-diags/man/infiniband-diags.8.in.rst new file mode 100644 index 0000000..b56f604 --- /dev/null +++ b/infiniband-diags/man/infiniband-diags.8.in.rst @@ -0,0 +1,165 @@ +================ +infiniband-diags +================ + +---------------------------------- +Diagnostics for InfiniBand Fabrics +---------------------------------- + +:Date: 2017-08-21 +:Manual section: 8 +:Manual group: Open IB Diagnostics + +DESCRIPTION +=========== + +infiniband-diags is a set of utilities designed to help configure, debug, and +maintain infiniband fabrics. Many tools and utilities are provided. Some with +similar functionality. + +The base utilities use directed route MAD's to perform their operations. They +may therefore work even in unconfigured subnets. Other, higher level +utilities, require LID routed MAD's and to some extent SA/SM access. + + +THE USE OF SMPs (QP0) +===================== + +Many of the tools in this package rely on the use of SMPs via QP0 to acquire +data directly from the SMA. While this mode of operation is not technically in +compliance with the InfiniBand specification, practical experience has found +that this level of diagnostics is valuable when working with a fabric which is +broken or only partially configured. For this reason many of these tools may +require the use of an MKey or operation from Virtual Machines may be restricted +for security reasons. + + +COMMON OPTIONS +============== + +Most OpenIB diagnostics take some of the following common flags. The exact list +of supported flags per utility can be found in the documentation for those +commands. + + +Addressing Flags +---------------- + +The -D and -G option have two forms: + +.. include:: common/opt_D.rst +.. include:: common/opt_D_with_param.rst +.. include:: common/opt_G.rst +.. include:: common/opt_G_with_param.rst + +.. include:: common/opt_L.rst +.. include:: common/opt_s.rst + + +Port Selection flags +-------------------- + +.. include:: common/opt_C.rst +.. include:: common/opt_P.rst +.. include:: common/sec_portselection.rst + + +Debugging flags +--------------- + +.. include:: common/opt_d.rst +.. include:: common/opt_e.rst +.. include:: common/opt_h.rst +.. include:: common/opt_v.rst +.. include:: common/opt_V.rst + + +Configuration flags +------------------- + +.. include:: common/opt_t.rst +.. include:: common/opt_o-outstanding_smps.rst +.. include:: common/opt_node_name_map.rst +.. include:: common/opt_z-config.rst + + + +COMMON FILES +============ + +The following config files are common amongst many of the utilities. + +.. include:: common/sec_config-file.rst + +.. include:: common/sec_node-name-map.rst +.. include:: common/sec_topology-file.rst + + + +Utilities list +============== + +Basic fabric connectivity +------------------------- + + See: ibnetdiscover, iblinkinfo + +Node information +---------------- + + See: ibnodes, ibswitches, ibhosts, ibrouters + +Port information +---------------- + + See: ibportstate, ibaddr + +Switch Forwarding Table info +---------------------------- + + See: ibtracert, ibroute, dump_lfts, dump_mfts, check_lft_balance, ibfindnodesusing + +Performance counters +-------------------- + + See: ibqueryerrors, perfquery + +Local HCA info +-------------- + + See: ibstat, ibstatus + +Connectivity check +------------------ + + See: ibping, ibsysstat + +Low level query tools +--------------------- + + See: smpquery, smpdump, saquery, sminfo + +Fabric verification tools +------------------------- + + See: ibidsverify + + +Backwards compatibility scripts +=============================== + +The following scripts have been identified as redundant and/or lower performing +as compared to the above scripts. They are provided as legacy scripts when +--enable-compat-utils is specified at build time. + +ibcheckerrors, ibclearcounters, ibclearerrors, ibdatacounters +ibchecknet, ibchecknode, ibcheckport, ibcheckportstate, +ibcheckportwidth, ibcheckstate, ibcheckwidth, ibswportwatch, +ibprintca, ibprintrt, ibprintswitch, set_nodedesc.sh + + +AUTHORS +======= + +Ira Weiny + < ira.weiny@intel.com > diff --git a/infiniband-diags/man/perfquery.8.in.rst b/infiniband-diags/man/perfquery.8.in.rst new file mode 100644 index 0000000..b511a7b --- /dev/null +++ b/infiniband-diags/man/perfquery.8.in.rst @@ -0,0 +1,185 @@ +========= +perfquery +========= + +----------------------------------------------- +query InfiniBand port counters on a single port +----------------------------------------------- + +:Date: 2017-08-21 +:Manual section: 8 +:Manual group: Open IB Diagnostics + +SYNOPSIS +======== + +perfquery [options] [<lid|guid> [[port(s)] [reset_mask]]] + +DESCRIPTION +=========== + +perfquery uses PerfMgt GMPs to obtain the PortCounters (basic performance and +error counters), PortExtendedCounters, PortXmitDataSL, PortRcvDataSL, +PortRcvErrorDetails, PortXmitDiscardDetails, PortExtendedSpeedsCounters, or +PortSamplesControl from the PMA at the node/port specified. Optionally shows +aggregated counters for all ports of node. Finally it can, reset after read, +or just reset the counters. + +Note: In PortCounters, PortCountersExtended, PortXmitDataSL, and PortRcvDataSL, +components that represent Data (e.g. PortXmitData and PortRcvData) indicate +octets divided by 4 rather than just octets. + +Note: Inputting a port of 255 indicates an operation be performed on all ports. + +Note: For PortCounters, ExtendedCounters, and resets, multiple ports can be +specified by either a comma separated list or a port range. See examples below. + + +OPTIONS +======= + +**-x, --extended** + show extended port counters rather than (basic) port counters. + Note that extended port counters attribute is optional. + +**-X, --xmtsl** + show transmit data SL counter. This is an optional counter for QoS. + +**-S, --rcvsl** + show receive data SL counter. This is an optional counter for QoS. + +**-D, --xmtdisc** + show transmit discard details. This is an optional counter. + +**-E, --rcverr** + show receive error details. This is an optional counter. + +**-D, --xmtdisc** + show transmit discard details. This is an optional counter. + +**-T, --extended_speeds** + show extended speeds port counters. This is an optional counter. + +**--oprcvcounters** + show Rcv Counters per Op code. This is an optional counter. + +**--flowctlcounters** + show flow control counters. This is an optional counter. + +**--vloppackets** + show packets received per Op code per VL. This is an optional counter. + +**--vlopdata** + show data received per Op code per VL. This is an optional counter. + +**--vlxmitflowctlerrors** + show flow control update errors per VL. This is an optional counter. + +**--vlxmitcounters** + show ticks waiting to transmit counters per VL. This is an optional counter. + +**--swportvlcong** + show sw port VL congestion. This is an optional counter. + +**--rcvcc** + show Rcv congestion control counters. This is an optional counter. + +**--slrcvfecn** + show SL Rcv FECN counters. This is an optional counter. + +**--slrcvbecn** + show SL Rcv BECN counters. This is an optional counter. + +**--xmitcc** + show Xmit congestion control counters. This is an optional counter. + +**--vlxmittimecc** + show VL Xmit Time congestion control counters. This is an optional counter. + +**-c, --smplctl** + show port samples control. + +**-a, --all_ports** + show aggregated counters for all ports of the destination lid, reset + all counters for all ports, or if multiple ports are specified, aggregate + the counters of the specified ports. If the destination lid does not support + the AllPortSelect flag, all ports will be iterated through to emulate + AllPortSelect behavior. + +**-l, --loop_ports** + If all ports are selected by the user (either through the **-a** option + or port 255) or multiple ports are specified iterate through each port rather + than doing than aggregate operation. + +**-r, --reset_after_read** + reset counters after read + +**-R, --Reset_only** + only reset counters + + +Addressing Flags +---------------- + +.. include:: common/opt_G.rst +.. include:: common/opt_L.rst +.. include:: common/opt_s.rst + + +Port Selection flags +-------------------- + +.. include:: common/opt_C.rst +.. include:: common/opt_P.rst +.. include:: common/sec_portselection.rst + + +Debugging flags +--------------- + +.. include:: common/opt_d.rst +.. include:: common/opt_e.rst +.. include:: common/opt_h.rst +.. include:: common/opt_v.rst +.. include:: common/opt_V.rst + + +Configuration flags +------------------- + +.. include:: common/opt_t.rst +.. include:: common/opt_y.rst +.. include:: common/opt_z-config.rst + + +FILES +===== + +.. include:: common/sec_config-file.rst + +EXAMPLES +======== + +:: + + perfquery # read local port performance counters + perfquery 32 1 # read performance counters from lid 32, port 1 + perfquery -x 32 1 # read extended performance counters from lid 32, port 1 + perfquery -a 32 # read perf counters from lid 32, all ports + perfquery -r 32 1 # read performance counters and reset + perfquery -x -r 32 1 # read extended performance counters and reset + perfquery -R 0x20 1 # reset performance counters of port 1 only + perfquery -x -R 0x20 1 # reset extended performance counters of port 1 only + perfquery -R -a 32 # reset performance counters of all ports + perfquery -R 32 2 0x0fff # reset only error counters of port 2 + perfquery -R 32 2 0xf000 # reset only non-error counters of port 2 + perfquery -a 32 1-10 # read performance counters from lid 32, port 1-10, aggregate output + perfquery -l 32 1-10 # read performance counters from lid 32, port 1-10, output each port + perfquery -a 32 1,4,8 # read performance counters from lid 32, port 1, 4, and 8, aggregate output + perfquery -l 32 1,4,8 # read performance counters from lid 32, port 1, 4, and 8, output each port + +AUTHOR +====== + +Hal Rosenstock + < hal.rosenstock@gmail.com > diff --git a/infiniband-diags/man/saquery.8.in.rst b/infiniband-diags/man/saquery.8.in.rst new file mode 100644 index 0000000..93043e1 --- /dev/null +++ b/infiniband-diags/man/saquery.8.in.rst @@ -0,0 +1,211 @@ +======= +saquery +======= + +------------------------------------------------- +query InfiniBand subnet administration attributes +------------------------------------------------- + +:Date: 2017-08-21 +:Manual section: 8 +:Manual group: Open IB Diagnostics + +SYNOPSIS +======== + +saquery [options] [<name> | <lid> | <guid>] + +DESCRIPTION +=========== + +saquery issues the selected SA query. Node records are queried by default. + +OPTIONS +======= + +**-p** + get PathRecord info + +**-N** + get NodeRecord info + +**-D, --list** + get NodeDescriptions of CAs only + +**-S** + get ServiceRecord info + +**-I** + get InformInfoRecord (subscription) info + +**-L** + return the Lids of the name specified + +**-l** + return the unique Lid of the name specified + +**-G** + return the Guids of the name specified + +**-O** + return the name for the Lid specified + +**-U** + return the name for the Guid specified + +**-c** + get the SA's class port info + +**-s** + return the PortInfoRecords with isSM or isSMdisabled capability mask bit on + +**-g** + get multicast group info + +**-m** + get multicast member info. If a group is specified, limit the output + to the group specified and print one line containing only the GUID and + node description for each entry. Example: saquery -m 0xc000 + +**-x** + get LinkRecord info + +**--src-to-dst <src:dst>** + get a PathRecord for <src:dst> + where src and dst are either node names or LIDs + +**--sgid-to-dgid <sgid:dgid>** + get a PathRecord for **sgid** to **dgid** + where both GIDs are in an IPv6 format acceptable to **inet_pton (3)** + +**--smkey <val>** + use SM_Key value for the query. Will be used only with "trusted" + queries. If non-numeric value (like 'x') is specified then saquery + will prompt for a value. + Default (when not specified here or in + @IBDIAG_CONFIG_PATH@/ibdiag.conf) is to use SM_Key == 0 (or + \"untrusted\") + +.. include:: common/opt_K.rst + +**--slid <lid>** Source LID (PathRecord) + +**--dlid <lid>** Destination LID (PathRecord) + +**--mlid <lid>** Multicast LID (MCMemberRecord) + +**--sgid <gid>** Source GID (IPv6 format) (PathRecord) + +**--dgid <gid>** Destination GID (IPv6 format) (PathRecord) + +**--gid <gid>** Port GID (MCMemberRecord) + +**--mgid <gid>** Multicast GID (MCMemberRecord) + +**--reversible** Reversible path (PathRecord) + +**--numb_path** Number of paths (PathRecord) + +**--pkey** P_Key (PathRecord, MCMemberRecord). If non-numeric value (like 'x') + is specified then saquery will prompt for a value + +**--qos_class** QoS Class (PathRecord) + +**--sl** Service level (PathRecord, MCMemberRecord) + +**--mtu** MTU and selector (PathRecord, MCMemberRecord) + +**--rate** Rate and selector (PathRecord, MCMemberRecord) + +**--pkt_lifetime** Packet lifetime and selector (PathRecord, MCMemberRecord) + +**--qkey** Q_Key (MCMemberRecord). If non-numeric value (like 'x') is specified + then saquery will prompt for a value + +**--tclass** Traffic Class (PathRecord, MCMemberRecord) + +**--flow_label** Flow Label (PathRecord, MCMemberRecord) + +**--hop_limit** Hop limit (PathRecord, MCMemberRecord) + +**--scope** Scope (MCMemberRecord) + +**--join_state** Join state (MCMemberRecord) + +**--proxy_join** Proxy join (MCMemberRecord) + +**--service_id** ServiceID (PathRecord) + +Supported query names (and aliases): + +:: + + ClassPortInfo (CPI) + NodeRecord (NR) [lid] + PortInfoRecord (PIR) [[lid]/[port]/[options]] + SL2VLTableRecord (SL2VL) [[lid]/[in_port]/[out_port]] + PKeyTableRecord (PKTR) [[lid]/[port]/[block]] + VLArbitrationTableRecord (VLAR) [[lid]/[port]/[block]] + InformInfoRecord (IIR) + LinkRecord (LR) [[from_lid]/[from_port]] [[to_lid]/[to_port]] + ServiceRecord (SR) + PathRecord (PR) + MCMemberRecord (MCMR) + LFTRecord (LFTR) [[lid]/[block]] + MFTRecord (MFTR) [[mlid]/[position]/[block]] + GUIDInfoRecord (GIR) [[lid]/[block]] + SwitchInfoRecord (SWIR) [lid] + SMInfoRecord (SMIR) [lid] + + + +Port Selection flags +-------------------- + +.. include:: common/opt_C.rst +.. include:: common/opt_P.rst +.. include:: common/sec_portselection.rst + + +Debugging flags +--------------- + +.. include:: common/opt_d.rst +.. include:: common/opt_e.rst +.. include:: common/opt_h.rst +.. include:: common/opt_v.rst +.. include:: common/opt_V.rst + + +Configuration flags +------------------- + +.. include:: common/opt_t.rst +.. include:: common/opt_o-outstanding_smps.rst +.. include:: common/opt_node_name_map.rst +.. include:: common/opt_z-config.rst + + + +COMMON FILES +============ + +.. include:: common/sec_config-file.rst + +.. include:: common/sec_node-name-map.rst + + + +DEPENDENCIES +============ + +OpenSM (or other running SM/SA), libosmcomp, libibumad, libibmad + +AUTHORS +======= + +Ira Weiny + < ira.weiny@intel.com > + +Hal Rosenstock + < halr@mellanox.com > diff --git a/infiniband-diags/man/sminfo.8.in.rst b/infiniband-diags/man/sminfo.8.in.rst new file mode 100644 index 0000000..6ac8e27 --- /dev/null +++ b/infiniband-diags/man/sminfo.8.in.rst @@ -0,0 +1,102 @@ +====== +sminfo +====== + +--------------------------------- +query InfiniBand SMInfo attribute +--------------------------------- + +:Date: 2017-08-21 +:Manual section: 8 +:Manual group: Open IB Diagnostics + +SYNOPSIS +======== + +sminfo [options] sm_lid | sm_dr_path [modifier] + +DESCRIPTION +=========== + +Optionally set and display the output of a sminfo query in human readable +format. The target SM is the one listed in the local port info, or the SM +specified by the optional SM lid or by the SM direct routed path. + +Note: using sminfo for any purposes other then simple query may be very +dangerous, and may result in a malfunction of the target SM. + +OPTIONS +======= + +**-s, --state <state>** set SM state + 0 not active + + 1 discovering + + 2 standby + + 3 master + +**-p, --priority <priority>** set priority (0-15) + +**-a, --activity <val>** set activity count + +Addressing Flags +---------------- + +.. include:: common/opt_D.rst +.. include:: common/opt_G.rst +.. include:: common/opt_L.rst + + +Port Selection flags +-------------------- + +.. include:: common/opt_C.rst +.. include:: common/opt_P.rst +.. include:: common/sec_portselection.rst + + +Debugging flags +--------------- + +.. include:: common/opt_d.rst +.. include:: common/opt_e.rst +.. include:: common/opt_h.rst +.. include:: common/opt_v.rst +.. include:: common/opt_V.rst + + +Configuration flags +------------------- + +.. include:: common/opt_t.rst +.. include:: common/opt_y.rst +.. include:: common/opt_z-config.rst + + +FILES +===== + +.. include:: common/sec_config-file.rst + + +EXAMPLES +======== + +:: + sminfo # local port\'s sminfo + sminfo 32 # show sminfo of lid 32 + sminfo -G 0x8f1040023 # same but using guid address + + +SEE ALSO +======== + +smpdump (8) + +AUTHOR +====== + +Hal Rosenstock + < halr@voltaire.com > diff --git a/infiniband-diags/man/smpdump.8.in.rst b/infiniband-diags/man/smpdump.8.in.rst new file mode 100644 index 0000000..5c7bcfc --- /dev/null +++ b/infiniband-diags/man/smpdump.8.in.rst @@ -0,0 +1,103 @@ +======= +smpdump +======= + +-------------------------------------------- +dump InfiniBand subnet management attributes +-------------------------------------------- + +:Date: 2017-08-21 +:Manual section: 8 +:Manual group: Open IB Diagnostics + +SYNOPSIS +======== + +smpdump [options] <dlid|dr_path> <attribute> [attribute_modifier] + +DESCRIPTION +=========== + +smpdump is a general purpose SMP utility which gets SM attributes from a +specified SMA. The result is dumped in hex by default. + +OPTIONS +======= + +**dlid|drpath** + LID or DR path to SMA + +**attribute** + IBA attribute ID for SM attribute + +**attribute_modifier** + IBA modifier for SM attribute + +**-s, --string** + Print strings in packet if possible + + +Addressing Flags +---------------- + +.. include:: common/opt_D.rst +.. include:: common/opt_L.rst + + +Port Selection flags +-------------------- + +.. include:: common/opt_C.rst +.. include:: common/opt_P.rst +.. include:: common/sec_portselection.rst + + +Debugging flags +--------------- + +.. include:: common/opt_d.rst +.. include:: common/opt_e.rst +.. include:: common/opt_h.rst +.. include:: common/opt_v.rst +.. include:: common/opt_V.rst + + +Configuration flags +------------------- + +.. include:: common/opt_t.rst +.. include:: common/opt_z-config.rst + + +FILES +===== + +.. include:: common/sec_config-file.rst + + +EXAMPLES +======== + +Direct Routed Examples + +:: + smpdump -D 0,1,2,3,5 16 # NODE DESC + smpdump -D 0,1,2 0x15 2 # PORT INFO, port 2 + +LID Routed Examples + +:: + smpdump 3 0x15 2 # PORT INFO, lid 3 port 2 + smpdump 0xa0 0x11 # NODE INFO, lid 0xa0 + +SEE ALSO +======== + +smpquery (8) + + +AUTHOR +====== + +Hal Rosenstock + < halr@voltaire.com > diff --git a/infiniband-diags/man/smpquery.8.in.rst b/infiniband-diags/man/smpquery.8.in.rst new file mode 100644 index 0000000..dcb26fb --- /dev/null +++ b/infiniband-diags/man/smpquery.8.in.rst @@ -0,0 +1,127 @@ +======== +smpquery +======== + +--------------------------------------------- +query InfiniBand subnet management attributes +--------------------------------------------- + +:Date: 2017-08-21 +:Manual section: 8 +:Manual group: Open IB Diagnostics + +SYNOPSIS +======== + +smpquery [options] <op> <dest dr_path|lid|guid> [op params] + +DESCRIPTION +=========== + +smpquery allows a basic subset of standard SMP queries including the following: +node info, node description, switch info, port info. Fields are displayed in +human readable format. + +OPTIONS +======= + +Current supported operations (case insensitive) and their parameters: + + +:: + + Nodeinfo (NI) <addr> + + Nodedesc (ND) <addr> + + Portinfo (PI) <addr> [<portnum>] # default port is zero + + PortInfoExtended (PIE) <addr> [<portnum>] + + Switchinfo (SI) <addr> + + PKeyTable (PKeys) <addr> [<portnum>] + + SL2VLTable (SL2VL) <addr> [<portnum>] + + VLArbitration (VLArb) <addr> [<portnum>] + + GUIDInfo (GI) <addr> + + MlnxExtPortInfo (MEPI) <addr> [<portnum>] # default port is zero + + +**-c, --combined** + Use Combined route address argument ``<lid> <DR_Path>`` + +**-x, --extended** + Set SMSupportsExtendedSpeeds bit 31 in AttributeModifier + (only impacts PortInfo queries). + +.. include:: common/opt_K.rst + + +Addressing Flags +---------------- + +.. include:: common/opt_D.rst +.. include:: common/opt_G.rst +.. include:: common/opt_L.rst +.. include:: common/opt_s.rst + + +Port Selection flags +-------------------- + +.. include:: common/opt_C.rst +.. include:: common/opt_P.rst +.. include:: common/sec_portselection.rst + + +Debugging flags +--------------- + +.. include:: common/opt_d.rst +.. include:: common/opt_e.rst +.. include:: common/opt_h.rst +.. include:: common/opt_v.rst +.. include:: common/opt_V.rst + + +Configuration flags +------------------- + +.. include:: common/opt_t.rst +.. include:: common/opt_node_name_map.rst +.. include:: common/opt_y.rst +.. include:: common/opt_z-config.rst + + + +FILES +===== + +.. include:: common/sec_config-file.rst + +.. include:: common/sec_node-name-map.rst + + +EXAMPLES +======== + +:: + smpquery portinfo 3 1 # portinfo by lid, with port modifier + smpquery -G switchinfo 0x2C9000100D051 1 # switchinfo by guid + smpquery -D nodeinfo 0 # nodeinfo by direct route + smpquery -c nodeinfo 6 0,12 # nodeinfo by combined route + +SEE ALSO +======== + +smpdump (8) + +AUTHOR +====== + +Hal Rosenstock + < hal@mellanox.com > diff --git a/infiniband-diags/man/vendstat.8.in.rst b/infiniband-diags/man/vendstat.8.in.rst new file mode 100644 index 0000000..4d2c0a3 --- /dev/null +++ b/infiniband-diags/man/vendstat.8.in.rst @@ -0,0 +1,119 @@ +======== +vendstat +======== + +------------------------------------------ +query InfiniBand vendor specific functions +------------------------------------------ + +:Date: 2017-08-21 +:Manual section: 8 +:Manual group: Open IB Diagnostics + +SYNOPSIS +======== + +vendstat [options] <lid|guid> + +DESCRIPTION +=========== + +vendstat uses vendor specific MADs to access beyond the IB spec +vendor specific functionality. Currently, there is support for +Mellanox InfiniSwitch-III (IS3) and InfiniSwitch-IV (IS4). + +OPTIONS +======= + +**-N** + show IS3 or IS4 general information. + +**-w** + show IS3 port xmit wait counters. + +**-i** + show IS4 counter group info. + +**-c <num,num>** + configure IS4 counter groups. + + Configure IS4 counter groups 0 and 1. Such configuration is not + persistent across IS4 reboot. First number is for counter group 0 and + second is for counter group 1. + + Group 0 counter config values: + +:: + 0 - PortXmitDataSL0-7 + 1 - PortXmitDataSL8-15 + 2 - PortRcvDataSL0-7 + + Group 1 counter config values: + +:: + 1 - PortXmitDataSL8-15 + 2 - PortRcvDataSL0-7 + 8 - PortRcvDataSL8-15 + +**-R, --Read <addr,mask>** + Read configuration space record at addr + +**-W, --Write <addr,val,mask>** + Write configuration space record at addr + + +Addressing Flags +---------------- + +.. include:: common/opt_G.rst +.. include:: common/opt_L.rst +.. include:: common/opt_s.rst + + +Port Selection flags +-------------------- + +.. include:: common/opt_C.rst +.. include:: common/opt_P.rst +.. include:: common/sec_portselection.rst + + +Debugging flags +--------------- + +.. include:: common/opt_d.rst +.. include:: common/opt_e.rst +.. include:: common/opt_h.rst +.. include:: common/opt_v.rst +.. include:: common/opt_V.rst + + +Configuration flags +------------------- + +.. include:: common/opt_t.rst +.. include:: common/opt_z-config.rst + + + +FILES +===== + +.. include:: common/sec_config-file.rst + + +EXAMPLES +======== + +:: + vendstat -N 6 # read IS3 or IS4 general information + vendstat -w 6 # read IS3 port xmit wait counters + vendstat -i 6 12 # read IS4 port 12 counter group info + vendstat -c 0,1 6 12 # configure IS4 port 12 counter groups for PortXmitDataSL + vendstat -c 2,8 6 12 # configure IS4 port 12 counter groups for PortRcvDataSL + +AUTHOR +====== + +Hal Rosenstock + < hal.rosenstock@gmail.com > diff --git a/infiniband-diags/mcm_rereg_test.c b/infiniband-diags/mcm_rereg_test.c new file mode 100644 index 0000000..5d141df --- /dev/null +++ b/infiniband-diags/mcm_rereg_test.c @@ -0,0 +1,397 @@ +/* + * Copyright (c) 2006-2009 Voltaire, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#include <stdio.h> +#include <string.h> +#include <errno.h> +#include <inttypes.h> + +#include <infiniband/umad.h> +#include <infiniband/mad.h> + +#include "ibdiag_common.h" + +#define info(fmt, ...) fprintf(stderr, "INFO: " fmt, ## __VA_ARGS__ ) +#define err(fmt, ...) fprintf(stderr, "ERR: " fmt, ## __VA_ARGS__ ) +#ifdef NOISY_DEBUG +#define dbg(fmt, ...) fprintf(stderr, "DBG: " fmt, ## __VA_ARGS__ ) +#else +__attribute__((format(printf, 1, 2))) static inline void dbg(const char *fmt, + ...) +{ +} +#endif + +#define TMO 100 + +static ibmad_gid_t mgid_ipoib = { + 0xff, 0x12, 0x40, 0x1b, 0xff, 0xff, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff +}; + +static struct ibmad_port *srcport; + +static uint64_t build_mcm_rec(uint8_t *data, ibmad_gid_t mgid, + ibmad_gid_t port_gid) +{ + memset(data, 0, IB_SA_DATA_SIZE); + mad_set_array(data, 0, IB_SA_MCM_MGID_F, mgid); + mad_set_array(data, 0, IB_SA_MCM_PORTGID_F, port_gid); + mad_set_field(data, 0, IB_SA_MCM_JOIN_STATE_F, 1); + + return be64toh(IB_MCR_COMPMASK_MGID | IB_MCR_COMPMASK_PORT_GID | + IB_MCR_COMPMASK_JOIN_STATE); +} + +static void build_mcm_rec_umad(void *umad, ib_portid_t * dport, int method, + uint64_t comp_mask, uint8_t * data) +{ + ib_rpc_t rpc; + + memset(&rpc, 0, sizeof(rpc)); + rpc.mgtclass = IB_SA_CLASS; + rpc.method = method; + rpc.attr.id = IB_SA_ATTR_MCRECORD; + rpc.attr.mod = 0; // ??? + rpc.mask = comp_mask; + rpc.datasz = IB_SA_DATA_SIZE; + rpc.dataoffs = IB_SA_DATA_OFFS; + + mad_build_pkt(umad, &rpc, dport, NULL, data); +} + +static int rereg_send(int port, int agent, ib_portid_t * dport, + uint8_t * umad, int len, int method, ibmad_gid_t port_gid) +{ + uint8_t data[IB_SA_DATA_SIZE]; + uint64_t comp_mask; + + comp_mask = build_mcm_rec(data, mgid_ipoib, port_gid); + + build_mcm_rec_umad(umad, dport, method, comp_mask, data); + if (umad_send(port, agent, umad, len, TMO, 0) < 0) { + err("umad_send %s failed: %s\n", + (method == IB_MAD_METHOD_GET) ? "query" : "non query", + strerror(errno)); + return -1; + } + dbg("umad_send %d: tid = 0x%016" PRIx64 "\n", method, + mad_get_field64(umad_get_mad(umad), 0, IB_MAD_TRID_F)); + + return 0; +} + +static int rereg_port_gid(int port, int agent, ib_portid_t * dport, + uint8_t * umad, int len, ibmad_gid_t port_gid) +{ + uint8_t data[IB_SA_DATA_SIZE]; + uint64_t comp_mask; + + comp_mask = build_mcm_rec(data, mgid_ipoib, port_gid); + + build_mcm_rec_umad(umad, dport, IB_MAD_METHOD_DELETE, comp_mask, data); + if (umad_send(port, agent, umad, len, TMO, 0) < 0) { + err("umad_send leave failed: %s\n", strerror(errno)); + return -1; + } + dbg("umad_send leave: tid = 0x%016" PRIx64 "\n", + mad_get_field64(umad_get_mad(umad), 0, IB_MAD_TRID_F)); + + build_mcm_rec_umad(umad, dport, IB_MAD_METHOD_SET, comp_mask, data); + if (umad_send(port, agent, umad, len, TMO, 0) < 0) { + err("umad_send join failed: %s\n", strerror(errno)); + return -1; + } + dbg("umad_send join: tid = 0x%016" PRIx64 "\n", + mad_get_field64(umad_get_mad(umad), 0, IB_MAD_TRID_F)); + + return 0; +} + +struct guid_trid { + ibmad_gid_t gid; + __be64 guid; + uint64_t trid; +}; + +static int rereg_send_all(int port, int agent, ib_portid_t * dport, + struct guid_trid *list, unsigned cnt) +{ + uint8_t *umad; + int len = umad_size() + 256; + unsigned i; + int ret; + + info("rereg_send_all... cnt = %u\n", cnt); + + umad = calloc(1, len); + if (!umad) { + err("cannot alloc mem for umad: %s\n", strerror(errno)); + return -1; + } + + for (i = 0; i < cnt; i++) { + ret = + rereg_port_gid(port, agent, dport, umad, len, list[i].gid); + if (ret < 0) { + err("rereg_send_all: rereg_port_gid 0x%016" PRIx64 + " failed\n", be64toh(list[i].guid)); + continue; + } + list[i].trid = mad_get_field64(umad_get_mad(umad), 0, + IB_MAD_TRID_F); + } + + info("rereg_send_all: sent %u requests\n", cnt * 2); + + free(umad); + + return 0; +} + +static int rereg_recv(int port, int agent, ib_portid_t * dport, + uint8_t * umad, int length, int tmo) +{ + int ret, retry = 0; + int len = length; + + while ((ret = umad_recv(port, umad, &len, tmo)) < 0 && + errno == ETIMEDOUT) { + if (retry++ > 3) + return 0; + } + if (ret < 0) { + err("umad_recv %d failed: %s\n", ret, strerror(errno)); + return -1; + } + dbg("umad_recv (retries %d), tid = 0x%016" PRIx64 + ": len = %d, status = %d\n", retry, + mad_get_field64(umad_get_mad(umad), 0, IB_MAD_TRID_F), len, + umad_status(umad)); + + return 1; +} + +static int rereg_recv_all(int port, int agent, ib_portid_t * dport, + struct guid_trid *list, unsigned cnt) +{ + uint8_t *umad, *mad; + int len = umad_size() + 256; + uint64_t trid; + unsigned n, method, status; + unsigned i; + + info("rereg_recv_all...\n"); + + umad = calloc(1, len); + if (!umad) { + err("cannot alloc mem for umad: %s\n", strerror(errno)); + return -1; + } + + n = 0; + while (rereg_recv(port, agent, dport, umad, len, TMO) > 0) { + dbg("rereg_recv_all: done %d\n", n); + n++; + mad = umad_get_mad(umad); + + method = mad_get_field(mad, 0, IB_MAD_METHOD_F); + status = mad_get_field(mad, 0, IB_MAD_STATUS_F); + + if (status) + dbg("MAD status %x, method %x\n", status, method); + + if (status && + (method & 0x7f) == (IB_MAD_METHOD_GET_RESPONSE & 0x7f)) { + trid = mad_get_field64(mad, 0, IB_MAD_TRID_F); + for (i = 0; i < cnt; i++) + if (trid == list[i].trid) + break; + if (i == cnt) { + err("cannot find trid 0x%016" PRIx64 "\n", + trid); + continue; + } + info("guid 0x%016" PRIx64 + ": method = %x status = %x. Resending\n", + be64toh(list[i].guid), method, status); + rereg_port_gid(port, agent, dport, umad, len, + list[i].gid); + list[i].trid = + mad_get_field64(umad_get_mad(umad), 0, + IB_MAD_TRID_F); + } + } + + info("rereg_recv_all: got %u responses\n", n); + + free(umad); + return 0; +} + +static int rereg_query_all(int port, int agent, ib_portid_t * dport, + struct guid_trid *list, unsigned cnt) +{ + uint8_t *umad, *mad; + int len = umad_size() + 256; + unsigned method, status; + unsigned i; + int ret; + + info("rereg_query_all...\n"); + + umad = calloc(1, len); + if (!umad) { + err("cannot alloc mem for umad: %s\n", strerror(errno)); + return -1; + } + + for (i = 0; i < cnt; i++) { + ret = rereg_send(port, agent, dport, umad, len, + IB_MAD_METHOD_GET, list[i].gid); + if (ret < 0) { + err("query_all: rereg_send failed.\n"); + continue; + } + + ret = rereg_recv(port, agent, dport, umad, len, TMO); + if (ret < 0) { + err("query_all: rereg_recv failed.\n"); + continue; + } + + mad = umad_get_mad(umad); + + method = mad_get_field(mad, 0, IB_MAD_METHOD_F); + status = mad_get_field(mad, 0, IB_MAD_STATUS_F); + + if (status) + info("guid 0x%016" PRIx64 ": status %x, method %x\n", + be64toh(list[i].guid), status, method); + } + + info("rereg_query_all: %u queried.\n", cnt); + + free(umad); + return 0; +} + +#define MAX_CLIENTS 50 + +static int rereg_and_test_port(const char *guid_file, int port, int agent, + ib_portid_t *dport, int timeout) +{ + char line[256]; + FILE *f; + ibmad_gid_t port_gid; + __be64 prefix = htobe64(0xfe80000000000000ull); + __be64 guid = htobe64(0x0002c90200223825ull); + struct guid_trid *list; + int i = 0; + + list = calloc(MAX_CLIENTS, sizeof(*list)); + if (!list) { + err("cannot alloc mem for guid/trid list: %s\n", + strerror(errno)); + return -1; + } + + f = fopen(guid_file, "r"); + if (!f) { + err("cannot open %s: %s\n", guid_file, strerror(errno)); + free(list); + return -1; + } + + while (fgets(line, sizeof(line), f)) { + guid = htobe64(strtoull(line, NULL, 0)); + memcpy(&port_gid[0], &prefix, 8); + memcpy(&port_gid[8], &guid, 8); + + list[i].guid = guid; + memcpy(list[i].gid, port_gid, sizeof(list[i].gid)); + list[i].trid = 0; + if (++i >= MAX_CLIENTS) + break; + } + fclose(f); + + rereg_send_all(port, agent, dport, list, i); + rereg_recv_all(port, agent, dport, list, i); + + rereg_query_all(port, agent, dport, list, i); + + free(list); + return 0; +} + +int main(int argc, const char **argv) +{ + const char *guid_file = "port_guids.list"; + int mgmt_classes[2] = { IB_SMI_CLASS, IB_SMI_DIRECT_CLASS }; + ib_portid_t dport_id; + int port, agent; + uint8_t *umad; + int len; + + if (argc > 1) + guid_file = argv[1]; + + srcport = mad_rpc_open_port(NULL, 0, mgmt_classes, 2); + if (!srcport) + err("Failed to open port"); + + resolve_sm_portid(NULL, 0, &dport_id); + dport_id.qp = 1; + if (!dport_id.qkey) + dport_id.qkey = IB_DEFAULT_QP1_QKEY; + + len = umad_size() + 256; + umad = calloc(1, len); + if (!umad) { + err("cannot alloc mem for umad: %s\n", strerror(errno)); + return -1; + } + port = mad_rpc_portid(srcport); + + agent = umad_register(port, IB_SA_CLASS, 2, 0, NULL); + + rereg_and_test_port(guid_file, port, agent, &dport_id, TMO); + + free(umad); + umad_unregister(port, agent); + umad_close_port(port); + umad_done(); + + return 0; +} diff --git a/infiniband-diags/perfquery.c b/infiniband-diags/perfquery.c new file mode 100644 index 0000000..c7d320d --- /dev/null +++ b/infiniband-diags/perfquery.c @@ -0,0 +1,1134 @@ +/* + * Copyright (c) 2004-2009 Voltaire Inc. All rights reserved. + * Copyright (c) 2007 Xsigo Systems Inc. All rights reserved. + * Copyright (c) 2009 HNR Consulting. All rights reserved. + * Copyright (c) 2011 Mellanox Technologies LTD. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <netinet/in.h> + +#include <infiniband/umad.h> +#include <infiniband/mad.h> + +#include "ibdiag_common.h" + +static struct ibmad_port *srcport; + +struct perf_count { + uint32_t portselect; + uint32_t counterselect; + uint32_t symbolerrors; + uint32_t linkrecovers; + uint32_t linkdowned; + uint32_t rcverrors; + uint32_t rcvremotephyerrors; + uint32_t rcvswrelayerrors; + uint32_t xmtdiscards; + uint32_t xmtconstrainterrors; + uint32_t rcvconstrainterrors; + uint32_t linkintegrityerrors; + uint32_t excbufoverrunerrors; + uint32_t qp1dropped; + uint32_t vl15dropped; + uint32_t xmtdata; + uint32_t rcvdata; + uint32_t xmtpkts; + uint32_t rcvpkts; + uint32_t xmtwait; +}; + +struct perf_count_ext { + uint32_t portselect; + uint32_t counterselect; + uint64_t portxmitdata; + uint64_t portrcvdata; + uint64_t portxmitpkts; + uint64_t portrcvpkts; + uint64_t portunicastxmitpkts; + uint64_t portunicastrcvpkts; + uint64_t portmulticastxmitpkits; + uint64_t portmulticastrcvpkts; + + uint32_t counterSelect2; + uint64_t symbolErrorCounter; + uint64_t linkErrorRecoveryCounter; + uint64_t linkDownedCounter; + uint64_t portRcvErrors; + uint64_t portRcvRemotePhysicalErrors; + uint64_t portRcvSwitchRelayErrors; + uint64_t portXmitDiscards; + uint64_t portXmitConstraintErrors; + uint64_t portRcvConstraintErrors; + uint64_t localLinkIntegrityErrors; + uint64_t excessiveBufferOverrunErrors; + uint64_t VL15Dropped; + uint64_t portXmitWait; + uint64_t QP1Dropped; +}; + +static uint8_t pc[1024]; + +static struct perf_count perf_count = {}; +static struct perf_count_ext perf_count_ext = {}; + +#define ALL_PORTS 0xFF +#define MAX_PORTS 255 + +/* Notes: IB semantics is to cap counters if count has exceeded limits. + * Therefore we must check for overflows and cap the counters if necessary. + * + * mad_decode_field and mad_encode_field assume 32 bit integers passed in + * for fields < 32 bits in length. + */ + +static void aggregate_4bit(uint32_t * dest, uint32_t val) +{ + if ((((*dest) + val) < (*dest)) || ((*dest) + val) > 0xf) + (*dest) = 0xf; + else + (*dest) = (*dest) + val; +} + +static void aggregate_8bit(uint32_t * dest, uint32_t val) +{ + if ((((*dest) + val) < (*dest)) + || ((*dest) + val) > 0xff) + (*dest) = 0xff; + else + (*dest) = (*dest) + val; +} + +static void aggregate_16bit(uint32_t * dest, uint32_t val) +{ + if ((((*dest) + val) < (*dest)) + || ((*dest) + val) > 0xffff) + (*dest) = 0xffff; + else + (*dest) = (*dest) + val; +} + +static void aggregate_32bit(uint32_t * dest, uint32_t val) +{ + if (((*dest) + val) < (*dest)) + (*dest) = 0xffffffff; + else + (*dest) = (*dest) + val; +} + +static void aggregate_64bit(uint64_t * dest, uint64_t val) +{ + if (((*dest) + val) < (*dest)) + (*dest) = 0xffffffffffffffffULL; + else + (*dest) = (*dest) + val; +} + +static void aggregate_perfcounters(void) +{ + uint32_t val; + + mad_decode_field(pc, IB_PC_PORT_SELECT_F, &val); + perf_count.portselect = val; + mad_decode_field(pc, IB_PC_COUNTER_SELECT_F, &val); + perf_count.counterselect = val; + mad_decode_field(pc, IB_PC_ERR_SYM_F, &val); + aggregate_16bit(&perf_count.symbolerrors, val); + mad_decode_field(pc, IB_PC_LINK_RECOVERS_F, &val); + aggregate_8bit(&perf_count.linkrecovers, val); + mad_decode_field(pc, IB_PC_LINK_DOWNED_F, &val); + aggregate_8bit(&perf_count.linkdowned, val); + mad_decode_field(pc, IB_PC_ERR_RCV_F, &val); + aggregate_16bit(&perf_count.rcverrors, val); + mad_decode_field(pc, IB_PC_ERR_PHYSRCV_F, &val); + aggregate_16bit(&perf_count.rcvremotephyerrors, val); + mad_decode_field(pc, IB_PC_ERR_SWITCH_REL_F, &val); + aggregate_16bit(&perf_count.rcvswrelayerrors, val); + mad_decode_field(pc, IB_PC_XMT_DISCARDS_F, &val); + aggregate_16bit(&perf_count.xmtdiscards, val); + mad_decode_field(pc, IB_PC_ERR_XMTCONSTR_F, &val); + aggregate_8bit(&perf_count.xmtconstrainterrors, val); + mad_decode_field(pc, IB_PC_ERR_RCVCONSTR_F, &val); + aggregate_8bit(&perf_count.rcvconstrainterrors, val); + mad_decode_field(pc, IB_PC_ERR_LOCALINTEG_F, &val); + aggregate_4bit(&perf_count.linkintegrityerrors, val); + mad_decode_field(pc, IB_PC_ERR_EXCESS_OVR_F, &val); + aggregate_4bit(&perf_count.excbufoverrunerrors, val); + mad_decode_field(pc, IB_PC_QP1_DROP_F, &val); + aggregate_16bit(&perf_count.qp1dropped, val); + mad_decode_field(pc, IB_PC_VL15_DROPPED_F, &val); + aggregate_16bit(&perf_count.vl15dropped, val); + mad_decode_field(pc, IB_PC_XMT_BYTES_F, &val); + aggregate_32bit(&perf_count.xmtdata, val); + mad_decode_field(pc, IB_PC_RCV_BYTES_F, &val); + aggregate_32bit(&perf_count.rcvdata, val); + mad_decode_field(pc, IB_PC_XMT_PKTS_F, &val); + aggregate_32bit(&perf_count.xmtpkts, val); + mad_decode_field(pc, IB_PC_RCV_PKTS_F, &val); + aggregate_32bit(&perf_count.rcvpkts, val); + mad_decode_field(pc, IB_PC_XMT_WAIT_F, &val); + aggregate_32bit(&perf_count.xmtwait, val); +} + +static void output_aggregate_perfcounters(ib_portid_t * portid, + __be16 cap_mask) +{ + char buf[1024]; + uint32_t val = ALL_PORTS; + + /* set port_select to 255 to emulate AllPortSelect */ + mad_encode_field(pc, IB_PC_PORT_SELECT_F, &val); + mad_encode_field(pc, IB_PC_COUNTER_SELECT_F, &perf_count.counterselect); + mad_encode_field(pc, IB_PC_ERR_SYM_F, &perf_count.symbolerrors); + mad_encode_field(pc, IB_PC_LINK_RECOVERS_F, &perf_count.linkrecovers); + mad_encode_field(pc, IB_PC_LINK_DOWNED_F, &perf_count.linkdowned); + mad_encode_field(pc, IB_PC_ERR_RCV_F, &perf_count.rcverrors); + mad_encode_field(pc, IB_PC_ERR_PHYSRCV_F, + &perf_count.rcvremotephyerrors); + mad_encode_field(pc, IB_PC_ERR_SWITCH_REL_F, + &perf_count.rcvswrelayerrors); + mad_encode_field(pc, IB_PC_XMT_DISCARDS_F, &perf_count.xmtdiscards); + mad_encode_field(pc, IB_PC_ERR_XMTCONSTR_F, + &perf_count.xmtconstrainterrors); + mad_encode_field(pc, IB_PC_ERR_RCVCONSTR_F, + &perf_count.rcvconstrainterrors); + mad_encode_field(pc, IB_PC_ERR_LOCALINTEG_F, + &perf_count.linkintegrityerrors); + mad_encode_field(pc, IB_PC_ERR_EXCESS_OVR_F, + &perf_count.excbufoverrunerrors); + mad_encode_field(pc, IB_PC_QP1_DROP_F, &perf_count.qp1dropped); + mad_encode_field(pc, IB_PC_VL15_DROPPED_F, &perf_count.vl15dropped); + mad_encode_field(pc, IB_PC_XMT_BYTES_F, &perf_count.xmtdata); + mad_encode_field(pc, IB_PC_RCV_BYTES_F, &perf_count.rcvdata); + mad_encode_field(pc, IB_PC_XMT_PKTS_F, &perf_count.xmtpkts); + mad_encode_field(pc, IB_PC_RCV_PKTS_F, &perf_count.rcvpkts); + mad_encode_field(pc, IB_PC_XMT_WAIT_F, &perf_count.xmtwait); + + mad_dump_perfcounters(buf, sizeof buf, pc, sizeof pc); + + printf("# Port counters: %s port %d (CapMask: 0x%02X)\n%s", + portid2str(portid), ALL_PORTS, ntohs(cap_mask), buf); +} + +static void aggregate_perfcounters_ext(__be16 cap_mask, uint32_t cap_mask2) +{ + uint32_t val; + uint64_t val64; + + mad_decode_field(pc, IB_PC_EXT_PORT_SELECT_F, &val); + perf_count_ext.portselect = val; + mad_decode_field(pc, IB_PC_EXT_COUNTER_SELECT_F, &val); + perf_count_ext.counterselect = val; + mad_decode_field(pc, IB_PC_EXT_XMT_BYTES_F, &val64); + aggregate_64bit(&perf_count_ext.portxmitdata, val64); + mad_decode_field(pc, IB_PC_EXT_RCV_BYTES_F, &val64); + aggregate_64bit(&perf_count_ext.portrcvdata, val64); + mad_decode_field(pc, IB_PC_EXT_XMT_PKTS_F, &val64); + aggregate_64bit(&perf_count_ext.portxmitpkts, val64); + mad_decode_field(pc, IB_PC_EXT_RCV_PKTS_F, &val64); + aggregate_64bit(&perf_count_ext.portrcvpkts, val64); + + if (cap_mask & IB_PM_EXT_WIDTH_SUPPORTED) { + mad_decode_field(pc, IB_PC_EXT_XMT_UPKTS_F, &val64); + aggregate_64bit(&perf_count_ext.portunicastxmitpkts, val64); + mad_decode_field(pc, IB_PC_EXT_RCV_UPKTS_F, &val64); + aggregate_64bit(&perf_count_ext.portunicastrcvpkts, val64); + mad_decode_field(pc, IB_PC_EXT_XMT_MPKTS_F, &val64); + aggregate_64bit(&perf_count_ext.portmulticastxmitpkits, val64); + mad_decode_field(pc, IB_PC_EXT_RCV_MPKTS_F, &val64); + aggregate_64bit(&perf_count_ext.portmulticastrcvpkts, val64); + } + + if (htonl(cap_mask2) & IB_PM_IS_ADDL_PORT_CTRS_EXT_SUP) { + mad_decode_field(pc, IB_PC_EXT_COUNTER_SELECT2_F, &val); + perf_count_ext.counterSelect2 = val; + mad_decode_field(pc, IB_PC_EXT_ERR_SYM_F, &val64); + aggregate_64bit(&perf_count_ext.symbolErrorCounter, val64); + mad_decode_field(pc, IB_PC_EXT_LINK_RECOVERS_F, &val64); + aggregate_64bit(&perf_count_ext.linkErrorRecoveryCounter, val64); + mad_decode_field(pc, IB_PC_EXT_LINK_DOWNED_F, &val64); + aggregate_64bit(&perf_count_ext.linkDownedCounter, val64); + mad_decode_field(pc, IB_PC_EXT_ERR_RCV_F, &val64); + aggregate_64bit(&perf_count_ext.portRcvErrors, val64); + mad_decode_field(pc, IB_PC_EXT_ERR_PHYSRCV_F, &val64); + aggregate_64bit(&perf_count_ext.portRcvRemotePhysicalErrors, val64); + mad_decode_field(pc, IB_PC_EXT_ERR_SWITCH_REL_F, &val64); + aggregate_64bit(&perf_count_ext.portRcvSwitchRelayErrors, val64); + mad_decode_field(pc, IB_PC_EXT_XMT_DISCARDS_F, &val64); + aggregate_64bit(&perf_count_ext.portXmitDiscards, val64); + mad_decode_field(pc, IB_PC_EXT_ERR_XMTCONSTR_F, &val64); + aggregate_64bit(&perf_count_ext.portXmitConstraintErrors, val64); + mad_decode_field(pc, IB_PC_EXT_ERR_RCVCONSTR_F, &val64); + aggregate_64bit(&perf_count_ext.portRcvConstraintErrors, val64); + mad_decode_field(pc, IB_PC_EXT_ERR_LOCALINTEG_F, &val64); + aggregate_64bit(&perf_count_ext.localLinkIntegrityErrors, val64); + mad_decode_field(pc, IB_PC_EXT_ERR_EXCESS_OVR_F, &val64); + aggregate_64bit(&perf_count_ext.excessiveBufferOverrunErrors, val64); + mad_decode_field(pc, IB_PC_EXT_VL15_DROPPED_F, &val64); + aggregate_64bit(&perf_count_ext.VL15Dropped, val64); + mad_decode_field(pc, IB_PC_EXT_XMT_WAIT_F, &val64); + aggregate_64bit(&perf_count_ext.portXmitWait, val64); + mad_decode_field(pc, IB_PC_EXT_QP1_DROP_F, &val64); + aggregate_64bit(&perf_count_ext.QP1Dropped, val64); + } +} + +static void dump_perfcounters_ext(char *buf, int size, __be16 cap_mask, + uint32_t cap_mask2) +{ + size_t offset, tmp_offset; + + mad_dump_fields(buf, size, pc, sizeof(pc), IB_PC_EXT_FIRST_F, + IB_PC_EXT_XMT_UPKTS_F); + offset = strlen(buf); + + if (cap_mask & IB_PM_EXT_WIDTH_SUPPORTED) { + mad_dump_fields(buf + offset, size - offset, pc, sizeof(pc), + IB_PC_EXT_XMT_UPKTS_F, IB_PC_EXT_LAST_F); + tmp_offset = strlen(buf + offset); + offset += tmp_offset; + } + + if (htonl(cap_mask2) & IB_PM_IS_ADDL_PORT_CTRS_EXT_SUP) { + mad_dump_fields(buf + offset, size - offset, pc, sizeof(pc), + IB_PC_EXT_COUNTER_SELECT2_F, + IB_PC_EXT_ERR_LAST_F); + } +} + +static void output_aggregate_perfcounters_ext(ib_portid_t * portid, + __be16 cap_mask, uint32_t cap_mask2) +{ + char buf[1536]; + uint32_t val = ALL_PORTS; + + memset(buf, 0, sizeof(buf)); + + /* set port_select to 255 to emulate AllPortSelect */ + mad_encode_field(pc, IB_PC_EXT_PORT_SELECT_F, &val); + mad_encode_field(pc, IB_PC_EXT_COUNTER_SELECT_F, + &perf_count_ext.counterselect); + mad_encode_field(pc, IB_PC_EXT_XMT_BYTES_F, + &perf_count_ext.portxmitdata); + mad_encode_field(pc, IB_PC_EXT_RCV_BYTES_F, + &perf_count_ext.portrcvdata); + mad_encode_field(pc, IB_PC_EXT_XMT_PKTS_F, + &perf_count_ext.portxmitpkts); + mad_encode_field(pc, IB_PC_EXT_RCV_PKTS_F, &perf_count_ext.portrcvpkts); + + if (cap_mask & IB_PM_EXT_WIDTH_SUPPORTED) { + mad_encode_field(pc, IB_PC_EXT_XMT_UPKTS_F, + &perf_count_ext.portunicastxmitpkts); + mad_encode_field(pc, IB_PC_EXT_RCV_UPKTS_F, + &perf_count_ext.portunicastrcvpkts); + mad_encode_field(pc, IB_PC_EXT_XMT_MPKTS_F, + &perf_count_ext.portmulticastxmitpkits); + mad_encode_field(pc, IB_PC_EXT_RCV_MPKTS_F, + &perf_count_ext.portmulticastrcvpkts); + } + + if (htonl(cap_mask2) & IB_PM_IS_ADDL_PORT_CTRS_EXT_SUP) { + mad_encode_field(pc, IB_PC_EXT_COUNTER_SELECT2_F, + &perf_count_ext.counterSelect2); + mad_encode_field(pc, IB_PC_EXT_ERR_SYM_F, + &perf_count_ext.symbolErrorCounter); + mad_encode_field(pc, IB_PC_EXT_LINK_RECOVERS_F, + &perf_count_ext.linkErrorRecoveryCounter); + mad_encode_field(pc, IB_PC_EXT_LINK_DOWNED_F, + &perf_count_ext.linkDownedCounter); + mad_encode_field(pc, IB_PC_EXT_ERR_RCV_F, + &perf_count_ext.portRcvErrors); + mad_encode_field(pc, IB_PC_EXT_ERR_PHYSRCV_F, + &perf_count_ext.portRcvRemotePhysicalErrors); + mad_encode_field(pc, IB_PC_EXT_ERR_SWITCH_REL_F, + &perf_count_ext.portRcvSwitchRelayErrors); + mad_encode_field(pc, IB_PC_EXT_XMT_DISCARDS_F, + &perf_count_ext.portXmitDiscards); + mad_encode_field(pc, IB_PC_EXT_ERR_XMTCONSTR_F, + &perf_count_ext.portXmitConstraintErrors); + mad_encode_field(pc, IB_PC_EXT_ERR_RCVCONSTR_F, + &perf_count_ext.portRcvConstraintErrors); + mad_encode_field(pc, IB_PC_EXT_ERR_LOCALINTEG_F, + &perf_count_ext.localLinkIntegrityErrors); + mad_encode_field(pc, IB_PC_EXT_ERR_EXCESS_OVR_F, + &perf_count_ext.excessiveBufferOverrunErrors); + mad_encode_field(pc, IB_PC_EXT_VL15_DROPPED_F, + &perf_count_ext.VL15Dropped); + mad_encode_field(pc, IB_PC_EXT_XMT_WAIT_F, + &perf_count_ext.portXmitWait); + mad_encode_field(pc, IB_PC_EXT_QP1_DROP_F, + &perf_count_ext.QP1Dropped); + } + + dump_perfcounters_ext(buf, sizeof(buf), cap_mask, cap_mask2); + + printf("# Port extended counters: %s port %d (CapMask: 0x%02X CapMask2: 0x%07X)\n%s", + portid2str(portid), ALL_PORTS, ntohs(cap_mask), cap_mask2, buf); +} + +static void dump_perfcounters(int extended, int timeout, __be16 cap_mask, + uint32_t cap_mask2, ib_portid_t * portid, + int port, int aggregate) +{ + char buf[1536]; + + if (extended != 1) { + memset(pc, 0, sizeof(pc)); + if (!pma_query_via(pc, portid, port, timeout, + IB_GSI_PORT_COUNTERS, srcport)) + IBEXIT("perfquery"); + if (!(cap_mask & IB_PM_PC_XMIT_WAIT_SUP)) { + /* if PortCounters:PortXmitWait not supported clear this counter */ + VERBOSE("PortXmitWait not indicated" + " so ignore this counter"); + perf_count.xmtwait = 0; + mad_encode_field(pc, IB_PC_XMT_WAIT_F, + &perf_count.xmtwait); + } + if (aggregate) + aggregate_perfcounters(); + else + mad_dump_perfcounters(buf, sizeof buf, pc, sizeof pc); + + } else { + /* 1.2 errata: bit 9 is extended counter support + * bit 10 is extended counter NoIETF + */ + if (!(cap_mask & IB_PM_EXT_WIDTH_SUPPORTED) && + !(cap_mask & IB_PM_EXT_WIDTH_NOIETF_SUP)) + IBWARN + ("PerfMgt ClassPortInfo CapMask 0x%02X; No extended counter support indicated\n", + ntohs(cap_mask)); + + memset(pc, 0, sizeof(pc)); + if (!pma_query_via(pc, portid, port, timeout, + IB_GSI_PORT_COUNTERS_EXT, srcport)) + IBEXIT("perfextquery"); + if (aggregate) + aggregate_perfcounters_ext(cap_mask, cap_mask2); + else + dump_perfcounters_ext(buf, sizeof(buf), cap_mask, + cap_mask2); + } + + if (!aggregate) { + if (extended) + printf("# Port extended counters: %s port %d " + "(CapMask: 0x%02X CapMask2: 0x%07X)\n%s", + portid2str(portid), port, ntohs(cap_mask), + cap_mask2, buf); + else + printf("# Port counters: %s port %d " + "(CapMask: 0x%02X)\n%s", + portid2str(portid), port, ntohs(cap_mask), buf); + } +} + +static void reset_counters(int extended, int timeout, int mask, + ib_portid_t * portid, int port) +{ + memset(pc, 0, sizeof(pc)); + if (extended != 1) { + if (!performance_reset_via(pc, portid, port, mask, timeout, + IB_GSI_PORT_COUNTERS, srcport)) + IBEXIT("perf reset"); + } else { + if (!performance_reset_via(pc, portid, port, mask, timeout, + IB_GSI_PORT_COUNTERS_EXT, srcport)) + IBEXIT("perf ext reset"); + } +} + +static struct +{ + int reset, reset_only, all_ports, loop_ports, port, extended, xmt_sl, + rcv_sl, xmt_disc, rcv_err, extended_speeds, smpl_ctl, + oprcvcounters, flowctlcounters, vloppackets, vlopdata, + vlxmitflowctlerrors, vlxmitcounters, swportvlcong, rcvcc, + slrcvfecn, slrcvbecn, xmitcc, vlxmittimecc; + int ports[MAX_PORTS]; + int ports_count; +} info; + +static void common_func(ib_portid_t * portid, int port_num, int mask, + unsigned query, unsigned reset, + const char *name, uint16_t attr, + void dump_func(char *, int, void *, int)) +{ + char buf[1536]; + + if (query) { + memset(pc, 0, sizeof(pc)); + if (!pma_query_via(pc, portid, port_num, ibd_timeout, attr, + srcport)) + IBEXIT("cannot query %s", name); + + dump_func(buf, sizeof(buf), pc, sizeof(pc)); + + printf("# %s counters: %s port %d\n%s", name, + portid2str(portid), port_num, buf); + } + + memset(pc, 0, sizeof(pc)); + if (reset && !performance_reset_via(pc, portid, info.port, mask, + ibd_timeout, attr, srcport)) + IBEXIT("cannot reset %s", name); +} + +static void xmt_sl_query(ib_portid_t * portid, int port, int mask) +{ + common_func(portid, port, mask, !info.reset_only, + (info.reset_only || info.reset), "PortXmitDataSL", + IB_GSI_PORT_XMIT_DATA_SL, mad_dump_perfcounters_xmt_sl); +} + +static void rcv_sl_query(ib_portid_t * portid, int port, int mask) +{ + common_func(portid, port, mask, !info.reset_only, + (info.reset_only || info.reset), "PortRcvDataSL", + IB_GSI_PORT_RCV_DATA_SL, mad_dump_perfcounters_rcv_sl); +} + +static void xmt_disc_query(ib_portid_t * portid, int port, int mask) +{ + common_func(portid, port, mask, !info.reset_only, + (info.reset_only || info.reset), "PortXmitDiscardDetails", + IB_GSI_PORT_XMIT_DISCARD_DETAILS, + mad_dump_perfcounters_xmt_disc); +} + +static void rcv_err_query(ib_portid_t * portid, int port, int mask) +{ + common_func(portid, port, mask, !info.reset_only, + (info.reset_only || info.reset), "PortRcvErrorDetails", + IB_GSI_PORT_RCV_ERROR_DETAILS, + mad_dump_perfcounters_rcv_err); +} + +static uint8_t *ext_speeds_reset_via(void *rcvbuf, ib_portid_t * dest, + int port, uint64_t mask, unsigned timeout) +{ + ib_rpc_t rpc = { 0 }; + int lid = dest->lid; + + DEBUG("lid %u port %d mask 0x%" PRIx64, lid, port, mask); + + if (lid == -1) { + IBWARN("only lid routed is supported"); + return NULL; + } + + if (!mask) + mask = ~0; + + rpc.mgtclass = IB_PERFORMANCE_CLASS; + rpc.method = IB_MAD_METHOD_SET; + rpc.attr.id = IB_GSI_PORT_EXT_SPEEDS_COUNTERS; + + memset(rcvbuf, 0, IB_MAD_SIZE); + + mad_set_field(rcvbuf, 0, IB_PESC_PORT_SELECT_F, port); + mad_set_field64(rcvbuf, 0, IB_PESC_COUNTER_SELECT_F, mask); + rpc.attr.mod = 0; + rpc.timeout = timeout; + rpc.datasz = IB_PC_DATA_SZ; + rpc.dataoffs = IB_PC_DATA_OFFS; + if (!dest->qp) + dest->qp = 1; + if (!dest->qkey) + dest->qkey = IB_DEFAULT_QP1_QKEY; + + return mad_rpc(srcport, &rpc, dest, rcvbuf, rcvbuf); +} + +static uint8_t is_rsfec_mode_active(ib_portid_t * portid, int port, + __be16 cap_mask) +{ + uint8_t data[IB_SMP_DATA_SIZE] = { 0 }; + uint32_t fec_mode_active = 0; + uint32_t pie_capmask = 0; + if (cap_mask & IS_PM_RSFEC_COUNTERS_SUP) { + if (!is_port_info_extended_supported(portid, port, srcport)) { + IBWARN("Port Info Extended not supported"); + return 0; + } + + if (!smp_query_via(data, portid, IB_ATTR_PORT_INFO_EXT, port, 0, + srcport)) + IBEXIT("smp query portinfo extended failed"); + + mad_decode_field(data, IB_PORT_EXT_CAPMASK_F, &pie_capmask); + mad_decode_field(data, IB_PORT_EXT_FEC_MODE_ACTIVE_F, + &fec_mode_active); + if((pie_capmask & + be32toh(IB_PORT_EXT_CAP_IS_FEC_MODE_SUPPORTED)) && + ((be16toh(IB_PORT_EXT_RS_FEC_MODE_ACTIVE) == (fec_mode_active & 0xffff)) || + (be16toh(IB_PORT_EXT_RS_FEC2_MODE_ACTIVE) == (fec_mode_active & 0xffff)))) + return 1; + } + + return 0; +} + +static void extended_speeds_query(ib_portid_t * portid, int port, + uint64_t ext_mask, __be16 cap_mask) +{ + int mask = ext_mask; + + if (!info.reset_only) { + if (is_rsfec_mode_active(portid, port, cap_mask)) + common_func(portid, port, mask, 1, 0, + "PortExtendedSpeedsCounters with RS-FEC Active", + IB_GSI_PORT_EXT_SPEEDS_COUNTERS, + mad_dump_port_ext_speeds_counters_rsfec_active); + else + common_func(portid, port, mask, 1, 0, + "PortExtendedSpeedsCounters", + IB_GSI_PORT_EXT_SPEEDS_COUNTERS, + mad_dump_port_ext_speeds_counters); + } + + if ((info.reset_only || info.reset) && + !ext_speeds_reset_via(pc, portid, port, ext_mask, ibd_timeout)) + IBEXIT("cannot reset PortExtendedSpeedsCounters"); +} + +static void oprcvcounters_query(ib_portid_t * portid, int port, int mask) +{ + common_func(portid, port, mask, !info.reset_only, + (info.reset_only || info.reset), "PortOpRcvCounters", + IB_GSI_PORT_PORT_OP_RCV_COUNTERS, + mad_dump_perfcounters_port_op_rcv_counters); +} + +static void flowctlcounters_query(ib_portid_t * portid, int port, int mask) +{ + common_func(portid, port, mask, !info.reset_only, + (info.reset_only || info.reset), "PortFlowCtlCounters", + IB_GSI_PORT_PORT_FLOW_CTL_COUNTERS, + mad_dump_perfcounters_port_flow_ctl_counters); +} + +static void vloppackets_query(ib_portid_t * portid, int port, int mask) +{ + common_func(portid, port, mask, !info.reset_only, + (info.reset_only || info.reset), "PortVLOpPackets", + IB_GSI_PORT_PORT_VL_OP_PACKETS, + mad_dump_perfcounters_port_vl_op_packet); +} + +static void vlopdata_query(ib_portid_t * portid, int port, int mask) +{ + common_func(portid, port, mask, !info.reset_only, + (info.reset_only || info.reset), "PortVLOpData", + IB_GSI_PORT_PORT_VL_OP_DATA, + mad_dump_perfcounters_port_vl_op_data); +} + +static void vlxmitflowctlerrors_query(ib_portid_t * portid, int port, int mask) +{ + common_func(portid, port, mask, !info.reset_only, + (info.reset_only || info.reset), + "PortVLXmitFlowCtlUpdateErrors", + IB_GSI_PORT_PORT_VL_XMIT_FLOW_CTL_UPDATE_ERRORS, + mad_dump_perfcounters_port_vl_xmit_flow_ctl_update_errors); +} + +static void vlxmitcounters_query(ib_portid_t * portid, int port, int mask) +{ + common_func(portid, port, mask, !info.reset_only, + (info.reset_only || info.reset), "PortVLXmitWaitCounters", + IB_GSI_PORT_PORT_VL_XMIT_WAIT_COUNTERS, + mad_dump_perfcounters_port_vl_xmit_wait_counters); +} + +static void swportvlcong_query(ib_portid_t * portid, int port, int mask) +{ + common_func(portid, port, mask, !info.reset_only, + (info.reset_only || info.reset), "SwPortVLCongestion", + IB_GSI_SW_PORT_VL_CONGESTION, + mad_dump_perfcounters_sw_port_vl_congestion); +} + +static void rcvcc_query(ib_portid_t * portid, int port, int mask) +{ + common_func(portid, port, mask, !info.reset_only, + (info.reset_only || info.reset), "PortRcvConCtrl", + IB_GSI_PORT_RCV_CON_CTRL, + mad_dump_perfcounters_rcv_con_ctrl); +} + +static void slrcvfecn_query(ib_portid_t * portid, int port, int mask) +{ + common_func(portid, port, mask, !info.reset_only, + (info.reset_only || info.reset), "PortSLRcvFECN", + IB_GSI_PORT_SL_RCV_FECN, mad_dump_perfcounters_sl_rcv_fecn); +} + +static void slrcvbecn_query(ib_portid_t * portid, int port, int mask) +{ + common_func(portid, port, mask, !info.reset_only, + (info.reset_only || info.reset), "PortSLRcvBECN", + IB_GSI_PORT_SL_RCV_BECN, mad_dump_perfcounters_sl_rcv_becn); +} + +static void xmitcc_query(ib_portid_t * portid, int port, int mask) +{ + common_func(portid, port, mask, !info.reset_only, + (info.reset_only || info.reset), "PortXmitConCtrl", + IB_GSI_PORT_XMIT_CON_CTRL, + mad_dump_perfcounters_xmit_con_ctrl); +} + +static void vlxmittimecc_query(ib_portid_t * portid, int port, int mask) +{ + common_func(portid, port, mask, !info.reset_only, + (info.reset_only || info.reset), "PortVLXmitTimeCong", + IB_GSI_PORT_VL_XMIT_TIME_CONG, + mad_dump_perfcounters_vl_xmit_time_cong); +} + +static void dump_portsamples_control(ib_portid_t *portid, int port) +{ + char buf[1280]; + + memset(pc, 0, sizeof(pc)); + if (!pma_query_via(pc, portid, port, ibd_timeout, + IB_GSI_PORT_SAMPLES_CONTROL, srcport)) + IBEXIT("sampctlquery"); + + mad_dump_portsamples_control(buf, sizeof buf, pc, sizeof pc); + printf("# PortSamplesControl: %s port %d\n%s", portid2str(portid), + port, buf); +} + +static int process_opt(void *context, int ch) +{ + switch (ch) { + case 'x': + info.extended = 1; + break; + case 'X': + info.xmt_sl = 1; + break; + case 'S': + info.rcv_sl = 1; + break; + case 'D': + info.xmt_disc = 1; + break; + case 'E': + info.rcv_err = 1; + break; + case 'T': + info.extended_speeds = 1; + break; + case 'c': + info.smpl_ctl = 1; + break; + case 1: + info.oprcvcounters = 1; + break; + case 2: + info.flowctlcounters = 1; + break; + case 3: + info.vloppackets = 1; + break; + case 4: + info.vlopdata = 1; + break; + case 5: + info.vlxmitflowctlerrors = 1; + break; + case 6: + info.vlxmitcounters = 1; + break; + case 7: + info.swportvlcong = 1; + break; + case 8: + info.rcvcc = 1; + break; + case 9: + info.slrcvfecn = 1; + break; + case 10: + info.slrcvbecn = 1; + break; + case 11: + info.xmitcc = 1; + break; + case 12: + info.vlxmittimecc = 1; + break; + case 'a': + info.all_ports++; + info.port = ALL_PORTS; + break; + case 'l': + info.loop_ports++; + break; + case 'r': + info.reset++; + break; + case 'R': + info.reset_only++; + break; + default: + return -1; + } + return 0; +} + +int main(int argc, char **argv) +{ + int mgmt_classes[3] = { IB_SMI_CLASS, IB_SA_CLASS, IB_PERFORMANCE_CLASS }; + ib_portid_t portid = { 0 }; + int mask = 0xffff; + uint64_t ext_mask = 0xffffffffffffffffULL; + __be32 cap_mask2_be; + uint32_t cap_mask2; + __be16 cap_mask; + int all_ports_loop = 0; + int node_type, num_ports = 0; + uint8_t data[IB_SMP_DATA_SIZE] = { 0 }; + int start_port = 1; + int enhancedport0; + char *tmpstr; + int i; + + const struct ibdiag_opt opts[] = { + {"extended", 'x', 0, NULL, "show extended port counters"}, + {"xmtsl", 'X', 0, NULL, "show Xmt SL port counters"}, + {"rcvsl", 'S', 0, NULL, "show Rcv SL port counters"}, + {"xmtdisc", 'D', 0, NULL, "show Xmt Discard Details"}, + {"rcverr", 'E', 0, NULL, "show Rcv Error Details"}, + {"extended_speeds", 'T', 0, NULL, "show port extended speeds counters"}, + {"oprcvcounters", 1, 0, NULL, "show Rcv Counters per Op code"}, + {"flowctlcounters", 2, 0, NULL, "show flow control counters"}, + {"vloppackets", 3, 0, NULL, "show packets received per Op code per VL"}, + {"vlopdata", 4, 0, NULL, "show data received per Op code per VL"}, + {"vlxmitflowctlerrors", 5, 0, NULL, "show flow control update errors per VL"}, + {"vlxmitcounters", 6, 0, NULL, "show ticks waiting to transmit counters per VL"}, + {"swportvlcong", 7, 0, NULL, "show sw port VL congestion"}, + {"rcvcc", 8, 0, NULL, "show Rcv congestion control counters"}, + {"slrcvfecn", 9, 0, NULL, "show SL Rcv FECN counters"}, + {"slrcvbecn", 10, 0, NULL, "show SL Rcv BECN counters"}, + {"xmitcc", 11, 0, NULL, "show Xmit congestion control counters"}, + {"vlxmittimecc", 12, 0, NULL, "show VL Xmit Time congestion control counters"}, + {"smplctl", 'c', 0, NULL, "show samples control"}, + {"all_ports", 'a', 0, NULL, "show aggregated counters"}, + {"loop_ports", 'l', 0, NULL, "iterate through each port"}, + {"reset_after_read", 'r', 0, NULL, "reset counters after read"}, + {"Reset_only", 'R', 0, NULL, "only reset counters"}, + {} + }; + char usage_args[] = " [<lid|guid> [[port(s)] [reset_mask]]]"; + const char *usage_examples[] = { + "\t\t# read local port's performance counters", + "32 1\t\t# read performance counters from lid 32, port 1", + "-x 32 1\t# read extended performance counters from lid 32, port 1", + "-a 32\t\t# read performance counters from lid 32, all ports", + "-r 32 1\t# read performance counters and reset", + "-x -r 32 1\t# read extended performance counters and reset", + "-R 0x20 1\t# reset performance counters of port 1 only", + "-x -R 0x20 1\t# reset extended performance counters of port 1 only", + "-R -a 32\t# reset performance counters of all ports", + "-R 32 2 0x0fff\t# reset only error counters of port 2", + "-R 32 2 0xf000\t# reset only non-error counters of port 2", + "-a 32 1-10\t# read performance counters from lid 32, port 1-10, aggregate output", + "-l 32 1-10\t# read performance counters from lid 32, port 1-10, output each port", + "-a 32 1,4,8\t# read performance counters from lid 32, port 1, 4, and 8, aggregate output", + "-l 32 1,4,8\t# read performance counters from lid 32, port 1, 4, and 8, output each port", + NULL, + }; + + ibdiag_process_opts(argc, argv, NULL, "DK", opts, process_opt, + usage_args, usage_examples); + + argc -= optind; + argv += optind; + + if (argc > 1) { + if (strchr(argv[1], ',')) { + tmpstr = strtok(argv[1], ","); + while (tmpstr) { + info.ports[info.ports_count++] = + strtoul(tmpstr, NULL, 0); + tmpstr = strtok(NULL, ","); + } + info.port = info.ports[0]; + } + else if ((tmpstr = strchr(argv[1], '-'))) { + int pmin, pmax; + + *tmpstr = '\0'; + tmpstr++; + + pmin = strtoul(argv[1], NULL, 0); + pmax = strtoul(tmpstr, NULL, 0); + + if (pmin >= pmax) + IBEXIT("max port must be greater than min port in range"); + + while (pmin <= pmax) + info.ports[info.ports_count++] = pmin++; + + info.port = info.ports[0]; + } + else + info.port = strtoul(argv[1], NULL, 0); + } + if (argc > 2) { + ext_mask = strtoull(argv[2], NULL, 0); + mask = ext_mask; + } + + srcport = mad_rpc_open_port(ibd_ca, ibd_ca_port, mgmt_classes, 3); + if (!srcport) + IBEXIT("Failed to open '%s' port '%d'", ibd_ca, ibd_ca_port); + + smp_mkey_set(srcport, ibd_mkey); + + if (argc) { + if (resolve_portid_str(ibd_ca, ibd_ca_port, &portid, argv[0], + ibd_dest_type, ibd_sm_id, srcport) < 0) + IBEXIT("can't resolve destination port %s", argv[0]); + } else { + if (resolve_self(ibd_ca, ibd_ca_port, &portid, &info.port, NULL) < + 0) + IBEXIT("can't resolve self port %s", argv[0]); + } + + /* PerfMgt ClassPortInfo is a required attribute */ + memset(pc, 0, sizeof(pc)); + if (!pma_query_via(pc, &portid, info.port, ibd_timeout, CLASS_PORT_INFO, + srcport)) + IBEXIT("classportinfo query"); + /* ClassPortInfo should be supported as part of libibmad */ + memcpy(&cap_mask, pc + 2, sizeof(cap_mask)); /* CapabilityMask */ + memcpy(&cap_mask2_be, pc + 4, sizeof(cap_mask2_be)); /* CapabilityMask2 */ + cap_mask2 = ntohl(cap_mask2_be) >> 5; + + if (!(cap_mask & IB_PM_ALL_PORT_SELECT)) { /* bit 8 is AllPortSelect */ + if (!info.all_ports && info.port == ALL_PORTS) + IBEXIT("AllPortSelect not supported"); + if (info.all_ports && info.port == ALL_PORTS) + all_ports_loop = 1; + } + + if (info.xmt_sl) { + xmt_sl_query(&portid, info.port, mask); + goto done; + } + + if (info.rcv_sl) { + rcv_sl_query(&portid, info.port, mask); + goto done; + } + + if (info.xmt_disc) { + xmt_disc_query(&portid, info.port, mask); + goto done; + } + + if (info.rcv_err) { + rcv_err_query(&portid, info.port, mask); + goto done; + } + + if (info.extended_speeds) { + extended_speeds_query(&portid, info.port, ext_mask, cap_mask); + goto done; + } + + if (info.oprcvcounters) { + oprcvcounters_query(&portid, info.port, mask); + goto done; + } + + if (info.flowctlcounters) { + flowctlcounters_query(&portid, info.port, mask); + goto done; + } + + if (info.vloppackets) { + vloppackets_query(&portid, info.port, mask); + goto done; + } + + if (info.vlopdata) { + vlopdata_query(&portid, info.port, mask); + goto done; + } + + if (info.vlxmitflowctlerrors) { + vlxmitflowctlerrors_query(&portid, info.port, mask); + goto done; + } + + if (info.vlxmitcounters) { + vlxmitcounters_query(&portid, info.port, mask); + goto done; + } + + if (info.swportvlcong) { + swportvlcong_query(&portid, info.port, mask); + goto done; + } + + if (info.rcvcc) { + rcvcc_query(&portid, info.port, mask); + goto done; + } + + if (info.slrcvfecn) { + slrcvfecn_query(&portid, info.port, mask); + goto done; + } + + if (info.slrcvbecn) { + slrcvbecn_query(&portid, info.port, mask); + goto done; + } + + if (info.xmitcc) { + xmitcc_query(&portid, info.port, mask); + goto done; + } + + if (info.vlxmittimecc) { + vlxmittimecc_query(&portid, info.port, mask); + goto done; + } + + if (info.smpl_ctl) { + dump_portsamples_control(&portid, info.port); + goto done; + } + + if (all_ports_loop || + (info.loop_ports && (info.all_ports || info.port == ALL_PORTS))) { + if (!smp_query_via(data, &portid, IB_ATTR_NODE_INFO, 0, 0, + srcport)) + IBEXIT("smp query nodeinfo failed"); + node_type = mad_get_field(data, 0, IB_NODE_TYPE_F); + mad_decode_field(data, IB_NODE_NPORTS_F, &num_ports); + if (!num_ports) + IBEXIT("smp query nodeinfo: num ports invalid"); + + if (node_type == IB_NODE_SWITCH) { + if (!smp_query_via(data, &portid, IB_ATTR_SWITCH_INFO, + 0, 0, srcport)) + IBEXIT("smp query nodeinfo failed"); + enhancedport0 = + mad_get_field(data, 0, IB_SW_ENHANCED_PORT0_F); + if (enhancedport0) + start_port = 0; + } + if (all_ports_loop && !info.loop_ports) + IBWARN + ("Emulating AllPortSelect by iterating through all ports"); + } + + if (info.reset_only) + goto do_reset; + + if (all_ports_loop || + (info.loop_ports && (info.all_ports || info.port == ALL_PORTS))) { + for (i = start_port; i <= num_ports; i++) + dump_perfcounters(info.extended, ibd_timeout, cap_mask, + cap_mask2, &portid, i, + (all_ports_loop && !info.loop_ports)); + if (all_ports_loop && !info.loop_ports) { + if (info.extended != 1) + output_aggregate_perfcounters(&portid, + cap_mask); + else + output_aggregate_perfcounters_ext(&portid, + cap_mask, cap_mask2); + } + } else if (info.ports_count > 1) { + for (i = 0; i < info.ports_count; i++) + dump_perfcounters(info.extended, ibd_timeout, cap_mask, + cap_mask2, &portid, info.ports[i], + (info.all_ports && !info.loop_ports)); + if (info.all_ports && !info.loop_ports) { + if (info.extended != 1) + output_aggregate_perfcounters(&portid, + cap_mask); + else + output_aggregate_perfcounters_ext(&portid, + cap_mask, cap_mask2); + } + } else + dump_perfcounters(info.extended, ibd_timeout, cap_mask, + cap_mask2, &portid, info.port, 0); + + if (!info.reset) + goto done; + +do_reset: + if (argc <= 2 && !info.extended) { + if (cap_mask & IB_PM_PC_XMIT_WAIT_SUP) + mask |= (1 << 16); /* reset portxmitwait */ + if (cap_mask & IB_PM_IS_QP1_DROP_SUP) + mask |= (1 << 17); /* reset qp1dropped */ + } + + if (info.extended) { + mask |= 0xfff0000; + if (cap_mask & IB_PM_PC_XMIT_WAIT_SUP) + mask |= (1 << 28); + if (cap_mask & IB_PM_IS_QP1_DROP_SUP) + mask |= (1 << 29); + } + + if (all_ports_loop || + (info.loop_ports && (info.all_ports || info.port == ALL_PORTS))) { + for (i = start_port; i <= num_ports; i++) + reset_counters(info.extended, ibd_timeout, mask, + &portid, i); + } else if (info.ports_count > 1) { + for (i = 0; i < info.ports_count; i++) + reset_counters(info.extended, ibd_timeout, mask, + &portid, info.ports[i]); + } else + reset_counters(info.extended, ibd_timeout, mask, &portid, info.port); + +done: + mad_rpc_close_port(srcport); + exit(0); +} diff --git a/infiniband-diags/saquery.c b/infiniband-diags/saquery.c new file mode 100644 index 0000000..224865f --- /dev/null +++ b/infiniband-diags/saquery.c @@ -0,0 +1,1917 @@ +/* + * Copyright (c) 2006,2007 The Regents of the University of California. + * Copyright (c) 2004-2009 Voltaire, Inc. All rights reserved. + * Copyright (c) 2002-2013 Mellanox Technologies LTD. All rights reserved. + * Copyright (c) 1996-2013 Intel Corporation. All rights reserved. + * Copyright (c) 2009 HNR Consulting. All rights reserved. + * + * Produced at Lawrence Livermore National Laboratory. + * Written by Ira Weiny <weiny2@llnl.gov>. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#include <unistd.h> +#include <stdio.h> +#include <arpa/inet.h> +#include <ctype.h> +#include <string.h> +#include <errno.h> + +#define _GNU_SOURCE + +#include <infiniband/umad.h> +#include <infiniband/mad.h> +#include <util/node_name_map.h> + +#include "ibdiag_common.h" +#include "ibdiag_sa.h" + +#ifndef IB_PR_COMPMASK_SERVICEID +#define IB_PR_COMPMASK_SERVICEID (IB_PR_COMPMASK_SERVICEID_MSB | \ + IB_PR_COMPMASK_SERVICEID_LSB) +#endif + +#define UMAD_SA_CAP_MASK2_IS_MCAST_TOP_SUP (1 << 3) + +struct query_params { + uint64_t service_id; + ibmad_gid_t sgid, dgid, gid, mgid; + uint16_t slid, dlid, mlid; + uint32_t flow_label; + int hop_limit; + uint8_t tclass; + int reversible, numb_path; + uint16_t pkey; + int qos_class, sl; + uint8_t mtu, rate, pkt_life; + uint32_t qkey; + uint8_t scope; + uint8_t join_state; + int proxy_join; + ib_class_port_info_t cpi; +}; + +struct query_cmd { + const char *name, *alias; + uint16_t query_type; + const char *usage; + int (*handler) (const struct query_cmd * q, struct sa_handle * h, + struct query_params * p, int argc, char *argv[]); +}; + +static char *node_name_map_file = NULL; +static nn_map_t *node_name_map = NULL; + +/** + * Declare some globals because I don't want this to be too complex. + */ +#define MAX_PORTS (8) +#define DEFAULT_SA_TIMEOUT_MS (1000) + +static enum { + ALL, + LID_ONLY, + UNIQUE_LID_ONLY, + GUID_ONLY, + ALL_DESC, + NAME_OF_LID, + NAME_OF_GUID, +} node_print_desc = ALL; + +static char *requested_name; +static uint16_t requested_lid; +static int requested_lid_flag; +static uint64_t requested_guid; +static int requested_guid_flag; + +static unsigned valid_gid(ibmad_gid_t * gid) +{ + ibmad_gid_t zero_gid; + memset(&zero_gid, 0, sizeof zero_gid); + return memcmp(&zero_gid, gid, sizeof(*gid)); +} + +static void print_node_desc(ib_node_record_t * node_record) +{ + ib_node_info_t *p_ni = &(node_record->node_info); + ib_node_desc_t *p_nd = &(node_record->node_desc); + char *name; + + if (p_ni->node_type == IB_NODE_TYPE_CA) { + name = remap_node_name(node_name_map, + be64toh(node_record->node_info.node_guid), + (char *)p_nd->description); + printf("%6d \"%s\"\n", be16toh(node_record->lid), name); + free(name); + } +} + +static void dump_node_record(void *data, struct query_params *p) +{ + ib_node_record_t *nr = data; + ib_node_info_t *ni = &nr->node_info; + char *name = remap_node_name(node_name_map, + be64toh(ni->node_guid), + (char *)nr->node_desc.description); + + printf("NodeRecord dump:\n" + "\t\tlid.....................%u\n" + "\t\treserved................0x%X\n" + "\t\tbase_version............0x%X\n" + "\t\tclass_version...........0x%X\n" + "\t\tnode_type...............%s\n" + "\t\tnum_ports...............%u\n" + "\t\tsys_guid................0x%016" PRIx64 "\n" + "\t\tnode_guid...............0x%016" PRIx64 "\n" + "\t\tport_guid...............0x%016" PRIx64 "\n" + "\t\tpartition_cap...........0x%X\n" + "\t\tdevice_id...............0x%X\n" + "\t\trevision................0x%X\n" + "\t\tport_num................%u\n" + "\t\tvendor_id...............0x%X\n" + "\t\tNodeDescription.........%s\n", + be16toh(nr->lid), be16toh(nr->resv), + ni->base_version, ni->class_version, + ib_get_node_type_str(ni->node_type), ni->num_ports, + be64toh(ni->sys_guid), be64toh(ni->node_guid), + be64toh(ni->port_guid), be16toh(ni->partition_cap), + be16toh(ni->device_id), be32toh(ni->revision), + ib_node_info_get_local_port_num(ni), + be32toh(ib_node_info_get_vendor_id(ni)), + name); + + free(name); +} + +static void print_node_record(ib_node_record_t * node_record) +{ + ib_node_info_t *p_ni = &node_record->node_info; + ib_node_desc_t *p_nd = &node_record->node_desc; + char *name; + + switch (node_print_desc) { + case LID_ONLY: + case UNIQUE_LID_ONLY: + printf("%u\n", be16toh(node_record->lid)); + return; + case GUID_ONLY: + printf("0x%016" PRIx64 "\n", be64toh(p_ni->port_guid)); + return; + case NAME_OF_LID: + case NAME_OF_GUID: + name = remap_node_name(node_name_map, + be64toh(p_ni->node_guid), + (char *)p_nd->description); + printf("%s\n", name); + free(name); + return; + case ALL: + default: + break; + } + + dump_node_record(node_record, NULL); +} + +static void dump_path_record(void *data, struct query_params *p) +{ + char gid_str[INET6_ADDRSTRLEN]; + char gid_str2[INET6_ADDRSTRLEN]; + ib_path_rec_t *p_pr = data; + printf("PathRecord dump:\n" + "\t\tservice_id..............0x%016" PRIx64 "\n" + "\t\tdgid....................%s\n" + "\t\tsgid....................%s\n" + "\t\tdlid....................%u\n" + "\t\tslid....................%u\n" + "\t\thop_flow_raw............0x%X\n" + "\t\ttclass..................0x%X\n" + "\t\tnum_path_revers.........0x%X\n" + "\t\tpkey....................0x%X\n" + "\t\tqos_class...............0x%X\n" + "\t\tsl......................0x%X\n" + "\t\tmtu.....................0x%X\n" + "\t\trate....................0x%X\n" + "\t\tpkt_life................0x%X\n" + "\t\tpreference..............0x%X\n" + "\t\tresv2...................0x%02X%02X%02X%02X%02X%02X\n", + be64toh(p_pr->service_id), + inet_ntop(AF_INET6, p_pr->dgid.raw, gid_str, sizeof gid_str), + inet_ntop(AF_INET6, p_pr->sgid.raw, gid_str2, sizeof gid_str2), + be16toh(p_pr->dlid), be16toh(p_pr->slid), + be32toh(p_pr->hop_flow_raw), p_pr->tclass, p_pr->num_path, + be16toh(p_pr->pkey), ib_path_rec_qos_class(p_pr), + ib_path_rec_sl(p_pr), p_pr->mtu, p_pr->rate, p_pr->pkt_life, + p_pr->preference, + p_pr->resv2[0], p_pr->resv2[1], p_pr->resv2[2], + p_pr->resv2[3], p_pr->resv2[4], p_pr->resv2[5]); +} + +static void dump_class_port_info(ib_class_port_info_t *cpi) +{ + char gid_str[INET6_ADDRSTRLEN]; + char gid_str2[INET6_ADDRSTRLEN]; + + printf("SA ClassPortInfo:\n" + "\t\tBase version.............%d\n" + "\t\tClass version............%d\n" + "\t\tCapability mask..........0x%04X\n" + "\t\tCapability mask 2........0x%08X\n" + "\t\tResponse time value......0x%02X\n" + "\t\tRedirect GID.............%s\n" + "\t\tRedirect TC/SL/FL........0x%08X\n" + "\t\tRedirect LID.............%u\n" + "\t\tRedirect PKey............0x%04X\n" + "\t\tRedirect QP..............0x%08X\n" + "\t\tRedirect QKey............0x%08X\n" + "\t\tTrap GID.................%s\n" + "\t\tTrap TC/SL/FL............0x%08X\n" + "\t\tTrap LID.................%u\n" + "\t\tTrap PKey................0x%04X\n" + "\t\tTrap HL/QP...............0x%08X\n" + "\t\tTrap QKey................0x%08X\n", + cpi->base_ver, cpi->class_ver, be16toh(cpi->cap_mask), + ib_class_cap_mask2(cpi), ib_class_resp_time_val(cpi), + inet_ntop(AF_INET6, &(cpi->redir_gid), gid_str, sizeof gid_str), + be32toh(cpi->redir_tc_sl_fl), be16toh(cpi->redir_lid), + be16toh(cpi->redir_pkey), be32toh(cpi->redir_qp), + be32toh(cpi->redir_qkey), + inet_ntop(AF_INET6, &(cpi->trap_gid), gid_str2, sizeof gid_str2), + be32toh(cpi->trap_tc_sl_fl), be16toh(cpi->trap_lid), + be16toh(cpi->trap_pkey), be32toh(cpi->trap_hop_qp), + be32toh(cpi->trap_qkey)); +} + +static void dump_portinfo_record(void *data, struct query_params *p) +{ + ib_portinfo_record_t *p_pir = data; + const ib_port_info_t *const p_pi = &p_pir->port_info; + + printf("PortInfoRecord dump:\n" + "\t\tEndPortLid..............%u\n" + "\t\tPortNum.................%u\n" + "\t\tbase_lid................%u\n" + "\t\tmaster_sm_base_lid......%u\n" + "\t\tcapability_mask.........0x%X\n", + be16toh(p_pir->lid), p_pir->port_num, + be16toh(p_pi->base_lid), be16toh(p_pi->master_sm_base_lid), + be32toh(p_pi->capability_mask)); +} + +static void dump_one_portinfo_record(void *data, struct query_params *p) +{ + ib_portinfo_record_t *pir = data; + ib_port_info_t *pi = &pir->port_info; + + printf("PortInfoRecord dump:\n" + "\tRID\n" + "\t\tEndPortLid..............%u\n" + "\t\tPortNum.................%u\n" + "\t\tOptions.................0x%x\n" + "\tPortInfo dump:\n", + be16toh(pir->lid), pir->port_num, pir->options); + dump_portinfo(pi, 2); +} + +static void dump_one_mcmember_record(void *data, struct query_params *p) +{ + char mgid[INET6_ADDRSTRLEN], gid[INET6_ADDRSTRLEN]; + ib_member_rec_t *mr = data; + uint32_t flow; + uint8_t sl, hop, scope, join; + ib_member_get_sl_flow_hop(mr->sl_flow_hop, &sl, &flow, &hop); + ib_member_get_scope_state(mr->scope_state, &scope, &join); + printf("MCMember Record dump:\n" + "\t\tMGID....................%s\n" + "\t\tPortGid.................%s\n" + "\t\tqkey....................0x%x\n" + "\t\tmlid....................0x%x\n" + "\t\tmtu.....................0x%x\n" + "\t\tTClass..................0x%x\n" + "\t\tpkey....................0x%x\n" + "\t\trate....................0x%x\n" + "\t\tpkt_life................0x%x\n" + "\t\tSL......................0x%x\n" + "\t\tFlowLabel...............0x%x\n" + "\t\tHopLimit................0x%x\n" + "\t\tScope...................0x%x\n" + "\t\tJoinState...............0x%x\n" + "\t\tProxyJoin...............0x%x\n", + inet_ntop(AF_INET6, mr->mgid.raw, mgid, sizeof(mgid)), + inet_ntop(AF_INET6, mr->port_gid.raw, gid, sizeof(gid)), + be32toh(mr->qkey), be16toh(mr->mlid), mr->mtu, mr->tclass, + be16toh(mr->pkey), mr->rate, mr->pkt_life, sl, + flow, hop, scope, join, mr->proxy_join); +} + +static void dump_multicast_group_record(void *data, struct query_params *p) +{ + char gid_str[INET6_ADDRSTRLEN]; + ib_member_rec_t *p_mcmr = data; + uint8_t sl; + ib_member_get_sl_flow_hop(p_mcmr->sl_flow_hop, &sl, NULL, NULL); + printf("MCMemberRecord group dump:\n" + "\t\tMGID....................%s\n" + "\t\tMlid....................0x%X\n" + "\t\tMtu.....................0x%X\n" + "\t\tpkey....................0x%X\n" + "\t\tRate....................0x%X\n" + "\t\tSL......................0x%X\n", + inet_ntop(AF_INET6, p_mcmr->mgid.raw, gid_str, sizeof gid_str), + be16toh(p_mcmr->mlid), + p_mcmr->mtu, be16toh(p_mcmr->pkey), p_mcmr->rate, sl); +} + +static void dump_multicast_member_record(ib_member_rec_t *p_mcmr, + struct sa_query_result *nr_result, + struct query_params *params) +{ + char gid_str[INET6_ADDRSTRLEN]; + char gid_str2[INET6_ADDRSTRLEN]; + uint16_t mlid = be16toh(p_mcmr->mlid); + unsigned i = 0; + char *node_name = strdup("<unknown>"); + + /* go through the node records searching for a port guid which matches + * this port gid interface id. + * This gives us a node name to print, if available. + */ + for (i = 0; i < nr_result->result_cnt; i++) { + ib_node_record_t *nr = sa_get_query_rec(nr_result->p_result_madw, i); + if (nr->node_info.port_guid == + p_mcmr->port_gid.unicast.interface_id) { + if(node_name != NULL) + free(node_name); + node_name = remap_node_name(node_name_map, + be64toh(nr->node_info.node_guid), + (char *)nr->node_desc.description); + break; + } + } + + if (requested_name) { + if (strtol(requested_name, NULL, 0) == mlid) + printf("\t\tPortGid.................%s (%s)\n", + inet_ntop(AF_INET6, p_mcmr->port_gid.raw, + gid_str, sizeof gid_str), node_name); + } else { + printf("MCMemberRecord member dump:\n" + "\t\tMGID....................%s\n" + "\t\tMlid....................0x%X\n" + "\t\tPortGid.................%s\n" + "\t\tScopeState..............0x%X\n" + "\t\tProxyJoin...............0x%X\n" + "\t\tNodeDescription.........%s\n", + inet_ntop(AF_INET6, p_mcmr->mgid.raw, gid_str, + sizeof gid_str), + be16toh(p_mcmr->mlid), + inet_ntop(AF_INET6, p_mcmr->port_gid.raw, + gid_str2, sizeof gid_str2), + p_mcmr->scope_state, p_mcmr->proxy_join, node_name); + } + free(node_name); +} + +static void dump_service_record(void *data, struct query_params *p) +{ + char gid[INET6_ADDRSTRLEN]; + char buf_service_key[35]; + char buf_service_name[65]; + ib_service_record_t *p_sr = data; + + sprintf(buf_service_key, + "0x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x", + p_sr->service_key[0], p_sr->service_key[1], + p_sr->service_key[2], p_sr->service_key[3], + p_sr->service_key[4], p_sr->service_key[5], + p_sr->service_key[6], p_sr->service_key[7], + p_sr->service_key[8], p_sr->service_key[9], + p_sr->service_key[10], p_sr->service_key[11], + p_sr->service_key[12], p_sr->service_key[13], + p_sr->service_key[14], p_sr->service_key[15]); + strncpy(buf_service_name, (char *)p_sr->service_name, 64); + buf_service_name[64] = '\0'; + + printf("ServiceRecord dump:\n" + "\t\tServiceID...............0x%016" PRIx64 "\n" + "\t\tServiceGID..............%s\n" + "\t\tServiceP_Key............0x%X\n" + "\t\tServiceLease............0x%X\n" + "\t\tServiceKey..............%s\n" + "\t\tServiceName.............%s\n" + "\t\tServiceData8.1..........0x%X\n" + "\t\tServiceData8.2..........0x%X\n" + "\t\tServiceData8.3..........0x%X\n" + "\t\tServiceData8.4..........0x%X\n" + "\t\tServiceData8.5..........0x%X\n" + "\t\tServiceData8.6..........0x%X\n" + "\t\tServiceData8.7..........0x%X\n" + "\t\tServiceData8.8..........0x%X\n" + "\t\tServiceData8.9..........0x%X\n" + "\t\tServiceData8.10.........0x%X\n" + "\t\tServiceData8.11.........0x%X\n" + "\t\tServiceData8.12.........0x%X\n" + "\t\tServiceData8.13.........0x%X\n" + "\t\tServiceData8.14.........0x%X\n" + "\t\tServiceData8.15.........0x%X\n" + "\t\tServiceData8.16.........0x%X\n" + "\t\tServiceData16.1.........0x%X\n" + "\t\tServiceData16.2.........0x%X\n" + "\t\tServiceData16.3.........0x%X\n" + "\t\tServiceData16.4.........0x%X\n" + "\t\tServiceData16.5.........0x%X\n" + "\t\tServiceData16.6.........0x%X\n" + "\t\tServiceData16.7.........0x%X\n" + "\t\tServiceData16.8.........0x%X\n" + "\t\tServiceData32.1.........0x%X\n" + "\t\tServiceData32.2.........0x%X\n" + "\t\tServiceData32.3.........0x%X\n" + "\t\tServiceData32.4.........0x%X\n" + "\t\tServiceData64.1.........0x%016" PRIx64 "\n" + "\t\tServiceData64.2.........0x%016" PRIx64 "\n", + be64toh(p_sr->service_id), + inet_ntop(AF_INET6, p_sr->service_gid.raw, gid, sizeof gid), + be16toh(p_sr->service_pkey), be32toh(p_sr->service_lease), + (show_keys ? buf_service_key : NOT_DISPLAYED_STR), + buf_service_name, + p_sr->service_data8[0], p_sr->service_data8[1], + p_sr->service_data8[2], p_sr->service_data8[3], + p_sr->service_data8[4], p_sr->service_data8[5], + p_sr->service_data8[6], p_sr->service_data8[7], + p_sr->service_data8[8], p_sr->service_data8[9], + p_sr->service_data8[10], p_sr->service_data8[11], + p_sr->service_data8[12], p_sr->service_data8[13], + p_sr->service_data8[14], p_sr->service_data8[15], + be16toh(p_sr->service_data16[0]), + be16toh(p_sr->service_data16[1]), + be16toh(p_sr->service_data16[2]), + be16toh(p_sr->service_data16[3]), + be16toh(p_sr->service_data16[4]), + be16toh(p_sr->service_data16[5]), + be16toh(p_sr->service_data16[6]), + be16toh(p_sr->service_data16[7]), + be32toh(p_sr->service_data32[0]), + be32toh(p_sr->service_data32[1]), + be32toh(p_sr->service_data32[2]), + be32toh(p_sr->service_data32[3]), + be64toh(p_sr->service_data64[0]), + be64toh(p_sr->service_data64[1])); +} + +static void dump_sm_info_record(void *data, struct query_params *p) +{ + ib_sminfo_record_t *p_smr = data; + const ib_sm_info_t *const p_smi = &p_smr->sm_info; + uint8_t priority, state; + priority = ib_sminfo_get_priority(p_smi); + state = ib_sminfo_get_state(p_smi); + + printf("SMInfoRecord dump:\n" + "\t\tRID\n" + "\t\tLID...................%u\n" + "\t\tSMInfo dump:\n" + "\t\tGUID..................0x%016" PRIx64 "\n" + "\t\tSM_Key................0x%016" PRIx64 "\n" + "\t\tActCount..............%u\n" + "\t\tPriority..............%u\n" + "\t\tSMState...............%u\n", + be16toh(p_smr->lid), + be64toh(p_smr->sm_info.guid), + be64toh(p_smr->sm_info.sm_key), + be32toh(p_smr->sm_info.act_count), + priority, state); +} + +static void dump_switch_info_record(void *data, struct query_params *p) +{ + ib_switch_info_record_t *p_sir = data; + uint32_t sa_cap_mask2 = ib_class_cap_mask2(&p->cpi); + + printf("SwitchInfoRecord dump:\n" + "\t\tRID\n" + "\t\tLID.....................................%u\n" + "\t\tSwitchInfo dump:\n" + "\t\tLinearFDBCap............................0x%X\n" + "\t\tRandomFDBCap............................0x%X\n" + "\t\tMulticastFDBCap.........................0x%X\n" + "\t\tLinearFDBTop............................0x%X\n" + "\t\tDefaultPort.............................%u\n" + "\t\tDefaultMulticastPrimaryPort.............%u\n" + "\t\tDefaultMulticastNotPrimaryPort..........%u\n" + "\t\tLifeTimeValue/PortStateChange/OpSL2VL...0x%X\n" + "\t\tLIDsPerPort.............................0x%X\n" + "\t\tPartitionEnforcementCap.................0x%X\n" + "\t\tflags...................................0x%X\n", + be16toh(p_sir->lid), + be16toh(p_sir->switch_info.lin_cap), + be16toh(p_sir->switch_info.rand_cap), + be16toh(p_sir->switch_info.mcast_cap), + be16toh(p_sir->switch_info.lin_top), + p_sir->switch_info.def_port, + p_sir->switch_info.def_mcast_pri_port, + p_sir->switch_info.def_mcast_not_port, + p_sir->switch_info.life_state, + be16toh(p_sir->switch_info.lids_per_port), + be16toh(p_sir->switch_info.enforce_cap), + p_sir->switch_info.flags); + if (sa_cap_mask2 & UMAD_SA_CAP_MASK2_IS_MCAST_TOP_SUP) + printf("\t\tMulticastFDBTop.........................0x%X\n", + be16toh(p_sir->switch_info.mcast_top)); +} + +static void dump_inform_info_record(void *data, struct query_params *p) +{ + char gid_str[INET6_ADDRSTRLEN]; + char gid_str2[INET6_ADDRSTRLEN]; + ib_inform_info_record_t *p_iir = data; + __be32 qpn; + uint8_t resp_time_val; + + ib_inform_info_get_qpn_resp_time(p_iir->inform_info.g_or_v. + generic.qpn_resp_time_val, &qpn, + &resp_time_val); + if (p_iir->inform_info.is_generic) { + printf("InformInfoRecord dump:\n" + "\t\tRID\n" + "\t\tSubscriberGID...........%s\n" + "\t\tSubscriberEnum..........0x%X\n" + "\t\tInformInfo dump:\n" + "\t\tgid.....................%s\n" + "\t\tlid_range_begin.........%u\n" + "\t\tlid_range_end...........%u\n" + "\t\tis_generic..............0x%X\n" + "\t\tsubscribe...............0x%X\n" + "\t\ttrap_type...............0x%X\n" + "\t\ttrap_num................%u\n", + inet_ntop(AF_INET6, p_iir->subscriber_gid.raw, gid_str, + sizeof gid_str), + be16toh(p_iir->subscriber_enum), + inet_ntop(AF_INET6, p_iir->inform_info.gid.raw, gid_str2, + sizeof gid_str2), + be16toh(p_iir->inform_info.lid_range_begin), + be16toh(p_iir->inform_info.lid_range_end), + p_iir->inform_info.is_generic, + p_iir->inform_info.subscribe, + be16toh(p_iir->inform_info.trap_type), + be16toh(p_iir->inform_info.g_or_v.generic.trap_num)); + if (show_keys) { + printf("\t\tqpn.....................0x%06X\n", + be32toh(qpn)); + } else { + printf("\t\tqpn....................." + NOT_DISPLAYED_STR "\n"); + } + printf("\t\tresp_time_val...........0x%X\n" + "\t\tnode_type...............0x%06X\n", + resp_time_val, + be32toh(ib_inform_info_get_prod_type + (&p_iir->inform_info))); + } else { + printf("InformInfoRecord dump:\n" + "\t\tRID\n" + "\t\tSubscriberGID...........%s\n" + "\t\tSubscriberEnum..........0x%X\n" + "\t\tInformInfo dump:\n" + "\t\tgid.....................%s\n" + "\t\tlid_range_begin.........%u\n" + "\t\tlid_range_end...........%u\n" + "\t\tis_generic..............0x%X\n" + "\t\tsubscribe...............0x%X\n" + "\t\ttrap_type...............0x%X\n" + "\t\tdev_id..................0x%X\n", + inet_ntop(AF_INET6, p_iir->subscriber_gid.raw, gid_str, + sizeof gid_str), + be16toh(p_iir->subscriber_enum), + inet_ntop(AF_INET6, p_iir->inform_info.gid.raw, + gid_str2, sizeof gid_str2), + be16toh(p_iir->inform_info.lid_range_begin), + be16toh(p_iir->inform_info.lid_range_end), + p_iir->inform_info.is_generic, + p_iir->inform_info.subscribe, + be16toh(p_iir->inform_info.trap_type), + be16toh(p_iir->inform_info.g_or_v.vend.dev_id)); + if (show_keys) { + printf("\t\tqpn.....................0x%06X\n", + be32toh(qpn)); + } else { + printf("\t\tqpn....................." + NOT_DISPLAYED_STR "\n"); + } + printf("\t\tresp_time_val...........0x%X\n" + "\t\tvendor_id...............0x%06X\n", + resp_time_val, + be32toh(ib_inform_info_get_prod_type + (&p_iir->inform_info))); + } +} + +static void dump_one_link_record(void *data, struct query_params *p) +{ + ib_link_record_t *lr = data; + printf("LinkRecord dump:\n" + "\t\tFromLID....................%u\n" + "\t\tFromPort...................%u\n" + "\t\tToPort.....................%u\n" + "\t\tToLID......................%u\n", + be16toh(lr->from_lid), lr->from_port_num, + lr->to_port_num, be16toh(lr->to_lid)); +} + +static void dump_one_slvl_record(void *data, struct query_params *p) +{ + ib_slvl_table_record_t *slvl = data; + ib_slvl_table_t *t = &slvl->slvl_tbl; + printf("SL2VLTableRecord dump:\n" + "\t\tLID........................%u\n" + "\t\tInPort.....................%u\n" + "\t\tOutPort....................%u\n" + "\t\tSL: 0| 1| 2| 3| 4| 5| 6| 7| 8| 9|10|11|12|13|14|15|\n" + "\t\tVL:%2u|%2u|%2u|%2u|%2u|%2u|%2u|%2u|%2u|%2u|%2u|%2u|%2u" + "|%2u|%2u|%2u|\n", + be16toh(slvl->lid), slvl->in_port_num, slvl->out_port_num, + ib_slvl_table_get(t, 0), ib_slvl_table_get(t, 1), + ib_slvl_table_get(t, 2), ib_slvl_table_get(t, 3), + ib_slvl_table_get(t, 4), ib_slvl_table_get(t, 5), + ib_slvl_table_get(t, 6), ib_slvl_table_get(t, 7), + ib_slvl_table_get(t, 8), ib_slvl_table_get(t, 9), + ib_slvl_table_get(t, 10), ib_slvl_table_get(t, 11), + ib_slvl_table_get(t, 12), ib_slvl_table_get(t, 13), + ib_slvl_table_get(t, 14), ib_slvl_table_get(t, 15)); +} + +static void dump_one_vlarb_record(void *data, struct query_params *p) +{ + ib_vl_arb_table_record_t *vlarb = data; + ib_vl_arb_element_t *e = vlarb->vl_arb_tbl.vl_entry; + int i; + printf("VLArbTableRecord dump:\n" + "\t\tLID........................%u\n" + "\t\tPort.......................%u\n" + "\t\tBlock......................%u\n", + be16toh(vlarb->lid), vlarb->port_num, vlarb->block_num); + for (i = 0; i < 32; i += 16) + printf("\t\tVL :%2u|%2u|%2u|%2u|%2u|%2u|%2u|%2u|" + "%2u|%2u|%2u|%2u|%2u|%2u|%2u|%2u|\n" + "\t\tWeight:%2u|%2u|%2u|%2u|%2u|%2u|%2u|%2u|" + "%2u|%2u|%2u|%2u|%2u|%2u|%2u|%2u|\n", + e[i + 0].vl, e[i + 1].vl, e[i + 2].vl, e[i + 3].vl, + e[i + 4].vl, e[i + 5].vl, e[i + 6].vl, e[i + 7].vl, + e[i + 8].vl, e[i + 9].vl, e[i + 10].vl, e[i + 11].vl, + e[i + 12].vl, e[i + 13].vl, e[i + 14].vl, e[i + 15].vl, + e[i + 0].weight, e[i + 1].weight, e[i + 2].weight, + e[i + 3].weight, e[i + 4].weight, e[i + 5].weight, + e[i + 6].weight, e[i + 7].weight, e[i + 8].weight, + e[i + 9].weight, e[i + 10].weight, e[i + 11].weight, + e[i + 12].weight, e[i + 13].weight, e[i + 14].weight, + e[i + 15].weight); +} + +static void dump_one_pkey_tbl_record(void *data, struct query_params *params) +{ + ib_pkey_table_record_t *pktr = data; + __be16 *p = pktr->pkey_tbl.pkey_entry; + int i; + printf("PKeyTableRecord dump:\n" + "\t\tLID........................%u\n" + "\t\tPort.......................%u\n" + "\t\tBlock......................%u\n" + "\t\tPKey Table:\n", + be16toh(pktr->lid), pktr->port_num, pktr->block_num); + for (i = 0; i < 32; i += 8) + printf("\t\t0x%04x 0x%04x 0x%04x 0x%04x" + " 0x%04x 0x%04x 0x%04x 0x%04x\n", + be16toh(p[i + 0]), be16toh(p[i + 1]), + be16toh(p[i + 2]), be16toh(p[i + 3]), + be16toh(p[i + 4]), be16toh(p[i + 5]), + be16toh(p[i + 6]), be16toh(p[i + 7])); + printf("\n"); +} + +static void dump_one_lft_record(void *data, struct query_params *p) +{ + ib_lft_record_t *lftr = data; + unsigned block = be16toh(lftr->block_num); + int i; + printf("LFT Record dump:\n" + "\t\tLID........................%u\n" + "\t\tBlock......................%u\n" + "\t\tLFT:\n\t\tLID\tPort Number\n", be16toh(lftr->lid), block); + for (i = 0; i < 64; i++) + printf("\t\t%u\t%u\n", block * 64 + i, lftr->lft[i]); + printf("\n"); +} + +static void dump_one_guidinfo_record(void *data, struct query_params *p) +{ + ib_guidinfo_record_t *gir = data; + printf("GUIDInfo Record dump:\n" + "\t\tLID........................%u\n" + "\t\tBlock......................%u\n" + "\t\tGUID 0.....................0x%016" PRIx64 "\n" + "\t\tGUID 1.....................0x%016" PRIx64 "\n" + "\t\tGUID 2.....................0x%016" PRIx64 "\n" + "\t\tGUID 3.....................0x%016" PRIx64 "\n" + "\t\tGUID 4.....................0x%016" PRIx64 "\n" + "\t\tGUID 5.....................0x%016" PRIx64 "\n" + "\t\tGUID 6.....................0x%016" PRIx64 "\n" + "\t\tGUID 7.....................0x%016" PRIx64 "\n", + be16toh(gir->lid), gir->block_num, + be64toh(gir->guid_info.guid[0]), + be64toh(gir->guid_info.guid[1]), + be64toh(gir->guid_info.guid[2]), + be64toh(gir->guid_info.guid[3]), + be64toh(gir->guid_info.guid[4]), + be64toh(gir->guid_info.guid[5]), + be64toh(gir->guid_info.guid[6]), + be64toh(gir->guid_info.guid[7])); +} + +static void dump_one_mft_record(void *data, struct query_params *p) +{ + ib_mft_record_t *mftr = data; + unsigned position = be16toh(mftr->position_block_num) >> 12; + unsigned block = be16toh(mftr->position_block_num) & + IB_MCAST_BLOCK_ID_MASK_HO; + int i; + unsigned offset; + + printf("MFT Record dump:\n" + "\t\tLID........................%u\n" + "\t\tPosition...................%u\n" + "\t\tBlock......................%u\n" + "\t\tMFT:\n\t\tMLID\tPort Mask\n", + be16toh(mftr->lid), position, block); + offset = IB_LID_MCAST_START_HO + block * 32; + for (i = 0; i < IB_MCAST_BLOCK_SIZE; i++) + printf("\t\t0x%04x\t0x%04x\n", + offset + i, be16toh(mftr->mft[i])); + printf("\n"); +} + +static void dump_results(struct sa_query_result *r, + void (*dump_func) (void *, struct query_params *), + struct query_params *p) +{ + unsigned i; + for (i = 0; i < r->result_cnt; i++) { + void *data = sa_get_query_rec(r->p_result_madw, i); + dump_func(data, p); + } +} + +/** + * Get any record(s) + */ +static int get_any_records(struct sa_handle * h, + uint16_t attr_id, uint32_t attr_mod, + __be64 comp_mask, void *attr, + size_t attr_size, + struct sa_query_result *result) +{ + int ret = sa_query(h, IB_MAD_METHOD_GET_TABLE, attr_id, attr_mod, + be64toh(comp_mask), ibd_sakey, attr, attr_size, result); + if (ret) { + fprintf(stderr, "Query SA failed: %s\n", strerror(ret)); + return ret; + } + + if (result->status != IB_SA_MAD_STATUS_SUCCESS) { + sa_report_err(result->status); + return EIO; + } + + return ret; +} + +static int get_and_dump_any_records(struct sa_handle * h, uint16_t attr_id, + uint32_t attr_mod, __be64 comp_mask, + void *attr, + size_t attr_size, + void (*dump_func) (void *, + struct query_params *), + struct query_params *p) +{ + struct sa_query_result result; + int ret = get_any_records(h, attr_id, attr_mod, comp_mask, attr, + attr_size, &result); + if (ret) + return ret; + + dump_results(&result, dump_func, p); + sa_free_result_mad(&result); + return 0; +} + +/** + * Get all the records available for requested query type. + */ +static int get_all_records(struct sa_handle * h, uint16_t attr_id, + struct sa_query_result *result) +{ + return get_any_records(h, attr_id, 0, 0, NULL, 0, result); +} + +static int get_and_dump_all_records(struct sa_handle * h, uint16_t attr_id, + void (*dump_func) (void *, + struct query_params *p), + struct query_params *p) +{ + struct sa_query_result result; + int ret = get_all_records(h, attr_id, &result); + if (ret) + return ret; + + dump_results(&result, dump_func, p); + sa_free_result_mad(&result); + return ret; +} + +/** + * return the lid from the node descriptor (name) supplied + */ +static int get_lid_from_name(struct sa_handle * h, const char *name, uint16_t * lid) +{ + ib_node_record_t *node_record = NULL; + unsigned i; + int ret; + struct sa_query_result result; + + ret = get_all_records(h, IB_SA_ATTR_NODERECORD, &result); + if (ret) + return ret; + + ret = ENONET; + for (i = 0; i < result.result_cnt; i++) { + node_record = sa_get_query_rec(result.p_result_madw, i); + if (name + && strncmp(name, (char *)node_record->node_desc.description, + sizeof(node_record->node_desc.description)) == + 0) { + *lid = be16toh(node_record->lid); + ret = 0; + break; + } + } + sa_free_result_mad(&result); + return ret; +} + +static uint16_t get_lid(struct sa_handle * h, const char *name) +{ + int rc = 0; + uint16_t rc_lid = 0; + + if (!name) + return 0; + if (isalpha(name[0])) { + if ((rc = get_lid_from_name(h, name, &rc_lid)) != 0) { + fprintf(stderr, "Failed to find lid for \"%s\": %s\n", + name, strerror(rc)); + exit(rc); + } + } else { + long val; + errno = 0; + val = strtol(name, NULL, 0); + if (errno != 0 || val <= 0 || val > UINT16_MAX) { + fprintf(stderr, "Invalid lid specified: \"%s\"\n", name); + exit(EINVAL); + } + rc_lid = (uint16_t)val; + } + + return rc_lid; +} + +static int parse_iir_subscriber_gid(char *str, ib_inform_info_record_t *ir) +{ + int rc = inet_pton(AF_INET6,str,&(ir->subscriber_gid.raw)); + if(rc < 1){ + fprintf(stderr, "Invalid SubscriberGID specified: \"%s\"\n",str); + exit(EINVAL); + } + return rc; +} + +static int parse_lid_and_ports(struct sa_handle * h, + char *str, int *lid, int *port1, int *port2) +{ + char *p, *e; + + if (port1) + *port1 = -1; + if (port2) + *port2 = -1; + + p = strchr(str, '/'); + if (p) + *p = '\0'; + if (lid) + *lid = get_lid(h, str); + + if (!p) + return 0; + str = p + 1; + p = strchr(str, '/'); + if (p) + *p = '\0'; + if (port1) { + *port1 = strtoul(str, &e, 0); + if (e == str) + *port1 = -1; + } + + if (!p) + return 0; + str = p + 1; + if (port2) { + *port2 = strtoul(str, &e, 0); + if (e == str) + *port2 = -1; + } + + return 0; +} + +/* + * Get the portinfo records available with IsSM or IsSMdisabled CapabilityMask bit on. + */ +static int get_issm_records(struct sa_handle * h, __be32 capability_mask, + struct sa_query_result *result) +{ + ib_portinfo_record_t attr; + + memset(&attr, 0, sizeof(attr)); + attr.port_info.capability_mask = capability_mask; + + return get_any_records(h, IB_SA_ATTR_PORTINFORECORD, 1 << 31, + IB_PIR_COMPMASK_CAPMASK, &attr, sizeof(attr), result); +} + +static int print_node_records(struct sa_handle * h, struct query_params *p) +{ + unsigned i; + int ret; + struct sa_query_result result; + + ret = get_all_records(h, IB_SA_ATTR_NODERECORD, &result); + if (ret) + return ret; + + if (node_print_desc == ALL_DESC) { + printf(" LID \"name\"\n"); + printf("================\n"); + } + for (i = 0; i < result.result_cnt; i++) { + ib_node_record_t *node_record; + node_record = sa_get_query_rec(result.p_result_madw, i); + if (node_print_desc == ALL_DESC) { + print_node_desc(node_record); + } else if (node_print_desc == NAME_OF_LID) { + if (requested_lid == be16toh(node_record->lid)) + print_node_record(node_record); + } else if (node_print_desc == NAME_OF_GUID) { + ib_node_info_t *p_ni = &(node_record->node_info); + + if (requested_guid == be64toh(p_ni->port_guid)) + print_node_record(node_record); + } else { + ib_node_info_t *p_ni = &(node_record->node_info); + ib_node_desc_t *p_nd = &(node_record->node_desc); + char *name; + + name = remap_node_name (node_name_map, + be64toh(p_ni->node_guid), + (char *)p_nd->description); + + if (!requested_name || + (strncmp(requested_name, + (char *)node_record->node_desc.description, + sizeof(node_record-> + node_desc.description)) == 0) || + (strncmp(requested_name, + name, + sizeof(node_record-> + node_desc.description)) == 0)) { + print_node_record(node_record); + if (node_print_desc == UNIQUE_LID_ONLY) { + sa_free_result_mad(&result); + exit(0); + } + } + + free(name); + } + } + sa_free_result_mad(&result); + return ret; +} + +static int query_path_records(const struct query_cmd *q, struct sa_handle * h, + struct query_params *p, int argc, char *argv[]) +{ + ib_path_rec_t pr; + __be64 comp_mask = 0; + uint32_t flow = 0; + int qos_class = 0; + uint8_t reversible = 0; + + memset(&pr, 0, sizeof(pr)); + CHECK_AND_SET_VAL(p->service_id, 64, 0, pr.service_id, PR, SERVICEID); + CHECK_AND_SET_GID(p->sgid, pr.sgid, PR, SGID); + CHECK_AND_SET_GID(p->dgid, pr.dgid, PR, DGID); + CHECK_AND_SET_VAL(p->slid, 16, 0, pr.slid, PR, SLID); + CHECK_AND_SET_VAL(p->dlid, 16, 0, pr.dlid, PR, DLID); + CHECK_AND_SET_VAL(p->hop_limit, 32, -1, pr.hop_flow_raw, PR, HOPLIMIT); + CHECK_AND_SET_VAL(p->flow_label, 8, 0, flow, PR, FLOWLABEL); + pr.hop_flow_raw |= htobe32(flow << 8); + CHECK_AND_SET_VAL(p->tclass, 8, 0, pr.tclass, PR, TCLASS); + CHECK_AND_SET_VAL(p->reversible, 8, -1, reversible, PR, REVERSIBLE); + CHECK_AND_SET_VAL(p->numb_path, 8, -1, pr.num_path, PR, NUMBPATH); + pr.num_path |= reversible << 7; + CHECK_AND_SET_VAL(p->pkey, 16, 0, pr.pkey, PR, PKEY); + CHECK_AND_SET_VAL(p->sl, 16, -1, pr.qos_class_sl, PR, SL); + + if (p->qos_class != -1) { + qos_class = p->qos_class; + comp_mask |= IB_PR_COMPMASK_QOS_CLASS; + } + ib_path_rec_set_qos_class(&pr, qos_class); + + CHECK_AND_SET_VAL_AND_SEL(p->mtu, pr.mtu, PR, MTU, SELEC); + CHECK_AND_SET_VAL_AND_SEL(p->rate, pr.rate, PR, RATE, SELEC); + CHECK_AND_SET_VAL_AND_SEL(p->pkt_life, pr.pkt_life, PR, PKTLIFETIME, + SELEC); + + return get_and_dump_any_records(h, IB_SA_ATTR_PATHRECORD, 0, comp_mask, + &pr, sizeof(pr), dump_path_record, p); +} + +static int print_issm_records(struct sa_handle * h, struct query_params *p) +{ + struct sa_query_result result; + int ret = 0; + + /* First, get IsSM records */ + ret = get_issm_records(h, IB_PORT_CAP_IS_SM, &result); + if (ret != 0) + return (ret); + + printf("IsSM ports\n"); + dump_results(&result, dump_portinfo_record, p); + sa_free_result_mad(&result); + + /* Now, get IsSMdisabled records */ + ret = get_issm_records(h, IB_PORT_CAP_SM_DISAB, &result); + if (ret != 0) + return (ret); + + printf("\nIsSMdisabled ports\n"); + dump_results(&result, dump_portinfo_record, p); + sa_free_result_mad(&result); + + return (ret); +} + +static int print_multicast_member_records(struct sa_handle * h, + struct query_params *params) +{ + struct sa_query_result mc_group_result; + struct sa_query_result nr_result; + int ret; + unsigned i; + + ret = get_all_records(h, IB_SA_ATTR_MCRECORD, &mc_group_result); + if (ret) + return ret; + + ret = get_all_records(h, IB_SA_ATTR_NODERECORD, &nr_result); + if (ret) + goto return_mc; + + for (i = 0; i < mc_group_result.result_cnt; i++) { + ib_member_rec_t *rec = (ib_member_rec_t *) + sa_get_query_rec(mc_group_result.p_result_madw, + i); + dump_multicast_member_record(rec, &nr_result, params); + } + + sa_free_result_mad(&nr_result); + +return_mc: + sa_free_result_mad(&mc_group_result); + + return ret; +} + +static int print_multicast_group_records(struct sa_handle * h, + struct query_params *p) +{ + return get_and_dump_all_records(h, IB_SA_ATTR_MCRECORD, + dump_multicast_group_record, p); +} + +static int query_class_port_info(const struct query_cmd *q, struct sa_handle * h, + struct query_params *p, int argc, char *argv[]) +{ + dump_class_port_info(&p->cpi); + return (0); +} + +static int query_node_records(const struct query_cmd *q, struct sa_handle * h, + struct query_params *p, int argc, char *argv[]) +{ + ib_node_record_t nr; + __be64 comp_mask = 0; + int lid = 0; + + if (argc > 0) + parse_lid_and_ports(h, argv[0], &lid, NULL, NULL); + + memset(&nr, 0, sizeof(nr)); + CHECK_AND_SET_VAL(lid, 16, 0, nr.lid, NR, LID); + + return get_and_dump_any_records(h, IB_SA_ATTR_NODERECORD, 0, comp_mask, + &nr, sizeof(nr), dump_node_record, p); +} + +static int query_portinfo_records(const struct query_cmd *q, + struct sa_handle * h, struct query_params *p, + int argc, char *argv[]) +{ + ib_portinfo_record_t pir; + __be64 comp_mask = 0; + int lid = 0, port = -1, options = -1; + + if (argc > 0) + parse_lid_and_ports(h, argv[0], &lid, &port, &options); + + memset(&pir, 0, sizeof(pir)); + CHECK_AND_SET_VAL(lid, 16, 0, pir.lid, PIR, LID); + CHECK_AND_SET_VAL(port, 8, -1, pir.port_num, PIR, PORTNUM); + CHECK_AND_SET_VAL(options, 8, -1, pir.options, PIR, OPTIONS); + + return get_and_dump_any_records(h, IB_SA_ATTR_PORTINFORECORD, 0, + comp_mask, &pir, sizeof(pir), + dump_one_portinfo_record, p); +} + +static int query_mcmember_records(const struct query_cmd *q, + struct sa_handle * h, struct query_params *p, + int argc, char *argv[]) +{ + ib_member_rec_t mr; + __be64 comp_mask = 0; + uint32_t flow = 0; + uint8_t sl = 0, hop = 0, scope = 0; + + memset(&mr, 0, sizeof(mr)); + CHECK_AND_SET_GID(p->mgid, mr.mgid, MCR, MGID); + CHECK_AND_SET_GID(p->gid, mr.port_gid, MCR, PORT_GID); + CHECK_AND_SET_VAL(p->mlid, 16, 0, mr.mlid, MCR, MLID); + CHECK_AND_SET_VAL(p->qkey, 32, 0, mr.qkey, MCR, QKEY); + CHECK_AND_SET_VAL_AND_SEL(p->mtu, mr.mtu, MCR, MTU, _SEL); + CHECK_AND_SET_VAL_AND_SEL(p->rate, mr.rate, MCR, RATE, _SEL); + CHECK_AND_SET_VAL_AND_SEL(p->pkt_life, mr.pkt_life, MCR, LIFE, _SEL); + CHECK_AND_SET_VAL(p->tclass, 8, 0, mr.tclass, MCR, TCLASS); + CHECK_AND_SET_VAL(p->pkey, 16, 0, mr.pkey, MCR, PKEY); + CHECK_AND_SET_VAL(p->sl, 8, -1, sl, MCR, SL); + CHECK_AND_SET_VAL(p->flow_label, 8, 0, flow, MCR, FLOW); + CHECK_AND_SET_VAL(p->hop_limit, 8, -1, hop, MCR, HOP); + mr.sl_flow_hop = ib_member_set_sl_flow_hop(sl, flow, hop); + CHECK_AND_SET_VAL(p->scope, 8, 0, scope, MCR, SCOPE); + CHECK_AND_SET_VAL(p->join_state, 8, 0, mr.scope_state, MCR, JOIN_STATE); + mr.scope_state |= scope << 4; + CHECK_AND_SET_VAL(p->proxy_join, 8, -1, mr.proxy_join, MCR, PROXY); + + return get_and_dump_any_records(h, IB_SA_ATTR_MCRECORD, 0, comp_mask, + &mr, sizeof(mr), dump_one_mcmember_record, p); +} + +static int query_service_records(const struct query_cmd *q, struct sa_handle * h, + struct query_params *p, int argc, char *argv[]) +{ + return get_and_dump_all_records(h, IB_SA_ATTR_SERVICERECORD, + dump_service_record, p); +} + +static int query_sm_info_records(const struct query_cmd *q, + struct sa_handle * h, struct query_params *p, + int argc, char *argv[]) +{ + ib_sminfo_record_t smir; + __be64 comp_mask = 0; + int lid = 0; + + if (argc > 0) + parse_lid_and_ports(h, argv[0], &lid, NULL, NULL); + + memset(&smir, 0, sizeof(smir)); + CHECK_AND_SET_VAL(lid, 16, 0, smir.lid, SMIR, LID); + + return get_and_dump_any_records(h, IB_SA_ATTR_SMINFORECORD, 0, + comp_mask, &smir, sizeof(smir), + dump_sm_info_record, p); +} + +static int query_switchinfo_records(const struct query_cmd *q, + struct sa_handle * h, struct query_params *p, + int argc, char *argv[]) +{ + ib_switch_info_record_t swir; + __be64 comp_mask = 0; + int lid = 0; + + if (argc > 0) + parse_lid_and_ports(h, argv[0], &lid, NULL, NULL); + + memset(&swir, 0, sizeof(swir)); + CHECK_AND_SET_VAL(lid, 16, 0, swir.lid, SWIR, LID); + + return get_and_dump_any_records(h, IB_SA_ATTR_SWITCHINFORECORD, 0, + comp_mask, &swir, sizeof(swir), + dump_switch_info_record, p); +} + +static int query_inform_info_records(const struct query_cmd *q, + struct sa_handle * h, struct query_params *p, + int argc, char *argv[]) +{ + int rc = 0; + ib_inform_info_record_t ir; + __be64 comp_mask = 0; + memset(&ir, 0, sizeof(ir)); + + if (argc > 0) { + comp_mask = IB_IIR_COMPMASK_SUBSCRIBERGID; + if((rc = parse_iir_subscriber_gid(argv[0], &ir)) < 1) + return rc; + } + + return get_and_dump_any_records(h, IB_SA_ATTR_INFORMINFORECORD, 0, comp_mask, + &ir, sizeof(ir), dump_inform_info_record, p); + +} + +static int query_link_records(const struct query_cmd *q, struct sa_handle * h, + struct query_params *p, int argc, char *argv[]) +{ + ib_link_record_t lr; + __be64 comp_mask = 0; + int from_lid = 0, to_lid = 0, from_port = -1, to_port = -1; + + if (argc > 0) + parse_lid_and_ports(h, argv[0], &from_lid, &from_port, NULL); + + if (argc > 1) + parse_lid_and_ports(h, argv[1], &to_lid, &to_port, NULL); + + memset(&lr, 0, sizeof(lr)); + CHECK_AND_SET_VAL(from_lid, 16, 0, lr.from_lid, LR, FROM_LID); + CHECK_AND_SET_VAL(from_port, 8, -1, lr.from_port_num, LR, FROM_PORT); + CHECK_AND_SET_VAL(to_lid, 16, 0, lr.to_lid, LR, TO_LID); + CHECK_AND_SET_VAL(to_port, 8, -1, lr.to_port_num, LR, TO_PORT); + + return get_and_dump_any_records(h, IB_SA_ATTR_LINKRECORD, 0, comp_mask, + &lr, sizeof(lr), dump_one_link_record, p); +} + +static int query_sl2vl_records(const struct query_cmd *q, struct sa_handle * h, + struct query_params *p, int argc, char *argv[]) +{ + ib_slvl_table_record_t slvl; + __be64 comp_mask = 0; + int lid = 0, in_port = -1, out_port = -1; + + if (argc > 0) + parse_lid_and_ports(h, argv[0], &lid, &in_port, &out_port); + + memset(&slvl, 0, sizeof(slvl)); + CHECK_AND_SET_VAL(lid, 16, 0, slvl.lid, SLVL, LID); + CHECK_AND_SET_VAL(in_port, 8, -1, slvl.in_port_num, SLVL, IN_PORT); + CHECK_AND_SET_VAL(out_port, 8, -1, slvl.out_port_num, SLVL, OUT_PORT); + + return get_and_dump_any_records(h, IB_SA_ATTR_SL2VLTABLERECORD, 0, + comp_mask, &slvl, sizeof(slvl), + dump_one_slvl_record, p); +} + +static int query_vlarb_records(const struct query_cmd *q, struct sa_handle * h, + struct query_params *p, int argc, char *argv[]) +{ + ib_vl_arb_table_record_t vlarb; + __be64 comp_mask = 0; + int lid = 0, port = -1, block = -1; + + if (argc > 0) + parse_lid_and_ports(h, argv[0], &lid, &port, &block); + + memset(&vlarb, 0, sizeof(vlarb)); + CHECK_AND_SET_VAL(lid, 16, 0, vlarb.lid, VLA, LID); + CHECK_AND_SET_VAL(port, 8, -1, vlarb.port_num, VLA, OUT_PORT); + CHECK_AND_SET_VAL(block, 8, -1, vlarb.block_num, VLA, BLOCK); + + return get_and_dump_any_records(h, IB_SA_ATTR_VLARBTABLERECORD, 0, + comp_mask, &vlarb, sizeof(vlarb), + dump_one_vlarb_record, p); +} + +static int query_pkey_tbl_records(const struct query_cmd *q, + struct sa_handle * h, struct query_params *p, + int argc, char *argv[]) +{ + ib_pkey_table_record_t pktr; + __be64 comp_mask = 0; + int lid = 0, port = -1, block = -1; + + if (argc > 0) + parse_lid_and_ports(h, argv[0], &lid, &port, &block); + + memset(&pktr, 0, sizeof(pktr)); + CHECK_AND_SET_VAL(lid, 16, 0, pktr.lid, PKEY, LID); + CHECK_AND_SET_VAL(port, 8, -1, pktr.port_num, PKEY, PORT); + CHECK_AND_SET_VAL(block, 16, -1, pktr.block_num, PKEY, BLOCK); + + return get_and_dump_any_records(h, IB_SA_ATTR_PKEYTABLERECORD, 0, + comp_mask, &pktr, sizeof(pktr), + dump_one_pkey_tbl_record, p); +} + +static int query_lft_records(const struct query_cmd *q, struct sa_handle * h, + struct query_params *p, int argc, char *argv[]) +{ + ib_lft_record_t lftr; + __be64 comp_mask = 0; + int lid = 0, block = -1; + + if (argc > 0) + parse_lid_and_ports(h, argv[0], &lid, &block, NULL); + + memset(&lftr, 0, sizeof(lftr)); + CHECK_AND_SET_VAL(lid, 16, 0, lftr.lid, LFTR, LID); + CHECK_AND_SET_VAL(block, 16, -1, lftr.block_num, LFTR, BLOCK); + + return get_and_dump_any_records(h, IB_SA_ATTR_LFTRECORD, 0, comp_mask, + &lftr, sizeof(lftr), dump_one_lft_record, p); +} + +static int query_guidinfo_records(const struct query_cmd *q, struct sa_handle * h, + struct query_params *p, int argc, char *argv[]) +{ + ib_guidinfo_record_t gir; + __be64 comp_mask = 0; + int lid = 0, block = -1; + + if (argc > 0) + parse_lid_and_ports(h, argv[0], &lid, &block, NULL); + + memset(&gir, 0, sizeof(gir)); + CHECK_AND_SET_VAL(lid, 16, 0, gir.lid, GIR, LID); + CHECK_AND_SET_VAL(block, 8, -1, gir.block_num, GIR, BLOCKNUM); + + return get_and_dump_any_records(h, IB_SA_ATTR_GUIDINFORECORD, 0, + comp_mask, &gir, sizeof(gir), + dump_one_guidinfo_record, p); +} + +static int query_mft_records(const struct query_cmd *q, struct sa_handle * h, + struct query_params *p, int argc, char *argv[]) +{ + ib_mft_record_t mftr; + __be64 comp_mask = 0; + int lid = 0, block = -1, position = -1; + uint16_t pos = 0; + + if (argc > 0) + parse_lid_and_ports(h, argv[0], &lid, &position, &block); + + memset(&mftr, 0, sizeof(mftr)); + CHECK_AND_SET_VAL(lid, 16, 0, mftr.lid, MFTR, LID); + CHECK_AND_SET_VAL(block, 16, -1, mftr.position_block_num, MFTR, BLOCK); + mftr.position_block_num &= htobe16(IB_MCAST_BLOCK_ID_MASK_HO); + CHECK_AND_SET_VAL(position, 8, -1, pos, MFTR, POSITION); + mftr.position_block_num |= htobe16(pos << 12); + + return get_and_dump_any_records(h, IB_SA_ATTR_MFTRECORD, 0, comp_mask, + &mftr, sizeof(mftr), dump_one_mft_record, p); +} + +static int query_sa_cpi(struct sa_handle *h, struct query_params *query_params) +{ + ib_class_port_info_t *cpi; + struct sa_query_result result; + int ret = sa_query(h, IB_MAD_METHOD_GET, CLASS_PORT_INFO, 0, 0, + ibd_sakey, NULL, 0, &result); + if (ret) { + fprintf(stderr, "Query SA failed: %s\n", strerror(ret)); + return ret; + } + + if (result.status != IB_SA_MAD_STATUS_SUCCESS) { + sa_report_err(result.status); + ret = EIO; + goto Exit; + } + cpi = sa_get_query_rec(result.p_result_madw, 0); + memcpy(&query_params->cpi, cpi, sizeof(query_params->cpi)); +Exit: + sa_free_result_mad(&result); + return ret; +} + +static const struct query_cmd query_cmds[] = { + {"ClassPortInfo", "CPI", CLASS_PORT_INFO, + NULL, query_class_port_info}, + {"NodeRecord", "NR", IB_SA_ATTR_NODERECORD, + "[lid]", query_node_records}, + {"PortInfoRecord", "PIR", IB_SA_ATTR_PORTINFORECORD, + "[[lid]/[port]/[options]]", query_portinfo_records}, + {"SL2VLTableRecord", "SL2VL", IB_SA_ATTR_SL2VLTABLERECORD, + "[[lid]/[in_port]/[out_port]]", query_sl2vl_records}, + {"PKeyTableRecord", "PKTR", IB_SA_ATTR_PKEYTABLERECORD, + "[[lid]/[port]/[block]]", query_pkey_tbl_records}, + {"VLArbitrationTableRecord", "VLAR", IB_SA_ATTR_VLARBTABLERECORD, + "[[lid]/[port]/[block]]", query_vlarb_records}, + {"InformInfoRecord", "IIR", IB_SA_ATTR_INFORMINFORECORD, + "[subscriber_gid]", query_inform_info_records}, + {"LinkRecord", "LR", IB_SA_ATTR_LINKRECORD, + "[[from_lid]/[from_port]] [[to_lid]/[to_port]]", query_link_records}, + {"ServiceRecord", "SR", IB_SA_ATTR_SERVICERECORD, + NULL, query_service_records}, + {"PathRecord", "PR", IB_SA_ATTR_PATHRECORD, + NULL, query_path_records}, + {"MCMemberRecord", "MCMR", IB_SA_ATTR_MCRECORD, + NULL, query_mcmember_records}, + {"LFTRecord", "LFTR", IB_SA_ATTR_LFTRECORD, + "[[lid]/[block]]", query_lft_records}, + {"MFTRecord", "MFTR", IB_SA_ATTR_MFTRECORD, + "[[mlid]/[position]/[block]]", query_mft_records}, + {"GUIDInfoRecord", "GIR", IB_SA_ATTR_GUIDINFORECORD, + "[[lid]/[block]]", query_guidinfo_records}, + {"SwitchInfoRecord", "SWIR", IB_SA_ATTR_SWITCHINFORECORD, + "[lid]", query_switchinfo_records}, + {"SMInfoRecord", "SMIR", IB_SA_ATTR_SMINFORECORD, + "[lid]", query_sm_info_records}, + {} +}; + +static const struct query_cmd *find_query(const char *name) +{ + const struct query_cmd *q; + + for (q = query_cmds; q->name; q++) + if (!strcasecmp(name, q->name) || + (q->alias && !strcasecmp(name, q->alias))) + return q; + + return NULL; +} + +static const struct query_cmd *find_query_by_type(uint16_t type) +{ + const struct query_cmd *q; + + for (q = query_cmds; q->name; q++) + if (q->query_type == type) + return q; + + return NULL; +} + +enum saquery_command { + SAQUERY_CMD_QUERY, + SAQUERY_CMD_NODE_RECORD, + SAQUERY_CMD_CLASS_PORT_INFO, + SAQUERY_CMD_ISSM, + SAQUERY_CMD_MCGROUPS, + SAQUERY_CMD_MCMEMBERS, +}; + +static enum saquery_command command = SAQUERY_CMD_QUERY; +static uint16_t query_type; +static char *src_lid, *dst_lid; + +static int process_opt(void *context, int ch) +{ + struct query_params *p = context; + + switch (ch) { + case 1: + { + src_lid = strdup(optarg); + dst_lid = strchr(src_lid, ':'); + if (!dst_lid) + ibdiag_show_usage(); + *dst_lid++ = '\0'; + } + p->numb_path = 0x7f; + query_type = IB_SA_ATTR_PATHRECORD; + break; + case 2: + { + char *src_addr = strdup(optarg); + char *dst_addr = strchr(src_addr, '-'); + if (!dst_addr) + ibdiag_show_usage(); + *dst_addr++ = '\0'; + if (inet_pton(AF_INET6, src_addr, &p->sgid) <= 0) + ibdiag_show_usage(); + if (inet_pton(AF_INET6, dst_addr, &p->dgid) <= 0) + ibdiag_show_usage(); + free(src_addr); + } + p->numb_path = 0x7f; + query_type = IB_SA_ATTR_PATHRECORD; + break; + case 3: + node_name_map_file = strdup(optarg); + if (node_name_map_file == NULL) + IBEXIT("out of memory, strdup for node_name_map_file name failed"); + break; + case 4: + if (!isxdigit(*optarg) && !(optarg = getpass("SM_Key: "))) { + fprintf(stderr, "cannot get SM_Key\n"); + ibdiag_show_usage(); + } + ibd_sakey = strtoull(optarg, NULL, 0); + break; + case 'p': + query_type = IB_SA_ATTR_PATHRECORD; + break; + case 'D': + node_print_desc = ALL_DESC; + command = SAQUERY_CMD_NODE_RECORD; + break; + case 'c': + command = SAQUERY_CMD_CLASS_PORT_INFO; + break; + case 'S': + query_type = IB_SA_ATTR_SERVICERECORD; + break; + case 'I': + query_type = IB_SA_ATTR_INFORMINFORECORD; + break; + case 'N': + command = SAQUERY_CMD_NODE_RECORD; + break; + case 'L': + node_print_desc = LID_ONLY; + command = SAQUERY_CMD_NODE_RECORD; + break; + case 'l': + node_print_desc = UNIQUE_LID_ONLY; + command = SAQUERY_CMD_NODE_RECORD; + break; + case 'G': + node_print_desc = GUID_ONLY; + command = SAQUERY_CMD_NODE_RECORD; + break; + case 'O': + node_print_desc = NAME_OF_LID; + command = SAQUERY_CMD_NODE_RECORD; + break; + case 'U': + node_print_desc = NAME_OF_GUID; + command = SAQUERY_CMD_NODE_RECORD; + break; + case 's': + command = SAQUERY_CMD_ISSM; + break; + case 'g': + command = SAQUERY_CMD_MCGROUPS; + break; + case 'm': + command = SAQUERY_CMD_MCMEMBERS; + break; + case 'x': + query_type = IB_SA_ATTR_LINKRECORD; + break; + case 5: + p->slid = (uint16_t) strtoul(optarg, NULL, 0); + break; + case 6: + p->dlid = (uint16_t) strtoul(optarg, NULL, 0); + break; + case 7: + p->mlid = (uint16_t) strtoul(optarg, NULL, 0); + break; + case 14: + if (inet_pton(AF_INET6, optarg, &p->sgid) <= 0) + ibdiag_show_usage(); + break; + case 15: + if (inet_pton(AF_INET6, optarg, &p->dgid) <= 0) + ibdiag_show_usage(); + break; + case 16: + if (inet_pton(AF_INET6, optarg, &p->gid) <= 0) + ibdiag_show_usage(); + break; + case 17: + if (inet_pton(AF_INET6, optarg, &p->mgid) <= 0) + ibdiag_show_usage(); + break; + case 'r': + p->reversible = strtoul(optarg, NULL, 0); + break; + case 'n': + p->numb_path = strtoul(optarg, NULL, 0); + break; + case 18: + if (!isxdigit(*optarg) && !(optarg = getpass("P_Key: "))) { + fprintf(stderr, "cannot get P_Key\n"); + ibdiag_show_usage(); + } + p->pkey = (uint16_t) strtoul(optarg, NULL, 0); + break; + case 'Q': + p->qos_class = strtoul(optarg, NULL, 0); + break; + case 19: + p->sl = strtoul(optarg, NULL, 0); + break; + case 'M': + p->mtu = (uint8_t) strtoul(optarg, NULL, 0); + break; + case 'R': + p->rate = (uint8_t) strtoul(optarg, NULL, 0); + break; + case 20: + p->pkt_life = (uint8_t) strtoul(optarg, NULL, 0); + break; + case 'q': + if (!isxdigit(*optarg) && !(optarg = getpass("Q_Key: "))) { + fprintf(stderr, "cannot get Q_Key\n"); + ibdiag_show_usage(); + } + p->qkey = strtoul(optarg, NULL, 0); + break; + case 'T': + p->tclass = (uint8_t) strtoul(optarg, NULL, 0); + break; + case 'F': + p->flow_label = strtoul(optarg, NULL, 0); + break; + case 'H': + p->hop_limit = strtoul(optarg, NULL, 0); + break; + case 21: + p->scope = (uint8_t) strtoul(optarg, NULL, 0); + break; + case 'J': + p->join_state = (uint8_t) strtoul(optarg, NULL, 0); + break; + case 'X': + p->proxy_join = strtoul(optarg, NULL, 0); + break; + case 22: + p->service_id = strtoull(optarg, NULL, 0); + break; + default: + return -1; + } + return 0; +} + +int main(int argc, char **argv) +{ + int sa_cpi_required = 0; + char usage_args[1024]; + struct sa_handle * h; + struct query_params params; + const struct query_cmd *q; + int status; + int n; + + const struct ibdiag_opt opts[] = { + {"p", 'p', 0, NULL, "get PathRecord info"}, + {"N", 'N', 0, NULL, "get NodeRecord info"}, + {"L", 'L', 0, NULL, "return the Lids of the name specified"}, + {"l", 'l', 0, NULL, + "return the unique Lid of the name specified"}, + {"G", 'G', 0, NULL, "return the Guids of the name specified"}, + {"O", 'O', 0, NULL, "return name for the Lid specified"}, + {"U", 'U', 0, NULL, "return name for the Guid specified"}, + {"s", 's', 0, NULL, "return the PortInfoRecords with isSM or" + " isSMdisabled capability mask bit on"}, + {"g", 'g', 0, NULL, "get multicast group info"}, + {"m", 'm', 0, NULL, "get multicast member info (if multicast" + " group specified, list member GIDs only for group specified," + " for example 'saquery -m 0xC000')"}, + {"x", 'x', 0, NULL, "get LinkRecord info"}, + {"c", 'c', 0, NULL, "get the SA's class port info"}, + {"S", 'S', 0, NULL, "get ServiceRecord info"}, + {"I", 'I', 0, NULL, "get InformInfoRecord (subscription) info"}, + {"list", 'D', 0, NULL, "the node desc of the CA's"}, + {"src-to-dst", 1, 1, "<src:dst>", "get a PathRecord for" + " <src:dst> where src and dst are either node names or LIDs"}, + {"sgid-to-dgid", 2, 1, "<sgid-dgid>", "get a PathRecord for" + " <sgid-dgid> where sgid and dgid are addresses in IPv6 format"}, + {"node-name-map", 3, 1, "<file>", + "specify a node name map file"}, + {"smkey", 4, 1, "<val>", + "SA SM_Key value for the query." + " If non-numeric value (like 'x') is specified then" + " saquery will prompt for a value. " + " Default (when not specified here or in ibdiag.conf) is to " + " use SM_Key == 0 (or \"untrusted\")"}, + {"slid", 5, 1, "<lid>", "Source LID (PathRecord)"}, + {"dlid", 6, 1, "<lid>", "Destination LID (PathRecord)"}, + {"mlid", 7, 1, "<lid>", "Multicast LID (MCMemberRecord)"}, + {"sgid", 14, 1, "<gid>", + "Source GID (IPv6 format) (PathRecord)"}, + {"dgid", 15, 1, "<gid>", + "Destination GID (IPv6 format) (PathRecord)"}, + {"gid", 16, 1, "<gid>", "Port GID (MCMemberRecord)"}, + {"mgid", 17, 1, "<gid>", "Multicast GID (MCMemberRecord)"}, + {"reversible", 'r', 1, NULL, "Reversible path (PathRecord)"}, + {"numb_path", 'n', 1, NULL, "Number of paths (PathRecord)"}, + {"pkey", 18, 1, NULL, "P_Key (PathRecord, MCMemberRecord)." + " If non-numeric value (like 'x') is specified then" + " saquery will prompt for a value"}, + {"qos_class", 'Q', 1, NULL, "QoS Class (PathRecord)"}, + {"sl", 19, 1, NULL, + "Service level (PathRecord, MCMemberRecord)"}, + {"mtu", 'M', 1, NULL, + "MTU and selector (PathRecord, MCMemberRecord)"}, + {"rate", 'R', 1, NULL, + "Rate and selector (PathRecord, MCMemberRecord)"}, + {"pkt_lifetime", 20, 1, NULL, + "Packet lifetime and selector (PathRecord, MCMemberRecord)"}, + {"qkey", 'q', 1, NULL, "Q_Key (MCMemberRecord)." + " If non-numeric value (like 'x') is specified then" + " saquery will prompt for a value"}, + {"tclass", 'T', 1, NULL, + "Traffic Class (PathRecord, MCMemberRecord)"}, + {"flow_label", 'F', 1, NULL, + "Flow Label (PathRecord, MCMemberRecord)"}, + {"hop_limit", 'H', 1, NULL, + "Hop limit (PathRecord, MCMemberRecord)"}, + {"scope", 21, 1, NULL, "Scope (MCMemberRecord)"}, + {"join_state", 'J', 1, NULL, "Join state (MCMemberRecord)"}, + {"proxy_join", 'X', 1, NULL, "Proxy join (MCMemberRecord)"}, + {"service_id", 22, 1, NULL, "ServiceID (PathRecord)"}, + {} + }; + + memset(¶ms, 0, sizeof params); + params.hop_limit = -1; + params.reversible = -1; + params.numb_path = -1; + params.qos_class = -1; + params.sl = -1; + params.proxy_join = -1; + + n = sprintf(usage_args, "[query-name] [<name> | <lid> | <guid>]\n" + "\nSupported query names (and aliases):\n"); + for (q = query_cmds; q->name; q++) { + n += snprintf(usage_args + n, sizeof(usage_args) - n, + " %s (%s) %s\n", q->name, + q->alias ? q->alias : "", + q->usage ? q->usage : ""); + if (n >= sizeof(usage_args)) + exit(-1); + } + snprintf(usage_args + n, sizeof(usage_args) - n, + "\n Queries node records by default."); + + q = NULL; + ibd_timeout = DEFAULT_SA_TIMEOUT_MS; + + ibdiag_process_opts(argc, argv, ¶ms, "DGLsy", opts, process_opt, + usage_args, NULL); + + argc -= optind; + argv += optind; + + if (!query_type && command == SAQUERY_CMD_QUERY) { + if (!argc || !(q = find_query(argv[0]))) + query_type = IB_SA_ATTR_NODERECORD; + else { + query_type = q->query_type; + argc--; + argv++; + } + } + + if (argc) { + if (node_print_desc == NAME_OF_LID) { + requested_lid = (uint16_t) strtoul(argv[0], NULL, 0); + requested_lid_flag++; + } else if (node_print_desc == NAME_OF_GUID) { + requested_guid = strtoul(argv[0], NULL, 0); + requested_guid_flag++; + } else + requested_name = argv[0]; + } + + if ((node_print_desc == LID_ONLY || + node_print_desc == UNIQUE_LID_ONLY || + node_print_desc == GUID_ONLY) && !requested_name) { + fprintf(stderr, "ERROR: name not specified\n"); + ibdiag_show_usage(); + } + + if (node_print_desc == NAME_OF_LID && !requested_lid_flag) { + fprintf(stderr, "ERROR: lid not specified\n"); + ibdiag_show_usage(); + } + + if (node_print_desc == NAME_OF_GUID && !requested_guid_flag) { + fprintf(stderr, "ERROR: guid not specified\n"); + ibdiag_show_usage(); + } + + /* Note: lid cannot be 0; see infiniband spec 4.1.3 */ + if (node_print_desc == NAME_OF_LID && !requested_lid) { + fprintf(stderr, "ERROR: lid invalid\n"); + ibdiag_show_usage(); + } + + if (umad_init()) + IBEXIT("Failed to initialized umad library"); + + h = sa_get_handle(); + if (!h) + IBPANIC("Failed to bind to the SA"); + + node_name_map = open_node_name_map(node_name_map_file); + + if (src_lid && *src_lid) + params.slid = get_lid(h, src_lid); + if (dst_lid && *dst_lid) + params.dlid = get_lid(h, dst_lid); + + if (command == SAQUERY_CMD_CLASS_PORT_INFO || + query_type == CLASS_PORT_INFO || + query_type == IB_SA_ATTR_SWITCHINFORECORD) + sa_cpi_required = 1; + + if (sa_cpi_required && (status = query_sa_cpi(h, ¶ms)) != 0) { + fprintf(stderr, "Failed to query SA:ClassPortInfo\n"); + goto error; + } + + switch (command) { + case SAQUERY_CMD_NODE_RECORD: + status = print_node_records(h, ¶ms); + break; + case SAQUERY_CMD_CLASS_PORT_INFO: + dump_class_port_info(¶ms.cpi); + status = 0; + break; + case SAQUERY_CMD_ISSM: + status = print_issm_records(h, ¶ms); + break; + case SAQUERY_CMD_MCGROUPS: + status = print_multicast_group_records(h, ¶ms); + break; + case SAQUERY_CMD_MCMEMBERS: + status = print_multicast_member_records(h, ¶ms); + break; + default: + if ((!q && !(q = find_query_by_type(query_type))) + || !q->handler) { + fprintf(stderr, "Unknown query type %d\n", query_type); + status = EINVAL; + } else + status = q->handler(q, h, ¶ms, argc, argv); + break; + } + +error: + if (src_lid) + free(src_lid); + sa_free_handle(h); + umad_done(); + close_node_name_map(node_name_map); + return (status); +} diff --git a/infiniband-diags/scripts/CMakeLists.txt b/infiniband-diags/scripts/CMakeLists.txt new file mode 100644 index 0000000..377a388 --- /dev/null +++ b/infiniband-diags/scripts/CMakeLists.txt @@ -0,0 +1,114 @@ +function(_rdma_sbin_interp INTERP IFN OFN) + configure_file("${IFN}" "${CMAKE_CURRENT_BINARY_DIR}/${OFN}" @ONLY) + file(WRITE "${BUILD_BIN}/${OFN}" "#!${INTERP}\nexec ${INTERP} ${CMAKE_CURRENT_BINARY_DIR}/${OFN} \"$@\"\n") + execute_process(COMMAND "chmod" "a+x" "${BUILD_BIN}/${OFN}") + + install(FILES "${CMAKE_CURRENT_BINARY_DIR}/${OFN}" + DESTINATION "${CMAKE_INSTALL_SBINDIR}" + PERMISSIONS OWNER_WRITE OWNER_READ GROUP_READ WORLD_READ OWNER_EXECUTE GROUP_EXECUTE WORLD_EXECUTE) +endfunction() + +function(_rdma_sbin_interp_link INTERP IFN OFN) + file(WRITE "${BUILD_BIN}/${OFN}" "#!${INTERP}\nexec ${INTERP} ${CMAKE_CURRENT_SOURCE_DIR}/${IFN} \"$@\"\n") + execute_process(COMMAND "chmod" "a+x" "${BUILD_BIN}/${OFN}") + + install(FILES "${IFN}" + DESTINATION "${CMAKE_INSTALL_SBINDIR}" + RENAME "${OFN}" + PERMISSIONS OWNER_WRITE OWNER_READ GROUP_READ WORLD_READ OWNER_EXECUTE GROUP_EXECUTE WORLD_EXECUTE) +endfunction() + +function(rdma_sbin_shell_program) + foreach(IFN ${ARGN}) + if (IFN MATCHES "\\.sh\\.in") + if (DISTRO_FLAVOUR STREQUAL Debian) + string(REGEX REPLACE "^(.+)\\.sh\\.in$" "\\1" OFN "${IFN}") + else() + string(REGEX REPLACE "^(.+)\\.in$" "\\1" OFN "${IFN}") + endif() + _rdma_sbin_interp("/bin/bash" "${IFN}" "${OFN}") + elseif (IFN MATCHES "\\.in") + string(REGEX REPLACE "^(.+)\\.in$" "\\1" OFN "${IFN}") + _rdma_sbin_interp("/bin/bash" "${IFN}" "${OFN}") + elseif (IFN MATCHES "\\.sh") + if (DISTRO_FLAVOUR STREQUAL Debian) + string(REGEX REPLACE "^(.+)\\.sh$" "\\1" OFN "${IFN}") + else() + set(OFN "${IFN}") + endif() + _rdma_sbin_interp_link("/bin/bash" "${IFN}" "${OFN}") + else() + _rdma_sbin_interp_link("/bin/bash" "${IFN}" "${IFN}") + endif() + endforeach() +endfunction() + +function(rdma_sbin_perl_program) + foreach(IFN ${ARGN}) + if (IFN MATCHES "\\.pl\\.in") + if (DISTRO_FLAVOUR STREQUAL Debian) + string(REGEX REPLACE "^(.+)\\.pl\\.in$" "\\1" OFN "${IFN}") + else() + string(REGEX REPLACE "^(.+)\\.in$" "\\1" OFN "${IFN}") + endif() + _rdma_sbin_interp("/usr/bin/perl" "${IFN}" "${OFN}") + elseif (IFN MATCHES "\\.pl") + if (DISTRO_FLAVOUR STREQUAL Debian) + string(REGEX REPLACE "^(.+)\\.pl$" "\\1" OFN "${IFN}") + else() + set(OFN "${IFN}") + endif() + _rdma_sbin_interp_link("/usr/bin/perl" "${IFN}" "${OFN}") + endif() + endforeach() +endfunction() + +set(IBSCRIPTPATH "${CMAKE_INSTALL_FULL_SBINDIR}") + +rdma_sbin_shell_program( + dump_lfts.sh.in + dump_mfts.sh.in + ibhosts.in + ibnodes.in + ibrouters.in + ibstatus + ibswitches.in + ) + +rdma_sbin_perl_program( + check_lft_balance.pl + ibfindnodesusing.pl + ibidsverify.pl + ) + +install(FILES "IBswcountlimits.pm" + DESTINATION "${CMAKE_INSTALL_PERLDIR}") + +if (ENABLE_IBDIAGS_COMPAT) + rdma_sbin_shell_program( + ibcheckerrors.in + ibcheckerrs.in + ibchecknet.in + ibchecknode.in + ibcheckport.in + ibcheckportstate.in + ibcheckportwidth.in + ibcheckstate.in + ibcheckwidth.in + ibclearcounters.in + ibclearerrors.in + ibdatacounters.in + ibdatacounts.in + set_nodedesc.sh + ) + + rdma_sbin_perl_program( + ibdiscover.pl + iblinkinfo.pl.in + ibprintca.pl + ibprintrt.pl + ibprintswitch.pl + ibqueryerrors.pl.in + ibswportwatch.pl + ) +endif() diff --git a/infiniband-diags/scripts/IBswcountlimits.pm b/infiniband-diags/scripts/IBswcountlimits.pm new file mode 100755 index 0000000..7531aef --- /dev/null +++ b/infiniband-diags/scripts/IBswcountlimits.pm @@ -0,0 +1,501 @@ +#!/usr/bin/perl +# +# Copyright (c) 2006 The Regents of the University of California. +# Copyright (c) 2006-2008 Voltaire, Inc. All rights reserved. +# +# Produced at Lawrence Livermore National Laboratory. +# Written by Ira Weiny <weiny2@llnl.gov>. +# Erez Strauss from Voltaire for help in the get_link_ends code. +# +# This software is available to you under a choice of one of two +# licenses. You may choose to be licensed under the terms of the GNU +# General Public License (GPL) Version 2, available from the file +# COPYING in the main directory of this source tree, or the +# OpenIB.org BSD license below: +# +# Redistribution and use in source and binary forms, with or +# without modification, are permitted provided that the following +# conditions are met: +# +# - Redistributions of source code must retain the above +# copyright notice, this list of conditions and the following +# disclaimer. +# +# - Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials +# provided with the distribution. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS +# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# + +use strict; + +%IBswcountlimits::cur_counts = (); +%IBswcountlimits::new_counts = (); +@IBswcountlimits::suppress_errors = (); +$IBswcountlimits::link_ends = undef; +$IBswcountlimits::pause_time = 10; +$IBswcountlimits::cache_dir = "/var/cache/infiniband-diags"; + +# all the PerfMgt counters +@IBswcountlimits::counters = ( + "SymbolErrorCounter", "LinkErrorRecoveryCounter", + "LinkDownedCounter", "PortRcvErrors", + "PortRcvRemotePhysicalErrors", "PortRcvSwitchRelayErrors", + "PortXmitDiscards", "PortXmitConstraintErrors", + "PortRcvConstraintErrors", "LocalLinkIntegrityErrors", + "ExcessiveBufferOverrunErrors", "VL15Dropped", + "PortXmitData", "PortRcvData", + "PortXmitPkts", "PortRcvPkts" +); + +# non-critical counters +%IBswcountlimits::error_counters = ( + "SymbolErrorCounter", +"No action is required except if counter is increasing along with LinkErrorRecoveryCounter", + "LinkErrorRecoveryCounter", +"If this is increasing along with SymbolErrorCounter this may indicate a bad link, run ibswportwatch.pl on this port", + "LinkDownedCounter", + "Number of times the port has gone down (Usually for valid reasons)", + "PortRcvErrors", +"This is a bad link, if the link is internal to a 288 try setting SDR, otherwise check the cable", + "PortRcvRemotePhysicalErrors", + "This indicates a problem ELSEWHERE in the fabric.", + "PortXmitDiscards", +"This is a symptom of congestion and may require tweaking either HOQ or switch lifetime values", + "PortXmitConstraintErrors", + "This is a result of bad partitioning, check partition configuration.", + "PortRcvConstraintErrors", + "This is a result of bad partitioning, check partition configuration.", + "LocalLinkIntegrityErrors", + "May indicate a bad link, run ibswportwatch.pl on this port", + "ExcessiveBufferOverrunErrors", +"This is a flow control state machine error and can be caused by packets with physical errors", + "VL15Dropped", + "check with ibswportwatch.pl, if increasing in SMALL increments, OK", + "PortRcvSwitchRelayErrors", + "This counter can increase due to a valid network event" +); + +sub check_counters +{ + my $print_action = $_[0]; + my $actions = undef; + + COUNTER: foreach my $cnt (keys %IBswcountlimits::error_counters) { + if ($IBswcountlimits::cur_counts{$cnt} > 0) { + foreach my $sup_cnt (@IBswcountlimits::suppress_errors) { + if ("$cnt" eq $sup_cnt) { next COUNTER; } + } + print " [$cnt == $IBswcountlimits::cur_counts{$cnt}]"; + if ("$print_action" eq "yes") { + $actions = join " ", + ( + $actions, + " $cnt: $IBswcountlimits::error_counters{$cnt}\n" + ); + } + } + } + + if ($actions) { + print "\n Actions:\n$actions"; + } +} + +# Data counters +%IBswcountlimits::data_counters = ( + "PortXmitData", +"Total number of data octets, divided by 4, transmitted on all VLs from the port", + "PortRcvData", +"Total number of data octets, divided by 4, received on all VLs to the port", + "PortXmitPkts", +"Total number of packets, excluding link packets, transmitted on all VLs from the port", + "PortRcvPkts", +"Total number of packets, excluding link packets, received on all VLs to the port" +); + +sub check_data_counters +{ + my $print_action = $_[0]; + my $actions = undef; + + COUNTER: foreach my $cnt (keys %IBswcountlimits::data_counters) { + print " [$cnt == $IBswcountlimits::cur_counts{$cnt}]"; + if ("$print_action" eq "yes") { + $actions = join " ", + ( + $actions, + " $cnt: $IBswcountlimits::data_counters{$cnt}\n" + ); + } + } + if ($actions) { + print "\n Descriptions:\n$actions"; + } +} + +sub print_data_rates +{ + COUNTER: foreach my $cnt (keys %IBswcountlimits::data_counters) { + my $cnt_per_second = calculate_rate( + $IBswcountlimits::cur_counts{$cnt}, + $IBswcountlimits::new_counts{$cnt} + ); + print " $cnt_per_second $cnt/second\n"; + } +} + +# ========================================================================= +# Rate dependent counters +# calculate the count/sec +# calculate_rate old_count new_count +sub calculate_rate +{ + my $rate = 0; + my $old_val = $_[0]; + my $new_val = $_[1]; + my $rate = ($new_val - $old_val) / $IBswcountlimits::pause_time; + return ($rate); +} +%IBswcountlimits::rate_dep_thresholds = ( + "SymbolErrorCounter", 10, "LinkErrorRecoveryCounter", 10, + "PortRcvErrors", 10, "LocalLinkIntegrityErrors", 10, + "PortXmitDiscards", 10 +); + +sub check_counter_rates +{ + foreach my $rate_count (keys %IBswcountlimits::rate_dep_thresholds) { + my $rate = calculate_rate( + $IBswcountlimits::cur_counts{$rate_count}, + $IBswcountlimits::new_counts{$rate_count} + ); + if ($rate > $IBswcountlimits::rate_dep_thresholds{$rate_count}) { + print "Detected excessive rate for $rate_count ($rate cnts/sec)\n"; + } elsif ($rate > 0) { + print "Detected rate for $rate_count ($rate cnts/sec)\n"; + } + } +} + +# ========================================================================= +# +sub clear_counters +{ + # clear the counters + foreach my $count (@IBswcountlimits::counters) { + $IBswcountlimits::cur_counts{$count} = 0; + } +} + +# ========================================================================= +# +sub any_counts +{ + my $total = 0; + my $count = 0; + foreach $count (keys %IBswcountlimits::critical) { + $total = $total + $IBswcountlimits::cur_counts{$count}; + } + COUNTER: foreach $count (keys %IBswcountlimits::error_counters) { + foreach my $sup_cnt (@IBswcountlimits::suppress_errors) { + if ("$count" eq $sup_cnt) { next COUNTER; } + } + $total = $total + $IBswcountlimits::cur_counts{$count}; + } + return ($total); +} + +# ========================================================================= +# +sub ensure_cache_dir +{ + if (!(-d "$IBswcountlimits::cache_dir") && + !mkdir($IBswcountlimits::cache_dir, 0700)) { + die "cannot create $IBswcountlimits::cache_dir: $!\n"; + } +} + +# ========================================================================= +# get_cache_file(ca_name, ca_port) +# +sub get_cache_file +{ + my $ca_name = $_[0]; + my $ca_port = $_[1]; + ensure_cache_dir; + return ( + "$IBswcountlimits::cache_dir/ibnetdiscover-$ca_name-$ca_port.topology"); +} + +# ========================================================================= +# get_ca_name_port_param_string(ca_name, ca_port) +# +sub get_ca_name_port_param_string +{ + my $ca_name = $_[0]; + my $ca_port = $_[1]; + + if ("$ca_name" ne "") { $ca_name = "-C $ca_name"; } + if ("$ca_port" ne "") { $ca_port = "-P $ca_port"; } + + return ("$ca_name $ca_port"); +} + +# ========================================================================= +# generate_ibnetdiscover_topology(ca_name, ca_port) +# +sub generate_ibnetdiscover_topology +{ + my $ca_name = $_[0]; + my $ca_port = $_[1]; + my $cache_file = get_cache_file($ca_name, $ca_port); + my $extra_params = get_ca_name_port_param_string($ca_name, $ca_port); + + if (`ibnetdiscover -g $extra_params > $cache_file`) { + die "Execution of ibnetdiscover failed: $!\n"; + } +} + +# ========================================================================= +# get_link_ends(regenerate_map, ca_name, ca_port) +# +sub get_link_ends +{ + my $regenerate_map = $_[0]; + my $ca_name = $_[1]; + my $ca_port = $_[2]; + + my $cache_file = get_cache_file($ca_name, $ca_port); + + if ($regenerate_map || !(-f "$cache_file")) { + generate_ibnetdiscover_topology($ca_name, $ca_port); + } + open IBNET_TOPO, "<$cache_file" + or die "Failed to open ibnet topology: $!\n"; + my $in_switch = "no"; + my $desc = ""; + my $guid = ""; + my $loc_sw_lid = ""; + + my $loc_port = ""; + my $line = ""; + + while ($line = <IBNET_TOPO>) { + if ($line =~ /^Switch.*\"S-(.*)\"\s+#.*\"(.*)\".* lid (\d+).*/) { + $guid = $1; + $desc = $2; + $loc_sw_lid = $3; + $in_switch = "yes"; + } + if ($in_switch eq "yes") { + my $rec = undef; + if ($line =~ +/^\[(\d+)\]\s+\"[HSR]-(.+)\"\[(\d+)\](\(.+\))?\s+#.*\"(.*)\"\.* lid (\d+).*/ + ) + { + $loc_port = $1; + my $rem_guid = $2; + my $rem_port = $3; + my $rem_port_guid = $4; + my $rem_desc = $5; + my $rem_lid = $6; + $rec = { + loc_guid => "0x$guid", + loc_port => $loc_port, + loc_ext_port => "", + loc_desc => $desc, + loc_sw_lid => $loc_sw_lid, + rem_guid => "0x$rem_guid", + rem_lid => $rem_lid, + rem_port => $rem_port, + rem_ext_port => "", + rem_desc => $rem_desc, + rem_port_guid => $rem_port_guid + }; + } + if ($line =~ +/^\[(\d+)\]\[ext (\d+)\]\s+\"[HSR]-(.+)\"\[(\d+)\](\(.+\))?\s+#.*\"(.*)\"\.* lid (\d+).*/ + ) + { + $loc_port = $1; + my $loc_ext_port = $2; + my $rem_guid = $3; + my $rem_port = $4; + my $rem_port_guid = $5; + my $rem_desc = $6; + my $rem_lid = $7; + $rec = { + loc_guid => "0x$guid", + loc_port => $loc_port, + loc_ext_port => $loc_ext_port, + loc_desc => $desc, + loc_sw_lid => $loc_sw_lid, + rem_guid => "0x$rem_guid", + rem_lid => $rem_lid, + rem_port => $rem_port, + rem_ext_port => "", + rem_desc => $rem_desc, + rem_port_guid => $rem_port_guid + }; + } + if ($line =~ +/^\[(\d+)\]\s+\"[HSR]-(.+)\"\[(\d+)\]\[ext (\d+)\](\(.+\))?\s+#.*\"(.*)\"\.* lid (\d+).*/ + ) + { + $loc_port = $1; + my $rem_guid = $2; + my $rem_port = $3; + my $rem_ext_port = $4; + my $rem_port_guid = $5; + my $rem_desc = $6; + my $rem_lid = $7; + $rec = { + loc_guid => "0x$guid", + loc_port => $loc_port, + loc_ext_port => "", + loc_desc => $desc, + loc_sw_lid => $loc_sw_lid, + rem_guid => "0x$rem_guid", + rem_lid => $rem_lid, + rem_port => $rem_port, + rem_ext_port => $rem_ext_port, + rem_desc => $rem_desc, + rem_port_guid => $rem_port_guid + }; + } + if ($line =~ +/^\[(\d+)\]\[ext (\d+)\]\s+\"[HSR]-(.+)\"\[(\d+)\]\[ext (\d+)\](\(.+\))?\s+#.*\"(.*)\"\.* lid (\d+).*/ + ) + { + $loc_port = $1; + my $loc_ext_port = $2; + my $rem_guid = $3; + my $rem_port = $4; + my $rem_ext_port = $5; + my $rem_port_guid = $6; + my $rem_desc = $7; + my $rem_lid = $8; + $rec = { + loc_guid => "0x$guid", + loc_port => $loc_port, + loc_ext_port => $loc_ext_port, + loc_desc => $desc, + loc_sw_lid => $loc_sw_lid, + rem_guid => "0x$rem_guid", + rem_lid => $rem_lid, + rem_port => $rem_port, + rem_ext_port => $rem_ext_port, + rem_desc => $rem_desc, + rem_port_guid => $rem_port_guid + }; + } + if ($rec) { + $rec->{rem_port_guid} =~ s/\((.*)\)/$1/; + $IBswcountlimits::link_ends{"0x$guid"}{$loc_port} = $rec; + } + } + + if ($line =~ /^Ca.*/ || $line =~ /^Rt.*/) { $in_switch = "no"; } + } + close IBNET_TOPO; +} + +# ========================================================================= +# get_num_ports(switch_guid, ca_name, ca_port) +# +sub get_num_ports +{ + my $guid = $_[0]; + my $ca_name = $_[1]; + my $ca_port = $_[2]; + my $num_ports = 0; + my $extra_params = get_ca_name_port_param_string($ca_name, $ca_port); + + my $data = `smpquery $extra_params -G nodeinfo $guid` || + die "'smpquery $extra_params -G nodeinfo $guid' faild\n"; + my @lines = split("\n", $data); + my $pkt_lifetime = ""; + foreach my $line (@lines) { + if ($line =~ /^NumPorts:\.+(.*)/) { $num_ports = $1; } + } + return ($num_ports); +} + +# ========================================================================= +# format_guid(guid) +# The diags store the guids as strings. This converts the guid supplied +# to the correct string format. +# eg: 0x0008f10400411f56 == 0x8f10400411f56 +# +sub format_guid +{ + my $guid = $_[0]; + my $guid_str = ""; + + $guid =~ tr/[A-F]/[a-f]/; + if ($guid =~ /0x(.*)/) { + $guid_str = sprintf("0x%016s", $1); + } else { + $guid_str = sprintf("0x%016s", $guid); + } + return ($guid_str); +} + +# ========================================================================= +# convert_dr_to_guid(direct_route) +# +sub convert_dr_to_guid +{ + my $guid = undef; + + my $data = `smpquery nodeinfo -D $_[0]` || + die "'mpquery nodeinfo -D $_[0]' failed\n"; + my @lines = split("\n", $data); + foreach my $line (@lines) { + if ($line =~ /^PortGuid:\.+(.*)/) { $guid = $1; } + } + return format_guid($guid); +} + +# ========================================================================= +# get_node_type(guid_or_direct_route) +# +sub get_node_type +{ + my $type = undef; + my $query_arg = "smpquery nodeinfo "; + if ($_[0] =~ /x/) { + # assume arg is a guid if contains an x + $query_arg .= "-G " . $_[0]; + } else { + # assume arg is a direct path + $query_arg .= "-D " . $_[0]; + } + + my $data = `$query_arg` || + die "'$query_arg' failed\n"; + my @lines = split("\n", $data); + foreach my $line (@lines) { + if ($line =~ /^NodeType:\.+(.*)/) { $type = $1; } + } + return $type; +} + +# ========================================================================= +# is_switch(guid_or_direct_route) +# +sub is_switch +{ + my $node_type = &get_node_type($_[0]); + return ($node_type =~ /Switch/); +} diff --git a/infiniband-diags/scripts/check_lft_balance.pl b/infiniband-diags/scripts/check_lft_balance.pl new file mode 100755 index 0000000..cbe0690 --- /dev/null +++ b/infiniband-diags/scripts/check_lft_balance.pl @@ -0,0 +1,419 @@ +#!/usr/bin/perl +# +# Copyright (C) 2001-2003 The Regents of the University of California. +# Copyright (c) 2006 The Regents of the University of California. +# Copyright (c) 2007-2008 Voltaire, Inc. All rights reserved. +# +# Produced at Lawrence Livermore National Laboratory. +# Written by Ira Weiny <weiny2@llnl.gov> +# Jim Garlick <garlick@llnl.gov> +# Albert Chu <chu11@llnl.gov> +# +# This software is available to you under a choice of one of two +# licenses. You may choose to be licensed under the terms of the GNU +# General Public License (GPL) Version 2, available from the file +# COPYING in the main directory of this source tree, or the +# OpenIB.org BSD license below: +# +# Redistribution and use in source and binary forms, with or +# without modification, are permitted provided that the following +# conditions are met: +# +# - Redistributions of source code must retain the above +# copyright notice, this list of conditions and the following +# disclaimer. +# +# - Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials +# provided with the distribution. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS +# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# + +use strict; + +use Getopt::Std; + +my $ibnetdiscover_cache = ""; +my $dump_lft_file = ""; +my $verbose = 0; + +my $switch_lid = undef; +my $switch_guid = undef; +my $switch_name = undef; +my %switch_port_count = (); +my @switch_maybe_directly_connected_hosts = (); +my $host = undef; +my @host_ports = (); + +my @lft_lines = (); +my $lft_line; + +my $lids_per_port; +my $lids_per_port_calculated; + +my $heuristic_flag = 0; + +sub usage +{ + my $prog = `basename $0`; + + chomp($prog); + print "Usage: $prog -l lft-output -i ibnetdiscover-cache [-e] [-v]\n"; + print " Generate lft-output via \"dump_lfts.sh > lft-output\"\n"; + print " Generate ibnetdiscover-cache via \"ibnetdiscover --cache ibnetdiscover-cache\"\n"; + print " -e turn on heuristic(s) to look at switch balances deeper\n"; + print " -v verbose output, output all switches\n"; + exit 2; +} + +sub is_port_up +{ + my $iblinkinfo_output = $_[0]; + my $port = $_[1]; + my $decport; + my @lines; + my $line; + + $port =~ /0+(.+)/; + $decport = $1; + + # Add a space if necessary + if ($decport >= 1 && $decport <= 9) { + $decport = " $decport"; + } + + @lines = split("\n", $iblinkinfo_output); + foreach $line (@lines) { + if ($line =~ /$decport\[..\] ==/) { + if ($line =~ /Down/) { + return 0; + } + else { + return 1; + } + } + } + + # return 0 if not found + return 0; +} + +sub is_directly_connected +{ + my $iblinkinfo_output = $_[0]; + my $port = $_[1]; + my $decport; + my $str; + my $rv = 0; + my $host_tmp; + my @lines; + my $line; + + if (($switch_port_count{$port} != $lids_per_port) + || !(@switch_maybe_directly_connected_hosts)) + { + return $rv; + } + + $port =~ /0+(.+)/; + $decport = $1; + + # Add a space if necessary + if ($decport >= 1 && $decport <= 9) { + $decport = " $decport"; + } + + @lines = split("\n", $iblinkinfo_output); + foreach $line (@lines) { + if ($line =~ /$decport\[..\] ==/) { + $str = $line; + } + } + + if ($str =~ "Active") { + $str =~ +/[\d]+[\s]+[\d]+\[.+\] \=\=.+\=\=>[\s]+[\d]+[\s]+[\d]+\[.+\] \"(.+)\".+/; + for $host_tmp (@switch_maybe_directly_connected_hosts) { + if ($1 == $host_tmp) { + $rv = 1; + last; + } + } + } + + return $rv; +} + +sub output_switch_port_usage +{ + my $min_usage = 999999; + my $max_usage = 0; + my $min_usage2 = 999999; + my $max_usage2 = 0; + my @ports = ( + "001", "002", "003", "004", "005", "006", "007", "008", + "009", "010", "011", "012", "013", "014", "015", "016", + "017", "018", "019", "020", "021", "022", "023", "024", + "025", "026", "027", "028", "029", "030", "031", "032", + "033", "034", "035", "036" + ); + my @output_ports = (); + my @double_check_ports = (); + my $port; + my $iblinkinfo_output; + my $is_unbalanced = 0; + my $ports_on_switch = 0; + my $all_zero_flag = 1; + my $ret; + + $iblinkinfo_output = `iblinkinfo --load-cache $ibnetdiscover_cache -S $switch_guid`; + + for $port (@ports) { + if (!defined($switch_port_count{$port})) { + $switch_port_count{$port} = 0; + } + + if ($switch_port_count{$port} == 0) { + # If port is down, don't use it in this calculation + $ret = is_port_up($iblinkinfo_output, $port); + if ($ret == 0) { + next; + } + } + + $ports_on_switch++; + + # If port is directly connected to a node, don't use + # it in this calculation. + if (is_directly_connected($iblinkinfo_output, $port) == 1) { + next; + } + + # Save off ports that should be output later + push(@output_ports, $port); + + if ($switch_port_count{$port} < $min_usage) { + $min_usage = $switch_port_count{$port}; + } + if ($switch_port_count{$port} > $max_usage) { + $max_usage = $switch_port_count{$port}; + } + } + + if ($max_usage > ($min_usage + 1)) { + $is_unbalanced = 1; + } + + # In the event this is a switch lineboard, it will almost always never + # balanced. Half the ports go up to the spine, and the rest of the ports + # go down to HCAs. So we will do a special heuristic: + # + # If about 1/2 of the remaining ports are balanced, then we will consider the + # entire switch balanced. + # + # Also, we do this only if there are enough alive ports on the switch to care. + # I picked 12 somewhat randomly + if ($heuristic_flag == 1 + && $is_unbalanced == 1 + && $ports_on_switch > 12) { + + @double_check_ports = (); + + for $port (@output_ports) { + if ($switch_port_count{$port} == $max_usage + || $switch_port_count{$port} == ($max_usage - 1) + || $switch_port_count{$port} == 0) { + next; + } + + push(@double_check_ports, $port); + } + + # we'll call half +/- 1 "about half" + if (@double_check_ports == int($ports_on_switch / 2) + || @double_check_ports == int($ports_on_switch / 2) + 1 + || @double_check_ports == int($ports_on_switch / 2) - 1) { + for $port (@double_check_ports) { + if ($switch_port_count{$port} < $min_usage2) { + $min_usage2 = $switch_port_count{$port}; + } + if ($switch_port_count{$port} > $max_usage2) { + $max_usage2 = $switch_port_count{$port}; + } + } + + if (!($max_usage2 > ($min_usage2 + 1))) { + $is_unbalanced = 0; + } + } + } + + # Another special case is when you have a non-fully-populated switch + # Many ports will be zero. So if all active ports != max or max-1 are = 0 + # we will also consider this balanced. + if ($heuristic_flag == 1 + && $is_unbalanced == 1 + && $ports_on_switch > 12) { + + @double_check_ports = (); + + for $port (@output_ports) { + if ($switch_port_count{$port} == $max_usage + || $switch_port_count{$port} == ($max_usage - 1)) { + next; + } + + push(@double_check_ports, $port); + } + + for $port (@double_check_ports) { + if ($switch_port_count{$port} != 0) { + $all_zero_flag = 0; + last; + } + } + + if ($all_zero_flag == 1) { + $is_unbalanced = 0; + } + } + + if ($verbose || $is_unbalanced == 1) { + if ($is_unbalanced == 1) { + print "Unbalanced Switch Port Usage: "; + print "$switch_name, $switch_guid\n"; + } else { + print + "Switch Port Usage: $switch_name, $switch_guid\n"; + } + for $port (@output_ports) { + print "Port $port: $switch_port_count{$port}\n"; + } + } +} + +sub process_host_ports +{ + my $test_port; + my $tmp; + my $flag = 0; + + if (@host_ports == $lids_per_port) { + # Are all the host ports identical? + $test_port = $host_ports[0]; + for $tmp (@host_ports) { + if ($tmp != $test_port) { + $flag = 1; + last; + } + } + # If all host ports are identical, maybe its directly + # connected to a host. + if ($flag == 0) { + push(@switch_maybe_directly_connected_hosts, $host); + } + } +} + +if (!getopts("hl:i:ve")) { + usage(); +} + +if (defined($main::opt_h)) { + usage(); +} + +if (defined($main::opt_l)) { + $dump_lft_file = $main::opt_l; +} else { + print STDERR ("Must specify dump lfts file\n"); + usage(); + exit 1; +} + +if (defined($main::opt_i)) { + $ibnetdiscover_cache = $main::opt_i; +} else { + print STDERR ("Must specify ibnetdiscover cache\n"); + usage(); + exit 1; +} + +if (defined($main::opt_v)) { + $verbose = 1; +} + +if (defined($main::opt_e)) { + $heuristic_flag = 1; +} + +if (!open(FH, "< $dump_lft_file")) { + print STDERR ("Couldn't open dump lfts file: $dump_lft_file: $!\n"); +} + +@lft_lines = <FH>; + +foreach $lft_line (@lft_lines) { + chomp($lft_line); + if ($lft_line =~ /Unicast/) { + if (@host_ports) { + process_host_ports(); + } + if (defined($switch_name)) { + output_switch_port_usage(); + } + if ($lft_line =~ /Unicast lids .+ of switch DR path slid .+ guid (.+) \((.+)\)/) { + $switch_guid = $1; + $switch_name = $2; + } + if ($lft_line =~ /Unicast lids .+ of switch Lid .+ guid (.+) \((.+)\)/) { + $switch_guid = $1; + $switch_name = $2; + } + @switch_maybe_directly_connected_hosts = (); + %switch_port_count = (); + @host_ports = (); + $lids_per_port = 0; + $lids_per_port_calculated = 0; + } elsif ($lft_line =~ /Channel/ || $lft_line =~ /Router/) { + $lft_line =~ /.+ (.+) : \(.+ portguid .+: '(.+)'\)/; + $host = $2; + $switch_port_count{$1}++; + if (@host_ports) { + process_host_ports(); + } + @host_ports = ($1); + + if ($lids_per_port == 0) { + $lids_per_port++; + } else { + $lids_per_port_calculated++; + } + } elsif ($lft_line =~ /path/) { + $lft_line =~ /.+ (.+) : \(path #. out of .: portguid .+\)/; + $switch_port_count{$1}++; + if ($lids_per_port_calculated == 0) { + $lids_per_port++; + } + push(@host_ports, $1); + } else { + if ($lids_per_port) { + $lids_per_port_calculated++; + } + next; + } +} + +if (@host_ports) { + process_host_ports(); +} +output_switch_port_usage(); diff --git a/infiniband-diags/scripts/dump_lfts.sh.in b/infiniband-diags/scripts/dump_lfts.sh.in new file mode 100755 index 0000000..ac2c0fc --- /dev/null +++ b/infiniband-diags/scripts/dump_lfts.sh.in @@ -0,0 +1,12 @@ +#!/bin/sh +# +# This simple script will collect outputs of ibroute for all switches +# on the subnet and drop it on stdout. It can be used for LFTs dump +# generation. +# + +@IBSCRIPTPATH@/dump_fts $@ +echo "" +echo "*** WARNING ***: this command has been replaced by dump_fts" +echo "" +echo "" diff --git a/infiniband-diags/scripts/dump_mfts.sh.in b/infiniband-diags/scripts/dump_mfts.sh.in new file mode 100755 index 0000000..652b567 --- /dev/null +++ b/infiniband-diags/scripts/dump_mfts.sh.in @@ -0,0 +1,12 @@ +#!/bin/sh +# +# This simple script will collect outputs of ibroute for all switches +# on the subnet and drop it on stdout. It can be used for MFTs dump +# generation. +# + +@IBSCRIPTPATH@/dump_fts -M $@ +echo "" +echo "*** WARNING ***: this command has been replaced by dump_fts -M" +echo "" +echo "" diff --git a/infiniband-diags/scripts/ibcheckerrors.in b/infiniband-diags/scripts/ibcheckerrors.in new file mode 100644 index 0000000..6ebbe7f --- /dev/null +++ b/infiniband-diags/scripts/ibcheckerrors.in @@ -0,0 +1,134 @@ +#!/bin/sh + +IBPATH=${IBPATH:-@IBSCRIPTPATH@} + +usage() { + echo Usage: `basename $0` "[-h] [-b] [-v] [-N | -nocolor]"\ + "[<topology-file> | -C ca_name -P ca_port -t(imeout) timeout_ms]" + exit -1 +} + +user_abort() { + echo "Aborted" + exit 1 +} + +trap user_abort SIGINT + +gflags="" +verbose="" +brief="" +v=0 +ntype="" +nodeguid="" +topofile="" +ca_info="" + +while [ "$1" ]; do + case $1 in + -h) + usage + ;; + -N|-nocolor) + gflags=-N + ;; + -v) + verbose=-v + brief="" + v=1 + ;; + -b) + brief=-b + verbose="" + ;; + -P | -C | -t | -timeout) + case $2 in + -*) + usage + ;; + esac + if [ x$2 = x ] ; then + usage + fi + ca_info="$ca_info $1 $2" + shift + ;; + -*) + usage + ;; + *) + if [ "$topofile" ]; then + usage + fi + topofile="$1" + ;; + esac + shift +done + +if [ "$topofile" ]; then + netcmd="cat $topofile" +else + netcmd="$IBPATH/ibnetdiscover $ca_info" +fi + +text="`eval $netcmd`" +rv=$? +echo "$text" | awk ' +BEGIN { + ne=0 +} +function check_node(lid, port) +{ + if (system("'$IBPATH'/ibchecknode -S '"$ca_info"' '$gflags' '$verbose' " lid)) { + ne++ + print "\n# " ntype ": nodeguid 0x" nodeguid " failed" + return 1; + } + if (system("'$IBPATH'/ibcheckerrs -S '"$ca_info"' '$gflags' '$verbose' '$brief' " lid " " port)) + return 2; + return 0; +} + +/^Ca/ || /^Switch/ || /^Rt/ { + nnodes++ + ntype=$1; nodeguid=substr($3, 4, 16); ports=$2 + if ('$v') + print "\n# Checking " ntype ": nodeguid 0x" nodeguid + + err = 0; + if (ntype != "Switch") + next + + lid = substr($0, index($0, "port 0 lid ") + 11) + lid = substr(lid, 1, index(lid, " ") - 1) + err = check_node(lid, 255) + } +/^\[/ { + nports++ + port = $1 + sub("\\(.*\\)", "", port) + gsub("[\\[\\]]", "", port) + if (ntype != "Switch") { + lid = substr($0, index($0, " lid ") + 5) + lid = substr(lid, 1, index(lid, " ") - 1) + if (check_node(lid, port) == 2) + pcnterr++; + } else if (err && + system("'$IBPATH'/ibcheckerrs -S '"$ca_info"' '$gflags' '$verbose' '$brief' " lid " " port)) + pcnterr++; +} + +/^ib/ {print $0; next} +/ibpanic:/ {print $0} +/ibwarn:/ {print $0} +/iberror:/ {print $0} + +END { + printf "\n*** WARNING ***: this command is deprecated; Please use \"ibqueryerrors\"" + printf "\n## Summary: %d nodes checked, %d bad nodes found\n", nnodes, ne + printf "## %d ports checked, %d ports have errors beyond threshold\n", nports, pcnterr + exit (ne + pcnterr) +} +' +exit $rv diff --git a/infiniband-diags/scripts/ibcheckerrs.in b/infiniband-diags/scripts/ibcheckerrs.in new file mode 100644 index 0000000..1edd1eb --- /dev/null +++ b/infiniband-diags/scripts/ibcheckerrs.in @@ -0,0 +1,239 @@ +#!/bin/sh + +IBPATH=${IBPATH:-@IBSCRIPTPATH@} + +usage() { + echo Usage: `basename $0` "[-h] [-b] [-v] [-G] [-T <threshold_file>]" \ + "[-s(how_thresholds)] [-N \| -nocolor] [-C ca_name] [-P ca_port]" \ + "[-t(imeout) timeout_ms] <lid|guid> [<port>]" + exit -1 +} + +green() { + if [ "$bw" = "yes" ]; then + if [ "$verbose" = "yes" ]; then + echo $1 + fi + return + fi + if [ "$verbose" = "yes" ]; then + echo -e "\\033[1;032m" $1 "\\033[0;39m" + fi +} + +red() { + if [ "$bw" = "yes" ]; then + echo $1 + return + fi + echo -e "\\033[1;031m" $1 "\\033[0;39m" +} + +show_thresholds() { + echo "SymbolErrorCounter=$SymbolErrorCounter" + echo "LinkErrorRecoveryCounter=$LinkErrorRecoveryCounter" + echo "LinkDownedCounter=$LinkDownedCounter" + echo "PortRcvErrors=$PortRcvErrors" + echo "PortRcvRemotePhysicalErrors=$PortRcvRemotePhysicalErrors" + echo "PortRcvSwitchRelayErrors=$PortRcvSwitchRelayErrors" + echo "PortXmitDiscards=$PortXmitDiscards" + echo "PortXmitConstraintErrors=$PortXmitConstraintErrors" + echo "PortRcvConstraintErrors=$PortRcvConstraintErrors" + echo "LocalLinkIntegrityErrors=$LocalLinkIntegrityErrors" + echo "ExcessiveBufferOverrunErrors=$ExcessiveBufferOverrunErrors" + echo "VL15Dropped=$VL15Dropped" +} + +get_thresholds() { + . $1 +} + +# Default thresholds +SymbolErrorCounter=10 +LinkErrorRecoveryCounter=10 +LinkDownedCounter=10 +PortRcvErrors=10 +PortRcvRemotePhysicalErrors=100 +PortRcvSwitchRelayErrors=100 +PortXmitDiscards=100 +PortXmitConstraintErrors=100 +PortRcvConstraintErrors=100 +LocalLinkIntegrityErrors=10 +ExcessiveBufferOverrunErrors=10 +VL15Dropped=100 + +guid_addr="" +bw="" +verbose="" +brief="" +ca_info="" +suppress_deprecated="no" + +while [ "$1" ]; do + case $1 in + -G) + guid_addr=yes + ;; + -nocolor|-N) + bw=yes + ;; + -v) + verbose=yes + brief="" + ;; + -b) + brief=yes + verbose="" + ;; + -T) + if ! [ -r $2 ]; then + echo "Can't use threshold file '$2'" + usage + fi + get_thresholds $2 + shift + ;; + -s) + show_thresholds + exit 0 + ;; + -S) + suppress_deprecated="yes" + ;; + -P | -C | -t | -timeout) + case $2 in + -*) + usage + ;; + esac + if [ x$2 = x ] ; then + usage + fi + ca_info="$ca_info $1 $2" + shift + ;; + -*) + usage + ;; + *) + break + ;; + esac + shift +done + +#default is all ports +portnum=255 + +if [ $# -lt 1 ]; then + usage +fi + +if [ "$2" ]; then + portnum=$2 +fi + +if [ "$portnum" = "255" ]; then + portname="all" +else + portname=$2 +fi + +if [ "$suppress_deprecated" = "no" ]; then +echo "*** WARNING ***: this command is deprecated; Please use \"ibqueryerrors\"\n\n" 1>&2 +fi + +if [ "$guid_addr" ]; then + if ! lid=`$IBPATH/ibaddr $ca_info -G -L $1 | awk '/failed/{exit -1} {print $3}'`; then + echo -n "guid $1 address resolution: " + red "FAILED" + exit -1 + fi + guid=$1 +else + lid=$1 + if ! temp=`$IBPATH/ibaddr $ca_info -L $1 | awk '/failed/{exit -1} {print $1}'`; then + echo -n "lid $1 address resolution: " + red "FAILED" + exit -1 + fi +fi + +nodename=`$IBPATH/smpquery $ca_info nodedesc $lid | sed -e "s/^Node Description:\.*\(.*\)/\1/"` + +text="`eval $IBPATH/perfquery $ca_info $lid $portnum`" +rv=$? +if echo $text | grep -q 'AllPortSelect not supported'; then + if [ "$verbose" = "yes" ]; then + echo -n "Error check on lid $lid ($nodename) port $portname: " + green "AllPortSelect not supported" + fi + exit 0 +fi + +if echo "$text" | awk -v mono=$bw -v brief=$brief -F '[.:]*' ' +function blue(s) +{ + if (brief == "yes") { + return + } + if (mono) + printf s + else if (!quiet) { + printf "\033[1;034m" s + printf "\033[0;39m" + } +} + +BEGIN { + th["SymbolErrorCounter"] = '$SymbolErrorCounter' + th["LinkErrorRecoveryCounter"] = '$LinkErrorRecoveryCounter' + th["LinkDownedCounter"] = '$LinkDownedCounter' + th["PortRcvErrors"] = '$PortRcvErrors' + th["PortRcvRemotePhysicalErrors"] = '$PortRcvRemotePhysicalErrors' + th["PortRcvSwitchRelayErrors"] = '$PortRcvSwitchRelayErrors' + th["PortXmitDiscards"] = '$PortXmitDiscards' + th["PortXmitConstraintErrors"] = '$PortXmitConstraintErrors' + th["PortRcvConstraintErrors"] = '$PortRcvConstraintErrors' + th["LocalLinkIntegrityErrors"] = '$LocalLinkIntegrityErrors' + th["ExcessiveBufferOverrunErrors"] = '$ExcessiveBufferOverrunErrors' + th["VL15Dropped"] = '$VL15Dropped' +} + +/^CounterSelect/ {next} + +/AllPortSelect/ {next} + +/^ib/ {print $0; next} +/ibpanic:/ {print $0} +/ibwarn:/ {print $0} +/iberror:/ {print $0} + +/^PortSelect/ { if ($2 != '$portnum') {err = err "error: lid '$lid' port " $2 " does not match query ('$portnum')\n"; exit -1}} + +$1 ~ "(Xmt|Rcv)(Pkts|Data)" { next } + + { if (th[$1] > 0 && $2 >= th[$1]) + warn = warn "#warn: counter " $1 " = " $2 " \t(threshold " th[$1] ") lid '$lid' port '$portnum'\n" + } +END { + if (err != "") { + blue(err) + exit -1 + } + if (warn != "") { + blue(warn) + exit -1 + } + exit 0 +}' 2>&1 && test $rv -eq 0 ; then + if [ "$verbose" = "yes" ]; then + echo -n "Error check on lid $lid ($nodename) port $portname: " + green OK + fi + exit 0 +else + echo -n "Error check on lid $lid ($nodename) port $portname: " + red FAILED + exit -1 +fi diff --git a/infiniband-diags/scripts/ibchecknet.in b/infiniband-diags/scripts/ibchecknet.in new file mode 100644 index 0000000..3fd4ae7 --- /dev/null +++ b/infiniband-diags/scripts/ibchecknet.in @@ -0,0 +1,141 @@ +#!/bin/sh + +IBPATH=${IBPATH:-@IBSCRIPTPATH@} + +usage() { + echo Usage: `basename $0` "[-h] [-v] [-N | -nocolor]" \ + "[<topology-file> | -C ca_name -P ca_port -t(imeout) timeout_ms]" + exit -1 +} + +user_abort() { + echo "Aborted" + exit 1 +} + +trap user_abort SIGINT + +gflags="" +verbose="" +v=0 +oldlid="" +topofile="" +ca_info="" + +while [ "$1" ]; do + case $1 in + -h) + usage + ;; + -N|-nocolor) + gflags=-N + ;; + -v) + verbose=-v + v=0 + ;; + -P | -C | -t | -timeout) + case $2 in + -*) + usage + ;; + esac + if [ x$2 = x ] ; then + usage + fi + ca_info="$ca_info $1 $2" + shift + ;; + -*) + usage + ;; + *) + if [ "$topofile" ]; then + usage + fi + topofile="$1" + ;; + esac + shift +done + +if [ "$topofile" ]; then + netcmd="cat $topofile" +else + netcmd="$IBPATH/ibnetdiscover $ca_info" +fi + +text="`eval $netcmd`" +rv=$? +echo "$text" | awk ' +BEGIN { + ne=0 + pe=0 +} +function check_node(lid, port) +{ + if (system("'$IBPATH'/ibchecknode -S '"$ca_info"' '$gflags' '$verbose' " lid)) { + ne++ + print "\n# " ntype ": nodeguid 0x" nodeguid " failed" + return 1; + } + if (system("'$IBPATH'/ibcheckerrs -S '"$ca_info"' '$gflags' '$verbose' '$brief' " lid " " port)) + return 2; + return 0; +} + +/^Ca/ || /^Switch/ || /^Rt/ { + nnodes++ + ntype=$1; nodeguid=substr($3, 4, 16); ports=$2 + if ('$v' || ntype != "Switch") + print "\n# Checking " ntype ": nodeguid 0x" nodeguid + + err = 0; + if (ntype != "Switch") + next + + lid = substr($0, index($0, "port 0 lid ") + 11) + lid = substr(lid, 1, index(lid, " ") - 1) + err = check_node(lid, 255) + } +/^\[/ { + nports++ + port = $1 + sub("\\(.*\\)", "", port) + gsub("[\\[\\]]", "", port) + if (ntype != "Switch") { + lid = substr($0, index($0, " lid ") + 5) + lid = substr(lid, 1, index(lid, " ") - 1) + if (check_node(lid, port) == 2) + pcnterr++; + } else if (err && + system("'$IBPATH'/ibcheckerrs -S '"$ca_info"' '$gflags' '$verbose' '$brief' " lid " " port)) + pcnterr++; + if (system("'$IBPATH'/ibcheckport -S '"$ca_info"' '$gflags' '$verbose' " lid " " port)) { + if (!'$v' && oldlid != lid) { + print "# Checked " ntype ": nodeguid 0x" nodeguid " with failure" + oldlid = lid + } + pe++; + } +} + +/^ib/ {print $0; next} +/ibpanic:/ {print $0} +/ibwarn:/ {print $0} +/iberror:/ {print $0} + +END { + printf "\n*** WARNING ***: this command is deprecated; Please use \"ibqueryerrors -f\"" + printf "\n## Summary: %d nodes checked, %d bad nodes found\n", nnodes, ne + printf "## %d ports checked, %d bad ports found\n", nports, pe + printf "## %d ports have errors beyond threshold\n", pcnterr + exit (ne + pe + pcnterr) +} +' +av=$? +if [ $av -ne 0 ] ; then + exit $av +else + exit $rv +fi diff --git a/infiniband-diags/scripts/ibchecknode.in b/infiniband-diags/scripts/ibchecknode.in new file mode 100644 index 0000000..d70d5c8 --- /dev/null +++ b/infiniband-diags/scripts/ibchecknode.in @@ -0,0 +1,108 @@ +#!/bin/sh + +IBPATH=${IBPATH:-@IBSCRIPTPATH@} + +usage() { + echo Usage: `basename $0` "[-h] [-v] [-N | -nocolor] [-G]" \ + "[-C ca_name] [-P ca_port] [-t(imeout) timeout_ms] <lid|guid>" + exit -1 +} + +green() { + if [ "$bw" = "yes" ]; then + if [ "$verbose" = "yes" ]; then + echo $1 + fi + return + fi + if [ "$verbose" = "yes" ]; then + echo -e "\\033[1;032m" $1 "\\033[0;39m" + fi +} + +red() { + if [ "$bw" = "yes" ]; then + echo $1 + return + fi + echo -e "\\033[1;031m" $1 "\\033[0;39m" +} + +guid_addr="" +bw="" +verbose="" +ca_info="" +suppress_deprecated="no" + +while [ "$1" ]; do + case $1 in + -G) + guid_addr=yes + ;; + -nocolor|-N) + bw=yes + ;; + -v) + verbose=yes + ;; + -P | -C | -t | -timeout) + case $2 in + -*) + usage + ;; + esac + if [ x$2 = x ] ; then + usage + fi + ca_info="$ca_info $1 $2" + shift + ;; + -S) + suppress_deprecated="yes" + ;; + -*) + usage + ;; + *) + break + ;; + esac + shift +done + +if [ -z "$1" ]; then + usage +fi + +if [ "$suppress_deprecated" = "no" ]; then +echo "*** WARNING ***: this command is deprecated; Please use \"smpquery nodeinfo\"\n\n" 1>&2 +fi + +if [ "$guid_addr" ]; then + if ! lid=`$IBPATH/ibaddr $ca_info -G -L $1 | awk '/failed/{exit -1} {print $3}'`; then + echo -n "guid $1 address resolution: " + red "FAILED" + exit -1 + fi +else + lid=$1 + if ! temp=`$IBPATH/ibaddr $ca_info -L $1 | awk '/failed/{exit -1} {print $1}'`; then + echo -n "lid $1 address resolution: " + red "FAILED" + exit -1 + fi +fi + +## For now, check node only checks if node info is replied + +if $IBPATH/smpquery $ca_info nodeinfo $lid > /dev/null 2>&1 ; then + if [ "$verbose" = "yes" ]; then + echo -n "Node check lid $lid: " + green OK + fi + exit 0 +else + echo -n "Node check lid $lid: " + red FAILED + exit -1 +fi diff --git a/infiniband-diags/scripts/ibcheckport.in b/infiniband-diags/scripts/ibcheckport.in new file mode 100644 index 0000000..16a8c24 --- /dev/null +++ b/infiniband-diags/scripts/ibcheckport.in @@ -0,0 +1,157 @@ +#!/bin/sh + +IBPATH=${IBPATH:-@IBSCRIPTPATH@} + +usage() { + echo Usage: `basename $0` "[-h] [-v] [-N | -nocolor] [-G]" \ + "[-C ca_name] [-P ca_port] [-t(imeout) timeout_ms] <lid|guid> <port>" + exit -1 +} + +green() { + if [ "$bw" = "yes" ]; then + if [ "$verbose" = "yes" ]; then + echo $1 + fi + return + fi + if [ "$verbose" = "yes" ]; then + echo -e "\\033[1;032m" $1 "\\033[0;39m" + fi +} + +red() { + if [ "$bw" = "yes" ]; then + echo $1 + return + fi + echo -e "\\033[1;031m" $1 "\\033[0;39m" +} + +guid_addr="" +bw="" +verbose="" +ca_info="" +suppress_deprecated="no" + +while [ "$1" ]; do + case $1 in + -G) + guid_addr=yes + ;; + -nocolor|-N) + bw=yes + ;; + -v) + verbose=yes + ;; + -P | -C | -t | -timeout) + case $2 in + -*) + usage + ;; + esac + if [ x$2 = x ] ; then + usage + fi + ca_info="$ca_info $1 $2" + shift + ;; + -S) + suppress_deprecated="yes" + ;; + -*) + usage + ;; + *) + break + ;; + esac + shift +done + +if [ $# -lt 2 ]; then + usage +fi + +portnum=$2 + +if [ "$suppress_deprecated" = "no" ]; then +echo "*** WARNING ***: this command is deprecated\n\n" 1>&2 +fi + +if [ "$guid_addr" ]; then + if ! lid=`$IBPATH/ibaddr $ca_info -G -L $1 | awk '/failed/{exit -1} {print $3}'`; then + echo -n "guid $1 address resolution: " + red "FAILED" + exit -1 + fi + guid=$1 +else + lid=$1 + if ! temp=`$IBPATH/ibaddr $ca_info -L $1 | awk '/failed/{exit -1} {print $1}'`; then + echo -n "lid $1 address resolution: " + red "FAILED" + exit -1 + fi +fi + +is_switch=`$IBPATH/smpquery $ca_info nodeinfo $lid $portnum | awk -F '[.:]*' '/^NodeType/{ if ($2 == "Switch") {print 1}}'` + +if [ "$is_switch" -a "$portnum" == "0" ]; then + ignore_check=true +fi + +text="`eval $IBPATH/smpquery $ca_info portinfo $lid $portnum`" +rv=$? +if echo "$text" | awk -v ignore_check=$ignore_check -v mono=$bw -F '[.:]*' ' +function blue(s) +{ + if (mono) + printf s + else if (!quiet) { + printf "\033[1;034m" s + printf "\033[0;39m" + } +} + +# Checks + +/^PhysLinkState/{ if ($2 != "LinkUp") {err = err "#error: Physical link state is " $2 " lid '$lid' port '$portnum'\n"; exit -1}} + +/^LinkState/{ if ($2 != "Active") warn = warn "#warn: Logical link state is " $2 " lid '$lid' port '$portnum'\n"} + +/^LinkWidthActive/{ if ($2 == "1X") warn = warn "#warn: Link configured as 1X lid '$lid' port '$portnum'\n"} + +/^Lid/{ if (ignore_check == "0" && $2 == "0") warn = warn "#warn: Lid is not configured lid '$lid' port '$portnum'\n"} + +/^SMLid/{ if (ignore_check == "0" && $2 == "0") warn = warn "#warn: SM Lid is not configured\n"} + +#/^LocalPort/ { if ($2 != '$portnum') {err = err "#error: port " $2 " does not match query ('$portnum')\n"; exit -1}} + +/^ib/ {print $0; next} +/ibpanic:/ {print $0} +/ibwarn:/ {print $0} +/iberror:/ {print $0} + +END { + if (err != "") { + blue(err) + exit -1 + } + if (warn != "") { + blue(warn) + exit -1 + } + exit 0 +}' 2>&1 && test $rv -eq 0 ; then + if [ "$verbose" = "yes" ]; then + echo -n "Port check lid $lid port $portnum: " + green "OK" + fi + exit 0 +else + echo -n "Port check lid $lid port $portnum: " + red "FAILED" + exit -1 +fi diff --git a/infiniband-diags/scripts/ibcheckportstate.in b/infiniband-diags/scripts/ibcheckportstate.in new file mode 100644 index 0000000..c5fb948 --- /dev/null +++ b/infiniband-diags/scripts/ibcheckportstate.in @@ -0,0 +1,144 @@ +#!/bin/sh + +IBPATH=${IBPATH:-@IBSCRIPTPATH@} + +usage() { + echo Usage: `basename $0` "[-h] [-v] [-N | -nocolor] [-G]" \ + "[-C ca_name] [-P ca_port] [-t(imeout) timeout_ms] <lid|guid> <port>" + exit -1 +} + +green() { + if [ "$bw" = "yes" ]; then + if [ "$verbose" = "yes" ]; then + echo $1 + fi + return + fi + if [ "$verbose" = "yes" ]; then + echo -e "\\033[1;032m" $1 "\\033[0;39m" + fi +} + +red() { + if [ "$bw" = "yes" ]; then + echo $1 + return + fi + echo -e "\\033[1;031m" $1 "\\033[0;39m" +} + +guid_addr="" +bw="" +verbose="" +ca_info="" +suppress_deprecated="no" + +while [ "$1" ]; do + case $1 in + -G) + guid_addr=yes + ;; + -nocolor|-N) + bw=yes + ;; + -v) + verbose=yes + ;; + -S) + suppress_deprecated="yes" + ;; + -P | -C | -t | -timeout) + case $2 in + -*) + usage + ;; + esac + if [ x$2 = x ] ; then + usage + fi + ca_info="$ca_info $1 $2" + shift + ;; + -*) + usage + ;; + *) + break + ;; + esac + shift +done + +if [ $# -lt 2 ]; then + usage +fi + +portnum=$2 + +if [ "$suppress_deprecated" = "no" ]; then +echo "*** WARNING ***: this command is deprecated\n\n" 1>&2 +fi + +if [ "$guid_addr" ]; then + if ! lid=`$IBPATH/ibaddr $ca_info -G -L $1 | awk '/failed/{exit -1} {print $3}'`; then + echo -n "guid $1 address resolution: " + red "FAILED" + exit -1 + fi + guid=$1 +else + lid=$1 + if ! temp=`$IBPATH/ibaddr $ca_info -L $1 | awk '/failed/{exit -1} {print $1}'`; then + echo -n "lid $1 address resolution: " + red "FAILED" + exit -1 + fi +fi + + +text="`eval $IBPATH/smpquery $ca_info portinfo $lid $portnum`" +rv=$? +if echo "$text" | awk -v mono=$bw -F '[.:]*' ' +function blue(s) +{ + if (mono) + printf s + else if (!quiet) { + printf "\033[1;034m" s + printf "\033[0;39m" + } +} + +# Only check PortPhysicalState and PortState + +/^PhysLinkState/{ if ($2 != "LinkUp") {err = err "#error: Physical link state is " $2 " lid '$lid' port '$portnum'\n"; exit -1}} + +/^LinkState/{ if ($2 != "Active") warn = warn "#warn: Logical link state is " $2 " lid '$lid' port '$portnum'\n"} + +/^ib/ {print $0; next} +/ibpanic:/ {print $0} +/ibwarn:/ {print $0} +/iberror:/ {print $0} + +END { + if (err != "") { + blue(err) + exit -1 + } + if (warn != "") { + blue(warn) + exit -1 + } + exit 0 +}' 2>&1 && test $rv -eq 0 ; then + if [ "$verbose" = "yes" ]; then + echo -n "Port check lid $lid port $portnum: " + green "OK" + fi + exit 0 +else + echo -n "Port check lid $lid port $portnum: " + red "FAILED" + exit -1 +fi diff --git a/infiniband-diags/scripts/ibcheckportwidth.in b/infiniband-diags/scripts/ibcheckportwidth.in new file mode 100644 index 0000000..9cfca11 --- /dev/null +++ b/infiniband-diags/scripts/ibcheckportwidth.in @@ -0,0 +1,142 @@ +#!/bin/sh + +IBPATH=${IBPATH:-@IBSCRIPTPATH@} + +usage() { + echo Usage: `basename $0` "[-h] [-v] [-N | -nocolor] [-G]" \ + "[-C ca_name] [-P ca_port] [-t(imeout) timeout_ms] <lid|guid> <port>" + exit -1 +} + +green() { + if [ "$bw" = "yes" ]; then + if [ "$verbose" = "yes" ]; then + echo $1 + fi + return + fi + if [ "$verbose" = "yes" ]; then + echo -e "\\033[1;032m" $1 "\\033[0;39m" + fi +} + +red() { + if [ "$bw" = "yes" ]; then + echo $1 + return + fi + echo -e "\\033[1;031m" $1 "\\033[0;39m" +} + +guid_addr="" +bw="" +verbose="" +ca_info="" +suppress_deprecated="no" + +while [ "$1" ]; do + case $1 in + -G) + guid_addr=yes + ;; + -nocolor|-N) + bw=yes + ;; + -v) + verbose=yes + ;; + -S) + suppress_deprecated="yes" + ;; + -P | -C | -t | -timeout) + case $2 in + -*) + usage + ;; + esac + if [ x$2 = x ] ; then + usage + fi + ca_info="$ca_info $1 $2" + shift + ;; + -*) + usage + ;; + *) + break + ;; + esac + shift +done + +if [ $# -lt 2 ]; then + usage +fi + +portnum=$2 + +if [ "$suppress_deprecated" = "no" ]; then +echo "*** WARNING ***: this command is deprecated\n\n" 1>&2 +fi + +if [ "$guid_addr" ]; then + if ! lid=`$IBPATH/ibaddr $ca_info -G -L $1 | awk '/failed/{exit -1} {print $3}'`; then + echo -n "guid $1 address resolution: " + red "FAILED" + exit -1 + fi + guid=$1 +else + lid=$1 + if ! temp=`$IBPATH/ibaddr $ca_info -L $1 | awk '/failed/{exit -1} {print $1}'`; then + echo -n "lid $1 address resolution: " + red "FAILED" + exit -1 + fi +fi + + +text="`eval $IBPATH/smpquery $ca_info portinfo $lid $portnum`" +rv=$? +if echo "$text" | awk -v mono=$bw -F '[.:]*' ' +function blue(s) +{ + if (mono) + printf s + else if (!quiet) { + printf "\033[1;034m" s + printf "\033[0;39m" + } +} + +# Only check LinkWidthActive if LinkWidthSupported is not 1X +/^LinkWidthSupported/{ if ($2 == "1X") { exit } } +/^LinkWidthActive/{ if ($2 == "1X") warn = warn "#warn: Link configured as 1X lid '$lid' port '$portnum'\n"} + +/^ib/ {print $0; next} +/ibpanic:/ {print $0} +/ibwarn:/ {print $0} +/iberror:/ {print $0} + +END { + if (err != "") { + blue(err) + exit -1 + } + if (warn != "") { + blue(warn) + exit -1 + } + exit 0 +}' 2>&1 && test $rv -eq 0 ; then + if [ "$verbose" = "yes" ]; then + echo -n "Port check lid $lid port $portnum: " + green "OK" + fi + exit 0 +else + echo -n "Port check lid $lid port $portnum: " + red "FAILED" + exit -1 +fi diff --git a/infiniband-diags/scripts/ibcheckstate.in b/infiniband-diags/scripts/ibcheckstate.in new file mode 100644 index 0000000..ce89df5 --- /dev/null +++ b/infiniband-diags/scripts/ibcheckstate.in @@ -0,0 +1,137 @@ +#!/bin/sh + +IBPATH=${IBPATH:-@IBSCRIPTPATH@} + +usage() { + echo Usage: `basename $0` "[-h] [-v] [-N | -nocolor]" \ + "[<topology-file> | -C ca_name -P ca_port -t(imeout) timeout_ms]" + exit -1 +} + +user_abort() { + echo "Aborted" + exit 1 +} + +trap user_abort SIGINT + +gflags="" +verbose="" +v=0 +ntype="" +nodeguid="" +oldlid="" +topofile="" +ca_info="" + +while [ "$1" ]; do + case $1 in + -h) + usage + ;; + -N|-nocolor) + gflags=-N + ;; + -v) + verbose=-v + v=1 + ;; + -P | -C | -t | -timeout) + case $2 in + -*) + usage + ;; + esac + if [ x$2 = x ] ; then + usage + fi + ca_info="$ca_info $1 $2" + shift + ;; + -*) + usage + ;; + *) + if [ "$topofile" ]; then + usage + fi + topofile="$1" + ;; + esac + shift +done + +if [ "$topofile" ]; then + netcmd="cat $topofile" +else + netcmd="$IBPATH/ibnetdiscover $ca_info" +fi + +text="`eval $netcmd`" +rv=$? +echo "$text" | awk ' +BEGIN { + ne=0 + pe=0 +} +function check_node(lid) +{ + nodechecked=1 + if (system("'$IBPATH'/ibchecknode -S '"$ca_info"' '$gflags' '$verbose' " lid)) { + ne++ + badnode=1 + return + } +} + + +/^Ca/ || /^Switch/ || /^Rt/ { + nnodes++ + ntype=$1; nodeguid=substr($3, 4, 16); ports=$2 + if ('$v') + print "\n# Checking " ntype ": nodeguid 0x" nodeguid + + nodechecked=0 + badnode=0 + if (ntype != "Switch") + next + + lid = substr($0, index($0, "port 0 lid ") + 11) + lid = substr(lid, 1, index(lid, " ") - 1) + check_node(lid) + } +/^\[/ { + nports++ + port = $1 + if (!nodechecked) { + lid = substr($0, index($0, " lid ") + 5) + lid = substr(lid, 1, index(lid, " ") - 1) + check_node(lid) + } + if (badnode) { + print "\n# " ntype ": nodeguid 0x" nodeguid " failed" + next + } + sub("\\(.*\\)", "", port) + gsub("[\\[\\]]", "", port) + if (system("'$IBPATH'/ibcheckportstate -S '"$ca_info"' '$gflags' '$verbose' " lid " " port)) { + if (!'$v' && oldlid != lid) { + print "# Checked " ntype ": nodeguid 0x" nodeguid " with failure" + oldlid = lid + } + pe++; + } +} + +/^ib/ {print $0; next} +/ibpanic:/ {print $0} +/ibwarn:/ {print $0} +/iberror:/ {print $0} + +END { + printf "\n*** WARNING ***: this command is deprecated\n" + printf "\n## Summary: %d nodes checked, %d bad nodes found\n", nnodes, ne + printf "## %d ports checked, %d ports with bad state found\n", nports, pe +} +' +exit $rv diff --git a/infiniband-diags/scripts/ibcheckwidth.in b/infiniband-diags/scripts/ibcheckwidth.in new file mode 100644 index 0000000..c8a08ee --- /dev/null +++ b/infiniband-diags/scripts/ibcheckwidth.in @@ -0,0 +1,136 @@ +#!/bin/sh + +IBPATH=${IBPATH:-@IBSCRIPTPATH@} + +usage() { + echo Usage: `basename $0` "[-h] [-v] [-N | -nocolor]" \ + "[<topology-file> \| -C ca_name -P ca_port -t(imeout) timeout_ms]" + exit -1 +} + +user_abort() { + echo "Aborted" + exit 1 +} + +trap user_abort SIGINT + +gflags="" +verbose="" +v=0 +ntype="" +nodeguid="" +oldlid="" +topofile="" +ca_info="" + +while [ "$1" ]; do + case $1 in + -h) + usage + ;; + -N|-nocolor) + gflags=-N + ;; + -v) + verbose="-v" + v=1 + ;; + -P | -C | -t | -timeout) + case $2 in + -*) + usage + ;; + esac + if [ x$2 = x ] ; then + usage + fi + ca_info="$ca_info $1 $2" + shift + ;; + -*) + usage + ;; + *) + if [ "$topofile" ]; then + usage + fi + topofile="$1" + ;; + esac + shift +done + +if [ "$topofile" ]; then + netcmd="cat $topofile" +else + netcmd="$IBPATH/ibnetdiscover $ca_info" +fi + +text="`eval $netcmd`" +rv=$? +echo "$text" | awk ' +BEGIN { + ne=0 + pe=0 +} +function check_node(lid) +{ + nodechecked=1 + if (system("'$IBPATH'/ibchecknode -S '"$ca_info"' '$gflags' '$verbose' " lid)) { + ne++ + badnode=1 + return + } +} + +/^Ca/ || /^Switch/ || /^Rt/ { + nnodes++ + ntype=$1; nodeguid=substr($3, 4, 16); ports=$2 + if ('$v') + print "\n# Checking " ntype ": nodeguid 0x" nodeguid + + nodechecked=0 + badnode=0 + if (ntype != "Switch") + next + + lid = substr($0, index($0, "port 0 lid ") + 11) + lid = substr(lid, 1, index(lid, " ") - 1) + check_node(lid) + } +/^\[/ { + nports++ + port = $1 + if (!nodechecked) { + lid = substr($0, index($0, " lid ") + 5) + lid = substr(lid, 1, index(lid, " ") - 1) + check_node(lid) + } + if (badnode) { + print "\n# " ntype ": nodeguid 0x" nodeguid " failed" + next + } + sub("\\(.*\\)", "", port) + gsub("[\\[\\]]", "", port) + if (system("'$IBPATH'/ibcheckportwidth -S '"$ca_info"' '$gflags' '$verbose' " lid " " port)) { + if (!'$v' && oldlid != lid) { + print "# Checked " ntype ": nodeguid 0x" nodeguid " with failure" + oldlid = lid + } + pe++; + } +} + +/^ib/ {print $0; next} +/ibpanic:/ {print $0} +/ibwarn:/ {print $0} +/iberror:/ {print $0} + +END { + printf "\n*** WARNING ***: this command is deprecated\n" + printf "\n## Summary: %d nodes checked, %d bad nodes found\n", nnodes, ne + printf "## %d ports checked, %d ports with 1x width in error found\n", nports, pe +} +' +exit $rv diff --git a/infiniband-diags/scripts/ibclearcounters.in b/infiniband-diags/scripts/ibclearcounters.in new file mode 100644 index 0000000..789e02e --- /dev/null +++ b/infiniband-diags/scripts/ibclearcounters.in @@ -0,0 +1,108 @@ +#!/bin/sh + +IBPATH=${IBPATH:-@IBSCRIPTPATH@} + +usage() { + echo Usage: `basename $0` "[-h] [<topology-file>" \ + "| -C ca_name -P ca_port -t(imeout) timeout_ms]" + exit -1 +} + +user_abort() { + echo "Aborted" + exit 1 +} + +trap user_abort SIGINT + +gflags="" +verbose="" +v=0 +topofile="" +ca_info="" + +while [ "$1" ]; do + case $1 in + -h) + usage + ;; + -P | -C | -t | -timeout) + case $2 in + -*) + usage + ;; + esac + if [ x$2 = x ] ; then + usage + fi + ca_info="$ca_info $1 $2" + shift + ;; + -*) + usage + ;; + *) + if [ "$topofile" ]; then + usage + fi + topofile="$1" + ;; + esac + shift +done + +if [ "$topofile" ]; then + netcmd="cat $topofile" +else + netcmd="$IBPATH/ibnetdiscover $ca_info" +fi + +text="`eval $netcmd`" +rv=$? +echo "$text" | awk ' + +function clear_counters(lid) +{ + if (system("'$IBPATH'/perfquery'"$ca_info"' '$gflags' -R -a " lid)) + nodeerr++ +} + +function clear_port_counters(lid, port) +{ + if (system("'$IBPATH'/perfquery'"$ca_info"' '$gflags' -R " lid " " port)) + nodeerr++ +} + +/^Ca/ || /^Switch/ || /^Rt/ { + nnodes++ + ntype=$1; nodeguid=substr($3, 4, 16); ports=$2 + if (ntype != "Switch") + next + + lid = substr($0, index($0, "port 0 lid ") + 11) + lid = substr(lid, 1, index(lid, " ") - 1) + clear_counters(lid) + } + +/^\[/ { + port = $1 + sub("\\(.*\\)", "", port) + gsub("[\\[\\]]", "", port) + if (ntype != "Switch") { + lid = substr($0, index($0, " lid ") + 5) + lid = substr(lid, 1, index(lid, " ") - 1) + clear_port_counters(lid, port) + } + } + +/^ib/ {print $0; next} +/ibpanic:/ {print $0} +/ibwarn:/ {print $0} +/iberror:/ {print $0} + +END { + printf "\n*** WARNING ***: this command is deprecated; Please use \"ibqueryerrors -K\"\n" + printf "\n## Summary: %d nodes cleared %d errors\n", nnodes, nodeerr +} +' +exit $rv diff --git a/infiniband-diags/scripts/ibclearerrors.in b/infiniband-diags/scripts/ibclearerrors.in new file mode 100644 index 0000000..ff7a452 --- /dev/null +++ b/infiniband-diags/scripts/ibclearerrors.in @@ -0,0 +1,112 @@ +#!/bin/sh + +IBPATH=${IBPATH:-@IBSCRIPTPATH@} + +usage() { + echo Usage: `basename $0` "[-h] [-N | -nocolor] [<topology-file>" \ + "| -C ca_name -P ca_port -t(imeout) timeout_ms]" + exit -1 +} + +user_abort() { + echo "Aborted" + exit 1 +} + +trap user_abort SIGINT + +gflags="" +verbose="" +v=0 +oldlid="" +topofile="" +ca_info="" + +while [ "$1" ]; do + case $1 in + -h) + usage + ;; + -N|-nocolor) + gflags=-N + ;; + -P | -C | -t | -timeout) + case $2 in + -*) + usage + ;; + esac + if [ x$2 = x ] ; then + usage + fi + ca_info="$ca_info $1 $2" + shift + ;; + -*) + usage + ;; + *) + if [ "$topofile" ]; then + usage + fi + topofile="$1" + ;; + esac + shift +done + +if [ "$topofile" ]; then + netcmd="cat $topofile" +else + netcmd="$IBPATH/ibnetdiscover $ca_info" +fi + +text="`eval $netcmd`" +rv=$? +echo "$text" | awk ' + +function clear_all_errors(lid, port) +{ + if (system("'$IBPATH'/perfquery'"$ca_info"' '$gflags' -R -a " lid " " port " 0x0fff")) + nodeerr++ +} + +function clear_errors(lid, port) +{ + if (system("'$IBPATH'/perfquery'"$ca_info"' '$gflags' -R " lid " " port " 0x0fff")) + nodeerr++ +} + +/^Ca/ || /^Switch/ || /^Rt/ { + nnodes++ + ntype=$1; nodeguid=substr($3, 4, 16); ports=$2 + if (ntype != "Switch") + next + + lid = substr($0, index($0, "port 0 lid ") + 11) + lid = substr(lid, 1, index(lid, " ") - 1) + clear_all_errors(lid, 255) + } + +/^\[/ { + port = $1 + sub("\\(.*\\)", "", port) + gsub("[\\[\\]]", "", port) + if (ntype != "Switch") { + lid = substr($0, index($0, " lid ") + 5) + lid = substr(lid, 1, index(lid, " ") - 1) + clear_errors(lid, port) + } + } + +/^ib/ {print $0; next} +/ibpanic:/ {print $0} +/ibwarn:/ {print $0} +/iberror:/ {print $0} + +END { + printf "\n*** WARNING ***: this command is deprecated; Please use \"ibqueryerrors -k\"\n" + printf "\n## Summary: %d nodes cleared %d errors\n", nnodes, nodeerr +} +' +exit $rv diff --git a/infiniband-diags/scripts/ibdatacounters.in b/infiniband-diags/scripts/ibdatacounters.in new file mode 100644 index 0000000..6f0389e --- /dev/null +++ b/infiniband-diags/scripts/ibdatacounters.in @@ -0,0 +1,131 @@ +#!/bin/sh + +IBPATH=${IBPATH:-@IBSCRIPTPATH@} + +usage() { + echo Usage: `basename $0` "[-h] [-b] [-v] [-N | -nocolor]" \ + "[<topology-file> \| -C ca_name -P ca_port -t(imeout) timeout_ms]" + exit -1 +} + +user_abort() { + echo "Aborted" + exit 1 +} + +trap user_abort SIGINT + +gflags="" +verbose="" +brief="" +v=0 +ntype="" +nodeguid="" +topofile="" +ca_info="" + +while [ "$1" ]; do + case $1 in + -h) + usage + ;; + -N|-nocolor) + gflags=-N + ;; + -v) + verbose=-v + brief="" + v=1 + ;; + -b) + brief=-b + verbose="" + ;; + -P | -C | -t | -timeout) + case $2 in + -*) + usage + ;; + esac + if [ x$2 = x ] ; then + usage + fi + ca_info="$ca_info $1 $2" + shift + ;; + -*) + usage + ;; + *) + if [ "$topofile" ]; then + usage + fi + topofile="$1" + ;; + esac + shift +done + +if [ "$topofile" ]; then + netcmd="cat $topofile" +else + netcmd="$IBPATH/ibnetdiscover $ca_info" +fi + +text="`eval $netcmd`" +rv=$? +echo "$text" | awk ' +BEGIN { + ne=0 +} +function check_node(lid, port) +{ + if (system("'$IBPATH'/ibchecknode -S '"$ca_info"' '$gflags' '$verbose' " lid)) { + ne++ + print "\n# " ntype ": nodeguid 0x" nodeguid " failed" + return 1; + } + return system("'$IBPATH'/ibdatacounts -S '"$ca_info"' '$gflags' '$verbose' '$brief' " lid " " port); +} + +/^Ca/ || /^Switch/ || /^Rt/ { + nnodes++ + ntype=$1; nodeguid=substr($3, 4, 16); ports=$2 + if ('$v') + print "\n# Checking " ntype ": nodeguid 0x" nodeguid + + err = 0; + if (ntype != "Switch") + next + + lid = substr($0, index($0, "port 0 lid ") + 11) + lid = substr(lid, 1, index(lid, " ") - 1) + err = check_node(lid, 255) + } +/^\[/ { + nports++ + port = $1 + sub("\\(.*\\)", "", port) + gsub("[\\[\\]]", "", port) + if (ntype != "Switch") { + lid = substr($0, index($0, " lid ") + 5) + lid = substr(lid, 1, index(lid, " ") - 1) + check_node(lid, port) + } else if (err) + system("'$IBPATH'/ibdatacounts -S '"$ca_info"' '$gflags' '$verbose' '$brief' " lid " " port); +} + +/^ib/ {print $0; next} +/ibpanic:/ {print $0} +/ibwarn:/ {print $0} +/iberror:/ {print $0} + +END { + printf "*** WARNING ***: this command is deprecated; Please use \"ibqueryerrors --counters\n" + printf "\n## Summary: %d nodes checked, %d bad nodes found\n", nnodes, ne + printf "## %d ports checked\n", nports + exit (ne ) +} +' + +exit $rv diff --git a/infiniband-diags/scripts/ibdatacounts.in b/infiniband-diags/scripts/ibdatacounts.in new file mode 100644 index 0000000..ac5f8c4 --- /dev/null +++ b/infiniband-diags/scripts/ibdatacounts.in @@ -0,0 +1,172 @@ +#!/bin/sh + +IBPATH=${IBPATH:-@IBSCRIPTPATH@} + +usage() { + echo Usage: `basename $0` "[-h] [-b] [-v] [-G] [-N | -nocolor]" \ + "[-C ca_name] [-P ca_port] [-t(imeout) timeout_ms] <lid|guid>" \ + "[<port>]" + exit -1 +} + +green() { + if [ "$bw" = "yes" ]; then + if [ "$verbose" = "yes" ]; then + echo $1 + fi + return + fi + if [ "$verbose" = "yes" ]; then + echo -e "\\033[1;032m" $1 "\\033[0;39m" + fi +} + +red() { + if [ "$bw" = "yes" ]; then + echo $1 + return + fi + echo -e "\\033[1;031m" $1 "\\033[0;39m" +} + +guid_addr="" +bw="" +verbose="" +brief="" +ca_info="" +suppress_deprecated="no" + +while [ "$1" ]; do + case $1 in + -G) + guid_addr=yes + ;; + -nocolor|-N) + bw=yes + ;; + -v) + verbose=yes + brief="" + ;; + -b) + brief=yes + verbose="" + ;; + -P | -C | -t | -timeout) + case $2 in + -*) + usage + ;; + esac + if [ x$2 = x ] ; then + usage + fi + ca_info="$ca_info $1 $2" + shift + ;; + -S) + suppress_deprecated="yes" + ;; + -*) + usage + ;; + *) + break + ;; + esac + shift +done + +#default is all ports +portnum=255 + +if [ $# -lt 1 ]; then + usage +fi + +if [ "$2" ]; then + portnum=$2 +fi + +if [ "$portnum" = "255" ]; then + portname="all" +else + portname=$2 +fi + +if [ "$guid_addr" ]; then + if ! lid=`$IBPATH/ibaddr $ca_info -G -L $1 | awk '/failed/{exit -1} {print $3}'`; then + echo -n "guid $1 address resolution: " + red "FAILED" + exit -1 + fi + guid=$1 +else + lid=$1 + if ! temp=`$IBPATH/ibaddr $ca_info -L $1 | awk '/failed/{exit -1} {print $1}'`; then + echo -n "lid $1 address resolution: " + red "FAILED" + exit -1 + fi +fi + +nodename=`$IBPATH/smpquery $ca_info nodedesc $lid | sed -e "s/^Node Description:\.*\(.*\)/\1/"` + +if [ "$suppress_deprecated" = "no" ]; then +echo "*** WARNING ***: this command is deprecated; Please use \"ibqueryerrors --counters\"\n\n" 1>&2 +fi + +text="`eval $IBPATH/perfquery $ca_info $lid $portnum`" +rv=$? +if echo "$text" | awk -v mono=$bw -v brief=$brief -F '[.:]*' ' +function blue(s) +{ + if (brief == "yes") { + return + } + if (mono) + printf s + else if (!quiet) { + printf "\033[1;034m" s + printf "\033[0;39m" + } +} + +# Only display Xmit/Rcv Pkts/Data + +/^# Port counters/ {print} + +/^CounterSelect/ {next} + +/AllPortSelect/ {next} + +/^ib/ {print $0; next} +/ibpanic:/ {print $0} +/ibwarn:/ {print $0} +/iberror:/ {print $0} + +/^PortSelect/ { if ($2 != '$portnum') {err = err "error: lid '$lid' port " $2 " does not match query ('$portnum')\n"; exit -1}} + +$1 ~ "(Xmt|Rcv)(Pkts|Data)" { print $1 ":........................." $2 } + +END { + if (err != "") { + blue(err) + exit -1 + } + if (warn != "") { + blue(warn) + exit -1 + } + exit 0 +}' 2>&1 && test $rv -eq 0 ; then + if [ "$verbose" = "yes" ]; then + echo -n "Error on lid $lid ($nodename) port $portname: " + green OK + fi + exit 0 +else + echo -n "Error on lid $lid ($nodename) port $portname: " + red FAILED + exit -1 +fi diff --git a/infiniband-diags/scripts/ibdiscover.map b/infiniband-diags/scripts/ibdiscover.map new file mode 100644 index 0000000..58c69da --- /dev/null +++ b/infiniband-diags/scripts/ibdiscover.map @@ -0,0 +1,6 @@ +8f10400410015|8|"ISR 6000"|# SW-6IB4 Voltaire port 0 lid 5 +5442ba00003080|24|"ISR 9024"|# ISR9024 Voltaire port 0 lid 2 +8f10403960558|2|"HCA 1"|# MT23108 InfiniHost Mellanox Technologies +5442b100004900|2|"HCA 2"|# MT23108 InfiniHost Mellanox Technologies +8f10403961354|2|"HCA 3"|# MT23108 InfiniHost Mellanox Technologies +8f10403960984|2|"HCA 4"|# MT23108 InfiniHost Mellanox Technologies diff --git a/infiniband-diags/scripts/ibdiscover.pl b/infiniband-diags/scripts/ibdiscover.pl new file mode 100755 index 0000000..1462584 --- /dev/null +++ b/infiniband-diags/scripts/ibdiscover.pl @@ -0,0 +1,95 @@ +#!/usr/bin/perl + +printf (STDERR "*** WARNING ***; this command is deprecated;\n"); +printf (STDERR " see ibnetdiscover cache features\n"); +printf (STDERR " and/or iblinkinfo \"check\" features\n\n"); + +# +# Read mapfile +# +open(MAP, "< ibdiscover.map"); + +while (<MAP>) { + ($pre, $port, $desc) = split /\|/; + $val{$pre} = $desc; + # print "Ack1 - $pre - $port - $desc\n"; +} +close(MAP); + +# +# Read old topo map in +# +open(TOPO, "< ibdiscover.topo"); +$topomap = 0; + +while (<TOPO>) { + $topomap = 1; + ($localPort, $localGuid, $remotePort, $remoteGuid) = split /\|/; + chomp $remoteGuid; + $var = sprintf("%s|%2s|%2s|%s", $localGuid, $localPort, $remotePort, + $remoteGuid); + $topo{$var} = 1; + # ${$pre} = $desc; + # print "Ack1 - $pre - $port - $desc\n"; +} +close(TOPO); + +# +# Read stdin and output enhanced output +# +# Search and replace =0x???? with value +# Search and replace -000???? with value + +open(TOPO2, " >ibdiscover.topo.new"); +while (<STDIN>) { + ($a, $b, $local, $d) = /([sh])([\s\S]*)=0x([a-f\d]*)([\s\S]*)/; + if ($local ne "") { + printf( + "\n%s GUID: %s %s\n", + ($a eq "s" ? "Switch" : "Host"), + $local, $val{$local} + ); + chomp $local; + $localGuid = $local; + } else { + ($localPort, $type, $remoteGuid, $remotePort) = + /([\s\S]*)"([SH])\-000([a-f\d]*)"([\s\S]*)\n/; + ($localPort) = $localPort =~ /\[(\d*)]/; + ($remotePort) = $remotePort =~ /\[(\d*)]/; + if ($remoteGuid ne "" && $localPort ne "") { + printf(TOPO2 "%d|%s|%d|%s\n", + $localPort, $localGuid, $remotePort, $remoteGuid); + $var = sprintf("%s|%2s|%2s|%s", + $localGuid, $localPort, $remotePort, $remoteGuid); + $topo{$var} += 1; + printf( + "Local: %2s Remote: %2s %7s GUID: %s Location: %s\n", + $localPort, + $remotePort, + ($type eq "H" ? "Host" : "Switch"), + $remoteGuid, + ($val{$remoteGuid} ne "" ? $val{$remoteGuid} : $remoteGuid) + ); + } + } +} +close(STDIN); +close(TOPO2); + +printf("\nDelta change in topo (change between successive runs)\n\n"); + +foreach $el (keys %topo) { + if ($topo{$el} < 2 || $topomap == 0) { + ($lg, $lp, $rp, $rg) = split(/\|/, $el); + printf( +"Link change: Local/Remote Port %2d/%2d Local/Remote GUID: %s/%s\n", + $lp, $rp, $lg, $rg); + printf("\tLocations: Local/Remote\n\t\t%s\n\t\t%s\n\n", + $val{$lg}, $val{$rg}); + } +} + +printf (STDERR "*** WARNING ***; this command is deprecated;\n"); +printf (STDERR " see ibnetdiscover cache features\n"); +printf (STDERR " and/or iblinkinfo \"check\" features\n\n"); + diff --git a/infiniband-diags/scripts/ibfindnodesusing.pl b/infiniband-diags/scripts/ibfindnodesusing.pl new file mode 100755 index 0000000..a2102c7 --- /dev/null +++ b/infiniband-diags/scripts/ibfindnodesusing.pl @@ -0,0 +1,231 @@ +#!/usr/bin/perl +# +# Copyright (C) 2001-2003 The Regents of the University of California. +# Copyright (c) 2006 The Regents of the University of California. +# Copyright (c) 2007-2008 Voltaire, Inc. All rights reserved. +# +# Produced at Lawrence Livermore National Laboratory. +# Written by Ira Weiny <weiny2@llnl.gov> +# Jim Garlick <garlick@llnl.gov> +# Albert Chu <chu11@llnl.gov> +# +# This software is available to you under a choice of one of two +# licenses. You may choose to be licensed under the terms of the GNU +# General Public License (GPL) Version 2, available from the file +# COPYING in the main directory of this source tree, or the +# OpenIB.org BSD license below: +# +# Redistribution and use in source and binary forms, with or +# without modification, are permitted provided that the following +# conditions are met: +# +# - Redistributions of source code must retain the above +# copyright notice, this list of conditions and the following +# disclaimer. +# +# - Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials +# provided with the distribution. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS +# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# + +use strict; + +use Getopt::Std; +use IBswcountlimits; +my $ca_name = ""; +my $ca_port = ""; + +# ========================================================================= +# +sub get_hosts_routed +{ + my $sw_guid = $_[0]; + my $sw_port = $_[1]; + my @hosts = undef; + my $extra_params = get_ca_name_port_param_string($ca_name, $ca_port); + + if ($sw_guid eq "") { return (@hosts); } + + my $data = `ibroute $extra_params -G $sw_guid`; + my @lines = split("\n", $data); + foreach my $line (@lines) { + if ($line =~ /\w+\s+(\d+)\s+:\s+\(Channel Adapter.*:\s+'(.*)'\)/) { + if ($1 == $sw_port) { + push @hosts, $2; + } + } + } + + return (@hosts); +} + +# ========================================================================= +# +sub usage_and_exit +{ + my $prog = $_[0]; + print +"Usage: $prog [-R -C <ca_name> -P <ca_port>] <switch_guid|switch_name> <port>\n"; + print " find a list of nodes which are routed through switch:port\n"; + print " -R Recalculate ibnetdiscover information\n"; + print " -C <ca_name> use selected Channel Adaptor name for queries\n"; + print " -P <ca_port> use selected channel adaptor port for queries\n"; + exit 2; +} + +my $argv0 = `basename $0`; +my $regenerate_map = undef; +chomp $argv0; +if (!getopts("hRC:P:")) { usage_and_exit $argv0; } +if (defined $Getopt::Std::opt_h) { usage_and_exit $argv0; } +if (defined $Getopt::Std::opt_R) { $regenerate_map = $Getopt::Std::opt_R; } +if (defined $Getopt::Std::opt_C) { $ca_name = $Getopt::Std::opt_C; } +if (defined $Getopt::Std::opt_P) { $ca_port = $Getopt::Std::opt_P; } + +my $target_switch = format_guid($ARGV[0]); +my $target_port = $ARGV[1]; + +get_link_ends($regenerate_map, $ca_name, $ca_port); + +if ($target_switch eq "" || $target_port eq "") { + usage_and_exit $argv0; +} + +# sortn: +# +# sort a group of alphanumeric strings by the last group of digits on +# those strings, if such exists (good for numerically suffixed host lists) +# +sub sortn +{ + map { $$_[0] } + sort { ($$a[1] || 0) <=> ($$b[1] || 0) } map { [$_, /(\d*)$/] } @_; +} + +# comp2(): +# +# takes a list of names and returns a hash of arrays, indexed by name prefix, +# each containing a list of numerical ranges describing the initial list. +# +# e.g.: %hash = comp2(lx01,lx02,lx03,lx05,dev0,dev1,dev21) +# will return: +# $hash{"lx"} = ["01-03", "05"] +# $hash{"dev"} = ["0-1", "21"] +# +sub comp2 +{ + my (%i) = (); + my (%s) = (); + + # turn off warnings here to avoid perl complaints about + # uninitialized values for members of %i and %s + local ($^W) = 0; + push( + @{ + $s{$$_[0]}[ + ( + $s{$$_[0]}[$i{$$_[0]}][$#{$s{$$_[0]}[$i{$$_[0]}]}] == + ($$_[1] - 1) + ) ? $i{$$_[0]} : ++$i{$$_[0]} + ] + }, + ($$_[1]) + ) for map { [/(.*?)(\d*)$/] } sortn(@_); + + for my $key (keys %s) { + @{$s{$key}} = + map { $#$_ > 0 ? "$$_[0]-$$_[$#$_]" : @{$_} } @{$s{$key}}; + } + + return %s; +} + +sub compress_hostlist +{ + my %rng = comp2(@_); + my @list = (); + + local $" = ","; + + foreach my $k (keys %rng) { + @{$rng{$k}} = map { "$k$_" } @{$rng{$k}}; + } + @list = map { @{$rng{$_}} } sort keys %rng; + return "@list"; +} + +# ========================================================================= +# +sub main +{ + my $found_switch = undef; + my $cache_file = get_cache_file($ca_name, $ca_port); + open IBNET_TOPO, "<$cache_file" or die "Failed to open ibnet topology\n"; + my $in_switch = "no"; + my $switch_guid = ""; + my $desc = undef; + my %ports = undef; + while (my $line = <IBNET_TOPO>) { + + if ($line =~ /^Switch.*\"S-(.*)\"\s+# (.*) port.*/) { + $switch_guid = $1; + $desc = $2; + if ("0x$switch_guid" eq $target_switch + || $desc =~ /.*$target_switch\s+.*/) + { + $found_switch = "yes"; + goto FOUND; + } + } + if ($line =~ /^Ca.*/ || $line =~ /^Rt.*/) { $in_switch = "no"; } + + if ($line =~ /^\[(\d+)\].*/ && $in_switch eq "yes") { + $ports{$1} = $line; + } + + } + + FOUND: + close IBNET_TOPO; + if (!$found_switch) { + print "Switch \"$target_switch\" not found\n"; + print " Try running with the \"-R\" or \"-P\" option.\n"; + exit 1; + } + + $switch_guid = "0x$switch_guid"; + + my $hr = $IBswcountlimits::link_ends{$switch_guid}{$target_port}; + my $rem_sw_guid = $hr->{rem_guid}; + my $rem_sw_port = $hr->{rem_port}; + my $rem_sw_desc = $hr->{rem_desc}; + + my @hosts = undef; + @hosts = get_hosts_routed($switch_guid, $target_port); + + my $hosts = compress_hostlist(@hosts); + @hosts = split ",", $hosts; + print +"$switch_guid $target_port ($desc) ==>> $rem_sw_guid $rem_sw_port ($rem_sw_desc)\n"; + print "@hosts\n\n"; + + @hosts = get_hosts_routed($rem_sw_guid, $rem_sw_port); + + $hosts = compress_hostlist(@hosts); + @hosts = split ",", $hosts; + print +"$switch_guid $target_port ($desc) <<== $rem_sw_guid $rem_sw_port ($rem_sw_desc)\n"; + print "@hosts\n"; +} +main + diff --git a/infiniband-diags/scripts/ibhosts.in b/infiniband-diags/scripts/ibhosts.in new file mode 100644 index 0000000..c37260c --- /dev/null +++ b/infiniband-diags/scripts/ibhosts.in @@ -0,0 +1,73 @@ +#!/bin/sh + +IBPATH=${IBPATH:-@IBSCRIPTPATH@} + +usage() { + echo Usage: `basename $0` "[-h] [<topology-file> | -y mkey" \ + "-C ca_name -P ca_port -t timeout_ms]" + exit -1 +} + +topofile="" +ca_info="" +mkey="0" + +while [ "$1" ]; do + case $1 in + -h | --help) + usage + ;; + -y | --m_key) + case $2 in + -*) + usage + ;; + esac + if [ x$2 = x ] ; then + usage + fi + shift + mkey="$1" + ;; + -P | --Port | -C | --Ca | -t | --timeout) + case $2 in + -*) + usage + ;; + esac + if [ x$2 = x ] ; then + usage + fi + ca_info="$ca_info $1 $2" + shift + ;; + -*) + usage + ;; + *) + if [ "$topofile" ]; then + usage + fi + topofile="$1" + ;; + esac + shift +done + +if [ "$topofile" ]; then + netcmd="cat $topofile" +else + netcmd="$IBPATH/ibnetdiscover -y $mkey $ca_info" +fi + +text="`eval $netcmd`" +rv=$? +echo "$text" | awk ' +/^Ca/ {print $1 "\t: 0x" substr($3, 4, 16) " ports " $2 " "\ + substr($0, match($0, "#[ \t]*")+RLENGTH)} +/^ib/ {print $0; next} +/ibpanic:/ {print $0} +/ibwarn:/ {print $0} +/iberror:/ {print $0} +' +exit $rv diff --git a/infiniband-diags/scripts/ibidsverify.pl b/infiniband-diags/scripts/ibidsverify.pl new file mode 100755 index 0000000..cf290de --- /dev/null +++ b/infiniband-diags/scripts/ibidsverify.pl @@ -0,0 +1,272 @@ +#!/usr/bin/perl +# +# Copyright (c) 2007-2008 Voltaire, Inc. All rights reserved. +# Copyright (c) 2006 The Regents of the University of California. +# +# This software is available to you under a choice of one of two +# licenses. You may choose to be licensed under the terms of the GNU +# General Public License (GPL) Version 2, available from the file +# COPYING in the main directory of this source tree, or the +# OpenIB.org BSD license below: +# +# Redistribution and use in source and binary forms, with or +# without modification, are permitted provided that the following +# conditions are met: +# +# - Redistributions of source code must retain the above +# copyright notice, this list of conditions and the following +# disclaimer. +# +# - Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials +# provided with the distribution. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS +# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# + +use strict; + +use Getopt::Std; +use IBswcountlimits; + +my $return_code = 0; + +sub usage_and_exit +{ + my $prog = $_[0]; + print "Usage: $prog [-Rh]\n"; + print +" Validate LIDs and GUIDs (check for zero and duplicates) in the local subnet\n"; + print " -h This help message\n"; + print +" -R Recalculate ibnetdiscover information (Default is to reuse ibnetdiscover output)\n"; + print " -C <ca_name> use selected Channel Adaptor name for queries\n"; + print " -P <ca_port> use selected channel adaptor port for queries\n"; + exit 2; +} + +my $argv0 = `basename $0`; +my $regenerate_map = undef; +my $ca_name = ""; +my $ca_port = ""; + +chomp $argv0; +if (!getopts("hRC:P:")) { usage_and_exit $argv0; } +if (defined $Getopt::Std::opt_h) { usage_and_exit $argv0; } +if (defined $Getopt::Std::opt_R) { $regenerate_map = $Getopt::Std::opt_R; } +if (defined $Getopt::Std::opt_C) { $ca_name = $Getopt::Std::opt_C; } +if (defined $Getopt::Std::opt_P) { $ca_port = $Getopt::Std::opt_P; } + +sub validate_non_zero_lid +{ + my ($lid) = shift(@_); + my ($nodeguid) = shift(@_); + my ($nodetype) = shift(@_); + + if ($lid eq 0) { + print "LID 0 found for $nodetype NodeGUID $nodeguid\n"; + return 1; + } + return 0; +} + +sub validate_non_zero_guid +{ + my ($lid) = shift(@_); + my ($guid) = shift(@_); + my ($nodetype) = shift(@_); + + if ($guid eq 0x0) { + print "$nodetype GUID 0x0 found with LID $lid\n"; + return 1; + } + return 0; +} + +$insert_lid::lids = undef; +$insert_nodeguid::nodeguids = undef; +$insert_portguid::portguids = undef; + +sub insert_lid +{ + my ($lid) = shift(@_); + my ($nodeguid) = shift(@_); + my ($nodetype) = shift(@_); + my $rec = undef; + my $status = ""; + + $status = validate_non_zero_lid($lid, $nodeguid, $nodetype); + if ($status eq 0) { + if (defined($insert_lid::lids{$lid})) { + print +"LID $lid already defined for NodeGUID $insert_lid::lids{$lid}->{nodeguid}\n"; + $return_code = 1; + } else { + $rec = {lid => $lid, nodeguid => $nodeguid}; + $insert_lid::lids{$lid} = $rec; + } + } else { + $return_code = $status; + } +} + +sub insert_nodeguid +{ + my ($lid) = shift(@_); + my ($nodeguid) = shift(@_); + my ($nodetype) = shift(@_); + my $rec = undef; + my $status = ""; + + $status = validate_non_zero_guid($lid, $nodeguid, $nodetype); + if ($status eq 0) { + if (defined($insert_nodeguid::nodeguids{$nodeguid})) { + print +"NodeGUID $nodeguid already defined for LID $insert_nodeguid::nodeguids{$nodeguid}->{lid}\n"; + $return_code = 1; + } else { + $rec = {lid => $lid, nodeguid => $nodeguid}; + $insert_nodeguid::nodeguids{$nodeguid} = $rec; + } + } else { + $return_code = $status; + } +} + +sub validate_portguid +{ + my ($portguid) = shift(@_); + my ($nodeguid) = shift(@_); + + if (($nodeguid ne $portguid) + && defined($insert_nodeguid::nodeguids{$portguid})) { + print "PortGUID $portguid is an invalid duplicate of a NodeGUID\n"; + $return_code = 1; + } +} + +sub insert_portguid +{ + my ($lid) = shift(@_); + my ($portguid) = shift(@_); + my ($nodetype) = shift(@_); + my ($nodeguid) = shift(@_); + my $rec = undef; + my $status = ""; + + $status = validate_non_zero_guid($lid, $portguid, $nodetype); + if ($status eq 0) { + if (defined($insert_portguid::portguids{$portguid})) { + print +"PortGUID $portguid already defined for LID $insert_portguid::portguids{$portguid}->{lid}\n"; + $return_code = 1; + } else { + $rec = {lid => $lid, portguid => $portguid}; + $insert_portguid::portguids{$portguid} = $rec; + validate_portguid($portguid, $nodeguid); + } + } else { + $return_code = $status; + } +} + +sub main +{ + my $cache_file = get_cache_file($ca_name, $ca_port); + + if ($regenerate_map || !(-f "$cache_file")) { + generate_ibnetdiscover_topology($ca_name, $ca_port); + } + open IBNET_TOPO, "<$cache_file" + or die "Failed to open ibnet topology: $!\n"; + + my $nodetype = ""; + my $nodeguid = ""; + my $portguid = ""; + my $lid = ""; + my $line = ""; + my $firstport = ""; + + while ($line = <IBNET_TOPO>) { + + if ($line =~ /^caguid=(.*)/ || $line =~ /^rtguid=(.*)/) { + $nodeguid = $1; + $nodetype = ""; + } + + if ($line =~ /^switchguid=(.*)/) { + $nodeguid = $1; + $portguid = ""; + $nodetype = ""; + } + if ($line =~ /^switchguid=(.*)\((.*)\)/) { + $nodeguid = $1; + $portguid = "0x" . $2; + } + + if ($line =~ /^Switch.*\"S-(.*)\"\s+# (.*) port.* lid (\d+) .*/) { + $nodetype = "switch"; + $firstport = "yes"; + $lid = $3; + insert_lid($lid, $nodeguid, $nodetype); + insert_nodeguid($lid, $nodeguid, $nodetype); + if ($portguid ne "") { + insert_portguid($lid, $portguid, $nodetype, $nodeguid); + } + } + if ($line =~ /^Ca.*/) { + $nodetype = "ca"; + $firstport = "yes"; + } + if ($line =~ /^Rt.*/) { + $nodetype = "router"; + $firstport = "yes"; + } + + if ($nodetype eq "ca" || $nodetype eq "router") { + if ($line =~ /"S-(.*)\# lid (\d+) .*/) { + $lid = $2; + insert_lid($lid, $nodeguid, $nodetype); + if ($firstport eq "yes") { + insert_nodeguid($lid, $nodeguid, $nodetype); + $firstport = "no"; + } + } + if ($line =~ /^.*"H-(.*)\# lid (\d+) .*/) { + $lid = $2; + insert_lid($lid, $nodeguid, $nodetype); + if ($firstport eq "yes") { + insert_nodeguid($lid, $nodeguid, $nodetype); + $firstport = "no"; + } + } + if ($line =~ /^.*"R-(.*)\# lid (\d+) .*/) { + $lid = $2; + insert_lid($lid, $nodeguid, $nodetype); + if ($firstport eq "yes") { + insert_nodeguid($lid, $nodeguid, $nodetype); + $firstport = "no"; + } + } + if ($line =~ /^\[(\d+)\]\((.*)\)/) { + $portguid = "0x" . $2; + insert_portguid($lid, $portguid, $nodetype, $nodeguid); + } + } + + } + + close IBNET_TOPO; +} +main; + +exit ($return_code); + diff --git a/infiniband-diags/scripts/iblinkinfo.pl.in b/infiniband-diags/scripts/iblinkinfo.pl.in new file mode 100755 index 0000000..865be49 --- /dev/null +++ b/infiniband-diags/scripts/iblinkinfo.pl.in @@ -0,0 +1,42 @@ +#!/usr/bin/perl +# +# Copyright (c) 2009 Lawrence Livermore National Security +# +# Produced at Lawrence Livermore National Laboratory. +# Written by Ira Weiny <weiny2@llnl.gov>. +# +# This software is available to you under a choice of one of two +# licenses. You may choose to be licensed under the terms of the GNU +# General Public License (GPL) Version 2, available from the file +# COPYING in the main directory of this source tree, or the +# OpenIB.org BSD license below: +# +# Redistribution and use in source and binary forms, with or +# without modification, are permitted provided that the following +# conditions are met: +# +# - Redistributions of source code must retain the above +# copyright notice, this list of conditions and the following +# disclaimer. +# +# - Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials +# provided with the distribution. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS +# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# + + +# this is now just a wrapper for the C based utility +$str = join " ",@ARGV; +system "@IBSCRIPTPATH@/iblinkinfo $str"; +printf (STDERR "\n*** WARNING ***: this command has been replaced by iblinkinfo\n\n"); + diff --git a/infiniband-diags/scripts/ibnodes.in b/infiniband-diags/scripts/ibnodes.in new file mode 100644 index 0000000..5871da8 --- /dev/null +++ b/infiniband-diags/scripts/ibnodes.in @@ -0,0 +1,5 @@ +#!/bin/sh + +IBPATH=${IBPATH:-@IBSCRIPTPATH@} + +$IBPATH/ibhosts $@; $IBPATH/ibswitches $@ diff --git a/infiniband-diags/scripts/ibprintca.pl b/infiniband-diags/scripts/ibprintca.pl new file mode 100755 index 0000000..598229d --- /dev/null +++ b/infiniband-diags/scripts/ibprintca.pl @@ -0,0 +1,145 @@ +#!/usr/bin/perl +# +# Copyright (c) 2006 The Regents of the University of California. +# Copyright (c) 2007-2008 Voltaire, Inc. All rights reserved. +# +# Produced at Lawrence Livermore National Laboratory. +# Written by Ira Weiny <weiny2@llnl.gov>. +# +# This software is available to you under a choice of one of two +# licenses. You may choose to be licensed under the terms of the GNU +# General Public License (GPL) Version 2, available from the file +# COPYING in the main directory of this source tree, or the +# OpenIB.org BSD license below: +# +# Redistribution and use in source and binary forms, with or +# without modification, are permitted provided that the following +# conditions are met: +# +# - Redistributions of source code must retain the above +# copyright notice, this list of conditions and the following +# disclaimer. +# +# - Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials +# provided with the distribution. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS +# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# + +use strict; + +use Getopt::Std; +use IBswcountlimits; + +printf (STDERR "*** WARNING ***: this command is deprecated; Please use \"ibhosts\"\n\n"); + +# ========================================================================= +# +sub usage_and_exit +{ + my $prog = $_[0]; + print "Usage: $prog [-R -l] [-G <ca_guid> | <node_name>]\n"; + print " print only the ca specified from the ibnetdiscover output\n"; + print " -R Recalculate ibnetdiscover information\n"; + print " -l list cas\n"; + print " -C <ca_name> use selected channel adaptor name for queries\n"; + print " -P <ca_port> use selected channel adaptor port for queries\n"; + print " -G node is specified with GUID\n"; + exit 2; +} + +my $argv0 = `basename $0`; +my $regenerate_map = undef; +my $list_hcas = undef; +my $ca_name = ""; +my $ca_port = ""; +my $name_is_guid = "no"; +chomp $argv0; +if (!getopts("hRlC:P:G")) { usage_and_exit $argv0; } +if (defined $Getopt::Std::opt_h) { usage_and_exit $argv0; } +if (defined $Getopt::Std::opt_R) { $regenerate_map = $Getopt::Std::opt_R; } +if (defined $Getopt::Std::opt_l) { $list_hcas = $Getopt::Std::opt_l; } +if (defined $Getopt::Std::opt_C) { $ca_name = $Getopt::Std::opt_C; } +if (defined $Getopt::Std::opt_P) { $ca_port = $Getopt::Std::opt_P; } +if (defined $Getopt::Std::opt_G) { $name_is_guid = "yes"; } + +my $target_hca = $ARGV[0]; + +if ($name_is_guid eq "yes") { + $target_hca = format_guid($target_hca); +} + +my $cache_file = get_cache_file($ca_name, $ca_port); + +if ($regenerate_map || !(-f "$cache_file")) { + generate_ibnetdiscover_topology($ca_name, $ca_port); +} + +if ($list_hcas) { + system("ibhosts $cache_file"); + exit 1; +} + +if ($target_hca eq "") { + usage_and_exit $argv0; +} + +# ========================================================================= +# +sub main +{ + my $found_hca = 0; + open IBNET_TOPO, "<$cache_file" or die "Failed to open ibnet topology\n"; + my $in_hca = "no"; + my %ports = undef; + while (my $line = <IBNET_TOPO>) { + if ($line =~ /^Ca.*\"H-(.*)\"\s+# (.*)/) { + my $guid = $1; + my $desc = $2; + if ($in_hca eq "yes") { + $in_hca = "no"; + foreach my $port (sort { $a <=> $b } (keys %ports)) { + print $ports{$port}; + } + } + if ("0x$guid" eq $target_hca || $desc =~ /[\s\"]$target_hca[\s\"]/) { + print $line; + $in_hca = "yes"; + $found_hca++; + } + } + if ($line =~ /^Switch.*/ || $line =~ /^Rt.*/) { $in_hca = "no"; } + + if ($line =~ /^\[(\d+)\].*/ && $in_hca eq "yes") { + $ports{$1} = $line; + } + + } + + if ($in_hca eq "yes") { + foreach my $port (sort { $a <=> $b } (keys %ports)) { + print $ports{$port}; + } + } + + if ($found_hca == 0) { + die "\"$target_hca\" not found\n" . + " Try running with the \"-R\" option.\n" . + " If still not found the node is probably down.\n"; + } + if ($found_hca > 1) { + print "\nWARNING: Found $found_hca CA's with the name \"$target_hca\"\n"; + } + close IBNET_TOPO; +} +main + diff --git a/infiniband-diags/scripts/ibprintrt.pl b/infiniband-diags/scripts/ibprintrt.pl new file mode 100755 index 0000000..616be88 --- /dev/null +++ b/infiniband-diags/scripts/ibprintrt.pl @@ -0,0 +1,138 @@ +#!/usr/bin/perl +# +# Copyright (c) 2006 The Regents of the University of California. +# Copyright (c) 2007-2008 Voltaire, Inc. All rights reserved. +# +# Produced at Lawrence Livermore National Laboratory. +# Written by Ira Weiny <weiny2@llnl.gov>. +# +# This software is available to you under a choice of one of two +# licenses. You may choose to be licensed under the terms of the GNU +# General Public License (GPL) Version 2, available from the file +# COPYING in the main directory of this source tree, or the +# OpenIB.org BSD license below: +# +# Redistribution and use in source and binary forms, with or +# without modification, are permitted provided that the following +# conditions are met: +# +# - Redistributions of source code must retain the above +# copyright notice, this list of conditions and the following +# disclaimer. +# +# - Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials +# provided with the distribution. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS +# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# + +use strict; + +use Getopt::Std; +use IBswcountlimits; + +printf (STDERR "*** WARNING ***: this command is deprecated; Please use \"ibrouters\"\n\n"); + +# ========================================================================= +# +sub usage_and_exit +{ + my $prog = $_[0]; + print "Usage: $prog [-R -l] [-G <rt_guid> | <node_name>]\n"; + print " print only the rt specified from the ibnetdiscover output\n"; + print " -R Recalculate ibnetdiscover information\n"; + print " -l list rts\n"; + print " -C <ca_name> use selected channel adaptor name for queries\n"; + print " -P <ca_port> use selected channel adaptor port for queries\n"; + print " -G node is specified with GUID\n"; + exit 2; +} + +my $argv0 = `basename $0`; +my $regenerate_map = undef; +my $list_rts = undef; +my $ca_name = ""; +my $ca_port = ""; +my $name_is_guid = "no"; +chomp $argv0; +if (!getopts("hRlC:P:G")) { usage_and_exit $argv0; } +if (defined $Getopt::Std::opt_h) { usage_and_exit $argv0; } +if (defined $Getopt::Std::opt_R) { $regenerate_map = $Getopt::Std::opt_R; } +if (defined $Getopt::Std::opt_l) { $list_rts = $Getopt::Std::opt_l; } +if (defined $Getopt::Std::opt_C) { $ca_name = $Getopt::Std::opt_C; } +if (defined $Getopt::Std::opt_P) { $ca_port = $Getopt::Std::opt_P; } +if (defined $Getopt::Std::opt_G) { $name_is_guid = "yes"; } + +my $target_rt = $ARGV[0]; + +if ($name_is_guid eq "yes") { + $target_rt = format_guid($target_rt); +} + +my $cache_file = get_cache_file($ca_name, $ca_port); + +if ($regenerate_map || !(-f "$cache_file")) { + generate_ibnetdiscover_topology($ca_name, $ca_port); +} + +if ($list_rts) { + system("ibrouters $cache_file"); + exit 1; +} + +if ($target_rt eq "") { + usage_and_exit $argv0; +} + +# ========================================================================= +# +sub main +{ + my $found_rt = 0; + open IBNET_TOPO, "<$cache_file" or die "Failed to open ibnet topology\n"; + my $in_rt = "no"; + my %ports = undef; + while (my $line = <IBNET_TOPO>) { + if ($line =~ /^Rt.*\"R-(.*)\"\s+# (.*)/) { + my $guid = $1; + my $desc = $2; + if ($in_rt eq "yes") { + $in_rt = "no"; + foreach my $port (sort { $a <=> $b } (keys %ports)) { + print $ports{$port}; + } + } + if ("0x$guid" eq $target_rt || $desc =~ /[\s\"]$target_rt[\s\"]/) { + print $line; + $in_rt = "yes"; + $found_rt++; + } + } + if ($line =~ /^Switch.*/ || $line =~ /^Ca.*/) { $in_rt = "no"; } + + if ($line =~ /^\[(\d+)\].*/ && $in_rt eq "yes") { + $ports{$1} = $line; + } + + } + if ($found_rt == 0) { + die "\"$target_rt\" not found\n" . + " Try running with the \"-R\" option.\n" . + " If still not found the node is probably down.\n"; + } + if ($found_rt > 1) { + print "\nWARNING: Found $found_rt Router's with the name \"$target_rt\"\n"; + } + close IBNET_TOPO; +} +main + diff --git a/infiniband-diags/scripts/ibprintswitch.pl b/infiniband-diags/scripts/ibprintswitch.pl new file mode 100755 index 0000000..69d506a --- /dev/null +++ b/infiniband-diags/scripts/ibprintswitch.pl @@ -0,0 +1,137 @@ +#!/usr/bin/perl +# +# Copyright (c) 2008 Voltaire, Inc. All rights reserved. +# Copyright (c) 2006 The Regents of the University of California. +# +# Produced at Lawrence Livermore National Laboratory. +# Written by Ira Weiny <weiny2@llnl.gov>. +# +# This software is available to you under a choice of one of two +# licenses. You may choose to be licensed under the terms of the GNU +# General Public License (GPL) Version 2, available from the file +# COPYING in the main directory of this source tree, or the +# OpenIB.org BSD license below: +# +# Redistribution and use in source and binary forms, with or +# without modification, are permitted provided that the following +# conditions are met: +# +# - Redistributions of source code must retain the above +# copyright notice, this list of conditions and the following +# disclaimer. +# +# - Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials +# provided with the distribution. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS +# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# + +use strict; + +use Getopt::Std; +use IBswcountlimits; + +printf (STDERR "*** WARNING ***: this command is deprecated; Please use \"ibswitches\"\n\n"); + +# ========================================================================= +# +sub usage_and_exit +{ + my $prog = $_[0]; + print "Usage: $prog [-R -l] [-G <switch_guid> | <switch_name>]\n"; + print " print only the switch specified from the ibnetdiscover output\n"; + print " -R Recalculate ibnetdiscover information\n"; + print " -l list switches\n"; + print " -C <ca_name> use selected channel adaptor name for queries\n"; + print " -P <ca_port> use selected channel adaptor port for queries\n"; + print " -G node is specified with GUID\n"; + exit 2; +} + +my $argv0 = `basename $0`; +my $regenerate_map = undef; +my $list_switches = undef; +my $ca_name = ""; +my $ca_port = ""; +my $name_is_guid = "no"; +chomp $argv0; +if (!getopts("hRlC:P:G")) { usage_and_exit $argv0; } +if (defined $Getopt::Std::opt_h) { usage_and_exit $argv0; } +if (defined $Getopt::Std::opt_R) { $regenerate_map = $Getopt::Std::opt_R; } +if (defined $Getopt::Std::opt_l) { $list_switches = $Getopt::Std::opt_l; } +if (defined $Getopt::Std::opt_C) { $ca_name = $Getopt::Std::opt_C; } +if (defined $Getopt::Std::opt_P) { $ca_port = $Getopt::Std::opt_P; } +if (defined $Getopt::Std::opt_G) { $name_is_guid = "yes"; } + +my $target_switch = $ARGV[0]; + +if ($name_is_guid eq "yes") { + $target_switch = format_guid($target_switch); +} + +my $cache_file = get_cache_file($ca_name, $ca_port); + +if ($regenerate_map || !(-f "$cache_file")) { + generate_ibnetdiscover_topology($ca_name, $ca_port); +} + +if ($list_switches) { + system("ibswitches $cache_file"); + exit 1; +} + +if ($target_switch eq "") { + usage_and_exit $argv0; +} + +# ========================================================================= +# +sub main +{ + my $found_switch = 0; + open IBNET_TOPO, "<$cache_file" or die "Failed to open ibnet topology\n"; + my $in_switch = "no"; + my %ports = undef; + while (my $line = <IBNET_TOPO>) { + if ($line =~ /^Switch.*\"S-(.*)\"\s+# (.*) port.*/) { + my $guid = $1; + my $desc = $2; + if ($in_switch eq "yes") { + $in_switch = "no"; + foreach my $port (sort { $a <=> $b } (keys %ports)) { + print $ports{$port}; + } + } + if ("0x$guid" eq $target_switch || $desc =~ /[\s\"]$target_switch[\s\"]/) { + print $line; + $in_switch = "yes"; + $found_switch++; + } + } + if ($line =~ /^Ca.*/) { $in_switch = "no"; } + + if ($line =~ /^\[(\d+)\].*/ && $in_switch eq "yes") { + $ports{$1} = $line; + } + + } + if ($found_switch == 0) { + die "Switch \"$target_switch\" not found\n" . + " Try running with the \"-R\" option.\n"; + } + if ($found_switch > 1) { + print "\nWARNING: Found $found_switch switches with the name \"$target_switch\"\n"; + } + close IBNET_TOPO; +} +main + diff --git a/infiniband-diags/scripts/ibqueryerrors.pl.in b/infiniband-diags/scripts/ibqueryerrors.pl.in new file mode 100644 index 0000000..3d48751 --- /dev/null +++ b/infiniband-diags/scripts/ibqueryerrors.pl.in @@ -0,0 +1,41 @@ +#!/usr/bin/perl +# +# Copyright (c) 2009 Lawrence Livermore National Security +# +# Produced at Lawrence Livermore National Laboratory. +# Written by Ira Weiny <weiny2@llnl.gov>. +# +# This software is available to you under a choice of one of two +# licenses. You may choose to be licensed under the terms of the GNU +# General Public License (GPL) Version 2, available from the file +# COPYING in the main directory of this source tree, or the +# OpenIB.org BSD license below: +# +# Redistribution and use in source and binary forms, with or +# without modification, are permitted provided that the following +# conditions are met: +# +# - Redistributions of source code must retain the above +# copyright notice, this list of conditions and the following +# disclaimer. +# +# - Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials +# provided with the distribution. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS +# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# + +# this is now just a wrapper for the C based utility +$str = join " ",@ARGV; +system "@IBSCRIPTPATH@/ibqueryerrors $str"; +printf (STDERR "\n*** WARNING ***: this command has been replaced by ibqueryerrors\n\n"); + diff --git a/infiniband-diags/scripts/ibrouters.in b/infiniband-diags/scripts/ibrouters.in new file mode 100644 index 0000000..b3e5a1d --- /dev/null +++ b/infiniband-diags/scripts/ibrouters.in @@ -0,0 +1,73 @@ +#!/bin/sh + +IBPATH=${IBPATH:-@IBSCRIPTPATH@} + +usage() { + echo Usage: `basename $0` "[-h] [<topology-file> | -y mkey" \ + "-C ca_name -P ca_port -t timeout_ms]" + exit -1 +} + +topofile="" +ca_info="" +mkey="0" + +while [ "$1" ]; do + case $1 in + -h | --help) + usage + ;; + -y | --m_key) + case $2 in + -*) + usage + ;; + esac + if [ x$2 = x ] ; then + usage + fi + shift + mkey="$1" + ;; + -P | --Port | -C | --Ca | -t | --timeout) + case $2 in + -*) + usage + ;; + esac + if [ x$2 = x ] ; then + usage + fi + ca_info="$ca_info $1 $2" + shift + ;; + -*) + usage + ;; + *) + if [ "$topofile" ]; then + usage + fi + topofile="$1" + ;; + esac + shift +done + +if [ "$topofile" ]; then + netcmd="cat $topofile" +else + netcmd="$IBPATH/ibnetdiscover -y $mkey $ca_info" +fi + +text="`eval $netcmd`" +rv=$? +echo "$text" | awk ' +/^Rt/ {print $1 "\t: 0x" substr($3, 4, 16) " ports " $2 " "\ + substr($0, match($0, "#[ \t]*")+RLENGTH)} +/^ib/ {print $0; next} +/ibpanic:/ {print $0} +/ibwarn:/ {print $0} +/iberror:/ {print $0} +' +exit $rv diff --git a/infiniband-diags/scripts/ibstatus b/infiniband-diags/scripts/ibstatus new file mode 100755 index 0000000..f7fbbc2 --- /dev/null +++ b/infiniband-diags/scripts/ibstatus @@ -0,0 +1,78 @@ +#!/bin/sh + +# Usage ibstatus [devname[:port]] + +infiniband_base="/sys/class/infiniband" +def_ibdev="mthca0" + +usage() { + prog=`basename $0` + echo "Usage: " $prog " [-h] [devname[:portnum]]" + echo " -h: this help screen" + echo " Examples:" + echo " $prog mthca1 # shows status of all ports of 'mthca1'" + echo " $prog mthca0:2 # shows status port number 2 of 'mthca0'" + echo " $prog # default: shows status of all '$def_ibdev' ports" + exit -1 +} + +fatal() { + echo "Fatal error: " $* + exit -1 +} + + +port_status() { + port_dir="$infiniband_base/$1/ports/$2" + echo "Infiniband device '$1' port $2 status:" + echo " default gid: " `[ -r $port_dir/gids/0 ] && cat $port_dir/gids/0 || echo unknown` + echo " base lid: " `[ -r $port_dir/lid ] && cat $port_dir/lid || echo unknown` + echo " sm lid: " `[ -r $port_dir/sm_lid ] && cat $port_dir/sm_lid || echo unknown` + echo " state: " `[ -r $port_dir/state ] && cat $port_dir/state || echo unknown` + echo " phys state: " `[ -r $port_dir/phys_state ] && cat $port_dir/phys_state || echo unknown` + echo " rate: " `[ -r $port_dir/rate ] && cat $port_dir/rate || echo unknown` + echo " link_layer: " `[ -r $port_dir/link_layer ] && cat $port_dir/link_layer || echo IB` + echo +} + +ib_status() { + ports_dir="$infiniband_base/$1/ports" + + if ! [ -d "$ports_dir" ]; then + fatal "device '$1': sys files not found ($ports_dir)" + fi + + if [ "$2" = "+" ]; then + ports=`(cd "$infiniband_base/$1/ports" 2>/dev/null || fatal No devices; echo *)` + else + ports=$2 + fi + + for i in $ports; do + port_status $1 $i + done +} + +if [ "$1" = "-h" ]; then + usage +fi + +if [ -z "$1" ]; then + cd $infiniband_base 2>/dev/null || fatal No devices + for dev in *; do + ib_status $dev "+"; + done + exit 0 +fi + +while [ "$1" ]; do + dev=`echo $1 | sed 's/:.*$//'` + port=`echo $1 | sed 's/^.*://'` + + if [ "$port" = "$dev" ]; then + port="+" + fi + + ib_status $dev $port + shift +done diff --git a/infiniband-diags/scripts/ibswitches.in b/infiniband-diags/scripts/ibswitches.in new file mode 100644 index 0000000..743f1db --- /dev/null +++ b/infiniband-diags/scripts/ibswitches.in @@ -0,0 +1,92 @@ +#!/bin/sh + +IBPATH=${IBPATH:-@IBSCRIPTPATH@} + +usage() { + echo Usage: `basename $0` "[-h] [<topology-file> | -y mkey" \ + "-C ca_name -P ca_port -t timeout_ms]" + exit -1 +} + +topofile="" +ca_info="" +mkey="0" + +while [ "$1" ]; do + case $1 in + -h | --help) + usage + ;; + -y | --m_key) + case $2 in + -*) + usage + ;; + esac + if [ x$2 = x ] ; then + usage + fi + shift + mkey="$1" + ;; + -P | --Port | -C | --Ca | -t | --timeout) + case $2 in + -*) + usage + ;; + esac + if [ x$2 = x ] ; then + usage + fi + ca_info="$ca_info $1 $2" + shift + ;; + -*) + usage + ;; + *) + if [ "$topofile" ]; then + usage + fi + topofile="$1" + ;; + esac + shift +done + +if [ "$topofile" ]; then + netcmd="cat $topofile" +else + netcmd="$IBPATH/ibnetdiscover -y $mkey $ca_info" +fi + +text="`eval $netcmd`" +rv=$? +echo "$text" | awk ' +/^Switch/ { + l=$0 + desc=substr(l, match(l, "#[ \t]*")+RLENGTH) + pi=match(desc, "port 0.*") + pinfo=substr(desc, pi) + desc=substr(desc, 1, pi-2) + type="base port 0" + ti=match(desc, type) + if (ti==0) { + type="enhanced port 0" + ti=match(desc, type) + if (ti!=0) + desc=substr(desc, 1, ti-2) + } else + desc=substr(desc, 1, ti-2) + if (ti==0) + print $1 "\t: 0x" substr($3, 4, 16) " ports " $2 " "\ + desc " " pinfo + else + print $1 "\t: 0x" substr($3, 4, 16) " ports " $2 " "\ + desc " " type " " pinfo} +/^ib/ {print $0; next} +/ibpanic:/ {print $0} +/ibwarn:/ {print $0} +/iberror:/ {print $0} +' +exit $rv diff --git a/infiniband-diags/scripts/ibswportwatch.pl b/infiniband-diags/scripts/ibswportwatch.pl new file mode 100755 index 0000000..a2880aa --- /dev/null +++ b/infiniband-diags/scripts/ibswportwatch.pl @@ -0,0 +1,174 @@ +#!/usr/bin/perl +# +# Copyright (c) 2008 Voltaire, Inc. All rights reserved. +# Copyright (c) 2006 The Regents of the University of California. +# +# Produced at Lawrence Livermore National Laboratory. +# Written by Ira Weiny <weiny2@llnl.gov>. +# +# This software is available to you under a choice of one of two +# licenses. You may choose to be licensed under the terms of the GNU +# General Public License (GPL) Version 2, available from the file +# COPYING in the main directory of this source tree, or the +# OpenIB.org BSD license below: +# +# Redistribution and use in source and binary forms, with or +# without modification, are permitted provided that the following +# conditions are met: +# +# - Redistributions of source code must retain the above +# copyright notice, this list of conditions and the following +# disclaimer. +# +# - Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials +# provided with the distribution. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS +# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# + +use strict; + +use Getopt::Std; +use IBswcountlimits; + +my $sw_addr = ""; +my $sw_port = ""; +my $verbose = undef; + +# ========================================================================= +# +sub print_verbose +{ + if ($verbose) { + print $_[0]; + } +} + +# ========================================================================= +# +sub print_all_counts +{ + if (!$verbose) { return; } + + print " Counter\t\t\tNew ==> Old\n"; + foreach my $cnt (@IBswcountlimits::counters) { + print +" $cnt\t\t\t$IBswcountlimits::new_counts{$cnt} ==> $IBswcountlimits::cur_counts{$cnt}\n"; + } +} + +# ========================================================================= +# +sub usage_and_exit +{ + my $prog = $_[0]; + print + "Usage: $prog [-p <pause_time> -b -v -n <cycles> -G] <guid|lid> <port>\n"; + print " Attempt to diagnose a problem on a port\n"; + print +" Run this on a link while a job is running which utilizes that link.\n"; + print +" -p <pause_time> define the ammount of time between counter polls (default $IBswcountlimits::pause_time)\n"; + print " -v Be verbose\n"; + print " -n <cycles> run n cycles then exit (default -1 == forever)\n"; + print " -G Address provided is a GUID\n"; + print " -b report bytes/second packets/second\n"; + exit 2; +} + +# ========================================================================= +# +sub clear_counters +{ + # clear the counters + foreach my $count (@IBswcountlimits::counters) { + $IBswcountlimits::cur_counts{$count} = 0; + $IBswcountlimits::new_counts{$count} = 0; + } +} + +# ========================================================================= +# +sub mv_counts +{ + foreach my $count (@IBswcountlimits::counters) { + $IBswcountlimits::cur_counts{$count} = + $IBswcountlimits::new_counts{$count}; + } +} + +# ========================================================================= +# use perfquery to get the counters. +my $GUID = ""; + +sub get_new_counts +{ + my $addr = $_[0]; + my $port = $_[1]; + mv_counts; + ensure_cache_dir; + if ( + system( +"perfquery $GUID $addr $port > $IBswcountlimits::cache_dir/perfquery.out" + ) + ) + { + die "perfquery failed : \"perfquery $GUID $addr $port\"\n"; + } + open PERF_QUERY, "<$IBswcountlimits::cache_dir/perfquery.out" + or die "cannot read '$IBswcountlimits::cache_dir/perfquery.out': $!\n"; + while (my $line = <PERF_QUERY>) { + foreach my $count (@IBswcountlimits::counters) { + if ($line =~ /^$count:\.+(\d+)/) { + $IBswcountlimits::new_counts{$count} = $1; + } + } + } + close PERF_QUERY; +} + +my $cycle = -1; # forever + +my $bytes_per_second = undef; +my $argv0 = `basename $0`; +chomp $argv0; +if (!getopts("hbvp:n:G")) { usage_and_exit $argv0; } +if (defined $Getopt::Std::opt_h) { usage_and_exit $argv0; } +if (defined $Getopt::Std::opt_p) { + $IBswcountlimits::pause_time = $Getopt::Std::opt_p; +} +if (defined $Getopt::Std::opt_v) { $verbose = $Getopt::Std::opt_v; } +if (defined $Getopt::Std::opt_n) { $cycle = $Getopt::Std::opt_n; } +if (defined $Getopt::Std::opt_G) { $GUID = "-G"; } +if (defined $Getopt::Std::opt_b) { $bytes_per_second = $Getopt::Std::opt_b; } + +my $sw_addr = $ARGV[0]; +my $sw_port = $ARGV[1]; + +sub main +{ + clear_counters; + get_new_counts($sw_addr, $sw_port); + while ($cycle != 0) { + print "Checking counts...\n"; + sleep($IBswcountlimits::pause_time); + get_new_counts($sw_addr, $sw_port); + check_counter_rates; + if ($bytes_per_second) { + print_data_rates; + } + print_all_counts; + if ($cycle != -1) { $cycle = $cycle - 1; } + } +} +main; + diff --git a/infiniband-diags/scripts/set_nodedesc.sh b/infiniband-diags/scripts/set_nodedesc.sh new file mode 100755 index 0000000..1e42ac8 --- /dev/null +++ b/infiniband-diags/scripts/set_nodedesc.sh @@ -0,0 +1,60 @@ +#!/bin/sh + +if [ -f /etc/sysconfig/network ]; then +. /etc/sysconfig/network +fi + +ib_sysfs="/sys/class/infiniband" +newname="$HOSTNAME" + +echo "" +echo "*** WARNING ***: this command is deprecated." +echo "" + +function usage +{ + echo "Usage: `basename $0` [-hv] [<name>]" + echo " set the node_desc field of all hca's found in \"$ib_sysfs\"" + echo " -h this help" + echo " -v view all node descriptors" + echo " [<name>] set name to name specified." + echo " Default is to use the hostname: \"$HOSTNAME\"" + exit 2 +} + +function viewall +{ + for hca in `ls $ib_sysfs`; do + if [ -f $ib_sysfs/$hca/node_desc ]; then + echo -n "$hca: " + cat $ib_sysfs/$hca/node_desc + else + logger -s "Failed to set node_desc for : $hca" + fi + done + exit 0 +} + +while getopts "hv" flag +do + case $flag in + "h") usage;; + "v") viewall;; + esac +done + +shift $(($OPTIND - 1)) + +if [ "$1" != "" ]; then + newname="$1" +fi + +for hca in `ls $ib_sysfs`; do + if [ -f $ib_sysfs/$hca/node_desc ]; then + echo -n "$newname" >> $ib_sysfs/$hca/node_desc + else + logger -s "Failed to set node_desc for : $hca" + fi +done + +exit 0 diff --git a/infiniband-diags/sminfo.c b/infiniband-diags/sminfo.c new file mode 100644 index 0000000..7193a99 --- /dev/null +++ b/infiniband-diags/sminfo.c @@ -0,0 +1,155 @@ +/* + * Copyright (c) 2004-2009 Voltaire Inc. All rights reserved. + * Copyright (c) 2011 Mellanox Technologies LTD. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <inttypes.h> + +#include <infiniband/umad.h> +#include <infiniband/mad.h> + +#include "ibdiag_common.h" + +static uint8_t sminfo[1024] = { 0 }; + +static struct ibmad_port *srcport; + +enum { + SMINFO_NOTACT, + SMINFO_DISCOVER, + SMINFO_STANDBY, + SMINFO_MASTER, + + SMINFO_STATE_LAST, +}; + +static const char *const statestr[] = { + "SMINFO_NOTACT", + "SMINFO_DISCOVER", + "SMINFO_STANDBY", + "SMINFO_MASTER", +}; + +#define STATESTR(s) (((unsigned)(s)) < SMINFO_STATE_LAST ? statestr[s] : "???") + +static unsigned act; +static int prio, state = SMINFO_STANDBY; + +static int process_opt(void *context, int ch) +{ + switch (ch) { + case 'a': + act = strtoul(optarg, NULL, 0); + break; + case 's': + state = strtoul(optarg, NULL, 0); + break; + case 'p': + prio = strtoul(optarg, NULL, 0); + break; + default: + return -1; + } + return 0; +} + +int main(int argc, char **argv) +{ + int mgmt_classes[3] = + { IB_SMI_CLASS, IB_SMI_DIRECT_CLASS, IB_SA_CLASS }; + int mod = 0; + ib_portid_t portid = { 0 }; + uint8_t *p; + uint64_t guid = 0, key = 0; + + const struct ibdiag_opt opts[] = { + {"state", 's', 1, "<0-3>", "set SM state"}, + {"priority", 'p', 1, "<0-15>", "set SM priority"}, + {"activity", 'a', 1, NULL, "set activity count"}, + {} + }; + char usage_args[] = "<sm_lid|sm_dr_path> [modifier]"; + + ibdiag_process_opts(argc, argv, NULL, "sK", opts, process_opt, + usage_args, NULL); + + argc -= optind; + argv += optind; + + if (argc > 1) + mod = atoi(argv[1]); + + srcport = mad_rpc_open_port(ibd_ca, ibd_ca_port, mgmt_classes, 3); + if (!srcport) + IBEXIT("Failed to open '%s' port '%d'", ibd_ca, ibd_ca_port); + + smp_mkey_set(srcport, ibd_mkey); + + if (argc) { + if (resolve_portid_str(ibd_ca, ibd_ca_port, &portid, argv[0], + ibd_dest_type, NULL, srcport) < 0) + IBEXIT("can't resolve destination port %s", argv[0]); + } else { + if (resolve_sm_portid(ibd_ca, ibd_ca_port, &portid) < 0) + IBEXIT("can't resolve sm port %s", argv[0]); + } + + mad_encode_field(sminfo, IB_SMINFO_GUID_F, &guid); + mad_encode_field(sminfo, IB_SMINFO_ACT_F, &act); + mad_encode_field(sminfo, IB_SMINFO_KEY_F, &key); + mad_encode_field(sminfo, IB_SMINFO_PRIO_F, &prio); + mad_encode_field(sminfo, IB_SMINFO_STATE_F, &state); + + if (mod) { + if (!(p = smp_set_via(sminfo, &portid, IB_ATTR_SMINFO, mod, + ibd_timeout, srcport))) + IBEXIT("query"); + } else if (!(p = smp_query_via(sminfo, &portid, IB_ATTR_SMINFO, 0, + ibd_timeout, srcport))) + IBEXIT("query"); + + mad_decode_field(sminfo, IB_SMINFO_GUID_F, &guid); + mad_decode_field(sminfo, IB_SMINFO_ACT_F, &act); + mad_decode_field(sminfo, IB_SMINFO_KEY_F, &key); + mad_decode_field(sminfo, IB_SMINFO_PRIO_F, &prio); + mad_decode_field(sminfo, IB_SMINFO_STATE_F, &state); + + printf("sminfo: sm lid %d sm guid 0x%" PRIx64 + ", activity count %u priority %d state %d %s\n", portid.lid, + guid, act, prio, state, STATESTR(state)); + + mad_rpc_close_port(srcport); + exit(0); +} diff --git a/infiniband-diags/smpdump.c b/infiniband-diags/smpdump.c new file mode 100644 index 0000000..719e64d --- /dev/null +++ b/infiniband-diags/smpdump.c @@ -0,0 +1,270 @@ +/* + * Copyright (c) 2004-2009 Voltaire Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#define _GNU_SOURCE + +#include <inttypes.h> +#include <string.h> +#include <errno.h> +#include <stdio.h> +#include <stdlib.h> +#include <netinet/in.h> +#include <endian.h> + +#include <infiniband/mad.h> +#include <infiniband/umad.h> + +#include <ibdiag_common.h> + +static int mad_agent; +static int drmad_tid = 0x123; + +typedef struct { + char path[64]; + int hop_cnt; +} DRPath; + +struct drsmp { + uint8_t base_version; + uint8_t mgmt_class; + uint8_t class_version; + uint8_t method; + __be16 status; + uint8_t hop_ptr; + uint8_t hop_cnt; + __be64 tid; + __be16 attr_id; + uint16_t resv; + __be32 attr_mod; + __be64 mkey; + __be16 dr_slid; + __be16 dr_dlid; + uint8_t reserved[28]; + uint8_t data[64]; + uint8_t initial_path[64]; + uint8_t return_path[64]; +}; + +static void drsmp_get_init(void *umad, DRPath * path, int attr, int mod) +{ + struct drsmp *smp = (struct drsmp *)(umad_get_mad(umad)); + + memset(smp, 0, sizeof(*smp)); + + smp->base_version = 1; + smp->mgmt_class = IB_SMI_DIRECT_CLASS; + smp->class_version = 1; + + smp->method = 1; + smp->attr_id = htons(attr); + smp->attr_mod = htonl(mod); + smp->tid = htobe64(drmad_tid); + drmad_tid++; + smp->dr_slid = htobe16(0xffff); + smp->dr_dlid = htobe16(0xffff); + + umad_set_addr(umad, 0xffff, 0, 0, 0); + + if (path) + memcpy(smp->initial_path, path->path, path->hop_cnt + 1); + + smp->hop_cnt = (uint8_t) path->hop_cnt; +} + +static void smp_get_init(void *umad, int lid, int attr, int mod) +{ + struct drsmp *smp = (struct drsmp *)(umad_get_mad(umad)); + + memset(smp, 0, sizeof(*smp)); + + smp->base_version = 1; + smp->mgmt_class = IB_SMI_CLASS; + smp->class_version = 1; + + smp->method = 1; + smp->attr_id = htons(attr); + smp->attr_mod = htonl(mod); + smp->tid = htobe64(drmad_tid); + drmad_tid++; + + umad_set_addr(umad, lid, 0, 0, 0); +} + +static int str2DRPath(char *str, DRPath * path) +{ + char *s; + + path->hop_cnt = -1; + + DEBUG("DR str: %s", str); + while (str && *str) { + if ((s = strchr(str, ','))) + *s = 0; + path->path[++path->hop_cnt] = (char)atoi(str); + if (!s) + break; + str = s + 1; + } + +#if 0 + if (path->path[0] != 0 || + (path->hop_cnt > 0 && dev_port && path->path[1] != dev_port)) { + DEBUG("hop 0 != 0 or hop 1 != dev_port"); + return -1; + } +#endif + + return path->hop_cnt; +} + +static int dump_char, mgmt_class = IB_SMI_CLASS; + +static int process_opt(void *context, int ch) +{ + switch (ch) { + case 's': + dump_char++; + break; + case 'D': + mgmt_class = IB_SMI_DIRECT_CLASS; + break; + case 'L': + mgmt_class = IB_SMI_CLASS; + break; + default: + return -1; + } + return 0; +} + +int main(int argc, char *argv[]) +{ + int dlid = 0; + void *umad; + struct drsmp *smp; + int i, portid, mod = 0, attr; + DRPath path; + uint8_t *desc; + int length; + + const struct ibdiag_opt opts[] = { + {"string", 's', 0, NULL, ""}, + {} + }; + char usage_args[] = "<dlid|dr_path> <attr> [mod]"; + const char *usage_examples[] = { + " -- DR routed examples:", + "-D 0,1,2,3,5 16 # NODE DESC", + "-D 0,1,2 0x15 2 # PORT INFO, port 2", + " -- LID routed examples:", + "3 0x15 2 # PORT INFO, lid 3 port 2", + "0xa0 0x11 # NODE INFO, lid 0xa0", + NULL + }; + + ibd_timeout = 1000; + + ibdiag_process_opts(argc, argv, NULL, "GKs", opts, process_opt, + usage_args, usage_examples); + + argc -= optind; + argv += optind; + + if (argc < 2) + ibdiag_show_usage(); + + if (mgmt_class == IB_SMI_DIRECT_CLASS && + str2DRPath(strdupa(argv[0]), &path) < 0) + IBPANIC("bad path str '%s'", argv[0]); + + if (mgmt_class == IB_SMI_CLASS) + dlid = strtoul(argv[0], NULL, 0); + + attr = strtoul(argv[1], NULL, 0); + if (argc > 2) + mod = strtoul(argv[2], NULL, 0); + + if (umad_init() < 0) + IBPANIC("can't init UMAD library"); + + if ((portid = umad_open_port(ibd_ca, ibd_ca_port)) < 0) + IBPANIC("can't open UMAD port (%s:%d)", ibd_ca, ibd_ca_port); + + if ((mad_agent = umad_register(portid, mgmt_class, 1, 0, NULL)) < 0) + IBPANIC("Couldn't register agent for SMPs"); + + if (!(umad = umad_alloc(1, umad_size() + IB_MAD_SIZE))) + IBPANIC("can't alloc MAD"); + + smp = umad_get_mad(umad); + + if (mgmt_class == IB_SMI_DIRECT_CLASS) + drsmp_get_init(umad, &path, attr, mod); + else + smp_get_init(umad, dlid, attr, mod); + + if (ibdebug > 1) + xdump(stderr, "before send:\n", smp, 256); + + length = IB_MAD_SIZE; + if (umad_send(portid, mad_agent, umad, length, ibd_timeout, 0) < 0) + IBPANIC("send failed"); + + if (umad_recv(portid, umad, &length, -1) != mad_agent) + IBPANIC("recv error: %s", strerror(errno)); + + if (ibdebug) + fprintf(stderr, "%d bytes received\n", length); + + if (!dump_char) { + xdump(stdout, NULL, smp->data, 64); + if (smp->status) + fprintf(stdout, "SMP status: 0x%x\n", + ntohs(smp->status)); + goto exit; + } + + desc = smp->data; + for (i = 0; i < 64; ++i) { + if (!desc[i]) + break; + putchar(desc[i]); + } + putchar('\n'); + if (smp->status) + fprintf(stdout, "SMP status: 0x%x\n", ntohs(smp->status)); + +exit: + umad_free(umad); + return 0; +} diff --git a/infiniband-diags/smpquery.c b/infiniband-diags/smpquery.c new file mode 100644 index 0000000..ade400d --- /dev/null +++ b/infiniband-diags/smpquery.c @@ -0,0 +1,506 @@ +/* + * Copyright (c) 2004-2009 Voltaire Inc. All rights reserved. + * Copyright (c) 2011 Mellanox Technologies LTD. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <string.h> +#include <netinet/in.h> + +#define __STDC_FORMAT_MACROS +#include <inttypes.h> + +#include <infiniband/umad.h> +#include <infiniband/mad.h> +#include <util/node_name_map.h> + +#include "ibdiag_common.h" + +static struct ibmad_port *srcport; + +static op_fn_t node_desc, node_info, port_info, switch_info, pkey_table, + sl2vl_table, vlarb_table, guid_info, mlnx_ext_port_info, port_info_extended; + +static const match_rec_t match_tbl[] = { + {"NodeInfo", "NI", node_info, 0, ""}, + {"NodeDesc", "ND", node_desc, 0, ""}, + {"PortInfo", "PI", port_info, 1, ""}, + {"PortInfoExtended", "PIE", port_info_extended, 1, ""}, + {"SwitchInfo", "SI", switch_info, 0, ""}, + {"PKeyTable", "PKeys", pkey_table, 1, ""}, + {"SL2VLTable", "SL2VL", sl2vl_table, 1, ""}, + {"VLArbitration", "VLArb", vlarb_table, 1, ""}, + {"GUIDInfo", "GI", guid_info, 0, ""}, + {"MlnxExtPortInfo", "MEPI", mlnx_ext_port_info, 1, ""}, + {} +}; + +static char *node_name_map_file = NULL; +static nn_map_t *node_name_map = NULL; +static int extended_speeds = 0; + +/*******************************************/ +static const char *node_desc(ib_portid_t *dest, char **argv, int argc) +{ + int node_type, l; + uint64_t node_guid; + char nd[IB_SMP_DATA_SIZE] = { 0 }; + uint8_t data[IB_SMP_DATA_SIZE] = { 0 }; + char dots[128]; + char *nodename = NULL; + + if (!smp_query_via(data, dest, IB_ATTR_NODE_INFO, 0, 0, srcport)) + return "node info query failed"; + + mad_decode_field(data, IB_NODE_TYPE_F, &node_type); + mad_decode_field(data, IB_NODE_GUID_F, &node_guid); + + if (!smp_query_via(nd, dest, IB_ATTR_NODE_DESC, 0, 0, srcport)) + return "node desc query failed"; + + nodename = remap_node_name(node_name_map, node_guid, nd); + + l = strlen(nodename); + if (l < 32) { + memset(dots, '.', 32 - l); + dots[32 - l] = '\0'; + } else { + dots[0] = '.'; + dots[1] = '\0'; + } + + printf("Node Description:%s%s\n", dots, nodename); + free(nodename); + return NULL; +} + +static const char *node_info(ib_portid_t * dest, char **argv, int argc) +{ + char buf[2048]; + char data[IB_SMP_DATA_SIZE] = { 0 }; + + if (!smp_query_via(data, dest, IB_ATTR_NODE_INFO, 0, 0, srcport)) + return "node info query failed"; + + mad_dump_nodeinfo(buf, sizeof buf, data, sizeof data); + + printf("# Node info: %s\n%s", portid2str(dest), buf); + return NULL; +} + +static const char *port_info_extended(ib_portid_t *dest, char **argv, int argc) +{ + char buf[2048]; + uint8_t data[IB_SMP_DATA_SIZE] = { 0 }; + int portnum = 0; + + if (argc > 0) + portnum = strtol(argv[0], NULL, 0); + + if (!is_port_info_extended_supported(dest, portnum, srcport)) + return "port info extended not supported"; + + if (!smp_query_via(data, dest, IB_ATTR_PORT_INFO_EXT, portnum, 0, + srcport)) + return "port info extended query failed"; + + mad_dump_portinfo_ext(buf, sizeof buf, data, sizeof data); + printf("# Port info Extended: %s port %d\n%s", portid2str(dest), + portnum, buf); + return NULL; +} + +static const char *port_info(ib_portid_t *dest, char **argv, int argc) +{ + char data[IB_SMP_DATA_SIZE] = { 0 }; + int portnum = 0, orig_portnum; + + if (argc > 0) + portnum = strtol(argv[0], NULL, 0); + orig_portnum = portnum; + if (extended_speeds) + portnum |= (1U) << 31; + + if (!smp_query_via(data, dest, IB_ATTR_PORT_INFO, portnum, 0, srcport)) + return "port info query failed"; + + printf("# Port info: %s port %d\n", portid2str(dest), orig_portnum); + dump_portinfo(data, 0); + return NULL; +} + +static const char *mlnx_ext_port_info(ib_portid_t *dest, char **argv, int argc) +{ + char buf[2300]; + char data[IB_SMP_DATA_SIZE]; + int portnum = 0; + + if (argc > 0) + portnum = strtol(argv[0], NULL, 0); + + if (!smp_query_via(data, dest, IB_ATTR_MLNX_EXT_PORT_INFO, portnum, 0, srcport)) + return "Mellanox ext port info query failed"; + + mad_dump_mlnx_ext_port_info(buf, sizeof buf, data, sizeof data); + + printf("# MLNX ext Port info: %s port %d\n%s", portid2str(dest), portnum, buf); + return NULL; +} + +static const char *switch_info(ib_portid_t *dest, char **argv, int argc) +{ + char buf[2048]; + char data[IB_SMP_DATA_SIZE] = { 0 }; + + if (!smp_query_via(data, dest, IB_ATTR_SWITCH_INFO, 0, 0, srcport)) + return "switch info query failed"; + + mad_dump_switchinfo(buf, sizeof buf, data, sizeof data); + + printf("# Switch info: %s\n%s", portid2str(dest), buf); + return NULL; +} + +static const char *pkey_table(ib_portid_t *dest, char **argv, int argc) +{ + uint8_t data[IB_SMP_DATA_SIZE] = { 0 }; + int i, j, k; + __be16 *p; + unsigned mod; + int n, t, phy_ports; + int portnum = 0; + + if (argc > 0) + portnum = strtol(argv[0], NULL, 0); + + /* Get the partition capacity */ + if (!smp_query_via(data, dest, IB_ATTR_NODE_INFO, 0, 0, srcport)) + return "node info query failed"; + + mad_decode_field(data, IB_NODE_TYPE_F, &t); + mad_decode_field(data, IB_NODE_NPORTS_F, &phy_ports); + if (portnum > phy_ports) + return "invalid port number"; + + if ((t == IB_NODE_SWITCH) && (portnum != 0)) { + if (!smp_query_via(data, dest, IB_ATTR_SWITCH_INFO, 0, 0, + srcport)) + return "switch info failed"; + mad_decode_field(data, IB_SW_PARTITION_ENFORCE_CAP_F, &n); + } else + mad_decode_field(data, IB_NODE_PARTITION_CAP_F, &n); + + for (i = 0; i < (n + 31) / 32; i++) { + mod = i | (portnum << 16); + if (!smp_query_via(data, dest, IB_ATTR_PKEY_TBL, mod, 0, + srcport)) + return "pkey table query failed"; + if (i + 1 == (n + 31) / 32) + k = ((n + 7 - i * 32) / 8) * 8; + else + k = 32; + p = (__be16 *) data; + for (j = 0; j < k; j += 8, p += 8) { + printf + ("%4u: 0x%04x 0x%04x 0x%04x 0x%04x 0x%04x 0x%04x 0x%04x 0x%04x\n", + (i * 32) + j, ntohs(p[0]), ntohs(p[1]), + ntohs(p[2]), ntohs(p[3]), ntohs(p[4]), ntohs(p[5]), + ntohs(p[6]), ntohs(p[7])); + } + } + printf("%d pkeys capacity for this port\n", n); + + return NULL; +} + +static const char *sl2vl_dump_table_entry(ib_portid_t *dest, int in, int out) +{ + char buf[2048]; + char data[IB_SMP_DATA_SIZE] = { 0 }; + int portnum = (in << 8) | out; + + if (!smp_query_via(data, dest, IB_ATTR_SLVL_TABLE, portnum, 0, srcport)) + return "slvl query failed"; + + mad_dump_sltovl(buf, sizeof buf, data, sizeof data); + printf("ports: in %2d, out %2d: ", in, out); + printf("%s", buf); + return NULL; +} + +static const char *sl2vl_table(ib_portid_t *dest, char **argv, int argc) +{ + uint8_t data[IB_SMP_DATA_SIZE] = { 0 }; + int type, num_ports, portnum = 0; + int i; + const char *ret; + + if (argc > 0) + portnum = strtol(argv[0], NULL, 0); + + if (!smp_query_via(data, dest, IB_ATTR_NODE_INFO, 0, 0, srcport)) + return "node info query failed"; + + mad_decode_field(data, IB_NODE_TYPE_F, &type); + mad_decode_field(data, IB_NODE_NPORTS_F, &num_ports); + if (portnum > num_ports) + return "invalid port number"; + + printf("# SL2VL table: %s\n", portid2str(dest)); + printf("# SL: |"); + for (i = 0; i < 16; i++) + printf("%2d|", i); + printf("\n"); + + if (type != IB_NODE_SWITCH) + return sl2vl_dump_table_entry(dest, 0, 0); + + for (i = 0; i <= num_ports; i++) { + ret = sl2vl_dump_table_entry(dest, i, portnum); + if (ret) + return ret; + } + return NULL; +} + +static const char *vlarb_dump_table_entry(ib_portid_t *dest, int portnum, + int offset, unsigned cap) +{ + char buf[2048]; + char data[IB_SMP_DATA_SIZE] = { 0 }; + + if (!smp_query_via(data, dest, IB_ATTR_VL_ARBITRATION, + (offset << 16) | portnum, 0, srcport)) + return "vl arb query failed"; + mad_dump_vlarbitration(buf, sizeof(buf), data, cap * 2); + printf("%s", buf); + return NULL; +} + +static const char *vlarb_dump_table(ib_portid_t *dest, int portnum, + const char *name, int offset, int cap) +{ + const char *ret; + + printf("# %s priority VL Arbitration Table:", name); + ret = vlarb_dump_table_entry(dest, portnum, offset, + cap < 32 ? cap : 32); + if (!ret && cap > 32) + ret = vlarb_dump_table_entry(dest, portnum, offset + 1, + cap - 32); + return ret; +} + +static const char *vlarb_table(ib_portid_t *dest, char **argv, int argc) +{ + uint8_t data[IB_SMP_DATA_SIZE] = { 0 }; + int portnum = 0; + int type, enhsp0, lowcap, highcap; + const char *ret = NULL; + + if (argc > 0) + portnum = strtol(argv[0], NULL, 0); + + /* port number of 0 could mean SP0 or port MAD arrives on */ + if (portnum == 0) { + if (!smp_query_via(data, dest, IB_ATTR_NODE_INFO, 0, 0, + srcport)) + return "node info query failed"; + + mad_decode_field(data, IB_NODE_TYPE_F, &type); + if (type == IB_NODE_SWITCH) { + memset(data, 0, sizeof(data)); + if (!smp_query_via(data, dest, IB_ATTR_SWITCH_INFO, 0, + 0, srcport)) + return "switch info query failed"; + mad_decode_field(data, IB_SW_ENHANCED_PORT0_F, &enhsp0); + if (!enhsp0) { + printf + ("# No VLArbitration tables (BSP0): %s port %d\n", + portid2str(dest), 0); + return NULL; + } + memset(data, 0, sizeof(data)); + } + } + + if (!smp_query_via(data, dest, IB_ATTR_PORT_INFO, portnum, 0, srcport)) + return "port info query failed"; + + mad_decode_field(data, IB_PORT_VL_ARBITRATION_LOW_CAP_F, &lowcap); + mad_decode_field(data, IB_PORT_VL_ARBITRATION_HIGH_CAP_F, &highcap); + + printf("# VLArbitration tables: %s port %d LowCap %d HighCap %d\n", + portid2str(dest), portnum, lowcap, highcap); + + if (lowcap > 0) + ret = vlarb_dump_table(dest, portnum, "Low", 1, lowcap); + + if (!ret && highcap > 0) + ret = vlarb_dump_table(dest, portnum, "High", 3, highcap); + + return ret; +} + +static const char *guid_info(ib_portid_t *dest, char **argv, int argc) +{ + uint8_t data[IB_SMP_DATA_SIZE] = { 0 }; + int i, j, k; + __be64 *p; + unsigned mod; + int n; + + /* Get the guid capacity */ + if (!smp_query_via(data, dest, IB_ATTR_PORT_INFO, 0, 0, srcport)) + return "port info failed"; + mad_decode_field(data, IB_PORT_GUID_CAP_F, &n); + + for (i = 0; i < (n + 7) / 8; i++) { + mod = i; + if (!smp_query_via(data, dest, IB_ATTR_GUID_INFO, mod, 0, + srcport)) + return "guid info query failed"; + if (i + 1 == (n + 7) / 8) + k = ((n + 1 - i * 8) / 2) * 2; + else + k = 8; + p = (__be64 *) data; + for (j = 0; j < k; j += 2, p += 2) { + printf("%4u: 0x%016" PRIx64 " 0x%016" PRIx64 "\n", + (i * 8) + j, be64toh(p[0]), be64toh(p[1])); + } + } + printf("%d guids capacity for this port\n", n); + + return NULL; +} + +static int process_opt(void *context, int ch) +{ + switch (ch) { + case 1: + node_name_map_file = strdup(optarg); + if (node_name_map_file == NULL) + IBEXIT("out of memory, strdup for node_name_map_file name failed"); + break; + case 'c': + ibd_dest_type = IB_DEST_DRSLID; + break; + case 'x': + extended_speeds = 1; + break; + default: + return -1; + } + return 0; +} + +int main(int argc, char **argv) +{ + char usage_args[1024]; + int mgmt_classes[3] = + { IB_SMI_CLASS, IB_SMI_DIRECT_CLASS, IB_SA_CLASS }; + ib_portid_t portid = { 0 }; + const char *err; + op_fn_t *fn; + const match_rec_t *r; + int n; + + const struct ibdiag_opt opts[] = { + {"combined", 'c', 0, NULL, + "use Combined route address argument"}, + {"node-name-map", 1, 1, "<file>", "node name map file"}, + {"extended", 'x', 0, NULL, "use extended speeds"}, + {} + }; + const char *usage_examples[] = { + "portinfo 3 1\t\t\t\t# portinfo by lid, with port modifier", + "-G switchinfo 0x2C9000100D051 1\t# switchinfo by guid", + "-D nodeinfo 0\t\t\t\t# nodeinfo by direct route", + "-c nodeinfo 6 0,12\t\t\t# nodeinfo by combined route", + NULL + }; + + n = sprintf(usage_args, "<op> <dest dr_path|lid|guid> [op params]\n" + "\nSupported ops (and aliases, case insensitive):\n"); + for (r = match_tbl; r->name; r++) { + n += snprintf(usage_args + n, sizeof(usage_args) - n, + " %s (%s) <addr>%s\n", r->name, + r->alias ? r->alias : "", + r->opt_portnum ? " [<portnum>]" : ""); + if (n >= sizeof(usage_args)) + exit(-1); + } + + ibdiag_process_opts(argc, argv, NULL, NULL, opts, process_opt, + usage_args, usage_examples); + + argc -= optind; + argv += optind; + + if (argc < 2) + ibdiag_show_usage(); + + if (!(fn = match_op(match_tbl, argv[0]))) + IBEXIT("operation '%s' not supported", argv[0]); + + srcport = mad_rpc_open_port(ibd_ca, ibd_ca_port, mgmt_classes, 3); + if (!srcport) + IBEXIT("Failed to open '%s' port '%d'", ibd_ca, ibd_ca_port); + + smp_mkey_set(srcport, ibd_mkey); + + node_name_map = open_node_name_map(node_name_map_file); + + if (ibd_dest_type != IB_DEST_DRSLID) { + if (resolve_portid_str(ibd_ca, ibd_ca_port, &portid, argv[1], + ibd_dest_type, ibd_sm_id, srcport) < 0) + IBEXIT("can't resolve destination port %s", argv[1]); + if ((err = fn(&portid, argv + 2, argc - 2))) + IBEXIT("operation %s: %s", argv[0], err); + } else { + char concat[64]; + + memset(concat, 0, 64); + snprintf(concat, sizeof(concat), "%s %s", argv[1], argv[2]); + if (resolve_portid_str(ibd_ca, ibd_ca_port, &portid, concat, + ibd_dest_type, ibd_sm_id, srcport) < 0) + IBEXIT("can't resolve destination port %s", concat); + if ((err = fn(&portid, argv + 3, argc - 3))) + IBEXIT("operation %s: %s", argv[0], err); + } + close_node_name_map(node_name_map); + mad_rpc_close_port(srcport); + exit(0); +} diff --git a/infiniband-diags/vendstat.c b/infiniband-diags/vendstat.c new file mode 100644 index 0000000..2f7d7bd --- /dev/null +++ b/infiniband-diags/vendstat.c @@ -0,0 +1,526 @@ +/* + * Copyright (c) 2012 Mellanox Technologies LTD. All rights reserved. + * Copyright (c) 2004-2009 Voltaire Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <netinet/in.h> + +#include <infiniband/umad.h> +#include <infiniband/mad.h> + +#include "ibdiag_common.h" + +#define IS3_DEVICE_ID 47396 + +#define IB_MLX_VENDOR_CLASS 10 +/* Vendor specific Attribute IDs */ +#define IB_MLX_IS3_GENERAL_INFO 0x17 +#define IB_MLX_IS3_CONFIG_SPACE_ACCESS 0x50 +#define IB_MLX_IS4_COUNTER_GROUP_INFO 0x90 +#define IB_MLX_IS4_CONFIG_COUNTER_GROUP 0x91 +/* Config space addresses */ +#define IB_MLX_IS3_PORT_XMIT_WAIT 0x10013C + + +static struct ibmad_port *srcport; + +typedef struct { + __be16 hw_revision; + __be16 device_id; + uint8_t reserved[24]; + __be32 uptime; +} is3_hw_info_t; + +typedef struct { + uint8_t resv1; + uint8_t major; + uint8_t minor; + uint8_t sub_minor; + __be32 build_id; + uint8_t month; + uint8_t day; + __be16 year; + __be16 resv2; + __be16 hour; + uint8_t psid[16]; + __be32 ini_file_version; +} is3_fw_info_t; + +typedef struct { + __be32 ext_major; + __be32 ext_minor; + __be32 ext_sub_minor; + __be32 reserved[4]; +} is4_fw_ext_info_t; + +typedef struct { + uint8_t resv1; + uint8_t major; + uint8_t minor; + uint8_t sub_minor; + uint8_t resv2[28]; +} is3_sw_info_t; + +typedef struct { + uint8_t reserved[8]; + is3_hw_info_t hw_info; + is3_fw_info_t fw_info; + is3_sw_info_t sw_info; +} is3_general_info_t; + +typedef struct { + uint8_t reserved[8]; + is3_hw_info_t hw_info; + is3_fw_info_t fw_info; + is4_fw_ext_info_t ext_fw_info; + is3_sw_info_t sw_info; +} is4_general_info_t; + +typedef struct { + uint8_t reserved[8]; + struct is3_record { + __be32 address; + __be32 data; + __be32 mask; + } record[18]; +} is3_config_space_t; + +#define COUNTER_GROUPS_NUM 2 + +typedef struct { + uint8_t reserved1[8]; + uint8_t reserved[3]; + uint8_t num_of_counter_groups; + __be32 group_masks[COUNTER_GROUPS_NUM]; +} is4_counter_group_info_t; + +typedef struct { + uint8_t reserved[3]; + uint8_t group_select; +} is4_group_select_t; + +typedef struct { + uint8_t reserved1[8]; + uint8_t reserved[4]; + is4_group_select_t group_selects[COUNTER_GROUPS_NUM]; +} is4_config_counter_groups_t; + +static uint16_t ext_fw_info_device[][2] = { + {0x0245, 0x0245}, /* Switch-X */ + {0xc738, 0xc73b}, /* Switch-X */ + {0xcb20, 0xcb20}, /* Switch-IB */ + {0xcf08, 0xcf08}, /* Switch-IB2 */ + {0xd2f0, 0xd2f0}, /* Quantum */ + {0x01b3, 0x01b3}, /* IS-4 */ + {0x1003, 0x101b}, /* Connect-X */ + {0xa2d2, 0xa2d2}, /* BlueField */ + {0x1b02, 0x1b02}, /* Bull SwitchX */ + {0x1b50, 0x1b50}, /* Bull SwitchX */ + {0x1ba0, 0x1ba0}, /* Bull SwitchIB */ + {0x1bd0, 0x1bd5}, /* Bull SwitchIB and SwitchIB2 */ + {0x1bf0, 0x1bf0}, /* Bull Sequana Quantum */ + {0x1b33, 0x1b33}, /* Bull ConnectX3 */ + {0x1b73, 0x1b73}, /* Bull ConnectX3 */ + {0x1b40, 0x1b41}, /* Bull ConnectX3 */ + {0x1b60, 0x1b61}, /* Bull ConnectX3 */ + {0x1b83, 0x1b83}, /* Bull ConnectIB */ + {0x1b93, 0x1b94}, /* Bull ConnectIB */ + {0x1bb4, 0x1bb5}, /* Bull ConnectX4 */ + {0x1bc4, 0x1bc6}, /* Bull ConnectX4, Sequana HDR and HDR100 */ + {0x0000, 0x0000}}; + +static int is_ext_fw_info_supported(uint16_t device_id) { + int i; + for (i = 0; ext_fw_info_device[i][0]; i++) + if (ext_fw_info_device[i][0] <= device_id && + device_id <= ext_fw_info_device[i][1]) + return 1; + return 0; +} + +static int do_vendor(ib_portid_t *portid, uint8_t class, uint8_t method, + uint16_t attr_id, uint32_t attr_mod, void *data) +{ + ib_vendor_call_t call; + + memset(&call, 0, sizeof(call)); + call.mgmt_class = class; + call.method = method; + call.timeout = ibd_timeout; + call.attrid = attr_id; + call.mod = attr_mod; + + if (!ib_vendor_call_via(data, portid, &call, srcport)) { + fprintf(stderr,"vendstat: method %u, attribute %u failure\n", method, attr_id); + return -1; + } + return 0; +} + +static int do_config_space_records(ib_portid_t *portid, unsigned set, + is3_config_space_t *cs, unsigned records) +{ + unsigned i; + + if (records > 18) + records = 18; + + if (do_vendor(portid, IB_MLX_VENDOR_CLASS, + set ? IB_MAD_METHOD_SET : IB_MAD_METHOD_GET, + IB_MLX_IS3_CONFIG_SPACE_ACCESS, 2 << 22 | records << 16, + cs)) { + fprintf(stderr,"cannot %s config space records\n", set ? "set" : "get"); + return -1; + } + for (i = 0; i < records; i++) { + printf("Config space record at 0x%x: 0x%x\n", + ntohl(cs->record[i].address), + ntohl(cs->record[i].data & cs->record[i].mask)); + } + return 0; +} + +static int counter_groups_info(ib_portid_t * portid, int port) +{ + char buf[1024]; + is4_counter_group_info_t *cg_info; + int i, num_cg; + + /* Counter Group Info */ + memset(&buf, 0, sizeof(buf)); + if (do_vendor(portid, IB_MLX_VENDOR_CLASS, IB_MAD_METHOD_GET, + IB_MLX_IS4_COUNTER_GROUP_INFO, port, buf)) { + fprintf(stderr,"counter group info query failure\n"); + return -1; + } + cg_info = (is4_counter_group_info_t *) & buf; + num_cg = cg_info->num_of_counter_groups; + printf("counter_group_info:\n"); + printf("%d counter groups\n", num_cg); + for (i = 0; i < num_cg; i++) + printf("group%d mask %#x\n", i, ntohl(cg_info->group_masks[i])); + return 0; +} + +/* Group0 counter config values */ +#define IS4_G0_PortXmtDataSL_0_7 0 +#define IS4_G0_PortXmtDataSL_8_15 1 +#define IS4_G0_PortRcvDataSL_0_7 2 + +/* Group1 counter config values */ +#define IS4_G1_PortXmtDataSL_8_15 1 +#define IS4_G1_PortRcvDataSL_0_7 2 +#define IS4_G1_PortRcvDataSL_8_15 8 + +static int cg0, cg1; + +static int config_counter_groups(ib_portid_t * portid, int port) +{ + char buf[1024]; + is4_config_counter_groups_t *cg_config; + + /* configure counter groups for groups 0 and 1 */ + memset(&buf, 0, sizeof(buf)); + cg_config = (is4_config_counter_groups_t *) & buf; + + printf("counter_groups_config: configuring group0 %d group1 %d\n", cg0, + cg1); + cg_config->group_selects[0].group_select = (uint8_t) cg0; + cg_config->group_selects[1].group_select = (uint8_t) cg1; + + if (do_vendor(portid, IB_MLX_VENDOR_CLASS, IB_MAD_METHOD_SET, + IB_MLX_IS4_CONFIG_COUNTER_GROUP, port, buf)) { + fprintf(stderr, "config counter group set failure\n"); + return -1; + } + /* get config counter groups */ + memset(&buf, 0, sizeof(buf)); + + if (do_vendor(portid, IB_MLX_VENDOR_CLASS, IB_MAD_METHOD_GET, + IB_MLX_IS4_CONFIG_COUNTER_GROUP, port, buf)) { + fprintf(stderr, "config counter group query failure\n"); + return -1; + } + return 0; +} + +static int general_info, xmit_wait, counter_group_info, config_counter_group; +static is3_config_space_t write_cs, read_cs; +static unsigned write_cs_records, read_cs_records; + + +static int process_opt(void *context, int ch) +{ + int ret; + unsigned int address, data, mask; + + switch (ch) { + case 'N': + general_info = 1; + break; + case 'w': + xmit_wait = 1; + break; + case 'i': + counter_group_info = 1; + break; + case 'c': + config_counter_group = 1; + ret = sscanf(optarg, "%d,%d", &cg0, &cg1); + if (ret != 2) + return -1; + break; + case 'R': + if (read_cs_records >= 18) + break; + ret = sscanf(optarg, "%x,%x", &address, &mask); + if (ret < 1) + return -1; + else if (ret == 1) + mask = 0xffffffff; + read_cs.record[read_cs_records].address = htobe32(address); + read_cs.record[read_cs_records].mask = htobe32(mask); + read_cs_records++; + break; + case 'W': + if (write_cs_records >= 18) + break; + ret = sscanf(optarg, "%x,%x,%x", &address, &data, &mask); + if (ret < 2) + return -1; + else if (ret == 2) + mask = 0xffffffff; + write_cs.record[write_cs_records].address = htobe32(address); + write_cs.record[write_cs_records].data = htobe32(data); + write_cs.record[write_cs_records].mask = htobe32(mask); + write_cs_records++; + break; + default: + return -1; + } + return 0; +} + +int main(int argc, char **argv) +{ + int mgmt_classes[2] = { IB_SA_CLASS, IB_MLX_VENDOR_CLASS }; + ib_portid_t portid = { 0 }; + int port = 0; + char buf[1024]; + uint32_t fw_ver_major = 0; + uint32_t fw_ver_minor = 0; + uint32_t fw_ver_sub_minor = 0; + uint8_t sw_ver_major = 0, sw_ver_minor = 0, sw_ver_sub_minor = 0; + is3_general_info_t *gi_is3; + is4_general_info_t *gi_is4; + const struct ibdiag_opt opts[] = { + {"N", 'N', 0, NULL, "show IS3 or IS4 general information"}, + {"w", 'w', 0, NULL, "show IS3 port xmit wait counters"}, + {"i", 'i', 0, NULL, "show IS4 counter group info"}, + {"c", 'c', 1, "<num,num>", "configure IS4 counter groups"}, + {"Read", 'R', 1, "<addr,mask>", "Read configuration space record at addr"}, + {"Write", 'W', 1, "<addr,val,mask>", "Write configuration space record at addr"}, + {} + }; + + char usage_args[] = "<lid|guid> [port]"; + const char *usage_examples[] = { + "-N 6\t\t# read IS3 or IS4 general information", + "-w 6\t\t# read IS3 port xmit wait counters", + "-i 6 12\t# read IS4 port 12 counter group info", + "-c 0,1 6 12\t# configure IS4 port 12 counter groups for PortXmitDataSL", + "-c 2,8 6 12\t# configure IS4 port 12 counter groups for PortRcvDataSL", + NULL + }; + + ibdiag_process_opts(argc, argv, NULL, "DKy", opts, process_opt, + usage_args, usage_examples); + + argc -= optind; + argv += optind; + + if (argc > 1) + port = strtoul(argv[1], NULL, 0); + + srcport = mad_rpc_open_port(ibd_ca, ibd_ca_port, mgmt_classes, 2); + if (!srcport) + IBEXIT("Failed to open '%s' port '%d'", ibd_ca, ibd_ca_port); + + if (argc) { + if (resolve_portid_str(ibd_ca, ibd_ca_port, &portid, argv[0], + ibd_dest_type, ibd_sm_id, srcport) < 0) { + mad_rpc_close_port(srcport); + IBEXIT("can't resolve destination port %s", argv[0]); + } + } else { + if (resolve_self(ibd_ca, ibd_ca_port, &portid, &port, NULL) < 0) { + mad_rpc_close_port(srcport); + IBEXIT("can't resolve self port %s", argv[0]); + } + } + + if (counter_group_info) { + counter_groups_info(&portid, port); + mad_rpc_close_port(srcport); + exit(0); + } + + if (config_counter_group) { + config_counter_groups(&portid, port); + mad_rpc_close_port(srcport); + exit(0); + } + + if (read_cs_records || write_cs_records) { + if (read_cs_records) + do_config_space_records(&portid, 0, &read_cs, + read_cs_records); + if (write_cs_records) + do_config_space_records(&portid, 1, &write_cs, + write_cs_records); + mad_rpc_close_port(srcport); + exit(0); + } + + /* These are Mellanox specific vendor MADs */ + /* but vendors change the VendorId so how know for sure ? */ + /* Only General Info and Port Xmit Wait Counters */ + /* queries are currently supported */ + if (!general_info && !xmit_wait) { + mad_rpc_close_port(srcport); + IBEXIT("at least one of -N and -w must be specified"); + } + /* Would need a list of these and it might not be complete */ + /* so for right now, punt on this */ + + /* vendor ClassPortInfo is required attribute if class supported */ + memset(&buf, 0, sizeof(buf)); + if (do_vendor(&portid, IB_MLX_VENDOR_CLASS, IB_MAD_METHOD_GET, + CLASS_PORT_INFO, 0, buf)) { + mad_rpc_close_port(srcport); + IBEXIT("classportinfo query"); + } + memset(&buf, 0, sizeof(buf)); + gi_is3 = (is3_general_info_t *) &buf; + if (do_vendor(&portid, IB_MLX_VENDOR_CLASS, IB_MAD_METHOD_GET, + IB_MLX_IS3_GENERAL_INFO, 0, gi_is3)) { + mad_rpc_close_port(srcport); + IBEXIT("generalinfo query"); + } + + if (is_ext_fw_info_supported(ntohs(gi_is3->hw_info.device_id))) { + gi_is4 = (is4_general_info_t *) &buf; + fw_ver_major = ntohl(gi_is4->ext_fw_info.ext_major); + fw_ver_minor = ntohl(gi_is4->ext_fw_info.ext_minor); + fw_ver_sub_minor = ntohl(gi_is4->ext_fw_info.ext_sub_minor); + sw_ver_major = gi_is4->sw_info.major; + sw_ver_minor = gi_is4->sw_info.minor; + sw_ver_sub_minor = gi_is4->sw_info.sub_minor; + } else { + fw_ver_major = gi_is3->fw_info.major; + fw_ver_minor = gi_is3->fw_info.minor; + fw_ver_sub_minor = gi_is3->fw_info.sub_minor; + sw_ver_major = gi_is3->sw_info.major; + sw_ver_minor = gi_is3->sw_info.minor; + sw_ver_sub_minor = gi_is3->sw_info.sub_minor; + } + + if (general_info) { + /* dump IS3 or IS4 general info here */ + printf("hw_dev_rev: 0x%04x\n", ntohs(gi_is3->hw_info.hw_revision)); + printf("hw_dev_id: 0x%04x\n", ntohs(gi_is3->hw_info.device_id)); + printf("hw_uptime: 0x%08x\n", ntohl(gi_is3->hw_info.uptime)); + printf("fw_version: %02d.%02d.%02d\n", + fw_ver_major, fw_ver_minor, fw_ver_sub_minor); + printf("fw_build_id: 0x%04x\n", ntohl(gi_is3->fw_info.build_id)); + printf("fw_date: %02x/%02x/%04x\n", + gi_is3->fw_info.month, gi_is3->fw_info.day, + ntohs(gi_is3->fw_info.year)); + printf("fw_psid: '%s'\n", gi_is3->fw_info.psid); + printf("fw_ini_ver: %d\n", + ntohl(gi_is3->fw_info.ini_file_version)); + printf("sw_version: %02d.%02d.%02d\n", sw_ver_major, + sw_ver_minor, sw_ver_sub_minor); + } + + if (xmit_wait) { + is3_config_space_t *cs; + unsigned i; + + if (ntohs(gi_is3->hw_info.device_id) != IS3_DEVICE_ID) { + mad_rpc_close_port(srcport); + IBEXIT("Unsupported device ID 0x%x", + ntohs(gi_is3->hw_info.device_id)); + } + memset(&buf, 0, sizeof(buf)); + /* Set record addresses for each port */ + cs = (is3_config_space_t *) & buf; + for (i = 0; i < 16; i++) + cs->record[i].address = + htonl(IB_MLX_IS3_PORT_XMIT_WAIT + ((i + 1) << 12)); + if (do_vendor(&portid, IB_MLX_VENDOR_CLASS, + IB_MAD_METHOD_GET, IB_MLX_IS3_CONFIG_SPACE_ACCESS, + 2 << 22 | 16 << 16, cs)) { + mad_rpc_close_port(srcport); + IBEXIT("vendstat"); + } + for (i = 0; i < 16; i++) + if (cs->record[i].data) /* PortXmitWait is 32 bit counter */ + printf("Port %d: PortXmitWait 0x%x\n", i + 4, ntohl(cs->record[i].data)); /* port 4 is first port */ + + /* Last 8 ports is another query */ + memset(&buf, 0, sizeof(buf)); + /* Set record addresses for each port */ + cs = (is3_config_space_t *) & buf; + for (i = 0; i < 8; i++) + cs->record[i].address = + htonl(IB_MLX_IS3_PORT_XMIT_WAIT + ((i + 17) << 12)); + if (do_vendor(&portid, IB_MLX_VENDOR_CLASS, + IB_MAD_METHOD_GET, IB_MLX_IS3_CONFIG_SPACE_ACCESS, + 2 << 22 | 8 << 16, cs)) { + mad_rpc_close_port(srcport); + IBEXIT("vendstat"); + } + + for (i = 0; i < 8; i++) + if (cs->record[i].data) /* PortXmitWait is 32 bit counter */ + printf("Port %d: PortXmitWait 0x%x\n", + i < 4 ? i + 21 : i - 3, + ntohl(cs->record[i].data)); + } + + mad_rpc_close_port(srcport); + exit(0); +} diff --git a/iwpmd/CMakeLists.txt b/iwpmd/CMakeLists.txt new file mode 100644 index 0000000..bf53803 --- /dev/null +++ b/iwpmd/CMakeLists.txt @@ -0,0 +1,32 @@ +rdma_sbin_executable(iwpmd + iwarp_pm_common.c + iwarp_pm_helper.c + iwarp_pm_server.c + ) +target_link_libraries(iwpmd LINK_PRIVATE + ${SYSTEMD_LIBRARIES} + ${NL_LIBRARIES} + ${CMAKE_THREAD_LIBS_INIT} + ) + +rdma_man_pages( + iwpmd.8.in + iwpmd.conf.5.in + ) + +rdma_subst_install(FILES "iwpmd.service.in" + RENAME "iwpmd.service" + DESTINATION "${CMAKE_INSTALL_SYSTEMD_SERVICEDIR}") +rdma_subst_install(FILES "iwpmd_init.in" + DESTINATION "${CMAKE_INSTALL_INITDDIR}" + RENAME "iwpmd" + PERMISSIONS OWNER_WRITE OWNER_READ GROUP_READ WORLD_READ OWNER_EXECUTE GROUP_EXECUTE WORLD_EXECUTE) +install(FILES "iwpmd.conf" DESTINATION "${CMAKE_INSTALL_SYSCONFDIR}") + +install(FILES "iwpmd.rules" + RENAME "90-iwpmd.rules" + DESTINATION "${CMAKE_INSTALL_UDEV_RULESDIR}") + +install(FILES modules-iwpmd.conf + RENAME "iwpmd.conf" + DESTINATION "${CMAKE_INSTALL_SYSCONFDIR}/rdma/modules") diff --git a/iwpmd/iwarp_pm.h b/iwpmd/iwarp_pm.h new file mode 100644 index 0000000..83cbb8e --- /dev/null +++ b/iwpmd/iwarp_pm.h @@ -0,0 +1,288 @@ +/* + * Copyright (c) 2013 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#ifndef IWARP_PM_H +#define IWARP_PM_H + +#include <endian.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <sys/socket.h> +#include <sys/select.h> +#include <sys/stat.h> +#include <arpa/inet.h> +#include <errno.h> +#include <linux/netlink.h> +#include <netlink/attr.h> +#include <signal.h> +#include <ifaddrs.h> +#include <pthread.h> +#include <syslog.h> +#include <netlink/msg.h> +#include <ccan/list.h> +#include <rdma/rdma_netlink.h> +#include <stdatomic.h> + +#define IWARP_PM_PORT 3935 +#define IWARP_PM_VER_SHIFT 6 +#define IWARP_PM_VER_MASK 0xc0 +#define IWARP_PM_MT_SHIFT 4 +#define IWARP_PM_MT_MASK 0x30 +#define IWARP_PM_IPVER_SHIFT 0 +#define IWARP_PM_IPVER_MASK 0x0F +#define IWARP_PM_MESSAGE_SIZE 48 /* bytes */ +#define IWARP_PM_ASSOC_OFFSET 0x10 /* different assochandles for passive/active side map requests */ +#define IWARP_PM_IPV4_ADDR 4 + +#define IWARP_PM_MT_REQ 0 +#define IWARP_PM_MT_ACC 1 +#define IWARP_PM_MT_ACK 2 +#define IWARP_PM_MT_REJ 3 + +#define IWARP_PM_REQ_QUERY 1 +#define IWARP_PM_REQ_ACCEPT 2 +#define IWARP_PM_REQ_ACK 4 + +#define IWARP_PM_RECV_PAYLOAD 4096 +#define IWARP_PM_MAX_CLIENTS 64 +#define IWPM_MAP_REQ_TIMEOUT 10 /* sec */ +#define IWPM_SEND_MSG_RETRIES 3 + +#define IWPM_ULIB_NAME "iWarpPortMapperUser" +#define IWPM_ULIBNAME_SIZE 32 +#define IWPM_DEVNAME_SIZE 32 +#define IWPM_IFNAME_SIZE 16 +#define IWPM_IPADDR_SIZE 16 + +#define IWPM_PARAM_NUM 1 +#define IWPM_PARAM_NAME_LEN 64 + +#define IWARP_PM_NETLINK_DBG 0x01 +#define IWARP_PM_WIRE_DBG 0x02 +#define IWARP_PM_RETRY_DBG 0x04 +#define IWARP_PM_ALL_DBG 0x07 +#define IWARP_PM_DEBUG 0x08 + +#define iwpm_debug(dbg_level, str, args...) \ + do { if (dbg_level & IWARP_PM_DEBUG) { \ + syslog(LOG_WARNING, str, ##args); } \ + } while (0) + +/* Port Mapper errors */ +enum { + IWPM_INVALID_NLMSG_ERR = 10, + IWPM_CREATE_MAPPING_ERR, + IWPM_DUPLICATE_MAPPING_ERR, + IWPM_UNKNOWN_MAPPING_ERR, + IWPM_CLIENT_DEV_INFO_ERR, + IWPM_USER_LIB_INFO_ERR, + IWPM_REMOTE_QUERY_REJECT, + IWPM_VERSION_MISMATCH_ERR, +}; + +/* iwpm param indexes */ +enum { + NL_SOCK_RBUF_SIZE +}; + +typedef struct iwpm_client { + char ifname[IWPM_IFNAME_SIZE]; /* netdev interface name */ + char ibdevname[IWPM_DEVNAME_SIZE]; /* OFED device name */ + char ulibname[IWPM_ULIBNAME_SIZE]; /* library name of the userpace PM agent provider */ + __u32 nl_seq; + char valid; +} iwpm_client; + +typedef union sockaddr_union { + struct sockaddr_storage s_sockaddr; + struct sockaddr sock_addr; + struct sockaddr_in v4_sockaddr; + struct sockaddr_in6 v6_sockaddr; + struct sockaddr_nl nl_sockaddr; +} sockaddr_union; + +typedef struct iwpm_mapped_port { + struct list_node entry; + int owner_client; + int sd; + struct sockaddr_storage local_addr; + struct sockaddr_storage mapped_addr; + int wcard; + _Atomic(int) ref_cnt; /* the number of owners */ +} iwpm_mapped_port; + +typedef struct iwpm_wire_msg { + __u8 magic; + __u8 pmtime; + __be16 reserved; + __be16 apport; + __be16 cpport; + __be64 assochandle; + /* big endian IP addresses and ports */ + __u8 cpipaddr[IWPM_IPADDR_SIZE]; + __u8 apipaddr[IWPM_IPADDR_SIZE]; + __u8 mapped_cpipaddr[IWPM_IPADDR_SIZE]; +} iwpm_wire_msg; + +typedef struct iwpm_send_msg { + int pm_sock; + struct sockaddr_storage dest_addr; + iwpm_wire_msg data; + int length; +} iwpm_send_msg; + +typedef struct iwpm_mapping_request { + struct list_node entry; + struct sockaddr_storage src_addr; + struct sockaddr_storage remote_addr; + __u16 nlmsg_type; /* Message content */ + __u32 nlmsg_seq; /* Sequence number */ + __u32 nlmsg_pid; + __u64 assochandle; + iwpm_send_msg * send_msg; + int timeout; + int complete; + int msg_type; +} iwpm_mapping_request; + +typedef struct iwpm_pending_msg { + struct list_node entry; + iwpm_send_msg send_msg; +} iwpm_pending_msg; + +typedef struct iwpm_msg_parms { + __u32 ip_ver; + __u16 address_family; + char apipaddr[IWPM_IPADDR_SIZE]; + __be16 apport; + char cpipaddr[IWPM_IPADDR_SIZE]; + __be16 cpport; + char mapped_cpipaddr[IWPM_IPADDR_SIZE]; + __be16 mapped_cpport; + unsigned char ver; + unsigned char mt; + unsigned char pmtime; + __u64 assochandle; + int msize; +} iwpm_msg_parms; + +/* iwarp_pm_common.c */ + +void parse_iwpm_config(FILE *); + +int create_iwpm_socket_v4(__u16); + +int create_iwpm_socket_v6(__u16); + +int create_netlink_socket(void); + +void destroy_iwpm_socket(int); + +int parse_iwpm_nlmsg(struct nlmsghdr *, int, struct nla_policy *, struct nlattr * [], const char *); + +int parse_iwpm_msg(iwpm_wire_msg *, iwpm_msg_parms *); + +void form_iwpm_request(iwpm_wire_msg *, iwpm_msg_parms *); + +void form_iwpm_accept(iwpm_wire_msg *, iwpm_msg_parms *); + +void form_iwpm_ack(iwpm_wire_msg *, iwpm_msg_parms *); + +void form_iwpm_reject(iwpm_wire_msg *, iwpm_msg_parms *); + +int send_iwpm_nlmsg(int, struct nl_msg *, int); + +struct nl_msg *create_iwpm_nlmsg(__u16, int); + +void print_iwpm_sockaddr(struct sockaddr_storage *, const char *, __u32); + +__be16 get_sockaddr_port(struct sockaddr_storage *sockaddr); + +void copy_iwpm_sockaddr(__u16, struct sockaddr_storage *, struct sockaddr_storage *, + char *, char *, __be16 *); + +int is_wcard_ipaddr(struct sockaddr_storage *); + +/* iwarp_pm_helper.c */ + +iwpm_mapped_port *create_iwpm_mapped_port(struct sockaddr_storage *, int, __u32 flags); + +iwpm_mapped_port *reopen_iwpm_mapped_port(struct sockaddr_storage *, struct sockaddr_storage *, int, + __u32 flags); + +void add_iwpm_mapped_port(iwpm_mapped_port *); + +iwpm_mapped_port *find_iwpm_mapping(struct sockaddr_storage *, int); + +iwpm_mapped_port *find_iwpm_same_mapping(struct sockaddr_storage *, int); + +void remove_iwpm_mapped_port(iwpm_mapped_port *); + +void print_iwpm_mapped_ports(void); + +void free_iwpm_port(iwpm_mapped_port *); + +iwpm_mapping_request *create_iwpm_map_request(struct nlmsghdr *, struct sockaddr_storage *, + struct sockaddr_storage *, __u64, int, iwpm_send_msg *); + +void add_iwpm_map_request(iwpm_mapping_request *); + +int update_iwpm_map_request(__u64, struct sockaddr_storage *, int, iwpm_mapping_request *, int); + +void remove_iwpm_map_request(iwpm_mapping_request *); + +void form_iwpm_send_msg(int, struct sockaddr_storage *, int, iwpm_send_msg *); + +int send_iwpm_msg(void (*form_msg_type)(iwpm_wire_msg *, iwpm_msg_parms *), + iwpm_msg_parms *, struct sockaddr_storage *, int); + +int add_iwpm_pending_msg(iwpm_send_msg *); + +int check_same_sockaddr(struct sockaddr_storage *, struct sockaddr_storage *); + +void free_iwpm_mapped_ports(void); + +extern struct list_head pending_messages; +extern struct list_head mapping_reqs; + +extern iwpm_client client_list[IWARP_PM_MAX_CLIENTS]; + +extern pthread_cond_t cond_req_complete; +extern pthread_mutex_t map_req_mutex; +extern int wake; +extern pthread_cond_t cond_pending_msg; +extern pthread_mutex_t pending_msg_mutex; + +#endif diff --git a/iwpmd/iwarp_pm_common.c b/iwpmd/iwarp_pm_common.c new file mode 100644 index 0000000..8160180 --- /dev/null +++ b/iwpmd/iwarp_pm_common.c @@ -0,0 +1,639 @@ +/* + * Copyright (c) 2013-2015 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#include "iwarp_pm.h" +#include <endian.h> + +/* iwpm config params */ +static const char * iwpm_param_names[IWPM_PARAM_NUM] = + { "nl_sock_rbuf_size" }; +static int iwpm_param_vals[IWPM_PARAM_NUM] = + { 0 }; + +/** + * get_iwpm_param() + */ +static int get_iwpm_param(char *param_name, int val) +{ + int i, ret; + for (i = 0; i < IWPM_PARAM_NUM; i++) { + ret = strcmp(param_name, iwpm_param_names[i]); + if (!ret && val > 0) { + syslog(LOG_WARNING, "get_iwpm_param: Got param (name = %s val = %d)\n", param_name, val); + iwpm_param_vals[i] = val; + return ret; + } + } + return ret; +} + +/** + * parse_iwpm_config() + */ +void parse_iwpm_config(FILE *fp) +{ + char line_buf[128]; + char param_name[IWPM_PARAM_NAME_LEN]; + int n, val, ret; + char *str; + + str = fgets(line_buf, 128, fp); + while (str) { + if (line_buf[0] == '#' || line_buf[0] == '\n') + goto parse_next_line; + n = sscanf(line_buf, "%64[^= ] %*[=]%d", param_name, &val); + if (n != 2) { + syslog(LOG_WARNING, "parse_iwpm_config: Couldn't parse a line (n = %d, name = %s, val = %d\n", n, param_name, val); + goto parse_next_line; + } + ret = get_iwpm_param(param_name, val); + if (ret) + syslog(LOG_WARNING, "parse_iwpm_config: Couldn't find param (ret = %d)\n", ret); +parse_next_line: + str = fgets(line_buf, 128, fp); + } +} + +/** + * create_iwpm_socket_v4 - Create an ipv4 socket for the iwarp port mapper + * @bind_port: UDP port to bind the socket + * + * Return a handle of ipv4 socket + */ +int create_iwpm_socket_v4(__u16 bind_port) +{ + sockaddr_union bind_addr; + struct sockaddr_in *bind_in4; + int pm_sock; + socklen_t sockname_len; + char ip_address_text[INET6_ADDRSTRLEN]; + + /* create a socket */ + pm_sock = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP); + if (pm_sock < 0) { + syslog(LOG_WARNING, "create_iwpm_socket_v4: Unable to create socket. %s.\n", + strerror(errno)); + pm_sock = -errno; + goto create_socket_v4_exit; + } + /* bind the socket to the given port */ + memset(&bind_addr, 0, sizeof(bind_addr)); + bind_in4 = &bind_addr.v4_sockaddr; + bind_in4->sin_family = AF_INET; + bind_in4->sin_addr.s_addr = htobe32(INADDR_ANY); + bind_in4->sin_port = htobe16(bind_port); + + if (bind(pm_sock, &bind_addr.sock_addr, sizeof(struct sockaddr_in))) { + syslog(LOG_WARNING, "create_iwpm_socket_v4: Unable to bind socket (port = %u). %s.\n", + bind_port, strerror(errno)); + close(pm_sock); + pm_sock = -errno; + goto create_socket_v4_exit; + } + + /* get the socket name (local port number) */ + sockname_len = sizeof(struct sockaddr_in); + if (getsockname(pm_sock, &bind_addr.sock_addr, &sockname_len)) { + syslog(LOG_WARNING, "create_iwpm_socket_v4: Unable to get socket name. %s.\n", + strerror(errno)); + close(pm_sock); + pm_sock = -errno; + goto create_socket_v4_exit; + } + + iwpm_debug(IWARP_PM_WIRE_DBG, "create_iwpm_socket_v4: Socket IP address:port %s:%u\n", + inet_ntop(bind_in4->sin_family, &bind_in4->sin_addr.s_addr, ip_address_text, + INET6_ADDRSTRLEN), be16toh(bind_in4->sin_port)); +create_socket_v4_exit: + return pm_sock; +} + +/** + * create_iwpm_socket_v6 - Create an ipv6 socket for the iwarp port mapper + * @bind_port: UDP port to bind the socket + * + * Return a handle of ipv6 socket + */ +int create_iwpm_socket_v6(__u16 bind_port) +{ + sockaddr_union bind_addr; + struct sockaddr_in6 *bind_in6; + int pm_sock, ret_value, ipv6_only; + socklen_t sockname_len; + char ip_address_text[INET6_ADDRSTRLEN]; + + /* create a socket */ + pm_sock = socket(AF_INET6, SOCK_DGRAM, IPPROTO_UDP); + if (pm_sock < 0) { + syslog(LOG_WARNING, "create_iwpm_socket_v6: Unable to create socket. %s.\n", + strerror(errno)); + pm_sock = -errno; + goto create_socket_v6_exit; + } + + ipv6_only = 1; + ret_value = setsockopt(pm_sock, IPPROTO_IPV6, IPV6_V6ONLY, &ipv6_only, sizeof(ipv6_only)); + if (ret_value < 0) { + syslog(LOG_WARNING, "create_iwpm_socket_v6: Unable to set sock options. %s.\n", + strerror(errno)); + close(pm_sock); + pm_sock = -errno; + goto create_socket_v6_exit; + } + + /* bind the socket to the given port */ + memset(&bind_addr, 0, sizeof(bind_addr)); + bind_in6 = &bind_addr.v6_sockaddr; + bind_in6->sin6_family = AF_INET6; + bind_in6->sin6_addr = in6addr_any; + bind_in6->sin6_port = htobe16(bind_port); + + if (bind(pm_sock, &bind_addr.sock_addr, sizeof(struct sockaddr_in6))) { + syslog(LOG_WARNING, "create_iwpm_socket_v6: Unable to bind socket (port = %u). %s.\n", + bind_port, strerror(errno)); + close(pm_sock); + pm_sock = -errno; + goto create_socket_v6_exit; + } + + /* get the socket name (local port number) */ + sockname_len = sizeof(struct sockaddr_in6); + if (getsockname(pm_sock, &bind_addr.sock_addr, &sockname_len)) { + syslog(LOG_WARNING, "create_iwpm_socket_v6: Unable to get socket name. %s.\n", + strerror(errno)); + close(pm_sock); + pm_sock = -errno; + goto create_socket_v6_exit; + } + + iwpm_debug(IWARP_PM_WIRE_DBG, "create_iwpm_socket_v6: Socket IP address:port %s:%04X\n", + inet_ntop(bind_in6->sin6_family, &bind_in6->sin6_addr, ip_address_text, + INET6_ADDRSTRLEN), be16toh(bind_in6->sin6_port)); +create_socket_v6_exit: + return pm_sock; +} + +/** + * create_netlink_socket - Create netlink socket for the iwarp port mapper + */ +int create_netlink_socket(void) +{ + sockaddr_union bind_addr; + struct sockaddr_nl *bind_nl; + int nl_sock; + __u32 rbuf_size, opt_len; + + /* create a socket */ + nl_sock = socket(AF_NETLINK, SOCK_RAW, NETLINK_RDMA); + if (nl_sock < 0) { + syslog(LOG_WARNING, "create_netlink_socket: Unable to create socket. %s.\n", + strerror(errno)); + nl_sock = -errno; + goto create_nl_socket_exit; + } + + /* bind the socket */ + memset(&bind_addr, 0, sizeof(bind_addr)); + bind_nl = &bind_addr.nl_sockaddr; + bind_nl->nl_family = AF_NETLINK; + bind_nl->nl_pid = getpid(); + bind_nl->nl_groups = 3; /* != 0 support multicast */ + + if (bind(nl_sock, &bind_addr.sock_addr, sizeof(struct sockaddr_nl))) { + syslog(LOG_WARNING, "create_netlink_socket: Unable to bind socket. %s.\n", + strerror(errno)); + close(nl_sock); + nl_sock = -errno; + goto create_nl_socket_exit; + } + if (iwpm_param_vals[NL_SOCK_RBUF_SIZE] > 0) { + rbuf_size = iwpm_param_vals[NL_SOCK_RBUF_SIZE]; + + if (setsockopt(nl_sock, SOL_SOCKET, SO_RCVBUFFORCE, &rbuf_size, sizeof rbuf_size)) { + syslog(LOG_WARNING, "create_netlink_socket: Unable to set sock option " + "(rbuf_size = %u). %s.\n", rbuf_size, strerror(errno)); + if (setsockopt(nl_sock, SOL_SOCKET, SO_RCVBUF, + &rbuf_size, sizeof rbuf_size)) { + syslog(LOG_WARNING, "create_netlink_socket: " + "Unable to set sock option %s. Closing socket\n", strerror(errno)); + close(nl_sock); + nl_sock = -errno; + goto create_nl_socket_exit; + } + } + } + getsockopt(nl_sock, SOL_SOCKET, SO_RCVBUF, &rbuf_size, &opt_len); + iwpm_debug(IWARP_PM_NETLINK_DBG, "create_netlink_socket: Setting a sock option (rbuf_size = %u).\n", rbuf_size); + +create_nl_socket_exit: + return nl_sock; +} + +/** + * destroy_iwpm_socket - Close socket + */ +void destroy_iwpm_socket(int pm_sock) +{ + if (pm_sock > 0) + close(pm_sock); + pm_sock = -1; +} + +/** + * check_iwpm_nlattr - Check for NULL netlink attribute + */ +static int check_iwpm_nlattr(struct nlattr *nltb[], int nla_count) +{ + int i, ret = 0; + for (i = 1; i < nla_count; i++) { + if (!nltb[i]) { + iwpm_debug(IWARP_PM_NETLINK_DBG, "check_iwpm_nlattr: NULL (attr idx = %d)\n", i); + ret = -EINVAL; + } + } + return ret; +} + +/** + * parse_iwpm_nlmsg - Parse a netlink message + * @req_nlh: netlink header of the received message to parse + * @policy_max: the number of attributes in the policy + * @nlmsg_policy: the attribute policy + * @nltb: array to store the parsed attributes + * @msg_type: netlink message type (dbg purpose) + */ +int parse_iwpm_nlmsg(struct nlmsghdr *req_nlh, int policy_max, + struct nla_policy *nlmsg_policy, struct nlattr *nltb [], + const char *msg_type) +{ + const char *str_err; + int ret; + + if ((ret = nlmsg_validate(req_nlh, 0, policy_max-1, nlmsg_policy))) { + str_err = "nlmsg_validate error"; + goto parse_nlmsg_error; + } + if ((ret = nlmsg_parse(req_nlh, 0, nltb, policy_max-1, nlmsg_policy))) { + str_err = "nlmsg_parse error"; + goto parse_nlmsg_error; + } + if (check_iwpm_nlattr(nltb, policy_max)) { + ret = -EINVAL; + str_err = "NULL nlmsg attribute"; + goto parse_nlmsg_error; + } + return 0; +parse_nlmsg_error: + syslog(LOG_WARNING, "parse_iwpm_nlmsg: msg type = %s (%s ret = %d)\n", + msg_type, str_err, ret); + return ret; +} + +/** + * send_iwpm_nlmsg - Send a netlink message + * @nl_sock: netlink socket to use for sending the message + * @nlmsg: netlink message to send + * @dest_pid: pid of the destination of the nlmsg + */ +int send_iwpm_nlmsg(int nl_sock, struct nl_msg *nlmsg, int dest_pid) +{ + struct sockaddr_nl dest_addr; + struct nlmsghdr *nlh = nlmsg_hdr(nlmsg); + __u32 nlmsg_len = nlh->nlmsg_len; + int len; + + /* fill in the netlink address of the client */ + memset(&dest_addr, 0, sizeof(dest_addr)); + dest_addr.nl_groups = 0; + dest_addr.nl_family = AF_NETLINK; + dest_addr.nl_pid = dest_pid; + + /* send response to the client */ + len = sendto(nl_sock, (char *)nlh, nlmsg_len, 0, + (struct sockaddr *)&dest_addr, sizeof(dest_addr)); + if (len != nlmsg_len) + return -errno; + return 0; +} + +/** + * create_iwpm_nlmsg - Create a netlink message + * @nlmsg_type: type of the netlink message + * @client: the port mapper client to receive the message + */ +struct nl_msg *create_iwpm_nlmsg(__u16 nlmsg_type, int client_idx) +{ + struct nl_msg *nlmsg; + struct nlmsghdr *nlh; + __u32 seq = 0; + + nlmsg = nlmsg_alloc(); + if (!nlmsg) + return NULL; + if (client_idx > 0) + seq = client_list[client_idx].nl_seq++; + + nlh = nlmsg_put(nlmsg, getpid(), seq, nlmsg_type, 0, NLM_F_REQUEST); + if (!nlh) { + nlmsg_free(nlmsg); + return NULL; + } + return nlmsg; +} + +/** + * parse_iwpm_msg - Parse iwarp port mapper wire message + * @pm_msg: iwpm message to be parsed + * @msg_parms: contains the parameters of the iwpm message after parsing + */ +int parse_iwpm_msg(iwpm_wire_msg *pm_msg, iwpm_msg_parms *msg_parms) +{ + int ret_value = 0; + + msg_parms->pmtime = pm_msg->pmtime; + msg_parms->assochandle = be64toh(pm_msg->assochandle); + msg_parms->ip_ver = (pm_msg->magic & IWARP_PM_IPVER_MASK) >> IWARP_PM_IPVER_SHIFT; + switch (msg_parms->ip_ver) { + case 4: + msg_parms->address_family = AF_INET; + break; + case 6: + msg_parms->address_family = AF_INET6; + break; + default: + syslog(LOG_WARNING, "parse_iwpm_msg: Invalid IP version = %d.\n", + msg_parms->ip_ver); + return -EINVAL; + } + /* port mapper protocol version */ + msg_parms->ver = (pm_msg->magic & IWARP_PM_VER_MASK) >> IWARP_PM_VER_SHIFT; + /* message type */ + msg_parms->mt = (pm_msg->magic & IWARP_PM_MT_MASK) >> IWARP_PM_MT_SHIFT; + msg_parms->apport = pm_msg->apport; /* accepting peer port */ + msg_parms->cpport = pm_msg->cpport; /* connecting peer port */ + /* copy accepting peer IP address */ + memcpy(&msg_parms->apipaddr, &pm_msg->apipaddr, IWPM_IPADDR_SIZE); + /* copy connecting peer IP address */ + memcpy(&msg_parms->cpipaddr, &pm_msg->cpipaddr, IWPM_IPADDR_SIZE); + if (msg_parms->mt == IWARP_PM_MT_REQ) { + msg_parms->mapped_cpport = pm_msg->reserved; + memcpy(&msg_parms->mapped_cpipaddr, &pm_msg->mapped_cpipaddr, IWPM_IPADDR_SIZE); + } + return ret_value; +} + +/** + * form_iwpm_msg - Form iwarp port mapper wire message + * @pm_msg: iwpm message to be formed + * @msg_parms: the parameters to be packed in a iwpm message + */ +static void form_iwpm_msg(iwpm_wire_msg *pm_msg, iwpm_msg_parms *msg_parms) +{ + memset(pm_msg, 0, sizeof(struct iwpm_wire_msg)); + pm_msg->pmtime = msg_parms->pmtime; + pm_msg->assochandle = htobe64(msg_parms->assochandle); + /* record IP version, port mapper version, message type */ + pm_msg->magic = (msg_parms->ip_ver << IWARP_PM_IPVER_SHIFT) & IWARP_PM_IPVER_MASK; + pm_msg->magic |= (msg_parms->ver << IWARP_PM_VER_SHIFT) & IWARP_PM_VER_MASK; + pm_msg->magic |= (msg_parms->mt << IWARP_PM_MT_SHIFT) & IWARP_PM_MT_MASK; + + pm_msg->apport = msg_parms->apport; + pm_msg->cpport = msg_parms->cpport; + memcpy(&pm_msg->apipaddr, &msg_parms->apipaddr, IWPM_IPADDR_SIZE); + memcpy(&pm_msg->cpipaddr, &msg_parms->cpipaddr, IWPM_IPADDR_SIZE); + if (msg_parms->mt == IWARP_PM_MT_REQ) { + pm_msg->reserved = msg_parms->mapped_cpport; + memcpy(&pm_msg->mapped_cpipaddr, &msg_parms->mapped_cpipaddr, IWPM_IPADDR_SIZE); + } +} + +/** + * form_iwpm_request - Form iwarp port mapper request message + * @pm_msg: iwpm message to be formed + * @msg_parms: the parameters to be packed in a iwpm message + **/ +void form_iwpm_request(struct iwpm_wire_msg *pm_msg, + struct iwpm_msg_parms *msg_parms) +{ + msg_parms->mt = IWARP_PM_MT_REQ; + msg_parms->msize = IWARP_PM_MESSAGE_SIZE + IWPM_IPADDR_SIZE; + form_iwpm_msg(pm_msg, msg_parms); +} + +/** + * form_iwpm_accept - Form iwarp port mapper accept message + * @pm_msg: iwpm message to be formed + * @msg_parms: the parameters to be packed in a iwpm message + **/ +void form_iwpm_accept(struct iwpm_wire_msg *pm_msg, + struct iwpm_msg_parms *msg_parms) +{ + msg_parms->mt = IWARP_PM_MT_ACC; + msg_parms->msize = IWARP_PM_MESSAGE_SIZE; + form_iwpm_msg(pm_msg, msg_parms); +} + +/** + * form_iwpm_ack - Form iwarp port mapper ack message + * @pm_msg: iwpm message to be formed + * @msg_parms: the parameters to be packed in a iwpm message + **/ +void form_iwpm_ack(struct iwpm_wire_msg *pm_msg, + struct iwpm_msg_parms *msg_parms) +{ + msg_parms->mt = IWARP_PM_MT_ACK; + msg_parms->msize = IWARP_PM_MESSAGE_SIZE; + form_iwpm_msg(pm_msg, msg_parms); +} + +/** + * form_iwpm_reject - Form iwarp port mapper reject message + * @pm_msg: iwpm message to be formed + * @msg_parms: the parameters to be packed in a iwpm message + */ +void form_iwpm_reject(struct iwpm_wire_msg *pm_msg, + struct iwpm_msg_parms *msg_parms) +{ + msg_parms->mt = IWARP_PM_MT_REJ; + msg_parms->msize = IWARP_PM_MESSAGE_SIZE; + form_iwpm_msg(pm_msg, msg_parms); +} + +/** + * get_sockaddr_port - Report the tcp port number, contained in the sockaddr + * @sockaddr: sockaddr storage to get the tcp port from + */ +__be16 get_sockaddr_port(struct sockaddr_storage *sockaddr) +{ + struct sockaddr_in *sockaddr_v4; + struct sockaddr_in6 *sockaddr_v6; + __be16 port = 0; + + switch (sockaddr->ss_family) { + case AF_INET: + sockaddr_v4 = (struct sockaddr_in *)sockaddr; + port = sockaddr_v4->sin_port; + break; + case AF_INET6: + sockaddr_v6 = (struct sockaddr_in6 *)sockaddr; + port = sockaddr_v6->sin6_port; + break; + default: + syslog(LOG_WARNING, "get_sockaddr_port: Invalid sockaddr family.\n"); + break; + } + return port; +} + +/** + * copy_iwpm_sockaddr - Copy (IP address and Port) from src to dst + * @address_family: Internet address family + * @src_sockaddr: socket address to copy (if NULL, use src_addr) + * @dst_sockaddr: socket address to update (if NULL, use dst_addr) + * @src_addr: IP address to copy (if NULL, use src_sockaddr) + * @dst_addr: IP address to update (if NULL, use dst_sockaddr) + * @src_port: port to copy in dst_sockaddr, if src_sockaddr = NULL + * port to update, if src_sockaddr != NULL and dst_sockaddr = NULL + */ +void copy_iwpm_sockaddr(__u16 addr_family, struct sockaddr_storage *src_sockaddr, + struct sockaddr_storage *dst_sockaddr, + char *src_addr, char *dst_addr, __be16 *src_port) +{ + switch (addr_family) { + case AF_INET: { + const struct in_addr *src = (void *)src_addr; + struct in_addr *dst = (void *)dst_addr; + const struct sockaddr_in *src_sockaddr_in; + struct sockaddr_in *dst_sockaddr_in; + + if (src_sockaddr) { + src_sockaddr_in = (const void *)src_sockaddr; + src = &src_sockaddr_in->sin_addr; + *src_port = src_sockaddr_in->sin_port; + } + if (dst_sockaddr) { + dst_sockaddr_in = (void *)dst_sockaddr; + dst = &dst_sockaddr_in->sin_addr; + dst_sockaddr_in->sin_port = *src_port; + dst_sockaddr_in->sin_family = AF_INET; + } + *dst = *src; + break; + } + case AF_INET6: { + const struct in6_addr *src = (void *)src_addr; + struct in6_addr *dst = (void *)dst_addr; + const struct sockaddr_in6 *src_sockaddr_in6; + struct sockaddr_in6 *dst_sockaddr_in6; + + if (src_sockaddr) { + src_sockaddr_in6 = (const void *)src_sockaddr; + src = &src_sockaddr_in6->sin6_addr; + *src_port = src_sockaddr_in6->sin6_port; + } + if (dst_sockaddr) { + dst_sockaddr_in6 = (void *)dst_sockaddr; + dst = &dst_sockaddr_in6->sin6_addr; + dst_sockaddr_in6->sin6_port = *src_port; + dst_sockaddr_in6->sin6_family = AF_INET6; + } + *dst = *src; + break; + } + } +} + +/** + * is_wcard_ipaddr - Check if the search_addr has a wild card ip address + */ +int is_wcard_ipaddr(struct sockaddr_storage *search_addr) +{ + int ret = 0; + + switch (search_addr->ss_family) { + case AF_INET: { + struct sockaddr_in wcard_addr; + struct sockaddr_in *in4addr = (struct sockaddr_in *)search_addr; + inet_pton(AF_INET, "0.0.0.0", &wcard_addr.sin_addr); + + if (in4addr->sin_addr.s_addr == wcard_addr.sin_addr.s_addr) + ret = 1; + break; + } + case AF_INET6: { + struct sockaddr_in6 wcard_addr; + struct sockaddr_in6 *in6addr = (struct sockaddr_in6 *)search_addr; + inet_pton(AF_INET6, "::", &wcard_addr.sin6_addr); + + if (!memcmp(in6addr->sin6_addr.s6_addr, + wcard_addr.sin6_addr.s6_addr, IWPM_IPADDR_SIZE)) + ret = 1; + break; + } + default: + syslog(LOG_WARNING, "check_same_sockaddr: Invalid addr family 0x%02X\n", + search_addr->ss_family); + break; + } + return ret; +} + +/** + * print_iwpm_sockaddr - Print socket address (IP address and Port) + * @sockaddr: socket address to print + * @msg: message to print + */ +void print_iwpm_sockaddr(struct sockaddr_storage *sockaddr, const char *msg, + __u32 dbg_flag) +{ + struct sockaddr_in6 *sockaddr_v6; + struct sockaddr_in *sockaddr_v4; + char ip_address_text[INET6_ADDRSTRLEN]; + + switch (sockaddr->ss_family) { + case AF_INET: + sockaddr_v4 = (struct sockaddr_in *)sockaddr; + iwpm_debug(dbg_flag, "%s IPV4 %s:%u(0x%04X)\n", msg, + inet_ntop(AF_INET, &sockaddr_v4->sin_addr, ip_address_text, INET6_ADDRSTRLEN), + be16toh(sockaddr_v4->sin_port), be16toh(sockaddr_v4->sin_port)); + break; + case AF_INET6: + sockaddr_v6 = (struct sockaddr_in6 *)sockaddr; + iwpm_debug(dbg_flag, "%s IPV6 %s:%u(0x%04X)\n", msg, + inet_ntop(AF_INET6, &sockaddr_v6->sin6_addr, ip_address_text, INET6_ADDRSTRLEN), + be16toh(sockaddr_v6->sin6_port), be16toh(sockaddr_v6->sin6_port)); + break; + default: + break; + } +} diff --git a/iwpmd/iwarp_pm_helper.c b/iwpmd/iwarp_pm_helper.c new file mode 100644 index 0000000..b82de5c --- /dev/null +++ b/iwpmd/iwarp_pm_helper.c @@ -0,0 +1,605 @@ +/* + * Copyright (c) 2013-2016 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#include "iwarp_pm.h" + +static LIST_HEAD(mapped_ports); /* list of mapped ports */ + +/** + * create_iwpm_map_request - Create a new map request tracking object + * @req_nlh: netlink header of the received client message + * @src_addr: the local address of the client initiating the request + * @remote_addr: the destination (the port mapper peer) address + * @assochandle: unique number per host + * @msg_type: message types are request, accept and ack + * @send_msg: message to retransmit to the remote port mapper peer, + * if the request isn't serviced on time. + */ +iwpm_mapping_request *create_iwpm_map_request(struct nlmsghdr *req_nlh, + struct sockaddr_storage *src_addr, struct sockaddr_storage *remote_addr, + __u64 assochandle, int msg_type, iwpm_send_msg *send_msg) +{ + iwpm_mapping_request *iwpm_map_req; + __u32 type = 0, seq = 0, pid = 0; + + /* create iwpm conversation tracking object */ + iwpm_map_req = malloc(sizeof(iwpm_mapping_request)); + if (!iwpm_map_req) + return NULL; + if (req_nlh) { + type = req_nlh->nlmsg_type; + seq = req_nlh->nlmsg_seq; + pid = req_nlh->nlmsg_pid; + } + memset(iwpm_map_req, 0, sizeof(iwpm_mapping_request)); + iwpm_map_req->timeout = IWPM_MAP_REQ_TIMEOUT; + iwpm_map_req->complete = 0; + iwpm_map_req->msg_type = msg_type; + iwpm_map_req->send_msg = send_msg; + + iwpm_map_req->nlmsg_type = type; + iwpm_map_req->nlmsg_seq = seq; + iwpm_map_req->nlmsg_pid = pid; + /* assochandle helps match iwpm request sent to remote peer with future iwpm accept/reject */ + iwpm_map_req->assochandle = assochandle; + if (!assochandle) + iwpm_map_req->assochandle = (uintptr_t)iwpm_map_req; + + memcpy(&iwpm_map_req->src_addr, src_addr, sizeof(struct sockaddr_storage)); + /* keep record of remote IP address and port */ + memcpy(&iwpm_map_req->remote_addr, remote_addr, sizeof(struct sockaddr_storage)); + return iwpm_map_req; +} + +/** + * add_iwpm_map_request - Add a map request tracking object to a global list + * @iwpm_map_req: mapping request to be saved + */ +void add_iwpm_map_request(iwpm_mapping_request *iwpm_map_req) +{ + pthread_mutex_lock(&map_req_mutex); + list_add(&mapping_reqs, &iwpm_map_req->entry); + /* if not wake, signal the thread that a new request has been posted */ + if (!wake) + pthread_cond_signal(&cond_req_complete); + pthread_mutex_unlock(&map_req_mutex); +} + +/** + * remove_iwpm_map_request - Free a map request tracking object + * @iwpm_map_req: mapping request to be removed + * + * Routine must be called within lock context + */ +void remove_iwpm_map_request(iwpm_mapping_request *iwpm_map_req) +{ + if (!iwpm_map_req->complete && iwpm_map_req->msg_type != IWARP_PM_REQ_ACK) { + iwpm_debug(IWARP_PM_RETRY_DBG, "remove_iwpm_map_request: " + "Timeout for request (type = %u pid = %d)\n", + iwpm_map_req->msg_type, iwpm_map_req->nlmsg_pid); + } + list_del(&iwpm_map_req->entry); + if (iwpm_map_req->send_msg) + free(iwpm_map_req->send_msg); + free(iwpm_map_req); +} + +/** + * update_iwpm_map_request - Find and update a map request tracking object + * @assochandle: the request assochandle to search for + * @src_addr: the request src address to search for + * @msg_type: the request type to search for + * @iwpm_copy_req: to store a copy of the found map request object + * @update: if set update the found request, otherwise don't update + */ +int update_iwpm_map_request(__u64 assochandle, struct sockaddr_storage *src_addr, + int msg_type, iwpm_mapping_request *iwpm_copy_req, int update) +{ + iwpm_mapping_request *iwpm_map_req; + int ret = -EINVAL; + + pthread_mutex_lock(&map_req_mutex); + /* look for a matching entry in the list */ + list_for_each(&mapping_reqs, iwpm_map_req, entry) { + if (assochandle == iwpm_map_req->assochandle && + (msg_type & iwpm_map_req->msg_type) && + check_same_sockaddr(src_addr, &iwpm_map_req->src_addr)) { + ret = 0; + /* get a copy of the request (a different thread is in charge of freeing it) */ + memcpy(iwpm_copy_req, iwpm_map_req, sizeof(iwpm_mapping_request)); + if (!update) + goto update_map_request_exit; + if (iwpm_map_req->complete) + goto update_map_request_exit; + + /* update the request object */ + if (iwpm_map_req->msg_type == IWARP_PM_REQ_ACK) { + iwpm_map_req->timeout = IWPM_MAP_REQ_TIMEOUT; + iwpm_map_req->complete = 0; + } else { + /* already serviced request could be freed */ + iwpm_map_req->timeout = 0; + iwpm_map_req->complete = 1; + } + goto update_map_request_exit; + } + } +update_map_request_exit: + pthread_mutex_unlock(&map_req_mutex); + return ret; +} + +/** + * send_iwpm_msg - Form and send iwpm message to the remote peer + */ +int send_iwpm_msg(void (*form_msg_type)(iwpm_wire_msg *, iwpm_msg_parms *), + iwpm_msg_parms *msg_parms, struct sockaddr_storage *recv_addr, int send_sock) +{ + iwpm_send_msg send_msg; + + form_msg_type(&send_msg.data, msg_parms); + form_iwpm_send_msg(send_sock, recv_addr, msg_parms->msize, &send_msg); + return add_iwpm_pending_msg(&send_msg); +} + +/** + * check_iwpm_ip_addr - Check if the local IP address is valid + * @local_addr: local IP address to verify + * + * Check if the local IP address is used by the host ethernet interfaces + */ +static int check_iwpm_ip_addr(struct sockaddr_storage *local_addr) +{ + struct ifaddrs ifa; + struct ifaddrs *ifap = &ifa; + struct ifaddrs **ifa_list = &ifap; + struct ifaddrs *ifa_current; + int found_addr = 0; + int ret = -EINVAL; + + /* get a list of host ethernet interfaces */ + if ((ret = getifaddrs(ifa_list)) < 0) { + syslog(LOG_WARNING, "check_iwpm_ip_addr: Unable to get the list of interfaces (%s).\n", + strerror(errno)); + return ret; + } + /* go through the list to make sure local IP address is valid */ + ifa_current = *ifa_list; + while (ifa_current != NULL && !found_addr) { + if (local_addr->ss_family == ifa_current->ifa_addr->sa_family) { + switch (ifa_current->ifa_addr->sa_family) { + case AF_INET: { + if (!memcmp(&((struct sockaddr_in *) + ifa_current->ifa_addr)->sin_addr.s_addr, + &((struct sockaddr_in *)local_addr)->sin_addr.s_addr, + IWARP_PM_IPV4_ADDR)) { + + found_addr = 1; + } + break; + } + case AF_INET6: { + if (!memcmp(&((struct sockaddr_in6 *) + ifa_current->ifa_addr)->sin6_addr.s6_addr, + &((struct sockaddr_in6 *)local_addr)->sin6_addr.s6_addr, + INET6_ADDRSTRLEN)) + + found_addr = 1; + break; + } + default: + break; + } + } + ifa_current = ifa_current->ifa_next; + } + if (found_addr) + ret = 0; + + freeifaddrs(*ifa_list); + return ret; +} + +/** + * get_iwpm_ip_addr - Get a mapped IP address + * @local_addr: local IP address to map + * @mapped_addr: to store the mapped local IP address + * + * Currently, don't map the local IP address + */ +static int get_iwpm_ip_addr(struct sockaddr_storage *local_addr, + struct sockaddr_storage *mapped_addr) +{ + int ret = check_iwpm_ip_addr(local_addr); + if (!ret) + memcpy(mapped_addr, local_addr, sizeof(struct sockaddr_storage)); + else + iwpm_debug(IWARP_PM_ALL_DBG, "get_iwpm_ip_addr: Invalid local IP address.\n"); + + return ret; +} + +/** + * get_iwpm_tcp_port - Get a new TCP port from the host stack + * @addr_family: should be valid AF_INET or AF_INET6 + * @requested_port: set only if reopening of mapped port + * @mapped_addr: to store the mapped TCP port + * @new_sock: to store socket handle (bound to the mapped TCP port) +*/ +static int get_iwpm_tcp_port(__u16 addr_family, __be16 requested_port, + struct sockaddr_storage *mapped_addr, int *new_sock) +{ + sockaddr_union bind_addr; + struct sockaddr_in *bind_in4; + struct sockaddr_in6 *bind_in6; + socklen_t sockname_len; + __be16 *new_port = NULL, *mapped_port = NULL; + const char *str_err = ""; + + /* create a socket */ + *new_sock = socket(addr_family, SOCK_STREAM, 0); + if (*new_sock < 0) { + str_err = "Unable to create socket"; + goto get_tcp_port_error; + } + + memset(&bind_addr, 0, sizeof(bind_addr)); + switch (addr_family) { + case AF_INET: + mapped_port = &((struct sockaddr_in *)mapped_addr)->sin_port; + bind_in4 = &bind_addr.v4_sockaddr; + bind_in4->sin_family = addr_family; + bind_in4->sin_addr.s_addr = htobe32(INADDR_ANY); + if (requested_port) + requested_port = *mapped_port; + bind_in4->sin_port = requested_port; + new_port = &bind_in4->sin_port; + break; + case AF_INET6: + mapped_port = &((struct sockaddr_in6 *)mapped_addr)->sin6_port; + bind_in6 = &bind_addr.v6_sockaddr; + bind_in6->sin6_family = addr_family; + bind_in6->sin6_addr = in6addr_any; + if (requested_port) + requested_port = *mapped_port; + bind_in6->sin6_port = requested_port; + new_port = &bind_in6->sin6_port; + break; + default: + str_err = "Invalid Internet address family"; + goto get_tcp_port_error; + } + + if (bind(*new_sock, &bind_addr.sock_addr, sizeof(bind_addr))) { + str_err = "Unable to bind the socket"; + goto get_tcp_port_error; + } + /* get the TCP port */ + sockname_len = sizeof(bind_addr); + if (getsockname(*new_sock, &bind_addr.sock_addr, &sockname_len)) { + str_err = "Unable to get socket name"; + goto get_tcp_port_error; + } + *mapped_port = *new_port; + iwpm_debug(IWARP_PM_ALL_DBG, "get_iwpm_tcp_port: Open tcp port " + "(addr family = %04X, requested port = %04X, mapped port = %04X).\n", + addr_family, be16toh(requested_port), be16toh(*mapped_port)); + return 0; +get_tcp_port_error: + syslog(LOG_WARNING, "get_iwpm_tcp_port: %s (addr family = %04X, requested port = %04X).\n", + str_err, addr_family, be16toh(requested_port)); + return -errno; +} + +/** + * get_iwpm_port - Allocate and initialize a new mapped port object + */ +static iwpm_mapped_port *get_iwpm_port(int client_idx, struct sockaddr_storage *local_addr, + struct sockaddr_storage *mapped_addr, int sd) +{ + iwpm_mapped_port *iwpm_port; + + iwpm_port = malloc(sizeof(iwpm_mapped_port)); + if (!iwpm_port) { + syslog(LOG_WARNING, "get_iwpm_port: Unable to allocate a mapped port.\n"); + return NULL; + } + memset(iwpm_port, 0, sizeof(*iwpm_port)); + + /* record local and mapped address in the mapped port object */ + memcpy(&iwpm_port->local_addr, local_addr, sizeof(struct sockaddr_storage)); + memcpy(&iwpm_port->mapped_addr, mapped_addr, sizeof(struct sockaddr_storage)); + iwpm_port->owner_client = client_idx; + iwpm_port->sd = sd; + atomic_init(&iwpm_port->ref_cnt, 1); + if (is_wcard_ipaddr(local_addr)) + iwpm_port->wcard = 1; + return iwpm_port; +} + +/** + * create_iwpm_mapped_port - Create a new mapped port object + * @local_addr: local address to be mapped (IP address and TCP port) + * @client_idx: the index of the client owner of the mapped port + */ +iwpm_mapped_port *create_iwpm_mapped_port(struct sockaddr_storage *local_addr, int client_idx, __u32 flags) +{ + iwpm_mapped_port *iwpm_port; + struct sockaddr_storage mapped_addr; + int new_sd; + + /* check the local IP address */ + if (get_iwpm_ip_addr(local_addr, &mapped_addr)) + goto create_mapped_port_error; + /* get a tcp port from the host net stack */ + if (flags & IWPM_FLAGS_NO_PORT_MAP) { + mapped_addr = *local_addr; + new_sd = -1; + } else { + if (get_iwpm_tcp_port(local_addr->ss_family, 0, &mapped_addr, &new_sd)) + goto create_mapped_port_error; + } + + iwpm_port = get_iwpm_port(client_idx, local_addr, &mapped_addr, new_sd); + return iwpm_port; + +create_mapped_port_error: + iwpm_debug(IWARP_PM_ALL_DBG, "create_iwpm_mapped_port: Could not make port mapping.\n"); + return NULL; +} + +/** + * reopen_iwpm_mapped_port - Create a new mapped port object + * @local_addr: local address to be mapped (IP address and TCP port) + * @mapped_addr: mapped address to be remapped (IP address and TCP port) + * @client_idx: the index of the client owner of the mapped port + */ +iwpm_mapped_port *reopen_iwpm_mapped_port(struct sockaddr_storage *local_addr, + struct sockaddr_storage *mapped_addr, int client_idx, + __u32 flags) +{ + iwpm_mapped_port *iwpm_port; + int new_sd = -1; + const char *str_err = ""; + int ret = check_iwpm_ip_addr(local_addr); + if (ret) { + str_err = "Invalid local IP address"; + goto reopen_mapped_port_error; + } + if (local_addr->ss_family != mapped_addr->ss_family) { + str_err = "Different local and mapped sockaddr families"; + goto reopen_mapped_port_error; + } + if (!(flags & IWPM_FLAGS_NO_PORT_MAP)) { + if (get_iwpm_tcp_port(local_addr->ss_family, htobe16(1), mapped_addr, &new_sd)) + goto reopen_mapped_port_error; + } + iwpm_port = get_iwpm_port(client_idx, local_addr, mapped_addr, new_sd); + return iwpm_port; + +reopen_mapped_port_error: + iwpm_debug(IWARP_PM_ALL_DBG, "reopen_iwpm_mapped_port: Could not make port mapping (%s).\n", + str_err); + if (new_sd >= 0) + close(new_sd); + return NULL; +} + +/** + * add_iwpm_mapped_port - Add mapping to a global list + * @iwpm_port: mapping to be saved + */ +void add_iwpm_mapped_port(iwpm_mapped_port *iwpm_port) +{ + static int dbg_idx = 1; + if (atomic_load(&iwpm_port->ref_cnt) > 1) + return; + iwpm_debug(IWARP_PM_ALL_DBG, "add_iwpm_mapped_port: Adding a new mapping #%d\n", dbg_idx++); + list_add(&mapped_ports, &iwpm_port->entry); +} + +/** + * check_same_sockaddr - Compare two sock addresses; + * return true if they are same, false otherwise + */ +int check_same_sockaddr(struct sockaddr_storage *sockaddr_a, struct sockaddr_storage *sockaddr_b) +{ + int ret = 0; + if (sockaddr_a->ss_family == sockaddr_b->ss_family) { + switch (sockaddr_a->ss_family) { + case AF_INET: { + struct sockaddr_in *in4addr_a = (struct sockaddr_in *)sockaddr_a; + struct sockaddr_in *in4addr_b = (struct sockaddr_in *)sockaddr_b; + + if ((in4addr_a->sin_addr.s_addr == in4addr_b->sin_addr.s_addr) + && (in4addr_a->sin_port == in4addr_b->sin_port)) + ret = 1; + + break; + } + case AF_INET6: { + struct sockaddr_in6 *in6addr_a = (struct sockaddr_in6 *)sockaddr_a; + struct sockaddr_in6 *in6addr_b = (struct sockaddr_in6 *)sockaddr_b; + + if ((!memcmp(in6addr_a->sin6_addr.s6_addr, + in6addr_b->sin6_addr.s6_addr, IWPM_IPADDR_SIZE)) && + (in6addr_a->sin6_port == in6addr_b->sin6_port)) + ret = 1; + + break; + } + default: + syslog(LOG_WARNING, "check_same_sockaddr: Invalid addr family 0x%02X\n", + sockaddr_a->ss_family); + break; + } + } + return ret; +} + +/** + * find_iwpm_mapping - Find saved mapped port object + * @search_addr: IP address and port to search for in the list + * @not_mapped: if set, compare local addresses, otherwise compare mapped addresses + * + * Compares the search_sockaddr to the addresses in the list, + * to find a saved port object with the sockaddr or + * a wild card address with the same tcp port + */ +iwpm_mapped_port *find_iwpm_mapping(struct sockaddr_storage *search_addr, + int not_mapped) +{ + iwpm_mapped_port *iwpm_port, *saved_iwpm_port = NULL; + struct sockaddr_storage *current_addr; + + list_for_each(&mapped_ports, iwpm_port, entry) { + current_addr = (not_mapped)? &iwpm_port->local_addr : &iwpm_port->mapped_addr; + + if (get_sockaddr_port(search_addr) == get_sockaddr_port(current_addr)) { + if (check_same_sockaddr(search_addr, current_addr) || + iwpm_port->wcard || is_wcard_ipaddr(search_addr)) { + saved_iwpm_port = iwpm_port; + goto find_mapping_exit; + } + } + } +find_mapping_exit: + return saved_iwpm_port; +} + +/** + * find_iwpm_same_mapping - Find saved mapped port object + * @search_addr: IP address and port to search for in the list + * @not_mapped: if set, compare local addresses, otherwise compare mapped addresses + * + * Compares the search_sockaddr to the addresses in the list, + * to find a saved port object with the same sockaddr + */ +iwpm_mapped_port *find_iwpm_same_mapping(struct sockaddr_storage *search_addr, + int not_mapped) +{ + iwpm_mapped_port *iwpm_port, *saved_iwpm_port = NULL; + struct sockaddr_storage *current_addr; + + list_for_each(&mapped_ports, iwpm_port, entry) { + current_addr = (not_mapped)? &iwpm_port->local_addr : &iwpm_port->mapped_addr; + if (check_same_sockaddr(search_addr, current_addr)) { + saved_iwpm_port = iwpm_port; + goto find_same_mapping_exit; + } + } +find_same_mapping_exit: + return saved_iwpm_port; +} + +/** + * free_iwpm_port - Free mapping object + * @iwpm_port: mapped port object to be freed + */ +void free_iwpm_port(iwpm_mapped_port *iwpm_port) +{ + if (iwpm_port->sd != -1) + close(iwpm_port->sd); + free(iwpm_port); +} + +/** + * remove_iwpm_mapped_port - Remove a mapping from a global list + * @iwpm_port: mapping to be removed + * + * Called only by the main iwarp port mapper thread + */ +void remove_iwpm_mapped_port(iwpm_mapped_port *iwpm_port) +{ + static int dbg_idx = 1; + iwpm_debug(IWARP_PM_ALL_DBG, "remove_iwpm_mapped_port: index = %d\n", dbg_idx++); + + list_del(&iwpm_port->entry); +} + +void print_iwpm_mapped_ports(void) +{ + iwpm_mapped_port *iwpm_port; + int i = 0; + + syslog(LOG_WARNING, "print_iwpm_mapped_ports:\n"); + + list_for_each(&mapped_ports, iwpm_port, entry) { + syslog(LOG_WARNING, "Mapping #%d\n", i++); + print_iwpm_sockaddr(&iwpm_port->local_addr, "Local address", IWARP_PM_DEBUG); + print_iwpm_sockaddr(&iwpm_port->mapped_addr, "Mapped address", IWARP_PM_DEBUG); + } +} + +/** + * form_iwpm_send_msg - Form a message to send on the wire + */ +void form_iwpm_send_msg(int pm_sock, struct sockaddr_storage *dest, + int length, iwpm_send_msg *send_msg) +{ + send_msg->pm_sock = pm_sock; + send_msg->length = length; + memcpy(&send_msg->dest_addr, dest, sizeof(send_msg->dest_addr)); +} + +/** + * add_iwpm_pending_msg - Add wire message to a global list of pending messages + * @send_msg: message to send to the remote port mapper peer + */ +int add_iwpm_pending_msg(iwpm_send_msg *send_msg) +{ + iwpm_pending_msg *pending_msg = malloc(sizeof(iwpm_pending_msg)); + if (!pending_msg) { + syslog(LOG_WARNING, "add_iwpm_pending_msg: Unable to allocate message.\n"); + return -ENOMEM; + } + memcpy(&pending_msg->send_msg, send_msg, sizeof(iwpm_send_msg)); + + pthread_mutex_lock(&pending_msg_mutex); + list_add(&pending_messages, &pending_msg->entry); + pthread_mutex_unlock(&pending_msg_mutex); + /* signal the thread that a new message has been posted */ + pthread_cond_signal(&cond_pending_msg); + return 0; +} + +/** + * free_iwpm_mapped_ports - Free all iwpm mapped port objects + */ +void free_iwpm_mapped_ports(void) +{ + iwpm_mapped_port *iwpm_port; + + while ((iwpm_port = list_pop(&mapped_ports, iwpm_mapped_port, entry))) + free_iwpm_port(iwpm_port); +} diff --git a/iwpmd/iwarp_pm_server.c b/iwpmd/iwarp_pm_server.c new file mode 100644 index 0000000..0f9f459 --- /dev/null +++ b/iwpmd/iwarp_pm_server.c @@ -0,0 +1,1579 @@ +/* + * Copyright (c) 2013-2016 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#include "config.h" +#include <systemd/sd-daemon.h> +#include <getopt.h> +#include "iwarp_pm.h" +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> + +static const char iwpm_ulib_name [] = "iWarpPortMapperUser"; +static __u16 iwpm_version = IWPM_UABI_VERSION; + +LIST_HEAD(mapping_reqs); /* list of map tracking objects */ +LIST_HEAD(pending_messages); /* list of pending wire messages */ +iwpm_client client_list[IWARP_PM_MAX_CLIENTS];/* list of iwarp port mapper clients */ +static int mapinfo_num_list[IWARP_PM_MAX_CLIENTS]; /* list of iwarp port mapper clients */ + +/* socket handles */ +static int pmv4_sock, pmv6_sock, netlink_sock, pmv4_client_sock, pmv6_client_sock; + +static pthread_t map_req_thread; /* handling mapping requests timeout */ +pthread_cond_t cond_req_complete; +pthread_mutex_t map_req_mutex = PTHREAD_MUTEX_INITIALIZER; +int wake = 0; /* set if map_req_thread is wake */ + +static pthread_t pending_msg_thread; /* sending iwpm wire messages */ +pthread_cond_t cond_pending_msg; +pthread_mutex_t pending_msg_mutex = PTHREAD_MUTEX_INITIALIZER; + +static void iwpm_cleanup(void); +static int print_mappings = 0; +static int send_iwpm_mapinfo_request(int nl_sock, int client); + +/** + * iwpm_signal_handler - Handle signals which iwarp port mapper receives + * @signum: the number of the caught signal + */ +static void iwpm_signal_handler(int signum) +{ + switch(signum) { + case SIGHUP: + syslog(LOG_WARNING, "iwpm_signal_handler: Received SIGHUP signal\n"); + iwpm_cleanup(); + exit(signum); + break; + case SIGTERM: + syslog(LOG_WARNING, "iwpm_signal_handler: Received SIGTERM signal\n"); + iwpm_cleanup(); + exit(EXIT_SUCCESS); + break; + case SIGUSR1: + syslog(LOG_WARNING, "iwpm_signal_handler: Received SIGUSR1 signal\n"); + print_mappings = 1; + break; + default: + syslog(LOG_WARNING, "iwpm_signal_handler: Unhandled signal %d\n", signum); + break; + } +} + +/** + * iwpm_mapping_reqs_handler - Handle mapping requests timeouts and retries + */ +static void *iwpm_mapping_reqs_handler(void *unused) +{ + iwpm_mapping_request *iwpm_map_req, *next_map_req; + int ret = 0; + + while (1) { + pthread_mutex_lock(&map_req_mutex); + wake = 0; + if (list_empty(&mapping_reqs)) { + /* wait until a new mapping request is posted */ + ret = pthread_cond_wait(&cond_req_complete, &map_req_mutex); + if (ret) { + syslog(LOG_WARNING, "mapping_reqs_handler: " + "Condition wait failed (ret = %d)\n", ret); + pthread_mutex_unlock(&map_req_mutex); + goto mapping_reqs_handler_exit; + } + } + pthread_mutex_unlock(&map_req_mutex); + /* update timeouts of the posted mapping requests */ + do { + pthread_mutex_lock(&map_req_mutex); + wake = 1; + list_for_each_safe(&mapping_reqs, iwpm_map_req, next_map_req, entry) { + if (iwpm_map_req->timeout > 0) { + if (iwpm_map_req->timeout < IWPM_MAP_REQ_TIMEOUT && + iwpm_map_req->msg_type != IWARP_PM_REQ_ACK) { + /* the request is still incomplete, retransmit the message (every 1sec) */ + add_iwpm_pending_msg(iwpm_map_req->send_msg); + + iwpm_debug(IWARP_PM_RETRY_DBG, "mapping_reqs_handler: " + "Going to retransmit a msg, map request " + "(assochandle = %llu, type = %u, timeout = %d)\n", + iwpm_map_req->assochandle, iwpm_map_req->msg_type, + iwpm_map_req->timeout); + } + iwpm_map_req->timeout--; /* hang around for 10s */ + } else { + remove_iwpm_map_request(iwpm_map_req); + } + } + pthread_mutex_unlock(&map_req_mutex); + sleep(1); + } while (!list_empty(&mapping_reqs)); + } +mapping_reqs_handler_exit: + return NULL; +} + +/** + * iwpm_pending_msgs_handler - Handle sending iwarp port mapper wire messages + */ +static void *iwpm_pending_msgs_handler(void *unused) +{ + iwpm_pending_msg *pending_msg; + iwpm_send_msg *send_msg; + int retries = IWPM_SEND_MSG_RETRIES; + int ret = 0; + + pthread_mutex_lock(&pending_msg_mutex); + while (1) { + /* wait until a new message is posted */ + ret = pthread_cond_wait(&cond_pending_msg, &pending_msg_mutex); + if (ret) { + syslog(LOG_WARNING, "pending_msgs_handler: " + "Condition wait failed (ret = %d)\n", ret); + pthread_mutex_unlock(&pending_msg_mutex); + goto pending_msgs_handler_exit; + } + + /* try sending out each pending message and remove it from the list */ + while ((pending_msg = list_pop(&pending_messages, + iwpm_pending_msg, entry))) { + retries = IWPM_SEND_MSG_RETRIES; + while (retries) { + send_msg = &pending_msg->send_msg; + /* send out the message */ + int bytes_sent = sendto(send_msg->pm_sock, (char *)&send_msg->data, + send_msg->length, 0, + (struct sockaddr *)&send_msg->dest_addr, + sizeof(send_msg->dest_addr)); + if (bytes_sent != send_msg->length) { + retries--; + syslog(LOG_WARNING, "pending_msgs_handler: " + "Could not send to PM Socket send_msg = %p, retries = %d\n", + send_msg, retries); + } else + retries = 0; /* no need to retry */ + } + free(pending_msg); + } + } + pthread_mutex_unlock(&pending_msg_mutex); + +pending_msgs_handler_exit: + return NULL; +} + +static int send_iwpm_error_msg(__u32, __u16, int, int); + +/* Register pid query - nlmsg attributes */ +static struct nla_policy reg_pid_policy[IWPM_NLA_REG_PID_MAX] = { + [IWPM_NLA_REG_PID_SEQ] = { .type = NLA_U32 }, + [IWPM_NLA_REG_IF_NAME] = { .type = NLA_STRING, + .maxlen = IWPM_IFNAME_SIZE }, + [IWPM_NLA_REG_IBDEV_NAME] = { .type = NLA_STRING, + .maxlen = IWPM_ULIBNAME_SIZE }, + [IWPM_NLA_REG_ULIB_NAME] = { .type = NLA_STRING, + .maxlen = IWPM_ULIBNAME_SIZE } +}; + +/** + * process_iwpm_register_pid - Service a client query for port mapper pid + * @req_nlh: netlink header of the received client message + * @client_idx: the index of the client (unique for each iwpm client) + * @nl_sock: netlink socket to send a message back to the client + * + * Process a query and send a response to the client which contains the iwpm pid + * nlmsg response attributes: + * IWPM_NLA_RREG_PID_SEQ + * IWPM_NLA_RREG_IBDEV_NAME + * IWPM_NLA_RREG_ULIB_NAME + * IWPM_NLA_RREG_ULIB_VER + * IWPM_NLA_RREG_PID_ERR + */ +static int process_iwpm_register_pid(struct nlmsghdr *req_nlh, int client_idx, int nl_sock) +{ + iwpm_client *client; + struct nlattr *nltb [IWPM_NLA_REG_PID_MAX]; + struct nl_msg *resp_nlmsg = NULL; + const char *ifname, *devname, *libname; + __u16 err_code = 0; + const char *msg_type = "Register Pid Request"; + const char *str_err; + int ret = -EINVAL; + + if (parse_iwpm_nlmsg(req_nlh, IWPM_NLA_REG_PID_MAX, reg_pid_policy, nltb, msg_type)) { + str_err = "Received Invalid nlmsg"; + err_code = IWPM_INVALID_NLMSG_ERR; + goto register_pid_error; + } + + ifname = (const char *)nla_get_string(nltb[IWPM_NLA_REG_IF_NAME]); + devname = (const char *)nla_get_string(nltb[IWPM_NLA_REG_IBDEV_NAME]); + libname = (const char *)nla_get_string(nltb[IWPM_NLA_REG_ULIB_NAME]); + + iwpm_debug(IWARP_PM_NETLINK_DBG, "process_register_pid: PID request from " + "IB device %s Ethernet device %s User library %s " + "(client idx = %d, msg seq = %u).\n", + devname, ifname, libname, client_idx, req_nlh->nlmsg_seq); + + /* register a first time client */ + client = &client_list[client_idx]; + if (!client->valid) { + memcpy(client->ibdevname, devname, IWPM_DEVNAME_SIZE); + memcpy(client->ifname, ifname, IWPM_IFNAME_SIZE); + memcpy(client->ulibname, libname, IWPM_ULIBNAME_SIZE); + client->valid = 1; + } else { /* check client info */ + if (strcmp(client->ulibname, libname)) { + str_err = "Incorrect library version"; + err_code = IWPM_USER_LIB_INFO_ERR; + goto register_pid_error; + } + } + resp_nlmsg = create_iwpm_nlmsg(req_nlh->nlmsg_type, client_idx); + if (!resp_nlmsg) { + ret = -ENOMEM; + str_err = "Unable to create nlmsg response"; + goto register_pid_error; + } + str_err = "Invalid nlmsg attribute"; + if ((ret = nla_put_u32(resp_nlmsg, IWPM_NLA_RREG_PID_SEQ, req_nlh->nlmsg_seq))) + goto register_pid_error; + if ((ret = nla_put_string(resp_nlmsg, IWPM_NLA_RREG_IBDEV_NAME, devname))) + goto register_pid_error; + if ((ret = nla_put_string(resp_nlmsg, IWPM_NLA_RREG_ULIB_NAME, iwpm_ulib_name))) + goto register_pid_error; + if ((ret = nla_put_u16(resp_nlmsg, IWPM_NLA_RREG_ULIB_VER, iwpm_version))) + goto register_pid_error; + if ((ret = nla_put_u16(resp_nlmsg, IWPM_NLA_RREG_PID_ERR, err_code))) + goto register_pid_error; + + if ((ret = send_iwpm_nlmsg(nl_sock, resp_nlmsg, req_nlh->nlmsg_pid))) { + str_err = "Unable to send nlmsg response"; + goto register_pid_error; + } + nlmsg_free(resp_nlmsg); + return 0; +register_pid_error: + if (resp_nlmsg) + nlmsg_free(resp_nlmsg); + syslog(LOG_WARNING, "process_register_pid: %s ret = %d.\n", str_err, ret); + if (err_code) + send_iwpm_error_msg(req_nlh->nlmsg_seq, err_code, client_idx, nl_sock); + return ret; +} + +/* Add mapping request - nlmsg attributes */ +static struct nla_policy manage_map_policy[IWPM_NLA_MANAGE_MAPPING_MAX] = { + [IWPM_NLA_MANAGE_MAPPING_SEQ] = { .type = NLA_U32 }, + [IWPM_NLA_MANAGE_ADDR] = { .minlen = sizeof(struct sockaddr_storage) }, + [IWPM_NLA_MANAGE_FLAGS] = { .type = NLA_U32 } +}; + +/** + * process_iwpm_add_mapping - Service a client request for mapping of a local address + * @req_nlh: netlink header of the received client message + * @client_idx: the index of the client (unique for each iwpm client) + * @nl_sock: netlink socket to send a message back to the client + * + * Process a mapping request for a local address and send a response to the client + * which contains the mapped local address (IP address and TCP port) + * nlmsg response attributes: + * [IWPM_NLA_RMANAGE_MAPPING_SEQ] + * [IWPM_NLA_RMANAGE_ADDR] + * [IWPM_NLA_RMANAGE_MAPPED_LOC_ADDR] + * [IWPM_NLA_RMANAGE_MAPPING_ERR] + */ +static int process_iwpm_add_mapping(struct nlmsghdr *req_nlh, int client_idx, int nl_sock) +{ + iwpm_mapped_port *iwpm_port = NULL; + struct nlattr *nltb [IWPM_NLA_MANAGE_MAPPING_MAX] = {}; + struct nl_msg *resp_nlmsg = NULL; + struct sockaddr_storage *local_addr; + int not_mapped = 1; + __u16 err_code = 0; + const char *msg_type = "Add Mapping Request"; + const char *str_err = ""; + int ret = -EINVAL; + __u32 flags; + int max = IWPM_NLA_MANAGE_MAPPING_MAX; + + if (iwpm_version != IWPM_UABI_VERSION) + max--; + if (parse_iwpm_nlmsg(req_nlh, max, manage_map_policy, nltb, msg_type)) { + err_code = IWPM_INVALID_NLMSG_ERR; + str_err = "Received Invalid nlmsg"; + goto add_mapping_error; + } + local_addr = (struct sockaddr_storage *)nla_data(nltb[IWPM_NLA_MANAGE_ADDR]); + flags = nltb[IWPM_NLA_MANAGE_FLAGS] ? nla_get_u32(nltb[IWPM_NLA_MANAGE_FLAGS]) : 0; + + iwpm_port = find_iwpm_mapping(local_addr, not_mapped); + if (iwpm_port) { + if (check_same_sockaddr(local_addr, &iwpm_port->local_addr) && iwpm_port->wcard) { + atomic_fetch_add(&iwpm_port->ref_cnt, 1); + } else { + err_code = IWPM_DUPLICATE_MAPPING_ERR; + str_err = "Duplicate mapped port"; + goto add_mapping_error; + } + + } else { + iwpm_port = create_iwpm_mapped_port(local_addr, client_idx, flags); + if (!iwpm_port) { + err_code = IWPM_CREATE_MAPPING_ERR; + str_err = "Unable to create new mapping"; + goto add_mapping_error; + } + } + resp_nlmsg = create_iwpm_nlmsg(req_nlh->nlmsg_type, client_idx); + if (!resp_nlmsg) { + ret = -ENOMEM; + str_err = "Unable to create nlmsg response"; + goto add_mapping_free_error; + } + str_err = "Invalid nlmsg attribute"; + if ((ret = nla_put_u32(resp_nlmsg, IWPM_NLA_RMANAGE_MAPPING_SEQ, req_nlh->nlmsg_seq))) + goto add_mapping_free_error; + if ((ret = nla_put(resp_nlmsg, IWPM_NLA_RMANAGE_ADDR, + sizeof(struct sockaddr_storage), &iwpm_port->local_addr))) + goto add_mapping_free_error; + if ((ret = nla_put(resp_nlmsg, IWPM_NLA_RMANAGE_MAPPED_LOC_ADDR, + sizeof(struct sockaddr_storage), &iwpm_port->mapped_addr))) + goto add_mapping_free_error; + if ((ret = nla_put_u16(resp_nlmsg, IWPM_NLA_RMANAGE_MAPPING_ERR, err_code))) + goto add_mapping_free_error; + + if ((ret = send_iwpm_nlmsg(nl_sock, resp_nlmsg, req_nlh->nlmsg_pid))) { + str_err = "Unable to send nlmsg response"; + goto add_mapping_free_error; + } + /* add the new mapping to the list */ + add_iwpm_mapped_port(iwpm_port); + nlmsg_free(resp_nlmsg); + return 0; + +add_mapping_free_error: + if (resp_nlmsg) + nlmsg_free(resp_nlmsg); + if (iwpm_port) { + if (atomic_fetch_sub(&iwpm_port->ref_cnt, 1) == 1) + free_iwpm_port(iwpm_port); + } +add_mapping_error: + syslog(LOG_WARNING, "process_add_mapping: %s (failed request from client = %s).\n", + str_err, client_list[client_idx].ibdevname); + if (err_code) { + /* send error message to the client */ + send_iwpm_error_msg(req_nlh->nlmsg_seq, err_code, client_idx, nl_sock); + } + return ret; +} + +/* Query mapping request - nlmsg attributes */ +static struct nla_policy query_map_policy[IWPM_NLA_QUERY_MAPPING_MAX] = { + [IWPM_NLA_QUERY_MAPPING_SEQ] = { .type = NLA_U32 }, + [IWPM_NLA_QUERY_LOCAL_ADDR] = { .minlen = sizeof(struct sockaddr_storage) }, + [IWPM_NLA_QUERY_REMOTE_ADDR] = { .minlen = sizeof(struct sockaddr_storage) }, + [IWPM_NLA_QUERY_FLAGS] = { .type = NLA_U32 } +}; + +/** + * process_iwpm_query_mapping - Service a client request for local and remote mapping + * @req_nlh: netlink header of the received client message + * @client_idx: the index of the client (the index is unique for each iwpm client) + * @nl_sock: netlink socket to send a message back to the client + * + * Process a client request for local and remote address mapping + * Create mapping for the local address (IP address and TCP port) + * Send a request to the remote port mapper peer to find out the remote address mapping + */ +static int process_iwpm_query_mapping(struct nlmsghdr *req_nlh, int client_idx, int nl_sock) +{ + iwpm_mapped_port *iwpm_port = NULL; + iwpm_mapping_request *iwpm_map_req = NULL; + struct nlattr *nltb [IWPM_NLA_QUERY_MAPPING_MAX] = {}; + struct sockaddr_storage *local_addr, *remote_addr; + sockaddr_union dest_addr; + iwpm_msg_parms msg_parms; + iwpm_send_msg *send_msg = NULL; + int pm_client_sock; + int not_mapped = 1; + __u16 err_code = 0; + const char *msg_type = "Add & Query Mapping Request"; + const char *str_err = ""; + int ret = -EINVAL; + __u32 flags; + int max = IWPM_NLA_QUERY_MAPPING_MAX; + + if (iwpm_version != IWPM_UABI_VERSION) + max--; + if (parse_iwpm_nlmsg(req_nlh, max, query_map_policy, nltb, msg_type)) { + err_code = IWPM_INVALID_NLMSG_ERR; + str_err = "Received Invalid nlmsg"; + goto query_mapping_error; + } + local_addr = (struct sockaddr_storage *)nla_data(nltb[IWPM_NLA_QUERY_LOCAL_ADDR]); + remote_addr = (struct sockaddr_storage *)nla_data(nltb[IWPM_NLA_QUERY_REMOTE_ADDR]); + flags = nltb[IWPM_NLA_QUERY_FLAGS] ? nla_get_u32(nltb[IWPM_NLA_QUERY_FLAGS]) : 0; + + iwpm_port = find_iwpm_mapping(local_addr, not_mapped); + if (iwpm_port) { + atomic_fetch_add(&iwpm_port->ref_cnt, 1); + } else { + iwpm_port = create_iwpm_mapped_port(local_addr, client_idx, flags); + if (!iwpm_port) { + err_code = IWPM_CREATE_MAPPING_ERR; + str_err = "Unable to create new mapping"; + goto query_mapping_error; + } + } + if (iwpm_port->wcard) { + err_code = IWPM_CREATE_MAPPING_ERR; + str_err = "Invalid wild card mapping"; + goto query_mapping_free_error; + } + /* create iwpm wire message */ + memcpy(&dest_addr.s_sockaddr, remote_addr, sizeof(struct sockaddr_storage)); + switch (dest_addr.s_sockaddr.ss_family) { + case AF_INET: + dest_addr.v4_sockaddr.sin_port = htobe16(IWARP_PM_PORT); + msg_parms.ip_ver = 4; + msg_parms.address_family = AF_INET; + pm_client_sock = pmv4_client_sock; + break; + case AF_INET6: + dest_addr.v6_sockaddr.sin6_port = htobe16(IWARP_PM_PORT); + msg_parms.ip_ver = 6; + msg_parms.address_family = AF_INET6; + pm_client_sock = pmv6_client_sock; + break; + default: + str_err = "Invalid Internet address family"; + goto query_mapping_free_error; + } + /* fill in the remote peer address and the local mapped address */ + copy_iwpm_sockaddr(dest_addr.s_sockaddr.ss_family, remote_addr, NULL, NULL, + &msg_parms.apipaddr[0], &msg_parms.apport); + copy_iwpm_sockaddr(dest_addr.s_sockaddr.ss_family, local_addr, NULL, NULL, + &msg_parms.cpipaddr[0], &msg_parms.cpport); + copy_iwpm_sockaddr(dest_addr.s_sockaddr.ss_family, &iwpm_port->mapped_addr, NULL, NULL, + &msg_parms.mapped_cpipaddr[0], &msg_parms.mapped_cpport); + msg_parms.pmtime = 0; + msg_parms.ver = 0; + iwpm_debug(IWARP_PM_WIRE_DBG, "process_query_mapping: Local port = 0x%04X, " + "remote port = 0x%04X\n", + be16toh(msg_parms.cpport), be16toh(msg_parms.apport)); + ret = -ENOMEM; + send_msg = malloc(sizeof(iwpm_send_msg)); + if (!send_msg) { + str_err = "Unable to allocate send msg buffer"; + goto query_mapping_free_error; + } + iwpm_map_req = create_iwpm_map_request(req_nlh, &iwpm_port->local_addr, remote_addr, 0, + IWARP_PM_REQ_QUERY, send_msg); + if (!iwpm_map_req) { + str_err = "Unable to allocate mapping request"; + goto query_mapping_free_error; + } + msg_parms.assochandle = iwpm_map_req->assochandle; + form_iwpm_request(&send_msg->data, &msg_parms); + form_iwpm_send_msg(pm_client_sock, &dest_addr.s_sockaddr, msg_parms.msize, send_msg); + + add_iwpm_map_request(iwpm_map_req); + add_iwpm_mapped_port(iwpm_port); + + return send_iwpm_msg(form_iwpm_request, &msg_parms, &dest_addr.s_sockaddr, pm_client_sock); +query_mapping_free_error: + if (iwpm_port) { + if (atomic_fetch_sub(&iwpm_port->ref_cnt, 1) == 1) + free_iwpm_port(iwpm_port); + } + if (send_msg) + free(send_msg); + if (iwpm_map_req) + free(iwpm_map_req); +query_mapping_error: + syslog(LOG_WARNING, "process_query_mapping: %s (failed request from client = %s).\n", + str_err, client_list[client_idx].ibdevname); + if (err_code) { + /* send error message to the client */ + send_iwpm_error_msg(req_nlh->nlmsg_seq, err_code, client_idx, nl_sock); + } + return ret; +} + +/** + * process_iwpm_remove_mapping - Remove a local mapping and close the mapped TCP port + * @req_nlh: netlink header of the received client message + * @client_idx: the index of the client (the index is unique for each iwpm client) + * @nl_sock: netlink socket to send a message to the client + */ +static int process_iwpm_remove_mapping(struct nlmsghdr *req_nlh, int client_idx, int nl_sock) +{ + iwpm_mapped_port *iwpm_port = NULL; + struct sockaddr_storage *local_addr; + struct nlattr *nltb [IWPM_NLA_MANAGE_MAPPING_MAX]; + int not_mapped = 1; + const char *msg_type = "Remove Mapping Request"; + int ret = 0; + + if (parse_iwpm_nlmsg(req_nlh, IWPM_NLA_REMOVE_MAPPING_MAX, manage_map_policy, nltb, msg_type)) { + send_iwpm_error_msg(req_nlh->nlmsg_seq, IWPM_INVALID_NLMSG_ERR, client_idx, nl_sock); + syslog(LOG_WARNING, "process_remove_mapping: Received Invalid nlmsg from client = %d\n", + client_idx); + ret = -EINVAL; + goto remove_mapping_exit; + } + local_addr = (struct sockaddr_storage *)nla_data(nltb[IWPM_NLA_MANAGE_ADDR]); + iwpm_debug(IWARP_PM_NETLINK_DBG, "process_remove_mapping: Going to remove mapping" + " (client idx = %d)\n", client_idx); + + iwpm_port = find_iwpm_same_mapping(local_addr, not_mapped); + if (!iwpm_port) { + iwpm_debug(IWARP_PM_NETLINK_DBG, "process_remove_mapping: Unable to find mapped port object\n"); + print_iwpm_sockaddr(local_addr, "process_remove_mapping: Local address", IWARP_PM_ALL_DBG); + /* the client sends a remove mapping request when terminating a connection + and it is possible that there isn't a successful mapping for this connection */ + goto remove_mapping_exit; + } + if (iwpm_port->owner_client != client_idx) { + syslog(LOG_WARNING, "process_remove_mapping: Invalid request from client = %d\n", + client_idx); + goto remove_mapping_exit; + } + if (atomic_fetch_sub(&iwpm_port->ref_cnt, 1) == 1) { + remove_iwpm_mapped_port(iwpm_port); + free_iwpm_port(iwpm_port); + } +remove_mapping_exit: + return ret; +} + +static int send_conn_info_nlmsg(struct sockaddr_storage *local_addr, + struct sockaddr_storage *remote_addr, + struct sockaddr_storage *mapped_loc_addr, + struct sockaddr_storage *mapped_rem_addr, + int owner_client, __u16 nlmsg_type, __u32 nlmsg_seq, + __u32 nlmsg_pid, __u16 nlmsg_err, int nl_sock) + +{ + struct nl_msg *resp_nlmsg = NULL; + const char *str_err; + int ret; + + resp_nlmsg = create_iwpm_nlmsg(nlmsg_type, owner_client); + if (!resp_nlmsg) { + str_err = "Unable to create nlmsg response"; + ret = -ENOMEM; + goto nlmsg_error; + } + str_err = "Invalid nlmsg attribute"; + if ((ret = nla_put_u32(resp_nlmsg, IWPM_NLA_QUERY_MAPPING_SEQ, nlmsg_seq))) + goto nlmsg_free_error; + if ((ret = nla_put(resp_nlmsg, IWPM_NLA_QUERY_LOCAL_ADDR, + sizeof(struct sockaddr_storage), local_addr))) + goto nlmsg_free_error; + if ((ret = nla_put(resp_nlmsg, IWPM_NLA_QUERY_REMOTE_ADDR, + sizeof(struct sockaddr_storage), remote_addr))) + goto nlmsg_free_error; + if ((ret = nla_put(resp_nlmsg, IWPM_NLA_RQUERY_MAPPED_LOC_ADDR, + sizeof(struct sockaddr_storage), mapped_loc_addr))) + goto nlmsg_free_error; + if ((ret = nla_put(resp_nlmsg, IWPM_NLA_RQUERY_MAPPED_REM_ADDR, + sizeof(struct sockaddr_storage), mapped_rem_addr))) + goto nlmsg_free_error; + if ((ret = nla_put_u16(resp_nlmsg, IWPM_NLA_RQUERY_MAPPING_ERR, nlmsg_err))) + goto nlmsg_free_error; + + if ((ret = send_iwpm_nlmsg(nl_sock, resp_nlmsg, nlmsg_pid))) { + str_err = "Unable to send nlmsg response"; + goto nlmsg_free_error; + } + nlmsg_free(resp_nlmsg); + return 0; +nlmsg_free_error: + if (resp_nlmsg) + nlmsg_free(resp_nlmsg); +nlmsg_error: + syslog(LOG_WARNING, "send_conn_info_nlmsg: %s.\n", str_err); + return ret; +} + +/** + * process_iwpm_wire_request - Process a mapping query from remote port mapper peer + * @msg_parms: the received iwpm request message + * @recv_addr: address of the remote peer + * @pm_sock: socket handle to send a response to the remote iwpm peer + * + * Look up the accepting peer local address to find the corresponding mapping, + * send reject message to the remote connecting peer, if no mapping is found, + * otherwise, send accept message with the accepting peer mapping info + */ +static int process_iwpm_wire_request(iwpm_msg_parms *msg_parms, int nl_sock, + struct sockaddr_storage *recv_addr, int pm_sock) +{ + iwpm_mapped_port *iwpm_port; + iwpm_mapping_request *iwpm_map_req = NULL; + iwpm_mapping_request iwpm_copy_req; + iwpm_send_msg *send_msg = NULL; + struct sockaddr_storage local_addr, mapped_loc_addr; + struct sockaddr_storage remote_addr, mapped_rem_addr; + __u16 nlmsg_type; + int not_mapped = 1; + int ret = 0; + + copy_iwpm_sockaddr(msg_parms->address_family, NULL, &local_addr, + &msg_parms->apipaddr[0], NULL, &msg_parms->apport); + iwpm_port = find_iwpm_mapping(&local_addr, not_mapped); + if (!iwpm_port) { + /* could not find mapping for the requested address */ + iwpm_debug(IWARP_PM_WIRE_DBG, "process_wire_request: " + "Sending Reject to port mapper peer.\n"); + print_iwpm_sockaddr(&local_addr, "process_wire_request: Local address", + IWARP_PM_ALL_DBG); + return send_iwpm_msg(form_iwpm_reject, msg_parms, recv_addr, pm_sock); + } + /* record mapping in the accept message */ + if (iwpm_port->wcard) + msg_parms->apport = get_sockaddr_port(&iwpm_port->mapped_addr); + else + copy_iwpm_sockaddr(msg_parms->address_family, &iwpm_port->mapped_addr, + NULL, NULL, &msg_parms->apipaddr[0], &msg_parms->apport); + + copy_iwpm_sockaddr(msg_parms->address_family, NULL, &mapped_loc_addr, + &msg_parms->apipaddr[0], NULL, &msg_parms->apport); + + /* check if there is already a request */ + ret = update_iwpm_map_request(msg_parms->assochandle, &mapped_loc_addr, + IWARP_PM_REQ_ACCEPT, &iwpm_copy_req, 0); + if (!ret) { /* found request */ + iwpm_debug(IWARP_PM_WIRE_DBG,"process_wire_request: Detected retransmission " + "map request (assochandle = %llu type = %d timeout = %u complete = %d)\n", + iwpm_copy_req.assochandle, iwpm_copy_req.msg_type, + iwpm_copy_req.timeout, iwpm_copy_req.complete); + return 0; + } + /* allocate response message */ + send_msg = malloc(sizeof(iwpm_send_msg)); + if (!send_msg) { + syslog(LOG_WARNING, "process_wire_request: Unable to allocate send msg.\n"); + return -ENOMEM; + } + form_iwpm_accept(&send_msg->data, msg_parms); + form_iwpm_send_msg(pm_sock, recv_addr, msg_parms->msize, send_msg); + + copy_iwpm_sockaddr(msg_parms->address_family, NULL, &remote_addr, + &msg_parms->cpipaddr[0], NULL, &msg_parms->cpport); + copy_iwpm_sockaddr(msg_parms->address_family, NULL, &mapped_rem_addr, + &msg_parms->mapped_cpipaddr[0], NULL, &msg_parms->mapped_cpport); + + iwpm_map_req = create_iwpm_map_request(NULL, &mapped_loc_addr, &remote_addr, + msg_parms->assochandle, IWARP_PM_REQ_ACCEPT, send_msg); + if (!iwpm_map_req) { + syslog(LOG_WARNING, "process_wire_request: Unable to allocate mapping request.\n"); + free(send_msg); + return -ENOMEM; + } + add_iwpm_map_request(iwpm_map_req); + ret = send_iwpm_msg(form_iwpm_accept, msg_parms, recv_addr, pm_sock); + if (ret) { + syslog(LOG_WARNING, "process_wire_request: Unable to allocate accept message.\n"); + return ret; + } + nlmsg_type = RDMA_NL_GET_TYPE(iwpm_port->owner_client, RDMA_NL_IWPM_REMOTE_INFO); + ret = send_conn_info_nlmsg(&iwpm_port->local_addr, &remote_addr, + &iwpm_port->mapped_addr, &mapped_rem_addr, + iwpm_port->owner_client, nlmsg_type, 0, 0, 0, nl_sock); + return ret; +} + +/** + * process_iwpm_wire_accept - Process accept message from the remote port mapper peer + * @msg_parms: the received iwpm accept message, containing the remote peer mapping info + * @nl_sock: netlink socket to send a message to the iwpm client + * @recv_addr: address of the remote peer + * @pm_sock: socket handle to send ack message back to the remote peer + * + * Send acknowledgement to the remote/accepting peer, + * send a netlink message with the local and remote mapping info to the iwpm client + * nlmsg response attributes: + * [IWPM_NLA_QUERY_MAPPING_SEQ] + * [IWPM_NLA_QUERY_LOCAL_ADDR] + * [IWPM_NLA_QUERY_REMOTE_ADDR] + * [IWPM_NLA_RQUERY_MAPPED_LOC_ADDR] + * [IWPM_NLA_RQUERY_MAPPED_REM_ADDR] + * [IWPM_NLA_RQUERY_MAPPING_ERR] + */ +static int process_iwpm_wire_accept(iwpm_msg_parms *msg_parms, int nl_sock, + struct sockaddr_storage *recv_addr, int pm_sock) +{ + iwpm_mapping_request iwpm_map_req; + iwpm_mapping_request *iwpm_retry_req = NULL; + iwpm_mapped_port *iwpm_port; + struct sockaddr_storage local_addr, remote_mapped_addr; + int not_mapped = 1; + const char *str_err; + int ret; + + copy_iwpm_sockaddr(msg_parms->address_family, NULL, &local_addr, + &msg_parms->cpipaddr[0], NULL, &msg_parms->cpport); + copy_iwpm_sockaddr(msg_parms->address_family, NULL, &remote_mapped_addr, + &msg_parms->apipaddr[0], NULL, &msg_parms->apport); + ret = -EINVAL; + iwpm_port = find_iwpm_same_mapping(&local_addr, not_mapped); + if (!iwpm_port) { + iwpm_debug(IWARP_PM_WIRE_DBG, "process_wire_accept: " + "Received accept for unknown mapping.\n"); + return 0; + } + /* there should be a request for the accept message */ + ret = update_iwpm_map_request(msg_parms->assochandle, &iwpm_port->local_addr, + (IWARP_PM_REQ_QUERY|IWARP_PM_REQ_ACK), &iwpm_map_req, 1); + if (ret) { + iwpm_debug(IWARP_PM_WIRE_DBG, "process_wire_accept: " + "No matching mapping request (assochandle = %llu)\n", + msg_parms->assochandle); + return 0; /* ok when retransmission */ + } + if (iwpm_map_req.complete) + return 0; + /* if the accept has already been processed and this is retransmission */ + if (iwpm_map_req.msg_type == IWARP_PM_REQ_ACK) { + iwpm_debug(IWARP_PM_RETRY_DBG, "process_wire_accept: Detected retransmission " + "(map request assochandle = %llu)\n", iwpm_map_req.assochandle); + goto wire_accept_send_ack; + } + ret = send_conn_info_nlmsg(&iwpm_port->local_addr, &iwpm_map_req.remote_addr, + &iwpm_port->mapped_addr, &remote_mapped_addr, + iwpm_port->owner_client, iwpm_map_req.nlmsg_type, + iwpm_map_req.nlmsg_seq, iwpm_map_req.nlmsg_pid, 0, nl_sock); + if (ret) { + str_err = "Unable to send nlmsg response"; + goto wire_accept_error; + } + /* object to detect retransmission */ + iwpm_retry_req = create_iwpm_map_request(NULL, &iwpm_map_req.src_addr, &iwpm_map_req.remote_addr, + iwpm_map_req.assochandle, IWARP_PM_REQ_ACK, NULL); + if (!iwpm_retry_req) { + ret = -ENOMEM; + str_err = "Unable to allocate retry request"; + goto wire_accept_error; + } + add_iwpm_map_request(iwpm_retry_req); +wire_accept_send_ack: + return send_iwpm_msg(form_iwpm_ack, msg_parms, recv_addr, pm_sock); +wire_accept_error: + syslog(LOG_WARNING, "process_iwpm_wire_accept: %s.\n", str_err); + return ret; +} + +/** + * process_iwpm_wire_reject - Process reject message from the port mapper remote peer + * @msg_parms: the received iwpm reject message + * @nl_sock: netlink socket to send through a message to the iwpm client + * + * Send notification to the iwpm client that its + * mapping request is rejected by the remote/accepting port mapper peer + */ +static int process_iwpm_wire_reject(iwpm_msg_parms *msg_parms, int nl_sock) +{ + iwpm_mapping_request iwpm_map_req; + iwpm_mapped_port *iwpm_port; + struct sockaddr_storage local_addr, remote_addr; + int not_mapped = 1; + __u16 err_code = IWPM_REMOTE_QUERY_REJECT; + const char *str_err; + int ret = -EINVAL; + + copy_iwpm_sockaddr(msg_parms->address_family, NULL, &local_addr, + &msg_parms->cpipaddr[0], NULL, &msg_parms->cpport); + copy_iwpm_sockaddr(msg_parms->address_family, NULL, &remote_addr, + &msg_parms->apipaddr[0], NULL, &msg_parms->apport); + + print_iwpm_sockaddr(&local_addr, "process_wire_reject: Local address", + IWARP_PM_ALL_DBG); + print_iwpm_sockaddr(&remote_addr, "process_wire_reject: Remote address", + IWARP_PM_ALL_DBG); + ret = -EINVAL; + iwpm_port = find_iwpm_same_mapping(&local_addr, not_mapped); + if (!iwpm_port) { + syslog(LOG_WARNING, "process_wire_reject: Received reject for unknown mapping.\n"); + return 0; + } + /* make sure there is request posted */ + ret = update_iwpm_map_request(msg_parms->assochandle, &iwpm_port->local_addr, + IWARP_PM_REQ_QUERY, &iwpm_map_req, 1); + if (ret) { + iwpm_debug(IWARP_PM_WIRE_DBG, "process_wire_reject: " + "No matching mapping request (assochandle = %llu)\n", + msg_parms->assochandle); + return 0; /* ok when retransmission */ + } + if (iwpm_map_req.complete) + return 0; + + ret = send_conn_info_nlmsg(&iwpm_port->local_addr, &iwpm_map_req.remote_addr, + &iwpm_port->mapped_addr, &iwpm_map_req.remote_addr, + iwpm_port->owner_client, iwpm_map_req.nlmsg_type, + iwpm_map_req.nlmsg_seq, iwpm_map_req.nlmsg_pid, err_code, nl_sock); + if (ret) { + str_err = "Unable to send nlmsg response"; + goto wire_reject_error; + } + return 0; +wire_reject_error: + syslog(LOG_WARNING, "process_wire_reject: %s.\n", str_err); + return ret; +} + +/** + * process_iwpm_wire_ack - Process acknowledgement from the remote port mapper peer + * @msg_parms: received iwpm acknowledgement + */ +static int process_iwpm_wire_ack(iwpm_msg_parms *msg_parms) +{ + iwpm_mapped_port *iwpm_port; + iwpm_mapping_request iwpm_map_req; + struct sockaddr_storage local_mapped_addr; + int not_mapped = 0; + int ret; + + copy_iwpm_sockaddr(msg_parms->address_family, NULL, &local_mapped_addr, + &msg_parms->apipaddr[0], NULL, &msg_parms->apport); + iwpm_port = find_iwpm_mapping(&local_mapped_addr, not_mapped); + if (!iwpm_port) { + iwpm_debug(IWARP_PM_WIRE_DBG, "process_wire_ack: Received ack for unknown mapping.\n"); + return 0; + } + /* make sure there is accept for the ack */ + ret = update_iwpm_map_request(msg_parms->assochandle, &local_mapped_addr, + IWARP_PM_REQ_ACCEPT, &iwpm_map_req, 1); + if (ret) + iwpm_debug(IWARP_PM_WIRE_DBG, "process_wire_ack: No matching mapping request\n"); + return 0; +} + +/* Mapping info message - nlmsg attributes */ +static struct nla_policy mapinfo_policy[IWPM_NLA_MAPINFO_MAX] = { + [IWPM_NLA_MAPINFO_LOCAL_ADDR] = { .minlen = sizeof(struct sockaddr_storage) }, + [IWPM_NLA_MAPINFO_MAPPED_ADDR] = { .minlen = sizeof(struct sockaddr_storage) }, + [IWPM_NLA_MAPINFO_FLAGS] = { .type = NLA_U32 } +}; + +/** + * process_iwpm_mapinfo - Process a mapping info message from the port mapper client + * @req_nlh: netlink header of the received client message + * @client_idx: the index of the client (the index is unique for each iwpm client) + * @nl_sock: netlink socket to send a message to the client + * + * In case the userspace iwarp port mapper daemon is restarted, + * the iwpm client needs to send a record of mappings it is currently using. + * The port mapper needs to reopen the mapped ports used by the client. + */ +static int process_iwpm_mapinfo(struct nlmsghdr *req_nlh, int client_idx, int nl_sock) +{ + iwpm_mapped_port *iwpm_port = NULL; + struct sockaddr_storage *local_addr, *local_mapped_addr; + struct nlattr *nltb [IWPM_NLA_MAPINFO_MAX] = {}; + int not_mapped = 1; + __u16 err_code = 0; + const char *msg_type = "Mapping Info Msg"; + const char *str_err = ""; + int ret = -EINVAL; + __u32 flags; + int max = IWPM_NLA_MAPINFO_MAX; + + if (iwpm_version != IWPM_UABI_VERSION) + max--; + if (parse_iwpm_nlmsg(req_nlh, max, mapinfo_policy, nltb, msg_type)) { + err_code = IWPM_INVALID_NLMSG_ERR; + str_err = "Received Invalid nlmsg"; + goto process_mapinfo_error; + } + local_addr = (struct sockaddr_storage *)nla_data(nltb[IWPM_NLA_MAPINFO_LOCAL_ADDR]); + local_mapped_addr = (struct sockaddr_storage *)nla_data(nltb[IWPM_NLA_MAPINFO_MAPPED_ADDR]); + flags = nltb[IWPM_NLA_MAPINFO_FLAGS] ? nla_get_u32(nltb[IWPM_NLA_MAPINFO_FLAGS]) : 0; + + iwpm_port = find_iwpm_mapping(local_addr, not_mapped); + if (iwpm_port) { + /* Can be safely ignored, if the mapinfo is exactly the same, + * because the client will provide all the port information it has and + * it could have started using the port mapper service already */ + if (check_same_sockaddr(&iwpm_port->local_addr, local_addr) && + check_same_sockaddr(&iwpm_port->mapped_addr, local_mapped_addr)) + goto process_mapinfo_exit; + + /* partial duplicates matching wcard ip address aren't allowed as well */ + err_code = IWPM_DUPLICATE_MAPPING_ERR; + str_err = "Duplicate mapped port"; + goto process_mapinfo_error; + } + iwpm_port = reopen_iwpm_mapped_port(local_addr, local_mapped_addr, client_idx, flags); + if (!iwpm_port) { + err_code = IWPM_CREATE_MAPPING_ERR; + str_err = "Unable to create new mapping"; + goto process_mapinfo_error; + } + /* add the new mapping to the list */ + add_iwpm_mapped_port(iwpm_port); +process_mapinfo_exit: + mapinfo_num_list[client_idx]++; + return 0; +process_mapinfo_error: + syslog(LOG_WARNING, "process_mapinfo: %s.\n", str_err); + if (err_code) { + /* send error message to the client */ + send_iwpm_error_msg(req_nlh->nlmsg_seq, err_code, client_idx, nl_sock); + } + return ret; +} + +/* Mapping info message count - nlmsg attributes */ +static struct nla_policy mapinfo_count_policy[IWPM_NLA_MAPINFO_SEND_MAX] = { + [IWPM_NLA_MAPINFO_SEQ] = { .type = NLA_U32 }, + [IWPM_NLA_MAPINFO_SEND_NUM] = { .type = NLA_U32 } +}; + +/** + * process_iwpm_mapinfo_count - Process mapinfo count message + * @req_nlh: netlink header of the received message from the client + * @client_idx: the index of the client + * @nl_sock: netlink socket to send a message to the client + * + * Mapinfo count message is a mechanism for the port mapper and the client to + * synchronize on the number of mapinfo messages which were sucessfully exchanged and processed + */ +static int process_iwpm_mapinfo_count(struct nlmsghdr *req_nlh, int client_idx, int nl_sock) +{ + struct nlattr *nltb [IWPM_NLA_MAPINFO_SEND_MAX]; + struct nl_msg *resp_nlmsg = NULL; + const char *msg_type = "Number of Mappings Msg"; + __u32 map_count; + __u16 err_code = 0; + const char *str_err = ""; + int ret = -EINVAL; + + if (parse_iwpm_nlmsg(req_nlh, IWPM_NLA_MAPINFO_SEND_MAX, + mapinfo_count_policy, nltb, msg_type)) { + str_err = "Received Invalid nlmsg"; + err_code = IWPM_INVALID_NLMSG_ERR; + goto mapinfo_count_error; + } + map_count = nla_get_u32(nltb[IWPM_NLA_MAPINFO_SEND_NUM]); + if (map_count != mapinfo_num_list[client_idx]) + iwpm_debug(IWARP_PM_NETLINK_DBG, "get_mapinfo_count: Client (idx = %d) " + "send mapinfo count = %u processed mapinfo count = %u.\n", + client_idx, map_count, mapinfo_num_list[client_idx]); + + resp_nlmsg = create_iwpm_nlmsg(req_nlh->nlmsg_type, client_idx); + if (!resp_nlmsg) { + str_err = "Unable to create nlmsg response"; + ret = -ENOMEM; + goto mapinfo_count_error; + } + str_err = "Invalid nlmsg attribute"; + if ((ret = nla_put_u32(resp_nlmsg, IWPM_NLA_MAPINFO_SEQ, req_nlh->nlmsg_seq))) + goto mapinfo_count_free_error; + if ((ret = nla_put_u32(resp_nlmsg, IWPM_NLA_MAPINFO_SEND_NUM, map_count))) + goto mapinfo_count_free_error; + if ((ret = nla_put_u32(resp_nlmsg, IWPM_NLA_MAPINFO_ACK_NUM, + mapinfo_num_list[client_idx]))) + goto mapinfo_count_free_error; + + if ((ret = send_iwpm_nlmsg(nl_sock, resp_nlmsg, req_nlh->nlmsg_pid))) { + str_err = "Unable to send nlmsg response"; + goto mapinfo_count_free_error; + } + nlmsg_free(resp_nlmsg); + return 0; +mapinfo_count_free_error: + if (resp_nlmsg) + nlmsg_free(resp_nlmsg); +mapinfo_count_error: + syslog(LOG_WARNING, "process_mapinfo_count: %s.\n", str_err); + if (err_code) { + /* send error message to the client */ + send_iwpm_error_msg(req_nlh->nlmsg_seq, err_code, client_idx, nl_sock); + } + return ret; +} + +/** + * send_iwpm_error_msg - Send error message to the iwpm client + * @seq: last received netlink message sequence + * @err_code: used to differentiante between errors + * @client_idx: the index of the client + * @nl_sock: netlink socket to send a message to the client + */ +static int send_iwpm_error_msg(__u32 seq, __u16 err_code, int client_idx, int nl_sock) +{ + struct nl_msg *resp_nlmsg; + __u16 nlmsg_type; + const char *str_err = ""; + int ret; + + nlmsg_type = RDMA_NL_GET_TYPE(client_idx, RDMA_NL_IWPM_HANDLE_ERR); + resp_nlmsg = create_iwpm_nlmsg(nlmsg_type, client_idx); + if (!resp_nlmsg) { + ret = -ENOMEM; + str_err = "Unable to create nlmsg response"; + goto send_error_msg_exit; + } + str_err = "Invalid nlmsg attribute"; + if ((ret = nla_put_u32(resp_nlmsg, IWPM_NLA_ERR_SEQ, seq))) + goto send_error_msg_exit; + if ((ret = nla_put_u16(resp_nlmsg, IWPM_NLA_ERR_CODE, err_code))) + goto send_error_msg_exit; + + if ((ret = send_iwpm_nlmsg(nl_sock, resp_nlmsg, 0))) { + str_err = "Unable to send nlmsg response"; + goto send_error_msg_exit; + } + nlmsg_free(resp_nlmsg); + return 0; +send_error_msg_exit: + if (resp_nlmsg) + nlmsg_free(resp_nlmsg); + syslog(LOG_WARNING, "send_iwpm_error_msg: %s (ret = %d).\n", str_err, ret); + return ret; +} + +/* Hello message - nlmsg attributes */ +static struct nla_policy hello_policy[IWPM_NLA_HELLO_MAX] = { + [IWPM_NLA_HELLO_ABI_VERSION] = { .type = NLA_U16 } +}; + +/** + * process_iwpm_hello - Process mapinfo count message + * @req_nlh: netlink header of the received message from the client + * @client_idx: the index of the client + * @nl_sock: netlink socket to send a message to the client + * + * Mapinfo count message is a mechanism for the port mapper and the client to + * synchronize on the number of mapinfo messages which were sucessfully exchanged and processed + */ +static int process_iwpm_hello(struct nlmsghdr *req_nlh, int client_idx, int nl_sock) +{ + struct nlattr *nltb [IWPM_NLA_HELLO_MAX]; + const char *msg_type = "Hello Msg"; + __u16 abi_version; + __u16 err_code = 0; + const char *str_err = ""; + int ret = -EINVAL; + + if (req_nlh->nlmsg_type == NLMSG_ERROR) { + abi_version = IWPM_UABI_VERSION_MIN; + } else { + if (parse_iwpm_nlmsg(req_nlh, IWPM_NLA_HELLO_MAX, + hello_policy, nltb, msg_type)) { + str_err = "Received Invalid nlmsg"; + err_code = IWPM_INVALID_NLMSG_ERR; + goto hello_error; + } + abi_version = nla_get_u16(nltb[IWPM_NLA_HELLO_ABI_VERSION]); + } + if (abi_version > IWPM_UABI_VERSION) { + str_err = "UABI Version mismatch"; + err_code = IWPM_VERSION_MISMATCH_ERR; + goto hello_error; + } + iwpm_version = abi_version; + iwpm_debug(IWARP_PM_NETLINK_DBG, "process_iwpm_hello: using abi_version %u\n", iwpm_version); + + send_iwpm_mapinfo_request(nl_sock, RDMA_NL_IWCM); + if (iwpm_version == 3) { + /* Legacy RDMA_NL_C4IW for old kernels */ + send_iwpm_mapinfo_request(nl_sock, RDMA_NL_IWCM+1); + } + return 0; +hello_error: + syslog(LOG_WARNING, "process_iwpm_hello: %s.\n", str_err); + if (err_code) { + /* send error message to the client */ + send_iwpm_error_msg(req_nlh->nlmsg_seq, err_code, client_idx, nl_sock); + } + return ret; +} + +/** + * process_iwpm_netlink_msg - Dispatch received netlink messages + * @nl_sock: netlink socket to read the messages from + */ +static int process_iwpm_netlink_msg(int nl_sock) +{ + char *recv_buffer = NULL; + struct nlmsghdr *nlh; + struct sockaddr_nl src_addr; + int len, type, client_idx, op; + socklen_t src_addr_len; + const char *str_err = ""; + int ret = 0; + + recv_buffer = malloc(NLMSG_SPACE(IWARP_PM_RECV_PAYLOAD)); + if (!recv_buffer) { + ret = -ENOMEM; + str_err = "Unable to allocate receive socket buffer"; + goto process_netlink_msg_exit; + } + /* receive a new message */ + nlh = (struct nlmsghdr *)recv_buffer; + memset(nlh, 0, NLMSG_SPACE(IWARP_PM_RECV_PAYLOAD)); + memset(&src_addr, 0, sizeof(src_addr)); + + src_addr_len = sizeof(src_addr); + len = recvfrom(nl_sock, (void *)nlh, NLMSG_SPACE(IWARP_PM_RECV_PAYLOAD), 0, + (struct sockaddr *)&src_addr, &src_addr_len); + if (len <= 0) { + ret = -errno; + str_err = "Unable to receive data from netlink socket"; + goto process_netlink_msg_exit; + } + /* loop for multiple netlink messages packed together */ + while (NLMSG_OK(nlh, len) != 0) { + if (nlh->nlmsg_type == NLMSG_DONE) { + goto process_netlink_msg_exit; + } + + type = nlh->nlmsg_type; + client_idx = RDMA_NL_GET_CLIENT(type); + if (type == NLMSG_ERROR) { + + /* RDMA_NL_IWCM HELLO error indicates V3 kernel */ + if (nlh->nlmsg_seq == 0) { + ret = process_iwpm_hello(nlh, client_idx, nl_sock); + } else { + iwpm_debug(IWARP_PM_NETLINK_DBG, "process_netlink_msg: " + "Netlink error message seq = %u\n", nlh->nlmsg_seq); + } + goto process_netlink_msg_exit; + } + op = RDMA_NL_GET_OP(type); + iwpm_debug(IWARP_PM_NETLINK_DBG, "process_netlink_msg: Received a new message: " + "opcode = %u client idx = %u, client pid = %u," + " msg seq = %u, type = %u, length = %u.\n", + op, client_idx, nlh->nlmsg_pid, nlh->nlmsg_seq, type, len); + + if (client_idx >= IWARP_PM_MAX_CLIENTS) { + ret = -EINVAL; + str_err = "Invalid client index"; + goto process_netlink_msg_exit; + } + switch (op) { + case RDMA_NL_IWPM_REG_PID: + str_err = "Register Pid request"; + ret = process_iwpm_register_pid(nlh, client_idx, nl_sock); + break; + case RDMA_NL_IWPM_ADD_MAPPING: + str_err = "Add Mapping request"; + if (!client_list[client_idx].valid) { + ret = -EINVAL; + goto process_netlink_msg_exit; + } + ret = process_iwpm_add_mapping(nlh, client_idx, nl_sock); + break; + case RDMA_NL_IWPM_QUERY_MAPPING: + str_err = "Query Mapping request"; + if (!client_list[client_idx].valid) { + ret = -EINVAL; + goto process_netlink_msg_exit; + } + ret = process_iwpm_query_mapping(nlh, client_idx, nl_sock); + break; + case RDMA_NL_IWPM_REMOVE_MAPPING: + str_err = "Remove Mapping request"; + ret = process_iwpm_remove_mapping(nlh, client_idx, nl_sock); + break; + case RDMA_NL_IWPM_MAPINFO: + ret = process_iwpm_mapinfo(nlh, client_idx, nl_sock); + break; + case RDMA_NL_IWPM_MAPINFO_NUM: + ret = process_iwpm_mapinfo_count(nlh, client_idx, nl_sock); + break; + case RDMA_NL_IWPM_HELLO: + ret = process_iwpm_hello(nlh, client_idx, nl_sock); + break; + default: + str_err = "Netlink message with invalid opcode"; + ret = -1; + break; + } + nlh = NLMSG_NEXT(nlh, len); + if (ret) + goto process_netlink_msg_exit; + } + +process_netlink_msg_exit: + if (recv_buffer) + free(recv_buffer); + if (ret) + syslog(LOG_WARNING, "process_netlink_msg: %s error (ret = %d).\n", str_err, ret); + return ret; +} + +/** + * process_iwpm_msg - Dispatch iwpm wire messages, sent by the remote peer + * @pm_sock: socket handle to read the messages from + */ +static int process_iwpm_msg(int pm_sock) +{ + iwpm_msg_parms msg_parms; + struct sockaddr_storage recv_addr; + iwpm_wire_msg recv_buffer; /* received message */ + int bytes_recv, ret = 0; + int max_bytes_send = IWARP_PM_MESSAGE_SIZE + IWPM_IPADDR_SIZE; + socklen_t recv_addr_len = sizeof(recv_addr); + + bytes_recv = recvfrom(pm_sock, &recv_buffer, max_bytes_send, 0, + (struct sockaddr *)&recv_addr, &recv_addr_len); + + if (bytes_recv != IWARP_PM_MESSAGE_SIZE && bytes_recv != max_bytes_send) { + syslog(LOG_WARNING, + "process_iwpm_msg: Unable to receive data from PM socket. %s.\n", + strerror(errno)); + ret = -errno; + goto process_iwpm_msg_exit; + } + parse_iwpm_msg(&recv_buffer, &msg_parms); + + switch (msg_parms.mt) { + case IWARP_PM_MT_REQ: + iwpm_debug(IWARP_PM_WIRE_DBG, "process_iwpm_msg: Received Request message.\n"); + ret = process_iwpm_wire_request(&msg_parms, netlink_sock, &recv_addr, pm_sock); + break; + case IWARP_PM_MT_ACK: + iwpm_debug(IWARP_PM_WIRE_DBG, "process_iwpm_msg: Received Acknowledgement.\n"); + ret = process_iwpm_wire_ack(&msg_parms); + break; + case IWARP_PM_MT_ACC: + iwpm_debug(IWARP_PM_WIRE_DBG, "process_iwpm_msg: Received Accept message.\n"); + ret = process_iwpm_wire_accept(&msg_parms, netlink_sock, &recv_addr, pm_sock); + break; + case IWARP_PM_MT_REJ: + iwpm_debug(IWARP_PM_WIRE_DBG, "process_iwpm_msg: Received Reject message.\n"); + ret = process_iwpm_wire_reject(&msg_parms, netlink_sock); + break; + default: + syslog(LOG_WARNING, "process_iwpm_msg: Received Invalid message type = %u.\n", + msg_parms.mt); + } +process_iwpm_msg_exit: + return ret; +} + +/** + * send_iwpm_hello - Notify the client that the V4 iwarp port mapper is available + * @nl_sock: netlink socket to send a message to the client + * + * Send a HELLO message including the ABI_VERSION supported by iwpmd. If the + * response is an ERROR message, then we know the kernel driver is < V4, so we + * drop back to the V3 protocol. If the kernel is >= V4, then it will reply + * with its ABI Version. The response is handled in iwarp_port_mapper(). Once + * the ABI version is negotiatied, iwpmd will send a mapinfo request to get any + * current mappings, using the correct ABI version. This allows working with + * V3 kernels. + */ +static int send_iwpm_hello(int nl_sock) +{ + struct nl_msg *req_nlmsg; + const char *str_err; + __u16 nlmsg_type; + int ret; + + nlmsg_type = RDMA_NL_GET_TYPE(RDMA_NL_IWCM, RDMA_NL_IWPM_HELLO); + req_nlmsg = create_iwpm_nlmsg(nlmsg_type, RDMA_NL_IWCM); + if (!req_nlmsg) { + ret = -ENOMEM; + str_err = "Unable to create nlmsg request"; + goto send_hello_error; + } + str_err = "Invalid nlmsg attribute"; + if ((ret = nla_put_u16(req_nlmsg, IWPM_NLA_HELLO_ABI_VERSION, iwpm_version))) + goto send_hello_error; + + if ((ret = send_iwpm_nlmsg(nl_sock, req_nlmsg, 0))) { + str_err = "Unable to send nlmsg response"; + goto send_hello_error; + } + nlmsg_free(req_nlmsg); + return 0; +send_hello_error: + if (req_nlmsg) + nlmsg_free(req_nlmsg); + syslog(LOG_WARNING, "send_hello_request: %s ret = %d.\n", str_err, ret); + return ret; +} + +/** + * send_iwpm_mapinfo_request - Notify the client that the iwarp port mapper is available + * @nl_sock: netlink socket to send a message to the client + * @client - client to receive the message + */ +static int send_iwpm_mapinfo_request(int nl_sock, int client) +{ + struct nl_msg *req_nlmsg; + __u16 nlmsg_type; + const char *str_err; + int ret; + + nlmsg_type = RDMA_NL_GET_TYPE(client, RDMA_NL_IWPM_MAPINFO); + req_nlmsg = create_iwpm_nlmsg(nlmsg_type, client); + if (!req_nlmsg) { + ret = -ENOMEM; + str_err = "Unable to create nlmsg request"; + goto send_mapinfo_error; + } + str_err = "Invalid nlmsg attribute"; + if ((ret = nla_put_string(req_nlmsg, IWPM_NLA_MAPINFO_ULIB_NAME, iwpm_ulib_name))) + goto send_mapinfo_error; + + if ((ret = nla_put_u16(req_nlmsg, IWPM_NLA_MAPINFO_ULIB_VER, iwpm_version))) + goto send_mapinfo_error; + + if ((ret = send_iwpm_nlmsg(nl_sock, req_nlmsg, 0))) { + str_err = "Unable to send nlmsg response"; + goto send_mapinfo_error; + } + nlmsg_free(req_nlmsg); + return 0; +send_mapinfo_error: + if (req_nlmsg) + nlmsg_free(req_nlmsg); + syslog(LOG_WARNING, "send_mapinfo_request: %s ret = %d.\n", str_err, ret); + return ret; +} + +/** iwpm_cleanup - Close socket handles and free mapped ports */ +static void iwpm_cleanup(void) +{ + free_iwpm_mapped_ports(); + + destroy_iwpm_socket(netlink_sock); + destroy_iwpm_socket(pmv6_client_sock); + destroy_iwpm_socket(pmv6_sock); + destroy_iwpm_socket(pmv4_client_sock); + destroy_iwpm_socket(pmv4_sock); + /* close up logging */ + closelog(); +} + +/** + * iwarp_port_mapper - Distribute work orders for processing different types of iwpm messages + */ +static int iwarp_port_mapper(void) +{ + fd_set select_fdset; /* read fdset */ + struct timeval select_timeout; + int select_rc, max_sock = 0, ret = 0; + + if (pmv4_sock > max_sock) + max_sock = pmv4_sock; + if (pmv6_sock > max_sock) + max_sock = pmv6_sock; + if (netlink_sock > max_sock) + max_sock = netlink_sock; + if (pmv4_client_sock > max_sock) + max_sock = pmv4_client_sock; + if (pmv6_client_sock > max_sock) + max_sock = pmv6_client_sock; + + /* poll a set of sockets */ + do { + do { + if (print_mappings) { + print_iwpm_mapped_ports(); + print_mappings = 0; + } + /* initialize the file sets for select */ + FD_ZERO(&select_fdset); + /* add the UDP and Netlink sockets to the file set */ + FD_SET(pmv4_sock, &select_fdset); + FD_SET(pmv4_client_sock, &select_fdset); + FD_SET(pmv6_sock, &select_fdset); + FD_SET(pmv6_client_sock, &select_fdset); + FD_SET(netlink_sock, &select_fdset); + + /* set the timeout for select */ + select_timeout.tv_sec = 10; + select_timeout.tv_usec = 0; + /* timeout is an upper bound of time elapsed before select returns */ + select_rc = select(max_sock + 1, &select_fdset, NULL, NULL, &select_timeout); + } while (select_rc == 0); + /* select_rc is the number of fds ready for IO ( IO won't block) */ + + if (select_rc == -1) { + if (errno == EINTR) + continue; + syslog(LOG_WARNING, "iwarp_port_mapper: Select failed (%s).\n", strerror(errno)); + ret = -errno; + goto iwarp_port_mapper_exit; + } + + if (FD_ISSET(pmv4_sock, &select_fdset)) { + ret = process_iwpm_msg(pmv4_sock); + } + + if (FD_ISSET(pmv6_sock, &select_fdset)) { + ret = process_iwpm_msg(pmv6_sock); + } + + if (FD_ISSET(pmv4_client_sock, &select_fdset)) { + ret = process_iwpm_msg(pmv4_client_sock); + } + + if (FD_ISSET(pmv6_client_sock, &select_fdset)) { + ret = process_iwpm_msg(pmv6_client_sock); + } + + if (FD_ISSET(netlink_sock, &select_fdset)) { + ret = process_iwpm_netlink_msg(netlink_sock); + } + } while (1); + +iwarp_port_mapper_exit: + return ret; +} + +/** + * daemonize_iwpm_server - Make iwarp port mapper a daemon process + */ +static void daemonize_iwpm_server(void) +{ + if (daemon(0, 0) != 0) { + syslog(LOG_ERR, "Failed to daemonize\n"); + exit(EXIT_FAILURE); + } + + syslog(LOG_WARNING, "daemonize_iwpm_server: Starting iWarp Port Mapper V%d process\n", + iwpm_version); +} + +int main(int argc, char *argv[]) +{ + FILE *fp; + int c; + int ret = EXIT_FAILURE; + bool systemd = false; + + while (1) { + static const struct option long_opts[] = { + {"systemd", 0, NULL, 's'}, + {} + }; + + c = getopt_long(argc, argv, "fs", long_opts, NULL); + if (c == -1) + break; + + switch (c) { + case 's': + systemd = true; + break; + default: + break; + + } + } + + openlog(NULL, LOG_NDELAY | LOG_CONS | LOG_PID, LOG_DAEMON); + + if (!systemd) + daemonize_iwpm_server(); + umask(0); /* change file mode mask */ + + fp = fopen(IWPM_CONFIG_FILE, "r"); + if (fp) { + parse_iwpm_config(fp); + fclose(fp); + } + memset(client_list, 0, sizeof(client_list)); + + pmv4_sock = create_iwpm_socket_v4(IWARP_PM_PORT); + if (pmv4_sock < 0) + goto error_exit_v4; + + pmv4_client_sock = create_iwpm_socket_v4(0); + if (pmv4_client_sock < 0) + goto error_exit_v4_client; + + pmv6_sock = create_iwpm_socket_v6(IWARP_PM_PORT); + if (pmv6_sock < 0) + goto error_exit_v6; + + pmv6_client_sock = create_iwpm_socket_v6(0); + if (pmv6_client_sock < 0) + goto error_exit_v6_client; + + netlink_sock = create_netlink_socket(); + if (netlink_sock < 0) + goto error_exit_nl; + + signal(SIGHUP, iwpm_signal_handler); + signal(SIGTERM, iwpm_signal_handler); + signal(SIGUSR1, iwpm_signal_handler); + + pthread_cond_init(&cond_req_complete, NULL); + pthread_cond_init(&cond_pending_msg, NULL); + + ret = pthread_create(&map_req_thread, NULL, iwpm_mapping_reqs_handler, NULL); + if (ret) + goto error_exit; + + ret = pthread_create(&pending_msg_thread, NULL, iwpm_pending_msgs_handler, NULL); + if (ret) + goto error_exit; + + ret = send_iwpm_hello(netlink_sock); + if (ret) + goto error_exit; + + if (systemd) + sd_notify(0, "READY=1"); + + iwarp_port_mapper(); /* start iwarp port mapper process */ + + free_iwpm_mapped_ports(); + closelog(); + +error_exit: + destroy_iwpm_socket(netlink_sock); +error_exit_nl: + destroy_iwpm_socket(pmv6_client_sock); +error_exit_v6_client: + destroy_iwpm_socket(pmv6_sock); +error_exit_v6: + destroy_iwpm_socket(pmv4_client_sock); +error_exit_v4_client: + destroy_iwpm_socket(pmv4_sock); +error_exit_v4: + syslog(LOG_WARNING, "main: Couldn't start iWarp Port Mapper.\n"); + return ret; +} diff --git a/iwpmd/iwpmd.8.in b/iwpmd/iwpmd.8.in new file mode 100644 index 0000000..76efaa4 --- /dev/null +++ b/iwpmd/iwpmd.8.in @@ -0,0 +1,60 @@ +.TH "iwpmd" 8 "2016-09-16" "iwpmd" "iwpmd" iwpmd +.SH NAME +iwpmd \- port mapping services for iWARP. +.SH SYNOPSIS +.sp +.nf +\fIiwpmd\fR +.fi +.SH "DESCRIPTION" +The iWARP Port Mapper Daemon provides a user space service (iwpmd) for the +iWarp drivers to claim tcp ports through the standard socket interface. +.P +The kernel space support for the port mapper is part of the iw_cm module. +The ib_core module includes netlink support, which is used by the port +mapper clients to exchange messages with iwpmd. Both modules iw_cm and +ib_core need to be loaded in order for the libiwpm service to start +successfully. +.SH "IWARP PORT MAPPING DETAILS" +The iWARP Port Mapper implementation is based on the port mapper +specification section in the Sockets Direct Protocol: +http://www.rdmaconsortium.org/home/draft-pinkerton-iwarp-sdp-v1.0.pdf +.P +Existing iWARP RDMA providers use the same IP address as the native +TCP/IP stack when creating RDMA connections. They need a mechanism to +claim the TCP ports used for RDMA connections to prevent TCP port +collisions when other host applications use TCP ports. The iWARP Port +Mapper provides a standard mechanism to accomplish this. Without this +service it is possible for RDMA application to bind/listen on the same +port which is already being used by native TCP host application. If +that happens the incoming TCP connection data can be passed to the +RDMA stack with error. +.P +The iWARP Connection Manager (port mapper client) sends to the IWPM +service the local IP address and TCP port it has received from the RDMA +application, when starting a connection. The IWPM service performs +a socket bind from user space to get an available TCP port, called a +mapped port, and communicates it back to the client. In that sense, +the IWPM service is used to map the TCP port, which the RDMA application +uses to any port available from the host TCP port space. The mapped ports +are used in iWARP RDMA connections to avoid collisions with native TCP +stack which is aware that these ports are taken. When an RDMA connection +using a mapped port is terminated, the client notifies the IWPM service, +which then releases the TCP port. +.P +The message exchange between iwpmd and the iWARP Connection Manager +(between user space and kernel space) is implemented using netlink +sockets. +.SH OPTIONS +.sp +\fB\-s, \-\-systemd\fP +Enable systemd integration. +.SH "SIGNALS" +SIGUSR1 will force a dump of the current mappings +to the system message log. +.P +SIGTERM/SIGHUP will force iwpmd to exit. +.SH "FILES" +@CMAKE_INSTALL_FULL_SYSCONFDIR@/iwpmd.conf +.SH "SEE ALSO" +rdma_cm(7) diff --git a/iwpmd/iwpmd.conf b/iwpmd/iwpmd.conf new file mode 100644 index 0000000..4bed199 --- /dev/null +++ b/iwpmd/iwpmd.conf @@ -0,0 +1 @@ +nl_sock_rbuf_size=419430400 diff --git a/iwpmd/iwpmd.conf.5.in b/iwpmd/iwpmd.conf.5.in new file mode 100644 index 0000000..8cb3b0c --- /dev/null +++ b/iwpmd/iwpmd.conf.5.in @@ -0,0 +1,20 @@ +.TH "iwpmd.conf" 5 "2016-09-16" "iwpmd.conf" "iwpmd.conf" iwpmd.conf +.SH NAME +iwpmd.conf \- iWARP port mapper config file. +.SH SYNOPSIS +.sp +.nf +\fIiwpmd.conf\fR +.fi +.SH "DESCRIPTION" +The iwpmd.conf file provides configuration parameters for iwpmd. Parameters +are in the form: param=value, and one per line. Parameters include: +.P +nl_sock_rbuf_size - The socket buffer size of the netlink socket used +to communicate with the kernel port map client. The default is 400MB. +.SH "EXAMPLES" +nl_sock_rbuf_size=419430400 +.SH "FILES" +@CMAKE_INSTALL_FULL_SYSCONFDIR@/iwpmd.conf +.SH "SEE ALSO" +iwpmd(8) diff --git a/iwpmd/iwpmd.rules b/iwpmd/iwpmd.rules new file mode 100644 index 0000000..5b22cce --- /dev/null +++ b/iwpmd/iwpmd.rules @@ -0,0 +1 @@ +TAG+="systemd", ENV{ID_RDMA_IWARP}=="1", ENV{SYSTEMD_WANTS}+="iwpmd.service" diff --git a/iwpmd/iwpmd.service.in b/iwpmd/iwpmd.service.in new file mode 100644 index 0000000..596b705 --- /dev/null +++ b/iwpmd/iwpmd.service.in @@ -0,0 +1,28 @@ +[Unit] +Description=iWarp Port Mapper +Documentation=man:iwpmd file:/etc/iwpmd.conf +StopWhenUnneeded=yes +# iwpmd is a kernel support program and needs to run as early as possible, +# otherwise the kernel or userspace cannot establish RDMA connections and +# things will just fail, not block until iwpmd arrives. +DefaultDependencies=no +Before=sysinit.target +# Do not execute concurrently with an ongoing shutdown (required for DefaultDependencies=no) +Conflicts=shutdown.target +Before=shutdown.target +# Ensure required kernel modules are loaded before starting +Wants=rdma-load-modules@iwpmd.service +After=rdma-load-modules@iwpmd.service +# iwpmd needs to start before networking is brought up, even kernel networking +# (eg NFS) since it provides kernel support for iWarp's RDMA CM. +Wants=network-pre.target +Before=network-pre.target +# rdma-hw is not ready until iwpmd is running +Before=rdma-hw.target + +[Service] +Type=notify +ExecStart=@CMAKE_INSTALL_FULL_SBINDIR@/iwpmd --systemd +LimitNOFILE=102400 + +# iwpmd is automatically wanted by udev when an iWarp RDMA device is present diff --git a/iwpmd/iwpmd_init.in b/iwpmd/iwpmd_init.in new file mode 100644 index 0000000..f3ae269 --- /dev/null +++ b/iwpmd/iwpmd_init.in @@ -0,0 +1,118 @@ +#!/bin/bash +# Start the IWPMD daemon +# +# chkconfig: 1235 90 15 +# description: iWarp Port Mapper Daemon for opening sockets to reserve ports from userspace +# processname: iwpmd +# pidfile: /var/run/iwpmd.pid +# +### BEGIN INIT INFO +# Provides: iwpmd +# Required-Start: $network $syslog $remote_fs +# Required-Stop: $remote_fs +# Default-Stop: 0 1 6 +# Default-Start: 2 3 4 5 +# Short-Description: iWarp Port Mapper Daemon +# Description: iWarp Port Mapper Daemon for opening sockets to claim TCP ports from userspace +### END INIT INFO + +IWPMD_BIN="@CMAKE_INSTALL_FULL_SBINDIR@/iwpmd" +LOCK="/var/lock/subsys/iwpmd" +IWPMD_PID=0 +RETVAL=0 + +# Source function library. +if [ -f "/etc/redhat-release" ]; then + . /etc/rc.d/init.d/functions + STARTD=daemon + STOPD=killproc + STATUSD=status + GETPID=/sbin/pidof + +else + # Debian / openSUSE / Ubuntu + . /lib/lsb/init-functions + STARTD=start_daemon + STOPD=killproc + STATUSD=/sbin/checkproc + GETPID=pidofproc +fi + +check() { + # Check if iwpm is executable + test -x $IWPMD_BIN || ( echo "Couldn't find $IWPMD_BIN"; exit 5 ) +} + +start() { + check + RETVAL=$? + [ $RETVAL -gt 0 ] && exit $RETVAL + + echo -n $"Starting iwpm daemon: " + if [ ! -f "$LOCK" ]; then + ulimit -n 102400 + $STARTD $IWPMD_BIN &> /dev/null + RETVAL=$? + [ $RETVAL -eq 0 ] && ( touch $LOCK; echo "OK" ) || echo "NO" + else + echo "NO (iwpm is already running)" + fi + return $RETVAL +} + +stop() { + check + RETVAL=$? + [ $RETVAL -gt 0 ] && exit $RETVAL + + echo -n $"Stopping iwpm daemon: " + if [ -f "$LOCK" ]; then + $STOPD $IWPMD_BIN &> /dev/null + RETVAL=$? + [ $RETVAL -eq 0 ] && ( rm -f $LOCK; echo "OK" ) || echo "NO" + else + echo "NO (iwpm is already stopped)" + fi + return $RETVAL +} + +restart() { + stop + start +} + +show_status() { + check + RETVAL=$? + [ $RETVAL -gt 0 ] && exit $RETVAL + + IWPMD_PID="$($GETPID $IWPMD_BIN)" + $STATUSD $IWPMD_BIN &> /dev/null + RETVAL=$? + [ $RETVAL -eq 0 ] && echo "iwpm daemon (pid $IWPMD_PID) is running" || echo "iwpm daemon isn't available" + + return $RETVAL +} + +case "$1" in +start) + start + ;; +stop) + stop + ;; +restart) + restart + ;; +force-reload) + restart + ;; +status) + show_status + ;; +*) + echo $"Usage: $0 {start|stop|restart|force-reload|status}" + RETVAL=2 +esac + +exit $RETVAL diff --git a/iwpmd/modules-iwpmd.conf b/iwpmd/modules-iwpmd.conf new file mode 100644 index 0000000..5544b35 --- /dev/null +++ b/iwpmd/modules-iwpmd.conf @@ -0,0 +1,2 @@ +# These modules are loaded by the system if iwpmd is to be run +iw_cm diff --git a/kernel-boot/CMakeLists.txt b/kernel-boot/CMakeLists.txt new file mode 100644 index 0000000..e40a316 --- /dev/null +++ b/kernel-boot/CMakeLists.txt @@ -0,0 +1,63 @@ +rdma_subst_install(FILES rdma-load-modules@.service.in + DESTINATION "${CMAKE_INSTALL_SYSTEMD_SERVICEDIR}" + RENAME rdma-load-modules@.service + PERMISSIONS OWNER_WRITE OWNER_READ GROUP_READ WORLD_READ) + +rdma_subst_install(FILES "rdma-hw.target.in" + RENAME "rdma-hw.target" + DESTINATION "${CMAKE_INSTALL_SYSTEMD_SERVICEDIR}" + PERMISSIONS OWNER_WRITE OWNER_READ GROUP_READ WORLD_READ) + +install(FILES + modules/infiniband.conf + modules/iwarp.conf + modules/opa.conf + modules/rdma.conf + modules/roce.conf + DESTINATION "${CMAKE_INSTALL_SYSCONFDIR}/rdma/modules") + +install(FILES "rdma-persistent-naming.rules" + RENAME "60-rdma-persistent-naming.rules" + DESTINATION "${CMAKE_INSTALL_UDEV_RULESDIR}") + +install(FILES "rdma-description.rules" + RENAME "75-rdma-description.rules" + DESTINATION "${CMAKE_INSTALL_UDEV_RULESDIR}") + +install(FILES "rdma-hw-modules.rules" + RENAME "90-rdma-hw-modules.rules" + DESTINATION "${CMAKE_INSTALL_UDEV_RULESDIR}") + +install(FILES "rdma-ulp-modules.rules" + RENAME "90-rdma-ulp-modules.rules" + DESTINATION "${CMAKE_INSTALL_UDEV_RULESDIR}") + +install(FILES "rdma-umad.rules" + RENAME "90-rdma-umad.rules" + DESTINATION "${CMAKE_INSTALL_UDEV_RULESDIR}") + +# This file is intended to be customized by the user, so it is installed in +# /etc/ +install(FILES "persistent-ipoib.rules" + RENAME "70-persistent-ipoib.rules" + DESTINATION "${CMAKE_INSTALL_SYSCONFDIR}/udev/rules.d") + +set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS}") + +# Create an installed executable (under /usr/lib/udev) +function(rdma_udev_executable EXEC) + add_executable(${EXEC} ${ARGN}) + target_link_libraries(${EXEC} LINK_PRIVATE ${COMMON_LIBS}) + set_target_properties(${EXEC} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${BUILD_BIN}") + install(TARGETS ${EXEC} DESTINATION "${CMAKE_INSTALL_UDEV_RULESDIR}/../") +endfunction() + +if (NOT NL_KIND EQUAL 0) + rdma_udev_executable(rdma_rename + rdma_rename.c + ) + + target_link_libraries(rdma_rename LINK_PRIVATE + ${NL_LIBRARIES} + ) +endif() diff --git a/kernel-boot/modules/infiniband.conf b/kernel-boot/modules/infiniband.conf new file mode 100644 index 0000000..99526e1 --- /dev/null +++ b/kernel-boot/modules/infiniband.conf @@ -0,0 +1,12 @@ +# These modules are loaded by the system if any InfiniBand device is installed +# InfiniBand over IP netdevice +ib_ipoib + +# Access to fabric management SMPs and GMPs from userspace. +ib_umad + +# SCSI Remote Protocol target support +# ib_srpt + +# ib_ucm provides the obsolete /dev/infiniband/ucm0 +# ib_ucm diff --git a/kernel-boot/modules/iwarp.conf b/kernel-boot/modules/iwarp.conf new file mode 100644 index 0000000..0cb831d --- /dev/null +++ b/kernel-boot/modules/iwarp.conf @@ -0,0 +1 @@ +# These modules are loaded by the system if any iWarp device is installed diff --git a/kernel-boot/modules/opa.conf b/kernel-boot/modules/opa.conf new file mode 100644 index 0000000..b9bc9f1 --- /dev/null +++ b/kernel-boot/modules/opa.conf @@ -0,0 +1,10 @@ +# These modules are loaded by the system if any OmniPath Architecture device +# is installed +# Infiniband over IP netdevice +ib_ipoib + +# Access to fabric management SMPs and GMPs from userspace. +ib_umad + +# Omnipath Ethernet Virtual NIC netdevice +opa_vnic diff --git a/kernel-boot/modules/rdma.conf b/kernel-boot/modules/rdma.conf new file mode 100644 index 0000000..2d342dd --- /dev/null +++ b/kernel-boot/modules/rdma.conf @@ -0,0 +1,21 @@ +# These modules are loaded by the system if any RDMA devices is installed +# iSCSI over RDMA client support +ib_iser + +# iSCSI over RDMA target support +# ib_isert + +# User access to RDMA verbs (supports libibverbs) +ib_uverbs + +# User access to RDMA connection management (supports librdmacm) +rdma_ucm + +# RDS over RDMA support +# rds_rdma + +# NFS over RDMA client support +xprtrdma + +# NFS over RDMA server support +svcrdma diff --git a/kernel-boot/modules/roce.conf b/kernel-boot/modules/roce.conf new file mode 100644 index 0000000..8e4927c --- /dev/null +++ b/kernel-boot/modules/roce.conf @@ -0,0 +1,2 @@ +# These modules are loaded by the system if any RDMA over Converged Ethernet +# device is installed diff --git a/kernel-boot/persistent-ipoib.rules b/kernel-boot/persistent-ipoib.rules new file mode 100644 index 0000000..f8d700a --- /dev/null +++ b/kernel-boot/persistent-ipoib.rules @@ -0,0 +1,12 @@ +# This is a sample udev rules file that demonstrates how to get udev to +# set the name of IPoIB interfaces to whatever you wish. There is a +# 16 character limit on network device names. +# +# Important items to note: ATTR{type}=="32" is IPoIB interfaces, and the +# ATTR{address} match must start with ?* and only reference the last 8 +# bytes of the address or else the address might not match the variable QPN +# portion. +# +# Modern udev is case sensitive and all addresses need to be in lower case. +# +# ACTION=="add", SUBSYSTEM=="net", DRIVERS=="?*", ATTR{type}=="32", ATTR{address}=="?*00:02:c9:03:00:31:78:f2", NAME="mlx4_ib3" diff --git a/kernel-boot/rdma-description.rules b/kernel-boot/rdma-description.rules new file mode 100644 index 0000000..48a7ced --- /dev/null +++ b/kernel-boot/rdma-description.rules @@ -0,0 +1,43 @@ +# This is a version of net-description.rules for /sys/class/infiniband devices + +ACTION=="remove", GOTO="rdma_description_end" +SUBSYSTEM!="infiniband", GOTO="rdma_description_end" + +# NOTE: DRIVERS searches up the sysfs path to find the driver that is bound to +# the PCI/etc device that the RDMA device is linked to. This is not the kernel +# driver that is supplying the RDMA device (eg as seen in ID_NET_DRIVER) + +# FIXME: with kernel support we could actually detect the protocols the RDMA +# driver itself supports, this is a work around for lack of that support. +# In future we could do this with a udev IMPORT{program} helper program +# that extracted the ID information from the RDMA netlink. + +# Hardware that supports InfiniBand +DRIVERS=="ib_mthca", ENV{ID_RDMA_INFINIBAND}="1" +DRIVERS=="mlx4_core", ENV{ID_RDMA_INFINIBAND}="1" +DRIVERS=="mlx5_core", ENV{ID_RDMA_INFINIBAND}="1" +DRIVERS=="ib_qib", ENV{ID_RDMA_INFINIBAND}="1" + +# Hardware that supports OPA +DRIVERS=="hfi1", ENV{ID_RDMA_OPA}="1" + +# Hardware that supports iWarp +DRIVERS=="cxgb4", ENV{ID_RDMA_IWARP}="1" +DRIVERS=="i40e", ENV{ID_RDMA_IWARP}="1" + +# Hardware that supports RoCE +DRIVERS=="be2net", ENV{ID_RDMA_ROCE}="1" +DRIVERS=="bnxt_en", ENV{ID_RDMA_ROCE}="1" +DRIVERS=="hns", ENV{ID_RDMA_ROCE}="1" +DRIVERS=="mlx4_core", ENV{ID_RDMA_ROCE}="1" +DRIVERS=="mlx5_core", ENV{ID_RDMA_ROCE}="1" +DRIVERS=="qede", ENV{ID_RDMA_ROCE}="1" +DRIVERS=="vmw_pvrdma", ENV{ID_RDMA_ROCE}="1" +DEVPATH=="*/infiniband/rxe*", ATTR{parent}=="*", ENV{ID_RDMA_ROCE}="1" + +# Setup the usual ID information so that systemd will display a sane name for +# the RDMA device units. +SUBSYSTEMS=="pci", ENV{ID_BUS}="pci", ENV{ID_VENDOR_ID}="$attr{vendor}", ENV{ID_MODEL_ID}="$attr{device}" +SUBSYSTEMS=="pci", IMPORT{builtin}="hwdb --subsystem=pci" + +LABEL="rdma_description_end" diff --git a/kernel-boot/rdma-hw-modules.rules b/kernel-boot/rdma-hw-modules.rules new file mode 100644 index 0000000..bee416d --- /dev/null +++ b/kernel-boot/rdma-hw-modules.rules @@ -0,0 +1,37 @@ +ACTION=="remove", GOTO="rdma_hw_modules_end" +SUBSYSTEM!="net", GOTO="rdma_hw_modules_end" + +# Automatically load RDMA specific kernel modules when a multi-function device is installed + +# These drivers autoload an ethernet driver based on hardware detection and +# need userspace to load the module that has their RDMA component to turn on +# RDMA. +ENV{ID_NET_DRIVER}=="be2net", RUN{builtin}+="kmod load ocrdma" +ENV{ID_NET_DRIVER}=="bnxt_en", RUN{builtin}+="kmod load bnxt_re" +ENV{ID_NET_DRIVER}=="cxgb4", RUN{builtin}+="kmod load iw_cxgb4" +ENV{ID_NET_DRIVER}=="hns", RUN{builtin}+="kmod load hns_roce" +ENV{ID_NET_DRIVER}=="i40e", RUN{builtin}+="kmod load i40iw" +ENV{ID_NET_DRIVER}=="mlx4_en", RUN{builtin}+="kmod load mlx4_ib" +ENV{ID_NET_DRIVER}=="mlx5_core", RUN{builtin}+="kmod load mlx5_ib" +ENV{ID_NET_DRIVER}=="qede", RUN{builtin}+="kmod load qedr" + +# The user must explicitly load these modules via /etc/modules-load.d/ or otherwise +# rxe + +# When in IB mode the kernel PCI core module autoloads the protocol modules +# for these providers +# mlx4 +# mlx5 + +# enic no longer has a userspace verbs driver, this rule should probably be +# owned by libfabric +ENV{ID_NET_DRIVER}=="enic", RUN{builtin}+="kmod load usnic_verbs" + +# These providers are single function and autoload RDMA automatically based on +# PCI probing +# hfi1verbs +# ipathverbs +# mthca +# vmw_pvrdma + +LABEL="rdma_hw_modules_end" diff --git a/kernel-boot/rdma-hw.target.in b/kernel-boot/rdma-hw.target.in new file mode 100644 index 0000000..010e21e --- /dev/null +++ b/kernel-boot/rdma-hw.target.in @@ -0,0 +1,13 @@ +[Unit] +Description=RDMA Hardware +Documentation=file:@CMAKE_INSTALL_FULL_DOCDIR@/udev.md +StopWhenUnneeded=yes + +# Start the basic ULP RDMA kernel modules when RDMA hardware is detected (note +# the rdma-load-modules@.service is already before this target) +Wants=rdma-load-modules@rdma.service +# Order after the standard network.target for compatibility with init.d +# scripts that order after networking - this will mean RDMA is ready too. +Before=network.target +# We do not order rdma-hw before basic.target, units for daemons that use RDMA +# have to manually order after rdma-hw.target diff --git a/kernel-boot/rdma-load-modules@.service.in b/kernel-boot/rdma-load-modules@.service.in new file mode 100644 index 0000000..c390a8c --- /dev/null +++ b/kernel-boot/rdma-load-modules@.service.in @@ -0,0 +1,25 @@ +[Unit] +Description=Load RDMA modules from @CMAKE_INSTALL_FULL_SYSCONFDIR@/rdma/modules/%I.conf +Documentation=file:@CMAKE_INSTALL_FULL_DOCDIR@/udev.md +# Kernel module loading must take place before sysinit.target, similar to +# systemd-modules-load.service +DefaultDependencies=no +Before=sysinit.target +# Do not execute concurrently with an ongoing shutdown +Conflicts=shutdown.target +Before=shutdown.target +# Partially support distro network setup scripts that run after +# systemd-modules-load.service but before sysinit.target, eg a classic network +# setup script. Run them after modules have loaded. +Wants=network-pre.target +Before=network-pre.target +# Orders all kernel module startup before rdma-hw.target can become ready +Before=rdma-hw.target + +ConditionCapability=CAP_SYS_MODULE + +[Service] +Type=oneshot +RemainAfterExit=yes +ExecStart=@CMAKE_INSTALL_SYSTEMD_BINDIR@/systemd-modules-load @CMAKE_INSTALL_FULL_SYSCONFDIR@/rdma/modules/%I.conf +TimeoutSec=90s diff --git a/kernel-boot/rdma-persistent-naming.rules b/kernel-boot/rdma-persistent-naming.rules new file mode 100644 index 0000000..9b61e16 --- /dev/null +++ b/kernel-boot/rdma-persistent-naming.rules @@ -0,0 +1,28 @@ +# SPDX-License-Identifier: (GPL-2.0 OR Linux-OpenIB) +# Copyright (c) 2019, Mellanox Technologies. All rights reserved. See COPYING file +# +# Rename modes: +# NAME_FALLBACK - Try to name devices in the following order: +# by-pci -> by-guid -> kernel +# NAME_KERNEL - leave name as kernel provided +# NAME_PCI - based on PCI/slot/function location +# NAME_GUID - based on system image GUID +# +# The stable names are combination of device type technology and rename mode. +# Infiniband - ib* +# RoCE - roce* +# iWARP - iw* +# OPA - opa* +# Default (unknown protocol) - rdma* +# +# Example: +# * NAME_PCI +# pci = 0000:00:0c.4 +# Device type = IB +# mlx5_0 -> ibp0s12f4 +# * NAME_GUID +# GUID = 5254:00c0:fe12:3455 +# Device type = RoCE +# mlx5_0 -> rocex525400c0fe123455 +# +ACTION=="add", SUBSYSTEM=="infiniband", PROGRAM="rdma_rename %k NAME_FALLBACK" diff --git a/kernel-boot/rdma-ulp-modules.rules b/kernel-boot/rdma-ulp-modules.rules new file mode 100644 index 0000000..fbd195a --- /dev/null +++ b/kernel-boot/rdma-ulp-modules.rules @@ -0,0 +1,11 @@ +ACTION=="remove", GOTO="rdma_ulp_modules_end" +SUBSYSTEM!="infiniband", GOTO="rdma_ulp_modules_end" + +# Automatically load general RDMA ULP modules when RDMA hardware is installed +TAG+="systemd", ENV{SYSTEMD_WANTS}+="rdma-hw.target" +TAG+="systemd", ENV{ID_RDMA_INFINIBAND}=="1", ENV{SYSTEMD_WANTS}+="rdma-load-modules@infiniband.service" +TAG+="systemd", ENV{ID_RDMA_IWARP}=="1", ENV{SYSTEMD_WANTS}+="rdma-load-modules@iwarp.service" +TAG+="systemd", ENV{ID_RDMA_OPA}=="1", ENV{SYSTEMD_WANTS}+="rdma-load-modules@opa.service" +TAG+="systemd", ENV{ID_RDMA_ROCE}=="1", ENV{SYSTEMD_WANTS}+="rdma-load-modules@roce.service" + +LABEL="rdma_ulp_modules_end" diff --git a/kernel-boot/rdma-umad.rules b/kernel-boot/rdma-umad.rules new file mode 100644 index 0000000..ba7ee61 --- /dev/null +++ b/kernel-boot/rdma-umad.rules @@ -0,0 +1 @@ +SUBSYSTEM=="infiniband_mad", KERNEL=="*umad*", TAG+="systemd", ENV{SYSTEMD_ALIAS}="/sys/subsystem/rdma/devices/$attr{ibdev}:$attr{port}/umad" diff --git a/kernel-boot/rdma_rename.c b/kernel-boot/rdma_rename.c new file mode 100644 index 0000000..cf2f46b --- /dev/null +++ b/kernel-boot/rdma_rename.c @@ -0,0 +1,652 @@ +// SPDX-License-Identifier: (GPL-2.0 OR Linux-OpenIB) +/* Copyright (c) 2019, Mellanox Technologies. All rights reserved. See COPYING file */ + +#define _GNU_SOURCE +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <stdbool.h> +#include <errno.h> +#include <unistd.h> +#include <getopt.h> +#include <sys/types.h> +#include <dirent.h> +#include <syslog.h> +#include <rdma/rdma_netlink.h> +#include <netlink/netlink.h> +#include <netlink/msg.h> +#include <netlink/attr.h> +#include <linux/pci_regs.h> +#include <util/rdma_nl.h> + +/* + * Rename modes: + * NAME_FALLBACK - Try to name devices in the following order: + * by->onboard -> by-pci -> by-guid -> kernel + * NAME_KERNEL - leave name as kernel provided + * NAME_PCI - based on PCI/slot/function location + * NAME_GUID - based on node GUID + * NAME_ONBOARD - based on-board device index + * + * The stable names are combination of device type technology and rename mode. + * Infiniband - ib* + * RoCE - roce* + * iWARP - iw* + * OPA - opa* + * Default (unknown protocol) - rdma* + * + * Example: + * NAME_PCI + * pci = 0000:00:0c.4 + * Device type = IB + * mlx5_0 -> ibp0s12f4 + * NAME_GUID + * GUID = 5254:00c0:fe12:3455 + * Device type = RoCE + * mlx5_0 -> rocex525400c0fe123455 + * NAME_ONBOARD + * Index = 3 + * Device type = OPA + * hfi1_1 -> opao3 + */ + +struct data { + const char *curr; + char *prefix; + uint64_t node_guid; + char *name; + int idx; +}; + +static bool debug_mode; +#define pr_err(args...) syslog(LOG_ERR, ##args) +#define pr_dbg(args...) \ + do { \ + if (debug_mode) \ + syslog(LOG_ERR, ##args); \ + } while (0) + +#define ONBOARD_INDEX_MAX (16*1024-1) +static int by_onboard(struct data *d) +{ + char *index = NULL; + char *acpi = NULL; + unsigned int o; + FILE *fp; + int ret; + + /* + * ACPI_DSM - device specific method for naming + * PCI or PCI Express device + */ + ret = asprintf(&acpi, "/sys/class/infiniband/%s/device/acpi_index", + d->curr); + if (ret < 0) + return -ENOMEM; + + /* SMBIOS type 41 - Onboard Devices Extended Information */ + ret = asprintf(&index, "/sys/class/infiniband/%s/device/index", d->curr); + if (ret < 0) { + index = NULL; + ret = -ENOMEM; + goto out; + } + + fp = fopen(acpi, "r"); + if (!fp) + fp = fopen(index, "r"); + if (!fp) { + pr_dbg("%s: Device is not embedded onboard\n", d->curr); + ret = -ENOENT; + goto out; + } + + ret = fscanf(fp, "%u", &o); + fclose(fp); + /* https://github.com/systemd/systemd/blob/master/src/udev/udev-builtin-net_id.c#L263 */ + if (!ret || o > ONBOARD_INDEX_MAX) { + pr_err("%s: Onboard index %d and ret %d\n", d->curr, o, ret); + ret = -ENOENT; + goto out; + } + + ret = asprintf(&d->name, "%so%u", d->prefix, o); + if (ret < 0) { + pr_err("%s: Failed to allocate name with prefix %s and onboard index %d\n", + d->curr, d->prefix, o); + ret = -ENOENT; + d->name = NULL; + goto out; + } + ret = 0; +out: + free(index); + free(acpi); + return ret; +} + +static int find_sun(char *devname, char *pci) +{ + char bof[256], tmp[256]; + struct dirent *dent; + char *slots; + DIR *dir; + int ret; + + ret = asprintf(&slots, "%s/subsystem/slots", devname); + if (ret < 0) + return 0; + + ret = 0; + dir = opendir(slots); + if (!dir) + goto err_dir; + + if (sscanf(pci, "%s.%s", bof, tmp) != 2) + goto out; + + while ((dent = readdir(dir))) { + char *str, address[256]; + FILE *fp; + int i; + + if (dent->d_name[0] == '.') + continue; + i = atoi(dent->d_name); + if (i <= 0) + continue; + + ret = asprintf(&str, "%s/%s/address", slots, dent->d_name); + if (ret < 0) { + ret = 0; + goto out; + } + + fp = fopen(str, "r"); + free(str); + if (!fp) { + ret = 0; + goto out; + } + + ret = fscanf(fp, "%255s", address); + fclose(fp); + + if (ret != 1) { + ret = 0; + goto out; + } + + if (!strcmp(bof, address)) { + ret = i; + break; + } + } +out: + closedir(dir); +err_dir: + free(slots); + return ret; +} + +static int is_pci_multifunction(char *devname) +{ + char c[64] = {}; + char *config; + FILE *fp; + int ret; + + ret = asprintf(&config, "%s/config", devname); + if (ret < 0) + return 0; + + fp = fopen(config, "r"); + free(config); + if (!fp) + return 0; + + ret = fread(c, 1, sizeof(c), fp); + fclose(fp); + if (ret != sizeof(c)) + return 0; + + /* bit 0-6 header type, bit 7 multi/single function device */ + return c[PCI_HEADER_TYPE] & 0x80; +} + +static int is_pci_ari_enabled(char *devname) +{ + int ret, a; + char *ari; + FILE *fp; + + ret = asprintf(&ari, "%s/ari_enabled", devname); + if (ret < 0) + return 0; + + fp = fopen(ari, "r"); + free(ari); + if (!fp) + return 0; + + ret = fscanf(fp, "%d", &a); + fclose(fp); + return (ret) ? a == 1 : 0; +} + +struct pci_info { + char *pcidev; + + unsigned int domain; + unsigned int bus; + unsigned int slot; + unsigned int func; + unsigned int sun; + unsigned int vf; + bool valid_vf; +}; + +static int fill_pci_info(struct data *d, struct pci_info *p) +{ + char buf[256] = {}; + char *pci; + int ret; + + ret = readlink(p->pcidev, buf, sizeof(buf)-1); + if (ret == -1 || ret == sizeof(buf)) + return -EINVAL; + + buf[ret] = 0; + + pci = basename(buf); + /* + * pci = 0000:00:0c.0 + */ + ret = sscanf(pci, "%x:%x:%x.%u", &p->domain, &p->bus, &p->slot, + &p->func); + if (ret != 4) { + pr_err("%s: Failed to read PCI BOF\n", d->curr); + return -ENOENT; + } + + if (is_pci_ari_enabled(p->pcidev)) { + /* + * ARI devices support up to 256 functions on a single device + * ("slot"), and interpret the traditional 5-bit slot and 3-bit + * function number as a single 8-bit function number, where the + * slot makes up the upper 5 bits. + * + * https://github.com/systemd/systemd/blob/master/src/udev/udev-builtin-net_id.c#L344 + */ + p->func += p->slot * 8; + pr_dbg("%s: This is ARI device, new PCI BOF is %04x:%02x:%02x.%u\n", + d->curr, p->domain, p->bus, p->slot, p->func); + } + + p->sun = find_sun(p->pcidev, pci); + + return 0; +} + +static int get_virtfn_info(struct data *d, struct pci_info *p) +{ + struct pci_info vf = {}; + char *physfn_pcidev; + struct dirent *dent; + DIR *dir; + int ret; + + /* Check if this is a virtual function. */ + ret = asprintf(&physfn_pcidev, "%s/physfn", p->pcidev); + if (ret < 0) + return -ENOMEM; + + /* We are VF, get VF number and replace pcidev to point to PF */ + dir = opendir(physfn_pcidev); + if (!dir) { + /* + * -ENOENT means that we are already in PF + * and pcidev points to right PCI. + */ + ret = (errno == ENOENT) ? 0 : -ENOMEM; + goto err_free; + } + + p->valid_vf = true; + vf.pcidev = p->pcidev; + ret = fill_pci_info(d, &vf); + if (ret) + goto err_dir; + + while ((dent = readdir(dir))) { + const char *s = "virtfn"; + struct pci_info v = {}; + + if (strncmp(dent->d_name, s, strlen(s)) || + strlen(dent->d_name) == strlen(s)) + continue; + + ret = asprintf(&v.pcidev, "%s/%s", physfn_pcidev, dent->d_name); + if (ret < 0) { + ret = -ENOMEM; + goto err_dir; + } + ret = fill_pci_info(d, &v); + free(v.pcidev); + if (ret) { + ret = -ENOMEM; + goto err_dir; + } + if (vf.func == v.func) { + p->vf = atoi(&dent->d_name[6]); + break; + } + } + + p->pcidev = physfn_pcidev; + closedir(dir); + + return 0; + +err_dir: + closedir(dir); +err_free: + free(physfn_pcidev); + return ret; +} + +static int by_pci(struct data *d) +{ + struct pci_info p = {}; + char *subsystem; + char buf[256] = {}; + char *subs; + int ret; + + ret = asprintf(&subsystem, "/sys/class/infiniband/%s/device/subsystem", + d->curr); + if (ret < 0) + return -ENOMEM; + + ret = readlink(subsystem, buf, sizeof(buf)-1); + if (ret == -1 || ret == sizeof(buf)) { + ret = -EINVAL; + goto out; + } + buf[ret] = 0; + + subs = basename(buf); + if (strcmp(subs, "pci")) { + /* Ball out virtual devices */ + pr_dbg("%s: Non-PCI device (%s) was detected\n", d->curr, subs); + ret = -EINVAL; + goto out; + } + + /* Real devices */ + ret = asprintf(&p.pcidev, "/sys/class/infiniband/%s/device", d->curr); + if (ret < 0) { + ret = -ENOMEM; + p.pcidev = NULL; + goto out; + } + + ret = get_virtfn_info(d, &p); + if (ret) + goto out; + + ret = fill_pci_info(d, &p); + if (ret) { + pr_err("%s: Failed to fill PCI device information\n", d->curr); + goto out; + } + + d->name = calloc(256, sizeof(char)); + if (!d->name) { + ret = -ENOMEM; + goto out; + } + + ret = sprintf(d->name, "%s", d->prefix); + if (ret == -1) { + ret = -EINVAL; + goto out; + } + + if (p.domain > 0) { + ret = sprintf(buf, "P%u", p.domain); + if (ret == -1) { + ret = -ENOMEM; + goto out; + } + strcat(d->name, buf); + } + + if (p.sun > 0) + ret = sprintf(buf, "s%u", p.sun); + else + ret = sprintf(buf, "p%us%u", p.bus, p.slot); + if (ret == -1) { + ret = -ENOMEM; + goto out; + } + + strcat(d->name, buf); + + if (p.func > 0 || is_pci_multifunction(p.pcidev)) { + ret = sprintf(buf, "f%u", p.func); + if (ret == -1) { + ret = -ENOMEM; + goto out; + } + strcat(d->name, buf); + + if (p.valid_vf) { + ret = sprintf(buf, "v%u", p.vf); + if (ret == -1) { + ret = -ENOMEM; + goto out; + } + strcat(d->name, buf); + } + } + ret = 0; +out: + free(p.pcidev); + free(subsystem); + if (ret) { + free(d->name); + d->name = NULL; + } + + return ret; +} + +static int by_guid(struct data *d) +{ + uint16_t vp[4]; + int ret = -1; + + if (!d->node_guid) + /* virtual devices start without GUID */ + goto out; + + memcpy(vp, &d->node_guid, sizeof(uint64_t)); + ret = asprintf(&d->name, "%sx%04x%04x%04x%04x", d->prefix, vp[3], vp[2], + vp[1], vp[0]); +out: + if (ret == -1) { + d->name = NULL; + return -ENOMEM; + } + + return 0; +} + +static int device_rename(struct nl_sock *nl, struct data *d) +{ + struct nlmsghdr *hdr; + struct nl_msg *msg; + int ret = -1; + + msg = nlmsg_alloc(); + if (!msg) + return -ENOMEM; + + hdr = nlmsg_put(msg, NL_AUTO_PORT, NL_AUTO_SEQ, + RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_SET), + 0, 0); + if (!hdr) { + ret = -ENOMEM; + goto nla_put_failure; + } + + NLA_PUT_U32(msg, RDMA_NLDEV_ATTR_DEV_INDEX, d->idx); + NLA_PUT_STRING(msg, RDMA_NLDEV_ATTR_DEV_NAME, d->name); + ret = nl_send_auto(nl, msg); + if (ret < 0) + return ret; +nla_put_failure: + nlmsg_free(msg); + return (ret < 0) ? ret : 0; +} + +static int get_nldata_cb(struct nl_msg *msg, void *data) +{ + struct nlattr *tb[RDMA_NLDEV_ATTR_MAX] = {}; + struct nlmsghdr *hdr = nlmsg_hdr(msg); + struct data *d = data; + int ret; + + ret = nlmsg_parse(hdr, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, rdmanl_policy); + if (ret < 0) + return NL_STOP; + + if (!tb[RDMA_NLDEV_ATTR_DEV_NAME] || !tb[RDMA_NLDEV_ATTR_DEV_INDEX] || + !tb[RDMA_NLDEV_ATTR_NODE_GUID]) + return NL_STOP; + + ret = strcmp(d->curr, nla_get_string(tb[RDMA_NLDEV_ATTR_DEV_NAME])); + if (ret) + return NL_OK; + + if (tb[RDMA_NLDEV_ATTR_DEV_PROTOCOL]) + d->prefix = strdup( + nla_get_string(tb[RDMA_NLDEV_ATTR_DEV_PROTOCOL])); + if (!d->prefix) + ret = asprintf(&d->prefix, "rdma"); + if (ret < 0) + return NL_STOP; + + d->idx = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]); + d->node_guid = nla_get_u64(tb[RDMA_NLDEV_ATTR_NODE_GUID]); + return NL_STOP; +} + +enum name_policy { + NAME_KERNEL = 1 << 0, + NAME_PCI = 1 << 1, + NAME_GUID = 1 << 2, + NAME_ONBOARD = 1 << 3, + NAME_ERROR = 1 << 8 +}; + +static int str2policy(const char *np) +{ + if (!strcmp(np, "NAME_KERNEL")) + return NAME_KERNEL; + if (!strcmp(np, "NAME_PCI")) + return NAME_PCI; + if (!strcmp(np, "NAME_GUID")) + return NAME_GUID; + if (!strcmp(np, "NAME_ONBOARD")) + return NAME_ONBOARD; + if (!strcmp(np, "NAME_FALLBACK")) + return NAME_ONBOARD | NAME_PCI; + return NAME_ERROR; +}; + +int main(int argc, char **argv) +{ + struct data d = { .idx = -1 }; + struct nl_sock *nl; + int ret = -1; + int np, opt; + + if (argc < 3) + goto err; + + while ((opt = getopt(argc, argv, "v")) >= 0) { + switch (opt) { + case 'v': + debug_mode = true; + break; + default: + goto err; + } + } + + argc -= optind; + argv += optind; + + d.curr = argv[0]; + + np = str2policy(argv[1]); + if (np & NAME_ERROR) { + pr_err("%s: Unknown policy %s\n", d.curr, argv[1]); + goto err; + } + + pr_dbg("%s: Requested policy is %s\n", d.curr, argv[1]); + + if (np & NAME_KERNEL) { + pr_dbg("%s: Leave kernel names, do nothing\n", d.curr); + /* Do nothing */ + exit(0); + } + + nl = rdmanl_socket_alloc(); + if (!nl) { + pr_err("%s: Failed to allocate netlink socket\n", d.curr); + goto err; + } + + if (rdmanl_get_devices(nl, get_nldata_cb, &d)) { + pr_err("%s: Failed to connect to NETLINK_RDMA\n", d.curr); + goto out; + } + + if (d.idx == -1 || !d.prefix) { + pr_err("%s: Failed to get current device name and index\n", + d.curr); + goto out; + } + + ret = -1; + if (np & NAME_ONBOARD) + ret = by_onboard(&d); + if (ret && (np & NAME_PCI)) + ret = by_pci(&d); + if (ret && (np & NAME_GUID)) + ret = by_guid(&d); + if (ret) + goto out; + + ret = device_rename(nl, &d); + if (ret) { + pr_err("%s: Device rename to %s failed with error %d\n", d.curr, + d.name, ret); + goto out; + } + pr_dbg("%s: Successfully renamed device to be %s\n", d.curr, d.name); + + printf("%s\n", d.name); + free(d.name); + +out: + free(d.prefix); + nl_socket_free(nl); +err: + ret = (ret) ? 1 : 0; + exit(ret); +} diff --git a/kernel-headers/CMakeLists.txt b/kernel-headers/CMakeLists.txt new file mode 100644 index 0000000..2d0766d --- /dev/null +++ b/kernel-headers/CMakeLists.txt @@ -0,0 +1,79 @@ +publish_internal_headers(rdma + rdma/bnxt_re-abi.h + rdma/cxgb4-abi.h + rdma/efa-abi.h + rdma/hns-abi.h + rdma/i40iw-abi.h + rdma/ib_user_ioctl_cmds.h + rdma/ib_user_ioctl_verbs.h + rdma/ib_user_mad.h + rdma/ib_user_sa.h + rdma/ib_user_verbs.h + rdma/mlx4-abi.h + rdma/mlx5-abi.h + rdma/mlx5_user_ioctl_cmds.h + rdma/mlx5_user_ioctl_verbs.h + rdma/mthca-abi.h + rdma/ocrdma-abi.h + rdma/qedr-abi.h + rdma/rdma_netlink.h + rdma/rdma_user_cm.h + rdma/rdma_user_ioctl.h + rdma/rdma_user_ioctl_cmds.h + rdma/rdma_user_rxe.h + rdma/rvt-abi.h + rdma/siw-abi.h + rdma/vmw_pvrdma-abi.h + ) + +publish_internal_headers(rdma/hfi + rdma/hfi/hfi1_ioctl.h + rdma/hfi/hfi1_user.h + ) + +function(rdma_kernel_provider_abi) + # Older versions of cmake do not create the output directory automatically + set(DDIR "${BUILD_INCLUDE}/kernel-abi") + rdma_make_dir("${DDIR}") + + set(HDRS "") + foreach(IHDR ${ARGN}) + get_filename_component(FIL ${IHDR} NAME) + set(OHDR "${DDIR}/${FIL}") + set(HDRS ${HDRS} ${OHDR}) + add_custom_command( + OUTPUT "${OHDR}" + COMMAND "${PYTHON_EXECUTABLE}" "${CMAKE_SOURCE_DIR}/buildlib/make_abi_structs.py" "${IHDR}" "${OHDR}" + MAIN_DEPENDENCY "${IHDR}" + DEPENDS "${CMAKE_SOURCE_DIR}/buildlib/make_abi_structs.py" + WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}" + COMMENT "Creating ABI wrapper ${OHDR}" + ) + endforeach() + + # This weird construction is needed to ensure ordering of the build. + add_library(kern-abi STATIC kern-abi.c ${HDRS}) +endfunction() + +# Transform the kernel ABIs used by the providers +rdma_kernel_provider_abi( + rdma/bnxt_re-abi.h + rdma/cxgb4-abi.h + rdma/efa-abi.h + rdma/hns-abi.h + rdma/i40iw-abi.h + rdma/ib_user_verbs.h + rdma/mlx4-abi.h + rdma/mlx5-abi.h + rdma/mthca-abi.h + rdma/ocrdma-abi.h + rdma/qedr-abi.h + rdma/rdma_user_rxe.h + rdma/siw-abi.h + rdma/vmw_pvrdma-abi.h + ) + +publish_headers(infiniband + rdma/ib_user_ioctl_verbs.h + ) + diff --git a/kernel-headers/kern-abi.c b/kernel-headers/kern-abi.c new file mode 100644 index 0000000..cd2941e --- /dev/null +++ b/kernel-headers/kern-abi.c @@ -0,0 +1 @@ +/* empty file for cmake */ diff --git a/kernel-headers/rdma/bnxt_re-abi.h b/kernel-headers/rdma/bnxt_re-abi.h new file mode 100644 index 0000000..dc52e3c --- /dev/null +++ b/kernel-headers/rdma/bnxt_re-abi.h @@ -0,0 +1,117 @@ +/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-2-Clause) */ +/* + * Broadcom NetXtreme-E RoCE driver. + * + * Copyright (c) 2016 - 2017, Broadcom. All rights reserved. The term + * Broadcom refers to Broadcom Limited and/or its subsidiaries. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE + * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN + * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * Description: Uverbs ABI header file + */ + +#ifndef __BNXT_RE_UVERBS_ABI_H__ +#define __BNXT_RE_UVERBS_ABI_H__ + +#include <linux/types.h> + +#define BNXT_RE_ABI_VERSION 1 + +#define BNXT_RE_CHIP_ID0_CHIP_NUM_SFT 0x00 +#define BNXT_RE_CHIP_ID0_CHIP_REV_SFT 0x10 +#define BNXT_RE_CHIP_ID0_CHIP_MET_SFT 0x18 + +enum { + BNXT_RE_UCNTX_CMASK_HAVE_CCTX = 0x1ULL +}; + +struct bnxt_re_uctx_resp { + __u32 dev_id; + __u32 max_qp; + __u32 pg_size; + __u32 cqe_sz; + __u32 max_cqd; + __u32 rsvd; + __aligned_u64 comp_mask; + __u32 chip_id0; + __u32 chip_id1; +}; + +/* + * This struct is placed after the ib_uverbs_alloc_pd_resp struct, which is + * not 8 byted aligned. To avoid undesired padding in various cases we have to + * set this struct to packed. + */ +struct bnxt_re_pd_resp { + __u32 pdid; + __u32 dpi; + __u64 dbr; +} __attribute__((packed, aligned(4))); + +struct bnxt_re_cq_req { + __aligned_u64 cq_va; + __aligned_u64 cq_handle; +}; + +struct bnxt_re_cq_resp { + __u32 cqid; + __u32 tail; + __u32 phase; + __u32 rsvd; +}; + +struct bnxt_re_qp_req { + __aligned_u64 qpsva; + __aligned_u64 qprva; + __aligned_u64 qp_handle; +}; + +struct bnxt_re_qp_resp { + __u32 qpid; + __u32 rsvd; +}; + +struct bnxt_re_srq_req { + __aligned_u64 srqva; + __aligned_u64 srq_handle; +}; + +struct bnxt_re_srq_resp { + __u32 srqid; +}; + +enum bnxt_re_shpg_offt { + BNXT_RE_BEG_RESV_OFFT = 0x00, + BNXT_RE_AVID_OFFT = 0x10, + BNXT_RE_AVID_SIZE = 0x04, + BNXT_RE_END_RESV_OFFT = 0xFF0 +}; + +#endif /* __BNXT_RE_UVERBS_ABI_H__*/ diff --git a/kernel-headers/rdma/cxgb4-abi.h b/kernel-headers/rdma/cxgb4-abi.h new file mode 100644 index 0000000..f85ec1a --- /dev/null +++ b/kernel-headers/rdma/cxgb4-abi.h @@ -0,0 +1,115 @@ +/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR Linux-OpenIB) */ +/* + * Copyright (c) 2009-2010 Chelsio, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef CXGB4_ABI_USER_H +#define CXGB4_ABI_USER_H + +#include <linux/types.h> + +#define C4IW_UVERBS_ABI_VERSION 3 + +/* + * Make sure that all structs defined in this file remain laid out so + * that they pack the same way on 32-bit and 64-bit architectures (to + * avoid incompatibility between 32-bit userspace and 64-bit kernels). + * In particular do not use pointer types -- pass pointers in __aligned_u64 + * instead. + */ + +enum { + C4IW_64B_CQE = (1 << 0) +}; + +struct c4iw_create_cq { + __u32 flags; + __u32 reserved; +}; + +struct c4iw_create_cq_resp { + __aligned_u64 key; + __aligned_u64 gts_key; + __aligned_u64 memsize; + __u32 cqid; + __u32 size; + __u32 qid_mask; + __u32 flags; +}; + +enum { + C4IW_QPF_ONCHIP = (1 << 0), + C4IW_QPF_WRITE_W_IMM = (1 << 1) +}; + +struct c4iw_create_qp_resp { + __aligned_u64 ma_sync_key; + __aligned_u64 sq_key; + __aligned_u64 rq_key; + __aligned_u64 sq_db_gts_key; + __aligned_u64 rq_db_gts_key; + __aligned_u64 sq_memsize; + __aligned_u64 rq_memsize; + __u32 sqid; + __u32 rqid; + __u32 sq_size; + __u32 rq_size; + __u32 qid_mask; + __u32 flags; +}; + +struct c4iw_create_srq_resp { + __aligned_u64 srq_key; + __aligned_u64 srq_db_gts_key; + __aligned_u64 srq_memsize; + __u32 srqid; + __u32 srq_size; + __u32 rqt_abs_idx; + __u32 qid_mask; + __u32 flags; + __u32 reserved; /* explicit padding */ +}; + +enum { + /* HW supports SRQ_LIMIT_REACHED event */ + T4_SRQ_LIMIT_SUPPORT = 1 << 0, +}; + +struct c4iw_alloc_ucontext_resp { + __aligned_u64 status_page_key; + __u32 status_page_size; + __u32 reserved; /* explicit padding (optional for i386) */ +}; + +struct c4iw_alloc_pd_resp { + __u32 pdid; +}; + +#endif /* CXGB4_ABI_USER_H */ diff --git a/kernel-headers/rdma/efa-abi.h b/kernel-headers/rdma/efa-abi.h new file mode 100644 index 0000000..53b6e20 --- /dev/null +++ b/kernel-headers/rdma/efa-abi.h @@ -0,0 +1,107 @@ +/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-2-Clause) */ +/* + * Copyright 2018-2019 Amazon.com, Inc. or its affiliates. All rights reserved. + */ + +#ifndef EFA_ABI_USER_H +#define EFA_ABI_USER_H + +#include <linux/types.h> + +/* + * Increment this value if any changes that break userspace ABI + * compatibility are made. + */ +#define EFA_UVERBS_ABI_VERSION 1 + +/* + * Keep structs aligned to 8 bytes. + * Keep reserved fields as arrays of __u8 named reserved_XXX where XXX is the + * hex bit offset of the field. + */ + +enum efa_ibv_user_cmds_supp_udata { + EFA_USER_CMDS_SUPP_UDATA_QUERY_DEVICE = 1 << 0, + EFA_USER_CMDS_SUPP_UDATA_CREATE_AH = 1 << 1, +}; + +struct efa_ibv_alloc_ucontext_resp { + __u32 comp_mask; + __u32 cmds_supp_udata_mask; + __u16 sub_cqs_per_cq; + __u16 inline_buf_size; + __u32 max_llq_size; /* bytes */ +}; + +struct efa_ibv_alloc_pd_resp { + __u32 comp_mask; + __u16 pdn; + __u8 reserved_30[2]; +}; + +struct efa_ibv_create_cq { + __u32 comp_mask; + __u32 cq_entry_size; + __u16 num_sub_cqs; + __u8 reserved_50[6]; +}; + +struct efa_ibv_create_cq_resp { + __u32 comp_mask; + __u8 reserved_20[4]; + __aligned_u64 q_mmap_key; + __aligned_u64 q_mmap_size; + __u16 cq_idx; + __u8 reserved_d0[6]; +}; + +enum { + EFA_QP_DRIVER_TYPE_SRD = 0, +}; + +struct efa_ibv_create_qp { + __u32 comp_mask; + __u32 rq_ring_size; /* bytes */ + __u32 sq_ring_size; /* bytes */ + __u32 driver_qp_type; +}; + +struct efa_ibv_create_qp_resp { + __u32 comp_mask; + /* the offset inside the page of the rq db */ + __u32 rq_db_offset; + /* the offset inside the page of the sq db */ + __u32 sq_db_offset; + /* the offset inside the page of descriptors buffer */ + __u32 llq_desc_offset; + __aligned_u64 rq_mmap_key; + __aligned_u64 rq_mmap_size; + __aligned_u64 rq_db_mmap_key; + __aligned_u64 sq_db_mmap_key; + __aligned_u64 llq_desc_mmap_key; + __u16 send_sub_cq_idx; + __u16 recv_sub_cq_idx; + __u8 reserved_1e0[4]; +}; + +struct efa_ibv_create_ah_resp { + __u32 comp_mask; + __u16 efa_address_handle; + __u8 reserved_30[2]; +}; + +enum { + EFA_QUERY_DEVICE_CAPS_RDMA_READ = 1 << 0, +}; + +struct efa_ibv_ex_query_device_resp { + __u32 comp_mask; + __u32 max_sq_wr; + __u32 max_rq_wr; + __u16 max_sq_sge; + __u16 max_rq_sge; + __u32 max_rdma_size; + __u32 device_caps; +}; + +#endif /* EFA_ABI_USER_H */ diff --git a/kernel-headers/rdma/hfi/hfi1_ioctl.h b/kernel-headers/rdma/hfi/hfi1_ioctl.h new file mode 100644 index 0000000..8f3d9fe --- /dev/null +++ b/kernel-headers/rdma/hfi/hfi1_ioctl.h @@ -0,0 +1,174 @@ +/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */ +/* + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * Copyright(c) 2015 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Copyright(c) 2015 Intel Corporation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#ifndef _LINUX__HFI1_IOCTL_H +#define _LINUX__HFI1_IOCTL_H +#include <linux/types.h> + +/* + * This structure is passed to the driver to tell it where + * user code buffers are, sizes, etc. The offsets and sizes of the + * fields must remain unchanged, for binary compatibility. It can + * be extended, if userversion is changed so user code can tell, if needed + */ +struct hfi1_user_info { + /* + * version of user software, to detect compatibility issues. + * Should be set to HFI1_USER_SWVERSION. + */ + __u32 userversion; + __u32 pad; + /* + * If two or more processes wish to share a context, each process + * must set the subcontext_cnt and subcontext_id to the same + * values. The only restriction on the subcontext_id is that + * it be unique for a given node. + */ + __u16 subctxt_cnt; + __u16 subctxt_id; + /* 128bit UUID passed in by PSM. */ + __u8 uuid[16]; +}; + +struct hfi1_ctxt_info { + __aligned_u64 runtime_flags; /* chip/drv runtime flags (HFI1_CAP_*) */ + __u32 rcvegr_size; /* size of each eager buffer */ + __u16 num_active; /* number of active units */ + __u16 unit; /* unit (chip) assigned to caller */ + __u16 ctxt; /* ctxt on unit assigned to caller */ + __u16 subctxt; /* subctxt on unit assigned to caller */ + __u16 rcvtids; /* number of Rcv TIDs for this context */ + __u16 credits; /* number of PIO credits for this context */ + __u16 numa_node; /* NUMA node of the assigned device */ + __u16 rec_cpu; /* cpu # for affinity (0xffff if none) */ + __u16 send_ctxt; /* send context in use by this user context */ + __u16 egrtids; /* number of RcvArray entries for Eager Rcvs */ + __u16 rcvhdrq_cnt; /* number of RcvHdrQ entries */ + __u16 rcvhdrq_entsize; /* size (in bytes) for each RcvHdrQ entry */ + __u16 sdma_ring_size; /* number of entries in SDMA request ring */ +}; + +struct hfi1_tid_info { + /* virtual address of first page in transfer */ + __aligned_u64 vaddr; + /* pointer to tid array. this array is big enough */ + __aligned_u64 tidlist; + /* number of tids programmed by this request */ + __u32 tidcnt; + /* length of transfer buffer programmed by this request */ + __u32 length; +}; + +/* + * This structure is returned by the driver immediately after + * open to get implementation-specific info, and info specific to this + * instance. + * + * This struct must have explicit pad fields where type sizes + * may result in different alignments between 32 and 64 bit + * programs, since the 64 bit * bit kernel requires the user code + * to have matching offsets + */ +struct hfi1_base_info { + /* version of hardware, for feature checking. */ + __u32 hw_version; + /* version of software, for feature checking. */ + __u32 sw_version; + /* Job key */ + __u16 jkey; + __u16 padding1; + /* + * The special QP (queue pair) value that identifies PSM + * protocol packet from standard IB packets. + */ + __u32 bthqp; + /* PIO credit return address, */ + __aligned_u64 sc_credits_addr; + /* + * Base address of write-only pio buffers for this process. + * Each buffer has sendpio_credits*64 bytes. + */ + __aligned_u64 pio_bufbase_sop; + /* + * Base address of write-only pio buffers for this process. + * Each buffer has sendpio_credits*64 bytes. + */ + __aligned_u64 pio_bufbase; + /* address where receive buffer queue is mapped into */ + __aligned_u64 rcvhdr_bufbase; + /* base address of Eager receive buffers. */ + __aligned_u64 rcvegr_bufbase; + /* base address of SDMA completion ring */ + __aligned_u64 sdma_comp_bufbase; + /* + * User register base for init code, not to be used directly by + * protocol or applications. Always maps real chip register space. + * the register addresses are: + * ur_rcvhdrhead, ur_rcvhdrtail, ur_rcvegrhead, ur_rcvegrtail, + * ur_rcvtidflow + */ + __aligned_u64 user_regbase; + /* notification events */ + __aligned_u64 events_bufbase; + /* status page */ + __aligned_u64 status_bufbase; + /* rcvhdrtail update */ + __aligned_u64 rcvhdrtail_base; + /* + * shared memory pages for subctxts if ctxt is shared; these cover + * all the processes in the group sharing a single context. + * all have enough space for the num_subcontexts value on this job. + */ + __aligned_u64 subctxt_uregbase; + __aligned_u64 subctxt_rcvegrbuf; + __aligned_u64 subctxt_rcvhdrbuf; +}; +#endif /* _LINIUX__HFI1_IOCTL_H */ diff --git a/kernel-headers/rdma/hfi/hfi1_user.h b/kernel-headers/rdma/hfi/hfi1_user.h new file mode 100644 index 0000000..01ac585 --- /dev/null +++ b/kernel-headers/rdma/hfi/hfi1_user.h @@ -0,0 +1,267 @@ +/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */ +/* + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * Copyright(c) 2015 - 2018 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Copyright(c) 2015 Intel Corporation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +/* + * This file contains defines, structures, etc. that are used + * to communicate between kernel and user code. + */ + +#ifndef _LINUX__HFI1_USER_H +#define _LINUX__HFI1_USER_H + +#include <linux/types.h> +#include <rdma/rdma_user_ioctl.h> + +/* + * This version number is given to the driver by the user code during + * initialization in the spu_userversion field of hfi1_user_info, so + * the driver can check for compatibility with user code. + * + * The major version changes when data structures change in an incompatible + * way. The driver must be the same for initialization to succeed. + */ +#define HFI1_USER_SWMAJOR 6 + +/* + * Minor version differences are always compatible + * a within a major version, however if user software is larger + * than driver software, some new features and/or structure fields + * may not be implemented; the user code must deal with this if it + * cares, or it must abort after initialization reports the difference. + */ +#define HFI1_USER_SWMINOR 3 + +/* + * We will encode the major/minor inside a single 32bit version number. + */ +#define HFI1_SWMAJOR_SHIFT 16 + +/* + * Set of HW and driver capability/feature bits. + * These bit values are used to configure enabled/disabled HW and + * driver features. The same set of bits are communicated to user + * space. + */ +#define HFI1_CAP_DMA_RTAIL (1UL << 0) /* Use DMA'ed RTail value */ +#define HFI1_CAP_SDMA (1UL << 1) /* Enable SDMA support */ +#define HFI1_CAP_SDMA_AHG (1UL << 2) /* Enable SDMA AHG support */ +#define HFI1_CAP_EXTENDED_PSN (1UL << 3) /* Enable Extended PSN support */ +#define HFI1_CAP_HDRSUPP (1UL << 4) /* Enable Header Suppression */ +#define HFI1_CAP_TID_RDMA (1UL << 5) /* Enable TID RDMA operations */ +#define HFI1_CAP_USE_SDMA_HEAD (1UL << 6) /* DMA Hdr Q tail vs. use CSR */ +#define HFI1_CAP_MULTI_PKT_EGR (1UL << 7) /* Enable multi-packet Egr buffs*/ +#define HFI1_CAP_NODROP_RHQ_FULL (1UL << 8) /* Don't drop on Hdr Q full */ +#define HFI1_CAP_NODROP_EGR_FULL (1UL << 9) /* Don't drop on EGR buffs full */ +#define HFI1_CAP_TID_UNMAP (1UL << 10) /* Disable Expected TID caching */ +#define HFI1_CAP_PRINT_UNIMPL (1UL << 11) /* Show for unimplemented feats */ +#define HFI1_CAP_ALLOW_PERM_JKEY (1UL << 12) /* Allow use of permissive JKEY */ +#define HFI1_CAP_NO_INTEGRITY (1UL << 13) /* Enable ctxt integrity checks */ +#define HFI1_CAP_PKEY_CHECK (1UL << 14) /* Enable ctxt PKey checking */ +#define HFI1_CAP_STATIC_RATE_CTRL (1UL << 15) /* Allow PBC.StaticRateControl */ +#define HFI1_CAP_OPFN (1UL << 16) /* Enable the OPFN protocol */ +#define HFI1_CAP_SDMA_HEAD_CHECK (1UL << 17) /* SDMA head checking */ +#define HFI1_CAP_EARLY_CREDIT_RETURN (1UL << 18) /* early credit return */ + +#define HFI1_RCVHDR_ENTSIZE_2 (1UL << 0) +#define HFI1_RCVHDR_ENTSIZE_16 (1UL << 1) +#define HFI1_RCVDHR_ENTSIZE_32 (1UL << 2) + +#define _HFI1_EVENT_FROZEN_BIT 0 +#define _HFI1_EVENT_LINKDOWN_BIT 1 +#define _HFI1_EVENT_LID_CHANGE_BIT 2 +#define _HFI1_EVENT_LMC_CHANGE_BIT 3 +#define _HFI1_EVENT_SL2VL_CHANGE_BIT 4 +#define _HFI1_EVENT_TID_MMU_NOTIFY_BIT 5 +#define _HFI1_MAX_EVENT_BIT _HFI1_EVENT_TID_MMU_NOTIFY_BIT + +#define HFI1_EVENT_FROZEN (1UL << _HFI1_EVENT_FROZEN_BIT) +#define HFI1_EVENT_LINKDOWN (1UL << _HFI1_EVENT_LINKDOWN_BIT) +#define HFI1_EVENT_LID_CHANGE (1UL << _HFI1_EVENT_LID_CHANGE_BIT) +#define HFI1_EVENT_LMC_CHANGE (1UL << _HFI1_EVENT_LMC_CHANGE_BIT) +#define HFI1_EVENT_SL2VL_CHANGE (1UL << _HFI1_EVENT_SL2VL_CHANGE_BIT) +#define HFI1_EVENT_TID_MMU_NOTIFY (1UL << _HFI1_EVENT_TID_MMU_NOTIFY_BIT) + +/* + * These are the status bits readable (in ASCII form, 64bit value) + * from the "status" sysfs file. For binary compatibility, values + * must remain as is; removed states can be reused for different + * purposes. + */ +#define HFI1_STATUS_INITTED 0x1 /* basic initialization done */ +/* Chip has been found and initialized */ +#define HFI1_STATUS_CHIP_PRESENT 0x20 +/* IB link is at ACTIVE, usable for data traffic */ +#define HFI1_STATUS_IB_READY 0x40 +/* link is configured, LID, MTU, etc. have been set */ +#define HFI1_STATUS_IB_CONF 0x80 +/* A Fatal hardware error has occurred. */ +#define HFI1_STATUS_HWERROR 0x200 + +/* + * Number of supported shared contexts. + * This is the maximum number of software contexts that can share + * a hardware send/receive context. + */ +#define HFI1_MAX_SHARED_CTXTS 8 + +/* + * Poll types + */ +#define HFI1_POLL_TYPE_ANYRCV 0x0 +#define HFI1_POLL_TYPE_URGENT 0x1 + +enum hfi1_sdma_comp_state { + FREE = 0, + QUEUED, + COMPLETE, + ERROR +}; + +/* + * SDMA completion ring entry + */ +struct hfi1_sdma_comp_entry { + __u32 status; + __u32 errcode; +}; + +/* + * Device status and notifications from driver to user-space. + */ +struct hfi1_status { + __aligned_u64 dev; /* device/hw status bits */ + __aligned_u64 port; /* port state and status bits */ + char freezemsg[0]; +}; + +enum sdma_req_opcode { + EXPECTED = 0, + EAGER +}; + +#define HFI1_SDMA_REQ_VERSION_MASK 0xF +#define HFI1_SDMA_REQ_VERSION_SHIFT 0x0 +#define HFI1_SDMA_REQ_OPCODE_MASK 0xF +#define HFI1_SDMA_REQ_OPCODE_SHIFT 0x4 +#define HFI1_SDMA_REQ_IOVCNT_MASK 0xFF +#define HFI1_SDMA_REQ_IOVCNT_SHIFT 0x8 + +struct sdma_req_info { + /* + * bits 0-3 - version (currently unused) + * bits 4-7 - opcode (enum sdma_req_opcode) + * bits 8-15 - io vector count + */ + __u16 ctrl; + /* + * Number of fragments contained in this request. + * User-space has already computed how many + * fragment-sized packet the user buffer will be + * split into. + */ + __u16 npkts; + /* + * Size of each fragment the user buffer will be + * split into. + */ + __u16 fragsize; + /* + * Index of the slot in the SDMA completion ring + * this request should be using. User-space is + * in charge of managing its own ring. + */ + __u16 comp_idx; +} __attribute__((__packed__)); + +/* + * SW KDETH header. + * swdata is SW defined portion. + */ +struct hfi1_kdeth_header { + __le32 ver_tid_offset; + __le16 jkey; + __le16 hcrc; + __le32 swdata[7]; +} __attribute__((__packed__)); + +/* + * Structure describing the headers that User space uses. The + * structure above is a subset of this one. + */ +struct hfi1_pkt_header { + __le16 pbc[4]; + __be16 lrh[4]; + __be32 bth[3]; + struct hfi1_kdeth_header kdeth; +} __attribute__((__packed__)); + + +/* + * The list of usermode accessible registers. + */ +enum hfi1_ureg { + /* (RO) DMA RcvHdr to be used next. */ + ur_rcvhdrtail = 0, + /* (RW) RcvHdr entry to be processed next by host. */ + ur_rcvhdrhead = 1, + /* (RO) Index of next Eager index to use. */ + ur_rcvegrindextail = 2, + /* (RW) Eager TID to be processed next */ + ur_rcvegrindexhead = 3, + /* (RO) Receive Eager Offset Tail */ + ur_rcvegroffsettail = 4, + /* For internal use only; max register number. */ + ur_maxreg, + /* (RW) Receive TID flow table */ + ur_rcvtidflowtable = 256 +}; + +#endif /* _LINIUX__HFI1_USER_H */ diff --git a/kernel-headers/rdma/hns-abi.h b/kernel-headers/rdma/hns-abi.h new file mode 100644 index 0000000..eb76b38 --- /dev/null +++ b/kernel-headers/rdma/hns-abi.h @@ -0,0 +1,83 @@ +/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR Linux-OpenIB) */ +/* + * Copyright (c) 2016 Hisilicon Limited. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef HNS_ABI_USER_H +#define HNS_ABI_USER_H + +#include <linux/types.h> + +struct hns_roce_ib_create_cq { + __aligned_u64 buf_addr; + __aligned_u64 db_addr; +}; + +struct hns_roce_ib_create_cq_resp { + __aligned_u64 cqn; /* Only 32 bits used, 64 for compat */ + __aligned_u64 cap_flags; +}; + +struct hns_roce_ib_create_srq { + __aligned_u64 buf_addr; + __aligned_u64 db_addr; + __aligned_u64 que_addr; +}; + +struct hns_roce_ib_create_srq_resp { + __u32 srqn; + __u32 reserved; +}; + +struct hns_roce_ib_create_qp { + __aligned_u64 buf_addr; + __aligned_u64 db_addr; + __u8 log_sq_bb_count; + __u8 log_sq_stride; + __u8 sq_no_prefetch; + __u8 reserved[5]; + __aligned_u64 sdb_addr; +}; + +struct hns_roce_ib_create_qp_resp { + __aligned_u64 cap_flags; +}; + +struct hns_roce_ib_alloc_ucontext_resp { + __u32 qp_tab_size; + __u32 reserved; +}; + +struct hns_roce_ib_alloc_pd_resp { + __u32 pdn; +}; + +#endif /* HNS_ABI_USER_H */ diff --git a/kernel-headers/rdma/i40iw-abi.h b/kernel-headers/rdma/i40iw-abi.h new file mode 100644 index 0000000..79890ba --- /dev/null +++ b/kernel-headers/rdma/i40iw-abi.h @@ -0,0 +1,107 @@ +/* + * Copyright (c) 2006 - 2016 Intel Corporation. All rights reserved. + * Copyright (c) 2005 Topspin Communications. All rights reserved. + * Copyright (c) 2005 Cisco Systems. All rights reserved. + * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#ifndef I40IW_ABI_H +#define I40IW_ABI_H + +#include <linux/types.h> + +#define I40IW_ABI_VER 5 + +struct i40iw_alloc_ucontext_req { + __u32 reserved32; + __u8 userspace_ver; + __u8 reserved8[3]; +}; + +struct i40iw_alloc_ucontext_resp { + __u32 max_pds; /* maximum pds allowed for this user process */ + __u32 max_qps; /* maximum qps allowed for this user process */ + __u32 wq_size; /* size of the WQs (sq+rq) allocated to the mmaped area */ + __u8 kernel_ver; + __u8 reserved[3]; +}; + +struct i40iw_alloc_pd_resp { + __u32 pd_id; + __u8 reserved[4]; +}; + +struct i40iw_create_cq_req { + __aligned_u64 user_cq_buffer; + __aligned_u64 user_shadow_area; +}; + +struct i40iw_create_qp_req { + __aligned_u64 user_wqe_buffers; + __aligned_u64 user_compl_ctx; + + /* UDA QP PHB */ + __aligned_u64 user_sq_phb; /* place for VA of the sq phb buff */ + __aligned_u64 user_rq_phb; /* place for VA of the rq phb buff */ +}; + +enum i40iw_memreg_type { + IW_MEMREG_TYPE_MEM = 0x0000, + IW_MEMREG_TYPE_QP = 0x0001, + IW_MEMREG_TYPE_CQ = 0x0002, +}; + +struct i40iw_mem_reg_req { + __u16 reg_type; /* Memory, QP or CQ */ + __u16 cq_pages; + __u16 rq_pages; + __u16 sq_pages; +}; + +struct i40iw_create_cq_resp { + __u32 cq_id; + __u32 cq_size; + __u32 mmap_db_index; + __u32 reserved; +}; + +struct i40iw_create_qp_resp { + __u32 qp_id; + __u32 actual_sq_size; + __u32 actual_rq_size; + __u32 i40iw_drv_opt; + __u16 push_idx; + __u8 lsmm; + __u8 rsvd2; +}; + +#endif diff --git a/kernel-headers/rdma/ib_user_ioctl_cmds.h b/kernel-headers/rdma/ib_user_ioctl_cmds.h new file mode 100644 index 0000000..d4ddbe4 --- /dev/null +++ b/kernel-headers/rdma/ib_user_ioctl_cmds.h @@ -0,0 +1,259 @@ +/* + * Copyright (c) 2018, Mellanox Technologies inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef IB_USER_IOCTL_CMDS_H +#define IB_USER_IOCTL_CMDS_H + +#define UVERBS_ID_NS_MASK 0xF000 +#define UVERBS_ID_NS_SHIFT 12 + +#define UVERBS_UDATA_DRIVER_DATA_NS 1 +#define UVERBS_UDATA_DRIVER_DATA_FLAG (1UL << UVERBS_ID_NS_SHIFT) + +enum uverbs_default_objects { + UVERBS_OBJECT_DEVICE, /* No instances of DEVICE are allowed */ + UVERBS_OBJECT_PD, + UVERBS_OBJECT_COMP_CHANNEL, + UVERBS_OBJECT_CQ, + UVERBS_OBJECT_QP, + UVERBS_OBJECT_SRQ, + UVERBS_OBJECT_AH, + UVERBS_OBJECT_MR, + UVERBS_OBJECT_MW, + UVERBS_OBJECT_FLOW, + UVERBS_OBJECT_XRCD, + UVERBS_OBJECT_RWQ_IND_TBL, + UVERBS_OBJECT_WQ, + UVERBS_OBJECT_FLOW_ACTION, + UVERBS_OBJECT_DM, + UVERBS_OBJECT_COUNTERS, + UVERBS_OBJECT_ASYNC_EVENT, +}; + +enum { + UVERBS_ATTR_UHW_IN = UVERBS_UDATA_DRIVER_DATA_FLAG, + UVERBS_ATTR_UHW_OUT, +}; + +enum uverbs_methods_device { + UVERBS_METHOD_INVOKE_WRITE, + UVERBS_METHOD_INFO_HANDLES, + UVERBS_METHOD_QUERY_PORT, + UVERBS_METHOD_GET_CONTEXT, +}; + +enum uverbs_attrs_invoke_write_cmd_attr_ids { + UVERBS_ATTR_CORE_IN, + UVERBS_ATTR_CORE_OUT, + UVERBS_ATTR_WRITE_CMD, +}; + +enum uverbs_attrs_query_port_cmd_attr_ids { + UVERBS_ATTR_QUERY_PORT_PORT_NUM, + UVERBS_ATTR_QUERY_PORT_RESP, +}; + +enum uverbs_attrs_get_context_attr_ids { + UVERBS_ATTR_GET_CONTEXT_NUM_COMP_VECTORS, + UVERBS_ATTR_GET_CONTEXT_CORE_SUPPORT, +}; + +enum uverbs_attrs_create_cq_cmd_attr_ids { + UVERBS_ATTR_CREATE_CQ_HANDLE, + UVERBS_ATTR_CREATE_CQ_CQE, + UVERBS_ATTR_CREATE_CQ_USER_HANDLE, + UVERBS_ATTR_CREATE_CQ_COMP_CHANNEL, + UVERBS_ATTR_CREATE_CQ_COMP_VECTOR, + UVERBS_ATTR_CREATE_CQ_FLAGS, + UVERBS_ATTR_CREATE_CQ_RESP_CQE, +}; + +enum uverbs_attrs_destroy_cq_cmd_attr_ids { + UVERBS_ATTR_DESTROY_CQ_HANDLE, + UVERBS_ATTR_DESTROY_CQ_RESP, +}; + +enum uverbs_attrs_create_flow_action_esp { + UVERBS_ATTR_CREATE_FLOW_ACTION_ESP_HANDLE, + UVERBS_ATTR_FLOW_ACTION_ESP_ATTRS, + UVERBS_ATTR_FLOW_ACTION_ESP_ESN, + UVERBS_ATTR_FLOW_ACTION_ESP_KEYMAT, + UVERBS_ATTR_FLOW_ACTION_ESP_REPLAY, + UVERBS_ATTR_FLOW_ACTION_ESP_ENCAP, +}; + +enum uverbs_attrs_modify_flow_action_esp { + UVERBS_ATTR_MODIFY_FLOW_ACTION_ESP_HANDLE = + UVERBS_ATTR_CREATE_FLOW_ACTION_ESP_HANDLE, +}; + +enum uverbs_attrs_destroy_flow_action_esp { + UVERBS_ATTR_DESTROY_FLOW_ACTION_HANDLE, +}; + +enum uverbs_methods_cq { + UVERBS_METHOD_CQ_CREATE, + UVERBS_METHOD_CQ_DESTROY, +}; + +enum uverbs_methods_actions_flow_action_ops { + UVERBS_METHOD_FLOW_ACTION_ESP_CREATE, + UVERBS_METHOD_FLOW_ACTION_DESTROY, + UVERBS_METHOD_FLOW_ACTION_ESP_MODIFY, +}; + +enum uverbs_attrs_alloc_dm_cmd_attr_ids { + UVERBS_ATTR_ALLOC_DM_HANDLE, + UVERBS_ATTR_ALLOC_DM_LENGTH, + UVERBS_ATTR_ALLOC_DM_ALIGNMENT, +}; + +enum uverbs_attrs_free_dm_cmd_attr_ids { + UVERBS_ATTR_FREE_DM_HANDLE, +}; + +enum uverbs_methods_dm { + UVERBS_METHOD_DM_ALLOC, + UVERBS_METHOD_DM_FREE, +}; + +enum uverbs_attrs_reg_dm_mr_cmd_attr_ids { + UVERBS_ATTR_REG_DM_MR_HANDLE, + UVERBS_ATTR_REG_DM_MR_OFFSET, + UVERBS_ATTR_REG_DM_MR_LENGTH, + UVERBS_ATTR_REG_DM_MR_PD_HANDLE, + UVERBS_ATTR_REG_DM_MR_ACCESS_FLAGS, + UVERBS_ATTR_REG_DM_MR_DM_HANDLE, + UVERBS_ATTR_REG_DM_MR_RESP_LKEY, + UVERBS_ATTR_REG_DM_MR_RESP_RKEY, +}; + +enum uverbs_methods_mr { + UVERBS_METHOD_DM_MR_REG, + UVERBS_METHOD_MR_DESTROY, + UVERBS_METHOD_ADVISE_MR, +}; + +enum uverbs_attrs_mr_destroy_ids { + UVERBS_ATTR_DESTROY_MR_HANDLE, +}; + +enum uverbs_attrs_advise_mr_cmd_attr_ids { + UVERBS_ATTR_ADVISE_MR_PD_HANDLE, + UVERBS_ATTR_ADVISE_MR_ADVICE, + UVERBS_ATTR_ADVISE_MR_FLAGS, + UVERBS_ATTR_ADVISE_MR_SGE_LIST, +}; + +enum uverbs_attrs_create_counters_cmd_attr_ids { + UVERBS_ATTR_CREATE_COUNTERS_HANDLE, +}; + +enum uverbs_attrs_destroy_counters_cmd_attr_ids { + UVERBS_ATTR_DESTROY_COUNTERS_HANDLE, +}; + +enum uverbs_attrs_read_counters_cmd_attr_ids { + UVERBS_ATTR_READ_COUNTERS_HANDLE, + UVERBS_ATTR_READ_COUNTERS_BUFF, + UVERBS_ATTR_READ_COUNTERS_FLAGS, +}; + +enum uverbs_methods_actions_counters_ops { + UVERBS_METHOD_COUNTERS_CREATE, + UVERBS_METHOD_COUNTERS_DESTROY, + UVERBS_METHOD_COUNTERS_READ, +}; + +enum uverbs_attrs_info_handles_id { + UVERBS_ATTR_INFO_OBJECT_ID, + UVERBS_ATTR_INFO_TOTAL_HANDLES, + UVERBS_ATTR_INFO_HANDLES_LIST, +}; + +enum uverbs_methods_pd { + UVERBS_METHOD_PD_DESTROY, +}; + +enum uverbs_attrs_pd_destroy_ids { + UVERBS_ATTR_DESTROY_PD_HANDLE, +}; + +enum uverbs_methods_mw { + UVERBS_METHOD_MW_DESTROY, +}; + +enum uverbs_attrs_mw_destroy_ids { + UVERBS_ATTR_DESTROY_MW_HANDLE, +}; + +enum uverbs_methods_xrcd { + UVERBS_METHOD_XRCD_DESTROY, +}; + +enum uverbs_attrs_xrcd_destroy_ids { + UVERBS_ATTR_DESTROY_XRCD_HANDLE, +}; + +enum uverbs_methods_ah { + UVERBS_METHOD_AH_DESTROY, +}; + +enum uverbs_attrs_ah_destroy_ids { + UVERBS_ATTR_DESTROY_AH_HANDLE, +}; + +enum uverbs_methods_rwq_ind_tbl { + UVERBS_METHOD_RWQ_IND_TBL_DESTROY, +}; + +enum uverbs_attrs_rwq_ind_tbl_destroy_ids { + UVERBS_ATTR_DESTROY_RWQ_IND_TBL_HANDLE, +}; + +enum uverbs_methods_flow { + UVERBS_METHOD_FLOW_DESTROY, +}; + +enum uverbs_attrs_flow_destroy_ids { + UVERBS_ATTR_DESTROY_FLOW_HANDLE, +}; + +enum uverbs_method_async_event { + UVERBS_METHOD_ASYNC_EVENT_ALLOC, +}; + +enum uverbs_attrs_async_event_create { + UVERBS_ATTR_ASYNC_EVENT_ALLOC_FD_HANDLE, +}; + +#endif diff --git a/kernel-headers/rdma/ib_user_ioctl_verbs.h b/kernel-headers/rdma/ib_user_ioctl_verbs.h new file mode 100644 index 0000000..a640bb8 --- /dev/null +++ b/kernel-headers/rdma/ib_user_ioctl_verbs.h @@ -0,0 +1,210 @@ +/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR Linux-OpenIB) */ +/* + * Copyright (c) 2017-2018, Mellanox Technologies inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef IB_USER_IOCTL_VERBS_H +#define IB_USER_IOCTL_VERBS_H + +#include <linux/types.h> +#include <rdma/ib_user_verbs.h> + +#ifndef RDMA_UAPI_PTR +#define RDMA_UAPI_PTR(_type, _name) __aligned_u64 _name +#endif + +#define IB_UVERBS_ACCESS_OPTIONAL_FIRST (1 << 20) +#define IB_UVERBS_ACCESS_OPTIONAL_LAST (1 << 29) + +enum ib_uverbs_core_support { + IB_UVERBS_CORE_SUPPORT_OPTIONAL_MR_ACCESS = 1 << 0, +}; + +enum ib_uverbs_access_flags { + IB_UVERBS_ACCESS_LOCAL_WRITE = 1 << 0, + IB_UVERBS_ACCESS_REMOTE_WRITE = 1 << 1, + IB_UVERBS_ACCESS_REMOTE_READ = 1 << 2, + IB_UVERBS_ACCESS_REMOTE_ATOMIC = 1 << 3, + IB_UVERBS_ACCESS_MW_BIND = 1 << 4, + IB_UVERBS_ACCESS_ZERO_BASED = 1 << 5, + IB_UVERBS_ACCESS_ON_DEMAND = 1 << 6, + IB_UVERBS_ACCESS_HUGETLB = 1 << 7, + + IB_UVERBS_ACCESS_RELAXED_ORDERING = IB_UVERBS_ACCESS_OPTIONAL_FIRST, + IB_UVERBS_ACCESS_OPTIONAL_RANGE = + ((IB_UVERBS_ACCESS_OPTIONAL_LAST << 1) - 1) & + ~(IB_UVERBS_ACCESS_OPTIONAL_FIRST - 1) +}; + +enum ib_uverbs_query_port_cap_flags { + IB_UVERBS_PCF_SM = 1 << 1, + IB_UVERBS_PCF_NOTICE_SUP = 1 << 2, + IB_UVERBS_PCF_TRAP_SUP = 1 << 3, + IB_UVERBS_PCF_OPT_IPD_SUP = 1 << 4, + IB_UVERBS_PCF_AUTO_MIGR_SUP = 1 << 5, + IB_UVERBS_PCF_SL_MAP_SUP = 1 << 6, + IB_UVERBS_PCF_MKEY_NVRAM = 1 << 7, + IB_UVERBS_PCF_PKEY_NVRAM = 1 << 8, + IB_UVERBS_PCF_LED_INFO_SUP = 1 << 9, + IB_UVERBS_PCF_SM_DISABLED = 1 << 10, + IB_UVERBS_PCF_SYS_IMAGE_GUID_SUP = 1 << 11, + IB_UVERBS_PCF_PKEY_SW_EXT_PORT_TRAP_SUP = 1 << 12, + IB_UVERBS_PCF_EXTENDED_SPEEDS_SUP = 1 << 14, + IB_UVERBS_PCF_CM_SUP = 1 << 16, + IB_UVERBS_PCF_SNMP_TUNNEL_SUP = 1 << 17, + IB_UVERBS_PCF_REINIT_SUP = 1 << 18, + IB_UVERBS_PCF_DEVICE_MGMT_SUP = 1 << 19, + IB_UVERBS_PCF_VENDOR_CLASS_SUP = 1 << 20, + IB_UVERBS_PCF_DR_NOTICE_SUP = 1 << 21, + IB_UVERBS_PCF_CAP_MASK_NOTICE_SUP = 1 << 22, + IB_UVERBS_PCF_BOOT_MGMT_SUP = 1 << 23, + IB_UVERBS_PCF_LINK_LATENCY_SUP = 1 << 24, + IB_UVERBS_PCF_CLIENT_REG_SUP = 1 << 25, + /* + * IsOtherLocalChangesNoticeSupported is aliased by IP_BASED_GIDS and + * is inaccessible + */ + IB_UVERBS_PCF_LINK_SPEED_WIDTH_TABLE_SUP = 1 << 27, + IB_UVERBS_PCF_VENDOR_SPECIFIC_MADS_TABLE_SUP = 1 << 28, + IB_UVERBS_PCF_MCAST_PKEY_TRAP_SUPPRESSION_SUP = 1 << 29, + IB_UVERBS_PCF_MCAST_FDB_TOP_SUP = 1 << 30, + IB_UVERBS_PCF_HIERARCHY_INFO_SUP = 1ULL << 31, + + /* NOTE this is an internal flag, not an IBA flag */ + IB_UVERBS_PCF_IP_BASED_GIDS = 1 << 26, +}; + +enum ib_uverbs_query_port_flags { + IB_UVERBS_QPF_GRH_REQUIRED = 1 << 0, +}; + +enum ib_uverbs_flow_action_esp_keymat { + IB_UVERBS_FLOW_ACTION_ESP_KEYMAT_AES_GCM, +}; + +enum ib_uverbs_flow_action_esp_keymat_aes_gcm_iv_algo { + IB_UVERBS_FLOW_ACTION_IV_ALGO_SEQ, +}; + +struct ib_uverbs_flow_action_esp_keymat_aes_gcm { + __aligned_u64 iv; + __u32 iv_algo; /* Use enum ib_uverbs_flow_action_esp_keymat_aes_gcm_iv_algo */ + + __u32 salt; + __u32 icv_len; + + __u32 key_len; + __u32 aes_key[256 / 32]; +}; + +enum ib_uverbs_flow_action_esp_replay { + IB_UVERBS_FLOW_ACTION_ESP_REPLAY_NONE, + IB_UVERBS_FLOW_ACTION_ESP_REPLAY_BMP, +}; + +struct ib_uverbs_flow_action_esp_replay_bmp { + __u32 size; +}; + +enum ib_uverbs_flow_action_esp_flags { + IB_UVERBS_FLOW_ACTION_ESP_FLAGS_INLINE_CRYPTO = 0UL << 0, /* Default */ + IB_UVERBS_FLOW_ACTION_ESP_FLAGS_FULL_OFFLOAD = 1UL << 0, + + IB_UVERBS_FLOW_ACTION_ESP_FLAGS_TUNNEL = 0UL << 1, /* Default */ + IB_UVERBS_FLOW_ACTION_ESP_FLAGS_TRANSPORT = 1UL << 1, + + IB_UVERBS_FLOW_ACTION_ESP_FLAGS_DECRYPT = 0UL << 2, /* Default */ + IB_UVERBS_FLOW_ACTION_ESP_FLAGS_ENCRYPT = 1UL << 2, + + IB_UVERBS_FLOW_ACTION_ESP_FLAGS_ESN_NEW_WINDOW = 1UL << 3, +}; + +struct ib_uverbs_flow_action_esp_encap { + /* This struct represents a list of pointers to flow_xxxx_filter that + * encapsulates the payload in ESP tunnel mode. + */ + RDMA_UAPI_PTR(void *, val_ptr); /* pointer to a flow_xxxx_filter */ + RDMA_UAPI_PTR(struct ib_uverbs_flow_action_esp_encap *, next_ptr); + __u16 len; /* Len of the filter struct val_ptr points to */ + __u16 type; /* Use flow_spec_type enum */ +}; + +struct ib_uverbs_flow_action_esp { + __u32 spi; + __u32 seq; + __u32 tfc_pad; + __u32 flags; + __aligned_u64 hard_limit_pkts; +}; + +enum ib_uverbs_read_counters_flags { + /* prefer read values from driver cache */ + IB_UVERBS_READ_COUNTERS_PREFER_CACHED = 1 << 0, +}; + +enum ib_uverbs_advise_mr_advice { + IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH, + IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_WRITE, +}; + +enum ib_uverbs_advise_mr_flag { + IB_UVERBS_ADVISE_MR_FLAG_FLUSH = 1 << 0, +}; + +struct ib_uverbs_query_port_resp_ex { + struct ib_uverbs_query_port_resp legacy_resp; + __u16 port_cap_flags2; + __u8 reserved[6]; +}; + +enum rdma_driver_id { + RDMA_DRIVER_UNKNOWN, + RDMA_DRIVER_MLX5, + RDMA_DRIVER_MLX4, + RDMA_DRIVER_CXGB3, + RDMA_DRIVER_CXGB4, + RDMA_DRIVER_MTHCA, + RDMA_DRIVER_BNXT_RE, + RDMA_DRIVER_OCRDMA, + RDMA_DRIVER_NES, + RDMA_DRIVER_I40IW, + RDMA_DRIVER_VMW_PVRDMA, + RDMA_DRIVER_QEDR, + RDMA_DRIVER_HNS, + RDMA_DRIVER_USNIC, + RDMA_DRIVER_RXE, + RDMA_DRIVER_HFI1, + RDMA_DRIVER_QIB, + RDMA_DRIVER_EFA, + RDMA_DRIVER_SIW, +}; + +#endif diff --git a/kernel-headers/rdma/ib_user_mad.h b/kernel-headers/rdma/ib_user_mad.h new file mode 100644 index 0000000..90c0cf2 --- /dev/null +++ b/kernel-headers/rdma/ib_user_mad.h @@ -0,0 +1,234 @@ +/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR Linux-OpenIB) */ +/* + * Copyright (c) 2004 Topspin Communications. All rights reserved. + * Copyright (c) 2005 Voltaire, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef IB_USER_MAD_H +#define IB_USER_MAD_H + +#include <linux/types.h> +#include <rdma/rdma_user_ioctl.h> + +/* + * Increment this value if any changes that break userspace ABI + * compatibility are made. + */ +#define IB_USER_MAD_ABI_VERSION 5 + +/* + * Make sure that all structs defined in this file remain laid out so + * that they pack the same way on 32-bit and 64-bit architectures (to + * avoid incompatibility between 32-bit userspace and 64-bit kernels). + */ + +/** + * ib_user_mad_hdr_old - Old version of MAD packet header without pkey_index + * @id - ID of agent MAD received with/to be sent with + * @status - 0 on successful receive, ETIMEDOUT if no response + * received (transaction ID in data[] will be set to TID of original + * request) (ignored on send) + * @timeout_ms - Milliseconds to wait for response (unset on receive) + * @retries - Number of automatic retries to attempt + * @qpn - Remote QP number received from/to be sent to + * @qkey - Remote Q_Key to be sent with (unset on receive) + * @lid - Remote lid received from/to be sent to + * @sl - Service level received with/to be sent with + * @path_bits - Local path bits received with/to be sent with + * @grh_present - If set, GRH was received/should be sent + * @gid_index - Local GID index to send with (unset on receive) + * @hop_limit - Hop limit in GRH + * @traffic_class - Traffic class in GRH + * @gid - Remote GID in GRH + * @flow_label - Flow label in GRH + */ +struct ib_user_mad_hdr_old { + __u32 id; + __u32 status; + __u32 timeout_ms; + __u32 retries; + __u32 length; + __be32 qpn; + __be32 qkey; + __be16 lid; + __u8 sl; + __u8 path_bits; + __u8 grh_present; + __u8 gid_index; + __u8 hop_limit; + __u8 traffic_class; + __u8 gid[16]; + __be32 flow_label; +}; + +/** + * ib_user_mad_hdr - MAD packet header + * This layout allows specifying/receiving the P_Key index. To use + * this capability, an application must call the + * IB_USER_MAD_ENABLE_PKEY ioctl on the user MAD file handle before + * any other actions with the file handle. + * @id - ID of agent MAD received with/to be sent with + * @status - 0 on successful receive, ETIMEDOUT if no response + * received (transaction ID in data[] will be set to TID of original + * request) (ignored on send) + * @timeout_ms - Milliseconds to wait for response (unset on receive) + * @retries - Number of automatic retries to attempt + * @qpn - Remote QP number received from/to be sent to + * @qkey - Remote Q_Key to be sent with (unset on receive) + * @lid - Remote lid received from/to be sent to + * @sl - Service level received with/to be sent with + * @path_bits - Local path bits received with/to be sent with + * @grh_present - If set, GRH was received/should be sent + * @gid_index - Local GID index to send with (unset on receive) + * @hop_limit - Hop limit in GRH + * @traffic_class - Traffic class in GRH + * @gid - Remote GID in GRH + * @flow_label - Flow label in GRH + * @pkey_index - P_Key index + */ +struct ib_user_mad_hdr { + __u32 id; + __u32 status; + __u32 timeout_ms; + __u32 retries; + __u32 length; + __be32 qpn; + __be32 qkey; + __be16 lid; + __u8 sl; + __u8 path_bits; + __u8 grh_present; + __u8 gid_index; + __u8 hop_limit; + __u8 traffic_class; + __u8 gid[16]; + __be32 flow_label; + __u16 pkey_index; + __u8 reserved[6]; +}; + +/** + * ib_user_mad - MAD packet + * @hdr - MAD packet header + * @data - Contents of MAD + * + */ +struct ib_user_mad { + struct ib_user_mad_hdr hdr; + __aligned_u64 data[0]; +}; + +/* + * Earlier versions of this interface definition declared the + * method_mask[] member as an array of __u32 but treated it as a + * bitmap made up of longs in the kernel. This ambiguity meant that + * 32-bit big-endian applications that can run on both 32-bit and + * 64-bit kernels had no consistent ABI to rely on, and 64-bit + * big-endian applications that treated method_mask as being made up + * of 32-bit words would have their bitmap misinterpreted. + * + * To clear up this confusion, we change the declaration of + * method_mask[] to use unsigned long and handle the conversion from + * 32-bit userspace to 64-bit kernel for big-endian systems in the + * compat_ioctl method. Unfortunately, to keep the structure layout + * the same, we need the method_mask[] array to be aligned only to 4 + * bytes even when long is 64 bits, which forces us into this ugly + * typedef. + */ +typedef unsigned long __attribute__((aligned(4))) packed_ulong; +#define IB_USER_MAD_LONGS_PER_METHOD_MASK (128 / (8 * sizeof (long))) + +/** + * ib_user_mad_reg_req - MAD registration request + * @id - Set by the kernel; used to identify agent in future requests. + * @qpn - Queue pair number; must be 0 or 1. + * @method_mask - The caller will receive unsolicited MADs for any method + * where @method_mask = 1. + * @mgmt_class - Indicates which management class of MADs should be receive + * by the caller. This field is only required if the user wishes to + * receive unsolicited MADs, otherwise it should be 0. + * @mgmt_class_version - Indicates which version of MADs for the given + * management class to receive. + * @oui: Indicates IEEE OUI when mgmt_class is a vendor class + * in the range from 0x30 to 0x4f. Otherwise not used. + * @rmpp_version: If set, indicates the RMPP version used. + * + */ +struct ib_user_mad_reg_req { + __u32 id; + packed_ulong method_mask[IB_USER_MAD_LONGS_PER_METHOD_MASK]; + __u8 qpn; + __u8 mgmt_class; + __u8 mgmt_class_version; + __u8 oui[3]; + __u8 rmpp_version; +}; + +/** + * ib_user_mad_reg_req2 - MAD registration request + * + * @id - Set by the _kernel_; used by userspace to identify the + * registered agent in future requests. + * @qpn - Queue pair number; must be 0 or 1. + * @mgmt_class - Indicates which management class of MADs should be + * receive by the caller. This field is only required if + * the user wishes to receive unsolicited MADs, otherwise + * it should be 0. + * @mgmt_class_version - Indicates which version of MADs for the given + * management class to receive. + * @res - Ignored. + * @flags - additional registration flags; Must be in the set of + * flags defined in IB_USER_MAD_REG_FLAGS_CAP + * @method_mask - The caller wishes to receive unsolicited MADs for the + * methods whose bit(s) is(are) set. + * @oui - Indicates IEEE OUI to use when mgmt_class is a vendor + * class in the range from 0x30 to 0x4f. Otherwise not + * used. + * @rmpp_version - If set, indicates the RMPP version to use. + */ +enum { + IB_USER_MAD_USER_RMPP = (1 << 0), +}; +#define IB_USER_MAD_REG_FLAGS_CAP (IB_USER_MAD_USER_RMPP) +struct ib_user_mad_reg_req2 { + __u32 id; + __u32 qpn; + __u8 mgmt_class; + __u8 mgmt_class_version; + __u16 res; + __u32 flags; + __aligned_u64 method_mask[2]; + __u32 oui; + __u8 rmpp_version; + __u8 reserved[3]; +}; + +#endif /* IB_USER_MAD_H */ diff --git a/kernel-headers/rdma/ib_user_sa.h b/kernel-headers/rdma/ib_user_sa.h new file mode 100644 index 0000000..435155d --- /dev/null +++ b/kernel-headers/rdma/ib_user_sa.h @@ -0,0 +1,77 @@ +/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR Linux-OpenIB) */ +/* + * Copyright (c) 2005 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef IB_USER_SA_H +#define IB_USER_SA_H + +#include <linux/types.h> + +enum { + IB_PATH_GMP = 1, + IB_PATH_PRIMARY = (1<<1), + IB_PATH_ALTERNATE = (1<<2), + IB_PATH_OUTBOUND = (1<<3), + IB_PATH_INBOUND = (1<<4), + IB_PATH_INBOUND_REVERSE = (1<<5), + IB_PATH_BIDIRECTIONAL = IB_PATH_OUTBOUND | IB_PATH_INBOUND_REVERSE +}; + +struct ib_path_rec_data { + __u32 flags; + __u32 reserved; + __u32 path_rec[16]; +}; + +struct ib_user_path_rec { + __u8 dgid[16]; + __u8 sgid[16]; + __be16 dlid; + __be16 slid; + __u32 raw_traffic; + __be32 flow_label; + __u32 reversible; + __u32 mtu; + __be16 pkey; + __u8 hop_limit; + __u8 traffic_class; + __u8 numb_path; + __u8 sl; + __u8 mtu_selector; + __u8 rate_selector; + __u8 rate; + __u8 packet_life_time_selector; + __u8 packet_life_time; + __u8 preference; +}; + +#endif /* IB_USER_SA_H */ diff --git a/kernel-headers/rdma/ib_user_verbs.h b/kernel-headers/rdma/ib_user_verbs.h new file mode 100644 index 0000000..0474c74 --- /dev/null +++ b/kernel-headers/rdma/ib_user_verbs.h @@ -0,0 +1,1304 @@ +/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR Linux-OpenIB) */ +/* + * Copyright (c) 2005 Topspin Communications. All rights reserved. + * Copyright (c) 2005, 2006 Cisco Systems. All rights reserved. + * Copyright (c) 2005 PathScale, Inc. All rights reserved. + * Copyright (c) 2006 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef IB_USER_VERBS_H +#define IB_USER_VERBS_H + +#include <linux/types.h> + +/* + * Increment this value if any changes that break userspace ABI + * compatibility are made. + */ +#define IB_USER_VERBS_ABI_VERSION 6 +#define IB_USER_VERBS_CMD_THRESHOLD 50 + +enum ib_uverbs_write_cmds { + IB_USER_VERBS_CMD_GET_CONTEXT, + IB_USER_VERBS_CMD_QUERY_DEVICE, + IB_USER_VERBS_CMD_QUERY_PORT, + IB_USER_VERBS_CMD_ALLOC_PD, + IB_USER_VERBS_CMD_DEALLOC_PD, + IB_USER_VERBS_CMD_CREATE_AH, + IB_USER_VERBS_CMD_MODIFY_AH, + IB_USER_VERBS_CMD_QUERY_AH, + IB_USER_VERBS_CMD_DESTROY_AH, + IB_USER_VERBS_CMD_REG_MR, + IB_USER_VERBS_CMD_REG_SMR, + IB_USER_VERBS_CMD_REREG_MR, + IB_USER_VERBS_CMD_QUERY_MR, + IB_USER_VERBS_CMD_DEREG_MR, + IB_USER_VERBS_CMD_ALLOC_MW, + IB_USER_VERBS_CMD_BIND_MW, + IB_USER_VERBS_CMD_DEALLOC_MW, + IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL, + IB_USER_VERBS_CMD_CREATE_CQ, + IB_USER_VERBS_CMD_RESIZE_CQ, + IB_USER_VERBS_CMD_DESTROY_CQ, + IB_USER_VERBS_CMD_POLL_CQ, + IB_USER_VERBS_CMD_PEEK_CQ, + IB_USER_VERBS_CMD_REQ_NOTIFY_CQ, + IB_USER_VERBS_CMD_CREATE_QP, + IB_USER_VERBS_CMD_QUERY_QP, + IB_USER_VERBS_CMD_MODIFY_QP, + IB_USER_VERBS_CMD_DESTROY_QP, + IB_USER_VERBS_CMD_POST_SEND, + IB_USER_VERBS_CMD_POST_RECV, + IB_USER_VERBS_CMD_ATTACH_MCAST, + IB_USER_VERBS_CMD_DETACH_MCAST, + IB_USER_VERBS_CMD_CREATE_SRQ, + IB_USER_VERBS_CMD_MODIFY_SRQ, + IB_USER_VERBS_CMD_QUERY_SRQ, + IB_USER_VERBS_CMD_DESTROY_SRQ, + IB_USER_VERBS_CMD_POST_SRQ_RECV, + IB_USER_VERBS_CMD_OPEN_XRCD, + IB_USER_VERBS_CMD_CLOSE_XRCD, + IB_USER_VERBS_CMD_CREATE_XSRQ, + IB_USER_VERBS_CMD_OPEN_QP, +}; + +enum { + IB_USER_VERBS_EX_CMD_QUERY_DEVICE = IB_USER_VERBS_CMD_QUERY_DEVICE, + IB_USER_VERBS_EX_CMD_CREATE_CQ = IB_USER_VERBS_CMD_CREATE_CQ, + IB_USER_VERBS_EX_CMD_CREATE_QP = IB_USER_VERBS_CMD_CREATE_QP, + IB_USER_VERBS_EX_CMD_MODIFY_QP = IB_USER_VERBS_CMD_MODIFY_QP, + IB_USER_VERBS_EX_CMD_CREATE_FLOW = IB_USER_VERBS_CMD_THRESHOLD, + IB_USER_VERBS_EX_CMD_DESTROY_FLOW, + IB_USER_VERBS_EX_CMD_CREATE_WQ, + IB_USER_VERBS_EX_CMD_MODIFY_WQ, + IB_USER_VERBS_EX_CMD_DESTROY_WQ, + IB_USER_VERBS_EX_CMD_CREATE_RWQ_IND_TBL, + IB_USER_VERBS_EX_CMD_DESTROY_RWQ_IND_TBL, + IB_USER_VERBS_EX_CMD_MODIFY_CQ +}; + +/* + * Make sure that all structs defined in this file remain laid out so + * that they pack the same way on 32-bit and 64-bit architectures (to + * avoid incompatibility between 32-bit userspace and 64-bit kernels). + * Specifically: + * - Do not use pointer types -- pass pointers in __u64 instead. + * - Make sure that any structure larger than 4 bytes is padded to a + * multiple of 8 bytes. Otherwise the structure size will be + * different between 32-bit and 64-bit architectures. + */ + +struct ib_uverbs_async_event_desc { + __aligned_u64 element; + __u32 event_type; /* enum ib_event_type */ + __u32 reserved; +}; + +struct ib_uverbs_comp_event_desc { + __aligned_u64 cq_handle; +}; + +struct ib_uverbs_cq_moderation_caps { + __u16 max_cq_moderation_count; + __u16 max_cq_moderation_period; + __u32 reserved; +}; + +/* + * All commands from userspace should start with a __u32 command field + * followed by __u16 in_words and out_words fields (which give the + * length of the command block and response buffer if any in 32-bit + * words). The kernel driver will read these fields first and read + * the rest of the command struct based on these value. + */ + +#define IB_USER_VERBS_CMD_COMMAND_MASK 0xff +#define IB_USER_VERBS_CMD_FLAG_EXTENDED 0x80000000u + +struct ib_uverbs_cmd_hdr { + __u32 command; + __u16 in_words; + __u16 out_words; +}; + +struct ib_uverbs_ex_cmd_hdr { + __aligned_u64 response; + __u16 provider_in_words; + __u16 provider_out_words; + __u32 cmd_hdr_reserved; +}; + +struct ib_uverbs_get_context { + __aligned_u64 response; + __aligned_u64 driver_data[0]; +}; + +struct ib_uverbs_get_context_resp { + __u32 async_fd; + __u32 num_comp_vectors; + __aligned_u64 driver_data[0]; +}; + +struct ib_uverbs_query_device { + __aligned_u64 response; + __aligned_u64 driver_data[0]; +}; + +struct ib_uverbs_query_device_resp { + __aligned_u64 fw_ver; + __be64 node_guid; + __be64 sys_image_guid; + __aligned_u64 max_mr_size; + __aligned_u64 page_size_cap; + __u32 vendor_id; + __u32 vendor_part_id; + __u32 hw_ver; + __u32 max_qp; + __u32 max_qp_wr; + __u32 device_cap_flags; + __u32 max_sge; + __u32 max_sge_rd; + __u32 max_cq; + __u32 max_cqe; + __u32 max_mr; + __u32 max_pd; + __u32 max_qp_rd_atom; + __u32 max_ee_rd_atom; + __u32 max_res_rd_atom; + __u32 max_qp_init_rd_atom; + __u32 max_ee_init_rd_atom; + __u32 atomic_cap; + __u32 max_ee; + __u32 max_rdd; + __u32 max_mw; + __u32 max_raw_ipv6_qp; + __u32 max_raw_ethy_qp; + __u32 max_mcast_grp; + __u32 max_mcast_qp_attach; + __u32 max_total_mcast_qp_attach; + __u32 max_ah; + __u32 max_fmr; + __u32 max_map_per_fmr; + __u32 max_srq; + __u32 max_srq_wr; + __u32 max_srq_sge; + __u16 max_pkeys; + __u8 local_ca_ack_delay; + __u8 phys_port_cnt; + __u8 reserved[4]; +}; + +struct ib_uverbs_ex_query_device { + __u32 comp_mask; + __u32 reserved; +}; + +struct ib_uverbs_odp_caps { + __aligned_u64 general_caps; + struct { + __u32 rc_odp_caps; + __u32 uc_odp_caps; + __u32 ud_odp_caps; + } per_transport_caps; + __u32 reserved; +}; + +struct ib_uverbs_rss_caps { + /* Corresponding bit will be set if qp type from + * 'enum ib_qp_type' is supported, e.g. + * supported_qpts |= 1 << IB_QPT_UD + */ + __u32 supported_qpts; + __u32 max_rwq_indirection_tables; + __u32 max_rwq_indirection_table_size; + __u32 reserved; +}; + +struct ib_uverbs_tm_caps { + /* Max size of rendezvous request message */ + __u32 max_rndv_hdr_size; + /* Max number of entries in tag matching list */ + __u32 max_num_tags; + /* TM flags */ + __u32 flags; + /* Max number of outstanding list operations */ + __u32 max_ops; + /* Max number of SGE in tag matching entry */ + __u32 max_sge; + __u32 reserved; +}; + +struct ib_uverbs_ex_query_device_resp { + struct ib_uverbs_query_device_resp base; + __u32 comp_mask; + __u32 response_length; + struct ib_uverbs_odp_caps odp_caps; + __aligned_u64 timestamp_mask; + __aligned_u64 hca_core_clock; /* in KHZ */ + __aligned_u64 device_cap_flags_ex; + struct ib_uverbs_rss_caps rss_caps; + __u32 max_wq_type_rq; + __u32 raw_packet_caps; + struct ib_uverbs_tm_caps tm_caps; + struct ib_uverbs_cq_moderation_caps cq_moderation_caps; + __aligned_u64 max_dm_size; + __u32 xrc_odp_caps; + __u32 reserved; +}; + +struct ib_uverbs_query_port { + __aligned_u64 response; + __u8 port_num; + __u8 reserved[7]; + __aligned_u64 driver_data[0]; +}; + +struct ib_uverbs_query_port_resp { + __u32 port_cap_flags; /* see ib_uverbs_query_port_cap_flags */ + __u32 max_msg_sz; + __u32 bad_pkey_cntr; + __u32 qkey_viol_cntr; + __u32 gid_tbl_len; + __u16 pkey_tbl_len; + __u16 lid; + __u16 sm_lid; + __u8 state; + __u8 max_mtu; + __u8 active_mtu; + __u8 lmc; + __u8 max_vl_num; + __u8 sm_sl; + __u8 subnet_timeout; + __u8 init_type_reply; + __u8 active_width; + __u8 active_speed; + __u8 phys_state; + __u8 link_layer; + __u8 flags; /* see ib_uverbs_query_port_flags */ + __u8 reserved; +}; + +struct ib_uverbs_alloc_pd { + __aligned_u64 response; + __aligned_u64 driver_data[0]; +}; + +struct ib_uverbs_alloc_pd_resp { + __u32 pd_handle; + __u32 driver_data[0]; +}; + +struct ib_uverbs_dealloc_pd { + __u32 pd_handle; +}; + +struct ib_uverbs_open_xrcd { + __aligned_u64 response; + __u32 fd; + __u32 oflags; + __aligned_u64 driver_data[0]; +}; + +struct ib_uverbs_open_xrcd_resp { + __u32 xrcd_handle; + __u32 driver_data[0]; +}; + +struct ib_uverbs_close_xrcd { + __u32 xrcd_handle; +}; + +struct ib_uverbs_reg_mr { + __aligned_u64 response; + __aligned_u64 start; + __aligned_u64 length; + __aligned_u64 hca_va; + __u32 pd_handle; + __u32 access_flags; + __aligned_u64 driver_data[0]; +}; + +struct ib_uverbs_reg_mr_resp { + __u32 mr_handle; + __u32 lkey; + __u32 rkey; + __u32 driver_data[0]; +}; + +struct ib_uverbs_rereg_mr { + __aligned_u64 response; + __u32 mr_handle; + __u32 flags; + __aligned_u64 start; + __aligned_u64 length; + __aligned_u64 hca_va; + __u32 pd_handle; + __u32 access_flags; + __aligned_u64 driver_data[0]; +}; + +struct ib_uverbs_rereg_mr_resp { + __u32 lkey; + __u32 rkey; + __aligned_u64 driver_data[0]; +}; + +struct ib_uverbs_dereg_mr { + __u32 mr_handle; +}; + +struct ib_uverbs_alloc_mw { + __aligned_u64 response; + __u32 pd_handle; + __u8 mw_type; + __u8 reserved[3]; + __aligned_u64 driver_data[0]; +}; + +struct ib_uverbs_alloc_mw_resp { + __u32 mw_handle; + __u32 rkey; + __aligned_u64 driver_data[0]; +}; + +struct ib_uverbs_dealloc_mw { + __u32 mw_handle; +}; + +struct ib_uverbs_create_comp_channel { + __aligned_u64 response; +}; + +struct ib_uverbs_create_comp_channel_resp { + __u32 fd; +}; + +struct ib_uverbs_create_cq { + __aligned_u64 response; + __aligned_u64 user_handle; + __u32 cqe; + __u32 comp_vector; + __s32 comp_channel; + __u32 reserved; + __aligned_u64 driver_data[0]; +}; + +enum ib_uverbs_ex_create_cq_flags { + IB_UVERBS_CQ_FLAGS_TIMESTAMP_COMPLETION = 1 << 0, + IB_UVERBS_CQ_FLAGS_IGNORE_OVERRUN = 1 << 1, +}; + +struct ib_uverbs_ex_create_cq { + __aligned_u64 user_handle; + __u32 cqe; + __u32 comp_vector; + __s32 comp_channel; + __u32 comp_mask; + __u32 flags; /* bitmask of ib_uverbs_ex_create_cq_flags */ + __u32 reserved; +}; + +struct ib_uverbs_create_cq_resp { + __u32 cq_handle; + __u32 cqe; + __aligned_u64 driver_data[0]; +}; + +struct ib_uverbs_ex_create_cq_resp { + struct ib_uverbs_create_cq_resp base; + __u32 comp_mask; + __u32 response_length; +}; + +struct ib_uverbs_resize_cq { + __aligned_u64 response; + __u32 cq_handle; + __u32 cqe; + __aligned_u64 driver_data[0]; +}; + +struct ib_uverbs_resize_cq_resp { + __u32 cqe; + __u32 reserved; + __aligned_u64 driver_data[0]; +}; + +struct ib_uverbs_poll_cq { + __aligned_u64 response; + __u32 cq_handle; + __u32 ne; +}; + +struct ib_uverbs_wc { + __aligned_u64 wr_id; + __u32 status; + __u32 opcode; + __u32 vendor_err; + __u32 byte_len; + union { + __be32 imm_data; + __u32 invalidate_rkey; + } ex; + __u32 qp_num; + __u32 src_qp; + __u32 wc_flags; + __u16 pkey_index; + __u16 slid; + __u8 sl; + __u8 dlid_path_bits; + __u8 port_num; + __u8 reserved; +}; + +struct ib_uverbs_poll_cq_resp { + __u32 count; + __u32 reserved; + struct ib_uverbs_wc wc[0]; +}; + +struct ib_uverbs_req_notify_cq { + __u32 cq_handle; + __u32 solicited_only; +}; + +struct ib_uverbs_destroy_cq { + __aligned_u64 response; + __u32 cq_handle; + __u32 reserved; +}; + +struct ib_uverbs_destroy_cq_resp { + __u32 comp_events_reported; + __u32 async_events_reported; +}; + +struct ib_uverbs_global_route { + __u8 dgid[16]; + __u32 flow_label; + __u8 sgid_index; + __u8 hop_limit; + __u8 traffic_class; + __u8 reserved; +}; + +struct ib_uverbs_ah_attr { + struct ib_uverbs_global_route grh; + __u16 dlid; + __u8 sl; + __u8 src_path_bits; + __u8 static_rate; + __u8 is_global; + __u8 port_num; + __u8 reserved; +}; + +struct ib_uverbs_qp_attr { + __u32 qp_attr_mask; + __u32 qp_state; + __u32 cur_qp_state; + __u32 path_mtu; + __u32 path_mig_state; + __u32 qkey; + __u32 rq_psn; + __u32 sq_psn; + __u32 dest_qp_num; + __u32 qp_access_flags; + + struct ib_uverbs_ah_attr ah_attr; + struct ib_uverbs_ah_attr alt_ah_attr; + + /* ib_qp_cap */ + __u32 max_send_wr; + __u32 max_recv_wr; + __u32 max_send_sge; + __u32 max_recv_sge; + __u32 max_inline_data; + + __u16 pkey_index; + __u16 alt_pkey_index; + __u8 en_sqd_async_notify; + __u8 sq_draining; + __u8 max_rd_atomic; + __u8 max_dest_rd_atomic; + __u8 min_rnr_timer; + __u8 port_num; + __u8 timeout; + __u8 retry_cnt; + __u8 rnr_retry; + __u8 alt_port_num; + __u8 alt_timeout; + __u8 reserved[5]; +}; + +struct ib_uverbs_create_qp { + __aligned_u64 response; + __aligned_u64 user_handle; + __u32 pd_handle; + __u32 send_cq_handle; + __u32 recv_cq_handle; + __u32 srq_handle; + __u32 max_send_wr; + __u32 max_recv_wr; + __u32 max_send_sge; + __u32 max_recv_sge; + __u32 max_inline_data; + __u8 sq_sig_all; + __u8 qp_type; + __u8 is_srq; + __u8 reserved; + __aligned_u64 driver_data[0]; +}; + +enum ib_uverbs_create_qp_mask { + IB_UVERBS_CREATE_QP_MASK_IND_TABLE = 1UL << 0, +}; + +enum { + IB_UVERBS_CREATE_QP_SUP_COMP_MASK = IB_UVERBS_CREATE_QP_MASK_IND_TABLE, +}; + +enum { + /* + * This value is equal to IB_QP_DEST_QPN. + */ + IB_USER_LEGACY_LAST_QP_ATTR_MASK = 1ULL << 20, +}; + +enum { + /* + * This value is equal to IB_QP_RATE_LIMIT. + */ + IB_USER_LAST_QP_ATTR_MASK = 1ULL << 25, +}; + +struct ib_uverbs_ex_create_qp { + __aligned_u64 user_handle; + __u32 pd_handle; + __u32 send_cq_handle; + __u32 recv_cq_handle; + __u32 srq_handle; + __u32 max_send_wr; + __u32 max_recv_wr; + __u32 max_send_sge; + __u32 max_recv_sge; + __u32 max_inline_data; + __u8 sq_sig_all; + __u8 qp_type; + __u8 is_srq; + __u8 reserved; + __u32 comp_mask; + __u32 create_flags; + __u32 rwq_ind_tbl_handle; + __u32 source_qpn; +}; + +struct ib_uverbs_open_qp { + __aligned_u64 response; + __aligned_u64 user_handle; + __u32 pd_handle; + __u32 qpn; + __u8 qp_type; + __u8 reserved[7]; + __aligned_u64 driver_data[0]; +}; + +/* also used for open response */ +struct ib_uverbs_create_qp_resp { + __u32 qp_handle; + __u32 qpn; + __u32 max_send_wr; + __u32 max_recv_wr; + __u32 max_send_sge; + __u32 max_recv_sge; + __u32 max_inline_data; + __u32 reserved; + __u32 driver_data[0]; +}; + +struct ib_uverbs_ex_create_qp_resp { + struct ib_uverbs_create_qp_resp base; + __u32 comp_mask; + __u32 response_length; +}; + +/* + * This struct needs to remain a multiple of 8 bytes to keep the + * alignment of the modify QP parameters. + */ +struct ib_uverbs_qp_dest { + __u8 dgid[16]; + __u32 flow_label; + __u16 dlid; + __u16 reserved; + __u8 sgid_index; + __u8 hop_limit; + __u8 traffic_class; + __u8 sl; + __u8 src_path_bits; + __u8 static_rate; + __u8 is_global; + __u8 port_num; +}; + +struct ib_uverbs_query_qp { + __aligned_u64 response; + __u32 qp_handle; + __u32 attr_mask; + __aligned_u64 driver_data[0]; +}; + +struct ib_uverbs_query_qp_resp { + struct ib_uverbs_qp_dest dest; + struct ib_uverbs_qp_dest alt_dest; + __u32 max_send_wr; + __u32 max_recv_wr; + __u32 max_send_sge; + __u32 max_recv_sge; + __u32 max_inline_data; + __u32 qkey; + __u32 rq_psn; + __u32 sq_psn; + __u32 dest_qp_num; + __u32 qp_access_flags; + __u16 pkey_index; + __u16 alt_pkey_index; + __u8 qp_state; + __u8 cur_qp_state; + __u8 path_mtu; + __u8 path_mig_state; + __u8 sq_draining; + __u8 max_rd_atomic; + __u8 max_dest_rd_atomic; + __u8 min_rnr_timer; + __u8 port_num; + __u8 timeout; + __u8 retry_cnt; + __u8 rnr_retry; + __u8 alt_port_num; + __u8 alt_timeout; + __u8 sq_sig_all; + __u8 reserved[5]; + __aligned_u64 driver_data[0]; +}; + +struct ib_uverbs_modify_qp { + struct ib_uverbs_qp_dest dest; + struct ib_uverbs_qp_dest alt_dest; + __u32 qp_handle; + __u32 attr_mask; + __u32 qkey; + __u32 rq_psn; + __u32 sq_psn; + __u32 dest_qp_num; + __u32 qp_access_flags; + __u16 pkey_index; + __u16 alt_pkey_index; + __u8 qp_state; + __u8 cur_qp_state; + __u8 path_mtu; + __u8 path_mig_state; + __u8 en_sqd_async_notify; + __u8 max_rd_atomic; + __u8 max_dest_rd_atomic; + __u8 min_rnr_timer; + __u8 port_num; + __u8 timeout; + __u8 retry_cnt; + __u8 rnr_retry; + __u8 alt_port_num; + __u8 alt_timeout; + __u8 reserved[2]; + __aligned_u64 driver_data[0]; +}; + +struct ib_uverbs_ex_modify_qp { + struct ib_uverbs_modify_qp base; + __u32 rate_limit; + __u32 reserved; +}; + +struct ib_uverbs_ex_modify_qp_resp { + __u32 comp_mask; + __u32 response_length; +}; + +struct ib_uverbs_destroy_qp { + __aligned_u64 response; + __u32 qp_handle; + __u32 reserved; +}; + +struct ib_uverbs_destroy_qp_resp { + __u32 events_reported; +}; + +/* + * The ib_uverbs_sge structure isn't used anywhere, since we assume + * the ib_sge structure is packed the same way on 32-bit and 64-bit + * architectures in both kernel and user space. It's just here to + * document the ABI. + */ +struct ib_uverbs_sge { + __aligned_u64 addr; + __u32 length; + __u32 lkey; +}; + +enum ib_uverbs_wr_opcode { + IB_UVERBS_WR_RDMA_WRITE = 0, + IB_UVERBS_WR_RDMA_WRITE_WITH_IMM = 1, + IB_UVERBS_WR_SEND = 2, + IB_UVERBS_WR_SEND_WITH_IMM = 3, + IB_UVERBS_WR_RDMA_READ = 4, + IB_UVERBS_WR_ATOMIC_CMP_AND_SWP = 5, + IB_UVERBS_WR_ATOMIC_FETCH_AND_ADD = 6, + IB_UVERBS_WR_LOCAL_INV = 7, + IB_UVERBS_WR_BIND_MW = 8, + IB_UVERBS_WR_SEND_WITH_INV = 9, + IB_UVERBS_WR_TSO = 10, + IB_UVERBS_WR_RDMA_READ_WITH_INV = 11, + IB_UVERBS_WR_MASKED_ATOMIC_CMP_AND_SWP = 12, + IB_UVERBS_WR_MASKED_ATOMIC_FETCH_AND_ADD = 13, + /* Review enum ib_wr_opcode before modifying this */ +}; + +struct ib_uverbs_send_wr { + __aligned_u64 wr_id; + __u32 num_sge; + __u32 opcode; /* see enum ib_uverbs_wr_opcode */ + __u32 send_flags; + union { + __be32 imm_data; + __u32 invalidate_rkey; + } ex; + union { + struct { + __aligned_u64 remote_addr; + __u32 rkey; + __u32 reserved; + } rdma; + struct { + __aligned_u64 remote_addr; + __aligned_u64 compare_add; + __aligned_u64 swap; + __u32 rkey; + __u32 reserved; + } atomic; + struct { + __u32 ah; + __u32 remote_qpn; + __u32 remote_qkey; + __u32 reserved; + } ud; + } wr; +}; + +struct ib_uverbs_post_send { + __aligned_u64 response; + __u32 qp_handle; + __u32 wr_count; + __u32 sge_count; + __u32 wqe_size; + struct ib_uverbs_send_wr send_wr[0]; +}; + +struct ib_uverbs_post_send_resp { + __u32 bad_wr; +}; + +struct ib_uverbs_recv_wr { + __aligned_u64 wr_id; + __u32 num_sge; + __u32 reserved; +}; + +struct ib_uverbs_post_recv { + __aligned_u64 response; + __u32 qp_handle; + __u32 wr_count; + __u32 sge_count; + __u32 wqe_size; + struct ib_uverbs_recv_wr recv_wr[0]; +}; + +struct ib_uverbs_post_recv_resp { + __u32 bad_wr; +}; + +struct ib_uverbs_post_srq_recv { + __aligned_u64 response; + __u32 srq_handle; + __u32 wr_count; + __u32 sge_count; + __u32 wqe_size; + struct ib_uverbs_recv_wr recv[0]; +}; + +struct ib_uverbs_post_srq_recv_resp { + __u32 bad_wr; +}; + +struct ib_uverbs_create_ah { + __aligned_u64 response; + __aligned_u64 user_handle; + __u32 pd_handle; + __u32 reserved; + struct ib_uverbs_ah_attr attr; + __aligned_u64 driver_data[0]; +}; + +struct ib_uverbs_create_ah_resp { + __u32 ah_handle; + __u32 driver_data[0]; +}; + +struct ib_uverbs_destroy_ah { + __u32 ah_handle; +}; + +struct ib_uverbs_attach_mcast { + __u8 gid[16]; + __u32 qp_handle; + __u16 mlid; + __u16 reserved; + __aligned_u64 driver_data[0]; +}; + +struct ib_uverbs_detach_mcast { + __u8 gid[16]; + __u32 qp_handle; + __u16 mlid; + __u16 reserved; + __aligned_u64 driver_data[0]; +}; + +struct ib_uverbs_flow_spec_hdr { + __u32 type; + __u16 size; + __u16 reserved; + /* followed by flow_spec */ + __aligned_u64 flow_spec_data[0]; +}; + +struct ib_uverbs_flow_eth_filter { + __u8 dst_mac[6]; + __u8 src_mac[6]; + __be16 ether_type; + __be16 vlan_tag; +}; + +struct ib_uverbs_flow_spec_eth { + union { + struct ib_uverbs_flow_spec_hdr hdr; + struct { + __u32 type; + __u16 size; + __u16 reserved; + }; + }; + struct ib_uverbs_flow_eth_filter val; + struct ib_uverbs_flow_eth_filter mask; +}; + +struct ib_uverbs_flow_ipv4_filter { + __be32 src_ip; + __be32 dst_ip; + __u8 proto; + __u8 tos; + __u8 ttl; + __u8 flags; +}; + +struct ib_uverbs_flow_spec_ipv4 { + union { + struct ib_uverbs_flow_spec_hdr hdr; + struct { + __u32 type; + __u16 size; + __u16 reserved; + }; + }; + struct ib_uverbs_flow_ipv4_filter val; + struct ib_uverbs_flow_ipv4_filter mask; +}; + +struct ib_uverbs_flow_tcp_udp_filter { + __be16 dst_port; + __be16 src_port; +}; + +struct ib_uverbs_flow_spec_tcp_udp { + union { + struct ib_uverbs_flow_spec_hdr hdr; + struct { + __u32 type; + __u16 size; + __u16 reserved; + }; + }; + struct ib_uverbs_flow_tcp_udp_filter val; + struct ib_uverbs_flow_tcp_udp_filter mask; +}; + +struct ib_uverbs_flow_ipv6_filter { + __u8 src_ip[16]; + __u8 dst_ip[16]; + __be32 flow_label; + __u8 next_hdr; + __u8 traffic_class; + __u8 hop_limit; + __u8 reserved; +}; + +struct ib_uverbs_flow_spec_ipv6 { + union { + struct ib_uverbs_flow_spec_hdr hdr; + struct { + __u32 type; + __u16 size; + __u16 reserved; + }; + }; + struct ib_uverbs_flow_ipv6_filter val; + struct ib_uverbs_flow_ipv6_filter mask; +}; + +struct ib_uverbs_flow_spec_action_tag { + union { + struct ib_uverbs_flow_spec_hdr hdr; + struct { + __u32 type; + __u16 size; + __u16 reserved; + }; + }; + __u32 tag_id; + __u32 reserved1; +}; + +struct ib_uverbs_flow_spec_action_drop { + union { + struct ib_uverbs_flow_spec_hdr hdr; + struct { + __u32 type; + __u16 size; + __u16 reserved; + }; + }; +}; + +struct ib_uverbs_flow_spec_action_handle { + union { + struct ib_uverbs_flow_spec_hdr hdr; + struct { + __u32 type; + __u16 size; + __u16 reserved; + }; + }; + __u32 handle; + __u32 reserved1; +}; + +struct ib_uverbs_flow_spec_action_count { + union { + struct ib_uverbs_flow_spec_hdr hdr; + struct { + __u32 type; + __u16 size; + __u16 reserved; + }; + }; + __u32 handle; + __u32 reserved1; +}; + +struct ib_uverbs_flow_tunnel_filter { + __be32 tunnel_id; +}; + +struct ib_uverbs_flow_spec_tunnel { + union { + struct ib_uverbs_flow_spec_hdr hdr; + struct { + __u32 type; + __u16 size; + __u16 reserved; + }; + }; + struct ib_uverbs_flow_tunnel_filter val; + struct ib_uverbs_flow_tunnel_filter mask; +}; + +struct ib_uverbs_flow_spec_esp_filter { + __u32 spi; + __u32 seq; +}; + +struct ib_uverbs_flow_spec_esp { + union { + struct ib_uverbs_flow_spec_hdr hdr; + struct { + __u32 type; + __u16 size; + __u16 reserved; + }; + }; + struct ib_uverbs_flow_spec_esp_filter val; + struct ib_uverbs_flow_spec_esp_filter mask; +}; + +struct ib_uverbs_flow_gre_filter { + /* c_ks_res0_ver field is bits 0-15 in offset 0 of a standard GRE header: + * bit 0 - C - checksum bit. + * bit 1 - reserved. set to 0. + * bit 2 - key bit. + * bit 3 - sequence number bit. + * bits 4:12 - reserved. set to 0. + * bits 13:15 - GRE version. + */ + __be16 c_ks_res0_ver; + __be16 protocol; + __be32 key; +}; + +struct ib_uverbs_flow_spec_gre { + union { + struct ib_uverbs_flow_spec_hdr hdr; + struct { + __u32 type; + __u16 size; + __u16 reserved; + }; + }; + struct ib_uverbs_flow_gre_filter val; + struct ib_uverbs_flow_gre_filter mask; +}; + +struct ib_uverbs_flow_mpls_filter { + /* The field includes the entire MPLS label: + * bits 0:19 - label field. + * bits 20:22 - traffic class field. + * bits 23 - bottom of stack bit. + * bits 24:31 - ttl field. + */ + __be32 label; +}; + +struct ib_uverbs_flow_spec_mpls { + union { + struct ib_uverbs_flow_spec_hdr hdr; + struct { + __u32 type; + __u16 size; + __u16 reserved; + }; + }; + struct ib_uverbs_flow_mpls_filter val; + struct ib_uverbs_flow_mpls_filter mask; +}; + +struct ib_uverbs_flow_attr { + __u32 type; + __u16 size; + __u16 priority; + __u8 num_of_specs; + __u8 reserved[2]; + __u8 port; + __u32 flags; + /* Following are the optional layers according to user request + * struct ib_flow_spec_xxx + * struct ib_flow_spec_yyy + */ + struct ib_uverbs_flow_spec_hdr flow_specs[0]; +}; + +struct ib_uverbs_create_flow { + __u32 comp_mask; + __u32 qp_handle; + struct ib_uverbs_flow_attr flow_attr; +}; + +struct ib_uverbs_create_flow_resp { + __u32 comp_mask; + __u32 flow_handle; +}; + +struct ib_uverbs_destroy_flow { + __u32 comp_mask; + __u32 flow_handle; +}; + +struct ib_uverbs_create_srq { + __aligned_u64 response; + __aligned_u64 user_handle; + __u32 pd_handle; + __u32 max_wr; + __u32 max_sge; + __u32 srq_limit; + __aligned_u64 driver_data[0]; +}; + +struct ib_uverbs_create_xsrq { + __aligned_u64 response; + __aligned_u64 user_handle; + __u32 srq_type; + __u32 pd_handle; + __u32 max_wr; + __u32 max_sge; + __u32 srq_limit; + __u32 max_num_tags; + __u32 xrcd_handle; + __u32 cq_handle; + __aligned_u64 driver_data[0]; +}; + +struct ib_uverbs_create_srq_resp { + __u32 srq_handle; + __u32 max_wr; + __u32 max_sge; + __u32 srqn; + __u32 driver_data[0]; +}; + +struct ib_uverbs_modify_srq { + __u32 srq_handle; + __u32 attr_mask; + __u32 max_wr; + __u32 srq_limit; + __aligned_u64 driver_data[0]; +}; + +struct ib_uverbs_query_srq { + __aligned_u64 response; + __u32 srq_handle; + __u32 reserved; + __aligned_u64 driver_data[0]; +}; + +struct ib_uverbs_query_srq_resp { + __u32 max_wr; + __u32 max_sge; + __u32 srq_limit; + __u32 reserved; +}; + +struct ib_uverbs_destroy_srq { + __aligned_u64 response; + __u32 srq_handle; + __u32 reserved; +}; + +struct ib_uverbs_destroy_srq_resp { + __u32 events_reported; +}; + +struct ib_uverbs_ex_create_wq { + __u32 comp_mask; + __u32 wq_type; + __aligned_u64 user_handle; + __u32 pd_handle; + __u32 cq_handle; + __u32 max_wr; + __u32 max_sge; + __u32 create_flags; /* Use enum ib_wq_flags */ + __u32 reserved; +}; + +struct ib_uverbs_ex_create_wq_resp { + __u32 comp_mask; + __u32 response_length; + __u32 wq_handle; + __u32 max_wr; + __u32 max_sge; + __u32 wqn; +}; + +struct ib_uverbs_ex_destroy_wq { + __u32 comp_mask; + __u32 wq_handle; +}; + +struct ib_uverbs_ex_destroy_wq_resp { + __u32 comp_mask; + __u32 response_length; + __u32 events_reported; + __u32 reserved; +}; + +struct ib_uverbs_ex_modify_wq { + __u32 attr_mask; + __u32 wq_handle; + __u32 wq_state; + __u32 curr_wq_state; + __u32 flags; /* Use enum ib_wq_flags */ + __u32 flags_mask; /* Use enum ib_wq_flags */ +}; + +/* Prevent memory allocation rather than max expected size */ +#define IB_USER_VERBS_MAX_LOG_IND_TBL_SIZE 0x0d +struct ib_uverbs_ex_create_rwq_ind_table { + __u32 comp_mask; + __u32 log_ind_tbl_size; + /* Following are the wq handles according to log_ind_tbl_size + * wq_handle1 + * wq_handle2 + */ + __u32 wq_handles[0]; +}; + +struct ib_uverbs_ex_create_rwq_ind_table_resp { + __u32 comp_mask; + __u32 response_length; + __u32 ind_tbl_handle; + __u32 ind_tbl_num; +}; + +struct ib_uverbs_ex_destroy_rwq_ind_table { + __u32 comp_mask; + __u32 ind_tbl_handle; +}; + +struct ib_uverbs_cq_moderation { + __u16 cq_count; + __u16 cq_period; +}; + +struct ib_uverbs_ex_modify_cq { + __u32 cq_handle; + __u32 attr_mask; + struct ib_uverbs_cq_moderation attr; + __u32 reserved; +}; + +#define IB_DEVICE_NAME_MAX 64 + +#endif /* IB_USER_VERBS_H */ diff --git a/kernel-headers/rdma/mlx4-abi.h b/kernel-headers/rdma/mlx4-abi.h new file mode 100644 index 0000000..f745575 --- /dev/null +++ b/kernel-headers/rdma/mlx4-abi.h @@ -0,0 +1,191 @@ +/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR Linux-OpenIB) */ +/* + * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef MLX4_ABI_USER_H +#define MLX4_ABI_USER_H + +#include <linux/types.h> + +/* + * Increment this value if any changes that break userspace ABI + * compatibility are made. + */ + +#define MLX4_IB_UVERBS_NO_DEV_CAPS_ABI_VERSION 3 +#define MLX4_IB_UVERBS_ABI_VERSION 4 + +/* + * Make sure that all structs defined in this file remain laid out so + * that they pack the same way on 32-bit and 64-bit architectures (to + * avoid incompatibility between 32-bit userspace and 64-bit kernels). + * In particular do not use pointer types -- pass pointers in __u64 + * instead. + */ + +struct mlx4_ib_alloc_ucontext_resp_v3 { + __u32 qp_tab_size; + __u16 bf_reg_size; + __u16 bf_regs_per_page; +}; + +enum { + MLX4_USER_DEV_CAP_LARGE_CQE = 1L << 0, +}; + +struct mlx4_ib_alloc_ucontext_resp { + __u32 dev_caps; + __u32 qp_tab_size; + __u16 bf_reg_size; + __u16 bf_regs_per_page; + __u32 cqe_size; +}; + +struct mlx4_ib_alloc_pd_resp { + __u32 pdn; + __u32 reserved; +}; + +struct mlx4_ib_create_cq { + __aligned_u64 buf_addr; + __aligned_u64 db_addr; +}; + +struct mlx4_ib_create_cq_resp { + __u32 cqn; + __u32 reserved; +}; + +struct mlx4_ib_resize_cq { + __aligned_u64 buf_addr; +}; + +struct mlx4_ib_create_srq { + __aligned_u64 buf_addr; + __aligned_u64 db_addr; +}; + +struct mlx4_ib_create_srq_resp { + __u32 srqn; + __u32 reserved; +}; + +struct mlx4_ib_create_qp_rss { + __aligned_u64 rx_hash_fields_mask; /* Use enum mlx4_ib_rx_hash_fields */ + __u8 rx_hash_function; /* Use enum mlx4_ib_rx_hash_function_flags */ + __u8 reserved[7]; + __u8 rx_hash_key[40]; + __u32 comp_mask; + __u32 reserved1; +}; + +struct mlx4_ib_create_qp { + __aligned_u64 buf_addr; + __aligned_u64 db_addr; + __u8 log_sq_bb_count; + __u8 log_sq_stride; + __u8 sq_no_prefetch; + __u8 reserved; + __u32 inl_recv_sz; +}; + +struct mlx4_ib_create_wq { + __aligned_u64 buf_addr; + __aligned_u64 db_addr; + __u8 log_range_size; + __u8 reserved[3]; + __u32 comp_mask; +}; + +struct mlx4_ib_modify_wq { + __u32 comp_mask; + __u32 reserved; +}; + +struct mlx4_ib_create_rwq_ind_tbl_resp { + __u32 response_length; + __u32 reserved; +}; + +/* RX Hash function flags */ +enum mlx4_ib_rx_hash_function_flags { + MLX4_IB_RX_HASH_FUNC_TOEPLITZ = 1 << 0, +}; + +/* + * RX Hash flags, these flags allows to set which incoming packet's field should + * participates in RX Hash. Each flag represent certain packet's field, + * when the flag is set the field that is represented by the flag will + * participate in RX Hash calculation. + */ +enum mlx4_ib_rx_hash_fields { + MLX4_IB_RX_HASH_SRC_IPV4 = 1 << 0, + MLX4_IB_RX_HASH_DST_IPV4 = 1 << 1, + MLX4_IB_RX_HASH_SRC_IPV6 = 1 << 2, + MLX4_IB_RX_HASH_DST_IPV6 = 1 << 3, + MLX4_IB_RX_HASH_SRC_PORT_TCP = 1 << 4, + MLX4_IB_RX_HASH_DST_PORT_TCP = 1 << 5, + MLX4_IB_RX_HASH_SRC_PORT_UDP = 1 << 6, + MLX4_IB_RX_HASH_DST_PORT_UDP = 1 << 7, + MLX4_IB_RX_HASH_INNER = 1ULL << 31, +}; + +struct mlx4_ib_rss_caps { + __aligned_u64 rx_hash_fields_mask; /* enum mlx4_ib_rx_hash_fields */ + __u8 rx_hash_function; /* enum mlx4_ib_rx_hash_function_flags */ + __u8 reserved[7]; +}; + +enum query_device_resp_mask { + MLX4_IB_QUERY_DEV_RESP_MASK_CORE_CLOCK_OFFSET = 1UL << 0, +}; + +struct mlx4_ib_tso_caps { + __u32 max_tso; /* Maximum tso payload size in bytes */ + /* Corresponding bit will be set if qp type from + * 'enum ib_qp_type' is supported. + */ + __u32 supported_qpts; +}; + +struct mlx4_uverbs_ex_query_device_resp { + __u32 comp_mask; + __u32 response_length; + __aligned_u64 hca_core_clock_offset; + __u32 max_inl_recv_sz; + __u32 reserved; + struct mlx4_ib_rss_caps rss_caps; + struct mlx4_ib_tso_caps tso_caps; +}; + +#endif /* MLX4_ABI_USER_H */ diff --git a/kernel-headers/rdma/mlx5-abi.h b/kernel-headers/rdma/mlx5-abi.h new file mode 100644 index 0000000..df1cc36 --- /dev/null +++ b/kernel-headers/rdma/mlx5-abi.h @@ -0,0 +1,501 @@ +/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR Linux-OpenIB) */ +/* + * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef MLX5_ABI_USER_H +#define MLX5_ABI_USER_H + +#include <linux/types.h> +#include <linux/if_ether.h> /* For ETH_ALEN. */ +#include <rdma/ib_user_ioctl_verbs.h> + +enum { + MLX5_QP_FLAG_SIGNATURE = 1 << 0, + MLX5_QP_FLAG_SCATTER_CQE = 1 << 1, + MLX5_QP_FLAG_TUNNEL_OFFLOADS = 1 << 2, + MLX5_QP_FLAG_BFREG_INDEX = 1 << 3, + MLX5_QP_FLAG_TYPE_DCT = 1 << 4, + MLX5_QP_FLAG_TYPE_DCI = 1 << 5, + MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_UC = 1 << 6, + MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_MC = 1 << 7, + MLX5_QP_FLAG_ALLOW_SCATTER_CQE = 1 << 8, + MLX5_QP_FLAG_PACKET_BASED_CREDIT_MODE = 1 << 9, + MLX5_QP_FLAG_UAR_PAGE_INDEX = 1 << 10, +}; + +enum { + MLX5_SRQ_FLAG_SIGNATURE = 1 << 0, +}; + +enum { + MLX5_WQ_FLAG_SIGNATURE = 1 << 0, +}; + +/* Increment this value if any changes that break userspace ABI + * compatibility are made. + */ +#define MLX5_IB_UVERBS_ABI_VERSION 1 + +/* Make sure that all structs defined in this file remain laid out so + * that they pack the same way on 32-bit and 64-bit architectures (to + * avoid incompatibility between 32-bit userspace and 64-bit kernels). + * In particular do not use pointer types -- pass pointers in __u64 + * instead. + */ + +struct mlx5_ib_alloc_ucontext_req { + __u32 total_num_bfregs; + __u32 num_low_latency_bfregs; +}; + +enum mlx5_lib_caps { + MLX5_LIB_CAP_4K_UAR = (__u64)1 << 0, + MLX5_LIB_CAP_DYN_UAR = (__u64)1 << 1, +}; + +enum mlx5_ib_alloc_uctx_v2_flags { + MLX5_IB_ALLOC_UCTX_DEVX = 1 << 0, +}; +struct mlx5_ib_alloc_ucontext_req_v2 { + __u32 total_num_bfregs; + __u32 num_low_latency_bfregs; + __u32 flags; + __u32 comp_mask; + __u8 max_cqe_version; + __u8 reserved0; + __u16 reserved1; + __u32 reserved2; + __aligned_u64 lib_caps; +}; + +enum mlx5_ib_alloc_ucontext_resp_mask { + MLX5_IB_ALLOC_UCONTEXT_RESP_MASK_CORE_CLOCK_OFFSET = 1UL << 0, + MLX5_IB_ALLOC_UCONTEXT_RESP_MASK_DUMP_FILL_MKEY = 1UL << 1, +}; + +enum mlx5_user_cmds_supp_uhw { + MLX5_USER_CMDS_SUPP_UHW_QUERY_DEVICE = 1 << 0, + MLX5_USER_CMDS_SUPP_UHW_CREATE_AH = 1 << 1, +}; + +/* The eth_min_inline response value is set to off-by-one vs the FW + * returned value to allow user-space to deal with older kernels. + */ +enum mlx5_user_inline_mode { + MLX5_USER_INLINE_MODE_NA, + MLX5_USER_INLINE_MODE_NONE, + MLX5_USER_INLINE_MODE_L2, + MLX5_USER_INLINE_MODE_IP, + MLX5_USER_INLINE_MODE_TCP_UDP, +}; + +enum { + MLX5_USER_ALLOC_UCONTEXT_FLOW_ACTION_FLAGS_ESP_AES_GCM = 1 << 0, + MLX5_USER_ALLOC_UCONTEXT_FLOW_ACTION_FLAGS_ESP_AES_GCM_REQ_METADATA = 1 << 1, + MLX5_USER_ALLOC_UCONTEXT_FLOW_ACTION_FLAGS_ESP_AES_GCM_SPI_STEERING = 1 << 2, + MLX5_USER_ALLOC_UCONTEXT_FLOW_ACTION_FLAGS_ESP_AES_GCM_FULL_OFFLOAD = 1 << 3, + MLX5_USER_ALLOC_UCONTEXT_FLOW_ACTION_FLAGS_ESP_AES_GCM_TX_IV_IS_ESN = 1 << 4, +}; + +struct mlx5_ib_alloc_ucontext_resp { + __u32 qp_tab_size; + __u32 bf_reg_size; + __u32 tot_bfregs; + __u32 cache_line_size; + __u16 max_sq_desc_sz; + __u16 max_rq_desc_sz; + __u32 max_send_wqebb; + __u32 max_recv_wr; + __u32 max_srq_recv_wr; + __u16 num_ports; + __u16 flow_action_flags; + __u32 comp_mask; + __u32 response_length; + __u8 cqe_version; + __u8 cmds_supp_uhw; + __u8 eth_min_inline; + __u8 clock_info_versions; + __aligned_u64 hca_core_clock_offset; + __u32 log_uar_size; + __u32 num_uars_per_page; + __u32 num_dyn_bfregs; + __u32 dump_fill_mkey; +}; + +struct mlx5_ib_alloc_pd_resp { + __u32 pdn; +}; + +struct mlx5_ib_tso_caps { + __u32 max_tso; /* Maximum tso payload size in bytes */ + + /* Corresponding bit will be set if qp type from + * 'enum ib_qp_type' is supported, e.g. + * supported_qpts |= 1 << IB_QPT_UD + */ + __u32 supported_qpts; +}; + +struct mlx5_ib_rss_caps { + __aligned_u64 rx_hash_fields_mask; /* enum mlx5_rx_hash_fields */ + __u8 rx_hash_function; /* enum mlx5_rx_hash_function_flags */ + __u8 reserved[7]; +}; + +enum mlx5_ib_cqe_comp_res_format { + MLX5_IB_CQE_RES_FORMAT_HASH = 1 << 0, + MLX5_IB_CQE_RES_FORMAT_CSUM = 1 << 1, + MLX5_IB_CQE_RES_FORMAT_CSUM_STRIDX = 1 << 2, +}; + +struct mlx5_ib_cqe_comp_caps { + __u32 max_num; + __u32 supported_format; /* enum mlx5_ib_cqe_comp_res_format */ +}; + +enum mlx5_ib_packet_pacing_cap_flags { + MLX5_IB_PP_SUPPORT_BURST = 1 << 0, +}; + +struct mlx5_packet_pacing_caps { + __u32 qp_rate_limit_min; + __u32 qp_rate_limit_max; /* In kpbs */ + + /* Corresponding bit will be set if qp type from + * 'enum ib_qp_type' is supported, e.g. + * supported_qpts |= 1 << IB_QPT_RAW_PACKET + */ + __u32 supported_qpts; + __u8 cap_flags; /* enum mlx5_ib_packet_pacing_cap_flags */ + __u8 reserved[3]; +}; + +enum mlx5_ib_mpw_caps { + MPW_RESERVED = 1 << 0, + MLX5_IB_ALLOW_MPW = 1 << 1, + MLX5_IB_SUPPORT_EMPW = 1 << 2, +}; + +enum mlx5_ib_sw_parsing_offloads { + MLX5_IB_SW_PARSING = 1 << 0, + MLX5_IB_SW_PARSING_CSUM = 1 << 1, + MLX5_IB_SW_PARSING_LSO = 1 << 2, +}; + +struct mlx5_ib_sw_parsing_caps { + __u32 sw_parsing_offloads; /* enum mlx5_ib_sw_parsing_offloads */ + + /* Corresponding bit will be set if qp type from + * 'enum ib_qp_type' is supported, e.g. + * supported_qpts |= 1 << IB_QPT_RAW_PACKET + */ + __u32 supported_qpts; +}; + +struct mlx5_ib_striding_rq_caps { + __u32 min_single_stride_log_num_of_bytes; + __u32 max_single_stride_log_num_of_bytes; + __u32 min_single_wqe_log_num_of_strides; + __u32 max_single_wqe_log_num_of_strides; + + /* Corresponding bit will be set if qp type from + * 'enum ib_qp_type' is supported, e.g. + * supported_qpts |= 1 << IB_QPT_RAW_PACKET + */ + __u32 supported_qpts; + __u32 reserved; +}; + +enum mlx5_ib_query_dev_resp_flags { + /* Support 128B CQE compression */ + MLX5_IB_QUERY_DEV_RESP_FLAGS_CQE_128B_COMP = 1 << 0, + MLX5_IB_QUERY_DEV_RESP_FLAGS_CQE_128B_PAD = 1 << 1, + MLX5_IB_QUERY_DEV_RESP_PACKET_BASED_CREDIT_MODE = 1 << 2, + MLX5_IB_QUERY_DEV_RESP_FLAGS_SCAT2CQE_DCT = 1 << 3, +}; + +enum mlx5_ib_tunnel_offloads { + MLX5_IB_TUNNELED_OFFLOADS_VXLAN = 1 << 0, + MLX5_IB_TUNNELED_OFFLOADS_GRE = 1 << 1, + MLX5_IB_TUNNELED_OFFLOADS_GENEVE = 1 << 2, + MLX5_IB_TUNNELED_OFFLOADS_MPLS_GRE = 1 << 3, + MLX5_IB_TUNNELED_OFFLOADS_MPLS_UDP = 1 << 4, +}; + +struct mlx5_ib_query_device_resp { + __u32 comp_mask; + __u32 response_length; + struct mlx5_ib_tso_caps tso_caps; + struct mlx5_ib_rss_caps rss_caps; + struct mlx5_ib_cqe_comp_caps cqe_comp_caps; + struct mlx5_packet_pacing_caps packet_pacing_caps; + __u32 mlx5_ib_support_multi_pkt_send_wqes; + __u32 flags; /* Use enum mlx5_ib_query_dev_resp_flags */ + struct mlx5_ib_sw_parsing_caps sw_parsing_caps; + struct mlx5_ib_striding_rq_caps striding_rq_caps; + __u32 tunnel_offloads_caps; /* enum mlx5_ib_tunnel_offloads */ + __u32 reserved; +}; + +enum mlx5_ib_create_cq_flags { + MLX5_IB_CREATE_CQ_FLAGS_CQE_128B_PAD = 1 << 0, + MLX5_IB_CREATE_CQ_FLAGS_UAR_PAGE_INDEX = 1 << 1, +}; + +struct mlx5_ib_create_cq { + __aligned_u64 buf_addr; + __aligned_u64 db_addr; + __u32 cqe_size; + __u8 cqe_comp_en; + __u8 cqe_comp_res_format; + __u16 flags; + __u16 uar_page_index; + __u16 reserved0; + __u32 reserved1; +}; + +struct mlx5_ib_create_cq_resp { + __u32 cqn; + __u32 reserved; +}; + +struct mlx5_ib_resize_cq { + __aligned_u64 buf_addr; + __u16 cqe_size; + __u16 reserved0; + __u32 reserved1; +}; + +struct mlx5_ib_create_srq { + __aligned_u64 buf_addr; + __aligned_u64 db_addr; + __u32 flags; + __u32 reserved0; /* explicit padding (optional on i386) */ + __u32 uidx; + __u32 reserved1; +}; + +struct mlx5_ib_create_srq_resp { + __u32 srqn; + __u32 reserved; +}; + +struct mlx5_ib_create_qp { + __aligned_u64 buf_addr; + __aligned_u64 db_addr; + __u32 sq_wqe_count; + __u32 rq_wqe_count; + __u32 rq_wqe_shift; + __u32 flags; + __u32 uidx; + __u32 bfreg_index; + union { + __aligned_u64 sq_buf_addr; + __aligned_u64 access_key; + }; +}; + +/* RX Hash function flags */ +enum mlx5_rx_hash_function_flags { + MLX5_RX_HASH_FUNC_TOEPLITZ = 1 << 0, +}; + +/* + * RX Hash flags, these flags allows to set which incoming packet's field should + * participates in RX Hash. Each flag represent certain packet's field, + * when the flag is set the field that is represented by the flag will + * participate in RX Hash calculation. + * Note: *IPV4 and *IPV6 flags can't be enabled together on the same QP + * and *TCP and *UDP flags can't be enabled together on the same QP. +*/ +enum mlx5_rx_hash_fields { + MLX5_RX_HASH_SRC_IPV4 = 1 << 0, + MLX5_RX_HASH_DST_IPV4 = 1 << 1, + MLX5_RX_HASH_SRC_IPV6 = 1 << 2, + MLX5_RX_HASH_DST_IPV6 = 1 << 3, + MLX5_RX_HASH_SRC_PORT_TCP = 1 << 4, + MLX5_RX_HASH_DST_PORT_TCP = 1 << 5, + MLX5_RX_HASH_SRC_PORT_UDP = 1 << 6, + MLX5_RX_HASH_DST_PORT_UDP = 1 << 7, + MLX5_RX_HASH_IPSEC_SPI = 1 << 8, + /* Save bits for future fields */ + MLX5_RX_HASH_INNER = (1UL << 31), +}; + +struct mlx5_ib_create_qp_rss { + __aligned_u64 rx_hash_fields_mask; /* enum mlx5_rx_hash_fields */ + __u8 rx_hash_function; /* enum mlx5_rx_hash_function_flags */ + __u8 rx_key_len; /* valid only for Toeplitz */ + __u8 reserved[6]; + __u8 rx_hash_key[128]; /* valid only for Toeplitz */ + __u32 comp_mask; + __u32 flags; +}; + +enum mlx5_ib_create_qp_resp_mask { + MLX5_IB_CREATE_QP_RESP_MASK_TIRN = 1UL << 0, + MLX5_IB_CREATE_QP_RESP_MASK_TISN = 1UL << 1, + MLX5_IB_CREATE_QP_RESP_MASK_RQN = 1UL << 2, + MLX5_IB_CREATE_QP_RESP_MASK_SQN = 1UL << 3, + MLX5_IB_CREATE_QP_RESP_MASK_TIR_ICM_ADDR = 1UL << 4, +}; + +struct mlx5_ib_create_qp_resp { + __u32 bfreg_index; + __u32 reserved; + __u32 comp_mask; + __u32 tirn; + __u32 tisn; + __u32 rqn; + __u32 sqn; + __u32 reserved1; + __u64 tir_icm_addr; +}; + +struct mlx5_ib_alloc_mw { + __u32 comp_mask; + __u8 num_klms; + __u8 reserved1; + __u16 reserved2; +}; + +enum mlx5_ib_create_wq_mask { + MLX5_IB_CREATE_WQ_STRIDING_RQ = (1 << 0), +}; + +struct mlx5_ib_create_wq { + __aligned_u64 buf_addr; + __aligned_u64 db_addr; + __u32 rq_wqe_count; + __u32 rq_wqe_shift; + __u32 user_index; + __u32 flags; + __u32 comp_mask; + __u32 single_stride_log_num_of_bytes; + __u32 single_wqe_log_num_of_strides; + __u32 two_byte_shift_en; +}; + +struct mlx5_ib_create_ah_resp { + __u32 response_length; + __u8 dmac[ETH_ALEN]; + __u8 reserved[6]; +}; + +struct mlx5_ib_burst_info { + __u32 max_burst_sz; + __u16 typical_pkt_sz; + __u16 reserved; +}; + +struct mlx5_ib_modify_qp { + __u32 comp_mask; + struct mlx5_ib_burst_info burst_info; + __u32 reserved; +}; + +struct mlx5_ib_modify_qp_resp { + __u32 response_length; + __u32 dctn; +}; + +struct mlx5_ib_create_wq_resp { + __u32 response_length; + __u32 reserved; +}; + +struct mlx5_ib_create_rwq_ind_tbl_resp { + __u32 response_length; + __u32 reserved; +}; + +struct mlx5_ib_modify_wq { + __u32 comp_mask; + __u32 reserved; +}; + +struct mlx5_ib_clock_info { + __u32 sign; + __u32 resv; + __aligned_u64 nsec; + __aligned_u64 cycles; + __aligned_u64 frac; + __u32 mult; + __u32 shift; + __aligned_u64 mask; + __aligned_u64 overflow_period; +}; + +enum mlx5_ib_mmap_cmd { + MLX5_IB_MMAP_REGULAR_PAGE = 0, + MLX5_IB_MMAP_GET_CONTIGUOUS_PAGES = 1, + MLX5_IB_MMAP_WC_PAGE = 2, + MLX5_IB_MMAP_NC_PAGE = 3, + /* 5 is chosen in order to be compatible with old versions of libmlx5 */ + MLX5_IB_MMAP_CORE_CLOCK = 5, + MLX5_IB_MMAP_ALLOC_WC = 6, + MLX5_IB_MMAP_CLOCK_INFO = 7, + MLX5_IB_MMAP_DEVICE_MEM = 8, +}; + +enum { + MLX5_IB_CLOCK_INFO_KERNEL_UPDATING = 1, +}; + +/* Bit indexes for the mlx5_alloc_ucontext_resp.clock_info_versions bitmap */ +enum { + MLX5_IB_CLOCK_INFO_V1 = 0, +}; + +struct mlx5_ib_flow_counters_desc { + __u32 description; + __u32 index; +}; + +struct mlx5_ib_flow_counters_data { + RDMA_UAPI_PTR(struct mlx5_ib_flow_counters_desc *, counters_data); + __u32 ncounters; + __u32 reserved; +}; + +struct mlx5_ib_create_flow { + __u32 ncounters_data; + __u32 reserved; + /* + * Following are counters data based on ncounters_data, each + * entry in the data[] should match a corresponding counter object + * that was pointed by a counters spec upon the flow creation + */ + struct mlx5_ib_flow_counters_data data[]; +}; + +#endif /* MLX5_ABI_USER_H */ diff --git a/kernel-headers/rdma/mlx5_user_ioctl_cmds.h b/kernel-headers/rdma/mlx5_user_ioctl_cmds.h new file mode 100644 index 0000000..24f3388 --- /dev/null +++ b/kernel-headers/rdma/mlx5_user_ioctl_cmds.h @@ -0,0 +1,283 @@ +/* + * Copyright (c) 2018, Mellanox Technologies inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef MLX5_USER_IOCTL_CMDS_H +#define MLX5_USER_IOCTL_CMDS_H + +#include <linux/types.h> +#include <rdma/ib_user_ioctl_cmds.h> + +enum mlx5_ib_create_flow_action_attrs { + /* This attribute belong to the driver namespace */ + MLX5_IB_ATTR_CREATE_FLOW_ACTION_FLAGS = (1U << UVERBS_ID_NS_SHIFT), +}; + +enum mlx5_ib_alloc_dm_attrs { + MLX5_IB_ATTR_ALLOC_DM_RESP_START_OFFSET = (1U << UVERBS_ID_NS_SHIFT), + MLX5_IB_ATTR_ALLOC_DM_RESP_PAGE_INDEX, + MLX5_IB_ATTR_ALLOC_DM_REQ_TYPE, +}; + +enum mlx5_ib_devx_methods { + MLX5_IB_METHOD_DEVX_OTHER = (1U << UVERBS_ID_NS_SHIFT), + MLX5_IB_METHOD_DEVX_QUERY_UAR, + MLX5_IB_METHOD_DEVX_QUERY_EQN, + MLX5_IB_METHOD_DEVX_SUBSCRIBE_EVENT, +}; + +enum mlx5_ib_devx_other_attrs { + MLX5_IB_ATTR_DEVX_OTHER_CMD_IN = (1U << UVERBS_ID_NS_SHIFT), + MLX5_IB_ATTR_DEVX_OTHER_CMD_OUT, +}; + +enum mlx5_ib_devx_obj_create_attrs { + MLX5_IB_ATTR_DEVX_OBJ_CREATE_HANDLE = (1U << UVERBS_ID_NS_SHIFT), + MLX5_IB_ATTR_DEVX_OBJ_CREATE_CMD_IN, + MLX5_IB_ATTR_DEVX_OBJ_CREATE_CMD_OUT, +}; + +enum mlx5_ib_devx_query_uar_attrs { + MLX5_IB_ATTR_DEVX_QUERY_UAR_USER_IDX = (1U << UVERBS_ID_NS_SHIFT), + MLX5_IB_ATTR_DEVX_QUERY_UAR_DEV_IDX, +}; + +enum mlx5_ib_devx_obj_destroy_attrs { + MLX5_IB_ATTR_DEVX_OBJ_DESTROY_HANDLE = (1U << UVERBS_ID_NS_SHIFT), +}; + +enum mlx5_ib_devx_obj_modify_attrs { + MLX5_IB_ATTR_DEVX_OBJ_MODIFY_HANDLE = (1U << UVERBS_ID_NS_SHIFT), + MLX5_IB_ATTR_DEVX_OBJ_MODIFY_CMD_IN, + MLX5_IB_ATTR_DEVX_OBJ_MODIFY_CMD_OUT, +}; + +enum mlx5_ib_devx_obj_query_attrs { + MLX5_IB_ATTR_DEVX_OBJ_QUERY_HANDLE = (1U << UVERBS_ID_NS_SHIFT), + MLX5_IB_ATTR_DEVX_OBJ_QUERY_CMD_IN, + MLX5_IB_ATTR_DEVX_OBJ_QUERY_CMD_OUT, +}; + +enum mlx5_ib_devx_obj_query_async_attrs { + MLX5_IB_ATTR_DEVX_OBJ_QUERY_ASYNC_HANDLE = (1U << UVERBS_ID_NS_SHIFT), + MLX5_IB_ATTR_DEVX_OBJ_QUERY_ASYNC_CMD_IN, + MLX5_IB_ATTR_DEVX_OBJ_QUERY_ASYNC_FD, + MLX5_IB_ATTR_DEVX_OBJ_QUERY_ASYNC_WR_ID, + MLX5_IB_ATTR_DEVX_OBJ_QUERY_ASYNC_OUT_LEN, +}; + +enum mlx5_ib_devx_subscribe_event_attrs { + MLX5_IB_ATTR_DEVX_SUBSCRIBE_EVENT_FD_HANDLE = (1U << UVERBS_ID_NS_SHIFT), + MLX5_IB_ATTR_DEVX_SUBSCRIBE_EVENT_OBJ_HANDLE, + MLX5_IB_ATTR_DEVX_SUBSCRIBE_EVENT_TYPE_NUM_LIST, + MLX5_IB_ATTR_DEVX_SUBSCRIBE_EVENT_FD_NUM, + MLX5_IB_ATTR_DEVX_SUBSCRIBE_EVENT_COOKIE, +}; + +enum mlx5_ib_devx_query_eqn_attrs { + MLX5_IB_ATTR_DEVX_QUERY_EQN_USER_VEC = (1U << UVERBS_ID_NS_SHIFT), + MLX5_IB_ATTR_DEVX_QUERY_EQN_DEV_EQN, +}; + +enum mlx5_ib_devx_obj_methods { + MLX5_IB_METHOD_DEVX_OBJ_CREATE = (1U << UVERBS_ID_NS_SHIFT), + MLX5_IB_METHOD_DEVX_OBJ_DESTROY, + MLX5_IB_METHOD_DEVX_OBJ_MODIFY, + MLX5_IB_METHOD_DEVX_OBJ_QUERY, + MLX5_IB_METHOD_DEVX_OBJ_ASYNC_QUERY, +}; + +enum mlx5_ib_var_alloc_attrs { + MLX5_IB_ATTR_VAR_OBJ_ALLOC_HANDLE = (1U << UVERBS_ID_NS_SHIFT), + MLX5_IB_ATTR_VAR_OBJ_ALLOC_MMAP_OFFSET, + MLX5_IB_ATTR_VAR_OBJ_ALLOC_MMAP_LENGTH, + MLX5_IB_ATTR_VAR_OBJ_ALLOC_PAGE_ID, +}; + +enum mlx5_ib_var_obj_destroy_attrs { + MLX5_IB_ATTR_VAR_OBJ_DESTROY_HANDLE = (1U << UVERBS_ID_NS_SHIFT), +}; + +enum mlx5_ib_var_obj_methods { + MLX5_IB_METHOD_VAR_OBJ_ALLOC = (1U << UVERBS_ID_NS_SHIFT), + MLX5_IB_METHOD_VAR_OBJ_DESTROY, +}; + +enum mlx5_ib_uar_alloc_attrs { + MLX5_IB_ATTR_UAR_OBJ_ALLOC_HANDLE = (1U << UVERBS_ID_NS_SHIFT), + MLX5_IB_ATTR_UAR_OBJ_ALLOC_TYPE, + MLX5_IB_ATTR_UAR_OBJ_ALLOC_MMAP_OFFSET, + MLX5_IB_ATTR_UAR_OBJ_ALLOC_MMAP_LENGTH, + MLX5_IB_ATTR_UAR_OBJ_ALLOC_PAGE_ID, +}; + +enum mlx5_ib_uar_obj_destroy_attrs { + MLX5_IB_ATTR_UAR_OBJ_DESTROY_HANDLE = (1U << UVERBS_ID_NS_SHIFT), +}; + +enum mlx5_ib_uar_obj_methods { + MLX5_IB_METHOD_UAR_OBJ_ALLOC = (1U << UVERBS_ID_NS_SHIFT), + MLX5_IB_METHOD_UAR_OBJ_DESTROY, +}; + +enum mlx5_ib_devx_umem_reg_attrs { + MLX5_IB_ATTR_DEVX_UMEM_REG_HANDLE = (1U << UVERBS_ID_NS_SHIFT), + MLX5_IB_ATTR_DEVX_UMEM_REG_ADDR, + MLX5_IB_ATTR_DEVX_UMEM_REG_LEN, + MLX5_IB_ATTR_DEVX_UMEM_REG_ACCESS, + MLX5_IB_ATTR_DEVX_UMEM_REG_OUT_ID, +}; + +enum mlx5_ib_devx_umem_dereg_attrs { + MLX5_IB_ATTR_DEVX_UMEM_DEREG_HANDLE = (1U << UVERBS_ID_NS_SHIFT), +}; + +enum mlx5_ib_pp_obj_methods { + MLX5_IB_METHOD_PP_OBJ_ALLOC = (1U << UVERBS_ID_NS_SHIFT), + MLX5_IB_METHOD_PP_OBJ_DESTROY, +}; + +enum mlx5_ib_pp_alloc_attrs { + MLX5_IB_ATTR_PP_OBJ_ALLOC_HANDLE = (1U << UVERBS_ID_NS_SHIFT), + MLX5_IB_ATTR_PP_OBJ_ALLOC_CTX, + MLX5_IB_ATTR_PP_OBJ_ALLOC_FLAGS, + MLX5_IB_ATTR_PP_OBJ_ALLOC_INDEX, +}; + +enum mlx5_ib_pp_obj_destroy_attrs { + MLX5_IB_ATTR_PP_OBJ_DESTROY_HANDLE = (1U << UVERBS_ID_NS_SHIFT), +}; + +enum mlx5_ib_devx_umem_methods { + MLX5_IB_METHOD_DEVX_UMEM_REG = (1U << UVERBS_ID_NS_SHIFT), + MLX5_IB_METHOD_DEVX_UMEM_DEREG, +}; + +enum mlx5_ib_devx_async_cmd_fd_alloc_attrs { + MLX5_IB_ATTR_DEVX_ASYNC_CMD_FD_ALLOC_HANDLE = (1U << UVERBS_ID_NS_SHIFT), +}; + +enum mlx5_ib_devx_async_event_fd_alloc_attrs { + MLX5_IB_ATTR_DEVX_ASYNC_EVENT_FD_ALLOC_HANDLE = (1U << UVERBS_ID_NS_SHIFT), + MLX5_IB_ATTR_DEVX_ASYNC_EVENT_FD_ALLOC_FLAGS, +}; + +enum mlx5_ib_devx_async_cmd_fd_methods { + MLX5_IB_METHOD_DEVX_ASYNC_CMD_FD_ALLOC = (1U << UVERBS_ID_NS_SHIFT), +}; + +enum mlx5_ib_devx_async_event_fd_methods { + MLX5_IB_METHOD_DEVX_ASYNC_EVENT_FD_ALLOC = (1U << UVERBS_ID_NS_SHIFT), +}; + +enum mlx5_ib_objects { + MLX5_IB_OBJECT_DEVX = (1U << UVERBS_ID_NS_SHIFT), + MLX5_IB_OBJECT_DEVX_OBJ, + MLX5_IB_OBJECT_DEVX_UMEM, + MLX5_IB_OBJECT_FLOW_MATCHER, + MLX5_IB_OBJECT_DEVX_ASYNC_CMD_FD, + MLX5_IB_OBJECT_DEVX_ASYNC_EVENT_FD, + MLX5_IB_OBJECT_VAR, + MLX5_IB_OBJECT_PP, + MLX5_IB_OBJECT_UAR, +}; + +enum mlx5_ib_flow_matcher_create_attrs { + MLX5_IB_ATTR_FLOW_MATCHER_CREATE_HANDLE = (1U << UVERBS_ID_NS_SHIFT), + MLX5_IB_ATTR_FLOW_MATCHER_MATCH_MASK, + MLX5_IB_ATTR_FLOW_MATCHER_FLOW_TYPE, + MLX5_IB_ATTR_FLOW_MATCHER_MATCH_CRITERIA, + MLX5_IB_ATTR_FLOW_MATCHER_FLOW_FLAGS, + MLX5_IB_ATTR_FLOW_MATCHER_FT_TYPE, +}; + +enum mlx5_ib_flow_matcher_destroy_attrs { + MLX5_IB_ATTR_FLOW_MATCHER_DESTROY_HANDLE = (1U << UVERBS_ID_NS_SHIFT), +}; + +enum mlx5_ib_flow_matcher_methods { + MLX5_IB_METHOD_FLOW_MATCHER_CREATE = (1U << UVERBS_ID_NS_SHIFT), + MLX5_IB_METHOD_FLOW_MATCHER_DESTROY, +}; + +#define MLX5_IB_DW_MATCH_PARAM 0x80 + +struct mlx5_ib_match_params { + __u32 match_params[MLX5_IB_DW_MATCH_PARAM]; +}; + +enum mlx5_ib_flow_type { + MLX5_IB_FLOW_TYPE_NORMAL, + MLX5_IB_FLOW_TYPE_SNIFFER, + MLX5_IB_FLOW_TYPE_ALL_DEFAULT, + MLX5_IB_FLOW_TYPE_MC_DEFAULT, +}; + +enum mlx5_ib_create_flow_attrs { + MLX5_IB_ATTR_CREATE_FLOW_HANDLE = (1U << UVERBS_ID_NS_SHIFT), + MLX5_IB_ATTR_CREATE_FLOW_MATCH_VALUE, + MLX5_IB_ATTR_CREATE_FLOW_DEST_QP, + MLX5_IB_ATTR_CREATE_FLOW_DEST_DEVX, + MLX5_IB_ATTR_CREATE_FLOW_MATCHER, + MLX5_IB_ATTR_CREATE_FLOW_ARR_FLOW_ACTIONS, + MLX5_IB_ATTR_CREATE_FLOW_TAG, + MLX5_IB_ATTR_CREATE_FLOW_ARR_COUNTERS_DEVX, + MLX5_IB_ATTR_CREATE_FLOW_ARR_COUNTERS_DEVX_OFFSET, +}; + +enum mlx5_ib_destoy_flow_attrs { + MLX5_IB_ATTR_DESTROY_FLOW_HANDLE = (1U << UVERBS_ID_NS_SHIFT), +}; + +enum mlx5_ib_flow_methods { + MLX5_IB_METHOD_CREATE_FLOW = (1U << UVERBS_ID_NS_SHIFT), + MLX5_IB_METHOD_DESTROY_FLOW, +}; + +enum mlx5_ib_flow_action_methods { + MLX5_IB_METHOD_FLOW_ACTION_CREATE_MODIFY_HEADER = (1U << UVERBS_ID_NS_SHIFT), + MLX5_IB_METHOD_FLOW_ACTION_CREATE_PACKET_REFORMAT, +}; + +enum mlx5_ib_create_flow_action_create_modify_header_attrs { + MLX5_IB_ATTR_CREATE_MODIFY_HEADER_HANDLE = (1U << UVERBS_ID_NS_SHIFT), + MLX5_IB_ATTR_CREATE_MODIFY_HEADER_ACTIONS_PRM, + MLX5_IB_ATTR_CREATE_MODIFY_HEADER_FT_TYPE, +}; + +enum mlx5_ib_create_flow_action_create_packet_reformat_attrs { + MLX5_IB_ATTR_CREATE_PACKET_REFORMAT_HANDLE = (1U << UVERBS_ID_NS_SHIFT), + MLX5_IB_ATTR_CREATE_PACKET_REFORMAT_TYPE, + MLX5_IB_ATTR_CREATE_PACKET_REFORMAT_FT_TYPE, + MLX5_IB_ATTR_CREATE_PACKET_REFORMAT_DATA_BUF, +}; + +#endif diff --git a/kernel-headers/rdma/mlx5_user_ioctl_verbs.h b/kernel-headers/rdma/mlx5_user_ioctl_verbs.h new file mode 100644 index 0000000..56b26ea --- /dev/null +++ b/kernel-headers/rdma/mlx5_user_ioctl_verbs.h @@ -0,0 +1,87 @@ +/* + * Copyright (c) 2018, Mellanox Technologies inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef MLX5_USER_IOCTL_VERBS_H +#define MLX5_USER_IOCTL_VERBS_H + +#include <linux/types.h> + +enum mlx5_ib_uapi_flow_action_flags { + MLX5_IB_UAPI_FLOW_ACTION_FLAGS_REQUIRE_METADATA = 1 << 0, +}; + +enum mlx5_ib_uapi_flow_table_type { + MLX5_IB_UAPI_FLOW_TABLE_TYPE_NIC_RX = 0x0, + MLX5_IB_UAPI_FLOW_TABLE_TYPE_NIC_TX = 0x1, + MLX5_IB_UAPI_FLOW_TABLE_TYPE_FDB = 0x2, + MLX5_IB_UAPI_FLOW_TABLE_TYPE_RDMA_RX = 0x3, + MLX5_IB_UAPI_FLOW_TABLE_TYPE_RDMA_TX = 0x4, +}; + +enum mlx5_ib_uapi_flow_action_packet_reformat_type { + MLX5_IB_UAPI_FLOW_ACTION_PACKET_REFORMAT_TYPE_L2_TUNNEL_TO_L2 = 0x0, + MLX5_IB_UAPI_FLOW_ACTION_PACKET_REFORMAT_TYPE_L2_TO_L2_TUNNEL = 0x1, + MLX5_IB_UAPI_FLOW_ACTION_PACKET_REFORMAT_TYPE_L3_TUNNEL_TO_L2 = 0x2, + MLX5_IB_UAPI_FLOW_ACTION_PACKET_REFORMAT_TYPE_L2_TO_L3_TUNNEL = 0x3, +}; + +struct mlx5_ib_uapi_devx_async_cmd_hdr { + __aligned_u64 wr_id; + __u8 out_data[]; +}; + +enum mlx5_ib_uapi_dm_type { + MLX5_IB_UAPI_DM_TYPE_MEMIC, + MLX5_IB_UAPI_DM_TYPE_STEERING_SW_ICM, + MLX5_IB_UAPI_DM_TYPE_HEADER_MODIFY_SW_ICM, +}; + +enum mlx5_ib_uapi_devx_create_event_channel_flags { + MLX5_IB_UAPI_DEVX_CR_EV_CH_FLAGS_OMIT_DATA = 1 << 0, +}; + +struct mlx5_ib_uapi_devx_async_event_hdr { + __aligned_u64 cookie; + __u8 out_data[]; +}; + +enum mlx5_ib_uapi_pp_alloc_flags { + MLX5_IB_UAPI_PP_ALLOC_FLAGS_DEDICATED_INDEX = 1 << 0, +}; + +enum mlx5_ib_uapi_uar_alloc_type { + MLX5_IB_UAPI_UAR_ALLOC_TYPE_BF = 0x0, + MLX5_IB_UAPI_UAR_ALLOC_TYPE_NC = 0x1, +}; + +#endif + diff --git a/kernel-headers/rdma/mthca-abi.h b/kernel-headers/rdma/mthca-abi.h new file mode 100644 index 0000000..91b12e1 --- /dev/null +++ b/kernel-headers/rdma/mthca-abi.h @@ -0,0 +1,112 @@ +/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR Linux-OpenIB) */ +/* + * Copyright (c) 2005 Topspin Communications. All rights reserved. + * Copyright (c) 2005, 2006 Cisco Systems. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef MTHCA_ABI_USER_H +#define MTHCA_ABI_USER_H + +#include <linux/types.h> + +/* + * Increment this value if any changes that break userspace ABI + * compatibility are made. + */ +#define MTHCA_UVERBS_ABI_VERSION 1 + +/* + * Make sure that all structs defined in this file remain laid out so + * that they pack the same way on 32-bit and 64-bit architectures (to + * avoid incompatibility between 32-bit userspace and 64-bit kernels). + * In particular do not use pointer types -- pass pointers in __u64 + * instead. + */ +struct mthca_alloc_ucontext_resp { + __u32 qp_tab_size; + __u32 uarc_size; +}; + +struct mthca_alloc_pd_resp { + __u32 pdn; + __u32 reserved; +}; + +/* + * Mark the memory region with a DMA attribute that causes + * in-flight DMA to be flushed when the region is written to: + */ +#define MTHCA_MR_DMASYNC 0x1 + +struct mthca_reg_mr { + __u32 mr_attrs; + __u32 reserved; +}; + +struct mthca_create_cq { + __u32 lkey; + __u32 pdn; + __aligned_u64 arm_db_page; + __aligned_u64 set_db_page; + __u32 arm_db_index; + __u32 set_db_index; +}; + +struct mthca_create_cq_resp { + __u32 cqn; + __u32 reserved; +}; + +struct mthca_resize_cq { + __u32 lkey; + __u32 reserved; +}; + +struct mthca_create_srq { + __u32 lkey; + __u32 db_index; + __aligned_u64 db_page; +}; + +struct mthca_create_srq_resp { + __u32 srqn; + __u32 reserved; +}; + +struct mthca_create_qp { + __u32 lkey; + __u32 reserved; + __aligned_u64 sq_db_page; + __aligned_u64 rq_db_page; + __u32 sq_db_index; + __u32 rq_db_index; +}; +#endif /* MTHCA_ABI_USER_H */ diff --git a/kernel-headers/rdma/ocrdma-abi.h b/kernel-headers/rdma/ocrdma-abi.h new file mode 100644 index 0000000..284d47b --- /dev/null +++ b/kernel-headers/rdma/ocrdma-abi.h @@ -0,0 +1,152 @@ +/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-2-Clause) */ +/* This file is part of the Emulex RoCE Device Driver for + * RoCE (RDMA over Converged Ethernet) adapters. + * Copyright (C) 2012-2015 Emulex. All rights reserved. + * EMULEX and SLI are trademarks of Emulex. + * www.emulex.com + * + * This software is available to you under a choice of one of two licenses. + * You may choose to be licensed under the terms of the GNU General Public + * License (GPL) Version 2, available from the file COPYING in the main + * directory of this source tree, or the BSD license below: + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * Contact Information: + * linux-drivers@emulex.com + * + * Emulex + * 3333 Susan Street + * Costa Mesa, CA 92626 + */ + +#ifndef OCRDMA_ABI_USER_H +#define OCRDMA_ABI_USER_H + +#include <linux/types.h> + +#define OCRDMA_ABI_VERSION 2 +#define OCRDMA_BE_ROCE_ABI_VERSION 1 +/* user kernel communication data structures. */ + +struct ocrdma_alloc_ucontext_resp { + __u32 dev_id; + __u32 wqe_size; + __u32 max_inline_data; + __u32 dpp_wqe_size; + __aligned_u64 ah_tbl_page; + __u32 ah_tbl_len; + __u32 rqe_size; + __u8 fw_ver[32]; + /* for future use/new features in progress */ + __aligned_u64 rsvd1; + __aligned_u64 rsvd2; +}; + +struct ocrdma_alloc_pd_ureq { + __u32 rsvd[2]; +}; + +struct ocrdma_alloc_pd_uresp { + __u32 id; + __u32 dpp_enabled; + __u32 dpp_page_addr_hi; + __u32 dpp_page_addr_lo; + __u32 rsvd[2]; +}; + +struct ocrdma_create_cq_ureq { + __u32 dpp_cq; + __u32 rsvd; /* pad */ +}; + +#define MAX_CQ_PAGES 8 +struct ocrdma_create_cq_uresp { + __u32 cq_id; + __u32 page_size; + __u32 num_pages; + __u32 max_hw_cqe; + __aligned_u64 page_addr[MAX_CQ_PAGES]; + __aligned_u64 db_page_addr; + __u32 db_page_size; + __u32 phase_change; + /* for future use/new features in progress */ + __aligned_u64 rsvd1; + __aligned_u64 rsvd2; +}; + +#define MAX_QP_PAGES 8 +#define MAX_UD_AV_PAGES 8 + +struct ocrdma_create_qp_ureq { + __u8 enable_dpp_cq; + __u8 rsvd; + __u16 dpp_cq_id; + __u32 rsvd1; /* pad */ +}; + +struct ocrdma_create_qp_uresp { + __u16 qp_id; + __u16 sq_dbid; + __u16 rq_dbid; + __u16 resv0; /* pad */ + __u32 sq_page_size; + __u32 rq_page_size; + __u32 num_sq_pages; + __u32 num_rq_pages; + __aligned_u64 sq_page_addr[MAX_QP_PAGES]; + __aligned_u64 rq_page_addr[MAX_QP_PAGES]; + __aligned_u64 db_page_addr; + __u32 db_page_size; + __u32 dpp_credit; + __u32 dpp_offset; + __u32 num_wqe_allocated; + __u32 num_rqe_allocated; + __u32 db_sq_offset; + __u32 db_rq_offset; + __u32 db_shift; + __aligned_u64 rsvd[11]; +}; + +struct ocrdma_create_srq_uresp { + __u16 rq_dbid; + __u16 resv0; /* pad */ + __u32 resv1; + + __u32 rq_page_size; + __u32 num_rq_pages; + + __aligned_u64 rq_page_addr[MAX_QP_PAGES]; + __aligned_u64 db_page_addr; + + __u32 db_page_size; + __u32 num_rqe_allocated; + __u32 db_rq_offset; + __u32 db_shift; + + __aligned_u64 rsvd2; + __aligned_u64 rsvd3; +}; + +#endif /* OCRDMA_ABI_USER_H */ diff --git a/kernel-headers/rdma/qedr-abi.h b/kernel-headers/rdma/qedr-abi.h new file mode 100644 index 0000000..a0b83c9 --- /dev/null +++ b/kernel-headers/rdma/qedr-abi.h @@ -0,0 +1,170 @@ +/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR Linux-OpenIB) */ +/* QLogic qedr NIC Driver + * Copyright (c) 2015-2016 QLogic Corporation + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and /or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __QEDR_USER_H__ +#define __QEDR_USER_H__ + +#include <linux/types.h> + +#define QEDR_ABI_VERSION (8) + +/* user kernel communication data structures. */ +enum qedr_alloc_ucontext_flags { + QEDR_ALLOC_UCTX_RESERVED = 1 << 0, + QEDR_ALLOC_UCTX_DB_REC = 1 << 1 +}; + +struct qedr_alloc_ucontext_req { + __u32 context_flags; + __u32 reserved; +}; + +#define QEDR_LDPM_MAX_SIZE (8192) +#define QEDR_EDPM_TRANS_SIZE (64) + +enum qedr_rdma_dpm_type { + QEDR_DPM_TYPE_NONE = 0, + QEDR_DPM_TYPE_ROCE_ENHANCED = 1 << 0, + QEDR_DPM_TYPE_ROCE_LEGACY = 1 << 1, + QEDR_DPM_TYPE_IWARP_LEGACY = 1 << 2, + QEDR_DPM_TYPE_RESERVED = 1 << 3, + QEDR_DPM_SIZES_SET = 1 << 4, +}; + +struct qedr_alloc_ucontext_resp { + __aligned_u64 db_pa; + __u32 db_size; + + __u32 max_send_wr; + __u32 max_recv_wr; + __u32 max_srq_wr; + __u32 sges_per_send_wr; + __u32 sges_per_recv_wr; + __u32 sges_per_srq_wr; + __u32 max_cqes; + __u8 dpm_flags; + __u8 wids_enabled; + __u16 wid_count; + __u16 ldpm_limit_size; + __u8 edpm_trans_size; + __u8 reserved; +}; + +struct qedr_alloc_pd_ureq { + __aligned_u64 rsvd1; +}; + +struct qedr_alloc_pd_uresp { + __u32 pd_id; + __u32 reserved; +}; + +struct qedr_create_cq_ureq { + __aligned_u64 addr; + __aligned_u64 len; +}; + +struct qedr_create_cq_uresp { + __u32 db_offset; + __u16 icid; + __u16 reserved; + __aligned_u64 db_rec_addr; +}; + +struct qedr_create_qp_ureq { + __u32 qp_handle_hi; + __u32 qp_handle_lo; + + /* SQ */ + /* user space virtual address of SQ buffer */ + __aligned_u64 sq_addr; + + /* length of SQ buffer */ + __aligned_u64 sq_len; + + /* RQ */ + /* user space virtual address of RQ buffer */ + __aligned_u64 rq_addr; + + /* length of RQ buffer */ + __aligned_u64 rq_len; +}; + +struct qedr_create_qp_uresp { + __u32 qp_id; + __u32 atomic_supported; + + /* SQ */ + __u32 sq_db_offset; + __u16 sq_icid; + + /* RQ */ + __u32 rq_db_offset; + __u16 rq_icid; + + __u32 rq_db2_offset; + __u32 reserved; + + /* address of SQ doorbell recovery user entry */ + __aligned_u64 sq_db_rec_addr; + + /* address of RQ doorbell recovery user entry */ + __aligned_u64 rq_db_rec_addr; + +}; + +struct qedr_create_srq_ureq { + /* user space virtual address of producer pair */ + __aligned_u64 prod_pair_addr; + + /* user space virtual address of SRQ buffer */ + __aligned_u64 srq_addr; + + /* length of SRQ buffer */ + __aligned_u64 srq_len; +}; + +struct qedr_create_srq_uresp { + __u16 srq_id; + __u16 reserved0; + __u32 reserved1; +}; + +/* doorbell recovery entry allocated and populated by userspace doorbelling + * entities and mapped to kernel. Kernel uses this to register doorbell + * information with doorbell drop recovery mechanism. + */ +struct qedr_user_db_rec { + __aligned_u64 db_data; /* doorbell data */ +}; + +#endif /* __QEDR_USER_H__ */ diff --git a/kernel-headers/rdma/rdma_netlink.h b/kernel-headers/rdma/rdma_netlink.h new file mode 100644 index 0000000..8e27778 --- /dev/null +++ b/kernel-headers/rdma/rdma_netlink.h @@ -0,0 +1,565 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef _UAPI_RDMA_NETLINK_H +#define _UAPI_RDMA_NETLINK_H + +#include <linux/types.h> + +enum { + RDMA_NL_IWCM = 2, + RDMA_NL_RSVD, + RDMA_NL_LS, /* RDMA Local Services */ + RDMA_NL_NLDEV, /* RDMA device interface */ + RDMA_NL_NUM_CLIENTS +}; + +enum { + RDMA_NL_GROUP_IWPM = 2, + RDMA_NL_GROUP_LS, + RDMA_NL_NUM_GROUPS +}; + +#define RDMA_NL_GET_CLIENT(type) ((type & (((1 << 6) - 1) << 10)) >> 10) +#define RDMA_NL_GET_OP(type) (type & ((1 << 10) - 1)) +#define RDMA_NL_GET_TYPE(client, op) ((client << 10) + op) + +/* The minimum version that the iwpm kernel supports */ +#define IWPM_UABI_VERSION_MIN 3 + +/* The latest version that the iwpm kernel supports */ +#define IWPM_UABI_VERSION 4 + +/* iwarp port mapper message flags */ +enum { + + /* Do not map the port for this IWPM request */ + IWPM_FLAGS_NO_PORT_MAP = (1 << 0), +}; + +/* iwarp port mapper op-codes */ +enum { + RDMA_NL_IWPM_REG_PID = 0, + RDMA_NL_IWPM_ADD_MAPPING, + RDMA_NL_IWPM_QUERY_MAPPING, + RDMA_NL_IWPM_REMOVE_MAPPING, + RDMA_NL_IWPM_REMOTE_INFO, + RDMA_NL_IWPM_HANDLE_ERR, + RDMA_NL_IWPM_MAPINFO, + RDMA_NL_IWPM_MAPINFO_NUM, + RDMA_NL_IWPM_HELLO, + RDMA_NL_IWPM_NUM_OPS +}; + +enum { + IWPM_NLA_REG_PID_UNSPEC = 0, + IWPM_NLA_REG_PID_SEQ, + IWPM_NLA_REG_IF_NAME, + IWPM_NLA_REG_IBDEV_NAME, + IWPM_NLA_REG_ULIB_NAME, + IWPM_NLA_REG_PID_MAX +}; + +enum { + IWPM_NLA_RREG_PID_UNSPEC = 0, + IWPM_NLA_RREG_PID_SEQ, + IWPM_NLA_RREG_IBDEV_NAME, + IWPM_NLA_RREG_ULIB_NAME, + IWPM_NLA_RREG_ULIB_VER, + IWPM_NLA_RREG_PID_ERR, + IWPM_NLA_RREG_PID_MAX + +}; + +enum { + IWPM_NLA_MANAGE_MAPPING_UNSPEC = 0, + IWPM_NLA_MANAGE_MAPPING_SEQ, + IWPM_NLA_MANAGE_ADDR, + IWPM_NLA_MANAGE_FLAGS, + IWPM_NLA_MANAGE_MAPPING_MAX +}; + +enum { + IWPM_NLA_RMANAGE_MAPPING_UNSPEC = 0, + IWPM_NLA_RMANAGE_MAPPING_SEQ, + IWPM_NLA_RMANAGE_ADDR, + IWPM_NLA_RMANAGE_MAPPED_LOC_ADDR, + /* The following maintains bisectability of rdma-core */ + IWPM_NLA_MANAGE_MAPPED_LOC_ADDR = IWPM_NLA_RMANAGE_MAPPED_LOC_ADDR, + IWPM_NLA_RMANAGE_MAPPING_ERR, + IWPM_NLA_RMANAGE_MAPPING_MAX +}; + +#define IWPM_NLA_MAPINFO_SEND_MAX 3 +#define IWPM_NLA_REMOVE_MAPPING_MAX 3 + +enum { + IWPM_NLA_QUERY_MAPPING_UNSPEC = 0, + IWPM_NLA_QUERY_MAPPING_SEQ, + IWPM_NLA_QUERY_LOCAL_ADDR, + IWPM_NLA_QUERY_REMOTE_ADDR, + IWPM_NLA_QUERY_FLAGS, + IWPM_NLA_QUERY_MAPPING_MAX, +}; + +enum { + IWPM_NLA_RQUERY_MAPPING_UNSPEC = 0, + IWPM_NLA_RQUERY_MAPPING_SEQ, + IWPM_NLA_RQUERY_LOCAL_ADDR, + IWPM_NLA_RQUERY_REMOTE_ADDR, + IWPM_NLA_RQUERY_MAPPED_LOC_ADDR, + IWPM_NLA_RQUERY_MAPPED_REM_ADDR, + IWPM_NLA_RQUERY_MAPPING_ERR, + IWPM_NLA_RQUERY_MAPPING_MAX +}; + +enum { + IWPM_NLA_MAPINFO_REQ_UNSPEC = 0, + IWPM_NLA_MAPINFO_ULIB_NAME, + IWPM_NLA_MAPINFO_ULIB_VER, + IWPM_NLA_MAPINFO_REQ_MAX +}; + +enum { + IWPM_NLA_MAPINFO_UNSPEC = 0, + IWPM_NLA_MAPINFO_LOCAL_ADDR, + IWPM_NLA_MAPINFO_MAPPED_ADDR, + IWPM_NLA_MAPINFO_FLAGS, + IWPM_NLA_MAPINFO_MAX +}; + +enum { + IWPM_NLA_MAPINFO_NUM_UNSPEC = 0, + IWPM_NLA_MAPINFO_SEQ, + IWPM_NLA_MAPINFO_SEND_NUM, + IWPM_NLA_MAPINFO_ACK_NUM, + IWPM_NLA_MAPINFO_NUM_MAX +}; + +enum { + IWPM_NLA_ERR_UNSPEC = 0, + IWPM_NLA_ERR_SEQ, + IWPM_NLA_ERR_CODE, + IWPM_NLA_ERR_MAX +}; + +enum { + IWPM_NLA_HELLO_UNSPEC = 0, + IWPM_NLA_HELLO_ABI_VERSION, + IWPM_NLA_HELLO_MAX +}; + +/* For RDMA_NLDEV_ATTR_DEV_NODE_TYPE */ +enum { + /* IB values map to NodeInfo:NodeType. */ + RDMA_NODE_IB_CA = 1, + RDMA_NODE_IB_SWITCH, + RDMA_NODE_IB_ROUTER, + RDMA_NODE_RNIC, + RDMA_NODE_USNIC, + RDMA_NODE_USNIC_UDP, + RDMA_NODE_UNSPECIFIED, +}; + +/* + * Local service operations: + * RESOLVE - The client requests the local service to resolve a path. + * SET_TIMEOUT - The local service requests the client to set the timeout. + * IP_RESOLVE - The client requests the local service to resolve an IP to GID. + */ +enum { + RDMA_NL_LS_OP_RESOLVE = 0, + RDMA_NL_LS_OP_SET_TIMEOUT, + RDMA_NL_LS_OP_IP_RESOLVE, + RDMA_NL_LS_NUM_OPS +}; + +/* Local service netlink message flags */ +#define RDMA_NL_LS_F_ERR 0x0100 /* Failed response */ + +/* + * Local service resolve operation family header. + * The layout for the resolve operation: + * nlmsg header + * family header + * attributes + */ + +/* + * Local service path use: + * Specify how the path(s) will be used. + * ALL - For connected CM operation (6 pathrecords) + * UNIDIRECTIONAL - For unidirectional UD (1 pathrecord) + * GMP - For miscellaneous GMP like operation (at least 1 reversible + * pathrecord) + */ +enum { + LS_RESOLVE_PATH_USE_ALL = 0, + LS_RESOLVE_PATH_USE_UNIDIRECTIONAL, + LS_RESOLVE_PATH_USE_GMP, + LS_RESOLVE_PATH_USE_MAX +}; + +#define LS_DEVICE_NAME_MAX 64 + +struct rdma_ls_resolve_header { + __u8 device_name[LS_DEVICE_NAME_MAX]; + __u8 port_num; + __u8 path_use; +}; + +struct rdma_ls_ip_resolve_header { + __u32 ifindex; +}; + +/* Local service attribute type */ +#define RDMA_NLA_F_MANDATORY (1 << 13) +#define RDMA_NLA_TYPE_MASK (~(NLA_F_NESTED | NLA_F_NET_BYTEORDER | \ + RDMA_NLA_F_MANDATORY)) + +/* + * Local service attributes: + * Attr Name Size Byte order + * ----------------------------------------------------- + * PATH_RECORD struct ib_path_rec_data + * TIMEOUT u32 cpu + * SERVICE_ID u64 cpu + * DGID u8[16] BE + * SGID u8[16] BE + * TCLASS u8 + * PKEY u16 cpu + * QOS_CLASS u16 cpu + * IPV4 u32 BE + * IPV6 u8[16] BE + */ +enum { + LS_NLA_TYPE_UNSPEC = 0, + LS_NLA_TYPE_PATH_RECORD, + LS_NLA_TYPE_TIMEOUT, + LS_NLA_TYPE_SERVICE_ID, + LS_NLA_TYPE_DGID, + LS_NLA_TYPE_SGID, + LS_NLA_TYPE_TCLASS, + LS_NLA_TYPE_PKEY, + LS_NLA_TYPE_QOS_CLASS, + LS_NLA_TYPE_IPV4, + LS_NLA_TYPE_IPV6, + LS_NLA_TYPE_MAX +}; + +/* Local service DGID/SGID attribute: big endian */ +struct rdma_nla_ls_gid { + __u8 gid[16]; +}; + +enum rdma_nldev_command { + RDMA_NLDEV_CMD_UNSPEC, + + RDMA_NLDEV_CMD_GET, /* can dump */ + RDMA_NLDEV_CMD_SET, + + RDMA_NLDEV_CMD_NEWLINK, + + RDMA_NLDEV_CMD_DELLINK, + + RDMA_NLDEV_CMD_PORT_GET, /* can dump */ + + RDMA_NLDEV_CMD_SYS_GET, + RDMA_NLDEV_CMD_SYS_SET, + + /* 8 is free to use */ + + RDMA_NLDEV_CMD_RES_GET = 9, /* can dump */ + + RDMA_NLDEV_CMD_RES_QP_GET, /* can dump */ + + RDMA_NLDEV_CMD_RES_CM_ID_GET, /* can dump */ + + RDMA_NLDEV_CMD_RES_CQ_GET, /* can dump */ + + RDMA_NLDEV_CMD_RES_MR_GET, /* can dump */ + + RDMA_NLDEV_CMD_RES_PD_GET, /* can dump */ + + RDMA_NLDEV_CMD_GET_CHARDEV, + + RDMA_NLDEV_CMD_STAT_SET, + + RDMA_NLDEV_CMD_STAT_GET, /* can dump */ + + RDMA_NLDEV_CMD_STAT_DEL, + + RDMA_NLDEV_NUM_OPS +}; + +enum rdma_nldev_print_type { + RDMA_NLDEV_PRINT_TYPE_UNSPEC, + RDMA_NLDEV_PRINT_TYPE_HEX, +}; + +enum rdma_nldev_attr { + /* don't change the order or add anything between, this is ABI! */ + RDMA_NLDEV_ATTR_UNSPEC, + + /* Pad attribute for 64b alignment */ + RDMA_NLDEV_ATTR_PAD = RDMA_NLDEV_ATTR_UNSPEC, + + /* Identifier for ib_device */ + RDMA_NLDEV_ATTR_DEV_INDEX, /* u32 */ + + RDMA_NLDEV_ATTR_DEV_NAME, /* string */ + /* + * Device index together with port index are identifiers + * for port/link properties. + * + * For RDMA_NLDEV_CMD_GET commamnd, port index will return number + * of available ports in ib_device, while for port specific operations, + * it will be real port index as it appears in sysfs. Port index follows + * sysfs notation and starts from 1 for the first port. + */ + RDMA_NLDEV_ATTR_PORT_INDEX, /* u32 */ + + /* + * Device and port capabilities + * + * When used for port info, first 32-bits are CapabilityMask followed by + * 16-bit CapabilityMask2. + */ + RDMA_NLDEV_ATTR_CAP_FLAGS, /* u64 */ + + /* + * FW version + */ + RDMA_NLDEV_ATTR_FW_VERSION, /* string */ + + /* + * Node GUID (in host byte order) associated with the RDMA device. + */ + RDMA_NLDEV_ATTR_NODE_GUID, /* u64 */ + + /* + * System image GUID (in host byte order) associated with + * this RDMA device and other devices which are part of a + * single system. + */ + RDMA_NLDEV_ATTR_SYS_IMAGE_GUID, /* u64 */ + + /* + * Subnet prefix (in host byte order) + */ + RDMA_NLDEV_ATTR_SUBNET_PREFIX, /* u64 */ + + /* + * Local Identifier (LID), + * According to IB specification, It is 16-bit address assigned + * by the Subnet Manager. Extended to be 32-bit for OmniPath users. + */ + RDMA_NLDEV_ATTR_LID, /* u32 */ + RDMA_NLDEV_ATTR_SM_LID, /* u32 */ + + /* + * LID mask control (LMC) + */ + RDMA_NLDEV_ATTR_LMC, /* u8 */ + + RDMA_NLDEV_ATTR_PORT_STATE, /* u8 */ + RDMA_NLDEV_ATTR_PORT_PHYS_STATE, /* u8 */ + + RDMA_NLDEV_ATTR_DEV_NODE_TYPE, /* u8 */ + + RDMA_NLDEV_ATTR_RES_SUMMARY, /* nested table */ + RDMA_NLDEV_ATTR_RES_SUMMARY_ENTRY, /* nested table */ + RDMA_NLDEV_ATTR_RES_SUMMARY_ENTRY_NAME, /* string */ + RDMA_NLDEV_ATTR_RES_SUMMARY_ENTRY_CURR, /* u64 */ + + RDMA_NLDEV_ATTR_RES_QP, /* nested table */ + RDMA_NLDEV_ATTR_RES_QP_ENTRY, /* nested table */ + /* + * Local QPN + */ + RDMA_NLDEV_ATTR_RES_LQPN, /* u32 */ + /* + * Remote QPN, + * Applicable for RC and UC only IBTA 11.2.5.3 QUERY QUEUE PAIR + */ + RDMA_NLDEV_ATTR_RES_RQPN, /* u32 */ + /* + * Receive Queue PSN, + * Applicable for RC and UC only 11.2.5.3 QUERY QUEUE PAIR + */ + RDMA_NLDEV_ATTR_RES_RQ_PSN, /* u32 */ + /* + * Send Queue PSN + */ + RDMA_NLDEV_ATTR_RES_SQ_PSN, /* u32 */ + RDMA_NLDEV_ATTR_RES_PATH_MIG_STATE, /* u8 */ + /* + * QP types as visible to RDMA/core, the reserved QPT + * are not exported through this interface. + */ + RDMA_NLDEV_ATTR_RES_TYPE, /* u8 */ + RDMA_NLDEV_ATTR_RES_STATE, /* u8 */ + /* + * Process ID which created object, + * in case of kernel origin, PID won't exist. + */ + RDMA_NLDEV_ATTR_RES_PID, /* u32 */ + /* + * The name of process created following resource. + * It will exist only for kernel objects. + * For user created objects, the user is supposed + * to read /proc/PID/comm file. + */ + RDMA_NLDEV_ATTR_RES_KERN_NAME, /* string */ + + RDMA_NLDEV_ATTR_RES_CM_ID, /* nested table */ + RDMA_NLDEV_ATTR_RES_CM_ID_ENTRY, /* nested table */ + /* + * rdma_cm_id port space. + */ + RDMA_NLDEV_ATTR_RES_PS, /* u32 */ + /* + * Source and destination socket addresses + */ + RDMA_NLDEV_ATTR_RES_SRC_ADDR, /* __kernel_sockaddr_storage */ + RDMA_NLDEV_ATTR_RES_DST_ADDR, /* __kernel_sockaddr_storage */ + + RDMA_NLDEV_ATTR_RES_CQ, /* nested table */ + RDMA_NLDEV_ATTR_RES_CQ_ENTRY, /* nested table */ + RDMA_NLDEV_ATTR_RES_CQE, /* u32 */ + RDMA_NLDEV_ATTR_RES_USECNT, /* u64 */ + RDMA_NLDEV_ATTR_RES_POLL_CTX, /* u8 */ + + RDMA_NLDEV_ATTR_RES_MR, /* nested table */ + RDMA_NLDEV_ATTR_RES_MR_ENTRY, /* nested table */ + RDMA_NLDEV_ATTR_RES_RKEY, /* u32 */ + RDMA_NLDEV_ATTR_RES_LKEY, /* u32 */ + RDMA_NLDEV_ATTR_RES_IOVA, /* u64 */ + RDMA_NLDEV_ATTR_RES_MRLEN, /* u64 */ + + RDMA_NLDEV_ATTR_RES_PD, /* nested table */ + RDMA_NLDEV_ATTR_RES_PD_ENTRY, /* nested table */ + RDMA_NLDEV_ATTR_RES_LOCAL_DMA_LKEY, /* u32 */ + RDMA_NLDEV_ATTR_RES_UNSAFE_GLOBAL_RKEY, /* u32 */ + /* + * Provides logical name and index of netdevice which is + * connected to physical port. This information is relevant + * for RoCE and iWARP. + * + * The netdevices which are associated with containers are + * supposed to be exported together with GID table once it + * will be exposed through the netlink. Because the + * associated netdevices are properties of GIDs. + */ + RDMA_NLDEV_ATTR_NDEV_INDEX, /* u32 */ + RDMA_NLDEV_ATTR_NDEV_NAME, /* string */ + /* + * driver-specific attributes. + */ + RDMA_NLDEV_ATTR_DRIVER, /* nested table */ + RDMA_NLDEV_ATTR_DRIVER_ENTRY, /* nested table */ + RDMA_NLDEV_ATTR_DRIVER_STRING, /* string */ + /* + * u8 values from enum rdma_nldev_print_type + */ + RDMA_NLDEV_ATTR_DRIVER_PRINT_TYPE, /* u8 */ + RDMA_NLDEV_ATTR_DRIVER_S32, /* s32 */ + RDMA_NLDEV_ATTR_DRIVER_U32, /* u32 */ + RDMA_NLDEV_ATTR_DRIVER_S64, /* s64 */ + RDMA_NLDEV_ATTR_DRIVER_U64, /* u64 */ + + /* + * Indexes to get/set secific entry, + * for QP use RDMA_NLDEV_ATTR_RES_LQPN + */ + RDMA_NLDEV_ATTR_RES_PDN, /* u32 */ + RDMA_NLDEV_ATTR_RES_CQN, /* u32 */ + RDMA_NLDEV_ATTR_RES_MRN, /* u32 */ + RDMA_NLDEV_ATTR_RES_CM_IDN, /* u32 */ + RDMA_NLDEV_ATTR_RES_CTXN, /* u32 */ + /* + * Identifies the rdma driver. eg: "rxe" or "siw" + */ + RDMA_NLDEV_ATTR_LINK_TYPE, /* string */ + + /* + * net namespace mode for rdma subsystem: + * either shared or exclusive among multiple net namespaces. + */ + RDMA_NLDEV_SYS_ATTR_NETNS_MODE, /* u8 */ + /* + * Device protocol, e.g. ib, iw, usnic, roce and opa + */ + RDMA_NLDEV_ATTR_DEV_PROTOCOL, /* string */ + + /* + * File descriptor handle of the net namespace object + */ + RDMA_NLDEV_NET_NS_FD, /* u32 */ + /* + * Information about a chardev. + * CHARDEV_TYPE is the name of the chardev ABI (ie uverbs, umad, etc) + * CHARDEV_ABI signals the ABI revision (historical) + * CHARDEV_NAME is the kernel name for the /dev/ file (no directory) + * CHARDEV is the 64 bit dev_t for the inode + */ + RDMA_NLDEV_ATTR_CHARDEV_TYPE, /* string */ + RDMA_NLDEV_ATTR_CHARDEV_NAME, /* string */ + RDMA_NLDEV_ATTR_CHARDEV_ABI, /* u64 */ + RDMA_NLDEV_ATTR_CHARDEV, /* u64 */ + RDMA_NLDEV_ATTR_UVERBS_DRIVER_ID, /* u64 */ + /* + * Counter-specific attributes. + */ + RDMA_NLDEV_ATTR_STAT_MODE, /* u32 */ + RDMA_NLDEV_ATTR_STAT_RES, /* u32 */ + RDMA_NLDEV_ATTR_STAT_AUTO_MODE_MASK, /* u32 */ + RDMA_NLDEV_ATTR_STAT_COUNTER, /* nested table */ + RDMA_NLDEV_ATTR_STAT_COUNTER_ENTRY, /* nested table */ + RDMA_NLDEV_ATTR_STAT_COUNTER_ID, /* u32 */ + RDMA_NLDEV_ATTR_STAT_HWCOUNTERS, /* nested table */ + RDMA_NLDEV_ATTR_STAT_HWCOUNTER_ENTRY, /* nested table */ + RDMA_NLDEV_ATTR_STAT_HWCOUNTER_ENTRY_NAME, /* string */ + RDMA_NLDEV_ATTR_STAT_HWCOUNTER_ENTRY_VALUE, /* u64 */ + + /* + * CQ adaptive moderatio (DIM) + */ + RDMA_NLDEV_ATTR_DEV_DIM, /* u8 */ + + /* + * Always the end + */ + RDMA_NLDEV_ATTR_MAX +}; + +/* + * Supported counter bind modes. All modes are mutual-exclusive. + */ +enum rdma_nl_counter_mode { + RDMA_COUNTER_MODE_NONE, + + /* + * A qp is bound with a counter automatically during initialization + * based on the auto mode (e.g., qp type, ...) + */ + RDMA_COUNTER_MODE_AUTO, + + /* + * Which qp are bound with which counter is explicitly specified + * by the user + */ + RDMA_COUNTER_MODE_MANUAL, + + /* + * Always the end + */ + RDMA_COUNTER_MODE_MAX, +}; + +/* + * Supported criteria in counter auto mode. + * Currently only "qp type" is supported + */ +enum rdma_nl_counter_mask { + RDMA_COUNTER_MASK_QP_TYPE = 1, +}; +#endif /* _UAPI_RDMA_NETLINK_H */ diff --git a/kernel-headers/rdma/rdma_user_cm.h b/kernel-headers/rdma/rdma_user_cm.h new file mode 100644 index 0000000..e42940a --- /dev/null +++ b/kernel-headers/rdma/rdma_user_cm.h @@ -0,0 +1,328 @@ +/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR Linux-OpenIB) */ +/* + * Copyright (c) 2005-2006 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef RDMA_USER_CM_H +#define RDMA_USER_CM_H + +#include <linux/types.h> +#include <linux/socket.h> +#include <linux/in6.h> +#include <rdma/ib_user_verbs.h> +#include <rdma/ib_user_sa.h> + +#define RDMA_USER_CM_ABI_VERSION 4 + +#define RDMA_MAX_PRIVATE_DATA 256 + +enum { + RDMA_USER_CM_CMD_CREATE_ID, + RDMA_USER_CM_CMD_DESTROY_ID, + RDMA_USER_CM_CMD_BIND_IP, + RDMA_USER_CM_CMD_RESOLVE_IP, + RDMA_USER_CM_CMD_RESOLVE_ROUTE, + RDMA_USER_CM_CMD_QUERY_ROUTE, + RDMA_USER_CM_CMD_CONNECT, + RDMA_USER_CM_CMD_LISTEN, + RDMA_USER_CM_CMD_ACCEPT, + RDMA_USER_CM_CMD_REJECT, + RDMA_USER_CM_CMD_DISCONNECT, + RDMA_USER_CM_CMD_INIT_QP_ATTR, + RDMA_USER_CM_CMD_GET_EVENT, + RDMA_USER_CM_CMD_GET_OPTION, + RDMA_USER_CM_CMD_SET_OPTION, + RDMA_USER_CM_CMD_NOTIFY, + RDMA_USER_CM_CMD_JOIN_IP_MCAST, + RDMA_USER_CM_CMD_LEAVE_MCAST, + RDMA_USER_CM_CMD_MIGRATE_ID, + RDMA_USER_CM_CMD_QUERY, + RDMA_USER_CM_CMD_BIND, + RDMA_USER_CM_CMD_RESOLVE_ADDR, + RDMA_USER_CM_CMD_JOIN_MCAST +}; + +/* See IBTA Annex A11, servies ID bytes 4 & 5 */ +enum rdma_ucm_port_space { + RDMA_PS_IPOIB = 0x0002, + RDMA_PS_IB = 0x013F, + RDMA_PS_TCP = 0x0106, + RDMA_PS_UDP = 0x0111, +}; + +/* + * command ABI structures. + */ +struct rdma_ucm_cmd_hdr { + __u32 cmd; + __u16 in; + __u16 out; +}; + +struct rdma_ucm_create_id { + __aligned_u64 uid; + __aligned_u64 response; + __u16 ps; /* use enum rdma_ucm_port_space */ + __u8 qp_type; + __u8 reserved[5]; +}; + +struct rdma_ucm_create_id_resp { + __u32 id; +}; + +struct rdma_ucm_destroy_id { + __aligned_u64 response; + __u32 id; + __u32 reserved; +}; + +struct rdma_ucm_destroy_id_resp { + __u32 events_reported; +}; + +struct rdma_ucm_bind_ip { + __aligned_u64 response; + struct sockaddr_in6 addr; + __u32 id; +}; + +struct rdma_ucm_bind { + __u32 id; + __u16 addr_size; + __u16 reserved; + struct __kernel_sockaddr_storage addr; +}; + +struct rdma_ucm_resolve_ip { + struct sockaddr_in6 src_addr; + struct sockaddr_in6 dst_addr; + __u32 id; + __u32 timeout_ms; +}; + +struct rdma_ucm_resolve_addr { + __u32 id; + __u32 timeout_ms; + __u16 src_size; + __u16 dst_size; + __u32 reserved; + struct __kernel_sockaddr_storage src_addr; + struct __kernel_sockaddr_storage dst_addr; +}; + +struct rdma_ucm_resolve_route { + __u32 id; + __u32 timeout_ms; +}; + +enum { + RDMA_USER_CM_QUERY_ADDR, + RDMA_USER_CM_QUERY_PATH, + RDMA_USER_CM_QUERY_GID +}; + +struct rdma_ucm_query { + __aligned_u64 response; + __u32 id; + __u32 option; +}; + +struct rdma_ucm_query_route_resp { + __aligned_u64 node_guid; + struct ib_user_path_rec ib_route[2]; + struct sockaddr_in6 src_addr; + struct sockaddr_in6 dst_addr; + __u32 num_paths; + __u8 port_num; + __u8 reserved[3]; +}; + +struct rdma_ucm_query_addr_resp { + __aligned_u64 node_guid; + __u8 port_num; + __u8 reserved; + __u16 pkey; + __u16 src_size; + __u16 dst_size; + struct __kernel_sockaddr_storage src_addr; + struct __kernel_sockaddr_storage dst_addr; +}; + +struct rdma_ucm_query_path_resp { + __u32 num_paths; + __u32 reserved; + struct ib_path_rec_data path_data[0]; +}; + +struct rdma_ucm_conn_param { + __u32 qp_num; + __u32 qkey; + __u8 private_data[RDMA_MAX_PRIVATE_DATA]; + __u8 private_data_len; + __u8 srq; + __u8 responder_resources; + __u8 initiator_depth; + __u8 flow_control; + __u8 retry_count; + __u8 rnr_retry_count; + __u8 valid; +}; + +struct rdma_ucm_ud_param { + __u32 qp_num; + __u32 qkey; + struct ib_uverbs_ah_attr ah_attr; + __u8 private_data[RDMA_MAX_PRIVATE_DATA]; + __u8 private_data_len; + __u8 reserved[7]; +}; + +struct rdma_ucm_connect { + struct rdma_ucm_conn_param conn_param; + __u32 id; + __u32 reserved; +}; + +struct rdma_ucm_listen { + __u32 id; + __u32 backlog; +}; + +struct rdma_ucm_accept { + __aligned_u64 uid; + struct rdma_ucm_conn_param conn_param; + __u32 id; + __u32 reserved; +}; + +struct rdma_ucm_reject { + __u32 id; + __u8 private_data_len; + __u8 reserved[3]; + __u8 private_data[RDMA_MAX_PRIVATE_DATA]; +}; + +struct rdma_ucm_disconnect { + __u32 id; +}; + +struct rdma_ucm_init_qp_attr { + __aligned_u64 response; + __u32 id; + __u32 qp_state; +}; + +struct rdma_ucm_notify { + __u32 id; + __u32 event; +}; + +struct rdma_ucm_join_ip_mcast { + __aligned_u64 response; /* rdma_ucm_create_id_resp */ + __aligned_u64 uid; + struct sockaddr_in6 addr; + __u32 id; +}; + +/* Multicast join flags */ +enum { + RDMA_MC_JOIN_FLAG_FULLMEMBER, + RDMA_MC_JOIN_FLAG_SENDONLY_FULLMEMBER, + RDMA_MC_JOIN_FLAG_RESERVED, +}; + +struct rdma_ucm_join_mcast { + __aligned_u64 response; /* rdma_ucma_create_id_resp */ + __aligned_u64 uid; + __u32 id; + __u16 addr_size; + __u16 join_flags; + struct __kernel_sockaddr_storage addr; +}; + +struct rdma_ucm_get_event { + __aligned_u64 response; +}; + +struct rdma_ucm_event_resp { + __aligned_u64 uid; + __u32 id; + __u32 event; + __u32 status; + /* + * NOTE: This union is not aligned to 8 bytes so none of the union + * members may contain a u64 or anything with higher alignment than 4. + */ + union { + struct rdma_ucm_conn_param conn; + struct rdma_ucm_ud_param ud; + } param; + __u32 reserved; +}; + +/* Option levels */ +enum { + RDMA_OPTION_ID = 0, + RDMA_OPTION_IB = 1 +}; + +/* Option details */ +enum { + RDMA_OPTION_ID_TOS = 0, + RDMA_OPTION_ID_REUSEADDR = 1, + RDMA_OPTION_ID_AFONLY = 2, + RDMA_OPTION_ID_ACK_TIMEOUT = 3 +}; + +enum { + RDMA_OPTION_IB_PATH = 1 +}; + +struct rdma_ucm_set_option { + __aligned_u64 optval; + __u32 id; + __u32 level; + __u32 optname; + __u32 optlen; +}; + +struct rdma_ucm_migrate_id { + __aligned_u64 response; + __u32 id; + __u32 fd; +}; + +struct rdma_ucm_migrate_resp { + __u32 events_reported; +}; + +#endif /* RDMA_USER_CM_H */ diff --git a/kernel-headers/rdma/rdma_user_ioctl.h b/kernel-headers/rdma/rdma_user_ioctl.h new file mode 100644 index 0000000..d92d272 --- /dev/null +++ b/kernel-headers/rdma/rdma_user_ioctl.h @@ -0,0 +1,85 @@ +/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR Linux-OpenIB) */ +/* + * Copyright (c) 2016 Mellanox Technologies, LTD. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef RDMA_USER_IOCTL_H +#define RDMA_USER_IOCTL_H + +#include <rdma/ib_user_mad.h> +#include <rdma/hfi/hfi1_ioctl.h> +#include <rdma/rdma_user_ioctl_cmds.h> + +/* Legacy name, for user space application which already use it */ +#define IB_IOCTL_MAGIC RDMA_IOCTL_MAGIC + +/* + * General blocks assignments + * It is closed on purpose do not expose it it user space + * #define MAD_CMD_BASE 0x00 + * #define HFI1_CMD_BAS 0xE0 + */ + +/* MAD specific section */ +#define IB_USER_MAD_REGISTER_AGENT _IOWR(RDMA_IOCTL_MAGIC, 0x01, struct ib_user_mad_reg_req) +#define IB_USER_MAD_UNREGISTER_AGENT _IOW(RDMA_IOCTL_MAGIC, 0x02, __u32) +#define IB_USER_MAD_ENABLE_PKEY _IO(RDMA_IOCTL_MAGIC, 0x03) +#define IB_USER_MAD_REGISTER_AGENT2 _IOWR(RDMA_IOCTL_MAGIC, 0x04, struct ib_user_mad_reg_req2) + +/* HFI specific section */ +/* allocate HFI and context */ +#define HFI1_IOCTL_ASSIGN_CTXT _IOWR(RDMA_IOCTL_MAGIC, 0xE1, struct hfi1_user_info) +/* find out what resources we got */ +#define HFI1_IOCTL_CTXT_INFO _IOW(RDMA_IOCTL_MAGIC, 0xE2, struct hfi1_ctxt_info) +/* set up userspace */ +#define HFI1_IOCTL_USER_INFO _IOW(RDMA_IOCTL_MAGIC, 0xE3, struct hfi1_base_info) +/* update expected TID entries */ +#define HFI1_IOCTL_TID_UPDATE _IOWR(RDMA_IOCTL_MAGIC, 0xE4, struct hfi1_tid_info) +/* free expected TID entries */ +#define HFI1_IOCTL_TID_FREE _IOWR(RDMA_IOCTL_MAGIC, 0xE5, struct hfi1_tid_info) +/* force an update of PIO credit */ +#define HFI1_IOCTL_CREDIT_UPD _IO(RDMA_IOCTL_MAGIC, 0xE6) +/* control receipt of packets */ +#define HFI1_IOCTL_RECV_CTRL _IOW(RDMA_IOCTL_MAGIC, 0xE8, int) +/* set the kind of polling we want */ +#define HFI1_IOCTL_POLL_TYPE _IOW(RDMA_IOCTL_MAGIC, 0xE9, int) +/* ack & clear user status bits */ +#define HFI1_IOCTL_ACK_EVENT _IOW(RDMA_IOCTL_MAGIC, 0xEA, unsigned long) +/* set context's pkey */ +#define HFI1_IOCTL_SET_PKEY _IOW(RDMA_IOCTL_MAGIC, 0xEB, __u16) +/* reset context's HW send context */ +#define HFI1_IOCTL_CTXT_RESET _IO(RDMA_IOCTL_MAGIC, 0xEC) +/* read TID cache invalidations */ +#define HFI1_IOCTL_TID_INVAL_READ _IOWR(RDMA_IOCTL_MAGIC, 0xED, struct hfi1_tid_info) +/* get the version of the user cdev */ +#define HFI1_IOCTL_GET_VERS _IOR(RDMA_IOCTL_MAGIC, 0xEE, int) + +#endif /* RDMA_USER_IOCTL_H */ diff --git a/kernel-headers/rdma/rdma_user_ioctl_cmds.h b/kernel-headers/rdma/rdma_user_ioctl_cmds.h new file mode 100644 index 0000000..7b1ec80 --- /dev/null +++ b/kernel-headers/rdma/rdma_user_ioctl_cmds.h @@ -0,0 +1,87 @@ +/* + * Copyright (c) 2018, Mellanox Technologies inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef RDMA_USER_IOCTL_CMDS_H +#define RDMA_USER_IOCTL_CMDS_H + +#include <linux/types.h> +#include <linux/ioctl.h> + +/* Documentation/ioctl/ioctl-number.rst */ +#define RDMA_IOCTL_MAGIC 0x1b +#define RDMA_VERBS_IOCTL \ + _IOWR(RDMA_IOCTL_MAGIC, 1, struct ib_uverbs_ioctl_hdr) + +enum { + /* User input */ + UVERBS_ATTR_F_MANDATORY = 1U << 0, + /* + * Valid output bit should be ignored and considered set in + * mandatory fields. This bit is kernel output. + */ + UVERBS_ATTR_F_VALID_OUTPUT = 1U << 1, +}; + +struct ib_uverbs_attr { + __u16 attr_id; /* command specific type attribute */ + __u16 len; /* only for pointers and IDRs array */ + __u16 flags; /* combination of UVERBS_ATTR_F_XXXX */ + union { + struct { + __u8 elem_id; + __u8 reserved; + } enum_data; + __u16 reserved; + } attr_data; + union { + /* + * ptr to command, inline data, idr/fd or + * ptr to __u32 array of IDRs + */ + __aligned_u64 data; + /* Used by FD_IN and FD_OUT */ + __s64 data_s64; + }; +}; + +struct ib_uverbs_ioctl_hdr { + __u16 length; + __u16 object_id; + __u16 method_id; + __u16 num_attrs; + __aligned_u64 reserved1; + __u32 driver_id; + __u32 reserved2; + struct ib_uverbs_attr attrs[0]; +}; + +#endif diff --git a/kernel-headers/rdma/rdma_user_rxe.h b/kernel-headers/rdma/rdma_user_rxe.h new file mode 100644 index 0000000..aae2e69 --- /dev/null +++ b/kernel-headers/rdma/rdma_user_rxe.h @@ -0,0 +1,178 @@ +/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR Linux-OpenIB) */ +/* + * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef RDMA_USER_RXE_H +#define RDMA_USER_RXE_H + +#include <linux/types.h> +#include <linux/socket.h> +#include <linux/in.h> +#include <linux/in6.h> + +union rxe_gid { + __u8 raw[16]; + struct { + __be64 subnet_prefix; + __be64 interface_id; + } global; +}; + +struct rxe_global_route { + union rxe_gid dgid; + __u32 flow_label; + __u8 sgid_index; + __u8 hop_limit; + __u8 traffic_class; +}; + +struct rxe_av { + __u8 port_num; + __u8 network_type; + __u8 dmac[6]; + struct rxe_global_route grh; + union { + struct sockaddr_in _sockaddr_in; + struct sockaddr_in6 _sockaddr_in6; + } sgid_addr, dgid_addr; +}; + +struct rxe_send_wr { + __aligned_u64 wr_id; + __u32 num_sge; + __u32 opcode; + __u32 send_flags; + union { + __be32 imm_data; + __u32 invalidate_rkey; + } ex; + union { + struct { + __aligned_u64 remote_addr; + __u32 rkey; + __u32 reserved; + } rdma; + struct { + __aligned_u64 remote_addr; + __aligned_u64 compare_add; + __aligned_u64 swap; + __u32 rkey; + __u32 reserved; + } atomic; + struct { + __u32 remote_qpn; + __u32 remote_qkey; + __u16 pkey_index; + } ud; + /* reg is only used by the kernel and is not part of the uapi */ + struct { + union { + struct ib_mr *mr; + __aligned_u64 reserved; + }; + __u32 key; + __u32 access; + } reg; + } wr; +}; + +struct rxe_sge { + __aligned_u64 addr; + __u32 length; + __u32 lkey; +}; + +struct mminfo { + __aligned_u64 offset; + __u32 size; + __u32 pad; +}; + +struct rxe_dma_info { + __u32 length; + __u32 resid; + __u32 cur_sge; + __u32 num_sge; + __u32 sge_offset; + __u32 reserved; + union { + __u8 inline_data[0]; + struct rxe_sge sge[0]; + }; +}; + +struct rxe_send_wqe { + struct rxe_send_wr wr; + struct rxe_av av; + __u32 status; + __u32 state; + __aligned_u64 iova; + __u32 mask; + __u32 first_psn; + __u32 last_psn; + __u32 ack_length; + __u32 ssn; + __u32 has_rd_atomic; + struct rxe_dma_info dma; +}; + +struct rxe_recv_wqe { + __aligned_u64 wr_id; + __u32 num_sge; + __u32 padding; + struct rxe_dma_info dma; +}; + +struct rxe_create_cq_resp { + struct mminfo mi; +}; + +struct rxe_resize_cq_resp { + struct mminfo mi; +}; + +struct rxe_create_qp_resp { + struct mminfo rq_mi; + struct mminfo sq_mi; +}; + +struct rxe_create_srq_resp { + struct mminfo mi; + __u32 srq_num; + __u32 reserved; +}; + +struct rxe_modify_srq_cmd { + __aligned_u64 mmap_info_addr; +}; + +#endif /* RDMA_USER_RXE_H */ diff --git a/kernel-headers/rdma/rvt-abi.h b/kernel-headers/rdma/rvt-abi.h new file mode 100644 index 0000000..7c05a02 --- /dev/null +++ b/kernel-headers/rdma/rvt-abi.h @@ -0,0 +1,66 @@ +/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */ + +/* + * This file contains defines, structures, etc. that are used + * to communicate between kernel and user code. + */ + +#ifndef RVT_ABI_USER_H +#define RVT_ABI_USER_H + +#include <linux/types.h> +#include <rdma/ib_user_verbs.h> +#ifndef RDMA_ATOMIC_UAPI +#define RDMA_ATOMIC_UAPI(_type, _name) struct{ _type val; } _name +#endif + +struct rvt_wqe_sge { + __aligned_u64 addr; + __u32 length; + __u32 lkey; +}; + +/* + * This structure is used to contain the head pointer, tail pointer, + * and completion queue entries as a single memory allocation so + * it can be mmap'ed into user space. + */ +struct rvt_cq_wc { + /* index of next entry to fill */ + RDMA_ATOMIC_UAPI(__u32, head); + /* index of next ib_poll_cq() entry */ + RDMA_ATOMIC_UAPI(__u32, tail); + + /* these are actually size ibcq.cqe + 1 */ + struct ib_uverbs_wc uqueue[]; +}; + +/* + * Receive work request queue entry. + * The size of the sg_list is determined when the QP (or SRQ) is created + * and stored in qp->r_rq.max_sge (or srq->rq.max_sge). + */ +struct rvt_rwqe { + __u64 wr_id; + __u8 num_sge; + __u8 padding[7]; + struct rvt_wqe_sge sg_list[]; +}; + +/* + * This structure is used to contain the head pointer, tail pointer, + * and receive work queue entries as a single memory allocation so + * it can be mmap'ed into user space. + * Note that the wq array elements are variable size so you can't + * just index into the array to get the N'th element; + * use get_rwqe_ptr() for user space and rvt_get_rwqe_ptr() + * for kernel space. + */ +struct rvt_rwq { + /* new work requests posted to the head */ + RDMA_ATOMIC_UAPI(__u32, head); + /* receives pull requests from here. */ + RDMA_ATOMIC_UAPI(__u32, tail); + struct rvt_rwqe wq[]; +}; +#endif /* RVT_ABI_USER_H */ diff --git a/kernel-headers/rdma/siw-abi.h b/kernel-headers/rdma/siw-abi.h new file mode 100644 index 0000000..af735f5 --- /dev/null +++ b/kernel-headers/rdma/siw-abi.h @@ -0,0 +1,186 @@ +/* SPDX-License-Identifier: (GPL-2.0 WITH Linux-syscall-note) or BSD-3-Clause */ + +/* Authors: Bernard Metzler <bmt@zurich.ibm.com> */ +/* Copyright (c) 2008-2019, IBM Corporation */ + +#ifndef _SIW_USER_H +#define _SIW_USER_H + +#include <linux/types.h> + +#define SIW_NODE_DESC_COMMON "Software iWARP stack" +#define SIW_ABI_VERSION 1 +#define SIW_MAX_SGE 6 +#define SIW_UOBJ_MAX_KEY 0x08FFFF +#define SIW_INVAL_UOBJ_KEY (SIW_UOBJ_MAX_KEY + 1) + +struct siw_uresp_create_cq { + __u32 cq_id; + __u32 num_cqe; + __aligned_u64 cq_key; +}; + +struct siw_uresp_create_qp { + __u32 qp_id; + __u32 num_sqe; + __u32 num_rqe; + __u32 pad; + __aligned_u64 sq_key; + __aligned_u64 rq_key; +}; + +struct siw_ureq_reg_mr { + __u8 stag_key; + __u8 reserved[3]; + __u32 pad; +}; + +struct siw_uresp_reg_mr { + __u32 stag; + __u32 pad; +}; + +struct siw_uresp_create_srq { + __u32 num_rqe; + __u32 pad; + __aligned_u64 srq_key; +}; + +struct siw_uresp_alloc_ctx { + __u32 dev_id; + __u32 pad; +}; + +enum siw_opcode { + SIW_OP_WRITE, + SIW_OP_READ, + SIW_OP_READ_LOCAL_INV, + SIW_OP_SEND, + SIW_OP_SEND_WITH_IMM, + SIW_OP_SEND_REMOTE_INV, + + /* Unsupported */ + SIW_OP_FETCH_AND_ADD, + SIW_OP_COMP_AND_SWAP, + + SIW_OP_RECEIVE, + /* provider internal SQE */ + SIW_OP_READ_RESPONSE, + /* + * below opcodes valid for + * in-kernel clients only + */ + SIW_OP_INVAL_STAG, + SIW_OP_REG_MR, + SIW_NUM_OPCODES +}; + +/* Keep it same as ibv_sge to allow for memcpy */ +struct siw_sge { + __aligned_u64 laddr; + __u32 length; + __u32 lkey; +}; + +/* + * Inline data are kept within the work request itself occupying + * the space of sge[1] .. sge[n]. Therefore, inline data cannot be + * supported if SIW_MAX_SGE is below 2 elements. + */ +#define SIW_MAX_INLINE (sizeof(struct siw_sge) * (SIW_MAX_SGE - 1)) + +#if SIW_MAX_SGE < 2 +#error "SIW_MAX_SGE must be at least 2" +#endif + +enum siw_wqe_flags { + SIW_WQE_VALID = 1, + SIW_WQE_INLINE = (1 << 1), + SIW_WQE_SIGNALLED = (1 << 2), + SIW_WQE_SOLICITED = (1 << 3), + SIW_WQE_READ_FENCE = (1 << 4), + SIW_WQE_REM_INVAL = (1 << 5), + SIW_WQE_COMPLETED = (1 << 6) +}; + +/* Send Queue Element */ +struct siw_sqe { + __aligned_u64 id; + __u16 flags; + __u8 num_sge; + /* Contains enum siw_opcode values */ + __u8 opcode; + __u32 rkey; + union { + __aligned_u64 raddr; + __aligned_u64 base_mr; + }; + union { + struct siw_sge sge[SIW_MAX_SGE]; + __aligned_u64 access; + }; +}; + +/* Receive Queue Element */ +struct siw_rqe { + __aligned_u64 id; + __u16 flags; + __u8 num_sge; + /* + * only used by kernel driver, + * ignored if set by user + */ + __u8 opcode; + __u32 unused; + struct siw_sge sge[SIW_MAX_SGE]; +}; + +enum siw_notify_flags { + SIW_NOTIFY_NOT = (0), + SIW_NOTIFY_SOLICITED = (1 << 0), + SIW_NOTIFY_NEXT_COMPLETION = (1 << 1), + SIW_NOTIFY_MISSED_EVENTS = (1 << 2), + SIW_NOTIFY_ALL = SIW_NOTIFY_SOLICITED | SIW_NOTIFY_NEXT_COMPLETION | + SIW_NOTIFY_MISSED_EVENTS +}; + +enum siw_wc_status { + SIW_WC_SUCCESS, + SIW_WC_LOC_LEN_ERR, + SIW_WC_LOC_PROT_ERR, + SIW_WC_LOC_QP_OP_ERR, + SIW_WC_WR_FLUSH_ERR, + SIW_WC_BAD_RESP_ERR, + SIW_WC_LOC_ACCESS_ERR, + SIW_WC_REM_ACCESS_ERR, + SIW_WC_REM_INV_REQ_ERR, + SIW_WC_GENERAL_ERR, + SIW_NUM_WC_STATUS +}; + +struct siw_cqe { + __aligned_u64 id; + __u8 flags; + __u8 opcode; + __u16 status; + __u32 bytes; + union { + __aligned_u64 imm_data; + __u32 inval_stag; + }; + /* QP number or QP pointer */ + union { + struct ib_qp *base_qp; + __aligned_u64 qp_id; + }; +}; + +/* + * Shared structure between user and kernel + * to control CQ arming. + */ +struct siw_cq_ctrl { + __u32 flags; + __u32 pad; +}; +#endif diff --git a/kernel-headers/rdma/vmw_pvrdma-abi.h b/kernel-headers/rdma/vmw_pvrdma-abi.h new file mode 100644 index 0000000..f8b638c --- /dev/null +++ b/kernel-headers/rdma/vmw_pvrdma-abi.h @@ -0,0 +1,303 @@ +/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-2-Clause) */ +/* + * Copyright (c) 2012-2016 VMware, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of EITHER the GNU General Public License + * version 2 as published by the Free Software Foundation or the BSD + * 2-Clause License. This program is distributed in the hope that it + * will be useful, but WITHOUT ANY WARRANTY; WITHOUT EVEN THE IMPLIED + * WARRANTY OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE. + * See the GNU General Public License version 2 for more details at + * http://www.gnu.org/licenses/old-licenses/gpl-2.0.en.html. + * + * You should have received a copy of the GNU General Public License + * along with this program available in the file COPYING in the main + * directory of this source tree. + * + * The BSD 2-Clause License + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED + * OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __VMW_PVRDMA_ABI_H__ +#define __VMW_PVRDMA_ABI_H__ + +#include <linux/types.h> + +#define PVRDMA_UVERBS_ABI_VERSION 3 /* ABI Version. */ +#define PVRDMA_UAR_HANDLE_MASK 0x00FFFFFF /* Bottom 24 bits. */ +#define PVRDMA_UAR_QP_OFFSET 0 /* QP doorbell. */ +#define PVRDMA_UAR_QP_SEND (1 << 30) /* Send bit. */ +#define PVRDMA_UAR_QP_RECV (1 << 31) /* Recv bit. */ +#define PVRDMA_UAR_CQ_OFFSET 4 /* CQ doorbell. */ +#define PVRDMA_UAR_CQ_ARM_SOL (1 << 29) /* Arm solicited bit. */ +#define PVRDMA_UAR_CQ_ARM (1 << 30) /* Arm bit. */ +#define PVRDMA_UAR_CQ_POLL (1 << 31) /* Poll bit. */ +#define PVRDMA_UAR_SRQ_OFFSET 8 /* SRQ doorbell. */ +#define PVRDMA_UAR_SRQ_RECV (1 << 30) /* Recv bit. */ + +enum pvrdma_wr_opcode { + PVRDMA_WR_RDMA_WRITE, + PVRDMA_WR_RDMA_WRITE_WITH_IMM, + PVRDMA_WR_SEND, + PVRDMA_WR_SEND_WITH_IMM, + PVRDMA_WR_RDMA_READ, + PVRDMA_WR_ATOMIC_CMP_AND_SWP, + PVRDMA_WR_ATOMIC_FETCH_AND_ADD, + PVRDMA_WR_LSO, + PVRDMA_WR_SEND_WITH_INV, + PVRDMA_WR_RDMA_READ_WITH_INV, + PVRDMA_WR_LOCAL_INV, + PVRDMA_WR_FAST_REG_MR, + PVRDMA_WR_MASKED_ATOMIC_CMP_AND_SWP, + PVRDMA_WR_MASKED_ATOMIC_FETCH_AND_ADD, + PVRDMA_WR_BIND_MW, + PVRDMA_WR_REG_SIG_MR, + PVRDMA_WR_ERROR, +}; + +enum pvrdma_wc_status { + PVRDMA_WC_SUCCESS, + PVRDMA_WC_LOC_LEN_ERR, + PVRDMA_WC_LOC_QP_OP_ERR, + PVRDMA_WC_LOC_EEC_OP_ERR, + PVRDMA_WC_LOC_PROT_ERR, + PVRDMA_WC_WR_FLUSH_ERR, + PVRDMA_WC_MW_BIND_ERR, + PVRDMA_WC_BAD_RESP_ERR, + PVRDMA_WC_LOC_ACCESS_ERR, + PVRDMA_WC_REM_INV_REQ_ERR, + PVRDMA_WC_REM_ACCESS_ERR, + PVRDMA_WC_REM_OP_ERR, + PVRDMA_WC_RETRY_EXC_ERR, + PVRDMA_WC_RNR_RETRY_EXC_ERR, + PVRDMA_WC_LOC_RDD_VIOL_ERR, + PVRDMA_WC_REM_INV_RD_REQ_ERR, + PVRDMA_WC_REM_ABORT_ERR, + PVRDMA_WC_INV_EECN_ERR, + PVRDMA_WC_INV_EEC_STATE_ERR, + PVRDMA_WC_FATAL_ERR, + PVRDMA_WC_RESP_TIMEOUT_ERR, + PVRDMA_WC_GENERAL_ERR, +}; + +enum pvrdma_wc_opcode { + PVRDMA_WC_SEND, + PVRDMA_WC_RDMA_WRITE, + PVRDMA_WC_RDMA_READ, + PVRDMA_WC_COMP_SWAP, + PVRDMA_WC_FETCH_ADD, + PVRDMA_WC_BIND_MW, + PVRDMA_WC_LSO, + PVRDMA_WC_LOCAL_INV, + PVRDMA_WC_FAST_REG_MR, + PVRDMA_WC_MASKED_COMP_SWAP, + PVRDMA_WC_MASKED_FETCH_ADD, + PVRDMA_WC_RECV = 1 << 7, + PVRDMA_WC_RECV_RDMA_WITH_IMM, +}; + +enum pvrdma_wc_flags { + PVRDMA_WC_GRH = 1 << 0, + PVRDMA_WC_WITH_IMM = 1 << 1, + PVRDMA_WC_WITH_INVALIDATE = 1 << 2, + PVRDMA_WC_IP_CSUM_OK = 1 << 3, + PVRDMA_WC_WITH_SMAC = 1 << 4, + PVRDMA_WC_WITH_VLAN = 1 << 5, + PVRDMA_WC_WITH_NETWORK_HDR_TYPE = 1 << 6, + PVRDMA_WC_FLAGS_MAX = PVRDMA_WC_WITH_NETWORK_HDR_TYPE, +}; + +struct pvrdma_alloc_ucontext_resp { + __u32 qp_tab_size; + __u32 reserved; +}; + +struct pvrdma_alloc_pd_resp { + __u32 pdn; + __u32 reserved; +}; + +struct pvrdma_create_cq { + __aligned_u64 buf_addr; + __u32 buf_size; + __u32 reserved; +}; + +struct pvrdma_create_cq_resp { + __u32 cqn; + __u32 reserved; +}; + +struct pvrdma_resize_cq { + __aligned_u64 buf_addr; + __u32 buf_size; + __u32 reserved; +}; + +struct pvrdma_create_srq { + __aligned_u64 buf_addr; + __u32 buf_size; + __u32 reserved; +}; + +struct pvrdma_create_srq_resp { + __u32 srqn; + __u32 reserved; +}; + +struct pvrdma_create_qp { + __aligned_u64 rbuf_addr; + __aligned_u64 sbuf_addr; + __u32 rbuf_size; + __u32 sbuf_size; + __aligned_u64 qp_addr; +}; + +struct pvrdma_create_qp_resp { + __u32 qpn; + __u32 qp_handle; +}; + +/* PVRDMA masked atomic compare and swap */ +struct pvrdma_ex_cmp_swap { + __aligned_u64 swap_val; + __aligned_u64 compare_val; + __aligned_u64 swap_mask; + __aligned_u64 compare_mask; +}; + +/* PVRDMA masked atomic fetch and add */ +struct pvrdma_ex_fetch_add { + __aligned_u64 add_val; + __aligned_u64 field_boundary; +}; + +/* PVRDMA address vector. */ +struct pvrdma_av { + __u32 port_pd; + __u32 sl_tclass_flowlabel; + __u8 dgid[16]; + __u8 src_path_bits; + __u8 gid_index; + __u8 stat_rate; + __u8 hop_limit; + __u8 dmac[6]; + __u8 reserved[6]; +}; + +/* PVRDMA scatter/gather entry */ +struct pvrdma_sge { + __aligned_u64 addr; + __u32 length; + __u32 lkey; +}; + +/* PVRDMA receive queue work request */ +struct pvrdma_rq_wqe_hdr { + __aligned_u64 wr_id; /* wr id */ + __u32 num_sge; /* size of s/g array */ + __u32 total_len; /* reserved */ +}; +/* Use pvrdma_sge (ib_sge) for receive queue s/g array elements. */ + +/* PVRDMA send queue work request */ +struct pvrdma_sq_wqe_hdr { + __aligned_u64 wr_id; /* wr id */ + __u32 num_sge; /* size of s/g array */ + __u32 total_len; /* reserved */ + __u32 opcode; /* operation type */ + __u32 send_flags; /* wr flags */ + union { + __be32 imm_data; + __u32 invalidate_rkey; + } ex; + __u32 reserved; + union { + struct { + __aligned_u64 remote_addr; + __u32 rkey; + __u8 reserved[4]; + } rdma; + struct { + __aligned_u64 remote_addr; + __aligned_u64 compare_add; + __aligned_u64 swap; + __u32 rkey; + __u32 reserved; + } atomic; + struct { + __aligned_u64 remote_addr; + __u32 log_arg_sz; + __u32 rkey; + union { + struct pvrdma_ex_cmp_swap cmp_swap; + struct pvrdma_ex_fetch_add fetch_add; + } wr_data; + } masked_atomics; + struct { + __aligned_u64 iova_start; + __aligned_u64 pl_pdir_dma; + __u32 page_shift; + __u32 page_list_len; + __u32 length; + __u32 access_flags; + __u32 rkey; + __u32 reserved; + } fast_reg; + struct { + __u32 remote_qpn; + __u32 remote_qkey; + struct pvrdma_av av; + } ud; + } wr; +}; +/* Use pvrdma_sge (ib_sge) for send queue s/g array elements. */ + +/* Completion queue element. */ +struct pvrdma_cqe { + __aligned_u64 wr_id; + __aligned_u64 qp; + __u32 opcode; + __u32 status; + __u32 byte_len; + __be32 imm_data; + __u32 src_qp; + __u32 wc_flags; + __u32 vendor_err; + __u16 pkey_index; + __u16 slid; + __u8 sl; + __u8 dlid_path_bits; + __u8 port_num; + __u8 smac[6]; + __u8 network_hdr_type; + __u8 reserved2[6]; /* Pad to next power of 2 (64). */ +}; + +#endif /* __VMW_PVRDMA_ABI_H__ */ diff --git a/kernel-headers/update b/kernel-headers/update new file mode 100755 index 0000000..6bdf73b --- /dev/null +++ b/kernel-headers/update @@ -0,0 +1,197 @@ +#!/usr/bin/env python3 +# Copyright 2018 Mellanox Technologies Inc. +# Licensed under BSD (MIT variant) or GPLv2. See COPYING. +# PYTHON_ARGCOMPLETE_OK +"""This script takes a commitish from a kernel tree and synchronizes the RDMA +headers we use with that tree. + +During development, before commits are accepted to the official kernel git +tree, the --not-final option should be used. Once finalized the commit should +be revised using --amend, eg using the exec feature of 'git rebase'""" +import argparse +import subprocess +import tempfile +import os +import contextlib +import textwrap +import email.utils +import collections + +def git_call(args): + """Run git and display the output to the terminal""" + return subprocess.check_call(['git',] + args); + +def git_output(args,mode=None,input=None): + """Run git and return the output""" + o = subprocess.check_output(['git',] + args,input=input); + if mode == "raw": + return o; + return o.strip(); + +@contextlib.contextmanager +def in_directory(dir): + """Context manager that chdirs into a directory and restores the original + directory when closed.""" + cdir = os.getcwd(); + old_env = {}; + try: + # git rebase invokes its exec with a bunch of git variables set that + # prevent us from invoking git in another tree, blow them away. + for k in list(os.environ.keys()): + if k.startswith("GIT"): + old_env[k] = os.environ[k]; + del os.environ[k]; + + os.chdir(dir); + yield True; + finally: + os.chdir(cdir); + os.environ.update(old_env); + +def copy_objects(args): + """Copy the uapi header objects from the kernel repo at the commit indicated + into our repo. This is done by having git copy the tree object and blobs from the + kernel tree into this tree and then revising our index. This is a simple way + to ensure they match exactly.""" + with in_directory(args.KERNEL_GIT): + if args.not_final: + fmt = "--format=?? (\"%s\")"; + else: + fmt = "--format=%h (\"%s\")"; + + kernel_desc = git_output(["log", + "--abbrev=12","-1", + fmt, + args.COMMIT]); + ntree = git_output(["rev-parse", + "%s:include/uapi/rdma"%(args.COMMIT)]); + pack = git_output(["pack-objects", + "-q","--revs","--stdout"], + mode="raw", + input=ntree); + git_output(["unpack-objects","-q"],input=pack); + return (ntree,kernel_desc); + +def update_cmake(args,ntree): + """Create a new CMakeLists.txt that lists all of the kernel headers + for installation.""" + # We need to expand to a publish_internal_headers for each directory + fns = git_output(["ls-tree","--name-only","--full-tree","-r",ntree]).splitlines(); + groups = collections.defaultdict(list); + for I in fns: + d,p = os.path.split(os.path.join("rdma",I.decode())); + groups[d].append(p); + + data = subprocess.check_output(['git',"cat-file","blob", + ":kernel-headers/CMakeLists.txt"]); + data = data.decode(); + + # Build a new CMakeLists.txt in a temporary file + with tempfile.NamedTemporaryFile("wt") as F: + # Emit the headers lists + for I,vals in sorted(groups.items()): + F.write("publish_internal_headers(%s\n"%(I)); + for J in sorted(vals): + F.write(" %s\n"%(os.path.join(I,J))); + F.write(" )\n"); + F.write("\n"); + + # Throw away the old header lists + cur = iter(data.splitlines()); + for ln in cur: + if not ln: + continue; + if ln.startswith("publish_internal_headers(rdma"): + while not next(cur).startswith(" )"): + pass; + continue; + F.write(ln + '\n'); + break; + + # and copy the remaining lines + for ln in cur: + F.write(ln + '\n'); + + F.flush(); + blob = git_output(["hash-object","-w",F.name]); + + git_call(["update-index","--cacheinfo", + b"0644,%s,kernel-headers/CMakeLists.txt"%(blob)]); + +def make_commit(args,ntree,kernel_desc): + """Make the rdma-core commit that syncs the kernel header directory.""" + head_id = git_output(["rev-parse","HEAD"]); + old_tree_id = git_output(["rev-parse",b"%s^{tree}"%(head_id)]); + + if args.amend: + subject = git_output(["log","-1","--format=%s"]).decode(); + if subject != "Update kernel headers": + raise ValueError("In amend mode, but current HEAD does not seem to be a kernel update with subject %r"%( + subject)); + parent = git_output(["rev-parse",head_id + b"^"]); + else: + parent = head_id; + + emaila = email.utils.formataddr((git_output(["config","user.name"]).decode(), + git_output(["config","user.email"]).decode())); + + # Build a new tree object that replaces the kernel headers directory + with tempfile.NamedTemporaryFile() as F: + os.environ["GIT_INDEX_FILE"] = F.name; + git_call(["read-tree",head_id]); + git_call(["rm","-r","--quiet","--cached", + "kernel-headers/rdma"]); + git_call(["read-tree","--prefix=kernel-headers/rdma",ntree]); + update_cmake(args,ntree); + all_tree = git_output(["write-tree"]); + del os.environ["GIT_INDEX_FILE"]; + + if not args.amend and old_tree_id == all_tree: + raise ValueError("Commit is empty, aborting"); + + # And now create the commit + msg="Update kernel headers\n\n"; + p = textwrap.fill("To commit %s"%(kernel_desc.decode()), + width=74) + msg = msg + p; + msg = msg + "\n\nSigned-off-by: %s\n"%(emaila); + + commit = git_output(["commit-tree",all_tree,"-p",parent, + "-F","-"], + input=msg.encode()); + return commit,head_id; + +parser = argparse.ArgumentParser(description='Update kernel headers from the kernel tree') +parser.add_argument("--amend", + action="store_true", + default=False, + help="Replace the top commit with the the kernel header commit"); +parser.add_argument("--not-final", + action="store_true", + default=False, + help="Use if the git commit given is not part of the official kernel git tree. This option should be used during development."); +parser.add_argument("KERNEL_GIT", + action="store", + help="Kernel git directory"); +parser.add_argument("COMMIT", + action="store", + help="Kernel commitish to synchronize headers with"); + +try: + import argcomplete; + argcomplete.autocomplete(parser); +except ImportError: + pass; + +args = parser.parse_args(); + +ntree,kernel_desc = copy_objects(args); +commit,head_id = make_commit(args,ntree,kernel_desc); + +# Finalize +if args.amend: + print("Commit amended"); + git_call(["--no-pager","diff","--stat",head_id,commit]); + git_call(["reset","--merge",commit]); +else: + git_call(["merge","--ff","--ff-only",commit]); diff --git a/libibmad/CMakeLists.txt b/libibmad/CMakeLists.txt new file mode 100644 index 0000000..43d560a --- /dev/null +++ b/libibmad/CMakeLists.txt @@ -0,0 +1,31 @@ +publish_headers(infiniband + mad.h + mad_osd.h + ) + +publish_internal_headers(util + iba_types.h + ) + +rdma_library(ibmad libibmad.map + # See Documentation/versioning.md + 5 5.3.${PACKAGE_VERSION} + bm.c + cc.c + dump.c + fields.c + gs.c + mad.c + portid.c + register.c + resolve.c + rpc.c + sa.c + serv.c + smp.c + vendor.c + ) +target_link_libraries(ibmad LINK_PRIVATE + ibumad + ) +rdma_pkg_config("ibmad" "libibumad" "") diff --git a/libibmad/bm.c b/libibmad/bm.c new file mode 100644 index 0000000..921dbb8 --- /dev/null +++ b/libibmad/bm.c @@ -0,0 +1,103 @@ +/* + * Copyright (c) 2004-2009 Voltaire Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#include <string.h> + +#include <infiniband/mad.h> + +#undef DEBUG +#define DEBUG if (ibdebug) IBWARN + +static inline int response_expected(int method) +{ + return method == IB_MAD_METHOD_GET || + method == IB_MAD_METHOD_SET || method == IB_MAD_METHOD_TRAP; +} + +uint8_t *bm_call_via(void *data, ib_portid_t * portid, ib_bm_call_t * call, + struct ibmad_port * srcport) +{ + ib_rpc_t rpc = { 0 }; + int resp_expected; + struct { + uint64_t bkey; + uint8_t reserved[32]; + uint8_t data[IB_BM_DATA_SZ]; + } bm_data; + + DEBUG("route %s data %p", portid2str(portid), data); + if (portid->lid <= 0) { + IBWARN("only lid routes are supported"); + return NULL; + } + + resp_expected = response_expected(call->method); + + rpc.mgtclass = IB_BOARD_MGMT_CLASS; + + rpc.method = call->method; + rpc.attr.id = call->attrid; + rpc.attr.mod = call->mod; + rpc.timeout = resp_expected ? call->timeout : 0; + // send data and bkey + rpc.datasz = IB_BM_BKEY_AND_DATA_SZ; + rpc.dataoffs = IB_BM_BKEY_OFFS; + + // copy data to a buffer which also includes the bkey + bm_data.bkey = htonll(call->bkey); + memset(bm_data.reserved, 0, sizeof(bm_data.reserved)); + memcpy(bm_data.data, data, IB_BM_DATA_SZ); + + DEBUG + ("method 0x%x attr 0x%x mod 0x%x datasz %d off %d res_ex %d bkey 0x%08x%08x", + rpc.method, rpc.attr.id, rpc.attr.mod, rpc.datasz, rpc.dataoffs, + resp_expected, (int)(call->bkey >> 32), (int)call->bkey); + + portid->qp = 1; + if (!portid->qkey) + portid->qkey = IB_DEFAULT_QP1_QKEY; + + if (resp_expected) { + /* FIXME: no RMPP for now */ + if (mad_rpc(srcport, &rpc, portid, &bm_data, &bm_data)) + goto return_ok; + return NULL; + } + + if (mad_send_via(&rpc, portid, NULL, &bm_data, srcport) < 0) + return NULL; + +return_ok: + memcpy(data, bm_data.data, IB_BM_DATA_SZ); + return data; +} diff --git a/libibmad/cc.c b/libibmad/cc.c new file mode 100644 index 0000000..5cc2ebb --- /dev/null +++ b/libibmad/cc.c @@ -0,0 +1,114 @@ +/* + * Copyright (c) 2011 Lawrence Livermore National Lab. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#include <infiniband/mad.h> +#include "mad_internal.h" + +#undef DEBUG +#define DEBUG if (ibdebug) IBWARN + +void *cc_query_status_via(void *rcvbuf, ib_portid_t * portid, + unsigned attrid, unsigned mod, unsigned timeout, + int *rstatus, const struct ibmad_port * srcport, + uint64_t cckey) +{ + ib_rpc_cc_t rpc = { 0 }; + void *res; + + DEBUG("attr 0x%x mod 0x%x route %s", attrid, mod, portid2str(portid)); + rpc.method = IB_MAD_METHOD_GET; + rpc.attr.id = attrid; + rpc.attr.mod = mod; + rpc.timeout = timeout; + if (attrid == IB_CC_ATTR_CONGESTION_LOG) { + rpc.datasz = IB_CC_LOG_DATA_SZ; + rpc.dataoffs = IB_CC_LOG_DATA_OFFS; + } + else { + rpc.datasz = IB_CC_DATA_SZ; + rpc.dataoffs = IB_CC_DATA_OFFS; + } + rpc.mgtclass = IB_CC_CLASS; + rpc.cckey = cckey; + + portid->qp = 1; + if (!portid->qkey) + portid->qkey = IB_DEFAULT_QP1_QKEY; + + res = mad_rpc(srcport, (ib_rpc_t *)&rpc, portid, rcvbuf, rcvbuf); + if (rstatus) + *rstatus = rpc.rstatus; + + return res; +} + +void *cc_config_status_via(void *payload, void *rcvbuf, ib_portid_t * portid, + unsigned attrid, unsigned mod, unsigned timeout, + int *rstatus, const struct ibmad_port * srcport, + uint64_t cckey) +{ + ib_rpc_cc_t rpc = { 0 }; + void *res; + + DEBUG("attr 0x%x mod 0x%x route %s", attrid, mod, portid2str(portid)); + rpc.method = IB_MAD_METHOD_SET; + rpc.attr.id = attrid; + rpc.attr.mod = mod; + rpc.timeout = timeout; + if (attrid == IB_CC_ATTR_CONGESTION_LOG) { + rpc.datasz = IB_CC_LOG_DATA_SZ; + rpc.dataoffs = IB_CC_LOG_DATA_OFFS; + } + else { + rpc.datasz = IB_CC_DATA_SZ; + rpc.dataoffs = IB_CC_DATA_OFFS; + } + rpc.mgtclass = IB_CC_CLASS; + rpc.cckey = cckey; + + portid->qp = 1; + if (!portid->qkey) + portid->qkey = IB_DEFAULT_QP1_QKEY; + + res = mad_rpc(srcport, (ib_rpc_t *)&rpc, portid, payload, rcvbuf); + if (rstatus) + *rstatus = rpc.rstatus; + + return res; +} + + diff --git a/libibmad/dump.c b/libibmad/dump.c new file mode 100644 index 0000000..20d8e4e --- /dev/null +++ b/libibmad/dump.c @@ -0,0 +1,1218 @@ +/* + * Copyright (c) 2004-2009 Voltaire Inc. All rights reserved. + * Copyright (c) 2007 Xsigo Systems Inc. All rights reserved. + * Copyright (c) 2009-2011 Mellanox Technologies LTD. All rights reserved. + * Copyright (c) 2009 HNR Consulting. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#include <infiniband/mad.h> + +void mad_dump_int(char *buf, int bufsz, void *val, int valsz) +{ + switch (valsz) { + case 1: + snprintf(buf, bufsz, "%d", *(uint32_t *) val & 0xff); + break; + case 2: + snprintf(buf, bufsz, "%d", *(uint32_t *) val & 0xffff); + break; + case 3: + case 4: + snprintf(buf, bufsz, "%d", *(uint32_t *) val); + break; + case 5: + case 6: + case 7: + case 8: + snprintf(buf, bufsz, "%" PRIu64, *(uint64_t *) val); + break; + default: + IBWARN("bad int sz %d", valsz); + buf[0] = 0; + } +} + +void mad_dump_uint(char *buf, int bufsz, void *val, int valsz) +{ + switch (valsz) { + case 1: + snprintf(buf, bufsz, "%u", *(uint32_t *) val & 0xff); + break; + case 2: + snprintf(buf, bufsz, "%u", *(uint32_t *) val & 0xffff); + break; + case 3: + case 4: + snprintf(buf, bufsz, "%u", *(uint32_t *) val); + break; + case 5: + case 6: + case 7: + case 8: + snprintf(buf, bufsz, "%" PRIu64, *(uint64_t *) val); + break; + default: + IBWARN("bad int sz %u", valsz); + buf[0] = 0; + } +} + +void mad_dump_hex(char *buf, int bufsz, void *val, int valsz) +{ + switch (valsz) { + case 1: + snprintf(buf, bufsz, "0x%02x", *(uint32_t *) val & 0xff); + break; + case 2: + snprintf(buf, bufsz, "0x%04x", *(uint32_t *) val & 0xffff); + break; + case 3: + snprintf(buf, bufsz, "0x%06x", *(uint32_t *) val & 0xffffff); + break; + case 4: + snprintf(buf, bufsz, "0x%08x", *(uint32_t *) val); + break; + case 5: + snprintf(buf, bufsz, "0x%010" PRIx64, + *(uint64_t *) val & (uint64_t) 0xffffffffffULL); + break; + case 6: + snprintf(buf, bufsz, "0x%012" PRIx64, + *(uint64_t *) val & (uint64_t) 0xffffffffffffULL); + break; + case 7: + snprintf(buf, bufsz, "0x%014" PRIx64, + *(uint64_t *) val & (uint64_t) 0xffffffffffffffULL); + break; + case 8: + snprintf(buf, bufsz, "0x%016" PRIx64, *(uint64_t *) val); + break; + default: + IBWARN("bad int sz %d", valsz); + buf[0] = 0; + } +} + +void mad_dump_rhex(char *buf, int bufsz, void *val, int valsz) +{ + switch (valsz) { + case 1: + snprintf(buf, bufsz, "%02x", *(uint32_t *) val & 0xff); + break; + case 2: + snprintf(buf, bufsz, "%04x", *(uint32_t *) val & 0xffff); + break; + case 3: + snprintf(buf, bufsz, "%06x", *(uint32_t *) val & 0xffffff); + break; + case 4: + snprintf(buf, bufsz, "%08x", *(uint32_t *) val); + break; + case 5: + snprintf(buf, bufsz, "%010" PRIx64, + *(uint64_t *) val & (uint64_t) 0xffffffffffULL); + break; + case 6: + snprintf(buf, bufsz, "%012" PRIx64, + *(uint64_t *) val & (uint64_t) 0xffffffffffffULL); + break; + case 7: + snprintf(buf, bufsz, "%014" PRIx64, + *(uint64_t *) val & (uint64_t) 0xffffffffffffffULL); + break; + case 8: + snprintf(buf, bufsz, "%016" PRIx64, *(uint64_t *) val); + break; + default: + IBWARN("bad int sz %d", valsz); + buf[0] = 0; + } +} + +void mad_dump_linkwidth(char *buf, int bufsz, void *val, int valsz) +{ + int width = *(int *)val; + + switch (width) { + case 1: + snprintf(buf, bufsz, "1X"); + break; + case 2: + snprintf(buf, bufsz, "4X"); + break; + case 4: + snprintf(buf, bufsz, "8X"); + break; + case 8: + snprintf(buf, bufsz, "12X"); + break; + case 16: + snprintf(buf, bufsz, "2X"); + break; + default: + IBWARN("bad width %d", width); + snprintf(buf, bufsz, "undefined (%d)", width); + break; + } +} + +static void dump_linkwidth(char *buf, int bufsz, int width) +{ + int n = 0; + + if (width & 0x1) + n += snprintf(buf + n, bufsz - n, "1X or "); + if (n < bufsz && (width & 0x2)) + n += snprintf(buf + n, bufsz - n, "4X or "); + if (n < bufsz && (width & 0x4)) + n += snprintf(buf + n, bufsz - n, "8X or "); + if (n < bufsz && (width & 0x8)) + n += snprintf(buf + n, bufsz - n, "12X or "); + if (n < bufsz && (width & 0x10)) + n += snprintf(buf + n, bufsz - n, "2X or "); + + if (n >= bufsz) + return; + else if (width == 0 || (width >> 5)) + snprintf(buf + n, bufsz - n, "undefined (%d)", width); + else if (bufsz > 3) + buf[n - 4] = '\0'; +} + +void mad_dump_linkwidthsup(char *buf, int bufsz, void *val, int valsz) +{ + int width = *(int *)val; + + dump_linkwidth(buf, bufsz, width); + + switch (width) { + case 1: + case 3: + case 7: + case 11: + case 15: + case 17: + case 19: + case 23: + case 27: + case 31: + break; + + default: + if (!(width >> 5)) + snprintf(buf + strlen(buf), bufsz - strlen(buf), + " (IBA extension)"); + break; + } +} + +void mad_dump_linkwidthen(char *buf, int bufsz, void *val, int valsz) +{ + int width = *(int *)val; + + dump_linkwidth(buf, bufsz, width); +} + +void mad_dump_linkspeed(char *buf, int bufsz, void *val, int valsz) +{ + int speed = *(int *)val; + + switch (speed) { + case 0: + snprintf(buf, bufsz, "Extended speed"); + break; + case 1: + snprintf(buf, bufsz, "2.5 Gbps"); + break; + case 2: + snprintf(buf, bufsz, "5.0 Gbps"); + break; + case 4: + snprintf(buf, bufsz, "10.0 Gbps"); + break; + default: + snprintf(buf, bufsz, "undefined (%d)", speed); + break; + } +} + +static void dump_linkspeed(char *buf, int bufsz, int speed) +{ + int n = 0; + + if (speed & 0x1) + n += snprintf(buf + n, bufsz - n, "2.5 Gbps or "); + if (n < bufsz && (speed & 0x2)) + n += snprintf(buf + n, bufsz - n, "5.0 Gbps or "); + if (n < bufsz && (speed & 0x4)) + n += snprintf(buf + n, bufsz - n, "10.0 Gbps or "); + + if (n >= bufsz) + return; + else if (speed == 0 || (speed >> 3)) { + n += snprintf(buf + n, bufsz - n, "undefined (%d)", speed); + if (n >= bufsz) + return; + } else if (bufsz > 3) { + buf[n - 4] = '\0'; + n -= 4; + } + + switch (speed) { + case 1: + case 3: + case 5: + case 7: + break; + default: + if (!(speed >> 3)) + snprintf(buf + n, bufsz - n, " (IBA extension)"); + break; + } +} + +void mad_dump_linkspeedsup(char *buf, int bufsz, void *val, int valsz) +{ + int speed = *(int *)val; + + dump_linkspeed(buf, bufsz, speed); +} + +void mad_dump_linkspeeden(char *buf, int bufsz, void *val, int valsz) +{ + int speed = *(int *)val; + + dump_linkspeed(buf, bufsz, speed); +} + +void mad_dump_linkspeedext(char *buf, int bufsz, void *val, int valsz) +{ + int speed = *(int *)val; + + switch (speed) { + case 0: + snprintf(buf, bufsz, "No Extended Speed"); + break; + case 1: + snprintf(buf, bufsz, "14.0625 Gbps"); + break; + case 2: + snprintf(buf, bufsz, "25.78125 Gbps"); + break; + case 4: + snprintf(buf, bufsz, "53.125 Gbps"); + break; + default: + snprintf(buf, bufsz, "undefined (%d)", speed); + break; + } +} + +static void dump_linkspeedext(char *buf, int bufsz, int speed) +{ + int n = 0; + + if (speed == 0) { + sprintf(buf, "%d", speed); + return; + } + + if (speed & 0x1) + n += snprintf(buf + n, bufsz - n, "14.0625 Gbps or "); + if (n < bufsz && speed & 0x2) + n += snprintf(buf + n, bufsz - n, "25.78125 Gbps or "); + if (n < bufsz && speed & 0x4) + n += snprintf(buf + n, bufsz - n, "53.125 Gbps or "); + + if (n >= bufsz) { + if (bufsz > 3) + buf[n - 4] = '\0'; + return; + } + + if (speed >> 3) { + n += snprintf(buf + n, bufsz - n, "undefined (%d)", speed); + return; + } else if (bufsz > 3) + buf[n - 4] = '\0'; +} + +void mad_dump_linkspeedextsup(char *buf, int bufsz, void *val, int valsz) +{ + int speed = *(int *)val; + + dump_linkspeedext(buf, bufsz, speed); +} + +void mad_dump_linkspeedexten(char *buf, int bufsz, void *val, int valsz) +{ + int speed = *(int *)val; + + if (speed == 30) { + sprintf(buf, "%s", "Extended link speeds disabled"); + return; + } + dump_linkspeedext(buf, bufsz, speed); +} + +void mad_dump_portstate(char *buf, int bufsz, void *val, int valsz) +{ + int state = *(int *)val; + + switch (state) { + case 0: + snprintf(buf, bufsz, "NoChange"); + break; + case 1: + snprintf(buf, bufsz, "Down"); + break; + case 2: + snprintf(buf, bufsz, "Initialize"); + break; + case 3: + snprintf(buf, bufsz, "Armed"); + break; + case 4: + snprintf(buf, bufsz, "Active"); + break; + default: + snprintf(buf, bufsz, "?(%d)", state); + } +} + +void mad_dump_linkdowndefstate(char *buf, int bufsz, void *val, int valsz) +{ + int state = *(int *)val; + + switch (state) { + case 0: + snprintf(buf, bufsz, "NoChange"); + break; + case 1: + snprintf(buf, bufsz, "Sleep"); + break; + case 2: + snprintf(buf, bufsz, "Polling"); + break; + default: + snprintf(buf, bufsz, "?(%d)", state); + break; + } +} + +void mad_dump_physportstate(char *buf, int bufsz, void *val, int valsz) +{ + int state = *(int *)val; + + switch (state) { + case 0: + snprintf(buf, bufsz, "NoChange"); + break; + case 1: + snprintf(buf, bufsz, "Sleep"); + break; + case 2: + snprintf(buf, bufsz, "Polling"); + break; + case 3: + snprintf(buf, bufsz, "Disabled"); + break; + case 4: + snprintf(buf, bufsz, "PortConfigurationTraining"); + break; + case 5: + snprintf(buf, bufsz, "LinkUp"); + break; + case 6: + snprintf(buf, bufsz, "LinkErrorRecovery"); + break; + case 7: + snprintf(buf, bufsz, "PhyTest"); + break; + default: + snprintf(buf, bufsz, "?(%d)", state); + } +} + +void mad_dump_mtu(char *buf, int bufsz, void *val, int valsz) +{ + int mtu = *(int *)val; + + switch (mtu) { + case 1: + snprintf(buf, bufsz, "256"); + break; + case 2: + snprintf(buf, bufsz, "512"); + break; + case 3: + snprintf(buf, bufsz, "1024"); + break; + case 4: + snprintf(buf, bufsz, "2048"); + break; + case 5: + snprintf(buf, bufsz, "4096"); + break; + default: + snprintf(buf, bufsz, "?(%d)", mtu); + } +} + +void mad_dump_vlcap(char *buf, int bufsz, void *val, int valsz) +{ + int vlcap = *(int *)val; + + switch (vlcap) { + case 1: + snprintf(buf, bufsz, "VL0"); + break; + case 2: + snprintf(buf, bufsz, "VL0-1"); + break; + case 3: + snprintf(buf, bufsz, "VL0-3"); + break; + case 4: + snprintf(buf, bufsz, "VL0-7"); + break; + case 5: + snprintf(buf, bufsz, "VL0-14"); + break; + default: + snprintf(buf, bufsz, "?(%d)", vlcap); + } +} + +void mad_dump_opervls(char *buf, int bufsz, void *val, int valsz) +{ + int opervls = *(int *)val; + + switch (opervls) { + case 0: + snprintf(buf, bufsz, "No change"); + break; + case 1: + snprintf(buf, bufsz, "VL0"); + break; + case 2: + snprintf(buf, bufsz, "VL0-1"); + break; + case 3: + snprintf(buf, bufsz, "VL0-3"); + break; + case 4: + snprintf(buf, bufsz, "VL0-7"); + break; + case 5: + snprintf(buf, bufsz, "VL0-14"); + break; + default: + snprintf(buf, bufsz, "?(%d)", opervls); + } +} + +void mad_dump_portcapmask(char *buf, int bufsz, void *val, int valsz) +{ + unsigned mask = *(unsigned *)val; + char *s = buf; + + s += sprintf(s, "0x%x\n", mask); + if (mask & (1 << 1)) + s += sprintf(s, "\t\t\t\tIsSM\n"); + if (mask & (1 << 2)) + s += sprintf(s, "\t\t\t\tIsNoticeSupported\n"); + if (mask & (1 << 3)) + s += sprintf(s, "\t\t\t\tIsTrapSupported\n"); + if (mask & (1 << 4)) + s += sprintf(s, "\t\t\t\tIsOptionalIPDSupported\n"); + if (mask & (1 << 5)) + s += sprintf(s, "\t\t\t\tIsAutomaticMigrationSupported\n"); + if (mask & (1 << 6)) + s += sprintf(s, "\t\t\t\tIsSLMappingSupported\n"); + if (mask & (1 << 7)) + s += sprintf(s, "\t\t\t\tIsMKeyNVRAM\n"); + if (mask & (1 << 8)) + s += sprintf(s, "\t\t\t\tIsPKeyNVRAM\n"); + if (mask & (1 << 9)) + s += sprintf(s, "\t\t\t\tIsLedInfoSupported\n"); + if (mask & (1 << 10)) + s += sprintf(s, "\t\t\t\tIsSMdisabled\n"); + if (mask & (1 << 11)) + s += sprintf(s, "\t\t\t\tIsSystemImageGUIDsupported\n"); + if (mask & (1 << 12)) + s += sprintf(s, + "\t\t\t\tIsPkeySwitchExternalPortTrapSupported\n"); + if (mask & (1 << 14)) + s += sprintf(s, "\t\t\t\tIsExtendedSpeedsSupported\n"); + if (mask & (1 << 15)) + s += sprintf(s, "\t\t\t\tIsCapabilityMask2Supported\n"); + if (mask & (1 << 16)) + s += sprintf(s, "\t\t\t\tIsCommunicatonManagementSupported\n"); + if (mask & (1 << 17)) + s += sprintf(s, "\t\t\t\tIsSNMPTunnelingSupported\n"); + if (mask & (1 << 18)) + s += sprintf(s, "\t\t\t\tIsReinitSupported\n"); + if (mask & (1 << 19)) + s += sprintf(s, "\t\t\t\tIsDeviceManagementSupported\n"); + if (mask & (1 << 20)) + s += sprintf(s, "\t\t\t\tIsVendorClassSupported\n"); + if (mask & (1 << 21)) + s += sprintf(s, "\t\t\t\tIsDRNoticeSupported\n"); + if (mask & (1 << 22)) + s += sprintf(s, "\t\t\t\tIsCapabilityMaskNoticeSupported\n"); + if (mask & (1 << 23)) + s += sprintf(s, "\t\t\t\tIsBootManagementSupported\n"); + if (mask & (1 << 24)) + s += sprintf(s, "\t\t\t\tIsLinkRoundTripLatencySupported\n"); + if (mask & (1 << 25)) + s += sprintf(s, "\t\t\t\tIsClientRegistrationSupported\n"); + if (mask & (1 << 26)) + s += sprintf(s, "\t\t\t\tIsOtherLocalChangesNoticeSupported\n"); + if (mask & (1 << 27)) + s += sprintf(s, + "\t\t\t\tIsLinkSpeedWidthPairsTableSupported\n"); + if (mask & (1 << 28)) + s += sprintf(s, "\t\t\t\tIsVendorSpecificMadsTableSupported\n"); + if (mask & (1 << 29)) + s += sprintf(s, "\t\t\t\tIsMcastPkeyTrapSuppressionSupported\n"); + if (mask & (1 << 30)) + s += sprintf(s, "\t\t\t\tIsMulticastFDBTopSupported\n"); + if (mask & ((1U) << 31)) + s += sprintf(s, "\t\t\t\tIsHierarchyInfoSupported\n"); + + if (s != buf) + *(--s) = 0; +} + +void mad_dump_portcapmask2(char *buf, int bufsz, void *val, int valsz) +{ + int mask = *(int *)val; + char *s = buf; + + s += sprintf(s, "0x%x\n", mask); + if (mask & (1 << 0)) + s += sprintf(s, "\t\t\t\tIsSetNodeDescriptionSupported\n"); + if (mask & (1 << 1)) + s += sprintf(s, "\t\t\t\tIsPortInfoExtendedSupported\n"); + if (mask & (1 << 2)) + s += sprintf(s, "\t\t\t\tIsVirtualizationSupported\n"); + if (mask & (1 << 3)) + s += sprintf(s, "\t\t\t\tIsSwitchPortStateTableSupported\n"); + if (mask & (1 << 4)) + s += sprintf(s, "\t\t\t\tIsLinkWidth2xSupported\n"); + if (mask & (1 << 5)) + s += sprintf(s, "\t\t\t\tIsLinkSpeedHDRSupported\n"); + + if (s != buf) + *(--s) = 0; +} + +void mad_dump_bitfield(char *buf, int bufsz, void *val, int valsz) +{ + snprintf(buf, bufsz, "0x%x", *(uint32_t *) val); +} + +void mad_dump_array(char *buf, int bufsz, void *val, int valsz) +{ + uint8_t *p = val, *e; + char *s = buf; + + if (bufsz < valsz * 2) + valsz = bufsz / 2; + + for (p = val, e = p + valsz; p < e; p++, s += 2) + sprintf(s, "%02x", *p); +} + +void mad_dump_string(char *buf, int bufsz, void *val, int valsz) +{ + if (bufsz < valsz) + valsz = bufsz; + + snprintf(buf, valsz, "'%s'", (char *)val); +} + +void mad_dump_node_type(char *buf, int bufsz, void *val, int valsz) +{ + int nodetype = *(int *)val; + + switch (nodetype) { + case 1: + snprintf(buf, bufsz, "Channel Adapter"); + break; + case 2: + snprintf(buf, bufsz, "Switch"); + break; + case 3: + snprintf(buf, bufsz, "Router"); + break; + default: + snprintf(buf, bufsz, "?(%d)?", nodetype); + break; + } +} + +#define IB_MAX_NUM_VLS 16 +#define IB_MAX_NUM_VLS_TO_U8 ((IB_MAX_NUM_VLS)/2) + +typedef struct _ib_slvl_table { + uint8_t vl_by_sl_num[IB_MAX_NUM_VLS_TO_U8]; +} ib_slvl_table_t; + +static inline void ib_slvl_get_i(ib_slvl_table_t * tbl, int i, uint8_t * vl) +{ + *vl = (tbl->vl_by_sl_num[i >> 1] >> ((!(i & 1)) << 2)) & 0xf; +} + +#define IB_NUM_VL_ARB_ELEMENTS_IN_BLOCK 32 + +typedef struct _ib_vl_arb_table { + struct { + uint8_t res_vl; + uint8_t weight; + } vl_entry[IB_NUM_VL_ARB_ELEMENTS_IN_BLOCK]; +} ib_vl_arb_table_t; + +static inline void ib_vl_arb_get_vl(uint8_t res_vl, uint8_t * const vl) +{ + *vl = res_vl & 0x0F; +} + +void mad_dump_sltovl(char *buf, int bufsz, void *val, int valsz) +{ + ib_slvl_table_t *p_slvl_tbl = val; + uint8_t vl; + int i, n = 0; + n = snprintf(buf, bufsz, "|"); + for (i = 0; i < 16; i++) { + ib_slvl_get_i(p_slvl_tbl, i, &vl); + n += snprintf(buf + n, bufsz - n, "%2u|", vl); + if (n >= bufsz) + break; + } + snprintf(buf + n, bufsz - n, "\n"); +} + +void mad_dump_vlarbitration(char *buf, int bufsz, void *val, int num) +{ + ib_vl_arb_table_t *p_vla_tbl = val; + int i, n; + uint8_t vl; + + num /= sizeof(p_vla_tbl->vl_entry[0]); + + n = snprintf(buf, bufsz, "\nVL : |"); + if (n >= bufsz) + return; + for (i = 0; i < num; i++) { + ib_vl_arb_get_vl(p_vla_tbl->vl_entry[i].res_vl, &vl); + n += snprintf(buf + n, bufsz - n, "0x%-2X|", vl); + if (n >= bufsz) + return; + } + + n += snprintf(buf + n, bufsz - n, "\nWEIGHT: |"); + if (n >= bufsz) + return; + for (i = 0; i < num; i++) { + n += snprintf(buf + n, bufsz - n, "0x%-2X|", + p_vla_tbl->vl_entry[i].weight); + if (n >= bufsz) + return; + } + + snprintf(buf + n, bufsz - n, "\n"); +} + +static int _dump_fields(char *buf, int bufsz, void *data, int start, int end) +{ + char val[64]; + char *s = buf; + int n, field; + + for (field = start; field < end && bufsz > 0; field++) { + mad_decode_field(data, field, val); + if (!mad_dump_field(field, s, bufsz-1, val)) + return -1; + n = strlen(s); + s += n; + *s++ = '\n'; + *s = 0; + n++; + bufsz -= n; + } + + return (int)(s - buf); +} + +void mad_dump_fields(char *buf, int bufsz, void *val, int valsz, int start, + int end) +{ + _dump_fields(buf, bufsz, val, start, end); +} + +void mad_dump_nodedesc(char *buf, int bufsz, void *val, int valsz) +{ + strncpy(buf, val, bufsz); + + if (valsz < bufsz) + buf[valsz] = 0; +} + +void mad_dump_nodeinfo(char *buf, int bufsz, void *val, int valsz) +{ + _dump_fields(buf, bufsz, val, IB_NODE_FIRST_F, IB_NODE_LAST_F); +} + +void mad_dump_portinfo(char *buf, int bufsz, void *val, int valsz) +{ + int cnt; + + cnt = _dump_fields(buf, bufsz, val, IB_PORT_FIRST_F, IB_PORT_LAST_F); + if (cnt < 0) + return; + + _dump_fields(buf + cnt, bufsz - cnt, val, + IB_PORT_CAPMASK2_F, IB_PORT_LINK_SPEED_EXT_LAST_F); +} + +void mad_dump_portstates(char *buf, int bufsz, void *val, int valsz) +{ + _dump_fields(buf, bufsz, val, IB_PORT_STATE_F, IB_PORT_LINK_DOWN_DEF_F); +} + +void mad_dump_switchinfo(char *buf, int bufsz, void *val, int valsz) +{ + _dump_fields(buf, bufsz, val, IB_SW_FIRST_F, IB_SW_LAST_F); +} + +void mad_dump_perfcounters(char *buf, int bufsz, void *val, int valsz) +{ + int cnt, cnt2; + + cnt = _dump_fields(buf, bufsz, val, + IB_PC_FIRST_F, IB_PC_VL15_DROPPED_F); + if (cnt < 0) + return; + + cnt2 = _dump_fields(buf + cnt, bufsz - cnt, val, + IB_PC_QP1_DROP_F, IB_PC_QP1_DROP_F + 1); + if (cnt2 < 0) + return; + + _dump_fields(buf + cnt + cnt2, bufsz - cnt - cnt2, val, + IB_PC_VL15_DROPPED_F, IB_PC_LAST_F); +} + +void mad_dump_perfcounters_ext(char *buf, int bufsz, void *val, int valsz) +{ + int cnt; + + cnt = _dump_fields(buf, bufsz, val, IB_PC_EXT_FIRST_F, IB_PC_EXT_LAST_F); + if (cnt < 0) + return; + + _dump_fields(buf + cnt, bufsz - cnt, val, + IB_PC_EXT_COUNTER_SELECT2_F, IB_PC_EXT_ERR_LAST_F); +} + +void mad_dump_perfcounters_xmt_sl(char *buf, int bufsz, void *val, int valsz) +{ + int cnt; + + cnt = _dump_fields(buf, bufsz, val, IB_PC_EXT_PORT_SELECT_F, + IB_PC_EXT_XMT_BYTES_F); + if (cnt < 0) + return; + + _dump_fields(buf + cnt, bufsz - cnt, val, IB_PC_XMT_DATA_SL_FIRST_F, + IB_PC_XMT_DATA_SL_LAST_F); +} + +void mad_dump_perfcounters_rcv_sl(char *buf, int bufsz, void *val, int valsz) +{ + int cnt; + + cnt = _dump_fields(buf, bufsz, val, IB_PC_EXT_PORT_SELECT_F, + IB_PC_EXT_XMT_BYTES_F); + if (cnt < 0) + return; + + _dump_fields(buf + cnt, bufsz - cnt, val, IB_PC_RCV_DATA_SL_FIRST_F, + IB_PC_RCV_DATA_SL_LAST_F); +} + +void mad_dump_perfcounters_xmt_disc(char *buf, int bufsz, void *val, int valsz) +{ + int cnt; + + cnt = _dump_fields(buf, bufsz, val, IB_PC_EXT_PORT_SELECT_F, + IB_PC_EXT_XMT_BYTES_F); + if (cnt < 0) + return; + + _dump_fields(buf + cnt, bufsz - cnt, val, IB_PC_XMT_INACT_DISC_F, + IB_PC_XMT_DISC_LAST_F); +} + +void mad_dump_perfcounters_rcv_err(char *buf, int bufsz, void *val, int valsz) +{ + int cnt; + + cnt = _dump_fields(buf, bufsz, val, IB_PC_EXT_PORT_SELECT_F, + IB_PC_EXT_XMT_BYTES_F); + if (cnt < 0) + return; + + _dump_fields(buf + cnt, bufsz - cnt, val, IB_PC_RCV_LOCAL_PHY_ERR_F, + IB_PC_RCV_ERR_LAST_F); +} + +void mad_dump_portsamples_control(char *buf, int bufsz, void *val, int valsz) +{ + _dump_fields(buf, bufsz, val, IB_PSC_OPCODE_F, IB_PSC_LAST_F); +} + +void mad_dump_portsamples_result(char *buf, int bufsz, void *val, int valsz) +{ + _dump_fields(buf, bufsz, val, IB_PSR_TAG_F, IB_PSR_LAST_F); +} + +void mad_dump_port_ext_speeds_counters_rsfec_active(char *buf, int bufsz, + void *val, int valsz) +{ + _dump_fields(buf, bufsz, val, IB_PESC_RSFEC_FIRST_F, + IB_PESC_RSFEC_LAST_F); +} + +void mad_dump_port_ext_speeds_counters(char *buf, int bufsz, void *val, int valsz) +{ + _dump_fields(buf, bufsz, val, IB_PESC_PORT_SELECT_F, IB_PESC_LAST_F); +} + +void mad_dump_perfcounters_port_op_rcv_counters(char *buf, int bufsz, void *val, int valsz) +{ + int cnt; + + cnt = _dump_fields(buf, bufsz, val, IB_PC_EXT_PORT_SELECT_F, + IB_PC_EXT_XMT_BYTES_F); + if (cnt < 0) + return; + + _dump_fields(buf + cnt, bufsz - cnt, val, IB_PC_PORT_OP_RCV_COUNTERS_FIRST_F, + IB_PC_PORT_OP_RCV_COUNTERS_LAST_F); +} + +void mad_dump_perfcounters_port_flow_ctl_counters(char *buf, int bufsz, void *val, int valsz) +{ + int cnt; + + cnt = _dump_fields(buf, bufsz, val, IB_PC_EXT_PORT_SELECT_F, + IB_PC_EXT_XMT_BYTES_F); + if (cnt < 0) + return; + + _dump_fields(buf + cnt, bufsz - cnt, val, IB_PC_PORT_FLOW_CTL_COUNTERS_FIRST_F, + IB_PC_PORT_FLOW_CTL_COUNTERS_LAST_F); +} + +void mad_dump_perfcounters_port_vl_op_packet(char *buf, int bufsz, void *val, int valsz) +{ + int cnt; + + cnt = _dump_fields(buf, bufsz, val, IB_PC_EXT_PORT_SELECT_F, + IB_PC_EXT_XMT_BYTES_F); + if (cnt < 0) + return; + + _dump_fields(buf + cnt, bufsz - cnt, val, IB_PC_PORT_VL_OP_PACKETS_FIRST_F, + IB_PC_PORT_VL_OP_PACKETS_LAST_F); +} + +void mad_dump_perfcounters_port_vl_op_data(char *buf, int bufsz, void *val, int valsz) +{ + int cnt; + + cnt = _dump_fields(buf, bufsz, val, IB_PC_EXT_PORT_SELECT_F, + IB_PC_EXT_XMT_BYTES_F); + if (cnt < 0) + return; + + _dump_fields(buf + cnt, bufsz - cnt, val, IB_PC_PORT_VL_OP_DATA_FIRST_F, + IB_PC_PORT_VL_OP_DATA_LAST_F); +} + +void mad_dump_perfcounters_port_vl_xmit_flow_ctl_update_errors(char *buf, int bufsz, void *val, int valsz) +{ + int cnt; + + cnt = _dump_fields(buf, bufsz, val, IB_PC_EXT_PORT_SELECT_F, + IB_PC_EXT_XMT_BYTES_F); + if (cnt < 0) + return; + + _dump_fields(buf + cnt, bufsz - cnt, val, IB_PC_PORT_VL_XMIT_FLOW_CTL_UPDATE_ERRORS_FIRST_F, + IB_PC_PORT_VL_XMIT_FLOW_CTL_UPDATE_ERRORS_LAST_F); +} + +void mad_dump_perfcounters_port_vl_xmit_wait_counters(char *buf, int bufsz, void *val, int valsz) +{ + int cnt; + + cnt = _dump_fields(buf, bufsz, val, IB_PC_EXT_PORT_SELECT_F, + IB_PC_EXT_XMT_BYTES_F); + if (cnt < 0) + return; + + _dump_fields(buf + cnt, bufsz - cnt, val, IB_PC_PORT_VL_XMIT_WAIT_COUNTERS_FIRST_F, + IB_PC_PORT_VL_XMIT_WAIT_COUNTERS_LAST_F); +} + +void mad_dump_perfcounters_sw_port_vl_congestion(char *buf, int bufsz, void *val, int valsz) +{ + int cnt; + + cnt = _dump_fields(buf, bufsz, val, IB_PC_EXT_PORT_SELECT_F, + IB_PC_EXT_XMT_BYTES_F); + if (cnt < 0) + return; + + _dump_fields(buf + cnt, bufsz - cnt, val, IB_PC_SW_PORT_VL_CONGESTION_FIRST_F, + IB_PC_SW_PORT_VL_CONGESTION_LAST_F); +} + +void mad_dump_perfcounters_rcv_con_ctrl(char *buf, int bufsz, void *val, int valsz) +{ + int cnt; + + cnt = _dump_fields(buf, bufsz, val, IB_PC_EXT_PORT_SELECT_F, + IB_PC_EXT_XMT_BYTES_F); + if (cnt < 0) + return; + + _dump_fields(buf + cnt, bufsz - cnt, val, IB_PC_RCV_CON_CTRL_FIRST_F, + IB_PC_RCV_CON_CTRL_LAST_F); +} + + +void mad_dump_perfcounters_sl_rcv_fecn(char *buf, int bufsz, void *val, int valsz) +{ + int cnt; + + cnt = _dump_fields(buf, bufsz, val, IB_PC_EXT_PORT_SELECT_F, + IB_PC_EXT_XMT_BYTES_F); + if (cnt < 0) + return; + + _dump_fields(buf + cnt, bufsz - cnt, val, IB_PC_SL_RCV_FECN_FIRST_F, + IB_PC_SL_RCV_FECN_LAST_F); +} + +void mad_dump_perfcounters_sl_rcv_becn(char *buf, int bufsz, void *val, int valsz) +{ + int cnt; + + cnt = _dump_fields(buf, bufsz, val, IB_PC_EXT_PORT_SELECT_F, + IB_PC_EXT_XMT_BYTES_F); + if (cnt < 0) + return; + + _dump_fields(buf + cnt, bufsz - cnt, val, IB_PC_SL_RCV_BECN_FIRST_F, + IB_PC_SL_RCV_BECN_LAST_F); +} + +void mad_dump_perfcounters_xmit_con_ctrl(char *buf, int bufsz, void *val, int valsz) +{ + int cnt; + + cnt = _dump_fields(buf, bufsz, val, IB_PC_EXT_PORT_SELECT_F, + IB_PC_EXT_XMT_BYTES_F); + if (cnt < 0) + return; + + _dump_fields(buf + cnt, bufsz - cnt, val, IB_PC_XMIT_CON_CTRL_FIRST_F, + IB_PC_XMIT_CON_CTRL_LAST_F); +} + +void mad_dump_perfcounters_vl_xmit_time_cong(char *buf, int bufsz, void *val, int valsz) +{ + int cnt; + + cnt = _dump_fields(buf, bufsz, val, IB_PC_EXT_PORT_SELECT_F, + IB_PC_EXT_XMT_BYTES_F); + if (cnt < 0) + return; + + _dump_fields(buf + cnt, bufsz - cnt, val, IB_PC_VL_XMIT_TIME_CONG_FIRST_F, + IB_PC_VL_XMIT_TIME_CONG_LAST_F); +} + +void mad_dump_mlnx_ext_port_info(char *buf, int bufsz, void *val, int valsz) +{ + _dump_fields(buf, bufsz, val, IB_MLNX_EXT_PORT_STATE_CHG_ENABLE_F, + IB_MLNX_EXT_PORT_LAST_F); +} + +void mad_dump_cc_congestioninfo(char *buf, int bufsz, void *val, int valsz) +{ + _dump_fields(buf, bufsz, val, IB_CC_CONGESTION_INFO_FIRST_F, + IB_CC_CONGESTION_INFO_LAST_F); +} + +void mad_dump_cc_congestionkeyinfo(char *buf, int bufsz, void *val, int valsz) +{ + _dump_fields(buf, bufsz, val, IB_CC_CONGESTION_KEY_INFO_FIRST_F, + IB_CC_CONGESTION_KEY_INFO_LAST_F); +} + +void mad_dump_cc_congestionlog(char *buf, int bufsz, void *val, int valsz) +{ + _dump_fields(buf, bufsz, val, IB_CC_CONGESTION_LOG_FIRST_F, + IB_CC_CONGESTION_LOG_LAST_F); +} + +void mad_dump_cc_congestionlogswitch(char *buf, int bufsz, void *val, int valsz) +{ + _dump_fields(buf, bufsz, val, IB_CC_CONGESTION_LOG_SWITCH_FIRST_F, + IB_CC_CONGESTION_LOG_SWITCH_LAST_F); +} + +void mad_dump_cc_congestionlogentryswitch(char *buf, int bufsz, void *val, int valsz) +{ + _dump_fields(buf, bufsz, val, IB_CC_CONGESTION_LOG_ENTRY_SWITCH_FIRST_F, + IB_CC_CONGESTION_LOG_ENTRY_SWITCH_LAST_F); +} + +void mad_dump_cc_congestionlogca(char *buf, int bufsz, void *val, int valsz) +{ + _dump_fields(buf, bufsz, val, IB_CC_CONGESTION_LOG_CA_FIRST_F, + IB_CC_CONGESTION_LOG_CA_LAST_F); +} + +void mad_dump_cc_congestionlogentryca(char *buf, int bufsz, void *val, int valsz) +{ + _dump_fields(buf, bufsz, val, IB_CC_CONGESTION_LOG_ENTRY_CA_FIRST_F, + IB_CC_CONGESTION_LOG_ENTRY_CA_LAST_F); +} + +void mad_dump_cc_switchcongestionsetting(char *buf, int bufsz, void *val, int valsz) +{ + _dump_fields(buf, bufsz, val, IB_CC_SWITCH_CONGESTION_SETTING_FIRST_F, + IB_CC_SWITCH_CONGESTION_SETTING_LAST_F); +} + +void mad_dump_cc_switchportcongestionsettingelement(char *buf, int bufsz, void *val, int valsz) +{ + _dump_fields(buf, bufsz, val, IB_CC_SWITCH_PORT_CONGESTION_SETTING_ELEMENT_FIRST_F, + IB_CC_SWITCH_PORT_CONGESTION_SETTING_ELEMENT_LAST_F); +} + +void mad_dump_cc_cacongestionsetting(char *buf, int bufsz, void *val, int valsz) +{ + _dump_fields(buf, bufsz, val, IB_CC_CA_CONGESTION_SETTING_FIRST_F, + IB_CC_CA_CONGESTION_SETTING_LAST_F); +} + +void mad_dump_cc_cacongestionentry(char *buf, int bufsz, void *val, int valsz) +{ + _dump_fields(buf, bufsz, val, IB_CC_CA_CONGESTION_ENTRY_FIRST_F, + IB_CC_CA_CONGESTION_ENTRY_LAST_F); +} + +void mad_dump_cc_congestioncontroltable(char *buf, int bufsz, void *val, int valsz) +{ + _dump_fields(buf, bufsz, val, IB_CC_CONGESTION_CONTROL_TABLE_FIRST_F, + IB_CC_CONGESTION_CONTROL_TABLE_LAST_F); +} + +void mad_dump_cc_congestioncontroltableentry(char *buf, int bufsz, void *val, int valsz) +{ + _dump_fields(buf, bufsz, val, IB_CC_CONGESTION_CONTROL_TABLE_ENTRY_FIRST_F, + IB_CC_CONGESTION_CONTROL_TABLE_ENTRY_LAST_F); +} + +void mad_dump_cc_timestamp(char *buf, int bufsz, void *val, int valsz) +{ + _dump_fields(buf, bufsz, val, IB_CC_TIMESTAMP_FIRST_F, + IB_CC_TIMESTAMP_LAST_F); +} + +void mad_dump_classportinfo(char *buf, int bufsz, void *val, int valsz) +{ + /* no FIRST_F and LAST_F for CPI field enums, must do a hack */ + _dump_fields(buf, bufsz, val, IB_CPI_BASEVER_F, IB_CPI_TRAP_QKEY_F + 1); +} + +void mad_dump_portinfo_ext(char *buf, int bufsz, void *val, int valsz) +{ + int cnt; + + cnt = _dump_fields(buf, bufsz, val, IB_PORT_EXT_FIRST_F, + IB_PORT_EXT_LAST_F); + if (cnt < 0) + return; + + _dump_fields(buf + cnt, bufsz - cnt, val, + IB_PORT_EXT_HDR_FEC_MODE_SUPPORTED_F, + IB_PORT_EXT_HDR_FEC_MODE_LAST_F); +} + +void xdump(FILE * file, const char *msg, void *p, int size) +{ +#define HEX(x) ((x) < 10 ? '0' + (x) : 'a' + ((x) -10)) + uint8_t *cp = p; + int i; + + if (msg) + fputs(msg, file); + + for (i = 0; i < size;) { + fputc(HEX(*cp >> 4), file); + fputc(HEX(*cp & 0xf), file); + if (++i >= size) + break; + fputc(HEX(cp[1] >> 4), file); + fputc(HEX(cp[1] & 0xf), file); + if ((++i) % 16) + fputc(' ', file); + else + fputc('\n', file); + cp += 2; + } + if (i % 16) + fputc('\n', file); +} diff --git a/libibmad/fields.c b/libibmad/fields.c new file mode 100644 index 0000000..233eb16 --- /dev/null +++ b/libibmad/fields.c @@ -0,0 +1,1268 @@ +/* + * Copyright (c) 2004-2009 Voltaire Inc. All rights reserved. + * Copyright (c) 2009 HNR Consulting. All rights reserved. + * Copyright (c) 2009-2011 Mellanox Technologies LTD. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#include <infiniband/mad.h> + +/* + * BITSOFFS and BE_OFFS are required due the fact that the bit offsets are inconsistently + * encoded in the IB spec - IB headers are encoded such that the bit offsets + * are in big endian convention (BE_OFFS), while the SMI/GSI queries data fields bit + * offsets are specified using real bit offset (?!) + * The following macros normalize everything to big endian offsets. + */ +#define BITSOFFS(o, w) (((o) & ~31) | ((32 - ((o) & 31) - (w)))), (w) +#define BE_OFFS(o, w) (o), (w) +#define BE_TO_BITSOFFS(o, w) (((o) & ~31) | ((32 - ((o) & 31) - (w)))) + +static const ib_field_t ib_mad_f[] = { + {}, /* IB_NO_FIELD - reserved as invalid */ + + {0, 64, "GidPrefix", mad_dump_rhex}, + {64, 64, "GidGuid", mad_dump_rhex}, + + /* + * MAD: common MAD fields (IB spec 13.4.2) + * SMP: Subnet Management packets - lid routed (IB spec 14.2.1.1) + * DSMP: Subnet Management packets - direct route (IB spec 14.2.1.2) + * SA: Subnet Administration packets (IB spec 15.2.1.1) + */ + + /* first MAD word (0-3 bytes) */ + {BE_OFFS(0, 7), "MadMethod", mad_dump_hex}, /* TODO: add dumper */ + {BE_OFFS(7, 1), "MadIsResponse", mad_dump_uint}, /* TODO: add dumper */ + {BE_OFFS(8, 8), "MadClassVersion", mad_dump_uint}, + {BE_OFFS(16, 8), "MadMgmtClass", mad_dump_uint}, /* TODO: add dumper */ + {BE_OFFS(24, 8), "MadBaseVersion", mad_dump_uint}, + + /* second MAD word (4-7 bytes) */ + {BE_OFFS(48, 16), "MadStatus", mad_dump_hex}, /* TODO: add dumper */ + + /* DR SMP only */ + {BE_OFFS(32, 8), "DrSmpHopCnt", mad_dump_uint}, + {BE_OFFS(40, 8), "DrSmpHopPtr", mad_dump_uint}, + {BE_OFFS(48, 15), "DrSmpStatus", mad_dump_hex}, /* TODO: add dumper */ + {BE_OFFS(63, 1), "DrSmpDirection", mad_dump_uint}, /* TODO: add dumper */ + + /* words 3,4,5,6 (8-23 bytes) */ + {64, 64, "MadTRID", mad_dump_hex}, + {BE_OFFS(144, 16), "MadAttr", mad_dump_hex}, /* TODO: add dumper */ + {160, 32, "MadModifier", mad_dump_hex}, /* TODO: add dumper */ + + /* word 7,8 (24-31 bytes) */ + {192, 64, "MadMkey", mad_dump_hex}, + + /* word 9 (32-37 bytes) */ + {BE_OFFS(256, 16), "DrSmpDLID", mad_dump_uint}, + {BE_OFFS(272, 16), "DrSmpSLID", mad_dump_uint}, + + /* word 10,11 (36-43 bytes) */ + {288, 64, "SaSMkey", mad_dump_hex}, + + /* word 12 (44-47 bytes) */ + {BE_OFFS(46 * 8, 16), "SaAttrOffs", mad_dump_uint}, + + /* word 13,14 (48-55 bytes) */ + {48 * 8, 64, "SaCompMask", mad_dump_hex}, + + /* word 13,14 (56-255 bytes) */ + {56 * 8, (256 - 56) * 8, "SaData", mad_dump_hex}, + + /* bytes 64 - 127 */ + {}, /* IB_SM_DATA_F - reserved as invalid */ + + /* bytes 64 - 256 */ + {64 * 8, (256 - 64) * 8, "GsData", mad_dump_hex}, + + /* bytes 128 - 191 */ + {1024, 512, "DrSmpPath", mad_dump_hex}, + + /* bytes 192 - 255 */ + {1536, 512, "DrSmpRetPath", mad_dump_hex}, + + /* + * PortInfo fields + */ + {0, 64, "Mkey", mad_dump_hex}, + {64, 64, "GidPrefix", mad_dump_hex}, + {BITSOFFS(128, 16), "Lid", mad_dump_uint}, + {BITSOFFS(144, 16), "SMLid", mad_dump_uint}, + {160, 32, "CapMask", mad_dump_portcapmask}, + {BITSOFFS(192, 16), "DiagCode", mad_dump_hex}, + {BITSOFFS(208, 16), "MkeyLeasePeriod", mad_dump_uint}, + {BITSOFFS(224, 8), "LocalPort", mad_dump_uint}, + {BITSOFFS(232, 8), "LinkWidthEnabled", mad_dump_linkwidthen}, + {BITSOFFS(240, 8), "LinkWidthSupported", mad_dump_linkwidthsup}, + {BITSOFFS(248, 8), "LinkWidthActive", mad_dump_linkwidth}, + {BITSOFFS(256, 4), "LinkSpeedSupported", mad_dump_linkspeedsup}, + {BITSOFFS(260, 4), "LinkState", mad_dump_portstate}, + {BITSOFFS(264, 4), "PhysLinkState", mad_dump_physportstate}, + {BITSOFFS(268, 4), "LinkDownDefState", mad_dump_linkdowndefstate}, + {BITSOFFS(272, 2), "ProtectBits", mad_dump_uint}, + {BITSOFFS(277, 3), "LMC", mad_dump_uint}, + {BITSOFFS(280, 4), "LinkSpeedActive", mad_dump_linkspeed}, + {BITSOFFS(284, 4), "LinkSpeedEnabled", mad_dump_linkspeeden}, + {BITSOFFS(288, 4), "NeighborMTU", mad_dump_mtu}, + {BITSOFFS(292, 4), "SMSL", mad_dump_uint}, + {BITSOFFS(296, 4), "VLCap", mad_dump_vlcap}, + {BITSOFFS(300, 4), "InitType", mad_dump_hex}, + {BITSOFFS(304, 8), "VLHighLimit", mad_dump_uint}, + {BITSOFFS(312, 8), "VLArbHighCap", mad_dump_uint}, + {BITSOFFS(320, 8), "VLArbLowCap", mad_dump_uint}, + {BITSOFFS(328, 4), "InitReply", mad_dump_hex}, + {BITSOFFS(332, 4), "MtuCap", mad_dump_mtu}, + {BITSOFFS(336, 3), "VLStallCount", mad_dump_uint}, + {BITSOFFS(339, 5), "HoqLife", mad_dump_uint}, + {BITSOFFS(344, 4), "OperVLs", mad_dump_opervls}, + {BITSOFFS(348, 1), "PartEnforceInb", mad_dump_uint}, + {BITSOFFS(349, 1), "PartEnforceOutb", mad_dump_uint}, + {BITSOFFS(350, 1), "FilterRawInb", mad_dump_uint}, + {BITSOFFS(351, 1), "FilterRawOutb", mad_dump_uint}, + {BITSOFFS(352, 16), "MkeyViolations", mad_dump_uint}, + {BITSOFFS(368, 16), "PkeyViolations", mad_dump_uint}, + {BITSOFFS(384, 16), "QkeyViolations", mad_dump_uint}, + {BITSOFFS(400, 8), "GuidCap", mad_dump_uint}, + {BITSOFFS(408, 1), "ClientReregister", mad_dump_uint}, + {BITSOFFS(409, 2), "McastPkeyTrapSuppressionEnabled", mad_dump_uint}, + {BITSOFFS(411, 5), "SubnetTimeout", mad_dump_uint}, + {BITSOFFS(419, 5), "RespTimeVal", mad_dump_uint}, + {BITSOFFS(424, 4), "LocalPhysErr", mad_dump_uint}, + {BITSOFFS(428, 4), "OverrunErr", mad_dump_uint}, + {BITSOFFS(432, 16), "MaxCreditHint", mad_dump_uint}, + {BITSOFFS(456, 24), "RoundTrip", mad_dump_uint}, + {}, /* IB_PORT_LAST_F */ + + /* + * NodeInfo fields + */ + {BITSOFFS(0, 8), "BaseVers", mad_dump_uint}, + {BITSOFFS(8, 8), "ClassVers", mad_dump_uint}, + {BITSOFFS(16, 8), "NodeType", mad_dump_node_type}, + {BITSOFFS(24, 8), "NumPorts", mad_dump_uint}, + {32, 64, "SystemGuid", mad_dump_hex}, + {96, 64, "Guid", mad_dump_hex}, + {160, 64, "PortGuid", mad_dump_hex}, + {BITSOFFS(224, 16), "PartCap", mad_dump_uint}, + {BITSOFFS(240, 16), "DevId", mad_dump_hex}, + {256, 32, "Revision", mad_dump_hex}, + {BITSOFFS(288, 8), "LocalPort", mad_dump_uint}, + {BITSOFFS(296, 24), "VendorId", mad_dump_hex}, + {}, /* IB_NODE_LAST_F */ + + /* + * SwitchInfo fields + */ + {BITSOFFS(0, 16), "LinearFdbCap", mad_dump_uint}, + {BITSOFFS(16, 16), "RandomFdbCap", mad_dump_uint}, + {BITSOFFS(32, 16), "McastFdbCap", mad_dump_uint}, + {BITSOFFS(48, 16), "LinearFdbTop", mad_dump_uint}, + {BITSOFFS(64, 8), "DefPort", mad_dump_uint}, + {BITSOFFS(72, 8), "DefMcastPrimPort", mad_dump_uint}, + {BITSOFFS(80, 8), "DefMcastNotPrimPort", mad_dump_uint}, + {BITSOFFS(88, 5), "LifeTime", mad_dump_uint}, + {BITSOFFS(93, 1), "StateChange", mad_dump_uint}, + {BITSOFFS(94, 2), "OptSLtoVLMapping", mad_dump_uint}, + {BITSOFFS(96, 16), "LidsPerPort", mad_dump_uint}, + {BITSOFFS(112, 16), "PartEnforceCap", mad_dump_uint}, + {BITSOFFS(128, 1), "InboundPartEnf", mad_dump_uint}, + {BITSOFFS(129, 1), "OutboundPartEnf", mad_dump_uint}, + {BITSOFFS(130, 1), "FilterRawInbound", mad_dump_uint}, + {BITSOFFS(131, 1), "FilterRawOutbound", mad_dump_uint}, + {BITSOFFS(132, 1), "EnhancedPort0", mad_dump_uint}, + {BITSOFFS(144, 16), "MulticastFDBTop", mad_dump_hex}, + {}, /* IB_SW_LAST_F */ + + /* + * SwitchLinearForwardingTable fields + */ + {0, 512, "LinearForwTbl", mad_dump_array}, + + /* + * SwitchMulticastForwardingTable fields + */ + {0, 512, "MulticastForwTbl", mad_dump_array}, + + /* + * NodeDescription fields + */ + {0, 64 * 8, "NodeDesc", mad_dump_string}, + + /* + * Notice/Trap fields + */ + {BITSOFFS(0, 1), "NoticeIsGeneric", mad_dump_uint}, + {BITSOFFS(1, 7), "NoticeType", mad_dump_uint}, + {BITSOFFS(8, 24), "NoticeProducerType", mad_dump_node_type}, + {BITSOFFS(32, 16), "NoticeTrapNumber", mad_dump_uint}, + {BITSOFFS(48, 16), "NoticeIssuerLID", mad_dump_uint}, + {BITSOFFS(64, 1), "NoticeToggle", mad_dump_uint}, + {BITSOFFS(65, 15), "NoticeCount", mad_dump_uint}, + {80, 432, "NoticeDataDetails", mad_dump_array}, + {BITSOFFS(80, 16), "NoticeDataLID", mad_dump_uint}, + {BITSOFFS(96, 16), "NoticeDataTrap144LID", mad_dump_uint}, + {BITSOFFS(128, 32), "NoticeDataTrap144CapMask", mad_dump_uint}, + + /* + * Port counters + */ + {BITSOFFS(8, 8), "PortSelect", mad_dump_uint}, + {BITSOFFS(16, 16), "CounterSelect", mad_dump_hex}, + {BITSOFFS(32, 16), "SymbolErrorCounter", mad_dump_uint}, + {BITSOFFS(48, 8), "LinkErrorRecoveryCounter", mad_dump_uint}, + {BITSOFFS(56, 8), "LinkDownedCounter", mad_dump_uint}, + {BITSOFFS(64, 16), "PortRcvErrors", mad_dump_uint}, + {BITSOFFS(80, 16), "PortRcvRemotePhysicalErrors", mad_dump_uint}, + {BITSOFFS(96, 16), "PortRcvSwitchRelayErrors", mad_dump_uint}, + {BITSOFFS(112, 16), "PortXmitDiscards", mad_dump_uint}, + {BITSOFFS(128, 8), "PortXmitConstraintErrors", mad_dump_uint}, + {BITSOFFS(136, 8), "PortRcvConstraintErrors", mad_dump_uint}, + {BITSOFFS(144, 8), "CounterSelect2", mad_dump_hex}, + {BITSOFFS(152, 4), "LocalLinkIntegrityErrors", mad_dump_uint}, + {BITSOFFS(156, 4), "ExcessiveBufferOverrunErrors", mad_dump_uint}, + {BITSOFFS(176, 16), "VL15Dropped", mad_dump_uint}, + {192, 32, "PortXmitData", mad_dump_uint}, + {224, 32, "PortRcvData", mad_dump_uint}, + {256, 32, "PortXmitPkts", mad_dump_uint}, + {288, 32, "PortRcvPkts", mad_dump_uint}, + {320, 32, "PortXmitWait", mad_dump_uint}, + {}, /* IB_PC_LAST_F */ + + /* + * SMInfo + */ + {0, 64, "SmInfoGuid", mad_dump_hex}, + {64, 64, "SmInfoKey", mad_dump_hex}, + {128, 32, "SmActivity", mad_dump_uint}, + {BITSOFFS(160, 4), "SmPriority", mad_dump_uint}, + {BITSOFFS(164, 4), "SmState", mad_dump_uint}, + + /* + * SA RMPP + */ + {BE_OFFS(24 * 8 + 24, 8), "RmppVers", mad_dump_uint}, + {BE_OFFS(24 * 8 + 16, 8), "RmppType", mad_dump_uint}, + {BE_OFFS(24 * 8 + 11, 5), "RmppResp", mad_dump_uint}, + {BE_OFFS(24 * 8 + 8, 3), "RmppFlags", mad_dump_hex}, + {BE_OFFS(24 * 8 + 0, 8), "RmppStatus", mad_dump_hex}, + + /* data1 */ + {28 * 8, 32, "RmppData1", mad_dump_hex}, + {28 * 8, 32, "RmppSegNum", mad_dump_uint}, + /* data2 */ + {32 * 8, 32, "RmppData2", mad_dump_hex}, + {32 * 8, 32, "RmppPayload", mad_dump_uint}, + {32 * 8, 32, "RmppNewWin", mad_dump_uint}, + + /* + * SA Get Multi Path + */ + {BITSOFFS(41, 7), "MultiPathNumPath", mad_dump_uint}, + {BITSOFFS(120, 8), "MultiPathNumSrc", mad_dump_uint}, + {BITSOFFS(128, 8), "MultiPathNumDest", mad_dump_uint}, + {192, 128, "MultiPathGid", mad_dump_array}, + + /* + * SA Path rec + */ + {64, 128, "PathRecDGid", mad_dump_array}, + {192, 128, "PathRecSGid", mad_dump_array}, + {BITSOFFS(320, 16), "PathRecDLid", mad_dump_uint}, + {BITSOFFS(336, 16), "PathRecSLid", mad_dump_uint}, + {BITSOFFS(393, 7), "PathRecNumPath", mad_dump_uint}, + {BITSOFFS(428, 4), "PathRecSL", mad_dump_uint}, + + /* + * MC Member rec + */ + {0, 128, "McastMemMGid", mad_dump_array}, + {128, 128, "McastMemPortGid", mad_dump_array}, + {256, 32, "McastMemQkey", mad_dump_hex}, + {BITSOFFS(288, 16), "McastMemMLid", mad_dump_hex}, + {BITSOFFS(352, 4), "McastMemSL", mad_dump_uint}, + {BITSOFFS(306, 6), "McastMemMTU", mad_dump_uint}, + {BITSOFFS(338, 6), "McastMemRate", mad_dump_uint}, + {BITSOFFS(312, 8), "McastMemTClass", mad_dump_uint}, + {BITSOFFS(320, 16), "McastMemPkey", mad_dump_uint}, + {BITSOFFS(356, 20), "McastMemFlowLbl", mad_dump_uint}, + {BITSOFFS(388, 4), "McastMemJoinState", mad_dump_uint}, + {BITSOFFS(392, 1), "McastMemProxyJoin", mad_dump_uint}, + + /* + * Service record + */ + {0, 64, "ServRecID", mad_dump_hex}, + {64, 128, "ServRecGid", mad_dump_array}, + {BITSOFFS(192, 16), "ServRecPkey", mad_dump_hex}, + {224, 32, "ServRecLease", mad_dump_hex}, + {256, 128, "ServRecKey", mad_dump_hex}, + {384, 512, "ServRecName", mad_dump_string}, + {896, 512, "ServRecData", mad_dump_array}, /* ATS for example */ + + /* + * ATS SM record - within SA_SR_DATA + */ + {12 * 8, 32, "ATSNodeAddr", mad_dump_hex}, + {BITSOFFS(16 * 8, 16), "ATSMagicKey", mad_dump_hex}, + {BITSOFFS(18 * 8, 16), "ATSNodeType", mad_dump_hex}, + {32 * 8, 32 * 8, "ATSNodeName", mad_dump_string}, + + /* + * SLTOVL MAPPING TABLE + */ + {0, 64, "SLToVLMap", mad_dump_hex}, + + /* + * VL ARBITRATION TABLE + */ + {0, 512, "VLArbTbl", mad_dump_array}, + + /* + * IB vendor classes range 2 + */ + {BE_OFFS(36 * 8, 24), "OUI", mad_dump_array}, + {40 * 8, (256 - 40) * 8, "Vendor2Data", mad_dump_array}, + + /* + * Extended port counters + */ + {BITSOFFS(8, 8), "PortSelect", mad_dump_uint}, + {BITSOFFS(16, 16), "CounterSelect", mad_dump_hex}, + {64, 64, "PortXmitData", mad_dump_uint}, + {128, 64, "PortRcvData", mad_dump_uint}, + {192, 64, "PortXmitPkts", mad_dump_uint}, + {256, 64, "PortRcvPkts", mad_dump_uint}, + {320, 64, "PortUnicastXmitPkts", mad_dump_uint}, + {384, 64, "PortUnicastRcvPkts", mad_dump_uint}, + {448, 64, "PortMulticastXmitPkts", mad_dump_uint}, + {512, 64, "PortMulticastRcvPkts", mad_dump_uint}, + {}, /* IB_PC_EXT_LAST_F */ + + /* + * GUIDInfo fields + */ + {0, 64, "GUID0", mad_dump_hex}, + + /* + * ClassPortInfo fields + */ + {BITSOFFS(0, 8), "BaseVersion", mad_dump_uint}, + {BITSOFFS(8, 8), "ClassVersion", mad_dump_uint}, + {BITSOFFS(16, 16), "CapabilityMask", mad_dump_hex}, + {BITSOFFS(32, 27), "CapabilityMask2", mad_dump_hex}, + {BITSOFFS(59, 5), "RespTimeVal", mad_dump_uint}, + {64, 128, "RedirectGID", mad_dump_array}, + {BITSOFFS(192, 8), "RedirectTC", mad_dump_hex}, + {BITSOFFS(200, 4), "RedirectSL", mad_dump_uint}, + {BITSOFFS(204, 20), "RedirectFL", mad_dump_hex}, + {BITSOFFS(224, 16), "RedirectLID", mad_dump_uint}, + {BITSOFFS(240, 16), "RedirectPKey", mad_dump_hex}, + {BITSOFFS(264, 24), "RedirectQP", mad_dump_hex}, + {288, 32, "RedirectQKey", mad_dump_hex}, + {320, 128, "TrapGID", mad_dump_array}, + {BITSOFFS(448, 8), "TrapTC", mad_dump_hex}, + {BITSOFFS(456, 4), "TrapSL", mad_dump_uint}, + {BITSOFFS(460, 20), "TrapFL", mad_dump_hex}, + {BITSOFFS(480, 16), "TrapLID", mad_dump_uint}, + {BITSOFFS(496, 16), "TrapPKey", mad_dump_hex}, + {BITSOFFS(512, 8), "TrapHL", mad_dump_uint}, + {BITSOFFS(520, 24), "TrapQP", mad_dump_hex}, + {544, 32, "TrapQKey", mad_dump_hex}, + + /* + * PortXmitDataSL fields + */ + {32, 32, "XmtDataSL0", mad_dump_uint}, + {64, 32, "XmtDataSL1", mad_dump_uint}, + {96, 32, "XmtDataSL2", mad_dump_uint}, + {128, 32, "XmtDataSL3", mad_dump_uint}, + {160, 32, "XmtDataSL4", mad_dump_uint}, + {192, 32, "XmtDataSL5", mad_dump_uint}, + {224, 32, "XmtDataSL6", mad_dump_uint}, + {256, 32, "XmtDataSL7", mad_dump_uint}, + {288, 32, "XmtDataSL8", mad_dump_uint}, + {320, 32, "XmtDataSL9", mad_dump_uint}, + {352, 32, "XmtDataSL10", mad_dump_uint}, + {384, 32, "XmtDataSL11", mad_dump_uint}, + {416, 32, "XmtDataSL12", mad_dump_uint}, + {448, 32, "XmtDataSL13", mad_dump_uint}, + {480, 32, "XmtDataSL14", mad_dump_uint}, + {512, 32, "XmtDataSL15", mad_dump_uint}, + {}, /* IB_PC_XMT_DATA_SL_LAST_F */ + + /* + * PortRcvDataSL fields + */ + {32, 32, "RcvDataSL0", mad_dump_uint}, + {64, 32, "RcvDataSL1", mad_dump_uint}, + {96, 32, "RcvDataSL2", mad_dump_uint}, + {128, 32, "RcvDataSL3", mad_dump_uint}, + {160, 32, "RcvDataSL4", mad_dump_uint}, + {192, 32, "RcvDataSL5", mad_dump_uint}, + {224, 32, "RcvDataSL6", mad_dump_uint}, + {256, 32, "RcvDataSL7", mad_dump_uint}, + {288, 32, "RcvDataSL8", mad_dump_uint}, + {320, 32, "RcvDataSL9", mad_dump_uint}, + {352, 32, "RcvDataSL10", mad_dump_uint}, + {384, 32, "RcvDataSL11", mad_dump_uint}, + {416, 32, "RcvDataSL12", mad_dump_uint}, + {448, 32, "RcvDataSL13", mad_dump_uint}, + {480, 32, "RcvDataSL14", mad_dump_uint}, + {512, 32, "RcvDataSL15", mad_dump_uint}, + {}, /* IB_PC_RCV_DATA_SL_LAST_F */ + + /* + * PortXmitDiscardDetails fields + */ + {BITSOFFS(32, 16), "PortInactiveDiscards", mad_dump_uint}, + {BITSOFFS(48, 16), "PortNeighborMTUDiscards", mad_dump_uint}, + {BITSOFFS(64, 16), "PortSwLifetimeLimitDiscards", mad_dump_uint}, + {BITSOFFS(80, 16), "PortSwHOQLifetimeLimitDiscards", mad_dump_uint}, + {}, /* IB_PC_XMT_DISC_LAST_F */ + + /* + * PortRcvErrorDetails fields + */ + {BITSOFFS(32, 16), "PortLocalPhysicalErrors", mad_dump_uint}, + {BITSOFFS(48, 16), "PortMalformedPktErrors", mad_dump_uint}, + {BITSOFFS(64, 16), "PortBufferOverrunErrors", mad_dump_uint}, + {BITSOFFS(80, 16), "PortDLIDMappingErrors", mad_dump_uint}, + {BITSOFFS(96, 16), "PortVLMappingErrors", mad_dump_uint}, + {BITSOFFS(112, 16), "PortLoopingErrors", mad_dump_uint}, + {}, /* IB_PC_RCV_ERR_LAST_F */ + + /* + * PortSamplesControl fields + */ + {BITSOFFS(0, 8), "OpCode", mad_dump_hex}, + {BITSOFFS(8, 8), "PortSelect", mad_dump_uint}, + {BITSOFFS(16, 8), "Tick", mad_dump_hex}, + {BITSOFFS(29, 3), "CounterWidth", mad_dump_uint}, + {BITSOFFS(34, 3), "CounterMask0", mad_dump_hex}, + {BITSOFFS(37, 27), "CounterMasks1to9", mad_dump_hex}, + {BITSOFFS(65, 15), "CounterMasks10to14", mad_dump_hex}, + {BITSOFFS(80, 8), "SampleMechanisms", mad_dump_uint}, + {BITSOFFS(94, 2), "SampleStatus", mad_dump_uint}, + {96, 64, "OptionMask", mad_dump_hex}, + {160, 64, "VendorMask", mad_dump_hex}, + {224, 32, "SampleStart", mad_dump_uint}, + {256, 32, "SampleInterval", mad_dump_uint}, + {BITSOFFS(288, 16), "Tag", mad_dump_hex}, + {BITSOFFS(304, 16), "CounterSelect0", mad_dump_hex}, + {BITSOFFS(320, 16), "CounterSelect1", mad_dump_hex}, + {BITSOFFS(336, 16), "CounterSelect2", mad_dump_hex}, + {BITSOFFS(352, 16), "CounterSelect3", mad_dump_hex}, + {BITSOFFS(368, 16), "CounterSelect4", mad_dump_hex}, + {BITSOFFS(384, 16), "CounterSelect5", mad_dump_hex}, + {BITSOFFS(400, 16), "CounterSelect6", mad_dump_hex}, + {BITSOFFS(416, 16), "CounterSelect7", mad_dump_hex}, + {BITSOFFS(432, 16), "CounterSelect8", mad_dump_hex}, + {BITSOFFS(448, 16), "CounterSelect9", mad_dump_hex}, + {BITSOFFS(464, 16), "CounterSelect10", mad_dump_hex}, + {BITSOFFS(480, 16), "CounterSelect11", mad_dump_hex}, + {BITSOFFS(496, 16), "CounterSelect12", mad_dump_hex}, + {BITSOFFS(512, 16), "CounterSelect13", mad_dump_hex}, + {BITSOFFS(528, 16), "CounterSelect14", mad_dump_hex}, + {576, 64, "SamplesOnlyOptionMask", mad_dump_hex}, + {}, /* IB_PSC_LAST_F */ + + /* GUIDInfo fields */ + {0, 64, "GUID0", mad_dump_hex}, + {64, 64, "GUID1", mad_dump_hex}, + {128, 64, "GUID2", mad_dump_hex}, + {192, 64, "GUID3", mad_dump_hex}, + {256, 64, "GUID4", mad_dump_hex}, + {320, 64, "GUID5", mad_dump_hex}, + {384, 64, "GUID6", mad_dump_hex}, + {448, 64, "GUID7", mad_dump_hex}, + + /* GUID Info Record */ + {BITSOFFS(0, 16), "Lid", mad_dump_uint}, + {BITSOFFS(16, 8), "BlockNum", mad_dump_uint}, + {64, 64, "Guid0", mad_dump_hex}, + {128, 64, "Guid1", mad_dump_hex}, + {192, 64, "Guid2", mad_dump_hex}, + {256, 64, "Guid3", mad_dump_hex}, + {320, 64, "Guid4", mad_dump_hex}, + {384, 64, "Guid5", mad_dump_hex}, + {448, 64, "Guid6", mad_dump_hex}, + {512, 64, "Guid7", mad_dump_hex}, + + /* + * More PortInfo fields + */ + {BITSOFFS(480, 16), "CapabilityMask2", mad_dump_portcapmask2}, + {BITSOFFS(496, 4), "LinkSpeedExtActive", mad_dump_linkspeedext}, + {BITSOFFS(500, 4), "LinkSpeedExtSupported", mad_dump_linkspeedextsup}, + {BITSOFFS(507, 5), "LinkSpeedExtEnabled", mad_dump_linkspeedexten}, + {}, /* IB_PORT_LINK_SPEED_EXT_LAST_F */ + + /* + * PortExtendedSpeedsCounters fields + */ + {BITSOFFS(8, 8), "PortSelect", mad_dump_uint}, + {64, 64, "CounterSelect", mad_dump_hex}, + {BITSOFFS(128, 16), "SyncHeaderErrorCounter", mad_dump_uint}, + {BITSOFFS(144, 16), "UnknownBlockCounter", mad_dump_uint}, + {BITSOFFS(160, 16), "ErrorDetectionCounterLane0", mad_dump_uint}, + {BITSOFFS(176, 16), "ErrorDetectionCounterLane1", mad_dump_uint}, + {BITSOFFS(192, 16), "ErrorDetectionCounterLane2", mad_dump_uint}, + {BITSOFFS(208, 16), "ErrorDetectionCounterLane3", mad_dump_uint}, + {BITSOFFS(224, 16), "ErrorDetectionCounterLane4", mad_dump_uint}, + {BITSOFFS(240, 16), "ErrorDetectionCounterLane5", mad_dump_uint}, + {BITSOFFS(256, 16), "ErrorDetectionCounterLane6", mad_dump_uint}, + {BITSOFFS(272, 16), "ErrorDetectionCounterLane7", mad_dump_uint}, + {BITSOFFS(288, 16), "ErrorDetectionCounterLane8", mad_dump_uint}, + {BITSOFFS(304, 16), "ErrorDetectionCounterLane9", mad_dump_uint}, + {BITSOFFS(320, 16), "ErrorDetectionCounterLane10", mad_dump_uint}, + {BITSOFFS(336, 16), "ErrorDetectionCounterLane11", mad_dump_uint}, + {352, 32, "FECCorrectableBlockCtrLane0", mad_dump_uint}, + {384, 32, "FECCorrectableBlockCtrLane1", mad_dump_uint}, + {416, 32, "FECCorrectableBlockCtrLane2", mad_dump_uint}, + {448, 32, "FECCorrectableBlockCtrLane3", mad_dump_uint}, + {480, 32, "FECCorrectableBlockCtrLane4", mad_dump_uint}, + {512, 32, "FECCorrectableBlockCtrLane5", mad_dump_uint}, + {544, 32, "FECCorrectableBlockCtrLane6", mad_dump_uint}, + {576, 32, "FECCorrectableBlockCtrLane7", mad_dump_uint}, + {608, 32, "FECCorrectableBlockCtrLane8", mad_dump_uint}, + {640, 32, "FECCorrectableBlockCtrLane9", mad_dump_uint}, + {672, 32, "FECCorrectableBlockCtrLane10", mad_dump_uint}, + {704, 32, "FECCorrectableBlockCtrLane11", mad_dump_uint}, + {736, 32, "FECUncorrectableBlockCtrLane0", mad_dump_uint}, + {768, 32, "FECUncorrectableBlockCtrLane1", mad_dump_uint}, + {800, 32, "FECUncorrectableBlockCtrLane2", mad_dump_uint}, + {832, 32, "FECUncorrectableBlockCtrLane3", mad_dump_uint}, + {864, 32, "FECUncorrectableBlockCtrLane4", mad_dump_uint}, + {896, 32, "FECUncorrectableBlockCtrLane5", mad_dump_uint}, + {928, 32, "FECUncorrectableBlockCtrLane6", mad_dump_uint}, + {960, 32, "FECUncorrectableBlockCtrLane7", mad_dump_uint}, + {992, 32, "FECUncorrectableBlockCtrLane8", mad_dump_uint}, + {1024, 32, "FECUncorrectableBlockCtrLane9", mad_dump_uint}, + {1056, 32, "FECUncorrectableBlockCtrLane10", mad_dump_uint}, + {1088, 32, "FECUncorrectableBlockCtrLane11", mad_dump_uint}, + {}, /* IB_PESC_LAST_F */ + + /* + * PortOpRcvCounters fields + */ + {32, 32, "PortOpRcvPkts", mad_dump_uint}, + {64, 32, "PortOpRcvData", mad_dump_uint}, + {}, /* IB_PC_PORT_OP_RCV_COUNTERS_LAST_F */ + + /* + * PortFlowCtlCounters fields + */ + {32, 32, "PortXmitFlowPkts", mad_dump_uint}, + {64, 32, "PortRcvFlowPkts", mad_dump_uint}, + {}, /* IB_PC_PORT_FLOW_CTL_COUNTERS_LAST_F */ + + /* + * PortVLOpPackets fields + */ + {BITSOFFS(32, 16), "PortVLOpPackets0", mad_dump_uint}, + {BITSOFFS(48, 16), "PortVLOpPackets1", mad_dump_uint}, + {BITSOFFS(64, 16), "PortVLOpPackets2", mad_dump_uint}, + {BITSOFFS(80, 16), "PortVLOpPackets3", mad_dump_uint}, + {BITSOFFS(96, 16), "PortVLOpPackets4", mad_dump_uint}, + {BITSOFFS(112, 16), "PortVLOpPackets5", mad_dump_uint}, + {BITSOFFS(128, 16), "PortVLOpPackets6", mad_dump_uint}, + {BITSOFFS(144, 16), "PortVLOpPackets7", mad_dump_uint}, + {BITSOFFS(160, 16), "PortVLOpPackets8", mad_dump_uint}, + {BITSOFFS(176, 16), "PortVLOpPackets9", mad_dump_uint}, + {BITSOFFS(192, 16), "PortVLOpPackets10", mad_dump_uint}, + {BITSOFFS(208, 16), "PortVLOpPackets11", mad_dump_uint}, + {BITSOFFS(224, 16), "PortVLOpPackets12", mad_dump_uint}, + {BITSOFFS(240, 16), "PortVLOpPackets13", mad_dump_uint}, + {BITSOFFS(256, 16), "PortVLOpPackets14", mad_dump_uint}, + {BITSOFFS(272, 16), "PortVLOpPackets15", mad_dump_uint}, + {}, /* IB_PC_PORT_VL_OP_PACKETS_LAST_F */ + + /* + * PortVLOpData fields + */ + {32, 32, "PortVLOpData0", mad_dump_uint}, + {64, 32, "PortVLOpData1", mad_dump_uint}, + {96, 32, "PortVLOpData2", mad_dump_uint}, + {128, 32, "PortVLOpData3", mad_dump_uint}, + {160, 32, "PortVLOpData4", mad_dump_uint}, + {192, 32, "PortVLOpData5", mad_dump_uint}, + {224, 32, "PortVLOpData6", mad_dump_uint}, + {256, 32, "PortVLOpData7", mad_dump_uint}, + {288, 32, "PortVLOpData8", mad_dump_uint}, + {320, 32, "PortVLOpData9", mad_dump_uint}, + {352, 32, "PortVLOpData10", mad_dump_uint}, + {384, 32, "PortVLOpData11", mad_dump_uint}, + {416, 32, "PortVLOpData12", mad_dump_uint}, + {448, 32, "PortVLOpData13", mad_dump_uint}, + {480, 32, "PortVLOpData14", mad_dump_uint}, + {512, 32, "PortVLOpData15", mad_dump_uint}, + {}, /* IB_PC_PORT_VL_OP_DATA_LAST_F */ + + /* + * PortVLXmitFlowCtlUpdateErrors fields + */ + {BITSOFFS(32, 2), "PortVLXmitFlowCtlUpdateErrors0", mad_dump_uint}, + {BITSOFFS(34, 2), "PortVLXmitFlowCtlUpdateErrors1", mad_dump_uint}, + {BITSOFFS(36, 2), "PortVLXmitFlowCtlUpdateErrors2", mad_dump_uint}, + {BITSOFFS(38, 2), "PortVLXmitFlowCtlUpdateErrors3", mad_dump_uint}, + {BITSOFFS(40, 2), "PortVLXmitFlowCtlUpdateErrors4", mad_dump_uint}, + {BITSOFFS(42, 2), "PortVLXmitFlowCtlUpdateErrors5", mad_dump_uint}, + {BITSOFFS(44, 2), "PortVLXmitFlowCtlUpdateErrors6", mad_dump_uint}, + {BITSOFFS(46, 2), "PortVLXmitFlowCtlUpdateErrors7", mad_dump_uint}, + {BITSOFFS(48, 2), "PortVLXmitFlowCtlUpdateErrors8", mad_dump_uint}, + {BITSOFFS(50, 2), "PortVLXmitFlowCtlUpdateErrors9", mad_dump_uint}, + {BITSOFFS(52, 2), "PortVLXmitFlowCtlUpdateErrors10", mad_dump_uint}, + {BITSOFFS(54, 2), "PortVLXmitFlowCtlUpdateErrors11", mad_dump_uint}, + {BITSOFFS(56, 2), "PortVLXmitFlowCtlUpdateErrors12", mad_dump_uint}, + {BITSOFFS(58, 2), "PortVLXmitFlowCtlUpdateErrors13", mad_dump_uint}, + {BITSOFFS(60, 2), "PortVLXmitFlowCtlUpdateErrors14", mad_dump_uint}, + {BITSOFFS(62, 2), "PortVLXmitFlowCtlUpdateErrors15", mad_dump_uint}, + {}, /* IB_PC_PORT_VL_XMIT_FLOW_CTL_UPDATE_ERRORS_LAST_F */ + + /* + * PortVLXmitWaitCounters fields + */ + {BITSOFFS(32, 16), "PortVLXmitWait0", mad_dump_uint}, + {BITSOFFS(48, 16), "PortVLXmitWait1", mad_dump_uint}, + {BITSOFFS(64, 16), "PortVLXmitWait2", mad_dump_uint}, + {BITSOFFS(80, 16), "PortVLXmitWait3", mad_dump_uint}, + {BITSOFFS(96, 16), "PortVLXmitWait4", mad_dump_uint}, + {BITSOFFS(112, 16), "PortVLXmitWait5", mad_dump_uint}, + {BITSOFFS(128, 16), "PortVLXmitWait6", mad_dump_uint}, + {BITSOFFS(144, 16), "PortVLXmitWait7", mad_dump_uint}, + {BITSOFFS(160, 16), "PortVLXmitWait8", mad_dump_uint}, + {BITSOFFS(176, 16), "PortVLXmitWait9", mad_dump_uint}, + {BITSOFFS(192, 16), "PortVLXmitWait10", mad_dump_uint}, + {BITSOFFS(208, 16), "PortVLXmitWait11", mad_dump_uint}, + {BITSOFFS(224, 16), "PortVLXmitWait12", mad_dump_uint}, + {BITSOFFS(240, 16), "PortVLXmitWait13", mad_dump_uint}, + {BITSOFFS(256, 16), "PortVLXmitWait14", mad_dump_uint}, + {BITSOFFS(272, 16), "PortVLXmitWait15", mad_dump_uint}, + {}, /* IB_PC_PORT_VL_XMIT_WAIT_COUNTERS_LAST_F */ + + /* + * SwPortVLCongestion fields + */ + {BITSOFFS(32, 16), "SWPortVLCongestion0", mad_dump_uint}, + {BITSOFFS(48, 16), "SWPortVLCongestion1", mad_dump_uint}, + {BITSOFFS(64, 16), "SWPortVLCongestion2", mad_dump_uint}, + {BITSOFFS(80, 16), "SWPortVLCongestion3", mad_dump_uint}, + {BITSOFFS(96, 16), "SWPortVLCongestion4", mad_dump_uint}, + {BITSOFFS(112, 16), "SWPortVLCongestion5", mad_dump_uint}, + {BITSOFFS(128, 16), "SWPortVLCongestion6", mad_dump_uint}, + {BITSOFFS(144, 16), "SWPortVLCongestion7", mad_dump_uint}, + {BITSOFFS(160, 16), "SWPortVLCongestion8", mad_dump_uint}, + {BITSOFFS(176, 16), "SWPortVLCongestion9", mad_dump_uint}, + {BITSOFFS(192, 16), "SWPortVLCongestion10", mad_dump_uint}, + {BITSOFFS(208, 16), "SWPortVLCongestion11", mad_dump_uint}, + {BITSOFFS(224, 16), "SWPortVLCongestion12", mad_dump_uint}, + {BITSOFFS(240, 16), "SWPortVLCongestion13", mad_dump_uint}, + {BITSOFFS(256, 16), "SWPortVLCongestion14", mad_dump_uint}, + {BITSOFFS(272, 16), "SWPortVLCongestion15", mad_dump_uint}, + {}, /* IB_PC_SW_PORT_VL_CONGESTION_LAST_F */ + + /* + * PortRcvConCtrl fields + */ + {32, 32, "PortPktRcvFECN", mad_dump_uint}, + {64, 32, "PortPktRcvBECN", mad_dump_uint}, + {}, /* IB_PC_RCV_CON_CTRL_LAST_F */ + + /* + * PortSLRcvFECN fields + */ + {32, 32, "PortSLRcvFECN0", mad_dump_uint}, + {64, 32, "PortSLRcvFECN1", mad_dump_uint}, + {96, 32, "PortSLRcvFECN2", mad_dump_uint}, + {128, 32, "PortSLRcvFECN3", mad_dump_uint}, + {160, 32, "PortSLRcvFECN4", mad_dump_uint}, + {192, 32, "PortSLRcvFECN5", mad_dump_uint}, + {224, 32, "PortSLRcvFECN6", mad_dump_uint}, + {256, 32, "PortSLRcvFECN7", mad_dump_uint}, + {288, 32, "PortSLRcvFECN8", mad_dump_uint}, + {320, 32, "PortSLRcvFECN9", mad_dump_uint}, + {352, 32, "PortSLRcvFECN10", mad_dump_uint}, + {384, 32, "PortSLRcvFECN11", mad_dump_uint}, + {416, 32, "PortSLRcvFECN12", mad_dump_uint}, + {448, 32, "PortSLRcvFECN13", mad_dump_uint}, + {480, 32, "PortSLRcvFECN14", mad_dump_uint}, + {512, 32, "PortSLRcvFECN15", mad_dump_uint}, + {}, /* IB_PC_SL_RCV_FECN_LAST_F */ + + /* + * PortSLRcvBECN fields + */ + {32, 32, "PortSLRcvBECN0", mad_dump_uint}, + {64, 32, "PortSLRcvBECN1", mad_dump_uint}, + {96, 32, "PortSLRcvBECN2", mad_dump_uint}, + {128, 32, "PortSLRcvBECN3", mad_dump_uint}, + {160, 32, "PortSLRcvBECN4", mad_dump_uint}, + {192, 32, "PortSLRcvBECN5", mad_dump_uint}, + {224, 32, "PortSLRcvBECN6", mad_dump_uint}, + {256, 32, "PortSLRcvBECN7", mad_dump_uint}, + {288, 32, "PortSLRcvBECN8", mad_dump_uint}, + {320, 32, "PortSLRcvBECN9", mad_dump_uint}, + {352, 32, "PortSLRcvBECN10", mad_dump_uint}, + {384, 32, "PortSLRcvBECN11", mad_dump_uint}, + {416, 32, "PortSLRcvBECN12", mad_dump_uint}, + {448, 32, "PortSLRcvBECN13", mad_dump_uint}, + {480, 32, "PortSLRcvBECN14", mad_dump_uint}, + {512, 32, "PortSLRcvBECN15", mad_dump_uint}, + {}, /* IB_PC_SL_RCV_BECN_LAST_F */ + + /* + * PortXmitConCtrl fields + */ + {32, 32, "PortXmitTimeCong", mad_dump_uint}, + {}, /* IB_PC_XMIT_CON_CTRL_LAST_F */ + + /* + * PortVLXmitTimeCong fields + */ + {32, 32, "PortVLXmitTimeCong0", mad_dump_uint}, + {64, 32, "PortVLXmitTimeCong1", mad_dump_uint}, + {96, 32, "PortVLXmitTimeCong2", mad_dump_uint}, + {128, 32, "PortVLXmitTimeCong3", mad_dump_uint}, + {160, 32, "PortVLXmitTimeCong4", mad_dump_uint}, + {192, 32, "PortVLXmitTimeCong5", mad_dump_uint}, + {224, 32, "PortVLXmitTimeCong6", mad_dump_uint}, + {256, 32, "PortVLXmitTimeCong7", mad_dump_uint}, + {288, 32, "PortVLXmitTimeCong8", mad_dump_uint}, + {320, 32, "PortVLXmitTimeCong9", mad_dump_uint}, + {352, 32, "PortVLXmitTimeCong10", mad_dump_uint}, + {384, 32, "PortVLXmitTimeCong11", mad_dump_uint}, + {416, 32, "PortVLXmitTimeCong12", mad_dump_uint}, + {448, 32, "PortVLXmitTimeCong13", mad_dump_uint}, + {480, 32, "PortVLXmitTimeCong14", mad_dump_uint}, + {}, /* IB_PC_VL_XMIT_TIME_CONG_LAST_F */ + + /* + * Mellanox ExtendedPortInfo fields + */ + {BITSOFFS(24, 8), "StateChangeEnable", mad_dump_hex}, + {BITSOFFS(56, 8), "LinkSpeedSupported", mad_dump_hex}, + {BITSOFFS(88, 8), "LinkSpeedEnabled", mad_dump_hex}, + {BITSOFFS(120, 8), "LinkSpeedActive", mad_dump_hex}, + {}, /* IB_MLNX_EXT_PORT_LAST_F */ + + /* + * Congestion Control Mad fields + * bytes 24-31 of congestion control mad + */ + {192, 64, "CC_Key", mad_dump_hex}, /* IB_CC_CCKEY_F */ + + /* + * CongestionInfo fields + */ + {BITSOFFS(0, 16), "CongestionInfo", mad_dump_hex}, + {BITSOFFS(16, 8), "ControlTableCap", mad_dump_uint}, + {}, /* IB_CC_CONGESTION_INFO_LAST_F */ + + /* + * CongestionKeyInfo fields + */ + {0, 64, "CC_Key", mad_dump_hex}, + {BITSOFFS(64, 1), "CC_KeyProtectBit", mad_dump_uint}, + {BITSOFFS(80, 16), "CC_KeyLeasePeriod", mad_dump_uint}, + {BITSOFFS(96, 16), "CC_KeyViolations", mad_dump_uint}, + {}, /* IB_CC_CONGESTION_KEY_INFO_LAST_F */ + + /* + * CongestionLog (common) fields + */ + {BITSOFFS(0, 8), "LogType", mad_dump_uint}, + {BITSOFFS(8, 8), "CongestionFlags", mad_dump_hex}, + {}, /* IB_CC_CONGESTION_LOG_LAST_F */ + + /* + * CongestionLog (Switch) fields + */ + {BITSOFFS(16, 16), "LogEventsCounter", mad_dump_uint}, + {32, 32, "CurrentTimeStamp", mad_dump_uint}, + {64, 256, "PortMap", mad_dump_array}, + {}, /* IB_CC_CONGESTION_LOG_SWITCH_LAST_F */ + + /* + * CongestionLogEvent (Switch) fields + */ + {BITSOFFS(0, 16), "SLID", mad_dump_uint}, + {BITSOFFS(16, 16), "DLID", mad_dump_uint}, + {BITSOFFS(32, 4), "SL", mad_dump_uint}, + {64, 32, "Timestamp", mad_dump_uint}, + {}, /* IB_CC_CONGESTION_LOG_ENTRY_SWITCH_LAST_F */ + + /* + * CongestionLog (CA) fields + */ + {BITSOFFS(16, 16), "ThresholdEventCounter", mad_dump_uint}, + {BITSOFFS(32, 16), "ThresholdCongestionEventMap", mad_dump_hex}, + /* XXX: Q3/2010 errata lists offset 48, but that means field is not + * word aligned. Assume will be aligned to offset 64 later. + */ + {BITSOFFS(64, 32), "CurrentTimeStamp", mad_dump_uint}, + {}, /* IB_CC_CONGESTION_LOG_CA_LAST_F */ + + /* + * CongestionLogEvent (CA) fields + */ + {BITSOFFS(0, 24), "Local_QP_CN_Entry", mad_dump_uint}, + {BITSOFFS(24, 4), "SL_CN_Entry", mad_dump_uint}, + {BITSOFFS(28, 4), "Service_Type_CN_Entry", mad_dump_hex}, + {BITSOFFS(32, 24), "Remote_QP_Number_CN_Entry", mad_dump_uint}, + {BITSOFFS(64, 16), "Local_LID_CN", mad_dump_uint}, + {BITSOFFS(80, 16), "Remote_LID_CN_Entry", mad_dump_uint}, + {BITSOFFS(96, 32), "Timestamp_CN_Entry", mad_dump_uint}, + {}, /* IB_CC_CONGESTION_LOG_ENTRY_CA_LAST_F */ + + /* + * SwitchCongestionSetting fields + */ + {0, 32, "Control_Map", mad_dump_hex}, + {32, 256, "Victim_Mask", mad_dump_array}, + {288, 256, "Credit_Mask", mad_dump_array}, + {BITSOFFS(544, 4), "Threshold", mad_dump_hex}, + {BITSOFFS(552, 8), "Packet_Size", mad_dump_uint}, + {BITSOFFS(560, 4), "CS_Threshold", mad_dump_hex}, + {BITSOFFS(576, 16), "CS_ReturnDelay", mad_dump_hex}, /* TODO: CCT dump */ + {BITSOFFS(592, 16), "Marking_Rate", mad_dump_uint}, + {}, /* IB_CC_SWITCH_CONGESTION_SETTING_LAST_F */ + + /* + * SwitchPortCongestionSettingElement fields + */ + {BITSOFFS(0, 1), "Valid", mad_dump_uint}, + {BITSOFFS(1, 1), "Control_Type", mad_dump_uint}, + {BITSOFFS(4, 4), "Threshold", mad_dump_hex}, + {BITSOFFS(8, 8), "Packet_Size", mad_dump_uint}, + {BITSOFFS(16, 16), "Cong_Parm_Marking_Rate", mad_dump_uint}, + {}, /* IB_CC_SWITCH_PORT_CONGESTION_SETTING_ELEMENT_LAST_F */ + + /* + * CACongestionSetting fields + */ + {BITSOFFS(0, 16), "Port_Control", mad_dump_hex}, + {BITSOFFS(16, 16), "Control_Map", mad_dump_hex}, + {}, /* IB_CC_CA_CONGESTION_SETTING_LAST_F */ + + /* + * CACongestionEntry fields + */ + {BITSOFFS(0, 16), "CCTI_Timer", mad_dump_uint}, + {BITSOFFS(16, 8), "CCTI_Increase", mad_dump_uint}, + {BITSOFFS(24, 8), "Trigger_Threshold", mad_dump_uint}, + {BITSOFFS(32, 8), "CCTI_Min", mad_dump_uint}, + {}, /* IB_CC_CA_CONGESTION_SETTING_ENTRY_LAST_F */ + + /* + * CongestionControlTable fields + */ + {BITSOFFS(0, 16), "CCTI_Limit", mad_dump_uint}, + {}, /* IB_CC_CONGESTION_CONTROL_TABLE_LAST_F */ + + /* + * CongestionControlTableEntry fields + */ + {BITSOFFS(0, 2), "CCT_Shift", mad_dump_uint}, + {BITSOFFS(2, 14), "CCT_Multiplier", mad_dump_uint}, + {}, /* IB_CC_CONGESTION_CONTROL_TABLE_ENTRY_LAST_F */ + + /* + * Timestamp fields + */ + {0, 32, "Timestamp", mad_dump_uint}, + {}, /* IB_CC_TIMESTAMP_LAST_F */ + + /* Node Record */ + {BITSOFFS(0, 16), "Lid", mad_dump_uint}, + {BITSOFFS(32, 8), "BaseVers", mad_dump_uint}, + {BITSOFFS(40, 8), "ClassVers", mad_dump_uint}, + {BITSOFFS(48, 8), "NodeType", mad_dump_node_type}, + {BITSOFFS(56, 8), "NumPorts", mad_dump_uint}, + {64, 64, "SystemGuid", mad_dump_hex}, + {128, 64, "Guid", mad_dump_hex}, + {192, 64, "PortGuid", mad_dump_hex}, + {BITSOFFS(256, 16), "PartCap", mad_dump_uint}, + {BITSOFFS(272, 16), "DevId", mad_dump_hex}, + {288, 32, "Revision", mad_dump_hex}, + {BITSOFFS(320, 8), "LocalPort", mad_dump_uint}, + {BITSOFFS(328, 24), "VendorId", mad_dump_hex}, + {352, 64 * 8, "NodeDesc", mad_dump_string}, + {}, /* IB_SA_NR_LAST_F */ + + /* + * PortSamplesResult fields + */ + {BITSOFFS(0, 16), "Tag", mad_dump_hex}, + {BITSOFFS(30, 2), "SampleStatus", mad_dump_hex}, + {32, 32, "Counter0", mad_dump_uint}, + {64, 32, "Counter1", mad_dump_uint}, + {96, 32, "Counter2", mad_dump_uint}, + {128, 32, "Counter3", mad_dump_uint}, + {160, 32, "Counter4", mad_dump_uint}, + {192, 32, "Counter5", mad_dump_uint}, + {224, 32, "Counter6", mad_dump_uint}, + {256, 32, "Counter7", mad_dump_uint}, + {288, 32, "Counter8", mad_dump_uint}, + {320, 32, "Counter9", mad_dump_uint}, + {352, 32, "Counter10", mad_dump_uint}, + {384, 32, "Counter11", mad_dump_uint}, + {416, 32, "Counter12", mad_dump_uint}, + {448, 32, "Counter13", mad_dump_uint}, + {480, 32, "Counter14", mad_dump_uint}, + {}, /* IB_PSR_LAST_F */ + + /* + * PortInfoExtended fields + */ + {0, 32, "CapMask", mad_dump_hex}, + {BITSOFFS(32, 16), "FECModeActive", mad_dump_uint}, + {BITSOFFS(48, 16), "FDRFECModeSupported", mad_dump_hex}, + {BITSOFFS(64, 16), "FDRFECModeEnabled", mad_dump_hex}, + {BITSOFFS(80, 16), "EDRFECModeSupported", mad_dump_hex}, + {BITSOFFS(96, 16), "EDRFECModeEnabled", mad_dump_hex}, + {}, /* IB_PORT_EXT_LAST_F */ + + /* + * PortExtendedSpeedsCounters RSFEC Active fields + */ + {BITSOFFS(8, 8), "PortSelect", mad_dump_uint}, + {64, 64, "CounterSelect", mad_dump_hex}, + {BITSOFFS(128, 16), "SyncHeaderErrorCounter", mad_dump_uint}, + {BITSOFFS(144, 16), "UnknownBlockCounter", mad_dump_uint}, + {352, 32, "FECCorrectableSymbolCtrLane0", mad_dump_uint}, + {384, 32, "FECCorrectableSymbolCtrLane1", mad_dump_uint}, + {416, 32, "FECCorrectableSymbolCtrLane2", mad_dump_uint}, + {448, 32, "FECCorrectableSymbolCtrLane3", mad_dump_uint}, + {480, 32, "FECCorrectableSymbolCtrLane4", mad_dump_uint}, + {512, 32, "FECCorrectableSymbolCtrLane5", mad_dump_uint}, + {544, 32, "FECCorrectableSymbolCtrLane6", mad_dump_uint}, + {576, 32, "FECCorrectableSymbolCtrLane7", mad_dump_uint}, + {608, 32, "FECCorrectableSymbolCtrLane8", mad_dump_uint}, + {640, 32, "FECCorrectableSymbolCtrLane9", mad_dump_uint}, + {672, 32, "FECCorrectableSymbolCtrLane10", mad_dump_uint}, + {704, 32, "FECCorrectableSymbolCtrLane11", mad_dump_uint}, + {1120, 32, "PortFECCorrectableBlockCtr", mad_dump_uint}, + {1152, 32, "PortFECUncorrectableBlockCtr", mad_dump_uint}, + {1184, 32, "PortFECCorrectedSymbolCtr", mad_dump_uint}, + {}, /* IB_PESC_RSFEC_LAST_F */ + + /* + * More PortCountersExtended fields + */ + {32, 32, "CounterSelect2", mad_dump_hex}, + {576, 64, "SymbolErrorCounter", mad_dump_uint}, + {640, 64, "LinkErrorRecoveryCounter", mad_dump_uint}, + {704, 64, "LinkDownedCounter", mad_dump_uint}, + {768, 64, "PortRcvErrors", mad_dump_uint}, + {832, 64, "PortRcvRemotePhysicalErrors", mad_dump_uint}, + {896, 64, "PortRcvSwitchRelayErrors", mad_dump_uint}, + {960, 64, "PortXmitDiscards", mad_dump_uint}, + {1024, 64, "PortXmitConstraintErrors", mad_dump_uint}, + {1088, 64, "PortRcvConstraintErrors", mad_dump_uint}, + {1152, 64, "LocalLinkIntegrityErrors", mad_dump_uint}, + {1216, 64, "ExcessiveBufferOverrunErrors", mad_dump_uint}, + {1280, 64, "VL15Dropped", mad_dump_uint}, + {1344, 64, "PortXmitWait", mad_dump_uint}, + {1408, 64, "QP1Dropped", mad_dump_uint}, + {}, /* IB_PC_EXT_ERR_LAST_F */ + + /* + * Another PortCounters field + */ + {160, 16, "QP1Dropped", mad_dump_uint}, + + /* + * More PortInfoExtended fields + */ + {112, 16, "HDRFECModeSupported", mad_dump_hex}, + {128, 16, "HDRFECModeEnabled", mad_dump_hex}, + {}, /* IB_PORT_EXT_HDR_FEC_MODE_LAST_F */ + + {} /* IB_FIELD_LAST_ */ +}; + +static void _set_field64(void *buf, int base_offs, const ib_field_t * f, + uint64_t val) +{ + uint64_t nval; + + nval = htonll(val); + memcpy(((void *)(char *)buf + base_offs + f->bitoffs / 8), + (void *)&nval, sizeof(uint64_t)); +} + +static uint64_t _get_field64(void *buf, int base_offs, const ib_field_t * f) +{ + uint64_t val; + memcpy((void *)&val, (void *)((char *)buf + base_offs + f->bitoffs / 8), + sizeof(uint64_t)); + return ntohll(val); +} + +static void _set_field(void *buf, int base_offs, const ib_field_t * f, + uint32_t val) +{ + int prebits = (8 - (f->bitoffs & 7)) & 7; + int postbits = (f->bitoffs + f->bitlen) & 7; + int bytelen = f->bitlen / 8; + unsigned idx = base_offs + f->bitoffs / 8; + char *p = (char *)buf; + + if (!bytelen && (f->bitoffs & 7) + f->bitlen < 8) { + p[3 ^ idx] &= ~((((1 << f->bitlen) - 1)) << (f->bitoffs & 7)); + p[3 ^ idx] |= + (val & ((1 << f->bitlen) - 1)) << (f->bitoffs & 7); + return; + } + + if (prebits) { /* val lsb in byte msb */ + p[3 ^ idx] &= (1 << (8 - prebits)) - 1; + p[3 ^ idx++] |= (val & ((1 << prebits) - 1)) << (8 - prebits); + val >>= prebits; + } + + /* BIG endian byte order */ + for (; bytelen--; val >>= 8) + p[3 ^ idx++] = val & 0xff; + + if (postbits) { /* val msb in byte lsb */ + p[3 ^ idx] &= ~((1 << postbits) - 1); + p[3 ^ idx] |= val; + } +} + +static uint32_t _get_field(void *buf, int base_offs, const ib_field_t * f) +{ + int prebits = (8 - (f->bitoffs & 7)) & 7; + int postbits = (f->bitoffs + f->bitlen) & 7; + int bytelen = f->bitlen / 8; + unsigned idx = base_offs + f->bitoffs / 8; + uint8_t *p = (uint8_t *) buf; + uint32_t val = 0, v = 0, i; + + if (!bytelen && (f->bitoffs & 7) + f->bitlen < 8) + return (p[3 ^ idx] >> (f->bitoffs & 7)) & ((1 << f->bitlen) - + 1); + + if (prebits) /* val lsb from byte msb */ + v = p[3 ^ idx++] >> (8 - prebits); + + if (postbits) { /* val msb from byte lsb */ + i = base_offs + (f->bitoffs + f->bitlen) / 8; + val = (p[3 ^ i] & ((1 << postbits) - 1)); + } + + /* BIG endian byte order */ + for (idx += bytelen - 1; bytelen--; idx--) + val = (val << 8) | p[3 ^ idx]; + + return (val << prebits) | v; +} + +/* field must be byte aligned */ +static void _set_array(void *buf, int base_offs, const ib_field_t * f, + void *val) +{ + int bitoffs = f->bitoffs; + + if (f->bitlen < 32) + bitoffs = BE_TO_BITSOFFS(bitoffs, f->bitlen); + + memcpy((uint8_t *) buf + base_offs + bitoffs / 8, val, f->bitlen / 8); +} + +static void _get_array(void *buf, int base_offs, const ib_field_t * f, + void *val) +{ + int bitoffs = f->bitoffs; + + if (f->bitlen < 32) + bitoffs = BE_TO_BITSOFFS(bitoffs, f->bitlen); + + memcpy(val, (uint8_t *) buf + base_offs + bitoffs / 8, f->bitlen / 8); +} + +uint32_t mad_get_field(void *buf, int base_offs, enum MAD_FIELDS field) +{ + return _get_field(buf, base_offs, ib_mad_f + field); +} + +void mad_set_field(void *buf, int base_offs, enum MAD_FIELDS field, + uint32_t val) +{ + _set_field(buf, base_offs, ib_mad_f + field, val); +} + +uint64_t mad_get_field64(void *buf, int base_offs, enum MAD_FIELDS field) +{ + return _get_field64(buf, base_offs, ib_mad_f + field); +} + +void mad_set_field64(void *buf, int base_offs, enum MAD_FIELDS field, + uint64_t val) +{ + _set_field64(buf, base_offs, ib_mad_f + field, val); +} + +void mad_set_array(void *buf, int base_offs, enum MAD_FIELDS field, void *val) +{ + _set_array(buf, base_offs, ib_mad_f + field, val); +} + +void mad_get_array(void *buf, int base_offs, enum MAD_FIELDS field, void *val) +{ + _get_array(buf, base_offs, ib_mad_f + field, val); +} + +void mad_decode_field(uint8_t * buf, enum MAD_FIELDS field, void *val) +{ + const ib_field_t *f = ib_mad_f + field; + + if (!field) { + *(int *)val = *(int *)buf; + return; + } + if (f->bitlen <= 32) { + *(uint32_t *) val = _get_field(buf, 0, f); + return; + } + if (f->bitlen == 64) { + *(uint64_t *) val = _get_field64(buf, 0, f); + return; + } + _get_array(buf, 0, f, val); +} + +void mad_encode_field(uint8_t * buf, enum MAD_FIELDS field, void *val) +{ + const ib_field_t *f = ib_mad_f + field; + + if (!field) { + *(int *)buf = *(int *)val; + return; + } + if (f->bitlen <= 32) { + _set_field(buf, 0, f, *(uint32_t *) val); + return; + } + if (f->bitlen == 64) { + _set_field64(buf, 0, f, *(uint64_t *) val); + return; + } + _set_array(buf, 0, f, val); +} + +/************************/ + +static char *_mad_dump_val(const ib_field_t * f, char *buf, int bufsz, + void *val) +{ + f->def_dump_fn(buf, bufsz, val, ALIGN(f->bitlen, 8) / 8); + buf[bufsz - 1] = 0; + + return buf; +} + +static char *_mad_dump_field(const ib_field_t * f, const char *name, char *buf, + int bufsz, void *val) +{ + char dots[128]; + int l, n; + + if (bufsz <= 32) + return NULL; /* buf too small */ + + if (!name) + name = f->name; + + l = strlen(name); + if (l < 32) { + memset(dots, '.', 32 - l); + dots[32 - l] = 0; + } + + n = snprintf(buf, bufsz, "%s:%s", name, dots); + _mad_dump_val(f, buf + n, bufsz - n, val); + buf[bufsz - 1] = 0; + + return buf; +} + +static int _mad_dump(ib_mad_dump_fn * fn, const char *name, void *val, + int valsz) +{ + ib_field_t f; + char buf[512]; + + f.def_dump_fn = fn; + f.bitlen = valsz * 8; + + return printf("%s\n", _mad_dump_field(&f, name, buf, sizeof buf, val)); +} + +static int _mad_print_field(const ib_field_t * f, const char *name, void *val, + int valsz) +{ + return _mad_dump(f->def_dump_fn, name ? name : f->name, val, + valsz ? valsz : ALIGN(f->bitlen, 8) / 8); +} + +int mad_print_field(enum MAD_FIELDS field, const char *name, void *val) +{ + if (field <= IB_NO_FIELD || field >= IB_FIELD_LAST_) + return -1; + return _mad_print_field(ib_mad_f + field, name, val, 0); +} + +char *mad_dump_field(enum MAD_FIELDS field, char *buf, int bufsz, void *val) +{ + if (field <= IB_NO_FIELD || field >= IB_FIELD_LAST_) + return NULL; + return _mad_dump_field(ib_mad_f + field, NULL, buf, bufsz, val); +} + +char *mad_dump_val(enum MAD_FIELDS field, char *buf, int bufsz, void *val) +{ + if (field <= IB_NO_FIELD || field >= IB_FIELD_LAST_) + return NULL; + return _mad_dump_val(ib_mad_f + field, buf, bufsz, val); +} + +const char *mad_field_name(enum MAD_FIELDS field) +{ + return (ib_mad_f[field].name); +} diff --git a/libibmad/gs.c b/libibmad/gs.c new file mode 100644 index 0000000..7f9c129 --- /dev/null +++ b/libibmad/gs.c @@ -0,0 +1,129 @@ +/* + * Copyright (c) 2004-2009 Voltaire Inc. All rights reserved. + * Copyright (c) 2011 Mellanox Technologies LTD. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <errno.h> + +#include <infiniband/umad.h> +#include <infiniband/mad.h> + +#undef DEBUG +#define DEBUG if (ibdebug) IBWARN + +uint8_t *pma_query_via(void *rcvbuf, ib_portid_t * dest, int port, + unsigned timeout, unsigned id, + const struct ibmad_port * srcport) +{ + ib_rpc_v1_t rpc = { 0 }; + ib_rpc_t *rpcold = (ib_rpc_t *)(void *)&rpc; + int lid = dest->lid; + void *p_ret; + + DEBUG("lid %u port %d", lid, port); + + if (lid == -1) { + IBWARN("only lid routed is supported"); + return NULL; + } + + rpc.mgtclass = IB_PERFORMANCE_CLASS | IB_MAD_RPC_VERSION1; + rpc.method = IB_MAD_METHOD_GET; + rpc.attr.id = id; + + /* Same for attribute IDs */ + mad_set_field(rcvbuf, 0, IB_PC_PORT_SELECT_F, port); + rpc.attr.mod = 0; + rpc.timeout = timeout; + rpc.datasz = IB_PC_DATA_SZ; + rpc.dataoffs = IB_PC_DATA_OFFS; + + if (!dest->qp) + dest->qp = 1; + if (!dest->qkey) + dest->qkey = IB_DEFAULT_QP1_QKEY; + + p_ret = mad_rpc(srcport, rpcold, dest, rcvbuf, rcvbuf); + errno = rpc.error; + return p_ret; +} + +uint8_t *performance_reset_via(void *rcvbuf, ib_portid_t * dest, + int port, unsigned mask, unsigned timeout, + unsigned id, const struct ibmad_port * srcport) +{ + ib_rpc_v1_t rpc = { 0 }; + ib_rpc_t *rpcold = (ib_rpc_t *)(void *)&rpc; + + int lid = dest->lid; + void *p_ret; + + DEBUG("lid %u port %d mask 0x%x", lid, port, mask); + + if (lid == -1) { + IBWARN("only lid routed is supported"); + return NULL; + } + + if (!mask) + mask = ~0; + + rpc.mgtclass = IB_PERFORMANCE_CLASS | IB_MAD_RPC_VERSION1; + rpc.method = IB_MAD_METHOD_SET; + rpc.attr.id = id; + + memset(rcvbuf, 0, IB_MAD_SIZE); + + /* Next 2 lines - same for attribute IDs */ + mad_set_field(rcvbuf, 0, IB_PC_PORT_SELECT_F, port); + mad_set_field(rcvbuf, 0, IB_PC_COUNTER_SELECT_F, mask); + mask = mask >> 16; + if (id == IB_GSI_PORT_COUNTERS_EXT) + mad_set_field(rcvbuf, 0, IB_PC_EXT_COUNTER_SELECT2_F, mask); + else + mad_set_field(rcvbuf, 0, IB_PC_COUNTER_SELECT2_F, mask); + rpc.attr.mod = 0; + rpc.timeout = timeout; + rpc.datasz = IB_PC_DATA_SZ; + rpc.dataoffs = IB_PC_DATA_OFFS; + if (!dest->qp) + dest->qp = 1; + if (!dest->qkey) + dest->qkey = IB_DEFAULT_QP1_QKEY; + + p_ret = mad_rpc(srcport, rpcold, dest, rcvbuf, rcvbuf); + errno = rpc.error; + return p_ret; +} diff --git a/libibmad/iba_types.h b/libibmad/iba_types.h new file mode 100644 index 0000000..0805aa9 --- /dev/null +++ b/libibmad/iba_types.h @@ -0,0 +1,1734 @@ +/* + * Copyright (c) 2004-2009 Voltaire, Inc. All rights reserved. + * Copyright (c) 2002-2019 Mellanox Technologies LTD. All rights reserved. + * Copyright (c) 1996-2003 Intel Corporation. All rights reserved. + * Copyright (c) 2009 HNR Consulting. All rights reserved. + * Copyright (c) 2013 Oracle and/or its affiliates. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#ifndef __LIBIBMAD_IB_TYPES_H__ +#define __LIBIBMAD_IB_TYPES_H__ + +#include <endian.h> +#include <stdint.h> +#include <stdbool.h> +#include <assert.h> +#include <linux/types.h> + +#define MAD_BLOCK_SIZE 256 +#define MAD_RMPP_HDR_SIZE 36 +#define MAD_BLOCK_GRH_SIZE 296 +#define IB_LID_PERMISSIVE 0xFFFF +#define IB_DEFAULT_PKEY 0xFFFF +#define IB_QP1_WELL_KNOWN_Q_KEY htobe32(0x80010000) +#define IB_QP0 0 +#define IB_QP1 htobe32(1) +#define IB_QP_PRIVILEGED_Q_KEY htobe32(0x80000000) +#define IB_LID_UCAST_START_HO 0x0001 +#define IB_LID_UCAST_START htobe16(IB_LID_UCAST_START_HO) +#define IB_LID_UCAST_END_HO 0xBFFF +#define IB_LID_UCAST_END htobe16(IB_LID_UCAST_END_HO) +#define IB_LID_MCAST_START_HO 0xC000 +#define IB_LID_MCAST_START htobe16(IB_LID_MCAST_START_HO) +#define IB_LID_MCAST_END_HO 0xFFFE +#define IB_LID_MCAST_END htobe16(IB_LID_MCAST_END_HO) +#define IB_DEFAULT_SUBNET_PREFIX htobe64(0xFE80000000000000ULL) +#define IB_DEFAULT_SUBNET_PREFIX_HO 0xFE80000000000000ULL +#define IB_NODE_NUM_PORTS_MAX 0xFE +#define IB_INVALID_PORT_NUM 0xFF +#define IB_SUBNET_PATH_HOPS_MAX 64 +#define IB_HOPLIMIT_MAX 255 +#define IB_MC_SCOPE_LINK_LOCAL 0x2 +#define IB_MC_SCOPE_SITE_LOCAL 0x5 +#define IB_MC_SCOPE_ORG_LOCAL 0x8 +#define IB_MC_SCOPE_GLOBAL 0xE +#define IB_PKEY_MAX_BLOCKS 2048 +#define IB_MCAST_MAX_BLOCK_ID 511 +#define IB_MCAST_BLOCK_ID_MASK_HO 0x000001FF +#define IB_MCAST_BLOCK_SIZE 32 +#define IB_MCAST_MASK_SIZE 16 +#define IB_MCAST_POSITION_MASK_HO 0xF0000000 +#define IB_MCAST_POSITION_MAX 0xF +#define IB_MCAST_POSITION_SHIFT 28 +#define IB_PKEY_BASE_MASK htobe16(0x7FFF) +#define IB_PKEY_TYPE_MASK htobe16(0x8000) +#define IB_DEFAULT_PARTIAL_PKEY htobe16(0x7FFF) +#define IB_MCLASS_SUBN_LID 0x01 +#define IB_MCLASS_SUBN_DIR 0x81 +#define IB_MCLASS_SUBN_ADM 0x03 +#define IB_MCLASS_PERF 0x04 +#define IB_MCLASS_BM 0x05 +#define IB_MCLASS_DEV_MGMT 0x06 +#define IB_MCLASS_COMM_MGMT 0x07 +#define IB_MCLASS_SNMP 0x08 +#define IB_MCLASS_VENDOR_LOW_RANGE_MIN 0x09 +#define IB_MCLASS_VENDOR_LOW_RANGE_MAX 0x0F +#define IB_MCLASS_DEV_ADM 0x10 +#define IB_MCLASS_BIS 0x12 +#define IB_MCLASS_CC 0x21 +#define IB_MCLASS_VENDOR_HIGH_RANGE_MIN 0x30 +#define IB_MCLASS_VENDOR_HIGH_RANGE_MAX 0x4F +#define IB_MAX_METHODS 128 +#define IB_MAD_METHOD_RESP_MASK 0x80 +#define IB_MAD_METHOD_GET 0x01 +#define IB_MAD_METHOD_SET 0x02 +#define IB_MAD_METHOD_GET_RESP 0x81 +#define IB_MAD_METHOD_DELETE 0x15 +#define IB_MAD_METHOD_GETTABLE 0x12 +#define IB_MAD_METHOD_GETTABLE_RESP 0x92 +#define IB_MAD_METHOD_GETTRACETABLE 0x13 +#define IB_MAD_METHOD_GETMULTI 0x14 +#define IB_MAD_METHOD_GETMULTI_RESP 0x94 +#define IB_MAD_METHOD_SEND 0x03 +#define IB_MAD_METHOD_TRAP 0x05 +#define IB_MAD_METHOD_REPORT 0x06 +#define IB_MAD_METHOD_REPORT_RESP 0x86 +#define IB_MAD_METHOD_TRAP_REPRESS 0x07 +#define IB_MAD_STATUS_BUSY htobe16(0x0001) +#define IB_MAD_STATUS_REDIRECT htobe16(0x0002) +#define IB_MAD_STATUS_UNSUP_CLASS_VER htobe16(0x0004) +#define IB_MAD_STATUS_UNSUP_METHOD htobe16(0x0008) +#define IB_MAD_STATUS_UNSUP_METHOD_ATTR htobe16(0x000C) +#define IB_MAD_STATUS_INVALID_FIELD htobe16(0x001C) +#define IB_MAD_STATUS_CLASS_MASK htobe16(0xFF00) +#define IB_SA_MAD_STATUS_SUCCESS 0x0000 +#define IB_SA_MAD_STATUS_NO_RESOURCES htobe16(0x0100) +#define IB_SA_MAD_STATUS_REQ_INVALID htobe16(0x0200) +#define IB_SA_MAD_STATUS_NO_RECORDS htobe16(0x0300) +#define IB_SA_MAD_STATUS_TOO_MANY_RECORDS htobe16(0x0400) +#define IB_SA_MAD_STATUS_INVALID_GID htobe16(0x0500) +#define IB_SA_MAD_STATUS_INSUF_COMPS htobe16(0x0600) +#define IB_SA_MAD_STATUS_DENIED htobe16(0x0700) +#define IB_SA_MAD_STATUS_PRIO_SUGGESTED htobe16(0x0800) +#define IB_DM_MAD_STATUS_NO_IOC_RESP htobe16(0x0100) +#define IB_DM_MAD_STATUS_NO_SVC_ENTRIES htobe16(0x0200) +#define IB_DM_MAD_STATUS_IOC_FAILURE htobe16(0x8000) +#define IB_MAD_ATTR_CLASS_PORT_INFO htobe16(0x0001) +#define IB_MAD_ATTR_NOTICE htobe16(0x0002) +#define IB_MAD_ATTR_INFORM_INFO htobe16(0x0003) +#define IB_MAD_ATTR_NODE_DESC htobe16(0x0010) +#define IB_MAD_ATTR_PORT_SMPL_CTRL htobe16(0x0010) +#define IB_MAD_ATTR_NODE_INFO htobe16(0x0011) +#define IB_MAD_ATTR_PORT_SMPL_RSLT htobe16(0x0011) +#define IB_MAD_ATTR_SWITCH_INFO htobe16(0x0012) +#define IB_MAD_ATTR_PORT_CNTRS htobe16(0x0012) +#define IB_MAD_ATTR_PORT_CNTRS_EXT htobe16(0x001D) +#define IB_MAD_ATTR_PORT_XMIT_DATA_SL htobe16(0x0036) +#define IB_MAD_ATTR_PORT_RCV_DATA_SL htobe16(0x0037) +#define IB_MAD_ATTR_GUID_INFO htobe16(0x0014) +#define IB_MAD_ATTR_PORT_INFO htobe16(0x0015) +#define IB_MAD_ATTR_P_KEY_TABLE htobe16(0x0016) +#define IB_MAD_ATTR_SLVL_TABLE htobe16(0x0017) +#define IB_MAD_ATTR_VL_ARBITRATION htobe16(0x0018) +#define IB_MAD_ATTR_LIN_FWD_TBL htobe16(0x0019) +#define IB_MAD_ATTR_RND_FWD_TBL htobe16(0x001A) +#define IB_MAD_ATTR_MCAST_FWD_TBL htobe16(0x001B) +#define IB_MAD_ATTR_NODE_RECORD htobe16(0x0011) +#define IB_MAD_ATTR_PORTINFO_RECORD htobe16(0x0012) +#define IB_MAD_ATTR_SWITCH_INFO_RECORD htobe16(0x0014) +#define IB_MAD_ATTR_LINK_RECORD htobe16(0x0020) +#define IB_MAD_ATTR_SM_INFO htobe16(0x0020) +#define IB_MAD_ATTR_SMINFO_RECORD htobe16(0x0018) +#define IB_MAD_ATTR_GUIDINFO_RECORD htobe16(0x0030) +#define IB_MAD_ATTR_VENDOR_DIAG htobe16(0x0030) +#define IB_MAD_ATTR_LED_INFO htobe16(0x0031) +#define IB_MAD_ATTR_MLNX_EXTENDED_PORT_INFO htobe16(0xFF90) +#define IB_MAD_ATTR_SERVICE_RECORD htobe16(0x0031) +#define IB_MAD_ATTR_LFT_RECORD htobe16(0x0015) +#define IB_MAD_ATTR_MFT_RECORD htobe16(0x0017) +#define IB_MAD_ATTR_PKEY_TBL_RECORD htobe16(0x0033) +#define IB_MAD_ATTR_PATH_RECORD htobe16(0x0035) +#define IB_MAD_ATTR_VLARB_RECORD htobe16(0x0036) +#define IB_MAD_ATTR_SLVL_RECORD htobe16(0x0013) +#define IB_MAD_ATTR_MCMEMBER_RECORD htobe16(0x0038) +#define IB_MAD_ATTR_TRACE_RECORD htobe16(0x0039) +#define IB_MAD_ATTR_MULTIPATH_RECORD htobe16(0x003A) +#define IB_MAD_ATTR_SVC_ASSOCIATION_RECORD htobe16(0x003B) +#define IB_MAD_ATTR_INFORM_INFO_RECORD htobe16(0x00F3) +#define IB_MAD_ATTR_IO_UNIT_INFO htobe16(0x0010) +#define IB_MAD_ATTR_IO_CONTROLLER_PROFILE htobe16(0x0011) +#define IB_MAD_ATTR_SERVICE_ENTRIES htobe16(0x0012) +#define IB_MAD_ATTR_DIAGNOSTIC_TIMEOUT htobe16(0x0020) +#define IB_MAD_ATTR_PREPARE_TO_TEST htobe16(0x0021) +#define IB_MAD_ATTR_TEST_DEVICE_ONCE htobe16(0x0022) +#define IB_MAD_ATTR_TEST_DEVICE_LOOP htobe16(0x0023) +#define IB_MAD_ATTR_DIAG_CODE htobe16(0x0024) +#define IB_MAD_ATTR_SVC_ASSOCIATION_RECORD htobe16(0x003B) +#define IB_MAD_ATTR_CONG_INFO htobe16(0x0011) +#define IB_MAD_ATTR_CONG_KEY_INFO htobe16(0x0012) +#define IB_MAD_ATTR_CONG_LOG htobe16(0x0013) +#define IB_MAD_ATTR_SW_CONG_SETTING htobe16(0x0014) +#define IB_MAD_ATTR_SW_PORT_CONG_SETTING htobe16(0x0015) +#define IB_MAD_ATTR_CA_CONG_SETTING htobe16(0x0016) +#define IB_MAD_ATTR_CC_TBL htobe16(0x0017) +#define IB_MAD_ATTR_TIME_STAMP htobe16(0x0018) +#define IB_NODE_TYPE_CA 0x01 +#define IB_NODE_TYPE_SWITCH 0x02 +#define IB_NODE_TYPE_ROUTER 0x03 +#define IB_NOTICE_PRODUCER_TYPE_CA htobe32(0x000001) +#define IB_NOTICE_PRODUCER_TYPE_SWITCH htobe32(0x000002) +#define IB_NOTICE_PRODUCER_TYPE_ROUTER htobe32(0x000003) +#define IB_NOTICE_PRODUCER_TYPE_CLASS_MGR htobe32(0x000004) +#define IB_MTU_LEN_256 1 +#define IB_MTU_LEN_512 2 +#define IB_MTU_LEN_1024 3 +#define IB_MTU_LEN_2048 4 +#define IB_MTU_LEN_4096 5 +#define IB_PATH_SELECTOR_GREATER_THAN 0 +#define IB_PATH_SELECTOR_LESS_THAN 1 +#define IB_PATH_SELECTOR_EXACTLY 2 +#define IB_PATH_SELECTOR_LARGEST 3 +#define IB_SMINFO_STATE_NOTACTIVE 0 +#define IB_SMINFO_STATE_DISCOVERING 1 +#define IB_SMINFO_STATE_STANDBY 2 +#define IB_SMINFO_STATE_MASTER 3 +#define IB_PATH_REC_SL_MASK 0x000F +#define IB_MULTIPATH_REC_SL_MASK 0x000F +#define IB_PATH_REC_QOS_CLASS_MASK 0xFFF0 +#define IB_MULTIPATH_REC_QOS_CLASS_MASK 0xFFF0 +#define IB_PATH_REC_SELECTOR_MASK 0xC0 +#define IB_MULTIPATH_REC_SELECTOR_MASK 0xC0 +#define IB_PATH_REC_BASE_MASK 0x3F +#define IB_MULTIPATH_REC_BASE_MASK 0x3F +#define IB_LINK_NO_CHANGE 0 +#define IB_LINK_DOWN 1 +#define IB_LINK_INIT 2 +#define IB_LINK_ARMED 3 +#define IB_LINK_ACTIVE 4 +#define IB_LINK_ACT_DEFER 5 +#define IB_JOIN_STATE_FULL 1 +#define IB_JOIN_STATE_NON 2 +#define IB_JOIN_STATE_SEND_ONLY 4 +#define IB_JOIN_STATE_SEND_ONLY_FULL 8 +typedef union { + uint8_t raw[16]; + struct _ib_gid_unicast { + __be64 prefix; + __be64 interface_id; + } __attribute__((packed)) unicast; + struct _ib_gid_multicast { + uint8_t header[2]; + uint8_t raw_group_id[14]; + } __attribute__((packed)) multicast; + struct _ib_gid_ip_multicast { + uint8_t header[2]; + __be16 signature; + __be16 p_key; + uint8_t group_id[10]; + } __attribute__((packed)) ip_multicast; +} __attribute__((packed)) ib_gid_t; +typedef struct { + __be64 service_id; + ib_gid_t dgid; + ib_gid_t sgid; + __be16 dlid; + __be16 slid; + __be32 hop_flow_raw; + uint8_t tclass; + uint8_t num_path; + __be16 pkey; + __be16 qos_class_sl; + uint8_t mtu; + uint8_t rate; + uint8_t pkt_life; + uint8_t preference; + uint8_t resv2[6]; +} __attribute__((packed)) ib_path_rec_t; +#define IB_PR_COMPMASK_SERVICEID_MSB htobe64(((uint64_t)1) << 0) +#define IB_PR_COMPMASK_SERVICEID_LSB htobe64(((uint64_t)1) << 1) +#define IB_PR_COMPMASK_DGID htobe64(((uint64_t)1) << 2) +#define IB_PR_COMPMASK_SGID htobe64(((uint64_t)1) << 3) +#define IB_PR_COMPMASK_DLID htobe64(((uint64_t)1) << 4) +#define IB_PR_COMPMASK_SLID htobe64(((uint64_t)1) << 5) +#define IB_PR_COMPMASK_RAWTRAFFIC htobe64(((uint64_t)1) << 6) +#define IB_PR_COMPMASK_RESV0 htobe64(((uint64_t)1) << 7) +#define IB_PR_COMPMASK_FLOWLABEL htobe64(((uint64_t)1) << 8) +#define IB_PR_COMPMASK_HOPLIMIT htobe64(((uint64_t)1) << 9) +#define IB_PR_COMPMASK_TCLASS htobe64(((uint64_t)1) << 10) +#define IB_PR_COMPMASK_REVERSIBLE htobe64(((uint64_t)1) << 11) +#define IB_PR_COMPMASK_NUMBPATH htobe64(((uint64_t)1) << 12) +#define IB_PR_COMPMASK_PKEY htobe64(((uint64_t)1) << 13) +#define IB_PR_COMPMASK_QOS_CLASS htobe64(((uint64_t)1) << 14) +#define IB_PR_COMPMASK_SL htobe64(((uint64_t)1) << 15) +#define IB_PR_COMPMASK_MTUSELEC htobe64(((uint64_t)1) << 16) +#define IB_PR_COMPMASK_MTU htobe64(((uint64_t)1) << 17) +#define IB_PR_COMPMASK_RATESELEC htobe64(((uint64_t)1) << 18) +#define IB_PR_COMPMASK_RATE htobe64(((uint64_t)1) << 19) +#define IB_PR_COMPMASK_PKTLIFETIMESELEC htobe64(((uint64_t)1) << 20) +#define IB_PR_COMPMASK_PKTLIFETIME htobe64(((uint64_t)1) << 21) +#define IB_LR_COMPMASK_FROM_LID htobe64(((uint64_t)1) << 0) +#define IB_LR_COMPMASK_FROM_PORT htobe64(((uint64_t)1) << 1) +#define IB_LR_COMPMASK_TO_PORT htobe64(((uint64_t)1) << 2) +#define IB_LR_COMPMASK_TO_LID htobe64(((uint64_t)1) << 3) +#define IB_VLA_COMPMASK_LID htobe64(((uint64_t)1) << 0) +#define IB_VLA_COMPMASK_OUT_PORT htobe64(((uint64_t)1) << 1) +#define IB_VLA_COMPMASK_BLOCK htobe64(((uint64_t)1) << 2) +#define IB_SLVL_COMPMASK_LID htobe64(((uint64_t)1) << 0) +#define IB_SLVL_COMPMASK_IN_PORT htobe64(((uint64_t)1) << 1) +#define IB_SLVL_COMPMASK_OUT_PORT htobe64(((uint64_t)1) << 2) +#define IB_PKEY_COMPMASK_LID htobe64(((uint64_t)1) << 0) +#define IB_PKEY_COMPMASK_BLOCK htobe64(((uint64_t)1) << 1) +#define IB_PKEY_COMPMASK_PORT htobe64(((uint64_t)1) << 2) +#define IB_SWIR_COMPMASK_LID htobe64(((uint64_t)1) << 0) +#define IB_SWIR_COMPMASK_RESERVED1 htobe64(((uint64_t)1) << 1) +#define IB_LFTR_COMPMASK_LID htobe64(((uint64_t)1) << 0) +#define IB_LFTR_COMPMASK_BLOCK htobe64(((uint64_t)1) << 1) +#define IB_MFTR_COMPMASK_LID htobe64(((uint64_t)1) << 0) +#define IB_MFTR_COMPMASK_POSITION htobe64(((uint64_t)1) << 1) +#define IB_MFTR_COMPMASK_RESERVED1 htobe64(((uint64_t)1) << 2) +#define IB_MFTR_COMPMASK_BLOCK htobe64(((uint64_t)1) << 3) +#define IB_MFTR_COMPMASK_RESERVED2 htobe64(((uint64_t)1) << 4) +#define IB_NR_COMPMASK_LID htobe64(((uint64_t)1) << 0) +#define IB_NR_COMPMASK_RESERVED1 htobe64(((uint64_t)1) << 1) +#define IB_NR_COMPMASK_BASEVERSION htobe64(((uint64_t)1) << 2) +#define IB_NR_COMPMASK_CLASSVERSION htobe64(((uint64_t)1) << 3) +#define IB_NR_COMPMASK_NODETYPE htobe64(((uint64_t)1) << 4) +#define IB_NR_COMPMASK_NUMPORTS htobe64(((uint64_t)1) << 5) +#define IB_NR_COMPMASK_SYSIMAGEGUID htobe64(((uint64_t)1) << 6) +#define IB_NR_COMPMASK_NODEGUID htobe64(((uint64_t)1) << 7) +#define IB_NR_COMPMASK_PORTGUID htobe64(((uint64_t)1) << 8) +#define IB_NR_COMPMASK_PARTCAP htobe64(((uint64_t)1) << 9) +#define IB_NR_COMPMASK_DEVID htobe64(((uint64_t)1) << 10) +#define IB_NR_COMPMASK_REV htobe64(((uint64_t)1) << 11) +#define IB_NR_COMPMASK_PORTNUM htobe64(((uint64_t)1) << 12) +#define IB_NR_COMPMASK_VENDID htobe64(((uint64_t)1) << 13) +#define IB_NR_COMPMASK_NODEDESC htobe64(((uint64_t)1) << 14) +#define IB_SR_COMPMASK_SID htobe64(((uint64_t)1) << 0) +#define IB_SR_COMPMASK_SGID htobe64(((uint64_t)1) << 1) +#define IB_SR_COMPMASK_SPKEY htobe64(((uint64_t)1) << 2) +#define IB_SR_COMPMASK_RES1 htobe64(((uint64_t)1) << 3) +#define IB_SR_COMPMASK_SLEASE htobe64(((uint64_t)1) << 4) +#define IB_SR_COMPMASK_SKEY htobe64(((uint64_t)1) << 5) +#define IB_SR_COMPMASK_SNAME htobe64(((uint64_t)1) << 6) +#define IB_SR_COMPMASK_SDATA8_0 htobe64(((uint64_t)1) << 7) +#define IB_SR_COMPMASK_SDATA8_1 htobe64(((uint64_t)1) << 8) +#define IB_SR_COMPMASK_SDATA8_2 htobe64(((uint64_t)1) << 9) +#define IB_SR_COMPMASK_SDATA8_3 htobe64(((uint64_t)1) << 10) +#define IB_SR_COMPMASK_SDATA8_4 htobe64(((uint64_t)1) << 11) +#define IB_SR_COMPMASK_SDATA8_5 htobe64(((uint64_t)1) << 12) +#define IB_SR_COMPMASK_SDATA8_6 htobe64(((uint64_t)1) << 13) +#define IB_SR_COMPMASK_SDATA8_7 htobe64(((uint64_t)1) << 14) +#define IB_SR_COMPMASK_SDATA8_8 htobe64(((uint64_t)1) << 15) +#define IB_SR_COMPMASK_SDATA8_9 htobe64(((uint64_t)1) << 16) +#define IB_SR_COMPMASK_SDATA8_10 htobe64(((uint64_t)1) << 17) +#define IB_SR_COMPMASK_SDATA8_11 htobe64(((uint64_t)1) << 18) +#define IB_SR_COMPMASK_SDATA8_12 htobe64(((uint64_t)1) << 19) +#define IB_SR_COMPMASK_SDATA8_13 htobe64(((uint64_t)1) << 20) +#define IB_SR_COMPMASK_SDATA8_14 htobe64(((uint64_t)1) << 21) +#define IB_SR_COMPMASK_SDATA8_15 htobe64(((uint64_t)1) << 22) +#define IB_SR_COMPMASK_SDATA16_0 htobe64(((uint64_t)1) << 23) +#define IB_SR_COMPMASK_SDATA16_1 htobe64(((uint64_t)1) << 24) +#define IB_SR_COMPMASK_SDATA16_2 htobe64(((uint64_t)1) << 25) +#define IB_SR_COMPMASK_SDATA16_3 htobe64(((uint64_t)1) << 26) +#define IB_SR_COMPMASK_SDATA16_4 htobe64(((uint64_t)1) << 27) +#define IB_SR_COMPMASK_SDATA16_5 htobe64(((uint64_t)1) << 28) +#define IB_SR_COMPMASK_SDATA16_6 htobe64(((uint64_t)1) << 29) +#define IB_SR_COMPMASK_SDATA16_7 htobe64(((uint64_t)1) << 30) +#define IB_SR_COMPMASK_SDATA32_0 htobe64(((uint64_t)1) << 31) +#define IB_SR_COMPMASK_SDATA32_1 htobe64(((uint64_t)1) << 32) +#define IB_SR_COMPMASK_SDATA32_2 htobe64(((uint64_t)1) << 33) +#define IB_SR_COMPMASK_SDATA32_3 htobe64(((uint64_t)1) << 34) +#define IB_SR_COMPMASK_SDATA64_0 htobe64(((uint64_t)1) << 35) +#define IB_SR_COMPMASK_SDATA64_1 htobe64(((uint64_t)1) << 36) +#define IB_PIR_COMPMASK_LID htobe64(((uint64_t)1) << 0) +#define IB_PIR_COMPMASK_PORTNUM htobe64(((uint64_t)1) << 1) +#define IB_PIR_COMPMASK_OPTIONS htobe64(((uint64_t)1) << 2) +#define IB_PIR_COMPMASK_MKEY htobe64(((uint64_t)1) << 3) +#define IB_PIR_COMPMASK_GIDPRE htobe64(((uint64_t)1) << 4) +#define IB_PIR_COMPMASK_BASELID htobe64(((uint64_t)1) << 5) +#define IB_PIR_COMPMASK_SMLID htobe64(((uint64_t)1) << 6) +#define IB_PIR_COMPMASK_CAPMASK htobe64(((uint64_t)1) << 7) +#define IB_PIR_COMPMASK_DIAGCODE htobe64(((uint64_t)1) << 8) +#define IB_PIR_COMPMASK_MKEYLEASEPRD htobe64(((uint64_t)1) << 9) +#define IB_PIR_COMPMASK_LOCALPORTNUM htobe64(((uint64_t)1) << 10) +#define IB_PIR_COMPMASK_LINKWIDTHENABLED htobe64(((uint64_t)1) << 11) +#define IB_PIR_COMPMASK_LNKWIDTHSUPPORT htobe64(((uint64_t)1) << 12) +#define IB_PIR_COMPMASK_LNKWIDTHACTIVE htobe64(((uint64_t)1) << 13) +#define IB_PIR_COMPMASK_LNKSPEEDSUPPORT htobe64(((uint64_t)1) << 14) +#define IB_PIR_COMPMASK_PORTSTATE htobe64(((uint64_t)1) << 15) +#define IB_PIR_COMPMASK_PORTPHYSTATE htobe64(((uint64_t)1) << 16) +#define IB_PIR_COMPMASK_LINKDWNDFLTSTATE htobe64(((uint64_t)1) << 17) +#define IB_PIR_COMPMASK_MKEYPROTBITS htobe64(((uint64_t)1) << 18) +#define IB_PIR_COMPMASK_RESV2 htobe64(((uint64_t)1) << 19) +#define IB_PIR_COMPMASK_LMC htobe64(((uint64_t)1) << 20) +#define IB_PIR_COMPMASK_LINKSPEEDACTIVE htobe64(((uint64_t)1) << 21) +#define IB_PIR_COMPMASK_LINKSPEEDENABLE htobe64(((uint64_t)1) << 22) +#define IB_PIR_COMPMASK_NEIGHBORMTU htobe64(((uint64_t)1) << 23) +#define IB_PIR_COMPMASK_MASTERSMSL htobe64(((uint64_t)1) << 24) +#define IB_PIR_COMPMASK_VLCAP htobe64(((uint64_t)1) << 25) +#define IB_PIR_COMPMASK_INITTYPE htobe64(((uint64_t)1) << 26) +#define IB_PIR_COMPMASK_VLHIGHLIMIT htobe64(((uint64_t)1) << 27) +#define IB_PIR_COMPMASK_VLARBHIGHCAP htobe64(((uint64_t)1) << 28) +#define IB_PIR_COMPMASK_VLARBLOWCAP htobe64(((uint64_t)1) << 29) +#define IB_PIR_COMPMASK_INITTYPEREPLY htobe64(((uint64_t)1) << 30) +#define IB_PIR_COMPMASK_MTUCAP htobe64(((uint64_t)1) << 31) +#define IB_PIR_COMPMASK_VLSTALLCNT htobe64(((uint64_t)1) << 32) +#define IB_PIR_COMPMASK_HOQLIFE htobe64(((uint64_t)1) << 33) +#define IB_PIR_COMPMASK_OPVLS htobe64(((uint64_t)1) << 34) +#define IB_PIR_COMPMASK_PARENFIN htobe64(((uint64_t)1) << 35) +#define IB_PIR_COMPMASK_PARENFOUT htobe64(((uint64_t)1) << 36) +#define IB_PIR_COMPMASK_FILTERRAWIN htobe64(((uint64_t)1) << 37) +#define IB_PIR_COMPMASK_FILTERRAWOUT htobe64(((uint64_t)1) << 38) +#define IB_PIR_COMPMASK_MKEYVIO htobe64(((uint64_t)1) << 39) +#define IB_PIR_COMPMASK_PKEYVIO htobe64(((uint64_t)1) << 40) +#define IB_PIR_COMPMASK_QKEYVIO htobe64(((uint64_t)1) << 41) +#define IB_PIR_COMPMASK_GUIDCAP htobe64(((uint64_t)1) << 42) +#define IB_PIR_COMPMASK_CLIENTREREG htobe64(((uint64_t)1) << 43) +#define IB_PIR_COMPMASK_RESV3 htobe64(((uint64_t)1) << 44) +#define IB_PIR_COMPMASK_SUBNTO htobe64(((uint64_t)1) << 45) +#define IB_PIR_COMPMASK_RESV4 htobe64(((uint64_t)1) << 46) +#define IB_PIR_COMPMASK_RESPTIME htobe64(((uint64_t)1) << 47) +#define IB_PIR_COMPMASK_LOCALPHYERR htobe64(((uint64_t)1) << 48) +#define IB_PIR_COMPMASK_OVERRUNERR htobe64(((uint64_t)1) << 49) +#define IB_PIR_COMPMASK_MAXCREDHINT htobe64(((uint64_t)1) << 50) +#define IB_PIR_COMPMASK_RESV5 htobe64(((uint64_t)1) << 51) +#define IB_PIR_COMPMASK_LINKRTLAT htobe64(((uint64_t)1) << 52) +#define IB_PIR_COMPMASK_CAPMASK2 htobe64(((uint64_t)1) << 53) +#define IB_PIR_COMPMASK_LINKSPDEXTACT htobe64(((uint64_t)1) << 54) +#define IB_PIR_COMPMASK_LINKSPDEXTSUPP htobe64(((uint64_t)1) << 55) +#define IB_PIR_COMPMASK_RESV7 htobe64(((uint64_t)1) << 56) +#define IB_PIR_COMPMASK_LINKSPDEXTENAB htobe64(((uint64_t)1) << 57) +#define IB_MCR_COMPMASK_GID htobe64(((uint64_t)1) << 0) +#define IB_MCR_COMPMASK_MGID htobe64(((uint64_t)1) << 0) +#define IB_MCR_COMPMASK_PORT_GID htobe64(((uint64_t)1) << 1) +#define IB_MCR_COMPMASK_QKEY htobe64(((uint64_t)1) << 2) +#define IB_MCR_COMPMASK_MLID htobe64(((uint64_t)1) << 3) +#define IB_MCR_COMPMASK_MTU_SEL htobe64(((uint64_t)1) << 4) +#define IB_MCR_COMPMASK_MTU htobe64(((uint64_t)1) << 5) +#define IB_MCR_COMPMASK_TCLASS htobe64(((uint64_t)1) << 6) +#define IB_MCR_COMPMASK_PKEY htobe64(((uint64_t)1) << 7) +#define IB_MCR_COMPMASK_RATE_SEL htobe64(((uint64_t)1) << 8) +#define IB_MCR_COMPMASK_RATE htobe64(((uint64_t)1) << 9) +#define IB_MCR_COMPMASK_LIFE_SEL htobe64(((uint64_t)1) << 10) +#define IB_MCR_COMPMASK_LIFE htobe64(((uint64_t)1) << 11) +#define IB_MCR_COMPMASK_SL htobe64(((uint64_t)1) << 12) +#define IB_MCR_COMPMASK_FLOW htobe64(((uint64_t)1) << 13) +#define IB_MCR_COMPMASK_HOP htobe64(((uint64_t)1) << 14) +#define IB_MCR_COMPMASK_SCOPE htobe64(((uint64_t)1) << 15) +#define IB_MCR_COMPMASK_JOIN_STATE htobe64(((uint64_t)1) << 16) +#define IB_MCR_COMPMASK_PROXY htobe64(((uint64_t)1) << 17) +#define IB_GIR_COMPMASK_LID htobe64(((uint64_t)1) << 0) +#define IB_GIR_COMPMASK_BLOCKNUM htobe64(((uint64_t)1) << 1) +#define IB_GIR_COMPMASK_RESV1 htobe64(((uint64_t)1) << 2) +#define IB_GIR_COMPMASK_RESV2 htobe64(((uint64_t)1) << 3) +#define IB_GIR_COMPMASK_GID0 htobe64(((uint64_t)1) << 4) +#define IB_GIR_COMPMASK_GID1 htobe64(((uint64_t)1) << 5) +#define IB_GIR_COMPMASK_GID2 htobe64(((uint64_t)1) << 6) +#define IB_GIR_COMPMASK_GID3 htobe64(((uint64_t)1) << 7) +#define IB_GIR_COMPMASK_GID4 htobe64(((uint64_t)1) << 8) +#define IB_GIR_COMPMASK_GID5 htobe64(((uint64_t)1) << 9) +#define IB_GIR_COMPMASK_GID6 htobe64(((uint64_t)1) << 10) +#define IB_GIR_COMPMASK_GID7 htobe64(((uint64_t)1) << 11) +#define IB_MPR_COMPMASK_RAWTRAFFIC htobe64(((uint64_t)1) << 0) +#define IB_MPR_COMPMASK_RESV0 htobe64(((uint64_t)1) << 1) +#define IB_MPR_COMPMASK_FLOWLABEL htobe64(((uint64_t)1) << 2) +#define IB_MPR_COMPMASK_HOPLIMIT htobe64(((uint64_t)1) << 3) +#define IB_MPR_COMPMASK_TCLASS htobe64(((uint64_t)1) << 4) +#define IB_MPR_COMPMASK_REVERSIBLE htobe64(((uint64_t)1) << 5) +#define IB_MPR_COMPMASK_NUMBPATH htobe64(((uint64_t)1) << 6) +#define IB_MPR_COMPMASK_PKEY htobe64(((uint64_t)1) << 7) +#define IB_MPR_COMPMASK_QOS_CLASS htobe64(((uint64_t)1) << 8) +#define IB_MPR_COMPMASK_SL htobe64(((uint64_t)1) << 9) +#define IB_MPR_COMPMASK_MTUSELEC htobe64(((uint64_t)1) << 10) +#define IB_MPR_COMPMASK_MTU htobe64(((uint64_t)1) << 11) +#define IB_MPR_COMPMASK_RATESELEC htobe64(((uint64_t)1) << 12) +#define IB_MPR_COMPMASK_RATE htobe64(((uint64_t)1) << 13) +#define IB_MPR_COMPMASK_PKTLIFETIMESELEC htobe64(((uint64_t)1) << 14) +#define IB_MPR_COMPMASK_PKTLIFETIME htobe64(((uint64_t)1) << 15) +#define IB_MPR_COMPMASK_SERVICEID_MSB htobe64(((uint64_t)1) << 16) +#define IB_MPR_COMPMASK_INDEPSELEC htobe64(((uint64_t)1) << 17) +#define IB_MPR_COMPMASK_RESV3 htobe64(((uint64_t)1) << 18) +#define IB_MPR_COMPMASK_SGIDCOUNT htobe64(((uint64_t)1) << 19) +#define IB_MPR_COMPMASK_DGIDCOUNT htobe64(((uint64_t)1) << 20) +#define IB_MPR_COMPMASK_SERVICEID_LSB htobe64(((uint64_t)1) << 21) +#define IB_SMIR_COMPMASK_LID htobe64(((uint64_t)1) << 0) +#define IB_SMIR_COMPMASK_RESV0 htobe64(((uint64_t)1) << 1) +#define IB_SMIR_COMPMASK_GUID htobe64(((uint64_t)1) << 2) +#define IB_SMIR_COMPMASK_SMKEY htobe64(((uint64_t)1) << 3) +#define IB_SMIR_COMPMASK_ACTCOUNT htobe64(((uint64_t)1) << 4) +#define IB_SMIR_COMPMASK_PRIORITY htobe64(((uint64_t)1) << 5) +#define IB_SMIR_COMPMASK_SMSTATE htobe64(((uint64_t)1) << 6) +#define IB_IIR_COMPMASK_SUBSCRIBERGID htobe64(((uint64_t)1) << 0) +#define IB_IIR_COMPMASK_ENUM htobe64(((uint64_t)1) << 1) +#define IB_IIR_COMPMASK_RESV0 htobe64(((uint64_t)1) << 2) +#define IB_IIR_COMPMASK_GID htobe64(((uint64_t)1) << 3) +#define IB_IIR_COMPMASK_LIDRANGEBEGIN htobe64(((uint64_t)1) << 4) +#define IB_IIR_COMPMASK_LIDRANGEEND htobe64(((uint64_t)1) << 5) +#define IB_IIR_COMPMASK_RESV1 htobe64(((uint64_t)1) << 6) +#define IB_IIR_COMPMASK_ISGENERIC htobe64(((uint64_t)1) << 7) +#define IB_IIR_COMPMASK_SUBSCRIBE htobe64(((uint64_t)1) << 8) +#define IB_IIR_COMPMASK_TYPE htobe64(((uint64_t)1) << 9) +#define IB_IIR_COMPMASK_TRAPNUMB htobe64(((uint64_t)1) << 10) +#define IB_IIR_COMPMASK_DEVICEID htobe64(((uint64_t)1) << 10) +#define IB_IIR_COMPMASK_QPN htobe64(((uint64_t)1) << 11) +#define IB_IIR_COMPMASK_RESV2 htobe64(((uint64_t)1) << 12) +#define IB_IIR_COMPMASK_RESPTIME htobe64(((uint64_t)1) << 13) +#define IB_IIR_COMPMASK_RESV3 htobe64(((uint64_t)1) << 14) +#define IB_IIR_COMPMASK_PRODTYPE htobe64(((uint64_t)1) << 15) +#define IB_IIR_COMPMASK_VENDID htobe64(((uint64_t)1) << 15) +#define IB_CLASS_CAP_TRAP 0x0001 +#define IB_CLASS_CAP_GETSET 0x0002 +#define IB_CLASS_CAP_CAPMASK2 0x0004 +#define IB_CLASS_ENH_PORT0_CC_MASK 0x0100 +#define IB_CLASS_RESP_TIME_MASK 0x1F +#define IB_CLASS_CAPMASK2_SHIFT 5 +typedef struct { + uint8_t base_ver; + uint8_t class_ver; + __be16 cap_mask; + __be32 cap_mask2_resp_time; + ib_gid_t redir_gid; + __be32 redir_tc_sl_fl; + __be16 redir_lid; + __be16 redir_pkey; + __be32 redir_qp; + __be32 redir_qkey; + ib_gid_t trap_gid; + __be32 trap_tc_sl_fl; + __be16 trap_lid; + __be16 trap_pkey; + __be32 trap_hop_qp; + __be32 trap_qkey; +} __attribute__((packed)) ib_class_port_info_t; +#define IB_PM_ALL_PORT_SELECT htobe16(1 << 8) +#define IB_PM_EXT_WIDTH_SUPPORTED htobe16(1 << 9) +#define IB_PM_EXT_WIDTH_NOIETF_SUP htobe16(1 << 10) +#define IB_PM_SAMPLES_ONLY_SUP htobe16(1 << 11) +#define IB_PM_PC_XMIT_WAIT_SUP htobe16(1 << 12) +#define IS_PM_INH_LMTD_PKEY_MC_CONSTR_ERR htobe16(1 << 13) +#define IS_PM_RSFEC_COUNTERS_SUP htobe16(1 << 14) +#define IB_PM_IS_QP1_DROP_SUP htobe16(1 << 15) +#define IB_PM_IS_PM_KEY_SUPPORTED htobe32(1 << 0) +#define IB_PM_IS_ADDL_PORT_CTRS_EXT_SUP htobe32(1 << 1) +typedef struct { + __be64 guid; + __be64 sm_key; + __be32 act_count; + uint8_t pri_state; +} __attribute__((packed)) ib_sm_info_t; +typedef struct { + uint8_t base_ver; + uint8_t mgmt_class; + uint8_t class_ver; + uint8_t method; + __be16 status; + __be16 class_spec; + __be64 trans_id; + __be16 attr_id; + __be16 resv; + __be32 attr_mod; +} __attribute__((packed)) ib_mad_t; +typedef struct { + ib_mad_t common_hdr; + uint8_t rmpp_version; + uint8_t rmpp_type; + uint8_t rmpp_flags; + uint8_t rmpp_status; + __be32 seg_num; + __be32 paylen_newwin; +} __attribute__((packed)) ib_rmpp_mad_t; +#define IB_RMPP_TYPE_DATA 1 +#define IB_RMPP_TYPE_ACK 2 +#define IB_RMPP_TYPE_STOP 3 +#define IB_RMPP_TYPE_ABORT 4 +#define IB_RMPP_NO_RESP_TIME 0x1F +#define IB_RMPP_FLAG_ACTIVE 0x01 +#define IB_RMPP_FLAG_FIRST 0x02 +#define IB_RMPP_FLAG_LAST 0x04 +#define IB_RMPP_STATUS_SUCCESS 0 +#define IB_RMPP_STATUS_RESX 1 +#define IB_RMPP_STATUS_T2L 118 +#define IB_RMPP_STATUS_BAD_LEN 119 +#define IB_RMPP_STATUS_BAD_SEG 120 +#define IB_RMPP_STATUS_BADT 121 +#define IB_RMPP_STATUS_W2S 122 +#define IB_RMPP_STATUS_S2B 123 +#define IB_RMPP_STATUS_BAD_STATUS 124 +#define IB_RMPP_STATUS_UNV 125 +#define IB_RMPP_STATUS_TMR 126 +#define IB_RMPP_STATUS_UNSPEC 127 +#define IB_SMP_DIRECTION_HO 0x8000 +#define IB_SMP_DIRECTION htobe16(IB_SMP_DIRECTION_HO) +#define IB_SMP_STATUS_MASK_HO 0x7FFF +#define IB_SMP_STATUS_MASK htobe16(IB_SMP_STATUS_MASK_HO) +#define IB_SMP_DATA_SIZE 64 +typedef struct { + uint8_t base_ver; + uint8_t mgmt_class; + uint8_t class_ver; + uint8_t method; + __be16 status; + uint8_t hop_ptr; + uint8_t hop_count; + __be64 trans_id; + __be16 attr_id; + __be16 resv; + __be32 attr_mod; + __be64 m_key; + __be16 dr_slid; + __be16 dr_dlid; + uint32_t resv1[7]; + uint8_t data[IB_SMP_DATA_SIZE]; + uint8_t initial_path[IB_SUBNET_PATH_HOPS_MAX]; + uint8_t return_path[IB_SUBNET_PATH_HOPS_MAX]; +} __attribute__((packed)) ib_smp_t; +typedef struct { + uint8_t base_version; + uint8_t class_version; + uint8_t node_type; + uint8_t num_ports; + __be64 sys_guid; + __be64 node_guid; + __be64 port_guid; + __be16 partition_cap; + __be16 device_id; + __be32 revision; + __be32 port_num_vendor_id; +} __attribute__((packed)) ib_node_info_t; +#define IB_SA_DATA_SIZE 200 +typedef struct { + uint8_t base_ver; + uint8_t mgmt_class; + uint8_t class_ver; + uint8_t method; + __be16 status; + __be16 resv; + __be64 trans_id; + __be16 attr_id; + __be16 resv1; + __be32 attr_mod; + uint8_t rmpp_version; + uint8_t rmpp_type; + uint8_t rmpp_flags; + uint8_t rmpp_status; + __be32 seg_num; + __be32 paylen_newwin; + __be64 sm_key; + __be16 attr_offset; + __be16 resv3; + __be64 comp_mask; + uint8_t data[IB_SA_DATA_SIZE]; +} __attribute__((packed)) ib_sa_mad_t; +#define IB_NODE_INFO_PORT_NUM_MASK htobe32(0xFF000000) +#define IB_NODE_INFO_VEND_ID_MASK htobe32(0x00FFFFFF) +#define IB_NODE_DESCRIPTION_SIZE 64 +typedef struct { + // Node String is an array of UTF-8 characters + // that describe the node in text format + // Note that this string is NOT NULL TERMINATED! + uint8_t description[IB_NODE_DESCRIPTION_SIZE]; +} __attribute__((packed)) ib_node_desc_t; +typedef struct { + __be16 lid; + __be16 resv; + ib_node_info_t node_info; + ib_node_desc_t node_desc; + uint8_t pad[4]; +} __attribute__((packed)) ib_node_record_t; +typedef struct { + __be64 m_key; + __be64 subnet_prefix; + __be16 base_lid; + __be16 master_sm_base_lid; + __be32 capability_mask; + __be16 diag_code; + __be16 m_key_lease_period; + uint8_t local_port_num; + uint8_t link_width_enabled; + uint8_t link_width_supported; + uint8_t link_width_active; + uint8_t state_info1; /* LinkSpeedSupported and PortState */ + uint8_t state_info2; /* PortPhysState and LinkDownDefaultState */ + uint8_t mkey_lmc; /* M_KeyProtectBits and LMC */ + uint8_t link_speed; /* LinkSpeedEnabled and LinkSpeedActive */ + uint8_t mtu_smsl; + uint8_t vl_cap; /* VLCap and InitType */ + uint8_t vl_high_limit; + uint8_t vl_arb_high_cap; + uint8_t vl_arb_low_cap; + uint8_t mtu_cap; + uint8_t vl_stall_life; + uint8_t vl_enforce; + __be16 m_key_violations; + __be16 p_key_violations; + __be16 q_key_violations; + uint8_t guid_cap; + uint8_t subnet_timeout; /* cli_rereg(1b), mcast_pkey_trap_suppr(2b), timeout(5b) */ + uint8_t resp_time_value; /* reserv(3b), rtv(5b) */ + uint8_t error_threshold; /* local phy errors(4b), overrun errors(4b) */ + __be16 max_credit_hint; + __be32 link_rt_latency; /* reserv(8b), link round trip lat(24b) */ + __be16 capability_mask2; + uint8_t link_speed_ext; /* LinkSpeedExtActive and LinkSpeedExtSupported */ + uint8_t link_speed_ext_enabled; /* reserv(3b), LinkSpeedExtEnabled(5b) */ +} __attribute__((packed)) ib_port_info_t; +#define IB_PORT_STATE_MASK 0x0F +#define IB_PORT_LMC_MASK 0x07 +#define IB_PORT_LMC_MAX 0x07 +#define IB_PORT_MPB_MASK 0xC0 +#define IB_PORT_MPB_SHIFT 6 +#define IB_PORT_LINK_SPEED_SHIFT 4 +#define IB_PORT_LINK_SPEED_SUPPORTED_MASK 0xF0 +#define IB_PORT_LINK_SPEED_ACTIVE_MASK 0xF0 +#define IB_PORT_LINK_SPEED_ENABLED_MASK 0x0F +#define IB_PORT_PHYS_STATE_MASK 0xF0 +#define IB_PORT_PHYS_STATE_SHIFT 4 +#define IB_PORT_PHYS_STATE_NO_CHANGE 0 +#define IB_PORT_PHYS_STATE_SLEEP 1 +#define IB_PORT_PHYS_STATE_POLLING 2 +#define IB_PORT_PHYS_STATE_DISABLED 3 +#define IB_PORT_PHYS_STATE_PORTCONFTRAIN 4 +#define IB_PORT_PHYS_STATE_LINKUP 5 +#define IB_PORT_PHYS_STATE_LINKERRRECOVER 6 +#define IB_PORT_PHYS_STATE_PHYTEST 7 +#define IB_PORT_LNKDWNDFTSTATE_MASK 0x0F +#define IB_PORT_CAP_RESV0 htobe32(0x00000001) +#define IB_PORT_CAP_IS_SM htobe32(0x00000002) +#define IB_PORT_CAP_HAS_NOTICE htobe32(0x00000004) +#define IB_PORT_CAP_HAS_TRAP htobe32(0x00000008) +#define IB_PORT_CAP_HAS_IPD htobe32(0x00000010) +#define IB_PORT_CAP_HAS_AUTO_MIG htobe32(0x00000020) +#define IB_PORT_CAP_HAS_SL_MAP htobe32(0x00000040) +#define IB_PORT_CAP_HAS_NV_MKEY htobe32(0x00000080) +#define IB_PORT_CAP_HAS_NV_PKEY htobe32(0x00000100) +#define IB_PORT_CAP_HAS_LED_INFO htobe32(0x00000200) +#define IB_PORT_CAP_SM_DISAB htobe32(0x00000400) +#define IB_PORT_CAP_HAS_SYS_IMG_GUID htobe32(0x00000800) +#define IB_PORT_CAP_HAS_PKEY_SW_EXT_PORT_TRAP htobe32(0x00001000) +#define IB_PORT_CAP_HAS_CABLE_INFO htobe32(0x00002000) +#define IB_PORT_CAP_HAS_EXT_SPEEDS htobe32(0x00004000) +#define IB_PORT_CAP_HAS_CAP_MASK2 htobe32(0x00008000) +#define IB_PORT_CAP_HAS_COM_MGT htobe32(0x00010000) +#define IB_PORT_CAP_HAS_SNMP htobe32(0x00020000) +#define IB_PORT_CAP_REINIT htobe32(0x00040000) +#define IB_PORT_CAP_HAS_DEV_MGT htobe32(0x00080000) +#define IB_PORT_CAP_HAS_VEND_CLS htobe32(0x00100000) +#define IB_PORT_CAP_HAS_DR_NTC htobe32(0x00200000) +#define IB_PORT_CAP_HAS_CAP_NTC htobe32(0x00400000) +#define IB_PORT_CAP_HAS_BM htobe32(0x00800000) +#define IB_PORT_CAP_HAS_LINK_RT_LATENCY htobe32(0x01000000) +#define IB_PORT_CAP_HAS_CLIENT_REREG htobe32(0x02000000) +#define IB_PORT_CAP_HAS_OTHER_LOCAL_CHANGES_NTC htobe32(0x04000000) +#define IB_PORT_CAP_HAS_LINK_SPEED_WIDTH_PAIRS_TBL htobe32(0x08000000) +#define IB_PORT_CAP_HAS_VEND_MADS htobe32(0x10000000) +#define IB_PORT_CAP_HAS_MCAST_PKEY_TRAP_SUPPRESS htobe32(0x20000000) +#define IB_PORT_CAP_HAS_MCAST_FDB_TOP htobe32(0x40000000) +#define IB_PORT_CAP_HAS_HIER_INFO htobe32(0x80000000) +#define IB_PORT_CAP2_IS_SET_NODE_DESC_SUPPORTED htobe16(0x0001) +#define IB_PORT_CAP2_IS_PORT_INFO_EXT_SUPPORTED htobe16(0x0002) +#define IB_PORT_CAP2_IS_VIRT_SUPPORTED htobe16(0x0004) +#define IB_PORT_CAP2_IS_SWITCH_PORT_STATE_TBL_SUPP htobe16(0x0008) +#define IB_PORT_CAP2_IS_LINK_WIDTH_2X_SUPPORTED htobe16(0x0010) +#define IB_PORT_CAP2_IS_LINK_SPEED_HDR_SUPPORTED htobe16(0x0020) +typedef struct { + __be32 cap_mask; + __be16 fec_mode_active; + __be16 fdr_fec_mode_sup; + __be16 fdr_fec_mode_enable; + __be16 edr_fec_mode_sup; + __be16 edr_fec_mode_enable; + __be16 hdr_fec_mode_sup; + __be16 hdr_fec_mode_enable; + uint8_t reserved[46]; +} __attribute__((packed)) ib_port_info_ext_t; +#define IB_PORT_EXT_NO_FEC_MODE_ACTIVE 0 +#define IB_PORT_EXT_FIRE_CODE_FEC_MODE_ACTIVE htobe16(0x0001) +#define IB_PORT_EXT_RS_FEC_MODE_ACTIVE htobe16(0x0002) +#define IB_PORT_EXT_LOW_LATENCY_RS_FEC_MODE_ACTIVE htobe16(0x0003) +#define IB_PORT_EXT_CAP_IS_FEC_MODE_SUPPORTED htobe32(0x00000001) +#define IB_LINK_WIDTH_ACTIVE_1X 1 +#define IB_LINK_WIDTH_ACTIVE_4X 2 +#define IB_LINK_WIDTH_ACTIVE_8X 4 +#define IB_LINK_WIDTH_ACTIVE_12X 8 +#define IB_LINK_WIDTH_ACTIVE_2X 16 +#define IB_LINK_WIDTH_SET_LWS 255 +#define IB_LINK_SPEED_ACTIVE_EXTENDED 0 +#define IB_LINK_SPEED_ACTIVE_2_5 1 +#define IB_LINK_SPEED_ACTIVE_5 2 +#define IB_LINK_SPEED_ACTIVE_10 4 +#define IB_LINK_SPEED_SET_LSS 15 +#define IB_LINK_SPEED_EXT_ACTIVE_NONE 0 +#define IB_LINK_SPEED_EXT_ACTIVE_14 1 +#define IB_LINK_SPEED_EXT_ACTIVE_25 2 +#define IB_LINK_SPEED_EXT_ACTIVE_50 4 +#define IB_LINK_SPEED_EXT_DISABLE 30 +#define IB_LINK_SPEED_EXT_SET_LSES 31 +#define IB_PATH_RECORD_RATE_2_5_GBS 2 +#define IB_PATH_RECORD_RATE_10_GBS 3 +#define IB_PATH_RECORD_RATE_30_GBS 4 +#define IB_PATH_RECORD_RATE_5_GBS 5 +#define IB_PATH_RECORD_RATE_20_GBS 6 +#define IB_PATH_RECORD_RATE_40_GBS 7 +#define IB_PATH_RECORD_RATE_60_GBS 8 +#define IB_PATH_RECORD_RATE_80_GBS 9 +#define IB_PATH_RECORD_RATE_120_GBS 10 +#define IB_PATH_RECORD_RATE_14_GBS 11 +#define IB_PATH_RECORD_RATE_56_GBS 12 +#define IB_PATH_RECORD_RATE_112_GBS 13 +#define IB_PATH_RECORD_RATE_168_GBS 14 +#define IB_PATH_RECORD_RATE_25_GBS 15 +#define IB_PATH_RECORD_RATE_100_GBS 16 +#define IB_PATH_RECORD_RATE_200_GBS 17 +#define IB_PATH_RECORD_RATE_300_GBS 18 +#define IB_PATH_RECORD_RATE_28_GBS 19 +#define IB_PATH_RECORD_RATE_50_GBS 20 +#define IB_PATH_RECORD_RATE_400_GBS 21 +#define IB_PATH_RECORD_RATE_600_GBS 22 +#define FDR10 0x01 +typedef struct { + uint8_t resvd1[3]; + uint8_t state_change_enable; + uint8_t resvd2[3]; + uint8_t link_speed_supported; + uint8_t resvd3[3]; + uint8_t link_speed_enabled; + uint8_t resvd4[3]; + uint8_t link_speed_active; + uint8_t resvd5[48]; +} __attribute__((packed)) ib_mlnx_ext_port_info_t; +typedef struct { + __be64 service_id; + ib_gid_t service_gid; + __be16 service_pkey; + __be16 resv; + __be32 service_lease; + uint8_t service_key[16]; + uint8_t service_name[64]; + uint8_t service_data8[16]; + __be16 service_data16[8]; + __be32 service_data32[4]; + __be64 service_data64[2]; +} __attribute__((packed)) ib_service_record_t; +typedef struct { + __be16 lid; + uint8_t port_num; + uint8_t options; + ib_port_info_t port_info; + uint8_t pad[4]; +} __attribute__((packed)) ib_portinfo_record_t; +typedef struct { + __be16 lid; + uint8_t port_num; + uint8_t options; + ib_port_info_ext_t port_info_ext; +} __attribute__((packed)) ib_portinfoext_record_t; +typedef struct { + __be16 from_lid; + uint8_t from_port_num; + uint8_t to_port_num; + __be16 to_lid; + uint8_t pad[2]; +} __attribute__((packed)) ib_link_record_t; +typedef struct { + __be16 lid; + uint16_t resv0; + ib_sm_info_t sm_info; + uint8_t pad[7]; +} __attribute__((packed)) ib_sminfo_record_t; +typedef struct { + __be16 lid; + __be16 block_num; + uint32_t resv0; + uint8_t lft[64]; +} __attribute__((packed)) ib_lft_record_t; +typedef struct { + __be16 lid; + __be16 position_block_num; + uint32_t resv0; + __be16 mft[IB_MCAST_BLOCK_SIZE]; +} __attribute__((packed)) ib_mft_record_t; +typedef struct { + __be16 lin_cap; + __be16 rand_cap; + __be16 mcast_cap; + __be16 lin_top; + uint8_t def_port; + uint8_t def_mcast_pri_port; + uint8_t def_mcast_not_port; + uint8_t life_state; + __be16 lids_per_port; + __be16 enforce_cap; + uint8_t flags; + uint8_t resvd; + __be16 mcast_top; +} __attribute__((packed)) ib_switch_info_t; +typedef struct { + __be16 lid; + uint16_t resv0; + ib_switch_info_t switch_info; +} __attribute__((packed)) ib_switch_info_record_t; +#define IB_SWITCH_PSC 0x04 +#define GUID_TABLE_MAX_ENTRIES 8 +typedef struct { + __be64 guid[GUID_TABLE_MAX_ENTRIES]; +} __attribute__((packed)) ib_guid_info_t; +typedef struct { + __be16 lid; + uint8_t block_num; + uint8_t resv; + uint32_t reserved; + ib_guid_info_t guid_info; +} __attribute__((packed)) ib_guidinfo_record_t; +#define IB_MULTIPATH_MAX_GIDS 11 +typedef struct { + __be32 hop_flow_raw; + uint8_t tclass; + uint8_t num_path; + __be16 pkey; + __be16 qos_class_sl; + uint8_t mtu; + uint8_t rate; + uint8_t pkt_life; + uint8_t service_id_8msb; + uint8_t independence; /* formerly resv2 */ + uint8_t sgid_count; + uint8_t dgid_count; + uint8_t service_id_56lsb[7]; + ib_gid_t gids[IB_MULTIPATH_MAX_GIDS]; +} __attribute__((packed)) ib_multipath_rec_t; +#define IB_NUM_PKEY_ELEMENTS_IN_BLOCK 32 +typedef struct { + __be16 pkey_entry[IB_NUM_PKEY_ELEMENTS_IN_BLOCK]; +} ib_pkey_table_t; +typedef struct { + __be16 lid; // for CA: lid of port, for switch lid of port 0 + __be16 block_num; + uint8_t port_num; // for switch: port number, for CA: reserved + uint8_t reserved1; + uint16_t reserved2; + ib_pkey_table_t pkey_tbl; +} ib_pkey_table_record_t; +#define IB_DROP_VL 15 +#define IB_MAX_NUM_VLS 16 +typedef struct { + uint8_t raw_vl_by_sl[IB_MAX_NUM_VLS / 2]; +} __attribute__((packed)) ib_slvl_table_t; +typedef struct { + __be16 lid; // for CA: lid of port, for switch lid of port 0 + uint8_t in_port_num; // reserved for CAs + uint8_t out_port_num; // reserved for CAs + uint32_t resv; + ib_slvl_table_t slvl_tbl; +} __attribute__((packed)) ib_slvl_table_record_t; +typedef struct { + uint8_t vl; + uint8_t weight; +} __attribute__((packed)) ib_vl_arb_element_t; +#define IB_NUM_VL_ARB_ELEMENTS_IN_BLOCK 32 +typedef struct { + ib_vl_arb_element_t vl_entry[IB_NUM_VL_ARB_ELEMENTS_IN_BLOCK]; +} __attribute__((packed)) ib_vl_arb_table_t; +typedef struct { + __be16 lid; // for CA: lid of port, for switch lid of port 0 + uint8_t port_num; + uint8_t block_num; + uint32_t reserved; + ib_vl_arb_table_t vl_arb_tbl; +} __attribute__((packed)) ib_vl_arb_table_record_t; +typedef struct { + __be32 ver_class_flow; + __be16 resv1; + uint8_t resv2; + uint8_t hop_limit; + ib_gid_t src_gid; + ib_gid_t dest_gid; +} __attribute__((packed)) ib_grh_t; +typedef struct { + ib_gid_t mgid; + ib_gid_t port_gid; + __be32 qkey; + __be16 mlid; + uint8_t mtu; + uint8_t tclass; + __be16 pkey; + uint8_t rate; + uint8_t pkt_life; + __be32 sl_flow_hop; + uint8_t scope_state; + uint8_t proxy_join : 1; + uint8_t reserved[2]; + uint8_t pad[4]; +} __attribute__((packed)) ib_member_rec_t; +#define IB_MC_REC_STATE_FULL_MEMBER 0x01 +#define IB_MC_REC_STATE_NON_MEMBER 0x02 +#define IB_MC_REC_STATE_SEND_ONLY_NON_MEMBER 0x04 +#define IB_MC_REC_STATE_SEND_ONLY_FULL_MEMBER 0x08 +#define IB_NOTICE_TYPE_FATAL 0x00 +#define IB_NOTICE_TYPE_URGENT 0x01 +#define IB_NOTICE_TYPE_SECURITY 0x02 +#define IB_NOTICE_TYPE_SUBN_MGMT 0x03 +#define IB_NOTICE_TYPE_INFO 0x04 +#define IB_NOTICE_TYPE_EMPTY 0x7F +#define SM_GID_IN_SERVICE_TRAP 64 +#define SM_GID_OUT_OF_SERVICE_TRAP 65 +#define SM_MGID_CREATED_TRAP 66 +#define SM_MGID_DESTROYED_TRAP 67 +#define SM_UNPATH_TRAP 68 +#define SM_REPATH_TRAP 69 +#define SM_LINK_STATE_CHANGED_TRAP 128 +#define SM_LINK_INTEGRITY_THRESHOLD_TRAP 129 +#define SM_BUFFER_OVERRUN_THRESHOLD_TRAP 130 +#define SM_WATCHDOG_TIMER_EXPIRED_TRAP 131 +#define SM_LOCAL_CHANGES_TRAP 144 +#define SM_SYS_IMG_GUID_CHANGED_TRAP 145 +#define SM_BAD_MKEY_TRAP 256 +#define SM_BAD_PKEY_TRAP 257 +#define SM_BAD_QKEY_TRAP 258 +#define SM_BAD_SWITCH_PKEY_TRAP 259 +typedef struct { + uint8_t generic_type; // 1 1 + union _notice_g_or_v { + struct _notice_generic // 5 6 + { + uint8_t prod_type_msb; + __be16 prod_type_lsb; + __be16 trap_num; + } __attribute__((packed)) generic; + struct _notice_vend { + uint8_t vend_id_msb; + __be16 vend_id_lsb; + __be16 dev_id; + } __attribute__((packed)) vend; + } g_or_v; + __be16 issuer_lid; // 2 8 + __be16 toggle_count; // 2 10 + union _data_details // 54 64 + { + struct _raw_data { + uint8_t details[54]; + } __attribute__((packed)) raw_data; + struct _ntc_64_67 { + uint8_t res[6]; + ib_gid_t gid; // the Node or Multicast Group that came in/out + } __attribute__((packed)) ntc_64_67; + struct _ntc_128 { + __be16 sw_lid; // the sw lid of which link state changed + } __attribute__((packed)) ntc_128; + struct _ntc_129_131 { + __be16 pad; + __be16 lid; // lid and port number of the violation + uint8_t port_num; + } __attribute__((packed)) ntc_129_131; + struct _ntc_144 { + __be16 pad1; + __be16 lid; // lid where change occured + uint8_t pad2; // reserved + uint8_t local_changes; // 7b reserved 1b local changes + __be32 new_cap_mask; // new capability mask + __be16 change_flgs; // 10b reserved 6b change flags + __be16 cap_mask2; + } __attribute__((packed)) ntc_144; + struct _ntc_145 { + __be16 pad1; + __be16 lid; // lid where sys guid changed + __be16 pad2; + __be64 new_sys_guid; // new system image guid + } __attribute__((packed)) ntc_145; + struct _ntc_256 { // total: 54 + __be16 pad1; // 2 + __be16 lid; // 2 + __be16 dr_slid; // 2 + uint8_t method; // 1 + uint8_t pad2; // 1 + __be16 attr_id; // 2 + __be32 attr_mod; // 4 + __be64 mkey; // 8 + uint8_t pad3; // 1 + uint8_t dr_trunc_hop; // 1 + uint8_t dr_rtn_path[30]; // 30 + } __attribute__((packed)) ntc_256; + struct _ntc_257_258 // violation of p/q_key // 49 + { + __be16 pad1; // 2 + __be16 lid1; // 2 + __be16 lid2; // 2 + __be32 key; // 4 + __be32 qp1; // 4b sl, 4b pad, 24b qp1 + __be32 qp2; // 8b pad, 24b qp2 + ib_gid_t gid1; // 16 + ib_gid_t gid2; // 16 + } __attribute__((packed)) ntc_257_258; + struct _ntc_259 // pkey violation from switch 51 + { + __be16 data_valid; // 2 + __be16 lid1; // 2 + __be16 lid2; // 2 + __be16 pkey; // 2 + __be32 sl_qp1; // 4b sl, 4b pad, 24b qp1 + __be32 qp2; // 8b pad, 24b qp2 + ib_gid_t gid1; // 16 + ib_gid_t gid2; // 16 + __be16 sw_lid; // 2 + uint8_t port_no; // 1 + } __attribute__((packed)) ntc_259; + struct _ntc_bkey_259 // bkey violation + { + __be16 lidaddr; + uint8_t method; + uint8_t reserved; + __be16 attribute_id; + __be32 attribute_modifier; + __be32 qp; // qp is low 24 bits + __be64 bkey; + ib_gid_t gid; + } __attribute__((packed)) ntc_bkey_259; + struct _ntc_cckey_0 // CC key violation + { + __be16 slid; // source LID from offending packet LRH + uint8_t method; // method, from common MAD header + uint8_t resv0; + __be16 attribute_id; // Attribute ID, from common MAD header + __be16 resv1; + __be32 attribute_modifier; // Attribute Modif, from common MAD header + __be32 qp; // 8b pad, 24b dest QP from BTH + __be64 cc_key; // CC key of the offending packet + ib_gid_t source_gid; // GID from GRH of the offending packet + uint8_t padding[14]; // Padding - ignored on read + } __attribute__((packed)) ntc_cckey_0; + } data_details; + ib_gid_t issuer_gid; // 16 80 +} __attribute__((packed)) ib_mad_notice_attr_t; +#define TRAP_259_MASK_SL htobe32(0xF0000000) +#define TRAP_259_MASK_QP htobe32(0x00FFFFFF) +#define TRAP_144_MASK_OTHER_LOCAL_CHANGES 0x01 +#define TRAP_144_MASK_CAPABILITY_MASK2_CHANGE htobe16(0x0020) +#define TRAP_144_MASK_HIERARCHY_INFO_CHANGE htobe16(0x0010) +#define TRAP_144_MASK_SM_PRIORITY_CHANGE htobe16(0x0008) +#define TRAP_144_MASK_LINK_SPEED_ENABLE_CHANGE htobe16(0x0004) +#define TRAP_144_MASK_LINK_WIDTH_ENABLE_CHANGE htobe16(0x0002) +#define TRAP_144_MASK_NODE_DESCRIPTION_CHANGE htobe16(0x0001) +typedef struct { + ib_gid_t gid; + __be16 lid_range_begin; + __be16 lid_range_end; + __be16 reserved1; + uint8_t is_generic; + uint8_t subscribe; + __be16 trap_type; + union _inform_g_or_v { + struct _inform_generic { + __be16 trap_num; + __be32 qpn_resp_time_val; + uint8_t reserved2; + uint8_t node_type_msb; + __be16 node_type_lsb; + } __attribute__((packed)) generic; + struct _inform_vend { + __be16 dev_id; + __be32 qpn_resp_time_val; + uint8_t reserved2; + uint8_t vendor_id_msb; + __be16 vendor_id_lsb; + } __attribute__((packed)) vend; + } __attribute__((packed)) g_or_v; +} __attribute__((packed)) ib_inform_info_t; +typedef struct { + ib_gid_t subscriber_gid; + __be16 subscriber_enum; + uint8_t reserved[6]; + ib_inform_info_t inform_info; + uint8_t pad[4]; +} __attribute__((packed)) ib_inform_info_record_t; +typedef struct { + ib_mad_t header; + uint8_t resv[40]; +#define IB_PM_DATA_SIZE 192 + uint8_t data[IB_PM_DATA_SIZE]; +} __attribute__((packed)) ib_perfmgt_mad_t; +typedef struct { + uint8_t reserved; + uint8_t port_select; + __be16 counter_select; + __be16 symbol_err_cnt; + uint8_t link_err_recover; + uint8_t link_downed; + __be16 rcv_err; + __be16 rcv_rem_phys_err; + __be16 rcv_switch_relay_err; + __be16 xmit_discards; + uint8_t xmit_constraint_err; + uint8_t rcv_constraint_err; + uint8_t counter_select2; + uint8_t link_int_buffer_overrun; + __be16 qp1_dropped; + __be16 vl15_dropped; + __be32 xmit_data; + __be32 rcv_data; + __be32 xmit_pkts; + __be32 rcv_pkts; + __be32 xmit_wait; +} __attribute__((packed)) ib_port_counters_t; +typedef struct { + uint8_t reserved; + uint8_t port_select; + __be16 counter_select; + __be32 counter_select2; + __be64 xmit_data; + __be64 rcv_data; + __be64 xmit_pkts; + __be64 rcv_pkts; + __be64 unicast_xmit_pkts; + __be64 unicast_rcv_pkts; + __be64 multicast_xmit_pkts; + __be64 multicast_rcv_pkts; + __be64 symbol_err_cnt; + __be64 link_err_recover; + __be64 link_downed; + __be64 rcv_err; + __be64 rcv_rem_phys_err; + __be64 rcv_switch_relay_err; + __be64 xmit_discards; + __be64 xmit_constraint_err; + __be64 rcv_constraint_err; + __be64 link_integrity_err; + __be64 buffer_overrun; + __be64 vl15_dropped; + __be64 xmit_wait; + __be64 qp1_dropped; +} __attribute__((packed)) ib_port_counters_ext_t; +typedef struct { + uint8_t op_code; + uint8_t port_select; + uint8_t tick; + uint8_t counter_width; /* 5 bits res : 3bits counter_width */ + __be32 counter_mask; /* 2 bits res : 3 bits counter_mask : 27 bits counter_masks_1to9 */ + __be16 counter_mask_10to14; /* 1 bits res : 15 bits counter_masks_10to14 */ + uint8_t sample_mech; + uint8_t sample_status; /* 6 bits res : 2 bits sample_status */ + __be64 option_mask; + __be64 vendor_mask; + __be32 sample_start; + __be32 sample_interval; + __be16 tag; + __be16 counter_select0; + __be16 counter_select1; + __be16 counter_select2; + __be16 counter_select3; + __be16 counter_select4; + __be16 counter_select5; + __be16 counter_select6; + __be16 counter_select7; + __be16 counter_select8; + __be16 counter_select9; + __be16 counter_select10; + __be16 counter_select11; + __be16 counter_select12; + __be16 counter_select13; + __be16 counter_select14; +} __attribute__((packed)) ib_port_samples_control_t; +#define IB_CS_PORT_XMIT_DATA htobe16(0x0001) +#define IB_CS_PORT_RCV_DATA htobe16(0x0002) +#define IB_CS_PORT_XMIT_PKTS htobe16(0x0003) +#define IB_CS_PORT_RCV_PKTS htobe16(0x0004) +#define IB_CS_PORT_XMIT_WAIT htobe16(0x0005) +typedef struct { + __be16 tag; + __be16 sample_status; /* 14 bits res : 2 bits sample_status */ + __be32 counter0; + __be32 counter1; + __be32 counter2; + __be32 counter3; + __be32 counter4; + __be32 counter5; + __be32 counter6; + __be32 counter7; + __be32 counter8; + __be32 counter9; + __be32 counter10; + __be32 counter11; + __be32 counter12; + __be32 counter13; + __be32 counter14; +} __attribute__((packed)) ib_port_samples_result_t; +typedef struct { + uint8_t reserved; + uint8_t port_select; + __be16 counter_select; + __be32 port_xmit_data_sl[16]; + uint8_t resv[124]; +} __attribute__((packed)) ib_port_xmit_data_sl_t; +typedef struct { + uint8_t reserved; + uint8_t port_select; + __be16 counter_select; + __be32 port_rcv_data_sl[16]; + uint8_t resv[124]; +} __attribute__((packed)) ib_port_rcv_data_sl_t; +typedef struct { + ib_mad_t header; + uint8_t resv[40]; +#define IB_DM_DATA_SIZE 192 + uint8_t data[IB_DM_DATA_SIZE]; +} __attribute__((packed)) ib_dm_mad_t; +typedef struct { + __be16 change_id; + uint8_t max_controllers; + uint8_t diag_rom; +#define IB_DM_CTRL_LIST_SIZE 128 + uint8_t controller_list[IB_DM_CTRL_LIST_SIZE]; +#define IOC_NOT_INSTALLED 0x0 +#define IOC_INSTALLED 0x1 +// Reserved values 0x02-0xE +#define SLOT_DOES_NOT_EXIST 0xF +} __attribute__((packed)) ib_iou_info_t; +typedef struct { + __be64 ioc_guid; + __be32 vend_id; + __be32 dev_id; + __be16 dev_ver; + __be16 resv2; + __be32 subsys_vend_id; + __be32 subsys_id; + __be16 io_class; + __be16 io_subclass; + __be16 protocol; + __be16 protocol_ver; + __be32 resv3; + __be16 send_msg_depth; + uint8_t resv4; + uint8_t rdma_read_depth; + __be32 send_msg_size; + __be32 rdma_size; + uint8_t ctrl_ops_cap; +#define CTRL_OPS_CAP_ST 0x01 +#define CTRL_OPS_CAP_SF 0x02 +#define CTRL_OPS_CAP_RT 0x04 +#define CTRL_OPS_CAP_RF 0x08 +#define CTRL_OPS_CAP_WT 0x10 +#define CTRL_OPS_CAP_WF 0x20 +#define CTRL_OPS_CAP_AT 0x40 +#define CTRL_OPS_CAP_AF 0x80 + uint8_t resv5; + uint8_t num_svc_entries; +#define MAX_NUM_SVC_ENTRIES 0xff + uint8_t resv6[9]; +#define CTRL_ID_STRING_LEN 64 + char id_string[CTRL_ID_STRING_LEN]; +} __attribute__((packed)) ib_ioc_profile_t; +typedef struct { +#define MAX_SVC_ENTRY_NAME_LEN 40 + char name[MAX_SVC_ENTRY_NAME_LEN]; + __be64 id; +} __attribute__((packed)) ib_svc_entry_t; +typedef struct { +#define SVC_ENTRY_COUNT 4 + ib_svc_entry_t service_entry[SVC_ENTRY_COUNT]; +} __attribute__((packed)) ib_svc_entries_t; +typedef struct { + __be64 module_guid; + __be64 iou_guid; + ib_ioc_profile_t ioc_profile; + __be64 access_key; + uint16_t initiators_conf; + uint8_t resv[38]; +} __attribute__((packed)) ib_ioc_info_t; +typedef struct { + bool cm; + bool snmp; + bool dev_mgmt; + bool vend; + bool sm; + bool sm_disable; + bool qkey_ctr; + bool pkey_ctr; + bool notice; + bool trap; + bool apm; + bool slmap; + bool pkey_nvram; + bool mkey_nvram; + bool sysguid; + bool dr_notice; + bool boot_mgmt; + bool capm_notice; + bool reinit; + bool ledinfo; + bool port_active; +} ib_port_cap_t; +#define IB_INIT_TYPE_NO_LOAD 0x01 +#define IB_INIT_TYPE_PRESERVE_CONTENT 0x02 +#define IB_INIT_TYPE_PRESERVE_PRESENCE 0x04 +#define IB_INIT_TYPE_DO_NOT_RESUSCITATE 0x08 +typedef struct { + uint8_t port_num; + uint8_t sl; + __be16 dlid; + bool grh_valid; + ib_grh_t grh; + uint8_t static_rate; + uint8_t path_bits; + struct _av_conn { + uint8_t path_mtu; + uint8_t local_ack_timeout; + uint8_t seq_err_retry_cnt; + uint8_t rnr_retry_cnt; + } conn; +} ib_av_attr_t; +#define IB_AC_RDMA_READ 0x00000001 +#define IB_AC_RDMA_WRITE 0x00000002 +#define IB_AC_ATOMIC 0x00000004 +#define IB_AC_LOCAL_WRITE 0x00000008 +#define IB_AC_MW_BIND 0x00000010 +#define IB_QPS_RESET 0x00000001 +#define IB_QPS_INIT 0x00000002 +#define IB_QPS_RTR 0x00000004 +#define IB_QPS_RTS 0x00000008 +#define IB_QPS_SQD 0x00000010 +#define IB_QPS_SQD_DRAINING 0x00000030 +#define IB_QPS_SQD_DRAINED 0x00000050 +#define IB_QPS_SQERR 0x00000080 +#define IB_QPS_ERROR 0x00000100 +#define IB_QPS_TIME_WAIT 0xDEAD0000 +#define IB_MOD_QP_ALTERNATE_AV 0x00000001 +#define IB_MOD_QP_PKEY 0x00000002 +#define IB_MOD_QP_APM_STATE 0x00000004 +#define IB_MOD_QP_PRIMARY_AV 0x00000008 +#define IB_MOD_QP_RNR_NAK_TIMEOUT 0x00000010 +#define IB_MOD_QP_RESP_RES 0x00000020 +#define IB_MOD_QP_INIT_DEPTH 0x00000040 +#define IB_MOD_QP_PRIMARY_PORT 0x00000080 +#define IB_MOD_QP_ACCESS_CTRL 0x00000100 +#define IB_MOD_QP_QKEY 0x00000200 +#define IB_MOD_QP_SQ_DEPTH 0x00000400 +#define IB_MOD_QP_RQ_DEPTH 0x00000800 +#define IB_MOD_QP_CURRENT_STATE 0x00001000 +#define IB_MOD_QP_RETRY_CNT 0x00002000 +#define IB_MOD_QP_LOCAL_ACK_TIMEOUT 0x00004000 +#define IB_MOD_QP_RNR_RETRY_CNT 0x00008000 +#define IB_MOD_EEC_ALTERNATE_AV 0x00000001 +#define IB_MOD_EEC_PKEY 0x00000002 +#define IB_MOD_EEC_APM_STATE 0x00000004 +#define IB_MOD_EEC_PRIMARY_AV 0x00000008 +#define IB_MOD_EEC_RNR 0x00000010 +#define IB_MOD_EEC_RESP_RES 0x00000020 +#define IB_MOD_EEC_OUTSTANDING 0x00000040 +#define IB_MOD_EEC_PRIMARY_PORT 0x00000080 +#define IB_SEND_OPT_IMMEDIATE 0x00000001 +#define IB_SEND_OPT_FENCE 0x00000002 +#define IB_SEND_OPT_SIGNALED 0x00000004 +#define IB_SEND_OPT_SOLICITED 0x00000008 +#define IB_SEND_OPT_INLINE 0x00000010 +#define IB_SEND_OPT_LOCAL 0x00000020 +#define IB_SEND_OPT_VEND_MASK 0xFFFF0000 +#define IB_RECV_OPT_IMMEDIATE 0x00000001 +#define IB_RECV_OPT_FORWARD 0x00000002 +#define IB_RECV_OPT_GRH_VALID 0x00000004 +#define IB_RECV_OPT_VEND_MASK 0xFFFF0000 +#define IB_CA_MOD_IS_CM_SUPPORTED 0x00000001 +#define IB_CA_MOD_IS_SNMP_SUPPORTED 0x00000002 +#define IB_CA_MOD_IS_DEV_MGMT_SUPPORTED 0x00000004 +#define IB_CA_MOD_IS_VEND_SUPPORTED 0x00000008 +#define IB_CA_MOD_IS_SM 0x00000010 +#define IB_CA_MOD_IS_SM_DISABLED 0x00000020 +#define IB_CA_MOD_QKEY_CTR 0x00000040 +#define IB_CA_MOD_PKEY_CTR 0x00000080 +#define IB_CA_MOD_IS_NOTICE_SUPPORTED 0x00000100 +#define IB_CA_MOD_IS_TRAP_SUPPORTED 0x00000200 +#define IB_CA_MOD_IS_APM_SUPPORTED 0x00000400 +#define IB_CA_MOD_IS_SLMAP_SUPPORTED 0x00000800 +#define IB_CA_MOD_IS_PKEY_NVRAM_SUPPORTED 0x00001000 +#define IB_CA_MOD_IS_MKEY_NVRAM_SUPPORTED 0x00002000 +#define IB_CA_MOD_IS_SYSGUID_SUPPORTED 0x00004000 +#define IB_CA_MOD_IS_DR_NOTICE_SUPPORTED 0x00008000 +#define IB_CA_MOD_IS_BOOT_MGMT_SUPPORTED 0x00010000 +#define IB_CA_MOD_IS_CAPM_NOTICE_SUPPORTED 0x00020000 +#define IB_CA_MOD_IS_REINIT_SUPORTED 0x00040000 +#define IB_CA_MOD_IS_LEDINFO_SUPPORTED 0x00080000 +#define IB_CA_MOD_SHUTDOWN_PORT 0x00100000 +#define IB_CA_MOD_INIT_TYPE_VALUE 0x00200000 +#define IB_CA_MOD_SYSTEM_IMAGE_GUID 0x00400000 +#define IB_MR_MOD_ADDR 0x00000001 +#define IB_MR_MOD_PD 0x00000002 +#define IB_MR_MOD_ACCESS 0x00000004 +#define IB_SMINFO_ATTR_MOD_HANDOVER htobe32(0x000001) +#define IB_SMINFO_ATTR_MOD_ACKNOWLEDGE htobe32(0x000002) +#define IB_SMINFO_ATTR_MOD_DISABLE htobe32(0x000003) +#define IB_SMINFO_ATTR_MOD_STANDBY htobe32(0x000004) +#define IB_SMINFO_ATTR_MOD_DISCOVER htobe32(0x000005) +#define IB_CC_LOG_DATA_SIZE 32 +#define IB_CC_MGT_DATA_SIZE 192 +typedef struct { + ib_mad_t header; + __be64 cc_key; + uint8_t log_data[IB_CC_LOG_DATA_SIZE]; + uint8_t mgt_data[IB_CC_MGT_DATA_SIZE]; +} __attribute__((packed)) ib_cc_mad_t; +typedef struct { + uint8_t cong_info; + uint8_t resv; + uint8_t ctrl_table_cap; +} __attribute__((packed)) ib_cong_info_t; +typedef struct { + __be64 cc_key; + __be16 protect_bit; + __be16 lease_period; + __be16 violations; +} __attribute__((packed)) ib_cong_key_info_t; +typedef struct { + __be16 slid; + __be16 dlid; + __be32 sl; + __be32 time_stamp; +} __attribute__((packed)) ib_cong_log_event_sw_t; +typedef struct { + __be32 local_qp_resv0; + __be32 remote_qp_sl_service_type; + __be16 remote_lid; + __be16 resv1; + __be32 time_stamp; +} __attribute__((packed)) ib_cong_log_event_ca_t; +typedef struct { + uint8_t log_type; + union _log_details { + struct _log_sw { + uint8_t cong_flags; + __be16 event_counter; + __be32 time_stamp; + uint8_t port_map[32]; + ib_cong_log_event_sw_t entry_list[15]; + } __attribute__((packed)) log_sw; + + struct _log_ca { + uint8_t cong_flags; + __be16 event_counter; + __be16 event_map; + __be16 resv; + __be32 time_stamp; + ib_cong_log_event_ca_t log_event[13]; + } __attribute__((packed)) log_ca; + + } log_details; +} __attribute__((packed)) ib_cong_log_t; +#define IB_CC_PORT_MASK_DATA_SIZE 32 +typedef struct { + __be32 control_map; + uint8_t victim_mask[IB_CC_PORT_MASK_DATA_SIZE]; + uint8_t credit_mask[IB_CC_PORT_MASK_DATA_SIZE]; + uint8_t threshold_resv; + uint8_t packet_size; + __be16 cs_threshold_resv; + __be16 cs_return_delay; + __be16 marking_rate; +} __attribute__((packed)) ib_sw_cong_setting_t; +typedef struct { + uint8_t valid_ctrl_type_res_threshold; + uint8_t packet_size; + __be16 cong_param; +} __attribute__((packed)) ib_sw_port_cong_setting_element_t; +#define IB_CC_SW_PORT_SETTING_ELEMENTS 32 +typedef struct { + ib_sw_port_cong_setting_element_t block[IB_CC_SW_PORT_SETTING_ELEMENTS]; +} __attribute__((packed)) ib_sw_port_cong_setting_t; +typedef struct { + __be16 ccti_timer; + uint8_t ccti_increase; + uint8_t trigger_threshold; + uint8_t ccti_min; + uint8_t resv0; + __be16 resv1; +} __attribute__((packed)) ib_ca_cong_entry_t; +#define IB_CA_CONG_ENTRY_DATA_SIZE 16 +typedef struct { + __be16 port_control; + __be16 control_map; + ib_ca_cong_entry_t entry_list[IB_CA_CONG_ENTRY_DATA_SIZE]; +} __attribute__((packed)) ib_ca_cong_setting_t; +typedef struct { + __be16 shift_multiplier; +} __attribute__((packed)) ib_cc_tbl_entry_t; +#define IB_CC_TBL_ENTRY_LIST_MAX 64 +typedef struct { + __be16 ccti_limit; + __be16 resv; + ib_cc_tbl_entry_t entry_list[IB_CC_TBL_ENTRY_LIST_MAX]; +} __attribute__((packed)) ib_cc_tbl_t; +typedef struct { + __be32 value; +} __attribute__((packed)) ib_time_stamp_t; + +#define IB_PM_PC_XMIT_WAIT_SUP htobe16(1 << 12) +#define IS_PM_RSFEC_COUNTERS_SUP htobe16(1 << 14) +#define IB_PM_IS_QP1_DROP_SUP htobe16(1 << 15) +#define IB_PM_IS_ADDL_PORT_CTRS_EXT_SUP htobe32(1 << 1) +#define IB_PORT_CAP2_IS_PORT_INFO_EXT_SUPPORTED htobe16(0x0002) +#define IB_PORT_EXT_NO_FEC_MODE_ACTIVE 0 +#define IB_PORT_EXT_FIRE_CODE_FEC_MODE_ACTIVE htobe16(0x0001) +#define IB_PORT_EXT_RS_FEC_MODE_ACTIVE htobe16(0x0002) +#define IB_PORT_EXT_LOW_LATENCY_RS_FEC_MODE_ACTIVE htobe16(0x0003) +#define IB_PORT_EXT_RS_FEC2_MODE_ACTIVE htobe16(0x0004) +#define IB_PORT_EXT_CAP_IS_FEC_MODE_SUPPORTED htobe32(0x00000001) + +static inline uint32_t ib_class_cap_mask2(const ib_class_port_info_t *p_cpi) +{ + return (be32toh(p_cpi->cap_mask2_resp_time) >> IB_CLASS_CAPMASK2_SHIFT); +} + +static inline uint8_t ib_class_resp_time_val(ib_class_port_info_t *p_cpi) +{ + return (uint8_t)(be32toh(p_cpi->cap_mask2_resp_time) & + IB_CLASS_RESP_TIME_MASK); +} + +static inline const char *ib_get_node_type_str(uint8_t node_type) +{ + static const char *const __ib_node_type_str[] = { + "UNKNOWN", + "Channel Adapter", + "Switch", + "Router", + }; + + if (node_type > IB_NODE_TYPE_ROUTER) + node_type = 0; + return (__ib_node_type_str[node_type]); +} + +static inline __be32 ib_inform_info_get_prod_type(const ib_inform_info_t *p_inf) +{ + uint32_t nt; + + nt = be16toh(p_inf->g_or_v.generic.node_type_lsb) | + (p_inf->g_or_v.generic.node_type_msb << 16); + return htobe32(nt); +} + +static inline void +ib_inform_info_get_qpn_resp_time(const __be32 qpn_resp_time_val, __be32 *p_qpn, + uint8_t *p_resp_time_val) +{ + uint32_t tmp = be32toh(qpn_resp_time_val); + + if (p_qpn) + *p_qpn = htobe32((tmp & 0xffffff00) >> 8); + if (p_resp_time_val) + *p_resp_time_val = (uint8_t)(tmp & 0x0000001f); +} + +static inline void ib_member_get_scope_state(const uint8_t scope_state, + uint8_t *p_scope, uint8_t *p_state) +{ + uint8_t tmp_scope_state; + + if (p_state) + *p_state = (uint8_t)(scope_state & 0x0f); + + tmp_scope_state = scope_state >> 4; + + if (p_scope) + *p_scope = (uint8_t)(tmp_scope_state & 0x0f); +} + +static inline void ib_member_get_sl_flow_hop(const __be32 sl_flow_hop, + uint8_t *p_sl, + uint32_t *p_flow_lbl, + uint8_t *p_hop) +{ + uint32_t tmp; + + tmp = be32toh(sl_flow_hop); + if (p_hop) + *p_hop = (uint8_t)tmp; + tmp >>= 8; + + if (p_flow_lbl) + *p_flow_lbl = (uint32_t)(tmp & 0xfffff); + tmp >>= 20; + + if (p_sl) + *p_sl = (uint8_t)tmp; +} + +static inline __be32 ib_member_set_sl_flow_hop(const uint8_t sl, + const uint32_t flow_label, + const uint8_t hop_limit) +{ + uint32_t tmp; + + tmp = (sl << 28) | ((flow_label & 0xfffff) << 8) | hop_limit; + return htobe32(tmp); +} + +static inline __be32 ib_node_info_get_vendor_id(const ib_node_info_t *p_ni) +{ + return ((__be32)(p_ni->port_num_vendor_id & IB_NODE_INFO_VEND_ID_MASK)); +} + +static inline uint8_t +ib_node_info_get_local_port_num(const ib_node_info_t *p_ni) +{ + return be32toh(p_ni->port_num_vendor_id & IB_NODE_INFO_PORT_NUM_MASK) >> + 24; +} + +static inline uint16_t ib_path_rec_qos_class(const ib_path_rec_t *p_rec) +{ + return (be16toh(p_rec->qos_class_sl) >> 4); +} + +static inline void ib_path_rec_set_qos_class(ib_path_rec_t *p_rec, + const uint16_t qos_class) +{ + p_rec->qos_class_sl = + (p_rec->qos_class_sl & htobe16(IB_PATH_REC_SL_MASK)) | + htobe16(qos_class << 4); +} + +static inline uint8_t ib_path_rec_sl(const ib_path_rec_t *p_rec) +{ + return (uint8_t)(be16toh(p_rec->qos_class_sl) & IB_PATH_REC_SL_MASK); +} + +static inline uint8_t ib_slvl_table_get(const ib_slvl_table_t *p_slvl_tbl, + uint8_t sl_index) +{ + uint8_t idx = sl_index / 2; + assert(sl_index <= 15); + + if (sl_index % 2) + /* this is an odd sl. Need to return the ls bits. */ + return (p_slvl_tbl->raw_vl_by_sl[idx] & 0x0F); + else + /* this is an even sl. Need to return the ms bits. */ + return ((p_slvl_tbl->raw_vl_by_sl[idx] & 0xF0) >> 4); +} + +static inline uint8_t ib_sminfo_get_priority(const ib_sm_info_t *p_smi) +{ + return ((uint8_t)((p_smi->pri_state & 0xF0) >> 4)); +} + +static inline uint8_t ib_sminfo_get_state(const ib_sm_info_t *p_smi) +{ + return ((uint8_t)(p_smi->pri_state & 0x0F)); +} + +#endif diff --git a/libibmad/libibmad.map b/libibmad/libibmad.map new file mode 100644 index 0000000..e08adf0 --- /dev/null +++ b/libibmad/libibmad.map @@ -0,0 +1,156 @@ +IBMAD_1.3 { + global: + xdump; + mad_dump_field; + mad_dump_val; + mad_print_field; + mad_dump_array; + mad_dump_bitfield; + mad_dump_hex; + mad_dump_int; + mad_dump_linkdowndefstate; + mad_dump_linkspeed; + mad_dump_linkspeeden; + mad_dump_linkspeedsup; + mad_dump_linkspeedext; + mad_dump_linkspeedexten; + mad_dump_linkspeedextsup; + mad_dump_linkwidth; + mad_dump_linkwidthen; + mad_dump_linkwidthsup; + mad_dump_mlnx_ext_port_info; + mad_dump_portinfo_ext; + mad_dump_mtu; + mad_dump_node_type; + mad_dump_nodedesc; + mad_dump_nodeinfo; + mad_dump_opervls; + mad_dump_fields; + mad_dump_perfcounters; + mad_dump_perfcounters_ext; + mad_dump_perfcounters_xmt_sl; + mad_dump_perfcounters_rcv_sl; + mad_dump_perfcounters_xmt_disc; + mad_dump_perfcounters_rcv_err; + mad_dump_physportstate; + mad_dump_portcapmask; + mad_dump_portcapmask2; + mad_dump_portinfo; + mad_dump_portsamples_control; + mad_dump_portsamples_result; + mad_dump_perfcounters_port_op_rcv_counters; + mad_dump_perfcounters_port_flow_ctl_counters; + mad_dump_perfcounters_port_vl_op_packet; + mad_dump_perfcounters_port_vl_op_data; + mad_dump_perfcounters_port_vl_xmit_flow_ctl_update_errors; + mad_dump_perfcounters_port_vl_xmit_wait_counters; + mad_dump_perfcounters_sw_port_vl_congestion; + mad_dump_perfcounters_rcv_con_ctrl; + mad_dump_perfcounters_sl_rcv_fecn; + mad_dump_perfcounters_sl_rcv_becn; + mad_dump_perfcounters_xmit_con_ctrl; + mad_dump_perfcounters_vl_xmit_time_cong; + mad_dump_cc_congestioninfo; + mad_dump_cc_congestionkeyinfo; + mad_dump_cc_congestionlog; + mad_dump_cc_congestionlogswitch; + mad_dump_cc_congestionlogentryswitch; + mad_dump_cc_congestionlogca; + mad_dump_cc_congestionlogentryca; + mad_dump_cc_switchcongestionsetting; + mad_dump_cc_switchportcongestionsettingelement; + mad_dump_cc_cacongestionsetting; + mad_dump_cc_cacongestionentry; + mad_dump_cc_congestioncontroltable; + mad_dump_cc_congestioncontroltableentry; + mad_dump_cc_timestamp; + mad_dump_classportinfo; + mad_dump_portstates; + mad_dump_portstate; + mad_dump_rhex; + mad_dump_sltovl; + mad_dump_string; + mad_dump_switchinfo; + mad_dump_uint; + mad_dump_vlarbitration; + mad_dump_vlcap; + mad_get_field; + mad_set_field; + mad_get_field64; + mad_set_field64; + mad_get_array; + mad_set_array; + pma_query_via; + performance_reset_via; + mad_build_pkt; + mad_decode_field; + mad_encode; + mad_encode_field; + mad_trid; + portid2portnum; + portid2str; + str2drpath; + drpath2str; + mad_class_agent; + mad_register_client; + mad_register_server; + mad_register_client_via; + mad_register_server_via; + ib_resolve_portid_str; + ib_resolve_self; + ib_resolve_smlid; + ibdebug; + mad_rpc_open_port; + mad_rpc_close_port; + mad_rpc; + mad_rpc_rmpp; + mad_rpc_portid; + mad_rpc_class_agent; + mad_rpc_set_retries; + mad_rpc_set_timeout; + mad_get_timeout; + mad_get_retries; + madrpc; + madrpc_init; + madrpc_portid; + madrpc_rmpp; + madrpc_save_mad; + madrpc_set_retries; + madrpc_set_timeout; + madrpc_show_errors; + ib_path_query; + sa_call; + sa_rpc_call; + mad_alloc; + mad_free; + mad_receive; + mad_respond; + mad_receive_via; + mad_respond_via; + mad_send; + mad_send_via; + smp_query; + smp_set; + ib_vendor_call; + ib_vendor_call_via; + smp_query_via; + smp_query_status_via; + smp_set_via; + smp_set_status_via; + ib_path_query_via; + ib_resolve_smlid_via; + ib_resolve_guid_via; + ib_resolve_gid_via; + ib_resolve_portid_str_via; + ib_resolve_self_via; + mad_field_name; + bm_call_via; + mad_dump_port_ext_speeds_counters; + mad_dump_port_ext_speeds_counters_rsfec_active; + cc_query_status_via; + cc_config_status_via; + smp_mkey_get; + smp_mkey_set; + ib_node_query_via; + local: *; +}; diff --git a/libibmad/mad.c b/libibmad/mad.c new file mode 100644 index 0000000..d222264 --- /dev/null +++ b/libibmad/mad.c @@ -0,0 +1,207 @@ +/* + * Copyright (c) 2004-2009 Voltaire Inc. All rights reserved. + * Copyright (c) 2009 HNR Consulting. All rights reserved. + * Copyright (c) 2011 Mellanox Technologies LTD. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <time.h> +#include <errno.h> + +#include <infiniband/umad.h> +#include <infiniband/mad.h> + +#include "mad_internal.h" + +#undef DEBUG +#define DEBUG if (ibdebug) IBWARN + +#define GET_IB_USERLAND_TID(tid) (tid & 0x00000000ffffffff) +/* + * Generate the 64 bit MAD transaction ID. The upper 32 bits are reserved for + * use by the kernel. We clear the upper 32 bits here, but MADs received from + * the kernel may contain kernel specific data in these bits, consequently + * userland TID matching should only be done on the lower 32 bits. + */ +uint64_t mad_trid(void) +{ + static uint64_t trid; + uint64_t next; + + if (!trid) { + srandom((int)time(NULL) * getpid()); + trid = random(); + } + next = ++trid; + next = GET_IB_USERLAND_TID(next); + return next; +} + +int mad_get_timeout(const struct ibmad_port *srcport, int override_ms) +{ + return (override_ms ? override_ms : + srcport->timeout ? srcport->timeout : madrpc_timeout); +} + +int mad_get_retries(const struct ibmad_port *srcport) +{ + return (srcport->retries ? srcport->retries : madrpc_retries); +} + +void *mad_encode(void *buf, ib_rpc_t * rpc, ib_dr_path_t * drpath, void *data) +{ + int is_resp = rpc->method & IB_MAD_RESPONSE; + int mgtclass; + + /* first word */ + mad_set_field(buf, 0, IB_MAD_METHOD_F, rpc->method); + mad_set_field(buf, 0, IB_MAD_RESPONSE_F, is_resp ? 1 : 0); + mgtclass = rpc->mgtclass & 0xff; + if (mgtclass == IB_SA_CLASS || mgtclass == IB_CC_CLASS) + mad_set_field(buf, 0, IB_MAD_CLASSVER_F, 2); + else + mad_set_field(buf, 0, IB_MAD_CLASSVER_F, 1); + mad_set_field(buf, 0, IB_MAD_MGMTCLASS_F, rpc->mgtclass & 0xff); + mad_set_field(buf, 0, IB_MAD_BASEVER_F, 1); + + /* second word */ + if ((rpc->mgtclass & 0xff) == IB_SMI_DIRECT_CLASS) { + if (!drpath) { + IBWARN("encoding dr mad without drpath (null)"); + errno = EINVAL; + return NULL; + } + if (drpath->cnt >= IB_SUBNET_PATH_HOPS_MAX) { + IBWARN("dr path with hop count %d", drpath->cnt); + errno = EINVAL; + return NULL; + } + mad_set_field(buf, 0, IB_DRSMP_HOPCNT_F, drpath->cnt); + mad_set_field(buf, 0, IB_DRSMP_HOPPTR_F, + is_resp ? drpath->cnt + 1 : 0x0); + mad_set_field(buf, 0, IB_DRSMP_STATUS_F, rpc->rstatus); + mad_set_field(buf, 0, IB_DRSMP_DIRECTION_F, is_resp ? 1 : 0); /* out */ + } else + mad_set_field(buf, 0, IB_MAD_STATUS_F, rpc->rstatus); + + /* words 3,4,5,6 */ + if (!rpc->trid) + rpc->trid = mad_trid(); + + mad_set_field64(buf, 0, IB_MAD_TRID_F, rpc->trid); + mad_set_field(buf, 0, IB_MAD_ATTRID_F, rpc->attr.id); + mad_set_field(buf, 0, IB_MAD_ATTRMOD_F, rpc->attr.mod); + + /* words 7,8 */ + mad_set_field64(buf, 0, IB_MAD_MKEY_F, rpc->mkey); + + if ((rpc->mgtclass & 0xff) == IB_SMI_DIRECT_CLASS) { + /* word 9 */ + mad_set_field(buf, 0, IB_DRSMP_DRDLID_F, + drpath->drdlid ? drpath->drdlid : 0xffff); + mad_set_field(buf, 0, IB_DRSMP_DRSLID_F, + drpath->drslid ? drpath->drslid : 0xffff); + + /* bytes 128 - 256 - by default should be zero due to memset */ + if (is_resp) + mad_set_array(buf, 0, IB_DRSMP_RPATH_F, drpath->p); + else + mad_set_array(buf, 0, IB_DRSMP_PATH_F, drpath->p); + } + + if ((rpc->mgtclass & 0xff) == IB_SA_CLASS) + mad_set_field64(buf, 0, IB_SA_COMPMASK_F, rpc->mask); + + if ((rpc->mgtclass & 0xff) == IB_CC_CLASS) { + ib_rpc_cc_t *rpccc = (ib_rpc_cc_t *)rpc; + mad_set_field64(buf, 0, IB_CC_CCKEY_F, rpccc->cckey); + } + + if (data) + memcpy((char *)buf + rpc->dataoffs, data, rpc->datasz); + + /* vendor mads range 2 */ + if (mad_is_vendor_range2(rpc->mgtclass & 0xff)) + mad_set_field(buf, 0, IB_VEND2_OUI_F, rpc->oui); + + return (uint8_t *) buf + IB_MAD_SIZE; +} + +int mad_build_pkt(void *umad, ib_rpc_t * rpc, ib_portid_t * dport, + ib_rmpp_hdr_t * rmpp, void *data) +{ + uint8_t *p, *mad; + int lid_routed = (rpc->mgtclass & 0xff) != IB_SMI_DIRECT_CLASS; + int is_smi = ((rpc->mgtclass & 0xff) == IB_SMI_CLASS || + (rpc->mgtclass & 0xff) == IB_SMI_DIRECT_CLASS); + struct ib_mad_addr addr; + + if (!is_smi) + umad_set_addr(umad, dport->lid, dport->qp, dport->sl, + dport->qkey); + else if (lid_routed) + umad_set_addr(umad, dport->lid, dport->qp, 0, 0); + else if ((dport->drpath.drslid != 0xffff) && (dport->lid > 0)) + umad_set_addr(umad, dport->lid, 0, 0, 0); + else + umad_set_addr(umad, 0xffff, 0, 0, 0); + + if (dport->grh_present && !is_smi) { + addr.grh_present = 1; + memcpy(addr.gid, dport->gid, 16); + addr.hop_limit = 0xff; + addr.traffic_class = 0; + addr.flow_label = 0; + umad_set_grh(umad, &addr); + } else + umad_set_grh(umad, NULL); + umad_set_pkey(umad, is_smi ? 0 : dport->pkey_idx); + + mad = umad_get_mad(umad); + p = mad_encode(mad, rpc, lid_routed ? NULL : &dport->drpath, data); + if (!p) + return -1; + + if (!is_smi && rmpp) { + mad_set_field(mad, 0, IB_SA_RMPP_VERS_F, 1); + mad_set_field(mad, 0, IB_SA_RMPP_TYPE_F, rmpp->type); + mad_set_field(mad, 0, IB_SA_RMPP_RESP_F, 0x3f); + mad_set_field(mad, 0, IB_SA_RMPP_FLAGS_F, rmpp->flags); + mad_set_field(mad, 0, IB_SA_RMPP_STATUS_F, rmpp->status); + mad_set_field(mad, 0, IB_SA_RMPP_D1_F, rmpp->d1.u); + mad_set_field(mad, 0, IB_SA_RMPP_D2_F, rmpp->d2.u); + } + + return ((int)(p - mad)); +} diff --git a/libibmad/mad.h b/libibmad/mad.h new file mode 100644 index 0000000..0945c03 --- /dev/null +++ b/libibmad/mad.h @@ -0,0 +1,1712 @@ +/* + * Copyright (c) 2004-2009 Voltaire Inc. All rights reserved. + * Copyright (c) 2009 HNR Consulting. All rights reserved. + * Copyright (c) 2009-2011 Mellanox Technologies LTD. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#ifndef _MAD_H_ +#define _MAD_H_ + +#include <stdint.h> +#include <string.h> +#include <stdlib.h> +#include <stdio.h> +#include <sys/types.h> +#include <unistd.h> +#include <byteswap.h> +#include <inttypes.h> +#include <arpa/inet.h> + +#ifdef __cplusplus +extern "C" { +#endif + +#define IB_MAD_RPC_VERSION_MASK 0x0f00 +#define IB_MAD_RPC_VERSION1 (1<<8) + +#define IB_SUBNET_PATH_HOPS_MAX 64 +#define IB_DEFAULT_SUBN_PREFIX 0xfe80000000000000ULL +#define IB_DEFAULT_QP1_QKEY 0x80010000 +#define IB_MAD_SIZE 256 +#define IB_SMP_DATA_OFFS 64 +#define IB_SMP_DATA_SIZE 64 +#define IB_VENDOR_RANGE1_DATA_OFFS 24 +#define IB_VENDOR_RANGE1_DATA_SIZE (IB_MAD_SIZE - IB_VENDOR_RANGE1_DATA_OFFS) +#define IB_VENDOR_RANGE2_DATA_OFFS 40 +#define IB_VENDOR_RANGE2_DATA_SIZE (IB_MAD_SIZE - IB_VENDOR_RANGE2_DATA_OFFS) +#define IB_SA_DATA_SIZE 200 +#define IB_SA_DATA_OFFS 56 +#define IB_PC_DATA_OFFS 64 +#define IB_PC_DATA_SZ (IB_MAD_SIZE - IB_PC_DATA_OFFS) +#define IB_SA_MCM_RECSZ 53 +#define IB_SA_PR_RECSZ 64 +#define IB_SA_NR_RECSZ 108 +#define IB_SA_GIR_RECSZ 72 +#define IB_BM_DATA_OFFS 64 +#define IB_BM_DATA_SZ (IB_MAD_SIZE - IB_BM_DATA_OFFS) +#define IB_BM_BKEY_OFFS 24 +#define IB_BM_BKEY_AND_DATA_SZ (IB_MAD_SIZE - IB_BM_BKEY_OFFS) +#define IB_CC_DATA_OFFS 64 +#define IB_CC_DATA_SZ (IB_MAD_SIZE - IB_CC_DATA_OFFS) +#define IB_CC_LOG_DATA_OFFS 32 +#define IB_CC_LOG_DATA_SZ (IB_MAD_SIZE - IB_CC_LOG_DATA_OFFS) + +enum MAD_CLASSES { + IB_SMI_CLASS = 0x1, + IB_SMI_DIRECT_CLASS = 0x81, + IB_SA_CLASS = 0x3, + IB_PERFORMANCE_CLASS = 0x4, + IB_BOARD_MGMT_CLASS = 0x5, + IB_DEVICE_MGMT_CLASS = 0x6, + IB_CM_CLASS = 0x7, + IB_SNMP_CLASS = 0x8, + IB_VENDOR_RANGE1_START_CLASS = 0x9, + IB_VENDOR_RANGE1_END_CLASS = 0x0f, + IB_CC_CLASS = 0x21, + IB_VENDOR_RANGE2_START_CLASS = 0x30, + IB_VENDOR_RANGE2_END_CLASS = 0x4f, +}; + +enum MAD_METHODS { + IB_MAD_METHOD_GET = 0x1, + IB_MAD_METHOD_SET = 0x2, + IB_MAD_METHOD_GET_RESPONSE = 0x81, + + IB_MAD_METHOD_SEND = 0x3, + IB_MAD_METHOD_TRAP = 0x5, + IB_MAD_METHOD_TRAP_REPRESS = 0x7, + + IB_MAD_METHOD_REPORT = 0x6, + IB_MAD_METHOD_REPORT_RESPONSE = 0x86, + IB_MAD_METHOD_GET_TABLE = 0x12, + IB_MAD_METHOD_GET_TABLE_RESPONSE = 0x92, + IB_MAD_METHOD_GET_TRACE_TABLE = 0x13, + IB_MAD_METHOD_GET_TRACE_TABLE_RESPONSE = 0x93, + IB_MAD_METHOD_GETMULTI = 0x14, + IB_MAD_METHOD_GETMULTI_RESPONSE = 0x94, + IB_MAD_METHOD_DELETE = 0x15, + IB_MAD_METHOD_DELETE_RESPONSE = 0x95, + + IB_MAD_RESPONSE = 0x80, +}; + +enum MAD_ATTR_ID { + CLASS_PORT_INFO = 0x1, + NOTICE = 0x2, + INFORM_INFO = 0x3, +}; + +enum MAD_STATUS { + IB_MAD_STS_OK = (0 << 2), + IB_MAD_STS_BUSY = (1 << 0), + IB_MAD_STS_REDIRECT = (1 << 1), + IB_MAD_STS_BAD_BASE_VER_OR_CLASS = (1 << 2), + IB_MAD_STS_METHOD_NOT_SUPPORTED = (2 << 2), + IB_MAD_STS_METHOD_ATTR_NOT_SUPPORTED = (3 << 2), + IB_MAD_STS_INV_ATTR_VALUE = (7 << 2), +}; + +enum SMI_ATTR_ID { + IB_ATTR_NODE_DESC = 0x10, + IB_ATTR_NODE_INFO = 0x11, + IB_ATTR_SWITCH_INFO = 0x12, + IB_ATTR_GUID_INFO = 0x14, + IB_ATTR_PORT_INFO = 0x15, + IB_ATTR_PKEY_TBL = 0x16, + IB_ATTR_SLVL_TABLE = 0x17, + IB_ATTR_VL_ARBITRATION = 0x18, + IB_ATTR_LINEARFORWTBL = 0x19, + IB_ATTR_MULTICASTFORWTBL = 0x1b, + IB_ATTR_LINKSPEEDWIDTHPAIRSTBL = 0x1c, + IB_ATTR_VENDORMADSTBL = 0x1d, + IB_ATTR_SMINFO = 0x20, + IB_ATTR_PORT_INFO_EXT = 0x33, + IB_ATTR_LAST, + + IB_ATTR_MLNX_EXT_PORT_INFO = 0xff90, +}; + +enum SA_ATTR_ID { + IB_SA_ATTR_NOTICE = 0x02, + IB_SA_ATTR_INFORMINFO = 0x03, + IB_SA_ATTR_NODERECORD = 0x11, + IB_SA_ATTR_PORTINFORECORD = 0x12, + IB_SA_ATTR_SL2VLTABLERECORD = 0x13, + IB_SA_ATTR_SWITCHINFORECORD = 0x14, + IB_SA_ATTR_LFTRECORD = 0x15, + IB_SA_ATTR_RFTRECORD = 0x16, + IB_SA_ATTR_MFTRECORD = 0x17, + IB_SA_ATTR_SMINFORECORD = 0x18, + IB_SA_ATTR_LINKRECORD = 0x20, + IB_SA_ATTR_GUIDINFORECORD = 0x30, + IB_SA_ATTR_SERVICERECORD = 0x31, + IB_SA_ATTR_PKEYTABLERECORD = 0x33, + IB_SA_ATTR_PATHRECORD = 0x35, + IB_SA_ATTR_VLARBTABLERECORD = 0x36, + IB_SA_ATTR_MCRECORD = 0x38, + IB_SA_ATTR_MULTIPATH = 0x3a, + IB_SA_ATTR_INFORMINFORECORD = 0xf3, + + IB_SA_ATTR_LAST +}; + +enum GSI_ATTR_ID { + IB_GSI_PORT_SAMPLES_CONTROL = 0x10, + IB_GSI_PORT_SAMPLES_RESULT = 0x11, + IB_GSI_PORT_COUNTERS = 0x12, + IB_GSI_PORT_RCV_ERROR_DETAILS = 0x15, + IB_GSI_PORT_XMIT_DISCARD_DETAILS = 0x16, + IB_GSI_PORT_PORT_OP_RCV_COUNTERS = 0x17, + IB_GSI_PORT_PORT_FLOW_CTL_COUNTERS = 0x18, + IB_GSI_PORT_PORT_VL_OP_PACKETS = 0x19, + IB_GSI_PORT_PORT_VL_OP_DATA = 0x1A, + IB_GSI_PORT_PORT_VL_XMIT_FLOW_CTL_UPDATE_ERRORS = 0x1B, + IB_GSI_PORT_PORT_VL_XMIT_WAIT_COUNTERS = 0x1C, + IB_GSI_PORT_COUNTERS_EXT = 0x1D, + IB_GSI_PORT_EXT_SPEEDS_COUNTERS = 0x1F, + IB_GSI_SW_PORT_VL_CONGESTION = 0x30, + IB_GSI_PORT_RCV_CON_CTRL = 0x31, + IB_GSI_PORT_SL_RCV_FECN = 0x32, + IB_GSI_PORT_SL_RCV_BECN = 0x33, + IB_GSI_PORT_XMIT_CON_CTRL = 0x34, + IB_GSI_PORT_VL_XMIT_TIME_CONG = 0x35, + IB_GSI_PORT_XMIT_DATA_SL = 0x36, + IB_GSI_PORT_RCV_DATA_SL = 0x37, + IB_GSI_ATTR_LAST +}; + +enum BM_ATTR_ID { + IB_BM_ATTR_BKEYINFO = 0x10, + IB_BM_ATTR_WRITE_VPD = 0x20, + IB_BM_ATTR_READ_VPD = 0x21, + IB_BM_ATTR_RESET_IBML = 0x22, + IB_BM_ATTR_SET_MODULE_PM_CONTROL = 0x23, + IB_BM_ATTR_GET_MODULE_PM_CONTROL = 0x24, + IB_BM_ATTR_SET_UNIT_PM_CONTROL = 0x25, + IB_BM_ATTR_GET_UNIT_PM_CONTROL = 0x26, + IB_BM_ATTR_SET_IOC_PM_CONTROL = 0x27, + IB_BM_ATTR_GET_IOC_PM_CONTROL = 0x28, + IB_BM_ATTR_SET_MODULE_STATE = 0x29, + IB_BM_ATTR_SET_MODULE_ATTENTION = 0x2A, + IB_BM_ATTR_GET_MODULE_STATUS = 0x2B, + IB_BM_ATTR_IB2IBML = 0x2C, + IB_BM_ATTR_IB2CME = 0x2D, + IB_BM_ATTR_IB2MME = 0x2E, + IB_BM_ATTR_OEM = 0x2F, + + IB_BM_ATTR_LAST +}; + +enum CC_ATTRI_ID { + IB_CC_ATTR_CONGESTION_INFO = 0x11, + IB_CC_ATTR_CONGESTION_KEY_INFO = 0x12, + IB_CC_ATTR_CONGESTION_LOG = 0x13, + IB_CC_ATTR_SWITCH_CONGESTION_SETTING = 0x14, + IB_CC_ATTR_SWITCH_PORT_CONGESTION_SETTING = 0x15, + IB_CC_ATTR_CA_CONGESTION_SETTING = 0x16, + IB_CC_ATTR_CONGESTION_CONTROL_TABLE = 0x17, + IB_CC_ATTR_TIMESTAMP = 0x18, +}; + +#define IB_VENDOR_OPENIB_PING_CLASS (IB_VENDOR_RANGE2_START_CLASS + 2) +#define IB_VENDOR_OPENIB_SYSSTAT_CLASS (IB_VENDOR_RANGE2_START_CLASS + 3) +#define IB_OPENIB_OUI (0x001405) + +typedef uint8_t ibmad_gid_t[16]; +#ifdef USE_DEPRECATED_IB_GID_T +typedef ibmad_gid_t ib_gid_t __attribute__ ((deprecated)); +#endif + +typedef struct { + int cnt; + uint8_t p[IB_SUBNET_PATH_HOPS_MAX]; + uint16_t drslid; + uint16_t drdlid; +} ib_dr_path_t; + +typedef struct { + unsigned id; + unsigned mod; +} ib_attr_t; + +typedef struct { + int mgtclass; + int method; + ib_attr_t attr; + uint32_t rstatus; /* return status */ + int dataoffs; + int datasz; + uint64_t mkey; + uint64_t trid; /* used for out mad if nonzero, return real val */ + uint64_t mask; /* for sa mads */ + unsigned recsz; /* for sa mads (attribute offset) */ + int timeout; + uint32_t oui; /* for vendor range 2 mads */ +} ib_rpc_t; + +typedef struct { + int mgtclass; + int method; + ib_attr_t attr; + uint32_t rstatus; /* return status */ + int dataoffs; + int datasz; + uint64_t mkey; + uint64_t trid; /* used for out mad if nonzero, return real val */ + uint64_t mask; /* for sa mads */ + unsigned recsz; /* for sa mads (attribute offset) */ + int timeout; + uint32_t oui; /* for vendor range 2 mads */ + int error; /* errno */ +} ib_rpc_v1_t; + +typedef struct { + int mgtclass; + int method; + ib_attr_t attr; + uint32_t rstatus; /* return status */ + int dataoffs; + int datasz; + uint64_t mkey; + uint64_t trid; /* used for out mad if nonzero, return real val */ + uint64_t mask; /* for sa mads */ + unsigned recsz; /* for sa mads (attribute offset) */ + int timeout; + uint32_t oui; /* for vendor range 2 mads */ + int error; /* errno */ + uint64_t cckey; +} ib_rpc_cc_t; + +typedef struct portid { + int lid; /* lid or 0 if directed route */ + ib_dr_path_t drpath; + int grh_present; /* flag */ + ibmad_gid_t gid; + uint32_t qp; + uint32_t qkey; + uint8_t sl; + unsigned pkey_idx; +} ib_portid_t; + +typedef void (ib_mad_dump_fn) (char *buf, int bufsz, void *val, int valsz); + +#define IB_FIELD_NAME_LEN 32 + +typedef struct ib_field { + int bitoffs; + int bitlen; + char name[IB_FIELD_NAME_LEN]; + ib_mad_dump_fn *def_dump_fn; +} ib_field_t; + +enum MAD_FIELDS { + IB_NO_FIELD, + + IB_GID_PREFIX_F, + IB_GID_GUID_F, + + /* first MAD word (0-3 bytes) */ + IB_MAD_METHOD_F, + IB_MAD_RESPONSE_F, + IB_MAD_CLASSVER_F, + IB_MAD_MGMTCLASS_F, + IB_MAD_BASEVER_F, + + /* second MAD word (4-7 bytes) */ + IB_MAD_STATUS_F, + + /* DRSMP only */ + IB_DRSMP_HOPCNT_F, + IB_DRSMP_HOPPTR_F, + IB_DRSMP_STATUS_F, + IB_DRSMP_DIRECTION_F, + + /* words 3,4,5,6 (8-23 bytes) */ + IB_MAD_TRID_F, + IB_MAD_ATTRID_F, + IB_MAD_ATTRMOD_F, + + /* word 7,8 (24-31 bytes) */ + IB_MAD_MKEY_F, + + /* word 9 (32-37 bytes) */ + IB_DRSMP_DRDLID_F, + IB_DRSMP_DRSLID_F, + + /* word 10,11 (36-43 bytes) */ + IB_SA_MKEY_F, + + /* word 12 (44-47 bytes) */ + IB_SA_ATTROFFS_F, + + /* word 13,14 (48-55 bytes) */ + IB_SA_COMPMASK_F, + + /* word 13,14 (56-255 bytes) */ + IB_SA_DATA_F, + + /* bytes 64 - 127 */ + IB_SM_DATA_F, + + /* bytes 64 - 256 */ + IB_GS_DATA_F, + + /* bytes 128 - 191 */ + IB_DRSMP_PATH_F, + + /* bytes 192 - 255 */ + IB_DRSMP_RPATH_F, + + /* + * PortInfo fields + */ + IB_PORT_FIRST_F, + IB_PORT_MKEY_F = IB_PORT_FIRST_F, + IB_PORT_GID_PREFIX_F, + IB_PORT_LID_F, + IB_PORT_SMLID_F, + IB_PORT_CAPMASK_F, + IB_PORT_DIAG_F, + IB_PORT_MKEY_LEASE_F, + IB_PORT_LOCAL_PORT_F, + IB_PORT_LINK_WIDTH_ENABLED_F, + IB_PORT_LINK_WIDTH_SUPPORTED_F, + IB_PORT_LINK_WIDTH_ACTIVE_F, + IB_PORT_LINK_SPEED_SUPPORTED_F, + IB_PORT_STATE_F, + IB_PORT_PHYS_STATE_F, + IB_PORT_LINK_DOWN_DEF_F, + IB_PORT_MKEY_PROT_BITS_F, + IB_PORT_LMC_F, + IB_PORT_LINK_SPEED_ACTIVE_F, + IB_PORT_LINK_SPEED_ENABLED_F, + IB_PORT_NEIGHBOR_MTU_F, + IB_PORT_SMSL_F, + IB_PORT_VL_CAP_F, + IB_PORT_INIT_TYPE_F, + IB_PORT_VL_HIGH_LIMIT_F, + IB_PORT_VL_ARBITRATION_HIGH_CAP_F, + IB_PORT_VL_ARBITRATION_LOW_CAP_F, + IB_PORT_INIT_TYPE_REPLY_F, + IB_PORT_MTU_CAP_F, + IB_PORT_VL_STALL_COUNT_F, + IB_PORT_HOQ_LIFE_F, + IB_PORT_OPER_VLS_F, + IB_PORT_PART_EN_INB_F, + IB_PORT_PART_EN_OUTB_F, + IB_PORT_FILTER_RAW_INB_F, + IB_PORT_FILTER_RAW_OUTB_F, + IB_PORT_MKEY_VIOL_F, + IB_PORT_PKEY_VIOL_F, + IB_PORT_QKEY_VIOL_F, + IB_PORT_GUID_CAP_F, + IB_PORT_CLIENT_REREG_F, + IB_PORT_MCAST_PKEY_SUPR_ENAB_F, + IB_PORT_SUBN_TIMEOUT_F, + IB_PORT_RESP_TIME_VAL_F, + IB_PORT_LOCAL_PHYS_ERR_F, + IB_PORT_OVERRUN_ERR_F, + IB_PORT_MAX_CREDIT_HINT_F, + IB_PORT_LINK_ROUND_TRIP_F, + IB_PORT_LAST_F, + + /* + * NodeInfo fields + */ + IB_NODE_FIRST_F, + IB_NODE_BASE_VERS_F = IB_NODE_FIRST_F, + IB_NODE_CLASS_VERS_F, + IB_NODE_TYPE_F, + IB_NODE_NPORTS_F, + IB_NODE_SYSTEM_GUID_F, + IB_NODE_GUID_F, + IB_NODE_PORT_GUID_F, + IB_NODE_PARTITION_CAP_F, + IB_NODE_DEVID_F, + IB_NODE_REVISION_F, + IB_NODE_LOCAL_PORT_F, + IB_NODE_VENDORID_F, + IB_NODE_LAST_F, + + /* + * SwitchInfo fields + */ + IB_SW_FIRST_F, + IB_SW_LINEAR_FDB_CAP_F = IB_SW_FIRST_F, + IB_SW_RANDOM_FDB_CAP_F, + IB_SW_MCAST_FDB_CAP_F, + IB_SW_LINEAR_FDB_TOP_F, + IB_SW_DEF_PORT_F, + IB_SW_DEF_MCAST_PRIM_F, + IB_SW_DEF_MCAST_NOT_PRIM_F, + IB_SW_LIFE_TIME_F, + IB_SW_STATE_CHANGE_F, + IB_SW_OPT_SLTOVL_MAPPING_F, + IB_SW_LIDS_PER_PORT_F, + IB_SW_PARTITION_ENFORCE_CAP_F, + IB_SW_PARTITION_ENF_INB_F, + IB_SW_PARTITION_ENF_OUTB_F, + IB_SW_FILTER_RAW_INB_F, + IB_SW_FILTER_RAW_OUTB_F, + IB_SW_ENHANCED_PORT0_F, + IB_SW_MCAST_FDB_TOP_F, + IB_SW_LAST_F, + + /* + * SwitchLinearForwardingTable fields + */ + IB_LINEAR_FORW_TBL_F, + + /* + * SwitchMulticastForwardingTable fields + */ + IB_MULTICAST_FORW_TBL_F, + + /* + * NodeDescription fields + */ + IB_NODE_DESC_F, + + /* + * Notice/Trap fields + */ + IB_NOTICE_IS_GENERIC_F, + IB_NOTICE_TYPE_F, + IB_NOTICE_PRODUCER_F, + IB_NOTICE_TRAP_NUMBER_F, + IB_NOTICE_ISSUER_LID_F, + IB_NOTICE_TOGGLE_F, + IB_NOTICE_COUNT_F, + IB_NOTICE_DATA_DETAILS_F, + IB_NOTICE_DATA_LID_F, + IB_NOTICE_DATA_144_LID_F, + IB_NOTICE_DATA_144_CAPMASK_F, + + /* + * GS Performance + */ + IB_PC_FIRST_F, + IB_PC_PORT_SELECT_F = IB_PC_FIRST_F, + IB_PC_COUNTER_SELECT_F, + IB_PC_ERR_SYM_F, + IB_PC_LINK_RECOVERS_F, + IB_PC_LINK_DOWNED_F, + IB_PC_ERR_RCV_F, + IB_PC_ERR_PHYSRCV_F, + IB_PC_ERR_SWITCH_REL_F, + IB_PC_XMT_DISCARDS_F, + IB_PC_ERR_XMTCONSTR_F, + IB_PC_ERR_RCVCONSTR_F, + IB_PC_COUNTER_SELECT2_F, + IB_PC_ERR_LOCALINTEG_F, + IB_PC_ERR_EXCESS_OVR_F, + IB_PC_VL15_DROPPED_F, + IB_PC_XMT_BYTES_F, + IB_PC_RCV_BYTES_F, + IB_PC_XMT_PKTS_F, + IB_PC_RCV_PKTS_F, + IB_PC_XMT_WAIT_F, + IB_PC_LAST_F, + + /* + * SMInfo + */ + IB_SMINFO_GUID_F, + IB_SMINFO_KEY_F, + IB_SMINFO_ACT_F, + IB_SMINFO_PRIO_F, + IB_SMINFO_STATE_F, + + /* + * SA RMPP + */ + IB_SA_RMPP_VERS_F, + IB_SA_RMPP_TYPE_F, + IB_SA_RMPP_RESP_F, + IB_SA_RMPP_FLAGS_F, + IB_SA_RMPP_STATUS_F, + + /* data1 */ + IB_SA_RMPP_D1_F, + IB_SA_RMPP_SEGNUM_F, + /* data2 */ + IB_SA_RMPP_D2_F, + IB_SA_RMPP_LEN_F, /* DATA: Payload len */ + IB_SA_RMPP_NEWWIN_F, /* ACK: new window last */ + + /* + * SA Multi Path rec + */ + IB_SA_MP_NPATH_F, + IB_SA_MP_NSRC_F, + IB_SA_MP_NDEST_F, + IB_SA_MP_GID0_F, + + /* + * SA Path rec + */ + IB_SA_PR_DGID_F, + IB_SA_PR_SGID_F, + IB_SA_PR_DLID_F, + IB_SA_PR_SLID_F, + IB_SA_PR_NPATH_F, + IB_SA_PR_SL_F, + + /* + * MC Member rec + */ + IB_SA_MCM_MGID_F, + IB_SA_MCM_PORTGID_F, + IB_SA_MCM_QKEY_F, + IB_SA_MCM_MLID_F, + IB_SA_MCM_SL_F, + IB_SA_MCM_MTU_F, + IB_SA_MCM_RATE_F, + IB_SA_MCM_TCLASS_F, + IB_SA_MCM_PKEY_F, + IB_SA_MCM_FLOW_LABEL_F, + IB_SA_MCM_JOIN_STATE_F, + IB_SA_MCM_PROXY_JOIN_F, + + /* + * Service record + */ + IB_SA_SR_ID_F, + IB_SA_SR_GID_F, + IB_SA_SR_PKEY_F, + IB_SA_SR_LEASE_F, + IB_SA_SR_KEY_F, + IB_SA_SR_NAME_F, + IB_SA_SR_DATA_F, + + /* + * ATS SM record - within SA_SR_DATA + */ + IB_ATS_SM_NODE_ADDR_F, + IB_ATS_SM_MAGIC_KEY_F, + IB_ATS_SM_NODE_TYPE_F, + IB_ATS_SM_NODE_NAME_F, + + /* + * SLTOVL MAPPING TABLE + */ + IB_SLTOVL_MAPPING_TABLE_F, + + /* + * VL ARBITRATION TABLE + */ + IB_VL_ARBITRATION_TABLE_F, + + /* + * IB vendor class range 2 + */ + IB_VEND2_OUI_F, + IB_VEND2_DATA_F, + + /* + * PortCountersExtended + */ + IB_PC_EXT_FIRST_F, + IB_PC_EXT_PORT_SELECT_F = IB_PC_EXT_FIRST_F, + IB_PC_EXT_COUNTER_SELECT_F, + IB_PC_EXT_XMT_BYTES_F, + IB_PC_EXT_RCV_BYTES_F, + IB_PC_EXT_XMT_PKTS_F, + IB_PC_EXT_RCV_PKTS_F, + IB_PC_EXT_XMT_UPKTS_F, + IB_PC_EXT_RCV_UPKTS_F, + IB_PC_EXT_XMT_MPKTS_F, + IB_PC_EXT_RCV_MPKTS_F, + IB_PC_EXT_LAST_F, + + /* + * GUIDInfo fields + */ + IB_GUID_GUID0_F, /* Obsolete, kept for compatibility + Use IB_GI_GUID0_F going forward */ + + /* + * ClassPortInfo fields + */ + IB_CPI_BASEVER_F, + IB_CPI_CLASSVER_F, + IB_CPI_CAPMASK_F, + IB_CPI_CAPMASK2_F, + IB_CPI_RESP_TIME_VALUE_F, + IB_CPI_REDIRECT_GID_F, + IB_CPI_REDIRECT_TC_F, + IB_CPI_REDIRECT_SL_F, + IB_CPI_REDIRECT_FL_F, + IB_CPI_REDIRECT_LID_F, + IB_CPI_REDIRECT_PKEY_F, + IB_CPI_REDIRECT_QP_F, + IB_CPI_REDIRECT_QKEY_F, + IB_CPI_TRAP_GID_F, + IB_CPI_TRAP_TC_F, + IB_CPI_TRAP_SL_F, + IB_CPI_TRAP_FL_F, + IB_CPI_TRAP_LID_F, + IB_CPI_TRAP_PKEY_F, + IB_CPI_TRAP_HL_F, + IB_CPI_TRAP_QP_F, + IB_CPI_TRAP_QKEY_F, + + /* + * PortXmitDataSL fields + */ + IB_PC_XMT_DATA_SL_FIRST_F, /* for PortSelect and CounterSelect, use IB_PC_PORT_SELECT_F and IB_PC_COUNTER_SELECT_F */ + IB_PC_XMT_DATA_SL0_F = IB_PC_XMT_DATA_SL_FIRST_F, + IB_PC_XMT_DATA_SL1_F, + IB_PC_XMT_DATA_SL2_F, + IB_PC_XMT_DATA_SL3_F, + IB_PC_XMT_DATA_SL4_F, + IB_PC_XMT_DATA_SL5_F, + IB_PC_XMT_DATA_SL6_F, + IB_PC_XMT_DATA_SL7_F, + IB_PC_XMT_DATA_SL8_F, + IB_PC_XMT_DATA_SL9_F, + IB_PC_XMT_DATA_SL10_F, + IB_PC_XMT_DATA_SL11_F, + IB_PC_XMT_DATA_SL12_F, + IB_PC_XMT_DATA_SL13_F, + IB_PC_XMT_DATA_SL14_F, + IB_PC_XMT_DATA_SL15_F, + IB_PC_XMT_DATA_SL_LAST_F, + + /* + * PortRcvDataSL fields + */ + IB_PC_RCV_DATA_SL_FIRST_F, /* for PortSelect and CounterSelect, use IB_PC_PORT_SELECT_F and IB_PC_COUNTER_SELECT_F */ + IB_PC_RCV_DATA_SL0_F = IB_PC_RCV_DATA_SL_FIRST_F, + IB_PC_RCV_DATA_SL1_F, + IB_PC_RCV_DATA_SL2_F, + IB_PC_RCV_DATA_SL3_F, + IB_PC_RCV_DATA_SL4_F, + IB_PC_RCV_DATA_SL5_F, + IB_PC_RCV_DATA_SL6_F, + IB_PC_RCV_DATA_SL7_F, + IB_PC_RCV_DATA_SL8_F, + IB_PC_RCV_DATA_SL9_F, + IB_PC_RCV_DATA_SL10_F, + IB_PC_RCV_DATA_SL11_F, + IB_PC_RCV_DATA_SL12_F, + IB_PC_RCV_DATA_SL13_F, + IB_PC_RCV_DATA_SL14_F, + IB_PC_RCV_DATA_SL15_F, + IB_PC_RCV_DATA_SL_LAST_F, + + /* + * PortXmitDiscardDetails fields + */ + /* for PortSelect and CounterSelect, use IB_PC_PORT_SELECT_F and IB_PC_COUNTER_SELECT_F */ + IB_PC_XMT_INACT_DISC_F, + IB_PC_XMT_NEIGH_MTU_DISC_F, + IB_PC_XMT_SW_LIFE_DISC_F, + IB_PC_XMT_SW_HOL_DISC_F, + IB_PC_XMT_DISC_LAST_F, + + /* + * PortRcvErrorDetails fields + */ + /* for PortSelect and CounterSelect, use IB_PC_PORT_SELECT_F and IB_PC_COUNTER_SELECT_F */ + IB_PC_RCV_LOCAL_PHY_ERR_F, + IB_PC_RCV_MALFORMED_PKT_ERR_F, + IB_PC_RCV_BUF_OVR_ERR_F, + IB_PC_RCV_DLID_MAP_ERR_F, + IB_PC_RCV_VL_MAP_ERR_F, + IB_PC_RCV_LOOPING_ERR_F, + IB_PC_RCV_ERR_LAST_F, + + /* + * PortSamplesControl fields + */ + IB_PSC_OPCODE_F, + IB_PSC_PORT_SELECT_F, + IB_PSC_TICK_F, + IB_PSC_COUNTER_WIDTH_F, + IB_PSC_COUNTER_MASK0_F, + IB_PSC_COUNTER_MASKS1TO9_F, + IB_PSC_COUNTER_MASKS10TO14_F, + IB_PSC_SAMPLE_MECHS_F, + IB_PSC_SAMPLE_STATUS_F, + IB_PSC_OPTION_MASK_F, + IB_PSC_VENDOR_MASK_F, + IB_PSC_SAMPLE_START_F, + IB_PSC_SAMPLE_INTVL_F, + IB_PSC_TAG_F, + IB_PSC_COUNTER_SEL0_F, + IB_PSC_COUNTER_SEL1_F, + IB_PSC_COUNTER_SEL2_F, + IB_PSC_COUNTER_SEL3_F, + IB_PSC_COUNTER_SEL4_F, + IB_PSC_COUNTER_SEL5_F, + IB_PSC_COUNTER_SEL6_F, + IB_PSC_COUNTER_SEL7_F, + IB_PSC_COUNTER_SEL8_F, + IB_PSC_COUNTER_SEL9_F, + IB_PSC_COUNTER_SEL10_F, + IB_PSC_COUNTER_SEL11_F, + IB_PSC_COUNTER_SEL12_F, + IB_PSC_COUNTER_SEL13_F, + IB_PSC_COUNTER_SEL14_F, + IB_PSC_SAMPLES_ONLY_OPT_MASK_F, + IB_PSC_LAST_F, + + /* + * GUIDInfo fields + */ + IB_GI_GUID0_F, /* a duplicate of IB_GUID_GUID0_F for backwards + compatibility */ + IB_GI_GUID1_F, + IB_GI_GUID2_F, + IB_GI_GUID3_F, + IB_GI_GUID4_F, + IB_GI_GUID5_F, + IB_GI_GUID6_F, + IB_GI_GUID7_F, + + /* + * GUID Info Record + */ + IB_SA_GIR_LID_F, + IB_SA_GIR_BLOCKNUM_F, + IB_SA_GIR_GUID0_F, + IB_SA_GIR_GUID1_F, + IB_SA_GIR_GUID2_F, + IB_SA_GIR_GUID3_F, + IB_SA_GIR_GUID4_F, + IB_SA_GIR_GUID5_F, + IB_SA_GIR_GUID6_F, + IB_SA_GIR_GUID7_F, + + /* + * More PortInfo fields + */ + IB_PORT_CAPMASK2_F, + IB_PORT_LINK_SPEED_EXT_ACTIVE_F, + IB_PORT_LINK_SPEED_EXT_SUPPORTED_F, + IB_PORT_LINK_SPEED_EXT_ENABLED_F, + IB_PORT_LINK_SPEED_EXT_LAST_F, + + /* + * PortExtendedSpeedsCounters fields + */ + IB_PESC_PORT_SELECT_F, + IB_PESC_COUNTER_SELECT_F, + IB_PESC_SYNC_HDR_ERR_CTR_F, + IB_PESC_UNK_BLOCK_CTR_F, + IB_PESC_ERR_DET_CTR_LANE0_F, + IB_PESC_ERR_DET_CTR_LANE1_F, + IB_PESC_ERR_DET_CTR_LANE2_F, + IB_PESC_ERR_DET_CTR_LANE3_F, + IB_PESC_ERR_DET_CTR_LANE4_F, + IB_PESC_ERR_DET_CTR_LANE5_F, + IB_PESC_ERR_DET_CTR_LANE6_F, + IB_PESC_ERR_DET_CTR_LANE7_F, + IB_PESC_ERR_DET_CTR_LANE8_F, + IB_PESC_ERR_DET_CTR_LANE9_F, + IB_PESC_ERR_DET_CTR_LANE10_F, + IB_PESC_ERR_DET_CTR_LANE11_F, + IB_PESC_FEC_CORR_BLOCK_CTR_LANE0_F, + IB_PESC_FEC_CORR_BLOCK_CTR_LANE1_F, + IB_PESC_FEC_CORR_BLOCK_CTR_LANE2_F, + IB_PESC_FEC_CORR_BLOCK_CTR_LANE3_F, + IB_PESC_FEC_CORR_BLOCK_CTR_LANE4_F, + IB_PESC_FEC_CORR_BLOCK_CTR_LANE5_F, + IB_PESC_FEC_CORR_BLOCK_CTR_LANE6_F, + IB_PESC_FEC_CORR_BLOCK_CTR_LANE7_F, + IB_PESC_FEC_CORR_BLOCK_CTR_LANE8_F, + IB_PESC_FEC_CORR_BLOCK_CTR_LANE9_F, + IB_PESC_FEC_CORR_BLOCK_CTR_LANE10_F, + IB_PESC_FEC_CORR_BLOCK_CTR_LANE11_F, + IB_PESC_FEC_UNCORR_BLOCK_CTR_LANE0_F, + IB_PESC_FEC_UNCORR_BLOCK_CTR_LANE1_F, + IB_PESC_FEC_UNCORR_BLOCK_CTR_LANE2_F, + IB_PESC_FEC_UNCORR_BLOCK_CTR_LANE3_F, + IB_PESC_FEC_UNCORR_BLOCK_CTR_LANE4_F, + IB_PESC_FEC_UNCORR_BLOCK_CTR_LANE5_F, + IB_PESC_FEC_UNCORR_BLOCK_CTR_LANE6_F, + IB_PESC_FEC_UNCORR_BLOCK_CTR_LANE7_F, + IB_PESC_FEC_UNCORR_BLOCK_CTR_LANE8_F, + IB_PESC_FEC_UNCORR_BLOCK_CTR_LANE9_F, + IB_PESC_FEC_UNCORR_BLOCK_CTR_LANE10_F, + IB_PESC_FEC_UNCORR_BLOCK_CTR_LANE11_F, + IB_PESC_LAST_F, + + /* + * PortOpRcvCounters fields + */ + IB_PC_PORT_OP_RCV_COUNTERS_FIRST_F, /* for PortSelect and CounterSelect, use IB_PC_PORT_SELECT_F and IB_PC_COUNTER_SELECT_F */ + IB_PC_PORT_OP_RCV_PKTS_F = IB_PC_PORT_OP_RCV_COUNTERS_FIRST_F, + IB_PC_PORT_OP_RCV_DATA_F, + IB_PC_PORT_OP_RCV_COUNTERS_LAST_F, + + /* + * PortFlowCtlCounters fields + */ + IB_PC_PORT_FLOW_CTL_COUNTERS_FIRST_F, /* for PortSelect and CounterSelect, use IB_PC_PORT_SELECT_F and IB_PC_COUNTER_SELECT_F */ + IB_PC_PORT_XMIT_FLOW_PKTS_F = IB_PC_PORT_FLOW_CTL_COUNTERS_FIRST_F, + IB_PC_PORT_RCV_FLOW_PKTS_F, + IB_PC_PORT_FLOW_CTL_COUNTERS_LAST_F, + + /* + * PortVLOpPackets fields + */ + IB_PC_PORT_VL_OP_PACKETS_FIRST_F, /* for PortSelect and CounterSelect, use IB_PC_PORT_SELECT_F and IB_PC_COUNTER_SELECT_F */ + IB_PC_PORT_VL_OP_PACKETS0_F = IB_PC_PORT_VL_OP_PACKETS_FIRST_F, + IB_PC_PORT_VL_OP_PACKETS1_F, + IB_PC_PORT_VL_OP_PACKETS2_F, + IB_PC_PORT_VL_OP_PACKETS3_F, + IB_PC_PORT_VL_OP_PACKETS4_F, + IB_PC_PORT_VL_OP_PACKETS5_F, + IB_PC_PORT_VL_OP_PACKETS6_F, + IB_PC_PORT_VL_OP_PACKETS7_F, + IB_PC_PORT_VL_OP_PACKETS8_F, + IB_PC_PORT_VL_OP_PACKETS9_F, + IB_PC_PORT_VL_OP_PACKETS10_F, + IB_PC_PORT_VL_OP_PACKETS11_F, + IB_PC_PORT_VL_OP_PACKETS12_F, + IB_PC_PORT_VL_OP_PACKETS13_F, + IB_PC_PORT_VL_OP_PACKETS14_F, + IB_PC_PORT_VL_OP_PACKETS15_F, + IB_PC_PORT_VL_OP_PACKETS_LAST_F, + + /* + * PortVLOpData fields + */ + IB_PC_PORT_VL_OP_DATA_FIRST_F, /* for PortSelect and CounterSelect, use IB_PC_PORT_SELECT_F and IB_PC_COUNTER_SELECT_F */ + IB_PC_PORT_VL_OP_DATA0_F = IB_PC_PORT_VL_OP_DATA_FIRST_F, + IB_PC_PORT_VL_OP_DATA1_F, + IB_PC_PORT_VL_OP_DATA2_F, + IB_PC_PORT_VL_OP_DATA3_F, + IB_PC_PORT_VL_OP_DATA4_F, + IB_PC_PORT_VL_OP_DATA5_F, + IB_PC_PORT_VL_OP_DATA6_F, + IB_PC_PORT_VL_OP_DATA7_F, + IB_PC_PORT_VL_OP_DATA8_F, + IB_PC_PORT_VL_OP_DATA9_F, + IB_PC_PORT_VL_OP_DATA10_F, + IB_PC_PORT_VL_OP_DATA11_F, + IB_PC_PORT_VL_OP_DATA12_F, + IB_PC_PORT_VL_OP_DATA13_F, + IB_PC_PORT_VL_OP_DATA14_F, + IB_PC_PORT_VL_OP_DATA15_F, + IB_PC_PORT_VL_OP_DATA_LAST_F, + + /* + * PortVLXmitFlowCtlUpdateErrors fields + */ + IB_PC_PORT_VL_XMIT_FLOW_CTL_UPDATE_ERRORS_FIRST_F, /* for PortSelect and CounterSelect, use IB_PC_PORT_SELECT_F and IB_PC_COUNTER_SELECT_F */ + IB_PC_PORT_VL_XMIT_FLOW_CTL_UPDATE_ERRORS0_F = IB_PC_PORT_VL_XMIT_FLOW_CTL_UPDATE_ERRORS_FIRST_F, + IB_PC_PORT_VL_XMIT_FLOW_CTL_UPDATE_ERRORS1_F, + IB_PC_PORT_VL_XMIT_FLOW_CTL_UPDATE_ERRORS2_F, + IB_PC_PORT_VL_XMIT_FLOW_CTL_UPDATE_ERRORS3_F, + IB_PC_PORT_VL_XMIT_FLOW_CTL_UPDATE_ERRORS4_F, + IB_PC_PORT_VL_XMIT_FLOW_CTL_UPDATE_ERRORS5_F, + IB_PC_PORT_VL_XMIT_FLOW_CTL_UPDATE_ERRORS6_F, + IB_PC_PORT_VL_XMIT_FLOW_CTL_UPDATE_ERRORS7_F, + IB_PC_PORT_VL_XMIT_FLOW_CTL_UPDATE_ERRORS8_F, + IB_PC_PORT_VL_XMIT_FLOW_CTL_UPDATE_ERRORS9_F, + IB_PC_PORT_VL_XMIT_FLOW_CTL_UPDATE_ERRORS10_F, + IB_PC_PORT_VL_XMIT_FLOW_CTL_UPDATE_ERRORS11_F, + IB_PC_PORT_VL_XMIT_FLOW_CTL_UPDATE_ERRORS12_F, + IB_PC_PORT_VL_XMIT_FLOW_CTL_UPDATE_ERRORS13_F, + IB_PC_PORT_VL_XMIT_FLOW_CTL_UPDATE_ERRORS14_F, + IB_PC_PORT_VL_XMIT_FLOW_CTL_UPDATE_ERRORS15_F, + IB_PC_PORT_VL_XMIT_FLOW_CTL_UPDATE_ERRORS_LAST_F, + + /* + * PortVLXmitWaitCounters fields + */ + IB_PC_PORT_VL_XMIT_WAIT_COUNTERS_FIRST_F, /* for PortSelect and CounterSelect, use IB_PC_PORT_SELECT_F and IB_PC_COUNTER_SELECT_F */ + IB_PC_PORT_VL_XMIT_WAIT0_F = IB_PC_PORT_VL_XMIT_WAIT_COUNTERS_FIRST_F, + IB_PC_PORT_VL_XMIT_WAIT1_F, + IB_PC_PORT_VL_XMIT_WAIT2_F, + IB_PC_PORT_VL_XMIT_WAIT3_F, + IB_PC_PORT_VL_XMIT_WAIT4_F, + IB_PC_PORT_VL_XMIT_WAIT5_F, + IB_PC_PORT_VL_XMIT_WAIT6_F, + IB_PC_PORT_VL_XMIT_WAIT7_F, + IB_PC_PORT_VL_XMIT_WAIT8_F, + IB_PC_PORT_VL_XMIT_WAIT9_F, + IB_PC_PORT_VL_XMIT_WAIT10_F, + IB_PC_PORT_VL_XMIT_WAIT11_F, + IB_PC_PORT_VL_XMIT_WAIT12_F, + IB_PC_PORT_VL_XMIT_WAIT13_F, + IB_PC_PORT_VL_XMIT_WAIT14_F, + IB_PC_PORT_VL_XMIT_WAIT15_F, + IB_PC_PORT_VL_XMIT_WAIT_COUNTERS_LAST_F, + + /* + * SwPortVLCongestion fields + */ + IB_PC_SW_PORT_VL_CONGESTION_FIRST_F, /* for PortSelect and CounterSelect, use IB_PC_PORT_SELECT_F and IB_PC_COUNTER_SELECT_F */ + IB_PC_SW_PORT_VL_CONGESTION0_F = IB_PC_SW_PORT_VL_CONGESTION_FIRST_F, + IB_PC_SW_PORT_VL_CONGESTION1_F, + IB_PC_SW_PORT_VL_CONGESTION2_F, + IB_PC_SW_PORT_VL_CONGESTION3_F, + IB_PC_SW_PORT_VL_CONGESTION4_F, + IB_PC_SW_PORT_VL_CONGESTION5_F, + IB_PC_SW_PORT_VL_CONGESTION6_F, + IB_PC_SW_PORT_VL_CONGESTION7_F, + IB_PC_SW_PORT_VL_CONGESTION8_F, + IB_PC_SW_PORT_VL_CONGESTION9_F, + IB_PC_SW_PORT_VL_CONGESTION10_F, + IB_PC_SW_PORT_VL_CONGESTION11_F, + IB_PC_SW_PORT_VL_CONGESTION12_F, + IB_PC_SW_PORT_VL_CONGESTION13_F, + IB_PC_SW_PORT_VL_CONGESTION14_F, + IB_PC_SW_PORT_VL_CONGESTION15_F, + IB_PC_SW_PORT_VL_CONGESTION_LAST_F, + + /* + * PortRcvConCtrl fields + */ + IB_PC_RCV_CON_CTRL_FIRST_F, /* for PortSelect and CounterSelect, use IB_PC_PORT_SELECT_F and IB_PC_COUNTER_SELECT_F */ + IB_PC_RCV_CON_CTRL_PKT_RCV_FECN_F = IB_PC_RCV_CON_CTRL_FIRST_F, + IB_PC_RCV_CON_CTRL_PKT_RCV_BECN_F, + IB_PC_RCV_CON_CTRL_LAST_F, + + /* + * PortSLRcvFECN fields + */ + IB_PC_SL_RCV_FECN_FIRST_F, /* for PortSelect and CounterSelect, use IB_PC_PORT_SELECT_F and IB_PC_COUNTER_SELECT_F */ + IB_PC_SL_RCV_FECN0_F = IB_PC_SL_RCV_FECN_FIRST_F, + IB_PC_SL_RCV_FECN1_F, + IB_PC_SL_RCV_FECN2_F, + IB_PC_SL_RCV_FECN3_F, + IB_PC_SL_RCV_FECN4_F, + IB_PC_SL_RCV_FECN5_F, + IB_PC_SL_RCV_FECN6_F, + IB_PC_SL_RCV_FECN7_F, + IB_PC_SL_RCV_FECN8_F, + IB_PC_SL_RCV_FECN9_F, + IB_PC_SL_RCV_FECN10_F, + IB_PC_SL_RCV_FECN11_F, + IB_PC_SL_RCV_FECN12_F, + IB_PC_SL_RCV_FECN13_F, + IB_PC_SL_RCV_FECN14_F, + IB_PC_SL_RCV_FECN15_F, + IB_PC_SL_RCV_FECN_LAST_F, + + /* + * PortSLRcvBECN fields + */ + IB_PC_SL_RCV_BECN_FIRST_F, /* for PortSelect and CounterSelect, use IB_PC_PORT_SELECT_F and IB_PC_COUNTER_SELECT_F */ + IB_PC_SL_RCV_BECN0_F = IB_PC_SL_RCV_BECN_FIRST_F, + IB_PC_SL_RCV_BECN1_F, + IB_PC_SL_RCV_BECN2_F, + IB_PC_SL_RCV_BECN3_F, + IB_PC_SL_RCV_BECN4_F, + IB_PC_SL_RCV_BECN5_F, + IB_PC_SL_RCV_BECN6_F, + IB_PC_SL_RCV_BECN7_F, + IB_PC_SL_RCV_BECN8_F, + IB_PC_SL_RCV_BECN9_F, + IB_PC_SL_RCV_BECN10_F, + IB_PC_SL_RCV_BECN11_F, + IB_PC_SL_RCV_BECN12_F, + IB_PC_SL_RCV_BECN13_F, + IB_PC_SL_RCV_BECN14_F, + IB_PC_SL_RCV_BECN15_F, + IB_PC_SL_RCV_BECN_LAST_F, + + /* + * PortXmitConCtrl fields + */ + IB_PC_XMIT_CON_CTRL_FIRST_F, /* for PortSelect and CounterSelect, use IB_PC_PORT_SELECT_F and IB_PC_COUNTER_SELECT_F */ + IB_PC_XMIT_CON_CTRL_TIME_CONG_F = IB_PC_XMIT_CON_CTRL_FIRST_F, + IB_PC_XMIT_CON_CTRL_LAST_F, + + /* + * PortVLXmitTimeCong fields + */ + IB_PC_VL_XMIT_TIME_CONG_FIRST_F, /* for PortSelect and CounterSelect, use IB_PC_PORT_SELECT_F and IB_PC_COUNTER_SELECT_F */ + IB_PC_VL_XMIT_TIME_CONG0_F = IB_PC_VL_XMIT_TIME_CONG_FIRST_F, + IB_PC_VL_XMIT_TIME_CONG1_F, + IB_PC_VL_XMIT_TIME_CONG2_F, + IB_PC_VL_XMIT_TIME_CONG3_F, + IB_PC_VL_XMIT_TIME_CONG4_F, + IB_PC_VL_XMIT_TIME_CONG5_F, + IB_PC_VL_XMIT_TIME_CONG6_F, + IB_PC_VL_XMIT_TIME_CONG7_F, + IB_PC_VL_XMIT_TIME_CONG8_F, + IB_PC_VL_XMIT_TIME_CONG9_F, + IB_PC_VL_XMIT_TIME_CONG10_F, + IB_PC_VL_XMIT_TIME_CONG11_F, + IB_PC_VL_XMIT_TIME_CONG12_F, + IB_PC_VL_XMIT_TIME_CONG13_F, + IB_PC_VL_XMIT_TIME_CONG14_F, + IB_PC_VL_XMIT_TIME_CONG_LAST_F, + + /* + * Mellanox ExtendedPortInfo fields + */ + IB_MLNX_EXT_PORT_STATE_CHG_ENABLE_F, + IB_MLNX_EXT_PORT_LINK_SPEED_SUPPORTED_F, + IB_MLNX_EXT_PORT_LINK_SPEED_ENABLED_F, + IB_MLNX_EXT_PORT_LINK_SPEED_ACTIVE_F, + IB_MLNX_EXT_PORT_LAST_F, + + /* + * Congestion Control Mad fields + * bytes 24-31 of congestion control mad + */ + IB_CC_CCKEY_F, + + /* + * CongestionInfo fields + */ + IB_CC_CONGESTION_INFO_FIRST_F, + IB_CC_CONGESTION_INFO_F = IB_CC_CONGESTION_INFO_FIRST_F, + IB_CC_CONGESTION_INFO_CONTROL_TABLE_CAP_F, + IB_CC_CONGESTION_INFO_LAST_F, + + /* + * CongestionKeyInfo fields + */ + IB_CC_CONGESTION_KEY_INFO_FIRST_F, + IB_CC_CONGESTION_KEY_INFO_CC_KEY_F = IB_CC_CONGESTION_KEY_INFO_FIRST_F, + IB_CC_CONGESTION_KEY_INFO_CC_KEY_PROTECT_BIT_F, + IB_CC_CONGESTION_KEY_INFO_CC_KEY_LEASE_PERIOD_F, + IB_CC_CONGESTION_KEY_INFO_CC_KEY_VIOLATIONS_F, + IB_CC_CONGESTION_KEY_INFO_LAST_F, + + /* + * CongestionLog (common) fields + */ + IB_CC_CONGESTION_LOG_FIRST_F, + IB_CC_CONGESTION_LOG_LOGTYPE_F = IB_CC_CONGESTION_LOG_FIRST_F, + IB_CC_CONGESTION_LOG_CONGESTION_FLAGS_F, + IB_CC_CONGESTION_LOG_LAST_F, + + /* + * CongestionLog (Switch) fields + */ + IB_CC_CONGESTION_LOG_SWITCH_FIRST_F, + IB_CC_CONGESTION_LOG_SWITCH_LOG_EVENTS_COUNTER_F = IB_CC_CONGESTION_LOG_SWITCH_FIRST_F, + IB_CC_CONGESTION_LOG_SWITCH_CURRENT_TIME_STAMP_F, + IB_CC_CONGESTION_LOG_SWITCH_PORTMAP_F, + IB_CC_CONGESTION_LOG_SWITCH_LAST_F, + + /* + * CongestionLogEvent (Switch) fields + */ + IB_CC_CONGESTION_LOG_ENTRY_SWITCH_FIRST_F, + IB_CC_CONGESTION_LOG_ENTRY_SWITCH_SLID_F = IB_CC_CONGESTION_LOG_ENTRY_SWITCH_FIRST_F, + IB_CC_CONGESTION_LOG_ENTRY_SWITCH_DLID_F, + IB_CC_CONGESTION_LOG_ENTRY_SWITCH_SL_F, + IB_CC_CONGESTION_LOG_ENTRY_SWITCH_TIMESTAMP_F, + IB_CC_CONGESTION_LOG_ENTRY_SWITCH_LAST_F, + + /* + * CongestionLog (CA) fields + */ + IB_CC_CONGESTION_LOG_CA_FIRST_F, + IB_CC_CONGESTION_LOG_CA_THRESHOLD_EVENT_COUNTER_F = IB_CC_CONGESTION_LOG_CA_FIRST_F, + IB_CC_CONGESTION_LOG_CA_THRESHOLD_CONGESTION_EVENT_MAP_F, + IB_CC_CONGESTION_LOG_CA_CURRENT_TIMESTAMP_F, + IB_CC_CONGESTION_LOG_CA_LAST_F, + + /* + * CongestionLogEvent (CA) fields + */ + IB_CC_CONGESTION_LOG_ENTRY_CA_FIRST_F, + IB_CC_CONGESTION_LOG_ENTRY_CA_LOCAL_QP_CN_ENTRY_F = IB_CC_CONGESTION_LOG_ENTRY_CA_FIRST_F, + IB_CC_CONGESTION_LOG_ENTRY_CA_SL_CN_ENTRY_F, + IB_CC_CONGESTION_LOG_ENTRY_CA_SERVICE_TYPE_CN_ENTRY_F, + IB_CC_CONGESTION_LOG_ENTRY_CA_REMOTE_QP_NUMBER_CN_ENTRY_F, + IB_CC_CONGESTION_LOG_ENTRY_CA_LOCAL_LID_CN_F, + IB_CC_CONGESTION_LOG_ENTRY_CA_REMOTE_LID_CN_ENTRY_F, + IB_CC_CONGESTION_LOG_ENTRY_CA_TIMESTAMP_CN_ENTRY_F, + IB_CC_CONGESTION_LOG_ENTRY_CA_LAST_F, + + /* + * SwitchCongestionSetting fields + */ + IB_CC_SWITCH_CONGESTION_SETTING_FIRST_F, + IB_CC_SWITCH_CONGESTION_SETTING_CONTROL_MAP_F = IB_CC_SWITCH_CONGESTION_SETTING_FIRST_F, + IB_CC_SWITCH_CONGESTION_SETTING_VICTIM_MASK_F, + IB_CC_SWITCH_CONGESTION_SETTING_CREDIT_MASK_F, + IB_CC_SWITCH_CONGESTION_SETTING_THRESHOLD_F, + IB_CC_SWITCH_CONGESTION_SETTING_PACKET_SIZE_F, + IB_CC_SWITCH_CONGESTION_SETTING_CS_THRESHOLD_F, + IB_CC_SWITCH_CONGESTION_SETTING_CS_RETURN_DELAY_F, + IB_CC_SWITCH_CONGESTION_SETTING_MARKING_RATE_F, + IB_CC_SWITCH_CONGESTION_SETTING_LAST_F, + + /* + * SwitchPortCongestionSettingElement fields + */ + IB_CC_SWITCH_PORT_CONGESTION_SETTING_ELEMENT_FIRST_F, + IB_CC_SWITCH_PORT_CONGESTION_SETTING_ELEMENT_VALID_F = IB_CC_SWITCH_PORT_CONGESTION_SETTING_ELEMENT_FIRST_F, + IB_CC_SWITCH_PORT_CONGESTION_SETTING_ELEMENT_CONTROL_TYPE_F, + IB_CC_SWITCH_PORT_CONGESTION_SETTING_ELEMENT_THRESHOLD_F, + IB_CC_SWITCH_PORT_CONGESTION_SETTING_ELEMENT_PACKET_SIZE_F, + IB_CC_SWITCH_PORT_CONGESTION_SETTING_ELEMENT_CONG_PARM_MARKING_RATE_F, + IB_CC_SWITCH_PORT_CONGESTION_SETTING_ELEMENT_LAST_F, + + /* + * CACongestionSetting fields + */ + IB_CC_CA_CONGESTION_SETTING_FIRST_F, + IB_CC_CA_CONGESTION_SETTING_PORT_CONTROL_F = IB_CC_CA_CONGESTION_SETTING_FIRST_F, + IB_CC_CA_CONGESTION_SETTING_CONTROL_MAP_F, + IB_CC_CA_CONGESTION_SETTING_LAST_F, + + /* + * CACongestionEntry fields + */ + IB_CC_CA_CONGESTION_ENTRY_FIRST_F, + IB_CC_CA_CONGESTION_ENTRY_CCTI_TIMER_F = IB_CC_CA_CONGESTION_ENTRY_FIRST_F, + IB_CC_CA_CONGESTION_ENTRY_CCTI_INCREASE_F, + IB_CC_CA_CONGESTION_ENTRY_TRIGGER_THRESHOLD_F, + IB_CC_CA_CONGESTION_ENTRY_CCTI_MIN_F, + IB_CC_CA_CONGESTION_ENTRY_LAST_F, + + /* + * CongestionControlTable fields + */ + IB_CC_CONGESTION_CONTROL_TABLE_FIRST_F, + IB_CC_CONGESTION_CONTROL_TABLE_CCTI_LIMIT_F = IB_CC_CONGESTION_CONTROL_TABLE_FIRST_F, + IB_CC_CONGESTION_CONTROL_TABLE_LAST_F, + + /* + * CongestionControlTableEntry fields + */ + IB_CC_CONGESTION_CONTROL_TABLE_ENTRY_FIRST_F, + IB_CC_CONGESTION_CONTROL_TABLE_ENTRY_CCT_SHIFT_F = IB_CC_CONGESTION_CONTROL_TABLE_ENTRY_FIRST_F, + IB_CC_CONGESTION_CONTROL_TABLE_ENTRY_CCT_MULTIPLIER_F, + IB_CC_CONGESTION_CONTROL_TABLE_ENTRY_LAST_F, + + /* + * Timestamp fields + */ + IB_CC_TIMESTAMP_FIRST_F, + IB_CC_TIMESTAMP_F = IB_CC_TIMESTAMP_FIRST_F, + IB_CC_TIMESTAMP_LAST_F, + + /* + * Node Record + */ + IB_SA_NR_FIRST_F, + IB_SA_NR_LID_F = IB_SA_NR_FIRST_F, + IB_SA_NR_BASEVER_F, + IB_SA_NR_CLASSVER_F, + IB_SA_NR_TYPE_F, + IB_SA_NR_NPORTS_F, + IB_SA_NR_SYSTEM_GUID_F, + IB_SA_NR_GUID_F, + IB_SA_NR_PORT_GUID_F, + IB_SA_NR_PARTITION_CAP_F, + IB_SA_NR_DEVID_F, + IB_SA_NR_REVISION_F, + IB_SA_NR_LOCAL_PORT_F, + IB_SA_NR_VENDORID_F, + IB_SA_NR_NODEDESC_F, + IB_SA_NR_LAST_F, + + /* + * PortSamplesResult fields + */ + IB_PSR_TAG_F, + IB_PSR_SAMPLE_STATUS_F, + IB_PSR_COUNTER0_F, + IB_PSR_COUNTER1_F, + IB_PSR_COUNTER2_F, + IB_PSR_COUNTER3_F, + IB_PSR_COUNTER4_F, + IB_PSR_COUNTER5_F, + IB_PSR_COUNTER6_F, + IB_PSR_COUNTER7_F, + IB_PSR_COUNTER8_F, + IB_PSR_COUNTER9_F, + IB_PSR_COUNTER10_F, + IB_PSR_COUNTER11_F, + IB_PSR_COUNTER12_F, + IB_PSR_COUNTER13_F, + IB_PSR_COUNTER14_F, + IB_PSR_LAST_F, + + /* + * PortInfoExtended fields + */ + IB_PORT_EXT_FIRST_F, + IB_PORT_EXT_CAPMASK_F = IB_PORT_EXT_FIRST_F, + IB_PORT_EXT_FEC_MODE_ACTIVE_F, + IB_PORT_EXT_FDR_FEC_MODE_SUPPORTED_F, + IB_PORT_EXT_FDR_FEC_MODE_ENABLED_F, + IB_PORT_EXT_EDR_FEC_MODE_SUPPORTED_F, + IB_PORT_EXT_EDR_FEC_MODE_ENABLED_F, + IB_PORT_EXT_LAST_F, + + /* + * PortExtendedSpeedsCounters RSFEC active fields + */ + IB_PESC_RSFEC_FIRST_F, + IB_PESC_RSFEC_PORT_SELECT_F = IB_PESC_RSFEC_FIRST_F, + IB_PESC_RSFEC_COUNTER_SELECT_F, + IB_PESC_RSFEC_SYNC_HDR_ERR_CTR_F, + IB_PESC_RSFEC_UNK_BLOCK_CTR_F, + IB_PESC_RSFEC_FEC_CORR_SYMBOL_CTR_LANE0_F, + IB_PESC_RSFEC_FEC_CORR_SYMBOL_CTR_LANE1_F, + IB_PESC_RSFEC_FEC_CORR_SYMBOL_CTR_LANE2_F, + IB_PESC_RSFEC_FEC_CORR_SYMBOL_CTR_LANE3_F, + IB_PESC_RSFEC_FEC_CORR_SYMBOL_CTR_LANE4_F, + IB_PESC_RSFEC_FEC_CORR_SYMBOL_CTR_LANE5_F, + IB_PESC_RSFEC_FEC_CORR_SYMBOL_CTR_LANE6_F, + IB_PESC_RSFEC_FEC_CORR_SYMBOL_CTR_LANE7_F, + IB_PESC_RSFEC_FEC_CORR_SYMBOL_CTR_LANE8_F, + IB_PESC_RSFEC_FEC_CORR_SYMBOL_CTR_LANE9_F, + IB_PESC_RSFEC_FEC_CORR_SYMBOL_CTR_LANE10_F, + IB_PESC_RSFEC_FEC_CORR_SYMBOL_CTR_LANE11_F, + IB_PESC_PORT_FEC_CORR_BLOCK_CTR_F, + IB_PESC_PORT_FEC_UNCORR_BLOCK_CTR_F, + IB_PESC_PORT_FEC_CORR_SYMBOL_CTR_F, + IB_PESC_RSFEC_LAST_F, + + /* + * More PortCountersExtended fields + */ + IB_PC_EXT_COUNTER_SELECT2_F, + IB_PC_EXT_ERR_SYM_F, + IB_PC_EXT_LINK_RECOVERS_F, + IB_PC_EXT_LINK_DOWNED_F, + IB_PC_EXT_ERR_RCV_F, + IB_PC_EXT_ERR_PHYSRCV_F, + IB_PC_EXT_ERR_SWITCH_REL_F, + IB_PC_EXT_XMT_DISCARDS_F, + IB_PC_EXT_ERR_XMTCONSTR_F, + IB_PC_EXT_ERR_RCVCONSTR_F, + IB_PC_EXT_ERR_LOCALINTEG_F, + IB_PC_EXT_ERR_EXCESS_OVR_F, + IB_PC_EXT_VL15_DROPPED_F, + IB_PC_EXT_XMT_WAIT_F, + IB_PC_EXT_QP1_DROP_F, + IB_PC_EXT_ERR_LAST_F, + + /* + * Another PortCounters field + */ + IB_PC_QP1_DROP_F, + + /* + * More PortInfoExtended fields + */ + IB_PORT_EXT_HDR_FEC_MODE_SUPPORTED_F, + IB_PORT_EXT_HDR_FEC_MODE_ENABLED_F, + IB_PORT_EXT_HDR_FEC_MODE_LAST_F, + + IB_FIELD_LAST_ /* must be last */ +}; + +/* + * SA RMPP section + */ +enum RMPP_TYPE_ENUM { + IB_RMPP_TYPE_NONE, + IB_RMPP_TYPE_DATA, + IB_RMPP_TYPE_ACK, + IB_RMPP_TYPE_STOP, + IB_RMPP_TYPE_ABORT, +}; + +enum RMPP_FLAGS_ENUM { + IB_RMPP_FLAG_ACTIVE = 1 << 0, + IB_RMPP_FLAG_FIRST = 1 << 1, + IB_RMPP_FLAG_LAST = 1 << 2, +}; + +typedef struct { + int type; + int flags; + int status; + union { + uint32_t u; + uint32_t segnum; + } d1; + union { + uint32_t u; + uint32_t len; + uint32_t newwin; + } d2; +} ib_rmpp_hdr_t; + +enum SA_SIZES_ENUM { + SA_HEADER_SZ = 20, +}; + +typedef struct ib_sa_call { + unsigned attrid; + unsigned mod; + uint64_t mask; + unsigned method; + + uint64_t trid; /* used for out mad if nonzero, return real val */ + unsigned recsz; /* return field */ + ib_rmpp_hdr_t rmpp; +} ib_sa_call_t; + +typedef struct ib_vendor_call { + unsigned method; + unsigned mgmt_class; + unsigned attrid; + unsigned mod; + uint32_t oui; + unsigned timeout; + ib_rmpp_hdr_t rmpp; +} ib_vendor_call_t; + +typedef struct ib_bm_call { + unsigned method; + unsigned attrid; + unsigned mod; + unsigned timeout; + uint64_t bkey; +} ib_bm_call_t; + +#define IB_MIN_UCAST_LID 1 +#define IB_MAX_UCAST_LID (0xc000-1) +#define IB_MIN_MCAST_LID 0xc000 +#define IB_MAX_MCAST_LID (0xffff-1) + +#define IB_LID_VALID(lid) ((lid) >= IB_MIN_UCAST_LID && lid <= IB_MAX_UCAST_LID) +#define IB_MLID_VALID(lid) ((lid) >= IB_MIN_MCAST_LID && lid <= IB_MAX_MCAST_LID) + +#define MAD_DEF_RETRIES 3 +#define MAD_DEF_TIMEOUT_MS 1000 + +enum MAD_DEST { + IB_DEST_LID, + IB_DEST_DRPATH, + IB_DEST_GUID, + IB_DEST_DRSLID, + IB_DEST_GID +}; + +enum MAD_NODE_TYPE { + IB_NODE_CA = 1, + IB_NODE_SWITCH, + IB_NODE_ROUTER, + NODE_RNIC, + + IB_NODE_MAX = NODE_RNIC +}; + +/******************************************************************************/ + +/* portid.c */ +char *portid2str(ib_portid_t *portid); +int portid2portnum(ib_portid_t *portid); +int str2drpath(ib_dr_path_t *path, char *routepath, int drslid, int drdlid); +char *drpath2str(ib_dr_path_t *path, char *dstr, size_t dstr_size); + +static inline int ib_portid_set(ib_portid_t * portid, int lid, int qp, int qkey) +{ + portid->lid = lid; + portid->qp = qp; + portid->qkey = qkey; + portid->grh_present = 0; + + return 0; +} + +/* fields.c */ +uint32_t mad_get_field(void *buf, int base_offs, enum MAD_FIELDS field); +void mad_set_field(void *buf, int base_offs, enum MAD_FIELDS field, + uint32_t val); +/* field must be byte aligned */ +uint64_t mad_get_field64(void *buf, int base_offs, enum MAD_FIELDS field); +void mad_set_field64(void *buf, int base_offs, enum MAD_FIELDS field, + uint64_t val); +void mad_set_array(void *buf, int base_offs, enum MAD_FIELDS field, void *val); +void mad_get_array(void *buf, int base_offs, enum MAD_FIELDS field, void *val); +void mad_decode_field(uint8_t *buf, enum MAD_FIELDS field, void *val); +void mad_encode_field(uint8_t *buf, enum MAD_FIELDS field, void *val); +int mad_print_field(enum MAD_FIELDS field, const char *name, void *val); +char *mad_dump_field(enum MAD_FIELDS field, char *buf, int bufsz, void *val); +char *mad_dump_val(enum MAD_FIELDS field, char *buf, int bufsz, void *val); +const char *mad_field_name(enum MAD_FIELDS field); + +/* mad.c */ +void *mad_encode(void *buf, ib_rpc_t *rpc, ib_dr_path_t *drpath, void *data); +uint64_t mad_trid(void); +int mad_build_pkt(void *umad, ib_rpc_t *rpc, ib_portid_t *dport, + ib_rmpp_hdr_t *rmpp, void *data); + +/* New interface */ +void madrpc_show_errors(int set); +int madrpc_set_retries(int retries); +int madrpc_set_timeout(int timeout); +struct ibmad_port *mad_rpc_open_port(char *dev_name, int dev_port, + int *mgmt_classes, int num_classes); +void mad_rpc_close_port(struct ibmad_port *srcport); + +/* + * On redirection, the dport argument is updated with the redirection target, + * so subsequent MADs will not go through the redirection process again but + * reach the target directly. + */ +void *mad_rpc(const struct ibmad_port *srcport, ib_rpc_t *rpc, + ib_portid_t *dport, void *payload, void *rcvdata); + +void *mad_rpc_rmpp(const struct ibmad_port *srcport, ib_rpc_t *rpc, + ib_portid_t *dport, ib_rmpp_hdr_t *rmpp, void *data); +int mad_rpc_portid(struct ibmad_port *srcport); +void mad_rpc_set_retries(struct ibmad_port *port, int retries); +void mad_rpc_set_timeout(struct ibmad_port *port, int timeout); +int mad_rpc_class_agent(struct ibmad_port *srcport, int cls); + +int mad_get_timeout(const struct ibmad_port *srcport, int override_ms); +int mad_get_retries(const struct ibmad_port *srcport); + +/* register.c */ +int mad_register_port_client(int port_id, int mgmt, uint8_t rmpp_version); +int mad_register_client(int mgmt, uint8_t rmpp_version) + __attribute__((deprecated)); +int mad_register_server(int mgmt, uint8_t rmpp_version, + long method_mask[16 / sizeof(long)], uint32_t class_oui) + __attribute__((deprecated)); +/* register.c new interface */ +int mad_register_client_via(int mgmt, uint8_t rmpp_version, + struct ibmad_port *srcport); +int mad_register_server_via(int mgmt, uint8_t rmpp_version, + long method_mask[16 / sizeof(long)], + uint32_t class_oui, struct ibmad_port *srcport); +int mad_class_agent(int mgmt) __attribute__((deprecated)); + +/* serv.c */ +int mad_send(ib_rpc_t *rpc, ib_portid_t *dport, ib_rmpp_hdr_t *rmpp, void *data) + __attribute__((deprecated)); +void *mad_receive(void *umad, int timeout) __attribute__((deprecated)); +int mad_respond(void *umad, ib_portid_t *portid, uint32_t rstatus) + __attribute__((deprecated)); + +/* serv.c new interface */ +int mad_send_via(ib_rpc_t *rpc, ib_portid_t *dport, ib_rmpp_hdr_t *rmpp, + void *data, struct ibmad_port *srcport); +void *mad_receive_via(void *umad, int timeout, struct ibmad_port *srcport); +int mad_respond_via(void *umad, ib_portid_t *portid, uint32_t rstatus, + struct ibmad_port *srcport); +void *mad_alloc(void); +void mad_free(void *umad); + +/* vendor.c */ +uint8_t *ib_vendor_call(void *data, ib_portid_t *portid, ib_vendor_call_t *call) + __attribute__((deprecated)); + +/* vendor.c new interface */ +uint8_t *ib_vendor_call_via(void *data, ib_portid_t *portid, + ib_vendor_call_t *call, struct ibmad_port *srcport); + +static inline int mad_is_vendor_range1(int mgmt) +{ + return mgmt >= 0x9 && mgmt <= 0xf; +} + +static inline int mad_is_vendor_range2(int mgmt) +{ + return mgmt >= 0x30 && mgmt <= 0x4f; +} + +/* rpc.c */ +int madrpc_portid(void) __attribute__((deprecated)); +void *madrpc(ib_rpc_t *rpc, ib_portid_t *dport, void *payload, void *rcvdata) + __attribute__((deprecated)); +void *madrpc_rmpp(ib_rpc_t *rpc, ib_portid_t *dport, ib_rmpp_hdr_t *rmpp, + void *data) __attribute__((deprecated)); +void madrpc_init(char *dev_name, int dev_port, int *mgmt_classes, + int num_classes) __attribute__((deprecated)); +void madrpc_save_mad(void *madbuf, int len) __attribute__((deprecated)); + +/* smp.c */ +uint8_t *smp_query(void *buf, ib_portid_t *id, unsigned attrid, unsigned mod, + unsigned timeout) __attribute__((deprecated)); +uint8_t *smp_set(void *buf, ib_portid_t *id, unsigned attrid, unsigned mod, + unsigned timeout) __attribute__((deprecated)); + +/* smp.c new interface */ +uint8_t *smp_query_via(void *buf, ib_portid_t *id, unsigned attrid, + unsigned mod, unsigned timeout, + const struct ibmad_port *srcport); +uint8_t *smp_set_via(void *buf, ib_portid_t *id, unsigned attrid, unsigned mod, + unsigned timeout, const struct ibmad_port *srcport); +uint8_t *smp_query_status_via(void *rcvbuf, ib_portid_t *portid, + unsigned attrid, unsigned mod, unsigned timeout, + int *rstatus, const struct ibmad_port *srcport); +uint8_t *smp_set_status_via(void *data, ib_portid_t *portid, unsigned attrid, + unsigned mod, unsigned timeout, int *rstatus, + const struct ibmad_port *srcport); +void smp_mkey_set(struct ibmad_port *srcport, uint64_t mkey); +uint64_t smp_mkey_get(const struct ibmad_port *srcport); + +/* cc.c */ +void *cc_query_status_via(void *rcvbuf, ib_portid_t *portid, unsigned attrid, + unsigned mod, unsigned timeout, int *rstatus, + const struct ibmad_port *srcport, uint64_t cckey); + +void *cc_config_status_via(void *payload, void *rcvbuf, ib_portid_t *portid, + unsigned attrid, unsigned mod, unsigned timeout, + int *rstatus, const struct ibmad_port *srcport, + uint64_t cckey); + +/* sa.c */ +uint8_t *sa_call(void *rcvbuf, ib_portid_t *portid, ib_sa_call_t *sa, + unsigned timeout) __attribute__((deprecated)); +int ib_path_query(ibmad_gid_t srcgid, ibmad_gid_t destgid, ib_portid_t *sm_id, + void *buf) __attribute__((deprecated)); + +/* sa.c new interface */ +uint8_t *sa_rpc_call(const struct ibmad_port *srcport, void *rcvbuf, + ib_portid_t *portid, ib_sa_call_t *sa, unsigned timeout); +int ib_path_query_via(const struct ibmad_port *srcport, ibmad_gid_t srcgid, + ibmad_gid_t destgid, ib_portid_t *sm_id, void *buf); +/* returns lid */ +int ib_node_query_via(const struct ibmad_port *srcport, uint64_t guid, + ib_portid_t *sm_id, void *buf); + +/* resolve.c */ +int ib_resolve_smlid(ib_portid_t *sm_id, int timeout) + __attribute__((deprecated)); +int ib_resolve_portid_str(ib_portid_t *portid, char *addr_str, + enum MAD_DEST dest, ib_portid_t *sm_id) + __attribute__((deprecated)); +int ib_resolve_self(ib_portid_t *portid, int *portnum, ibmad_gid_t *gid) + __attribute__((deprecated)); + +/* resolve.c new interface */ +int ib_resolve_smlid_via(ib_portid_t *sm_id, int timeout, + const struct ibmad_port *srcport); +int ib_resolve_guid_via(ib_portid_t *portid, uint64_t *guid, ib_portid_t *sm_id, + int timeout, const struct ibmad_port *srcport); +int ib_resolve_gid_via(ib_portid_t *portid, ibmad_gid_t gid, ib_portid_t *sm_id, + int timeout, const struct ibmad_port *srcport); +int ib_resolve_portid_str_via(ib_portid_t *portid, char *addr_str, + enum MAD_DEST dest, ib_portid_t *sm_id, + const struct ibmad_port *srcport); +int ib_resolve_self_via(ib_portid_t *portid, int *portnum, ibmad_gid_t *gid, + const struct ibmad_port *srcport); + +/* gs.c new interface */ +uint8_t *pma_query_via(void *rcvbuf, ib_portid_t *dest, int port, + unsigned timeout, unsigned id, + const struct ibmad_port *srcport); +uint8_t *performance_reset_via(void *rcvbuf, ib_portid_t *dest, int port, + unsigned mask, unsigned timeout, unsigned id, + const struct ibmad_port *srcport); + +/* bm.c */ +uint8_t *bm_call_via(void *data, ib_portid_t *portid, ib_bm_call_t *call, + struct ibmad_port *srcport); + +/* dump.c */ +ib_mad_dump_fn mad_dump_int, mad_dump_uint, mad_dump_hex, mad_dump_rhex, + mad_dump_bitfield, mad_dump_array, mad_dump_string, mad_dump_linkwidth, + mad_dump_linkwidthsup, mad_dump_linkwidthen, mad_dump_linkdowndefstate, + mad_dump_linkspeed, mad_dump_linkspeedsup, mad_dump_linkspeeden, + mad_dump_linkspeedext, mad_dump_linkspeedextsup, + mad_dump_linkspeedexten, mad_dump_portstate, mad_dump_portstates, + mad_dump_physportstate, mad_dump_portcapmask, mad_dump_portcapmask2, + mad_dump_mtu, mad_dump_vlcap, mad_dump_opervls, mad_dump_node_type, + mad_dump_sltovl, mad_dump_vlarbitration, mad_dump_nodedesc, + mad_dump_nodeinfo, mad_dump_portinfo, mad_dump_switchinfo, + mad_dump_perfcounters, mad_dump_perfcounters_ext, + mad_dump_perfcounters_xmt_sl, mad_dump_perfcounters_rcv_sl, + mad_dump_perfcounters_xmt_disc, mad_dump_perfcounters_rcv_err, + mad_dump_portsamples_control, mad_dump_port_ext_speeds_counters, + mad_dump_perfcounters_port_op_rcv_counters, + mad_dump_perfcounters_port_flow_ctl_counters, + mad_dump_perfcounters_port_vl_op_packet, + mad_dump_perfcounters_port_vl_op_data, + mad_dump_perfcounters_port_vl_xmit_flow_ctl_update_errors, + mad_dump_perfcounters_port_vl_xmit_wait_counters, + mad_dump_perfcounters_sw_port_vl_congestion, + mad_dump_perfcounters_rcv_con_ctrl, mad_dump_perfcounters_sl_rcv_fecn, + mad_dump_perfcounters_sl_rcv_becn, mad_dump_perfcounters_xmit_con_ctrl, + mad_dump_perfcounters_vl_xmit_time_cong, mad_dump_mlnx_ext_port_info, + mad_dump_cc_congestioninfo, mad_dump_cc_congestionkeyinfo, + mad_dump_cc_congestionlog, mad_dump_cc_congestionlogswitch, + mad_dump_cc_congestionlogentryswitch, mad_dump_cc_congestionlogca, + mad_dump_cc_congestionlogentryca, mad_dump_cc_switchcongestionsetting, + mad_dump_cc_switchportcongestionsettingelement, + mad_dump_cc_cacongestionsetting, mad_dump_cc_cacongestionentry, + mad_dump_cc_congestioncontroltable, + mad_dump_cc_congestioncontroltableentry, mad_dump_cc_timestamp, + mad_dump_classportinfo, mad_dump_portsamples_result, + mad_dump_portinfo_ext, mad_dump_port_ext_speeds_counters_rsfec_active; + +void mad_dump_fields(char *buf, int bufsz, void *val, int valsz, int start, + int end); + +extern int ibdebug; + +#if __BYTE_ORDER == __LITTLE_ENDIAN +#ifndef ntohll +#define ntohll bswap_64 +#endif +#ifndef htonll +#define htonll bswap_64 +#endif +#elif __BYTE_ORDER == __BIG_ENDIAN +#ifndef ntohll +#define ntohll(x) (x) +#endif +#ifndef htonll +#define htonll(x) (x) +#endif +#endif /* __BYTE_ORDER == __BIG_ENDIAN */ + +/* Misc. macros: */ +/** align value \a l to \a size (ceil) */ +#define ALIGN(l, size) (((l) + ((size) - 1)) / (size) * (size)) + +/** printf style warning MACRO, includes name of function and pid */ +#define IBWARN(fmt, ...) fprintf(stderr, "ibwarn: [%d] %s: " fmt "\n", \ +(int)getpid(), __func__, ## __VA_ARGS__) + +#define IBDEBUG(fmt, ...) fprintf(stdout, "ibdebug: [%d] %s: " fmt "\n", \ +(int)getpid(), __func__, ## __VA_ARGS__) + +#define IBVERBOSE(fmt, ...) fprintf(stdout, "[%d] %s: " fmt "\n", \ +(int)getpid(), __func__, ## __VA_ARGS__) + +#define IBPANIC(fmt, ...) do { \ + fprintf(stderr, "ibpanic: [%d] %s: " fmt ": %m\n", \ + (int)getpid(), __func__, ## __VA_ARGS__); \ + exit(-1); \ +} while(0) + +void xdump(FILE *file, const char *msg, void *p, int size); + +#ifdef __cplusplus +} +#endif +#endif /* _MAD_H_ */ diff --git a/libibmad/mad_internal.h b/libibmad/mad_internal.h new file mode 100644 index 0000000..5deac27 --- /dev/null +++ b/libibmad/mad_internal.h @@ -0,0 +1,50 @@ +/* + * Copyright (c) 2004-2009 Voltaire Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#ifndef _MAD_INTERNAL_H_ +#define _MAD_INTERNAL_H_ + +#define MAX_CLASS 256 + +struct ibmad_port { + int port_id; /* file descriptor returned by umad_open() */ + int class_agents[MAX_CLASS]; /* class2agent mapper */ + int timeout, retries; + uint64_t smp_mkey; +}; + +extern struct ibmad_port *ibmp; +extern int madrpc_timeout; +extern int madrpc_retries; + +#endif /* _MAD_INTERNAL_H_ */ diff --git a/libibmad/mad_osd.h b/libibmad/mad_osd.h new file mode 100644 index 0000000..061001b --- /dev/null +++ b/libibmad/mad_osd.h @@ -0,0 +1 @@ +#warning "This header is obsolete." diff --git a/libibmad/portid.c b/libibmad/portid.c new file mode 100644 index 0000000..ea02ca9 --- /dev/null +++ b/libibmad/portid.c @@ -0,0 +1,123 @@ +/* + * Copyright (c) 2004-2009 Voltaire Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <arpa/inet.h> + +#include <infiniband/mad.h> + +#undef DEBUG +#define DEBUG if (ibdebug) IBWARN + +int portid2portnum(ib_portid_t * portid) +{ + if (portid->lid > 0) + return -1; + + if (portid->drpath.cnt == 0) + return 0; + + return portid->drpath.p[(portid->drpath.cnt - 1)]; +} + +char *portid2str(ib_portid_t * portid) +{ + static char buf[1024] = "local"; + int n = 0; + + if (portid->lid > 0) { + n += sprintf(buf + n, "Lid %d", portid->lid); + if (portid->grh_present) { + char gid[sizeof + "ffff:ffff:ffff:ffff:ffff:ffff:ffff:ffff"]; + if (inet_ntop(AF_INET6, portid->gid, gid, sizeof(gid))) + n += sprintf(buf + n, " Gid %s", gid); + } + if (portid->drpath.cnt) + n += sprintf(buf + n, " "); + else + return buf; + } + n += sprintf(buf + n, "DR path "); + drpath2str(&(portid->drpath), buf + n, sizeof(buf) - n); + + return buf; +} + +int str2drpath(ib_dr_path_t * path, char *routepath, int drslid, int drdlid) +{ + char *s, *str; + char *tmp; + + path->cnt = -1; + + if (!routepath || !(tmp = strdup(routepath))) + goto Exit; + + DEBUG("DR str: %s", routepath); + + str = tmp; + + while (str && *str) { + if ((s = strchr(str, ','))) + *s = 0; + path->p[++path->cnt] = (uint8_t) atoi(str); + if (!s) + break; + str = s + 1; + } + free(tmp); + +Exit: + path->drdlid = drdlid ? drdlid : 0xffff; + path->drslid = drslid ? drslid : 0xffff; + + return path->cnt; +} + +char *drpath2str(ib_dr_path_t * path, char *dstr, size_t dstr_size) +{ + int i = 0; + int rc = snprintf(dstr, dstr_size, "slid %u; dlid %u; %d", + path->drslid, path->drdlid, path->p[0]); + if (rc >= (int)dstr_size) + return dstr; + for (i = 1; i <= path->cnt; i++) { + rc += snprintf(dstr + rc, dstr_size - rc, ",%d", path->p[i]); + if (rc >= (int)dstr_size) + break; + } + return (dstr); +} diff --git a/libibmad/register.c b/libibmad/register.c new file mode 100644 index 0000000..854e462 --- /dev/null +++ b/libibmad/register.c @@ -0,0 +1,174 @@ +/* + * Copyright (c) 2004-2009 Voltaire Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <errno.h> + +#include <infiniband/umad.h> +#include <infiniband/mad.h> + +#include "mad_internal.h" + +#undef DEBUG +#define DEBUG if (ibdebug) IBWARN + +static int mgmt_class_vers(int mgmt_class) +{ + if ((mgmt_class >= IB_VENDOR_RANGE1_START_CLASS && + mgmt_class <= IB_VENDOR_RANGE1_END_CLASS) || + (mgmt_class >= IB_VENDOR_RANGE2_START_CLASS && + mgmt_class <= IB_VENDOR_RANGE2_END_CLASS)) + return 1; + + switch (mgmt_class) { + case IB_SMI_CLASS: + case IB_SMI_DIRECT_CLASS: + return 1; + case IB_SA_CLASS: + return 2; + case IB_PERFORMANCE_CLASS: + return 1; + case IB_DEVICE_MGMT_CLASS: + return 1; + case IB_CC_CLASS: + return 2; + case IB_BOARD_MGMT_CLASS: + return 1; + } + + return 0; +} + +int mad_class_agent(int mgmt) +{ + if (mgmt < 1 || mgmt >= MAX_CLASS) + return -1; + return ibmp->class_agents[mgmt]; +} + +int mad_register_port_client(int port_id, int mgmt, uint8_t rmpp_version) +{ + int vers, agent; + + if ((vers = mgmt_class_vers(mgmt)) <= 0) { + DEBUG("Unknown class %d mgmt_class", mgmt); + return -1; + } + + agent = umad_register(port_id, mgmt, vers, rmpp_version, NULL); + if (agent < 0) + DEBUG("Can't register agent for class %d", mgmt); + + return agent; +} + +int mad_register_client(int mgmt, uint8_t rmpp_version) +{ + return mad_register_client_via(mgmt, rmpp_version, ibmp); +} + +int mad_register_client_via(int mgmt, uint8_t rmpp_version, + struct ibmad_port *srcport) +{ + int agent; + + if (!srcport) + return -1; + + agent = mad_register_port_client(mad_rpc_portid(srcport), mgmt, + rmpp_version); + if (agent < 0) + return agent; + + srcport->class_agents[mgmt] = agent; + return 0; +} + +int mad_register_server(int mgmt, uint8_t rmpp_version, + long method_mask[], uint32_t class_oui) +{ + return mad_register_server_via(mgmt, rmpp_version, method_mask, + class_oui, ibmp); +} + +int mad_register_server_via(int mgmt, uint8_t rmpp_version, + long method_mask[], uint32_t class_oui, + struct ibmad_port *srcport) +{ + long class_method_mask[16 / sizeof(long)]; + uint8_t oui[3]; + int agent, vers; + + if (method_mask) + memcpy(class_method_mask, method_mask, + sizeof class_method_mask); + else + memset(class_method_mask, 0xff, sizeof(class_method_mask)); + + if (!srcport) + return -1; + + if (srcport->class_agents[mgmt] >= 0) { + DEBUG("Class 0x%x already registered %d", + mgmt, srcport->class_agents[mgmt]); + return -1; + } + if ((vers = mgmt_class_vers(mgmt)) <= 0) { + DEBUG("Unknown class 0x%x mgmt_class", mgmt); + return -1; + } + if (mgmt >= IB_VENDOR_RANGE2_START_CLASS && + mgmt <= IB_VENDOR_RANGE2_END_CLASS) { + oui[0] = (class_oui >> 16) & 0xff; + oui[1] = (class_oui >> 8) & 0xff; + oui[2] = class_oui & 0xff; + if ((agent = + umad_register_oui(srcport->port_id, mgmt, rmpp_version, + oui, class_method_mask)) < 0) { + DEBUG("Can't register agent for class %d", mgmt); + return -1; + } + } else + if ((agent = + umad_register(srcport->port_id, mgmt, vers, rmpp_version, + class_method_mask)) < 0) { + DEBUG("Can't register agent for class %d", mgmt); + return -1; + } + + srcport->class_agents[mgmt] = agent; + + return agent; +} diff --git a/libibmad/resolve.c b/libibmad/resolve.c new file mode 100644 index 0000000..2c397eb --- /dev/null +++ b/libibmad/resolve.c @@ -0,0 +1,244 @@ +/* + * Copyright (c) 2004-2009 Voltaire Inc. All rights reserved. + * Copyright (c) 2009 HNR Consulting. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <arpa/inet.h> +#include <errno.h> + +#include <infiniband/umad.h> +#include <infiniband/mad.h> +#include "mad_internal.h" + +#undef DEBUG +#define DEBUG if (ibdebug) IBWARN + +int ib_resolve_smlid_via(ib_portid_t * sm_id, int timeout, + const struct ibmad_port *srcport) +{ + ib_portid_t self = { 0 }; + uint8_t portinfo[64]; + int lid; + + memset(sm_id, 0, sizeof(*sm_id)); + + if (!smp_query_via(portinfo, &self, IB_ATTR_PORT_INFO, 0, 0, srcport)) + return -1; + + mad_decode_field(portinfo, IB_PORT_SMLID_F, &lid); + if (!IB_LID_VALID(lid)) { + errno = ENXIO; + return -1; + } + mad_decode_field(portinfo, IB_PORT_SMSL_F, &sm_id->sl); + + return ib_portid_set(sm_id, lid, 0, 0); +} + +int ib_resolve_smlid(ib_portid_t * sm_id, int timeout) +{ + return ib_resolve_smlid_via(sm_id, timeout, ibmp); +} + +int ib_resolve_gid_via(ib_portid_t * portid, ibmad_gid_t gid, + ib_portid_t * sm_id, int timeout, + const struct ibmad_port *srcport) +{ + ib_portid_t sm_portid = { 0 }; + char buf[IB_SA_DATA_SIZE] = { 0 }; + + if (!sm_id) + sm_id = &sm_portid; + + if (!IB_LID_VALID(sm_id->lid)) { + if (ib_resolve_smlid_via(sm_id, timeout, srcport) < 0) + return -1; + } + + if ((portid->lid = + ib_path_query_via(srcport, gid, gid, sm_id, buf)) < 0) + return -1; + + return 0; +} + +int ib_resolve_guid_via(ib_portid_t * portid, uint64_t * guid, + ib_portid_t * sm_id, int timeout, + const struct ibmad_port *srcport) +{ + ib_portid_t sm_portid = { 0 }; + uint8_t buf[IB_SA_DATA_SIZE] = { 0 }; + ib_portid_t self = { 0 }; + uint64_t selfguid, prefix; + ibmad_gid_t selfgid; + uint8_t nodeinfo[64]; + + if (!sm_id) + sm_id = &sm_portid; + + if (!IB_LID_VALID(sm_id->lid)) { + if (ib_resolve_smlid_via(sm_id, timeout, srcport) < 0) + return -1; + } + + if (!smp_query_via(nodeinfo, &self, IB_ATTR_NODE_INFO, 0, 0, srcport)) + return -1; + mad_decode_field(nodeinfo, IB_NODE_PORT_GUID_F, &selfguid); + mad_set_field64(selfgid, 0, IB_GID_PREFIX_F, IB_DEFAULT_SUBN_PREFIX); + mad_set_field64(selfgid, 0, IB_GID_GUID_F, selfguid); + + memcpy(&prefix, portid->gid, sizeof(prefix)); + if (!prefix) + mad_set_field64(portid->gid, 0, IB_GID_PREFIX_F, + IB_DEFAULT_SUBN_PREFIX); + if (guid) + mad_set_field64(portid->gid, 0, IB_GID_GUID_F, *guid); + + if ((portid->lid = + ib_path_query_via(srcport, selfgid, portid->gid, sm_id, buf)) < 0) + return -1; + + mad_decode_field(buf, IB_SA_PR_SL_F, &portid->sl); + return 0; +} + +int ib_resolve_portid_str_via(ib_portid_t * portid, char *addr_str, + enum MAD_DEST dest_type, ib_portid_t * sm_id, + const struct ibmad_port *srcport) +{ + ibmad_gid_t gid; + uint64_t guid; + int lid; + char *routepath; + ib_portid_t selfportid = { 0 }; + int selfport = 0; + + memset(portid, 0, sizeof *portid); + + switch (dest_type) { + case IB_DEST_LID: + lid = strtol(addr_str, NULL, 0); + if (!IB_LID_VALID(lid)) { + errno = EINVAL; + return -1; + } + return ib_portid_set(portid, lid, 0, 0); + + case IB_DEST_DRPATH: + if (str2drpath(&portid->drpath, addr_str, 0, 0) < 0) { + errno = EINVAL; + return -1; + } + return 0; + + case IB_DEST_GUID: + if (!(guid = strtoull(addr_str, NULL, 0))) { + errno = EINVAL; + return -1; + } + + /* keep guid in portid? */ + return ib_resolve_guid_via(portid, &guid, sm_id, 0, srcport); + + case IB_DEST_DRSLID: + lid = strtol(addr_str, &routepath, 0); + routepath++; + if (!IB_LID_VALID(lid)) { + errno = EINVAL; + return -1; + } + ib_portid_set(portid, lid, 0, 0); + + /* handle DR parsing and set DrSLID to local lid */ + if (ib_resolve_self_via(&selfportid, &selfport, NULL, srcport) < 0) + return -1; + if (str2drpath(&portid->drpath, routepath, selfportid.lid, 0) < + 0) { + errno = EINVAL; + return -1; + } + return 0; + + case IB_DEST_GID: + if (inet_pton(AF_INET6, addr_str, &gid) <= 0) + return -1; + return ib_resolve_gid_via(portid, gid, sm_id, 0, srcport); + default: + IBWARN("bad dest_type %d", dest_type); + errno = EINVAL; + } + + return -1; +} + +int ib_resolve_portid_str(ib_portid_t * portid, char *addr_str, + enum MAD_DEST dest_type, ib_portid_t * sm_id) +{ + return ib_resolve_portid_str_via(portid, addr_str, dest_type, + sm_id, ibmp); +} + +int ib_resolve_self_via(ib_portid_t * portid, int *portnum, ibmad_gid_t * gid, + const struct ibmad_port *srcport) +{ + ib_portid_t self = { 0 }; + uint8_t portinfo[64]; + uint8_t nodeinfo[64]; + uint64_t guid, prefix; + + if (!smp_query_via(nodeinfo, &self, IB_ATTR_NODE_INFO, 0, 0, srcport)) + return -1; + + if (!smp_query_via(portinfo, &self, IB_ATTR_PORT_INFO, 0, 0, srcport)) + return -1; + + mad_decode_field(portinfo, IB_PORT_LID_F, &portid->lid); + mad_decode_field(portinfo, IB_PORT_SMSL_F, &portid->sl); + mad_decode_field(portinfo, IB_PORT_GID_PREFIX_F, &prefix); + mad_decode_field(nodeinfo, IB_NODE_PORT_GUID_F, &guid); + + if (portnum) + mad_decode_field(nodeinfo, IB_NODE_LOCAL_PORT_F, portnum); + if (gid) { + mad_encode_field(*gid, IB_GID_PREFIX_F, &prefix); + mad_encode_field(*gid, IB_GID_GUID_F, &guid); + } + return 0; +} + +int ib_resolve_self(ib_portid_t * portid, int *portnum, ibmad_gid_t * gid) +{ + return ib_resolve_self_via(portid, portnum, gid, ibmp); +} diff --git a/libibmad/rpc.c b/libibmad/rpc.c new file mode 100644 index 0000000..9e3d88e --- /dev/null +++ b/libibmad/rpc.c @@ -0,0 +1,432 @@ +/* + * Copyright (c) 2004-2009 Voltaire Inc. All rights reserved. + * Copyright (c) 2009 HNR Consulting. All rights reserved. + * Copyright (c) 2011 Mellanox Technologies LTD. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <errno.h> + +#include <infiniband/umad.h> +#include <infiniband/mad.h> + +#include "mad_internal.h" + +int ibdebug; + +static struct ibmad_port mad_port; +struct ibmad_port *ibmp = &mad_port; + +static int iberrs; + +int madrpc_retries = MAD_DEF_RETRIES; +int madrpc_timeout = MAD_DEF_TIMEOUT_MS; + +static void *save_mad; +static int save_mad_len = 256; + +#undef DEBUG +#define DEBUG if (ibdebug) IBWARN +#define ERRS(fmt, ...) do { \ + if (iberrs || ibdebug) \ + IBWARN(fmt, ## __VA_ARGS__); \ +} while (0) + +#define MAD_TID(mad) (*((uint64_t *)((char *)(mad) + 8))) + +void madrpc_show_errors(int set) +{ + iberrs = set; +} + +void madrpc_save_mad(void *madbuf, int len) +{ + save_mad = madbuf; + save_mad_len = len; +} + +int madrpc_set_retries(int retries) +{ + if (retries > 0) + madrpc_retries = retries; + return madrpc_retries; +} + +int madrpc_set_timeout(int timeout) +{ + madrpc_timeout = timeout; + return 0; +} + +void mad_rpc_set_retries(struct ibmad_port *port, int retries) +{ + port->retries = retries; +} + +void mad_rpc_set_timeout(struct ibmad_port *port, int timeout) +{ + port->timeout = timeout; +} + +int madrpc_portid(void) +{ + return ibmp->port_id; +} + +int mad_rpc_portid(struct ibmad_port *srcport) +{ + return srcport->port_id; +} + +int mad_rpc_class_agent(struct ibmad_port *port, int class) +{ + if (class < 1 || class >= MAX_CLASS) + return -1; + return port->class_agents[class]; +} + +static int +_do_madrpc(int port_id, void *sndbuf, void *rcvbuf, int agentid, int len, + int timeout, int max_retries, int *p_error) +{ + uint32_t trid; /* only low 32 bits - see mad_trid() */ + int retries; + int length, status; + + if (ibdebug > 1) { + IBWARN(">>> sending: len %d pktsz %zu", len, umad_size() + len); + xdump(stderr, "send buf\n", sndbuf, umad_size() + len); + } + + if (save_mad) { + memcpy(save_mad, umad_get_mad(sndbuf), + save_mad_len < len ? save_mad_len : len); + save_mad = NULL; + } + + if (max_retries <= 0) { + errno = EINVAL; + *p_error = EINVAL; + ERRS("max_retries %d <= 0", max_retries); + return -1; + } + + trid = + (uint32_t) mad_get_field64(umad_get_mad(sndbuf), 0, IB_MAD_TRID_F); + + for (retries = 0; retries < max_retries; retries++) { + if (retries) + ERRS("retry %d (timeout %d ms)", retries, timeout); + + length = len; + if (umad_send(port_id, agentid, sndbuf, length, timeout, 0) < 0) { + IBWARN("send failed; %s", strerror(errno)); + return -1; + } + + /* Use same timeout on receive side just in case */ + /* send packet is lost somewhere. */ + do { + length = len; + if (umad_recv(port_id, rcvbuf, &length, timeout) < 0) { + IBWARN("recv failed: %s", strerror(errno)); + return -1; + } + + if (ibdebug > 2) + umad_addr_dump(umad_get_mad_addr(rcvbuf)); + if (ibdebug > 1) { + IBWARN("rcv buf:"); + xdump(stderr, "rcv buf\n", umad_get_mad(rcvbuf), + IB_MAD_SIZE); + } + } while ((uint32_t) + mad_get_field64(umad_get_mad(rcvbuf), 0, + IB_MAD_TRID_F) != trid); + + status = umad_status(rcvbuf); + if (!status) + return length; /* done */ + if (status == ENOMEM) + return length; + } + + errno = status; + *p_error = ETIMEDOUT; + ERRS("timeout after %d retries, %d ms", retries, timeout * retries); + return -1; +} + +static int redirect_port(ib_portid_t * port, uint8_t * mad) +{ + port->lid = mad_get_field(mad, 64, IB_CPI_REDIRECT_LID_F); + if (!port->lid) { + IBWARN("GID-based redirection is not supported"); + return -1; + } + + port->qp = mad_get_field(mad, 64, IB_CPI_REDIRECT_QP_F); + port->qkey = mad_get_field(mad, 64, IB_CPI_REDIRECT_QKEY_F); + port->sl = (uint8_t) mad_get_field(mad, 64, IB_CPI_REDIRECT_SL_F); + + /* TODO: Reverse map redirection P_Key to P_Key index */ + + if (ibdebug) + IBWARN("redirected to lid %d, qp 0x%x, qkey 0x%x, sl 0x%x", + port->lid, port->qp, port->qkey, port->sl); + + return 0; +} + +void *mad_rpc(const struct ibmad_port *port, ib_rpc_t * rpc, + ib_portid_t * dport, void *payload, void *rcvdata) +{ + int status, len; + uint8_t sndbuf[1024], rcvbuf[1024], *mad; + ib_rpc_v1_t *rpcv1 = (ib_rpc_v1_t *)rpc; + int error = 0; + + if ((rpc->mgtclass & IB_MAD_RPC_VERSION_MASK) == IB_MAD_RPC_VERSION1) + rpcv1->error = 0; + do { + len = 0; + memset(sndbuf, 0, umad_size() + IB_MAD_SIZE); + + if ((len = mad_build_pkt(sndbuf, rpc, dport, NULL, payload)) < 0) + return NULL; + + if ((len = _do_madrpc(port->port_id, sndbuf, rcvbuf, + port->class_agents[rpc->mgtclass & 0xff], + len, mad_get_timeout(port, rpc->timeout), + mad_get_retries(port), &error)) < 0) { + if ((rpc->mgtclass & IB_MAD_RPC_VERSION_MASK) == + IB_MAD_RPC_VERSION1) + rpcv1->error = error; + IBWARN("_do_madrpc failed; dport (%s)", + portid2str(dport)); + return NULL; + } + + mad = umad_get_mad(rcvbuf); + status = mad_get_field(mad, 0, IB_DRSMP_STATUS_F); + + /* check for exact match instead of only the redirect bit; + * that way, weird statuses cause an error, too */ + if (status == IB_MAD_STS_REDIRECT) { + /* update dport for next request and retry */ + /* bail if redirection fails */ + if (redirect_port(dport, mad)) + break; + } else + break; + } while (1); + + if ((rpc->mgtclass & IB_MAD_RPC_VERSION_MASK) == IB_MAD_RPC_VERSION1) + rpcv1->error = error; + rpc->rstatus = status; + + if (status != 0) { + ERRS("MAD completed with error status 0x%x; dport (%s)", + status, portid2str(dport)); + errno = EIO; + return NULL; + } + + if (rcvdata) + memcpy(rcvdata, mad + rpc->dataoffs, rpc->datasz); + + return rcvdata; +} + +void *mad_rpc_rmpp(const struct ibmad_port *port, ib_rpc_t * rpc, + ib_portid_t * dport, ib_rmpp_hdr_t * rmpp, void *data) +{ + int status, len; + uint8_t sndbuf[1024], rcvbuf[1024], *mad; + ib_rpc_v1_t *rpcv1 = (ib_rpc_v1_t *)rpc; + int error = 0; + + memset(sndbuf, 0, umad_size() + IB_MAD_SIZE); + + DEBUG("rmpp %p data %p", rmpp, data); + + if ((rpc->mgtclass & IB_MAD_RPC_VERSION_MASK) == IB_MAD_RPC_VERSION1) + rpcv1->error = 0; + if ((len = mad_build_pkt(sndbuf, rpc, dport, rmpp, data)) < 0) + return NULL; + + if ((len = _do_madrpc(port->port_id, sndbuf, rcvbuf, + port->class_agents[rpc->mgtclass & 0xff], + len, mad_get_timeout(port, rpc->timeout), + mad_get_retries(port), &error)) < 0) { + if ((rpc->mgtclass & IB_MAD_RPC_VERSION_MASK) == IB_MAD_RPC_VERSION1) + rpcv1->error = error; + IBWARN("_do_madrpc failed; dport (%s)", portid2str(dport)); + return NULL; + } + + if ((rpc->mgtclass & IB_MAD_RPC_VERSION_MASK) == IB_MAD_RPC_VERSION1) + rpcv1->error = error; + + mad = umad_get_mad(rcvbuf); + + if ((status = mad_get_field(mad, 0, IB_MAD_STATUS_F)) != 0) { + ERRS("MAD completed with error status 0x%x; dport (%s)", + status, portid2str(dport)); + errno = EIO; + return NULL; + } + + if (rmpp) { + rmpp->flags = mad_get_field(mad, 0, IB_SA_RMPP_FLAGS_F); + if ((rmpp->flags & 0x3) && + mad_get_field(mad, 0, IB_SA_RMPP_VERS_F) != 1) { + IBWARN("bad rmpp version"); + return NULL; + } + rmpp->type = mad_get_field(mad, 0, IB_SA_RMPP_TYPE_F); + rmpp->status = mad_get_field(mad, 0, IB_SA_RMPP_STATUS_F); + DEBUG("rmpp type %d status %d", rmpp->type, rmpp->status); + rmpp->d1.u = mad_get_field(mad, 0, IB_SA_RMPP_D1_F); + rmpp->d2.u = mad_get_field(mad, 0, IB_SA_RMPP_D2_F); + } + + if (data) + memcpy(data, mad + rpc->dataoffs, rpc->datasz); + + rpc->recsz = mad_get_field(mad, 0, IB_SA_ATTROFFS_F); + + return data; +} + +void *madrpc(ib_rpc_t * rpc, ib_portid_t * dport, void *payload, void *rcvdata) +{ + return mad_rpc(ibmp, rpc, dport, payload, rcvdata); +} + +void *madrpc_rmpp(ib_rpc_t * rpc, ib_portid_t * dport, ib_rmpp_hdr_t * rmpp, + void *data) +{ + return mad_rpc_rmpp(ibmp, rpc, dport, rmpp, data); +} + +void +madrpc_init(char *dev_name, int dev_port, int *mgmt_classes, int num_classes) +{ + int fd; + + if (umad_init() < 0) + IBPANIC("can't init UMAD library"); + + if ((fd = umad_open_port(dev_name, dev_port)) < 0) + IBPANIC("can't open UMAD port (%s:%d)", + dev_name ? dev_name : "(nil)", dev_port); + + if (num_classes >= MAX_CLASS) + IBPANIC("too many classes %d requested", num_classes); + + ibmp->port_id = fd; + memset(ibmp->class_agents, 0xff, sizeof ibmp->class_agents); + while (num_classes--) { + uint8_t rmpp_version = 0; + int mgmt = *mgmt_classes++; + + if (mgmt == IB_SA_CLASS) + rmpp_version = 1; + if (mad_register_client_via(mgmt, rmpp_version, ibmp) < 0) + IBPANIC("client_register for mgmt class %d failed", + mgmt); + } +} + +struct ibmad_port *mad_rpc_open_port(char *dev_name, int dev_port, + int *mgmt_classes, int num_classes) +{ + struct ibmad_port *p; + int port_id; + + if (num_classes >= MAX_CLASS) { + IBWARN("too many classes %d requested", num_classes); + errno = EINVAL; + return NULL; + } + + if (umad_init() < 0) { + IBWARN("can't init UMAD library"); + errno = ENODEV; + return NULL; + } + + p = malloc(sizeof(*p)); + if (!p) { + errno = ENOMEM; + return NULL; + } + memset(p, 0, sizeof(*p)); + + if ((port_id = umad_open_port(dev_name, dev_port)) < 0) { + IBWARN("can't open UMAD port (%s:%d)", dev_name, dev_port); + if (!errno) + errno = EIO; + free(p); + return NULL; + } + + p->port_id = port_id; + memset(p->class_agents, 0xff, sizeof p->class_agents); + while (num_classes--) { + uint8_t rmpp_version = 0; + int mgmt = *mgmt_classes++; + + if (mgmt == IB_SA_CLASS) + rmpp_version = 1; + if (mgmt < 0 || mgmt >= MAX_CLASS || + mad_register_client_via(mgmt, rmpp_version, p) < 0) { + IBWARN("client_register for mgmt %d failed", mgmt); + if (!errno) + errno = EINVAL; + umad_close_port(port_id); + free(p); + return NULL; + } + } + + return p; +} + +void mad_rpc_close_port(struct ibmad_port *port) +{ + umad_close_port(port->port_id); + free(port); +} diff --git a/libibmad/sa.c b/libibmad/sa.c new file mode 100644 index 0000000..9dfec93 --- /dev/null +++ b/libibmad/sa.c @@ -0,0 +1,187 @@ +/* + * Copyright (c) 2004-2009 Voltaire Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#include <infiniband/mad.h> +#include "mad_internal.h" + +#undef DEBUG +#define DEBUG if (ibdebug) IBWARN + +uint8_t *sa_rpc_call(const struct ibmad_port *ibmad_port, void *rcvbuf, + ib_portid_t * portid, ib_sa_call_t * sa, unsigned timeout) +{ + ib_rpc_t rpc = { 0 }; + uint8_t *p; + + DEBUG("attr 0x%x mod 0x%x route %s", sa->attrid, sa->mod, + portid2str(portid)); + + if (portid->lid <= 0) { + IBWARN("only lid routes are supported"); + return NULL; + } + + rpc.mgtclass = IB_SA_CLASS; + rpc.method = sa->method; + rpc.attr.id = sa->attrid; + rpc.attr.mod = sa->mod; + rpc.mask = sa->mask; + rpc.timeout = timeout; + rpc.datasz = IB_SA_DATA_SIZE; + rpc.dataoffs = IB_SA_DATA_OFFS; + rpc.trid = sa->trid; + + portid->qp = 1; + if (!portid->qkey) + portid->qkey = IB_DEFAULT_QP1_QKEY; + + p = mad_rpc_rmpp(ibmad_port, &rpc, portid, NULL /*&sa->rmpp */ , rcvbuf); /* TODO: RMPP */ + + sa->recsz = rpc.recsz; + + return p; +} + +uint8_t *sa_call(void *rcvbuf, ib_portid_t * portid, ib_sa_call_t * sa, + unsigned timeout) +{ + return sa_rpc_call(ibmp, rcvbuf, portid, sa, timeout); +} + +/* PathRecord */ +#define IB_PR_COMPMASK_DGID (1ull<<2) +#define IB_PR_COMPMASK_SGID (1ull<<3) +#define IB_PR_COMPMASK_DLID (1ull<<4) +#define IB_PR_COMPMASK_SLID (1ull<<5) +#define IB_PR_COMPMASK_RAWTRAFIC (1ull<<6) +#define IB_PR_COMPMASK_RESV0 (1ull<<7) +#define IB_PR_COMPMASK_FLOWLABEL (1ull<<8) +#define IB_PR_COMPMASK_HOPLIMIT (1ull<<9) +#define IB_PR_COMPMASK_TCLASS (1ull<<10) +#define IB_PR_COMPMASK_REVERSIBLE (1ull<<11) +#define IB_PR_COMPMASK_NUMBPATH (1ull<<12) +#define IB_PR_COMPMASK_PKEY (1ull<<13) +#define IB_PR_COMPMASK_RESV1 (1ull<<14) +#define IB_PR_COMPMASK_SL (1ull<<15) +#define IB_PR_COMPMASK_MTUSELEC (1ull<<16) +#define IB_PR_COMPMASK_MTU (1ull<<17) +#define IB_PR_COMPMASK_RATESELEC (1ull<<18) +#define IB_PR_COMPMASK_RATE (1ull<<19) +#define IB_PR_COMPMASK_PKTLIFETIMESELEC (1ull<<20) +#define IB_PR_COMPMASK_PKTLIFETIME (1ull<<21) +#define IB_PR_COMPMASK_PREFERENCE (1ull<<22) + +#define IB_PR_DEF_MASK (IB_PR_COMPMASK_DGID |\ + IB_PR_COMPMASK_SGID) + +int ib_path_query_via(const struct ibmad_port *srcport, ibmad_gid_t srcgid, + ibmad_gid_t destgid, ib_portid_t * sm_id, void *buf) +{ + ib_sa_call_t sa = { 0 }; + uint8_t *p; + int dlid; + + memset(&sa, 0, sizeof sa); + sa.method = IB_MAD_METHOD_GET; + sa.attrid = IB_SA_ATTR_PATHRECORD; + sa.mask = IB_PR_DEF_MASK; + sa.trid = mad_trid(); + + memset(buf, 0, IB_SA_PR_RECSZ); + + mad_encode_field(buf, IB_SA_PR_DGID_F, destgid); + mad_encode_field(buf, IB_SA_PR_SGID_F, srcgid); + + p = sa_rpc_call(srcport, buf, sm_id, &sa, 0); + if (!p) { + IBWARN("sa call path_query failed"); + return -1; + } + + mad_decode_field(p, IB_SA_PR_DLID_F, &dlid); + return dlid; +} + +int ib_path_query(ibmad_gid_t srcgid, ibmad_gid_t destgid, ib_portid_t * sm_id, + void *buf) +{ + return ib_path_query_via(ibmp, srcgid, destgid, sm_id, buf); +} + +/* NodeRecord */ +#define IB_NR_COMPMASK_LID (1ull<<0) +#define IB_NR_COMPMASK_RESERVED1 (1ull<<1) +#define IB_NR_COMPMASK_BASEVERSION (1ull<<2) +#define IB_NR_COMPMASK_CLASSVERSION (1ull<<3) +#define IB_NR_COMPMASK_NODETYPE (1ull<<4) +#define IB_NR_COMPMASK_NUMPORTS (1ull<<5) +#define IB_NR_COMPMASK_SYSIMAGEGUID (1ull<<6) +#define IB_NR_COMPMASK_NODEGUID (1ull<<7) +#define IB_NR_COMPMASK_PORTGUID (1ull<<8) +#define IB_NR_COMPMASK_PARTCAP (1ull<<9) +#define IB_NR_COMPMASK_DEVID (1ull<<10) +#define IB_NR_COMPMASK_REV (1ull<<11) +#define IB_NR_COMPMASK_PORTNUM (1ull<<12) +#define IB_NR_COMPMASK_VENDID (1ull<<13) +#define IB_NR_COMPMASK_NODEDESC (1ull<<14) + +#define IB_NR_DEF_MASK IB_NR_COMPMASK_PORTGUID + +int ib_node_query_via(const struct ibmad_port *srcport, uint64_t guid, + ib_portid_t * sm_id, void *buf) +{ + ib_sa_call_t sa = { 0 }; + uint8_t *p; + + memset(&sa, 0, sizeof sa); + sa.method = IB_MAD_METHOD_GET; + sa.attrid = IB_SA_ATTR_NODERECORD; + sa.mask = IB_NR_DEF_MASK; + sa.trid = mad_trid(); + + memset(buf, 0, IB_SA_NR_RECSZ); + + mad_encode_field(buf, IB_SA_NR_PORT_GUID_F, &guid); + + p = sa_rpc_call(srcport, buf, sm_id, &sa, 0); + if (!p) { + IBWARN("sa call node_query failed"); + return -1; + } + + return 0; +} diff --git a/libibmad/serv.c b/libibmad/serv.c new file mode 100644 index 0000000..040bb62 --- /dev/null +++ b/libibmad/serv.c @@ -0,0 +1,196 @@ +/* + * Copyright (c) 2004-2009 Voltaire Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <errno.h> + +#include <infiniband/umad.h> +#include <infiniband/mad.h> + +#include "mad_internal.h" + +#undef DEBUG +#define DEBUG if (ibdebug) IBWARN + +int mad_send(ib_rpc_t * rpc, ib_portid_t * dport, ib_rmpp_hdr_t * rmpp, + void *data) +{ + return mad_send_via(rpc, dport, rmpp, data, ibmp); +} + +int mad_send_via(ib_rpc_t * rpc, ib_portid_t * dport, ib_rmpp_hdr_t * rmpp, + void *data, struct ibmad_port *srcport) +{ + uint8_t pktbuf[1024]; + void *umad = pktbuf; + + memset(pktbuf, 0, umad_size() + IB_MAD_SIZE); + + DEBUG("rmpp %p data %p", rmpp, data); + + if (mad_build_pkt(umad, rpc, dport, rmpp, data) < 0) + return -1; + + if (ibdebug) { + IBWARN("data offs %d sz %d", rpc->dataoffs, rpc->datasz); + xdump(stderr, "mad send data\n", + (char *)umad_get_mad(umad) + rpc->dataoffs, rpc->datasz); + } + + if (umad_send(srcport->port_id, srcport->class_agents[rpc->mgtclass & 0xff], + umad, IB_MAD_SIZE, mad_get_timeout(srcport, rpc->timeout), + 0) < 0) { + IBWARN("send failed; %s", strerror(errno)); + return -1; + } + + return 0; +} + +int mad_respond(void *umad, ib_portid_t * portid, uint32_t rstatus) +{ + return mad_respond_via(umad, portid, rstatus, ibmp); +} + +int mad_respond_via(void *umad, ib_portid_t * portid, uint32_t rstatus, + struct ibmad_port *srcport) +{ + uint8_t *mad = umad_get_mad(umad); + ib_mad_addr_t *mad_addr; + ib_rpc_t rpc = { 0 }; + ib_portid_t rport; + int is_smi; + + if (!portid) { + if (!(mad_addr = umad_get_mad_addr(umad))) { + errno = EINVAL; + return -1; + } + + memset(&rport, 0, sizeof(rport)); + + rport.lid = ntohs(mad_addr->lid); + rport.qp = ntohl(mad_addr->qpn); + rport.qkey = ntohl(mad_addr->qkey); + rport.sl = mad_addr->sl; + + portid = &rport; + } + + DEBUG("dest %s", portid2str(portid)); + + rpc.mgtclass = mad_get_field(mad, 0, IB_MAD_MGMTCLASS_F); + + rpc.method = mad_get_field(mad, 0, IB_MAD_METHOD_F); + if (rpc.method == IB_MAD_METHOD_SET) + rpc.method = IB_MAD_METHOD_GET; + if (rpc.method != IB_MAD_METHOD_SEND) + rpc.method |= IB_MAD_RESPONSE; + + rpc.attr.id = mad_get_field(mad, 0, IB_MAD_ATTRID_F); + rpc.attr.mod = mad_get_field(mad, 0, IB_MAD_ATTRMOD_F); + if (rpc.mgtclass == IB_SA_CLASS) + rpc.recsz = mad_get_field(mad, 0, IB_SA_ATTROFFS_F); + if (mad_is_vendor_range2(rpc.mgtclass)) + rpc.oui = mad_get_field(mad, 0, IB_VEND2_OUI_F); + + rpc.trid = mad_get_field64(mad, 0, IB_MAD_TRID_F); + rpc.rstatus = rstatus; + + /* cleared by default: timeout, datasz, dataoffs, mkey, mask */ + + is_smi = rpc.mgtclass == IB_SMI_CLASS || + rpc.mgtclass == IB_SMI_DIRECT_CLASS; + + if (is_smi) + portid->qp = 0; + else if (!portid->qp) + portid->qp = 1; + + if (!portid->qkey && portid->qp == 1) + portid->qkey = IB_DEFAULT_QP1_QKEY; + + DEBUG + ("qp 0x%x class 0x%x method %d attr 0x%x mod 0x%x datasz %d off %d qkey %x", + portid->qp, rpc.mgtclass, rpc.method, rpc.attr.id, rpc.attr.mod, + rpc.datasz, rpc.dataoffs, portid->qkey); + + if (mad_build_pkt(umad, &rpc, portid, NULL, NULL) < 0) + return -1; + + if (ibdebug > 1) + xdump(stderr, "mad respond pkt\n", mad, IB_MAD_SIZE); + + if (umad_send + (srcport->port_id, srcport->class_agents[rpc.mgtclass], umad, + IB_MAD_SIZE, mad_get_timeout(srcport, rpc.timeout), 0) < 0) { + DEBUG("send failed; %s", strerror(errno)); + return -1; + } + + return 0; +} + +void *mad_receive(void *umad, int timeout) +{ + return mad_receive_via(umad, timeout, ibmp); +} + +void *mad_receive_via(void *umad, int timeout, struct ibmad_port *srcport) +{ + void *mad = umad ? umad : umad_alloc(1, umad_size() + IB_MAD_SIZE); + int agent; + int length = IB_MAD_SIZE; + + if ((agent = umad_recv(srcport->port_id, mad, &length, + mad_get_timeout(srcport, timeout))) < 0) { + if (!umad) + umad_free(mad); + DEBUG("recv failed: %s", strerror(errno)); + return NULL; + } + + return mad; +} + +void *mad_alloc(void) +{ + return umad_alloc(1, umad_size() + IB_MAD_SIZE); +} + +void mad_free(void *umad) +{ + umad_free(umad); +} diff --git a/libibmad/smp.c b/libibmad/smp.c new file mode 100644 index 0000000..07d0ad2 --- /dev/null +++ b/libibmad/smp.c @@ -0,0 +1,145 @@ +/* + * Copyright (c) 2004-2009 Voltaire Inc. All rights reserved. + * Copyright (c) 2011 Mellanox Technologies LTD. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#include <infiniband/mad.h> +#include "mad_internal.h" + +#undef DEBUG +#define DEBUG if (ibdebug) IBWARN + +void smp_mkey_set(struct ibmad_port *srcport, uint64_t mkey) +{ + srcport->smp_mkey = mkey; +} + +uint64_t smp_mkey_get(const struct ibmad_port *srcport) +{ + return srcport->smp_mkey; +} + +uint8_t *smp_set_status_via(void *data, ib_portid_t * portid, unsigned attrid, + unsigned mod, unsigned timeout, int *rstatus, + const struct ibmad_port *srcport) +{ + ib_rpc_t rpc = { 0 }; + uint8_t *res; + + DEBUG("attr 0x%x mod 0x%x route %s", attrid, mod, portid2str(portid)); + if ((portid->lid <= 0) || + (portid->drpath.drslid == 0xffff) || + (portid->drpath.drdlid == 0xffff)) + rpc.mgtclass = IB_SMI_DIRECT_CLASS; /* direct SMI */ + else + rpc.mgtclass = IB_SMI_CLASS; /* Lid routed SMI */ + + rpc.method = IB_MAD_METHOD_SET; + rpc.attr.id = attrid; + rpc.attr.mod = mod; + rpc.timeout = timeout; + rpc.datasz = IB_SMP_DATA_SIZE; + rpc.dataoffs = IB_SMP_DATA_OFFS; + rpc.mkey = srcport->smp_mkey; + + portid->sl = 0; + portid->qp = 0; + + res = mad_rpc(srcport, &rpc, portid, data, data); + if (rstatus) + *rstatus = rpc.rstatus; + return res; +} + +uint8_t *smp_set_via(void *data, ib_portid_t * portid, unsigned attrid, + unsigned mod, unsigned timeout, + const struct ibmad_port *srcport) +{ + return smp_set_status_via(data, portid, attrid, mod, timeout, NULL, + srcport); +} + +uint8_t *smp_set(void *data, ib_portid_t * portid, unsigned attrid, + unsigned mod, unsigned timeout) +{ + return smp_set_via(data, portid, attrid, mod, timeout, ibmp); +} + +uint8_t *smp_query_status_via(void *rcvbuf, ib_portid_t * portid, + unsigned attrid, unsigned mod, unsigned timeout, + int *rstatus, const struct ibmad_port * srcport) +{ + ib_rpc_t rpc = { 0 }; + uint8_t *res; + + DEBUG("attr 0x%x mod 0x%x route %s", attrid, mod, portid2str(portid)); + rpc.method = IB_MAD_METHOD_GET; + rpc.attr.id = attrid; + rpc.attr.mod = mod; + rpc.timeout = timeout; + rpc.datasz = IB_SMP_DATA_SIZE; + rpc.dataoffs = IB_SMP_DATA_OFFS; + rpc.mkey = srcport->smp_mkey; + + if ((portid->lid <= 0) || + (portid->drpath.drslid == 0xffff) || + (portid->drpath.drdlid == 0xffff)) + rpc.mgtclass = IB_SMI_DIRECT_CLASS; /* direct SMI */ + else + rpc.mgtclass = IB_SMI_CLASS; /* Lid routed SMI */ + + portid->sl = 0; + portid->qp = 0; + + res = mad_rpc(srcport, &rpc, portid, rcvbuf, rcvbuf); + if (rstatus) + *rstatus = rpc.rstatus; + return res; +} + +uint8_t *smp_query_via(void *rcvbuf, ib_portid_t * portid, unsigned attrid, + unsigned mod, unsigned timeout, + const struct ibmad_port * srcport) +{ + return smp_query_status_via(rcvbuf, portid, attrid, mod, timeout, NULL, + srcport); +} + +uint8_t *smp_query(void *rcvbuf, ib_portid_t * portid, unsigned attrid, + unsigned mod, unsigned timeout) +{ + return smp_query_via(rcvbuf, portid, attrid, mod, timeout, ibmp); +} diff --git a/libibmad/vendor.c b/libibmad/vendor.c new file mode 100644 index 0000000..f96ba24 --- /dev/null +++ b/libibmad/vendor.c @@ -0,0 +1,107 @@ +/* + * Copyright (c) 2004-2009 Voltaire Inc. All rights reserved. + * Copyright (c) 2011 Mellanox Technologies LTD. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <errno.h> + +#include <infiniband/mad.h> +#include "mad_internal.h" + +#undef DEBUG +#define DEBUG if (ibdebug) IBWARN + +static inline int response_expected(int method) +{ + return method == IB_MAD_METHOD_GET || + method == IB_MAD_METHOD_SET || method == IB_MAD_METHOD_TRAP; +} + +uint8_t *ib_vendor_call(void *data, ib_portid_t * portid, + ib_vendor_call_t * call) +{ + return ib_vendor_call_via(data, portid, call, ibmp); +} + +uint8_t *ib_vendor_call_via(void *data, ib_portid_t * portid, + ib_vendor_call_t * call, + struct ibmad_port * srcport) +{ + ib_rpc_v1_t rpc = { 0 }; + ib_rpc_t *rpcold = (ib_rpc_t *)(void *)&rpc; + int range1 = 0, resp_expected; + void *p_ret; + + DEBUG("route %s data %p", portid2str(portid), data); + if (portid->lid <= 0) + return NULL; /* no direct SMI */ + + if (!(range1 = mad_is_vendor_range1(call->mgmt_class)) && + !(mad_is_vendor_range2(call->mgmt_class))) + return NULL; + + resp_expected = response_expected(call->method); + + rpc.mgtclass = call->mgmt_class | IB_MAD_RPC_VERSION1; + + rpc.method = call->method; + rpc.attr.id = call->attrid; + rpc.attr.mod = call->mod; + rpc.timeout = resp_expected ? call->timeout : 0; + rpc.datasz = + range1 ? IB_VENDOR_RANGE1_DATA_SIZE : IB_VENDOR_RANGE2_DATA_SIZE; + rpc.dataoffs = + range1 ? IB_VENDOR_RANGE1_DATA_OFFS : IB_VENDOR_RANGE2_DATA_OFFS; + + if (!range1) + rpc.oui = call->oui; + + DEBUG + ("class 0x%x method 0x%x attr 0x%x mod 0x%x datasz %d off %d res_ex %d", + rpc.mgtclass, rpc.method, rpc.attr.id, rpc.attr.mod, rpc.datasz, + rpc.dataoffs, resp_expected); + + portid->qp = 1; + if (!portid->qkey) + portid->qkey = IB_DEFAULT_QP1_QKEY; + + if (resp_expected) { + p_ret = mad_rpc_rmpp(srcport, rpcold, portid, NULL, data); /* FIXME: no RMPP for now */ + errno = rpc.error; + return p_ret; + } + + return mad_send_via(rpcold, portid, NULL, data, srcport) < 0 ? NULL : data; /* FIXME: no RMPP for now */ +} diff --git a/libibnetdisc/CMakeLists.txt b/libibnetdisc/CMakeLists.txt new file mode 100644 index 0000000..e908bc2 --- /dev/null +++ b/libibnetdisc/CMakeLists.txt @@ -0,0 +1,24 @@ +publish_headers(infiniband + ibnetdisc.h + ibnetdisc_osd.h + ) + +rdma_library(ibnetdisc libibnetdisc.map + # See Documentation/versioning.md + 5 5.0.${PACKAGE_VERSION} + chassis.c + ibnetdisc.c + ibnetdisc_cache.c + query_smp.c + ) +target_link_libraries(ibnetdisc LINK_PRIVATE + ibmad + ibumad + ) +rdma_pkg_config("ibnetdisc" "libibumad libibmad" "") + +rdma_test_executable(testleaks tests/testleaks.c) +target_link_libraries(testleaks LINK_PRIVATE + ibmad + ibnetdisc +) diff --git a/libibnetdisc/chassis.c b/libibnetdisc/chassis.c new file mode 100644 index 0000000..a3ec1d8 --- /dev/null +++ b/libibnetdisc/chassis.c @@ -0,0 +1,1335 @@ +/* + * Copyright (c) 2004-2009 Voltaire Inc. All rights reserved. + * Copyright (c) 2007 Xsigo Systems Inc. All rights reserved. + * Copyright (c) 2008 Lawrence Livermore National Lab. All rights reserved. + * Copyright (c) 2010 HNR Consulting. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +/*========================================================*/ +/* FABRIC SCANNER SPECIFIC DATA */ +/*========================================================*/ + +#include <stdlib.h> +#include <inttypes.h> + +#include <infiniband/mad.h> + +#include "internal.h" +#include "chassis.h" + +static const char * const ChassisTypeStr[] = +{ "", "ISR9288", "ISR9096", "ISR2012", "ISR2004", "ISR4700", "ISR4200" }; +static const char * const ChassisSlotTypeStr[] = { "", "Line", "Spine", "SRBD" }; + +typedef struct chassis_scan { + ibnd_chassis_t *first_chassis; + ibnd_chassis_t *current_chassis; + ibnd_chassis_t *last_chassis; +} chassis_scan_t; + +const char *ibnd_get_chassis_type(ibnd_node_t * node) +{ + int chassis_type; + + if (!node) { + IBND_DEBUG("node parameter NULL\n"); + return NULL; + } + + if (!node->chassis) + return NULL; + + chassis_type = mad_get_field(node->info, 0, IB_NODE_VENDORID_F); + + switch (chassis_type) + { + case VTR_VENDOR_ID: /* Voltaire chassis */ + { + if (node->ch_type == UNRESOLVED_CT || node->ch_type > ISR4200_CT) + return NULL; + return ChassisTypeStr[node->ch_type]; + } + case MLX_VENDOR_ID: + { + if (node->ch_type_str[0] == '\0') + return NULL; + return node->ch_type_str; + } + default: + { + break; + } + } + return NULL; +} + +char *ibnd_get_chassis_slot_str(ibnd_node_t * node, char *str, size_t size) +{ + int vendor_id; + + if (!node) { + IBND_DEBUG("node parameter NULL\n"); + return NULL; + } + + /* Currently, only if Voltaire or Mellanox chassis */ + vendor_id = mad_get_field(node->info, 0,IB_NODE_VENDORID_F); + + if ((vendor_id != VTR_VENDOR_ID) && (vendor_id != MLX_VENDOR_ID)) + return NULL; + if (!node->chassis) + return NULL; + if (node->ch_slot == UNRESOLVED_CS || node->ch_slot > SRBD_CS) + return NULL; + if (!str) + return NULL; + snprintf(str, size, "%s %d Chip %d", ChassisSlotTypeStr[node->ch_slot], + node->ch_slotnum, node->ch_anafanum); + return str; +} + +static ibnd_chassis_t *find_chassisnum(ibnd_fabric_t * fabric, + unsigned char chassisnum) +{ + ibnd_chassis_t *current; + + for (current = fabric->chassis; current; current = current->next) + if (current->chassisnum == chassisnum) + return current; + + return NULL; +} + +static uint64_t topspin_chassisguid(uint64_t guid) +{ + /* Byte 3 in system image GUID is chassis type, and */ + /* Byte 4 is location ID (slot) so just mask off byte 4 */ + return guid & 0xffffffff00ffffffULL; +} + +int ibnd_is_xsigo_guid(uint64_t guid) +{ + if ((guid & 0xffffff0000000000ULL) == 0x0013970000000000ULL) + return 1; + else + return 0; +} + +static int is_xsigo_leafone(uint64_t guid) +{ + if ((guid & 0xffffffffff000000ULL) == 0x0013970102000000ULL) + return 1; + else + return 0; +} + +int ibnd_is_xsigo_hca(uint64_t guid) +{ + /* NodeType 2 is HCA */ + if ((guid & 0xffffffff00000000ULL) == 0x0013970200000000ULL) + return 1; + else + return 0; +} + +int ibnd_is_xsigo_tca(uint64_t guid) +{ + /* NodeType 3 is TCA */ + if ((guid & 0xffffffff00000000ULL) == 0x0013970300000000ULL) + return 1; + else + return 0; +} + +static int is_xsigo_ca(uint64_t guid) +{ + if (ibnd_is_xsigo_hca(guid) || ibnd_is_xsigo_tca(guid)) + return 1; + else + return 0; +} + +static int is_xsigo_switch(uint64_t guid) +{ + if ((guid & 0xffffffff00000000ULL) == 0x0013970100000000ULL) + return 1; + else + return 0; +} + +static uint64_t xsigo_chassisguid(ibnd_node_t * node) +{ + uint64_t sysimgguid = + mad_get_field64(node->info, 0, IB_NODE_SYSTEM_GUID_F); + uint64_t remote_sysimgguid; + + if (!is_xsigo_ca(sysimgguid)) { + /* Byte 3 is NodeType and byte 4 is PortType */ + /* If NodeType is 1 (switch), PortType is masked */ + if (is_xsigo_switch(sysimgguid)) + return sysimgguid & 0xffffffff00ffffffULL; + else + return sysimgguid; + } else { + if (!node->ports || !node->ports[1]) + return 0; + + /* Is there a peer port ? */ + if (!node->ports[1]->remoteport) + return sysimgguid; + + /* If peer port is Leaf 1, use its chassis GUID */ + remote_sysimgguid = + mad_get_field64(node->ports[1]->remoteport->node->info, 0, + IB_NODE_SYSTEM_GUID_F); + if (is_xsigo_leafone(remote_sysimgguid)) + return remote_sysimgguid & 0xffffffff00ffffffULL; + else + return sysimgguid; + } +} + +static uint64_t get_chassisguid(ibnd_node_t * node) +{ + uint32_t vendid = mad_get_field(node->info, 0, IB_NODE_VENDORID_F); + uint64_t sysimgguid = + mad_get_field64(node->info, 0, IB_NODE_SYSTEM_GUID_F); + + if (vendid == TS_VENDOR_ID || vendid == SS_VENDOR_ID) + return topspin_chassisguid(sysimgguid); + else if (vendid == XS_VENDOR_ID || ibnd_is_xsigo_guid(sysimgguid)) + return xsigo_chassisguid(node); + else + return sysimgguid; +} + +static ibnd_chassis_t *find_chassisguid(ibnd_fabric_t * fabric, + ibnd_node_t * node) +{ + ibnd_chassis_t *current; + uint64_t chguid; + + chguid = get_chassisguid(node); + for (current = fabric->chassis; current; current = current->next) + if (current->chassisguid == chguid) + return current; + + return NULL; +} + +uint64_t ibnd_get_chassis_guid(ibnd_fabric_t * fabric, unsigned char chassisnum) +{ + ibnd_chassis_t *chassis; + + if (!fabric) { + IBND_DEBUG("fabric parameter NULL\n"); + return 0; + } + + chassis = find_chassisnum(fabric, chassisnum); + if (chassis) + return chassis->chassisguid; + else + return 0; +} + +static int is_router(ibnd_node_t * n) +{ + uint32_t devid = mad_get_field(n->info, 0, IB_NODE_DEVID_F); + return (devid == VTR_DEVID_IB_FC_ROUTER || + devid == VTR_DEVID_IB_IP_ROUTER); +} + +static int is_spine_9096(ibnd_node_t * n) +{ + uint32_t devid = mad_get_field(n->info, 0, IB_NODE_DEVID_F); + return (devid == VTR_DEVID_SFB4 || devid == VTR_DEVID_SFB4_DDR); +} + +static int is_spine_9288(ibnd_node_t * n) +{ + uint32_t devid = mad_get_field(n->info, 0, IB_NODE_DEVID_F); + return (devid == VTR_DEVID_SFB12 || devid == VTR_DEVID_SFB12_DDR); +} + +static int is_spine_2004(ibnd_node_t * n) +{ + uint32_t devid = mad_get_field(n->info, 0, IB_NODE_DEVID_F); + return (devid == VTR_DEVID_SFB2004); +} + +static int is_spine_2012(ibnd_node_t * n) +{ + uint32_t devid = mad_get_field(n->info, 0, IB_NODE_DEVID_F); + return (devid == VTR_DEVID_SFB2012); +} + +static int is_spine_4700(ibnd_node_t * n) +{ + uint32_t devid = mad_get_field(n->info, 0, IB_NODE_DEVID_F); + return (devid == VTR_DEVID_SFB4700); +} + +static int is_spine_4700x2(ibnd_node_t * n) +{ + uint32_t devid = mad_get_field(n->info, 0, IB_NODE_DEVID_F); + return (devid == VTR_DEVID_SFB4700X2); +} + +static int is_spine_4200(ibnd_node_t * n) +{ + uint32_t devid = mad_get_field(n->info, 0, IB_NODE_DEVID_F); + return (devid == VTR_DEVID_SFB4200); +} + +static int is_spine(ibnd_node_t * n) +{ + return (is_spine_9096(n) || is_spine_9288(n) || + is_spine_2004(n) || is_spine_2012(n) || + is_spine_4700(n) || is_spine_4700x2(n) || + is_spine_4200(n)); +} + +static int is_line_24(ibnd_node_t * n) +{ + uint32_t devid = mad_get_field(n->info, 0, IB_NODE_DEVID_F); + return (devid == VTR_DEVID_SLB24 || + devid == VTR_DEVID_SLB24_DDR || devid == VTR_DEVID_SRB2004); +} + +static int is_line_8(ibnd_node_t * n) +{ + uint32_t devid = mad_get_field(n->info, 0, IB_NODE_DEVID_F); + return (devid == VTR_DEVID_SLB8); +} + +static int is_line_2024(ibnd_node_t * n) +{ + uint32_t devid = mad_get_field(n->info, 0, IB_NODE_DEVID_F); + return (devid == VTR_DEVID_SLB2024); +} + +static int is_line_4700(ibnd_node_t * n) +{ + uint32_t devid = mad_get_field(n->info, 0, IB_NODE_DEVID_F); + return (devid == VTR_DEVID_SLB4018); +} + +static int is_line(ibnd_node_t * n) +{ + return (is_line_24(n) || is_line_8(n) || + is_line_2024(n) || is_line_4700(n)); +} + +/* these structs help find Line (Anafa) slot number while using spine portnum */ +static const char line_slot_2_sfb4[37] = { + 0, + 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, + 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +}; +static const char anafa_line_slot_2_sfb4[37] = { + 0, + 1, 1, 1, 2, 2, 2, 1, 1, 1, 2, 2, 2, 1, 1, 1, 2, 2, 2, + 1, 1, 1, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +}; + +static const char line_slot_2_sfb12[37] = { + 0, + 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, + 10, 10, 11, 11, 12, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +}; +static const char anafa_line_slot_2_sfb12[37] = { + 0, + 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, + 1, 2, 1, 2, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +}; + +/* LB slot = table[spine port] */ +static const char line_slot_2_sfb18[37] = { + 0, + 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, + 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15, 16, 16, 17, 17, 18, 18}; +/* LB asic num = table[spine port] */ +static const char anafa_line_slot_2_sfb18[37] = { + 0, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 +}; + +/* LB slot = table[spine port] */ +static const char line_slot_2_sfb18x2[37] = { + 0, + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; +/* LB asic num = table[spine port] */ +static const char anafa_line_slot_2_sfb18x2[37] = { + 0, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +}; + +/* LB slot = table[spine port] */ +static const char line_slot_2_sfb4200[37] = { + 0, + 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, + 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8, 8, 9, 9, 9, 9}; +/* LB asic num = table[spine port] */ +static const char anafa_line_slot_2_sfb4200[37] = { + 0, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 +}; + +/* IPR FCR modules connectivity while using sFB4 port as reference */ +static const char ipr_slot_2_sfb4_port[37] = { + 0, + 3, 2, 1, 3, 2, 1, 3, 2, 1, 3, 2, 1, 3, 2, 1, 3, 2, 1, + 3, 2, 1, 3, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +}; + +/* these structs help find Spine (Anafa) slot number while using spine portnum */ +static const char spine12_slot_2_slb[37] = { + 0, + 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +}; +static const char anafa_spine12_slot_2_slb[37] = { + 0, + 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +}; + +static const char spine4_slot_2_slb[37] = { + 0, + 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +}; +static const char anafa_spine4_slot_2_slb[37] = { + 0, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +}; + +/* FB slot = table[line port] */ +static const char spine18_slot_2_slb[37] = { + 0, + 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +}; +/* FB asic = table[line port] */ +static const char anafa_spine18_slot_2_slb[37] = { + 0, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +}; +static const char anafa_spine18x2_slot_2_slb[37] = { + 0, + 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +}; + +/* FB slot = table[line port] */ +static const char sfb4200_slot_2_slb[37] = { + 0, + 1, 1, 1, 1, 0, 0, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +}; +/* FB asic = table[line port] */ +static const char anafa_sfb4200_slot_2_slb[37] = { + 0, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +}; + +/* reference { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24 }; */ + +static int get_sfb_slot(ibnd_node_t * n, ibnd_port_t * lineport) +{ + n->ch_slot = SPINE_CS; + if (is_spine_9096(n)) { + n->ch_type = ISR9096_CT; + n->ch_slotnum = spine4_slot_2_slb[lineport->portnum]; + n->ch_anafanum = anafa_spine4_slot_2_slb[lineport->portnum]; + } else if (is_spine_9288(n)) { + n->ch_type = ISR9288_CT; + n->ch_slotnum = spine12_slot_2_slb[lineport->portnum]; + n->ch_anafanum = anafa_spine12_slot_2_slb[lineport->portnum]; + } else if (is_spine_2012(n)) { + n->ch_type = ISR2012_CT; + n->ch_slotnum = spine12_slot_2_slb[lineport->portnum]; + n->ch_anafanum = anafa_spine12_slot_2_slb[lineport->portnum]; + } else if (is_spine_2004(n)) { + n->ch_type = ISR2004_CT; + n->ch_slotnum = spine4_slot_2_slb[lineport->portnum]; + n->ch_anafanum = anafa_spine4_slot_2_slb[lineport->portnum]; + } else if (is_spine_4700(n)) { + n->ch_type = ISR4700_CT; + n->ch_slotnum = spine18_slot_2_slb[lineport->portnum]; + n->ch_anafanum = anafa_spine18_slot_2_slb[lineport->portnum]; + } else if (is_spine_4700x2(n)) { + n->ch_type = ISR4700_CT; + n->ch_slotnum = spine18_slot_2_slb[lineport->portnum]; + n->ch_anafanum = anafa_spine18x2_slot_2_slb[lineport->portnum]; + } else if (is_spine_4200(n)) { + n->ch_type = ISR4200_CT; + n->ch_slotnum = sfb4200_slot_2_slb[lineport->portnum]; + n->ch_anafanum = anafa_sfb4200_slot_2_slb[lineport->portnum]; + } else { + IBND_ERROR("Unexpected node found: guid 0x%016" PRIx64 "\n", + n->guid); + } + return 0; +} + +static int get_router_slot(ibnd_node_t * n, ibnd_port_t * spineport) +{ + uint64_t guessnum = 0; + + n->ch_found = 1; + + n->ch_slot = SRBD_CS; + if (is_spine_9096(spineport->node)) { + n->ch_type = ISR9096_CT; + n->ch_slotnum = line_slot_2_sfb4[spineport->portnum]; + n->ch_anafanum = ipr_slot_2_sfb4_port[spineport->portnum]; + } else if (is_spine_9288(spineport->node)) { + n->ch_type = ISR9288_CT; + n->ch_slotnum = line_slot_2_sfb12[spineport->portnum]; + /* this is a smart guess based on nodeguids order on sFB-12 module */ + guessnum = spineport->node->guid % 4; + /* module 1 <--> remote anafa 3 */ + /* module 2 <--> remote anafa 2 */ + /* module 3 <--> remote anafa 1 */ + n->ch_anafanum = (guessnum == 3 ? 1 : (guessnum == 1 ? 3 : 2)); + } else if (is_spine_2012(spineport->node)) { + n->ch_type = ISR2012_CT; + n->ch_slotnum = line_slot_2_sfb12[spineport->portnum]; + /* this is a smart guess based on nodeguids order on sFB-12 module */ + guessnum = spineport->node->guid % 4; + // module 1 <--> remote anafa 3 + // module 2 <--> remote anafa 2 + // module 3 <--> remote anafa 1 + n->ch_anafanum = (guessnum == 3 ? 1 : (guessnum == 1 ? 3 : 2)); + } else if (is_spine_2004(spineport->node)) { + n->ch_type = ISR2004_CT; + n->ch_slotnum = line_slot_2_sfb4[spineport->portnum]; + n->ch_anafanum = ipr_slot_2_sfb4_port[spineport->portnum]; + } else { + IBND_ERROR("Unexpected node found: guid 0x%016" PRIx64 "\n", + spineport->node->guid); + } + return 0; +} + +static int get_slb_slot(ibnd_node_t * n, ibnd_port_t * spineport) +{ + n->ch_slot = LINE_CS; + if (is_spine_9096(spineport->node)) { + n->ch_type = ISR9096_CT; + n->ch_slotnum = line_slot_2_sfb4[spineport->portnum]; + n->ch_anafanum = anafa_line_slot_2_sfb4[spineport->portnum]; + } else if (is_spine_9288(spineport->node)) { + n->ch_type = ISR9288_CT; + n->ch_slotnum = line_slot_2_sfb12[spineport->portnum]; + n->ch_anafanum = anafa_line_slot_2_sfb12[spineport->portnum]; + } else if (is_spine_2012(spineport->node)) { + n->ch_type = ISR2012_CT; + n->ch_slotnum = line_slot_2_sfb12[spineport->portnum]; + n->ch_anafanum = anafa_line_slot_2_sfb12[spineport->portnum]; + } else if (is_spine_2004(spineport->node)) { + n->ch_type = ISR2004_CT; + n->ch_slotnum = line_slot_2_sfb4[spineport->portnum]; + n->ch_anafanum = anafa_line_slot_2_sfb4[spineport->portnum]; + } else if (is_spine_4700(spineport->node)) { + n->ch_type = ISR4700_CT; + n->ch_slotnum = line_slot_2_sfb18[spineport->portnum]; + n->ch_anafanum = anafa_line_slot_2_sfb18[spineport->portnum]; + } else if (is_spine_4700x2(spineport->node)) { + n->ch_type = ISR4700_CT; + n->ch_slotnum = line_slot_2_sfb18x2[spineport->portnum]; + n->ch_anafanum = anafa_line_slot_2_sfb18x2[spineport->portnum]; + } else if (is_spine_4200(spineport->node)) { + n->ch_type = ISR4200_CT; + n->ch_slotnum = line_slot_2_sfb4200[spineport->portnum]; + n->ch_anafanum = anafa_line_slot_2_sfb4200[spineport->portnum]; + } else { + IBND_ERROR("Unexpected node found: guid 0x%016" PRIx64 "\n", + spineport->node->guid); + } + return 0; +} + + +/* + This function called for every Mellanox node in fabric +*/ +static int fill_mellanox_chassis_record(ibnd_node_t * node) +{ + int p = 0; + ibnd_port_t *port; + + char node_desc[IB_SMP_DATA_SIZE]; + char *system_name; + char *system_type; + char *system_slot_name; + char *node_index; + char *iter; + int dev_id; + + /* + The node description has the following format: + + 'MF0;<system name>:<system type>/<system slot name>[:board type]/U<node index>' + + - System slot name in our systems can be L[01-36] , S[01-18] + - Node index is always 1 (we don.t have boards with multiple IS4 chips). + - System name is taken from the currently configured host name. + -The board type is optional and we don.t set it currently - A leaf or spine slot can currently hold a single type of board. + */ + + memcpy(node_desc, node->nodedesc, IB_SMP_DATA_SIZE); + + IBND_DEBUG("fill_mellanox_chassis_record: node_desc:%s \n",node_desc); + + if (node->ch_found) /* somehow this node has already been passed */ + return 0; + + /* All mellanox IS4 switches have the same vendor id*/ + dev_id = mad_get_field(node->info, 0,IB_NODE_DEVID_F); + if (dev_id != MLX_DEVID_IS4) + return 0; + + if((node_desc[0] != 'M') || + (node_desc[1] != 'F') || + (node_desc[2] != '0') || + (node_desc[3] != ';')) { + IBND_DEBUG("fill_mellanox_chassis_record: Unsupported node description format:%s \n",node_desc); + return 0; + } + + /* parse system name*/ + system_name = &node_desc[4]; + for (iter = system_name ; (*iter != ':') && (*iter != '\0') ; iter++); + if(*iter == '\0'){ + IBND_DEBUG("fill_mellanox_chassis_record: Unsupported node description format:%s - (get system_name failed) \n",node_desc); + return 0; + } + *iter = '\0'; + iter++; + /* parse system type*/ + system_type = iter; + for ( ; (*iter != '/') && (*iter != '\0') ; iter++); + if(*iter == '\0'){ + IBND_DEBUG("fill_mellanox_chassis_record: Unsupported node description format:%s - (get system_type failed) \n",node_desc); + return 0; + } + *iter = '\0'; + iter++; + /* parse system slot name*/ + system_slot_name = iter; + for ( ; (*iter != '/') && (*iter != ':') && (*iter != '\0') ; iter++); + if(*iter == '\0'){ + IBND_DEBUG("fill_mellanox_chassis_record: Unsupported node description format:%s - (get system_slot_name failed) \n",node_desc); + return 0; + } + if(*iter == ':'){ + *iter = '\0'; + iter++; + for ( ; (*iter != '/') && (*iter != '\0') ; iter++); + if(*iter == '\0'){ + IBND_DEBUG("fill_mellanox_chassis_record: Unsupported node description format:%s - (get board type failed) \n",node_desc); + return 0; + } + } + *iter = '\0'; + iter++; + node_index = iter; + if(node_index[0] != 'U'){ + IBND_DEBUG("fill_mellanox_chassis_record: Unsupported node description format:%s - (get node index) \n",node_desc); + return 0; + } + + /* set Chip number (node index) */ + node->ch_anafanum = (unsigned char) atoi(&node_index[1]); + if(node->ch_anafanum != 1){ + IBND_DEBUG("Unexpected Chip number:%d \n",node->ch_anafanum); + } + + + /* set Line Spine numbers */ + if(system_slot_name[0] == 'L') + node->ch_slot = LINE_CS; + else if(system_slot_name[0] == 'S') + node->ch_slot = SPINE_CS; + else{ + IBND_DEBUG("fill_mellanox_chassis_record: Unsupported system_slot_name:%s \n",system_slot_name); + return 0; + } + + /* The switch will be displayed under Line or Spine and not under Chassis switches */ + node->ch_found = 1; + + node->ch_slotnum = (unsigned char) atoi(&system_slot_name[1]); + if((node->ch_slot == LINE_CS && (node->ch_slotnum > (LINES_MAX_NUM + 1))) || + (node->ch_slot == SPINE_CS && (node->ch_slotnum > (SPINES_MAX_NUM + 1)))){ + IBND_ERROR("fill_mellanox_chassis_record: invalid slot number:%d \n",node->ch_slotnum); + node->ch_slotnum = 0; + return 0; + } + + /*set ch_type_str*/ + strncpy(node->ch_type_str , system_type, sizeof(node->ch_type_str)-1); + + /* Line ports 1-18 are mapped to external ports 1-18*/ + if(node->ch_slot == LINE_CS) + { + for (p = 1; p <= node->numports && p <= 18 ; p++) { + port = node->ports[p]; + if (!port) + continue; + port->ext_portnum = p; + } + } + + return 0; +} + +static int insert_mellanox_line_and_spine(ibnd_node_t * node, ibnd_chassis_t * chassis) +{ + if (node->ch_slot == LINE_CS){ + + if (chassis->linenode[node->ch_slotnum]) + return 0; /* already filled slot */ + + chassis->linenode[node->ch_slotnum] = node; + } + else if (node->ch_slot == SPINE_CS){ + + if (chassis->spinenode[node->ch_slotnum]) + return 0; /* already filled slot */ + + chassis->spinenode[node->ch_slotnum] = node; + } + else + return 0; + + node->chassis = chassis; + + return 0; +} + + +/* forward declare this */ +static void voltaire_portmap(ibnd_port_t * port); +/* + This function called for every Voltaire node in fabric + It could be optimized so, but time overhead is very small + and its only diag.util +*/ +static int fill_voltaire_chassis_record(ibnd_node_t * node) +{ + int p = 0; + ibnd_port_t *port; + ibnd_node_t *remnode = NULL; + + if (node->ch_found) /* somehow this node has already been passed */ + return 0; + node->ch_found = 1; + + /* node is router only in case of using unique lid */ + /* (which is lid of chassis router port) */ + /* in such case node->ports is actually a requested port... */ + if (is_router(node)) + /* find the remote node */ + for (p = 1; p <= node->numports; p++) { + port = node->ports[p]; + if (port && is_spine(port->remoteport->node)) + get_router_slot(node, port->remoteport); + } + else if (is_spine(node)) { + int is_4700x2 = is_spine_4700x2(node); + + for (p = 1; p <= node->numports; p++) { + port = node->ports[p]; + if (!port || !port->remoteport) + continue; + + /* + * Skip ISR4700 double density fabric boards ports 19-36 + * as they are chassis external ports + */ + if (is_4700x2 && (port->portnum > 18)) + continue; + + remnode = port->remoteport->node; + if (remnode->type != IB_NODE_SWITCH) { + if (!remnode->ch_found) + get_router_slot(remnode, port); + continue; + } + if (!node->ch_type) + /* we assume here that remoteport belongs to line */ + get_sfb_slot(node, port->remoteport); + + /* we could break here, but need to find if more routers connected */ + } + + } else if (is_line(node)) { + int is_4700_line = is_line_4700(node); + + for (p = 1; p <= node->numports; p++) { + port = node->ports[p]; + if (!port || !port->remoteport) + continue; + + if ((is_4700_line && (port->portnum > 18)) || + (!is_4700_line && (port->portnum > 12))) + continue; + + /* we assume here that remoteport belongs to spine */ + get_slb_slot(node, port->remoteport); + break; + } + } + + /* for each port of this node, map external ports */ + for (p = 1; p <= node->numports; p++) { + port = node->ports[p]; + if (!port) + continue; + voltaire_portmap(port); + } + + return 0; +} + +static int get_line_index(ibnd_node_t * node) +{ + int retval; + + if (is_line_4700(node)) + retval = node->ch_slotnum; + else + retval = 3 * (node->ch_slotnum - 1) + node->ch_anafanum; + + if (retval > LINES_MAX_NUM || retval < 1) { + printf("%s: retval = %d\n", __FUNCTION__, retval); + IBND_ERROR("Internal error\n"); + return -1; + } + return retval; +} + +static int get_spine_index(ibnd_node_t * node) +{ + int retval; + + if (is_spine_9288(node) || is_spine_2012(node)) + retval = 3 * (node->ch_slotnum - 1) + node->ch_anafanum; + else if (is_spine_4700(node) || is_spine_4700x2(node)) + retval = 2 * (node->ch_slotnum - 1) + node->ch_anafanum; + else + retval = node->ch_slotnum; + + if (retval > SPINES_MAX_NUM || retval < 1) { + IBND_ERROR("Internal error\n"); + return -1; + } + return retval; +} + +static int insert_line_router(ibnd_node_t * node, ibnd_chassis_t * chassis) +{ + int i = get_line_index(node); + + if (i < 0) + return i; + + if (chassis->linenode[i]) + return 0; /* already filled slot */ + + chassis->linenode[i] = node; + node->chassis = chassis; + return 0; +} + +static int insert_spine(ibnd_node_t * node, ibnd_chassis_t * chassis) +{ + int i = get_spine_index(node); + + if (i < 0) + return i; + + if (chassis->spinenode[i]) + return 0; /* already filled slot */ + + chassis->spinenode[i] = node; + node->chassis = chassis; + return 0; +} + +static int pass_on_lines_catch_spines(ibnd_chassis_t * chassis) +{ + ibnd_node_t *node, *remnode; + ibnd_port_t *port; + int i, p; + + for (i = 1; i <= LINES_MAX_NUM; i++) { + int is_4700_line; + + node = chassis->linenode[i]; + + if (!(node && is_line(node))) + continue; /* empty slot or router */ + + is_4700_line = is_line_4700(node); + + for (p = 1; p <= node->numports; p++) { + + port = node->ports[p]; + if (!port || !port->remoteport) + continue; + + if ((is_4700_line && (port->portnum > 18)) || + (!is_4700_line && (port->portnum > 12))) + continue; + + remnode = port->remoteport->node; + + if (!remnode->ch_found) + continue; /* some error - spine not initialized ? FIXME */ + if (insert_spine(remnode, chassis)) + return -1; + } + } + return 0; +} + +static int pass_on_spines_catch_lines(ibnd_chassis_t * chassis) +{ + ibnd_node_t *node, *remnode; + ibnd_port_t *port; + int i, p; + + for (i = 1; i <= SPINES_MAX_NUM; i++) { + int is_4700x2; + + node = chassis->spinenode[i]; + if (!node) + continue; /* empty slot */ + + is_4700x2 = is_spine_4700x2(node); + + for (p = 1; p <= node->numports; p++) { + port = node->ports[p]; + if (!port || !port->remoteport) + continue; + + /* + * ISR4700 double density fabric board ports 19-36 are + * chassis external ports, so skip them + */ + if (is_4700x2 && (port->portnum > 18)) + continue; + + remnode = port->remoteport->node; + + if (!remnode->ch_found) + continue; /* some error - line/router not initialized ? FIXME */ + + if (insert_line_router(remnode, chassis)) + return -1; + } + } + return 0; +} + +/* + Stupid interpolation algorithm... + But nothing to do - have to be compliant with VoltaireSM/NMS +*/ +static void pass_on_spines_interpolate_chguid(ibnd_chassis_t * chassis) +{ + ibnd_node_t *node; + int i; + + for (i = 1; i <= SPINES_MAX_NUM; i++) { + node = chassis->spinenode[i]; + if (!node) + continue; /* skip the empty slots */ + + /* take first guid minus one to be consistent with SM */ + chassis->chassisguid = node->guid - 1; + break; + } +} + +/* + This function fills chassis structure with all nodes + in that chassis + chassis structure = structure of one standalone chassis +*/ +static int build_chassis(ibnd_node_t * node, ibnd_chassis_t * chassis) +{ + int p = 0; + ibnd_node_t *remnode = NULL; + ibnd_port_t *port = NULL; + + /* we get here with node = chassis_spine */ + if (insert_spine(node, chassis)) + return -1; + + /* loop: pass on all ports of node */ + for (p = 1; p <= node->numports; p++) { + + port = node->ports[p]; + if (!port || !port->remoteport) + continue; + + /* + * ISR4700 double density fabric board ports 19-36 are + * chassis external ports, so skip them + */ + if (is_spine_4700x2(node) && (port->portnum > 18)) + continue; + + remnode = port->remoteport->node; + + if (!remnode->ch_found) + continue; /* some error - line or router not initialized ? FIXME */ + + insert_line_router(remnode, chassis); + } + + if (pass_on_lines_catch_spines(chassis)) + return -1; + /* this pass needed for to catch routers, since routers connected only */ + /* to spines in slot 1 or 4 and we could miss them first time */ + if (pass_on_spines_catch_lines(chassis)) + return -1; + + /* additional 2 passes needed for to overcome a problem of pure "in-chassis" */ + /* connectivity - extra pass to ensure that all related chips/modules */ + /* inserted into the chassis */ + if (pass_on_lines_catch_spines(chassis)) + return -1; + if (pass_on_spines_catch_lines(chassis)) + return -1; + pass_on_spines_interpolate_chguid(chassis); + + return 0; +} + +/*========================================================*/ +/* INTERNAL TO EXTERNAL PORT MAPPING */ +/*========================================================*/ + +/* +Description : On ISR9288/9096 external ports indexing + is not matching the internal ( anafa ) port + indexes. Use this MAP to translate the data you get from + the OpenIB diagnostics (smpquery, ibroute, ibtracert, etc.) + +Module : sLB-24 + anafa 1 anafa 2 +ext port | 13 14 15 16 17 18 | 19 20 21 22 23 24 +int port | 22 23 24 18 17 16 | 22 23 24 18 17 16 +ext port | 1 2 3 4 5 6 | 7 8 9 10 11 12 +int port | 19 20 21 15 14 13 | 19 20 21 15 14 13 +------------------------------------------------ + +Module : sLB-8 + anafa 1 anafa 2 +ext port | 13 14 15 16 17 18 | 19 20 21 22 23 24 +int port | 24 23 22 18 17 16 | 24 23 22 18 17 16 +ext port | 1 2 3 4 5 6 | 7 8 9 10 11 12 +int port | 21 20 19 15 14 13 | 21 20 19 15 14 13 + +-----------> + anafa 1 anafa 2 +ext port | - - 5 - - 6 | - - 7 - - 8 +int port | 24 23 22 18 17 16 | 24 23 22 18 17 16 +ext port | - - 1 - - 2 | - - 3 - - 4 +int port | 21 20 19 15 14 13 | 21 20 19 15 14 13 +------------------------------------------------ + +Module : sLB-2024 + +ext port | 13 14 15 16 17 18 19 20 21 22 23 24 +A1 int port| 13 14 15 16 17 18 19 20 21 22 23 24 +ext port | 1 2 3 4 5 6 7 8 9 10 11 12 +A2 int port| 13 14 15 16 17 18 19 20 21 22 23 24 +--------------------------------------------------- + +Module : sLB-4018 + +int port | 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 +ext port | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 +--------------------------------------------------- + +Module : sFB-4700X2 + + 12X port -> 3 x 4X ports: + +A1 int port | 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 + ext port | 7 7 7 8 8 8 9 9 9 10 10 10 11 11 11 12 12 12 +A2 int port | 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 + ext port | 1 1 1 2 2 2 3 3 3 4 4 4 5 5 5 6 6 6 + +*/ + +static int int2ext_map_slb24[2][25] = { + {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 5, 4, 18, 17, 16, 1, 2, 3, + 13, 14, 15}, + {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 12, 11, 10, 24, 23, 22, 7, 8, 9, + 19, 20, 21} +}; + +static int int2ext_map_slb8[2][25] = { + {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 6, 6, 6, 1, 1, 1, 5, 5, + 5}, + {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 8, 8, 8, 3, 3, 3, 7, 7, + 7} +}; + +static int int2ext_map_slb2024[2][25] = { + {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13, 14, 15, 16, 17, 18, 19, 20, + 21, 22, 23, 24}, + {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, + 11, 12} +}; + +static int int2ext_map_slb4018[37] = { + 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18 +}; + +static int int2ext_map_sfb4700x2[2][37] = { + {0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 7, 7, 7, 8, 8, 8, 9, 9, 9, 10, 10, 10, 11, 11, 11, 12, 12, 12}, + {0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 6} +}; + +/* reference { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24 }; */ + +/* map internal ports to external ports if appropriate */ +static void voltaire_portmap(ibnd_port_t * port) +{ + int portnum = port->portnum; + int chipnum = 0; + ibnd_node_t *node = port->node; + int is_4700_line = is_line_4700(node); + int is_4700x2_spine = is_spine_4700x2(node); + + if (!node->ch_found || (!is_line(node) && !is_4700x2_spine)) { + port->ext_portnum = 0; + return; + } + + if (((is_4700_line || is_4700x2_spine) && + (portnum < 19 || portnum > 36)) || + ((!is_4700_line && !is_4700x2_spine) && + (portnum < 13 || portnum > 24))) { + port->ext_portnum = 0; + return; + } + + if (port->node->ch_anafanum < 1 || port->node->ch_anafanum > 2) { + port->ext_portnum = 0; + return; + } + + chipnum = port->node->ch_anafanum - 1; + + if (is_line_24(node)) + port->ext_portnum = int2ext_map_slb24[chipnum][portnum]; + else if (is_line_2024(node)) + port->ext_portnum = int2ext_map_slb2024[chipnum][portnum]; + /* sLB-4018: Only one asic per LB */ + else if (is_4700_line) + port->ext_portnum = int2ext_map_slb4018[portnum]; + /* sFB-4700X2 4X port */ + else if (is_4700x2_spine) + port->ext_portnum = int2ext_map_sfb4700x2[chipnum][portnum]; + else + port->ext_portnum = int2ext_map_slb8[chipnum][portnum]; +} + +static int add_chassis(chassis_scan_t * chassis_scan) +{ + if (!(chassis_scan->current_chassis = + calloc(1, sizeof(ibnd_chassis_t)))) { + IBND_ERROR("OOM: failed to allocate chassis object\n"); + return -1; + } + + if (chassis_scan->first_chassis == NULL) { + chassis_scan->first_chassis = chassis_scan->current_chassis; + chassis_scan->last_chassis = chassis_scan->current_chassis; + } else { + chassis_scan->last_chassis->next = + chassis_scan->current_chassis; + chassis_scan->last_chassis = chassis_scan->current_chassis; + } + return 0; +} + +static void add_node_to_chassis(ibnd_chassis_t * chassis, ibnd_node_t * node) +{ + node->chassis = chassis; + node->next_chassis_node = chassis->nodes; + chassis->nodes = node; +} + +/* + Main grouping function + Algorithm: + 1. pass on every Voltaire node + 2. catch spine chip for every Voltaire node + 2.1 build/interpolate chassis around this chip + 2.2 go to 1. + 3. pass on non Voltaire nodes (SystemImageGUID based grouping) + 4. now group non Voltaire nodes by SystemImageGUID + Returns: + 0 on success, -1 on failure +*/ +int group_nodes(ibnd_fabric_t * fabric) +{ + ibnd_node_t *node; + int chassisnum = 0; + ibnd_chassis_t *chassis; + ibnd_chassis_t *ch, *ch_next; + chassis_scan_t chassis_scan; + int vendor_id; + + chassis_scan.first_chassis = NULL; + chassis_scan.current_chassis = NULL; + chassis_scan.last_chassis = NULL; + + /* first pass on switches and build for every Voltaire node */ + /* an appropriate chassis record (slotnum and position) */ + /* according to internal connectivity */ + /* not very efficient but clear code so... */ + for (node = fabric->switches; node; node = node->type_next) { + + vendor_id = mad_get_field(node->info, 0,IB_NODE_VENDORID_F); + + if (vendor_id == VTR_VENDOR_ID + && fill_voltaire_chassis_record(node)) + goto cleanup; + else if (vendor_id == MLX_VENDOR_ID + && fill_mellanox_chassis_record(node)) + goto cleanup; + + } + + /* separate every Voltaire chassis from each other and build linked list of them */ + /* algorithm: catch spine and find all surrounding nodes */ + for (node = fabric->switches; node; node = node->type_next) { + if (mad_get_field(node->info, 0, + IB_NODE_VENDORID_F) != VTR_VENDOR_ID) + continue; + if (!node->ch_found + || (node->chassis && node->chassis->chassisnum) + || !is_spine(node)) + continue; + if (add_chassis(&chassis_scan)) + goto cleanup; + chassis_scan.current_chassis->chassisnum = ++chassisnum; + if (build_chassis(node, chassis_scan.current_chassis)) + goto cleanup; + } + + /* now make pass on nodes for chassis which are not Voltaire */ + /* grouped by common SystemImageGUID */ + for (node = fabric->nodes; node; node = node->next) { + if (mad_get_field(node->info, 0, + IB_NODE_VENDORID_F) == VTR_VENDOR_ID) + continue; + if (mad_get_field64(node->info, 0, IB_NODE_SYSTEM_GUID_F)) { + chassis = find_chassisguid(fabric, node); + if (chassis) + chassis->nodecount++; + else { + /* Possible new chassis */ + if (add_chassis(&chassis_scan)) + goto cleanup; + chassis_scan.current_chassis->chassisguid = + get_chassisguid(node); + chassis_scan.current_chassis->nodecount = 1; + if (!fabric->chassis) + fabric->chassis = chassis_scan.first_chassis; + } + } + } + + /* now, make another pass to see which nodes are part of chassis */ + /* (defined as chassis->nodecount > 1) */ + for (node = fabric->nodes; node; node = node->next) { + + vendor_id = mad_get_field(node->info, 0,IB_NODE_VENDORID_F); + + if (vendor_id == VTR_VENDOR_ID) + continue; + if (mad_get_field64(node->info, 0, IB_NODE_SYSTEM_GUID_F)) { + chassis = find_chassisguid(fabric, node); + if (chassis && chassis->nodecount > 1) { + if (!chassis->chassisnum) + chassis->chassisnum = ++chassisnum; + if (!node->ch_found) { + node->ch_found = 1; + add_node_to_chassis(chassis, node); + } + else if (vendor_id == MLX_VENDOR_ID){ + insert_mellanox_line_and_spine(node, chassis); + } + } + } + } + + fabric->chassis = chassis_scan.first_chassis; + return 0; + +cleanup: + ch = chassis_scan.first_chassis; + while (ch) { + ch_next = ch->next; + free(ch); + ch = ch_next; + } + fabric->chassis = NULL; + return -1; +} diff --git a/libibnetdisc/chassis.h b/libibnetdisc/chassis.h new file mode 100644 index 0000000..7a91be3 --- /dev/null +++ b/libibnetdisc/chassis.h @@ -0,0 +1,95 @@ +/* + * Copyright (c) 2004-2007 Voltaire Inc. All rights reserved. + * Copyright (c) 2007 Xsigo Systems Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#ifndef _CHASSIS_H_ +#define _CHASSIS_H_ + +#include <infiniband/ibnetdisc.h> + +#include "internal.h" + +/*========================================================*/ +/* CHASSIS RECOGNITION SPECIFIC DATA */ +/*========================================================*/ + +/* Device IDs */ +#define VTR_DEVID_IB_FC_ROUTER 0x5a00 +#define VTR_DEVID_IB_IP_ROUTER 0x5a01 +#define VTR_DEVID_ISR9600_SPINE 0x5a02 +#define VTR_DEVID_ISR9600_LEAF 0x5a03 +#define VTR_DEVID_HCA1 0x5a04 +#define VTR_DEVID_HCA2 0x5a44 +#define VTR_DEVID_HCA3 0x6278 +#define VTR_DEVID_SW_6IB4 0x5a05 +#define VTR_DEVID_ISR9024 0x5a06 +#define VTR_DEVID_ISR9288 0x5a07 +#define VTR_DEVID_SLB24 0x5a09 +#define VTR_DEVID_SFB12 0x5a08 +#define VTR_DEVID_SFB4 0x5a0b +#define VTR_DEVID_ISR9024_12 0x5a0c +#define VTR_DEVID_SLB8 0x5a0d +#define VTR_DEVID_RLX_SWITCH_BLADE 0x5a20 +#define VTR_DEVID_ISR9024_DDR 0x5a31 +#define VTR_DEVID_SFB12_DDR 0x5a32 +#define VTR_DEVID_SFB4_DDR 0x5a33 +#define VTR_DEVID_SLB24_DDR 0x5a34 +#define VTR_DEVID_SFB2012 0x5a37 +#define VTR_DEVID_SLB2024 0x5a38 +#define VTR_DEVID_ISR2012 0x5a39 +#define VTR_DEVID_SFB2004 0x5a40 +#define VTR_DEVID_ISR2004 0x5a41 +#define VTR_DEVID_SRB2004 0x5a42 +#define VTR_DEVID_SLB4018 0x5a5b +#define VTR_DEVID_SFB4700 0x5a5c +#define VTR_DEVID_SFB4700X2 0x5a5d +#define VTR_DEVID_SFB4200 0x5a60 + +#define MLX_DEVID_IS4 0xbd36 + +/* Vendor IDs (for chassis based systems) */ +#define VTR_VENDOR_ID 0x8f1 /* Voltaire */ +#define MLX_VENDOR_ID 0x2c9 /* Mellanox */ +#define TS_VENDOR_ID 0x5ad /* Cisco */ +#define SS_VENDOR_ID 0x66a /* InfiniCon */ +#define XS_VENDOR_ID 0x1397 /* Xsigo */ + +enum ibnd_chassis_type { + UNRESOLVED_CT, ISR9288_CT, ISR9096_CT, ISR2012_CT, ISR2004_CT, + ISR4700_CT, ISR4200_CT +}; +enum ibnd_chassis_slot_type { UNRESOLVED_CS, LINE_CS, SPINE_CS, SRBD_CS }; + +int group_nodes(struct ibnd_fabric *fabric); + +#endif /* _CHASSIS_H_ */ diff --git a/libibnetdisc/ibnetdisc.c b/libibnetdisc/ibnetdisc.c new file mode 100644 index 0000000..cc76e99 --- /dev/null +++ b/libibnetdisc/ibnetdisc.c @@ -0,0 +1,1019 @@ +/* + * Copyright (c) 2004-2009 Voltaire Inc. All rights reserved. + * Copyright (c) 2007 Xsigo Systems Inc. All rights reserved. + * Copyright (c) 2008 Lawrence Livermore National Laboratory + * Copyright (c) 2010-2011 Mellanox Technologies LTD. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#define _GNU_SOURCE +#include <stdio.h> +#include <stdlib.h> +#include <stddef.h> +#include <unistd.h> +#include <string.h> +#include <errno.h> +#include <inttypes.h> + +#include <infiniband/umad.h> +#include <infiniband/mad.h> +#include <util/iba_types.h> + +#include <infiniband/ibnetdisc.h> + +#include "internal.h" +#include "chassis.h" + +#define container_of(ptr, type, member) \ + ((type *)((uint8_t *)(ptr)-offsetof(type, member))) + +/* forward declarations */ +struct ni_cbdata +{ + ibnd_node_t *node; + int port_num; +}; +static int query_node_info(smp_engine_t * engine, ib_portid_t * portid, + struct ni_cbdata * cbdata); +static int query_port_info(smp_engine_t * engine, ib_portid_t * portid, + ibnd_node_t * node, int portnum); + +static int recv_switch_info(smp_engine_t * engine, ibnd_smp_t * smp, + uint8_t * mad, void *cb_data) +{ + uint8_t *switch_info = mad + IB_SMP_DATA_OFFS; + ibnd_node_t *node = cb_data; + memcpy(node->switchinfo, switch_info, sizeof(node->switchinfo)); + mad_decode_field(node->switchinfo, IB_SW_ENHANCED_PORT0_F, + &node->smaenhsp0); + return 0; +} + +static int query_switch_info(smp_engine_t * engine, ib_portid_t * portid, + ibnd_node_t * node) +{ + node->smaenhsp0 = 0; /* assume base SP0 */ + return issue_smp(engine, portid, IB_ATTR_SWITCH_INFO, 0, + recv_switch_info, node); +} + +static int add_port_to_dpath(ib_dr_path_t * path, int nextport) +{ + if (path->cnt > sizeof(path->p) - 2) + return -1; + ++path->cnt; + path->p[path->cnt] = (uint8_t) nextport; + return path->cnt; +} + +static int retract_dpath(smp_engine_t * engine, ib_portid_t * portid) +{ + ibnd_scan_t *scan = engine->user_data; + f_internal_t *f_int = scan->f_int; + + if (scan->cfg->max_hops && + f_int->fabric.maxhops_discovered > scan->cfg->max_hops) + return 0; + + /* this may seem wrong but the only time we would retract the path is + * if the user specified a CA for the DR path and we are retracting + * from that to find the node it is connected to. This counts as a + * positive hop discovered + */ + f_int->fabric.maxhops_discovered++; + portid->drpath.p[portid->drpath.cnt] = 0; + portid->drpath.cnt--; + return 1; +} + +static int extend_dpath(smp_engine_t * engine, ib_portid_t * portid, + int nextport) +{ + ibnd_scan_t *scan = engine->user_data; + f_internal_t *f_int = scan->f_int; + + if (scan->cfg->max_hops && + f_int->fabric.maxhops_discovered > scan->cfg->max_hops) + return 0; + + if (portid->lid) { + /* If we were LID routed we need to set up the drslid */ + portid->drpath.drslid = (uint16_t) scan->selfportid.lid; + portid->drpath.drdlid = 0xFFFF; + } + + if (add_port_to_dpath(&portid->drpath, nextport) < 0) { + IBND_ERROR("add port %d to DR path failed; %s\n", nextport, + portid2str(portid)); + return -1; + } + + if (((unsigned) portid->drpath.cnt - scan->initial_hops) > + f_int->fabric.maxhops_discovered) + f_int->fabric.maxhops_discovered++; + + return 1; +} + +static int recv_node_desc(smp_engine_t * engine, ibnd_smp_t * smp, + uint8_t * mad, void *cb_data) +{ + uint8_t *node_desc = mad + IB_SMP_DATA_OFFS; + ibnd_node_t *node = cb_data; + memcpy(node->nodedesc, node_desc, sizeof(node->nodedesc)); + return 0; +} + +static int query_node_desc(smp_engine_t * engine, ib_portid_t * portid, + ibnd_node_t * node) +{ + return issue_smp(engine, portid, IB_ATTR_NODE_DESC, 0, + recv_node_desc, node); +} + +static void debug_port(ib_portid_t * portid, ibnd_port_t * port) +{ + char width[64], speed[64]; + int iwidth; + int ispeed, fdr10, espeed; + uint8_t *info; + uint32_t cap_mask; + + iwidth = mad_get_field(port->info, 0, IB_PORT_LINK_WIDTH_ACTIVE_F); + ispeed = mad_get_field(port->info, 0, IB_PORT_LINK_SPEED_ACTIVE_F); + fdr10 = mad_get_field(port->ext_info, 0, + IB_MLNX_EXT_PORT_LINK_SPEED_ACTIVE_F); + + if (port->node->type == IB_NODE_SWITCH) + info = (uint8_t *)&port->node->ports[0]->info; + else + info = (uint8_t *)&port->info; + cap_mask = mad_get_field(info, 0, IB_PORT_CAPMASK_F); + if (cap_mask & be32toh(IB_PORT_CAP_HAS_EXT_SPEEDS)) + espeed = mad_get_field(port->info, 0, IB_PORT_LINK_SPEED_EXT_ACTIVE_F); + else + espeed = 0; + IBND_DEBUG + ("portid %s portnum %d: base lid %d state %d physstate %d %s %s %s %s\n", + portid2str(portid), port->portnum, port->base_lid, + mad_get_field(port->info, 0, IB_PORT_STATE_F), + mad_get_field(port->info, 0, IB_PORT_PHYS_STATE_F), + mad_dump_val(IB_PORT_LINK_WIDTH_ACTIVE_F, width, 64, &iwidth), + mad_dump_val(IB_PORT_LINK_SPEED_ACTIVE_F, speed, 64, &ispeed), + (fdr10 & FDR10) ? "FDR10" : "", + mad_dump_val(IB_PORT_LINK_SPEED_EXT_ACTIVE_F, speed, 64, &espeed)); +} + +static int is_mlnx_ext_port_info_supported(ibnd_port_t * port) +{ + uint16_t devid = (uint16_t) mad_get_field(port->node->info, 0, IB_NODE_DEVID_F); + uint32_t vendorid = (uint32_t) mad_get_field(port->node->info, 0, IB_NODE_VENDORID_F); + + if ((devid >= 0xc738 && devid <= 0xc73b) || + devid == 0xc839 || devid == 0xcb20 || devid == 0xcf08 || + devid == 0xcf09 || devid == 0xd2f0 || + ((vendorid == 0x119f) && + /* Bull SwitchX */ + (devid == 0x1b02 || devid == 0x1b50 || + /* Bull SwitchIB and SwitchIB2 */ + devid == 0x1ba0 || + (devid >= 0x1bd0 && devid <= 0x1bd5) || + /* Bull Quantum */ + devid == 0x1bf0))) + return 1; + if ((devid >= 0x1003 && devid <= 0x101b) || (devid == 0xa2d2) || + ((vendorid == 0x119f) && + /* Bull ConnectX3 */ + (devid == 0x1b33 || devid == 0x1b73 || + devid == 0x1b40 || devid == 0x1b41 || + devid == 0x1b60 || devid == 0x1b61 || + /* Bull ConnectIB */ + devid == 0x1b83 || + devid == 0x1b93 || devid == 0x1b94 || + /* Bull ConnectX4, Sequana HDR and HDR100 */ + devid == 0x1bb4 || devid == 0x1bb5 || + (devid >= 0x1bc4 && devid <= 0x1bc6)))) + return 1; + return 0; +} + +int mlnx_ext_port_info_err(smp_engine_t * engine, ibnd_smp_t * smp, + uint8_t * mad, void *cb_data) +{ + f_internal_t *f_int = ((ibnd_scan_t *) engine->user_data)->f_int; + ibnd_node_t *node = cb_data; + ibnd_port_t *port; + uint8_t port_num, local_port; + + port_num = (uint8_t) mad_get_field(mad, 0, IB_MAD_ATTRMOD_F); + port = node->ports[port_num]; + if (!port) { + IBND_ERROR("Failed to find 0x%" PRIx64 " port %u\n", + node->guid, port_num); + return -1; + } + + local_port = (uint8_t) mad_get_field(port->info, 0, IB_PORT_LOCAL_PORT_F); + debug_port(&smp->path, port); + + if (port_num && mad_get_field(port->info, 0, IB_PORT_PHYS_STATE_F) + == IB_PORT_PHYS_STATE_LINKUP + && ((node->type == IB_NODE_SWITCH && port_num != local_port) || + (node == f_int->fabric.from_node && port_num == f_int->fabric.from_portnum))) { + int rc = 0; + ib_portid_t path = smp->path; + + if (node->type != IB_NODE_SWITCH && + node == f_int->fabric.from_node && + path.drpath.cnt > 1) + rc = retract_dpath(engine, &path); + else { + /* we can't proceed through an HCA with DR */ + if (path.lid == 0 || node->type == IB_NODE_SWITCH) + rc = extend_dpath(engine, &path, port_num); + } + + if (rc > 0) { + struct ni_cbdata * cbdata = malloc(sizeof(*cbdata)); + cbdata->node = node; + cbdata->port_num = port_num; + query_node_info(engine, &path, cbdata); + } + } + + return 0; +} + +static int recv_mlnx_ext_port_info(smp_engine_t * engine, ibnd_smp_t * smp, + uint8_t * mad, void *cb_data) +{ + f_internal_t *f_int = ((ibnd_scan_t *) engine->user_data)->f_int; + ibnd_node_t *node = cb_data; + ibnd_port_t *port; + uint8_t *ext_port_info = mad + IB_SMP_DATA_OFFS; + uint8_t port_num, local_port; + + port_num = (uint8_t) mad_get_field(mad, 0, IB_MAD_ATTRMOD_F); + port = node->ports[port_num]; + if (!port) { + IBND_ERROR("Failed to find 0x%" PRIx64 " port %u\n", + node->guid, port_num); + return -1; + } + + memcpy(port->ext_info, ext_port_info, sizeof(port->ext_info)); + local_port = (uint8_t) mad_get_field(port->info, 0, IB_PORT_LOCAL_PORT_F); + debug_port(&smp->path, port); + + if (port_num && mad_get_field(port->info, 0, IB_PORT_PHYS_STATE_F) + == IB_PORT_PHYS_STATE_LINKUP + && ((node->type == IB_NODE_SWITCH && port_num != local_port) || + (node == f_int->fabric.from_node && port_num == f_int->fabric.from_portnum))) { + int rc = 0; + ib_portid_t path = smp->path; + + if (node->type != IB_NODE_SWITCH && + node == f_int->fabric.from_node && + path.drpath.cnt > 1) + rc = retract_dpath(engine, &path); + else { + /* we can't proceed through an HCA with DR */ + if (path.lid == 0 || node->type == IB_NODE_SWITCH) + rc = extend_dpath(engine, &path, port_num); + } + + if (rc > 0) { + struct ni_cbdata * cbdata = malloc(sizeof(*cbdata)); + cbdata->node = node; + cbdata->port_num = port_num; + query_node_info(engine, &path, cbdata); + } + } + + return 0; +} + +static int query_mlnx_ext_port_info(smp_engine_t * engine, ib_portid_t * portid, + ibnd_node_t * node, int portnum) +{ + IBND_DEBUG("Query MLNX Extended Port Info; %s (0x%" PRIx64 "):%d\n", + portid2str(portid), node->guid, portnum); + return issue_smp(engine, portid, IB_ATTR_MLNX_EXT_PORT_INFO, portnum, + recv_mlnx_ext_port_info, node); +} + +static int recv_port_info(smp_engine_t * engine, ibnd_smp_t * smp, + uint8_t * mad, void *cb_data) +{ + ibnd_scan_t *scan = (ibnd_scan_t *)engine->user_data; + f_internal_t *f_int = scan->f_int; + ibnd_node_t *node = cb_data; + ibnd_port_t *port; + uint8_t *port_info = mad + IB_SMP_DATA_OFFS; + uint8_t port_num, local_port; + int phystate, ispeed, espeed; + uint8_t *info; + uint32_t cap_mask; + + port_num = (uint8_t) mad_get_field(mad, 0, IB_MAD_ATTRMOD_F); + local_port = (uint8_t) mad_get_field(port_info, 0, IB_PORT_LOCAL_PORT_F); + + /* this may have been created before */ + port = node->ports[port_num]; + if (!port) { + port = node->ports[port_num] = calloc(1, sizeof(*port)); + if (!port) { + IBND_ERROR("Failed to allocate 0x%" PRIx64 " port %u\n", + node->guid, port_num); + return -1; + } + port->guid = + mad_get_field64(node->info, 0, IB_NODE_PORT_GUID_F); + } + + memcpy(port->info, port_info, sizeof(port->info)); + port->node = node; + port->portnum = port_num; + port->ext_portnum = 0; + port->base_lid = (uint16_t) mad_get_field(port->info, 0, IB_PORT_LID_F); + port->lmc = (uint8_t) mad_get_field(port->info, 0, IB_PORT_LMC_F); + + if (port_num == 0) { + node->smalid = port->base_lid; + node->smalmc = port->lmc; + } else if (node->type == IB_NODE_SWITCH) { + port->base_lid = node->smalid; + port->lmc = node->smalmc; + } + + int rc1 = add_to_portguid_hash(port, f_int->fabric.portstbl); + if (rc1) + IBND_ERROR("Error Occurred when trying" + " to insert new port guid 0x%016" PRIx64 " to DB\n", + port->guid); + + add_to_portlid_hash(port, f_int); + + if ((scan->cfg->flags & IBND_CONFIG_MLX_EPI) + && is_mlnx_ext_port_info_supported(port)) { + phystate = mad_get_field(port->info, 0, IB_PORT_PHYS_STATE_F); + ispeed = mad_get_field(port->info, 0, IB_PORT_LINK_SPEED_ACTIVE_F); + if (port->node->type == IB_NODE_SWITCH) + info = (uint8_t *)&port->node->ports[0]->info; + else + info = (uint8_t *)&port->info; + cap_mask = mad_get_field(info, 0, IB_PORT_CAPMASK_F); + if (cap_mask & be32toh(IB_PORT_CAP_HAS_EXT_SPEEDS)) + espeed = mad_get_field(port->info, 0, IB_PORT_LINK_SPEED_EXT_ACTIVE_F); + else + espeed = 0; + + if (phystate == IB_PORT_PHYS_STATE_LINKUP && + ispeed == IB_LINK_SPEED_ACTIVE_10 && + espeed == IB_LINK_SPEED_EXT_ACTIVE_NONE) { /* LinkUp/QDR */ + query_mlnx_ext_port_info(engine, &smp->path, + node, port_num); + return 0; + } + } + + debug_port(&smp->path, port); + + if (port_num && mad_get_field(port->info, 0, IB_PORT_PHYS_STATE_F) + == IB_PORT_PHYS_STATE_LINKUP + && ((node->type == IB_NODE_SWITCH && port_num != local_port) || + (node == f_int->fabric.from_node && port_num == f_int->fabric.from_portnum))) { + + int rc = 0; + ib_portid_t path = smp->path; + + if (node->type != IB_NODE_SWITCH && + node == f_int->fabric.from_node && + path.drpath.cnt > 1) + rc = retract_dpath(engine, &path); + else { + /* we can't proceed through an HCA with DR */ + if (path.lid == 0 || node->type == IB_NODE_SWITCH) + rc = extend_dpath(engine, &path, port_num); + } + + if (rc > 0) { + struct ni_cbdata * cbdata = malloc(sizeof(*cbdata)); + cbdata->node = node; + cbdata->port_num = port_num; + query_node_info(engine, &path, cbdata); + } + } + + return 0; +} + +static int recv_port0_info(smp_engine_t * engine, ibnd_smp_t * smp, + uint8_t * mad, void *cb_data) +{ + ibnd_node_t *node = cb_data; + int i, status; + + status = recv_port_info(engine, smp, mad, cb_data); + /* Query PortInfo on switch external/physical ports */ + for (i = 1; i <= node->numports; i++) + query_port_info(engine, &smp->path, node, i); + + return status; +} + +static int query_port_info(smp_engine_t * engine, ib_portid_t * portid, + ibnd_node_t * node, int portnum) +{ + IBND_DEBUG("Query Port Info; %s (0x%" PRIx64 "):%d\n", + portid2str(portid), node->guid, portnum); + return issue_smp(engine, portid, IB_ATTR_PORT_INFO, portnum, + portnum ? recv_port_info : recv_port0_info, node); +} + +static ibnd_node_t *create_node(smp_engine_t * engine, ib_portid_t * path, + uint8_t * node_info) +{ + f_internal_t *f_int = ((ibnd_scan_t *) engine->user_data)->f_int; + ibnd_node_t *rc = calloc(1, sizeof(*rc)); + if (!rc) { + IBND_ERROR("OOM: node creation failed\n"); + return NULL; + } + + /* decode just a couple of fields for quicker reference. */ + mad_decode_field(node_info, IB_NODE_GUID_F, &rc->guid); + mad_decode_field(node_info, IB_NODE_TYPE_F, &rc->type); + mad_decode_field(node_info, IB_NODE_NPORTS_F, &rc->numports); + + rc->ports = calloc(rc->numports + 1, sizeof(*rc->ports)); + if (!rc->ports) { + free(rc); + IBND_ERROR("OOM: Failed to allocate the ports array\n"); + return NULL; + } + + rc->path_portid = *path; + memcpy(rc->info, node_info, sizeof(rc->info)); + + int rc1 = add_to_nodeguid_hash(rc, f_int->fabric.nodestbl); + if (rc1) + IBND_ERROR("Error Occurred when trying" + " to insert new node guid 0x%016" PRIx64 " to DB\n", + rc->guid); + + /* add this to the all nodes list */ + rc->next = f_int->fabric.nodes; + f_int->fabric.nodes = rc; + + add_to_type_list(rc, f_int); + + return rc; +} + +static void link_ports(ibnd_node_t * node, ibnd_port_t * port, + ibnd_node_t * remotenode, ibnd_port_t * remoteport) +{ + IBND_DEBUG("linking: 0x%" PRIx64 " %p->%p:%u and 0x%" PRIx64 + " %p->%p:%u\n", node->guid, node, port, port->portnum, + remotenode->guid, remotenode, remoteport, + remoteport->portnum); + if (port->remoteport) + port->remoteport->remoteport = NULL; + if (remoteport->remoteport) + remoteport->remoteport->remoteport = NULL; + port->remoteport = remoteport; + remoteport->remoteport = port; +} + +static void dump_endnode(ib_portid_t *path, const char *prompt, + ibnd_node_t *node, ibnd_port_t *port) +{ + char type[64]; + mad_dump_node_type(type, sizeof(type), &node->type, sizeof(int)); + printf("%s -> %s %s {%016" PRIx64 "} portnum %d lid %d-%d \"%s\"\n", + portid2str(path), prompt, type, node->guid, + node->type == IB_NODE_SWITCH ? 0 : port->portnum, + port->base_lid, port->base_lid + (1 << port->lmc) - 1, + node->nodedesc); +} + +static int recv_node_info(smp_engine_t * engine, ibnd_smp_t * smp, + uint8_t * mad, void *cb_data) +{ + ibnd_scan_t *scan = engine->user_data; + f_internal_t *f_int = scan->f_int; + uint8_t *node_info = mad + IB_SMP_DATA_OFFS; + struct ni_cbdata *ni_cbdata = (struct ni_cbdata *)cb_data; + ibnd_node_t *rem_node = NULL; + int rem_port_num = 0; + ibnd_node_t *node; + int node_is_new = 0; + uint64_t node_guid = mad_get_field64(node_info, 0, IB_NODE_GUID_F); + uint64_t port_guid = mad_get_field64(node_info, 0, IB_NODE_PORT_GUID_F); + int port_num = mad_get_field(node_info, 0, IB_NODE_LOCAL_PORT_F); + ibnd_port_t *port = NULL; + + if (ni_cbdata) { + rem_node = ni_cbdata->node; + rem_port_num = ni_cbdata->port_num; + free(ni_cbdata); + } + + node = ibnd_find_node_guid(&f_int->fabric, node_guid); + if (!node) { + node = create_node(engine, &smp->path, node_info); + if (!node) + return -1; + node_is_new = 1; + } + IBND_DEBUG("Found %s node GUID 0x%" PRIx64 " (%s)\n", + node_is_new ? "new" : "old", node->guid, + portid2str(&smp->path)); + + port = node->ports[port_num]; + if (!port) { + /* If we have not see this port before create a shell for it */ + port = node->ports[port_num] = calloc(1, sizeof(*port)); + if (!port) + return -1; + port->node = node; + port->portnum = port_num; + } + port->guid = port_guid; + + if (scan->cfg->show_progress) + dump_endnode(&smp->path, node_is_new ? "new" : "known", + node, port); + + if (rem_node == NULL) { /* this is the start node */ + f_int->fabric.from_node = node; + f_int->fabric.from_portnum = port_num; + } else { + /* link ports... */ + if (!rem_node->ports[rem_port_num]) { + IBND_ERROR("Internal Error; " + "Node(%p) 0x%" PRIx64 + " Port %d no port created!?!?!?\n\n", + rem_node, rem_node->guid, rem_port_num); + return -1; + } + + link_ports(node, port, rem_node, rem_node->ports[rem_port_num]); + } + + if (node_is_new) { + query_node_desc(engine, &smp->path, node); + + if (node->type == IB_NODE_SWITCH) { + query_switch_info(engine, &smp->path, node); + /* Query PortInfo on Switch Port 0 first */ + query_port_info(engine, &smp->path, node, 0); + } + } + + if (node->type != IB_NODE_SWITCH) + query_port_info(engine, &smp->path, node, port_num); + + return 0; +} + +static int query_node_info(smp_engine_t * engine, ib_portid_t * portid, + struct ni_cbdata * cbdata) +{ + IBND_DEBUG("Query Node Info; %s\n", portid2str(portid)); + return issue_smp(engine, portid, IB_ATTR_NODE_INFO, 0, + recv_node_info, (void *)cbdata); +} + +ibnd_node_t *ibnd_find_node_guid(ibnd_fabric_t * fabric, uint64_t guid) +{ + int hash = HASHGUID(guid) % HTSZ; + ibnd_node_t *node; + + if (!fabric) { + IBND_DEBUG("fabric parameter NULL\n"); + return NULL; + } + + for (node = fabric->nodestbl[hash]; node; node = node->htnext) + if (node->guid == guid) + return node; + + return NULL; +} + +ibnd_node_t *ibnd_find_node_dr(ibnd_fabric_t * fabric, char *dr_str) +{ + ibnd_port_t *rc = ibnd_find_port_dr(fabric, dr_str); + return rc->node; +} + +int add_to_nodeguid_hash(ibnd_node_t * node, ibnd_node_t * hash[]) +{ + int rc = 0; + ibnd_node_t *tblnode; + int hash_idx = HASHGUID(node->guid) % HTSZ; + + for (tblnode = hash[hash_idx]; tblnode; tblnode = tblnode->htnext) { + if (tblnode == node) { + IBND_ERROR("Duplicate Node: Node with guid 0x%016" + PRIx64 " already exists in nodes DB\n", + node->guid); + return 1; + } + } + node->htnext = hash[hash_idx]; + hash[hash_idx] = node; + return rc; +} + +int add_to_portguid_hash(ibnd_port_t * port, ibnd_port_t * hash[]) +{ + int rc = 0; + ibnd_port_t *tblport; + int hash_idx = HASHGUID(port->guid) % HTSZ; + + for (tblport = hash[hash_idx]; tblport; tblport = tblport->htnext) { + if (tblport == port) { + IBND_ERROR("Duplicate Port: Port with guid 0x%016" + PRIx64 " already exists in ports DB\n", + port->guid); + return 1; + } + } + port->htnext = hash[hash_idx]; + hash[hash_idx] = port; + return rc; +} + +struct lid2guid_item { + cl_map_item_t cl_map; + ibnd_port_t *port; +}; + +void create_lid2guid(f_internal_t *f_int) +{ + cl_qmap_init(&f_int->lid2guid); +} + +void destroy_lid2guid(f_internal_t *f_int) +{ + cl_map_item_t *item; + + for (item = cl_qmap_head(&f_int->lid2guid); item != cl_qmap_end(&f_int->lid2guid); + item = cl_qmap_head(&f_int->lid2guid)) { + cl_qmap_remove_item(&f_int->lid2guid, item); + free(container_of(item, struct lid2guid_item, cl_map)); + } +} + +void add_to_portlid_hash(ibnd_port_t * port, f_internal_t *f_int) +{ + uint16_t base_lid = port->base_lid; + uint16_t lid_mask = ((1 << port->lmc) -1); + uint16_t lid = 0; + /* 0 < valid lid <= 0xbfff */ + if (base_lid > 0 && base_lid <= 0xbfff) { + /* We add the port for all lids + * so it is easier to find any "random" lid specified */ + for (lid = base_lid; lid <= (base_lid + lid_mask); lid++) { + struct lid2guid_item *item; + + item = malloc(sizeof(*item)); + if (item) { + item->port = port; + cl_qmap_insert(&f_int->lid2guid, lid, + &item->cl_map); + } + } + } +} + +void add_to_type_list(ibnd_node_t * node, f_internal_t * f_int) +{ + ibnd_fabric_t *fabric = &f_int->fabric; + switch (node->type) { + case IB_NODE_CA: + node->type_next = fabric->ch_adapters; + fabric->ch_adapters = node; + break; + case IB_NODE_SWITCH: + node->type_next = fabric->switches; + fabric->switches = node; + break; + case IB_NODE_ROUTER: + node->type_next = fabric->routers; + fabric->routers = node; + break; + } +} + +static int set_config(struct ibnd_config *config, struct ibnd_config *cfg) +{ + if (!config) + return (-EINVAL); + + if (cfg) + memcpy(config, cfg, sizeof(*config)); + + if (!config->max_smps) + config->max_smps = DEFAULT_MAX_SMP_ON_WIRE; + if (!config->timeout_ms) + config->timeout_ms = DEFAULT_TIMEOUT; + if (!config->retries) + config->retries = DEFAULT_RETRIES; + + return (0); +} + +f_internal_t *allocate_fabric_internal(void) +{ + f_internal_t *f = calloc(1, sizeof(*f)); + if (f) + create_lid2guid(f); + + return (f); +} + +ibnd_fabric_t *ibnd_discover_fabric(char * ca_name, int ca_port, + ib_portid_t * from, + struct ibnd_config *cfg) +{ + struct ibnd_config config = { 0 }; + f_internal_t *f_int = NULL; + ib_portid_t my_portid = { 0 }; + smp_engine_t engine; + ibnd_scan_t scan; + struct ibmad_port *ibmad_port; + int nc = 2; + int mc[2] = { IB_SMI_CLASS, IB_SMI_DIRECT_CLASS }; + + /* If not specified start from "my" port */ + if (!from) + from = &my_portid; + + if (set_config(&config, cfg)) { + IBND_ERROR("Invalid ibnd_config\n"); + return NULL; + } + + f_int = allocate_fabric_internal(); + if (!f_int) { + IBND_ERROR("OOM: failed to calloc ibnd_fabric_t\n"); + return NULL; + } + + memset(&scan.selfportid, 0, sizeof(scan.selfportid)); + scan.f_int = f_int; + scan.cfg = &config; + scan.initial_hops = from->drpath.cnt; + + ibmad_port = mad_rpc_open_port(ca_name, ca_port, mc, nc); + if (!ibmad_port) { + IBND_ERROR("can't open MAD port (%s:%d)\n", ca_name, ca_port); + return (NULL); + } + mad_rpc_set_timeout(ibmad_port, cfg->timeout_ms); + mad_rpc_set_retries(ibmad_port, cfg->retries); + smp_mkey_set(ibmad_port, cfg->mkey); + + if (ib_resolve_self_via(&scan.selfportid, + NULL, NULL, ibmad_port) < 0) { + IBND_ERROR("Failed to resolve self\n"); + mad_rpc_close_port(ibmad_port); + return NULL; + } + mad_rpc_close_port(ibmad_port); + + if (smp_engine_init(&engine, ca_name, ca_port, &scan, &config)) { + free(f_int); + return (NULL); + } + + IBND_DEBUG("from %s\n", portid2str(from)); + + if (!query_node_info(&engine, from, NULL)) + if (process_mads(&engine) != 0) + goto error; + + f_int->fabric.total_mads_used = engine.total_smps; + f_int->fabric.maxhops_discovered += scan.initial_hops; + + if (group_nodes(&f_int->fabric)) + goto error; + + smp_engine_destroy(&engine); + return (ibnd_fabric_t *)f_int; +error: + smp_engine_destroy(&engine); + ibnd_destroy_fabric(&f_int->fabric); + return NULL; +} + +void destroy_node(ibnd_node_t * node) +{ + int p = 0; + + if (node->ports) { + for (p = 0; p <= node->numports; p++) + free(node->ports[p]); + free(node->ports); + } + free(node); +} + +void ibnd_destroy_fabric(ibnd_fabric_t * fabric) +{ + ibnd_node_t *node = NULL; + ibnd_node_t *next = NULL; + ibnd_chassis_t *ch, *ch_next; + + if (!fabric) + return; + + ch = fabric->chassis; + while (ch) { + ch_next = ch->next; + free(ch); + ch = ch_next; + } + node = fabric->nodes; + while (node) { + next = node->next; + destroy_node(node); + node = next; + } + destroy_lid2guid((f_internal_t *)fabric); + free(fabric); +} + +void ibnd_iter_nodes(ibnd_fabric_t * fabric, ibnd_iter_node_func_t func, + void *user_data) +{ + ibnd_node_t *cur = NULL; + + if (!fabric) { + IBND_DEBUG("fabric parameter NULL\n"); + return; + } + + if (!func) { + IBND_DEBUG("func parameter NULL\n"); + return; + } + + for (cur = fabric->nodes; cur; cur = cur->next) + func(cur, user_data); +} + +void ibnd_iter_nodes_type(ibnd_fabric_t * fabric, ibnd_iter_node_func_t func, + int node_type, void *user_data) +{ + ibnd_node_t *list = NULL; + ibnd_node_t *cur = NULL; + + if (!fabric) { + IBND_DEBUG("fabric parameter NULL\n"); + return; + } + + if (!func) { + IBND_DEBUG("func parameter NULL\n"); + return; + } + + switch (node_type) { + case IB_NODE_SWITCH: + list = fabric->switches; + break; + case IB_NODE_CA: + list = fabric->ch_adapters; + break; + case IB_NODE_ROUTER: + list = fabric->routers; + break; + default: + IBND_DEBUG("Invalid node_type specified %d\n", node_type); + break; + } + + for (cur = list; cur; cur = cur->type_next) + func(cur, user_data); +} + +ibnd_port_t *ibnd_find_port_lid(ibnd_fabric_t * fabric, + uint16_t lid) +{ + f_internal_t *f = (f_internal_t *)fabric; + + return container_of(cl_qmap_get(&f->lid2guid, lid), + struct lid2guid_item, cl_map) + ->port; +} + +ibnd_port_t *ibnd_find_port_guid(ibnd_fabric_t * fabric, uint64_t guid) +{ + int hash = HASHGUID(guid) % HTSZ; + ibnd_port_t *port; + + if (!fabric) { + IBND_DEBUG("fabric parameter NULL\n"); + return NULL; + } + + for (port = fabric->portstbl[hash]; port; port = port->htnext) + if (port->guid == guid) + return port; + + return NULL; +} + +ibnd_port_t *ibnd_find_port_dr(ibnd_fabric_t * fabric, char *dr_str) +{ + int i = 0; + ibnd_node_t *cur_node; + ibnd_port_t *rc = NULL; + ib_dr_path_t path; + + if (!fabric) { + IBND_DEBUG("fabric parameter NULL\n"); + return NULL; + } + + if (!dr_str) { + IBND_DEBUG("dr_str parameter NULL\n"); + return NULL; + } + + cur_node = fabric->from_node; + + if (str2drpath(&path, dr_str, 0, 0) == -1) + return NULL; + + for (i = 0; i <= path.cnt; i++) { + ibnd_port_t *remote_port = NULL; + if (path.p[i] == 0) + continue; + if (!cur_node->ports) + return NULL; + + remote_port = cur_node->ports[path.p[i]]->remoteport; + if (!remote_port) + return NULL; + + rc = remote_port; + cur_node = remote_port->node; + } + + return rc; +} + +void ibnd_iter_ports(ibnd_fabric_t * fabric, ibnd_iter_port_func_t func, + void *user_data) +{ + int i = 0; + ibnd_port_t *cur = NULL; + + if (!fabric) { + IBND_DEBUG("fabric parameter NULL\n"); + return; + } + + if (!func) { + IBND_DEBUG("func parameter NULL\n"); + return; + } + + for (i = 0; i<HTSZ; i++) + for (cur = fabric->portstbl[i]; cur; cur = cur->htnext) + func(cur, user_data); +} diff --git a/libibnetdisc/ibnetdisc.h b/libibnetdisc/ibnetdisc.h new file mode 100644 index 0000000..51fcbe0 --- /dev/null +++ b/libibnetdisc/ibnetdisc.h @@ -0,0 +1,246 @@ +/* + * Copyright (c) 2009 Voltaire, Inc. All rights reserved. + * Copyright (c) 2008 Lawrence Livermore National Lab. All rights reserved. + * Copyright (c) 2010-2011 Mellanox Technologies LTD. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#ifndef _IBNETDISC_H_ +#define _IBNETDISC_H_ + +#include <stdio.h> +#include <infiniband/mad.h> + +#ifdef __cplusplus +extern "C" { +#endif + +struct ibnd_chassis; /* forward declare */ +struct ibnd_port; /* forward declare */ + +#define CHASSIS_TYPE_SIZE 20 + +/** ========================================================================= + * Node + */ +typedef struct ibnd_node { + struct ibnd_node *next; /* all node list in fabric */ + + ib_portid_t path_portid; /* path from "from_node" */ + /* NOTE: this is not valid on a fabric + * read from a cache file */ + uint16_t smalid; + uint8_t smalmc; + + /* quick cache of switchinfo below */ + int smaenhsp0; + /* use libibmad decoder functions for switchinfo */ + uint8_t switchinfo[IB_SMP_DATA_SIZE]; + + /* quick cache of info below */ + uint64_t guid; + int type; + int numports; + /* use libibmad decoder functions for info */ + uint8_t info[IB_SMP_DATA_SIZE]; + + char nodedesc[IB_SMP_DATA_SIZE]; + + struct ibnd_port **ports; /* array of ports, indexed by port number + ports[1] == port 1, + ports[2] == port 2, + etc... + Any port in the array MAY BE NULL! + Most notable is non-switches have no + port 0 therefore node.ports[0] == NULL + for those nodes */ + + /* chassis info */ + struct ibnd_node *next_chassis_node; /* next node in ibnd_chassis_t->nodes */ + struct ibnd_chassis *chassis; /* if != NULL the chassis this node belongs to */ + unsigned char ch_type; + char ch_type_str[CHASSIS_TYPE_SIZE]; + unsigned char ch_anafanum; + unsigned char ch_slotnum; + unsigned char ch_slot; + + /* internal use only */ + unsigned char ch_found; + struct ibnd_node *htnext; /* hash table list */ + struct ibnd_node *type_next; /* next based on type */ +} ibnd_node_t; + +/** ========================================================================= + * Port + */ +typedef struct ibnd_port { + uint64_t guid; + int portnum; + int ext_portnum; /* optional if != 0 external port num */ + ibnd_node_t *node; /* node this port belongs to */ + struct ibnd_port *remoteport; /* null if SMA, or does not exist */ + /* quick cache of info below */ + uint16_t base_lid; + uint8_t lmc; + /* use libibmad decoder functions for info */ + uint8_t info[IB_SMP_DATA_SIZE]; + uint8_t ext_info[IB_SMP_DATA_SIZE]; + + /* internal use only */ + struct ibnd_port *htnext; +} ibnd_port_t; + +/** ========================================================================= + * Chassis + */ +typedef struct ibnd_chassis { + struct ibnd_chassis *next; + uint64_t chassisguid; + unsigned char chassisnum; + + /* generic grouping by SystemImageGUID */ + unsigned char nodecount; + ibnd_node_t *nodes; + + /* specific to voltaire type nodes */ +#define SPINES_MAX_NUM 18 +#define LINES_MAX_NUM 36 + ibnd_node_t *spinenode[SPINES_MAX_NUM + 1]; + ibnd_node_t *linenode[LINES_MAX_NUM + 1]; +} ibnd_chassis_t; + +#define HTSZ 137 + +/* define config flags */ +#define IBND_CONFIG_MLX_EPI (1 << 0) + +typedef struct ibnd_config { + unsigned max_smps; + unsigned show_progress; + unsigned max_hops; + unsigned debug; + unsigned timeout_ms; + unsigned retries; + uint32_t flags; + uint64_t mkey; + uint8_t pad[44]; +} ibnd_config_t; + +/** ========================================================================= + * Fabric + * Main fabric object which is returned and represents the data discovered + */ +typedef struct ibnd_fabric { + /* the node the discover was initiated from + * "from" parameter in ibnd_discover_fabric + * or by default the node you ar running on + */ + ibnd_node_t *from_node; + int from_portnum; + + /* NULL term list of all nodes in the fabric */ + ibnd_node_t *nodes; + /* NULL terminated list of all chassis found in the fabric */ + ibnd_chassis_t *chassis; + unsigned maxhops_discovered; + unsigned total_mads_used; + + /* internal use only */ + ibnd_node_t *nodestbl[HTSZ]; + ibnd_port_t *portstbl[HTSZ]; + ibnd_node_t *switches; + ibnd_node_t *ch_adapters; + ibnd_node_t *routers; +} ibnd_fabric_t; + +/** ========================================================================= + * Initialization (fabric operations) + */ + +ibnd_fabric_t *ibnd_discover_fabric(char *ca_name, int ca_port, + ib_portid_t *from, + struct ibnd_config *config); + /** + * ca_name: (optional) name of the CA to use + * ca_port: (optional) CA port to use + * from: (optional) specify the node to start scanning from. + * If NULL start from the CA/CA port specified + * config: (optional) additional config options for the scan + */ +void ibnd_destroy_fabric(ibnd_fabric_t *fabric); + +ibnd_fabric_t *ibnd_load_fabric(const char *file, unsigned int flags); + +int ibnd_cache_fabric(ibnd_fabric_t *fabric, const char *file, + unsigned int flags); + +#define IBND_CACHE_FABRIC_FLAG_DEFAULT 0x0000 +#define IBND_CACHE_FABRIC_FLAG_NO_OVERWRITE 0x0001 + +/** ========================================================================= + * Node operations + */ +ibnd_node_t *ibnd_find_node_guid(ibnd_fabric_t *fabric, uint64_t guid); +ibnd_node_t *ibnd_find_node_dr(ibnd_fabric_t *fabric, char *dr_str); + +typedef void (*ibnd_iter_node_func_t) (ibnd_node_t * node, void *user_data); +void ibnd_iter_nodes(ibnd_fabric_t *fabric, ibnd_iter_node_func_t func, + void *user_data); +void ibnd_iter_nodes_type(ibnd_fabric_t *fabric, ibnd_iter_node_func_t func, + int node_type, void *user_data); + +/** ========================================================================= + * Port operations + */ +ibnd_port_t *ibnd_find_port_guid(ibnd_fabric_t *fabric, uint64_t guid); +ibnd_port_t *ibnd_find_port_dr(ibnd_fabric_t *fabric, char *dr_str); +ibnd_port_t *ibnd_find_port_lid(ibnd_fabric_t *fabric, uint16_t lid); + +typedef void (*ibnd_iter_port_func_t) (ibnd_port_t * port, void *user_data); +void ibnd_iter_ports(ibnd_fabric_t *fabric, ibnd_iter_port_func_t func, + void *user_data); + +/** ========================================================================= + * Chassis queries + */ +uint64_t ibnd_get_chassis_guid(ibnd_fabric_t *fabric, unsigned char chassisnum); +const char *ibnd_get_chassis_type(ibnd_node_t *node); +char *ibnd_get_chassis_slot_str(ibnd_node_t *node, char *str, size_t size); + +int ibnd_is_xsigo_guid(uint64_t guid); +int ibnd_is_xsigo_tca(uint64_t guid); +int ibnd_is_xsigo_hca(uint64_t guid); + +#ifdef __cplusplus +} +#endif + +#endif /* _IBNETDISC_H_ */ diff --git a/libibnetdisc/ibnetdisc_cache.c b/libibnetdisc/ibnetdisc_cache.c new file mode 100644 index 0000000..605582f --- /dev/null +++ b/libibnetdisc/ibnetdisc_cache.c @@ -0,0 +1,963 @@ +/* + * Copyright (c) 2004-2007 Voltaire Inc. All rights reserved. + * Copyright (c) 2007 Xsigo Systems Inc. All rights reserved. + * Copyright (c) 2008 Lawrence Livermore National Laboratory + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#define _GNU_SOURCE +#include <stdio.h> +#include <stdlib.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <unistd.h> +#include <fcntl.h> +#include <string.h> +#include <errno.h> +#include <inttypes.h> + +#include <infiniband/ibnetdisc.h> + +#include "internal.h" +#include "chassis.h" + +/* For this caching lib, we always cache little endian */ + +/* Cache format + * + * Bytes 1-4 - magic number + * Bytes 5-8 - version number + * Bytes 9-12 - node count + * Bytes 13-16 - port count + * Bytes 17-24 - "from node" guid + * Bytes 25-28 - maxhops discovered + * Bytes X-Y - nodes (variable length) + * Bytes X-Y - ports (variable length) + * + * Nodes are cached as + * + * 2 bytes - smalid + * 1 byte - smalmc + * 1 byte - smaenhsp0 flag + * IB_SMP_DATA_SIZE bytes - switchinfo + * 8 bytes - guid + * 1 byte - type + * 1 byte - numports + * IB_SMP_DATA_SIZE bytes - info + * IB_SMP_DATA_SIZE bytes - nodedesc + * 1 byte - number of ports stored + * 8 bytes - portguid A + * 1 byte - port num A + * 8 bytes - portguid B + * 1 byte - port num B + * ... etc., depending on number of ports stored + * + * Ports are cached as + * + * 8 bytes - guid + * 1 byte - portnum + * 1 byte - external portnum + * 2 bytes - base lid + * 1 byte - lmc + * IB_SMP_DATA_SIZE bytes - info + * 8 bytes - node guid port "owned" by + * 1 byte - flag indicating if remote port exists + * 8 bytes - port guid remotely connected to + * 1 byte - port num remotely connected to + */ + +/* Structs that hold cache info temporarily before + * the real structs can be reconstructed. + */ + +typedef struct ibnd_port_cache_key { + uint64_t guid; + uint8_t portnum; +} ibnd_port_cache_key_t; + +typedef struct ibnd_node_cache { + ibnd_node_t *node; + uint8_t ports_stored_count; + ibnd_port_cache_key_t *port_cache_keys; + struct ibnd_node_cache *next; + struct ibnd_node_cache *htnext; + int node_stored_to_fabric; +} ibnd_node_cache_t; + +typedef struct ibnd_port_cache { + ibnd_port_t *port; + uint64_t node_guid; + uint8_t remoteport_flag; + ibnd_port_cache_key_t remoteport_cache_key; + struct ibnd_port_cache *next; + struct ibnd_port_cache *htnext; + int port_stored_to_fabric; +} ibnd_port_cache_t; + +typedef struct ibnd_fabric_cache { + f_internal_t *f_int; + uint64_t from_node_guid; + ibnd_node_cache_t *nodes_cache; + ibnd_port_cache_t *ports_cache; + ibnd_node_cache_t *nodescachetbl[HTSZ]; + ibnd_port_cache_t *portscachetbl[HTSZ]; +} ibnd_fabric_cache_t; + +#define IBND_FABRIC_CACHE_BUFLEN 4096 +#define IBND_FABRIC_CACHE_MAGIC 0x8FE7832B +#define IBND_FABRIC_CACHE_VERSION 0x00000001 + +#define IBND_FABRIC_CACHE_COUNT_OFFSET 8 + +#define IBND_FABRIC_CACHE_HEADER_LEN (28) +#define IBND_NODE_CACHE_HEADER_LEN (15 + IB_SMP_DATA_SIZE*3) +#define IBND_PORT_CACHE_KEY_LEN (8 + 1) +#define IBND_PORT_CACHE_LEN (31 + IB_SMP_DATA_SIZE) + +static ssize_t ibnd_read(int fd, void *buf, size_t count) +{ + size_t count_done = 0; + ssize_t ret; + + while ((count - count_done) > 0) { + ret = read(fd, ((char *) buf) + count_done, count - count_done); + if (ret < 0) { + if (errno == EINTR) + continue; + else { + IBND_DEBUG("read: %s\n", strerror(errno)); + return -1; + } + } + if (!ret) + break; + count_done += ret; + } + + if (count_done != count) { + IBND_DEBUG("read: read short\n"); + return -1; + } + + return count_done; +} + +static size_t _unmarshall8(uint8_t * inbuf, uint8_t * num) +{ + (*num) = inbuf[0]; + + return (sizeof(*num)); +} + +static size_t _unmarshall16(uint8_t * inbuf, uint16_t * num) +{ + (*num) = ((uint16_t) inbuf[1] << 8) | inbuf[0]; + + return (sizeof(*num)); +} + +static size_t _unmarshall32(uint8_t * inbuf, uint32_t * num) +{ + (*num) = (uint32_t) inbuf[0]; + (*num) |= ((uint32_t) inbuf[1] << 8); + (*num) |= ((uint32_t) inbuf[2] << 16); + (*num) |= ((uint32_t) inbuf[3] << 24); + + return (sizeof(*num)); +} + +static size_t _unmarshall64(uint8_t * inbuf, uint64_t * num) +{ + (*num) = (uint64_t) inbuf[0]; + (*num) |= ((uint64_t) inbuf[1] << 8); + (*num) |= ((uint64_t) inbuf[2] << 16); + (*num) |= ((uint64_t) inbuf[3] << 24); + (*num) |= ((uint64_t) inbuf[4] << 32); + (*num) |= ((uint64_t) inbuf[5] << 40); + (*num) |= ((uint64_t) inbuf[6] << 48); + (*num) |= ((uint64_t) inbuf[7] << 56); + + return (sizeof(*num)); +} + +static size_t _unmarshall_buf(const void *inbuf, void *outbuf, unsigned int len) +{ + memcpy(outbuf, inbuf, len); + + return len; +} + +static int _load_header_info(int fd, ibnd_fabric_cache_t * fabric_cache, + unsigned int *node_count, unsigned int *port_count) +{ + uint8_t buf[IBND_FABRIC_CACHE_BUFLEN]; + uint32_t magic = 0; + uint32_t version = 0; + size_t offset = 0; + uint32_t tmp32; + + if (ibnd_read(fd, buf, IBND_FABRIC_CACHE_HEADER_LEN) < 0) + return -1; + + offset += _unmarshall32(buf + offset, &magic); + + if (magic != IBND_FABRIC_CACHE_MAGIC) { + IBND_DEBUG("invalid fabric cache file\n"); + return -1; + } + + offset += _unmarshall32(buf + offset, &version); + + if (version != IBND_FABRIC_CACHE_VERSION) { + IBND_DEBUG("invalid fabric cache version\n"); + return -1; + } + + offset += _unmarshall32(buf + offset, node_count); + offset += _unmarshall32(buf + offset, port_count); + + offset += _unmarshall64(buf + offset, &fabric_cache->from_node_guid); + offset += _unmarshall32(buf + offset, &tmp32); + fabric_cache->f_int->fabric.maxhops_discovered = tmp32; + + return 0; +} + +static void _destroy_ibnd_node_cache(ibnd_node_cache_t * node_cache) +{ + free(node_cache->port_cache_keys); + if (!node_cache->node_stored_to_fabric && node_cache->node) + destroy_node(node_cache->node); + free(node_cache); +} + +static void _destroy_ibnd_fabric_cache(ibnd_fabric_cache_t * fabric_cache) +{ + ibnd_node_cache_t *node_cache; + ibnd_node_cache_t *node_cache_next; + ibnd_port_cache_t *port_cache; + ibnd_port_cache_t *port_cache_next; + + if (!fabric_cache) + return; + + node_cache = fabric_cache->nodes_cache; + while (node_cache) { + node_cache_next = node_cache->next; + + _destroy_ibnd_node_cache(node_cache); + + node_cache = node_cache_next; + } + + port_cache = fabric_cache->ports_cache; + while (port_cache) { + port_cache_next = port_cache->next; + + if (!port_cache->port_stored_to_fabric && port_cache->port) + free(port_cache->port); + free(port_cache); + + port_cache = port_cache_next; + } + + free(fabric_cache); +} + +static void store_node_cache(ibnd_node_cache_t * node_cache, + ibnd_fabric_cache_t * fabric_cache) +{ + int hash_indx = HASHGUID(node_cache->node->guid) % HTSZ; + + node_cache->next = fabric_cache->nodes_cache; + fabric_cache->nodes_cache = node_cache; + + node_cache->htnext = fabric_cache->nodescachetbl[hash_indx]; + fabric_cache->nodescachetbl[hash_indx] = node_cache; +} + +static int _load_node(int fd, ibnd_fabric_cache_t * fabric_cache) +{ + uint8_t buf[IBND_FABRIC_CACHE_BUFLEN]; + ibnd_node_cache_t *node_cache = NULL; + ibnd_node_t *node = NULL; + size_t offset = 0; + uint8_t tmp8; + + node_cache = (ibnd_node_cache_t *) malloc(sizeof(ibnd_node_cache_t)); + if (!node_cache) { + IBND_DEBUG("OOM: node_cache\n"); + return -1; + } + memset(node_cache, '\0', sizeof(ibnd_node_cache_t)); + + node = (ibnd_node_t *) malloc(sizeof(ibnd_node_t)); + if (!node) { + IBND_DEBUG("OOM: node\n"); + free(node_cache); + return -1; + } + memset(node, '\0', sizeof(ibnd_node_t)); + + node_cache->node = node; + + if (ibnd_read(fd, buf, IBND_NODE_CACHE_HEADER_LEN) < 0) + goto cleanup; + + offset += _unmarshall16(buf + offset, &node->smalid); + offset += _unmarshall8(buf + offset, &node->smalmc); + offset += _unmarshall8(buf + offset, &tmp8); + node->smaenhsp0 = tmp8; + offset += _unmarshall_buf(buf + offset, node->switchinfo, + IB_SMP_DATA_SIZE); + offset += _unmarshall64(buf + offset, &node->guid); + offset += _unmarshall8(buf + offset, &tmp8); + node->type = tmp8; + offset += _unmarshall8(buf + offset, &tmp8); + node->numports = tmp8; + offset += _unmarshall_buf(buf + offset, node->info, IB_SMP_DATA_SIZE); + offset += _unmarshall_buf(buf + offset, node->nodedesc, + IB_SMP_DATA_SIZE); + + offset += _unmarshall8(buf + offset, &node_cache->ports_stored_count); + + if (node_cache->ports_stored_count) { + unsigned int tomalloc = 0; + unsigned int toread = 0; + unsigned int i; + + tomalloc = + sizeof(ibnd_port_cache_key_t) * + node_cache->ports_stored_count; + + toread = + IBND_PORT_CACHE_KEY_LEN * node_cache->ports_stored_count; + + node_cache->port_cache_keys = + (ibnd_port_cache_key_t *) malloc(tomalloc); + if (!node_cache->port_cache_keys) { + IBND_DEBUG("OOM: node_cache port_cache_keys\n"); + goto cleanup; + } + + if (ibnd_read(fd, buf, toread) < 0) + goto cleanup; + + offset = 0; + + for (i = 0; i < node_cache->ports_stored_count; i++) { + offset += + _unmarshall64(buf + offset, + &node_cache->port_cache_keys[i].guid); + offset += + _unmarshall8(buf + offset, + &node_cache-> + port_cache_keys[i].portnum); + } + } + + store_node_cache(node_cache, fabric_cache); + + return 0; + +cleanup: + _destroy_ibnd_node_cache(node_cache); + return -1; +} + +static void store_port_cache(ibnd_port_cache_t * port_cache, + ibnd_fabric_cache_t * fabric_cache) +{ + int hash_indx = HASHGUID(port_cache->port->guid) % HTSZ; + + port_cache->next = fabric_cache->ports_cache; + fabric_cache->ports_cache = port_cache; + + port_cache->htnext = fabric_cache->portscachetbl[hash_indx]; + fabric_cache->portscachetbl[hash_indx] = port_cache; +} + +static int _load_port(int fd, ibnd_fabric_cache_t * fabric_cache) +{ + uint8_t buf[IBND_FABRIC_CACHE_BUFLEN]; + ibnd_port_cache_t *port_cache = NULL; + ibnd_port_t *port = NULL; + size_t offset = 0; + uint8_t tmp8; + + port_cache = (ibnd_port_cache_t *) malloc(sizeof(ibnd_port_cache_t)); + if (!port_cache) { + IBND_DEBUG("OOM: port_cache\n"); + return -1; + } + memset(port_cache, '\0', sizeof(ibnd_port_cache_t)); + + port = (ibnd_port_t *) malloc(sizeof(ibnd_port_t)); + if (!port) { + IBND_DEBUG("OOM: port\n"); + free(port_cache); + return -1; + } + memset(port, '\0', sizeof(ibnd_port_t)); + + port_cache->port = port; + + if (ibnd_read(fd, buf, IBND_PORT_CACHE_LEN) < 0) + goto cleanup; + + offset += _unmarshall64(buf + offset, &port->guid); + offset += _unmarshall8(buf + offset, &tmp8); + port->portnum = tmp8; + offset += _unmarshall8(buf + offset, &tmp8); + port->ext_portnum = tmp8; + offset += _unmarshall16(buf + offset, &port->base_lid); + offset += _unmarshall8(buf + offset, &port->lmc); + offset += _unmarshall_buf(buf + offset, port->info, IB_SMP_DATA_SIZE); + offset += _unmarshall64(buf + offset, &port_cache->node_guid); + offset += _unmarshall8(buf + offset, &port_cache->remoteport_flag); + offset += + _unmarshall64(buf + offset, &port_cache->remoteport_cache_key.guid); + offset += + _unmarshall8(buf + offset, + &port_cache->remoteport_cache_key.portnum); + + store_port_cache(port_cache, fabric_cache); + + return 0; + +cleanup: + free(port); + free(port_cache); + return -1; +} + +static ibnd_port_cache_t *_find_port(ibnd_fabric_cache_t * fabric_cache, + ibnd_port_cache_key_t * port_cache_key) +{ + int hash_indx = HASHGUID(port_cache_key->guid) % HTSZ; + ibnd_port_cache_t *port_cache; + + for (port_cache = fabric_cache->portscachetbl[hash_indx]; + port_cache; port_cache = port_cache->htnext) { + if (port_cache->port->guid == port_cache_key->guid + && port_cache->port->portnum == port_cache_key->portnum) + return port_cache; + } + + return NULL; +} + +static ibnd_node_cache_t *_find_node(ibnd_fabric_cache_t * fabric_cache, + uint64_t guid) +{ + int hash_indx = HASHGUID(guid) % HTSZ; + ibnd_node_cache_t *node_cache; + + for (node_cache = fabric_cache->nodescachetbl[hash_indx]; + node_cache; node_cache = node_cache->htnext) { + if (node_cache->node->guid == guid) + return node_cache; + } + + return NULL; +} + +static int _fill_port(ibnd_fabric_cache_t * fabric_cache, ibnd_node_t * node, + ibnd_port_cache_key_t * port_cache_key) +{ + ibnd_port_cache_t *port_cache; + + if (!(port_cache = _find_port(fabric_cache, port_cache_key))) { + IBND_DEBUG("Cache invalid: cannot find port\n"); + return -1; + } + + if (port_cache->port_stored_to_fabric) { + IBND_DEBUG("Cache invalid: duplicate port discovered\n"); + return -1; + } + + node->ports[port_cache->port->portnum] = port_cache->port; + port_cache->port_stored_to_fabric++; + + /* achu: needed if user wishes to re-cache a loaded fabric. + * Otherwise, mostly unnecessary to do this. + */ + int rc = add_to_portguid_hash(port_cache->port, + fabric_cache->f_int->fabric.portstbl); + if (rc) { + IBND_DEBUG("Error Occurred when trying" + " to insert new port guid 0x%016" PRIx64 " to DB\n", + port_cache->port->guid); + } + return 0; +} + +static int _rebuild_nodes(ibnd_fabric_cache_t * fabric_cache) +{ + ibnd_node_cache_t *node_cache; + ibnd_node_cache_t *node_cache_next; + + node_cache = fabric_cache->nodes_cache; + while (node_cache) { + ibnd_node_t *node; + int i; + + node_cache_next = node_cache->next; + + node = node_cache->node; + + /* Insert node into appropriate data structures */ + + node->next = fabric_cache->f_int->fabric.nodes; + fabric_cache->f_int->fabric.nodes = node; + + int rc = add_to_nodeguid_hash(node_cache->node, + fabric_cache-> + f_int-> + fabric.nodestbl); + if (rc) { + IBND_DEBUG("Error Occurred when trying" + " to insert new node guid 0x%016" PRIx64 " to DB\n", + node_cache->node->guid); + } + + add_to_type_list(node_cache->node, fabric_cache->f_int); + + node_cache->node_stored_to_fabric++; + + /* Rebuild node ports array */ + + if (!(node->ports = + calloc(sizeof(*node->ports), node->numports + 1))) { + IBND_DEBUG("OOM: node->ports\n"); + return -1; + } + + for (i = 0; i < node_cache->ports_stored_count; i++) { + if (_fill_port(fabric_cache, node, + &node_cache->port_cache_keys[i]) < 0) + return -1; + } + + node_cache = node_cache_next; + } + + return 0; +} + +static int _rebuild_ports(ibnd_fabric_cache_t * fabric_cache) +{ + ibnd_port_cache_t *port_cache; + ibnd_port_cache_t *port_cache_next; + + port_cache = fabric_cache->ports_cache; + while (port_cache) { + ibnd_node_cache_t *node_cache; + ibnd_port_cache_t *remoteport_cache; + ibnd_port_t *port; + + port_cache_next = port_cache->next; + + port = port_cache->port; + + if (!(node_cache = + _find_node(fabric_cache, port_cache->node_guid))) { + IBND_DEBUG("Cache invalid: cannot find node\n"); + return -1; + } + + port->node = node_cache->node; + + if (port_cache->remoteport_flag) { + if (!(remoteport_cache = _find_port(fabric_cache, + &port_cache->remoteport_cache_key))) + { + IBND_DEBUG + ("Cache invalid: cannot find remote port\n"); + return -1; + } + + port->remoteport = remoteport_cache->port; + } else + port->remoteport = NULL; + + add_to_portlid_hash(port, fabric_cache->f_int); + port_cache = port_cache_next; + } + + return 0; +} + +ibnd_fabric_t *ibnd_load_fabric(const char *file, unsigned int flags) +{ + unsigned int node_count = 0; + unsigned int port_count = 0; + ibnd_fabric_cache_t *fabric_cache = NULL; + f_internal_t *f_int = NULL; + ibnd_node_cache_t *node_cache = NULL; + int fd = -1; + unsigned int i; + + if (!file) { + IBND_DEBUG("file parameter NULL\n"); + return NULL; + } + + if ((fd = open(file, O_RDONLY)) < 0) { + IBND_DEBUG("open: %s\n", strerror(errno)); + return NULL; + } + + fabric_cache = + (ibnd_fabric_cache_t *) malloc(sizeof(ibnd_fabric_cache_t)); + if (!fabric_cache) { + IBND_DEBUG("OOM: fabric_cache\n"); + goto cleanup; + } + memset(fabric_cache, '\0', sizeof(ibnd_fabric_cache_t)); + + f_int = allocate_fabric_internal(); + if (!f_int) { + IBND_DEBUG("OOM: fabric\n"); + goto cleanup; + } + + fabric_cache->f_int = f_int; + + if (_load_header_info(fd, fabric_cache, &node_count, &port_count) < 0) + goto cleanup; + + for (i = 0; i < node_count; i++) { + if (_load_node(fd, fabric_cache) < 0) + goto cleanup; + } + + for (i = 0; i < port_count; i++) { + if (_load_port(fd, fabric_cache) < 0) + goto cleanup; + } + + /* Special case - find from node */ + if (!(node_cache = + _find_node(fabric_cache, fabric_cache->from_node_guid))) { + IBND_DEBUG("Cache invalid: cannot find from node\n"); + goto cleanup; + } + f_int->fabric.from_node = node_cache->node; + + if (_rebuild_nodes(fabric_cache) < 0) + goto cleanup; + + if (_rebuild_ports(fabric_cache) < 0) + goto cleanup; + + if (group_nodes(&f_int->fabric)) + goto cleanup; + + _destroy_ibnd_fabric_cache(fabric_cache); + close(fd); + return (ibnd_fabric_t *)&f_int->fabric; + +cleanup: + ibnd_destroy_fabric((ibnd_fabric_t *)f_int); + _destroy_ibnd_fabric_cache(fabric_cache); + close(fd); + return NULL; +} + +static ssize_t ibnd_write(int fd, const void *buf, size_t count) +{ + size_t count_done = 0; + ssize_t ret; + + while ((count - count_done) > 0) { + ret = write(fd, ((char *) buf) + count_done, count - count_done); + if (ret < 0) { + if (errno == EINTR) + continue; + else { + IBND_DEBUG("write: %s\n", strerror(errno)); + return -1; + } + } + count_done += ret; + } + return count_done; +} + +static size_t _marshall8(uint8_t * outbuf, uint8_t num) +{ + outbuf[0] = num; + + return (sizeof(num)); +} + +static size_t _marshall16(uint8_t * outbuf, uint16_t num) +{ + outbuf[0] = num & 0x00FF; + outbuf[1] = (num & 0xFF00) >> 8; + + return (sizeof(num)); +} + +static size_t _marshall32(uint8_t * outbuf, uint32_t num) +{ + outbuf[0] = num & 0x000000FF; + outbuf[1] = (num & 0x0000FF00) >> 8; + outbuf[2] = (num & 0x00FF0000) >> 16; + outbuf[3] = (num & 0xFF000000) >> 24; + + return (sizeof(num)); +} + +static size_t _marshall64(uint8_t * outbuf, uint64_t num) +{ + outbuf[0] = (uint8_t) num; + outbuf[1] = (uint8_t) (num >> 8); + outbuf[2] = (uint8_t) (num >> 16); + outbuf[3] = (uint8_t) (num >> 24); + outbuf[4] = (uint8_t) (num >> 32); + outbuf[5] = (uint8_t) (num >> 40); + outbuf[6] = (uint8_t) (num >> 48); + outbuf[7] = (uint8_t) (num >> 56); + + return (sizeof(num)); +} + +static size_t _marshall_buf(void *outbuf, const void *inbuf, unsigned int len) +{ + memcpy(outbuf, inbuf, len); + + return len; +} + +static int _cache_header_info(int fd, ibnd_fabric_t * fabric) +{ + uint8_t buf[IBND_FABRIC_CACHE_BUFLEN]; + size_t offset = 0; + + /* Store magic number, version, and other important info */ + /* For this caching lib, we always assume cached as little endian */ + + offset += _marshall32(buf + offset, IBND_FABRIC_CACHE_MAGIC); + offset += _marshall32(buf + offset, IBND_FABRIC_CACHE_VERSION); + /* save space for node count */ + offset += _marshall32(buf + offset, 0); + /* save space for port count */ + offset += _marshall32(buf + offset, 0); + offset += _marshall64(buf + offset, fabric->from_node->guid); + offset += _marshall32(buf + offset, fabric->maxhops_discovered); + + if (ibnd_write(fd, buf, offset) < 0) + return -1; + + return 0; +} + +static int _cache_header_counts(int fd, unsigned int node_count, + unsigned int port_count) +{ + uint8_t buf[IBND_FABRIC_CACHE_BUFLEN]; + size_t offset = 0; + + offset += _marshall32(buf + offset, node_count); + offset += _marshall32(buf + offset, port_count); + + if (lseek(fd, IBND_FABRIC_CACHE_COUNT_OFFSET, SEEK_SET) < 0) { + IBND_DEBUG("lseek: %s\n", strerror(errno)); + return -1; + } + + if (ibnd_write(fd, buf, offset) < 0) + return -1; + + return 0; +} + +static int _cache_node(int fd, ibnd_node_t * node) +{ + uint8_t buf[IBND_FABRIC_CACHE_BUFLEN]; + size_t offset = 0; + size_t ports_stored_offset = 0; + uint8_t ports_stored_count = 0; + int i; + + offset += _marshall16(buf + offset, node->smalid); + offset += _marshall8(buf + offset, node->smalmc); + offset += _marshall8(buf + offset, (uint8_t) node->smaenhsp0); + offset += _marshall_buf(buf + offset, node->switchinfo, + IB_SMP_DATA_SIZE); + offset += _marshall64(buf + offset, node->guid); + offset += _marshall8(buf + offset, (uint8_t) node->type); + offset += _marshall8(buf + offset, (uint8_t) node->numports); + offset += _marshall_buf(buf + offset, node->info, IB_SMP_DATA_SIZE); + offset += _marshall_buf(buf + offset, node->nodedesc, IB_SMP_DATA_SIZE); + /* need to come back later and store number of stored ports + * because port entries can be NULL or (in the case of switches) + * there is an additional port 0 not accounted for in numports. + */ + ports_stored_offset = offset; + offset += sizeof(uint8_t); + + for (i = 0; i <= node->numports; i++) { + if (node->ports[i]) { + offset += _marshall64(buf + offset, + node->ports[i]->guid); + offset += _marshall8(buf + offset, + (uint8_t) node->ports[i]->portnum); + ports_stored_count++; + } + } + + /* go back and store number of port keys stored */ + _marshall8(buf + ports_stored_offset, ports_stored_count); + + if (ibnd_write(fd, buf, offset) < 0) + return -1; + + return 0; +} + +static int _cache_port(int fd, ibnd_port_t * port) +{ + uint8_t buf[IBND_FABRIC_CACHE_BUFLEN]; + size_t offset = 0; + + offset += _marshall64(buf + offset, port->guid); + offset += _marshall8(buf + offset, (uint8_t) port->portnum); + offset += _marshall8(buf + offset, (uint8_t) port->ext_portnum); + offset += _marshall16(buf + offset, port->base_lid); + offset += _marshall8(buf + offset, port->lmc); + offset += _marshall_buf(buf + offset, port->info, IB_SMP_DATA_SIZE); + offset += _marshall64(buf + offset, port->node->guid); + if (port->remoteport) { + offset += _marshall8(buf + offset, 1); + offset += _marshall64(buf + offset, port->remoteport->guid); + offset += _marshall8(buf + offset, (uint8_t) port->remoteport->portnum); + } else { + offset += _marshall8(buf + offset, 0); + offset += _marshall64(buf + offset, 0); + offset += _marshall8(buf + offset, 0); + } + + if (ibnd_write(fd, buf, offset) < 0) + return -1; + + return 0; +} + +int ibnd_cache_fabric(ibnd_fabric_t * fabric, const char *file, + unsigned int flags) +{ + struct stat statbuf; + ibnd_node_t *node = NULL; + ibnd_node_t *node_next = NULL; + unsigned int node_count = 0; + ibnd_port_t *port = NULL; + ibnd_port_t *port_next = NULL; + unsigned int port_count = 0; + int fd; + int i; + + if (!fabric) { + IBND_DEBUG("fabric parameter NULL\n"); + return -1; + } + + if (!file) { + IBND_DEBUG("file parameter NULL\n"); + return -1; + } + + if (!(flags & IBND_CACHE_FABRIC_FLAG_NO_OVERWRITE)) { + if (!stat(file, &statbuf)) { + if (unlink(file) < 0) { + IBND_DEBUG("error removing '%s': %s\n", + file, strerror(errno)); + return -1; + } + } + } + else { + if (!stat(file, &statbuf)) { + IBND_DEBUG("file '%s' already exists\n", file); + return -1; + } + } + + if ((fd = open(file, O_CREAT | O_EXCL | O_WRONLY, 0644)) < 0) { + IBND_DEBUG("open: %s\n", strerror(errno)); + return -1; + } + + if (_cache_header_info(fd, fabric) < 0) + goto cleanup; + + node = fabric->nodes; + while (node) { + node_next = node->next; + + if (_cache_node(fd, node) < 0) + goto cleanup; + + node_count++; + node = node_next; + } + + for (i = 0; i < HTSZ; i++) { + port = fabric->portstbl[i]; + while (port) { + port_next = port->htnext; + + if (_cache_port(fd, port) < 0) + goto cleanup; + + port_count++; + port = port_next; + } + } + + if (_cache_header_counts(fd, node_count, port_count) < 0) + goto cleanup; + + if (close(fd) < 0) { + IBND_DEBUG("close: %s\n", strerror(errno)); + goto cleanup; + } + + return 0; + +cleanup: + unlink(file); + close(fd); + return -1; +} diff --git a/libibnetdisc/ibnetdisc_osd.h b/libibnetdisc/ibnetdisc_osd.h new file mode 100644 index 0000000..061001b --- /dev/null +++ b/libibnetdisc/ibnetdisc_osd.h @@ -0,0 +1 @@ +#warning "This header is obsolete." diff --git a/libibnetdisc/internal.h b/libibnetdisc/internal.h new file mode 100644 index 0000000..7b5d4e8 --- /dev/null +++ b/libibnetdisc/internal.h @@ -0,0 +1,119 @@ +/* + * Copyright (c) 2008 Lawrence Livermore National Laboratory + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +/** ========================================================================= + * Define the internal data structures. + */ + +#ifndef _INTERNAL_H_ +#define _INTERNAL_H_ + +#include <infiniband/ibnetdisc.h> +#include <util/cl_qmap.h> + +#define IBND_DEBUG(fmt, ...) \ + if (ibdebug) { \ + printf("%s:%u; " fmt, __FILE__, __LINE__, ## __VA_ARGS__); \ + } +#define IBND_ERROR(fmt, ...) \ + fprintf(stderr, "%s:%u; " fmt, __FILE__, __LINE__, ## __VA_ARGS__) + +/* HASH table defines */ +#define HASHGUID(guid) ((uint32_t)(((uint32_t)(guid) * 101) ^ ((uint32_t)((guid) >> 32) * 103))) + +#define MAXHOPS 63 + +#define DEFAULT_MAX_SMP_ON_WIRE 2 +#define DEFAULT_TIMEOUT 1000 +#define DEFAULT_RETRIES 3 + +typedef struct f_internal { + ibnd_fabric_t fabric; + cl_qmap_t lid2guid; +} f_internal_t; +f_internal_t *allocate_fabric_internal(void); +void create_lid2guid(f_internal_t *f_int); +void destroy_lid2guid(f_internal_t *f_int); +void add_to_portlid_hash(ibnd_port_t * port, f_internal_t *f_int); + +typedef struct ibnd_scan { + ib_portid_t selfportid; + f_internal_t *f_int; + struct ibnd_config *cfg; + unsigned initial_hops; +} ibnd_scan_t; + +typedef struct ibnd_smp ibnd_smp_t; +typedef struct smp_engine smp_engine_t; +typedef int (*smp_comp_cb_t) (smp_engine_t * engine, ibnd_smp_t * smp, + uint8_t * mad_resp, void *cb_data); +struct ibnd_smp { + cl_map_item_t on_wire; + struct ibnd_smp *qnext; + smp_comp_cb_t cb; + void *cb_data; + ib_portid_t path; + ib_rpc_t rpc; +}; + +struct smp_engine { + int umad_fd; + int smi_agent; + int smi_dir_agent; + ibnd_smp_t *smp_queue_head; + ibnd_smp_t *smp_queue_tail; + void *user_data; + cl_qmap_t smps_on_wire; + struct ibnd_config *cfg; + unsigned total_smps; +}; + +int smp_engine_init(smp_engine_t * engine, char * ca_name, int ca_port, + void *user_data, ibnd_config_t *cfg); +int issue_smp(smp_engine_t * engine, ib_portid_t * portid, + unsigned attrid, unsigned mod, smp_comp_cb_t cb, void *cb_data); +int process_mads(smp_engine_t * engine); +void smp_engine_destroy(smp_engine_t * engine); + +int add_to_nodeguid_hash(ibnd_node_t * node, ibnd_node_t * hash[]); + +int add_to_portguid_hash(ibnd_port_t * port, ibnd_port_t * hash[]); + +void add_to_type_list(ibnd_node_t * node, f_internal_t * fabric); + +void destroy_node(ibnd_node_t * node); + +int mlnx_ext_port_info_err(smp_engine_t *engine, ibnd_smp_t *smp, uint8_t *mad, + void *cb_data); + +#endif /* _INTERNAL_H_ */ diff --git a/libibnetdisc/libibnetdisc.map b/libibnetdisc/libibnetdisc.map new file mode 100644 index 0000000..f1b7229 --- /dev/null +++ b/libibnetdisc/libibnetdisc.map @@ -0,0 +1,22 @@ +IBNETDISC_1.0 { + global: + ibnd_discover_fabric; + ibnd_destroy_fabric; + ibnd_load_fabric; + ibnd_cache_fabric; + ibnd_find_node_guid; + ibnd_find_node_dr; + ibnd_is_xsigo_guid; + ibnd_is_xsigo_tca; + ibnd_is_xsigo_hca; + ibnd_get_chassis_guid; + ibnd_get_chassis_type; + ibnd_get_chassis_slot_str; + ibnd_iter_nodes; + ibnd_iter_nodes_type; + ibnd_find_port_guid; + ibnd_find_port_dr; + ibnd_find_port_lid; + ibnd_iter_ports; + local: *; +}; diff --git a/libibnetdisc/man/CMakeLists.txt b/libibnetdisc/man/CMakeLists.txt new file mode 100644 index 0000000..01457dd --- /dev/null +++ b/libibnetdisc/man/CMakeLists.txt @@ -0,0 +1,14 @@ +rdma_man_pages( + ibnd_discover_fabric.3 + ibnd_find_node_guid.3 + ibnd_iter_nodes.3 + ) + +rdma_alias_man_pages( + ibnd_discover_fabric.3 ibnd_debug.3 + ibnd_discover_fabric.3 ibnd_destroy_fabric.3 + ibnd_discover_fabric.3 ibnd_set_max_smps_on_wire.3 + ibnd_discover_fabric.3 ibnd_show_progress.3 + ibnd_find_node_guid.3 ibnd_find_node_dr.3 + ibnd_iter_nodes.3 ibnd_iter_nodes_type.3 + ) diff --git a/libibnetdisc/man/ibnd_discover_fabric.3 b/libibnetdisc/man/ibnd_discover_fabric.3 new file mode 100644 index 0000000..2c09da7 --- /dev/null +++ b/libibnetdisc/man/ibnd_discover_fabric.3 @@ -0,0 +1,65 @@ +.TH IBND_DISCOVER_FABRIC 3 "July 25, 2008" "OpenIB" "OpenIB Programmer's Manual" +.SH "NAME" +ibnd_discover_fabric, ibnd_destroy_fabric, ibnd_debug ibnd_show_progress \- initialize ibnetdiscover library. +.SH "SYNOPSIS" +.nf +.B #include <infiniband/ibnetdisc.h> +.sp +.BI "ibnd_fabric_t *ibnd_discover_fabric(struct ibmad_port *ibmad_port, int timeout_ms, ib_portid_t *from, int hops)" +.BI "void ibnd_destroy_fabric(ibnd_fabric_t *fabric)" +.BI "void ibnd_debug(int i)" +.BI "void ibnd_show_progress(int i)" +.BI "int ibnd_set_max_smps_on_wire(int i)" +.SH "DESCRIPTION" +.B ibnd_discover_fabric() +Discover the fabric connected to the port specified by ibmad_port, using a timeout specified. The "from" and "hops" parameters are optional and allow one to scan part of a fabric by specifying a node "from" and a number of hops away from that node to scan, "hops". This gives the user a "sub-fabric" which is "centered" anywhere they chose. + +ibmad_port must be opened with at least IB_SMI_CLASS and IB_SMI_DIRECT_CLASS +classes for ibnd_discover_fabric to work. + +.B ibnd_destroy_fabric() +free all memory and resources associated with the fabric. + +.B ibnd_debug() +Set the debug level to be printed as library operations take place. + +.B ibnd_show_progress() +Indicate that the library should print debug output which shows it's progress +through the fabric. + +.B ibnd_set_max_smps_on_wire() +Set the number of SMP\'s which will be issued on the wire simultaneously. + +.SH "RETURN VALUE" +.B ibnd_discover_fabric() +return NULL on failure, otherwise a valid ibnd_fabric_t object. + +.B ibnd_destory_fabric(), ibnd_debug() +NONE + +.B ibnd_set_max_smps_on_wire() +The previous value is returned + +.SH "EXAMPLES" + +.B Discover the entire fabric connected to device "mthca0", port 1. + + int mgmt_classes[2] = {IB_SMI_CLASS, IB_SMI_DIRECT_CLASS}; + struct ibmad_port *ibmad_port = mad_rpc_open_port(ca, ca_port, mgmt_classes, 2); + ibnd_fabric_t *fabric = ibnd_discover_fabric(ibmad_port, 100, NULL, 0); + ... + ibnd_destroy_fabric(fabric); + mad_rpc_close_port(ibmad_port); + +.B Discover only a single node and those nodes connected to it. + + ... + str2drpath(&(port_id.drpath), from, 0, 0); + ... + ibnd_discover_fabric(ibmad_port, 100, &port_id, 1); + ... +.SH "SEE ALSO" + libibmad, mad_rpc_open_port +.SH "AUTHORS" +.TP +Ira Weiny <weiny2@llnl.gov> diff --git a/libibnetdisc/man/ibnd_find_node_guid.3 b/libibnetdisc/man/ibnd_find_node_guid.3 new file mode 100644 index 0000000..2d0cb63 --- /dev/null +++ b/libibnetdisc/man/ibnd_find_node_guid.3 @@ -0,0 +1,21 @@ +.TH IBND_FIND_NODE_GUID 3 "July 25, 2008" "OpenIB" "OpenIB Programmer's Manual" +.SH "NAME" +ibnd_find_node_guid, ibnd_find_node_dr \- given a fabric object find the node object within it which matches the guid or directed route specified. +.SH "SYNOPSIS" +.nf +.B #include <infiniband/ibnetdisc.h> +.sp +.BI "ibnd_node_t *ibnd_find_node_guid(ibnd_fabric_t *fabric, uint64_t guid)" +.BI "ibnd_node_t *ibnd_find_node_dr(ibnd_fabric_t *fabric, char *dr_str)" +.SH "DESCRIPTION" +.B ibnd_find_node_guid() +Given a fabric object and a guid, return the ibnd_node_t object with that node guid. +.B ibnd_find_node_dr() +Given a fabric object and a directed route, return the ibnd_node_t object with +that directed route. +.SH "RETURN VALUE" +.B ibnd_find_node_guid(), ibnd_find_node_dr() +return NULL on failure, otherwise a valid ibnd_node_t object. +.SH "AUTHORS" +.TP +Ira Weiny <weiny2@llnl.gov> diff --git a/libibnetdisc/man/ibnd_iter_nodes.3 b/libibnetdisc/man/ibnd_iter_nodes.3 new file mode 100644 index 0000000..469f07b --- /dev/null +++ b/libibnetdisc/man/ibnd_iter_nodes.3 @@ -0,0 +1,20 @@ +.TH IBND_ITER_NODES 3 "July 25, 2008" "OpenIB" "OpenIB Programmer's Manual" +.SH "NAME" +ibnd_iter_nodes, ibnd_iter_nodes_type \- given a fabric object and a function itterate over the nodes in the fabric. +.SH "SYNOPSIS" +.nf +.B #include <infiniband/ibnetdisc.h> +.sp +.BI "void ibnd_iter_nodes(ibnd_fabric_t *fabric, ibnd_iter_func_t func, void *user_data)" +.BI "void ibnd_iter_nodes_type(ibnd_fabric_t *fabric, ibnd_iter_func_t func, ibnd_node_type_t type, void *user_data)" +.SH "DESCRIPTION" +.B ibnd_iter_nodes() +Itterate through all the nodes in the fabric and call "func" on them. +.B ibnd_iter_nodes_type() +The same as ibnd_iter_nodes except to limit the iteration to the nodes with the specified type. +.SH "RETURN VALUE" +.B ibnd_iter_nodes(), ibnd_iter_nodes_type() +NONE +.SH "AUTHORS" +.TP +Ira Weiny <weiny2@llnl.gov> diff --git a/libibnetdisc/query_smp.c b/libibnetdisc/query_smp.c new file mode 100644 index 0000000..47693bc --- /dev/null +++ b/libibnetdisc/query_smp.c @@ -0,0 +1,279 @@ +/* + * Copyright (c) 2010 Lawrence Livermore National Laboratory + * Copyright (c) 2011 Mellanox Technologies LTD. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#include <errno.h> +#include <infiniband/ibnetdisc.h> +#include <infiniband/umad.h> +#include "internal.h" + +static void queue_smp(smp_engine_t * engine, ibnd_smp_t * smp) +{ + smp->qnext = NULL; + if (!engine->smp_queue_head) { + engine->smp_queue_head = smp; + engine->smp_queue_tail = smp; + } else { + engine->smp_queue_tail->qnext = smp; + engine->smp_queue_tail = smp; + } +} + +static ibnd_smp_t *get_smp(smp_engine_t * engine) +{ + ibnd_smp_t *head = engine->smp_queue_head; + ibnd_smp_t *tail = engine->smp_queue_tail; + ibnd_smp_t *rc = head; + if (head) { + if (tail == head) + engine->smp_queue_tail = NULL; + engine->smp_queue_head = head->qnext; + } + return rc; +} + +static int send_smp(ibnd_smp_t * smp, smp_engine_t * engine) +{ + int rc = 0; + uint8_t umad[1024]; + ib_rpc_t *rpc = &smp->rpc; + int agent = 0; + + memset(umad, 0, umad_size() + IB_MAD_SIZE); + + if (rpc->mgtclass == IB_SMI_CLASS) { + agent = engine->smi_agent; + } else if (rpc->mgtclass == IB_SMI_DIRECT_CLASS) { + agent = engine->smi_dir_agent; + } else { + IBND_ERROR("Invalid class for RPC\n"); + return (-EIO); + } + + if ((rc = mad_build_pkt(umad, &smp->rpc, &smp->path, NULL, NULL)) + < 0) { + IBND_ERROR("mad_build_pkt failed; %d\n", rc); + return rc; + } + + if ((rc = umad_send(engine->umad_fd, agent, umad, IB_MAD_SIZE, + engine->cfg->timeout_ms, engine->cfg->retries)) < 0) { + IBND_ERROR("send failed; %d\n", rc); + return rc; + } + + return 0; +} + +static int process_smp_queue(smp_engine_t * engine) +{ + int rc = 0; + ibnd_smp_t *smp; + while (cl_qmap_count(&engine->smps_on_wire) + < engine->cfg->max_smps) { + smp = get_smp(engine); + if (!smp) + return 0; + + if ((rc = send_smp(smp, engine)) != 0) { + free(smp); + return rc; + } + cl_qmap_insert(&engine->smps_on_wire, (uint32_t) smp->rpc.trid, + (cl_map_item_t *) smp); + engine->total_smps++; + } + return 0; +} + +int issue_smp(smp_engine_t * engine, ib_portid_t * portid, + unsigned attrid, unsigned mod, smp_comp_cb_t cb, void *cb_data) +{ + ibnd_smp_t *smp = calloc(1, sizeof *smp); + if (!smp) { + IBND_ERROR("OOM\n"); + return -ENOMEM; + } + + smp->cb = cb; + smp->cb_data = cb_data; + smp->path = *portid; + smp->rpc.method = IB_MAD_METHOD_GET; + smp->rpc.attr.id = attrid; + smp->rpc.attr.mod = mod; + smp->rpc.timeout = engine->cfg->timeout_ms; + smp->rpc.datasz = IB_SMP_DATA_SIZE; + smp->rpc.dataoffs = IB_SMP_DATA_OFFS; + smp->rpc.trid = mad_trid(); + smp->rpc.mkey = engine->cfg->mkey; + + if (portid->lid <= 0 || portid->drpath.drslid == 0xffff || + portid->drpath.drdlid == 0xffff) + smp->rpc.mgtclass = IB_SMI_DIRECT_CLASS; /* direct SMI */ + else + smp->rpc.mgtclass = IB_SMI_CLASS; /* Lid routed SMI */ + + portid->sl = 0; + portid->qp = 0; + + queue_smp(engine, smp); + return process_smp_queue(engine); +} + +static int process_one_recv(smp_engine_t * engine) +{ + int rc = 0; + int status = 0; + ibnd_smp_t *smp; + uint8_t *mad; + uint32_t trid; + uint8_t umad[sizeof(struct ib_user_mad) + IB_MAD_SIZE]; + int length = umad_size() + IB_MAD_SIZE; + + memset(umad, 0, sizeof(umad)); + + /* wait for the next message */ + if ((rc = umad_recv(engine->umad_fd, umad, &length, + -1)) < 0) { + IBND_ERROR("umad_recv failed: %d\n", rc); + return -1; + } + + mad = umad_get_mad(umad); + trid = (uint32_t) mad_get_field64(mad, 0, IB_MAD_TRID_F); + + smp = (ibnd_smp_t *) cl_qmap_remove(&engine->smps_on_wire, trid); + if ((cl_map_item_t *) smp == cl_qmap_end(&engine->smps_on_wire)) { + IBND_ERROR("Failed to find matching smp for trid (%x)\n", trid); + return -1; + } + + rc = process_smp_queue(engine); + if (rc) + goto error; + + if ((status = umad_status(umad))) { + IBND_ERROR("umad (%s Attr 0x%x:%u) bad status %d; %s\n", + portid2str(&smp->path), smp->rpc.attr.id, + smp->rpc.attr.mod, status, strerror(status)); + if (smp->rpc.attr.id == IB_ATTR_MLNX_EXT_PORT_INFO) + rc = mlnx_ext_port_info_err(engine, smp, mad, + smp->cb_data); + } else if ((status = mad_get_field(mad, 0, IB_DRSMP_STATUS_F))) { + IBND_ERROR("mad (%s Attr 0x%x:%u) bad status 0x%x\n", + portid2str(&smp->path), smp->rpc.attr.id, + smp->rpc.attr.mod, status); + if (smp->rpc.attr.id == IB_ATTR_MLNX_EXT_PORT_INFO) + rc = mlnx_ext_port_info_err(engine, smp, mad, + smp->cb_data); + } else + rc = smp->cb(engine, smp, mad, smp->cb_data); + +error: + free(smp); + return rc; +} + +int smp_engine_init(smp_engine_t * engine, char * ca_name, int ca_port, + void *user_data, ibnd_config_t *cfg) +{ + memset(engine, 0, sizeof(*engine)); + + if (umad_init() < 0) { + IBND_ERROR("umad_init failed\n"); + return -EIO; + } + + engine->umad_fd = umad_open_port(ca_name, ca_port); + if (engine->umad_fd < 0) { + IBND_ERROR("can't open UMAD port (%s:%d)\n", ca_name, ca_port); + return -EIO; + } + + if ((engine->smi_agent = umad_register(engine->umad_fd, + IB_SMI_CLASS, 1, 0, NULL)) < 0) { + IBND_ERROR("Failed to register SMI agent on (%s:%d)\n", + ca_name, ca_port); + goto eio_close; + } + + if ((engine->smi_dir_agent = umad_register(engine->umad_fd, + IB_SMI_DIRECT_CLASS, 1, 0, NULL)) < 0) { + IBND_ERROR("Failed to register SMI_DIRECT agent on (%s:%d)\n", + ca_name, ca_port); + goto eio_close; + } + + engine->user_data = user_data; + cl_qmap_init(&engine->smps_on_wire); + engine->cfg = cfg; + return (0); + +eio_close: + umad_close_port(engine->umad_fd); + return (-EIO); +} + +void smp_engine_destroy(smp_engine_t * engine) +{ + cl_map_item_t *item; + ibnd_smp_t *smp; + + /* remove queued smps */ + smp = get_smp(engine); + if (smp) + IBND_ERROR("outstanding SMP's\n"); + for ( /* */ ; smp; smp = get_smp(engine)) + free(smp); + + /* remove smps from the wire queue */ + item = cl_qmap_head(&engine->smps_on_wire); + if (item != cl_qmap_end(&engine->smps_on_wire)) + IBND_ERROR("outstanding SMP's on wire\n"); + for ( /* */ ; item != cl_qmap_end(&engine->smps_on_wire); + item = cl_qmap_head(&engine->smps_on_wire)) { + cl_qmap_remove_item(&engine->smps_on_wire, item); + free(item); + } + + umad_close_port(engine->umad_fd); +} + +int process_mads(smp_engine_t * engine) +{ + int rc; + while (!cl_is_qmap_empty(&engine->smps_on_wire)) + if ((rc = process_one_recv(engine)) != 0) + return rc; + return 0; +} diff --git a/libibnetdisc/tests/testleaks.c b/libibnetdisc/tests/testleaks.c new file mode 100644 index 0000000..f1e1985 --- /dev/null +++ b/libibnetdisc/tests/testleaks.c @@ -0,0 +1,158 @@ +/* + * Copyright (c) 2004-2007 Voltaire Inc. All rights reserved. + * Copyright (c) 2007 Xsigo Systems Inc. All rights reserved. + * Copyright (c) 2008 Lawrence Livermore National Lab. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#define _GNU_SOURCE +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <stdarg.h> +#include <time.h> +#include <string.h> +#include <getopt.h> +#include <errno.h> +#include <inttypes.h> + +#include <infiniband/ibnetdisc.h> + +static const char *argv0 = "iblinkinfotest"; +static FILE *f; + +static void usage(void) +{ + fprintf(stderr, + "Usage: %s [-hclp -D <direct route> -C <ca_name> -P <ca_port>]\n" + " Report link speed and connection for each port of each switch which is active\n" + " -h This help message\n" + " -i <iters> Number of iterations to run (default -1 == infinate)\n" + " -f <dr_path> specify node to start \"from\"\n" + " -n <hops> Number of hops to include away from specified node\n" + " -t <timeout_ms> timeout for any single fabric query\n" + " -s show errors\n" + " -C <ca_name> use selected Channel Adaptor name for queries\n" + " -P <ca_port> use selected channel adaptor port for queries\n" + " --debug print debug messages\n", argv0); + exit(-1); +} + +int main(int argc, char **argv) +{ + struct ibnd_config config = { 0 }; + int rc = 0; + char *ca = NULL; + int ca_port = 0; + ibnd_fabric_t *fabric = NULL; + char *from = NULL; + ib_portid_t port_id; + int iters = -1; + + static char const str_opts[] = "S:D:n:C:P:t:shuf:i:"; + static const struct option long_opts[] = { + {"S", 1, NULL, 'S'}, + {"D", 1, NULL, 'D'}, + {"num-hops", 1, NULL, 'n'}, + {"ca-name", 1, NULL, 'C'}, + {"ca-port", 1, NULL, 'P'}, + {"timeout", 1, NULL, 't'}, + {"show", 0, NULL, 's'}, + {"help", 0, NULL, 'h'}, + {"usage", 0, NULL, 'u'}, + {"debug", 0, NULL, 2}, + {"from", 1, NULL, 'f'}, + {"iters", 1, NULL, 'i'}, + {} + }; + + f = stdout; + + argv0 = argv[0]; + + while (1) { + int ch = getopt_long(argc, argv, str_opts, long_opts, NULL); + if (ch == -1) + break; + switch (ch) { + case 2: + config.debug++; + break; + case 'f': + from = strdup(optarg); + break; + case 'C': + ca = strdup(optarg); + break; + case 'P': + ca_port = strtoul(optarg, NULL, 0); + break; + case 'n': + config.max_hops = strtoul(optarg, NULL, 0); + break; + case 'i': + iters = (int)strtol(optarg, NULL, 0); + break; + case 't': + config.timeout_ms = strtoul(optarg, NULL, 0); + break; + default: + usage(); + break; + } + } + argc -= optind; + argv += optind; + + while (iters == -1 || iters-- > 0) { + if (from) { + /* only scan part of the fabric */ + str2drpath(&(port_id.drpath), from, 0, 0); + if ((fabric = ibnd_discover_fabric(ca, ca_port, + &port_id, &config)) + == NULL) { + fprintf(stderr, "discover failed\n"); + rc = 1; + goto close_port; + } + } else if ((fabric = ibnd_discover_fabric(ca, ca_port, NULL, + &config)) == NULL) { + fprintf(stderr, "discover failed\n"); + rc = 1; + goto close_port; + } + + ibnd_destroy_fabric(fabric); + } + +close_port: + exit(rc); +} diff --git a/libibumad/CMakeLists.txt b/libibumad/CMakeLists.txt new file mode 100644 index 0000000..9d0a425 --- /dev/null +++ b/libibumad/CMakeLists.txt @@ -0,0 +1,19 @@ +publish_headers(infiniband + umad.h + umad_cm.h + umad_sa.h + umad_sa_mcm.h + umad_sm.h + umad_str.h + umad_types.h + ) + +rdma_library(ibumad libibumad.map + # See Documentation/versioning.md + 3 3.1.${PACKAGE_VERSION} + sysfs.c + umad.c + umad_str.c + ) + +rdma_pkg_config("ibumad" "" "") diff --git a/libibumad/libibumad.map b/libibumad/libibumad.map new file mode 100644 index 0000000..9f08d29 --- /dev/null +++ b/libibumad/libibumad.map @@ -0,0 +1,47 @@ +/* Do not change this file without reading Documentation/versioning.md */ +IBUMAD_1.0 { + global: + umad_init; + umad_done; + umad_get_cas_names; + umad_get_ca_portguids; + umad_open_port; + umad_get_ca; + umad_release_ca; + umad_get_port; + umad_release_port; + umad_close_port; + umad_get_mad; + umad_get_issm_path; + umad_size; + umad_set_grh; + umad_set_pkey; + umad_get_pkey; + umad_set_addr; + umad_set_addr_net; + umad_send; + umad_recv; + umad_poll; + umad_get_fd; + umad_register; + umad_register2; + umad_register_oui; + umad_unregister; + umad_status; + umad_get_mad_addr; + umad_debug; + umad_addr_dump; + umad_dump; + umad_class_str; + umad_method_str; + umad_common_mad_status_str; + umad_sa_mad_status_str; + umad_attribute_str; + local: *; +}; + +IBUMAD_1.1 { + global: + umad_free_ca_device_list; + umad_get_ca_device_list; +} IBUMAD_1.0; diff --git a/libibumad/man/CMakeLists.txt b/libibumad/man/CMakeLists.txt new file mode 100644 index 0000000..185584a --- /dev/null +++ b/libibumad/man/CMakeLists.txt @@ -0,0 +1,42 @@ +rdma_man_pages( + umad_addr_dump.3 + umad_alloc.3 + umad_class_str.3 + umad_close_port.3 + umad_debug.3 + umad_dump.3 + umad_free.3 + umad_get_ca.3 + umad_get_ca_portguids.3 + umad_get_cas_names.3 + umad_get_fd.3 + umad_get_issm_path.3 + umad_get_mad.3 + umad_get_mad_addr.3 + umad_get_pkey.3 + umad_get_port.3 + umad_init.3.md + umad_open_port.3 + umad_poll.3 + umad_recv.3 + umad_register.3 + umad_register2.3 + umad_register_oui.3 + umad_send.3 + umad_set_addr.3 + umad_set_addr_net.3 + umad_set_grh.3 + umad_set_grh_net.3 + umad_set_pkey.3 + umad_size.3 + umad_status.3 + umad_unregister.3 + ) +rdma_alias_man_pages( + umad_class_str.3 umad_attribute_str.3 + umad_class_str.3 umad_mad_status_str.3 + umad_class_str.3 umad_method_str.3 + umad_get_ca.3 umad_release_ca.3 + umad_get_port.3 umad_release_port.3 + umad_init.3 umad_done.3 + ) diff --git a/libibumad/man/umad_addr_dump.3 b/libibumad/man/umad_addr_dump.3 new file mode 100644 index 0000000..d082c37 --- /dev/null +++ b/libibumad/man/umad_addr_dump.3 @@ -0,0 +1,46 @@ +.\" -*- nroff -*- +.\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md +.\" +.TH UMAD_ADDR_DUMP 3 "May 21, 2007" "OpenIB" "OpenIB Programmer\'s Manual" +.SH "NAME" +umad_addr_dump \- dump addr structure to stderr +.SH "SYNOPSIS" +.nf +.B #include <infiniband/umad.h> +.sp +.BI "void umad_addr_dump(ib_mad_addr_t " "*addr"); +.fi +.SH "DESCRIPTION" +.B umad_addr_dump() +dumps the given +.I addr\fR +to stderr. +The argument +.I addr +is an +.I ib_mad_addr_t +struct, as specified in <infiniband/umad.h>. +.PP +.nf +typedef struct ib_mad_addr { +.in +8 +uint32_t qpn; +uint32_t qkey; +uint16_t lid; +uint8_t sl; +uint8_t path_bits; +uint8_t grh_present; +uint8_t gid_index; +uint8_t hop_limit; +uint8_t traffic_class; +uint8_t gid[16]; +uint32_t flow_label; +.in -8 +} ib_mad_addr_t; +.fi +.SH "RETURN VALUE" +.B umad_addr_dump() +returns no value. +.SH "AUTHOR" +.TP +Hal Rosenstock <halr@voltaire.com> diff --git a/libibumad/man/umad_alloc.3 b/libibumad/man/umad_alloc.3 new file mode 100644 index 0000000..b5ef752 --- /dev/null +++ b/libibumad/man/umad_alloc.3 @@ -0,0 +1,34 @@ +.\" -*- nroff -*- +.\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md +.\" +.TH UMAD_ALLOC 3 "May 21, 2007" "OpenIB" "OpenIB Programmer\'s Manual" +.SH "NAME" +umad_alloc \- allocate memory for umad buffers +.SH "SYNOPSIS" +.nf +.B #include <infiniband/umad.h> +.sp +.BI "void * umad_alloc(int " "num" ", size_t " "size"); +.fi +.SH "DESCRIPTION" +.B umad_alloc() +allocates memory for an array of +.I num\fR +umad buffers of +.I size +bytes\fR. +Note that +.I size\fR +should include the +.B umad_size() +plus the length (MAD_BLOCK_SIZE for normal MADs or the length returned from +.B umad_recv() +for RMPP MADs). +.SH "RETURN VALUE" +.B umad_alloc() +returns NULL if out of memory. +.SH "SEE ALSO" +.BR umad_free (3) +.SH "AUTHOR" +.TP +Hal Rosenstock <halr@voltaire.com> diff --git a/libibumad/man/umad_class_str.3 b/libibumad/man/umad_class_str.3 new file mode 100644 index 0000000..9adb0fd --- /dev/null +++ b/libibumad/man/umad_class_str.3 @@ -0,0 +1,46 @@ +.\" -*- nroff -*- +.\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md +.\" +.TH UMAD_CLASS_STR 3 "Feb 15, 2013" "OpenIB" "OpenIB Programmer\'s Manual" +.SH "NAME" +umad_*_str \- class of functions to return string representations of enums + +.SH "SYNOPSIS" +.nf +.B #include <infiniband/umad_str.h> +.sp +.BI "const char * umad_class_str(uint8_t mgmt_class)" +.BI "const char * umad_method_str(uint8_t mgmt_class, uint8_t method)" +.BI "const char * umad_attribute_str(uint8_t mgmt_class, be16_t attr_id)" + +.BI "const char * umad_common_mad_status_str(be16_t status)" +.BI "const char * umad_sa_mad_status_str(be16_t status)" + +.SH "DESCRIPTION" + +.B "const char * umad_class_str(uint8_t mgmt_class)" +Return string value of management class enum + +.B "const char * umad_method_str(uint8_t mgmt_class, uint8_t method)" +Return string value of the method for the mgmt_class specified + +.B "const char * umad_attribute_str(uint8_t mgmt_class, be16_t attr_id)" +Return string value of attribute specified in attr_id based on mgmt_class specified. + +.B "const char * umad_common_mad_status_str(be16_t status)" +Return string value for common MAD status values + +.B "const char * umad_sa_mad_status_str(be16_t status)" +Return string value for SA MAD status values + + +.B NOTE: +Not all classes are supported. + +.SH "RETURN VALUE" + +Returns a string representations of the fields specified. + +.SH "AUTHOR" +.TP +Ira Weiny <weiny2@llnl.gov> diff --git a/libibumad/man/umad_close_port.3 b/libibumad/man/umad_close_port.3 new file mode 100644 index 0000000..341c2d2 --- /dev/null +++ b/libibumad/man/umad_close_port.3 @@ -0,0 +1,27 @@ +.\" -*- nroff -*- +.\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md +.\" +.TH UMAD_OPEN_PORT 3 "May 11, 2007" "OpenIB" "OpenIB Programmer\'s Manual" +.SH "NAME" +umad_close_port \- close InfiniBand device port for umad access +.SH "SYNOPSIS" +.nf +.B #include <infiniband/umad.h> +.sp +.BI "int umad_close_port(int " "portid" ); +.fi +.SH "DESCRIPTION" +.B umad_close_port() +closes the port specified by the handle +.I portid\fR. +.SH "RETURN VALUE" +.B umad_close_port() +returns 0 on success, and a negative value on error. +-EINVAL is returned if the +.I portid\fR +is not a handle to a valid (open) port. +.SH "SEE ALSO" +.BR umad_open_port (3) +.SH "AUTHOR" +.TP +Hal Rosenstock <halr@voltaire.com> diff --git a/libibumad/man/umad_debug.3 b/libibumad/man/umad_debug.3 new file mode 100644 index 0000000..224d5c0 --- /dev/null +++ b/libibumad/man/umad_debug.3 @@ -0,0 +1,30 @@ +.\" -*- nroff -*- +.\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md +.\" +.TH UMAD_DEBUG 3 "May 21, 2007" "OpenIB" "OpenIB Programmer\'s Manual" +.SH "NAME" +umad_debug \- set debug level +.SH "SYNOPSIS" +.nf +.B #include <infiniband/umad.h> +.sp +.BI "int umad_debug(int " "level" ); +.fi +.SH "DESCRIPTION" +.B umad_debug() +sets the umad library internal debug level to +.I level\fR. +The following +debug levels are supported: 0 - no debug (the default), +1 - basic debug information, 2 - verbose debug information. Negative values are +ignored in terms of set. Note that the current debug level can +be queried by passing a negative value as +.I level\fR. +.SH "RETURN VALUE" +.B umad_debug() +returns the actual debug level. +.SH "AUTHORS" +.TP +Hal Rosenstock <halr@voltaire.com> +.TP +Dotan Barak <dotanb@mellanox.co.il> diff --git a/libibumad/man/umad_dump.3 b/libibumad/man/umad_dump.3 new file mode 100644 index 0000000..c01d51b --- /dev/null +++ b/libibumad/man/umad_dump.3 @@ -0,0 +1,23 @@ +.\" -*- nroff -*- +.\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md +.\" +.TH UMAD_DUMP 3 "May 17, 2007" "OpenIB" "OpenIB Programmer\'s Manual" +.SH "NAME" +umad_dump \- dump umad buffer to stderr +.SH "SYNOPSIS" +.nf +.B #include <infiniband/umad.h> +.sp +.BI "void umad_dump(void " "*umad"); +.fi +.SH "DESCRIPTION" +.B umad_dump() +dumps the given +.I umad\fR +buffer to stderr. +.SH "RETURN VALUE" +.B umad_dump() +returns no value. +.SH "AUTHOR" +.TP +Hal Rosenstock <halr@voltaire.com> diff --git a/libibumad/man/umad_free.3 b/libibumad/man/umad_free.3 new file mode 100644 index 0000000..e347317 --- /dev/null +++ b/libibumad/man/umad_free.3 @@ -0,0 +1,24 @@ +.\" -*- nroff -*- +.\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md +.\" +.TH UMAD_FREE 3 "May 17, 2007" "OpenIB" "OpenIB Programmer\'s Manual" +.SH "NAME" +umad_free \- frees memory of umad buffers +.SH "SYNOPSIS" +.nf +.B #include <infiniband/umad.h> +.sp +.BI "void umad_free(void " "*umad"); +.fi +.SH "DESCRIPTION" +.B umad_free() +frees memory previously allocated with +.B umad_alloc()\fR. +.SH "RETURN VALUE" +.B umad_free() +returns no value. +.SH "SEE ALSO" +.BR umad_alloc (3) +.SH "AUTHOR" +.TP +Hal Rosenstock <halr@voltaire.com> diff --git a/libibumad/man/umad_free_ca_device_list.3.md b/libibumad/man/umad_free_ca_device_list.3.md new file mode 100644 index 0000000..c12f6e3 --- /dev/null +++ b/libibumad/man/umad_free_ca_device_list.3.md @@ -0,0 +1,43 @@ + +--- +date: "May 1, 2018" +footer: "OpenIB" +header: "OpenIB Programmer's Manual" +layout: page +license: 'Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md' +section: 3 +title: UMAD_FREE_CA_DEVICE_LIST +--- + +# NAME + +umad_free_ca_device_list - free InfiniBand devices name list + +# SYNOPSIS + +```c +#include <infiniband/umad.h> + +void umad_free_ca_device_list(struct umad_device_node *head); +``` + +# DESCRIPTION + +**umad_free_ca_device_list()** frees the *struct umad_device_node* +list and its values that allocated with umad_get_ca_namelist(). +The argument head is list of *struct umad_device_node* filled with +local IB devices(CAs) names. + +# RETURN VALUE + +**umad_free_ca_device_list()** returns no value. + +# SEE ALSO + +**umad_get_ca_device_list** + +# AUTHORS + +Vladimir Koushnir <vladimirk@mellanox.com>, +Hal Rosenstock <hal@mellanox.com>, +Haim Boozaglo <haimbo@mellanox.com> diff --git a/libibumad/man/umad_get_ca.3 b/libibumad/man/umad_get_ca.3 new file mode 100644 index 0000000..760b6b6 --- /dev/null +++ b/libibumad/man/umad_get_ca.3 @@ -0,0 +1,66 @@ +.\" -*- nroff -*- +.\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md +.\" +.TH UMAD_GET_CA 3 "May 21, 2007" "OpenIB" "OpenIB Programmer\'s Manual" +.SH "NAME" +umad_get_ca, umad_release_ca \- get and release InfiniBand device port attributes +.SH "SYNOPSIS" +.nf +.B #include <infiniband/umad.h> +.sp +.BI "int umad_get_ca(char " "*ca_name" ", umad_ca_t " "*ca" ); +.sp +.BI "int umad_release_ca(umad_ca_t " "*ca" ); +.fi +.SH "DESCRIPTION" +.B umad_get_ca() +gets the attributes of the InfiniBand device +.I ca_name\fR. +It fills +the +.I ca +structure with the device attributes specified by +the +.I ca_name +or with the default device attributes if +.I ca_name +is NULL. +.B umad_release_ca() +should be called before the +.I ca +structure is deallocated. +The argument +.I ca +is an +.I umad_ca_t +struct, as specified in <infiniband/umad.h>. +.PP +.nf +typedef struct umad_ca { +.in +8 +char ca_name[UMAD_CA_NAME_LEN]; /* Name of the device */ +uint node_type; /* Type of the device */ +int numports; /* Number of physical ports */ +char fw_ver[20]; /* FW version */ +char ca_type[40]; /* CA type (e.g. MT23108, etc.) */ +char hw_ver[20]; /* Hardware version */ +uint64_t node_guid; /* Node GUID */ +uint64_t system_guid; /* System image GUID */ +umad_port_t *ports[UMAD_CA_MAX_PORTS]; /* Array of device port properties */ +.in -8 +} umad_ca_t; +.fi +.PP +.B umad_release_ca() +releases the resources that were allocated in the function +.B umad_get_ca()\fR. +.SH "RETURN VALUE" +.B umad_get_ca() +and +.B umad_release_ca() +return 0 on success, and a negative value on error. +.SH "AUTHORS" +.TP +Hal Rosenstock <halr@voltaire.com> +.TP +Dotan Barak <dotanb@mellanox.co.il> diff --git a/libibumad/man/umad_get_ca_device_list.3.md b/libibumad/man/umad_get_ca_device_list.3.md new file mode 100644 index 0000000..8ed649b --- /dev/null +++ b/libibumad/man/umad_get_ca_device_list.3.md @@ -0,0 +1,63 @@ + +--- +date: "May 1, 2018" +footer: "OpenIB" +header: "OpenIB Programmer's Manual" +layout: page +license: 'Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md' +section: 3 +title: UMAD_GET_CA_DEVICE_LIST +--- + +# NAME + +umad_get_ca_device_list - get list of available InfiniBand device names. + +# SYNOPSIS + +```c +#include <infiniband/umad.h> + +struct umad_device_node *umad_get_ca_device_list(void); +``` + +# DESCRIPTION + +**umad_get_ca_device_list()** fills the cas list of *struct umad_device_node* +with local IB devices (CAs) names. + +*struct umad_device_node* is defined as follows: + +```c +struct umad_device_node { + struct umad_device_node *next; + const char *ca_name; +}; +``` + +# RETURN VALUE + +**umad_get_ca_device_list()** returns list of *struct umad_device_node* filled +with local IB devices(CAs) names. +In case of empty list (zero elements), NULL is returned and +*errno* is not set. +On error, NULL is returned and *errno* is set appropriately. +The last value of the list is NULL in order to indicate the number of +entries filled. + +# ERRORS + +**umad_get_ca_device_list()** can fail with the following errors: + +**ENOMEM** + +# SEE ALSO + +**umad_get_ca_portguids**(3), **umad_open_port**(3), +**umad_free_ca_device_list** + +# AUTHORS + +Vladimir Koushnir <vladimirk@mellanox.com>, +Hal Rosenstock <hal@mellanox.com>, +Haim Boozaglo <haimbo@mellanox.com> diff --git a/libibumad/man/umad_get_ca_portguids.3 b/libibumad/man/umad_get_ca_portguids.3 new file mode 100644 index 0000000..c2a5592 --- /dev/null +++ b/libibumad/man/umad_get_ca_portguids.3 @@ -0,0 +1,43 @@ +.\" -*- nroff -*- +.\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md +.\" +.TH UMAD_GET_CA_PORTGUIDS 3 "August 8, 2016" "OpenIB" "OpenIB Programmer\'s Manual" +.SH "NAME" +umad_get_ca_portguids \- get the InfiniBand device ports GUIDs +.SH "SYNOPSIS" +.nf +.B #include <infiniband/umad.h> +.sp +.BI "int umad_get_ca_portguids(char " "*ca_name" ", __be64 " "*portguids" ", int " "max" ); +.fi +.SH "DESCRIPTION" +.B umad_get_ca_portguids() +fills the +.I portguids\fR +array with up to +.I max +port GUIDs belonging the specified IB device +.I ca_name +, or to the default IB device if +.I ca_name +is NULL. +The argument +.I portguids +is an array of +.I max +uint64_t entries. +.SH "RETURN VALUE" +On success, +.B umad_get_ca_portguids() +returns a non-negative value equal to the number of port GUIDs actually filled. +Not all filled entries may be valid. Invalid entries will be 0. +For example, on a CA node with only one port, this function returns a value of 2. +In this case, the value at index 0 will be invalid as it is reserved for switches. +On failure, a negative value is returned. +.SH "SEE ALSO" +.BR umad_get_cas_names (3) +.SH "AUTHORS" +.TP +Hal Rosenstock <halr@voltaire.com> +.TP +Dotan Barak <dotanb@mellanox.co.il> diff --git a/libibumad/man/umad_get_cas_names.3 b/libibumad/man/umad_get_cas_names.3 new file mode 100644 index 0000000..0366c16 --- /dev/null +++ b/libibumad/man/umad_get_cas_names.3 @@ -0,0 +1,38 @@ +.\" -*- nroff -*- +.\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md +.\" +.TH UMAD_GET_CAS_NAMES 3 "May 21, 2007" "OpenIB" "OpenIB Programmer\'s Manual" +.SH "NAME" +umad_get_cas_names \- get list of available InfiniBand device names +.SH "SYNOPSIS" +.nf +.B #include <infiniband/umad.h> +.sp +.BI "int umad_get_cas_names(char " "cas[][UMAD_CA_NAME_LEN]" ", int " "max" ); +.fi +.SH "DESCRIPTION" +.B umad_get_cas_names() +fills the +.I cas +array with up to +.I max +local IB devices (CAs) names. +The argument +.I cas +is a character array with +.I max +entries, each with +.B UMAD_CA_NAME_LEN +characters. +.SH "RETURN VALUE" +.B umad_get_cas_names() +returns a non-negative value equal to the number of entries filled, +or \-1 on errors. +.SH "SEE ALSO" +.BR umad_get_ca_portguids (3), +.BR umad_open_port (3) +.SH "AUTHORS" +.TP +Hal Rosenstock <halr@voltaire.com> +.TP +Dotan Barak <dotanb@mellanox.co.il> diff --git a/libibumad/man/umad_get_fd.3 b/libibumad/man/umad_get_fd.3 new file mode 100644 index 0000000..5fe5311 --- /dev/null +++ b/libibumad/man/umad_get_fd.3 @@ -0,0 +1,26 @@ +.\" -*- nroff -*- +.\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md +.\" +.TH UMAD_GET_FD 3 "May 17, 2007" "OpenIB" "OpenIB Programmer\'s Manual" +.SH "NAME" +umad_get_fd \- get the umad fd for the requested port +.SH "SYNOPSIS" +.nf +.B #include <infiniband/umad.h> +.sp +.BI "int umad_get_fd(int " "portid" ); +.fi +.SH "DESCRIPTION" +.B umad_get_fd() +returns the umad fd for the port specified by +.I portid\fR. +.SH "RETURN VALUE" +.B umad_get_fd() +returns the fd for the +.I portid\fR +requested or -EINVAL if +.I portid\fR +is invalid. +.SH "AUTHOR" +.TP +Hal Rosenstock <halr@voltaire.com> diff --git a/libibumad/man/umad_get_issm_path.3 b/libibumad/man/umad_get_issm_path.3 new file mode 100644 index 0000000..4abef18 --- /dev/null +++ b/libibumad/man/umad_get_issm_path.3 @@ -0,0 +1,39 @@ +.\" -*- nroff -*- +.\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md +.\" +.TH UMAD_GET_ISSM_PATH 3 "Oct 18, 2007" "OpenIB" "OpenIB Programmer\'s Manual" +.SH "NAME" +umad_get_issm_path \- get path of issm device +.SH "SYNOPSIS" +.nf +.B #include <infiniband/umad.h> +.sp +.BI "int umad_get_issm_path(char " "*ca_name" ", int " "portnum", char *path, int max); +.fi +.SH "DESCRIPTION" +.B umad_get_issm_path() +resolves path to issm device (which used for setting/clearing PortInfo:CapMask IsSM bit) for +.I portnum +of the IB device +.I ca_name +, it stores resolved path in +.I path +array which cannot exceed +.I max +bytes in length (including NULL terminator). +.fi +Opening issm device sets PortInfo:CapMask IsSM bit and closing clears it. +.fi +.SH "RETURN VALUE" +.B umad_open_port() +returns 0 on success and a negative value on error as follows: + -ENODEV IB device can\'t be resolved + -EINVAL port is not valid (bad +.I portnum\fR +or no umad device) +.SH "SEE ALSO" +.BR umad_open_port (3), +.BR umad_get_port (3) +.SH "AUTHOR" +.TP +Sasha Khapyorsky <sashak@voltaire.com> diff --git a/libibumad/man/umad_get_mad.3 b/libibumad/man/umad_get_mad.3 new file mode 100644 index 0000000..ac56c48 --- /dev/null +++ b/libibumad/man/umad_get_mad.3 @@ -0,0 +1,25 @@ +.\" -*- nroff -*- +.\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md +.\" +.TH UMAD_GET_MAD 3 "May 21, 2007" "OpenIB" "OpenIB Programmer\'s Manual" +.SH "NAME" +umad_get_mad \- get the MAD pointer of a umad buffer +.SH "SYNOPSIS" +.nf +.B #include <infiniband/umad.h> +.sp +.BI "void * umad_get_mad(void " "*umad"); +.fi +.SH "DESCRIPTION" +.B umad_get_mad() +returns a pointer to the MAD contained within the +.I umad\fR +buffer. +.SH "RETURN VALUE" +.B umad_get_mad() +returns a pointer to the MAD contained within the supplied +.I umad\fR +buffer. +.SH "AUTHOR" +.TP +Hal Rosenstock <halr@voltaire.com> diff --git a/libibumad/man/umad_get_mad_addr.3 b/libibumad/man/umad_get_mad_addr.3 new file mode 100644 index 0000000..4a92b7b --- /dev/null +++ b/libibumad/man/umad_get_mad_addr.3 @@ -0,0 +1,43 @@ +.\" -*- nroff -*- +.\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md +.\" +.TH UMAD_GET_MAD_ADDR 3 "May 21, 2007" "OpenIB" "OpenIB Programmer\'s Manual" +.SH "NAME" +umad_get_mad_addr \- get the address of the ib_mad_addr from a umad buffer +.SH "SYNOPSIS" +.nf +.B #include <infiniband/umad.h> +.sp +.BI "ib_mad_addr_t * umad_get_mad_addr(void " "*umad"); +.fi +.SH "DESCRIPTION" +.B umad_get_mad_addr() +returns a pointer to the ib_mad_addr struct within the specified +.I umad\fR +buffer. +.SH "RETURN VALUE" +The return value +is a pointer to an +.I ib_mad_addr_t +struct, as specified in <infiniband/umad.h>. +.PP +.nf +typedef struct ib_mad_addr { +.in +8 +uint32_t qpn; +uint32_t qkey; +uint16_t lid; +uint8_t sl; +uint8_t path_bits; +uint8_t grh_present; +uint8_t gid_index; +uint8_t hop_limit; +uint8_t traffic_class; +uint8_t gid[16]; +uint32_t flow_label; +.in -8 +} ib_mad_addr_t; +.fi +.SH "AUTHOR" +.TP +Hal Rosenstock <halr@voltaire.com> diff --git a/libibumad/man/umad_get_pkey.3 b/libibumad/man/umad_get_pkey.3 new file mode 100644 index 0000000..b9dd1be --- /dev/null +++ b/libibumad/man/umad_get_pkey.3 @@ -0,0 +1,24 @@ +.\" -*- nroff -*- +.\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md +.\" +.TH UMAD_GET_PKEY 3 "Jan 15, 2008" "OpenIB" "OpenIB Programmer\'s Manual" +.SH "NAME" +umad_get_pkey \- get pkey index from umad buffer +.SH "SYNOPSIS" +.nf +.B #include <infiniband/umad.h> +.sp +.BI "int umad_get_pkey(void " "*umad"); +.fi +.SH "DESCRIPTION" +.B umad_get_pkey() +gets the pkey index from the specified +.I umad\fR +buffer. +.SH "RETURN VALUE" +.B umad_get_pkey() +returns value of pkey index (or zero if pkey index is not supported by +user_mad interface). +.SH "AUTHOR" +.TP +Sasha Khapyorsky <sashak@voltaire.com> diff --git a/libibumad/man/umad_get_port.3 b/libibumad/man/umad_get_port.3 new file mode 100644 index 0000000..44dbfb0 --- /dev/null +++ b/libibumad/man/umad_get_port.3 @@ -0,0 +1,83 @@ +.\" -*- nroff -*- +.\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md +.\" +.TH UMAD_GET_PORT 3 "May 21, 2007" "OpenIB" "OpenIB Programmer\'s Manual" +.SH "NAME" +umad_get_port, umad_release_port \- open and close an InfiniBand port +.SH "SYNOPSIS" +.nf +.B #include <infiniband/umad.h> +.sp +.BI "int umad_get_port(char " "*ca_name" ", int " "portnum" ", umad_port_t " "*port" ); +.sp +.BI "int umad_release_port(umad_port_t " "*port" ); +.fi +.SH "DESCRIPTION" +.B umad_get_port() +fills the +.I port +structure with the IB port attributes specified by +.I ca_name +and +.I portnum +, or the default port if +.I ca_name +is NULL and +.I portnum +is zero. If only one of +.I ca_name +and +.I portnum +are specified, the other is used as a filter. +For example, passing a NULL +.I ca_name +and 2 for the +.I portnum +means get a port from any of the local IB devices, as long as it is +the second port. +Note that the library may use some reference scheme to support port caching +therefore +.B umad_release_port() +should be called before the +.I port +structure can be deallocated. +The argument +.I port +is an +.B umad_port_t +struct, as specified in <infiniband/umad.h>. +.PP +.nf +typedef struct umad_port { +.in +8 +char ca_name[UMAD_CA_NAME_LEN]; /* Name of the device */ +int portnum; /* Physical port number */ +uint base_lid; /* Base port LID */ +uint lmc; /* LMC of LID */ +uint sm_lid; /* SM LID */ +uint sm_sl; /* SM service level */ +uint state; /* Logical port state */ +uint phys_state; /* Physical port state */ +uint rate; /* Port link bit rate */ +uint64_t capmask; /* Port capabilities */ +uint64_t gid_prefix; /* Gid prefix of this port */ +uint64_t port_guid; /* GUID of this port */ +.in -8 +} umad_port_t; +.fi +.PP +.B umad_release_port() +releases the resources that were allocated by the +.B umad_get_port() +function for the specified IB +.I port\fR. +.SH "RETURN VALUE" +.B umad_get_port() +and +.B umad_release_port() +return 0 on success, and a negative value on error. +.SH "AUTHORS" +.TP +Hal Rosenstock <halr@voltaire.com> +.TP +Dotan Barak <dotanb@mellanox.co.il> diff --git a/libibumad/man/umad_init.3.md b/libibumad/man/umad_init.3.md new file mode 100644 index 0000000..ebfc389 --- /dev/null +++ b/libibumad/man/umad_init.3.md @@ -0,0 +1,48 @@ + +--- +date: "May 21, 2007" +footer: "OpenIB" +header: "OpenIB Programmer's Manual" +layout: page +license: 'Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md' +section: 3 +title: UMAD_INIT +--- + +# NAME + +umad_init, umad_done - perform library initialization and finalization + +# SYNOPSIS + +```c +#include <infiniband/umad.h> + +int umad_init(void); + +int umad_done(void); +``` + +# DESCRIPTION + +**umad_init()** and **umad_done()** do nothing. + +# RETURN VALUE + +Always 0. + +# COMPATIBILITY + +Versions prior to release 18 of the library require **umad_init()** to be +called prior to using any other library functions. Old versions could return a +failure code of -1 from **umad_init()**. + +For compatibility, applications should continue to call **umad_init()**, and +check the return code, prior to calling other **umad_** functions. If +**umad_init()** returns an error, then no further use of the umad library +should be attempted. + +# AUTHORS + +Dotan Barak <dotanb@mellanox.co.il>, +Hal Rosenstock <halr@voltaire.com> diff --git a/libibumad/man/umad_open_port.3 b/libibumad/man/umad_open_port.3 new file mode 100644 index 0000000..bd7026b --- /dev/null +++ b/libibumad/man/umad_open_port.3 @@ -0,0 +1,39 @@ +.\" -*- nroff -*- +.\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md +.\" +.TH UMAD_OPEN_PORT 3 "May 21, 2007" "OpenIB" "OpenIB Programmer\'s Manual" +.SH "NAME" +umad_open_port \- open InfiniBand device port for umad access +.SH "SYNOPSIS" +.nf +.B #include <infiniband/umad.h> +.sp +.BI "int umad_open_port(char " "*ca_name" ", int " "portnum" ); +.fi +.SH "DESCRIPTION" +.B umad_open_port() +opens the port +.I portnum +of the IB device +.I ca_name +for umad access. The port is selected by the library if not all parameters +are provided (see +.B umad_get_port() +for details). +.fi +.SH "RETURN VALUE" +.B umad_open_port() +returns 0 or an unique positive value of umad device descriptor on success, and a negative value on error as follows: + -EOPNOTSUPP ABI version doesn\'t match + -ENODEV IB device can\'t be resolved + -EINVAL port is not valid (bad +.I portnum\fR +or no umad device) + -EIO umad device for this port can\'t be opened +.SH "SEE ALSO" +.BR umad_close_port (3), +.BR umad_get_cas_names (3), +.BR umad_get_port (3) +.SH "AUTHOR" +.TP +Hal Rosenstock <halr@voltaire.com> diff --git a/libibumad/man/umad_poll.3 b/libibumad/man/umad_poll.3 new file mode 100644 index 0000000..57b7a65 --- /dev/null +++ b/libibumad/man/umad_poll.3 @@ -0,0 +1,41 @@ +.\" -*- nroff -*- +.\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md +.\" +.TH UMAD_POLL 3 "October 23, 2007" "OpenIB" "OpenIB Programmer\'s Manual" +.SH "NAME" +umad_poll \- poll umad +.SH "SYNOPSIS" +.nf +.B #include <infiniband/umad.h> +.sp +.BI "int umad_poll(int " "portid" ", int " "timeout_ms"); +.fi +.SH "DESCRIPTION" +.B umad_poll() +waits up to +.I timeout_ms\fR +milliseconds for a packet to be received from the port specified by +.I portid\fR. +Once a packet is ready to be read, the function +returns 0. After that the packet can be read using +.B umad_recv(). +Otherwise, \-ETIMEDOUT is returned. Note that successfully polling a port +does not guarantee that the subsequent +.B umad_recv() +will be non blocking when several threads are using +the same port. Instead, use a +.I timeout_ms\fR +parameter of zero to +.B umad_recv() +to ensure a non-blocking read. +.SH "RETURN VALUE" +.B umad_poll() +returns 0 on success, and a negative value on error as follows: + -EINVAL invalid port handle or agentid + -ETIMEDOUT poll operation timed out + -EIO poll operation failed +.SH "SEE ALSO" +.BR umad_recv (3) +.SH "AUTHOR" +.TP +Hal Rosenstock <halr@voltaire.com> diff --git a/libibumad/man/umad_recv.3 b/libibumad/man/umad_recv.3 new file mode 100644 index 0000000..93eec99 --- /dev/null +++ b/libibumad/man/umad_recv.3 @@ -0,0 +1,69 @@ +.\" -*- nroff -*- +.\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md +.\" +.TH UMAD_RECV 3 "May 11, 2007" "OpenIB" "OpenIB Programmer\'s Manual" +.SH "NAME" +umad_recv \- receive umad +.SH "SYNOPSIS" +.nf +.B #include <infiniband/umad.h> +.sp +.BI "int umad_recv(int " "portid" ", void " "*umad" ", int " "*length" ", int " "timeout_ms"); +.fi +.SH "DESCRIPTION" +.B umad_recv() +waits up to +.I timeout_ms\fR +milliseconds for an incoming MAD message to be received from the port specified by +.I portid\fR. + +A MAD "message" consists of a single MAD packet +.I or +a coalesced multipacket RMPP transmission. In the RMPP case the header of the +first RMPP packet is returned as the header of the buffer and the buffer data +contains the coalesced data section of each subsequent RMPP MAD packet within +the transmission. Thus all the RMPP headers except the first are not copied to +user space from the kernel. + +The message is copied to the +.I umad\fR +buffer if there is sufficient room and the received +.I length\fR is indicated. +If the buffer is not large enough, the size of the umad +buffer needed is returned in +.I length\fR. +A negative +.I timeout_ms\fR +makes the function block until a packet is received. A +.I timeout_ms\fR +parameter of zero indicates a non blocking read. + +.B Note +.I length +is a pointer to the length of the +.B data +portion of the umad buffer. This means that +.I umad +must point to a buffer at least umad_size() + +.I *length +bytes long. + +.B Note also +that +.I *length\fR +must be >= 256 bytes. This length allows for at least a single MAD packet to +be returned. + +.SH "RETURN VALUE" +.B umad_recv() +on success return the agentid; on error, errno is set and a negative value is +returned as follows: + -EINVAL invalid port handle or agentid or *length is less than the minimum supported + -EIO receive operation failed + -EWOULDBLOCK non blocking read can't be fulfilled + -ENOSPC The provided buffer is not long enough for the complete message. +.SH "SEE ALSO" +.BR umad_poll (3) +.SH "AUTHOR" +.TP +Hal Rosenstock <halr@voltaire.com> diff --git a/libibumad/man/umad_register.3 b/libibumad/man/umad_register.3 new file mode 100644 index 0000000..58b88f3 --- /dev/null +++ b/libibumad/man/umad_register.3 @@ -0,0 +1,37 @@ +.\" -*- nroff -*- +.\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md +.\" +.TH UMAD_REGISTER 3 "May 11, 2007" "OpenIB" "OpenIB Programmer\'s Manual" +.SH "NAME" +umad_register \- register the specified management class and version for port +.SH "SYNOPSIS" +.nf +.B #include <infiniband/umad.h> +.sp +.BI "int umad_register(int " "portid" ", int " "mgmt_class" ", int " "mgmt_version" " , uint8_t " "rmpp_version" ", long " "method_mask[16/sizeof(long)]"); +.fi +.SH "DESCRIPTION" +.B umad_register() +registers the specified management class, management version, +and whether RMPP is being used for the port specified by the +.I portid\fR +parameter. If +.I method_mask\fR +array is provided, the caller is registered as a replier (server) for the +methods having their corresponding bit on in the +.I method_mask\fR. +If +.I method_mask\fR +is NULL, the caller is registered as a MAD client, meaning that it can +only receive replies on MADs that it sent (solicited MADs). +.SH "RETURN VALUE" +.B umad_register() +returns non-negative agent id number on success, and a negative value on error as follows: + -EINVAL invalid port handle + -EPERM registration failed +.SH "SEE ALSO" +.BR umad_register_oui(3), +.BR umad_unregister (3) +.SH "AUTHOR" +.TP +Hal Rosenstock <halr@voltaire.com> diff --git a/libibumad/man/umad_register2.3 b/libibumad/man/umad_register2.3 new file mode 100644 index 0000000..74e8794 --- /dev/null +++ b/libibumad/man/umad_register2.3 @@ -0,0 +1,81 @@ +.\" -*- nroff -*- +.\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md +.\" +.TH UMAD_REGISTER2 3 "March 25, 2014" "OpenIB" "OpenIB Programmer\'s Manual" +.SH "NAME" +umad_register2 \- register the specified management class and version for port +.SH "SYNOPSIS" +.nf +.B #include <infiniband/umad.h> +.sp +.BI "int umad_register2(int " "port_fd" ", struct umad_reg_attr *" "attr" ", uint32_t *" "agent_id"); +.fi +.SH "DESCRIPTION" +.B umad_register2() +registers for a MAD agent using the provided registration attributes + +.I port_fd\fR +the port on which to register the agent + +.I attr\fR +The registration attributes as defined by the structure passed. See below for details of this structure. + +.I agent_id\fR +returned on success. agent_id identifies the kernel MAD agent a MAD is received by or to be sent by. agent_id is returned in the umad header "struct ib_user_mad" on recv and specified in umad_send when sending. + + +.SH "REGISTRATION ATTRIBUTE STRUCTURE" +.nf +struct umad_reg_attr { +.in +8 +uint8_t mgmt_class; +uint8_t mgmt_class_version; +uint32_t flags; +uint64_t method_mask[2]; +uint32_t oui; +uint8_t rmpp_version; +.in -8 +}; + +.I mgmt_class\fR +Management class to register for. + +.I mgmt_class_version\fR +Management class version to register for. + +.I flags\fR +Registration flags. If a flag specified is not supported by the kernel, +an error is returned, and the supported flags are returned in this field. + +.P +Current flags are: +.in +8 +UMAD_USER_RMPP -- flag to indicate the kernel should not process +RMPP packets. All RMPP packets will be treated like individual +MADs. The user is responsible for implementing the RMPP +protocol. +.in -8 + +.I method_mask\fR +A bit mask which indicates which unsolicited methods this agent should +receive. Setting this array to 0 will result in the agent only +receiving response MADs for which a request was sent. + +.I oui\fR +The oui (in host order) to use for vendor classes 0x30 - 0x4f. +Otherwise ignored. + +.I rmpp_version\fR +If the class supports RMPP and kernel RMPP is enabled (the default) +indicate which rmpp_version to use. + + +.SH "RETURN VALUE" +.B umad_register2() +returns 0 on success and +ERRNO on failure. + +.SH "SEE ALSO" +.BR umad_unregister (3) +.SH "AUTHOR" +.TP +Ira Weiny <ira.weiny@intel.com> diff --git a/libibumad/man/umad_register_oui.3 b/libibumad/man/umad_register_oui.3 new file mode 100644 index 0000000..19430a9 --- /dev/null +++ b/libibumad/man/umad_register_oui.3 @@ -0,0 +1,38 @@ +.\" -*- nroff -*- +.\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md +.\" +.TH UMAD_REGISTER_OUI 3 "May 17, 2007" "OpenIB" "OpenIB Programmer\'s Manual" +.SH "NAME" +umad_register_oui \- register the specified class in vendor range 2 for port +.SH "SYNOPSIS" +.nf +.B #include <infiniband/umad.h> +.sp +.BI "int umad_register_oui(int " "portid" ", int " "mgmt_class" ", uint8_t " "rmpp_version" ", uint8_t " "oui[3]" ", uint32_t " "method_mask[4]"); +.fi +.SH "DESCRIPTION" +.B umad_register_oui() +registers the specified class in vendor range 2, the specified +.I oui\fR, +and whether RMPP is being used for the port specified by the +.I portid\fR +handle. If +.I method_mask\fR +array is provided, the caller is registered as a replier (server) for the +methods having their corresponding bit on in the +.I method_mask\fR. +If +.I method_mask\fR +is NULL, the caller is registered as a MAD client, meaning that it can +only receive replies on MADs that it sent (solicited MADs). +.SH "RETURN VALUE" +.B umad_register() +returns non-negative agent id number on success, and a negative value on error as follows: + -EINVAL invalid port handle or class is not in the vendor class 2 range + -EPERM registration failed +.SH "SEE ALSO" +.BR umad_register (3), +.BR umad_unregister (3) +.SH "AUTHOR" +.TP +Hal Rosenstock <halr@voltaire.com> diff --git a/libibumad/man/umad_send.3 b/libibumad/man/umad_send.3 new file mode 100644 index 0000000..59af2cb --- /dev/null +++ b/libibumad/man/umad_send.3 @@ -0,0 +1,49 @@ +.\" -*- nroff -*- +.\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md +.\" +.TH UMAD_SEND 3 "May 11, 2007" "OpenIB" "OpenIB Programmer\'s Manual" +.SH "NAME" +umad_send \- send umad +.SH "SYNOPSIS" +.nf +.B #include <infiniband/umad.h> +.sp +.BI "int umad_send(int " "portid" ", int " "agentid" ", void " "*umad" ", int " "length" ", int " "timeout_ms" ", int " "retries"); +.fi +.SH "DESCRIPTION" +.B umad_send() +sends +.I length\fR +bytes from the specified +.I umad\fR +buffer from the port specified by +.I portid\fR, +and using the agent specified by +.I agentid\fR. + +The buffer can contain a RMPP transmission which is larger than a single MAD +packet when the agentid specifies a class which utilizes RMPP and the header +flags indicate RMPP is active. NOTE currently only RMPPFlags.Active is +meaningful in the header in user space. All other RMPP fields are ignored. +The data section of the buffer will be sent in multiple RMPP MAD packets with +headers built for the user. + +.I timeout_ms\fR +controls the solicited MADs behavior as follows: +zero value means not solicited. Positive value makes kernel indicate timeout +in milliseconds. If reply is not received within the specified value, the +original buffer is returned in the read channel with the status field set (to +non zero). Negative +.I timeout_ms\fR +makes kernel wait forever for the reply. +.I retries\fR +indicates the number of times the MAD will be retried before giving up. +.SH "RETURN VALUE" +.B umad_send() +returns 0 on success; on error, errno is set and a negative value is returned +as follows: + -EINVAL invalid port handle or agentid + -EIO send operation failed +.SH "AUTHOR" +.TP +Hal Rosenstock <halr@voltaire.com> diff --git a/libibumad/man/umad_set_addr.3 b/libibumad/man/umad_set_addr.3 new file mode 100644 index 0000000..03ac862 --- /dev/null +++ b/libibumad/man/umad_set_addr.3 @@ -0,0 +1,34 @@ +.\" -*- nroff -*- +.\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md +.\" +.TH UMAD_SET_ADDR 3 "May 17, 2007" "OpenIB" "OpenIB Programmer\'s Manual" +.SH "NAME" +umad_set_addr \- set MAD address fields within umad buffer using host ordering +.SH "SYNOPSIS" +.nf +.B #include <infiniband/umad.h> +.sp +.BI "int umad_set_addr(void " "*umad" ", int " "dlid" ", int " "dqp" ", int " "sl" ", int " "qkey"); +.fi +.SH "DESCRIPTION" +.B umad_set_addr() +sets the MAD address fields within the specified +.I umad\fR +buffer using the provided host ordered fields. +.I dlid\fR +is the destination LID. +.I dqp\fR +is the destination QP (queue pair). +.I sl\fR +is the SL (service level). +.I qkey\fR +is the Q_Key (queue key). +.SH "RETURN VALUE" +.B umad_set_addr() +returns 0 on success, and a negative value on errors. Currently, there +are no errors indicated. +.SH "SEE ALSO" +.BR umad_set_addr_net (3) +.SH "AUTHOR" +.TP +Hal Rosenstock <halr@voltaire.com> diff --git a/libibumad/man/umad_set_addr_net.3 b/libibumad/man/umad_set_addr_net.3 new file mode 100644 index 0000000..b395252 --- /dev/null +++ b/libibumad/man/umad_set_addr_net.3 @@ -0,0 +1,34 @@ +.\" -*- nroff -*- +.\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md +.\" +.TH UMAD_SET_ADDR_NET 3 "May 21, 2007" "OpenIB" "OpenIB Programmer\'s Manual" +.SH "NAME" +umad_set_addr_net \- set MAD address fields within umad buffer using network ordering +.SH "SYNOPSIS" +.nf +.B #include <infiniband/umad.h> +.sp +.BI "int umad_set_addr_net(void " "*umad" ", __be16 " "dlid" ", __be32 " "dqp" ", int " "sl" ", __be32 " "qkey"); +.fi +.SH "DESCRIPTION" +.B umad_set_addr_net() +sets the MAD address fields within the specified +.I umad\fR +buffer using the provided network ordered fields. +.I dlid\fR +is the destination LID. +.I dqp\fR +is the destination QP (queue pair). +.I sl\fR +is the SL (service level). +.I qkey\fR +is the Q_Key (queue key). +.SH "RETURN VALUE" +.B umad_set_addr_net() +returns 0 on success, and a negative value on errors. Currently, there +are no errors indicated. +.SH "SEE ALSO" +.BR umad_set_addr (3) +.SH "AUTHOR" +.TP +Hal Rosenstock <halr@voltaire.com> diff --git a/libibumad/man/umad_set_grh.3 b/libibumad/man/umad_set_grh.3 new file mode 100644 index 0000000..4ff52ec --- /dev/null +++ b/libibumad/man/umad_set_grh.3 @@ -0,0 +1,76 @@ +.\" -*- nroff -*- +.\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md +.\" +.TH UMAD_SET_GRH 3 "May 24, 2007" "OpenIB" "OpenIB Programmer\'s Manual" +.SH "NAME" +umad_set_grh \- set GRH fields within umad buffer using host ordering +.SH "SYNOPSIS" +.nf +.B #include <infiniband/umad.h> +.sp +.BI "int umad_set_grh(void " "*umad" ", void " "*mad_addr"); +.fi +.SH "DESCRIPTION" +.B umad_set_grh() +sets the GRH fields (grh_present, gid, hop_limit, traffic_class, flow_label) +within the specified +.I umad\fR +buffer based on the +.I mad_addr\fR +supplied. The provided +.I mad_addr\fR +fields are expected to be in host order. +If the +.I mad_addr\fR +pointer supplied is NULL, no GRH is set. +The argument +.I mad_addr +is a pointer to an +.I ib_mad_addr_t +struct, as specified in +.I <infiniband/umad.h>. +The argument +.I umad +is a pointer to an +.I ib_user_mad_t +struct, as specified in +.I <infiniband/umad.h>. +.PP +.nf +typedef struct ib_mad_addr { +.in +8 +uint32_t qpn; +uint32_t qkey; +uint16_t lid; +uint8_t sl; +uint8_t path_bits; +uint8_t grh_present; +uint8_t gid_index; +uint8_t hop_limit; +uint8_t traffic_class; +uint8_t gid[16]; +uint32_t flow_label; +.in -8 +} ib_mad_addr_t; +.PP +typedef struct ib_user_mad { +.in +8 +uint32_t agent_id; +uint32_t status; +uint32_t timeout_ms; +uint32_t retries; +uint32_t length; +ib_mad_addr_t addr; +uint8_t data[0]; +.in -8 +} ib_user_mad_t; +.fi +.SH "RETURN VALUE" +.B umad_set_grh() +returns 0 on success, and a negative value on errors. Currently, there +are no errors indicated. +.SH "SEE ALSO" +.BR umad_set_grh_net (3) +.SH "AUTHOR" +.TP +Hal Rosenstock <halr@voltaire.com> diff --git a/libibumad/man/umad_set_grh_net.3 b/libibumad/man/umad_set_grh_net.3 new file mode 100644 index 0000000..802b575 --- /dev/null +++ b/libibumad/man/umad_set_grh_net.3 @@ -0,0 +1,77 @@ +.\" -*- nroff -*- +.\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md +.\" +.TH UMAD_SET_GRH_NET 3 "May 24, 2007" "OpenIB" "OpenIB Programmer\'s Manual" +.SH "NAME" +umad_set_grh_net \- set GRH fields within umad buffer using network ordering +.SH "SYNOPSIS" +.nf +.B #include <infiniband/umad.h> +.sp +.BI "int umad_set_grh_net(void " "*umad" ", void " "*mad_addr"); +.fi +.SH "DESCRIPTION" +.B umad_set_grh_net() +sets the GRH fields (grh_present, gid, hop_limit, traffic_class, flow_label) +within the specified +.I umad\fR +buffer based on the +.I mad_addr\fR +supplied. The provided +.I mad_addr\fR +fields are expected to be in network order. +If the +.I mad_addr\fR +pointer supplied is NULL, no GRH is set. +The argument +.I mad_addr +is a pointer to an +.I ib_mad_addr_t +struct, as specified in <infiniband/umad.h>. +The argument +.I umad +is a pointer to an +.I ib_user_mad_t +struct, as specified in +.I <infiniband/umad.h>. +.PP +.nf +typedef struct ib_mad_addr { +.in +8 +uint32_t qpn; +uint32_t qkey; +uint16_t lid; +uint8_t sl; +uint8_t path_bits; +uint8_t grh_present; +uint8_t gid_index; +uint8_t hop_limit; +uint8_t traffic_class; +uint8_t gid[16]; +uint32_t flow_label; +.in -8 +} ib_mad_addr_t; +.PP +typedef struct ib_user_mad { +.in +8 +uint32_t agent_id; +uint32_t status; +uint32_t timeout_ms; +uint32_t retries; +uint32_t length; +ib_mad_addr_t addr; +uint8_t data[0]; +.in -8 +} ib_user_mad_t; +.fi +.SH "RETURN VALUE" +.B umad_set_grh_net() +returns 0 on success, and a negative value on errors. Currently, there +are no errors indicated. +.SH "KNOWN BUGS" +Not implemented. +.SH "SEE ALSO" +.BR umad_set_grh (3) +.SH "AUTHOR" +.TP +Hal Rosenstock <halr@voltaire.com> diff --git a/libibumad/man/umad_set_pkey.3 b/libibumad/man/umad_set_pkey.3 new file mode 100644 index 0000000..23e858e --- /dev/null +++ b/libibumad/man/umad_set_pkey.3 @@ -0,0 +1,23 @@ +.\" -*- nroff -*- +.\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md +.\" +.TH UMAD_SET_PKEY 3 "June 20, 2007" "OpenIB" "OpenIB Programmer\'s Manual" +.SH "NAME" +umad_set_pkey \- set pkey index within umad buffer +.SH "SYNOPSIS" +.nf +.B #include <infiniband/umad.h> +.sp +.BI "int umad_set_pkey(void " "*umad" ", int " "pkey_index"); +.fi +.SH "DESCRIPTION" +.B umad_set_pkey() +sets the pkey index within the specified +.I umad\fR +buffer. +.SH "RETURN VALUE" +.B umad_set_pkey() +returns 0 on success, and a negative value on an error. +.SH "AUTHOR" +.TP +Hal Rosenstock <halr@voltaire.com> diff --git a/libibumad/man/umad_size.3 b/libibumad/man/umad_size.3 new file mode 100644 index 0000000..74737cc --- /dev/null +++ b/libibumad/man/umad_size.3 @@ -0,0 +1,21 @@ +.\" -*- nroff -*- +.\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md +.\" +.TH UMAD_SIZE 3 "May 21, 2007" "OpenIB" "OpenIB Programmer\'s Manual" +.SH "NAME" +umad_size \- get the size of umad buffer +.SH "SYNOPSIS" +.nf +.B #include <infiniband/umad.h> +.sp +.BI "size_t umad_size(void); +.fi +.SH "DESCRIPTION" +.B umad_size() +returns the size of umad buffer (in bytes). +.SH "RETURN VALUE" +.B umad_size() +returns the size of umad buffer (in bytes). +.SH "AUTHOR" +.TP +Hal Rosenstock <halr@voltaire.com> diff --git a/libibumad/man/umad_status.3 b/libibumad/man/umad_status.3 new file mode 100644 index 0000000..fd5430a --- /dev/null +++ b/libibumad/man/umad_status.3 @@ -0,0 +1,27 @@ +.\" -*- nroff -*- +.\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md +.\" +.TH UMAD_STATUS 3 "May 17, 2007" "OpenIB" "OpenIB Programmer\'s Manual" +.SH "NAME" +umad_status \- get the status of a umad buffer +.SH "SYNOPSIS" +.nf +.B #include <infiniband/umad.h> +.sp +.BI "int umad_status(void " "*umad" ); +.fi +.SH "DESCRIPTION" +.B umad_status() +get the internal +.I umad\fR +status field. +.SH "RETURN VALUE" +After a packet is received, +.B umad_status() +returns 0 on a successful receive, or a non zero status. +ETIMEDOUT means that the packet had +a send-timeout indication. In this case, the transaction ID will be +set to the TID of the original request. +.SH "AUTHOR" +.TP +Hal Rosenstock <halr@voltaire.com> diff --git a/libibumad/man/umad_unregister.3 b/libibumad/man/umad_unregister.3 new file mode 100644 index 0000000..785d22d --- /dev/null +++ b/libibumad/man/umad_unregister.3 @@ -0,0 +1,31 @@ +.\" -*- nroff -*- +.\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md +.\" +.TH UMAD_UNREGISTER 3 "May 21, 2007" "OpenIB" "OpenIB Programmer\'s Manual" +.SH "NAME" +umad_unregister \- unregister umad agent +.SH "SYNOPSIS" +.nf +.B #include <infiniband/umad.h> +.sp +.BI "int umad_unregister(int " "portid" ", int " "agentid"); +.fi +.SH "DESCRIPTION" +.B umad_unregister() +unregisters the specified +.I agentid\fR +previously registered using +.B umad_register() +or +.B umad_register_oui()\fR. +.SH "RETURN VALUE" +.B umad_unregister() +returns 0 on success and negative value on error as follows: + -EINVAL invalid port handle or agentid + * (kernel error codes) +.SH "SEE ALSO" +.BR umad_register (3), +.BR umad_register_oui (3) +.SH "AUTHOR" +.TP +Hal Rosenstock <halr@voltaire.com> diff --git a/libibumad/sysfs.c b/libibumad/sysfs.c new file mode 100644 index 0000000..4584726 --- /dev/null +++ b/libibumad/sysfs.c @@ -0,0 +1,147 @@ +/* + * Copyright (c) 2004-2008 Voltaire Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include <config.h> + +#include <endian.h> +#include <inttypes.h> +#include <string.h> +#include <errno.h> +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> +#include "sysfs.h" + +static int ret_code(void) +{ + int e = errno; + + if (e > 0) + return -e; + return e; +} + +int sys_read_string(const char *dir_name, const char *file_name, char *str, int max_len) +{ + char path[256], *s; + int fd, r; + + snprintf(path, sizeof(path), "%s/%s", dir_name, file_name); + + if ((fd = open(path, O_RDONLY)) < 0) + return ret_code(); + + if ((r = read(fd, (void *)str, max_len)) < 0) { + int e = errno; + close(fd); + errno = e; + return ret_code(); + } + + str[(r < max_len) ? r : max_len - 1] = 0; + + if ((s = strrchr(str, '\n'))) + *s = 0; + + close(fd); + return 0; +} + +int sys_read_guid(const char *dir_name, const char *file_name, __be64 *net_guid) +{ + char buf[32], *str, *s; + uint64_t guid; + int r, i; + + if ((r = sys_read_string(dir_name, file_name, buf, sizeof(buf))) < 0) + return r; + + guid = 0; + + for (s = buf, i = 0; i < 4; i++) { + if (!(str = strsep(&s, ": \t\n"))) + return -EINVAL; + guid = (guid << 16) | (strtoul(str, NULL, 16) & 0xffff); + } + + *net_guid = htobe64(guid); + + return 0; +} + +int sys_read_gid(const char *dir_name, const char *file_name, + union umad_gid *gid) +{ + char buf[64], *str, *s; + __be16 *ugid = (__be16 *) gid; + int r, i; + + if ((r = sys_read_string(dir_name, file_name, buf, sizeof(buf))) < 0) + return r; + + for (s = buf, i = 0; i < 8; i++) { + if (!(str = strsep(&s, ": \t\n"))) + return -EINVAL; + ugid[i] = htobe16(strtoul(str, NULL, 16) & 0xffff); + } + + return 0; +} + +int sys_read_uint64(const char *dir_name, const char *file_name, uint64_t * u) +{ + char buf[32]; + int r; + + if ((r = sys_read_string(dir_name, file_name, buf, sizeof(buf))) < 0) + return r; + + *u = strtoull(buf, NULL, 0); + + return 0; +} + +int sys_read_uint(const char *dir_name, const char *file_name, unsigned *u) +{ + char buf[32]; + int r; + + if ((r = sys_read_string(dir_name, file_name, buf, sizeof(buf))) < 0) + return r; + + *u = strtoul(buf, NULL, 0); + + return 0; +} diff --git a/libibumad/sysfs.h b/libibumad/sysfs.h new file mode 100644 index 0000000..bad092e --- /dev/null +++ b/libibumad/sysfs.h @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2008 Voltaire Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#ifndef _UMAD_SYSFS_H +#define _UMAD_SYSFS_H + +#include <stdint.h> +#include <linux/types.h> +#include <infiniband/umad.h> + +extern int sys_read_string(const char *dir_name, const char *file_name, char *str, int len); +extern int sys_read_guid(const char *dir_name, const char *file_name, __be64 * net_guid); +extern int sys_read_gid(const char *dir_name, const char *file_name, + union umad_gid *gid); +extern int sys_read_uint64(const char *dir_name, const char *file_name, uint64_t * u); +extern int sys_read_uint(const char *dir_name, const char *file_name, unsigned *u); + +#endif /* _UMAD_SYSFS_H */ diff --git a/libibumad/tests/CMakeLists.txt b/libibumad/tests/CMakeLists.txt new file mode 100644 index 0000000..2092b81 --- /dev/null +++ b/libibumad/tests/CMakeLists.txt @@ -0,0 +1,10 @@ +rdma_test_executable(umad_reg2 umad_reg2_compat.c) +target_link_libraries(umad_reg2 LINK_PRIVATE ibumad) + +rdma_test_executable(umad_register2 umad_register2.c) +target_link_libraries(umad_register2 LINK_PRIVATE ibumad) + +rdma_test_executable(umad_sa_mcm_rereg_test umad_sa_mcm_rereg_test.c) +target_link_libraries(umad_sa_mcm_rereg_test LINK_PRIVATE ibumad) + +rdma_test_executable(umad_compile_test umad_compile_test.c) diff --git a/libibumad/tests/umad_compile_test.c b/libibumad/tests/umad_compile_test.c new file mode 100644 index 0000000..bb6bb29 --- /dev/null +++ b/libibumad/tests/umad_compile_test.c @@ -0,0 +1,73 @@ +/* + * Copyright (c) 2017 Mellanox Technologies LTD. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#include <config.h> + +#include <stddef.h> +#include <endian.h> +#include <ccan/build_assert.h> +#include <infiniband/umad.h> +#include <infiniband/umad_types.h> +#include <infiniband/umad_sm.h> +#include <infiniband/umad_sa.h> +#include <infiniband/umad_cm.h> + +int main(int argc, char *argv[]) +{ +#ifndef __CHECKER__ + /* + * Hide these checks for sparse because these checks fail with + * older versions of sparse. + */ + BUILD_ASSERT(__alignof__(union umad_gid) == 4); +#endif + + /* umad_types.h structure checks */ + BUILD_ASSERT(sizeof(struct umad_hdr) == 24); + BUILD_ASSERT(sizeof(struct umad_rmpp_hdr) == 12); + BUILD_ASSERT(sizeof(struct umad_packet) == 256); + BUILD_ASSERT(sizeof(struct umad_rmpp_packet) == 256); + BUILD_ASSERT(sizeof(struct umad_dm_packet) == 256); + BUILD_ASSERT(sizeof(struct umad_vendor_packet) == 256); + BUILD_ASSERT(sizeof(struct umad_class_port_info) == 72); + BUILD_ASSERT(offsetof(struct umad_class_port_info, redirgid) == 8); + BUILD_ASSERT(offsetof(struct umad_class_port_info, trapgid) == 40); + + /* umad_sm.h structure check */ + BUILD_ASSERT(sizeof(struct umad_smp) == 256); + + /* umad_sa.h structure check */ + BUILD_ASSERT(sizeof(struct umad_sa_packet) == 256); + + return 0; +} diff --git a/libibumad/tests/umad_reg2_compat.c b/libibumad/tests/umad_reg2_compat.c new file mode 100644 index 0000000..92d1df0 --- /dev/null +++ b/libibumad/tests/umad_reg2_compat.c @@ -0,0 +1,212 @@ +/* + * Copyright (c) 2014 Intel Corporation, All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#include <config.h> + +#include <string.h> +#include <stdio.h> +#include <inttypes.h> + +#include <infiniband/umad.h> + +#define UNLIKELY_MGMT_CLASS 0x2F +#define UNLIKELY_RMPP_MGMT_CLASS 0x4F + +static int test_failures = 0; + +/** ========================================================================= + * Stolen from OpenSM's register + */ +static int set_bit(int nr, void *method_mask) +{ + long mask, *addr = method_mask; + int retval; + + addr += nr / (8 * sizeof(long)); + mask = (1UL) << (nr % (8 * sizeof(long))); + retval = (mask & *addr) != 0; + *addr |= mask; + return retval; +} + +static void set_bit64(int b, uint64_t *buf) +{ + uint64_t mask; + uint64_t *addr = buf; + + addr += b >> 6; + mask = 1ULL << (b & 0x3f); + *addr |= mask; +} + +static void dump_reg_attr(struct umad_reg_attr *reg_attr) +{ + printf("\nmgmt_class %u\n" + "mgmt_class_version %u\n" + "flags 0x%08x\n" + "method_mask 0x%016"PRIx64" %016"PRIx64"\n" + "oui 0x%06x\n" + "rmpp_version %u\n\n", + reg_attr->mgmt_class, + reg_attr->mgmt_class_version, + reg_attr->flags, + reg_attr->method_mask[1], reg_attr->method_mask[0], + reg_attr->oui, + reg_attr->rmpp_version); +} + +static int open_test_device(void) +{ + int fd = umad_open_port(NULL, 0); + if (fd < 0) { + printf("\n *****\nOpen Port Failure... Aborting\n"); + printf(" Ensure you have an HCA to test against.\n"); + exit(0); + } + return fd; +} + +static void test_register(void) +{ + int agent_id; + long method_mask[16 / sizeof(long)]; + uint32_t class_oui = 0x001405; /* OPENIB_OUI */ + uint8_t oui[3]; + int fd; + + printf("\n old register test ... "); + + fd = open_test_device(); + + memset(&method_mask, 0, sizeof(method_mask)); + set_bit( 1, &method_mask); + set_bit(63, &method_mask); + set_bit(64, &method_mask); + + // equal to this with the new register + //reg_attr.method_mask[0] = 0x8000000000000002ULL; + //reg_attr.method_mask[1] = 0x0000000000000001ULL; + + agent_id = umad_register(fd, UNLIKELY_MGMT_CLASS, 0x1, 0x00, method_mask); + if (agent_id < 0) { + printf("\n umad_register Failure, agent_id %d\n", agent_id); + printf("\n umad_register(fd, 0x01, 0x1, 0x00, method_mask);\n"); + test_failures++; + } else { + printf(" PASS\n"); + umad_unregister(fd, agent_id); + } + + printf("\n old register_oui test ... "); + + oui[0] = (class_oui >> 16) & 0xff; + oui[1] = (class_oui >> 8) & 0xff; + oui[2] = class_oui & 0xff; + + agent_id = umad_register_oui(fd, UNLIKELY_RMPP_MGMT_CLASS, 0x1, oui, method_mask); + if (agent_id < 0) { + printf("\n umad_register_oui Failure, agent_id %d\n", agent_id); + printf("\n umad_register(fd, 0x30, 0x1, oui, method_mask);\n"); + test_failures++; + } else { + printf(" PASS\n"); + umad_unregister(fd, agent_id); + } + + umad_close_port(fd); +} + + +static void test_fall_back(void) +{ + int rc = 0; + struct umad_reg_attr reg_attr; + uint32_t agent_id; + int fd; + + fd = open_test_device(); + + memset(®_attr, 0, sizeof(reg_attr)); + reg_attr.mgmt_class = UNLIKELY_MGMT_CLASS; + reg_attr.mgmt_class_version = 0x1; + reg_attr.oui = 0x001405; /* OPENIB_OUI */ + + //reg_attr.method_mask[0] = 0x8000000000000002ULL; + //reg_attr.method_mask[1] = 0x0000000000000001ULL; + + set_bit64( 1, (uint64_t *)®_attr.method_mask); + set_bit64(63, (uint64_t *)®_attr.method_mask); + set_bit64(64, (uint64_t *)®_attr.method_mask); + + printf("\n umad_register2 fall back (set_bit) ... "); + rc = umad_register2(fd, ®_attr, &agent_id); + if (rc != 0) { + printf("\n umad_register2 failed to fall back. rc = %d\n", rc); + dump_reg_attr(®_attr); + test_failures++; + } else { + printf(" PASS\n"); + umad_unregister(fd, agent_id); + } + + reg_attr.method_mask[0] = 0x8000000000000002ULL; + reg_attr.method_mask[1] = 0x0000000000000001ULL; + + printf("\n umad_register2 fall back ... "); + rc = umad_register2(fd, ®_attr, &agent_id); + if (rc != 0) { + printf("\n umad_register2 failed to fall back. rc = %d\n", rc); + dump_reg_attr(®_attr); + test_failures++; + } else { + printf(" PASS\n"); + umad_unregister(fd, agent_id); + } + + umad_close_port(fd); + +} + +int main(int argc, char *argv[]) +{ + //umad_debug(1); + + printf("\n *****\nStart compatibility tests\n"); + + test_register(); + test_fall_back(); + printf("\n *******************\n"); + printf(" umad_reg2_compat had %d failures\n", test_failures); + printf(" *******************\n"); + return test_failures; +} diff --git a/libibumad/tests/umad_register2.c b/libibumad/tests/umad_register2.c new file mode 100644 index 0000000..477bd29 --- /dev/null +++ b/libibumad/tests/umad_register2.c @@ -0,0 +1,286 @@ +/* + * Copyright (c) 2014 Intel Corporation, All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#include <config.h> + +#include <string.h> +#include <stdio.h> +#include <inttypes.h> +#include <errno.h> +#include <sys/ioctl.h> + +#include <infiniband/umad.h> + +#define UNLIKELY_MGMT_CLASS 0x2F +#define UNLIKELY_RMPP_MGMT_CLASS 0x4F + +struct ib_user_mad_reg_req2 { + uint32_t id; + uint32_t qpn; + uint8_t mgmt_class; + uint8_t mgmt_class_version; + uint16_t res; + uint32_t flags; + uint64_t method_mask[2]; + uint32_t oui; + uint8_t rmpp_version; + uint8_t reserved[3]; +}; + +static int test_failures = 0; + +static void dump_reg_attr(struct umad_reg_attr *reg_attr) +{ + printf("\nmgmt_class %u\n" + "mgmt_class_version %u\n" + "flags 0x%08x\n" + "method_mask 0x%016"PRIx64" %016"PRIx64"\n" + "oui 0x%06x\n" + "rmpp_version %u\n\n", + reg_attr->mgmt_class, + reg_attr->mgmt_class_version, + reg_attr->flags, + reg_attr->method_mask[1], reg_attr->method_mask[0], + reg_attr->oui, + reg_attr->rmpp_version); +} + +static int open_test_device(void) +{ + int fd = umad_open_port(NULL, 0); + if (fd < 0) { + printf("\n *****\nOpen Port Failure... Aborting\n"); + printf(" Ensure you have an HCA to test against.\n"); + exit(0); + } + return fd; +} + +static void test_fail(void) +{ + int rc = 0; + struct umad_reg_attr reg_attr; + uint32_t agent_id; + uint32_t agent_id2; + int fd; + + printf("\n *****\nBegin invalid tests\n"); + + fd = open_test_device(); + + memset(®_attr, 0, sizeof(reg_attr)); + reg_attr.mgmt_class = UNLIKELY_MGMT_CLASS; + reg_attr.mgmt_class_version = 0x1; + reg_attr.flags = 0x80000000; + printf("\n invalid register flags ... "); + rc = umad_register2(fd, ®_attr, &agent_id); + if (rc == 0) { + printf("\n umad_register2 registered invalid flags. rc = %d\n", + rc); + dump_reg_attr(®_attr); + test_failures++; + goto out; + } else { + printf(" PASS\n"); + umad_unregister(fd, agent_id); + } + + memset(®_attr, 0, sizeof(reg_attr)); + reg_attr.mgmt_class = 0x03; + reg_attr.mgmt_class_version = 0x2; + reg_attr.rmpp_version = 0x02; + printf("\n invalid rmpp_version ... "); + rc = umad_register2(fd, ®_attr, &agent_id); + if (rc == 0) { + printf("\n umad_register2 registered an invalid rmpp_version. rc = %d\n", + rc); + dump_reg_attr(®_attr); + test_failures++; + goto out; + } else { + printf(" PASS\n"); + umad_unregister(fd, agent_id); + } + + memset(®_attr, 0, sizeof(reg_attr)); + reg_attr.mgmt_class = UNLIKELY_RMPP_MGMT_CLASS; + reg_attr.oui = 0x0100066a; + printf("\n invalid oui ... "); + rc = umad_register2(fd, ®_attr, &agent_id); + if (rc == 0) { + printf("\n umad_register2 registered an invalid oui. rc = %d\n", + rc); + dump_reg_attr(®_attr); + test_failures++; + goto out; + } else { + printf(" PASS\n"); + umad_unregister(fd, agent_id); + } + + /* The following 2 registrations attempt to register the same OUI 2 + * times. The second one is supposed to fail with the same method + * mask. + */ + printf("\n duplicate oui ... "); + memset(®_attr, 0, sizeof(reg_attr)); + reg_attr.mgmt_class = UNLIKELY_RMPP_MGMT_CLASS; + reg_attr.mgmt_class_version = 0x1; + reg_attr.rmpp_version = 0x00; + reg_attr.oui = 0x00066a; + reg_attr.method_mask[0] = 0x80000000000000DEULL; + reg_attr.method_mask[1] = 0xAD00000000000001ULL; + rc = umad_register2(fd, ®_attr, &agent_id); + if (rc != 0) { + printf("\n umad_register2 Failed to register an oui for the duplicate test. rc = %d\n", + rc); + dump_reg_attr(®_attr); + test_failures++; + goto out; + } + + memset(®_attr, 0, sizeof(reg_attr)); + reg_attr.mgmt_class = UNLIKELY_RMPP_MGMT_CLASS; + reg_attr.mgmt_class_version = 0x1; + reg_attr.rmpp_version = 0x00; + reg_attr.oui = 0x00066a; + reg_attr.method_mask[0] = 0x80000000000000DEULL; + reg_attr.method_mask[1] = 0xAD00000000000001ULL; + rc = umad_register2(fd, ®_attr, &agent_id2); + if (rc == 0) { + printf("\n umad_register2 registered a duplicate oui. rc = %d\n", + rc); + dump_reg_attr(®_attr); + test_failures++; + goto out; + } else { + printf(" PASS\n"); + umad_unregister(fd, agent_id); + umad_unregister(fd, agent_id2); + } + + umad_close_port(fd); +out: + printf("\n *****\nEnd invalid tests\n"); +} + +static void test_oui(void) +{ + int rc = 0; + struct umad_reg_attr reg_attr; + uint32_t agent_id; + int fd; + + printf("\n *****\nStart valid oui tests\n"); + + fd = open_test_device(); + + printf("\n valid oui ... "); + memset(®_attr, 0, sizeof(reg_attr)); + reg_attr.mgmt_class = UNLIKELY_RMPP_MGMT_CLASS; + reg_attr.mgmt_class_version = 0x1; + reg_attr.rmpp_version = 0x00; + reg_attr.oui = 0x00066a; + reg_attr.method_mask[0] = 0x80000000000000DEULL; + reg_attr.method_mask[1] = 0xAD00000000000001ULL; + rc = umad_register2(fd, ®_attr, &agent_id); + if (rc != 0) { + printf("\n umad_register2 failed oui 0x%x. rc = %d\n", + reg_attr.oui, rc); + dump_reg_attr(®_attr); + test_failures++; + goto out; + } else { + printf(" PASS\n"); + umad_unregister(fd, agent_id); + } + + printf("\n valid oui with flags ... "); + memset(®_attr, 0, sizeof(reg_attr)); + reg_attr.mgmt_class = UNLIKELY_RMPP_MGMT_CLASS; + reg_attr.mgmt_class_version = 0x1; + reg_attr.rmpp_version = 0x00; + reg_attr.flags = 0x01; + /* Use Intel OUI for testing */ + reg_attr.oui = 0x00066a; + rc = umad_register2(fd, ®_attr, &agent_id); + if (rc != 0) { + printf("\n umad_register2 failed oui 0x%x with flags 0x%x. rc = %d\n", + reg_attr.oui, reg_attr.flags, rc); + dump_reg_attr(®_attr); + test_failures++; + goto out; + } else { + printf(" PASS\n"); + umad_unregister(fd, agent_id); + } + + umad_close_port(fd); + +out: + printf("\n End valid oui tests\n *****\n"); +} + +static void check_register2_support(void) +{ + struct ib_user_mad_reg_req2 req; + int fd; + + fd = open_test_device(); + + memset(&req, 0, sizeof(req)); + req.mgmt_class = UNLIKELY_MGMT_CLASS; + req.mgmt_class_version = 0x1; + req.qpn = 0x1; + + if (ioctl(fd, IB_USER_MAD_REGISTER_AGENT2, (void *)&req) != 0) { + if (errno == ENOTTY || errno == EINVAL) { + printf("\n *****\nKernel does not support the new ioctl. Aborting tests\n"); + exit(0); + } + } + + umad_close_port(fd); +} + +int main(int argc, char *argv[]) +{ + //umad_debug(1); + check_register2_support(); + test_fail(); + test_oui(); + printf("\n *******************\n"); + printf(" umad_register2 had %d failures\n", test_failures); + printf(" *******************\n"); + return test_failures; +} diff --git a/libibumad/tests/umad_sa_mcm_rereg_test.c b/libibumad/tests/umad_sa_mcm_rereg_test.c new file mode 100644 index 0000000..4330d9f --- /dev/null +++ b/libibumad/tests/umad_sa_mcm_rereg_test.c @@ -0,0 +1,583 @@ +/* + * Copyright (c) 2017 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2006-2009 Voltaire, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#include <stdio.h> +#include <string.h> +#include <errno.h> +#include <inttypes.h> +#include <unistd.h> + +#include <infiniband/umad.h> +#include <infiniband/umad_sa_mcm.h> + +#define info(fmt, ...) fprintf(stderr, "INFO: " fmt, ## __VA_ARGS__) +#define err(fmt, ...) fprintf(stderr, "ERR: " fmt, ## __VA_ARGS__) +#ifdef NOISY_DEBUG +#define dbg(fmt, ...) fprintf(stderr, "DBG: " fmt, ## __VA_ARGS__) +#else +#define dbg(fmt, ...) {} +#endif + +#define DEFAULT_TIMEOUT 100 /* milliseconds */ +#define MAX_PORT_GUIDS 64 + +/* Use null MGID to request SA assigned MGID */ +static const uint8_t null_mgid[16] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 +}; + +static int create, join, leave; +static uint8_t rate = 0xff, mtu = 0xff, sl = 0xff; +static umad_port_t umad_port; + +struct guid_trid { + uint8_t gid[16]; + __be64 guid; + uint64_t trid[2]; +}; + +static void build_user_mad_addr(uint8_t *umad) +{ + umad_set_addr(umad, umad_port.sm_lid, 1, umad_port.sm_sl, UMAD_QKEY); + + /* + * The following 2 umad calls are redundant + * as umad was originally cleared to + */ + umad_set_grh(umad, NULL); + umad_set_pkey(umad, 0); /* just pkey index 0 for now !!! */ +} + +static void build_mcm_rec(struct umad_sa_packet *sa, uint8_t method, + const uint8_t mgid[], const uint8_t port_gid[], + uint64_t tid, int creat) +{ + struct umad_sa_mcmember_record *mcm; + + memset(sa, 0, sizeof(*sa)); + + sa->mad_hdr.base_version = UMAD_BASE_VERSION; + sa->mad_hdr.mgmt_class = UMAD_CLASS_SUBN_ADM; + sa->mad_hdr.class_version = UMAD_SA_CLASS_VERSION; + sa->mad_hdr.method = method; + sa->mad_hdr.tid = htobe64(tid); + sa->mad_hdr.attr_id = htons(UMAD_SA_ATTR_MCMEMBER_REC); + if (creat) + sa->comp_mask = htobe64(UMAD_SA_MCM_COMP_MASK_MGID | + UMAD_SA_MCM_COMP_MASK_PORT_GID | + UMAD_SA_MCM_COMP_MASK_QKEY | + UMAD_SA_MCM_COMP_MASK_TCLASS | + UMAD_SA_MCM_COMP_MASK_PKEY | + UMAD_SA_MCM_COMP_MASK_SL | + UMAD_SA_MCM_COMP_MASK_FLOW_LABEL | + UMAD_SA_MCM_COMP_MASK_JOIN_STATE); + else + sa->comp_mask = htobe64(UMAD_SA_MCM_COMP_MASK_MGID | + UMAD_SA_MCM_COMP_MASK_PORT_GID | + UMAD_SA_MCM_COMP_MASK_JOIN_STATE); + + mcm = (struct umad_sa_mcmember_record *) sa->data; + memcpy(mcm->mgid, mgid, sizeof(mcm->mgid)); + memcpy(mcm->portgid, port_gid, sizeof(mcm->portgid)); + umad_sa_mcm_set_join_state(mcm, UMAD_SA_MCM_JOIN_STATE_FULL_MEMBER); + if (creat) { + mcm->qkey = htonl(0xb1b); + /* assume full default partition (in index 0) */ + mcm->pkey = htons(0xffff); + if (rate != 0xff) { + sa->comp_mask |= + htobe64(UMAD_SA_MCM_COMP_MASK_RATE_SEL | + UMAD_SA_MCM_COMP_MASK_RATE); + mcm->rate = (UMAD_SA_SELECTOR_EXACTLY << + UMAD_SA_SELECTOR_SHIFT) | + (rate & UMAD_SA_RATE_MTU_PKT_LIFE_MASK); + } + if (mtu != 0xff) { + sa->comp_mask |= htobe64(UMAD_SA_MCM_COMP_MASK_MTU_SEL | + UMAD_SA_MCM_COMP_MASK_MTU); + mcm->mtu = (UMAD_SA_SELECTOR_EXACTLY << + UMAD_SA_SELECTOR_SHIFT) | + (mtu & UMAD_SA_RATE_MTU_PKT_LIFE_MASK); + } + if (sl != 0xff) { + sa->comp_mask |= htobe64(UMAD_SA_MCM_COMP_MASK_SL); + mcm->sl_flow_hop = + umad_sa_mcm_set_sl_flow_hop(sl, 0, 0); + } + } +} + +static int mcm_send(int portid, int agentid, uint8_t *umad, int len, int tmo, + uint8_t method, const uint8_t mgid[], + struct guid_trid *entry, int creat) +{ + struct umad_sa_packet *sa = umad_get_mad(umad); + + build_mcm_rec(sa, method, mgid, entry->gid, entry->trid[0], creat); + if (umad_send(portid, agentid, umad, len, tmo, 0) < 0) { + err("umad_send %s failed: %s\n", + (method == UMAD_METHOD_GET) ? "query" : "non query", + strerror(errno)); + return -1; + } + dbg("umad_send %d: tid = 0x%" PRIx64 "\n", method, + be64toh(sa->mad_hdr.tid)); + + return 0; +} + +static int rereg_port_gid(int portid, int agentid, + uint8_t *umad, int len, int tmo, + const uint8_t mgid[], struct guid_trid *entry) +{ + struct umad_sa_packet *sa = umad_get_mad(umad); + + build_mcm_rec(sa, UMAD_SA_METHOD_DELETE, mgid, + entry->gid, entry->trid[0], 0); + if (umad_send(portid, agentid, umad, len, tmo, 0) < 0) { + err("umad_send leave failed: %s\n", strerror(errno)); + return -1; + } + dbg("umad_send leave: tid = 0x%" PRIx64 "\n", be64toh(sa->mad_hdr.tid)); + entry->trid[0] = be64toh(sa->mad_hdr.tid); /* for agent ID */ + + sa->mad_hdr.method = UMAD_METHOD_SET; + sa->mad_hdr.tid = htobe64(entry->trid[1]); + if (umad_send(portid, agentid, umad, len, tmo, 0) < 0) { + err("umad_send join failed: %s\n", strerror(errno)); + return -1; + } + dbg("umad_send join: tid = 0x%" PRIx64 "\n", be64toh(sa->mad_hdr.tid)); + entry->trid[1] = be64toh(sa->mad_hdr.tid); /* for agent ID */ + + return 0; +} + +static int rereg_send_all(int portid, int agentid, int tmo, + const uint8_t mgid[], struct guid_trid *list, + unsigned int cnt) +{ + uint8_t *umad; + int len = sizeof(struct umad_hdr) + UMAD_LEN_DATA; + unsigned int i, sent = 0; + int ret; + + info("%s... cnt = %u\n", __func__, cnt); + + umad = calloc(1, len + umad_size()); + if (!umad) { + err("cannot alloc mem for umad: %s\n", strerror(errno)); + return -1; + } + build_user_mad_addr(umad); + + for (i = 0; i < cnt; i++) { + ret = rereg_port_gid(portid, agentid, umad, len, tmo, + mgid, &list[i]); + if (ret < 0) { + err("%s: rereg_port_gid guid 0x%016" PRIx64 + " failed\n", __func__, be64toh(list[i].guid)); + continue; + } + sent++; + } + + info("%s: sent %u of %u requests\n", __func__, sent * 2, cnt * 2); + + free(umad); + + return 0; +} + +static int mcm_recv(int portid, uint8_t *umad, int length, int tmo) +{ + int ret, retry = 0; + int len = length; +#ifdef NOISY_DEBUG + struct umad_hdr *mad; +#endif + + while ((ret = umad_recv(portid, umad, &len, tmo)) < 0 && + errno == ETIMEDOUT) { + if (retry++ > 3) + return 0; + } + if (ret < 0) { + err("umad_recv %d failed: %s\n", ret, strerror(errno)); + return -1; + } + +#ifdef NOISY_DEBUG + mad = umad_get_mad(umad); +#endif + dbg("umad_recv (retries %d), tid = 0x%" PRIx64 + ": len = %d, status = %d\n", retry, + be64toh(mad->tid), len, umad_status(umad)); + + return 1; +} + +static int rereg_recv_all(int portid, int agentid, int tmo, + const uint8_t mgid[], struct guid_trid *list, + unsigned int cnt) +{ + uint8_t *umad; + struct umad_hdr *mad; + int len = sizeof(struct umad_hdr) + UMAD_LEN_DATA; + uint64_t trid; + unsigned int n, i, j; + uint16_t status; + uint8_t method; + + info("%s...\n", __func__); + + umad = calloc(1, len + umad_size()); + if (!umad) { + err("cannot alloc mem for umad: %s\n", strerror(errno)); + return -1; + } + mad = umad_get_mad(umad); + + n = 0; + while (mcm_recv(portid, umad, len, tmo) > 0) { + dbg("%s: done %d\n", __func__, n); + n++; + + method = mad->method; + status = ntohs(mad->status); + trid = be64toh(mad->tid); + + if (status) + dbg("MAD status 0x%x, method 0x%x\n", status, method); + + if (status && + (method == UMAD_METHOD_GET_RESP || + method == UMAD_SA_METHOD_DELETE_RESP)) { + for (i = 0; i < cnt; i++) + for (j = 0; j < 2; j++) + if (trid == list[i].trid[j]) + break; + if (i == cnt) { + err("cannot find trid 0x%" PRIx64 + ", status 0x%x, method 0x%x\n", + trid, status, method); + continue; + } + info("guid 0x%016" PRIx64 + ": status 0x%x, method 0x%x. Retrying\n", + be64toh(list[i].guid), status, method); + rereg_port_gid(portid, agentid, umad, len, tmo, + mgid, &list[i]); + } + } + + info("%s: got %u responses\n", __func__, n); + + free(umad); + return 0; +} + +static int query_all(int portid, int agentid, int tmo, uint8_t method, + const uint8_t mgid[], struct guid_trid *list, + int creat, unsigned int cnt) +{ + uint8_t *umad; + struct umad_hdr *mad; + int len = sizeof(struct umad_hdr) + UMAD_LEN_DATA; + unsigned int i, sent = 0; + int ret; + uint16_t status; + uint8_t mcgid[16]; + + info("%s...\n", __func__); + + memcpy(mcgid, mgid, 16); + + umad = calloc(1, len + umad_size()); + if (!umad) { + err("cannot alloc mem for umad: %s\n", strerror(errno)); + return -1; + } + build_user_mad_addr(umad); + mad = umad_get_mad(umad); + + for (i = 0; i < cnt; i++) { + ret = mcm_send(portid, agentid, umad, len, tmo, + method, mcgid, &list[i], creat); + if (ret < 0) { + err("%s: mcm_send failed\n", __func__); + continue; + } + sent++; + + ret = mcm_recv(portid, umad, len, tmo); + if (ret < 0) { + err("%s: mcm_recv failed\n", __func__); + continue; + } + + status = ntohs(mad->status); + if (status) + info( + "guid 0x%016" PRIx64 ": status 0x%x, method 0x%x\n", + be64toh(list[i].guid), status, mad->method); + else if (creat && i == 0) { + if (memcmp(mgid, null_mgid, 16) == 0) { + struct umad_sa_packet *sa = (void *) mad; + struct umad_sa_mcmember_record *mcm; + + mcm = (struct umad_sa_mcmember_record *) + sa->data; + memcpy(mcgid, mcm->mgid, 16); + } + } + } + + info("%s: %u of %u queried\n", __func__, sent, cnt); + + free(umad); + return 0; +} + +static int test_port(const char *guid_file, int portid, int agentid, int tmo, + const uint8_t mgid[]) +{ + char line[256]; + FILE *f; + uint8_t port_gid[16]; + uint64_t guidho; + __be64 prefix, guid; + uint64_t trid; + struct guid_trid *list; + int i = 0, j; + + list = calloc(MAX_PORT_GUIDS, sizeof(*list)); + if (!list) { + err("cannot alloc mem for guid/trid list: %s\n", + strerror(errno)); + return -1; + } + + f = fopen(guid_file, "r"); + if (!f) { + err("cannot open %s: %s\n", guid_file, strerror(errno)); + free(list); + return -1; + } + + trid = 0x12345678; /* starting tid */ + prefix = umad_port.gid_prefix; + + while (fgets(line, sizeof(line), f)) { + guidho = strtoull(line, NULL, 0); + guid = htobe64(guidho); + memcpy(&port_gid[0], &prefix, 8); + memcpy(&port_gid[8], &guid, 8); + + list[i].guid = guid; + memcpy(list[i].gid, port_gid, sizeof(list[i].gid)); + for (j = 0; j < 2; j++) + list[i].trid[j] = trid++; + + if (++i >= MAX_PORT_GUIDS) + break; + } + fclose(f); + + if (create) + query_all(portid, agentid, tmo, UMAD_METHOD_SET, + mgid, list, 1, i); + else if (join) + query_all(portid, agentid, tmo, UMAD_METHOD_SET, + mgid, list, 0, i); + else if (leave) + query_all(portid, agentid, tmo, UMAD_SA_METHOD_DELETE, + mgid, list, 0, i); + else { + /* no operation specified - default to rereg */ + rereg_send_all(portid, agentid, tmo, mgid, list, i); + rereg_recv_all(portid, agentid, tmo, mgid, list, i); + + query_all(portid, agentid, tmo, UMAD_METHOD_GET, + mgid, list, 0, i); + } + + free(list); + return 0; +} + +static void show_usage(const char *prog_name) +{ + fprintf(stderr, + "%s [-C <ca_name>] [-P <ca_port>] [-F <port_guid_file>] [-t <timeout_ms>] [-g <mgid>] [-c] [-j] [-l] [-r <rate>] [-m <mtu>] [-s <sl>] [-h]\n", + prog_name); + fprintf(stderr, + " -C <ca_name> use the specified ca_name\n"); + fprintf(stderr, + " -P <ca_port> use the specific ca_port\n"); + fprintf(stderr, + " -F <port_guid_file> use the specified port_guid_file\n"); + fprintf(stderr, + " defaults to port_guids.lst\n"); + fprintf(stderr, + " -t <timeout_ms> override the default timeout of 100 milliseconds\n"); + fprintf(stderr, + " -g <mgid> MGID of MC group in IPv6 format\n"); + fprintf(stderr, + " defaults to IPv4 broadcast group if not specified\n"); + fprintf(stderr, + " To create SA assigned group, use either :: or 0:0:0:0:0:0:0:0\n"); + fprintf(stderr, " -c create MC group with ports\n"); + fprintf(stderr, " -j join ports to MC group\n"); + fprintf(stderr, + " -l remove ports from MC group (leave)\n"); + fprintf(stderr, + " operation defaults to reregister ports if none if c, j, l are specified\n\n"); + fprintf(stderr, + " -r <rate> Encoded rate value (for create)\n"); + fprintf(stderr, + " -m <mtu> Encoded mtu value (for create)\n"); + fprintf(stderr, " -s <sl> SL (for create)\n"); + fprintf(stderr, " -h show this usage message\n"); +} + +int main(int argc, char **argv) +{ + char *ibd_ca = NULL; + int ibd_ca_port = 0; + const char *guid_file = "port_guids.list"; + int tmo = DEFAULT_TIMEOUT; + int c, portid, agentid; + const char *prog_name; + const char *const optstring = "F:C:P:t:g:cjlr:m:s:h"; + /* IPoIB broadcast group (for full default pkey) */ + uint8_t mgid[16] = { + 0xff, 0x12, 0x40, 0x1b, 0xff, 0xff, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff + }; + + prog_name = argv[0]; + while ((c = getopt(argc, argv, optstring)) != -1) { + switch (c) { + case 'C': + ibd_ca = optarg; + break; + case 'P': + ibd_ca_port = strtoul(optarg, NULL, 0); + break; + case 'F': + guid_file = optarg; + break; + case 't': + tmo = atoi(optarg); + break; + case 'g': + if (inet_pton(AF_INET6, optarg, &mgid) <= 0) { + fprintf(stderr, "mgid could not be parsed\n"); + exit(EXIT_FAILURE); + } + break; + case 'c': + create = 1; + break; + case 'j': + join = 1; + break; + case 'l': + leave = 1; + break; + case 'r': + rate = atoi(optarg); + break; + case 'm': + mtu = atoi(optarg); + break; + case 's': + sl = atoi(optarg); + break; + case 'h': + show_usage(prog_name); + exit(EXIT_SUCCESS); + break; + default: + fprintf(stderr, "Unrecognized option: -%c\n", optopt); + show_usage(prog_name); + exit(EXIT_FAILURE); + break; + } + } + + if (umad_get_port(ibd_ca, ibd_ca_port, &umad_port) < 0) { + if (ibd_ca == NULL) + err( + "umad_get_port failed for first IB CA port %d: %s\n", + ibd_ca_port, strerror(errno)); + else + err("umad_get_port failed for CA %s port %d: %s\n", + ibd_ca, ibd_ca_port, strerror(errno)); + umad_done(); + return -1; + } + info("using %s port %d guid 0x%016" PRIx64 "\n", + umad_port.ca_name, umad_port.portnum, + be64toh(umad_port.port_guid)); + + portid = umad_open_port(umad_port.ca_name, umad_port.portnum); + if (portid < 0) { + err("umad_open_port failed: %s\n", strerror(errno)); + umad_release_port(&umad_port); + umad_done(); + return -1; + } + + agentid = umad_register(portid, UMAD_CLASS_SUBN_ADM, + UMAD_SA_CLASS_VERSION, 0, NULL); + if (agentid < 0) { + err("umad_register failed: %s\n", strerror(errno)); + umad_release_port(&umad_port); + umad_close_port(portid); + umad_done(); + return -1; + } + + test_port(guid_file, portid, agentid, tmo, mgid); + + umad_release_port(&umad_port); + umad_unregister(portid, agentid); + umad_close_port(portid); + umad_done(); + + return 0; +} diff --git a/libibumad/umad.c b/libibumad/umad.c new file mode 100644 index 0000000..fc242a6 --- /dev/null +++ b/libibumad/umad.c @@ -0,0 +1,1276 @@ +/* + * Copyright (c) 2004-2009 Voltaire Inc. All rights reserved. + * Copyright (c) 2014 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#include <config.h> + +#include <sys/poll.h> +#include <unistd.h> +#include <string.h> +#include <stdio.h> +#include <errno.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> +#include <sys/ioctl.h> +#include <dirent.h> +#include <ctype.h> +#include <inttypes.h> +#include <util/compiler.h> + +#include <infiniband/umad.h> + +#define IB_OPENIB_OUI (0x001405) + +#include <valgrind/memcheck.h> +#include "sysfs.h" + +typedef struct ib_user_mad_reg_req { + uint32_t id; + uint32_t method_mask[4]; + uint8_t qpn; + uint8_t mgmt_class; + uint8_t mgmt_class_version; + uint8_t oui[3]; + uint8_t rmpp_version; +} ib_user_mad_reg_req_t; + +struct ib_user_mad_reg_req2 { + uint32_t id; + uint32_t qpn; + uint8_t mgmt_class; + uint8_t mgmt_class_version; + uint16_t res; + uint32_t flags; + uint64_t method_mask[2]; + uint32_t oui; + uint8_t rmpp_version; + uint8_t reserved[3]; +}; + +#define IBWARN(fmt, args...) fprintf(stderr, "ibwarn: [%d] %s: " fmt "\n", getpid(), __func__, ## args) + +#define TRACE if (umaddebug) IBWARN +#define DEBUG if (umaddebug) IBWARN + +static int umaddebug = 0; + +#define UMAD_DEV_FILE_SZ 256 + +static const char *def_ca_name = "mthca0"; +static int def_ca_port = 1; + +static unsigned new_user_mad_api; + +static unsigned int get_abi_version(void) +{ + static unsigned int abi_version; + + if (abi_version != 0) + return abi_version & 0x7FFFFFFF; + + if (sys_read_uint(IB_UMAD_ABI_DIR, IB_UMAD_ABI_FILE, &abi_version) < + 0) { + IBWARN("can't read ABI version from %s/%s (%m): is ib_umad module loaded?", + IB_UMAD_ABI_DIR, IB_UMAD_ABI_FILE); + abi_version = (1U) << 31; + return 0; + } + + if (abi_version < IB_UMAD_ABI_VERSION) { + abi_version = (1U) << 31; + return 0; + } + + return abi_version; +} + +/************************************* + * Port + */ +static int find_cached_ca(const char *ca_name, umad_ca_t * ca) +{ + return 0; /* caching not implemented yet */ +} + +static int put_ca(umad_ca_t * ca) +{ + return 0; /* caching not implemented yet */ +} + +static int release_port(umad_port_t * port) +{ + free(port->pkeys); + port->pkeys = NULL; + port->pkeys_size = 0; + return 0; +} + +static int check_for_digit_name(const struct dirent *dent) +{ + const char *p = dent->d_name; + while (*p && isdigit(*p)) + p++; + return *p ? 0 : 1; +} + +static int get_port(const char *ca_name, const char *dir, int portnum, umad_port_t * port) +{ + char port_dir[256]; + union umad_gid gid; + struct dirent **namelist = NULL; + int i, len, num_pkeys = 0; + uint32_t capmask; + + strncpy(port->ca_name, ca_name, sizeof port->ca_name - 1); + port->portnum = portnum; + port->pkeys = NULL; + + len = snprintf(port_dir, sizeof(port_dir), "%s/%d", dir, portnum); + if (len < 0 || len > sizeof(port_dir)) + goto clean; + + if (sys_read_uint(port_dir, SYS_PORT_LMC, &port->lmc) < 0) + goto clean; + if (sys_read_uint(port_dir, SYS_PORT_SMLID, &port->sm_lid) < 0) + goto clean; + if (sys_read_uint(port_dir, SYS_PORT_SMSL, &port->sm_sl) < 0) + goto clean; + if (sys_read_uint(port_dir, SYS_PORT_LID, &port->base_lid) < 0) + goto clean; + if (sys_read_uint(port_dir, SYS_PORT_STATE, &port->state) < 0) + goto clean; + if (sys_read_uint(port_dir, SYS_PORT_PHY_STATE, &port->phys_state) < 0) + goto clean; + sys_read_uint(port_dir, SYS_PORT_RATE, &port->rate); + if (sys_read_uint(port_dir, SYS_PORT_CAPMASK, &capmask) < 0) + goto clean; + + if (sys_read_string(port_dir, SYS_PORT_LINK_LAYER, + port->link_layer, UMAD_CA_NAME_LEN) < 0) + /* assume IB by default */ + sprintf(port->link_layer, "IB"); + + port->capmask = htobe32(capmask); + + if (sys_read_gid(port_dir, SYS_PORT_GID, &gid) < 0) + goto clean; + + port->gid_prefix = gid.global.subnet_prefix; + port->port_guid = gid.global.interface_id; + + snprintf(port_dir + len, sizeof(port_dir) - len, "/pkeys"); + num_pkeys = scandir(port_dir, &namelist, check_for_digit_name, NULL); + if (num_pkeys <= 0) { + IBWARN("no pkeys found for %s:%u (at dir %s)...", + port->ca_name, port->portnum, port_dir); + goto clean; + } + port->pkeys = calloc(num_pkeys, sizeof(port->pkeys[0])); + if (!port->pkeys) { + IBWARN("get_port: calloc failed: %s", strerror(errno)); + goto clean; + } + for (i = 0; i < num_pkeys; i++) { + unsigned idx, val; + idx = strtoul(namelist[i]->d_name, NULL, 0); + sys_read_uint(port_dir, namelist[i]->d_name, &val); + port->pkeys[idx] = val; + free(namelist[i]); + } + port->pkeys_size = num_pkeys; + free(namelist); + namelist = NULL; + port_dir[len] = '\0'; + + /* FIXME: handle gids */ + + return 0; + +clean: + if (namelist) { + for (i = 0; i < num_pkeys; i++) + free(namelist[i]); + free(namelist); + } + if (port->pkeys) + free(port->pkeys); + return -EIO; +} + +static int release_ca(umad_ca_t * ca) +{ + int i; + + for (i = 0; i <= ca->numports; i++) { + if (!ca->ports[i]) + continue; + release_port(ca->ports[i]); + free(ca->ports[i]); + ca->ports[i] = NULL; + } + return 0; +} + +/* + * if *port > 0, check ca[port] state. Otherwise set *port to + * the first port that is active, and if such is not found, to + * the first port that is link up and if none are linkup, then + * the first port that is not disabled. Otherwise return -1. + */ +static int resolve_ca_port(const char *ca_name, int *port) +{ + umad_ca_t ca; + int active = -1, up = -1; + int i, ret = 0; + + TRACE("checking ca '%s'", ca_name); + + if (umad_get_ca(ca_name, &ca) < 0) + return -1; + + if (ca.node_type == 2) { + *port = 0; /* switch sma port 0 */ + ret = 1; + goto Exit; + } + + if (*port > 0) { /* check only the port the user wants */ + if (*port > ca.numports) { + ret = -1; + goto Exit; + } + if (!ca.ports[*port]) { + ret = -1; + goto Exit; + } + if (strcmp(ca.ports[*port]->link_layer, "InfiniBand") && + strcmp(ca.ports[*port]->link_layer, "IB")) { + ret = -1; + goto Exit; + } + if (ca.ports[*port]->state == 4) { + ret = 1; + goto Exit; + } + if (ca.ports[*port]->phys_state != 3) + goto Exit; + ret = -1; + goto Exit; + } + + for (i = 0; i <= ca.numports; i++) { + DEBUG("checking port %d", i); + if (!ca.ports[i]) + continue; + if (strcmp(ca.ports[i]->link_layer, "InfiniBand") && + strcmp(ca.ports[i]->link_layer, "IB")) + continue; + if (up < 0 && ca.ports[i]->phys_state == 5) + up = *port = i; + if (ca.ports[i]->state == 4) { + active = *port = i; + DEBUG("found active port %d", i); + break; + } + } + + if (active == -1 && up == -1) { /* no active or linkup port found */ + for (i = 0; i <= ca.numports; i++) { + DEBUG("checking port %d", i); + if (!ca.ports[i]) + continue; + if (ca.ports[i]->phys_state != 3) { + up = *port = i; + break; + } + } + } + + if (active >= 0) { + ret = 1; + goto Exit; + } + if (up >= 0) { + ret = 0; + goto Exit; + } + ret = -1; +Exit: + release_ca(&ca); + return ret; +} + +static int resolve_ca_name(const char *ca_in, int *best_port, + char **ca_name) +{ + struct umad_device_node *device_list; + struct umad_device_node *node; + struct umad_device_node *phys_found = NULL; + const char *name_found; + int port_found = 0, port, port_type; + + *ca_name = NULL; + if (ca_in && (!best_port || *best_port)) { + *ca_name = strdup(ca_in); + if (!(*ca_name)) + return -1; + return 0; + } + + if (ca_in) { + if (resolve_ca_port(ca_in, best_port) < 0) + return -1; + *ca_name = strdup(ca_in); + if (!(*ca_name)) + return -1; + return 0; + } + + /* Get the list of CA names */ + device_list = umad_get_ca_device_list(); + if (!device_list) + return -1; + + /* Find the first existing CA with an active port */ + for (node = device_list; node; node = node->next) { + name_found = node->ca_name; + + TRACE("checking ca '%s'", name_found); + + port = best_port ? *best_port : 0; + port_type = resolve_ca_port(name_found, &port); + if (port_type < 0) + continue; + + DEBUG("found ca %s with port %d type %d", + name_found, port, port_type); + + if (port_type > 0) { + if (best_port) + *best_port = port; + DEBUG("found ca %s with active port %d", + name_found, port); + *ca_name = strdup(name_found); + umad_free_ca_device_list(device_list); + if (!(*ca_name)) + return -1; + return 0; + } + + if (!phys_found) { + phys_found = node; + port_found = port; + } + } + + DEBUG("phys found on %s port %d", + phys_found ? phys_found->ca_name : NULL, + port_found); + + if (phys_found) { + name_found = phys_found->ca_name; + DEBUG("phys found on %s port %d", + phys_found ? name_found : NULL, + port_found); + if (best_port) + *best_port = port_found; + *ca_name = strdup(name_found); + umad_free_ca_device_list(device_list); + if (!(*ca_name)) + return -1; + return 0; + } + + umad_free_ca_device_list(device_list); + + if (best_port) + *best_port = def_ca_port; + + *ca_name = strdup(def_ca_name); + if (!(*ca_name)) + return -1; + return 0; +} + +static int get_ca(const char *ca_name, umad_ca_t * ca) +{ + DIR *dir; + char dir_name[256]; + struct dirent **namelist; + int r, i, ret; + int portnum; + + ca->numports = 0; + memset(ca->ports, 0, sizeof ca->ports); + strncpy(ca->ca_name, ca_name, sizeof(ca->ca_name) - 1); + + snprintf(dir_name, sizeof(dir_name), "%s/%s", SYS_INFINIBAND, + ca->ca_name); + + if ((r = sys_read_uint(dir_name, SYS_NODE_TYPE, &ca->node_type)) < 0) + return r; + if (sys_read_string(dir_name, SYS_CA_FW_VERS, ca->fw_ver, + sizeof ca->fw_ver) < 0) + ca->fw_ver[0] = '\0'; + if (sys_read_string(dir_name, SYS_CA_HW_VERS, ca->hw_ver, + sizeof ca->hw_ver) < 0) + ca->hw_ver[0] = '\0'; + if ((r = sys_read_string(dir_name, SYS_CA_TYPE, ca->ca_type, + sizeof ca->ca_type)) < 0) + ca->ca_type[0] = '\0'; + if ((r = sys_read_guid(dir_name, SYS_CA_NODE_GUID, &ca->node_guid)) < 0) + return r; + if ((r = + sys_read_guid(dir_name, SYS_CA_SYS_GUID, &ca->system_guid)) < 0) + return r; + + snprintf(dir_name, sizeof(dir_name), "%s/%s/%s", + SYS_INFINIBAND, ca->ca_name, SYS_CA_PORTS_DIR); + + if (!(dir = opendir(dir_name))) + return -ENOENT; + + if ((r = scandir(dir_name, &namelist, NULL, alphasort)) < 0) { + ret = errno < 0 ? errno : -EIO; + goto error; + } + + ret = 0; + for (i = 0; i < r; i++) { + portnum = 0; + if (!strcmp(".", namelist[i]->d_name) || + !strcmp("..", namelist[i]->d_name)) + continue; + if (strcmp("0", namelist[i]->d_name) && + ((portnum = atoi(namelist[i]->d_name)) <= 0 || + portnum >= UMAD_CA_MAX_PORTS)) { + ret = -EIO; + goto clean; + } + if (!(ca->ports[portnum] = + calloc(1, sizeof(*ca->ports[portnum])))) { + ret = -ENOMEM; + goto clean; + } + if (get_port(ca_name, dir_name, portnum, ca->ports[portnum]) < + 0) { + free(ca->ports[portnum]); + ca->ports[portnum] = NULL; + ret = -EIO; + goto clean; + } + if (ca->numports < portnum) + ca->numports = portnum; + } + + for (i = 0; i < r; i++) + free(namelist[i]); + free(namelist); + + closedir(dir); + put_ca(ca); + return 0; + +clean: + for (i = 0; i < r; i++) + free(namelist[i]); + free(namelist); +error: + closedir(dir); + release_ca(ca); + + return ret; +} + +static int umad_id_to_dev(int umad_id, char *dev, unsigned *port) +{ + char path[256]; + int r; + + snprintf(path, sizeof(path), SYS_INFINIBAND_MAD "/umad%d/", umad_id); + + if ((r = + sys_read_string(path, SYS_IB_MAD_DEV, dev, UMAD_CA_NAME_LEN)) < 0) + return r; + + if ((r = sys_read_uint(path, SYS_IB_MAD_PORT, port)) < 0) + return r; + + return 0; +} + +static int dev_to_umad_id(const char *dev, unsigned port) +{ + char umad_dev[UMAD_CA_NAME_LEN]; + unsigned umad_port; + int id; + + for (id = 0; id < UMAD_MAX_PORTS; id++) { + if (umad_id_to_dev(id, umad_dev, &umad_port) < 0) + continue; + if (strncmp(dev, umad_dev, UMAD_CA_NAME_LEN)) + continue; + if (port != umad_port) + continue; + + DEBUG("mapped %s %d to %d", dev, port, id); + return id; + } + + return -1; /* not found */ +} + +/******************************* + * Public interface + */ + +int umad_init(void) +{ + TRACE("umad_init"); + return 0; +} + +int umad_done(void) +{ + TRACE("umad_done"); + /* FIXME - verify that all ports are closed */ + return 0; +} + +static unsigned is_ib_type(const char *ca_name) +{ + char dir_name[256]; + unsigned type; + + snprintf(dir_name, sizeof(dir_name), "%s/%s", SYS_INFINIBAND, ca_name); + + if (sys_read_uint(dir_name, SYS_NODE_TYPE, &type) < 0) + return 0; + + return type >= 1 && type <= 3 ? 1 : 0; +} + +int umad_get_cas_names(char cas[][UMAD_CA_NAME_LEN], int max) +{ + struct dirent **namelist; + int n, i, j = 0; + + TRACE("max %d", max); + + n = scandir(SYS_INFINIBAND, &namelist, NULL, alphasort); + if (n > 0) { + for (i = 0; i < n; i++) { + if (strcmp(namelist[i]->d_name, ".") && + strcmp(namelist[i]->d_name, "..") && + strlen(namelist[i]->d_name) < UMAD_CA_NAME_LEN) { + if (j < max && is_ib_type(namelist[i]->d_name)) + strcpy(cas[j++], namelist[i]->d_name); + } + free(namelist[i]); + } + DEBUG("return %d cas", j); + } else { + /* Is this still needed ? */ + strncpy((char *)cas, def_ca_name, UMAD_CA_NAME_LEN); + DEBUG("return 1 ca"); + j = 1; + } + if (n >= 0) + free(namelist); + return j; +} + +int umad_get_ca_portguids(const char *ca_name, __be64 *portguids, int max) +{ + umad_ca_t ca; + int ports = 0, i, result; + char *found_ca_name; + + TRACE("ca name %s max port guids %d", ca_name, max); + if (resolve_ca_name(ca_name, NULL, &found_ca_name) < 0) { + result = -ENODEV; + goto exit; + } + + if (umad_get_ca(found_ca_name, &ca) < 0) { + result = -1; + goto exit; + } + + if (portguids) { + if (ca.numports + 1 > max) { + result = -ENOMEM; + goto clean; + } + + for (i = 0; i <= ca.numports; i++) + portguids[ports++] = ca.ports[i] ? + ca.ports[i]->port_guid : htobe64(0); + } + + DEBUG("%s: %d ports", found_ca_name, ports); + + result = ports; +clean: + release_ca(&ca); +exit: + free(found_ca_name); + + return result; +} + +int umad_get_issm_path(const char *ca_name, int portnum, char path[], int max) +{ + int umad_id, result; + char *found_ca_name; + + TRACE("ca %s port %d", ca_name, portnum); + + if (resolve_ca_name(ca_name, &portnum, &found_ca_name) < 0) { + result = -ENODEV; + goto exit; + } + + umad_id = dev_to_umad_id(found_ca_name, portnum); + if (umad_id < 0) { + result = -EINVAL; + goto exit; + } + + snprintf(path, max, "%s/issm%u", RDMA_CDEV_DIR, umad_id); + + result = 0; +exit: + free(found_ca_name); + + return result; +} + +int umad_open_port(const char *ca_name, int portnum) +{ + char dev_file[UMAD_DEV_FILE_SZ]; + int umad_id, fd, result; + unsigned int abi_version = get_abi_version(); + char *found_ca_name = NULL; + + TRACE("ca %s port %d", ca_name, portnum); + + if (!abi_version) { + result = -EOPNOTSUPP; + goto exit; + } + + if (resolve_ca_name(ca_name, &portnum, &found_ca_name) < 0) { + result = -ENODEV; + goto exit; + } + + DEBUG("opening %s port %d", found_ca_name, portnum); + + umad_id = dev_to_umad_id(found_ca_name, portnum); + if (umad_id < 0) { + result = -EINVAL; + goto exit; + } + + snprintf(dev_file, sizeof(dev_file), "%s/umad%d", + RDMA_CDEV_DIR, umad_id); + + if ((fd = open(dev_file, O_RDWR | O_NONBLOCK)) < 0) { + DEBUG("open %s failed: %s", dev_file, strerror(errno)); + result = -EIO; + goto exit; + } + + if (abi_version > 5 || !ioctl(fd, IB_USER_MAD_ENABLE_PKEY, NULL)) + new_user_mad_api = 1; + else + new_user_mad_api = 0; + + DEBUG("opened %s fd %d portid %d", dev_file, fd, umad_id); + + result = fd; +exit: + free(found_ca_name); + + return result; +} + +int umad_get_ca(const char *ca_name, umad_ca_t *ca) +{ + int r = 0; + char *found_ca_name; + + TRACE("ca_name %s", ca_name); + if (resolve_ca_name(ca_name, NULL, &found_ca_name) < 0) { + r = -ENODEV; + goto exit; + } + + if (find_cached_ca(found_ca_name, ca) > 0) + goto exit; + + r = get_ca(found_ca_name, ca); + if (r < 0) + goto exit; + + DEBUG("opened %s", found_ca_name); +exit: + free(found_ca_name); + + return r; +} + +int umad_release_ca(umad_ca_t * ca) +{ + int r; + + TRACE("ca_name %s", ca->ca_name); + if (!ca) + return -ENODEV; + + if ((r = release_ca(ca)) < 0) + return r; + + DEBUG("releasing %s", ca->ca_name); + return 0; +} + +int umad_get_port(const char *ca_name, int portnum, umad_port_t *port) +{ + char dir_name[256]; + char *found_ca_name; + int result; + + TRACE("ca_name %s portnum %d", ca_name, portnum); + + if (resolve_ca_name(ca_name, &portnum, &found_ca_name) < 0) { + result = -ENODEV; + goto exit; + } + + snprintf(dir_name, sizeof(dir_name), "%s/%s/%s", + SYS_INFINIBAND, found_ca_name, SYS_CA_PORTS_DIR); + + result = get_port(found_ca_name, dir_name, portnum, port); +exit: + free(found_ca_name); + + return result; +} + +int umad_release_port(umad_port_t * port) +{ + int r; + + TRACE("port %s:%d", port->ca_name, port->portnum); + if (!port) + return -ENODEV; + + if ((r = release_port(port)) < 0) + return r; + + DEBUG("releasing %s:%d", port->ca_name, port->portnum); + return 0; +} + +int umad_close_port(int fd) +{ + close(fd); + DEBUG("closed fd %d", fd); + return 0; +} + +void *umad_get_mad(void *umad) +{ + return new_user_mad_api ? ((struct ib_user_mad *)umad)->data : + (void *)&((struct ib_user_mad *)umad)->addr.pkey_index; +} + +size_t umad_size(void) +{ + return new_user_mad_api ? sizeof(struct ib_user_mad) : + sizeof(struct ib_user_mad) - 8; +} + +int umad_set_grh(void *umad, void *mad_addr) +{ + struct ib_user_mad *mad = umad; + struct ib_mad_addr *addr = mad_addr; + + if (mad_addr) { + mad->addr.grh_present = 1; + mad->addr.ib_gid = addr->ib_gid; + /* The definition for umad_set_grh requires that the input be + * in host order */ + mad->addr.flow_label = htobe32((__force uint32_t)addr->flow_label); + mad->addr.hop_limit = addr->hop_limit; + mad->addr.traffic_class = addr->traffic_class; + } else + mad->addr.grh_present = 0; + return 0; +} + +int umad_set_pkey(void *umad, int pkey_index) +{ + struct ib_user_mad *mad = umad; + + if (new_user_mad_api) + mad->addr.pkey_index = pkey_index; + + return 0; +} + +int umad_get_pkey(void *umad) +{ + struct ib_user_mad *mad = umad; + + if (new_user_mad_api) + return mad->addr.pkey_index; + + return 0; +} + +int umad_set_addr(void *umad, int dlid, int dqp, int sl, int qkey) +{ + struct ib_user_mad *mad = umad; + + TRACE("umad %p dlid %u dqp %d sl %d, qkey %x", + umad, dlid, dqp, sl, qkey); + mad->addr.qpn = htobe32(dqp); + mad->addr.lid = htobe16(dlid); + mad->addr.qkey = htobe32(qkey); + mad->addr.sl = sl; + + return 0; +} + +int umad_set_addr_net(void *umad, __be16 dlid, __be32 dqp, int sl, __be32 qkey) +{ + struct ib_user_mad *mad = umad; + + TRACE("umad %p dlid %u dqp %d sl %d qkey %x", + umad, be16toh(dlid), be32toh(dqp), sl, be32toh(qkey)); + mad->addr.qpn = dqp; + mad->addr.lid = dlid; + mad->addr.qkey = qkey; + mad->addr.sl = sl; + + return 0; +} + +int umad_send(int fd, int agentid, void *umad, int length, + int timeout_ms, int retries) +{ + struct ib_user_mad *mad = umad; + int n; + + TRACE("fd %d agentid %d umad %p timeout %u", + fd, agentid, umad, timeout_ms); + errno = 0; + + mad->timeout_ms = timeout_ms; + mad->retries = retries; + mad->agent_id = agentid; + + if (umaddebug > 1) + umad_dump(mad); + + n = write(fd, mad, length + umad_size()); + if (n == length + umad_size()) + return 0; + + DEBUG("write returned %d != sizeof umad %zu + length %d (%m)", + n, umad_size(), length); + if (!errno) + errno = EIO; + return -EIO; +} + +static int dev_poll(int fd, int timeout_ms) +{ + struct pollfd ufds; + int n; + + ufds.fd = fd; + ufds.events = POLLIN; + + if ((n = poll(&ufds, 1, timeout_ms)) == 1) + return 0; + + if (n == 0) + return -ETIMEDOUT; + + return -EIO; +} + +int umad_recv(int fd, void *umad, int *length, int timeout_ms) +{ + struct ib_user_mad *mad = umad; + int n; + + errno = 0; + TRACE("fd %d umad %p timeout %u", fd, umad, timeout_ms); + + if (!umad || !length) { + errno = EINVAL; + return -EINVAL; + } + + if (timeout_ms && (n = dev_poll(fd, timeout_ms)) < 0) { + if (!errno) + errno = -n; + return n; + } + + n = read(fd, umad, umad_size() + *length); + + VALGRIND_MAKE_MEM_DEFINED(umad, umad_size() + *length); + + if ((n >= 0) && (n <= umad_size() + *length)) { + DEBUG("mad received by agent %d length %d", mad->agent_id, n); + if (n > umad_size()) + *length = n - umad_size(); + else + *length = 0; + return mad->agent_id; + } + + if (n == -EWOULDBLOCK) { + if (!errno) + errno = EWOULDBLOCK; + return n; + } + + DEBUG("read returned %zu > sizeof umad %zu + length %d (%m)", + mad->length - umad_size(), umad_size(), *length); + + *length = mad->length - umad_size(); + if (!errno) + errno = EIO; + return -errno; +} + +int umad_poll(int fd, int timeout_ms) +{ + TRACE("fd %d timeout %u", fd, timeout_ms); + return dev_poll(fd, timeout_ms); +} + +int umad_get_fd(int fd) +{ + TRACE("fd %d", fd); + return fd; +} + +int umad_register_oui(int fd, int mgmt_class, uint8_t rmpp_version, + uint8_t oui[3], long method_mask[]) +{ + struct ib_user_mad_reg_req req; + + TRACE("fd %d mgmt_class %u rmpp_version %d oui 0x%x%x%x method_mask %p", + fd, mgmt_class, (int)rmpp_version, (int)oui[0], (int)oui[1], + (int)oui[2], method_mask); + + if (mgmt_class < 0x30 || mgmt_class > 0x4f) { + DEBUG("mgmt class %d not in vendor range 2", mgmt_class); + return -EINVAL; + } + + req.qpn = 1; + req.mgmt_class = mgmt_class; + req.mgmt_class_version = 1; + memcpy(req.oui, oui, sizeof req.oui); + req.rmpp_version = rmpp_version; + + if (method_mask) + memcpy(req.method_mask, method_mask, sizeof req.method_mask); + else + memset(req.method_mask, 0, sizeof req.method_mask); + + VALGRIND_MAKE_MEM_DEFINED(&req, sizeof req); + + if (!ioctl(fd, IB_USER_MAD_REGISTER_AGENT, (void *)&req)) { + DEBUG + ("fd %d registered to use agent %d qp %d class 0x%x oui %p", + fd, req.id, req.qpn, req.mgmt_class, oui); + return req.id; /* return agentid */ + } + + DEBUG("fd %d registering qp %d class 0x%x version %d oui %p failed: %m", + fd, req.qpn, req.mgmt_class, req.mgmt_class_version, oui); + return -EPERM; +} + +int umad_register(int fd, int mgmt_class, int mgmt_version, + uint8_t rmpp_version, long method_mask[]) +{ + struct ib_user_mad_reg_req req; + __be32 oui = htobe32(IB_OPENIB_OUI); + int qp; + + TRACE + ("fd %d mgmt_class %u mgmt_version %u rmpp_version %d method_mask %p", + fd, mgmt_class, mgmt_version, rmpp_version, method_mask); + + req.qpn = qp = (mgmt_class == 0x1 || mgmt_class == 0x81) ? 0 : 1; + req.mgmt_class = mgmt_class; + req.mgmt_class_version = mgmt_version; + req.rmpp_version = rmpp_version; + + if (method_mask) + memcpy(req.method_mask, method_mask, sizeof req.method_mask); + else + memset(req.method_mask, 0, sizeof req.method_mask); + + memcpy(&req.oui, (char *)&oui + 1, sizeof req.oui); + + VALGRIND_MAKE_MEM_DEFINED(&req, sizeof req); + + if (!ioctl(fd, IB_USER_MAD_REGISTER_AGENT, (void *)&req)) { + DEBUG("fd %d registered to use agent %d qp %d", fd, req.id, qp); + return req.id; /* return agentid */ + } + + DEBUG("fd %d registering qp %d class 0x%x version %d failed: %m", + fd, qp, mgmt_class, mgmt_version); + return -EPERM; +} + +int umad_register2(int port_fd, struct umad_reg_attr *attr, uint32_t *agent_id) +{ + struct ib_user_mad_reg_req2 req; + int rc; + + if (!attr || !agent_id) + return EINVAL; + + TRACE("fd %d mgmt_class %u mgmt_class_version %u flags 0x%08x " + "method_mask 0x%016" PRIx64 " %016" PRIx64 + "oui 0x%06x rmpp_version %u ", + port_fd, attr->mgmt_class, attr->mgmt_class_version, + attr->flags, attr->method_mask[0], attr->method_mask[1], + attr->oui, attr->rmpp_version); + + if (attr->mgmt_class >= 0x30 && attr->mgmt_class <= 0x4f && + ((attr->oui & 0x00ffffff) == 0 || (attr->oui & 0xff000000) != 0)) { + DEBUG("mgmt class %d is in vendor range 2 but oui (0x%08x) is invalid", + attr->mgmt_class, attr->oui); + return EINVAL; + } + + memset(&req, 0, sizeof(req)); + + req.mgmt_class = attr->mgmt_class; + req.mgmt_class_version = attr->mgmt_class_version; + req.qpn = (attr->mgmt_class == 0x1 || attr->mgmt_class == 0x81) ? 0 : 1; + req.flags = attr->flags; + memcpy(req.method_mask, attr->method_mask, sizeof req.method_mask); + req.oui = attr->oui; + req.rmpp_version = attr->rmpp_version; + + VALGRIND_MAKE_MEM_DEFINED(&req, sizeof req); + + if ((rc = ioctl(port_fd, IB_USER_MAD_REGISTER_AGENT2, (void *)&req)) == 0) { + DEBUG("fd %d registered to use agent %d qp %d class 0x%x oui 0x%06x", + port_fd, req.id, req.qpn, req.mgmt_class, attr->oui); + *agent_id = req.id; + return 0; + } + + if (errno == ENOTTY || errno == EINVAL) { + + TRACE("no kernel support for registration flags"); + req.flags = 0; + + if (attr->flags == 0) { + struct ib_user_mad_reg_req req_v1; + + TRACE("attempting original register ioctl"); + + memset(&req_v1, 0, sizeof(req_v1)); + req_v1.mgmt_class = req.mgmt_class; + req_v1.mgmt_class_version = req.mgmt_class_version; + req_v1.qpn = req.qpn; + req_v1.rmpp_version = req.rmpp_version; + req_v1.oui[0] = (req.oui & 0xff0000) >> 16; + req_v1.oui[1] = (req.oui & 0x00ff00) >> 8; + req_v1.oui[2] = req.oui & 0x0000ff; + + memcpy(req_v1.method_mask, req.method_mask, sizeof req_v1.method_mask); + + if ((rc = ioctl(port_fd, IB_USER_MAD_REGISTER_AGENT, + (void *)&req_v1)) == 0) { + DEBUG("fd %d registered to use agent %d qp %d class 0x%x oui 0x%06x", + port_fd, req_v1.id, req_v1.qpn, req_v1.mgmt_class, attr->oui); + *agent_id = req_v1.id; + return 0; + } + } + } + + rc = errno; + attr->flags = req.flags; + + DEBUG("fd %d registering qp %d class 0x%x version %d " + "oui 0x%06x failed flags returned 0x%x : %m", + port_fd, req.qpn, req.mgmt_class, req.mgmt_class_version, + attr->oui, req.flags); + + return rc; +} + +int umad_unregister(int fd, int agentid) +{ + TRACE("fd %d unregistering agent %d", fd, agentid); + return ioctl(fd, IB_USER_MAD_UNREGISTER_AGENT, &agentid); +} + +int umad_status(void *umad) +{ + struct ib_user_mad *mad = umad; + + return mad->status; +} + +ib_mad_addr_t *umad_get_mad_addr(void *umad) +{ + struct ib_user_mad *mad = umad; + + return &mad->addr; +} + +int umad_debug(int level) +{ + if (level >= 0) + umaddebug = level; + return umaddebug; +} + +void umad_addr_dump(ib_mad_addr_t * addr) +{ +#define HEX(x) ((x) < 10 ? '0' + (x) : 'a' + ((x) -10)) + char gid_str[64]; + int i; + + for (i = 0; i < sizeof addr->gid; i++) { + gid_str[i * 2] = HEX(addr->gid[i] >> 4); + gid_str[i * 2 + 1] = HEX(addr->gid[i] & 0xf); + } + gid_str[i * 2] = 0; + IBWARN("qpn %d qkey 0x%x lid %u sl %d\n" + "grh_present %d gid_index %d hop_limit %d traffic_class %d flow_label 0x%x pkey_index 0x%x\n" + "Gid 0x%s", + be32toh(addr->qpn), be32toh(addr->qkey), be16toh(addr->lid), addr->sl, + addr->grh_present, (int)addr->gid_index, (int)addr->hop_limit, + (int)addr->traffic_class, addr->flow_label, addr->pkey_index, + gid_str); +} + +void umad_dump(void *umad) +{ + struct ib_user_mad *mad = umad; + + IBWARN("agent id %d status %x timeout %d", + mad->agent_id, mad->status, mad->timeout_ms); + umad_addr_dump(&mad->addr); +} + +struct umad_device_node *umad_get_ca_device_list(void) +{ + DIR *dir; + struct dirent *entry; + struct umad_device_node *head = NULL; + struct umad_device_node *tail; + struct umad_device_node *node; + char *ca_name; + size_t cas_num = 0; + size_t d_name_size; + int errsv = 0; + + dir = opendir(SYS_INFINIBAND); + if (!dir) { + if (errno == ENOENT) + errno = 0; + return NULL; + } + + while ((entry = readdir(dir))) { + if ((strcmp(entry->d_name, ".") == 0) || + (strcmp(entry->d_name, "..") == 0)) + continue; + + if (!is_ib_type(entry->d_name)) + continue; + + d_name_size = strlen(entry->d_name) + 1; + node = calloc(1, sizeof(struct umad_device_node) + d_name_size); + if (!node) { + errsv = ENOMEM; + umad_free_ca_device_list(head); + head = NULL; + goto exit; + } + + if (!head) + head = node; + else + tail->next = node; + tail = node; + + ca_name = (char *)(node + 1); + strncpy(ca_name, entry->d_name, d_name_size); + node->ca_name = ca_name; + + cas_num++; + } + + DEBUG("return %zu cas", cas_num); +exit: + closedir(dir); + errno = errsv; + + return head; +} + +void umad_free_ca_device_list(struct umad_device_node *head) +{ + struct umad_device_node *node; + struct umad_device_node *next; + + for (node = head; node; node = next) { + next = node->next; + free(node); + } +} diff --git a/libibumad/umad.h b/libibumad/umad.h new file mode 100644 index 0000000..ee2af2f --- /dev/null +++ b/libibumad/umad.h @@ -0,0 +1,264 @@ +/* + * Copyright (c) 2004-2009 Voltaire Inc. All rights reserved. + * Copyright (c) 2014 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#ifndef _UMAD_H +#define _UMAD_H + +#include <endian.h> +#include <stdint.h> +#include <stdlib.h> +#include <arpa/inet.h> +#include <linux/types.h> /* __be16, __be32 and __be64 */ + +#ifdef __cplusplus +extern "C" { +#endif + +typedef __be16 __attribute__((deprecated)) be16_t; +typedef __be32 __attribute__((deprecated)) be32_t; +typedef __be64 __attribute__((deprecated)) be64_t; + +/* + * A GID data structure that may be used in definitions of on-the-wire data + * structures. Do not cast umad_gid pointers to ibv_gid pointers because the + * alignment of these two data structures is different. + */ +union umad_gid { + uint8_t raw[16]; + __be16 raw_be16[8]; + struct { + __be64 subnet_prefix; + __be64 interface_id; + } global; +} __attribute__((aligned(4))) __attribute__((packed)); + +#define UMAD_MAX_DEVICES 32 +#define UMAD_ANY_PORT 0 +typedef struct ib_mad_addr { + __be32 qpn; + __be32 qkey; + __be16 lid; + uint8_t sl; + uint8_t path_bits; + uint8_t grh_present; + uint8_t gid_index; + uint8_t hop_limit; + uint8_t traffic_class; + union { + uint8_t gid[16]; /* network-byte order */ + union umad_gid ib_gid; + }; + __be32 flow_label; + uint16_t pkey_index; + uint8_t reserved[6]; +} ib_mad_addr_t; + +typedef struct ib_user_mad { + uint32_t agent_id; + uint32_t status; + uint32_t timeout_ms; + uint32_t retries; + uint32_t length; + ib_mad_addr_t addr; + uint8_t data[0]; +} ib_user_mad_t; + +#define IB_UMAD_ABI_VERSION 5 +#define IB_UMAD_ABI_DIR "/sys/class/infiniband_mad" +#define IB_UMAD_ABI_FILE "abi_version" + +#define IB_IOCTL_MAGIC 0x1b + +#define IB_USER_MAD_REGISTER_AGENT _IOWR(IB_IOCTL_MAGIC, 1, \ + struct ib_user_mad_reg_req) +#define IB_USER_MAD_UNREGISTER_AGENT _IOW(IB_IOCTL_MAGIC, 2, uint32_t) +#define IB_USER_MAD_ENABLE_PKEY _IO(IB_IOCTL_MAGIC, 3) +#define IB_USER_MAD_REGISTER_AGENT2 _IOWR(IB_IOCTL_MAGIC, 4, \ + struct ib_user_mad_reg_req2) + +#define UMAD_CA_NAME_LEN 20 +#define UMAD_CA_MAX_PORTS 10 /* 0 - 9 */ +#define UMAD_CA_MAX_AGENTS 32 + +#define SYS_INFINIBAND "/sys/class/infiniband" + +#define SYS_INFINIBAND_MAD "/sys/class/infiniband_mad" +#define SYS_IB_MAD_PORT "port" +#define SYS_IB_MAD_DEV "ibdev" + +#define UMAD_MAX_PORTS 64 + +#define SYS_CA_PORTS_DIR "ports" + +#define SYS_NODE_TYPE "node_type" +#define SYS_CA_FW_VERS "fw_ver" +#define SYS_CA_HW_VERS "hw_rev" +#define SYS_CA_TYPE "hca_type" +#define SYS_CA_NODE_GUID "node_guid" +#define SYS_CA_SYS_GUID "sys_image_guid" + +#define SYS_PORT_LMC "lid_mask_count" +#define SYS_PORT_SMLID "sm_lid" +#define SYS_PORT_SMSL "sm_sl" +#define SYS_PORT_LID "lid" +#define SYS_PORT_STATE "state" +#define SYS_PORT_PHY_STATE "phys_state" +#define SYS_PORT_CAPMASK "cap_mask" +#define SYS_PORT_RATE "rate" +#define SYS_PORT_GUID "port_guid" +#define SYS_PORT_GID "gids/0" +#define SYS_PORT_LINK_LAYER "link_layer" + +typedef struct umad_port { + char ca_name[UMAD_CA_NAME_LEN]; + int portnum; + unsigned base_lid; + unsigned lmc; + unsigned sm_lid; + unsigned sm_sl; + unsigned state; + unsigned phys_state; + unsigned rate; + __be32 capmask; + __be64 gid_prefix; + __be64 port_guid; + unsigned pkeys_size; + uint16_t *pkeys; + char link_layer[UMAD_CA_NAME_LEN]; +} umad_port_t; + +typedef struct umad_ca { + char ca_name[UMAD_CA_NAME_LEN]; + unsigned node_type; + int numports; + char fw_ver[20]; + char ca_type[40]; + char hw_ver[20]; + __be64 node_guid; + __be64 system_guid; + umad_port_t *ports[UMAD_CA_MAX_PORTS]; +} umad_ca_t; + +struct umad_device_node { + struct umad_device_node *next; /* next umad device node */ + const char *ca_name; /* ca name */ +}; + +int umad_init(void); +int umad_done(void); + +int umad_get_cas_names(char cas[][UMAD_CA_NAME_LEN], int max); +int umad_get_ca_portguids(const char *ca_name, __be64 *portguids, int max); + +int umad_get_ca(const char *ca_name, umad_ca_t * ca); +int umad_release_ca(umad_ca_t * ca); +int umad_get_port(const char *ca_name, int portnum, umad_port_t * port); +int umad_release_port(umad_port_t * port); + +int umad_get_issm_path(const char *ca_name, int portnum, char path[], int max); + +int umad_open_port(const char *ca_name, int portnum); +int umad_close_port(int portid); + +void *umad_get_mad(void *umad); +size_t umad_size(void); +int umad_status(void *umad); + +ib_mad_addr_t *umad_get_mad_addr(void *umad); +int umad_set_grh_net(void *umad, void *mad_addr); +int umad_set_grh(void *umad, void *mad_addr); +int umad_set_addr_net(void *umad, __be16 dlid, __be32 dqp, int sl, __be32 qkey); +int umad_set_addr(void *umad, int dlid, int dqp, int sl, int qkey); +int umad_set_pkey(void *umad, int pkey_index); +int umad_get_pkey(void *umad); + +int umad_send(int portid, int agentid, void *umad, int length, + int timeout_ms, int retries); +int umad_recv(int portid, void *umad, int *length, int timeout_ms); +int umad_poll(int portid, int timeout_ms); +int umad_get_fd(int portid); + +int umad_register(int portid, int mgmt_class, int mgmt_version, + uint8_t rmpp_version, long method_mask[16 / sizeof(long)]); +int umad_register_oui(int portid, int mgmt_class, uint8_t rmpp_version, + uint8_t oui[3], long method_mask[16 / sizeof(long)]); +int umad_unregister(int portid, int agentid); +struct umad_device_node *umad_get_ca_device_list(void); +void umad_free_ca_device_list(struct umad_device_node *head); + +enum { + UMAD_USER_RMPP = (1 << 0) +}; + +struct umad_reg_attr { + uint8_t mgmt_class; + uint8_t mgmt_class_version; + uint32_t flags; + uint64_t method_mask[2]; + uint32_t oui; + uint8_t rmpp_version; +}; + +int umad_register2(int port_fd, struct umad_reg_attr *attr, + uint32_t *agent_id); + +int umad_debug(int level); +void umad_addr_dump(ib_mad_addr_t * addr); +void umad_dump(void *umad); + +static inline void *umad_alloc(int num, size_t size) +{ /* alloc array of umad buffers */ + return calloc(num, size); +} + +static inline void umad_free(void *umad) +{ + free(umad); +} + +/* Users should use the glibc functions directly, not these wrappers */ +#ifndef ntohll +#undef ntohll +static inline __attribute__((deprecated)) uint64_t ntohll(uint64_t x) { return be64toh(x); } +#define ntohll ntohll +#endif +#ifndef htonll +#undef htonll +static inline __attribute__((deprecated)) uint64_t htonll(uint64_t x) { return htobe64(x); } +#define htonll htonll +#endif + +#ifdef __cplusplus +} +#endif +#endif /* _UMAD_H */ diff --git a/libibumad/umad_cm.h b/libibumad/umad_cm.h new file mode 100644 index 0000000..dd077f7 --- /dev/null +++ b/libibumad/umad_cm.h @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2010 Intel Corporation. All rights reserved. + * Copyright (c) 2014 Mellanox Technologies LTD. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef _UMAD_CM_H +#define _UMAD_CM_H + +#include <infiniband/umad_types.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* Communication management attributes */ +enum { + UMAD_CM_ATTR_REQ = 0x0010, + UMAD_CM_ATTR_MRA = 0x0011, + UMAD_CM_ATTR_REJ = 0x0012, + UMAD_CM_ATTR_REP = 0x0013, + UMAD_CM_ATTR_RTU = 0x0014, + UMAD_CM_ATTR_DREQ = 0x0015, + UMAD_CM_ATTR_DREP = 0x0016, + UMAD_CM_ATTR_SIDR_REQ = 0x0017, + UMAD_CM_ATTR_SIDR_REP = 0x0018, + UMAD_CM_ATTR_LAP = 0x0019, + UMAD_CM_ATTR_APR = 0x001A, + UMAD_CM_ATTR_SAP = 0x001B, + UMAD_CM_ATTR_SPR = 0x001C, +}; + +#ifdef __cplusplus +} +#endif +#endif /* _UMAD_CM_H */ diff --git a/libibumad/umad_sa.h b/libibumad/umad_sa.h new file mode 100644 index 0000000..b07774b --- /dev/null +++ b/libibumad/umad_sa.h @@ -0,0 +1,172 @@ +/* + * Copyright (c) 2004 Topspin Communications. All rights reserved. + * Copyright (c) 2005 Voltaire, Inc. All rights reserved. + * Copyright (c) 2006, 2010 Intel Corporation. All rights reserved. + * Copyright (c) 2014 Mellanox Technologies LTD. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#ifndef _UMAD_SA_H +#define _UMAD_SA_H + +#include <infiniband/umad_types.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* SA specific methods */ +enum { + UMAD_SA_CLASS_VERSION = 2, /* IB spec version 1.1/1.2 */ + + UMAD_SA_METHOD_GET_TABLE = 0x12, + UMAD_SA_METHOD_GET_TABLE_RESP = 0x92, + UMAD_SA_METHOD_DELETE = 0x15, + UMAD_SA_METHOD_DELETE_RESP = 0x95, + UMAD_SA_METHOD_GET_MULTI = 0x14, + UMAD_SA_METHOD_GET_MULTI_RESP = 0x94, + UMAD_SA_METHOD_GET_TRACE_TABLE = 0x13 +}; + +enum { + UMAD_SA_STATUS_SUCCESS = 0, + UMAD_SA_STATUS_NO_RESOURCES = 1, + UMAD_SA_STATUS_REQ_INVALID = 2, + UMAD_SA_STATUS_NO_RECORDS = 3, + UMAD_SA_STATUS_TOO_MANY_RECORDS = 4, + UMAD_SA_STATUS_INVALID_GID = 5, + UMAD_SA_STATUS_INSUF_COMPS = 6, + UMAD_SA_STATUS_REQ_DENIED = 7, + UMAD_SA_STATUS_PRI_SUGGESTED = 8 +}; + +/* SA attributes */ +enum { + UMAD_SA_ATTR_NODE_REC = 0x0011, + UMAD_SA_ATTR_PORT_INFO_REC = 0x0012, + UMAD_SA_ATTR_SLVL_REC = 0x0013, + UMAD_SA_ATTR_SWITCH_INFO_REC = 0x0014, + UMAD_SA_ATTR_LINEAR_FT_REC = 0x0015, + UMAD_SA_ATTR_RANDOM_FT_REC = 0x0016, + UMAD_SA_ATTR_MCAST_FT_REC = 0x0017, + UMAD_SA_ATTR_SM_INFO_REC = 0x0018, + UMAD_SA_ATTR_LINK_SPD_WIDTH_TABLE_REC = 0x0019, + UMAD_SA_ATTR_INFORM_INFO_REC = 0x00F3, + UMAD_SA_ATTR_LINK_REC = 0x0020, + UMAD_SA_ATTR_GUID_INFO_REC = 0x0030, + UMAD_SA_ATTR_SERVICE_REC = 0x0031, + UMAD_SA_ATTR_PKEY_TABLE_REC = 0x0033, + UMAD_SA_ATTR_PATH_REC = 0x0035, + UMAD_SA_ATTR_VL_ARB_REC = 0x0036, + UMAD_SA_ATTR_MCMEMBER_REC = 0x0038, + UMAD_SA_ATTR_TRACE_REC = 0x0039, + UMAD_SA_ATTR_MULTI_PATH_REC = 0x003A, + UMAD_SA_ATTR_SERVICE_ASSOC_REC = 0x003B, + UMAD_SA_ATTR_HIERARCHY_INFO_REC = 0x003C, + UMAD_SA_ATTR_CABLE_INFO_REC = 0x003D, + UMAD_SA_ATTR_PORT_INFO_EXT_REC = 0x003E +}; + +enum { + UMAD_LEN_SA_DATA = 200 +}; + +/* CM bits */ +enum { + UMAD_SA_CAP_MASK_IS_SUBNET_OPT_REC_SUP = (1 << 8), + UMAD_SA_CAP_MASK_IS_UD_MCAST_SUP = (1 << 9), + UMAD_SA_CAP_MASK_IS_MULTIPATH_SUP = (1 << 10), + UMAD_SA_CAP_MASK_IS_REINIT_SUP = (1 << 11), + UMAD_SA_CAP_MASK_IS_GID_SCOPED_MULTIPATH_SUP = (1 << 12), + UMAD_SA_CAP_MASK_IS_PORTINFO_CAP_MASK_MATCH_SUP = (1 << 13), + UMAD_SA_CAP_MASK_IS_LINK_SPEED_WIDTH_PAIRS_REC_SUP = (1 << 14), + UMAD_SA_CAP_MASK_IS_PA_SERVICES_SUP = (1 << 15) +}; +/* CM2 bits */ +enum { + UMAD_SA_CAP_MASK2_IS_UNPATH_REPATH_SUP = (1 << 0), + UMAD_SA_CAP_MASK2_IS_QOS_SUP = (1 << 1), + UMAD_SA_CAP_MASK2_IS_REV_PATH_PKEY_MEM_BIT_SUP = (1 << 2), + UMAD_SA_CAP_MASK2_IS_MCAST_TOP_SUP = (1 << 3), + UMAD_SA_CAP_MASK2_IS_HIERARCHY_INFO_SUP = (1 << 4), + UMAD_SA_CAP_MASK2_IS_ADDITIONAL_GUID_SUP = (1 << 5), + UMAD_SA_CAP_MASK2_IS_FULL_PORTINFO_REC_SUP = (1 << 6), + UMAD_SA_CAP_MASK2_IS_EXT_SPEEDS_SUP = (1 << 7), + UMAD_SA_CAP_MASK2_IS_MCAST_SERVICE_REC_SUP = (1 << 8), + UMAD_SA_CAP_MASK2_IS_CABLE_INFO_REC_SUP = (1 << 9), + UMAD_SA_CAP_MASK2_IS_PORT_INFO_CAPMASK2_MATCH_SUP = (1 << 10), + UMAD_SA_CAP_MASK2_IS_PORT_INFO_EXT_REC_SUP = (1 << 11) +}; + +/* + * Shared by SA MCMemberRecord, PathRecord, and MultiPathRecord + */ +enum { + UMAD_SA_SELECTOR_GREATER_THAN = 0, + UMAD_SA_SELECTOR_LESS_THAN = 1, + UMAD_SA_SELECTOR_EXACTLY = 2, + UMAD_SA_SELECTOR_LARGEST_AVAIL = 3, /* rate & MTU */ + UMAD_SA_SELECTOR_SMALLEST_AVAIL = 3 /* packet lifetime */ +}; + +#define UMAD_SA_SELECTOR_SHIFT 6 +#define UMAD_SA_RATE_MTU_PKT_LIFE_MASK 0x3f +#define UMAD_SA_SELECTOR_MASK 0x3 + +/* + * sm_key is not aligned on an 8-byte boundary, so is defined as a byte array + */ +struct umad_sa_packet { + struct umad_hdr mad_hdr; + struct umad_rmpp_hdr rmpp_hdr; + uint8_t sm_key[8]; /* network-byte order */ + __be16 attr_offset; + __be16 reserved; + __be64 comp_mask; + uint8_t data[UMAD_LEN_SA_DATA]; /* network-byte order */ +}; + +static inline uint8_t +umad_sa_get_rate_mtu_or_life(uint8_t rate_mtu_or_life) +{ + return (rate_mtu_or_life & UMAD_SA_RATE_MTU_PKT_LIFE_MASK); +} + +static inline uint8_t +umad_sa_set_rate_mtu_or_life(uint8_t selector, uint8_t rate_mtu_or_life) +{ + return (((selector & UMAD_SA_SELECTOR_MASK) << UMAD_SA_SELECTOR_SHIFT) | + (rate_mtu_or_life & UMAD_SA_RATE_MTU_PKT_LIFE_MASK)); +} + +#ifdef __cplusplus +} +#endif +#endif /* _UMAD_SA_H */ diff --git a/libibumad/umad_sa_mcm.h b/libibumad/umad_sa_mcm.h new file mode 100644 index 0000000..afe7978 --- /dev/null +++ b/libibumad/umad_sa_mcm.h @@ -0,0 +1,168 @@ +/* + * Copyright (c) 2017 Mellanox Technologies LTD. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#ifndef _UMAD_SA_MCM_H +#define _UMAD_SA_MCM_H + +#include <infiniband/umad_types.h> +#include <infiniband/umad_sa.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* Component mask bits for MCMemberRecord */ +enum { + UMAD_SA_MCM_COMP_MASK_MGID = (1ULL << 0), + UMAD_SA_MCM_COMP_MASK_PORT_GID = (1ULL << 1), + UMAD_SA_MCM_COMP_MASK_QKEY = (1ULL << 2), + UMAD_SA_MCM_COMP_MASK_MLID = (1ULL << 3), + UMAD_SA_MCM_COMP_MASK_MTU_SEL = (1ULL << 4), + UMAD_SA_MCM_COMP_MASK_MTU = (1ULL << 5), + UMAD_SA_MCM_COMP_MASK_TCLASS = (1ULL << 6), + UMAD_SA_MCM_COMP_MASK_PKEY = (1ULL << 7), + UMAD_SA_MCM_COMP_MASK_RATE_SEL = (1ULL << 8), + UMAD_SA_MCM_COMP_MASK_RATE = (1ULL << 9), + UMAD_SA_MCM_COMP_MASK_LIFE_TIME_SEL = (1ULL << 10), + UMAD_SA_MCM_COMP_MASK_LIFE_TIME = (1ULL << 11), + UMAD_SA_MCM_COMP_MASK_SL = (1ULL << 12), + UMAD_SA_MCM_COMP_MASK_FLOW_LABEL = (1ULL << 13), + UMAD_SA_MCM_COMP_MASK_HOP_LIMIT = (1ULL << 14), + UMAD_SA_MCM_COMP_MASK_SCOPE = (1ULL << 15), + UMAD_SA_MCM_COMP_MASK_JOIN_STATE = (1ULL << 16), + UMAD_SA_MCM_COMP_MASK_PROXY_JOIN = (1ULL << 17) +}; + +enum { + UMAD_SA_MCM_JOIN_STATE_FULL_MEMBER = (1 << 0), + UMAD_SA_MCM_JOIN_STATE_NON_MEMBER = (1 << 1), + UMAD_SA_MCM_JOIN_STATE_SEND_ONLY_NON_MEMBER = (1 << 2), + UMAD_SA_MCM_JOIN_STATE_SEND_ONLY_FULL_MEMBER = (1 << 3) +}; + +enum { + UMAD_SA_MCM_ADDR_SCOPE_LINK_LOCAL = 0x2, + UMAD_SA_MCM_ADDR_SCOPE_SITE_LOCAL = 0x5, + UMAD_SA_MCM_ADDR_SCOPE_ORG_LOCAL = 0x8, + UMAD_SA_MCM_ADDR_SCOPE_GLOBAL = 0xE, +}; + +struct umad_sa_mcmember_record { + uint8_t mgid[16]; /* network-byte order */ + uint8_t portgid[16]; /* network-byte order */ + __be32 qkey; + __be16 mlid; + uint8_t mtu; /* 2 bit selector included */ + uint8_t tclass; + __be16 pkey; + uint8_t rate; /* 2 bit selector included */ + uint8_t pkt_life; /* 2 bit selector included */ + __be32 sl_flow_hop; /* SL: 4 bits, FlowLabel: 20 bits, */ + /* HopLimit: 8 bits */ + uint8_t scope_state; /* Scope: 4 bits, JoinState: 4 bits */ + uint8_t proxy_join; /* ProxyJoin: 1 bit (computed by SA) */ + uint8_t reserved[2]; + uint8_t pad[4]; /* SA records are multiple of 8 bytes */ +}; + +static inline void +umad_sa_mcm_get_sl_flow_hop(__be32 sl_flow_hop, uint8_t * const p_sl, + uint32_t * const p_flow_lbl, uint8_t * const p_hop) +{ + uint32_t tmp; + + tmp = ntohl(sl_flow_hop); + if (p_hop) + *p_hop = (uint8_t) tmp; + + tmp >>= 8; + if (p_flow_lbl) + *p_flow_lbl = (uint32_t) (tmp & 0xfffff); + + tmp >>= 20; + if (p_sl) + *p_sl = (uint8_t) tmp; +} + +static inline __be32 +umad_sa_mcm_set_sl_flow_hop(uint8_t sl, uint32_t flow_label, uint8_t hop_limit) +{ + uint32_t tmp; + + tmp = (sl << 28) | ((flow_label & 0xfffff) << 8) | hop_limit; + return htonl(tmp); +} + +static inline void +umad_sa_mcm_get_scope_state(const uint8_t scope_state, uint8_t * const p_scope, + uint8_t * const p_state) +{ + uint8_t tmp_scope_state; + + if (p_state) + *p_state = (uint8_t) (scope_state & 0x0f); + + tmp_scope_state = scope_state >> 4; + + if (p_scope) + *p_scope = (uint8_t) (tmp_scope_state & 0x0f); +} + +static inline uint8_t +umad_sa_mcm_set_scope_state(const uint8_t scope, const uint8_t state) +{ + uint8_t scope_state; + + scope_state = scope; + scope_state = scope_state << 4; + scope_state = scope_state | state; + return scope_state; +} + +static inline void +umad_sa_mcm_set_join_state(struct umad_sa_mcmember_record *p_mc_rec, + const uint8_t state) +{ + /* keep the scope as it is */ + p_mc_rec->scope_state = (p_mc_rec->scope_state & 0xf0) | (0x0f & state); +} + +static inline int +umad_sa_mcm_get_proxy_join(struct umad_sa_mcmember_record *p_mc_rec) +{ + return ((p_mc_rec->proxy_join & 0x80) == 0x80); +} + +#ifdef __cplusplus +} +#endif +#endif /* _UMAD_SA_MCM_H */ diff --git a/libibumad/umad_sm.h b/libibumad/umad_sm.h new file mode 100644 index 0000000..5a326e5 --- /dev/null +++ b/libibumad/umad_sm.h @@ -0,0 +1,124 @@ +/* + * Copyright (c) 2004-2014 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2004 Infinicon Corporation. All rights reserved. + * Copyright (c) 2004 Intel Corporation. All rights reserved. + * Copyright (c) 2004 Topspin Corporation. All rights reserved. + * Copyright (c) 2004 Voltaire Corporation. All rights reserved. + * Copyright (c) 2013 Oracle and/or its affiliates. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef _UMAD_SM_H +#define _UMAD_SM_H + +#include <infiniband/umad_types.h> + +#ifdef __cplusplus +extern "C" { +#endif + +enum { + UMAD_SMP_DIRECTION = 0x8000, +}; + +/* Subnet management attributes */ +enum { + UMAD_SM_ATTR_NODE_DESC = 0x0010, + UMAD_SM_ATTR_NODE_INFO = 0x0011, + UMAD_SM_ATTR_SWITCH_INFO = 0x0012, + UMAD_SM_ATTR_GUID_INFO = 0x0014, + UMAD_SM_ATTR_PORT_INFO = 0x0015, + UMAD_SM_ATTR_PKEY_TABLE = 0x0016, + UMAD_SM_ATTR_SLVL_TABLE = 0x0017, + UMAD_SM_ATTR_VL_ARB_TABLE = 0x0018, + UMAD_SM_ATTR_LINEAR_FT = 0x0019, + UMAD_SM_ATTR_RANDOM_FT = 0x001A, + UMAD_SM_ATTR_MCAST_FT = 0x001B, + UMAD_SM_ATTR_LINK_SPD_WIDTH_TABLE = 0x001C, + UMAD_SM_ATTR_VENDOR_MADS_TABLE = 0x001D, + UMAD_SM_ATTR_HIERARCHY_INFO = 0x001E, + UMAD_SM_ATTR_SM_INFO = 0x0020, + UMAD_SM_ATTR_VENDOR_DIAG = 0x0030, + UMAD_SM_ATTR_LED_INFO = 0x0031, + UMAD_SM_ATTR_CABLE_INFO = 0x0032, + UMAD_SM_ATTR_PORT_INFO_EXT = 0x0033, + UMAD_SM_ATTR_VENDOR_MASK = 0xFF00, + UMAD_SM_ATTR_MLNX_EXT_PORT_INFO = 0xFF90 +}; + +enum { + UMAD_SM_GID_IN_SERVICE_TRAP = 64, + UMAD_SM_GID_OUT_OF_SERVICE_TRAP = 65, + UMAD_SM_MGID_CREATED_TRAP = 66, + UMAD_SM_MGID_DESTROYED_TRAP = 67, + UMAD_SM_UNPATH_TRAP = 68, + UMAD_SM_REPATH_TRAP = 69, + UMAD_SM_LINK_STATE_CHANGED_TRAP = 128, + UMAD_SM_LINK_INTEGRITY_THRESHOLD_TRAP = 129, + UMAD_SM_BUFFER_OVERRUN_THRESHOLD_TRAP = 130, + UMAD_SM_WATCHDOG_TIMER_EXPIRED_TRAP = 131, + UMAD_SM_LOCAL_CHANGES_TRAP = 144, + UMAD_SM_SYS_IMG_GUID_CHANGED_TRAP = 145, + UMAD_SM_BAD_MKEY_TRAP = 256, + UMAD_SM_BAD_PKEY_TRAP = 257, + UMAD_SM_BAD_QKEY_TRAP = 258, + UMAD_SM_BAD_SWITCH_PKEY_TRAP = 259 +}; + +enum { + UMAD_LEN_SMP_DATA = 64, + UMAD_SMP_MAX_HOPS = 64 +}; + +struct umad_smp { + uint8_t base_version; + uint8_t mgmt_class; + uint8_t class_version; + uint8_t method; + __be16 status; + uint8_t hop_ptr; + uint8_t hop_cnt; + __be64 tid; + __be16 attr_id; + __be16 resv; + __be32 attr_mod; + __be64 mkey; + __be16 dr_slid; + __be16 dr_dlid; + uint8_t reserved[28]; + uint8_t data[UMAD_LEN_SMP_DATA]; + uint8_t initial_path[UMAD_SMP_MAX_HOPS]; + uint8_t return_path[UMAD_SMP_MAX_HOPS]; +}; + +#ifdef __cplusplus +} +#endif +#endif /* _UMAD_SM_H */ diff --git a/libibumad/umad_str.c b/libibumad/umad_str.c new file mode 100644 index 0000000..4c001fd --- /dev/null +++ b/libibumad/umad_str.c @@ -0,0 +1,352 @@ +/* + * Copyright (c) 2004, 2005, 2010 Intel Corporation. All rights reserved. + * Copyright (c) 2013 Lawrence Livermore National Security. All rights reserved. + * Copyright (c) 2014 Mellanox Technologies LTD. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#include <endian.h> +#include <stdio.h> +#include <infiniband/umad.h> +#include <infiniband/umad_types.h> +#include <infiniband/umad_sm.h> +#include <infiniband/umad_sa.h> +#include <infiniband/umad_cm.h> +#include "umad_str.h" + +const char * umad_class_str(uint8_t mgmt_class) +{ + switch (mgmt_class) { + case UMAD_CLASS_SUBN_LID_ROUTED: + case UMAD_CLASS_SUBN_DIRECTED_ROUTE: + return("Subn"); + case UMAD_CLASS_SUBN_ADM: + return("SubnAdm"); + case UMAD_CLASS_PERF_MGMT: + return("Perf"); + case UMAD_CLASS_BM: + return("BM"); + case UMAD_CLASS_DEVICE_MGMT: + return("DevMgt"); + case UMAD_CLASS_CM: + return("ComMgt"); + case UMAD_CLASS_SNMP: + return("SNMP"); + case UMAD_CLASS_DEVICE_ADM: + return("DevAdm"); + case UMAD_CLASS_BOOT_MGMT: + return("BootMgt"); + case UMAD_CLASS_BIS: + return("BIS"); + case UMAD_CLASS_CONG_MGMT: + return("CongestionManagment"); + default: + break; + } + + if ((UMAD_CLASS_VENDOR_RANGE1_START <= mgmt_class + && mgmt_class <= UMAD_CLASS_VENDOR_RANGE1_END) + || (UMAD_CLASS_VENDOR_RANGE2_START <= mgmt_class + && mgmt_class <= UMAD_CLASS_VENDOR_RANGE2_END)) + return("Vendor"); + + if (UMAD_CLASS_APPLICATION_START <= mgmt_class + && mgmt_class <= UMAD_CLASS_APPLICATION_END) { + return("Application"); + } + return ("<unknown>"); +} + +static const char * umad_common_method_str(uint8_t method) +{ + switch(method) { + case UMAD_METHOD_GET: + return ("Get"); + case UMAD_METHOD_SET: + return ("Set"); + case UMAD_METHOD_GET_RESP: + return ("GetResp"); + case UMAD_METHOD_SEND: + return ("Send"); + case UMAD_METHOD_TRAP: + return ("Trap"); + case UMAD_METHOD_REPORT: + return ("Report"); + case UMAD_METHOD_REPORT_RESP: + return ("ReportResp"); + case UMAD_METHOD_TRAP_REPRESS: + return ("TrapRepress"); + default: + return ("<unknown"); + } +} + +static const char * umad_sa_method_str(uint8_t method) +{ + switch(method) { + case UMAD_SA_METHOD_GET_TABLE: + return ("GetTable"); + case UMAD_SA_METHOD_GET_TABLE_RESP: + return ("GetTableResp"); + case UMAD_SA_METHOD_DELETE: + return ("Delete"); + case UMAD_SA_METHOD_DELETE_RESP: + return ("DeleteResp"); + case UMAD_SA_METHOD_GET_MULTI: + return ("GetMulti"); + case UMAD_SA_METHOD_GET_MULTI_RESP: + return ("GetMultiResp"); + case UMAD_SA_METHOD_GET_TRACE_TABLE: + return ("GetTraceTable"); + default: + return (umad_common_method_str(method)); + } +} + +const char * umad_method_str(uint8_t mgmt_class, uint8_t method) +{ + if (mgmt_class == UMAD_CLASS_SUBN_ADM) + return(umad_sa_method_str(method)); + + return (umad_common_method_str(method)); +} + +const char * umad_common_mad_status_str(__be16 _status) +{ + uint16_t status = be16toh(_status); + + if (status & UMAD_STATUS_BUSY) + return ("Busy"); + + if (status & UMAD_STATUS_REDIRECT) + return ("Redirection required"); + + switch(status & UMAD_STATUS_INVALID_FIELD_MASK) { + case UMAD_STATUS_BAD_VERSION: + return ("Bad Version"); + case UMAD_STATUS_METHOD_NOT_SUPPORTED: + return ("Method not supported"); + case UMAD_STATUS_ATTR_NOT_SUPPORTED: + return ("Method/Attribute combo not supported"); + case UMAD_STATUS_INVALID_ATTR_VALUE: + return ("Invalid attribute/modifier field"); + } + return ("Success"); +} + +const char * umad_sa_mad_status_str(__be16 _status) +{ + uint16_t status = be16toh(_status); + switch((status & UMAD_STATUS_CLASS_MASK) >> 8) { + case UMAD_SA_STATUS_SUCCESS: + return ("Success"); + case UMAD_SA_STATUS_NO_RESOURCES: + return ("No Resources"); + case UMAD_SA_STATUS_REQ_INVALID: + return ("Request Invalid"); + case UMAD_SA_STATUS_NO_RECORDS: + return ("No Records"); + case UMAD_SA_STATUS_TOO_MANY_RECORDS: + return ("Too Many Records"); + case UMAD_SA_STATUS_INVALID_GID: + return ("Invalid GID"); + case UMAD_SA_STATUS_INSUF_COMPS: + return ("Insufficient Components"); + case UMAD_SA_STATUS_REQ_DENIED: + return ("Request Denied"); + case UMAD_SA_STATUS_PRI_SUGGESTED: + return ("Priority Suggested"); + } + return ("Undefined Error"); +} + +static const char *umad_common_attr_str(__be16 attr_id) +{ + switch(be16toh(attr_id)) { + case UMAD_ATTR_CLASS_PORT_INFO: + return "Class Port Info"; + case UMAD_ATTR_NOTICE: + return "Notice"; + case UMAD_ATTR_INFORM_INFO: + return "Inform Info"; + default: + return "<unknown>"; + } +} + +static const char * umad_sm_attr_str(__be16 attr_id) +{ + switch(be16toh(attr_id)) { + case UMAD_SM_ATTR_NODE_DESC: + return ("NodeDescription"); + case UMAD_SM_ATTR_NODE_INFO: + return ("NodeInfo"); + case UMAD_SM_ATTR_SWITCH_INFO: + return ("SwitchInfo"); + case UMAD_SM_ATTR_GUID_INFO: + return ("GUIDInfo"); + case UMAD_SM_ATTR_PORT_INFO: + return ("PortInfo"); + case UMAD_SM_ATTR_PKEY_TABLE: + return ("P_KeyTable"); + case UMAD_SM_ATTR_SLVL_TABLE: + return ("SLtoVLMappingTable"); + case UMAD_SM_ATTR_VL_ARB_TABLE: + return ("VLArbitrationTable"); + case UMAD_SM_ATTR_LINEAR_FT: + return ("LinearForwardingTable"); + case UMAD_SM_ATTR_RANDOM_FT: + return ("RandomForwardingTable"); + case UMAD_SM_ATTR_MCAST_FT: + return ("MulticastForwardingTable"); + case UMAD_SM_ATTR_SM_INFO: + return ("SMInfo"); + case UMAD_SM_ATTR_VENDOR_DIAG: + return ("VendorDiag"); + case UMAD_SM_ATTR_LED_INFO: + return ("LedInfo"); + case UMAD_SM_ATTR_LINK_SPD_WIDTH_TABLE: + return ("LinkSpeedWidthPairsTable"); + case UMAD_SM_ATTR_VENDOR_MADS_TABLE: + return ("VendorSpecificMadsTable"); + case UMAD_SM_ATTR_HIERARCHY_INFO: + return ("HierarchyInfo"); + case UMAD_SM_ATTR_CABLE_INFO: + return ("CableInfo"); + case UMAD_SM_ATTR_PORT_INFO_EXT: + return ("PortInfoExtended"); + default: + return (umad_common_attr_str(attr_id)); + } +} + +static const char * umad_sa_attr_str(__be16 attr_id) +{ + switch(be16toh(attr_id)) { + case UMAD_SA_ATTR_NODE_REC: + return ("NodeRecord"); + case UMAD_SA_ATTR_PORT_INFO_REC: + return ("PortInfoRecord"); + case UMAD_SA_ATTR_SLVL_REC: + return ("SLtoVLMappingTableRecord"); + case UMAD_SA_ATTR_SWITCH_INFO_REC: + return ("SwitchInfoRecord"); + case UMAD_SA_ATTR_LINEAR_FT_REC: + return ("LinearForwardingTableRecord"); + case UMAD_SA_ATTR_RANDOM_FT_REC: + return ("RandomForwardingTableRecord"); + case UMAD_SA_ATTR_MCAST_FT_REC: + return ("MulticastForwardingTableRecord"); + case UMAD_SA_ATTR_SM_INFO_REC: + return ("SMInfoRecord"); + case UMAD_SA_ATTR_INFORM_INFO_REC: + return ("InformInfoRecord"); + case UMAD_SA_ATTR_LINK_REC: + return ("LinkRecord"); + case UMAD_SA_ATTR_GUID_INFO_REC: + return ("GuidInfoRecord"); + case UMAD_SA_ATTR_SERVICE_REC: + return ("ServiceRecord"); + case UMAD_SA_ATTR_PKEY_TABLE_REC: + return ("P_KeyTableRecord"); + case UMAD_SA_ATTR_PATH_REC: + return ("PathRecord"); + case UMAD_SA_ATTR_VL_ARB_REC: + return ("VLArbitrationTableRecord"); + case UMAD_SA_ATTR_MCMEMBER_REC: + return ("MCMemberRecord"); + case UMAD_SA_ATTR_TRACE_REC: + return ("TraceRecord"); + case UMAD_SA_ATTR_MULTI_PATH_REC: + return ("MultiPathRecord"); + case UMAD_SA_ATTR_SERVICE_ASSOC_REC: + return ("ServiceAssociationRecord"); + case UMAD_SA_ATTR_LINK_SPD_WIDTH_TABLE_REC: + return ("LinkSpeedWidthPairsTableRecord"); + case UMAD_SA_ATTR_HIERARCHY_INFO_REC: + return ("HierarchyInfoRecord"); + case UMAD_SA_ATTR_CABLE_INFO_REC: + return ("CableInfoRecord"); + case UMAD_SA_ATTR_PORT_INFO_EXT_REC: + return ("PortInfoExtendedRecord"); + default: + return (umad_common_attr_str(attr_id)); + } +} + +static const char * umad_cm_attr_str(__be16 attr_id) +{ + switch(be16toh(attr_id)) { + case UMAD_CM_ATTR_REQ: + return "ConnectRequest"; + case UMAD_CM_ATTR_MRA: + return "MsgRcptAck"; + case UMAD_CM_ATTR_REJ: + return "ConnectReject"; + case UMAD_CM_ATTR_REP: + return "ConnectReply"; + case UMAD_CM_ATTR_RTU: + return "ReadyToUse"; + case UMAD_CM_ATTR_DREQ: + return "DisconnectRequest"; + case UMAD_CM_ATTR_DREP: + return "DisconnectReply"; + case UMAD_CM_ATTR_SIDR_REQ: + return "ServiceIDResReq"; + case UMAD_CM_ATTR_SIDR_REP: + return "ServiceIDResReqResp"; + case UMAD_CM_ATTR_LAP: + return "LoadAlternatePath"; + case UMAD_CM_ATTR_APR: + return "AlternatePathResponse"; + case UMAD_CM_ATTR_SAP: + return "SuggestAlternatePath"; + case UMAD_CM_ATTR_SPR: + return "SuggestPathResponse"; + default: + return (umad_common_attr_str(attr_id)); + } +} + +const char * umad_attribute_str(uint8_t mgmt_class, __be16 attr_id) +{ + switch (mgmt_class) { + case UMAD_CLASS_SUBN_LID_ROUTED: + case UMAD_CLASS_SUBN_DIRECTED_ROUTE: + return(umad_sm_attr_str(attr_id)); + case UMAD_CLASS_SUBN_ADM: + return(umad_sa_attr_str(attr_id)); + case UMAD_CLASS_CM: + return(umad_cm_attr_str(attr_id)); + } + + return (umad_common_attr_str(attr_id)); +} diff --git a/libibumad/umad_str.h b/libibumad/umad_str.h new file mode 100644 index 0000000..ddd6867 --- /dev/null +++ b/libibumad/umad_str.h @@ -0,0 +1,53 @@ +/* + * Copyright (c) 2004, 2005, 2010 Intel Corporation. All rights reserved. + * Copyright (c) 2013 Lawrence Livermore National Security. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#ifndef _UMAD_STR_H +#define _UMAD_STR_H + +#include <infiniband/umad.h> + +#ifdef __cplusplus +extern "C" { +#endif + +const char * umad_class_str(uint8_t mgmt_class); +const char * umad_method_str(uint8_t mgmt_class, uint8_t method); +const char * umad_attribute_str(uint8_t mgmt_class, __be16 attr_id); + +const char * umad_common_mad_status_str(__be16 status); +const char * umad_sa_mad_status_str(__be16 status); + +#ifdef __cplusplus +} +#endif +#endif /* _UMAD_STR_H */ diff --git a/libibumad/umad_types.h b/libibumad/umad_types.h new file mode 100644 index 0000000..bb0d57f --- /dev/null +++ b/libibumad/umad_types.h @@ -0,0 +1,213 @@ +/* + * Copyright (c) 2004 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2004 Infinicon Corporation. All rights reserved. + * Copyright (c) 2004, 2010 Intel Corporation. All rights reserved. + * Copyright (c) 2004 Topspin Corporation. All rights reserved. + * Copyright (c) 2004-2006 Voltaire Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#ifndef _UMAD_TYPES_H +#define _UMAD_TYPES_H + +#include <stdint.h> +#include <infiniband/umad.h> + +#ifdef __cplusplus +extern "C" { +#endif + +#define UMAD_BASE_VERSION 1 +#define UMAD_QKEY 0x80010000 + +/* Management classes */ +enum { + UMAD_CLASS_SUBN_LID_ROUTED = 0x01, + UMAD_CLASS_SUBN_DIRECTED_ROUTE = 0x81, + UMAD_CLASS_SUBN_ADM = 0x03, + UMAD_CLASS_PERF_MGMT = 0x04, + UMAD_CLASS_BM = 0x05, + UMAD_CLASS_DEVICE_MGMT = 0x06, + UMAD_CLASS_CM = 0x07, + UMAD_CLASS_SNMP = 0x08, + UMAD_CLASS_VENDOR_RANGE1_START = 0x09, + UMAD_CLASS_VENDOR_RANGE1_END = 0x0F, + UMAD_CLASS_APPLICATION_START = 0x10, + UMAD_CLASS_DEVICE_ADM = UMAD_CLASS_APPLICATION_START, + UMAD_CLASS_BOOT_MGMT = 0x11, + UMAD_CLASS_BIS = 0x12, + UMAD_CLASS_CONG_MGMT = 0x21, + UMAD_CLASS_APPLICATION_END = 0x2F, + UMAD_CLASS_VENDOR_RANGE2_START = 0x30, + UMAD_CLASS_VENDOR_RANGE2_END = 0x4F +}; + +/* Management methods */ +enum { + UMAD_METHOD_GET = 0x01, + UMAD_METHOD_SET = 0x02, + UMAD_METHOD_GET_RESP = 0x81, + UMAD_METHOD_SEND = 0x03, + UMAD_METHOD_TRAP = 0x05, + UMAD_METHOD_REPORT = 0x06, + UMAD_METHOD_REPORT_RESP = 0x86, + UMAD_METHOD_TRAP_REPRESS = 0x07, + UMAD_METHOD_RESP_MASK = 0x80 +}; + +enum { + UMAD_STATUS_SUCCESS = 0x0000, + UMAD_STATUS_BUSY = 0x0001, + UMAD_STATUS_REDIRECT = 0x0002, + + /* Invalid fields, bits 2-4 */ + UMAD_STATUS_BAD_VERSION = (1 << 2), + UMAD_STATUS_METHOD_NOT_SUPPORTED = (2 << 2), + UMAD_STATUS_ATTR_NOT_SUPPORTED = (3 << 2), + UMAD_STATUS_INVALID_ATTR_VALUE = (7 << 2), + + UMAD_STATUS_INVALID_FIELD_MASK = 0x001C, + UMAD_STATUS_CLASS_MASK = 0xFF00 +}; + +/* Attributes common to multiple classes */ +enum { + UMAD_ATTR_CLASS_PORT_INFO = 0x0001, + UMAD_ATTR_NOTICE = 0x0002, + UMAD_ATTR_INFORM_INFO = 0x0003 +}; + +/* RMPP information */ +#define UMAD_RMPP_VERSION 1 +enum { + UMAD_RMPP_FLAG_ACTIVE = 1, +}; + +enum { + UMAD_LEN_DATA = 232, + UMAD_LEN_RMPP_DATA = 220, + UMAD_LEN_DM_DATA = 192, + UMAD_LEN_VENDOR_DATA = 216, +}; + +struct umad_hdr { + uint8_t base_version; + uint8_t mgmt_class; + uint8_t class_version; + uint8_t method; + __be16 status; + __be16 class_specific; + __be64 tid; + __be16 attr_id; + __be16 resv; + __be32 attr_mod; +}; + +struct umad_rmpp_hdr { + uint8_t rmpp_version; + uint8_t rmpp_type; + uint8_t rmpp_rtime_flags; + uint8_t rmpp_status; + __be32 seg_num; + __be32 paylen_newwin; +}; + +struct umad_packet { + struct umad_hdr mad_hdr; + uint8_t data[UMAD_LEN_DATA]; /* network-byte order */ +}; + +struct umad_rmpp_packet { + struct umad_hdr mad_hdr; + struct umad_rmpp_hdr rmpp_hdr; + uint8_t data[UMAD_LEN_RMPP_DATA]; /* network-byte order */ +}; + +struct umad_dm_packet { + struct umad_hdr mad_hdr; + uint8_t reserved[40]; + uint8_t data[UMAD_LEN_DM_DATA]; /* network-byte order */ +}; + +struct umad_vendor_packet { + struct umad_hdr mad_hdr; + struct umad_rmpp_hdr rmpp_hdr; + uint8_t reserved; + uint8_t oui[3]; /* network-byte order */ + uint8_t data[UMAD_LEN_VENDOR_DATA]; /* network-byte order */ +}; + +enum { + UMAD_OPENIB_OUI = 0x001405 +}; + +enum { + UMAD_CLASS_RESP_TIME_MASK = 0x1F +}; +struct umad_class_port_info { + uint8_t base_ver; + uint8_t class_ver; + __be16 cap_mask; + __be32 cap_mask2_resp_time; + union { + uint8_t redir_gid[16] __attribute__((deprecated)); /* network byte order */ + union umad_gid redirgid; + }; + __be32 redir_tc_sl_fl; + __be16 redir_lid; + __be16 redir_pkey; + __be32 redir_qp; + __be32 redir_qkey; + union { + uint8_t trap_gid[16] __attribute__((deprecated)); /* network byte order */ + union umad_gid trapgid; + }; + __be32 trap_tc_sl_fl; + __be16 trap_lid; + __be16 trap_pkey; + __be32 trap_hl_qp; + __be32 trap_qkey; +}; +static inline uint32_t +umad_class_cap_mask2(struct umad_class_port_info *cpi) +{ + return (be32toh(cpi->cap_mask2_resp_time) >> 5); +} +static inline uint8_t +umad_class_resp_time(struct umad_class_port_info *cpi) +{ + return (uint8_t)(be32toh(cpi->cap_mask2_resp_time) + & UMAD_CLASS_RESP_TIME_MASK); +} + +#ifdef __cplusplus +} +#endif +#endif /* _UMAD_TYPES_H */ diff --git a/libibverbs/CMakeLists.txt b/libibverbs/CMakeLists.txt new file mode 100644 index 0000000..4328548 --- /dev/null +++ b/libibverbs/CMakeLists.txt @@ -0,0 +1,88 @@ +publish_headers(infiniband + arch.h + opcode.h + sa-kern-abi.h + sa.h + verbs.h + verbs_api.h + tm_types.h + ) + +publish_internal_headers(infiniband + cmd_ioctl.h + cmd_write.h + driver.h + kern-abi.h + marshall.h + ) + +configure_file("libibverbs.map.in" + "${CMAKE_CURRENT_BINARY_DIR}/libibverbs.map" @ONLY) + +rdma_library(ibverbs "${CMAKE_CURRENT_BINARY_DIR}/libibverbs.map" + # See Documentation/versioning.md + 1 1.8.${PACKAGE_VERSION} + all_providers.c + cmd.c + cmd_ah.c + cmd_counters.c + cmd_cq.c + cmd_device.c + cmd_dm.c + cmd_fallback.c + cmd_flow.c + cmd_flow_action.c + cmd_ioctl.c + cmd_mr.c + cmd_mw.c + cmd_pd.c + cmd_rwq_ind.c + cmd_xrcd.c + compat-1_0.c + device.c + dummy_ops.c + dynamic_driver.c + enum_strs.c + ibdev_nl.c + init.c + marshall.c + memory.c + neigh.c + static_driver.c + sysfs.c + verbs.c + ) +target_link_libraries(ibverbs LINK_PRIVATE + ${NL_LIBRARIES} + ${CMAKE_THREAD_LIBS_INIT} + ${CMAKE_DL_LIBS} + kern-abi + ) + +function(ibverbs_finalize) + if (ENABLE_STATIC) + # In static mode the .pc file lists all of the providers for static + # linking. The user should set RDMA_STATIC_PROVIDERS to select which ones + # to include. + list(LENGTH RDMA_PROVIDER_LIST LEN) + math(EXPR LEN ${LEN}-1) + foreach(I RANGE 0 ${LEN} 2) + list(GET RDMA_PROVIDER_LIST ${I} PROVIDER_NAME) + math(EXPR I ${I}+1) + list(GET RDMA_PROVIDER_LIST ${I} LIB_NAME) + math(EXPR I ${I}+1) + + set(PROVIDER_LIBS "${PROVIDER_LIBS} -l${LIB_NAME}") + set(FOR_EACH_PROVIDER "${FOR_EACH_PROVIDER} FOR_PROVIDER(${PROVIDER_NAME})") + endforeach() + + if (NOT NL_KIND EQUAL 0) + set(REQUIRES "libnl-3.0, libnl-route-3.0") + endif() + rdma_pkg_config("ibverbs" "${REQUIRES}" "${PROVIDER_LIBS} -libverbs ${CMAKE_THREAD_LIBS_INIT}") + + file(WRITE ${BUILD_INCLUDE}/infiniband/all_providers.h "#define FOR_EACH_PROVIDER() ${FOR_EACH_PROVIDER}") + else() + rdma_pkg_config("ibverbs" "" "${CMAKE_THREAD_LIBS_INIT}") + endif() +endfunction() diff --git a/libibverbs/all_providers.c b/libibverbs/all_providers.c new file mode 100644 index 0000000..78adac9 --- /dev/null +++ b/libibverbs/all_providers.c @@ -0,0 +1,53 @@ +/* + * Copyright (c) 2018 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifdef _STATIC_LIBRARY_BUILD_ +#define RDMA_STATIC_PROVIDERS none +#include <infiniband/verbs.h> +#include <infiniband/driver.h> +#include <infiniband/all_providers.h> + +/* When static linking this object will be included in the final link only if + * something refers to the 'verbs_provider_all' symbol. It in turn brings all + * the providers into the link as well. Otherwise the static linker will not + * include this. It is important this is the only thing in this file. + */ +#define FOR_PROVIDER(x) &verbs_provider_ ## x, +static const struct verbs_device_ops *all_providers[] = { + FOR_EACH_PROVIDER() + NULL +}; + +const struct verbs_device_ops verbs_provider_all = { + .static_providers = all_providers, +}; + +#endif diff --git a/libibverbs/arch.h b/libibverbs/arch.h new file mode 100644 index 0000000..bcbece8 --- /dev/null +++ b/libibverbs/arch.h @@ -0,0 +1,51 @@ +/* + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef INFINIBAND_ARCH_H +#define INFINIBAND_ARCH_H + +#include <stdint.h> +#include <endian.h> + +#warning "This header is obsolete." + +#ifndef ntohll +#undef htonll +#undef ntohll +/* Users should use the glibc functions directly, not these wrappers */ +static inline __attribute__((deprecated)) uint64_t htonll(uint64_t x) { return htobe64(x); } +static inline __attribute__((deprecated)) uint64_t ntohll(uint64_t x) { return be64toh(x); } +#define htonll htonll +#define ntohll ntohll +#endif + +/* Barrier macros are no longer provided by libibverbs */ + +#endif /* INFINIBAND_ARCH_H */ diff --git a/libibverbs/cmd.c b/libibverbs/cmd.c new file mode 100644 index 0000000..728d884 --- /dev/null +++ b/libibverbs/cmd.c @@ -0,0 +1,1914 @@ +/* + * Copyright (c) 2005 Topspin Communications. All rights reserved. + * Copyright (c) 2005 PathScale, Inc. All rights reserved. + * Copyright (c) 2006 Cisco Systems, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <config.h> + +#include <stdio.h> +#include <unistd.h> +#include <stdlib.h> +#include <errno.h> +#include <alloca.h> +#include <string.h> + +#include <infiniband/cmd_write.h> +#include "ibverbs.h" +#include <ccan/minmax.h> + +bool verbs_allow_disassociate_destroy; + +static void copy_query_dev_fields(struct ibv_device_attr *device_attr, + struct ib_uverbs_query_device_resp *resp, + uint64_t *raw_fw_ver) +{ + *raw_fw_ver = resp->fw_ver; + device_attr->node_guid = resp->node_guid; + device_attr->sys_image_guid = resp->sys_image_guid; + device_attr->max_mr_size = resp->max_mr_size; + device_attr->page_size_cap = resp->page_size_cap; + device_attr->vendor_id = resp->vendor_id; + device_attr->vendor_part_id = resp->vendor_part_id; + device_attr->hw_ver = resp->hw_ver; + device_attr->max_qp = resp->max_qp; + device_attr->max_qp_wr = resp->max_qp_wr; + device_attr->device_cap_flags = resp->device_cap_flags; + device_attr->max_sge = resp->max_sge; + device_attr->max_sge_rd = resp->max_sge_rd; + device_attr->max_cq = resp->max_cq; + device_attr->max_cqe = resp->max_cqe; + device_attr->max_mr = resp->max_mr; + device_attr->max_pd = resp->max_pd; + device_attr->max_qp_rd_atom = resp->max_qp_rd_atom; + device_attr->max_ee_rd_atom = resp->max_ee_rd_atom; + device_attr->max_res_rd_atom = resp->max_res_rd_atom; + device_attr->max_qp_init_rd_atom = resp->max_qp_init_rd_atom; + device_attr->max_ee_init_rd_atom = resp->max_ee_init_rd_atom; + device_attr->atomic_cap = resp->atomic_cap; + device_attr->max_ee = resp->max_ee; + device_attr->max_rdd = resp->max_rdd; + device_attr->max_mw = resp->max_mw; + device_attr->max_raw_ipv6_qp = resp->max_raw_ipv6_qp; + device_attr->max_raw_ethy_qp = resp->max_raw_ethy_qp; + device_attr->max_mcast_grp = resp->max_mcast_grp; + device_attr->max_mcast_qp_attach = resp->max_mcast_qp_attach; + device_attr->max_total_mcast_qp_attach = resp->max_total_mcast_qp_attach; + device_attr->max_ah = resp->max_ah; + device_attr->max_fmr = resp->max_fmr; + device_attr->max_map_per_fmr = resp->max_map_per_fmr; + device_attr->max_srq = resp->max_srq; + device_attr->max_srq_wr = resp->max_srq_wr; + device_attr->max_srq_sge = resp->max_srq_sge; + device_attr->max_pkeys = resp->max_pkeys; + device_attr->local_ca_ack_delay = resp->local_ca_ack_delay; + device_attr->phys_port_cnt = resp->phys_port_cnt; +} + +int ibv_cmd_query_device(struct ibv_context *context, + struct ibv_device_attr *device_attr, + uint64_t *raw_fw_ver, + struct ibv_query_device *cmd, size_t cmd_size) +{ + struct ib_uverbs_query_device_resp resp; + int ret; + + ret = execute_cmd_write(context, IB_USER_VERBS_CMD_QUERY_DEVICE, cmd, + cmd_size, &resp, sizeof(resp)); + if (ret) + return ret; + + memset(device_attr->fw_ver, 0, sizeof device_attr->fw_ver); + copy_query_dev_fields(device_attr, &resp, raw_fw_ver); + + return 0; +} + +int ibv_cmd_query_device_ex(struct ibv_context *context, + const struct ibv_query_device_ex_input *input, + struct ibv_device_attr_ex *attr, size_t attr_size, + uint64_t *raw_fw_ver, + struct ibv_query_device_ex *cmd, + size_t cmd_size, + struct ib_uverbs_ex_query_device_resp *resp, + size_t resp_size) +{ + int err; + + if (input && input->comp_mask) + return EINVAL; + + if (attr_size < offsetof(struct ibv_device_attr_ex, comp_mask) + + sizeof(attr->comp_mask)) + return EINVAL; + + cmd->comp_mask = 0; + cmd->reserved = 0; + memset(attr->orig_attr.fw_ver, 0, sizeof(attr->orig_attr.fw_ver)); + memset(&attr->comp_mask, 0, attr_size - sizeof(attr->orig_attr)); + + err = execute_cmd_write_ex(context, IB_USER_VERBS_EX_CMD_QUERY_DEVICE, + cmd, cmd_size, resp, resp_size); + if (err) + return err; + + copy_query_dev_fields(&attr->orig_attr, &resp->base, raw_fw_ver); + /* Report back supported comp_mask bits. For now no comp_mask bit is + * defined */ + attr->comp_mask = resp->comp_mask & 0; + if (attr_size >= offsetof(struct ibv_device_attr_ex, odp_caps) + + sizeof(attr->odp_caps)) { + if (resp->response_length >= + offsetof(struct ib_uverbs_ex_query_device_resp, odp_caps) + + sizeof(resp->odp_caps)) { + attr->odp_caps.general_caps = resp->odp_caps.general_caps; + attr->odp_caps.per_transport_caps.rc_odp_caps = + resp->odp_caps.per_transport_caps.rc_odp_caps; + attr->odp_caps.per_transport_caps.uc_odp_caps = + resp->odp_caps.per_transport_caps.uc_odp_caps; + attr->odp_caps.per_transport_caps.ud_odp_caps = + resp->odp_caps.per_transport_caps.ud_odp_caps; + } + } + + if (attr_size >= offsetof(struct ibv_device_attr_ex, + completion_timestamp_mask) + + sizeof(attr->completion_timestamp_mask)) { + if (resp->response_length >= + offsetof(struct ib_uverbs_ex_query_device_resp, timestamp_mask) + + sizeof(resp->timestamp_mask)) + attr->completion_timestamp_mask = resp->timestamp_mask; + } + + if (attr_size >= offsetof(struct ibv_device_attr_ex, hca_core_clock) + + sizeof(attr->hca_core_clock)) { + if (resp->response_length >= + offsetof(struct ib_uverbs_ex_query_device_resp, hca_core_clock) + + sizeof(resp->hca_core_clock)) + attr->hca_core_clock = resp->hca_core_clock; + } + + if (attr_size >= offsetof(struct ibv_device_attr_ex, device_cap_flags_ex) + + sizeof(attr->device_cap_flags_ex)) { + if (resp->response_length >= + offsetof(struct ib_uverbs_ex_query_device_resp, device_cap_flags_ex) + + sizeof(resp->device_cap_flags_ex)) + attr->device_cap_flags_ex = resp->device_cap_flags_ex; + } + + if (attr_size >= offsetof(struct ibv_device_attr_ex, rss_caps) + + sizeof(attr->rss_caps)) { + if (resp->response_length >= + offsetof(struct ib_uverbs_ex_query_device_resp, rss_caps) + + sizeof(resp->rss_caps)) { + attr->rss_caps.supported_qpts = resp->rss_caps.supported_qpts; + attr->rss_caps.max_rwq_indirection_tables = resp->rss_caps.max_rwq_indirection_tables; + attr->rss_caps.max_rwq_indirection_table_size = resp->rss_caps.max_rwq_indirection_table_size; + } + } + + if (attr_size >= offsetof(struct ibv_device_attr_ex, max_wq_type_rq) + + sizeof(attr->max_wq_type_rq)) { + if (resp->response_length >= + offsetof(struct ib_uverbs_ex_query_device_resp, max_wq_type_rq) + + sizeof(resp->max_wq_type_rq)) + attr->max_wq_type_rq = resp->max_wq_type_rq; + } + + if (attr_size >= offsetof(struct ibv_device_attr_ex, raw_packet_caps) + + sizeof(attr->raw_packet_caps)) { + if (resp->response_length >= + offsetof(struct ib_uverbs_ex_query_device_resp, raw_packet_caps) + + sizeof(resp->raw_packet_caps)) + attr->raw_packet_caps = resp->raw_packet_caps; + } + + if (attr_size >= offsetof(struct ibv_device_attr_ex, tm_caps) + + sizeof(attr->tm_caps)) { + if (resp->response_length >= + offsetof(struct ib_uverbs_ex_query_device_resp, tm_caps) + + sizeof(resp->tm_caps)) { + attr->tm_caps.max_rndv_hdr_size = + resp->tm_caps.max_rndv_hdr_size; + attr->tm_caps.max_num_tags = + resp->tm_caps.max_num_tags; + attr->tm_caps.flags = resp->tm_caps.flags; + attr->tm_caps.max_ops = + resp->tm_caps.max_ops; + attr->tm_caps.max_sge = + resp->tm_caps.max_sge; + } + } + + if (attr_size >= offsetof(struct ibv_device_attr_ex, cq_mod_caps) + + sizeof(attr->cq_mod_caps)) { + if (resp->response_length >= + offsetof(struct ib_uverbs_ex_query_device_resp, cq_moderation_caps) + + sizeof(resp->cq_moderation_caps)) { + attr->cq_mod_caps.max_cq_count = resp->cq_moderation_caps.max_cq_moderation_count; + attr->cq_mod_caps.max_cq_period = resp->cq_moderation_caps.max_cq_moderation_period; + } + } + + if (attr_size >= offsetof(struct ibv_device_attr_ex, max_dm_size) + + sizeof(attr->max_dm_size)) { + if (resp->response_length >= + offsetof(struct ib_uverbs_ex_query_device_resp, max_dm_size) + + sizeof(resp->max_dm_size)) { + attr->max_dm_size = resp->max_dm_size; + } + } + + if (attr_size >= offsetof(struct ibv_device_attr_ex, xrc_odp_caps) + + sizeof(attr->xrc_odp_caps)) { + if (resp->response_length >= + offsetof(struct ib_uverbs_ex_query_device_resp, xrc_odp_caps) + + sizeof(resp->xrc_odp_caps)) { + attr->xrc_odp_caps = resp->xrc_odp_caps; + } + } + + return 0; +} + +int ibv_cmd_alloc_pd(struct ibv_context *context, struct ibv_pd *pd, + struct ibv_alloc_pd *cmd, size_t cmd_size, + struct ib_uverbs_alloc_pd_resp *resp, size_t resp_size) +{ + int ret; + + ret = execute_cmd_write(context, IB_USER_VERBS_CMD_ALLOC_PD, cmd, + cmd_size, resp, resp_size); + if (ret) + return ret; + + pd->handle = resp->pd_handle; + pd->context = context; + + return 0; +} + +int ibv_cmd_open_xrcd(struct ibv_context *context, struct verbs_xrcd *xrcd, + int vxrcd_size, + struct ibv_xrcd_init_attr *attr, + struct ibv_open_xrcd *cmd, size_t cmd_size, + struct ib_uverbs_open_xrcd_resp *resp, size_t resp_size) +{ + int ret; + + if (attr->comp_mask >= IBV_XRCD_INIT_ATTR_RESERVED) + return EOPNOTSUPP; + + if (!(attr->comp_mask & IBV_XRCD_INIT_ATTR_FD) || + !(attr->comp_mask & IBV_XRCD_INIT_ATTR_OFLAGS)) + return EINVAL; + + cmd->fd = attr->fd; + cmd->oflags = attr->oflags; + ret = execute_cmd_write(context, IB_USER_VERBS_CMD_OPEN_XRCD, cmd, + cmd_size, resp, resp_size); + if (ret) + return ret; + + xrcd->xrcd.context = context; + xrcd->comp_mask = 0; + if (vext_field_avail(struct verbs_xrcd, handle, vxrcd_size)) { + xrcd->comp_mask = VERBS_XRCD_HANDLE; + xrcd->handle = resp->xrcd_handle; + } + + return 0; +} + +int ibv_cmd_reg_mr(struct ibv_pd *pd, void *addr, size_t length, + uint64_t hca_va, int access, + struct verbs_mr *vmr, struct ibv_reg_mr *cmd, + size_t cmd_size, + struct ib_uverbs_reg_mr_resp *resp, size_t resp_size) +{ + int ret; + + cmd->start = (uintptr_t) addr; + cmd->length = length; + /* On demand access and entire address space means implicit. + * In that case set the value in the command to what kernel expects. + */ + if (access & IBV_ACCESS_ON_DEMAND) { + if (length == SIZE_MAX && addr) { + errno = EINVAL; + return EINVAL; + } + if (length == SIZE_MAX) + cmd->length = UINT64_MAX; + } + + cmd->hca_va = hca_va; + cmd->pd_handle = pd->handle; + cmd->access_flags = access; + + ret = execute_cmd_write(pd->context, IB_USER_VERBS_CMD_REG_MR, cmd, + cmd_size, resp, resp_size); + if (ret) + return ret; + + vmr->ibv_mr.handle = resp->mr_handle; + vmr->ibv_mr.lkey = resp->lkey; + vmr->ibv_mr.rkey = resp->rkey; + vmr->ibv_mr.context = pd->context; + vmr->mr_type = IBV_MR_TYPE_MR; + + return 0; +} + +int ibv_cmd_rereg_mr(struct verbs_mr *vmr, uint32_t flags, void *addr, + size_t length, uint64_t hca_va, int access, + struct ibv_pd *pd, struct ibv_rereg_mr *cmd, + size_t cmd_sz, struct ib_uverbs_rereg_mr_resp *resp, + size_t resp_sz) +{ + int ret; + + cmd->mr_handle = vmr->ibv_mr.handle; + cmd->flags = flags; + cmd->start = (uintptr_t)addr; + cmd->length = length; + cmd->hca_va = hca_va; + cmd->pd_handle = (flags & IBV_REREG_MR_CHANGE_PD) ? pd->handle : 0; + cmd->access_flags = access; + + ret = execute_cmd_write(vmr->ibv_mr.context, IB_USER_VERBS_CMD_REREG_MR, + cmd, cmd_sz, resp, resp_sz); + if (ret) + return ret; + + vmr->ibv_mr.lkey = resp->lkey; + vmr->ibv_mr.rkey = resp->rkey; + if (flags & IBV_REREG_MR_CHANGE_PD) + vmr->ibv_mr.context = pd->context; + + return 0; +} + +int ibv_cmd_alloc_mw(struct ibv_pd *pd, enum ibv_mw_type type, + struct ibv_mw *mw, struct ibv_alloc_mw *cmd, + size_t cmd_size, + struct ib_uverbs_alloc_mw_resp *resp, size_t resp_size) +{ + int ret; + + cmd->pd_handle = pd->handle; + cmd->mw_type = type; + memset(cmd->reserved, 0, sizeof(cmd->reserved)); + + ret = execute_cmd_write(pd->context, IB_USER_VERBS_CMD_ALLOC_MW, cmd, + cmd_size, resp, resp_size); + if (ret) + return ret; + + mw->context = pd->context; + mw->pd = pd; + mw->rkey = resp->rkey; + mw->handle = resp->mw_handle; + mw->type = type; + + return 0; +} + +int ibv_cmd_poll_cq(struct ibv_cq *ibcq, int ne, struct ibv_wc *wc) +{ + struct ibv_poll_cq cmd; + struct ib_uverbs_poll_cq_resp *resp; + int i; + int rsize; + int ret; + + rsize = sizeof *resp + ne * sizeof(struct ib_uverbs_wc); + resp = malloc(rsize); + if (!resp) + return -1; + + cmd.cq_handle = ibcq->handle; + cmd.ne = ne; + + ret = execute_cmd_write_no_uhw(ibcq->context, IB_USER_VERBS_CMD_POLL_CQ, + &cmd, sizeof(cmd), resp, rsize); + if (ret) { + ret = -1; + goto out; + } + + for (i = 0; i < resp->count; i++) { + wc[i].wr_id = resp->wc[i].wr_id; + wc[i].status = resp->wc[i].status; + wc[i].opcode = resp->wc[i].opcode; + wc[i].vendor_err = resp->wc[i].vendor_err; + wc[i].byte_len = resp->wc[i].byte_len; + wc[i].imm_data = resp->wc[i].ex.imm_data; + wc[i].qp_num = resp->wc[i].qp_num; + wc[i].src_qp = resp->wc[i].src_qp; + wc[i].wc_flags = resp->wc[i].wc_flags; + wc[i].pkey_index = resp->wc[i].pkey_index; + wc[i].slid = resp->wc[i].slid; + wc[i].sl = resp->wc[i].sl; + wc[i].dlid_path_bits = resp->wc[i].dlid_path_bits; + } + + ret = resp->count; + +out: + free(resp); + return ret; +} + +int ibv_cmd_req_notify_cq(struct ibv_cq *ibcq, int solicited_only) +{ + struct ibv_req_notify_cq req; + + req.core_payload = (struct ib_uverbs_req_notify_cq){ + .cq_handle = ibcq->handle, + .solicited_only = !!solicited_only, + }; + return execute_cmd_write_req(ibcq->context, + IB_USER_VERBS_CMD_REQ_NOTIFY_CQ, &req, + sizeof(req)); +} + +int ibv_cmd_resize_cq(struct ibv_cq *cq, int cqe, + struct ibv_resize_cq *cmd, size_t cmd_size, + struct ib_uverbs_resize_cq_resp *resp, size_t resp_size) +{ + int ret; + + cmd->cq_handle = cq->handle; + cmd->cqe = cqe; + + ret = execute_cmd_write(cq->context, IB_USER_VERBS_CMD_RESIZE_CQ, cmd, + cmd_size, resp, resp_size); + if (ret) + return ret; + + cq->cqe = resp->cqe; + + return 0; +} + +int ibv_cmd_create_srq(struct ibv_pd *pd, + struct ibv_srq *srq, struct ibv_srq_init_attr *attr, + struct ibv_create_srq *cmd, size_t cmd_size, + struct ib_uverbs_create_srq_resp *resp, size_t resp_size) +{ + int ret; + + cmd->user_handle = (uintptr_t) srq; + cmd->pd_handle = pd->handle; + cmd->max_wr = attr->attr.max_wr; + cmd->max_sge = attr->attr.max_sge; + cmd->srq_limit = attr->attr.srq_limit; + + ret = execute_cmd_write(pd->context, IB_USER_VERBS_CMD_CREATE_SRQ, cmd, + cmd_size, resp, resp_size); + if (ret) + return ret; + + srq->handle = resp->srq_handle; + srq->context = pd->context; + + if (abi_ver > 5) { + attr->attr.max_wr = resp->max_wr; + attr->attr.max_sge = resp->max_sge; + } else { + struct ibv_create_srq_resp_v5 *resp_v5 = + (struct ibv_create_srq_resp_v5 *) resp; + + memmove((void *) resp + sizeof *resp, + (void *) resp_v5 + sizeof *resp_v5, + resp_size - sizeof *resp); + } + + return 0; +} + +int ibv_cmd_create_srq_ex(struct ibv_context *context, + struct verbs_srq *srq, int vsrq_sz, + struct ibv_srq_init_attr_ex *attr_ex, + struct ibv_create_xsrq *cmd, size_t cmd_size, + struct ib_uverbs_create_srq_resp *resp, size_t resp_size) +{ + struct verbs_xrcd *vxrcd = NULL; + int ret; + + if (attr_ex->comp_mask >= IBV_SRQ_INIT_ATTR_RESERVED) + return EOPNOTSUPP; + + if (!(attr_ex->comp_mask & IBV_SRQ_INIT_ATTR_PD)) + return EINVAL; + + cmd->user_handle = (uintptr_t) srq; + cmd->pd_handle = attr_ex->pd->handle; + cmd->max_wr = attr_ex->attr.max_wr; + cmd->max_sge = attr_ex->attr.max_sge; + cmd->srq_limit = attr_ex->attr.srq_limit; + + cmd->srq_type = (attr_ex->comp_mask & IBV_SRQ_INIT_ATTR_TYPE) ? + attr_ex->srq_type : IBV_SRQT_BASIC; + if (attr_ex->comp_mask & IBV_SRQ_INIT_ATTR_XRCD) { + if (!(attr_ex->comp_mask & IBV_SRQ_INIT_ATTR_CQ)) + return EINVAL; + + vxrcd = container_of(attr_ex->xrcd, struct verbs_xrcd, xrcd); + cmd->xrcd_handle = vxrcd->handle; + cmd->cq_handle = attr_ex->cq->handle; + } else if (attr_ex->comp_mask & IBV_SRQ_INIT_ATTR_TM) { + if (cmd->srq_type != IBV_SRQT_TM) + return EINVAL; + if (!(attr_ex->comp_mask & IBV_SRQ_INIT_ATTR_CQ) || + !attr_ex->tm_cap.max_num_tags) + return EINVAL; + + cmd->cq_handle = attr_ex->cq->handle; + cmd->max_num_tags = attr_ex->tm_cap.max_num_tags; + } else if (cmd->srq_type != IBV_SRQT_BASIC) { + return EINVAL; + } + + ret = execute_cmd_write(context, IB_USER_VERBS_CMD_CREATE_XSRQ, cmd, + cmd_size, resp, resp_size); + if (ret) + return ret; + + srq->srq.handle = resp->srq_handle; + srq->srq.context = context; + srq->srq.srq_context = attr_ex->srq_context; + srq->srq.pd = attr_ex->pd; + srq->srq.events_completed = 0; + pthread_mutex_init(&srq->srq.mutex, NULL); + pthread_cond_init(&srq->srq.cond, NULL); + + /* + * check that the last field is available. + * If it is than all the others exist as well + */ + if (vext_field_avail(struct verbs_srq, srq_num, vsrq_sz)) { + srq->comp_mask = IBV_SRQ_INIT_ATTR_TYPE; + srq->srq_type = (attr_ex->comp_mask & IBV_SRQ_INIT_ATTR_TYPE) ? + attr_ex->srq_type : IBV_SRQT_BASIC; + if (srq->srq_type == IBV_SRQT_XRC) { + srq->comp_mask |= VERBS_SRQ_NUM; + srq->srq_num = resp->srqn; + } + if (attr_ex->comp_mask & IBV_SRQ_INIT_ATTR_XRCD) { + srq->comp_mask |= VERBS_SRQ_XRCD; + srq->xrcd = vxrcd; + } + if (attr_ex->comp_mask & IBV_SRQ_INIT_ATTR_CQ) { + srq->comp_mask |= VERBS_SRQ_CQ; + srq->cq = attr_ex->cq; + } + } + + attr_ex->attr.max_wr = resp->max_wr; + attr_ex->attr.max_sge = resp->max_sge; + + return 0; +} + + +static int ibv_cmd_modify_srq_v3(struct ibv_srq *srq, + struct ibv_srq_attr *srq_attr, + int srq_attr_mask, + struct ibv_modify_srq *new_cmd, + size_t new_cmd_size) +{ + struct ibv_modify_srq_v3 *cmd; + size_t cmd_size; + + cmd_size = sizeof *cmd + new_cmd_size - sizeof *new_cmd; + cmd = alloca(cmd_size); + memcpy(cmd + 1, new_cmd + 1, new_cmd_size - sizeof *new_cmd); + + cmd->core_payload = (struct ib_uverbs_modify_srq_v3){ + .srq_handle = srq->handle, + .attr_mask = srq_attr_mask, + .max_wr = srq_attr->max_wr, + .srq_limit = srq_attr->srq_limit, + }; + + return execute_cmd_write_req( + srq->context, IB_USER_VERBS_CMD_MODIFY_SRQ_V3, cmd, cmd_size); +} + +int ibv_cmd_modify_srq(struct ibv_srq *srq, + struct ibv_srq_attr *srq_attr, + int srq_attr_mask, + struct ibv_modify_srq *cmd, size_t cmd_size) +{ + if (abi_ver == 3) + return ibv_cmd_modify_srq_v3(srq, srq_attr, srq_attr_mask, + cmd, cmd_size); + + cmd->srq_handle = srq->handle; + cmd->attr_mask = srq_attr_mask; + cmd->max_wr = srq_attr->max_wr; + cmd->srq_limit = srq_attr->srq_limit; + + return execute_cmd_write_req(srq->context, IB_USER_VERBS_CMD_MODIFY_SRQ, + cmd, cmd_size); +} + +int ibv_cmd_query_srq(struct ibv_srq *srq, struct ibv_srq_attr *srq_attr, + struct ibv_query_srq *cmd, size_t cmd_size) +{ + struct ib_uverbs_query_srq_resp resp; + int ret; + + cmd->srq_handle = srq->handle; + cmd->reserved = 0; + + ret = execute_cmd_write(srq->context, IB_USER_VERBS_CMD_QUERY_SRQ, cmd, + cmd_size, &resp, sizeof(resp)); + if (ret) + return ret; + + srq_attr->max_wr = resp.max_wr; + srq_attr->max_sge = resp.max_sge; + srq_attr->srq_limit = resp.srq_limit; + + return 0; +} + +int ibv_cmd_destroy_srq(struct ibv_srq *srq) +{ + struct ibv_destroy_srq req; + struct ib_uverbs_destroy_srq_resp resp; + int ret; + + req.core_payload = (struct ib_uverbs_destroy_srq){ + .srq_handle = srq->handle, + }; + + ret = execute_cmd_write(srq->context, IB_USER_VERBS_CMD_DESTROY_SRQ, + &req, sizeof(req), &resp, sizeof(resp)); + if (verbs_is_destroy_err(&ret)) + return ret; + + pthread_mutex_lock(&srq->mutex); + while (srq->events_completed != resp.events_reported) + pthread_cond_wait(&srq->cond, &srq->mutex); + pthread_mutex_unlock(&srq->mutex); + + return 0; +} + +static int create_qp_ex_common(struct verbs_qp *qp, + struct ibv_qp_init_attr_ex *qp_attr, + struct verbs_xrcd *vxrcd, + struct ib_uverbs_create_qp *cmd) +{ + cmd->user_handle = (uintptr_t)qp; + + if (qp_attr->comp_mask & IBV_QP_INIT_ATTR_XRCD) { + vxrcd = container_of(qp_attr->xrcd, struct verbs_xrcd, xrcd); + cmd->pd_handle = vxrcd->handle; + } else { + if (!(qp_attr->comp_mask & IBV_QP_INIT_ATTR_PD)) + return EINVAL; + + cmd->pd_handle = qp_attr->pd->handle; + if (qp_attr->comp_mask & IBV_QP_INIT_ATTR_IND_TABLE) { + if (cmd->max_recv_wr || cmd->max_recv_sge || + cmd->recv_cq_handle || qp_attr->srq) + return EINVAL; + + /* send_cq is optinal */ + if (qp_attr->cap.max_send_wr) + cmd->send_cq_handle = qp_attr->send_cq->handle; + } else { + cmd->send_cq_handle = qp_attr->send_cq->handle; + + if (qp_attr->qp_type != IBV_QPT_XRC_SEND) { + cmd->recv_cq_handle = qp_attr->recv_cq->handle; + cmd->srq_handle = qp_attr->srq ? qp_attr->srq->handle : + 0; + } + } + } + + cmd->max_send_wr = qp_attr->cap.max_send_wr; + cmd->max_recv_wr = qp_attr->cap.max_recv_wr; + cmd->max_send_sge = qp_attr->cap.max_send_sge; + cmd->max_recv_sge = qp_attr->cap.max_recv_sge; + cmd->max_inline_data = qp_attr->cap.max_inline_data; + cmd->sq_sig_all = qp_attr->sq_sig_all; + cmd->qp_type = qp_attr->qp_type; + cmd->is_srq = !!qp_attr->srq; + cmd->reserved = 0; + + return 0; +} + +static void create_qp_handle_resp_common(struct ibv_context *context, + struct verbs_qp *qp, + struct ibv_qp_init_attr_ex *qp_attr, + struct ib_uverbs_create_qp_resp *resp, + struct verbs_xrcd *vxrcd, + int vqp_sz) +{ + if (abi_ver > 3) { + qp_attr->cap.max_recv_sge = resp->max_recv_sge; + qp_attr->cap.max_send_sge = resp->max_send_sge; + qp_attr->cap.max_recv_wr = resp->max_recv_wr; + qp_attr->cap.max_send_wr = resp->max_send_wr; + qp_attr->cap.max_inline_data = resp->max_inline_data; + } + + qp->qp.handle = resp->qp_handle; + qp->qp.qp_num = resp->qpn; + qp->qp.context = context; + qp->qp.qp_context = qp_attr->qp_context; + qp->qp.pd = qp_attr->pd; + qp->qp.send_cq = qp_attr->send_cq; + qp->qp.recv_cq = qp_attr->recv_cq; + qp->qp.srq = qp_attr->srq; + qp->qp.qp_type = qp_attr->qp_type; + qp->qp.state = IBV_QPS_RESET; + qp->qp.events_completed = 0; + pthread_mutex_init(&qp->qp.mutex, NULL); + pthread_cond_init(&qp->qp.cond, NULL); + + qp->comp_mask = 0; + if (vext_field_avail(struct verbs_qp, xrcd, vqp_sz) && + (qp_attr->comp_mask & IBV_QP_INIT_ATTR_XRCD)) { + qp->comp_mask |= VERBS_QP_XRCD; + qp->xrcd = vxrcd; + } +} + +enum { + CREATE_QP_EX2_SUP_CREATE_FLAGS = IBV_QP_CREATE_BLOCK_SELF_MCAST_LB | + IBV_QP_CREATE_SCATTER_FCS | + IBV_QP_CREATE_CVLAN_STRIPPING | + IBV_QP_CREATE_SOURCE_QPN | + IBV_QP_CREATE_PCI_WRITE_END_PADDING, +}; + +int ibv_cmd_create_qp_ex2(struct ibv_context *context, + struct verbs_qp *qp, int vqp_sz, + struct ibv_qp_init_attr_ex *qp_attr, + struct ibv_create_qp_ex *cmd, + size_t cmd_size, + struct ib_uverbs_ex_create_qp_resp *resp, + size_t resp_size) +{ + struct verbs_xrcd *vxrcd = NULL; + int err; + + if (!check_comp_mask(qp_attr->comp_mask, + IBV_QP_INIT_ATTR_PD | + IBV_QP_INIT_ATTR_XRCD | + IBV_QP_INIT_ATTR_CREATE_FLAGS | + IBV_QP_INIT_ATTR_MAX_TSO_HEADER | + IBV_QP_INIT_ATTR_IND_TABLE | + IBV_QP_INIT_ATTR_RX_HASH | + IBV_QP_INIT_ATTR_SEND_OPS_FLAGS)) + return EINVAL; + + memset(&cmd->core_payload, 0, sizeof(cmd->core_payload)); + + err = create_qp_ex_common(qp, qp_attr, vxrcd, + ibv_create_qp_ex_to_reg(cmd)); + if (err) + return err; + + if (qp_attr->comp_mask & IBV_QP_INIT_ATTR_CREATE_FLAGS) { + if (qp_attr->create_flags & ~CREATE_QP_EX2_SUP_CREATE_FLAGS) + return EINVAL; + cmd->create_flags = qp_attr->create_flags; + + if (qp_attr->create_flags & IBV_QP_CREATE_SOURCE_QPN) + cmd->source_qpn = qp_attr->source_qpn; + } + + if (qp_attr->comp_mask & IBV_QP_INIT_ATTR_IND_TABLE) { + cmd->rwq_ind_tbl_handle = qp_attr->rwq_ind_tbl->ind_tbl_handle; + cmd->comp_mask = IB_UVERBS_CREATE_QP_MASK_IND_TABLE; + } + + err = execute_cmd_write_ex(context, IB_USER_VERBS_EX_CMD_CREATE_QP, + cmd, cmd_size, resp, resp_size); + if (err) + return err; + + create_qp_handle_resp_common(context, qp, qp_attr, &resp->base, vxrcd, + vqp_sz); + + return 0; +} + +int ibv_cmd_create_qp_ex(struct ibv_context *context, + struct verbs_qp *qp, int vqp_sz, + struct ibv_qp_init_attr_ex *attr_ex, + struct ibv_create_qp *cmd, size_t cmd_size, + struct ib_uverbs_create_qp_resp *resp, size_t resp_size) +{ + struct verbs_xrcd *vxrcd = NULL; + int err; + + if (!check_comp_mask(attr_ex->comp_mask, + IBV_QP_INIT_ATTR_PD | + IBV_QP_INIT_ATTR_XRCD | + IBV_QP_INIT_ATTR_SEND_OPS_FLAGS)) + return EOPNOTSUPP; + + err = create_qp_ex_common(qp, attr_ex, vxrcd, + &cmd->core_payload); + if (err) + return err; + + err = execute_cmd_write(context, IB_USER_VERBS_CMD_CREATE_QP, cmd, + cmd_size, resp, resp_size); + if (err) + return err; + + if (abi_ver == 4) { + struct ibv_create_qp_resp_v4 *resp_v4 = + (struct ibv_create_qp_resp_v4 *)resp; + + memmove((void *)resp + sizeof *resp, + (void *)resp_v4 + sizeof *resp_v4, + resp_size - sizeof *resp); + } else if (abi_ver <= 3) { + struct ibv_create_qp_resp_v3 *resp_v3 = + (struct ibv_create_qp_resp_v3 *)resp; + + memmove((void *)resp + sizeof *resp, + (void *)resp_v3 + sizeof *resp_v3, + resp_size - sizeof *resp); + } + + create_qp_handle_resp_common(context, qp, attr_ex, resp, vxrcd, vqp_sz); + + return 0; +} + +int ibv_cmd_create_qp(struct ibv_pd *pd, + struct ibv_qp *qp, struct ibv_qp_init_attr *attr, + struct ibv_create_qp *cmd, size_t cmd_size, + struct ib_uverbs_create_qp_resp *resp, size_t resp_size) +{ + int ret; + + cmd->user_handle = (uintptr_t) qp; + cmd->pd_handle = pd->handle; + cmd->send_cq_handle = attr->send_cq->handle; + cmd->recv_cq_handle = attr->recv_cq->handle; + cmd->srq_handle = attr->srq ? attr->srq->handle : 0; + cmd->max_send_wr = attr->cap.max_send_wr; + cmd->max_recv_wr = attr->cap.max_recv_wr; + cmd->max_send_sge = attr->cap.max_send_sge; + cmd->max_recv_sge = attr->cap.max_recv_sge; + cmd->max_inline_data = attr->cap.max_inline_data; + cmd->sq_sig_all = attr->sq_sig_all; + cmd->qp_type = attr->qp_type; + cmd->is_srq = !!attr->srq; + cmd->reserved = 0; + + ret = execute_cmd_write(pd->context, IB_USER_VERBS_CMD_CREATE_QP, cmd, + cmd_size, resp, resp_size); + if (ret) + return ret; + + qp->handle = resp->qp_handle; + qp->qp_num = resp->qpn; + qp->context = pd->context; + + if (abi_ver > 3) { + attr->cap.max_recv_sge = resp->max_recv_sge; + attr->cap.max_send_sge = resp->max_send_sge; + attr->cap.max_recv_wr = resp->max_recv_wr; + attr->cap.max_send_wr = resp->max_send_wr; + attr->cap.max_inline_data = resp->max_inline_data; + } + + if (abi_ver == 4) { + struct ibv_create_qp_resp_v4 *resp_v4 = + (struct ibv_create_qp_resp_v4 *) resp; + + memmove((void *) resp + sizeof *resp, + (void *) resp_v4 + sizeof *resp_v4, + resp_size - sizeof *resp); + } else if (abi_ver <= 3) { + struct ibv_create_qp_resp_v3 *resp_v3 = + (struct ibv_create_qp_resp_v3 *) resp; + + memmove((void *) resp + sizeof *resp, + (void *) resp_v3 + sizeof *resp_v3, + resp_size - sizeof *resp); + } + + return 0; +} + +int ibv_cmd_open_qp(struct ibv_context *context, struct verbs_qp *qp, + int vqp_sz, + struct ibv_qp_open_attr *attr, + struct ibv_open_qp *cmd, size_t cmd_size, + struct ib_uverbs_create_qp_resp *resp, size_t resp_size) +{ + struct verbs_xrcd *xrcd; + int ret; + + if (attr->comp_mask >= IBV_QP_OPEN_ATTR_RESERVED) + return EOPNOTSUPP; + + if (!(attr->comp_mask & IBV_QP_OPEN_ATTR_XRCD) || + !(attr->comp_mask & IBV_QP_OPEN_ATTR_NUM) || + !(attr->comp_mask & IBV_QP_OPEN_ATTR_TYPE)) + return EINVAL; + + xrcd = container_of(attr->xrcd, struct verbs_xrcd, xrcd); + cmd->user_handle = (uintptr_t) qp; + cmd->pd_handle = xrcd->handle; + cmd->qpn = attr->qp_num; + cmd->qp_type = attr->qp_type; + + ret = execute_cmd_write(context, IB_USER_VERBS_CMD_OPEN_QP, cmd, + cmd_size, resp, resp_size); + if (ret) + return ret; + + qp->qp.handle = resp->qp_handle; + qp->qp.context = context; + qp->qp.qp_context = attr->qp_context; + qp->qp.pd = NULL; + qp->qp.send_cq = NULL; + qp->qp.recv_cq = NULL; + qp->qp.srq = NULL; + qp->qp.qp_num = attr->qp_num; + qp->qp.qp_type = attr->qp_type; + qp->qp.state = IBV_QPS_UNKNOWN; + qp->qp.events_completed = 0; + pthread_mutex_init(&qp->qp.mutex, NULL); + pthread_cond_init(&qp->qp.cond, NULL); + qp->comp_mask = 0; + if (vext_field_avail(struct verbs_qp, xrcd, vqp_sz)) { + qp->comp_mask = VERBS_QP_XRCD; + qp->xrcd = xrcd; + } + + return 0; +} + +int ibv_cmd_query_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, + int attr_mask, + struct ibv_qp_init_attr *init_attr, + struct ibv_query_qp *cmd, size_t cmd_size) +{ + struct ib_uverbs_query_qp_resp resp; + int ret; + + /* + * Starting with IBV_QP_RATE_LIMIT the attribute must go through the + * _ex path. + */ + if (attr_mask & ~(IBV_QP_RATE_LIMIT - 1)) + return EOPNOTSUPP; + + cmd->qp_handle = qp->handle; + cmd->attr_mask = attr_mask; + + ret = execute_cmd_write(qp->context, IB_USER_VERBS_CMD_QUERY_QP, cmd, + cmd_size, &resp, sizeof(resp)); + if (ret) + return ret; + + attr->qkey = resp.qkey; + attr->rq_psn = resp.rq_psn; + attr->sq_psn = resp.sq_psn; + attr->dest_qp_num = resp.dest_qp_num; + attr->qp_access_flags = resp.qp_access_flags; + attr->pkey_index = resp.pkey_index; + attr->alt_pkey_index = resp.alt_pkey_index; + attr->qp_state = resp.qp_state; + attr->cur_qp_state = resp.cur_qp_state; + attr->path_mtu = resp.path_mtu; + attr->path_mig_state = resp.path_mig_state; + attr->sq_draining = resp.sq_draining; + attr->max_rd_atomic = resp.max_rd_atomic; + attr->max_dest_rd_atomic = resp.max_dest_rd_atomic; + attr->min_rnr_timer = resp.min_rnr_timer; + attr->port_num = resp.port_num; + attr->timeout = resp.timeout; + attr->retry_cnt = resp.retry_cnt; + attr->rnr_retry = resp.rnr_retry; + attr->alt_port_num = resp.alt_port_num; + attr->alt_timeout = resp.alt_timeout; + attr->cap.max_send_wr = resp.max_send_wr; + attr->cap.max_recv_wr = resp.max_recv_wr; + attr->cap.max_send_sge = resp.max_send_sge; + attr->cap.max_recv_sge = resp.max_recv_sge; + attr->cap.max_inline_data = resp.max_inline_data; + + memcpy(attr->ah_attr.grh.dgid.raw, resp.dest.dgid, 16); + attr->ah_attr.grh.flow_label = resp.dest.flow_label; + attr->ah_attr.dlid = resp.dest.dlid; + attr->ah_attr.grh.sgid_index = resp.dest.sgid_index; + attr->ah_attr.grh.hop_limit = resp.dest.hop_limit; + attr->ah_attr.grh.traffic_class = resp.dest.traffic_class; + attr->ah_attr.sl = resp.dest.sl; + attr->ah_attr.src_path_bits = resp.dest.src_path_bits; + attr->ah_attr.static_rate = resp.dest.static_rate; + attr->ah_attr.is_global = resp.dest.is_global; + attr->ah_attr.port_num = resp.dest.port_num; + + memcpy(attr->alt_ah_attr.grh.dgid.raw, resp.alt_dest.dgid, 16); + attr->alt_ah_attr.grh.flow_label = resp.alt_dest.flow_label; + attr->alt_ah_attr.dlid = resp.alt_dest.dlid; + attr->alt_ah_attr.grh.sgid_index = resp.alt_dest.sgid_index; + attr->alt_ah_attr.grh.hop_limit = resp.alt_dest.hop_limit; + attr->alt_ah_attr.grh.traffic_class = resp.alt_dest.traffic_class; + attr->alt_ah_attr.sl = resp.alt_dest.sl; + attr->alt_ah_attr.src_path_bits = resp.alt_dest.src_path_bits; + attr->alt_ah_attr.static_rate = resp.alt_dest.static_rate; + attr->alt_ah_attr.is_global = resp.alt_dest.is_global; + attr->alt_ah_attr.port_num = resp.alt_dest.port_num; + + init_attr->qp_context = qp->qp_context; + init_attr->send_cq = qp->send_cq; + init_attr->recv_cq = qp->recv_cq; + init_attr->srq = qp->srq; + init_attr->qp_type = qp->qp_type; + init_attr->cap.max_send_wr = resp.max_send_wr; + init_attr->cap.max_recv_wr = resp.max_recv_wr; + init_attr->cap.max_send_sge = resp.max_send_sge; + init_attr->cap.max_recv_sge = resp.max_recv_sge; + init_attr->cap.max_inline_data = resp.max_inline_data; + init_attr->sq_sig_all = resp.sq_sig_all; + + return 0; +} + +static void copy_modify_qp_fields(struct ibv_qp *qp, struct ibv_qp_attr *attr, + int attr_mask, + struct ib_uverbs_modify_qp *cmd) +{ + cmd->qp_handle = qp->handle; + cmd->attr_mask = attr_mask; + + if (attr_mask & IBV_QP_STATE) + cmd->qp_state = attr->qp_state; + if (attr_mask & IBV_QP_CUR_STATE) + cmd->cur_qp_state = attr->cur_qp_state; + if (attr_mask & IBV_QP_EN_SQD_ASYNC_NOTIFY) + cmd->en_sqd_async_notify = attr->en_sqd_async_notify; + if (attr_mask & IBV_QP_ACCESS_FLAGS) + cmd->qp_access_flags = attr->qp_access_flags; + if (attr_mask & IBV_QP_PKEY_INDEX) + cmd->pkey_index = attr->pkey_index; + if (attr_mask & IBV_QP_PORT) + cmd->port_num = attr->port_num; + if (attr_mask & IBV_QP_QKEY) + cmd->qkey = attr->qkey; + + if (attr_mask & IBV_QP_AV) { + memcpy(cmd->dest.dgid, attr->ah_attr.grh.dgid.raw, 16); + cmd->dest.flow_label = attr->ah_attr.grh.flow_label; + cmd->dest.dlid = attr->ah_attr.dlid; + cmd->dest.reserved = 0; + cmd->dest.sgid_index = attr->ah_attr.grh.sgid_index; + cmd->dest.hop_limit = attr->ah_attr.grh.hop_limit; + cmd->dest.traffic_class = attr->ah_attr.grh.traffic_class; + cmd->dest.sl = attr->ah_attr.sl; + cmd->dest.src_path_bits = attr->ah_attr.src_path_bits; + cmd->dest.static_rate = attr->ah_attr.static_rate; + cmd->dest.is_global = attr->ah_attr.is_global; + cmd->dest.port_num = attr->ah_attr.port_num; + } + + if (attr_mask & IBV_QP_PATH_MTU) + cmd->path_mtu = attr->path_mtu; + if (attr_mask & IBV_QP_TIMEOUT) + cmd->timeout = attr->timeout; + if (attr_mask & IBV_QP_RETRY_CNT) + cmd->retry_cnt = attr->retry_cnt; + if (attr_mask & IBV_QP_RNR_RETRY) + cmd->rnr_retry = attr->rnr_retry; + if (attr_mask & IBV_QP_RQ_PSN) + cmd->rq_psn = attr->rq_psn; + if (attr_mask & IBV_QP_MAX_QP_RD_ATOMIC) + cmd->max_rd_atomic = attr->max_rd_atomic; + + if (attr_mask & IBV_QP_ALT_PATH) { + cmd->alt_pkey_index = attr->alt_pkey_index; + cmd->alt_port_num = attr->alt_port_num; + cmd->alt_timeout = attr->alt_timeout; + + memcpy(cmd->alt_dest.dgid, attr->alt_ah_attr.grh.dgid.raw, 16); + cmd->alt_dest.flow_label = attr->alt_ah_attr.grh.flow_label; + cmd->alt_dest.dlid = attr->alt_ah_attr.dlid; + cmd->alt_dest.reserved = 0; + cmd->alt_dest.sgid_index = attr->alt_ah_attr.grh.sgid_index; + cmd->alt_dest.hop_limit = attr->alt_ah_attr.grh.hop_limit; + cmd->alt_dest.traffic_class = + attr->alt_ah_attr.grh.traffic_class; + cmd->alt_dest.sl = attr->alt_ah_attr.sl; + cmd->alt_dest.src_path_bits = attr->alt_ah_attr.src_path_bits; + cmd->alt_dest.static_rate = attr->alt_ah_attr.static_rate; + cmd->alt_dest.is_global = attr->alt_ah_attr.is_global; + cmd->alt_dest.port_num = attr->alt_ah_attr.port_num; + } + + if (attr_mask & IBV_QP_MIN_RNR_TIMER) + cmd->min_rnr_timer = attr->min_rnr_timer; + if (attr_mask & IBV_QP_SQ_PSN) + cmd->sq_psn = attr->sq_psn; + if (attr_mask & IBV_QP_MAX_DEST_RD_ATOMIC) + cmd->max_dest_rd_atomic = attr->max_dest_rd_atomic; + if (attr_mask & IBV_QP_PATH_MIG_STATE) + cmd->path_mig_state = attr->path_mig_state; + if (attr_mask & IBV_QP_DEST_QPN) + cmd->dest_qp_num = attr->dest_qp_num; + + cmd->reserved[0] = cmd->reserved[1] = 0; +} + +int ibv_cmd_modify_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, + int attr_mask, + struct ibv_modify_qp *cmd, size_t cmd_size) +{ + /* + * Starting with IBV_QP_RATE_LIMIT the attribute must go through the + * _ex path. + */ + if (attr_mask & ~(IBV_QP_RATE_LIMIT - 1)) + return EOPNOTSUPP; + + copy_modify_qp_fields(qp, attr, attr_mask, &cmd->core_payload); + + return execute_cmd_write_req(qp->context, IB_USER_VERBS_CMD_MODIFY_QP, + cmd, cmd_size); +} + +int ibv_cmd_modify_qp_ex(struct ibv_qp *qp, struct ibv_qp_attr *attr, + int attr_mask, struct ibv_modify_qp_ex *cmd, + size_t cmd_size, + struct ib_uverbs_ex_modify_qp_resp *resp, + size_t resp_size) +{ + copy_modify_qp_fields(qp, attr, attr_mask, &cmd->base); + + if (attr_mask & IBV_QP_RATE_LIMIT) { + if (cmd_size >= offsetof(struct ibv_modify_qp_ex, rate_limit) + + sizeof(cmd->rate_limit)) + cmd->rate_limit = attr->rate_limit; + else + return EINVAL; + } + + return execute_cmd_write_ex(qp->context, IB_USER_VERBS_EX_CMD_MODIFY_QP, + cmd, cmd_size, resp, resp_size); +} + +int ibv_cmd_post_send(struct ibv_qp *ibqp, struct ibv_send_wr *wr, + struct ibv_send_wr **bad_wr) +{ + struct ibv_post_send *cmd; + struct ib_uverbs_post_send_resp resp; + struct ibv_send_wr *i; + struct ib_uverbs_send_wr *n, *tmp; + struct ibv_sge *s; + unsigned wr_count = 0; + unsigned sge_count = 0; + int cmd_size; + int ret; + + for (i = wr; i; i = i->next) { + wr_count++; + sge_count += i->num_sge; + } + + cmd_size = sizeof *cmd + wr_count * sizeof *n + sge_count * sizeof *s; + cmd = alloca(cmd_size); + + cmd->qp_handle = ibqp->handle; + cmd->wr_count = wr_count; + cmd->sge_count = sge_count; + cmd->wqe_size = sizeof *n; + + n = (struct ib_uverbs_send_wr *) ((void *) cmd + sizeof *cmd); + s = (struct ibv_sge *) (n + wr_count); + + tmp = n; + for (i = wr; i; i = i->next) { + tmp->wr_id = i->wr_id; + tmp->num_sge = i->num_sge; + tmp->opcode = i->opcode; + tmp->send_flags = i->send_flags; + tmp->ex.imm_data = i->imm_data; + if (ibqp->qp_type == IBV_QPT_UD) { + tmp->wr.ud.ah = i->wr.ud.ah->handle; + tmp->wr.ud.remote_qpn = i->wr.ud.remote_qpn; + tmp->wr.ud.remote_qkey = i->wr.ud.remote_qkey; + } else { + switch (i->opcode) { + case IBV_WR_RDMA_WRITE: + case IBV_WR_RDMA_WRITE_WITH_IMM: + case IBV_WR_RDMA_READ: + tmp->wr.rdma.remote_addr = + i->wr.rdma.remote_addr; + tmp->wr.rdma.rkey = i->wr.rdma.rkey; + break; + case IBV_WR_ATOMIC_CMP_AND_SWP: + case IBV_WR_ATOMIC_FETCH_AND_ADD: + tmp->wr.atomic.remote_addr = + i->wr.atomic.remote_addr; + tmp->wr.atomic.compare_add = + i->wr.atomic.compare_add; + tmp->wr.atomic.swap = i->wr.atomic.swap; + tmp->wr.atomic.rkey = i->wr.atomic.rkey; + break; + default: + break; + } + } + + if (tmp->num_sge) { + memcpy(s, i->sg_list, tmp->num_sge * sizeof *s); + s += tmp->num_sge; + } + + tmp++; + } + + resp.bad_wr = 0; + ret = execute_cmd_write_no_uhw(ibqp->context, + IB_USER_VERBS_CMD_POST_SEND, cmd, + cmd_size, &resp, sizeof(resp)); + + wr_count = resp.bad_wr; + if (wr_count) { + i = wr; + while (--wr_count) + i = i->next; + *bad_wr = i; + } else if (ret) + *bad_wr = wr; + + return ret; +} + +int ibv_cmd_post_recv(struct ibv_qp *ibqp, struct ibv_recv_wr *wr, + struct ibv_recv_wr **bad_wr) +{ + struct ibv_post_recv *cmd; + struct ib_uverbs_post_recv_resp resp; + struct ibv_recv_wr *i; + struct ib_uverbs_recv_wr *n, *tmp; + struct ibv_sge *s; + unsigned wr_count = 0; + unsigned sge_count = 0; + int cmd_size; + int ret; + + for (i = wr; i; i = i->next) { + wr_count++; + sge_count += i->num_sge; + } + + cmd_size = sizeof *cmd + wr_count * sizeof *n + sge_count * sizeof *s; + cmd = alloca(cmd_size); + + cmd->qp_handle = ibqp->handle; + cmd->wr_count = wr_count; + cmd->sge_count = sge_count; + cmd->wqe_size = sizeof *n; + + n = (struct ib_uverbs_recv_wr *) ((void *) cmd + sizeof *cmd); + s = (struct ibv_sge *) (n + wr_count); + + tmp = n; + for (i = wr; i; i = i->next) { + tmp->wr_id = i->wr_id; + tmp->num_sge = i->num_sge; + + if (tmp->num_sge) { + memcpy(s, i->sg_list, tmp->num_sge * sizeof *s); + s += tmp->num_sge; + } + + tmp++; + } + + resp.bad_wr = 0; + ret = execute_cmd_write_no_uhw(ibqp->context, + IB_USER_VERBS_CMD_POST_RECV, cmd, + cmd_size, &resp, sizeof(resp)); + + wr_count = resp.bad_wr; + if (wr_count) { + i = wr; + while (--wr_count) + i = i->next; + *bad_wr = i; + } else if (ret) + *bad_wr = wr; + + return ret; +} + +int ibv_cmd_post_srq_recv(struct ibv_srq *srq, struct ibv_recv_wr *wr, + struct ibv_recv_wr **bad_wr) +{ + struct ibv_post_srq_recv *cmd; + struct ib_uverbs_post_srq_recv_resp resp; + struct ibv_recv_wr *i; + struct ib_uverbs_recv_wr *n, *tmp; + struct ibv_sge *s; + unsigned wr_count = 0; + unsigned sge_count = 0; + int cmd_size; + int ret; + + for (i = wr; i; i = i->next) { + wr_count++; + sge_count += i->num_sge; + } + + cmd_size = sizeof *cmd + wr_count * sizeof *n + sge_count * sizeof *s; + cmd = alloca(cmd_size); + + cmd->srq_handle = srq->handle; + cmd->wr_count = wr_count; + cmd->sge_count = sge_count; + cmd->wqe_size = sizeof *n; + + n = (struct ib_uverbs_recv_wr *) ((void *) cmd + sizeof *cmd); + s = (struct ibv_sge *) (n + wr_count); + + tmp = n; + for (i = wr; i; i = i->next) { + tmp->wr_id = i->wr_id; + tmp->num_sge = i->num_sge; + + if (tmp->num_sge) { + memcpy(s, i->sg_list, tmp->num_sge * sizeof *s); + s += tmp->num_sge; + } + + tmp++; + } + + resp.bad_wr = 0; + ret = execute_cmd_write_no_uhw(srq->context, + IB_USER_VERBS_CMD_POST_SRQ_RECV, cmd, + cmd_size, &resp, sizeof(resp)); + + wr_count = resp.bad_wr; + if (wr_count) { + i = wr; + while (--wr_count) + i = i->next; + *bad_wr = i; + } else if (ret) + *bad_wr = wr; + + return ret; +} + +int ibv_cmd_create_ah(struct ibv_pd *pd, struct ibv_ah *ah, + struct ibv_ah_attr *attr, + struct ib_uverbs_create_ah_resp *resp, + size_t resp_size) +{ + struct ibv_create_ah cmd; + int ret; + + cmd.user_handle = (uintptr_t) ah; + cmd.pd_handle = pd->handle; + cmd.reserved = 0; + cmd.attr.dlid = attr->dlid; + cmd.attr.sl = attr->sl; + cmd.attr.src_path_bits = attr->src_path_bits; + cmd.attr.static_rate = attr->static_rate; + cmd.attr.is_global = attr->is_global; + cmd.attr.port_num = attr->port_num; + cmd.attr.reserved = 0; + cmd.attr.grh.flow_label = attr->grh.flow_label; + cmd.attr.grh.sgid_index = attr->grh.sgid_index; + cmd.attr.grh.hop_limit = attr->grh.hop_limit; + cmd.attr.grh.traffic_class = attr->grh.traffic_class; + cmd.attr.grh.reserved = 0; + memcpy(cmd.attr.grh.dgid, attr->grh.dgid.raw, 16); + + ret = execute_cmd_write(pd->context, IB_USER_VERBS_CMD_CREATE_AH, &cmd, + sizeof(cmd), resp, resp_size); + if (ret) + return ret; + + ah->handle = resp->ah_handle; + ah->context = pd->context; + + return 0; +} + +int ibv_cmd_destroy_qp(struct ibv_qp *qp) +{ + struct ibv_destroy_qp req; + struct ib_uverbs_destroy_qp_resp resp; + int ret; + + req.core_payload = (struct ib_uverbs_destroy_qp){ + .qp_handle = qp->handle, + }; + + ret = execute_cmd_write(qp->context, IB_USER_VERBS_CMD_DESTROY_QP, &req, + sizeof(req), &resp, sizeof(resp)); + if (verbs_is_destroy_err(&ret)) + return ret; + + pthread_mutex_lock(&qp->mutex); + while (qp->events_completed != resp.events_reported) + pthread_cond_wait(&qp->cond, &qp->mutex); + pthread_mutex_unlock(&qp->mutex); + + return 0; +} + +int ibv_cmd_attach_mcast(struct ibv_qp *qp, const union ibv_gid *gid, uint16_t lid) +{ + struct ibv_attach_mcast req; + + req.core_payload = (struct ib_uverbs_attach_mcast){ + .qp_handle = qp->handle, + .mlid = lid, + }; + memcpy(req.gid, gid->raw, sizeof(req.gid)); + return execute_cmd_write_req( + qp->context, IB_USER_VERBS_CMD_ATTACH_MCAST, &req, sizeof(req)); +} + +int ibv_cmd_detach_mcast(struct ibv_qp *qp, const union ibv_gid *gid, uint16_t lid) +{ + struct ibv_detach_mcast req; + int ret; + + req.core_payload = (struct ib_uverbs_detach_mcast){ + .qp_handle = qp->handle, + .mlid = lid, + }; + memcpy(req.gid, gid->raw, sizeof(req.gid)); + ret = execute_cmd_write_req(qp->context, IB_USER_VERBS_CMD_DETACH_MCAST, + &req, sizeof(req)); + if (verbs_is_destroy_err(&ret)) + return ret; + + return 0; +} + +static int buffer_is_zero(char *addr, ssize_t size) +{ + return addr[0] == 0 && !memcmp(addr, addr + 1, size - 1); +} + +static int get_filters_size(struct ibv_flow_spec *ib_spec, + struct ibv_kern_spec *kern_spec, + int *ib_filter_size, int *kern_filter_size, + enum ibv_flow_spec_type type) +{ + void *ib_spec_filter_mask; + int curr_kern_filter_size; + int min_filter_size; + + *ib_filter_size = (ib_spec->hdr.size - sizeof(ib_spec->hdr)) / 2; + + switch (type) { + case IBV_FLOW_SPEC_IPV4_EXT: + min_filter_size = + offsetof(struct ib_uverbs_flow_ipv4_filter, flags) + + sizeof(kern_spec->ipv4_ext.mask.flags); + curr_kern_filter_size = min_filter_size; + ib_spec_filter_mask = (void *)&ib_spec->ipv4_ext.val + + *ib_filter_size; + break; + case IBV_FLOW_SPEC_IPV6: + min_filter_size = + offsetof(struct ib_uverbs_flow_ipv6_filter, hop_limit) + + sizeof(kern_spec->ipv6.mask.hop_limit); + curr_kern_filter_size = min_filter_size; + ib_spec_filter_mask = (void *)&ib_spec->ipv6.val + + *ib_filter_size; + break; + case IBV_FLOW_SPEC_VXLAN_TUNNEL: + min_filter_size = + offsetof(struct ib_uverbs_flow_tunnel_filter, + tunnel_id) + + sizeof(kern_spec->tunnel.mask.tunnel_id); + curr_kern_filter_size = min_filter_size; + ib_spec_filter_mask = (void *)&ib_spec->tunnel.val + + *ib_filter_size; + break; + default: + return EINVAL; + } + + if (*ib_filter_size < min_filter_size) + return EINVAL; + + if (*ib_filter_size > curr_kern_filter_size && + !buffer_is_zero(ib_spec_filter_mask + curr_kern_filter_size, + *ib_filter_size - curr_kern_filter_size)) + return EOPNOTSUPP; + + *kern_filter_size = min_t(int, curr_kern_filter_size, *ib_filter_size); + + return 0; +} + +static int ib_spec_to_kern_spec(struct ibv_flow_spec *ib_spec, + struct ibv_kern_spec *kern_spec) +{ + int kern_filter_size; + int ib_filter_size; + int ret; + + kern_spec->hdr.type = ib_spec->hdr.type; + + switch (kern_spec->hdr.type) { + case IBV_FLOW_SPEC_ETH: + case IBV_FLOW_SPEC_ETH | IBV_FLOW_SPEC_INNER: + kern_spec->eth.size = sizeof(struct ib_uverbs_flow_spec_eth); + memcpy(&kern_spec->eth.val, &ib_spec->eth.val, + sizeof(struct ibv_flow_eth_filter)); + memcpy(&kern_spec->eth.mask, &ib_spec->eth.mask, + sizeof(struct ibv_flow_eth_filter)); + break; + case IBV_FLOW_SPEC_IPV4: + case IBV_FLOW_SPEC_IPV4 | IBV_FLOW_SPEC_INNER: + kern_spec->ipv4.size = sizeof(struct ibv_kern_spec_ipv4); + memcpy(&kern_spec->ipv4.val, &ib_spec->ipv4.val, + sizeof(struct ibv_flow_ipv4_filter)); + memcpy(&kern_spec->ipv4.mask, &ib_spec->ipv4.mask, + sizeof(struct ibv_flow_ipv4_filter)); + break; + case IBV_FLOW_SPEC_IPV4_EXT: + case IBV_FLOW_SPEC_IPV4_EXT | IBV_FLOW_SPEC_INNER: + ret = get_filters_size(ib_spec, kern_spec, + &ib_filter_size, &kern_filter_size, + IBV_FLOW_SPEC_IPV4_EXT); + if (ret) + return ret; + + kern_spec->hdr.type = IBV_FLOW_SPEC_IPV4 | + (IBV_FLOW_SPEC_INNER & ib_spec->hdr.type); + kern_spec->ipv4_ext.size = sizeof(struct + ib_uverbs_flow_spec_ipv4); + memcpy(&kern_spec->ipv4_ext.val, &ib_spec->ipv4_ext.val, + kern_filter_size); + memcpy(&kern_spec->ipv4_ext.mask, (void *)&ib_spec->ipv4_ext.val + + ib_filter_size, kern_filter_size); + break; + case IBV_FLOW_SPEC_IPV6: + case IBV_FLOW_SPEC_IPV6 | IBV_FLOW_SPEC_INNER: + ret = get_filters_size(ib_spec, kern_spec, + &ib_filter_size, &kern_filter_size, + IBV_FLOW_SPEC_IPV6); + if (ret) + return ret; + + kern_spec->ipv6.size = sizeof(struct ib_uverbs_flow_spec_ipv6); + memcpy(&kern_spec->ipv6.val, &ib_spec->ipv6.val, + kern_filter_size); + memcpy(&kern_spec->ipv6.mask, (void *)&ib_spec->ipv6.val + + ib_filter_size, kern_filter_size); + break; + case IBV_FLOW_SPEC_ESP: + case IBV_FLOW_SPEC_ESP | IBV_FLOW_SPEC_INNER: + kern_spec->esp.size = sizeof(struct ib_uverbs_flow_spec_esp); + memcpy(&kern_spec->esp.val, &ib_spec->esp.val, + sizeof(struct ib_uverbs_flow_spec_esp_filter)); + memcpy(&kern_spec->esp.mask, (void *)&ib_spec->esp.mask, + sizeof(struct ib_uverbs_flow_spec_esp_filter)); + break; + case IBV_FLOW_SPEC_TCP: + case IBV_FLOW_SPEC_UDP: + case IBV_FLOW_SPEC_TCP | IBV_FLOW_SPEC_INNER: + case IBV_FLOW_SPEC_UDP | IBV_FLOW_SPEC_INNER: + kern_spec->tcp_udp.size = sizeof(struct ib_uverbs_flow_spec_tcp_udp); + memcpy(&kern_spec->tcp_udp.val, &ib_spec->tcp_udp.val, + sizeof(struct ibv_flow_tcp_udp_filter)); + memcpy(&kern_spec->tcp_udp.mask, &ib_spec->tcp_udp.mask, + sizeof(struct ibv_flow_tcp_udp_filter)); + break; + case IBV_FLOW_SPEC_GRE: + kern_spec->gre.size = sizeof(struct ib_uverbs_flow_spec_gre); + memcpy(&kern_spec->gre.val, &ib_spec->gre.val, + sizeof(struct ibv_flow_gre_filter)); + memcpy(&kern_spec->gre.mask, &ib_spec->gre.mask, + sizeof(struct ibv_flow_gre_filter)); + break; + case IBV_FLOW_SPEC_MPLS: + case IBV_FLOW_SPEC_MPLS | IBV_FLOW_SPEC_INNER: + kern_spec->mpls.size = sizeof(struct ib_uverbs_flow_spec_mpls); + memcpy(&kern_spec->mpls.val, &ib_spec->mpls.val, + sizeof(struct ibv_flow_mpls_filter)); + memcpy(&kern_spec->mpls.mask, &ib_spec->mpls.mask, + sizeof(struct ibv_flow_mpls_filter)); + break; + case IBV_FLOW_SPEC_VXLAN_TUNNEL: + ret = get_filters_size(ib_spec, kern_spec, + &ib_filter_size, &kern_filter_size, + IBV_FLOW_SPEC_VXLAN_TUNNEL); + if (ret) + return ret; + + kern_spec->tunnel.size = sizeof(struct ib_uverbs_flow_spec_tunnel); + memcpy(&kern_spec->tunnel.val, &ib_spec->tunnel.val, + kern_filter_size); + memcpy(&kern_spec->tunnel.mask, (void *)&ib_spec->tunnel.val + + ib_filter_size, kern_filter_size); + break; + case IBV_FLOW_SPEC_ACTION_TAG: + kern_spec->flow_tag.size = + sizeof(struct ib_uverbs_flow_spec_action_tag); + kern_spec->flow_tag.tag_id = ib_spec->flow_tag.tag_id; + break; + case IBV_FLOW_SPEC_ACTION_DROP: + kern_spec->drop.size = sizeof(struct ib_uverbs_flow_spec_action_drop); + break; + case IBV_FLOW_SPEC_ACTION_HANDLE: { + const struct verbs_flow_action *vaction = + container_of((const struct ibv_flow_action *)ib_spec->handle.action, + const struct verbs_flow_action, action); + kern_spec->handle.size = sizeof(struct ib_uverbs_flow_spec_action_handle); + kern_spec->handle.handle = vaction->handle; + break; + } + case IBV_FLOW_SPEC_ACTION_COUNT: { + const struct verbs_counters *vcounters = + container_of(ib_spec->flow_count.counters, + const struct verbs_counters, counters); + kern_spec->flow_count.size = + sizeof(struct ib_uverbs_flow_spec_action_count); + kern_spec->flow_count.handle = vcounters->handle; + break; + } + default: + return EINVAL; + } + return 0; +} + +int ibv_cmd_create_flow(struct ibv_qp *qp, + struct ibv_flow *flow_id, + struct ibv_flow_attr *flow_attr, + void *ucmd, + size_t ucmd_size) +{ + struct ibv_create_flow *cmd; + struct ib_uverbs_create_flow_resp resp; + size_t cmd_size; + size_t written_size; + int i, err; + void *kern_spec; + void *ib_spec; + + cmd_size = sizeof(*cmd) + (flow_attr->num_of_specs * + sizeof(struct ibv_kern_spec)); + cmd = alloca(cmd_size + ucmd_size); + memset(cmd, 0, cmd_size + ucmd_size); + + cmd->qp_handle = qp->handle; + + cmd->flow_attr.type = flow_attr->type; + cmd->flow_attr.priority = flow_attr->priority; + cmd->flow_attr.num_of_specs = flow_attr->num_of_specs; + cmd->flow_attr.port = flow_attr->port; + cmd->flow_attr.flags = flow_attr->flags; + + kern_spec = cmd + 1; + ib_spec = flow_attr + 1; + for (i = 0; i < flow_attr->num_of_specs; i++) { + err = ib_spec_to_kern_spec(ib_spec, kern_spec); + if (err) { + errno = err; + return err; + } + cmd->flow_attr.size += + ((struct ibv_kern_spec *)kern_spec)->hdr.size; + kern_spec += ((struct ibv_kern_spec *)kern_spec)->hdr.size; + ib_spec += ((struct ibv_flow_spec *)ib_spec)->hdr.size; + } + + written_size = sizeof(*cmd) + cmd->flow_attr.size; + if (ucmd) { + memcpy((char *)cmd + written_size, ucmd, ucmd_size); + written_size += ucmd_size; + } + + err = execute_cmd_write_ex_full(qp->context, + IB_USER_VERBS_EX_CMD_CREATE_FLOW, cmd, + written_size - ucmd_size, written_size, + &resp, sizeof(resp), sizeof(resp)); + if (err) + return err; + + flow_id->context = qp->context; + flow_id->handle = resp.flow_handle; + return 0; +} + +int ibv_cmd_create_wq(struct ibv_context *context, + struct ibv_wq_init_attr *wq_init_attr, + struct ibv_wq *wq, + struct ibv_create_wq *cmd, + size_t cmd_size, + struct ib_uverbs_ex_create_wq_resp *resp, + size_t resp_size) +{ + int err; + + if (wq_init_attr->comp_mask >= IBV_WQ_INIT_ATTR_RESERVED) + return EINVAL; + + cmd->user_handle = (uintptr_t)wq; + cmd->pd_handle = wq_init_attr->pd->handle; + cmd->cq_handle = wq_init_attr->cq->handle; + cmd->wq_type = wq_init_attr->wq_type; + cmd->max_sge = wq_init_attr->max_sge; + cmd->max_wr = wq_init_attr->max_wr; + cmd->comp_mask = 0; + + if (wq_init_attr->comp_mask & IBV_WQ_INIT_ATTR_FLAGS) { + if (wq_init_attr->create_flags & ~(IBV_WQ_FLAGS_RESERVED - 1)) + return EOPNOTSUPP; + cmd->create_flags = wq_init_attr->create_flags; + } + + err = execute_cmd_write_ex(context, IB_USER_VERBS_EX_CMD_CREATE_WQ, + cmd, cmd_size, resp, resp_size); + if (err) + return err; + + if (resp->response_length < sizeof(*resp)) + return EINVAL; + + wq->handle = resp->wq_handle; + wq_init_attr->max_wr = resp->max_wr; + wq_init_attr->max_sge = resp->max_sge; + wq->wq_num = resp->wqn; + wq->context = context; + wq->cq = wq_init_attr->cq; + wq->pd = wq_init_attr->pd; + wq->wq_type = wq_init_attr->wq_type; + + return 0; +} + +int ibv_cmd_modify_wq(struct ibv_wq *wq, struct ibv_wq_attr *attr, + struct ibv_modify_wq *cmd, size_t cmd_size) +{ + int err; + + if (attr->attr_mask >= IBV_WQ_ATTR_RESERVED) + return EINVAL; + + memset(cmd, 0, sizeof(*cmd)); + + cmd->curr_wq_state = attr->curr_wq_state; + cmd->wq_state = attr->wq_state; + if (attr->attr_mask & IBV_WQ_ATTR_FLAGS) { + if (attr->flags_mask & ~(IBV_WQ_FLAGS_RESERVED - 1)) + return EOPNOTSUPP; + cmd->flags = attr->flags; + cmd->flags_mask = attr->flags_mask; + } + cmd->wq_handle = wq->handle; + cmd->attr_mask = attr->attr_mask; + + err = execute_cmd_write_ex_req( + wq->context, IB_USER_VERBS_EX_CMD_MODIFY_WQ, cmd, cmd_size); + if (err) + return err; + + if (attr->attr_mask & IBV_WQ_ATTR_STATE) + wq->state = attr->wq_state; + + return 0; +} + +int ibv_cmd_destroy_wq(struct ibv_wq *wq) +{ + struct ibv_destroy_wq req; + struct ib_uverbs_ex_destroy_wq_resp resp; + int ret; + + req.core_payload = (struct ib_uverbs_ex_destroy_wq){ + .wq_handle = wq->handle, + }; + + ret = execute_cmd_write_ex(wq->context, IB_USER_VERBS_EX_CMD_DESTROY_WQ, + &req, sizeof(req), &resp, sizeof(resp)); + if (verbs_is_destroy_err(&ret)) + return ret; + + if (resp.response_length < sizeof(resp)) + return EINVAL; + + pthread_mutex_lock(&wq->mutex); + while (wq->events_completed != resp.events_reported) + pthread_cond_wait(&wq->cond, &wq->mutex); + pthread_mutex_unlock(&wq->mutex); + + return 0; +} + +int ibv_cmd_create_rwq_ind_table(struct ibv_context *context, + struct ibv_rwq_ind_table_init_attr *init_attr, + struct ibv_rwq_ind_table *rwq_ind_table, + struct ib_uverbs_ex_create_rwq_ind_table_resp *resp, + size_t resp_size) +{ + struct ibv_create_rwq_ind_table *cmd; + int err; + unsigned int i; + unsigned int num_tbl_entries; + size_t cmd_size; + + if (init_attr->comp_mask >= IBV_CREATE_IND_TABLE_RESERVED) + return EINVAL; + + num_tbl_entries = 1 << init_attr->log_ind_tbl_size; + + /* The entire message must be size aligned to 8 bytes. */ + cmd_size = sizeof(*cmd) + num_tbl_entries * sizeof(cmd->wq_handles[0]); + cmd_size = (cmd_size + 7) / 8 * 8; + cmd = alloca(cmd_size); + memset(cmd, 0, cmd_size); + + for (i = 0; i < num_tbl_entries; i++) + cmd->wq_handles[i] = init_attr->ind_tbl[i]->handle; + + cmd->log_ind_tbl_size = init_attr->log_ind_tbl_size; + cmd->comp_mask = 0; + + err = execute_cmd_write_ex_full(context, + IB_USER_VERBS_EX_CMD_CREATE_RWQ_IND_TBL, + cmd, cmd_size, cmd_size, resp, + sizeof(*resp), resp_size); + if (err) + return err; + + if (resp->response_length < sizeof(*resp)) + return EINVAL; + + rwq_ind_table->ind_tbl_handle = resp->ind_tbl_handle; + rwq_ind_table->ind_tbl_num = resp->ind_tbl_num; + rwq_ind_table->context = context; + return 0; +} + +int ibv_cmd_modify_cq(struct ibv_cq *cq, + struct ibv_modify_cq_attr *attr, + struct ibv_modify_cq *cmd, + size_t cmd_size) +{ + + if (attr->attr_mask >= IBV_CQ_ATTR_RESERVED) + return EINVAL; + + cmd->cq_handle = cq->handle; + cmd->attr_mask = attr->attr_mask; + cmd->attr.cq_count = attr->moderate.cq_count; + cmd->attr.cq_period = attr->moderate.cq_period; + cmd->reserved = 0; + + return execute_cmd_write_ex_req( + cq->context, IB_USER_VERBS_EX_CMD_MODIFY_CQ, cmd, cmd_size); +} diff --git a/libibverbs/cmd_ah.c b/libibverbs/cmd_ah.c new file mode 100644 index 0000000..76d4ba1 --- /dev/null +++ b/libibverbs/cmd_ah.c @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2018 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <infiniband/cmd_write.h> + +int ibv_cmd_destroy_ah(struct ibv_ah *ah) +{ + DECLARE_FBCMD_BUFFER(cmdb, UVERBS_OBJECT_AH, + UVERBS_METHOD_AH_DESTROY, 1, NULL); + int ret; + + fill_attr_in_obj(cmdb, UVERBS_ATTR_DESTROY_AH_HANDLE, ah->handle); + + switch (execute_ioctl_fallback(ah->context, destroy_ah, cmdb, &ret)) { + case TRY_WRITE: { + struct ibv_destroy_ah req; + + req.core_payload = (struct ib_uverbs_destroy_ah){ + .ah_handle = ah->handle, + }; + ret = execute_cmd_write_req(ah->context, + IB_USER_VERBS_CMD_DESTROY_AH, &req, + sizeof(req)); + break; + } + + default: + break; + } + + if (verbs_is_destroy_err(&ret)) + return ret; + return 0; +} diff --git a/libibverbs/cmd_counters.c b/libibverbs/cmd_counters.c new file mode 100644 index 0000000..8964fed --- /dev/null +++ b/libibverbs/cmd_counters.c @@ -0,0 +1,96 @@ +/* + * Copyright (c) 2018 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <infiniband/cmd_ioctl.h> +#include <rdma/ib_user_ioctl_cmds.h> +#include <infiniband/driver.h> +#include <infiniband/cmd_write.h> + +int ibv_cmd_create_counters(struct ibv_context *context, + struct ibv_counters_init_attr *init_attr, + struct verbs_counters *vcounters, + struct ibv_command_buffer *link) +{ + DECLARE_COMMAND_BUFFER_LINK(cmd, UVERBS_OBJECT_COUNTERS, + UVERBS_METHOD_COUNTERS_CREATE, + 1, + link); + struct ib_uverbs_attr *handle = + fill_attr_out_obj(cmd, UVERBS_ATTR_CREATE_COUNTERS_HANDLE); + int ret; + + if (!check_comp_mask(init_attr->comp_mask, 0)) + return EOPNOTSUPP; + + ret = execute_ioctl(context, cmd); + if (ret) + return ret; + + vcounters->counters.context = context; + vcounters->handle = read_attr_obj(UVERBS_ATTR_CREATE_COUNTERS_HANDLE, handle); + + return 0; +} + +int ibv_cmd_destroy_counters(struct verbs_counters *vcounters) +{ + DECLARE_COMMAND_BUFFER(cmd, UVERBS_OBJECT_COUNTERS, + UVERBS_METHOD_COUNTERS_DESTROY, + 1); + int ret; + + fill_attr_in_obj(cmd, UVERBS_ATTR_DESTROY_COUNTERS_HANDLE, vcounters->handle); + ret = execute_ioctl(vcounters->counters.context, cmd); + if (verbs_is_destroy_err(&ret)) + return ret; + + return 0; +} + +int ibv_cmd_read_counters(struct verbs_counters *vcounters, + uint64_t *counters_value, + uint32_t ncounters, + uint32_t flags, + struct ibv_command_buffer *link) +{ + DECLARE_COMMAND_BUFFER_LINK(cmd, UVERBS_OBJECT_COUNTERS, + UVERBS_METHOD_COUNTERS_READ, + 3, + link); + + fill_attr_in_obj(cmd, UVERBS_ATTR_READ_COUNTERS_HANDLE, vcounters->handle); + fill_attr_out_ptr_array(cmd, UVERBS_ATTR_READ_COUNTERS_BUFF, counters_value, + ncounters); + fill_attr_in_uint32(cmd, UVERBS_ATTR_READ_COUNTERS_FLAGS, flags); + + return execute_ioctl(vcounters->counters.context, cmd); +} diff --git a/libibverbs/cmd_cq.c b/libibverbs/cmd_cq.c new file mode 100644 index 0000000..542daa7 --- /dev/null +++ b/libibverbs/cmd_cq.c @@ -0,0 +1,197 @@ +/* + * Copyright (c) 2018 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <infiniband/cmd_write.h> + +static int ibv_icmd_create_cq(struct ibv_context *context, int cqe, + struct ibv_comp_channel *channel, int comp_vector, + uint32_t flags, struct ibv_cq *cq, + struct ibv_command_buffer *link) +{ + DECLARE_FBCMD_BUFFER(cmdb, UVERBS_OBJECT_CQ, UVERBS_METHOD_CQ_CREATE, 7, link); + struct ib_uverbs_attr *handle; + uint32_t resp_cqe; + int ret; + + cq->context = context; + + handle = fill_attr_out_obj(cmdb, UVERBS_ATTR_CREATE_CQ_HANDLE); + fill_attr_out_ptr(cmdb, UVERBS_ATTR_CREATE_CQ_RESP_CQE, &resp_cqe); + + fill_attr_in_uint32(cmdb, UVERBS_ATTR_CREATE_CQ_CQE, cqe); + fill_attr_in_uint64(cmdb, UVERBS_ATTR_CREATE_CQ_USER_HANDLE, (uintptr_t)cq); + if (channel) + fill_attr_in_fd(cmdb, UVERBS_ATTR_CREATE_CQ_COMP_CHANNEL, channel->fd); + fill_attr_in_uint32(cmdb, UVERBS_ATTR_CREATE_CQ_COMP_VECTOR, comp_vector); + + if (flags) { + fallback_require_ex(cmdb); + fill_attr_in_uint32(cmdb, UVERBS_ATTR_CREATE_CQ_FLAGS, flags); + } + + switch (execute_ioctl_fallback(cq->context, create_cq, cmdb, &ret)) { + case TRY_WRITE: { + DECLARE_LEGACY_UHW_BUFS(link, IB_USER_VERBS_CMD_CREATE_CQ); + + *req = (struct ib_uverbs_create_cq){ + .user_handle = (uintptr_t)cq, + .cqe = cqe, + .comp_vector = comp_vector, + .comp_channel = channel ? channel->fd : -1, + }; + + ret = execute_write_bufs( + cq->context, IB_USER_VERBS_CMD_CREATE_CQ, req, resp); + if (ret) + return ret; + + cq->handle = resp->cq_handle; + cq->cqe = resp->cqe; + + return 0; + } + case TRY_WRITE_EX: { + DECLARE_LEGACY_UHW_BUFS_EX(link, + IB_USER_VERBS_EX_CMD_CREATE_CQ); + + *req = (struct ib_uverbs_ex_create_cq){ + .user_handle = (uintptr_t)cq, + .cqe = cqe, + .comp_vector = comp_vector, + .comp_channel = channel ? channel->fd : -1, + .flags = flags, + }; + + ret = execute_write_bufs_ex( + cq->context, IB_USER_VERBS_EX_CMD_CREATE_CQ, req, resp); + if (ret) + return ret; + + cq->handle = resp->base.cq_handle; + cq->cqe = resp->base.cqe; + + return 0; + } + + case ERROR: + return ret; + + case SUCCESS: + break; + } + + cq->handle = read_attr_obj(UVERBS_ATTR_CREATE_CQ_HANDLE, handle); + cq->cqe = resp_cqe; + + return 0; +} + +int ibv_cmd_create_cq(struct ibv_context *context, int cqe, + struct ibv_comp_channel *channel, int comp_vector, + struct ibv_cq *cq, struct ibv_create_cq *cmd, + size_t cmd_size, struct ib_uverbs_create_cq_resp *resp, + size_t resp_size) +{ + DECLARE_CMD_BUFFER_COMPAT(cmdb, UVERBS_OBJECT_CQ, + UVERBS_METHOD_CQ_CREATE, cmd, cmd_size, resp, + resp_size); + + return ibv_icmd_create_cq(context, cqe, channel, comp_vector, 0, cq, + cmdb); +} + +int ibv_cmd_create_cq_ex(struct ibv_context *context, + struct ibv_cq_init_attr_ex *cq_attr, + struct ibv_cq_ex *cq, + struct ibv_create_cq_ex *cmd, + size_t cmd_size, + struct ib_uverbs_ex_create_cq_resp *resp, + size_t resp_size) +{ + DECLARE_CMD_BUFFER_COMPAT(cmdb, UVERBS_OBJECT_CQ, + UVERBS_METHOD_CQ_CREATE, cmd, cmd_size, resp, + resp_size); + uint32_t flags = 0; + + if (!check_comp_mask(cq_attr->comp_mask, IBV_CQ_INIT_ATTR_MASK_FLAGS)) + return EOPNOTSUPP; + + if (cq_attr->wc_flags & IBV_WC_EX_WITH_COMPLETION_TIMESTAMP) + flags |= IB_UVERBS_CQ_FLAGS_TIMESTAMP_COMPLETION; + + if (cq_attr->flags & IBV_CREATE_CQ_ATTR_IGNORE_OVERRUN) + flags |= IB_UVERBS_CQ_FLAGS_IGNORE_OVERRUN; + + return ibv_icmd_create_cq(context, cq_attr->cqe, cq_attr->channel, + cq_attr->comp_vector, flags, + ibv_cq_ex_to_cq(cq), cmdb); +} + +int ibv_cmd_destroy_cq(struct ibv_cq *cq) +{ + DECLARE_FBCMD_BUFFER(cmdb, UVERBS_OBJECT_CQ, UVERBS_METHOD_CQ_DESTROY, 2, + NULL); + struct ib_uverbs_destroy_cq_resp resp; + int ret; + + fill_attr_out_ptr(cmdb, UVERBS_ATTR_DESTROY_CQ_RESP, &resp); + fill_attr_in_obj(cmdb, UVERBS_ATTR_DESTROY_CQ_HANDLE, cq->handle); + + switch (execute_ioctl_fallback(cq->context, destroy_cq, cmdb, &ret)) { + case TRY_WRITE: { + struct ibv_destroy_cq req; + + req.core_payload = (struct ib_uverbs_destroy_cq){ + .cq_handle = cq->handle, + }; + + ret = execute_cmd_write(cq->context, + IB_USER_VERBS_CMD_DESTROY_CQ, &req, + sizeof(req), &resp, sizeof(resp)); + break; + } + + default: + break; + } + + if (verbs_is_destroy_err(&ret)) + return ret; + + pthread_mutex_lock(&cq->mutex); + while (cq->comp_events_completed != resp.comp_events_reported || + cq->async_events_completed != resp.async_events_reported) + pthread_cond_wait(&cq->cond, &cq->mutex); + pthread_mutex_unlock(&cq->mutex); + + return 0; +} diff --git a/libibverbs/cmd_device.c b/libibverbs/cmd_device.c new file mode 100644 index 0000000..4de59c0 --- /dev/null +++ b/libibverbs/cmd_device.c @@ -0,0 +1,180 @@ +/* + * Copyright (c) 2018 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <infiniband/cmd_write.h> + +static void copy_query_port_resp_to_port_attr(struct ibv_port_attr *port_attr, + struct ib_uverbs_query_port_resp *resp) +{ + port_attr->state = resp->state; + port_attr->max_mtu = resp->max_mtu; + port_attr->active_mtu = resp->active_mtu; + port_attr->gid_tbl_len = resp->gid_tbl_len; + port_attr->port_cap_flags = resp->port_cap_flags; + port_attr->max_msg_sz = resp->max_msg_sz; + port_attr->bad_pkey_cntr = resp->bad_pkey_cntr; + port_attr->qkey_viol_cntr = resp->qkey_viol_cntr; + port_attr->pkey_tbl_len = resp->pkey_tbl_len; + port_attr->lid = resp->lid; + port_attr->sm_lid = resp->sm_lid; + port_attr->lmc = resp->lmc; + port_attr->max_vl_num = resp->max_vl_num; + port_attr->sm_sl = resp->sm_sl; + port_attr->subnet_timeout = resp->subnet_timeout; + port_attr->init_type_reply = resp->init_type_reply; + port_attr->active_width = resp->active_width; + port_attr->active_speed = resp->active_speed; + port_attr->phys_state = resp->phys_state; + port_attr->link_layer = resp->link_layer; + port_attr->flags = resp->flags; +} + +int ibv_cmd_query_port(struct ibv_context *context, uint8_t port_num, + struct ibv_port_attr *port_attr, + struct ibv_query_port *cmd, size_t cmd_size) +{ + DECLARE_FBCMD_BUFFER(cmdb, UVERBS_OBJECT_DEVICE, + UVERBS_METHOD_QUERY_PORT, 2, NULL); + int ret; + struct ib_uverbs_query_port_resp_ex resp_ex = {}; + + fill_attr_const_in(cmdb, UVERBS_ATTR_QUERY_PORT_PORT_NUM, port_num); + fill_attr_out_ptr(cmdb, UVERBS_ATTR_QUERY_PORT_RESP, &resp_ex); + + switch (execute_ioctl_fallback(context, query_port, cmdb, &ret)) { + case TRY_WRITE: { + struct ib_uverbs_query_port_resp resp; + + cmd->port_num = port_num; + memset(cmd->reserved, 0, sizeof(cmd->reserved)); + memset(&resp, 0, sizeof(resp)); + + ret = execute_cmd_write(context, + IB_USER_VERBS_CMD_QUERY_PORT, cmd, + cmd_size, &resp, sizeof(resp)); + if (ret) + return ret; + + copy_query_port_resp_to_port_attr(port_attr, &resp); + break; + } + case SUCCESS: + copy_query_port_resp_to_port_attr(port_attr, + &resp_ex.legacy_resp); + port_attr->port_cap_flags2 = resp_ex.port_cap_flags2; + break; + default: + return ret; + }; + + return 0; +} + +static int cmd_alloc_async_fd(struct ibv_context *context) +{ + DECLARE_COMMAND_BUFFER(cmdb, UVERBS_OBJECT_ASYNC_EVENT, + UVERBS_METHOD_ASYNC_EVENT_ALLOC, 1); + struct ib_uverbs_attr *handle; + int ret; + + handle = fill_attr_out_fd(cmdb, UVERBS_ATTR_ASYNC_EVENT_ALLOC_FD_HANDLE, + 0); + + ret = execute_ioctl(context, cmdb); + if (ret) + return ret; + + context->async_fd = + read_attr_fd(UVERBS_ATTR_ASYNC_EVENT_ALLOC_FD_HANDLE, handle); + return 0; +} + +static int cmd_get_context(struct verbs_context *context_ex, + struct ibv_command_buffer *link) +{ + DECLARE_FBCMD_BUFFER(cmdb, UVERBS_OBJECT_DEVICE, + UVERBS_METHOD_GET_CONTEXT, 2, link); + + struct ibv_context *context = &context_ex->context; + struct verbs_device *verbs_device; + uint64_t core_support; + uint32_t num_comp_vectors; + int ret; + + fill_attr_out_ptr(cmdb, UVERBS_ATTR_GET_CONTEXT_NUM_COMP_VECTORS, + &num_comp_vectors); + fill_attr_out_ptr(cmdb, UVERBS_ATTR_GET_CONTEXT_CORE_SUPPORT, + &core_support); + + /* Using free_context cmd_name as alloc context is not in + * verbs_context_ops while free_context is and doesn't use ioctl + */ + switch (execute_ioctl_fallback(context, free_context, cmdb, &ret)) { + case TRY_WRITE: { + DECLARE_LEGACY_UHW_BUFS(link, IB_USER_VERBS_CMD_GET_CONTEXT); + + ret = execute_write_bufs(context, IB_USER_VERBS_CMD_GET_CONTEXT, + req, resp); + if (ret) + return ret; + + context->async_fd = resp->async_fd; + context->num_comp_vectors = resp->num_comp_vectors; + + return 0; + } + case SUCCESS: + ret = cmd_alloc_async_fd(context); + if (ret) + return ret; + break; + default: + return ret; + }; + + context->num_comp_vectors = num_comp_vectors; + verbs_device = verbs_get_device(context->device); + verbs_device->core_support = core_support; + return 0; +} + +int ibv_cmd_get_context(struct verbs_context *context_ex, + struct ibv_get_context *cmd, size_t cmd_size, + struct ib_uverbs_get_context_resp *resp, + size_t resp_size) +{ + DECLARE_CMD_BUFFER_COMPAT(cmdb, UVERBS_OBJECT_DEVICE, + UVERBS_METHOD_GET_CONTEXT, cmd, cmd_size, + resp, resp_size); + + return cmd_get_context(context_ex, cmdb); +} diff --git a/libibverbs/cmd_dm.c b/libibverbs/cmd_dm.c new file mode 100644 index 0000000..86b1331 --- /dev/null +++ b/libibverbs/cmd_dm.c @@ -0,0 +1,121 @@ +/* + * Copyright (c) 2018 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <infiniband/cmd_write.h> + +int ibv_cmd_alloc_dm(struct ibv_context *ctx, + const struct ibv_alloc_dm_attr *dm_attr, + struct verbs_dm *dm, + struct ibv_command_buffer *link) +{ + DECLARE_COMMAND_BUFFER_LINK(cmdb, UVERBS_OBJECT_DM, + UVERBS_METHOD_DM_ALLOC, 3, link); + struct ib_uverbs_attr *handle; + int ret; + + handle = fill_attr_out_obj(cmdb, UVERBS_ATTR_ALLOC_DM_HANDLE); + fill_attr_in_uint64(cmdb, UVERBS_ATTR_ALLOC_DM_LENGTH, + dm_attr->length); + fill_attr_in_uint32(cmdb, UVERBS_ATTR_ALLOC_DM_ALIGNMENT, + dm_attr->log_align_req); + + ret = execute_ioctl(ctx, cmdb); + if (ret) + return errno; + + dm->handle = read_attr_obj(UVERBS_ATTR_ALLOC_DM_HANDLE, handle); + dm->dm.context = ctx; + + return 0; +} + +int ibv_cmd_free_dm(struct verbs_dm *dm) +{ + DECLARE_COMMAND_BUFFER(cmdb, UVERBS_OBJECT_DM, UVERBS_METHOD_DM_FREE, + 1); + int ret; + + fill_attr_in_obj(cmdb, UVERBS_ATTR_FREE_DM_HANDLE, dm->handle); + + ret = execute_ioctl(dm->dm.context, cmdb); + if (verbs_is_destroy_err(&ret)) + return ret; + + return 0; +} + +int ibv_cmd_reg_dm_mr(struct ibv_pd *pd, struct verbs_dm *dm, + uint64_t offset, size_t length, + unsigned int access, struct verbs_mr *vmr, + struct ibv_command_buffer *link) +{ + DECLARE_COMMAND_BUFFER_LINK(cmdb, UVERBS_OBJECT_MR, UVERBS_METHOD_DM_MR_REG, + 8, link); + struct ib_uverbs_attr *handle; + uint32_t lkey, rkey; + int ret; + + /* + * DM MRs are always 0 based since the mmap pointer, if it exists, is + * hidden from the user. + */ + if (!(access & IBV_ACCESS_ZERO_BASED)) { + errno = EINVAL; + return errno; + } + + handle = fill_attr_out_obj(cmdb, UVERBS_ATTR_REG_DM_MR_HANDLE); + fill_attr_out_ptr(cmdb, UVERBS_ATTR_REG_DM_MR_RESP_LKEY, &lkey); + fill_attr_out_ptr(cmdb, UVERBS_ATTR_REG_DM_MR_RESP_RKEY, &rkey); + + fill_attr_in_obj(cmdb, UVERBS_ATTR_REG_DM_MR_PD_HANDLE, pd->handle); + fill_attr_in_obj(cmdb, UVERBS_ATTR_REG_DM_MR_DM_HANDLE, dm->handle); + fill_attr_in_uint64(cmdb, UVERBS_ATTR_REG_DM_MR_OFFSET, offset); + fill_attr_in_uint64(cmdb, UVERBS_ATTR_REG_DM_MR_LENGTH, length); + fill_attr_in_uint32(cmdb, UVERBS_ATTR_REG_DM_MR_ACCESS_FLAGS, access); + + ret = execute_ioctl(pd->context, cmdb); + if (ret) + return errno; + + vmr->ibv_mr.handle = + read_attr_obj(UVERBS_ATTR_REG_DM_MR_HANDLE, handle); + vmr->ibv_mr.context = pd->context; + vmr->ibv_mr.lkey = lkey; + vmr->ibv_mr.rkey = rkey; + vmr->ibv_mr.length = length; + vmr->ibv_mr.pd = pd; + vmr->ibv_mr.addr = NULL; + vmr->mr_type = IBV_MR_TYPE_MR; + + return 0; +} diff --git a/libibverbs/cmd_fallback.c b/libibverbs/cmd_fallback.c new file mode 100644 index 0000000..46c09f3 --- /dev/null +++ b/libibverbs/cmd_fallback.c @@ -0,0 +1,301 @@ +/* + * Copyright (c) 2018 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <infiniband/cmd_ioctl.h> +#include <infiniband/cmd_write.h> +#include "ibverbs.h" + +#include <util/compiler.h> +#include <ccan/build_assert.h> + +#include <unistd.h> +#include <valgrind/memcheck.h> + +/* + * Check if the command buffer provided by the driver includes anything that + * is not compatible with the legacy interface. If so, then + * _execute_ioctl_fallback indicates it handled the call and sets the error + * code + */ +enum write_fallback _check_legacy(struct ibv_command_buffer *cmdb, int *ret) +{ + struct ib_uverbs_attr *cur; + bool fallback_require_ex = cmdb->fallback_require_ex; + bool fallback_ioctl_only = cmdb->fallback_ioctl_only; + + for (cmdb = cmdb->next; cmdb; cmdb = cmdb->next) { + for (cur = cmdb->hdr.attrs; cur != cmdb->next_attr; cur++) { + if (cur->attr_id != UVERBS_ATTR_UHW_IN && + cur->attr_id != UVERBS_ATTR_UHW_OUT && + cur->flags & UVERBS_ATTR_F_MANDATORY) + goto not_supp; + } + fallback_require_ex |= cmdb->fallback_require_ex; + fallback_ioctl_only |= cmdb->fallback_ioctl_only; + } + + if (fallback_ioctl_only) + goto not_supp; + + if (fallback_require_ex) + return TRY_WRITE_EX; + return TRY_WRITE; + +not_supp: + errno = EOPNOTSUPP; + *ret = EOPNOTSUPP; + return ERROR; +} + +/* + * Used to support callers that have a fallback to the old write ABI + * interface. + */ +enum write_fallback _execute_ioctl_fallback(struct ibv_context *ctx, + unsigned int cmd_bit, + struct ibv_command_buffer *cmdb, + int *ret) +{ + struct verbs_ex_private *priv = get_priv(ctx); + + if (bitmap_test_bit(priv->unsupported_ioctls, cmd_bit)) + return _check_legacy(cmdb, ret); + + *ret = execute_ioctl(ctx, cmdb); + + if (likely(*ret == 0)) + return SUCCESS; + + if (*ret == ENOTTY) { + /* ENOTTY means the ioctl framework is entirely absent */ + bitmap_fill(priv->unsupported_ioctls, VERBS_OPS_NUM); + return _check_legacy(cmdb, ret); + } + + if (*ret == EPROTONOSUPPORT) { + /* + * EPROTONOSUPPORT means we have the ioctl framework but this + * specific method is not supported + */ + bitmap_set_bit(priv->unsupported_ioctls, cmd_bit); + return _check_legacy(cmdb, ret); + } + + return ERROR; +} + +/* + * Within the command implementation we get a pointer to the request and + * response buffers for the legacy interface. This pointer is either allocated + * on the stack (if the driver didn't provide a UHW) or arranged to be + * directly before the UHW memory (see _write_set_uhw) + */ +void *_write_get_req(struct ibv_command_buffer *link, + struct ib_uverbs_cmd_hdr *onstack, size_t size) +{ + struct ib_uverbs_cmd_hdr *hdr; + + size += sizeof(*hdr); + + if (link->uhw_in_idx != _UHW_NO_INDEX) { + struct ib_uverbs_attr *uhw = &link->hdr.attrs[link->uhw_in_idx]; + + assert(uhw->attr_id == UVERBS_ATTR_UHW_IN); + assert(link->uhw_in_headroom_dwords * 4 >= size); + hdr = (void *)((uintptr_t)uhw->data - size); + hdr->in_words = __check_divide(size + uhw->len, 4); + } else { + hdr = onstack; + hdr->in_words = __check_divide(size, 4); + } + + return hdr + 1; +} + +void *_write_get_req_ex(struct ibv_command_buffer *link, struct ex_hdr *onstack, + size_t size) +{ + struct ex_hdr *hdr; + size_t full_size = size + sizeof(*hdr); + + if (link->uhw_in_idx != _UHW_NO_INDEX) { + struct ib_uverbs_attr *uhw = &link->hdr.attrs[link->uhw_in_idx]; + + assert(uhw->attr_id == UVERBS_ATTR_UHW_IN); + assert(link->uhw_in_headroom_dwords * 4 >= full_size); + hdr = (void *)((uintptr_t)uhw->data - full_size); + hdr->ex_hdr.provider_in_words = __check_divide(uhw->len, 8); + } else { + hdr = onstack; + hdr->ex_hdr.provider_in_words = 0; + } + + return hdr + 1; +} + +void *_write_get_resp(struct ibv_command_buffer *link, + struct ib_uverbs_cmd_hdr *hdr, void *onstack, + size_t resp_size) +{ + void *resp_start; + + if (link->uhw_out_idx != _UHW_NO_INDEX) { + struct ib_uverbs_attr *uhw = + &link->hdr.attrs[link->uhw_out_idx]; + + assert(uhw->attr_id == UVERBS_ATTR_UHW_OUT); + assert(link->uhw_out_headroom_dwords * 4 >= resp_size); + resp_start = (void *)((uintptr_t)uhw->data - resp_size); + hdr->out_words = __check_divide(resp_size + uhw->len, 4); + } else { + resp_start = onstack; + hdr->out_words = __check_divide(resp_size, 4); + } + + return resp_start; +} + +void *_write_get_resp_ex(struct ibv_command_buffer *link, + struct ex_hdr *hdr, void *onstack, + size_t resp_size) +{ + void *resp_start; + + if (link->uhw_out_idx != _UHW_NO_INDEX) { + struct ib_uverbs_attr *uhw = + &link->hdr.attrs[link->uhw_out_idx]; + + assert(uhw->attr_id == UVERBS_ATTR_UHW_OUT); + assert(link->uhw_out_headroom_dwords * 4 >= resp_size); + resp_start = (void *)((uintptr_t)uhw->data - resp_size); + hdr->ex_hdr.provider_out_words = __check_divide(uhw->len, 8); + } else { + resp_start = onstack; + hdr->ex_hdr.provider_out_words = 0; + } + + return resp_start; +} + +static int ioctl_write(struct ibv_context *ctx, unsigned int write_method, + const void *req, size_t core_req_size, size_t req_size, + void *resp, size_t core_resp_size, size_t resp_size) +{ + DECLARE_COMMAND_BUFFER(cmdb, UVERBS_OBJECT_DEVICE, + UVERBS_METHOD_INVOKE_WRITE, 5); + + fill_attr_const_in(cmdb, UVERBS_ATTR_WRITE_CMD, write_method); + + if (core_req_size) + fill_attr_in(cmdb, UVERBS_ATTR_CORE_IN, req, core_req_size); + if (core_resp_size) + fill_attr_out(cmdb, UVERBS_ATTR_CORE_OUT, resp, core_resp_size); + + if (req_size - core_req_size) + fill_attr_in(cmdb, UVERBS_ATTR_UHW_IN, req + core_req_size, + req_size - core_req_size); + if (resp_size - core_resp_size) + fill_attr_out(cmdb, UVERBS_ATTR_UHW_OUT, resp + core_resp_size, + resp_size - core_resp_size); + + return execute_ioctl(ctx, cmdb); +} + +int _execute_cmd_write(struct ibv_context *ctx, unsigned int write_method, + struct ib_uverbs_cmd_hdr *req, size_t core_req_size, + size_t req_size, void *resp, size_t core_resp_size, + size_t resp_size) +{ + struct verbs_ex_private *priv = get_priv(ctx); + + if (!VERBS_WRITE_ONLY && (VERBS_IOCTL_ONLY || priv->use_ioctl_write)) + return ioctl_write(ctx, write_method, req + 1, + core_req_size - sizeof(*req), + req_size - sizeof(*req), resp, + core_resp_size, resp_size); + + req->command = write_method; + req->in_words = __check_divide(req_size, 4); + req->out_words = __check_divide(resp_size, 4); + + if (write(ctx->cmd_fd, req, req_size) != req_size) + return errno; + + if (resp) + VALGRIND_MAKE_MEM_DEFINED(resp, resp_size); + return 0; +} + +/* + * req_size is the total length of the ex_hdr, core payload and driver data. + * core_req_size is the total length of the ex_hdr and core_payload. + */ +int _execute_cmd_write_ex(struct ibv_context *ctx, unsigned int write_method, + struct ex_hdr *req, size_t core_req_size, + size_t req_size, void *resp, size_t core_resp_size, + size_t resp_size) +{ + struct verbs_ex_private *priv = get_priv(ctx); + + if (!VERBS_WRITE_ONLY && (VERBS_IOCTL_ONLY || priv->use_ioctl_write)) + return ioctl_write( + ctx, IB_USER_VERBS_CMD_FLAG_EXTENDED | write_method, + req + 1, core_req_size - sizeof(*req), + req_size - sizeof(*req), resp, core_resp_size, + resp_size); + + req->hdr.command = IB_USER_VERBS_CMD_FLAG_EXTENDED | write_method; + req->hdr.in_words = + __check_divide(core_req_size - sizeof(struct ex_hdr), 8); + req->hdr.out_words = __check_divide(core_resp_size, 8); + req->ex_hdr.provider_in_words = + __check_divide(req_size - core_req_size, 8); + req->ex_hdr.provider_out_words = + __check_divide(resp_size - core_resp_size, 8); + req->ex_hdr.response = ioctl_ptr_to_u64(resp); + req->ex_hdr.cmd_hdr_reserved = 0; + + /* + * Users assumes the stack buffer is zeroed before passing to the + * kernel for writing. New kernels with the ioctl path do this + * automatically for us. + */ + if (resp) + memset(resp, 0, resp_size); + + if (write(ctx->cmd_fd, req, req_size) != req_size) + return errno; + + if (resp) + VALGRIND_MAKE_MEM_DEFINED(resp, resp_size); + return 0; +} diff --git a/libibverbs/cmd_flow.c b/libibverbs/cmd_flow.c new file mode 100644 index 0000000..2cb09b9 --- /dev/null +++ b/libibverbs/cmd_flow.c @@ -0,0 +1,65 @@ +/* + * Copyright (c) 2018 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <infiniband/cmd_write.h> + +int ibv_cmd_destroy_flow(struct ibv_flow *flow_id) +{ + DECLARE_FBCMD_BUFFER(cmdb, UVERBS_OBJECT_FLOW, + UVERBS_METHOD_FLOW_DESTROY, 1, NULL); + int ret; + + fill_attr_in_obj(cmdb, UVERBS_ATTR_DESTROY_FLOW_HANDLE, + flow_id->handle); + + switch (execute_ioctl_fallback(flow_id->context, destroy_ah, cmdb, + &ret)) { + case TRY_WRITE: { + struct ibv_destroy_flow req; + + req.core_payload = (struct ib_uverbs_destroy_flow){ + .flow_handle = flow_id->handle, + }; + ret = execute_cmd_write_ex_req( + flow_id->context, IB_USER_VERBS_EX_CMD_DESTROY_FLOW, + &req, sizeof(req)); + break; + } + + default: + break; + } + + if (verbs_is_destroy_err(&ret)) + return ret; + return 0; +} diff --git a/libibverbs/cmd_flow_action.c b/libibverbs/cmd_flow_action.c new file mode 100644 index 0000000..0bb8149 --- /dev/null +++ b/libibverbs/cmd_flow_action.c @@ -0,0 +1,134 @@ +/* + * Copyright (c) 2018 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <infiniband/cmd_ioctl.h> +#include <rdma/ib_user_ioctl_cmds.h> +#include <infiniband/driver.h> +#include <infiniband/cmd_write.h> + +static void scrub_esp_encap(struct ibv_flow_action_esp_encap *esp_encap) +{ + scrub_ptr_attr(esp_encap->val_ptr); + scrub_ptr_attr(esp_encap->next_ptr); +} + +static int copy_flow_action_esp(struct ibv_flow_action_esp_attr *esp, + struct ibv_command_buffer *cmd) +{ + if (esp->comp_mask & IBV_FLOW_ACTION_ESP_MASK_ESN) + fill_attr_in(cmd, UVERBS_ATTR_FLOW_ACTION_ESP_ESN, + &esp->esn, sizeof(esp->esn)); + + if (esp->keymat_ptr) + fill_attr_in_enum(cmd, UVERBS_ATTR_FLOW_ACTION_ESP_KEYMAT, + esp->keymat_proto, + esp->keymat_ptr, esp->keymat_len); + if (esp->replay_ptr) + fill_attr_in_enum(cmd, UVERBS_ATTR_FLOW_ACTION_ESP_REPLAY, + esp->replay_proto, + esp->replay_ptr, esp->replay_len); + if (esp->esp_encap) { + scrub_esp_encap(esp->esp_encap); + fill_attr_in_ptr(cmd, UVERBS_ATTR_FLOW_ACTION_ESP_ENCAP, + esp->esp_encap); + } + if (esp->esp_attr) + fill_attr_in_ptr(cmd, UVERBS_ATTR_FLOW_ACTION_ESP_ATTRS, + esp->esp_attr); + + return 0; +} + +#define FLOW_ACTION_ESP_ATTRS_NUM 6 +int ibv_cmd_create_flow_action_esp(struct ibv_context *ctx, + struct ibv_flow_action_esp_attr *attr, + struct verbs_flow_action *flow_action, + struct ibv_command_buffer *driver) +{ + DECLARE_COMMAND_BUFFER_LINK(cmd, UVERBS_OBJECT_FLOW_ACTION, + UVERBS_METHOD_FLOW_ACTION_ESP_CREATE, + FLOW_ACTION_ESP_ATTRS_NUM, + driver); + struct ib_uverbs_attr *handle = fill_attr_out_obj( + cmd, UVERBS_ATTR_CREATE_FLOW_ACTION_ESP_HANDLE); + int ret; + + ret = copy_flow_action_esp(attr, cmd); + if (ret) + return ret; + + ret = execute_ioctl(ctx, cmd); + if (ret) + return errno; + + flow_action->action.context = ctx; + flow_action->type = IBV_FLOW_ACTION_ESP; + flow_action->handle = read_attr_obj( + UVERBS_ATTR_CREATE_FLOW_ACTION_ESP_HANDLE, handle); + + return 0; +} + +int ibv_cmd_modify_flow_action_esp(struct verbs_flow_action *flow_action, + struct ibv_flow_action_esp_attr *attr, + struct ibv_command_buffer *driver) +{ + DECLARE_COMMAND_BUFFER_LINK(cmd, UVERBS_OBJECT_FLOW_ACTION, + UVERBS_METHOD_FLOW_ACTION_ESP_MODIFY, + FLOW_ACTION_ESP_ATTRS_NUM, driver); + int ret; + + fill_attr_in_obj(cmd, UVERBS_ATTR_MODIFY_FLOW_ACTION_ESP_HANDLE, + flow_action->handle); + + ret = copy_flow_action_esp(attr, cmd); + if (ret) + return ret; + + return execute_ioctl(flow_action->action.context, cmd); +} + +int ibv_cmd_destroy_flow_action(struct verbs_flow_action *action) +{ + DECLARE_COMMAND_BUFFER(cmd, UVERBS_OBJECT_FLOW_ACTION, + UVERBS_METHOD_FLOW_ACTION_DESTROY, 1); + int ret; + + fill_attr_in_obj(cmd, UVERBS_ATTR_DESTROY_FLOW_ACTION_HANDLE, + action->handle); + ret = execute_ioctl(action->action.context, cmd); + if (verbs_is_destroy_err(&ret)) + return ret; + + return 0; +} + diff --git a/libibverbs/cmd_ioctl.c b/libibverbs/cmd_ioctl.c new file mode 100644 index 0000000..2a46c49 --- /dev/null +++ b/libibverbs/cmd_ioctl.c @@ -0,0 +1,210 @@ +/* + * Copyright (c) 2018 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <infiniband/cmd_ioctl.h> +#include <infiniband/cmd_write.h> +#include "ibverbs.h" + +#include <sys/ioctl.h> +#include <infiniband/driver.h> + +#include <rdma/ib_user_ioctl_cmds.h> +#include <valgrind/memcheck.h> + +/* Number of attrs in this and all the link'd buffers */ +unsigned int __ioctl_final_num_attrs(unsigned int num_attrs, + struct ibv_command_buffer *link) +{ + for (; link; link = link->next) + num_attrs += link->next_attr - link->hdr.attrs; + + return num_attrs; +} + +/* Linearize the link'd buffers into this one */ +static void prepare_attrs(struct ibv_command_buffer *cmd) +{ + struct ib_uverbs_attr *end = cmd->next_attr; + struct ibv_command_buffer *link; + + for (link = cmd->next; link; link = link->next) { + struct ib_uverbs_attr *cur; + + assert(cmd->hdr.object_id == link->hdr.object_id); + assert(cmd->hdr.method_id == link->hdr.method_id); + + /* + * Keep track of where the uhw_in lands in the final array if + * we copy it from a link + */ + if (!VERBS_IOCTL_ONLY && link->uhw_in_idx != _UHW_NO_INDEX) { + assert(cmd->uhw_in_idx == _UHW_NO_INDEX); + cmd->uhw_in_idx = + link->uhw_in_idx + (end - cmd->hdr.attrs); + } + + for (cur = link->hdr.attrs; cur != link->next_attr; cur++) + *end++ = *cur; + + assert(end <= cmd->last_attr); + } + + cmd->hdr.num_attrs = end - cmd->hdr.attrs; + + /* + * We keep the in UHW uninlined until directly before sending to + * support the compat path. See _fill_attr_in_uhw + */ + if (!VERBS_IOCTL_ONLY && cmd->uhw_in_idx != _UHW_NO_INDEX) { + struct ib_uverbs_attr *uhw = &cmd->hdr.attrs[cmd->uhw_in_idx]; + + assert(uhw->attr_id == UVERBS_ATTR_UHW_IN); + + if (uhw->len <= sizeof(uhw->data)) + memcpy(&uhw->data, (void *)(uintptr_t)uhw->data, + uhw->len); + } +} + +static void finalize_attr(struct ib_uverbs_attr *attr) +{ + /* Only matches UVERBS_ATTR_TYPE_PTR_OUT */ + if (attr->flags & UVERBS_ATTR_F_VALID_OUTPUT && attr->len) + VALGRIND_MAKE_MEM_DEFINED((void *)(uintptr_t)attr->data, + attr->len); +} + +/* + * Copy the link'd attrs back to their source and make all output buffers safe + * for VALGRIND + */ +static void finalize_attrs(struct ibv_command_buffer *cmd) +{ + struct ibv_command_buffer *link; + struct ib_uverbs_attr *end; + + for (end = cmd->hdr.attrs; end != cmd->next_attr; end++) + finalize_attr(end); + + for (link = cmd->next; link; link = link->next) { + struct ib_uverbs_attr *cur; + + for (cur = link->hdr.attrs; cur != link->next_attr; cur++) { + finalize_attr(end); + *cur = *end++; + } + } +} + +int execute_ioctl(struct ibv_context *context, struct ibv_command_buffer *cmd) +{ + struct verbs_context *vctx = verbs_get_ctx(context); + + /* + * One of the fill functions was given input that cannot be marshaled + */ + if (unlikely(cmd->buffer_error)) { + errno = EINVAL; + return errno; + } + + prepare_attrs(cmd); + cmd->hdr.length = sizeof(cmd->hdr) + + sizeof(cmd->hdr.attrs[0]) * cmd->hdr.num_attrs; + cmd->hdr.reserved1 = 0; + cmd->hdr.reserved2 = 0; + cmd->hdr.driver_id = vctx->priv->driver_id; + + if (ioctl(context->cmd_fd, RDMA_VERBS_IOCTL, &cmd->hdr)) + return errno; + + finalize_attrs(cmd); + + return 0; +} + +/* + * The compat scheme for UHW IN requires a pointer in .data, however the + * kernel protocol requires pointers < 8 to be inlined into .data. We defer + * that transformation until directly before the ioctl. + */ +static inline struct ib_uverbs_attr * +_fill_attr_in_uhw(struct ibv_command_buffer *cmd, uint16_t attr_id, + const void *data, size_t len) +{ + struct ib_uverbs_attr *attr = _ioctl_next_attr(cmd, attr_id); + + if (unlikely(len > UINT16_MAX)) + cmd->buffer_error = 1; + + attr->len = len; + attr->data = ioctl_ptr_to_u64(data); + + return attr; +} + +/* + * This helper is used in the driver compat wrappers to build the + * command buffer from the legacy input pointers format. + */ +void _write_set_uhw(struct ibv_command_buffer *cmdb, const void *req, + size_t core_req_size, size_t req_size, void *resp, + size_t core_resp_size, size_t resp_size) +{ + if (req && core_req_size < req_size) { + if (VERBS_IOCTL_ONLY) + cmdb->uhw_in_idx = + fill_attr_in(cmdb, UVERBS_ATTR_UHW_IN, + (uint8_t *)req + core_req_size, + req_size - core_req_size) - + cmdb->hdr.attrs; + else + cmdb->uhw_in_idx = + _fill_attr_in_uhw(cmdb, UVERBS_ATTR_UHW_IN, + (uint8_t *)req + + core_req_size, + req_size - core_req_size) - + cmdb->hdr.attrs; + cmdb->uhw_in_headroom_dwords = __check_divide(core_req_size, 4); + } + + + if (resp && core_resp_size < resp_size) { + cmdb->uhw_out_idx = + fill_attr_out(cmdb, UVERBS_ATTR_UHW_OUT, + (uint8_t *)resp + core_resp_size, + resp_size - core_resp_size) - + cmdb->hdr.attrs; + cmdb->uhw_out_headroom_dwords = + __check_divide(core_resp_size, 4); + } +} diff --git a/libibverbs/cmd_ioctl.h b/libibverbs/cmd_ioctl.h new file mode 100644 index 0000000..5587b86 --- /dev/null +++ b/libibverbs/cmd_ioctl.h @@ -0,0 +1,409 @@ +/* + * Copyright (c) 2018 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef __INFINIBAND_VERBS_IOCTL_H +#define __INFINIBAND_VERBS_IOCTL_H + +#include <config.h> + +#include <stdint.h> +#include <assert.h> +#include <rdma/rdma_user_ioctl_cmds.h> +#include <infiniband/verbs.h> +#include <ccan/container_of.h> +#include <util/compiler.h> + +static inline uint64_t ioctl_ptr_to_u64(const void *ptr) +{ + if (sizeof(ptr) == sizeof(uint64_t)) + return (uintptr_t)ptr; + + /* + * Some CPU architectures require sign extension when converting from + * a 32 bit to 64 bit pointer. This should match the kernel + * implementation of compat_ptr() for the architecture. + */ +#if defined(__tilegx__) + return (int64_t)(intptr_t)ptr; +#else + return (uintptr_t)ptr; +#endif +} + +static inline void _scrub_ptr_attr(void **ptr) +{ +#if UINTPTR_MAX == UINT64_MAX + /* Do nothing */ +#else + RDMA_UAPI_PTR(void *, data) *scrub_data; + + scrub_data = container_of(ptr, typeof(*scrub_data), data); + scrub_data->data_data_u64 = ioctl_ptr_to_u64(scrub_data->data); +#endif +} + +#define scrub_ptr_attr(ptr) _scrub_ptr_attr((void **)(&ptr)) + +/* + * The command buffer is organized as a linked list of blocks of attributes. + * Each stack frame allocates its block and then calls up toward to core code + * which will do the ioctl. The frame that does the ioctl calls the special + * FINAL variant which will allocate enough space to linearize the attribute + * buffer for the kernel. + * + * The current range of attributes to fill is next_attr -> last_attr. + */ +struct ibv_command_buffer { + struct ibv_command_buffer *next; + struct ib_uverbs_attr *next_attr; + struct ib_uverbs_attr *last_attr; + /* + * Used by the legacy write interface to keep track of where the UHW + * buffer is located and the 'headroom' space that the common code + * uses to construct the command header and common command struct + * directly before the drivers' UHW. + */ + uint8_t uhw_in_idx; + uint8_t uhw_out_idx; + uint8_t uhw_in_headroom_dwords; + uint8_t uhw_out_headroom_dwords; + + uint8_t buffer_error:1; + /* + * These flags control what execute_ioctl_fallback does if the kernel + * does not support ioctl + */ + uint8_t fallback_require_ex:1; + uint8_t fallback_ioctl_only:1; + struct ib_uverbs_ioctl_hdr hdr; +}; + +enum {_UHW_NO_INDEX = 0xFF}; + +/* + * Constructing an array of ibv_command_buffer is a reasonable way to expand + * the VLA in hdr.attrs on the stack and also allocate some internal state in + * a single contiguous stack memory region. It will over-allocate the region in + * some cases, but this approach allows the number of elements to be dynamic, + * and not fixed as a compile time constant. + */ +#define _IOCTL_NUM_CMDB(_num_attrs) \ + ((sizeof(struct ibv_command_buffer) + \ + sizeof(struct ib_uverbs_attr) * (_num_attrs) + \ + sizeof(struct ibv_command_buffer) - 1) / \ + sizeof(struct ibv_command_buffer)) + +unsigned int __ioctl_final_num_attrs(unsigned int num_attrs, + struct ibv_command_buffer *link); + +/* If the user doesn't provide a link then don't create a VLA */ +#define _ioctl_final_num_attrs(_num_attrs, _link) \ + ((__builtin_constant_p(!(_link)) && !(_link)) \ + ? (_num_attrs) \ + : __ioctl_final_num_attrs(_num_attrs, _link)) + +#define _COMMAND_BUFFER_INIT(_hdr, _object_id, _method_id, _num_attrs, _link) \ + ((struct ibv_command_buffer){ \ + .hdr = \ + { \ + .object_id = (_object_id), \ + .method_id = (_method_id), \ + }, \ + .next = _link, \ + .uhw_in_idx = _UHW_NO_INDEX, \ + .uhw_out_idx = _UHW_NO_INDEX, \ + .next_attr = (_hdr).attrs, \ + .last_attr = (_hdr).attrs + _num_attrs}) + +/* + * C99 does not permit an initializer for VLAs, so this function does the init + * instead. It is called in the wonky way so that DELCARE_COMMAND_BUFFER can + * still be a 'variable', and we so we don't require C11 mode. + */ +static inline int _ioctl_init_cmdb(struct ibv_command_buffer *cmd, + uint16_t object_id, uint16_t method_id, + size_t num_attrs, + struct ibv_command_buffer *link) +{ + *cmd = _COMMAND_BUFFER_INIT(cmd->hdr, object_id, method_id, num_attrs, + link); + return 0; +} + +/* + * Construct an IOCTL command buffer on the stack with enough space for + * _num_attrs elements. _num_attrs does not have to be a compile time constant. + * _link is a previous COMMAND_BUFFER in the call chain. + */ +#ifndef __CHECKER__ +#define DECLARE_COMMAND_BUFFER_LINK(_name, _object_id, _method_id, _num_attrs, \ + _link) \ + const unsigned int __##_name##total = \ + _ioctl_final_num_attrs(_num_attrs, _link); \ + struct ibv_command_buffer _name[_IOCTL_NUM_CMDB(__##_name##total)]; \ + int __attribute__((unused)) __##_name##dummy = _ioctl_init_cmdb( \ + _name, _object_id, _method_id, __##_name##total, _link) +#else +/* + * sparse enforces kernel rules which forbids VLAs. Make the VLA into a static + * array when running sparse. Don't actually run the sparse compile result. + */ +#define DECLARE_COMMAND_BUFFER_LINK(_name, _object_id, _method_id, _num_attrs, \ + _link) \ + struct ibv_command_buffer _name[10]; \ + int __attribute__((unused)) __##_name##dummy = \ + _ioctl_init_cmdb(_name, _object_id, _method_id, 10, _link) +#endif + +#define DECLARE_COMMAND_BUFFER(_name, _object_id, _method_id, _num_attrs) \ + DECLARE_COMMAND_BUFFER_LINK(_name, _object_id, _method_id, _num_attrs, \ + NULL) + +int execute_ioctl(struct ibv_context *context, struct ibv_command_buffer *cmd); + +static inline struct ib_uverbs_attr * +_ioctl_next_attr(struct ibv_command_buffer *cmd, uint16_t attr_id) +{ + struct ib_uverbs_attr *attr; + + assert(cmd->next_attr < cmd->last_attr); + attr = cmd->next_attr++; + + *attr = (struct ib_uverbs_attr){ + .attr_id = attr_id, + /* + * All attributes default to mandatory. Wrapper the fill_* + * call in attr_optional() to make it optional. + */ + .flags = UVERBS_ATTR_F_MANDATORY, + }; + + return attr; +} + +/* + * This construction is insane, an expression with a side effect that returns + * from the calling function, but it is a non-invasive way to get the compiler + * to elide the IOCTL support in the backwards compat command functions + * without disturbing native ioctl support. + * + * A command function will set last_attr on the stack to NULL, and if it is + * coded properly, the compiler will prove that last_attr is never changed and + * elide the function. Unfortunately this penalizes native ioctl uses with the + * extra if overhead. + * + * For this reason, _ioctl_next_attr must never be called outside a fill + * function. + */ +#if VERBS_WRITE_ONLY +#define _ioctl_next_attr(cmd, attr_id) \ + ({ \ + if (!((cmd)->last_attr)) \ + return NULL; \ + _ioctl_next_attr(cmd, attr_id); \ + }) +#endif + +/* Make the attribute optional. */ +static inline struct ib_uverbs_attr *attr_optional(struct ib_uverbs_attr *attr) +{ + attr->flags &= ~UVERBS_ATTR_F_MANDATORY; + return attr; +} + +/* Send attributes of kernel type UVERBS_ATTR_TYPE_IDR */ +static inline struct ib_uverbs_attr * +fill_attr_in_obj(struct ibv_command_buffer *cmd, uint16_t attr_id, uint32_t idr) +{ + struct ib_uverbs_attr *attr = _ioctl_next_attr(cmd, attr_id); + + /* UVERBS_ATTR_TYPE_IDR uses a 64 bit value for the idr # */ + attr->data = idr; + return attr; +} + +static inline struct ib_uverbs_attr * +fill_attr_out_obj(struct ibv_command_buffer *cmd, uint16_t attr_id) +{ + return fill_attr_in_obj(cmd, attr_id, 0); +} + +static inline uint32_t read_attr_obj(uint16_t attr_id, + struct ib_uverbs_attr *attr) +{ + assert(attr->attr_id == attr_id); + return attr->data; +} + +/* Send attributes of kernel type UVERBS_ATTR_TYPE_PTR_IN */ +static inline struct ib_uverbs_attr * +fill_attr_in(struct ibv_command_buffer *cmd, uint16_t attr_id, const void *data, + size_t len) +{ + struct ib_uverbs_attr *attr = _ioctl_next_attr(cmd, attr_id); + + if (unlikely(len > UINT16_MAX)) + cmd->buffer_error = 1; + + attr->len = len; + if (len <= sizeof(uint64_t)) + memcpy(&attr->data, data, len); + else + attr->data = ioctl_ptr_to_u64(data); + + return attr; +} + +#define fill_attr_in_ptr(cmd, attr_id, ptr) \ + fill_attr_in(cmd, attr_id, ptr, sizeof(*ptr)) + +/* Send attributes of various inline kernel types */ + +static inline struct ib_uverbs_attr * +fill_attr_in_uint64(struct ibv_command_buffer *cmd, uint16_t attr_id, + uint64_t data) +{ + struct ib_uverbs_attr *attr = _ioctl_next_attr(cmd, attr_id); + + attr->len = sizeof(data); + attr->data = data; + + return attr; +} + +#define fill_attr_const_in(cmd, attr_id, _data) \ + fill_attr_in_uint64(cmd, attr_id, _data) + +static inline struct ib_uverbs_attr * +fill_attr_in_uint32(struct ibv_command_buffer *cmd, uint16_t attr_id, + uint32_t data) +{ + struct ib_uverbs_attr *attr = _ioctl_next_attr(cmd, attr_id); + + attr->len = sizeof(data); + memcpy(&attr->data, &data, sizeof(data)); + + return attr; +} + +static inline struct ib_uverbs_attr * +fill_attr_in_fd(struct ibv_command_buffer *cmd, uint16_t attr_id, int fd) +{ + struct ib_uverbs_attr *attr; + + if (fd == -1) + return NULL; + + attr = _ioctl_next_attr(cmd, attr_id); + /* UVERBS_ATTR_TYPE_FD uses a 64 bit value for the idr # */ + attr->data = fd; + return attr; +} + +static inline struct ib_uverbs_attr * +fill_attr_out_fd(struct ibv_command_buffer *cmd, uint16_t attr_id, int fd) +{ + struct ib_uverbs_attr *attr = _ioctl_next_attr(cmd, attr_id); + + attr->data = 0; + return attr; +} + +static inline int read_attr_fd(uint16_t attr_id, struct ib_uverbs_attr *attr) +{ + assert(attr->attr_id == attr_id); + /* The kernel cannot fail to create a FD here, it never returns -1 */ + return attr->data; +} + +/* Send attributes of kernel type UVERBS_ATTR_TYPE_PTR_OUT */ +static inline struct ib_uverbs_attr * +fill_attr_out(struct ibv_command_buffer *cmd, uint16_t attr_id, void *data, + size_t len) +{ + struct ib_uverbs_attr *attr = _ioctl_next_attr(cmd, attr_id); + + if (unlikely(len > UINT16_MAX)) + cmd->buffer_error = 1; + + attr->len = len; + attr->data = ioctl_ptr_to_u64(data); + + return attr; +} + +#define fill_attr_out_ptr(cmd, attr_id, ptr) \ + fill_attr_out(cmd, attr_id, ptr, sizeof(*(ptr))) + +/* If size*nelems overflows size_t this returns SIZE_MAX */ +static inline size_t _array_len(size_t size, size_t nelems) +{ + if (size != 0 && + SIZE_MAX / size <= nelems) + return SIZE_MAX; + return size * nelems; +} + +#define fill_attr_out_ptr_array(cmd, attr_id, ptr, nelems) \ + fill_attr_out(cmd, attr_id, ptr, _array_len(sizeof(*ptr), nelems)) + +#define fill_attr_in_ptr_array(cmd, attr_id, ptr, nelems) \ + fill_attr_in(cmd, attr_id, ptr, _array_len(sizeof(*ptr), nelems)) + +static inline size_t __check_divide(size_t val, unsigned int div) +{ + assert(val % div == 0); + return val / div; +} + +static inline struct ib_uverbs_attr * +fill_attr_in_enum(struct ibv_command_buffer *cmd, uint16_t attr_id, + uint8_t elem_id, const void *data, size_t len) +{ + struct ib_uverbs_attr *attr; + + attr = fill_attr_in(cmd, attr_id, data, len); + attr->attr_data.enum_data.elem_id = elem_id; + + return attr; +} + +/* Send attributes of kernel type UVERBS_ATTR_TYPE_IDRS_ARRAY */ +static inline struct ib_uverbs_attr * +fill_attr_in_objs_arr(struct ibv_command_buffer *cmd, uint16_t attr_id, + const uint32_t *idrs_arr, size_t nelems) +{ + return fill_attr_in(cmd, attr_id, idrs_arr, + _array_len(sizeof(*idrs_arr), nelems)); +} + +#endif diff --git a/libibverbs/cmd_mr.c b/libibverbs/cmd_mr.c new file mode 100644 index 0000000..cb729b6 --- /dev/null +++ b/libibverbs/cmd_mr.c @@ -0,0 +1,87 @@ +/* + * Copyright (c) 2018 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <infiniband/cmd_ioctl.h> +#include <rdma/ib_user_ioctl_cmds.h> +#include <infiniband/driver.h> +#include <infiniband/cmd_write.h> + +int ibv_cmd_advise_mr(struct ibv_pd *pd, + enum ibv_advise_mr_advice advice, + uint32_t flags, + struct ibv_sge *sg_list, + uint32_t num_sge) +{ + DECLARE_COMMAND_BUFFER(cmd, UVERBS_OBJECT_MR, + UVERBS_METHOD_ADVISE_MR, + 4); + + fill_attr_in_obj(cmd, UVERBS_ATTR_ADVISE_MR_PD_HANDLE, pd->handle); + fill_attr_const_in(cmd, UVERBS_ATTR_ADVISE_MR_ADVICE, advice); + fill_attr_in_uint32(cmd, UVERBS_ATTR_ADVISE_MR_FLAGS, flags); + fill_attr_in_ptr_array(cmd, UVERBS_ATTR_ADVISE_MR_SGE_LIST, + sg_list, num_sge); + + return execute_ioctl(pd->context, cmd); +} + +int ibv_cmd_dereg_mr(struct verbs_mr *vmr) +{ + DECLARE_FBCMD_BUFFER(cmdb, UVERBS_OBJECT_MR, UVERBS_METHOD_MR_DESTROY, + 1, NULL); + int ret; + + fill_attr_in_obj(cmdb, UVERBS_ATTR_DESTROY_MR_HANDLE, + vmr->ibv_mr.handle); + + switch (execute_ioctl_fallback(vmr->ibv_mr.context, dereg_mr, cmdb, + &ret)) { + case TRY_WRITE: { + struct ibv_dereg_mr req; + + req.core_payload = (struct ib_uverbs_dereg_mr){ + .mr_handle = vmr->ibv_mr.handle, + }; + ret = execute_cmd_write_req(vmr->ibv_mr.context, + IB_USER_VERBS_CMD_DEREG_MR, &req, + sizeof(req)); + break; + } + + default: + break; + } + + if (verbs_is_destroy_err(&ret)) + return ret; + return 0; +} diff --git a/libibverbs/cmd_mw.c b/libibverbs/cmd_mw.c new file mode 100644 index 0000000..387d4db --- /dev/null +++ b/libibverbs/cmd_mw.c @@ -0,0 +1,64 @@ +/* + * Copyright (c) 2018 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <infiniband/cmd_write.h> + +int ibv_cmd_dealloc_mw(struct ibv_mw *mw) +{ + DECLARE_FBCMD_BUFFER(cmdb, UVERBS_OBJECT_MW, UVERBS_METHOD_MW_DESTROY, + 1, NULL); + int ret; + + fill_attr_in_obj(cmdb, UVERBS_ATTR_DESTROY_MW_HANDLE, + mw->handle); + + switch (execute_ioctl_fallback(mw->context, dealloc_mw, cmdb, &ret)) { + case TRY_WRITE: { + struct ibv_dealloc_mw req; + + req.core_payload = (struct ib_uverbs_dealloc_mw){ + .mw_handle = mw->handle, + }; + ret = execute_cmd_write_req(mw->context, + IB_USER_VERBS_CMD_DEALLOC_MW, &req, + sizeof(req)); + break; + } + + default: + break; + } + + if (verbs_is_destroy_err(&ret)) + return ret; + return 0; +} diff --git a/libibverbs/cmd_pd.c b/libibverbs/cmd_pd.c new file mode 100644 index 0000000..d1e237b --- /dev/null +++ b/libibverbs/cmd_pd.c @@ -0,0 +1,64 @@ +/* + * Copyright (c) 2018 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <infiniband/cmd_write.h> + +int ibv_cmd_dealloc_pd(struct ibv_pd *pd) +{ + DECLARE_FBCMD_BUFFER(cmdb, UVERBS_OBJECT_PD, UVERBS_METHOD_PD_DESTROY, + 1, NULL); + int ret; + + fill_attr_in_obj(cmdb, UVERBS_ATTR_DESTROY_PD_HANDLE, + pd->handle); + + switch (execute_ioctl_fallback(pd->context, dealloc_pd, cmdb, &ret)) { + case TRY_WRITE: { + struct ibv_dealloc_pd req; + + req.core_payload = (struct ib_uverbs_dealloc_pd){ + .pd_handle = pd->handle, + }; + ret = execute_cmd_write_req(pd->context, + IB_USER_VERBS_CMD_DEALLOC_PD, &req, + sizeof(req)); + break; + } + + default: + break; + } + + if (verbs_is_destroy_err(&ret)) + return ret; + return 0; +} diff --git a/libibverbs/cmd_rwq_ind.c b/libibverbs/cmd_rwq_ind.c new file mode 100644 index 0000000..78163b8 --- /dev/null +++ b/libibverbs/cmd_rwq_ind.c @@ -0,0 +1,66 @@ +/* + * Copyright (c) 2018 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <infiniband/cmd_write.h> + +int ibv_cmd_destroy_rwq_ind_table(struct ibv_rwq_ind_table *rwq_ind_table) +{ + DECLARE_FBCMD_BUFFER(cmdb, UVERBS_OBJECT_RWQ_IND_TBL, + UVERBS_METHOD_RWQ_IND_TBL_DESTROY, 1, NULL); + int ret; + + fill_attr_in_obj(cmdb, UVERBS_ATTR_DESTROY_RWQ_IND_TBL_HANDLE, + rwq_ind_table->ind_tbl_handle); + + switch (execute_ioctl_fallback(rwq_ind_table->context, destroy_ah, cmdb, + &ret)) { + case TRY_WRITE: { + struct ibv_destroy_rwq_ind_table req; + + req.core_payload = (struct ib_uverbs_ex_destroy_rwq_ind_table){ + .ind_tbl_handle = rwq_ind_table->ind_tbl_handle, + }; + ret = execute_cmd_write_ex_req( + rwq_ind_table->context, + IB_USER_VERBS_EX_CMD_DESTROY_RWQ_IND_TBL, &req, + sizeof(req)); + break; + } + + default: + break; + } + + if (verbs_is_destroy_err(&ret)) + return ret; + return 0; +} diff --git a/libibverbs/cmd_write.h b/libibverbs/cmd_write.h new file mode 100644 index 0000000..495cad8 --- /dev/null +++ b/libibverbs/cmd_write.h @@ -0,0 +1,341 @@ +/* + * Copyright (c) 2018 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef __INFINIBAND_VERBS_WRITE_H +#define __INFINIBAND_VERBS_WRITE_H + +#include <infiniband/cmd_ioctl.h> +#include <infiniband/driver.h> +#include <rdma/ib_user_verbs.h> +#include <rdma/ib_user_ioctl_cmds.h> + +#include <stdbool.h> + +void *_write_get_req(struct ibv_command_buffer *link, + struct ib_uverbs_cmd_hdr *onstack, size_t size); +void *_write_get_req_ex(struct ibv_command_buffer *link, struct ex_hdr *onstack, + size_t size); +void *_write_get_resp(struct ibv_command_buffer *link, + struct ib_uverbs_cmd_hdr *hdr, void *onstack, + size_t resp_size); +void *_write_get_resp_ex(struct ibv_command_buffer *link, + struct ex_hdr *hdr, void *onstack, + size_t resp_size); + +/* + * This macro creates 'req' and 'resp' pointers in the local stack frame that + * point to the core code write command structures patterned off _pattern. + * + * This should be done before calling execute_write_bufs + */ +#define DECLARE_LEGACY_UHW_BUFS(_link, _enum) \ + IBV_ABI_REQ(_enum) __req_onstack; \ + IBV_KABI_RESP(_enum) __resp_onstack; \ + IBV_KABI_REQ(_enum) *req = \ + _write_get_req(_link, &__req_onstack.hdr, sizeof(*req)); \ + IBV_KABI_RESP(_enum) *resp = ({ \ + void *_resp = _write_get_resp( \ + _link, \ + &container_of(req, IBV_ABI_REQ(_enum), core_payload) \ + ->hdr, \ + &__resp_onstack, sizeof(*resp)); \ + _resp; \ + }) + +#define DECLARE_LEGACY_UHW_BUFS_EX(_link, _enum) \ + IBV_ABI_REQ(_enum) __req_onstack; \ + IBV_KABI_RESP(_enum) __resp_onstack; \ + IBV_KABI_REQ(_enum) *req = \ + _write_get_req_ex(_link, &__req_onstack.hdr, sizeof(*req)); \ + IBV_KABI_RESP(_enum) *resp = _write_get_resp_ex( \ + _link, \ + &container_of(req, IBV_ABI_REQ(_enum), core_payload)->hdr, \ + &__resp_onstack, sizeof(*resp)) + +/* + * This macro is used to implement the compatibility command call wrappers. + * Compatibility calls do not accept a command_buffer, and cannot use the new + * attribute id mechanism. They accept the legacy kern-abi.h structs that have + * the embedded header. + */ +void _write_set_uhw(struct ibv_command_buffer *cmdb, const void *req, + size_t core_req_size, size_t req_size, void *resp, + size_t core_resp_size, size_t resp_size); +#define DECLARE_CMD_BUFFER_COMPAT(_name, _object_id, _method_id, cmd, \ + cmd_size, resp, resp_size) \ + DECLARE_COMMAND_BUFFER(_name, _object_id, _method_id, 2); \ + _write_set_uhw(_name, cmd, sizeof(*cmd), cmd_size, resp, \ + sizeof(*resp), resp_size) + +/* + * The fallback scheme keeps track of which ioctls succeed in a per-context + * bitmap. If ENOTTY or EPROTONOSUPPORT is seen then the ioctl is never + * retried. + * + * cmd_name should be the name of the function op from verbs_context_ops + * that is being implemented. + */ +#define _CMD_BIT(cmd_name) \ + (offsetof(struct verbs_context_ops, cmd_name) / sizeof(void *)) + +enum write_fallback { TRY_WRITE, TRY_WRITE_EX, ERROR, SUCCESS }; + +/* + * This bitmask indicate the required behavior of execute_ioctl_fallback when + * the ioctl is not supported. It is a priority list where the highest set bit + * takes precedence. This approach simplifies the typical required control + * flow of the user. + */ +static inline void fallback_require_ex(struct ibv_command_buffer *cmdb) +{ + cmdb->fallback_require_ex = 1; +} + +static inline void fallback_require_ioctl(struct ibv_command_buffer *cmdb) +{ + cmdb->fallback_ioctl_only = 1; +} + +enum write_fallback _check_legacy(struct ibv_command_buffer *cmdb, int *ret); + +enum write_fallback _execute_ioctl_fallback(struct ibv_context *ctx, + unsigned int cmd_bit, + struct ibv_command_buffer *cmdb, + int *ret); + +#define execute_ioctl_fallback(ctx, cmd_name, cmdb, ret) \ + _execute_ioctl_fallback(ctx, _CMD_BIT(cmd_name), cmdb, ret) + +/* + * For write() only commands that have fixed core structures and may take uhw + * driver data. The last arguments are the same ones passed into the typical + * ibv_cmd_* function. execute_cmd_write deduces the length of the core + * structure based on the KABI struct linked to the enum op code. + */ +int _execute_cmd_write(struct ibv_context *ctx, unsigned int write_method, + struct ib_uverbs_cmd_hdr *req, size_t core_req_size, + size_t req_size, void *resp, size_t core_resp_size, + size_t resp_size); +#define execute_cmd_write(ctx, enum, cmd, cmd_size, resp, resp_size) \ + ({ \ + (cmd)->core_payload.response = ioctl_ptr_to_u64(resp); \ + _execute_cmd_write( \ + ctx, enum, \ + &(cmd)->hdr + check_type(cmd, IBV_ABI_REQ(enum) *), \ + sizeof(*(cmd)), cmd_size, \ + resp + check_type(resp, IBV_KABI_RESP(enum) *), \ + sizeof(*(resp)), resp_size); \ + }) + +/* For write() commands that have no response */ +#define execute_cmd_write_req(ctx, enum, cmd, cmd_size) \ + ({ \ + static_assert(sizeof(IBV_KABI_RESP(enum)) == 0, \ + "Method has a response!"); \ + _execute_cmd_write( \ + ctx, enum, \ + &(cmd)->hdr + check_type(cmd, IBV_ABI_REQ(enum) *), \ + sizeof(*(cmd)), cmd_size, NULL, 0, 0); \ + }) + +/* + * Execute a write command that does not have a uhw component. The cmd_size + * and resp_size are the lengths of the core structure. This version is only + * needed if the core structure ends in a flex array, as the internal sizeof() + * in execute_cmd_write() will give the wrong size. + */ +#define execute_cmd_write_no_uhw(ctx, enum, cmd, cmd_size, resp, resp_size) \ + ({ \ + (cmd)->core_payload.response = ioctl_ptr_to_u64(resp); \ + _execute_cmd_write( \ + ctx, enum, \ + &(cmd)->hdr + check_type(cmd, IBV_ABI_REQ(enum) *), \ + cmd_size, cmd_size, \ + resp + check_type(resp, IBV_KABI_RESP(enum) *), \ + resp_size, resp_size); \ + }) + +/* + * For users of DECLARE_LEGACY_UHW_BUFS, in this case the machinery has + * already stored the full req/resp length in the hdr. + */ +#define execute_write_bufs(ctx, enum, req, resp) \ + ({ \ + IBV_ABI_REQ(enum) *_hdr = \ + container_of(req, IBV_ABI_REQ(enum), core_payload); \ + execute_cmd_write(ctx, enum, _hdr, _hdr->hdr.in_words * 4, \ + resp, _hdr->hdr.out_words * 4); \ + }) + +/* + * For write() commands that use the _ex protocol. _full allows the caller to + * specify all 4 sizes directly. This version is used when the core structs + * end in a flex array. The normal and req versions are similar to write() and + * deduce the length of the core struct from the enum. + */ +int _execute_cmd_write_ex(struct ibv_context *ctx, unsigned int write_method, + struct ex_hdr *req, size_t core_req_size, + size_t req_size, void *resp, size_t core_resp_size, + size_t resp_size); +#define execute_cmd_write_ex_full(ctx, enum, cmd, core_cmd_size, cmd_size, \ + resp, core_resp_size, resp_size) \ + _execute_cmd_write_ex( \ + ctx, enum, &(cmd)->hdr + check_type(cmd, IBV_ABI_REQ(enum) *), \ + core_cmd_size, cmd_size, \ + resp + check_type(resp, IBV_KABI_RESP(enum) *), \ + core_resp_size, resp_size) +#define execute_cmd_write_ex(ctx, enum, cmd, cmd_size, resp, resp_size) \ + execute_cmd_write_ex_full(ctx, enum, cmd, sizeof(*(cmd)), cmd_size, \ + resp, sizeof(*(resp)), resp_size) +#define execute_cmd_write_ex_req(ctx, enum, cmd, cmd_size) \ + ({ \ + static_assert(sizeof(IBV_KABI_RESP(enum)) == 0, \ + "Method has a response!"); \ + _execute_cmd_write_ex( \ + ctx, enum, \ + &(cmd)->hdr + check_type(cmd, IBV_ABI_REQ(enum) *), \ + sizeof(*(cmd)), cmd_size, NULL, 0, 0); \ + }) + +/* For users of DECLARE_LEGACY_UHW_BUFS_EX */ +#define execute_write_bufs_ex(ctx, enum, req, resp) \ + ({ \ + IBV_ABI_REQ(enum) *_hdr = \ + container_of(req, IBV_ABI_REQ(enum), core_payload); \ + execute_cmd_write_ex( \ + ctx, enum, _hdr, \ + sizeof(*_hdr) + \ + _hdr->hdr.ex_hdr.provider_in_words * 8, \ + resp, \ + sizeof(*(resp)) + \ + _hdr->hdr.ex_hdr.provider_out_words * 8); \ + }) + +/* + * These two macros are used only with execute_ioctl_fallback - they allow the + * IOCTL code to be elided by the compiler when disabled. + */ +#define DECLARE_FBCMD_BUFFER DECLARE_COMMAND_BUFFER_LINK + +/* + * Munge the macros above to remove certain paths during compilation based on + * the cmake flag. + */ +#if VERBS_IOCTL_ONLY +static inline enum write_fallback +_execute_ioctl_only(struct ibv_context *context, struct ibv_command_buffer *cmd, + int *ret) +{ + *ret = execute_ioctl(context, cmd); + if (*ret) + return ERROR; + + return SUCCESS; +} + +#undef execute_ioctl_fallback +#define execute_ioctl_fallback(ctx, cmd_name, cmdb, ret) \ + _execute_ioctl_only(ctx, cmdb, ret) + +#undef execute_write_bufs +static inline int execute_write_bufs(struct ibv_context *ctx, + unsigned int write_command, void *req, + void *resp) +{ + return ENOSYS; +} + +#undef execute_write_bufs_ex +static inline int execute_write_bufs_ex(struct ibv_context *ctx, + unsigned int write_command, void *req, + void *resp) +{ + return ENOSYS; +} + +#endif + +#if VERBS_WRITE_ONLY +static inline enum write_fallback +_execute_write_only(struct ibv_context *context, struct ibv_command_buffer *cmd, + int *ret) +{ + /* + * write only still has the command buffer, and the command buffer + * carries the fallback guidance that we need to inspect. This is + * written in this odd way so the compiler knows that SUCCESS is not a + * possible return and optimizes accordingly. + */ + switch (_check_legacy(cmd, ret)) { + case TRY_WRITE: + return TRY_WRITE; + case TRY_WRITE_EX: + return TRY_WRITE_EX; + default: + return ERROR; + } +} + +#undef execute_ioctl_fallback +#define execute_ioctl_fallback(ctx, cmd_name, cmdb, ret) \ + _execute_write_only(ctx, cmdb, ret) + +#undef DECLARE_FBCMD_BUFFER +#define DECLARE_FBCMD_BUFFER(_name, _object_id, _method_id, _num_attrs, _link) \ + struct ibv_command_buffer _name[1] = { \ + { \ + .next = _link, \ + .uhw_in_idx = _UHW_NO_INDEX, \ + .uhw_out_idx = _UHW_NO_INDEX, \ + }, \ + } + +#endif + +extern bool verbs_allow_disassociate_destroy; + +/* + * Return true if 'ret' indicates that a destroy operation has failed + * and the function should exit. If the kernel destroy failure is being + * ignored then this will set ret to 0, so the calling function appears to succeed. + */ +static inline bool verbs_is_destroy_err(int *ret) +{ + if (*ret == EIO && verbs_allow_disassociate_destroy) { + *ret = 0; + return true; + } + + return *ret != 0; +} + +#endif diff --git a/libibverbs/cmd_xrcd.c b/libibverbs/cmd_xrcd.c new file mode 100644 index 0000000..52d2952 --- /dev/null +++ b/libibverbs/cmd_xrcd.c @@ -0,0 +1,64 @@ +/* + * Copyright (c) 2018 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <infiniband/cmd_write.h> + +int ibv_cmd_close_xrcd(struct verbs_xrcd *xrcd) +{ + DECLARE_FBCMD_BUFFER(cmdb, UVERBS_OBJECT_XRCD, + UVERBS_METHOD_XRCD_DESTROY, 1, NULL); + int ret; + + fill_attr_in_obj(cmdb, UVERBS_ATTR_DESTROY_XRCD_HANDLE, xrcd->handle); + + switch (execute_ioctl_fallback(xrcd->xrcd.context, close_xrcd, cmdb, + &ret)) { + case TRY_WRITE: { + struct ibv_close_xrcd req; + + req.core_payload = (struct ib_uverbs_close_xrcd){ + .xrcd_handle = xrcd->handle, + }; + ret = execute_cmd_write_req(xrcd->xrcd.context, + IB_USER_VERBS_CMD_CLOSE_XRCD, &req, + sizeof(req)); + break; + } + + default: + break; + } + + if (verbs_is_destroy_err(&ret)) + return ret; + return 0; +} diff --git a/libibverbs/compat-1_0.c b/libibverbs/compat-1_0.c new file mode 100644 index 0000000..695f89d --- /dev/null +++ b/libibverbs/compat-1_0.c @@ -0,0 +1,987 @@ +/* + * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <config.h> + +#include <string.h> +#include <stddef.h> +#include <stdlib.h> +#include <unistd.h> +#include <alloca.h> + +#include <util/symver.h> +#include "ibverbs.h" + +struct ibv_pd_1_0 { + struct ibv_context_1_0 *context; + uint32_t handle; + + struct ibv_pd *real_pd; +}; + +struct ibv_mr_1_0 { + struct ibv_context_1_0 *context; + struct ibv_pd_1_0 *pd; + uint32_t handle; + uint32_t lkey; + uint32_t rkey; + + struct ibv_mr *real_mr; +}; + +struct ibv_srq_1_0 { + struct ibv_context_1_0 *context; + void *srq_context; + struct ibv_pd_1_0 *pd; + uint32_t handle; + + pthread_mutex_t mutex; + pthread_cond_t cond; + uint32_t events_completed; + + struct ibv_srq *real_srq; +}; + +struct ibv_qp_init_attr_1_0 { + void *qp_context; + struct ibv_cq_1_0 *send_cq; + struct ibv_cq_1_0 *recv_cq; + struct ibv_srq_1_0 *srq; + struct ibv_qp_cap cap; + enum ibv_qp_type qp_type; + int sq_sig_all; +}; + +struct ibv_send_wr_1_0 { + struct ibv_send_wr_1_0 *next; + uint64_t wr_id; + struct ibv_sge *sg_list; + int num_sge; + enum ibv_wr_opcode opcode; + int send_flags; + __be32 imm_data; + union { + struct { + uint64_t remote_addr; + uint32_t rkey; + } rdma; + struct { + uint64_t remote_addr; + uint64_t compare_add; + uint64_t swap; + uint32_t rkey; + } atomic; + struct { + struct ibv_ah_1_0 *ah; + uint32_t remote_qpn; + uint32_t remote_qkey; + } ud; + } wr; +}; + +struct ibv_recv_wr_1_0 { + struct ibv_recv_wr_1_0 *next; + uint64_t wr_id; + struct ibv_sge *sg_list; + int num_sge; +}; + +struct ibv_qp_1_0 { + struct ibv_context_1_0 *context; + void *qp_context; + struct ibv_pd_1_0 *pd; + struct ibv_cq_1_0 *send_cq; + struct ibv_cq_1_0 *recv_cq; + struct ibv_srq_1_0 *srq; + uint32_t handle; + uint32_t qp_num; + enum ibv_qp_state state; + enum ibv_qp_type qp_type; + + pthread_mutex_t mutex; + pthread_cond_t cond; + uint32_t events_completed; + + struct ibv_qp *real_qp; +}; + +struct ibv_cq_1_0 { + struct ibv_context_1_0 *context; + void *cq_context; + uint32_t handle; + int cqe; + + pthread_mutex_t mutex; + pthread_cond_t cond; + uint32_t comp_events_completed; + uint32_t async_events_completed; + + struct ibv_cq *real_cq; +}; + +struct ibv_ah_1_0 { + struct ibv_context_1_0 *context; + struct ibv_pd_1_0 *pd; + uint32_t handle; + + struct ibv_ah *real_ah; +}; + +struct ibv_device_1_0 { + void *obsolete_sysfs_dev; + void *obsolete_sysfs_ibdev; + struct ibv_device *real_device; /* was obsolete driver member */ + struct _ibv_device_ops _ops; +}; + +struct ibv_context_ops_1_0 { + int (*query_device)(struct ibv_context *context, + struct ibv_device_attr *device_attr); + int (*query_port)(struct ibv_context *context, uint8_t port_num, + struct ibv_port_attr *port_attr); + struct ibv_pd * (*alloc_pd)(struct ibv_context *context); + int (*dealloc_pd)(struct ibv_pd *pd); + struct ibv_mr * (*reg_mr)(struct ibv_pd *pd, void *addr, size_t length, + int access); + int (*dereg_mr)(struct ibv_mr *mr); + struct ibv_cq * (*create_cq)(struct ibv_context *context, int cqe, + struct ibv_comp_channel *channel, + int comp_vector); + int (*poll_cq)(struct ibv_cq_1_0 *cq, int num_entries, + struct ibv_wc *wc); + int (*req_notify_cq)(struct ibv_cq_1_0 *cq, + int solicited_only); + void (*cq_event)(struct ibv_cq *cq); + int (*resize_cq)(struct ibv_cq *cq, int cqe); + int (*destroy_cq)(struct ibv_cq *cq); + struct ibv_srq * (*create_srq)(struct ibv_pd *pd, + struct ibv_srq_init_attr *srq_init_attr); + int (*modify_srq)(struct ibv_srq *srq, + struct ibv_srq_attr *srq_attr, + int srq_attr_mask); + int (*query_srq)(struct ibv_srq *srq, + struct ibv_srq_attr *srq_attr); + int (*destroy_srq)(struct ibv_srq *srq); + int (*post_srq_recv)(struct ibv_srq_1_0 *srq, + struct ibv_recv_wr_1_0 *recv_wr, + struct ibv_recv_wr_1_0 **bad_recv_wr); + struct ibv_qp * (*create_qp)(struct ibv_pd *pd, struct ibv_qp_init_attr *attr); + int (*query_qp)(struct ibv_qp *qp, struct ibv_qp_attr *attr, + int attr_mask, + struct ibv_qp_init_attr *init_attr); + int (*modify_qp)(struct ibv_qp *qp, struct ibv_qp_attr *attr, + int attr_mask); + int (*destroy_qp)(struct ibv_qp *qp); + int (*post_send)(struct ibv_qp_1_0 *qp, + struct ibv_send_wr_1_0 *wr, + struct ibv_send_wr_1_0 **bad_wr); + int (*post_recv)(struct ibv_qp_1_0 *qp, + struct ibv_recv_wr_1_0 *wr, + struct ibv_recv_wr_1_0 **bad_wr); + struct ibv_ah * (*create_ah)(struct ibv_pd *pd, struct ibv_ah_attr *attr); + int (*destroy_ah)(struct ibv_ah *ah); + int (*attach_mcast)(struct ibv_qp *qp, union ibv_gid *gid, + uint16_t lid); + int (*detach_mcast)(struct ibv_qp *qp, union ibv_gid *gid, + uint16_t lid); +}; + +struct ibv_context_1_0 { + struct ibv_device_1_0 *device; + struct ibv_context_ops_1_0 ops; + int cmd_fd; + int async_fd; + int num_comp_vectors; + + struct ibv_context *real_context; /* was abi_compat member */ +}; + +typedef struct ibv_device *(*ibv_driver_init_func_1_1)(const char *uverbs_sys_path, + int abi_version); + +COMPAT_SYMVER_FUNC(ibv_get_device_list, 1_0, "IBVERBS_1.0", + struct ibv_device_1_0 **, + int *num) +{ + struct ibv_device **real_list; + struct ibv_device_1_0 **l; + int i, n; + + real_list = ibv_get_device_list(&n); + if (!real_list) + return NULL; + + l = calloc(n + 2, sizeof (struct ibv_device_1_0 *)); + if (!l) + goto free_device_list; + + l[0] = (void *) real_list; + + for (i = 0; i < n; ++i) { + l[i + 1] = calloc(1, sizeof (struct ibv_device_1_0)); + if (!l[i + 1]) + goto fail; + l[i + 1]->real_device = real_list[i]; + } + + if (num) + *num = n; + + return l + 1; + +fail: + for (i = 1; i <= n; ++i) + if (l[i]) + free(l[i]); + free(l); + +free_device_list: + ibv_free_device_list(real_list); + return NULL; +} + +COMPAT_SYMVER_FUNC(ibv_free_device_list, 1_0, "IBVERBS_1.0", + void, + struct ibv_device_1_0 **list) +{ + struct ibv_device_1_0 **l = list; + + while (*l) { + free(*l); + ++l; + } + + ibv_free_device_list((void *) list[-1]); + free(list - 1); +} + +COMPAT_SYMVER_FUNC(ibv_get_device_name, 1_0, "IBVERBS_1.0", + const char *, + struct ibv_device_1_0 *device) +{ + return ibv_get_device_name(device->real_device); +} + +COMPAT_SYMVER_FUNC(ibv_get_device_guid, 1_0, "IBVERBS_1.0", + __be64, + struct ibv_device_1_0 *device) +{ + return ibv_get_device_guid(device->real_device); +} + +static int poll_cq_wrapper_1_0(struct ibv_cq_1_0 *cq, int num_entries, + struct ibv_wc *wc) +{ + return cq->context->real_context->ops.poll_cq(cq->real_cq, num_entries, wc); +} + +static int req_notify_cq_wrapper_1_0(struct ibv_cq_1_0 *cq, int sol_only) +{ + return cq->context->real_context->ops.req_notify_cq(cq->real_cq, sol_only); +} + +static int post_srq_recv_wrapper_1_0(struct ibv_srq_1_0 *srq, struct ibv_recv_wr_1_0 *wr, + struct ibv_recv_wr_1_0 **bad_wr) +{ + struct ibv_recv_wr_1_0 *w; + struct ibv_recv_wr *real_wr, *head_wr = NULL, *tail_wr = NULL, *real_bad_wr; + int ret; + + for (w = wr; w; w = w->next) { + real_wr = alloca(sizeof *real_wr); + real_wr->wr_id = w->wr_id; + real_wr->sg_list = w->sg_list; + real_wr->num_sge = w->num_sge; + real_wr->next = NULL; + if (tail_wr) + tail_wr->next = real_wr; + else + head_wr = real_wr; + + tail_wr = real_wr; + } + + ret = srq->context->real_context->ops.post_srq_recv(srq->real_srq, head_wr, + &real_bad_wr); + + if (ret) { + for (real_wr = head_wr, w = wr; + real_wr; + real_wr = real_wr->next, w = w->next) + if (real_wr == real_bad_wr) { + *bad_wr = w; + break; + } + } + + return ret; +} + +static int post_send_wrapper_1_0(struct ibv_qp_1_0 *qp, struct ibv_send_wr_1_0 *wr, + struct ibv_send_wr_1_0 **bad_wr) +{ + struct ibv_send_wr_1_0 *w; + struct ibv_send_wr *real_wr, *head_wr = NULL, *tail_wr = NULL, *real_bad_wr; + int is_ud = qp->qp_type == IBV_QPT_UD; + int ret; + + for (w = wr; w; w = w->next) { + real_wr = alloca(sizeof *real_wr); + real_wr->wr_id = w->wr_id; + real_wr->next = NULL; + +#define TEST_SIZE_2_POINT(f1, f2) \ + ((offsetof(struct ibv_send_wr, f1) - offsetof(struct ibv_send_wr, f2)) \ + == offsetof(struct ibv_send_wr_1_0, f1) - offsetof(struct ibv_send_wr_1_0, f2)) +#define TEST_SIZE_TO_END(f1) \ + ((sizeof(struct ibv_send_wr) - offsetof(struct ibv_send_wr, f1)) == \ + (sizeof(struct ibv_send_wr_1_0) - offsetof(struct ibv_send_wr_1_0, f1))) + + if (TEST_SIZE_TO_END (sg_list)) + memcpy(&real_wr->sg_list, &w->sg_list, sizeof *real_wr + - offsetof(struct ibv_send_wr, sg_list)); + else if (TEST_SIZE_2_POINT (imm_data, sg_list) && + TEST_SIZE_TO_END (wr)) { + /* we have alignment up to wr, but padding between + * imm_data and wr, and we know wr itself is the + * same size */ + memcpy(&real_wr->sg_list, &w->sg_list, + offsetof(struct ibv_send_wr, imm_data) - + offsetof(struct ibv_send_wr, sg_list) + + sizeof real_wr->imm_data); + memcpy(&real_wr->wr, &w->wr, sizeof real_wr->wr); + } else { + real_wr->sg_list = w->sg_list; + real_wr->num_sge = w->num_sge; + real_wr->opcode = w->opcode; + real_wr->send_flags = w->send_flags; + real_wr->imm_data = w->imm_data; + if (TEST_SIZE_TO_END (wr)) + memcpy(&real_wr->wr, &w->wr, + sizeof real_wr->wr); + else { + real_wr->wr.atomic.remote_addr = + w->wr.atomic.remote_addr; + real_wr->wr.atomic.compare_add = + w->wr.atomic.compare_add; + real_wr->wr.atomic.swap = + w->wr.atomic.swap; + real_wr->wr.atomic.rkey = + w->wr.atomic.rkey; + } + } + + if (is_ud) + real_wr->wr.ud.ah = w->wr.ud.ah->real_ah; + + if (tail_wr) + tail_wr->next = real_wr; + else + head_wr = real_wr; + + tail_wr = real_wr; + } + + ret = qp->context->real_context->ops.post_send(qp->real_qp, head_wr, + &real_bad_wr); + + if (ret) { + for (real_wr = head_wr, w = wr; + real_wr; + real_wr = real_wr->next, w = w->next) + if (real_wr == real_bad_wr) { + *bad_wr = w; + break; + } + } + + return ret; +} + +static int post_recv_wrapper_1_0(struct ibv_qp_1_0 *qp, struct ibv_recv_wr_1_0 *wr, + struct ibv_recv_wr_1_0 **bad_wr) +{ + struct ibv_recv_wr_1_0 *w; + struct ibv_recv_wr *real_wr, *head_wr = NULL, *tail_wr = NULL, *real_bad_wr; + int ret; + + for (w = wr; w; w = w->next) { + real_wr = alloca(sizeof *real_wr); + real_wr->wr_id = w->wr_id; + real_wr->sg_list = w->sg_list; + real_wr->num_sge = w->num_sge; + real_wr->next = NULL; + if (tail_wr) + tail_wr->next = real_wr; + else + head_wr = real_wr; + + tail_wr = real_wr; + } + + ret = qp->context->real_context->ops.post_recv(qp->real_qp, head_wr, + &real_bad_wr); + + if (ret) { + for (real_wr = head_wr, w = wr; + real_wr; + real_wr = real_wr->next, w = w->next) + if (real_wr == real_bad_wr) { + *bad_wr = w; + break; + } + } + + return ret; +} + +COMPAT_SYMVER_FUNC(ibv_open_device, 1_0, "IBVERBS_1.0", + struct ibv_context_1_0 *, + struct ibv_device_1_0 *device) +{ + struct ibv_context *real_ctx; + struct ibv_context_1_0 *ctx; + + ctx = malloc(sizeof *ctx); + if (!ctx) + return NULL; + + real_ctx = ibv_open_device(device->real_device); + if (!real_ctx) { + free(ctx); + return NULL; + } + + ctx->device = device; + ctx->real_context = real_ctx; + + ctx->ops.poll_cq = poll_cq_wrapper_1_0; + ctx->ops.req_notify_cq = req_notify_cq_wrapper_1_0; + ctx->ops.post_send = post_send_wrapper_1_0; + ctx->ops.post_recv = post_recv_wrapper_1_0; + ctx->ops.post_srq_recv = post_srq_recv_wrapper_1_0; + + return ctx; +} + +COMPAT_SYMVER_FUNC(ibv_close_device, 1_0, "IBVERBS_1.0", + int, + struct ibv_context_1_0 *context) +{ + int ret; + + ret = ibv_close_device(context->real_context); + if (ret) + return ret; + + free(context); + return 0; +} + +COMPAT_SYMVER_FUNC(ibv_get_async_event, 1_0, "IBVERBS_1.0", + int, + struct ibv_context_1_0 *context, + struct ibv_async_event *event) +{ + int ret; + + ret = ibv_get_async_event(context->real_context, event); + if (ret) + return ret; + + switch (event->event_type) { + case IBV_EVENT_CQ_ERR: + event->element.cq = event->element.cq->cq_context; + break; + + case IBV_EVENT_QP_FATAL: + case IBV_EVENT_QP_REQ_ERR: + case IBV_EVENT_QP_ACCESS_ERR: + case IBV_EVENT_COMM_EST: + case IBV_EVENT_SQ_DRAINED: + case IBV_EVENT_PATH_MIG: + case IBV_EVENT_PATH_MIG_ERR: + case IBV_EVENT_QP_LAST_WQE_REACHED: + event->element.qp = event->element.qp->qp_context; + break; + + case IBV_EVENT_SRQ_ERR: + case IBV_EVENT_SRQ_LIMIT_REACHED: + event->element.srq = event->element.srq->srq_context; + break; + + default: + break; + } + + return ret; +} + +COMPAT_SYMVER_FUNC(ibv_ack_async_event, 1_0, "IBVERBS_1.0", + void, + struct ibv_async_event *event) +{ + struct ibv_async_event real_event = *event; + + switch (event->event_type) { + case IBV_EVENT_CQ_ERR: + real_event.element.cq = + ((struct ibv_cq_1_0 *) event->element.cq)->real_cq; + break; + + case IBV_EVENT_QP_FATAL: + case IBV_EVENT_QP_REQ_ERR: + case IBV_EVENT_QP_ACCESS_ERR: + case IBV_EVENT_COMM_EST: + case IBV_EVENT_SQ_DRAINED: + case IBV_EVENT_PATH_MIG: + case IBV_EVENT_PATH_MIG_ERR: + case IBV_EVENT_QP_LAST_WQE_REACHED: + real_event.element.qp = + ((struct ibv_qp_1_0 *) event->element.qp)->real_qp; + break; + + case IBV_EVENT_SRQ_ERR: + case IBV_EVENT_SRQ_LIMIT_REACHED: + real_event.element.srq = + ((struct ibv_srq_1_0 *) event->element.srq)->real_srq; + break; + + default: + break; + } + + ibv_ack_async_event(&real_event); +} + +COMPAT_SYMVER_FUNC(ibv_query_device, 1_0, "IBVERBS_1.0", + int, + struct ibv_context_1_0 *context, + struct ibv_device_attr *device_attr) +{ + return ibv_query_device(context->real_context, device_attr); +} + +COMPAT_SYMVER_FUNC(ibv_query_port, 1_0, "IBVERBS_1.0", + int, + struct ibv_context_1_0 *context, + uint8_t port_num, + struct ibv_port_attr *port_attr) +{ + return ibv_query_port(context->real_context, port_num, port_attr); +} + +COMPAT_SYMVER_FUNC(ibv_query_gid, 1_0, "IBVERBS_1.0", + int, + struct ibv_context_1_0 *context, + uint8_t port_num, int index, + union ibv_gid *gid) +{ + return ibv_query_gid(context->real_context, port_num, index, gid); +} + +COMPAT_SYMVER_FUNC(ibv_query_pkey, 1_0, "IBVERBS_1.0", + int, + struct ibv_context_1_0 *context, + uint8_t port_num, int index, + __be16 *pkey) +{ + return ibv_query_pkey(context->real_context, port_num, index, pkey); +} + +COMPAT_SYMVER_FUNC(ibv_alloc_pd, 1_0, "IBVERBS_1.0", + struct ibv_pd_1_0 *, + struct ibv_context_1_0 *context) +{ + struct ibv_pd *real_pd; + struct ibv_pd_1_0 *pd; + + pd = malloc(sizeof *pd); + if (!pd) + return NULL; + + real_pd = ibv_alloc_pd(context->real_context); + if (!real_pd) { + free(pd); + return NULL; + } + + pd->context = context; + pd->real_pd = real_pd; + + return pd; +} + +COMPAT_SYMVER_FUNC(ibv_dealloc_pd, 1_0, "IBVERBS_1.0", + int, + struct ibv_pd_1_0 *pd) +{ + int ret; + + ret = ibv_dealloc_pd(pd->real_pd); + if (ret) + return ret; + + free(pd); + return 0; +} + +COMPAT_SYMVER_FUNC(ibv_reg_mr, 1_0, "IBVERBS_1.0", + struct ibv_mr_1_0 *, + struct ibv_pd_1_0 *pd, void *addr, size_t length, + int access) +{ + struct ibv_mr *real_mr; + struct ibv_mr_1_0 *mr; + + mr = malloc(sizeof *mr); + if (!mr) + return NULL; + + real_mr = ibv_reg_mr(pd->real_pd, addr, length, access); + if (!real_mr) { + free(mr); + return NULL; + } + + mr->context = pd->context; + mr->pd = pd; + mr->lkey = real_mr->lkey; + mr->rkey = real_mr->rkey; + mr->real_mr = real_mr; + + return mr; +} + +COMPAT_SYMVER_FUNC(ibv_dereg_mr, 1_0, "IBVERBS_1.0", + int, + struct ibv_mr_1_0 *mr) +{ + int ret; + + ret = ibv_dereg_mr(mr->real_mr); + if (ret) + return ret; + + free(mr); + return 0; +} + +COMPAT_SYMVER_FUNC(ibv_create_cq, 1_0, "IBVERBS_1.0", + struct ibv_cq_1_0 *, + struct ibv_context_1_0 *context, int cqe, void *cq_context, + struct ibv_comp_channel *channel, int comp_vector) +{ + struct ibv_cq *real_cq; + struct ibv_cq_1_0 *cq; + + cq = malloc(sizeof *cq); + if (!cq) + return NULL; + + real_cq = ibv_create_cq(context->real_context, cqe, cq_context, + channel, comp_vector); + if (!real_cq) { + free(cq); + return NULL; + } + + cq->context = context; + cq->cq_context = cq_context; + cq->cqe = cqe; + cq->real_cq = real_cq; + + real_cq->cq_context = cq; + + return cq; +} + +COMPAT_SYMVER_FUNC(ibv_resize_cq, 1_0, "IBVERBS_1.0", + int, + struct ibv_cq_1_0 *cq, int cqe) +{ + return ibv_resize_cq(cq->real_cq, cqe); +} + +COMPAT_SYMVER_FUNC(ibv_destroy_cq, 1_0, "IBVERBS_1.0", + int, + struct ibv_cq_1_0 *cq) +{ + int ret; + + ret = ibv_destroy_cq(cq->real_cq); + if (ret) + return ret; + + free(cq); + return 0; +} + +COMPAT_SYMVER_FUNC(ibv_get_cq_event, 1_0, "IBVERBS_1.0", + int, + struct ibv_comp_channel *channel, + struct ibv_cq_1_0 **cq, + void **cq_context) +{ + struct ibv_cq *real_cq; + void *cq_ptr; + int ret; + + ret = ibv_get_cq_event(channel, &real_cq, &cq_ptr); + if (ret) + return ret; + + *cq = cq_ptr; + *cq_context = (*cq)->cq_context; + + return 0; +} + +COMPAT_SYMVER_FUNC(ibv_ack_cq_events, 1_0, "IBVERBS_1.0", + void, + struct ibv_cq_1_0 *cq, + unsigned int nevents) +{ + ibv_ack_cq_events(cq->real_cq, nevents); +} + +COMPAT_SYMVER_FUNC(ibv_create_srq, 1_0, "IBVERBS_1.0", + struct ibv_srq_1_0 *, + struct ibv_pd_1_0 *pd, + struct ibv_srq_init_attr *srq_init_attr) +{ + struct ibv_srq *real_srq; + struct ibv_srq_1_0 *srq; + + srq = malloc(sizeof *srq); + if (!srq) + return NULL; + + real_srq = ibv_create_srq(pd->real_pd, srq_init_attr); + if (!real_srq) { + free(srq); + return NULL; + } + + srq->context = pd->context; + srq->srq_context = srq_init_attr->srq_context; + srq->pd = pd; + srq->real_srq = real_srq; + + real_srq->srq_context = srq; + + return srq; +} + +COMPAT_SYMVER_FUNC(ibv_modify_srq, 1_0, "IBVERBS_1.0", + int, + struct ibv_srq_1_0 *srq, + struct ibv_srq_attr *srq_attr, + int srq_attr_mask) +{ + return ibv_modify_srq(srq->real_srq, srq_attr, srq_attr_mask); +} + +COMPAT_SYMVER_FUNC(ibv_query_srq, 1_0, "IBVERBS_1.0", + int, + struct ibv_srq_1_0 *srq, + struct ibv_srq_attr *srq_attr) +{ + return ibv_query_srq(srq->real_srq, srq_attr); +} + +COMPAT_SYMVER_FUNC(ibv_destroy_srq, 1_0, "IBVERBS_1.0", + int, + struct ibv_srq_1_0 *srq) +{ + int ret; + + ret = ibv_destroy_srq(srq->real_srq); + if (ret) + return ret; + + free(srq); + return 0; +} + +COMPAT_SYMVER_FUNC(ibv_create_qp, 1_0, "IBVERBS_1.0", + struct ibv_qp_1_0 *, + struct ibv_pd_1_0 *pd, + struct ibv_qp_init_attr_1_0 *qp_init_attr) +{ + struct ibv_qp *real_qp; + struct ibv_qp_1_0 *qp; + struct ibv_qp_init_attr real_init_attr; + + qp = malloc(sizeof *qp); + if (!qp) + return NULL; + + real_init_attr.qp_context = qp_init_attr->qp_context; + real_init_attr.send_cq = qp_init_attr->send_cq->real_cq; + real_init_attr.recv_cq = qp_init_attr->recv_cq->real_cq; + real_init_attr.srq = qp_init_attr->srq ? + qp_init_attr->srq->real_srq : NULL; + real_init_attr.cap = qp_init_attr->cap; + real_init_attr.qp_type = qp_init_attr->qp_type; + real_init_attr.sq_sig_all = qp_init_attr->sq_sig_all; + + real_qp = ibv_create_qp(pd->real_pd, &real_init_attr); + if (!real_qp) { + free(qp); + return NULL; + } + + qp->context = pd->context; + qp->qp_context = qp_init_attr->qp_context; + qp->pd = pd; + qp->send_cq = qp_init_attr->send_cq; + qp->recv_cq = qp_init_attr->recv_cq; + qp->srq = qp_init_attr->srq; + qp->qp_type = qp_init_attr->qp_type; + qp->qp_num = real_qp->qp_num; + qp->real_qp = real_qp; + + qp_init_attr->cap = real_init_attr.cap; + + real_qp->qp_context = qp; + + return qp; +} + +COMPAT_SYMVER_FUNC(ibv_query_qp, 1_0, "IBVERBS_1.0", + int, + struct ibv_qp_1_0 *qp, struct ibv_qp_attr *attr, + int attr_mask, struct ibv_qp_init_attr_1_0 *init_attr) +{ + struct ibv_qp_init_attr real_init_attr; + int ret; + + ret = ibv_query_qp(qp->real_qp, attr, attr_mask, &real_init_attr); + if (ret) + return ret; + + init_attr->qp_context = qp->qp_context; + init_attr->send_cq = real_init_attr.send_cq->cq_context; + init_attr->recv_cq = real_init_attr.recv_cq->cq_context; + init_attr->srq = real_init_attr.srq->srq_context; + init_attr->qp_type = real_init_attr.qp_type; + init_attr->cap = real_init_attr.cap; + init_attr->sq_sig_all = real_init_attr.sq_sig_all; + + return 0; +} + +COMPAT_SYMVER_FUNC(ibv_modify_qp, 1_0, "IBVERBS_1.0", + int, + struct ibv_qp_1_0 *qp, + struct ibv_qp_attr *attr, + int attr_mask) +{ + return ibv_modify_qp(qp->real_qp, attr, attr_mask); +} + +COMPAT_SYMVER_FUNC(ibv_destroy_qp, 1_0, "IBVERBS_1.0", + int, + struct ibv_qp_1_0 *qp) +{ + int ret; + + ret = ibv_destroy_qp(qp->real_qp); + if (ret) + return ret; + + free(qp); + return 0; +} + +COMPAT_SYMVER_FUNC(ibv_create_ah, 1_0, "IBVERBS_1.0", + struct ibv_ah_1_0 *, + struct ibv_pd_1_0 *pd, struct ibv_ah_attr *attr) +{ + struct ibv_ah *real_ah; + struct ibv_ah_1_0 *ah; + + ah = malloc(sizeof *ah); + if (!ah) + return NULL; + + real_ah = ibv_create_ah(pd->real_pd, attr); + if (!real_ah) { + free(ah); + return NULL; + } + + ah->context = pd->context; + ah->pd = pd; + ah->real_ah = real_ah; + + return ah; +} + +COMPAT_SYMVER_FUNC(ibv_destroy_ah, 1_0, "IBVERBS_1.0", + int, + struct ibv_ah_1_0 *ah) +{ + int ret; + + ret = ibv_destroy_ah(ah->real_ah); + if (ret) + return ret; + + free(ah); + return 0; +} + +COMPAT_SYMVER_FUNC(ibv_attach_mcast, 1_0, "IBVERBS_1.0", + int, + struct ibv_qp_1_0 *qp, union ibv_gid *gid, uint16_t lid) +{ + return ibv_attach_mcast(qp->real_qp, gid, lid); +} + +COMPAT_SYMVER_FUNC(ibv_detach_mcast, 1_0, "IBVERBS_1.0", + int, + struct ibv_qp_1_0 *qp, union ibv_gid *gid, uint16_t lid) +{ + return ibv_detach_mcast(qp->real_qp, gid, lid); +} + +COMPAT_SYMVER_FUNC(ibv_register_driver, 1_1, "IBVERBS_1.1", + void, + const char *name, ibv_driver_init_func_1_1 init_func) +{ + /* The driver interface is private as of rdma-core 13. This stub is + * left to preserve dynamic-link compatibility with old libfabrics + * usnic providers which use this function only to suppress a fprintf + * in old versions of libibverbs. */ +} diff --git a/libibverbs/device.c b/libibverbs/device.c new file mode 100644 index 0000000..bc7df1b --- /dev/null +++ b/libibverbs/device.c @@ -0,0 +1,498 @@ +/* + * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved. + * Copyright (c) 2006, 2007 Cisco Systems, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#define _GNU_SOURCE +#include <config.h> + +#include <endian.h> +#include <stdio.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> +#include <unistd.h> +#include <stdlib.h> +#include <alloca.h> +#include <errno.h> + +#include <rdma/ib_user_ioctl_cmds.h> +#include <util/symver.h> +#include <util/util.h> +#include "ibverbs.h" + +static pthread_mutex_t dev_list_lock = PTHREAD_MUTEX_INITIALIZER; +static struct list_head device_list = LIST_HEAD_INIT(device_list); + +LATEST_SYMVER_FUNC(ibv_get_device_list, 1_1, "IBVERBS_1.1", + struct ibv_device **, + int *num) +{ + struct ibv_device **l = NULL; + struct verbs_device *device; + static bool initialized; + int num_devices; + int i = 0; + + if (num) + *num = 0; + + pthread_mutex_lock(&dev_list_lock); + if (!initialized) { + if (ibverbs_init()) + goto out; + initialized = true; + } + + num_devices = ibverbs_get_device_list(&device_list); + if (num_devices < 0) { + errno = -num_devices; + goto out; + } + + l = calloc(num_devices + 1, sizeof (struct ibv_device *)); + if (!l) { + errno = ENOMEM; + goto out; + } + + list_for_each(&device_list, device, entry) { + l[i] = &device->device; + ibverbs_device_hold(l[i]); + i++; + } + if (num) + *num = num_devices; +out: + pthread_mutex_unlock(&dev_list_lock); + return l; +} + +LATEST_SYMVER_FUNC(ibv_free_device_list, 1_1, "IBVERBS_1.1", + void, + struct ibv_device **list) +{ + int i; + + for (i = 0; list[i]; i++) + ibverbs_device_put(list[i]); + free(list); +} + +LATEST_SYMVER_FUNC(ibv_get_device_name, 1_1, "IBVERBS_1.1", + const char *, + struct ibv_device *device) +{ + return device->name; +} + +LATEST_SYMVER_FUNC(ibv_get_device_guid, 1_1, "IBVERBS_1.1", + __be64, + struct ibv_device *device) +{ + struct verbs_sysfs_dev *sysfs_dev = verbs_get_device(device)->sysfs; + char attr[24]; + uint64_t guid = 0; + uint16_t parts[4]; + int i; + + pthread_mutex_lock(&dev_list_lock); + if (sysfs_dev->flags & VSYSFS_READ_NODE_GUID) { + guid = sysfs_dev->node_guid; + pthread_mutex_unlock(&dev_list_lock); + return htobe64(guid); + } + pthread_mutex_unlock(&dev_list_lock); + + if (ibv_read_ibdev_sysfs_file(attr, sizeof(attr), sysfs_dev, + "node_guid") < 0) + return 0; + + if (sscanf(attr, "%hx:%hx:%hx:%hx", + parts, parts + 1, parts + 2, parts + 3) != 4) + return 0; + + for (i = 0; i < 4; ++i) + guid = (guid << 16) | parts[i]; + + pthread_mutex_lock(&dev_list_lock); + sysfs_dev->node_guid = guid; + sysfs_dev->flags |= VSYSFS_READ_NODE_GUID; + pthread_mutex_unlock(&dev_list_lock); + + return htobe64(guid); +} + +int ibv_get_fw_ver(char *value, size_t len, struct verbs_sysfs_dev *sysfs_dev) +{ + /* + * NOTE: This can only be called by a driver inside the dev_list_lock, + * ie during context setup or otherwise. + */ + assert(pthread_mutex_trylock(&dev_list_lock) != 0); + + if (!(sysfs_dev->flags & VSYSFS_READ_FW_VER)) { + if (ibv_read_ibdev_sysfs_file(sysfs_dev->fw_ver, + sizeof(sysfs_dev->fw_ver), + sysfs_dev, "fw_ver") <= 0) + return -1; + sysfs_dev->flags |= VSYSFS_READ_FW_VER; + } + if (!check_snprintf(value, len, "%s", sysfs_dev->fw_ver)) + return -1; + return 0; +} + +void verbs_init_cq(struct ibv_cq *cq, struct ibv_context *context, + struct ibv_comp_channel *channel, + void *cq_context) +{ + cq->context = context; + cq->channel = channel; + + if (cq->channel) { + pthread_mutex_lock(&context->mutex); + ++cq->channel->refcnt; + pthread_mutex_unlock(&context->mutex); + } + + cq->cq_context = cq_context; + cq->comp_events_completed = 0; + cq->async_events_completed = 0; + pthread_mutex_init(&cq->mutex, NULL); + pthread_cond_init(&cq->cond, NULL); +} + +static struct ibv_cq_ex * +__lib_ibv_create_cq_ex(struct ibv_context *context, + struct ibv_cq_init_attr_ex *cq_attr) +{ + struct ibv_cq_ex *cq; + + if (cq_attr->wc_flags & ~IBV_CREATE_CQ_SUP_WC_FLAGS) { + errno = EOPNOTSUPP; + return NULL; + } + + cq = get_ops(context)->create_cq_ex(context, cq_attr); + + if (cq) + verbs_init_cq(ibv_cq_ex_to_cq(cq), context, + cq_attr->channel, cq_attr->cq_context); + + return cq; +} + +static bool has_ioctl_write(struct ibv_context *ctx) +{ + int rc; + DECLARE_COMMAND_BUFFER(cmdb, UVERBS_OBJECT_DEVICE, + UVERBS_METHOD_INVOKE_WRITE, 1); + + if (VERBS_IOCTL_ONLY) + return true; + if (VERBS_WRITE_ONLY) + return false; + + /* + * This command should return ENOSPC since the request length is too + * small. + */ + fill_attr_const_in(cmdb, UVERBS_ATTR_WRITE_CMD, + IB_USER_VERBS_CMD_QUERY_DEVICE); + rc = execute_ioctl(ctx, cmdb); + if (rc == EPROTONOSUPPORT) + return false; + if (rc == ENOTTY) + return false; + return true; +} + +/* + * Ownership of cmd_fd is transferred into this function, and it will either + * be released during the matching call to verbs_uninit_contxt or during the + * failure path of this function. + */ +int verbs_init_context(struct verbs_context *context_ex, + struct ibv_device *device, int cmd_fd, + uint32_t driver_id) +{ + struct ibv_context *context = &context_ex->context; + + ibverbs_device_hold(device); + + context->device = device; + context->cmd_fd = cmd_fd; + context->async_fd = -1; + pthread_mutex_init(&context->mutex, NULL); + + context_ex->context.abi_compat = __VERBS_ABI_IS_EXTENDED; + context_ex->sz = sizeof(*context_ex); + + /* + * In order to maintain backward/forward binary compatibility + * with apps compiled against libibverbs-1.1.8 that use the + * flow steering addition, we need to set the two + * ABI_placeholder entries to match the driver set flow + * entries. This is because apps compiled against + * libibverbs-1.1.8 use an inline ibv_create_flow and + * ibv_destroy_flow function that looks in the placeholder + * spots for the proper entry points. For apps compiled + * against libibverbs-1.1.9 and later, the inline functions + * will be looking in the right place. + */ + context_ex->ABI_placeholder1 = + (void (*)(void))context_ex->ibv_create_flow; + context_ex->ABI_placeholder2 = + (void (*)(void))context_ex->ibv_destroy_flow; + + context_ex->priv = calloc(1, sizeof(*context_ex->priv)); + if (!context_ex->priv) { + errno = ENOMEM; + close(cmd_fd); + return -1; + } + + context_ex->priv->driver_id = driver_id; + verbs_set_ops(context_ex, &verbs_dummy_ops); + context_ex->priv->use_ioctl_write = has_ioctl_write(context); + + return 0; +} + +/* + * Allocate and initialize a context structure. This is called to create the + * driver wrapper, and context_offset is the number of bytes into the wrapper + * structure where the verbs_context starts. + */ +void *_verbs_init_and_alloc_context(struct ibv_device *device, int cmd_fd, + size_t alloc_size, + struct verbs_context *context_offset, + uint32_t driver_id) +{ + void *drv_context; + struct verbs_context *context; + + drv_context = calloc(1, alloc_size); + if (!drv_context) { + errno = ENOMEM; + close(cmd_fd); + return NULL; + } + + context = drv_context + (uintptr_t)context_offset; + + if (verbs_init_context(context, device, cmd_fd, driver_id)) + goto err_free; + + return drv_context; + +err_free: + free(drv_context); + return NULL; +} + +static void set_lib_ops(struct verbs_context *vctx) +{ + vctx->create_cq_ex = __lib_ibv_create_cq_ex; + + /* + * The compat symver entry point behaves identically to what used to + * be pointed to by _compat_query_port. + */ +#undef ibv_query_port + vctx->context.ops._compat_query_port = ibv_query_port; + vctx->query_port = __lib_query_port; +} + +struct ibv_context *verbs_open_device(struct ibv_device *device, void *private_data) +{ + struct verbs_device *verbs_device = verbs_get_device(device); + int cmd_fd; + struct verbs_context *context_ex; + + /* + * We'll only be doing writes, but we need O_RDWR in case the + * provider needs to mmap() the file. + */ + cmd_fd = open_cdev(verbs_device->sysfs->sysfs_name, + verbs_device->sysfs->sysfs_cdev); + if (cmd_fd < 0) + return NULL; + + /* + * cmd_fd ownership is transferred into alloc_context, if it fails + * then it closes cmd_fd and returns NULL + */ + context_ex = verbs_device->ops->alloc_context(device, cmd_fd, private_data); + if (!context_ex) + return NULL; + + set_lib_ops(context_ex); + + return &context_ex->context; +} + +LATEST_SYMVER_FUNC(ibv_open_device, 1_1, "IBVERBS_1.1", + struct ibv_context *, + struct ibv_device *device) +{ + return verbs_open_device(device, NULL); +} + +void verbs_uninit_context(struct verbs_context *context_ex) +{ + free(context_ex->priv); + close(context_ex->context.cmd_fd); + close(context_ex->context.async_fd); + ibverbs_device_put(context_ex->context.device); +} + +LATEST_SYMVER_FUNC(ibv_close_device, 1_1, "IBVERBS_1.1", + int, + struct ibv_context *context) +{ + const struct verbs_context_ops *ops = get_ops(context); + + ops->free_context(context); + return 0; +} + +LATEST_SYMVER_FUNC(ibv_get_async_event, 1_1, "IBVERBS_1.1", + int, + struct ibv_context *context, + struct ibv_async_event *event) +{ + struct ib_uverbs_async_event_desc ev; + + if (read(context->async_fd, &ev, sizeof ev) != sizeof ev) + return -1; + + event->event_type = ev.event_type; + + switch (event->event_type) { + case IBV_EVENT_CQ_ERR: + event->element.cq = (void *) (uintptr_t) ev.element; + break; + + case IBV_EVENT_QP_FATAL: + case IBV_EVENT_QP_REQ_ERR: + case IBV_EVENT_QP_ACCESS_ERR: + case IBV_EVENT_COMM_EST: + case IBV_EVENT_SQ_DRAINED: + case IBV_EVENT_PATH_MIG: + case IBV_EVENT_PATH_MIG_ERR: + case IBV_EVENT_QP_LAST_WQE_REACHED: + event->element.qp = (void *) (uintptr_t) ev.element; + break; + + case IBV_EVENT_SRQ_ERR: + case IBV_EVENT_SRQ_LIMIT_REACHED: + event->element.srq = (void *) (uintptr_t) ev.element; + break; + + case IBV_EVENT_WQ_FATAL: + event->element.wq = (void *) (uintptr_t) ev.element; + break; + default: + event->element.port_num = ev.element; + break; + } + + get_ops(context)->async_event(context, event); + + return 0; +} + +LATEST_SYMVER_FUNC(ibv_ack_async_event, 1_1, "IBVERBS_1.1", + void, + struct ibv_async_event *event) +{ + switch (event->event_type) { + case IBV_EVENT_CQ_ERR: + { + struct ibv_cq *cq = event->element.cq; + + pthread_mutex_lock(&cq->mutex); + ++cq->async_events_completed; + pthread_cond_signal(&cq->cond); + pthread_mutex_unlock(&cq->mutex); + + return; + } + + case IBV_EVENT_QP_FATAL: + case IBV_EVENT_QP_REQ_ERR: + case IBV_EVENT_QP_ACCESS_ERR: + case IBV_EVENT_COMM_EST: + case IBV_EVENT_SQ_DRAINED: + case IBV_EVENT_PATH_MIG: + case IBV_EVENT_PATH_MIG_ERR: + case IBV_EVENT_QP_LAST_WQE_REACHED: + { + struct ibv_qp *qp = event->element.qp; + + pthread_mutex_lock(&qp->mutex); + ++qp->events_completed; + pthread_cond_signal(&qp->cond); + pthread_mutex_unlock(&qp->mutex); + + return; + } + + case IBV_EVENT_SRQ_ERR: + case IBV_EVENT_SRQ_LIMIT_REACHED: + { + struct ibv_srq *srq = event->element.srq; + + pthread_mutex_lock(&srq->mutex); + ++srq->events_completed; + pthread_cond_signal(&srq->cond); + pthread_mutex_unlock(&srq->mutex); + + return; + } + + case IBV_EVENT_WQ_FATAL: + { + struct ibv_wq *wq = event->element.wq; + + pthread_mutex_lock(&wq->mutex); + ++wq->events_completed; + pthread_cond_signal(&wq->cond); + pthread_mutex_unlock(&wq->mutex); + + return; + } + + default: + return; + } +} diff --git a/libibverbs/driver.h b/libibverbs/driver.h new file mode 100644 index 0000000..a0e6f89 --- /dev/null +++ b/libibverbs/driver.h @@ -0,0 +1,678 @@ +/* + * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved. + * Copyright (c) 2005, 2006 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2005 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef INFINIBAND_DRIVER_H +#define INFINIBAND_DRIVER_H + +#include <stdatomic.h> +#include <infiniband/verbs.h> +#include <infiniband/kern-abi.h> +#include <infiniband/cmd_ioctl.h> +#include <ccan/list.h> +#include <config.h> +#include <stdbool.h> +#include <rdma/rdma_user_ioctl_cmds.h> +#include <infiniband/cmd_ioctl.h> +#include <sys/types.h> + +struct verbs_device; + +enum verbs_xrcd_mask { + VERBS_XRCD_HANDLE = 1 << 0, + VERBS_XRCD_RESERVED = 1 << 1 +}; + +struct verbs_xrcd { + struct ibv_xrcd xrcd; + uint32_t comp_mask; + uint32_t handle; +}; + +enum verbs_srq_mask { + VERBS_SRQ_TYPE = 1 << 0, + VERBS_SRQ_XRCD = 1 << 1, + VERBS_SRQ_CQ = 1 << 2, + VERBS_SRQ_NUM = 1 << 3, + VERBS_SRQ_RESERVED = 1 << 4 +}; + +struct verbs_srq { + struct ibv_srq srq; + uint32_t comp_mask; + enum ibv_srq_type srq_type; + struct verbs_xrcd *xrcd; + struct ibv_cq *cq; + uint32_t srq_num; +}; + +enum verbs_qp_mask { + VERBS_QP_XRCD = 1 << 0, + VERBS_QP_EX = 1 << 1, +}; + +enum ibv_gid_type { + IBV_GID_TYPE_IB_ROCE_V1, + IBV_GID_TYPE_ROCE_V2, +}; + +enum ibv_mr_type { + IBV_MR_TYPE_MR, + IBV_MR_TYPE_NULL_MR, +}; + +struct verbs_mr { + struct ibv_mr ibv_mr; + enum ibv_mr_type mr_type; +}; + +static inline struct verbs_mr *verbs_get_mr(struct ibv_mr *mr) +{ + return container_of(mr, struct verbs_mr, ibv_mr); +} + +struct verbs_qp { + union { + struct ibv_qp qp; + struct ibv_qp_ex qp_ex; + }; + uint32_t comp_mask; + struct verbs_xrcd *xrcd; +}; +static_assert(offsetof(struct ibv_qp_ex, qp_base) == 0, "Invalid qp layout"); + +enum ibv_flow_action_type { + IBV_FLOW_ACTION_UNSPECIFIED, + IBV_FLOW_ACTION_ESP = 1, +}; + +struct verbs_flow_action { + struct ibv_flow_action action; + uint32_t handle; + enum ibv_flow_action_type type; +}; + +struct verbs_dm { + struct ibv_dm dm; + uint32_t handle; +}; + +enum { + VERBS_MATCH_SENTINEL = 0, + VERBS_MATCH_PCI = 1, + VERBS_MATCH_MODALIAS = 2, + VERBS_MATCH_DRIVER_ID = 3, +}; + +struct verbs_match_ent { + void *driver_data; + union { + const char *modalias; + uint64_t driver_id; + } u; + uint16_t vendor; + uint16_t device; + uint8_t kind; +}; +#define VERBS_DRIVER_ID(_id) \ + { \ + .u.driver_id = (_id), .kind = VERBS_MATCH_DRIVER_ID, \ + } +/* Note: New drivers should only use VERBS_DRIVER_ID, the below are for legacy + * drivers + */ +#define VERBS_PCI_MATCH(_vendor, _device, _data) \ + { \ + .driver_data = (void *)(_data), \ + .vendor = (_vendor), \ + .device = (_device), \ + .kind = VERBS_MATCH_PCI, \ + } + +#define VERBS_MODALIAS_MATCH(_mod_str, _data) \ + { \ + .driver_data = (void *)(_data), \ + .u.modalias = (_mod_str), \ + .kind = VERBS_MATCH_MODALIAS, \ + } + +/* Matching on the IB device name is STRONGLY discouraged. This will only + * match if there is no device/modalias file available, and it will eventually + * be disabled entirely if the kernel supports renaming. Use is strongly + * discouraged. + */ +#define VERBS_NAME_MATCH(_name_prefix, _data) \ + { \ + .driver_data = (_data), \ + .u.modalias = "rdma_device:*N" _name_prefix "*", \ + .kind = VERBS_MATCH_MODALIAS, \ + } + +enum { + VSYSFS_READ_MODALIAS = 1 << 0, + VSYSFS_READ_NODE_GUID = 1 << 1, + VSYSFS_READ_FW_VER = 1 << 2, +}; + +/* A rdma device detected in sysfs */ +struct verbs_sysfs_dev { + struct list_node entry; + void *provider_data; + const struct verbs_match_ent *match; + unsigned int flags; + char sysfs_name[IBV_SYSFS_NAME_MAX]; + dev_t sysfs_cdev; + char ibdev_name[IBV_SYSFS_NAME_MAX]; + char ibdev_path[IBV_SYSFS_PATH_MAX]; + char modalias[512]; + char fw_ver[64]; + uint64_t node_guid; + uint32_t driver_id; + enum ibv_node_type node_type; + int ibdev_idx; + uint32_t abi_ver; + struct timespec time_created; +}; + +/* Must change the PRIVATE IBVERBS_PRIVATE_ symbol if this is changed */ +struct verbs_device_ops { + const char *name; + + uint32_t match_min_abi_version; + uint32_t match_max_abi_version; + const struct verbs_match_ent *match_table; + const struct verbs_device_ops **static_providers; + + bool (*match_device)(struct verbs_sysfs_dev *sysfs_dev); + + struct verbs_context *(*alloc_context)(struct ibv_device *device, + int cmd_fd, + void *private_data); + + struct verbs_device *(*alloc_device)(struct verbs_sysfs_dev *sysfs_dev); + void (*uninit_device)(struct verbs_device *device); +}; + +/* Must change the PRIVATE IBVERBS_PRIVATE_ symbol if this is changed */ +struct verbs_device { + struct ibv_device device; /* Must be first */ + const struct verbs_device_ops *ops; + atomic_int refcount; + struct list_node entry; + struct verbs_sysfs_dev *sysfs; + uint64_t core_support; +}; + +struct verbs_counters { + struct ibv_counters counters; + uint32_t handle; +}; + +/* + * Must change the PRIVATE IBVERBS_PRIVATE_ symbol if this is changed. This is + * the union of every op the driver can support. If new elements are added to + * this structure then verbs_dummy_ops must also be updated. + * + * Keep sorted. + */ +struct verbs_context_ops { + int (*advise_mr)(struct ibv_pd *pd, + enum ibv_advise_mr_advice advice, + uint32_t flags, + struct ibv_sge *sg_list, + uint32_t num_sges); + struct ibv_dm *(*alloc_dm)(struct ibv_context *context, + struct ibv_alloc_dm_attr *attr); + struct ibv_mw *(*alloc_mw)(struct ibv_pd *pd, enum ibv_mw_type type); + struct ibv_mr *(*alloc_null_mr)(struct ibv_pd *pd); + struct ibv_pd *(*alloc_parent_domain)( + struct ibv_context *context, + struct ibv_parent_domain_init_attr *attr); + struct ibv_pd *(*alloc_pd)(struct ibv_context *context); + struct ibv_td *(*alloc_td)(struct ibv_context *context, + struct ibv_td_init_attr *init_attr); + void (*async_event)(struct ibv_context *context, struct ibv_async_event *event); + int (*attach_counters_point_flow)(struct ibv_counters *counters, + struct ibv_counter_attach_attr *attr, + struct ibv_flow *flow); + int (*attach_mcast)(struct ibv_qp *qp, const union ibv_gid *gid, + uint16_t lid); + int (*bind_mw)(struct ibv_qp *qp, struct ibv_mw *mw, + struct ibv_mw_bind *mw_bind); + int (*close_xrcd)(struct ibv_xrcd *xrcd); + void (*cq_event)(struct ibv_cq *cq); + struct ibv_ah *(*create_ah)(struct ibv_pd *pd, + struct ibv_ah_attr *attr); + struct ibv_counters *(*create_counters)(struct ibv_context *context, + struct ibv_counters_init_attr *init_attr); + struct ibv_cq *(*create_cq)(struct ibv_context *context, int cqe, + struct ibv_comp_channel *channel, + int comp_vector); + struct ibv_cq_ex *(*create_cq_ex)( + struct ibv_context *context, + struct ibv_cq_init_attr_ex *init_attr); + struct ibv_flow *(*create_flow)(struct ibv_qp *qp, + struct ibv_flow_attr *flow_attr); + struct ibv_flow_action *(*create_flow_action_esp)(struct ibv_context *context, + struct ibv_flow_action_esp_attr *attr); + struct ibv_qp *(*create_qp)(struct ibv_pd *pd, + struct ibv_qp_init_attr *attr); + struct ibv_qp *(*create_qp_ex)( + struct ibv_context *context, + struct ibv_qp_init_attr_ex *qp_init_attr_ex); + struct ibv_rwq_ind_table *(*create_rwq_ind_table)( + struct ibv_context *context, + struct ibv_rwq_ind_table_init_attr *init_attr); + struct ibv_srq *(*create_srq)(struct ibv_pd *pd, + struct ibv_srq_init_attr *srq_init_attr); + struct ibv_srq *(*create_srq_ex)( + struct ibv_context *context, + struct ibv_srq_init_attr_ex *srq_init_attr_ex); + struct ibv_wq *(*create_wq)(struct ibv_context *context, + struct ibv_wq_init_attr *wq_init_attr); + int (*dealloc_mw)(struct ibv_mw *mw); + int (*dealloc_pd)(struct ibv_pd *pd); + int (*dealloc_td)(struct ibv_td *td); + int (*dereg_mr)(struct verbs_mr *vmr); + int (*destroy_ah)(struct ibv_ah *ah); + int (*destroy_counters)(struct ibv_counters *counters); + int (*destroy_cq)(struct ibv_cq *cq); + int (*destroy_flow)(struct ibv_flow *flow); + int (*destroy_flow_action)(struct ibv_flow_action *action); + int (*destroy_qp)(struct ibv_qp *qp); + int (*destroy_rwq_ind_table)(struct ibv_rwq_ind_table *rwq_ind_table); + int (*destroy_srq)(struct ibv_srq *srq); + int (*destroy_wq)(struct ibv_wq *wq); + int (*detach_mcast)(struct ibv_qp *qp, const union ibv_gid *gid, + uint16_t lid); + void (*free_context)(struct ibv_context *context); + int (*free_dm)(struct ibv_dm *dm); + int (*get_srq_num)(struct ibv_srq *srq, uint32_t *srq_num); + int (*modify_cq)(struct ibv_cq *cq, struct ibv_modify_cq_attr *attr); + int (*modify_flow_action_esp)(struct ibv_flow_action *action, + struct ibv_flow_action_esp_attr *attr); + int (*modify_qp)(struct ibv_qp *qp, struct ibv_qp_attr *attr, + int attr_mask); + int (*modify_qp_rate_limit)(struct ibv_qp *qp, + struct ibv_qp_rate_limit_attr *attr); + int (*modify_srq)(struct ibv_srq *srq, struct ibv_srq_attr *srq_attr, + int srq_attr_mask); + int (*modify_wq)(struct ibv_wq *wq, struct ibv_wq_attr *wq_attr); + struct ibv_qp *(*open_qp)(struct ibv_context *context, + struct ibv_qp_open_attr *attr); + struct ibv_xrcd *(*open_xrcd)( + struct ibv_context *context, + struct ibv_xrcd_init_attr *xrcd_init_attr); + int (*poll_cq)(struct ibv_cq *cq, int num_entries, struct ibv_wc *wc); + int (*post_recv)(struct ibv_qp *qp, struct ibv_recv_wr *wr, + struct ibv_recv_wr **bad_wr); + int (*post_send)(struct ibv_qp *qp, struct ibv_send_wr *wr, + struct ibv_send_wr **bad_wr); + int (*post_srq_ops)(struct ibv_srq *srq, struct ibv_ops_wr *op, + struct ibv_ops_wr **bad_op); + int (*post_srq_recv)(struct ibv_srq *srq, struct ibv_recv_wr *recv_wr, + struct ibv_recv_wr **bad_recv_wr); + int (*query_device)(struct ibv_context *context, + struct ibv_device_attr *device_attr); + int (*query_device_ex)(struct ibv_context *context, + const struct ibv_query_device_ex_input *input, + struct ibv_device_attr_ex *attr, + size_t attr_size); + int (*query_port)(struct ibv_context *context, uint8_t port_num, + struct ibv_port_attr *port_attr); + int (*query_qp)(struct ibv_qp *qp, struct ibv_qp_attr *attr, + int attr_mask, struct ibv_qp_init_attr *init_attr); + int (*query_rt_values)(struct ibv_context *context, + struct ibv_values_ex *values); + int (*query_srq)(struct ibv_srq *srq, struct ibv_srq_attr *srq_attr); + int (*read_counters)(struct ibv_counters *counters, + uint64_t *counters_value, + uint32_t ncounters, + uint32_t flags); + struct ibv_mr *(*reg_dm_mr)(struct ibv_pd *pd, struct ibv_dm *dm, + uint64_t dm_offset, size_t length, + unsigned int access); + struct ibv_mr *(*reg_mr)(struct ibv_pd *pd, void *addr, size_t length, + uint64_t hca_va, int access); + int (*req_notify_cq)(struct ibv_cq *cq, int solicited_only); + int (*rereg_mr)(struct verbs_mr *vmr, int flags, struct ibv_pd *pd, + void *addr, size_t length, int access); + int (*resize_cq)(struct ibv_cq *cq, int cqe); +}; + +static inline struct verbs_device * +verbs_get_device(const struct ibv_device *dev) +{ + return container_of(dev, struct verbs_device, device); +} + +typedef struct verbs_device *(*verbs_driver_init_func)(const char *uverbs_sys_path, + int abi_version); + +/* Wire the IBVERBS_PRIVATE version number into the verbs_register_driver + * symbol name. This guarentees we link to the correct set of symbols even if + * statically linking or using a dynmic linker with symbol versioning turned + * off. + */ +#define ___make_verbs_register_driver(x) verbs_register_driver_ ## x +#define __make_verbs_register_driver(x) ___make_verbs_register_driver(x) +#define verbs_register_driver __make_verbs_register_driver(IBVERBS_PABI_VERSION) + +void verbs_register_driver(const struct verbs_device_ops *ops); + +/* + * Macro for providers to use to supply verbs_device_ops to the core code. + * This creates a global symbol for the provider structure to be used by the + * ibv_static_providers() machinery, and a global constructor for the dlopen + * machinery. + */ +#define PROVIDER_DRIVER(provider_name, drv_struct) \ + extern const struct verbs_device_ops verbs_provider_##provider_name \ + __attribute__((alias(stringify(drv_struct)))); \ + static __attribute__((constructor)) void drv##__register_driver(void) \ + { \ + verbs_register_driver(&drv_struct); \ + } + +void *_verbs_init_and_alloc_context(struct ibv_device *device, int cmd_fd, + size_t alloc_size, + struct verbs_context *context_offset, + uint32_t driver_id); + +#define verbs_init_and_alloc_context(ibdev, cmd_fd, drv_ctx_ptr, ctx_memb, \ + driver_id) \ + ((typeof(drv_ctx_ptr))_verbs_init_and_alloc_context( \ + ibdev, cmd_fd, sizeof(*drv_ctx_ptr), \ + &((typeof(drv_ctx_ptr))NULL)->ctx_memb, (driver_id))) + +int verbs_init_context(struct verbs_context *context_ex, + struct ibv_device *device, int cmd_fd, + uint32_t driver_id); +void verbs_uninit_context(struct verbs_context *context); +void verbs_set_ops(struct verbs_context *vctx, + const struct verbs_context_ops *ops); + +void verbs_init_cq(struct ibv_cq *cq, struct ibv_context *context, + struct ibv_comp_channel *channel, + void *cq_context); + +struct ibv_context *verbs_open_device(struct ibv_device *device, + void *private_data); +int ibv_cmd_get_context(struct verbs_context *context, + struct ibv_get_context *cmd, size_t cmd_size, + struct ib_uverbs_get_context_resp *resp, size_t resp_size); +int ibv_cmd_query_device(struct ibv_context *context, + struct ibv_device_attr *device_attr, + uint64_t *raw_fw_ver, + struct ibv_query_device *cmd, size_t cmd_size); +int ibv_cmd_create_flow_action_esp(struct ibv_context *ctx, + struct ibv_flow_action_esp_attr *attr, + struct verbs_flow_action *flow_action, + struct ibv_command_buffer *driver); +int ibv_cmd_modify_flow_action_esp(struct verbs_flow_action *flow_action, + struct ibv_flow_action_esp_attr *attr, + struct ibv_command_buffer *driver); +int ibv_cmd_query_device_ex(struct ibv_context *context, + const struct ibv_query_device_ex_input *input, + struct ibv_device_attr_ex *attr, size_t attr_size, + uint64_t *raw_fw_ver, + struct ibv_query_device_ex *cmd, + size_t cmd_size, + struct ib_uverbs_ex_query_device_resp *resp, + size_t resp_size); +int ibv_cmd_query_port(struct ibv_context *context, uint8_t port_num, + struct ibv_port_attr *port_attr, + struct ibv_query_port *cmd, size_t cmd_size); +int ibv_cmd_alloc_pd(struct ibv_context *context, struct ibv_pd *pd, + struct ibv_alloc_pd *cmd, size_t cmd_size, + struct ib_uverbs_alloc_pd_resp *resp, size_t resp_size); +int ibv_cmd_dealloc_pd(struct ibv_pd *pd); +int ibv_cmd_open_xrcd(struct ibv_context *context, struct verbs_xrcd *xrcd, + int vxrcd_size, + struct ibv_xrcd_init_attr *attr, + struct ibv_open_xrcd *cmd, size_t cmd_size, + struct ib_uverbs_open_xrcd_resp *resp, size_t resp_size); +int ibv_cmd_close_xrcd(struct verbs_xrcd *xrcd); +int ibv_cmd_reg_mr(struct ibv_pd *pd, void *addr, size_t length, + uint64_t hca_va, int access, + struct verbs_mr *vmr, struct ibv_reg_mr *cmd, + size_t cmd_size, + struct ib_uverbs_reg_mr_resp *resp, size_t resp_size); +int ibv_cmd_rereg_mr(struct verbs_mr *vmr, uint32_t flags, void *addr, + size_t length, uint64_t hca_va, int access, + struct ibv_pd *pd, struct ibv_rereg_mr *cmd, + size_t cmd_sz, struct ib_uverbs_rereg_mr_resp *resp, + size_t resp_sz); +int ibv_cmd_dereg_mr(struct verbs_mr *vmr); +int ibv_cmd_advise_mr(struct ibv_pd *pd, + enum ibv_advise_mr_advice advice, + uint32_t flags, + struct ibv_sge *sg_list, + uint32_t num_sge); +int ibv_cmd_alloc_mw(struct ibv_pd *pd, enum ibv_mw_type type, + struct ibv_mw *mw, struct ibv_alloc_mw *cmd, + size_t cmd_size, + struct ib_uverbs_alloc_mw_resp *resp, size_t resp_size); +int ibv_cmd_dealloc_mw(struct ibv_mw *mw); +int ibv_cmd_create_cq(struct ibv_context *context, int cqe, + struct ibv_comp_channel *channel, + int comp_vector, struct ibv_cq *cq, + struct ibv_create_cq *cmd, size_t cmd_size, + struct ib_uverbs_create_cq_resp *resp, size_t resp_size); +int ibv_cmd_create_cq_ex(struct ibv_context *context, + struct ibv_cq_init_attr_ex *cq_attr, + struct ibv_cq_ex *cq, + struct ibv_create_cq_ex *cmd, + size_t cmd_size, + struct ib_uverbs_ex_create_cq_resp *resp, + size_t resp_size); +int ibv_cmd_poll_cq(struct ibv_cq *cq, int ne, struct ibv_wc *wc); +int ibv_cmd_req_notify_cq(struct ibv_cq *cq, int solicited_only); +int ibv_cmd_resize_cq(struct ibv_cq *cq, int cqe, + struct ibv_resize_cq *cmd, size_t cmd_size, + struct ib_uverbs_resize_cq_resp *resp, size_t resp_size); +int ibv_cmd_destroy_cq(struct ibv_cq *cq); +int ibv_cmd_modify_cq(struct ibv_cq *cq, + struct ibv_modify_cq_attr *attr, + struct ibv_modify_cq *cmd, + size_t cmd_size); + +int ibv_cmd_create_srq(struct ibv_pd *pd, + struct ibv_srq *srq, struct ibv_srq_init_attr *attr, + struct ibv_create_srq *cmd, size_t cmd_size, + struct ib_uverbs_create_srq_resp *resp, size_t resp_size); +int ibv_cmd_create_srq_ex(struct ibv_context *context, + struct verbs_srq *srq, int vsrq_sz, + struct ibv_srq_init_attr_ex *attr_ex, + struct ibv_create_xsrq *cmd, size_t cmd_size, + struct ib_uverbs_create_srq_resp *resp, size_t resp_size); +int ibv_cmd_modify_srq(struct ibv_srq *srq, + struct ibv_srq_attr *srq_attr, + int srq_attr_mask, + struct ibv_modify_srq *cmd, size_t cmd_size); +int ibv_cmd_query_srq(struct ibv_srq *srq, + struct ibv_srq_attr *srq_attr, + struct ibv_query_srq *cmd, size_t cmd_size); +int ibv_cmd_destroy_srq(struct ibv_srq *srq); + +int ibv_cmd_create_qp(struct ibv_pd *pd, + struct ibv_qp *qp, struct ibv_qp_init_attr *attr, + struct ibv_create_qp *cmd, size_t cmd_size, + struct ib_uverbs_create_qp_resp *resp, size_t resp_size); +int ibv_cmd_create_qp_ex(struct ibv_context *context, + struct verbs_qp *qp, int vqp_sz, + struct ibv_qp_init_attr_ex *attr_ex, + struct ibv_create_qp *cmd, size_t cmd_size, + struct ib_uverbs_create_qp_resp *resp, size_t resp_size); +int ibv_cmd_create_qp_ex2(struct ibv_context *context, + struct verbs_qp *qp, int vqp_sz, + struct ibv_qp_init_attr_ex *qp_attr, + struct ibv_create_qp_ex *cmd, + size_t cmd_size, + struct ib_uverbs_ex_create_qp_resp *resp, + size_t resp_size); +int ibv_cmd_open_qp(struct ibv_context *context, + struct verbs_qp *qp, int vqp_sz, + struct ibv_qp_open_attr *attr, + struct ibv_open_qp *cmd, size_t cmd_size, + struct ib_uverbs_create_qp_resp *resp, size_t resp_size); +int ibv_cmd_query_qp(struct ibv_qp *qp, struct ibv_qp_attr *qp_attr, + int attr_mask, + struct ibv_qp_init_attr *qp_init_attr, + struct ibv_query_qp *cmd, size_t cmd_size); +int ibv_cmd_modify_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, + int attr_mask, + struct ibv_modify_qp *cmd, size_t cmd_size); +int ibv_cmd_modify_qp_ex(struct ibv_qp *qp, struct ibv_qp_attr *attr, + int attr_mask, struct ibv_modify_qp_ex *cmd, + size_t cmd_size, + struct ib_uverbs_ex_modify_qp_resp *resp, + size_t resp_size); +int ibv_cmd_destroy_qp(struct ibv_qp *qp); +int ibv_cmd_post_send(struct ibv_qp *ibqp, struct ibv_send_wr *wr, + struct ibv_send_wr **bad_wr); +int ibv_cmd_post_recv(struct ibv_qp *ibqp, struct ibv_recv_wr *wr, + struct ibv_recv_wr **bad_wr); +int ibv_cmd_post_srq_recv(struct ibv_srq *srq, struct ibv_recv_wr *wr, + struct ibv_recv_wr **bad_wr); +int ibv_cmd_create_ah(struct ibv_pd *pd, struct ibv_ah *ah, + struct ibv_ah_attr *attr, + struct ib_uverbs_create_ah_resp *resp, + size_t resp_size); +int ibv_cmd_destroy_ah(struct ibv_ah *ah); +int ibv_cmd_attach_mcast(struct ibv_qp *qp, const union ibv_gid *gid, uint16_t lid); +int ibv_cmd_detach_mcast(struct ibv_qp *qp, const union ibv_gid *gid, uint16_t lid); + +int ibv_cmd_create_flow(struct ibv_qp *qp, + struct ibv_flow *flow_id, + struct ibv_flow_attr *flow_attr, + void *ucmd, + size_t ucmd_size); +int ibv_cmd_destroy_flow(struct ibv_flow *flow_id); +int ibv_cmd_create_wq(struct ibv_context *context, + struct ibv_wq_init_attr *wq_init_attr, + struct ibv_wq *wq, + struct ibv_create_wq *cmd, + size_t cmd_size, + struct ib_uverbs_ex_create_wq_resp *resp, + size_t resp_size); + +int ibv_cmd_destroy_flow_action(struct verbs_flow_action *action); +int ibv_cmd_modify_wq(struct ibv_wq *wq, struct ibv_wq_attr *attr, + struct ibv_modify_wq *cmd, size_t cmd_size); +int ibv_cmd_destroy_wq(struct ibv_wq *wq); +int ibv_cmd_create_rwq_ind_table(struct ibv_context *context, + struct ibv_rwq_ind_table_init_attr *init_attr, + struct ibv_rwq_ind_table *rwq_ind_table, + struct ib_uverbs_ex_create_rwq_ind_table_resp *resp, + size_t resp_size); +int ibv_cmd_destroy_rwq_ind_table(struct ibv_rwq_ind_table *rwq_ind_table); +int ibv_cmd_create_counters(struct ibv_context *context, + struct ibv_counters_init_attr *init_attr, + struct verbs_counters *vcounters, + struct ibv_command_buffer *link); +int ibv_cmd_destroy_counters(struct verbs_counters *vcounters); +int ibv_cmd_read_counters(struct verbs_counters *vcounters, + uint64_t *counters_value, + uint32_t ncounters, + uint32_t flags, + struct ibv_command_buffer *link); +int ibv_dontfork_range(void *base, size_t size); +int ibv_dofork_range(void *base, size_t size); +int ibv_cmd_alloc_dm(struct ibv_context *ctx, + const struct ibv_alloc_dm_attr *dm_attr, + struct verbs_dm *dm, + struct ibv_command_buffer *link); +int ibv_cmd_free_dm(struct verbs_dm *dm); +int ibv_cmd_reg_dm_mr(struct ibv_pd *pd, struct verbs_dm *dm, + uint64_t offset, size_t length, + unsigned int access, struct verbs_mr *vmr, + struct ibv_command_buffer *link); + +/* + * sysfs helper functions + */ +const char *ibv_get_sysfs_path(void); + +int ibv_read_sysfs_file(const char *dir, const char *file, + char *buf, size_t size); +int ibv_read_sysfs_file_at(int dirfd, const char *file, char *buf, size_t size); +int ibv_read_ibdev_sysfs_file(char *buf, size_t size, + struct verbs_sysfs_dev *sysfs_dev, + const char *fnfmt, ...) + __attribute__((format(printf, 4, 5))); +int ibv_get_fw_ver(char *value, size_t len, struct verbs_sysfs_dev *sysfs_dev); + +static inline int verbs_get_srq_num(struct ibv_srq *srq, uint32_t *srq_num) +{ + struct verbs_srq *vsrq = container_of(srq, struct verbs_srq, srq); + if (vsrq->comp_mask & VERBS_SRQ_NUM) { + *srq_num = vsrq->srq_num; + return 0; + } + return EOPNOTSUPP; +} + +static inline bool check_comp_mask(uint64_t input, uint64_t supported) +{ + return (input & ~supported) == 0; +} + +int ibv_query_gid_type(struct ibv_context *context, uint8_t port_num, + unsigned int index, enum ibv_gid_type *type); + +static inline int +ibv_check_alloc_parent_domain(struct ibv_parent_domain_init_attr *attr) +{ + /* A valid protection domain must be set */ + if (!attr->pd) { + errno = EINVAL; + return -1; + } + + return 0; +} + +/* + * Initialize the ibv_pd which is being used as a parent_domain. From the + * perspective of the core code the new ibv_pd is completely interchangeable + * with the passed contained_pd. + */ +static inline void ibv_initialize_parent_domain(struct ibv_pd *parent_domain, + struct ibv_pd *contained_pd) +{ + parent_domain->context = contained_pd->context; + parent_domain->handle = contained_pd->handle; +} + +#endif /* INFINIBAND_DRIVER_H */ diff --git a/libibverbs/dummy_ops.c b/libibverbs/dummy_ops.c new file mode 100644 index 0000000..32fec71 --- /dev/null +++ b/libibverbs/dummy_ops.c @@ -0,0 +1,651 @@ +/* + * Copyright (c) 2017 Mellanox Technologies, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include <infiniband/driver.h> +#include "ibverbs.h" +#include <errno.h> + +static int advise_mr(struct ibv_pd *pd, + enum ibv_advise_mr_advice advice, + uint32_t flags, + struct ibv_sge *sg_list, + uint32_t num_sges) +{ + return EOPNOTSUPP; +} + +static struct ibv_dm *alloc_dm(struct ibv_context *context, + struct ibv_alloc_dm_attr *attr) +{ + errno = EOPNOTSUPP; + return NULL; +} + +static struct ibv_mw *alloc_mw(struct ibv_pd *pd, enum ibv_mw_type type) +{ + errno = EOPNOTSUPP; + return NULL; +} + +static struct ibv_mr *alloc_null_mr(struct ibv_pd *pd) +{ + errno = EOPNOTSUPP; + return NULL; +} + +static struct ibv_pd * +alloc_parent_domain(struct ibv_context *context, + struct ibv_parent_domain_init_attr *attr) +{ + errno = EOPNOTSUPP; + return NULL; +} + +static struct ibv_pd *alloc_pd(struct ibv_context *context) +{ + errno = EOPNOTSUPP; + return NULL; +} + +static struct ibv_td *alloc_td(struct ibv_context *context, + struct ibv_td_init_attr *init_attr) +{ + errno = EOPNOTSUPP; + return NULL; +} + +static void async_event(struct ibv_context *context, + struct ibv_async_event *event) +{ +} + +static int attach_counters_point_flow(struct ibv_counters *counters, + struct ibv_counter_attach_attr *attr, + struct ibv_flow *flow) +{ + return EOPNOTSUPP; +} + +static int attach_mcast(struct ibv_qp *qp, const union ibv_gid *gid, + uint16_t lid) +{ + return EOPNOTSUPP; +} + +static int bind_mw(struct ibv_qp *qp, struct ibv_mw *mw, + struct ibv_mw_bind *mw_bind) +{ + return EOPNOTSUPP; +} + +static int close_xrcd(struct ibv_xrcd *xrcd) +{ + return EOPNOTSUPP; +} + +static void cq_event(struct ibv_cq *cq) +{ +} + +static struct ibv_ah *create_ah(struct ibv_pd *pd, struct ibv_ah_attr *attr) +{ + errno = EOPNOTSUPP; + return NULL; +} + +static struct ibv_counters *create_counters(struct ibv_context *context, + struct ibv_counters_init_attr *init_attr) +{ + errno = EOPNOTSUPP; + return NULL; +} + +static struct ibv_cq *create_cq(struct ibv_context *context, int cqe, + struct ibv_comp_channel *channel, + int comp_vector) +{ + errno = EOPNOTSUPP; + return NULL; +} + +static struct ibv_cq_ex *create_cq_ex(struct ibv_context *context, + struct ibv_cq_init_attr_ex *init_attr) +{ + errno = EOPNOTSUPP; + return NULL; +} + +static struct ibv_flow *create_flow(struct ibv_qp *qp, + struct ibv_flow_attr *flow_attr) +{ + errno = EOPNOTSUPP; + return NULL; +} + +static struct ibv_flow_action *create_flow_action_esp(struct ibv_context *context, + struct ibv_flow_action_esp_attr *attr) +{ + errno = EOPNOTSUPP; + return NULL; +} + +static struct ibv_qp *create_qp(struct ibv_pd *pd, + struct ibv_qp_init_attr *attr) +{ + errno = EOPNOTSUPP; + return NULL; +} + +static struct ibv_qp *create_qp_ex(struct ibv_context *context, + struct ibv_qp_init_attr_ex *qp_init_attr_ex) +{ + errno = EOPNOTSUPP; + return NULL; +} + +static struct ibv_rwq_ind_table * +create_rwq_ind_table(struct ibv_context *context, + struct ibv_rwq_ind_table_init_attr *init_attr) +{ + errno = EOPNOTSUPP; + return NULL; +} + +static struct ibv_srq *create_srq(struct ibv_pd *pd, + struct ibv_srq_init_attr *srq_init_attr) +{ + errno = EOPNOTSUPP; + return NULL; +} + +static struct ibv_srq * +create_srq_ex(struct ibv_context *context, + struct ibv_srq_init_attr_ex *srq_init_attr_ex) +{ + errno = EOPNOTSUPP; + return NULL; +} + +static struct ibv_wq *create_wq(struct ibv_context *context, + struct ibv_wq_init_attr *wq_init_attr) +{ + errno = EOPNOTSUPP; + return NULL; +} + +static int dealloc_mw(struct ibv_mw *mw) +{ + return EOPNOTSUPP; +} + +static int dealloc_pd(struct ibv_pd *pd) +{ + return EOPNOTSUPP; +} + +static int dealloc_td(struct ibv_td *td) +{ + return EOPNOTSUPP; +} + +static int dereg_mr(struct verbs_mr *vmr) +{ + return EOPNOTSUPP; +} + +static int destroy_ah(struct ibv_ah *ah) +{ + return EOPNOTSUPP; +} + +static int destroy_counters(struct ibv_counters *counters) +{ + return EOPNOTSUPP; +} + +static int destroy_cq(struct ibv_cq *cq) +{ + return EOPNOTSUPP; +} + +static int destroy_flow(struct ibv_flow *flow) +{ + return EOPNOTSUPP; +} + +static int destroy_flow_action(struct ibv_flow_action *action) +{ + return EOPNOTSUPP; +} + +static int destroy_qp(struct ibv_qp *qp) +{ + return EOPNOTSUPP; +} + +static int destroy_rwq_ind_table(struct ibv_rwq_ind_table *rwq_ind_table) +{ + return EOPNOTSUPP; +} + +static int destroy_srq(struct ibv_srq *srq) +{ + return EOPNOTSUPP; +} + +static int destroy_wq(struct ibv_wq *wq) +{ + return EOPNOTSUPP; +} + +static int detach_mcast(struct ibv_qp *qp, const union ibv_gid *gid, + uint16_t lid) +{ + return EOPNOTSUPP; +} + +static void free_context(struct ibv_context *ctx) +{ + return; +} + +static int free_dm(struct ibv_dm *dm) +{ + return EOPNOTSUPP; +} + +static int get_srq_num(struct ibv_srq *srq, uint32_t *srq_num) +{ + return EOPNOTSUPP; +} + +static int modify_cq(struct ibv_cq *cq, struct ibv_modify_cq_attr *attr) +{ + return EOPNOTSUPP; +} + +static int modify_flow_action_esp(struct ibv_flow_action *action, + struct ibv_flow_action_esp_attr *attr) +{ + return EOPNOTSUPP; +} + +static int modify_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask) +{ + return EOPNOTSUPP; +} + +static int modify_qp_rate_limit(struct ibv_qp *qp, + struct ibv_qp_rate_limit_attr *attr) +{ + return EOPNOTSUPP; +} + +static int modify_srq(struct ibv_srq *srq, struct ibv_srq_attr *srq_attr, + int srq_attr_mask) +{ + return EOPNOTSUPP; +} + +static int modify_wq(struct ibv_wq *wq, struct ibv_wq_attr *wq_attr) +{ + return EOPNOTSUPP; +} + +static struct ibv_qp *open_qp(struct ibv_context *context, + struct ibv_qp_open_attr *attr) +{ + errno = EOPNOTSUPP; + return NULL; +} + +static struct ibv_xrcd *open_xrcd(struct ibv_context *context, + struct ibv_xrcd_init_attr *xrcd_init_attr) +{ + errno = EOPNOTSUPP; + return NULL; +} + +static int poll_cq(struct ibv_cq *cq, int num_entries, struct ibv_wc *wc) +{ + return EOPNOTSUPP; +} + +static int post_recv(struct ibv_qp *qp, struct ibv_recv_wr *wr, + struct ibv_recv_wr **bad_wr) +{ + return EOPNOTSUPP; +} + +static int post_send(struct ibv_qp *qp, struct ibv_send_wr *wr, + struct ibv_send_wr **bad_wr) +{ + return EOPNOTSUPP; +} + +static int post_srq_ops(struct ibv_srq *srq, struct ibv_ops_wr *op, + struct ibv_ops_wr **bad_op) +{ + return EOPNOTSUPP; +} + +static int post_srq_recv(struct ibv_srq *srq, struct ibv_recv_wr *recv_wr, + struct ibv_recv_wr **bad_recv_wr) +{ + return EOPNOTSUPP; +} + +static int query_device(struct ibv_context *context, + struct ibv_device_attr *device_attr) +{ + return EOPNOTSUPP; +} + +/* Provide a generic implementation for all providers that don't implement + * query_device_ex. + */ +static int query_device_ex(struct ibv_context *context, + const struct ibv_query_device_ex_input *input, + struct ibv_device_attr_ex *attr, size_t attr_size) +{ + if (input && input->comp_mask) + return EINVAL; + + if (attr_size < sizeof(attr->orig_attr)) + return EOPNOTSUPP; + + memset(&attr->orig_attr, 0, sizeof(attr->orig_attr)); + + return ibv_query_device(context, &attr->orig_attr); +} + +static int query_port(struct ibv_context *context, uint8_t port_num, + struct ibv_port_attr *port_attr) +{ + return EOPNOTSUPP; +} + +static int query_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask, + struct ibv_qp_init_attr *init_attr) +{ + return EOPNOTSUPP; +} + +static int query_rt_values(struct ibv_context *context, + struct ibv_values_ex *values) +{ + return EOPNOTSUPP; +} + +static int query_srq(struct ibv_srq *srq, struct ibv_srq_attr *srq_attr) +{ + return EOPNOTSUPP; +} + +static int read_counters(struct ibv_counters *counters, + uint64_t *counters_value, + uint32_t ncounters, + uint32_t flags) +{ + return EOPNOTSUPP; +} + +static struct ibv_mr *reg_dm_mr(struct ibv_pd *pd, struct ibv_dm *dm, + uint64_t dm_offset, size_t length, + unsigned int access) +{ + errno = EOPNOTSUPP; + return NULL; +} + +static struct ibv_mr *reg_mr(struct ibv_pd *pd, void *addr, size_t length, + uint64_t hca_va, int access) +{ + errno = EOPNOTSUPP; + return NULL; +} + +static int req_notify_cq(struct ibv_cq *cq, int solicited_only) +{ + return EOPNOTSUPP; +} + +static int rereg_mr(struct verbs_mr *vmr, int flags, struct ibv_pd *pd, + void *addr, size_t length, int access) +{ + errno = EOPNOTSUPP; + return IBV_REREG_MR_ERR_INPUT; +} + +static int resize_cq(struct ibv_cq *cq, int cqe) +{ + return EOPNOTSUPP; +} + +/* + * Ops in verbs_dummy_ops simply return an EOPNOTSUPP error code when called, or + * do nothing. They are placed in the ops structures if the provider does not + * provide an op for the function. + * + * NOTE: This deliberately does not use named initializers to trigger a + * '-Wmissing-field-initializers' warning if the struct is changed without + * changing this. + * + * Keep sorted. + */ +const struct verbs_context_ops verbs_dummy_ops = { + advise_mr, + alloc_dm, + alloc_mw, + alloc_null_mr, + alloc_parent_domain, + alloc_pd, + alloc_td, + async_event, + attach_counters_point_flow, + attach_mcast, + bind_mw, + close_xrcd, + cq_event, + create_ah, + create_counters, + create_cq, + create_cq_ex, + create_flow, + create_flow_action_esp, + create_qp, + create_qp_ex, + create_rwq_ind_table, + create_srq, + create_srq_ex, + create_wq, + dealloc_mw, + dealloc_pd, + dealloc_td, + dereg_mr, + destroy_ah, + destroy_counters, + destroy_cq, + destroy_flow, + destroy_flow_action, + destroy_qp, + destroy_rwq_ind_table, + destroy_srq, + destroy_wq, + detach_mcast, + free_context, + free_dm, + get_srq_num, + modify_cq, + modify_flow_action_esp, + modify_qp, + modify_qp_rate_limit, + modify_srq, + modify_wq, + open_qp, + open_xrcd, + poll_cq, + post_recv, + post_send, + post_srq_ops, + post_srq_recv, + query_device, + query_device_ex, + query_port, + query_qp, + query_rt_values, + query_srq, + read_counters, + reg_dm_mr, + reg_mr, + req_notify_cq, + rereg_mr, + resize_cq, +}; + +/* + * Set the ops in a context. If the function pointer in op is NULL then it is + * not set. This allows the providers to call the function multiple times in + * order to have variations of the ops for different HW configurations. + */ +void verbs_set_ops(struct verbs_context *vctx, + const struct verbs_context_ops *ops) +{ + struct verbs_ex_private *priv = vctx->priv; + struct ibv_context_ops *ctx = &vctx->context.ops; + + /* + * We retain the function pointer for now, just as 'just-in-case' ABI + * compatibility. If any ever get changed incompatibly they should be + * set to NULL instead. + */ +#define SET_PRIV_OP(ptr, name) \ + do { \ + if (ops->name) { \ + priv->ops.name = ops->name; \ + (ptr)->_compat_##name = (void *)ops->name; \ + } \ + } while (0) + + /* Same as SET_PRIV_OP but without the compatibility pointer */ +#define SET_PRIV_OP_IC(ptr, name) \ + do { \ + if (ops->name) \ + priv->ops.name = ops->name; \ + } while (0) + +#define SET_OP(ptr, name) \ + do { \ + if (ops->name) { \ + priv->ops.name = ops->name; \ + (ptr)->name = ops->name; \ + } \ + } while (0) + +#define SET_OP2(ptr, iname, name) \ + do { \ + if (ops->name) { \ + priv->ops.name = ops->name; \ + (ptr)->iname = ops->name; \ + } \ + } while (0) + + SET_OP(vctx, advise_mr); + SET_OP(vctx, alloc_dm); + SET_OP(ctx, alloc_mw); + SET_OP(vctx, alloc_null_mr); + SET_PRIV_OP(ctx, alloc_pd); + SET_OP(vctx, alloc_parent_domain); + SET_OP(vctx, alloc_td); + SET_OP(vctx, attach_counters_point_flow); + SET_OP(vctx, create_counters); + SET_PRIV_OP(ctx, async_event); + SET_PRIV_OP(ctx, attach_mcast); + SET_OP(ctx, bind_mw); + SET_OP(vctx, close_xrcd); + SET_PRIV_OP(ctx, cq_event); + SET_PRIV_OP(ctx, create_ah); + SET_PRIV_OP(ctx, create_cq); + SET_PRIV_OP_IC(vctx, create_cq_ex); + SET_OP2(vctx, ibv_create_flow, create_flow); + SET_OP(vctx, create_flow_action_esp); + SET_PRIV_OP(ctx, create_qp); + SET_OP(vctx, create_qp_ex); + SET_OP(vctx, create_rwq_ind_table); + SET_PRIV_OP(ctx, create_srq); + SET_OP(vctx, create_srq_ex); + SET_OP(vctx, create_wq); + SET_OP(ctx, dealloc_mw); + SET_PRIV_OP(ctx, dealloc_pd); + SET_OP(vctx, dealloc_td); + SET_OP(vctx, destroy_counters); + SET_PRIV_OP(ctx, dereg_mr); + SET_PRIV_OP(ctx, destroy_ah); + SET_PRIV_OP(ctx, destroy_cq); + SET_OP2(vctx, ibv_destroy_flow, destroy_flow); + SET_OP(vctx, destroy_flow_action); + SET_PRIV_OP(ctx, destroy_qp); + SET_OP(vctx, destroy_rwq_ind_table); + SET_PRIV_OP(ctx, destroy_srq); + SET_OP(vctx, destroy_wq); + SET_PRIV_OP(ctx, detach_mcast); + SET_PRIV_OP_IC(ctx, free_context); + SET_OP(vctx, free_dm); + SET_OP(vctx, get_srq_num); + SET_OP(vctx, modify_cq); + SET_OP(vctx, modify_flow_action_esp); + SET_PRIV_OP(ctx, modify_qp); + SET_OP(vctx, modify_qp_rate_limit); + SET_PRIV_OP(ctx, modify_srq); + SET_OP(vctx, modify_wq); + SET_OP(vctx, open_qp); + SET_OP(vctx, open_xrcd); + SET_OP(ctx, poll_cq); + SET_OP(ctx, post_recv); + SET_OP(ctx, post_send); + SET_OP(vctx, post_srq_ops); + SET_OP(ctx, post_srq_recv); + SET_PRIV_OP(ctx, query_device); + SET_OP(vctx, query_device_ex); + SET_PRIV_OP_IC(ctx, query_port); + SET_PRIV_OP(ctx, query_qp); + SET_OP(vctx, query_rt_values); + SET_OP(vctx, read_counters); + SET_PRIV_OP(ctx, query_srq); + SET_OP(vctx, reg_dm_mr); + SET_PRIV_OP(ctx, reg_mr); + SET_OP(ctx, req_notify_cq); + SET_PRIV_OP(ctx, rereg_mr); + SET_PRIV_OP(ctx, resize_cq); + +#undef SET_OP +#undef SET_OP2 +} diff --git a/libibverbs/dynamic_driver.c b/libibverbs/dynamic_driver.c new file mode 100644 index 0000000..7fa4233 --- /dev/null +++ b/libibverbs/dynamic_driver.c @@ -0,0 +1,241 @@ +/* + * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved. + * Copyright (c) 2006 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2018 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef _STATIC_LIBRARY_BUILD_ +#define _GNU_SOURCE + +#include <dlfcn.h> +#include <stdio.h> +#include <dirent.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <unistd.h> + +#include <ccan/list.h> + +#include "ibverbs.h" + +struct ibv_driver_name { + struct list_node entry; + char *name; +}; + +static LIST_HEAD(driver_name_list); + +static void read_config_file(const char *path) +{ + FILE *conf; + char *line = NULL; + char *config; + char *field; + size_t buflen = 0; + ssize_t len; + + conf = fopen(path, "r" STREAM_CLOEXEC); + if (!conf) { + fprintf(stderr, PFX "Warning: couldn't read config file %s.\n", + path); + return; + } + + while ((len = getline(&line, &buflen, conf)) != -1) { + config = line + strspn(line, "\t "); + if (config[0] == '\n' || config[0] == '#') + continue; + + field = strsep(&config, "\n\t "); + + if (strcmp(field, "driver") == 0 && config != NULL) { + struct ibv_driver_name *driver_name; + + config += strspn(config, "\t "); + field = strsep(&config, "\n\t "); + + driver_name = malloc(sizeof(*driver_name)); + if (!driver_name) { + fprintf(stderr, + PFX + "Warning: couldn't allocate driver name '%s'.\n", + field); + continue; + } + + driver_name->name = strdup(field); + if (!driver_name->name) { + fprintf(stderr, + PFX + "Warning: couldn't allocate driver name '%s'.\n", + field); + free(driver_name); + continue; + } + + list_add(&driver_name_list, &driver_name->entry); + } else + fprintf(stderr, + PFX + "Warning: ignoring bad config directive '%s' in file '%s'.\n", + field, path); + } + + if (line) + free(line); + fclose(conf); +} + +static void read_config(void) +{ + DIR *conf_dir; + struct dirent *dent; + char *path; + + conf_dir = opendir(IBV_CONFIG_DIR); + if (!conf_dir) { + fprintf(stderr, + PFX "Warning: couldn't open config directory '%s'.\n", + IBV_CONFIG_DIR); + return; + } + + while ((dent = readdir(conf_dir))) { + struct stat buf; + + if (asprintf(&path, "%s/%s", IBV_CONFIG_DIR, dent->d_name) < + 0) { + fprintf(stderr, + PFX + "Warning: couldn't read config file %s/%s.\n", + IBV_CONFIG_DIR, dent->d_name); + goto out; + } + + if (stat(path, &buf)) { + fprintf(stderr, + PFX + "Warning: couldn't stat config file '%s'.\n", + path); + goto next; + } + + if (!S_ISREG(buf.st_mode)) + goto next; + + read_config_file(path); +next: + free(path); + } + +out: + closedir(conf_dir); +} + +static void load_driver(const char *name) +{ + char *so_name; + void *dlhandle; + + /* If the name is an absolute path then open that path after appending + * the trailer suffix + */ + if (name[0] == '/') { + if (asprintf(&so_name, "%s" VERBS_PROVIDER_SUFFIX, name) < 0) + goto out_asprintf; + dlhandle = dlopen(so_name, RTLD_NOW); + if (!dlhandle) + goto out_dlopen; + free(so_name); + return; + } + + /* If configured with a provider plugin path then try that next */ + if (sizeof(VERBS_PROVIDER_DIR) > 1) { + if (asprintf(&so_name, + VERBS_PROVIDER_DIR "/lib%s" VERBS_PROVIDER_SUFFIX, + name) < 0) + goto out_asprintf; + dlhandle = dlopen(so_name, RTLD_NOW); + free(so_name); + if (dlhandle) + return; + } + + /* Otherwise use the system library search path. This is the historical + * behavior of libibverbs + */ + if (asprintf(&so_name, "lib%s" VERBS_PROVIDER_SUFFIX, name) < 0) + goto out_asprintf; + dlhandle = dlopen(so_name, RTLD_NOW); + if (!dlhandle) + goto out_dlopen; + free(so_name); + return; + +out_asprintf: + fprintf(stderr, PFX "Warning: couldn't load driver '%s'.\n", name); + return; +out_dlopen: + fprintf(stderr, PFX "Warning: couldn't load driver '%s': %s\n", so_name, + dlerror()); + free(so_name); +} + +void load_drivers(void) +{ + struct ibv_driver_name *name, *next_name; + const char *env; + char *list, *env_name; + + read_config(); + + /* Only use drivers passed in through the calling user's environment + * if we're not running setuid. + */ + if (getuid() == geteuid()) { + if ((env = getenv("RDMAV_DRIVERS"))) { + list = strdupa(env); + while ((env_name = strsep(&list, ":;"))) + load_driver(env_name); + } else if ((env = getenv("IBV_DRIVERS"))) { + list = strdupa(env); + while ((env_name = strsep(&list, ":;"))) + load_driver(env_name); + } + } + + list_for_each_safe (&driver_name_list, name, next_name, entry) { + load_driver(name->name); + free(name->name); + free(name); + } +} +#endif diff --git a/libibverbs/enum_strs.c b/libibverbs/enum_strs.c new file mode 100644 index 0000000..3549f26 --- /dev/null +++ b/libibverbs/enum_strs.c @@ -0,0 +1,134 @@ +/* + * Copyright (c) 2008 Lawrence Livermore National Laboratory + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <infiniband/verbs.h> + +const char *ibv_node_type_str(enum ibv_node_type node_type) +{ + static const char *const node_type_str[] = { + [IBV_NODE_CA] = "InfiniBand channel adapter", + [IBV_NODE_SWITCH] = "InfiniBand switch", + [IBV_NODE_ROUTER] = "InfiniBand router", + [IBV_NODE_RNIC] = "iWARP NIC", + [IBV_NODE_USNIC] = "usNIC", + [IBV_NODE_USNIC_UDP] = "usNIC UDP", + [IBV_NODE_UNSPECIFIED] = "unspecified", + }; + + if (node_type < IBV_NODE_CA || node_type > IBV_NODE_UNSPECIFIED) + return "unknown"; + + return node_type_str[node_type]; +} + +const char *ibv_port_state_str(enum ibv_port_state port_state) +{ + static const char *const port_state_str[] = { + [IBV_PORT_NOP] = "no state change (NOP)", + [IBV_PORT_DOWN] = "down", + [IBV_PORT_INIT] = "init", + [IBV_PORT_ARMED] = "armed", + [IBV_PORT_ACTIVE] = "active", + [IBV_PORT_ACTIVE_DEFER] = "active defer" + }; + + if (port_state < IBV_PORT_NOP || port_state > IBV_PORT_ACTIVE_DEFER) + return "unknown"; + + return port_state_str[port_state]; +} + +const char *ibv_event_type_str(enum ibv_event_type event) +{ + static const char *const event_type_str[] = { + [IBV_EVENT_CQ_ERR] = "CQ error", + [IBV_EVENT_QP_FATAL] = "local work queue catastrophic error", + [IBV_EVENT_QP_REQ_ERR] = "invalid request local work queue error", + [IBV_EVENT_QP_ACCESS_ERR] = "local access violation work queue error", + [IBV_EVENT_COMM_EST] = "communication established", + [IBV_EVENT_SQ_DRAINED] = "send queue drained", + [IBV_EVENT_PATH_MIG] = "path migrated", + [IBV_EVENT_PATH_MIG_ERR] = "path migration request error", + [IBV_EVENT_DEVICE_FATAL] = "local catastrophic error", + [IBV_EVENT_PORT_ACTIVE] = "port active", + [IBV_EVENT_PORT_ERR] = "port error", + [IBV_EVENT_LID_CHANGE] = "LID change", + [IBV_EVENT_PKEY_CHANGE] = "P_Key change", + [IBV_EVENT_SM_CHANGE] = "SM change", + [IBV_EVENT_SRQ_ERR] = "SRQ catastrophic error", + [IBV_EVENT_SRQ_LIMIT_REACHED] = "SRQ limit reached", + [IBV_EVENT_QP_LAST_WQE_REACHED] = "last WQE reached", + [IBV_EVENT_CLIENT_REREGISTER] = "client reregistration", + [IBV_EVENT_GID_CHANGE] = "GID table change", + [IBV_EVENT_WQ_FATAL] = "WQ fatal" + }; + + if (event < IBV_EVENT_CQ_ERR || event > IBV_EVENT_GID_CHANGE) + return "unknown"; + + return event_type_str[event]; +} + +const char *ibv_wc_status_str(enum ibv_wc_status status) +{ + static const char *const wc_status_str[] = { + [IBV_WC_SUCCESS] = "success", + [IBV_WC_LOC_LEN_ERR] = "local length error", + [IBV_WC_LOC_QP_OP_ERR] = "local QP operation error", + [IBV_WC_LOC_EEC_OP_ERR] = "local EE context operation error", + [IBV_WC_LOC_PROT_ERR] = "local protection error", + [IBV_WC_WR_FLUSH_ERR] = "Work Request Flushed Error", + [IBV_WC_MW_BIND_ERR] = "memory management operation error", + [IBV_WC_BAD_RESP_ERR] = "bad response error", + [IBV_WC_LOC_ACCESS_ERR] = "local access error", + [IBV_WC_REM_INV_REQ_ERR] = "remote invalid request error", + [IBV_WC_REM_ACCESS_ERR] = "remote access error", + [IBV_WC_REM_OP_ERR] = "remote operation error", + [IBV_WC_RETRY_EXC_ERR] = "transport retry counter exceeded", + [IBV_WC_RNR_RETRY_EXC_ERR] = "RNR retry counter exceeded", + [IBV_WC_LOC_RDD_VIOL_ERR] = "local RDD violation error", + [IBV_WC_REM_INV_RD_REQ_ERR] = "remote invalid RD request", + [IBV_WC_REM_ABORT_ERR] = "aborted error", + [IBV_WC_INV_EECN_ERR] = "invalid EE context number", + [IBV_WC_INV_EEC_STATE_ERR] = "invalid EE context state", + [IBV_WC_FATAL_ERR] = "fatal error", + [IBV_WC_RESP_TIMEOUT_ERR] = "response timeout error", + [IBV_WC_GENERAL_ERR] = "general error", + [IBV_WC_TM_ERR] = "TM error", + [IBV_WC_TM_RNDV_INCOMPLETE] = "TM software rendezvous", + }; + + if (status < IBV_WC_SUCCESS || status > IBV_WC_TM_RNDV_INCOMPLETE) + return "unknown"; + + return wc_status_str[status]; +} diff --git a/libibverbs/examples/CMakeLists.txt b/libibverbs/examples/CMakeLists.txt new file mode 100644 index 0000000..dc4c497 --- /dev/null +++ b/libibverbs/examples/CMakeLists.txt @@ -0,0 +1,28 @@ +# Shared example files +add_library(ibverbs_tools STATIC + pingpong.c + ) + +rdma_executable(ibv_asyncwatch asyncwatch.c) +target_link_libraries(ibv_asyncwatch LINK_PRIVATE ibverbs) + +rdma_executable(ibv_devices device_list.c) +target_link_libraries(ibv_devices LINK_PRIVATE ibverbs) + +rdma_executable(ibv_devinfo devinfo.c) +target_link_libraries(ibv_devinfo LINK_PRIVATE ibverbs) + +rdma_executable(ibv_rc_pingpong rc_pingpong.c) +target_link_libraries(ibv_rc_pingpong LINK_PRIVATE ibverbs ibverbs_tools) + +rdma_executable(ibv_srq_pingpong srq_pingpong.c) +target_link_libraries(ibv_srq_pingpong LINK_PRIVATE ibverbs ibverbs_tools) + +rdma_executable(ibv_uc_pingpong uc_pingpong.c) +target_link_libraries(ibv_uc_pingpong LINK_PRIVATE ibverbs ibverbs_tools) + +rdma_executable(ibv_ud_pingpong ud_pingpong.c) +target_link_libraries(ibv_ud_pingpong LINK_PRIVATE ibverbs ibverbs_tools) + +rdma_executable(ibv_xsrq_pingpong xsrq_pingpong.c) +target_link_libraries(ibv_xsrq_pingpong LINK_PRIVATE ibverbs ibverbs_tools) diff --git a/libibverbs/examples/asyncwatch.c b/libibverbs/examples/asyncwatch.c new file mode 100644 index 0000000..724796e --- /dev/null +++ b/libibverbs/examples/asyncwatch.c @@ -0,0 +1,164 @@ +/* + * Copyright (c) 2005 Topspin Communications. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#define _GNU_SOURCE +#include <config.h> + +#include <stdio.h> +#include <endian.h> +#include <getopt.h> +#include <string.h> + +#include <util/compiler.h> +#include <infiniband/verbs.h> + +static const char *event_name_str(enum ibv_event_type event_type) +{ + switch (event_type) { + case IBV_EVENT_DEVICE_FATAL: + return "IBV_EVENT_DEVICE_FATAL"; + case IBV_EVENT_PORT_ACTIVE: + return "IBV_EVENT_PORT_ACTIVE"; + case IBV_EVENT_PORT_ERR: + return "IBV_EVENT_PORT_ERR"; + case IBV_EVENT_LID_CHANGE: + return "IBV_EVENT_LID_CHANGE"; + case IBV_EVENT_PKEY_CHANGE: + return "IBV_EVENT_PKEY_CHANGE"; + case IBV_EVENT_SM_CHANGE: + return "IBV_EVENT_SM_CHANGE"; + case IBV_EVENT_CLIENT_REREGISTER: + return "IBV_EVENT_CLIENT_REREGISTER"; + case IBV_EVENT_GID_CHANGE: + return "IBV_EVENT_GID_CHANGE"; + + case IBV_EVENT_CQ_ERR: + case IBV_EVENT_QP_FATAL: + case IBV_EVENT_QP_REQ_ERR: + case IBV_EVENT_QP_ACCESS_ERR: + case IBV_EVENT_COMM_EST: + case IBV_EVENT_SQ_DRAINED: + case IBV_EVENT_PATH_MIG: + case IBV_EVENT_PATH_MIG_ERR: + case IBV_EVENT_SRQ_ERR: + case IBV_EVENT_SRQ_LIMIT_REACHED: + case IBV_EVENT_QP_LAST_WQE_REACHED: + default: + return "unexpected"; + } +} + +static void usage(const char *argv0) +{ + printf("Usage:\n"); + printf(" %s start an asyncwatch process\n", argv0); + printf("\n"); + printf("Options:\n"); + printf(" -d, --ib-dev=<dev> use IB device <dev> (default first device found)\n"); + printf(" -h, --help print a help text and exit\n"); +} + +int main(int argc, char *argv[]) +{ + struct ibv_device **dev_list; + struct ibv_context *context; + struct ibv_async_event event; + char *ib_devname = NULL; + int i = 0; + + /* Force line-buffering in case stdout is redirected */ + setvbuf(stdout, NULL, _IOLBF, 0); + + while (1) { + int ret = 1; + int c; + static struct option long_options[] = { + { .name = "ib-dev", .has_arg = 1, .val = 'd' }, + { .name = "help", .has_arg = 0, .val = 'h' }, + {} + }; + + c = getopt_long(argc, argv, "d:h", long_options, NULL); + if (c == -1) + break; + switch (c) { + case 'd': + ib_devname = strdupa(optarg); + break; + case 'h': + ret = 0; + SWITCH_FALLTHROUGH; + default: + usage(argv[0]); + return ret; + } + } + dev_list = ibv_get_device_list(NULL); + if (!dev_list) { + perror("Failed to get IB devices list"); + return 1; + } + if (ib_devname) { + for (; dev_list[i]; ++i) { + if (!strcmp(ibv_get_device_name(dev_list[i]), ib_devname)) + break; + } + } + + if (!dev_list[i]) { + fprintf(stderr, "IB device %s not found\n", + ib_devname ? ib_devname : ""); + return 1; + } + + context = ibv_open_device(dev_list[i]); + if (!context) { + fprintf(stderr, "Couldn't get context for %s\n", + ibv_get_device_name(dev_list[i])); + return 1; + } + + printf("%s: async event FD %d\n", + ibv_get_device_name(dev_list[i]), context->async_fd); + + while (1) { + if (ibv_get_async_event(context, &event)) + return 1; + + printf(" event_type %s (%d), port %d\n", + event_name_str(event.event_type), + event.event_type, event.element.port_num); + + ibv_ack_async_event(&event); + } + + return 0; +} diff --git a/libibverbs/examples/device_list.c b/libibverbs/examples/device_list.c new file mode 100644 index 0000000..f5602ca --- /dev/null +++ b/libibverbs/examples/device_list.c @@ -0,0 +1,64 @@ +/* + * Copyright (c) 2004 Topspin Communications. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <config.h> + +#include <stdio.h> + +#include <endian.h> + +#include <infiniband/verbs.h> + +int main(int argc, char *argv[]) +{ + struct ibv_device **dev_list; + int num_devices, i; + + dev_list = ibv_get_device_list(&num_devices); + if (!dev_list) { + perror("Failed to get IB devices list"); + return 1; + } + + printf(" %-16s\t node GUID\n", "device"); + printf(" %-16s\t----------------\n", "------"); + + for (i = 0; i < num_devices; ++i) { + printf(" %-16s\t%016llx\n", + ibv_get_device_name(dev_list[i]), + (unsigned long long) be64toh(ibv_get_device_guid(dev_list[i]))); + } + + ibv_free_device_list(dev_list); + + return 0; +} diff --git a/libibverbs/examples/devinfo.c b/libibverbs/examples/devinfo.c new file mode 100644 index 0000000..f10eb2d --- /dev/null +++ b/libibverbs/examples/devinfo.c @@ -0,0 +1,783 @@ +/* + * Copyright (c) 2005 Cisco Systems. All rights reserved. + * Copyright (c) 2005 Mellanox Technologies Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <config.h> + +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <string.h> +#include <getopt.h> +#include <endian.h> +#include <inttypes.h> +#include <arpa/inet.h> + +#include <infiniband/verbs.h> +#include <infiniband/driver.h> + +static int verbose; + +static int null_gid(union ibv_gid *gid) +{ + return !(gid->raw[8] | gid->raw[9] | gid->raw[10] | gid->raw[11] | + gid->raw[12] | gid->raw[13] | gid->raw[14] | gid->raw[15]); +} + +static const char *guid_str(__be64 _node_guid, char *str) +{ + uint64_t node_guid = be64toh(_node_guid); + sprintf(str, "%04x:%04x:%04x:%04x", + (unsigned) (node_guid >> 48) & 0xffff, + (unsigned) (node_guid >> 32) & 0xffff, + (unsigned) (node_guid >> 16) & 0xffff, + (unsigned) (node_guid >> 0) & 0xffff); + return str; +} + +static const char *transport_str(enum ibv_transport_type transport) +{ + switch (transport) { + case IBV_TRANSPORT_IB: return "InfiniBand"; + case IBV_TRANSPORT_IWARP: return "iWARP"; + case IBV_TRANSPORT_USNIC: return "usNIC"; + case IBV_TRANSPORT_USNIC_UDP: return "usNIC UDP"; + case IBV_TRANSPORT_UNSPECIFIED: return "unspecified"; + default: return "invalid transport"; + } +} + +static const char *port_state_str(enum ibv_port_state pstate) +{ + switch (pstate) { + case IBV_PORT_DOWN: return "PORT_DOWN"; + case IBV_PORT_INIT: return "PORT_INIT"; + case IBV_PORT_ARMED: return "PORT_ARMED"; + case IBV_PORT_ACTIVE: return "PORT_ACTIVE"; + default: return "invalid state"; + } +} + +static const char *port_phy_state_str(uint8_t phys_state) +{ + switch (phys_state) { + case 1: return "SLEEP"; + case 2: return "POLLING"; + case 3: return "DISABLED"; + case 4: return "PORT_CONFIGURATION TRAINNING"; + case 5: return "LINK_UP"; + case 6: return "LINK_ERROR_RECOVERY"; + case 7: return "PHY TEST"; + default: return "invalid physical state"; + } +} + +static const char *atomic_cap_str(enum ibv_atomic_cap atom_cap) +{ + switch (atom_cap) { + case IBV_ATOMIC_NONE: return "ATOMIC_NONE"; + case IBV_ATOMIC_HCA: return "ATOMIC_HCA"; + case IBV_ATOMIC_GLOB: return "ATOMIC_GLOB"; + default: return "invalid atomic capability"; + } +} + +static const char *mtu_str(enum ibv_mtu max_mtu) +{ + switch (max_mtu) { + case IBV_MTU_256: return "256"; + case IBV_MTU_512: return "512"; + case IBV_MTU_1024: return "1024"; + case IBV_MTU_2048: return "2048"; + case IBV_MTU_4096: return "4096"; + default: return "invalid MTU"; + } +} + +static const char *width_str(uint8_t width) +{ + switch (width) { + case 1: return "1"; + case 2: return "4"; + case 4: return "8"; + case 8: return "12"; + case 16: return "2"; + default: return "invalid width"; + } +} + +static const char *speed_str(uint8_t speed) +{ + switch (speed) { + case 1: return "2.5 Gbps"; + case 2: return "5.0 Gbps"; + + case 4: /* fall through */ + case 8: return "10.0 Gbps"; + + case 16: return "14.0 Gbps"; + case 32: return "25.0 Gbps"; + case 64: return "50.0 Gbps"; + default: return "invalid speed"; + } +} + +static const char *vl_str(uint8_t vl_num) +{ + switch (vl_num) { + case 1: return "1"; + case 2: return "2"; + case 3: return "4"; + case 4: return "8"; + case 5: return "15"; + default: return "invalid value"; + } +} + +#define DEVINFO_INVALID_GID_TYPE 2 +static const char *gid_type_str(enum ibv_gid_type type) +{ + switch (type) { + case IBV_GID_TYPE_IB_ROCE_V1: return "RoCE v1"; + case IBV_GID_TYPE_ROCE_V2: return "RoCE v2"; + default: return "Invalid gid type"; + } +} + +static void print_formated_gid(union ibv_gid *gid, int i, + enum ibv_gid_type type, int ll) +{ + char gid_str[INET6_ADDRSTRLEN] = {}; + char str[20] = {}; + + if (ll == IBV_LINK_LAYER_ETHERNET) + sprintf(str, ", %s", gid_type_str(type)); + + if (type == IBV_GID_TYPE_IB_ROCE_V1) + printf("\t\t\tGID[%3d]:\t\t%02x%02x:%02x%02x:%02x%02x:%02x%02x:%02x%02x:%02x%02x:%02x%02x:%02x%02x%s\n", + i, gid->raw[0], gid->raw[1], gid->raw[2], + gid->raw[3], gid->raw[4], gid->raw[5], gid->raw[6], + gid->raw[7], gid->raw[8], gid->raw[9], gid->raw[10], + gid->raw[11], gid->raw[12], gid->raw[13], gid->raw[14], + gid->raw[15], str); + + if (type == IBV_GID_TYPE_ROCE_V2) { + inet_ntop(AF_INET6, gid->raw, gid_str, sizeof(gid_str)); + printf("\t\t\tGID[%3d]:\t\t%s%s\n", i, gid_str, str); + } +} + +static int print_all_port_gids(struct ibv_context *ctx, + struct ibv_port_attr *port_attr, + uint8_t port_num) +{ + enum ibv_gid_type type; + union ibv_gid gid; + int tbl_len; + int rc = 0; + int i; + + tbl_len = port_attr->gid_tbl_len; + for (i = 0; i < tbl_len; i++) { + rc = ibv_query_gid(ctx, port_num, i, &gid); + if (rc) { + fprintf(stderr, "Failed to query gid to port %d, index %d\n", + port_num, i); + return rc; + } + + rc = ibv_query_gid_type(ctx, port_num, i, &type); + if (rc) { + rc = 0; + type = DEVINFO_INVALID_GID_TYPE; + } + if (!null_gid(&gid)) + print_formated_gid(&gid, i, type, + port_attr->link_layer); + } + return rc; +} + +static const char *link_layer_str(uint8_t link_layer) +{ + switch (link_layer) { + case IBV_LINK_LAYER_UNSPECIFIED: + case IBV_LINK_LAYER_INFINIBAND: + return "InfiniBand"; + case IBV_LINK_LAYER_ETHERNET: + return "Ethernet"; + default: + return "Unknown"; + } +} + +static void print_device_cap_flags(uint32_t dev_cap_flags) +{ + uint32_t unknown_flags = ~(IBV_DEVICE_RESIZE_MAX_WR | + IBV_DEVICE_BAD_PKEY_CNTR | + IBV_DEVICE_BAD_QKEY_CNTR | + IBV_DEVICE_RAW_MULTI | + IBV_DEVICE_AUTO_PATH_MIG | + IBV_DEVICE_CHANGE_PHY_PORT | + IBV_DEVICE_UD_AV_PORT_ENFORCE | + IBV_DEVICE_CURR_QP_STATE_MOD | + IBV_DEVICE_SHUTDOWN_PORT | + IBV_DEVICE_INIT_TYPE | + IBV_DEVICE_PORT_ACTIVE_EVENT | + IBV_DEVICE_SYS_IMAGE_GUID | + IBV_DEVICE_RC_RNR_NAK_GEN | + IBV_DEVICE_SRQ_RESIZE | + IBV_DEVICE_N_NOTIFY_CQ | + IBV_DEVICE_MEM_WINDOW | + IBV_DEVICE_UD_IP_CSUM | + IBV_DEVICE_XRC | + IBV_DEVICE_MEM_MGT_EXTENSIONS | + IBV_DEVICE_MEM_WINDOW_TYPE_2A | + IBV_DEVICE_MEM_WINDOW_TYPE_2B | + IBV_DEVICE_RC_IP_CSUM | + IBV_DEVICE_RAW_IP_CSUM | + IBV_DEVICE_MANAGED_FLOW_STEERING); + + if (dev_cap_flags & IBV_DEVICE_RESIZE_MAX_WR) + printf("\t\t\t\t\tRESIZE_MAX_WR\n"); + if (dev_cap_flags & IBV_DEVICE_BAD_PKEY_CNTR) + printf("\t\t\t\t\tBAD_PKEY_CNTR\n"); + if (dev_cap_flags & IBV_DEVICE_BAD_QKEY_CNTR) + printf("\t\t\t\t\tBAD_QKEY_CNTR\n"); + if (dev_cap_flags & IBV_DEVICE_RAW_MULTI) + printf("\t\t\t\t\tRAW_MULTI\n"); + if (dev_cap_flags & IBV_DEVICE_AUTO_PATH_MIG) + printf("\t\t\t\t\tAUTO_PATH_MIG\n"); + if (dev_cap_flags & IBV_DEVICE_CHANGE_PHY_PORT) + printf("\t\t\t\t\tCHANGE_PHY_PORT\n"); + if (dev_cap_flags & IBV_DEVICE_UD_AV_PORT_ENFORCE) + printf("\t\t\t\t\tUD_AV_PORT_ENFORCE\n"); + if (dev_cap_flags & IBV_DEVICE_CURR_QP_STATE_MOD) + printf("\t\t\t\t\tCURR_QP_STATE_MOD\n"); + if (dev_cap_flags & IBV_DEVICE_SHUTDOWN_PORT) + printf("\t\t\t\t\tSHUTDOWN_PORT\n"); + if (dev_cap_flags & IBV_DEVICE_INIT_TYPE) + printf("\t\t\t\t\tINIT_TYPE\n"); + if (dev_cap_flags & IBV_DEVICE_PORT_ACTIVE_EVENT) + printf("\t\t\t\t\tPORT_ACTIVE_EVENT\n"); + if (dev_cap_flags & IBV_DEVICE_SYS_IMAGE_GUID) + printf("\t\t\t\t\tSYS_IMAGE_GUID\n"); + if (dev_cap_flags & IBV_DEVICE_RC_RNR_NAK_GEN) + printf("\t\t\t\t\tRC_RNR_NAK_GEN\n"); + if (dev_cap_flags & IBV_DEVICE_SRQ_RESIZE) + printf("\t\t\t\t\tSRQ_RESIZE\n"); + if (dev_cap_flags & IBV_DEVICE_N_NOTIFY_CQ) + printf("\t\t\t\t\tN_NOTIFY_CQ\n"); + if (dev_cap_flags & IBV_DEVICE_MEM_WINDOW) + printf("\t\t\t\t\tMEM_WINDOW\n"); + if (dev_cap_flags & IBV_DEVICE_UD_IP_CSUM) + printf("\t\t\t\t\tUD_IP_CSUM\n"); + if (dev_cap_flags & IBV_DEVICE_XRC) + printf("\t\t\t\t\tXRC\n"); + if (dev_cap_flags & IBV_DEVICE_MEM_MGT_EXTENSIONS) + printf("\t\t\t\t\tMEM_MGT_EXTENSIONS\n"); + if (dev_cap_flags & IBV_DEVICE_MEM_WINDOW_TYPE_2A) + printf("\t\t\t\t\tMEM_WINDOW_TYPE_2A\n"); + if (dev_cap_flags & IBV_DEVICE_MEM_WINDOW_TYPE_2B) + printf("\t\t\t\t\tMEM_WINDOW_TYPE_2B\n"); + if (dev_cap_flags & IBV_DEVICE_RC_IP_CSUM) + printf("\t\t\t\t\tRC_IP_CSUM\n"); + if (dev_cap_flags & IBV_DEVICE_RAW_IP_CSUM) + printf("\t\t\t\t\tRAW_IP_CSUM\n"); + if (dev_cap_flags & IBV_DEVICE_MANAGED_FLOW_STEERING) + printf("\t\t\t\t\tMANAGED_FLOW_STEERING\n"); + if (dev_cap_flags & unknown_flags) + printf("\t\t\t\t\tUnknown flags: 0x%" PRIX32 "\n", + dev_cap_flags & unknown_flags); +} + +static void print_odp_trans_caps(uint32_t trans) +{ + uint32_t unknown_transport_caps = ~(IBV_ODP_SUPPORT_SEND | + IBV_ODP_SUPPORT_RECV | + IBV_ODP_SUPPORT_WRITE | + IBV_ODP_SUPPORT_READ | + IBV_ODP_SUPPORT_ATOMIC | + IBV_ODP_SUPPORT_SRQ_RECV); + + if (!trans) { + printf("\t\t\t\t\tNO SUPPORT\n"); + } else { + if (trans & IBV_ODP_SUPPORT_SEND) + printf("\t\t\t\t\tSUPPORT_SEND\n"); + if (trans & IBV_ODP_SUPPORT_RECV) + printf("\t\t\t\t\tSUPPORT_RECV\n"); + if (trans & IBV_ODP_SUPPORT_WRITE) + printf("\t\t\t\t\tSUPPORT_WRITE\n"); + if (trans & IBV_ODP_SUPPORT_READ) + printf("\t\t\t\t\tSUPPORT_READ\n"); + if (trans & IBV_ODP_SUPPORT_ATOMIC) + printf("\t\t\t\t\tSUPPORT_ATOMIC\n"); + if (trans & IBV_ODP_SUPPORT_SRQ_RECV) + printf("\t\t\t\t\tSUPPORT_SRQ\n"); + if (trans & unknown_transport_caps) + printf("\t\t\t\t\tUnknown flags: 0x%" PRIX32 "\n", + trans & unknown_transport_caps); + } +} + +static void print_odp_caps(const struct ibv_device_attr_ex *device_attr) +{ + uint64_t unknown_general_caps = ~(IBV_ODP_SUPPORT | + IBV_ODP_SUPPORT_IMPLICIT); + const struct ibv_odp_caps *caps = &device_attr->odp_caps; + + /* general odp caps */ + printf("\tgeneral_odp_caps:\n"); + if (caps->general_caps & IBV_ODP_SUPPORT) + printf("\t\t\t\t\tODP_SUPPORT\n"); + if (caps->general_caps & IBV_ODP_SUPPORT_IMPLICIT) + printf("\t\t\t\t\tODP_SUPPORT_IMPLICIT\n"); + if (caps->general_caps & unknown_general_caps) + printf("\t\t\t\t\tUnknown flags: 0x%" PRIX64 "\n", + caps->general_caps & unknown_general_caps); + + /* RC transport */ + printf("\trc_odp_caps:\n"); + print_odp_trans_caps(caps->per_transport_caps.rc_odp_caps); + printf("\tuc_odp_caps:\n"); + print_odp_trans_caps(caps->per_transport_caps.uc_odp_caps); + printf("\tud_odp_caps:\n"); + print_odp_trans_caps(caps->per_transport_caps.ud_odp_caps); + printf("\txrc_odp_caps:\n"); + print_odp_trans_caps(device_attr->xrc_odp_caps); +} + +static void print_device_cap_flags_ex(uint64_t device_cap_flags_ex) +{ + uint64_t ex_flags = device_cap_flags_ex & 0xffffffff00000000ULL; + uint64_t unknown_flags = ~(IBV_DEVICE_RAW_SCATTER_FCS | + IBV_DEVICE_PCI_WRITE_END_PADDING); + + if (ex_flags & IBV_DEVICE_RAW_SCATTER_FCS) + printf("\t\t\t\t\tRAW_SCATTER_FCS\n"); + if (ex_flags & IBV_DEVICE_PCI_WRITE_END_PADDING) + printf("\t\t\t\t\tPCI_WRITE_END_PADDING\n"); + if (ex_flags & unknown_flags) + printf("\t\t\t\t\tUnknown flags: 0x%" PRIX64 "\n", + ex_flags & unknown_flags); +} + +static void print_tm_caps(const struct ibv_tm_caps *caps) +{ + if (caps->max_num_tags) { + printf("\tmax_rndv_hdr_size:\t\t%u\n", + caps->max_rndv_hdr_size); + printf("\tmax_num_tags:\t\t\t%u\n", caps->max_num_tags); + printf("\tmax_ops:\t\t\t%u\n", caps->max_ops); + printf("\tmax_sge:\t\t\t%u\n", caps->max_sge); + printf("\tflags:\n"); + if (caps->flags & IBV_TM_CAP_RC) + printf("\t\t\t\t\tIBV_TM_CAP_RC\n"); + } else { + printf("\ttag matching not supported\n"); + } +} + +static void print_tso_caps(const struct ibv_tso_caps *caps) +{ + uint32_t unknown_general_caps = ~(1 << IBV_QPT_RAW_PACKET | + 1 << IBV_QPT_UD); + printf("\ttso_caps:\n"); + printf("\tmax_tso:\t\t\t%d\n", caps->max_tso); + + if (caps->max_tso) { + printf("\tsupported_qp:\n"); + if (ibv_is_qpt_supported(caps->supported_qpts, IBV_QPT_RAW_PACKET)) + printf("\t\t\t\t\tSUPPORT_RAW_PACKET\n"); + if (ibv_is_qpt_supported(caps->supported_qpts, IBV_QPT_UD)) + printf("\t\t\t\t\tSUPPORT_UD\n"); + if (caps->supported_qpts & unknown_general_caps) + printf("\t\t\t\t\tUnknown flags: 0x%" PRIX32 "\n", + caps->supported_qpts & unknown_general_caps); + } +} + +static void print_rss_caps(const struct ibv_rss_caps *caps) +{ + uint32_t unknown_general_caps = ~(1 << IBV_QPT_RAW_PACKET | + 1 << IBV_QPT_UD); + printf("\trss_caps:\n"); + printf("\t\tmax_rwq_indirection_tables:\t\t\t%u\n", caps->max_rwq_indirection_tables); + printf("\t\tmax_rwq_indirection_table_size:\t\t\t%u\n", caps->max_rwq_indirection_table_size); + printf("\t\trx_hash_function:\t\t\t\t0x%x\n", caps->rx_hash_function); + printf("\t\trx_hash_fields_mask:\t\t\t\t0x%" PRIX64 "\n", caps->rx_hash_fields_mask); + + if (caps->supported_qpts) { + printf("\t\tsupported_qp:\n"); + if (ibv_is_qpt_supported(caps->supported_qpts, IBV_QPT_RAW_PACKET)) + printf("\t\t\t\t\tSUPPORT_RAW_PACKET\n"); + if (ibv_is_qpt_supported(caps->supported_qpts, IBV_QPT_UD)) + printf("\t\t\t\t\tSUPPORT_UD\n"); + if (caps->supported_qpts & unknown_general_caps) + printf("\t\t\t\t\tUnknown flags: 0x%" PRIX32 "\n", + caps->supported_qpts & unknown_general_caps); + } +} + +static void print_cq_moderation_caps(const struct ibv_cq_moderation_caps *cq_caps) +{ + if (!cq_caps->max_cq_count || !cq_caps->max_cq_period) + return; + + printf("\n\tcq moderation caps:\n"); + printf("\t\tmax_cq_count:\t%u\n", cq_caps->max_cq_count); + printf("\t\tmax_cq_period:\t%u us\n\n", cq_caps->max_cq_period); +} + +static void print_packet_pacing_caps(const struct ibv_packet_pacing_caps *caps) +{ + uint32_t unknown_general_caps = ~(1 << IBV_QPT_RAW_PACKET | + 1 << IBV_QPT_UD); + printf("\tpacket_pacing_caps:\n"); + printf("\t\tqp_rate_limit_min:\t%ukbps\n", caps->qp_rate_limit_min); + printf("\t\tqp_rate_limit_max:\t%ukbps\n", caps->qp_rate_limit_max); + + if (caps->qp_rate_limit_max) { + printf("\t\tsupported_qp:\n"); + if (ibv_is_qpt_supported(caps->supported_qpts, IBV_QPT_RAW_PACKET)) + printf("\t\t\t\t\tSUPPORT_RAW_PACKET\n"); + if (ibv_is_qpt_supported(caps->supported_qpts, IBV_QPT_UD)) + printf("\t\t\t\t\tSUPPORT_UD\n"); + if (caps->supported_qpts & unknown_general_caps) + printf("\t\t\t\t\tUnknown flags: 0x%" PRIX32 "\n", + caps->supported_qpts & unknown_general_caps); + } +} + +static void print_raw_packet_caps(uint32_t raw_packet_caps) +{ + printf("\traw packet caps:\n"); + if (raw_packet_caps & IBV_RAW_PACKET_CAP_CVLAN_STRIPPING) + printf("\t\t\t\t\tC-VLAN stripping offload\n"); + if (raw_packet_caps & IBV_RAW_PACKET_CAP_SCATTER_FCS) + printf("\t\t\t\t\tScatter FCS offload\n"); + if (raw_packet_caps & IBV_RAW_PACKET_CAP_IP_CSUM) + printf("\t\t\t\t\tIP csum offload\n"); + if (raw_packet_caps & IBV_RAW_PACKET_CAP_DELAY_DROP) + printf("\t\t\t\t\tDelay drop\n"); +} + +static int print_hca_cap(struct ibv_device *ib_dev, uint8_t ib_port) +{ + struct ibv_context *ctx; + struct ibv_device_attr_ex device_attr; + struct ibv_port_attr port_attr; + int rc = 0; + uint8_t port; + char buf[256]; + + ctx = ibv_open_device(ib_dev); + if (!ctx) { + fprintf(stderr, "Failed to open device\n"); + rc = 1; + goto cleanup; + } + if (ibv_query_device_ex(ctx, NULL, &device_attr)) { + fprintf(stderr, "Failed to query device props\n"); + rc = 2; + goto cleanup; + } + if (ib_port && ib_port > device_attr.orig_attr.phys_port_cnt) { + fprintf(stderr, "Invalid port requested for device\n"); + /* rc = 3 is taken by failure to clean up */ + rc = 4; + goto cleanup; + } + + printf("hca_id:\t%s\n", ibv_get_device_name(ib_dev)); + printf("\ttransport:\t\t\t%s (%d)\n", + transport_str(ib_dev->transport_type), ib_dev->transport_type); + if (strlen(device_attr.orig_attr.fw_ver)) + printf("\tfw_ver:\t\t\t\t%s\n", device_attr.orig_attr.fw_ver); + printf("\tnode_guid:\t\t\t%s\n", guid_str(device_attr.orig_attr.node_guid, buf)); + printf("\tsys_image_guid:\t\t\t%s\n", guid_str(device_attr.orig_attr.sys_image_guid, buf)); + printf("\tvendor_id:\t\t\t0x%04x\n", device_attr.orig_attr.vendor_id); + printf("\tvendor_part_id:\t\t\t%d\n", device_attr.orig_attr.vendor_part_id); + printf("\thw_ver:\t\t\t\t0x%X\n", device_attr.orig_attr.hw_ver); + + if (ibv_read_sysfs_file(ib_dev->ibdev_path, "board_id", buf, sizeof buf) > 0) + printf("\tboard_id:\t\t\t%s\n", buf); + + printf("\tphys_port_cnt:\t\t\t%d\n", device_attr.orig_attr.phys_port_cnt); + + if (verbose) { + printf("\tmax_mr_size:\t\t\t0x%llx\n", + (unsigned long long) device_attr.orig_attr.max_mr_size); + printf("\tpage_size_cap:\t\t\t0x%llx\n", + (unsigned long long) device_attr.orig_attr.page_size_cap); + printf("\tmax_qp:\t\t\t\t%d\n", device_attr.orig_attr.max_qp); + printf("\tmax_qp_wr:\t\t\t%d\n", device_attr.orig_attr.max_qp_wr); + printf("\tdevice_cap_flags:\t\t0x%08x\n", device_attr.orig_attr.device_cap_flags); + print_device_cap_flags(device_attr.orig_attr.device_cap_flags); + printf("\tmax_sge:\t\t\t%d\n", device_attr.orig_attr.max_sge); + printf("\tmax_sge_rd:\t\t\t%d\n", device_attr.orig_attr.max_sge_rd); + printf("\tmax_cq:\t\t\t\t%d\n", device_attr.orig_attr.max_cq); + printf("\tmax_cqe:\t\t\t%d\n", device_attr.orig_attr.max_cqe); + printf("\tmax_mr:\t\t\t\t%d\n", device_attr.orig_attr.max_mr); + printf("\tmax_pd:\t\t\t\t%d\n", device_attr.orig_attr.max_pd); + printf("\tmax_qp_rd_atom:\t\t\t%d\n", device_attr.orig_attr.max_qp_rd_atom); + printf("\tmax_ee_rd_atom:\t\t\t%d\n", device_attr.orig_attr.max_ee_rd_atom); + printf("\tmax_res_rd_atom:\t\t%d\n", device_attr.orig_attr.max_res_rd_atom); + printf("\tmax_qp_init_rd_atom:\t\t%d\n", device_attr.orig_attr.max_qp_init_rd_atom); + printf("\tmax_ee_init_rd_atom:\t\t%d\n", device_attr.orig_attr.max_ee_init_rd_atom); + printf("\tatomic_cap:\t\t\t%s (%d)\n", + atomic_cap_str(device_attr.orig_attr.atomic_cap), device_attr.orig_attr.atomic_cap); + printf("\tmax_ee:\t\t\t\t%d\n", device_attr.orig_attr.max_ee); + printf("\tmax_rdd:\t\t\t%d\n", device_attr.orig_attr.max_rdd); + printf("\tmax_mw:\t\t\t\t%d\n", device_attr.orig_attr.max_mw); + printf("\tmax_raw_ipv6_qp:\t\t%d\n", device_attr.orig_attr.max_raw_ipv6_qp); + printf("\tmax_raw_ethy_qp:\t\t%d\n", device_attr.orig_attr.max_raw_ethy_qp); + printf("\tmax_mcast_grp:\t\t\t%d\n", device_attr.orig_attr.max_mcast_grp); + printf("\tmax_mcast_qp_attach:\t\t%d\n", device_attr.orig_attr.max_mcast_qp_attach); + printf("\tmax_total_mcast_qp_attach:\t%d\n", + device_attr.orig_attr.max_total_mcast_qp_attach); + printf("\tmax_ah:\t\t\t\t%d\n", device_attr.orig_attr.max_ah); + printf("\tmax_fmr:\t\t\t%d\n", device_attr.orig_attr.max_fmr); + if (device_attr.orig_attr.max_fmr) + printf("\tmax_map_per_fmr:\t\t%d\n", device_attr.orig_attr.max_map_per_fmr); + printf("\tmax_srq:\t\t\t%d\n", device_attr.orig_attr.max_srq); + if (device_attr.orig_attr.max_srq) { + printf("\tmax_srq_wr:\t\t\t%d\n", device_attr.orig_attr.max_srq_wr); + printf("\tmax_srq_sge:\t\t\t%d\n", device_attr.orig_attr.max_srq_sge); + } + printf("\tmax_pkeys:\t\t\t%d\n", device_attr.orig_attr.max_pkeys); + printf("\tlocal_ca_ack_delay:\t\t%d\n", device_attr.orig_attr.local_ca_ack_delay); + + print_odp_caps(&device_attr); + if (device_attr.completion_timestamp_mask) + printf("\tcompletion timestamp_mask:\t\t\t0x%016" PRIx64 "\n", + device_attr.completion_timestamp_mask); + else + printf("\tcompletion_timestamp_mask not supported\n"); + + if (device_attr.hca_core_clock) + printf("\thca_core_clock:\t\t\t%" PRIu64 "kHZ\n", device_attr.hca_core_clock); + else + printf("\tcore clock not supported\n"); + + if (device_attr.raw_packet_caps) + print_raw_packet_caps(device_attr.raw_packet_caps); + + printf("\tdevice_cap_flags_ex:\t\t0x%" PRIX64 "\n", device_attr.device_cap_flags_ex); + print_device_cap_flags_ex(device_attr.device_cap_flags_ex); + print_tso_caps(&device_attr.tso_caps); + print_rss_caps(&device_attr.rss_caps); + printf("\tmax_wq_type_rq:\t\t\t%u\n", device_attr.max_wq_type_rq); + print_packet_pacing_caps(&device_attr.packet_pacing_caps); + print_tm_caps(&device_attr.tm_caps); + print_cq_moderation_caps(&device_attr.cq_mod_caps); + + if (device_attr.max_dm_size) + printf("\tmaximum available device memory:\t%" PRIu64"Bytes\n\n", + device_attr.max_dm_size); + } + + for (port = 1; port <= device_attr.orig_attr.phys_port_cnt; ++port) { + /* if in the command line the user didn't ask for info about this port */ + if ((ib_port) && (port != ib_port)) + continue; + + rc = ibv_query_port(ctx, port, &port_attr); + if (rc) { + fprintf(stderr, "Failed to query port %u props\n", port); + goto cleanup; + } + printf("\t\tport:\t%d\n", port); + printf("\t\t\tstate:\t\t\t%s (%d)\n", + port_state_str(port_attr.state), port_attr.state); + printf("\t\t\tmax_mtu:\t\t%s (%d)\n", + mtu_str(port_attr.max_mtu), port_attr.max_mtu); + printf("\t\t\tactive_mtu:\t\t%s (%d)\n", + mtu_str(port_attr.active_mtu), port_attr.active_mtu); + printf("\t\t\tsm_lid:\t\t\t%d\n", port_attr.sm_lid); + printf("\t\t\tport_lid:\t\t%d\n", port_attr.lid); + printf("\t\t\tport_lmc:\t\t0x%02x\n", port_attr.lmc); + printf("\t\t\tlink_layer:\t\t%s\n", + link_layer_str(port_attr.link_layer)); + + if (verbose) { + printf("\t\t\tmax_msg_sz:\t\t0x%x\n", port_attr.max_msg_sz); + printf("\t\t\tport_cap_flags:\t\t0x%08x\n", port_attr.port_cap_flags); + printf("\t\t\tport_cap_flags2:\t0x%04x\n", port_attr.port_cap_flags2); + printf("\t\t\tmax_vl_num:\t\t%s (%d)\n", + vl_str(port_attr.max_vl_num), port_attr.max_vl_num); + printf("\t\t\tbad_pkey_cntr:\t\t0x%x\n", port_attr.bad_pkey_cntr); + printf("\t\t\tqkey_viol_cntr:\t\t0x%x\n", port_attr.qkey_viol_cntr); + printf("\t\t\tsm_sl:\t\t\t%d\n", port_attr.sm_sl); + printf("\t\t\tpkey_tbl_len:\t\t%d\n", port_attr.pkey_tbl_len); + printf("\t\t\tgid_tbl_len:\t\t%d\n", port_attr.gid_tbl_len); + printf("\t\t\tsubnet_timeout:\t\t%d\n", port_attr.subnet_timeout); + printf("\t\t\tinit_type_reply:\t%d\n", port_attr.init_type_reply); + printf("\t\t\tactive_width:\t\t%sX (%d)\n", + width_str(port_attr.active_width), port_attr.active_width); + printf("\t\t\tactive_speed:\t\t%s (%d)\n", + speed_str(port_attr.active_speed), port_attr.active_speed); + if (ib_dev->transport_type == IBV_TRANSPORT_IB) + printf("\t\t\tphys_state:\t\t%s (%d)\n", + port_phy_state_str(port_attr.phys_state), port_attr.phys_state); + + if (print_all_port_gids(ctx, &port_attr, port)) + goto cleanup; + } + printf("\n"); + } +cleanup: + if (ctx) + if (ibv_close_device(ctx)) { + fprintf(stderr, "Failed to close device"); + rc = 3; + } + return rc; +} + +static void usage(const char *argv0) +{ + printf("Usage: %s print the ca attributes\n", argv0); + printf("\n"); + printf("Options:\n"); + printf(" -d, --ib-dev=<dev> use IB device <dev> (default first device found)\n"); + printf(" -i, --ib-port=<port> use port <port> of IB device (default all ports)\n"); + printf(" -l, --list print only the IB devices names\n"); + printf(" -v, --verbose print all the attributes of the IB device(s)\n"); +} + +int main(int argc, char *argv[]) +{ + char *ib_devname = NULL; + int ret = 0; + struct ibv_device **dev_list, **orig_dev_list; + int num_of_hcas; + int ib_port = 0; + + /* parse command line options */ + while (1) { + int c; + static struct option long_options[] = { + { .name = "ib-dev", .has_arg = 1, .val = 'd' }, + { .name = "ib-port", .has_arg = 1, .val = 'i' }, + { .name = "list", .has_arg = 0, .val = 'l' }, + { .name = "verbose", .has_arg = 0, .val = 'v' }, + { } + }; + + c = getopt_long(argc, argv, "d:i:lv", long_options, NULL); + if (c == -1) + break; + + switch (c) { + case 'd': + ib_devname = strdup(optarg); + break; + + case 'i': + ib_port = strtol(optarg, NULL, 0); + if (ib_port <= 0) { + usage(argv[0]); + return 1; + } + break; + + case 'v': + verbose = 1; + break; + + case 'l': + dev_list = orig_dev_list = ibv_get_device_list(&num_of_hcas); + if (!dev_list) { + perror("Failed to get IB devices list"); + return -1; + } + + printf("%d HCA%s found:\n", num_of_hcas, + num_of_hcas != 1 ? "s" : ""); + + while (*dev_list) { + printf("\t%s\n", ibv_get_device_name(*dev_list)); + ++dev_list; + } + + printf("\n"); + + ibv_free_device_list(orig_dev_list); + + return 0; + + default: + usage(argv[0]); + return -1; + } + } + + dev_list = orig_dev_list = ibv_get_device_list(NULL); + if (!dev_list) { + perror("Failed to get IB devices list"); + return -1; + } + + if (ib_devname) { + while (*dev_list) { + if (!strcmp(ibv_get_device_name(*dev_list), ib_devname)) + break; + ++dev_list; + } + + if (!*dev_list) { + fprintf(stderr, "IB device '%s' wasn't found\n", ib_devname); + return -1; + } + + ret |= print_hca_cap(*dev_list, ib_port); + } else { + if (!*dev_list) { + fprintf(stderr, "No IB devices found\n"); + return -1; + } + + while (*dev_list) { + ret |= print_hca_cap(*dev_list, ib_port); + ++dev_list; + } + } + + if (ib_devname) + free(ib_devname); + + ibv_free_device_list(orig_dev_list); + + return ret; +} diff --git a/libibverbs/examples/pingpong.c b/libibverbs/examples/pingpong.c new file mode 100644 index 0000000..da08e82 --- /dev/null +++ b/libibverbs/examples/pingpong.c @@ -0,0 +1,80 @@ +/* + * Copyright (c) 2006 Cisco Systems. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "pingpong.h" +#include <endian.h> +#include <stdlib.h> +#include <stdio.h> +#include <string.h> + +enum ibv_mtu pp_mtu_to_enum(int mtu) +{ + switch (mtu) { + case 256: return IBV_MTU_256; + case 512: return IBV_MTU_512; + case 1024: return IBV_MTU_1024; + case 2048: return IBV_MTU_2048; + case 4096: return IBV_MTU_4096; + default: return 0; + } +} + +int pp_get_port_info(struct ibv_context *context, int port, + struct ibv_port_attr *attr) +{ + return ibv_query_port(context, port, attr); +} + +void wire_gid_to_gid(const char *wgid, union ibv_gid *gid) +{ + char tmp[9]; + __be32 v32; + int i; + uint32_t tmp_gid[4]; + + for (tmp[8] = 0, i = 0; i < 4; ++i) { + memcpy(tmp, wgid + i * 8, 8); + sscanf(tmp, "%x", &v32); + tmp_gid[i] = be32toh(v32); + } + memcpy(gid, tmp_gid, sizeof(*gid)); +} + +void gid_to_wire_gid(const union ibv_gid *gid, char wgid[]) +{ + uint32_t tmp_gid[4]; + int i; + + memcpy(tmp_gid, gid, sizeof(tmp_gid)); + for (i = 0; i < 4; ++i) + sprintf(&wgid[i * 8], "%08x", htobe32(tmp_gid[i])); +} diff --git a/libibverbs/examples/pingpong.h b/libibverbs/examples/pingpong.h new file mode 100644 index 0000000..8dc5dd0 --- /dev/null +++ b/libibverbs/examples/pingpong.h @@ -0,0 +1,44 @@ +/* + * Copyright (c) 2006 Cisco Systems. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef IBV_PINGPONG_H +#define IBV_PINGPONG_H + +#include <infiniband/verbs.h> + +enum ibv_mtu pp_mtu_to_enum(int mtu); +int pp_get_port_info(struct ibv_context *context, int port, + struct ibv_port_attr *attr); +void wire_gid_to_gid(const char *wgid, union ibv_gid *gid); +void gid_to_wire_gid(const union ibv_gid *gid, char wgid[]); + +#endif /* IBV_PINGPONG_H */ diff --git a/libibverbs/examples/rc_pingpong.c b/libibverbs/examples/rc_pingpong.c new file mode 100644 index 0000000..9781c4f --- /dev/null +++ b/libibverbs/examples/rc_pingpong.c @@ -0,0 +1,1204 @@ +/* + * Copyright (c) 2005 Topspin Communications. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#define _GNU_SOURCE +#include <config.h> + +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <string.h> +#include <sys/types.h> +#include <sys/socket.h> +#include <sys/time.h> +#include <netdb.h> +#include <malloc.h> +#include <getopt.h> +#include <arpa/inet.h> +#include <time.h> +#include <inttypes.h> + +#include "pingpong.h" + +#include <ccan/minmax.h> + +enum { + PINGPONG_RECV_WRID = 1, + PINGPONG_SEND_WRID = 2, +}; + +static int page_size; +static int use_odp; +static int implicit_odp; +static int prefetch_mr; +static int use_ts; +static int validate_buf; +static int use_dm; +static int use_new_send; + +struct pingpong_context { + struct ibv_context *context; + struct ibv_comp_channel *channel; + struct ibv_pd *pd; + struct ibv_mr *mr; + struct ibv_dm *dm; + union { + struct ibv_cq *cq; + struct ibv_cq_ex *cq_ex; + } cq_s; + struct ibv_qp *qp; + struct ibv_qp_ex *qpx; + char *buf; + int size; + int send_flags; + int rx_depth; + int pending; + struct ibv_port_attr portinfo; + uint64_t completion_timestamp_mask; +}; + +static struct ibv_cq *pp_cq(struct pingpong_context *ctx) +{ + return use_ts ? ibv_cq_ex_to_cq(ctx->cq_s.cq_ex) : + ctx->cq_s.cq; +} + +struct pingpong_dest { + int lid; + int qpn; + int psn; + union ibv_gid gid; +}; + +static int pp_connect_ctx(struct pingpong_context *ctx, int port, int my_psn, + enum ibv_mtu mtu, int sl, + struct pingpong_dest *dest, int sgid_idx) +{ + struct ibv_qp_attr attr = { + .qp_state = IBV_QPS_RTR, + .path_mtu = mtu, + .dest_qp_num = dest->qpn, + .rq_psn = dest->psn, + .max_dest_rd_atomic = 1, + .min_rnr_timer = 12, + .ah_attr = { + .is_global = 0, + .dlid = dest->lid, + .sl = sl, + .src_path_bits = 0, + .port_num = port + } + }; + + if (dest->gid.global.interface_id) { + attr.ah_attr.is_global = 1; + attr.ah_attr.grh.hop_limit = 1; + attr.ah_attr.grh.dgid = dest->gid; + attr.ah_attr.grh.sgid_index = sgid_idx; + } + if (ibv_modify_qp(ctx->qp, &attr, + IBV_QP_STATE | + IBV_QP_AV | + IBV_QP_PATH_MTU | + IBV_QP_DEST_QPN | + IBV_QP_RQ_PSN | + IBV_QP_MAX_DEST_RD_ATOMIC | + IBV_QP_MIN_RNR_TIMER)) { + fprintf(stderr, "Failed to modify QP to RTR\n"); + return 1; + } + + attr.qp_state = IBV_QPS_RTS; + attr.timeout = 14; + attr.retry_cnt = 7; + attr.rnr_retry = 7; + attr.sq_psn = my_psn; + attr.max_rd_atomic = 1; + if (ibv_modify_qp(ctx->qp, &attr, + IBV_QP_STATE | + IBV_QP_TIMEOUT | + IBV_QP_RETRY_CNT | + IBV_QP_RNR_RETRY | + IBV_QP_SQ_PSN | + IBV_QP_MAX_QP_RD_ATOMIC)) { + fprintf(stderr, "Failed to modify QP to RTS\n"); + return 1; + } + + return 0; +} + +static struct pingpong_dest *pp_client_exch_dest(const char *servername, int port, + const struct pingpong_dest *my_dest) +{ + struct addrinfo *res, *t; + struct addrinfo hints = { + .ai_family = AF_UNSPEC, + .ai_socktype = SOCK_STREAM + }; + char *service; + char msg[sizeof "0000:000000:000000:00000000000000000000000000000000"]; + int n; + int sockfd = -1; + struct pingpong_dest *rem_dest = NULL; + char gid[33]; + + if (asprintf(&service, "%d", port) < 0) + return NULL; + + n = getaddrinfo(servername, service, &hints, &res); + + if (n < 0) { + fprintf(stderr, "%s for %s:%d\n", gai_strerror(n), servername, port); + free(service); + return NULL; + } + + for (t = res; t; t = t->ai_next) { + sockfd = socket(t->ai_family, t->ai_socktype, t->ai_protocol); + if (sockfd >= 0) { + if (!connect(sockfd, t->ai_addr, t->ai_addrlen)) + break; + close(sockfd); + sockfd = -1; + } + } + + freeaddrinfo(res); + free(service); + + if (sockfd < 0) { + fprintf(stderr, "Couldn't connect to %s:%d\n", servername, port); + return NULL; + } + + gid_to_wire_gid(&my_dest->gid, gid); + sprintf(msg, "%04x:%06x:%06x:%s", my_dest->lid, my_dest->qpn, + my_dest->psn, gid); + if (write(sockfd, msg, sizeof msg) != sizeof msg) { + fprintf(stderr, "Couldn't send local address\n"); + goto out; + } + + if (read(sockfd, msg, sizeof msg) != sizeof msg || + write(sockfd, "done", sizeof "done") != sizeof "done") { + perror("client read/write"); + fprintf(stderr, "Couldn't read/write remote address\n"); + goto out; + } + + rem_dest = malloc(sizeof *rem_dest); + if (!rem_dest) + goto out; + + sscanf(msg, "%x:%x:%x:%s", &rem_dest->lid, &rem_dest->qpn, + &rem_dest->psn, gid); + wire_gid_to_gid(gid, &rem_dest->gid); + +out: + close(sockfd); + return rem_dest; +} + +static struct pingpong_dest *pp_server_exch_dest(struct pingpong_context *ctx, + int ib_port, enum ibv_mtu mtu, + int port, int sl, + const struct pingpong_dest *my_dest, + int sgid_idx) +{ + struct addrinfo *res, *t; + struct addrinfo hints = { + .ai_flags = AI_PASSIVE, + .ai_family = AF_UNSPEC, + .ai_socktype = SOCK_STREAM + }; + char *service; + char msg[sizeof "0000:000000:000000:00000000000000000000000000000000"]; + int n; + int sockfd = -1, connfd; + struct pingpong_dest *rem_dest = NULL; + char gid[33]; + + if (asprintf(&service, "%d", port) < 0) + return NULL; + + n = getaddrinfo(NULL, service, &hints, &res); + + if (n < 0) { + fprintf(stderr, "%s for port %d\n", gai_strerror(n), port); + free(service); + return NULL; + } + + for (t = res; t; t = t->ai_next) { + sockfd = socket(t->ai_family, t->ai_socktype, t->ai_protocol); + if (sockfd >= 0) { + n = 1; + + setsockopt(sockfd, SOL_SOCKET, SO_REUSEADDR, &n, sizeof n); + + if (!bind(sockfd, t->ai_addr, t->ai_addrlen)) + break; + close(sockfd); + sockfd = -1; + } + } + + freeaddrinfo(res); + free(service); + + if (sockfd < 0) { + fprintf(stderr, "Couldn't listen to port %d\n", port); + return NULL; + } + + listen(sockfd, 1); + connfd = accept(sockfd, NULL, NULL); + close(sockfd); + if (connfd < 0) { + fprintf(stderr, "accept() failed\n"); + return NULL; + } + + n = read(connfd, msg, sizeof msg); + if (n != sizeof msg) { + perror("server read"); + fprintf(stderr, "%d/%d: Couldn't read remote address\n", n, (int) sizeof msg); + goto out; + } + + rem_dest = malloc(sizeof *rem_dest); + if (!rem_dest) + goto out; + + sscanf(msg, "%x:%x:%x:%s", &rem_dest->lid, &rem_dest->qpn, + &rem_dest->psn, gid); + wire_gid_to_gid(gid, &rem_dest->gid); + + if (pp_connect_ctx(ctx, ib_port, my_dest->psn, mtu, sl, rem_dest, + sgid_idx)) { + fprintf(stderr, "Couldn't connect to remote QP\n"); + free(rem_dest); + rem_dest = NULL; + goto out; + } + + + gid_to_wire_gid(&my_dest->gid, gid); + sprintf(msg, "%04x:%06x:%06x:%s", my_dest->lid, my_dest->qpn, + my_dest->psn, gid); + if (write(connfd, msg, sizeof msg) != sizeof msg || + read(connfd, msg, sizeof msg) != sizeof "done") { + fprintf(stderr, "Couldn't send/recv local address\n"); + free(rem_dest); + rem_dest = NULL; + goto out; + } + + +out: + close(connfd); + return rem_dest; +} + +static struct pingpong_context *pp_init_ctx(struct ibv_device *ib_dev, int size, + int rx_depth, int port, + int use_event) +{ + struct pingpong_context *ctx; + int access_flags = IBV_ACCESS_LOCAL_WRITE; + + ctx = calloc(1, sizeof *ctx); + if (!ctx) + return NULL; + + ctx->size = size; + ctx->send_flags = IBV_SEND_SIGNALED; + ctx->rx_depth = rx_depth; + + ctx->buf = memalign(page_size, size); + if (!ctx->buf) { + fprintf(stderr, "Couldn't allocate work buf.\n"); + goto clean_ctx; + } + + /* FIXME memset(ctx->buf, 0, size); */ + memset(ctx->buf, 0x7b, size); + + ctx->context = ibv_open_device(ib_dev); + if (!ctx->context) { + fprintf(stderr, "Couldn't get context for %s\n", + ibv_get_device_name(ib_dev)); + goto clean_buffer; + } + + if (use_event) { + ctx->channel = ibv_create_comp_channel(ctx->context); + if (!ctx->channel) { + fprintf(stderr, "Couldn't create completion channel\n"); + goto clean_device; + } + } else + ctx->channel = NULL; + + ctx->pd = ibv_alloc_pd(ctx->context); + if (!ctx->pd) { + fprintf(stderr, "Couldn't allocate PD\n"); + goto clean_comp_channel; + } + + if (use_odp || use_ts || use_dm) { + const uint32_t rc_caps_mask = IBV_ODP_SUPPORT_SEND | + IBV_ODP_SUPPORT_RECV; + struct ibv_device_attr_ex attrx; + + if (ibv_query_device_ex(ctx->context, NULL, &attrx)) { + fprintf(stderr, "Couldn't query device for its features\n"); + goto clean_pd; + } + + if (use_odp) { + if (!(attrx.odp_caps.general_caps & IBV_ODP_SUPPORT) || + (attrx.odp_caps.per_transport_caps.rc_odp_caps & rc_caps_mask) != rc_caps_mask) { + fprintf(stderr, "The device isn't ODP capable or does not support RC send and receive with ODP\n"); + goto clean_pd; + } + if (implicit_odp && + !(attrx.odp_caps.general_caps & IBV_ODP_SUPPORT_IMPLICIT)) { + fprintf(stderr, "The device doesn't support implicit ODP\n"); + goto clean_pd; + } + access_flags |= IBV_ACCESS_ON_DEMAND; + } + + if (use_ts) { + if (!attrx.completion_timestamp_mask) { + fprintf(stderr, "The device isn't completion timestamp capable\n"); + goto clean_pd; + } + ctx->completion_timestamp_mask = attrx.completion_timestamp_mask; + } + + if (use_dm) { + struct ibv_alloc_dm_attr dm_attr = {}; + + if (!attrx.max_dm_size) { + fprintf(stderr, "Device doesn't support dm allocation\n"); + goto clean_pd; + } + + if (attrx.max_dm_size < size) { + fprintf(stderr, "Device memory is insufficient\n"); + goto clean_pd; + } + + dm_attr.length = size; + ctx->dm = ibv_alloc_dm(ctx->context, &dm_attr); + if (!ctx->dm) { + fprintf(stderr, "Dev mem allocation failed\n"); + goto clean_pd; + } + + access_flags |= IBV_ACCESS_ZERO_BASED; + } + } + + if (implicit_odp) { + ctx->mr = ibv_reg_mr(ctx->pd, NULL, SIZE_MAX, access_flags); + } else { + ctx->mr = use_dm ? ibv_reg_dm_mr(ctx->pd, ctx->dm, 0, + size, access_flags) : + ibv_reg_mr(ctx->pd, ctx->buf, size, access_flags); + } + + if (!ctx->mr) { + fprintf(stderr, "Couldn't register MR\n"); + goto clean_dm; + } + + if (prefetch_mr) { + struct ibv_sge sg_list; + int ret; + + sg_list.lkey = ctx->mr->lkey; + sg_list.addr = (uintptr_t)ctx->buf; + sg_list.length = size; + + ret = ibv_advise_mr(ctx->pd, IBV_ADVISE_MR_ADVICE_PREFETCH_WRITE, + IB_UVERBS_ADVISE_MR_FLAG_FLUSH, + &sg_list, 1); + + if (ret) + fprintf(stderr, "Couldn't prefetch MR(%d). Continue anyway\n", ret); + } + + if (use_ts) { + struct ibv_cq_init_attr_ex attr_ex = { + .cqe = rx_depth + 1, + .cq_context = NULL, + .channel = ctx->channel, + .comp_vector = 0, + .wc_flags = IBV_WC_EX_WITH_COMPLETION_TIMESTAMP + }; + + ctx->cq_s.cq_ex = ibv_create_cq_ex(ctx->context, &attr_ex); + } else { + ctx->cq_s.cq = ibv_create_cq(ctx->context, rx_depth + 1, NULL, + ctx->channel, 0); + } + + if (!pp_cq(ctx)) { + fprintf(stderr, "Couldn't create CQ\n"); + goto clean_mr; + } + + { + struct ibv_qp_attr attr; + struct ibv_qp_init_attr init_attr = { + .send_cq = pp_cq(ctx), + .recv_cq = pp_cq(ctx), + .cap = { + .max_send_wr = 1, + .max_recv_wr = rx_depth, + .max_send_sge = 1, + .max_recv_sge = 1 + }, + .qp_type = IBV_QPT_RC + }; + + if (use_new_send) { + struct ibv_qp_init_attr_ex init_attr_ex = {}; + + init_attr_ex.send_cq = pp_cq(ctx); + init_attr_ex.recv_cq = pp_cq(ctx); + init_attr_ex.cap.max_send_wr = 1; + init_attr_ex.cap.max_recv_wr = rx_depth; + init_attr_ex.cap.max_send_sge = 1; + init_attr_ex.cap.max_recv_sge = 1; + init_attr_ex.qp_type = IBV_QPT_RC; + + init_attr_ex.comp_mask |= IBV_QP_INIT_ATTR_PD | + IBV_QP_INIT_ATTR_SEND_OPS_FLAGS; + init_attr_ex.pd = ctx->pd; + init_attr_ex.send_ops_flags = IBV_QP_EX_WITH_SEND; + + ctx->qp = ibv_create_qp_ex(ctx->context, &init_attr_ex); + } else { + ctx->qp = ibv_create_qp(ctx->pd, &init_attr); + } + + if (!ctx->qp) { + fprintf(stderr, "Couldn't create QP\n"); + goto clean_cq; + } + + if (use_new_send) + ctx->qpx = ibv_qp_to_qp_ex(ctx->qp); + + ibv_query_qp(ctx->qp, &attr, IBV_QP_CAP, &init_attr); + if (init_attr.cap.max_inline_data >= size && !use_dm) + ctx->send_flags |= IBV_SEND_INLINE; + } + + { + struct ibv_qp_attr attr = { + .qp_state = IBV_QPS_INIT, + .pkey_index = 0, + .port_num = port, + .qp_access_flags = 0 + }; + + if (ibv_modify_qp(ctx->qp, &attr, + IBV_QP_STATE | + IBV_QP_PKEY_INDEX | + IBV_QP_PORT | + IBV_QP_ACCESS_FLAGS)) { + fprintf(stderr, "Failed to modify QP to INIT\n"); + goto clean_qp; + } + } + + return ctx; + +clean_qp: + ibv_destroy_qp(ctx->qp); + +clean_cq: + ibv_destroy_cq(pp_cq(ctx)); + +clean_mr: + ibv_dereg_mr(ctx->mr); + +clean_dm: + if (ctx->dm) + ibv_free_dm(ctx->dm); + +clean_pd: + ibv_dealloc_pd(ctx->pd); + +clean_comp_channel: + if (ctx->channel) + ibv_destroy_comp_channel(ctx->channel); + +clean_device: + ibv_close_device(ctx->context); + +clean_buffer: + free(ctx->buf); + +clean_ctx: + free(ctx); + + return NULL; +} + +static int pp_close_ctx(struct pingpong_context *ctx) +{ + if (ibv_destroy_qp(ctx->qp)) { + fprintf(stderr, "Couldn't destroy QP\n"); + return 1; + } + + if (ibv_destroy_cq(pp_cq(ctx))) { + fprintf(stderr, "Couldn't destroy CQ\n"); + return 1; + } + + if (ibv_dereg_mr(ctx->mr)) { + fprintf(stderr, "Couldn't deregister MR\n"); + return 1; + } + + if (ctx->dm) { + if (ibv_free_dm(ctx->dm)) { + fprintf(stderr, "Couldn't free DM\n"); + return 1; + } + } + + if (ibv_dealloc_pd(ctx->pd)) { + fprintf(stderr, "Couldn't deallocate PD\n"); + return 1; + } + + if (ctx->channel) { + if (ibv_destroy_comp_channel(ctx->channel)) { + fprintf(stderr, "Couldn't destroy completion channel\n"); + return 1; + } + } + + if (ibv_close_device(ctx->context)) { + fprintf(stderr, "Couldn't release context\n"); + return 1; + } + + free(ctx->buf); + free(ctx); + + return 0; +} + +static int pp_post_recv(struct pingpong_context *ctx, int n) +{ + struct ibv_sge list = { + .addr = use_dm ? 0 : (uintptr_t) ctx->buf, + .length = ctx->size, + .lkey = ctx->mr->lkey + }; + struct ibv_recv_wr wr = { + .wr_id = PINGPONG_RECV_WRID, + .sg_list = &list, + .num_sge = 1, + }; + struct ibv_recv_wr *bad_wr; + int i; + + for (i = 0; i < n; ++i) + if (ibv_post_recv(ctx->qp, &wr, &bad_wr)) + break; + + return i; +} + +static int pp_post_send(struct pingpong_context *ctx) +{ + struct ibv_sge list = { + .addr = use_dm ? 0 : (uintptr_t) ctx->buf, + .length = ctx->size, + .lkey = ctx->mr->lkey + }; + struct ibv_send_wr wr = { + .wr_id = PINGPONG_SEND_WRID, + .sg_list = &list, + .num_sge = 1, + .opcode = IBV_WR_SEND, + .send_flags = ctx->send_flags, + }; + struct ibv_send_wr *bad_wr; + + if (use_new_send) { + ibv_wr_start(ctx->qpx); + + ctx->qpx->wr_id = PINGPONG_SEND_WRID; + ctx->qpx->wr_flags = ctx->send_flags; + + ibv_wr_send(ctx->qpx); + ibv_wr_set_sge(ctx->qpx, list.lkey, list.addr, list.length); + + return ibv_wr_complete(ctx->qpx); + } else { + return ibv_post_send(ctx->qp, &wr, &bad_wr); + } +} + +struct ts_params { + uint64_t comp_recv_max_time_delta; + uint64_t comp_recv_min_time_delta; + uint64_t comp_recv_total_time_delta; + uint64_t comp_recv_prev_time; + int last_comp_with_ts; + unsigned int comp_with_time_iters; +}; + +static inline int parse_single_wc(struct pingpong_context *ctx, int *scnt, + int *rcnt, int *routs, int iters, + uint64_t wr_id, enum ibv_wc_status status, + uint64_t completion_timestamp, + struct ts_params *ts) +{ + if (status != IBV_WC_SUCCESS) { + fprintf(stderr, "Failed status %s (%d) for wr_id %d\n", + ibv_wc_status_str(status), + status, (int)wr_id); + return 1; + } + + switch ((int)wr_id) { + case PINGPONG_SEND_WRID: + ++(*scnt); + break; + + case PINGPONG_RECV_WRID: + if (--(*routs) <= 1) { + *routs += pp_post_recv(ctx, ctx->rx_depth - *routs); + if (*routs < ctx->rx_depth) { + fprintf(stderr, + "Couldn't post receive (%d)\n", + *routs); + return 1; + } + } + + ++(*rcnt); + if (use_ts) { + if (ts->last_comp_with_ts) { + uint64_t delta; + + /* checking whether the clock was wrapped around */ + if (completion_timestamp >= ts->comp_recv_prev_time) + delta = completion_timestamp - ts->comp_recv_prev_time; + else + delta = ctx->completion_timestamp_mask - ts->comp_recv_prev_time + + completion_timestamp + 1; + + ts->comp_recv_max_time_delta = max(ts->comp_recv_max_time_delta, delta); + ts->comp_recv_min_time_delta = min(ts->comp_recv_min_time_delta, delta); + ts->comp_recv_total_time_delta += delta; + ts->comp_with_time_iters++; + } + + ts->comp_recv_prev_time = completion_timestamp; + ts->last_comp_with_ts = 1; + } else { + ts->last_comp_with_ts = 0; + } + + break; + + default: + fprintf(stderr, "Completion for unknown wr_id %d\n", + (int)wr_id); + return 1; + } + + ctx->pending &= ~(int)wr_id; + if (*scnt < iters && !ctx->pending) { + if (pp_post_send(ctx)) { + fprintf(stderr, "Couldn't post send\n"); + return 1; + } + ctx->pending = PINGPONG_RECV_WRID | + PINGPONG_SEND_WRID; + } + + return 0; +} + +static void usage(const char *argv0) +{ + printf("Usage:\n"); + printf(" %s start a server and wait for connection\n", argv0); + printf(" %s <host> connect to server at <host>\n", argv0); + printf("\n"); + printf("Options:\n"); + printf(" -p, --port=<port> listen on/connect to port <port> (default 18515)\n"); + printf(" -d, --ib-dev=<dev> use IB device <dev> (default first device found)\n"); + printf(" -i, --ib-port=<port> use port <port> of IB device (default 1)\n"); + printf(" -s, --size=<size> size of message to exchange (default 4096)\n"); + printf(" -m, --mtu=<size> path MTU (default 1024)\n"); + printf(" -r, --rx-depth=<dep> number of receives to post at a time (default 500)\n"); + printf(" -n, --iters=<iters> number of exchanges (default 1000)\n"); + printf(" -l, --sl=<sl> service level value\n"); + printf(" -e, --events sleep on CQ events (default poll)\n"); + printf(" -g, --gid-idx=<gid index> local port gid index\n"); + printf(" -o, --odp use on demand paging\n"); + printf(" -O, --iodp use implicit on demand paging\n"); + printf(" -P, --prefetch prefetch an ODP MR\n"); + printf(" -t, --ts get CQE with timestamp\n"); + printf(" -c, --chk validate received buffer\n"); + printf(" -j, --dm use device memory\n"); + printf(" -N, --new_send use new post send WR API\n"); +} + +int main(int argc, char *argv[]) +{ + struct ibv_device **dev_list; + struct ibv_device *ib_dev; + struct pingpong_context *ctx; + struct pingpong_dest my_dest; + struct pingpong_dest *rem_dest; + struct timeval start, end; + char *ib_devname = NULL; + char *servername = NULL; + unsigned int port = 18515; + int ib_port = 1; + unsigned int size = 4096; + enum ibv_mtu mtu = IBV_MTU_1024; + unsigned int rx_depth = 500; + unsigned int iters = 1000; + int use_event = 0; + int routs; + int rcnt, scnt; + int num_cq_events = 0; + int sl = 0; + int gidx = -1; + char gid[33]; + struct ts_params ts; + + srand48(getpid() * time(NULL)); + + while (1) { + int c; + + static struct option long_options[] = { + { .name = "port", .has_arg = 1, .val = 'p' }, + { .name = "ib-dev", .has_arg = 1, .val = 'd' }, + { .name = "ib-port", .has_arg = 1, .val = 'i' }, + { .name = "size", .has_arg = 1, .val = 's' }, + { .name = "mtu", .has_arg = 1, .val = 'm' }, + { .name = "rx-depth", .has_arg = 1, .val = 'r' }, + { .name = "iters", .has_arg = 1, .val = 'n' }, + { .name = "sl", .has_arg = 1, .val = 'l' }, + { .name = "events", .has_arg = 0, .val = 'e' }, + { .name = "gid-idx", .has_arg = 1, .val = 'g' }, + { .name = "odp", .has_arg = 0, .val = 'o' }, + { .name = "iodp", .has_arg = 0, .val = 'O' }, + { .name = "prefetch", .has_arg = 0, .val = 'P' }, + { .name = "ts", .has_arg = 0, .val = 't' }, + { .name = "chk", .has_arg = 0, .val = 'c' }, + { .name = "dm", .has_arg = 0, .val = 'j' }, + { .name = "new_send", .has_arg = 0, .val = 'N' }, + {} + }; + + c = getopt_long(argc, argv, "p:d:i:s:m:r:n:l:eg:oOPtcjN", + long_options, NULL); + + if (c == -1) + break; + + switch (c) { + case 'p': + port = strtoul(optarg, NULL, 0); + if (port > 65535) { + usage(argv[0]); + return 1; + } + break; + + case 'd': + ib_devname = strdupa(optarg); + break; + + case 'i': + ib_port = strtol(optarg, NULL, 0); + if (ib_port < 1) { + usage(argv[0]); + return 1; + } + break; + + case 's': + size = strtoul(optarg, NULL, 0); + break; + + case 'm': + mtu = pp_mtu_to_enum(strtol(optarg, NULL, 0)); + if (mtu == 0) { + usage(argv[0]); + return 1; + } + break; + + case 'r': + rx_depth = strtoul(optarg, NULL, 0); + break; + + case 'n': + iters = strtoul(optarg, NULL, 0); + break; + + case 'l': + sl = strtol(optarg, NULL, 0); + break; + + case 'e': + ++use_event; + break; + + case 'g': + gidx = strtol(optarg, NULL, 0); + break; + + case 'o': + use_odp = 1; + break; + case 'P': + prefetch_mr = 1; + break; + case 'O': + use_odp = 1; + implicit_odp = 1; + break; + case 't': + use_ts = 1; + break; + case 'c': + validate_buf = 1; + break; + + case 'j': + use_dm = 1; + break; + + case 'N': + use_new_send = 1; + break; + + default: + usage(argv[0]); + return 1; + } + } + + if (optind == argc - 1) + servername = strdupa(argv[optind]); + else if (optind < argc) { + usage(argv[0]); + return 1; + } + + if (use_odp && use_dm) { + fprintf(stderr, "DM memory region can't be on demand\n"); + return 1; + } + + if (!use_odp && prefetch_mr) { + fprintf(stderr, "prefetch is valid only with on-demand memory region\n"); + return 1; + } + + if (use_ts) { + ts.comp_recv_max_time_delta = 0; + ts.comp_recv_min_time_delta = 0xffffffff; + ts.comp_recv_total_time_delta = 0; + ts.comp_recv_prev_time = 0; + ts.last_comp_with_ts = 0; + ts.comp_with_time_iters = 0; + } + + page_size = sysconf(_SC_PAGESIZE); + + dev_list = ibv_get_device_list(NULL); + if (!dev_list) { + perror("Failed to get IB devices list"); + return 1; + } + + if (!ib_devname) { + ib_dev = *dev_list; + if (!ib_dev) { + fprintf(stderr, "No IB devices found\n"); + return 1; + } + } else { + int i; + for (i = 0; dev_list[i]; ++i) + if (!strcmp(ibv_get_device_name(dev_list[i]), ib_devname)) + break; + ib_dev = dev_list[i]; + if (!ib_dev) { + fprintf(stderr, "IB device %s not found\n", ib_devname); + return 1; + } + } + + ctx = pp_init_ctx(ib_dev, size, rx_depth, ib_port, use_event); + if (!ctx) + return 1; + + routs = pp_post_recv(ctx, ctx->rx_depth); + if (routs < ctx->rx_depth) { + fprintf(stderr, "Couldn't post receive (%d)\n", routs); + return 1; + } + + if (use_event) + if (ibv_req_notify_cq(pp_cq(ctx), 0)) { + fprintf(stderr, "Couldn't request CQ notification\n"); + return 1; + } + + + if (pp_get_port_info(ctx->context, ib_port, &ctx->portinfo)) { + fprintf(stderr, "Couldn't get port info\n"); + return 1; + } + + my_dest.lid = ctx->portinfo.lid; + if (ctx->portinfo.link_layer != IBV_LINK_LAYER_ETHERNET && + !my_dest.lid) { + fprintf(stderr, "Couldn't get local LID\n"); + return 1; + } + + if (gidx >= 0) { + if (ibv_query_gid(ctx->context, ib_port, gidx, &my_dest.gid)) { + fprintf(stderr, "can't read sgid of index %d\n", gidx); + return 1; + } + } else + memset(&my_dest.gid, 0, sizeof my_dest.gid); + + my_dest.qpn = ctx->qp->qp_num; + my_dest.psn = lrand48() & 0xffffff; + inet_ntop(AF_INET6, &my_dest.gid, gid, sizeof gid); + printf(" local address: LID 0x%04x, QPN 0x%06x, PSN 0x%06x, GID %s\n", + my_dest.lid, my_dest.qpn, my_dest.psn, gid); + + + if (servername) + rem_dest = pp_client_exch_dest(servername, port, &my_dest); + else + rem_dest = pp_server_exch_dest(ctx, ib_port, mtu, port, sl, + &my_dest, gidx); + + if (!rem_dest) + return 1; + + inet_ntop(AF_INET6, &rem_dest->gid, gid, sizeof gid); + printf(" remote address: LID 0x%04x, QPN 0x%06x, PSN 0x%06x, GID %s\n", + rem_dest->lid, rem_dest->qpn, rem_dest->psn, gid); + + if (servername) + if (pp_connect_ctx(ctx, ib_port, my_dest.psn, mtu, sl, rem_dest, + gidx)) + return 1; + + ctx->pending = PINGPONG_RECV_WRID; + + if (servername) { + if (validate_buf) + for (int i = 0; i < size; i += page_size) + ctx->buf[i] = i / page_size % sizeof(char); + + if (use_dm) + if (ibv_memcpy_to_dm(ctx->dm, 0, (void *)ctx->buf, size)) { + fprintf(stderr, "Copy to dm buffer failed\n"); + return 1; + } + + if (pp_post_send(ctx)) { + fprintf(stderr, "Couldn't post send\n"); + return 1; + } + ctx->pending |= PINGPONG_SEND_WRID; + } + + if (gettimeofday(&start, NULL)) { + perror("gettimeofday"); + return 1; + } + + rcnt = scnt = 0; + while (rcnt < iters || scnt < iters) { + int ret; + + if (use_event) { + struct ibv_cq *ev_cq; + void *ev_ctx; + + if (ibv_get_cq_event(ctx->channel, &ev_cq, &ev_ctx)) { + fprintf(stderr, "Failed to get cq_event\n"); + return 1; + } + + ++num_cq_events; + + if (ev_cq != pp_cq(ctx)) { + fprintf(stderr, "CQ event for unknown CQ %p\n", ev_cq); + return 1; + } + + if (ibv_req_notify_cq(pp_cq(ctx), 0)) { + fprintf(stderr, "Couldn't request CQ notification\n"); + return 1; + } + } + + if (use_ts) { + struct ibv_poll_cq_attr attr = {}; + + do { + ret = ibv_start_poll(ctx->cq_s.cq_ex, &attr); + } while (!use_event && ret == ENOENT); + + if (ret) { + fprintf(stderr, "poll CQ failed %d\n", ret); + return ret; + } + ret = parse_single_wc(ctx, &scnt, &rcnt, &routs, + iters, + ctx->cq_s.cq_ex->wr_id, + ctx->cq_s.cq_ex->status, + ibv_wc_read_completion_ts(ctx->cq_s.cq_ex), + &ts); + if (ret) { + ibv_end_poll(ctx->cq_s.cq_ex); + return ret; + } + ret = ibv_next_poll(ctx->cq_s.cq_ex); + if (!ret) + ret = parse_single_wc(ctx, &scnt, &rcnt, &routs, + iters, + ctx->cq_s.cq_ex->wr_id, + ctx->cq_s.cq_ex->status, + ibv_wc_read_completion_ts(ctx->cq_s.cq_ex), + &ts); + ibv_end_poll(ctx->cq_s.cq_ex); + if (ret && ret != ENOENT) { + fprintf(stderr, "poll CQ failed %d\n", ret); + return ret; + } + } else { + int ne, i; + struct ibv_wc wc[2]; + + do { + ne = ibv_poll_cq(pp_cq(ctx), 2, wc); + if (ne < 0) { + fprintf(stderr, "poll CQ failed %d\n", ne); + return 1; + } + } while (!use_event && ne < 1); + + for (i = 0; i < ne; ++i) { + ret = parse_single_wc(ctx, &scnt, &rcnt, &routs, + iters, + wc[i].wr_id, + wc[i].status, + 0, &ts); + if (ret) { + fprintf(stderr, "parse WC failed %d\n", ne); + return 1; + } + } + } + } + + if (gettimeofday(&end, NULL)) { + perror("gettimeofday"); + return 1; + } + + { + float usec = (end.tv_sec - start.tv_sec) * 1000000 + + (end.tv_usec - start.tv_usec); + long long bytes = (long long) size * iters * 2; + + printf("%lld bytes in %.2f seconds = %.2f Mbit/sec\n", + bytes, usec / 1000000., bytes * 8. / usec); + printf("%d iters in %.2f seconds = %.2f usec/iter\n", + iters, usec / 1000000., usec / iters); + + if (use_ts && ts.comp_with_time_iters) { + printf("Max receive completion clock cycles = %" PRIu64 "\n", + ts.comp_recv_max_time_delta); + printf("Min receive completion clock cycles = %" PRIu64 "\n", + ts.comp_recv_min_time_delta); + printf("Average receive completion clock cycles = %f\n", + (double)ts.comp_recv_total_time_delta / ts.comp_with_time_iters); + } + + if ((!servername) && (validate_buf)) { + if (use_dm) + if (ibv_memcpy_from_dm(ctx->buf, ctx->dm, 0, size)) { + fprintf(stderr, "Copy from DM buffer failed\n"); + return 1; + } + for (int i = 0; i < size; i += page_size) + if (ctx->buf[i] != i / page_size % sizeof(char)) + printf("invalid data in page %d\n", + i / page_size); + } + } + + ibv_ack_cq_events(pp_cq(ctx), num_cq_events); + + if (pp_close_ctx(ctx)) + return 1; + + ibv_free_device_list(dev_list); + free(rem_dest); + + return 0; +} diff --git a/libibverbs/examples/srq_pingpong.c b/libibverbs/examples/srq_pingpong.c new file mode 100644 index 0000000..55d7eae --- /dev/null +++ b/libibverbs/examples/srq_pingpong.c @@ -0,0 +1,1034 @@ +/* + * Copyright (c) 2005 Topspin Communications. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#define _GNU_SOURCE +#include <config.h> + +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <string.h> +#include <sys/types.h> +#include <sys/socket.h> +#include <sys/time.h> +#include <netdb.h> +#include <malloc.h> +#include <getopt.h> +#include <arpa/inet.h> +#include <time.h> + +#include "pingpong.h" + +enum { + PINGPONG_RECV_WRID = 1, + PINGPONG_SEND_WRID = 2, + + MAX_QP = 256, +}; + +static int page_size; +static int validate_buf; +static int use_odp; + +struct pingpong_context { + struct ibv_context *context; + struct ibv_comp_channel *channel; + struct ibv_pd *pd; + struct ibv_mr *mr; + struct ibv_cq *cq; + struct ibv_srq *srq; + struct ibv_qp *qp[MAX_QP]; + char *buf; + int size; + int send_flags; + int num_qp; + int rx_depth; + int pending[MAX_QP]; + struct ibv_port_attr portinfo; +}; + +struct pingpong_dest { + int lid; + int qpn; + int psn; + union ibv_gid gid; +}; + +static int pp_connect_ctx(struct pingpong_context *ctx, int port, enum ibv_mtu mtu, + int sl, const struct pingpong_dest *my_dest, + const struct pingpong_dest *dest, int sgid_idx) +{ + int i; + + for (i = 0; i < ctx->num_qp; ++i) { + struct ibv_qp_attr attr = { + .qp_state = IBV_QPS_RTR, + .path_mtu = mtu, + .dest_qp_num = dest[i].qpn, + .rq_psn = dest[i].psn, + .max_dest_rd_atomic = 1, + .min_rnr_timer = 12, + .ah_attr = { + .is_global = 0, + .dlid = dest[i].lid, + .sl = sl, + .src_path_bits = 0, + .port_num = port + } + }; + + if (dest->gid.global.interface_id) { + attr.ah_attr.is_global = 1; + attr.ah_attr.grh.hop_limit = 1; + attr.ah_attr.grh.dgid = dest->gid; + attr.ah_attr.grh.sgid_index = sgid_idx; + } + if (ibv_modify_qp(ctx->qp[i], &attr, + IBV_QP_STATE | + IBV_QP_AV | + IBV_QP_PATH_MTU | + IBV_QP_DEST_QPN | + IBV_QP_RQ_PSN | + IBV_QP_MAX_DEST_RD_ATOMIC | + IBV_QP_MIN_RNR_TIMER)) { + fprintf(stderr, "Failed to modify QP[%d] to RTR\n", i); + return 1; + } + + attr.qp_state = IBV_QPS_RTS; + attr.timeout = 14; + attr.retry_cnt = 7; + attr.rnr_retry = 7; + attr.sq_psn = my_dest[i].psn; + attr.max_rd_atomic = 1; + if (ibv_modify_qp(ctx->qp[i], &attr, + IBV_QP_STATE | + IBV_QP_TIMEOUT | + IBV_QP_RETRY_CNT | + IBV_QP_RNR_RETRY | + IBV_QP_SQ_PSN | + IBV_QP_MAX_QP_RD_ATOMIC)) { + fprintf(stderr, "Failed to modify QP[%d] to RTS\n", i); + return 1; + } + } + + return 0; +} + +static struct pingpong_dest *pp_client_exch_dest(const char *servername, int port, + const struct pingpong_dest *my_dest) +{ + struct addrinfo *res, *t; + struct addrinfo hints = { + .ai_family = AF_UNSPEC, + .ai_socktype = SOCK_STREAM + }; + char *service; + char msg[sizeof "0000:000000:000000:00000000000000000000000000000000"]; + int n; + int r; + int i; + int sockfd = -1; + struct pingpong_dest *rem_dest = NULL; + char gid[33]; + + if (asprintf(&service, "%d", port) < 0) + return NULL; + + n = getaddrinfo(servername, service, &hints, &res); + + if (n < 0) { + fprintf(stderr, "%s for %s:%d\n", gai_strerror(n), servername, port); + free(service); + return NULL; + } + + for (t = res; t; t = t->ai_next) { + sockfd = socket(t->ai_family, t->ai_socktype, t->ai_protocol); + if (sockfd >= 0) { + if (!connect(sockfd, t->ai_addr, t->ai_addrlen)) + break; + close(sockfd); + sockfd = -1; + } + } + + freeaddrinfo(res); + free(service); + + if (sockfd < 0) { + fprintf(stderr, "Couldn't connect to %s:%d\n", servername, port); + return NULL; + } + + for (i = 0; i < MAX_QP; ++i) { + gid_to_wire_gid(&my_dest[i].gid, gid); + sprintf(msg, "%04x:%06x:%06x:%s", my_dest[i].lid, + my_dest[i].qpn, my_dest[i].psn, gid); + if (write(sockfd, msg, sizeof msg) != sizeof msg) { + fprintf(stderr, "Couldn't send local address\n"); + goto out; + } + } + + rem_dest = malloc(MAX_QP * sizeof *rem_dest); + if (!rem_dest) + goto out; + + for (i = 0; i < MAX_QP; ++i) { + n = 0; + while (n < sizeof msg) { + r = read(sockfd, msg + n, sizeof msg - n); + if (r < 0) { + perror("client read"); + fprintf(stderr, "%d/%d: Couldn't read remote address [%d]\n", + n, (int) sizeof msg, i); + goto out; + } + n += r; + } + + sscanf(msg, "%x:%x:%x:%s", &rem_dest[i].lid, &rem_dest[i].qpn, + &rem_dest[i].psn, gid); + wire_gid_to_gid(gid, &rem_dest[i].gid); + } + + if (write(sockfd, "done", sizeof "done") != sizeof "done") { + perror("client write"); + goto out; + } +out: + close(sockfd); + return rem_dest; +} + +static struct pingpong_dest *pp_server_exch_dest(struct pingpong_context *ctx, + int ib_port, enum ibv_mtu mtu, + int port, int sl, + const struct pingpong_dest *my_dest, + int sgid_idx) +{ + struct addrinfo *res, *t; + struct addrinfo hints = { + .ai_flags = AI_PASSIVE, + .ai_family = AF_UNSPEC, + .ai_socktype = SOCK_STREAM + }; + char *service; + char msg[sizeof "0000:000000:000000:00000000000000000000000000000000"]; + int n; + int r; + int i; + int sockfd = -1, connfd; + struct pingpong_dest *rem_dest = NULL; + char gid[33]; + + if (asprintf(&service, "%d", port) < 0) + return NULL; + + n = getaddrinfo(NULL, service, &hints, &res); + + if (n < 0) { + fprintf(stderr, "%s for port %d\n", gai_strerror(n), port); + free(service); + return NULL; + } + + for (t = res; t; t = t->ai_next) { + sockfd = socket(t->ai_family, t->ai_socktype, t->ai_protocol); + if (sockfd >= 0) { + n = 1; + + setsockopt(sockfd, SOL_SOCKET, SO_REUSEADDR, &n, sizeof n); + + if (!bind(sockfd, t->ai_addr, t->ai_addrlen)) + break; + close(sockfd); + sockfd = -1; + } + } + + freeaddrinfo(res); + free(service); + + if (sockfd < 0) { + fprintf(stderr, "Couldn't listen to port %d\n", port); + return NULL; + } + + listen(sockfd, 1); + connfd = accept(sockfd, NULL, NULL); + close(sockfd); + if (connfd < 0) { + fprintf(stderr, "accept() failed\n"); + return NULL; + } + + rem_dest = malloc(MAX_QP * sizeof *rem_dest); + if (!rem_dest) + goto out; + + for (i = 0; i < MAX_QP; ++i) { + n = 0; + while (n < sizeof msg) { + r = read(connfd, msg + n, sizeof msg - n); + if (r < 0) { + perror("server read"); + fprintf(stderr, "%d/%d: Couldn't read remote address [%d]\n", + n, (int) sizeof msg, i); + goto out; + } + n += r; + } + + sscanf(msg, "%x:%x:%x:%s", &rem_dest[i].lid, &rem_dest[i].qpn, + &rem_dest[i].psn, gid); + wire_gid_to_gid(gid, &rem_dest[i].gid); + } + + if (pp_connect_ctx(ctx, ib_port, mtu, sl, my_dest, rem_dest, + sgid_idx)) { + fprintf(stderr, "Couldn't connect to remote QP\n"); + free(rem_dest); + rem_dest = NULL; + goto out; + } + + for (i = 0; i < MAX_QP; ++i) { + gid_to_wire_gid(&my_dest[i].gid, gid); + sprintf(msg, "%04x:%06x:%06x:%s", my_dest[i].lid, + my_dest[i].qpn, my_dest[i].psn, gid); + if (write(connfd, msg, sizeof msg) != sizeof msg) { + fprintf(stderr, "Couldn't send local address\n"); + free(rem_dest); + rem_dest = NULL; + goto out; + } + } + + if (read(connfd, msg, sizeof msg) != sizeof "done") { + perror("client write"); + free(rem_dest); + rem_dest = NULL; + goto out; + } + +out: + close(connfd); + return rem_dest; +} + +static struct pingpong_context *pp_init_ctx(struct ibv_device *ib_dev, int size, + int num_qp, int rx_depth, int port, + int use_event) +{ + struct pingpong_context *ctx; + int i; + int access_flags = IBV_ACCESS_LOCAL_WRITE; + + ctx = calloc(1, sizeof *ctx); + if (!ctx) + return NULL; + + ctx->size = size; + ctx->send_flags = IBV_SEND_SIGNALED; + ctx->num_qp = num_qp; + ctx->rx_depth = rx_depth; + + ctx->buf = memalign(page_size, size); + if (!ctx->buf) { + fprintf(stderr, "Couldn't allocate work buf.\n"); + goto clean_ctx; + } + + memset(ctx->buf, 0, size); + + ctx->context = ibv_open_device(ib_dev); + if (!ctx->context) { + fprintf(stderr, "Couldn't get context for %s\n", + ibv_get_device_name(ib_dev)); + goto clean_buffer; + } + if (use_odp) { + struct ibv_device_attr_ex attrx; + const uint32_t rc_caps_mask = IBV_ODP_SUPPORT_SEND | + IBV_ODP_SUPPORT_SRQ_RECV; + + if (ibv_query_device_ex(ctx->context, NULL, &attrx)) { + fprintf(stderr, "Couldn't query device for its features\n"); + goto clean_device; + } + if (!(attrx.odp_caps.general_caps & IBV_ODP_SUPPORT) || + (attrx.odp_caps.per_transport_caps.rc_odp_caps & rc_caps_mask) != rc_caps_mask) { + fprintf(stderr, "The device isn't ODP capable or does not support RC send, receive and srq with ODP\n"); + goto clean_device; + } + access_flags |= IBV_ACCESS_ON_DEMAND; + } + + + if (use_event) { + ctx->channel = ibv_create_comp_channel(ctx->context); + if (!ctx->channel) { + fprintf(stderr, "Couldn't create completion channel\n"); + goto clean_device; + } + } else + ctx->channel = NULL; + + ctx->pd = ibv_alloc_pd(ctx->context); + if (!ctx->pd) { + fprintf(stderr, "Couldn't allocate PD\n"); + goto clean_comp_channel; + } + + ctx->mr = ibv_reg_mr(ctx->pd, ctx->buf, size, access_flags); + if (!ctx->mr) { + fprintf(stderr, "Couldn't register MR\n"); + goto clean_pd; + } + + ctx->cq = ibv_create_cq(ctx->context, rx_depth + num_qp, NULL, + ctx->channel, 0); + if (!ctx->cq) { + fprintf(stderr, "Couldn't create CQ\n"); + goto clean_mr; + } + + { + struct ibv_srq_init_attr attr = { + .attr = { + .max_wr = rx_depth, + .max_sge = 1 + } + }; + + ctx->srq = ibv_create_srq(ctx->pd, &attr); + if (!ctx->srq) { + fprintf(stderr, "Couldn't create SRQ\n"); + goto clean_cq; + } + } + + for (i = 0; i < num_qp; ++i) { + struct ibv_qp_attr attr; + struct ibv_qp_init_attr init_attr = { + .send_cq = ctx->cq, + .recv_cq = ctx->cq, + .srq = ctx->srq, + .cap = { + .max_send_wr = 1, + .max_send_sge = 1, + }, + .qp_type = IBV_QPT_RC + }; + + ctx->qp[i] = ibv_create_qp(ctx->pd, &init_attr); + if (!ctx->qp[i]) { + fprintf(stderr, "Couldn't create QP[%d]\n", i); + goto clean_qps; + } + ibv_query_qp(ctx->qp[i], &attr, IBV_QP_CAP, &init_attr); + if (init_attr.cap.max_inline_data >= size) { + ctx->send_flags |= IBV_SEND_INLINE; + } + } + + for (i = 0; i < num_qp; ++i) { + struct ibv_qp_attr attr = { + .qp_state = IBV_QPS_INIT, + .pkey_index = 0, + .port_num = port, + .qp_access_flags = 0 + }; + + if (ibv_modify_qp(ctx->qp[i], &attr, + IBV_QP_STATE | + IBV_QP_PKEY_INDEX | + IBV_QP_PORT | + IBV_QP_ACCESS_FLAGS)) { + fprintf(stderr, "Failed to modify QP[%d] to INIT\n", i); + goto clean_qps_full; + } + } + + return ctx; + +clean_qps_full: + i = num_qp; + +clean_qps: + for (--i; i >= 0; --i) + ibv_destroy_qp(ctx->qp[i]); + + ibv_destroy_srq(ctx->srq); + +clean_cq: + ibv_destroy_cq(ctx->cq); + +clean_mr: + ibv_dereg_mr(ctx->mr); + +clean_pd: + ibv_dealloc_pd(ctx->pd); + +clean_comp_channel: + if (ctx->channel) + ibv_destroy_comp_channel(ctx->channel); + +clean_device: + ibv_close_device(ctx->context); + +clean_buffer: + free(ctx->buf); + +clean_ctx: + free(ctx); + + return NULL; +} + +static int pp_close_ctx(struct pingpong_context *ctx, int num_qp) +{ + int i; + + for (i = 0; i < num_qp; ++i) { + if (ibv_destroy_qp(ctx->qp[i])) { + fprintf(stderr, "Couldn't destroy QP[%d]\n", i); + return 1; + } + } + + if (ibv_destroy_srq(ctx->srq)) { + fprintf(stderr, "Couldn't destroy SRQ\n"); + return 1; + } + + if (ibv_destroy_cq(ctx->cq)) { + fprintf(stderr, "Couldn't destroy CQ\n"); + return 1; + } + + if (ibv_dereg_mr(ctx->mr)) { + fprintf(stderr, "Couldn't deregister MR\n"); + return 1; + } + + if (ibv_dealloc_pd(ctx->pd)) { + fprintf(stderr, "Couldn't deallocate PD\n"); + return 1; + } + + if (ctx->channel) { + if (ibv_destroy_comp_channel(ctx->channel)) { + fprintf(stderr, "Couldn't destroy completion channel\n"); + return 1; + } + } + + if (ibv_close_device(ctx->context)) { + fprintf(stderr, "Couldn't release context\n"); + return 1; + } + + free(ctx->buf); + free(ctx); + + return 0; +} + +static int pp_post_recv(struct pingpong_context *ctx, int n) +{ + struct ibv_sge list = { + .addr = (uintptr_t) ctx->buf, + .length = ctx->size, + .lkey = ctx->mr->lkey + }; + struct ibv_recv_wr wr = { + .wr_id = PINGPONG_RECV_WRID, + .sg_list = &list, + .num_sge = 1, + }; + struct ibv_recv_wr *bad_wr; + int i; + + for (i = 0; i < n; ++i) + if (ibv_post_srq_recv(ctx->srq, &wr, &bad_wr)) + break; + + return i; +} + +static int pp_post_send(struct pingpong_context *ctx, int qp_index) +{ + struct ibv_sge list = { + .addr = (uintptr_t) ctx->buf, + .length = ctx->size, + .lkey = ctx->mr->lkey + }; + struct ibv_send_wr wr = { + .wr_id = PINGPONG_SEND_WRID, + .sg_list = &list, + .num_sge = 1, + .opcode = IBV_WR_SEND, + .send_flags = ctx->send_flags, + }; + struct ibv_send_wr *bad_wr; + + return ibv_post_send(ctx->qp[qp_index], &wr, &bad_wr); +} + +static int find_qp(int qpn, struct pingpong_context *ctx, int num_qp) +{ + int i; + + for (i = 0; i < num_qp; ++i) + if (ctx->qp[i]->qp_num == qpn) + return i; + + return -1; +} + +static void usage(const char *argv0) +{ + printf("Usage:\n"); + printf(" %s start a server and wait for connection\n", argv0); + printf(" %s <host> connect to server at <host>\n", argv0); + printf("\n"); + printf("Options:\n"); + printf(" -p, --port=<port> listen on/connect to port <port> (default 18515)\n"); + printf(" -d, --ib-dev=<dev> use IB device <dev> (default first device found)\n"); + printf(" -i, --ib-port=<port> use port <port> of IB device (default 1)\n"); + printf(" -s, --size=<size> size of message to exchange (default 4096)\n"); + printf(" -m, --mtu=<size> path MTU (default 1024)\n"); + printf(" -q, --num-qp=<num> number of QPs to use (default 16)\n"); + printf(" -r, --rx-depth=<dep> number of receives to post at a time (default 500)\n"); + printf(" -n, --iters=<iters> number of exchanges per QP(default 1000)\n"); + printf(" -l, --sl=<sl> service level value\n"); + printf(" -e, --events sleep on CQ events (default poll)\n"); + printf(" -g, --gid-idx=<gid index> local port gid index\n"); + printf(" -o, --odp use on demand paging\n"); + printf(" -c, --chk validate received buffer\n"); +} + +int main(int argc, char *argv[]) +{ + struct ibv_device **dev_list; + struct ibv_device *ib_dev; + struct ibv_wc *wc; + struct pingpong_context *ctx; + struct pingpong_dest my_dest[MAX_QP]; + struct pingpong_dest *rem_dest; + struct timeval start, end; + char *ib_devname = NULL; + char *servername = NULL; + unsigned int port = 18515; + int ib_port = 1; + unsigned int size = 4096; + enum ibv_mtu mtu = IBV_MTU_1024; + unsigned int num_qp = 16; + unsigned int rx_depth = 500; + unsigned int iters = 1000; + int use_event = 0; + int routs; + int rcnt, scnt; + int num_wc; + int i; + int num_cq_events = 0; + int sl = 0; + int gidx = -1; + char gid[33]; + + srand48(getpid() * time(NULL)); + + while (1) { + int c; + + static struct option long_options[] = { + { .name = "port", .has_arg = 1, .val = 'p' }, + { .name = "ib-dev", .has_arg = 1, .val = 'd' }, + { .name = "ib-port", .has_arg = 1, .val = 'i' }, + { .name = "size", .has_arg = 1, .val = 's' }, + { .name = "mtu", .has_arg = 1, .val = 'm' }, + { .name = "num-qp", .has_arg = 1, .val = 'q' }, + { .name = "rx-depth", .has_arg = 1, .val = 'r' }, + { .name = "iters", .has_arg = 1, .val = 'n' }, + { .name = "sl", .has_arg = 1, .val = 'l' }, + { .name = "events", .has_arg = 0, .val = 'e' }, + { .name = "odp", .has_arg = 0, .val = 'o' }, + { .name = "gid-idx", .has_arg = 1, .val = 'g' }, + { .name = "chk", .has_arg = 0, .val = 'c' }, + {} + }; + + c = getopt_long(argc, argv, "p:d:i:s:m:q:r:n:l:eog:c", + long_options, NULL); + if (c == -1) + break; + + switch (c) { + case 'p': + port = strtoul(optarg, NULL, 0); + if (port > 65535) { + usage(argv[0]); + return 1; + } + break; + + case 'd': + ib_devname = strdupa(optarg); + break; + + case 'i': + ib_port = strtol(optarg, NULL, 0); + if (ib_port < 1) { + usage(argv[0]); + return 1; + } + break; + + case 's': + size = strtoul(optarg, NULL, 0); + if (size < 1) { + usage(argv[0]); + return 1; + } + break; + + case 'm': + mtu = pp_mtu_to_enum(strtol(optarg, NULL, 0)); + if (mtu == 0) { + usage(argv[0]); + return 1; + } + break; + + case 'q': + num_qp = strtoul(optarg, NULL, 0); + break; + + case 'r': + rx_depth = strtoul(optarg, NULL, 0); + break; + + case 'n': + iters = strtoul(optarg, NULL, 0); + break; + + case 'l': + sl = strtol(optarg, NULL, 0); + break; + + case 'e': + ++use_event; + break; + + case 'g': + gidx = strtol(optarg, NULL, 0); + break; + + case 'o': + use_odp = 1; + break; + + case 'c': + validate_buf = 1; + break; + + default: + usage(argv[0]); + return 1; + } + } + + if (optind == argc - 1) + servername = strdupa(argv[optind]); + else if (optind < argc) { + usage(argv[0]); + return 1; + } + + if (num_qp > rx_depth) { + fprintf(stderr, "rx_depth %d is too small for %d QPs -- " + "must have at least one receive per QP.\n", + rx_depth, num_qp); + return 1; + } + + if (num_qp >= MAX_QP) { + fprintf(stderr, "num_qp %d must be less than %d\n", num_qp, + MAX_QP - 1); + return 1; + } + + num_wc = num_qp + rx_depth; + wc = alloca(num_wc * sizeof *wc); + + page_size = sysconf(_SC_PAGESIZE); + + dev_list = ibv_get_device_list(NULL); + if (!dev_list) { + perror("Failed to get IB devices list"); + return 1; + } + + if (!ib_devname) { + ib_dev = *dev_list; + if (!ib_dev) { + fprintf(stderr, "No IB devices found\n"); + return 1; + } + } else { + for (i = 0; dev_list[i]; ++i) + if (!strcmp(ibv_get_device_name(dev_list[i]), ib_devname)) + break; + ib_dev = dev_list[i]; + if (!ib_dev) { + fprintf(stderr, "IB device %s not found\n", ib_devname); + return 1; + } + } + + ctx = pp_init_ctx(ib_dev, size, num_qp, rx_depth, ib_port, use_event); + if (!ctx) + return 1; + + routs = pp_post_recv(ctx, ctx->rx_depth); + if (routs < ctx->rx_depth) { + fprintf(stderr, "Couldn't post receive (%d)\n", routs); + return 1; + } + + if (use_event) + if (ibv_req_notify_cq(ctx->cq, 0)) { + fprintf(stderr, "Couldn't request CQ notification\n"); + return 1; + } + + memset(my_dest, 0, sizeof my_dest); + + if (pp_get_port_info(ctx->context, ib_port, &ctx->portinfo)) { + fprintf(stderr, "Couldn't get port info\n"); + return 1; + } + for (i = 0; i < num_qp; ++i) { + my_dest[i].qpn = ctx->qp[i]->qp_num; + my_dest[i].psn = lrand48() & 0xffffff; + my_dest[i].lid = ctx->portinfo.lid; + if (ctx->portinfo.link_layer != IBV_LINK_LAYER_ETHERNET + && !my_dest[i].lid) { + fprintf(stderr, "Couldn't get local LID\n"); + return 1; + } + + if (gidx >= 0) { + if (ibv_query_gid(ctx->context, ib_port, gidx, + &my_dest[i].gid)) { + fprintf(stderr, "Could not get local gid for " + "gid index %d\n", gidx); + return 1; + } + } else + memset(&my_dest[i].gid, 0, sizeof my_dest[i].gid); + + inet_ntop(AF_INET6, &my_dest[i].gid, gid, sizeof gid); + printf(" local address: LID 0x%04x, QPN 0x%06x, PSN 0x%06x, " + "GID %s\n", my_dest[i].lid, my_dest[i].qpn, + my_dest[i].psn, gid); + } + + if (servername) + rem_dest = pp_client_exch_dest(servername, port, my_dest); + else + rem_dest = pp_server_exch_dest(ctx, ib_port, mtu, port, sl, + my_dest, gidx); + + if (!rem_dest) + return 1; + + inet_ntop(AF_INET6, &rem_dest->gid, gid, sizeof gid); + + for (i = 0; i < num_qp; ++i) { + inet_ntop(AF_INET6, &rem_dest[i].gid, gid, sizeof gid); + printf(" remote address: LID 0x%04x, QPN 0x%06x, PSN 0x%06x, " + "GID %s\n", rem_dest[i].lid, rem_dest[i].qpn, + rem_dest[i].psn, gid); + } + + if (servername) + if (pp_connect_ctx(ctx, ib_port, mtu, sl, my_dest, rem_dest, + gidx)) + return 1; + + if (servername) { + if (validate_buf) + for (i = 0; i < size; i += page_size) + ctx->buf[i] = i / page_size % sizeof(char); + + for (i = 0; i < num_qp; ++i) { + if (pp_post_send(ctx, i)) { + fprintf(stderr, "Couldn't post send\n"); + return 1; + } + ctx->pending[i] = PINGPONG_SEND_WRID | PINGPONG_RECV_WRID; + } + } else + for (i = 0; i < num_qp; ++i) + ctx->pending[i] = PINGPONG_RECV_WRID; + + if (gettimeofday(&start, NULL)) { + perror("gettimeofday"); + return 1; + } + + rcnt = scnt = 0; + while (rcnt < iters || scnt < iters) { + if (use_event) { + struct ibv_cq *ev_cq; + void *ev_ctx; + + if (ibv_get_cq_event(ctx->channel, &ev_cq, &ev_ctx)) { + fprintf(stderr, "Failed to get cq_event\n"); + return 1; + } + + ++num_cq_events; + + if (ev_cq != ctx->cq) { + fprintf(stderr, "CQ event for unknown CQ %p\n", ev_cq); + return 1; + } + + if (ibv_req_notify_cq(ctx->cq, 0)) { + fprintf(stderr, "Couldn't request CQ notification\n"); + return 1; + } + } + + { + int ne, qp_ind; + + do { + ne = ibv_poll_cq(ctx->cq, num_wc, wc); + if (ne < 0) { + fprintf(stderr, "poll CQ failed %d\n", ne); + return 1; + } + } while (!use_event && ne < 1); + + for (i = 0; i < ne; ++i) { + if (wc[i].status != IBV_WC_SUCCESS) { + fprintf(stderr, "Failed status %s (%d) for wr_id %d\n", + ibv_wc_status_str(wc[i].status), + wc[i].status, (int) wc[i].wr_id); + return 1; + } + + qp_ind = find_qp(wc[i].qp_num, ctx, num_qp); + if (qp_ind < 0) { + fprintf(stderr, "Couldn't find QPN %06x\n", + wc[i].qp_num); + return 1; + } + + switch ((int) wc[i].wr_id) { + case PINGPONG_SEND_WRID: + ++scnt; + break; + + case PINGPONG_RECV_WRID: + if (--routs <= num_qp) { + routs += pp_post_recv(ctx, ctx->rx_depth - routs); + if (routs < ctx->rx_depth) { + fprintf(stderr, + "Couldn't post receive (%d)\n", + routs); + return 1; + } + } + + ++rcnt; + break; + + default: + fprintf(stderr, "Completion for unknown wr_id %d\n", + (int) wc[i].wr_id); + return 1; + } + + ctx->pending[qp_ind] &= ~(int) wc[i].wr_id; + if (scnt < iters && !ctx->pending[qp_ind]) { + if (pp_post_send(ctx, qp_ind)) { + fprintf(stderr, "Couldn't post send\n"); + return 1; + } + ctx->pending[qp_ind] = PINGPONG_RECV_WRID | + PINGPONG_SEND_WRID; + } + + } + } + } + + if (gettimeofday(&end, NULL)) { + perror("gettimeofday"); + return 1; + } + + { + float usec = (end.tv_sec - start.tv_sec) * 1000000 + + (end.tv_usec - start.tv_usec); + long long bytes = (long long) size * iters * 2; + + printf("%lld bytes in %.2f seconds = %.2f Mbit/sec\n", + bytes, usec / 1000000., bytes * 8. / usec); + printf("%d iters in %.2f seconds = %.2f usec/iter\n", + iters, usec / 1000000., usec / iters); + + if ((!servername) && (validate_buf)) { + for (i = 0; i < size; i += page_size) + if (ctx->buf[i] != i / page_size % sizeof(char)) + printf("invalid data in page %d\n", + i / page_size); + } + } + + ibv_ack_cq_events(ctx->cq, num_cq_events); + + if (pp_close_ctx(ctx, num_qp)) + return 1; + + ibv_free_device_list(dev_list); + free(rem_dest); + + return 0; +} diff --git a/libibverbs/examples/uc_pingpong.c b/libibverbs/examples/uc_pingpong.c new file mode 100644 index 0000000..16202d9 --- /dev/null +++ b/libibverbs/examples/uc_pingpong.c @@ -0,0 +1,872 @@ +/* + * Copyright (c) 2005 Topspin Communications. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#define _GNU_SOURCE +#include <config.h> + +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <string.h> +#include <sys/types.h> +#include <sys/socket.h> +#include <sys/time.h> +#include <netdb.h> +#include <malloc.h> +#include <getopt.h> +#include <arpa/inet.h> +#include <time.h> + +#include "pingpong.h" + +enum { + PINGPONG_RECV_WRID = 1, + PINGPONG_SEND_WRID = 2, +}; + +static int page_size; +static int validate_buf; + +struct pingpong_context { + struct ibv_context *context; + struct ibv_comp_channel *channel; + struct ibv_pd *pd; + struct ibv_mr *mr; + struct ibv_cq *cq; + struct ibv_qp *qp; + char *buf; + int size; + int send_flags; + int rx_depth; + int pending; + struct ibv_port_attr portinfo; +}; + +struct pingpong_dest { + int lid; + int qpn; + int psn; + union ibv_gid gid; +}; + +static int pp_connect_ctx(struct pingpong_context *ctx, int port, int my_psn, + enum ibv_mtu mtu, int sl, + struct pingpong_dest *dest, int sgid_idx) +{ + struct ibv_qp_attr attr = { + .qp_state = IBV_QPS_RTR, + .path_mtu = mtu, + .dest_qp_num = dest->qpn, + .rq_psn = dest->psn, + .ah_attr = { + .is_global = 0, + .dlid = dest->lid, + .sl = sl, + .src_path_bits = 0, + .port_num = port + } + }; + + if (dest->gid.global.interface_id) { + attr.ah_attr.is_global = 1; + attr.ah_attr.grh.hop_limit = 1; + attr.ah_attr.grh.dgid = dest->gid; + attr.ah_attr.grh.sgid_index = sgid_idx; + } + if (ibv_modify_qp(ctx->qp, &attr, + IBV_QP_STATE | + IBV_QP_AV | + IBV_QP_PATH_MTU | + IBV_QP_DEST_QPN | + IBV_QP_RQ_PSN)) { + fprintf(stderr, "Failed to modify QP to RTR\n"); + return 1; + } + + attr.qp_state = IBV_QPS_RTS; + attr.sq_psn = my_psn; + if (ibv_modify_qp(ctx->qp, &attr, + IBV_QP_STATE | + IBV_QP_SQ_PSN)) { + fprintf(stderr, "Failed to modify QP to RTS\n"); + return 1; + } + + return 0; +} + +static struct pingpong_dest *pp_client_exch_dest(const char *servername, int port, + const struct pingpong_dest *my_dest) +{ + struct addrinfo *res, *t; + struct addrinfo hints = { + .ai_family = AF_UNSPEC, + .ai_socktype = SOCK_STREAM + }; + char *service; + char msg[sizeof "0000:000000:000000:00000000000000000000000000000000"]; + int n; + int sockfd = -1; + struct pingpong_dest *rem_dest = NULL; + char gid[33]; + + if (asprintf(&service, "%d", port) < 0) + return NULL; + + n = getaddrinfo(servername, service, &hints, &res); + + if (n < 0) { + fprintf(stderr, "%s for %s:%d\n", gai_strerror(n), servername, port); + free(service); + return NULL; + } + + for (t = res; t; t = t->ai_next) { + sockfd = socket(t->ai_family, t->ai_socktype, t->ai_protocol); + if (sockfd >= 0) { + if (!connect(sockfd, t->ai_addr, t->ai_addrlen)) + break; + close(sockfd); + sockfd = -1; + } + } + + freeaddrinfo(res); + free(service); + + if (sockfd < 0) { + fprintf(stderr, "Couldn't connect to %s:%d\n", servername, port); + return NULL; + } + + gid_to_wire_gid(&my_dest->gid, gid); + sprintf(msg, "%04x:%06x:%06x:%s", my_dest->lid, my_dest->qpn, + my_dest->psn, gid); + if (write(sockfd, msg, sizeof msg) != sizeof msg) { + fprintf(stderr, "Couldn't send local address\n"); + goto out; + } + + if (read(sockfd, msg, sizeof msg) != sizeof msg || + write(sockfd, "done", sizeof "done") != sizeof "done") { + perror("client read/write"); + fprintf(stderr, "Couldn't read/write remote address\n"); + goto out; + } + + + rem_dest = malloc(sizeof *rem_dest); + if (!rem_dest) + goto out; + + sscanf(msg, "%x:%x:%x:%s", &rem_dest->lid, &rem_dest->qpn, + &rem_dest->psn, gid); + wire_gid_to_gid(gid, &rem_dest->gid); + +out: + close(sockfd); + return rem_dest; +} + +static struct pingpong_dest *pp_server_exch_dest(struct pingpong_context *ctx, + int ib_port, enum ibv_mtu mtu, + int port, int sl, + const struct pingpong_dest *my_dest, + int sgid_idx) +{ + struct addrinfo *res, *t; + struct addrinfo hints = { + .ai_flags = AI_PASSIVE, + .ai_family = AF_UNSPEC, + .ai_socktype = SOCK_STREAM + }; + char *service; + char msg[sizeof "0000:000000:000000:00000000000000000000000000000000"]; + int n; + int sockfd = -1, connfd; + struct pingpong_dest *rem_dest = NULL; + char gid[33]; + + if (asprintf(&service, "%d", port) < 0) + return NULL; + + n = getaddrinfo(NULL, service, &hints, &res); + + if (n < 0) { + fprintf(stderr, "%s for port %d\n", gai_strerror(n), port); + free(service); + return NULL; + } + + for (t = res; t; t = t->ai_next) { + sockfd = socket(t->ai_family, t->ai_socktype, t->ai_protocol); + if (sockfd >= 0) { + n = 1; + + setsockopt(sockfd, SOL_SOCKET, SO_REUSEADDR, &n, sizeof n); + + if (!bind(sockfd, t->ai_addr, t->ai_addrlen)) + break; + close(sockfd); + sockfd = -1; + } + } + + freeaddrinfo(res); + free(service); + + if (sockfd < 0) { + fprintf(stderr, "Couldn't listen to port %d\n", port); + return NULL; + } + + listen(sockfd, 1); + connfd = accept(sockfd, NULL, NULL); + close(sockfd); + if (connfd < 0) { + fprintf(stderr, "accept() failed\n"); + return NULL; + } + + n = read(connfd, msg, sizeof msg); + if (n != sizeof msg) { + perror("server read"); + fprintf(stderr, "%d/%d: Couldn't read remote address\n", n, (int) sizeof msg); + goto out; + } + + rem_dest = malloc(sizeof *rem_dest); + if (!rem_dest) + goto out; + + sscanf(msg, "%x:%x:%x:%s", &rem_dest->lid, &rem_dest->qpn, + &rem_dest->psn, gid); + wire_gid_to_gid(gid, &rem_dest->gid); + + if (pp_connect_ctx(ctx, ib_port, my_dest->psn, mtu, sl, rem_dest, + sgid_idx)) { + fprintf(stderr, "Couldn't connect to remote QP\n"); + free(rem_dest); + rem_dest = NULL; + goto out; + } + + + gid_to_wire_gid(&my_dest->gid, gid); + sprintf(msg, "%04x:%06x:%06x:%s", my_dest->lid, my_dest->qpn, + my_dest->psn, gid); + if (write(connfd, msg, sizeof msg) != sizeof msg || + read(connfd, msg, sizeof msg) != sizeof "done") { + fprintf(stderr, "Couldn't send/recv local address\n"); + free(rem_dest); + rem_dest = NULL; + goto out; + } + +out: + close(connfd); + return rem_dest; +} + +static struct pingpong_context *pp_init_ctx(struct ibv_device *ib_dev, int size, + int rx_depth, int port, + int use_event) +{ + struct pingpong_context *ctx; + + ctx = calloc(1, sizeof *ctx); + if (!ctx) + return NULL; + + ctx->size = size; + ctx->send_flags = IBV_SEND_SIGNALED; + ctx->rx_depth = rx_depth; + + ctx->buf = memalign(page_size, size); + if (!ctx->buf) { + fprintf(stderr, "Couldn't allocate work buf.\n"); + goto clean_ctx; + } + + /* FIXME memset(ctx->buf, 0, size); */ + memset(ctx->buf, 0x7b, size); + + ctx->context = ibv_open_device(ib_dev); + if (!ctx->context) { + fprintf(stderr, "Couldn't get context for %s\n", + ibv_get_device_name(ib_dev)); + goto clean_buffer; + } + + if (use_event) { + ctx->channel = ibv_create_comp_channel(ctx->context); + if (!ctx->channel) { + fprintf(stderr, "Couldn't create completion channel\n"); + goto clean_device; + } + } else + ctx->channel = NULL; + + ctx->pd = ibv_alloc_pd(ctx->context); + if (!ctx->pd) { + fprintf(stderr, "Couldn't allocate PD\n"); + goto clean_comp_channel; + } + + ctx->mr = ibv_reg_mr(ctx->pd, ctx->buf, size, IBV_ACCESS_LOCAL_WRITE); + if (!ctx->mr) { + fprintf(stderr, "Couldn't register MR\n"); + goto clean_pd; + } + + ctx->cq = ibv_create_cq(ctx->context, rx_depth + 1, NULL, + ctx->channel, 0); + if (!ctx->cq) { + fprintf(stderr, "Couldn't create CQ\n"); + goto clean_mr; + } + + { + struct ibv_qp_attr attr; + struct ibv_qp_init_attr init_attr = { + .send_cq = ctx->cq, + .recv_cq = ctx->cq, + .cap = { + .max_send_wr = 1, + .max_recv_wr = rx_depth, + .max_send_sge = 1, + .max_recv_sge = 1 + }, + .qp_type = IBV_QPT_UC + }; + + ctx->qp = ibv_create_qp(ctx->pd, &init_attr); + if (!ctx->qp) { + fprintf(stderr, "Couldn't create QP\n"); + goto clean_cq; + } + ibv_query_qp(ctx->qp, &attr, IBV_QP_CAP, &init_attr); + if (init_attr.cap.max_inline_data >= size) { + ctx->send_flags |= IBV_SEND_INLINE; + } + } + + { + struct ibv_qp_attr attr = { + .qp_state = IBV_QPS_INIT, + .pkey_index = 0, + .port_num = port, + .qp_access_flags = 0 + }; + + if (ibv_modify_qp(ctx->qp, &attr, + IBV_QP_STATE | + IBV_QP_PKEY_INDEX | + IBV_QP_PORT | + IBV_QP_ACCESS_FLAGS)) { + fprintf(stderr, "Failed to modify QP to INIT\n"); + goto clean_qp; + } + } + + return ctx; + +clean_qp: + ibv_destroy_qp(ctx->qp); + +clean_cq: + ibv_destroy_cq(ctx->cq); + +clean_mr: + ibv_dereg_mr(ctx->mr); + +clean_pd: + ibv_dealloc_pd(ctx->pd); + +clean_comp_channel: + if (ctx->channel) + ibv_destroy_comp_channel(ctx->channel); + +clean_device: + ibv_close_device(ctx->context); + +clean_buffer: + free(ctx->buf); + +clean_ctx: + free(ctx); + + return NULL; +} + +static int pp_close_ctx(struct pingpong_context *ctx) +{ + if (ibv_destroy_qp(ctx->qp)) { + fprintf(stderr, "Couldn't destroy QP\n"); + return 1; + } + + if (ibv_destroy_cq(ctx->cq)) { + fprintf(stderr, "Couldn't destroy CQ\n"); + return 1; + } + + if (ibv_dereg_mr(ctx->mr)) { + fprintf(stderr, "Couldn't deregister MR\n"); + return 1; + } + + if (ibv_dealloc_pd(ctx->pd)) { + fprintf(stderr, "Couldn't deallocate PD\n"); + return 1; + } + + if (ctx->channel) { + if (ibv_destroy_comp_channel(ctx->channel)) { + fprintf(stderr, "Couldn't destroy completion channel\n"); + return 1; + } + } + + if (ibv_close_device(ctx->context)) { + fprintf(stderr, "Couldn't release context\n"); + return 1; + } + + free(ctx->buf); + free(ctx); + + return 0; +} + +static int pp_post_recv(struct pingpong_context *ctx, int n) +{ + struct ibv_sge list = { + .addr = (uintptr_t) ctx->buf, + .length = ctx->size, + .lkey = ctx->mr->lkey + }; + struct ibv_recv_wr wr = { + .wr_id = PINGPONG_RECV_WRID, + .sg_list = &list, + .num_sge = 1, + }; + struct ibv_recv_wr *bad_wr; + int i; + + for (i = 0; i < n; ++i) + if (ibv_post_recv(ctx->qp, &wr, &bad_wr)) + break; + + return i; +} + +static int pp_post_send(struct pingpong_context *ctx) +{ + struct ibv_sge list = { + .addr = (uintptr_t) ctx->buf, + .length = ctx->size, + .lkey = ctx->mr->lkey + }; + struct ibv_send_wr wr = { + .wr_id = PINGPONG_SEND_WRID, + .sg_list = &list, + .num_sge = 1, + .opcode = IBV_WR_SEND, + .send_flags = ctx->send_flags, + }; + struct ibv_send_wr *bad_wr; + + return ibv_post_send(ctx->qp, &wr, &bad_wr); +} + +static void usage(const char *argv0) +{ + printf("Usage:\n"); + printf(" %s start a server and wait for connection\n", argv0); + printf(" %s <host> connect to server at <host>\n", argv0); + printf("\n"); + printf("Options:\n"); + printf(" -p, --port=<port> listen on/connect to port <port> (default 18515)\n"); + printf(" -d, --ib-dev=<dev> use IB device <dev> (default first device found)\n"); + printf(" -i, --ib-port=<port> use port <port> of IB device (default 1)\n"); + printf(" -s, --size=<size> size of message to exchange (default 4096)\n"); + printf(" -m, --mtu=<size> path MTU (default 1024)\n"); + printf(" -r, --rx-depth=<dep> number of receives to post at a time (default 500)\n"); + printf(" -n, --iters=<iters> number of exchanges (default 1000)\n"); + printf(" -l, --sl=<sl> service level value\n"); + printf(" -e, --events sleep on CQ events (default poll)\n"); + printf(" -g, --gid-idx=<gid index> local port gid index\n"); + printf(" -c, --chk validate received buffer\n"); +} + +int main(int argc, char *argv[]) +{ + struct ibv_device **dev_list; + struct ibv_device *ib_dev; + struct pingpong_context *ctx; + struct pingpong_dest my_dest; + struct pingpong_dest *rem_dest; + struct timeval start, end; + char *ib_devname = NULL; + char *servername = NULL; + unsigned int port = 18515; + int ib_port = 1; + unsigned int size = 4096; + enum ibv_mtu mtu = IBV_MTU_1024; + unsigned int rx_depth = 500; + unsigned int iters = 1000; + int use_event = 0; + int routs; + int rcnt, scnt; + int num_cq_events = 0; + int sl = 0; + int gidx = -1; + char gid[33]; + + srand48(getpid() * time(NULL)); + + while (1) { + int c; + + static struct option long_options[] = { + { .name = "port", .has_arg = 1, .val = 'p' }, + { .name = "ib-dev", .has_arg = 1, .val = 'd' }, + { .name = "ib-port", .has_arg = 1, .val = 'i' }, + { .name = "size", .has_arg = 1, .val = 's' }, + { .name = "mtu", .has_arg = 1, .val = 'm' }, + { .name = "rx-depth", .has_arg = 1, .val = 'r' }, + { .name = "iters", .has_arg = 1, .val = 'n' }, + { .name = "sl", .has_arg = 1, .val = 'l' }, + { .name = "events", .has_arg = 0, .val = 'e' }, + { .name = "gid-idx", .has_arg = 1, .val = 'g' }, + { .name = "chk", .has_arg = 0, .val = 'c' }, + {} + }; + + c = getopt_long(argc, argv, "p:d:i:s:m:r:n:l:eg:c", + long_options, NULL); + if (c == -1) + break; + + switch (c) { + case 'p': + port = strtoul(optarg, NULL, 0); + if (port > 65535) { + usage(argv[0]); + return 1; + } + break; + + case 'd': + ib_devname = strdupa(optarg); + break; + + case 'i': + ib_port = strtol(optarg, NULL, 0); + if (ib_port < 1) { + usage(argv[0]); + return 1; + } + break; + + case 's': + size = strtoul(optarg, NULL, 0); + break; + + case 'm': + mtu = pp_mtu_to_enum(strtol(optarg, NULL, 0)); + if (mtu == 0) { + usage(argv[0]); + return 1; + } + break; + + case 'r': + rx_depth = strtoul(optarg, NULL, 0); + break; + + case 'n': + iters = strtoul(optarg, NULL, 0); + break; + + case 'l': + sl = strtol(optarg, NULL, 0); + break; + + case 'e': + ++use_event; + break; + + case 'g': + gidx = strtol(optarg, NULL, 0); + break; + + case 'c': + validate_buf = 1; + break; + + default: + usage(argv[0]); + return 1; + } + } + + if (optind == argc - 1) + servername = strdupa(argv[optind]); + else if (optind < argc) { + usage(argv[0]); + return 1; + } + + page_size = sysconf(_SC_PAGESIZE); + + dev_list = ibv_get_device_list(NULL); + if (!dev_list) { + perror("Failed to get IB devices list"); + return 1; + } + + if (!ib_devname) { + ib_dev = *dev_list; + if (!ib_dev) { + fprintf(stderr, "No IB devices found\n"); + return 1; + } + } else { + int i; + for (i = 0; dev_list[i]; ++i) + if (!strcmp(ibv_get_device_name(dev_list[i]), ib_devname)) + break; + ib_dev = dev_list[i]; + if (!ib_dev) { + fprintf(stderr, "IB device %s not found\n", ib_devname); + return 1; + } + } + + ctx = pp_init_ctx(ib_dev, size, rx_depth, ib_port, use_event); + if (!ctx) + return 1; + + routs = pp_post_recv(ctx, ctx->rx_depth); + if (routs < ctx->rx_depth) { + fprintf(stderr, "Couldn't post receive (%d)\n", routs); + return 1; + } + + if (use_event) + if (ibv_req_notify_cq(ctx->cq, 0)) { + fprintf(stderr, "Couldn't request CQ notification\n"); + return 1; + } + + + if (pp_get_port_info(ctx->context, ib_port, &ctx->portinfo)) { + fprintf(stderr, "Couldn't get port info\n"); + return 1; + } + + my_dest.lid = ctx->portinfo.lid; + if (ctx->portinfo.link_layer != IBV_LINK_LAYER_ETHERNET && + !my_dest.lid) { + fprintf(stderr, "Couldn't get local LID\n"); + return 1; + } + + if (gidx >= 0) { + if (ibv_query_gid(ctx->context, ib_port, gidx, &my_dest.gid)) { + fprintf(stderr, "can't read sgid of index %d\n", gidx); + return 1; + } + } else + memset(&my_dest.gid, 0, sizeof my_dest.gid); + + my_dest.qpn = ctx->qp->qp_num; + my_dest.psn = lrand48() & 0xffffff; + inet_ntop(AF_INET6, &my_dest.gid, gid, sizeof gid); + printf(" local address: LID 0x%04x, QPN 0x%06x, PSN 0x%06x, GID %s\n", + my_dest.lid, my_dest.qpn, my_dest.psn, gid); + + + if (servername) + rem_dest = pp_client_exch_dest(servername, port, &my_dest); + else + rem_dest = pp_server_exch_dest(ctx, ib_port, mtu, port, sl, + &my_dest, gidx); + + if (!rem_dest) + return 1; + + inet_ntop(AF_INET6, &rem_dest->gid, gid, sizeof gid); + printf(" remote address: LID 0x%04x, QPN 0x%06x, PSN 0x%06x, GID %s\n", + rem_dest->lid, rem_dest->qpn, rem_dest->psn, gid); + + if (servername) + if (pp_connect_ctx(ctx, ib_port, my_dest.psn, mtu, sl, rem_dest, + gidx)) + return 1; + + ctx->pending = PINGPONG_RECV_WRID; + + if (servername) { + if (validate_buf) + for (int i = 0; i < size; i += page_size) + ctx->buf[i] = i / page_size % sizeof(char); + + if (pp_post_send(ctx)) { + fprintf(stderr, "Couldn't post send\n"); + return 1; + } + ctx->pending |= PINGPONG_SEND_WRID; + } + + if (gettimeofday(&start, NULL)) { + perror("gettimeofday"); + return 1; + } + + rcnt = scnt = 0; + while (rcnt < iters || scnt < iters) { + if (use_event) { + struct ibv_cq *ev_cq; + void *ev_ctx; + + if (ibv_get_cq_event(ctx->channel, &ev_cq, &ev_ctx)) { + fprintf(stderr, "Failed to get cq_event\n"); + return 1; + } + + ++num_cq_events; + + if (ev_cq != ctx->cq) { + fprintf(stderr, "CQ event for unknown CQ %p\n", ev_cq); + return 1; + } + + if (ibv_req_notify_cq(ctx->cq, 0)) { + fprintf(stderr, "Couldn't request CQ notification\n"); + return 1; + } + } + + { + struct ibv_wc wc[2]; + int ne, i; + + do { + ne = ibv_poll_cq(ctx->cq, 2, wc); + if (ne < 0) { + fprintf(stderr, "poll CQ failed %d\n", ne); + return 1; + } + + } while (!use_event && ne < 1); + + for (i = 0; i < ne; ++i) { + if (wc[i].status != IBV_WC_SUCCESS) { + fprintf(stderr, "Failed status %s (%d) for wr_id %d\n", + ibv_wc_status_str(wc[i].status), + wc[i].status, (int) wc[i].wr_id); + return 1; + } + + switch ((int) wc[i].wr_id) { + case PINGPONG_SEND_WRID: + ++scnt; + break; + + case PINGPONG_RECV_WRID: + if (--routs <= 1) { + routs += pp_post_recv(ctx, ctx->rx_depth - routs); + if (routs < ctx->rx_depth) { + fprintf(stderr, + "Couldn't post receive (%d)\n", + routs); + return 1; + } + } + + ++rcnt; + break; + + default: + fprintf(stderr, "Completion for unknown wr_id %d\n", + (int) wc[i].wr_id); + return 1; + } + + ctx->pending &= ~(int) wc[i].wr_id; + if (scnt < iters && !ctx->pending) { + if (pp_post_send(ctx)) { + fprintf(stderr, "Couldn't post send\n"); + return 1; + } + ctx->pending = PINGPONG_RECV_WRID | + PINGPONG_SEND_WRID; + } + } + } + } + + if (gettimeofday(&end, NULL)) { + perror("gettimeofday"); + return 1; + } + + { + float usec = (end.tv_sec - start.tv_sec) * 1000000 + + (end.tv_usec - start.tv_usec); + long long bytes = (long long) size * iters * 2; + + printf("%lld bytes in %.2f seconds = %.2f Mbit/sec\n", + bytes, usec / 1000000., bytes * 8. / usec); + printf("%d iters in %.2f seconds = %.2f usec/iter\n", + iters, usec / 1000000., usec / iters); + + if ((!servername) && (validate_buf)) { + for (int i = 0; i < size; i += page_size) + if (ctx->buf[i] != i / page_size % sizeof(char)) + printf("invalid data in page %d\n", + i / page_size); + } + } + + ibv_ack_cq_events(ctx->cq, num_cq_events); + + if (pp_close_ctx(ctx)) + return 1; + + ibv_free_device_list(dev_list); + free(rem_dest); + + return 0; +} diff --git a/libibverbs/examples/ud_pingpong.c b/libibverbs/examples/ud_pingpong.c new file mode 100644 index 0000000..4b0e8af --- /dev/null +++ b/libibverbs/examples/ud_pingpong.c @@ -0,0 +1,879 @@ +/* + * Copyright (c) 2005 Topspin Communications. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#define _GNU_SOURCE +#include <config.h> + +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <string.h> +#include <sys/types.h> +#include <sys/socket.h> +#include <sys/time.h> +#include <netdb.h> +#include <malloc.h> +#include <getopt.h> +#include <arpa/inet.h> +#include <time.h> + +#include "pingpong.h" + +enum { + PINGPONG_RECV_WRID = 1, + PINGPONG_SEND_WRID = 2, +}; + +static int page_size; +static int validate_buf; + +struct pingpong_context { + struct ibv_context *context; + struct ibv_comp_channel *channel; + struct ibv_pd *pd; + struct ibv_mr *mr; + struct ibv_cq *cq; + struct ibv_qp *qp; + struct ibv_ah *ah; + char *buf; + int size; + int send_flags; + int rx_depth; + int pending; + struct ibv_port_attr portinfo; +}; + +struct pingpong_dest { + int lid; + int qpn; + int psn; + union ibv_gid gid; +}; + +static int pp_connect_ctx(struct pingpong_context *ctx, int port, int my_psn, + int sl, struct pingpong_dest *dest, int sgid_idx) +{ + struct ibv_ah_attr ah_attr = { + .is_global = 0, + .dlid = dest->lid, + .sl = sl, + .src_path_bits = 0, + .port_num = port + }; + struct ibv_qp_attr attr = { + .qp_state = IBV_QPS_RTR + }; + + if (ibv_modify_qp(ctx->qp, &attr, IBV_QP_STATE)) { + fprintf(stderr, "Failed to modify QP to RTR\n"); + return 1; + } + + attr.qp_state = IBV_QPS_RTS; + attr.sq_psn = my_psn; + + if (ibv_modify_qp(ctx->qp, &attr, + IBV_QP_STATE | + IBV_QP_SQ_PSN)) { + fprintf(stderr, "Failed to modify QP to RTS\n"); + return 1; + } + + if (dest->gid.global.interface_id) { + ah_attr.is_global = 1; + ah_attr.grh.hop_limit = 1; + ah_attr.grh.dgid = dest->gid; + ah_attr.grh.sgid_index = sgid_idx; + } + + ctx->ah = ibv_create_ah(ctx->pd, &ah_attr); + if (!ctx->ah) { + fprintf(stderr, "Failed to create AH\n"); + return 1; + } + + return 0; +} + +static struct pingpong_dest *pp_client_exch_dest(const char *servername, int port, + const struct pingpong_dest *my_dest) +{ + struct addrinfo *res, *t; + struct addrinfo hints = { + .ai_family = AF_UNSPEC, + .ai_socktype = SOCK_STREAM + }; + char *service; + char msg[sizeof "0000:000000:000000:00000000000000000000000000000000"]; + int n; + int sockfd = -1; + struct pingpong_dest *rem_dest = NULL; + char gid[33]; + + if (asprintf(&service, "%d", port) < 0) + return NULL; + + n = getaddrinfo(servername, service, &hints, &res); + + if (n < 0) { + fprintf(stderr, "%s for %s:%d\n", gai_strerror(n), servername, port); + free(service); + return NULL; + } + + for (t = res; t; t = t->ai_next) { + sockfd = socket(t->ai_family, t->ai_socktype, t->ai_protocol); + if (sockfd >= 0) { + if (!connect(sockfd, t->ai_addr, t->ai_addrlen)) + break; + close(sockfd); + sockfd = -1; + } + } + + freeaddrinfo(res); + free(service); + + if (sockfd < 0) { + fprintf(stderr, "Couldn't connect to %s:%d\n", servername, port); + return NULL; + } + + gid_to_wire_gid(&my_dest->gid, gid); + sprintf(msg, "%04x:%06x:%06x:%s", my_dest->lid, my_dest->qpn, + my_dest->psn, gid); + if (write(sockfd, msg, sizeof msg) != sizeof msg) { + fprintf(stderr, "Couldn't send local address\n"); + goto out; + } + + if (read(sockfd, msg, sizeof msg) != sizeof msg || + write(sockfd, "done", sizeof "done") != sizeof "done") { + perror("client read/write"); + fprintf(stderr, "Couldn't read/write remote address\n"); + goto out; + } + + rem_dest = malloc(sizeof *rem_dest); + if (!rem_dest) + goto out; + + sscanf(msg, "%x:%x:%x:%s", &rem_dest->lid, &rem_dest->qpn, + &rem_dest->psn, gid); + wire_gid_to_gid(gid, &rem_dest->gid); + +out: + close(sockfd); + return rem_dest; +} + +static struct pingpong_dest *pp_server_exch_dest(struct pingpong_context *ctx, + int ib_port, int port, int sl, + const struct pingpong_dest *my_dest, + int sgid_idx) +{ + struct addrinfo *res, *t; + struct addrinfo hints = { + .ai_flags = AI_PASSIVE, + .ai_family = AF_UNSPEC, + .ai_socktype = SOCK_STREAM + }; + char *service; + char msg[sizeof "0000:000000:000000:00000000000000000000000000000000"]; + int n; + int sockfd = -1, connfd; + struct pingpong_dest *rem_dest = NULL; + char gid[33]; + + if (asprintf(&service, "%d", port) < 0) + return NULL; + + n = getaddrinfo(NULL, service, &hints, &res); + + if (n < 0) { + fprintf(stderr, "%s for port %d\n", gai_strerror(n), port); + free(service); + return NULL; + } + + for (t = res; t; t = t->ai_next) { + sockfd = socket(t->ai_family, t->ai_socktype, t->ai_protocol); + if (sockfd >= 0) { + n = 1; + + setsockopt(sockfd, SOL_SOCKET, SO_REUSEADDR, &n, sizeof n); + + if (!bind(sockfd, t->ai_addr, t->ai_addrlen)) + break; + close(sockfd); + sockfd = -1; + } + } + + freeaddrinfo(res); + free(service); + + if (sockfd < 0) { + fprintf(stderr, "Couldn't listen to port %d\n", port); + return NULL; + } + + listen(sockfd, 1); + connfd = accept(sockfd, NULL, NULL); + close(sockfd); + if (connfd < 0) { + fprintf(stderr, "accept() failed\n"); + return NULL; + } + + n = read(connfd, msg, sizeof msg); + if (n != sizeof msg) { + perror("server read"); + fprintf(stderr, "%d/%d: Couldn't read remote address\n", n, (int) sizeof msg); + goto out; + } + + rem_dest = malloc(sizeof *rem_dest); + if (!rem_dest) + goto out; + + sscanf(msg, "%x:%x:%x:%s", &rem_dest->lid, &rem_dest->qpn, + &rem_dest->psn, gid); + wire_gid_to_gid(gid, &rem_dest->gid); + + if (pp_connect_ctx(ctx, ib_port, my_dest->psn, sl, rem_dest, + sgid_idx)) { + fprintf(stderr, "Couldn't connect to remote QP\n"); + free(rem_dest); + rem_dest = NULL; + goto out; + } + + gid_to_wire_gid(&my_dest->gid, gid); + sprintf(msg, "%04x:%06x:%06x:%s", my_dest->lid, my_dest->qpn, + my_dest->psn, gid); + if (write(connfd, msg, sizeof msg) != sizeof msg || + read(connfd, msg, sizeof msg) != sizeof "done") { + fprintf(stderr, "Couldn't send/recv local address\n"); + free(rem_dest); + rem_dest = NULL; + goto out; + } +out: + close(connfd); + return rem_dest; +} + +static struct pingpong_context *pp_init_ctx(struct ibv_device *ib_dev, int size, + int rx_depth, int port, + int use_event) +{ + struct pingpong_context *ctx; + + ctx = malloc(sizeof *ctx); + if (!ctx) + return NULL; + + ctx->size = size; + ctx->send_flags = IBV_SEND_SIGNALED; + ctx->rx_depth = rx_depth; + + ctx->buf = memalign(page_size, size + 40); + if (!ctx->buf) { + fprintf(stderr, "Couldn't allocate work buf.\n"); + goto clean_ctx; + } + + /* FIXME memset(ctx->buf, 0, size + 40); */ + memset(ctx->buf, 0x7b, size + 40); + + ctx->context = ibv_open_device(ib_dev); + if (!ctx->context) { + fprintf(stderr, "Couldn't get context for %s\n", + ibv_get_device_name(ib_dev)); + goto clean_buffer; + } + + { + struct ibv_port_attr port_info = {}; + int mtu; + + if (ibv_query_port(ctx->context, port, &port_info)) { + fprintf(stderr, "Unable to query port info for port %d\n", port); + goto clean_device; + } + mtu = 1 << (port_info.active_mtu + 7); + if (size > mtu) { + fprintf(stderr, "Requested size larger than port MTU (%d)\n", mtu); + goto clean_device; + } + } + + if (use_event) { + ctx->channel = ibv_create_comp_channel(ctx->context); + if (!ctx->channel) { + fprintf(stderr, "Couldn't create completion channel\n"); + goto clean_device; + } + } else + ctx->channel = NULL; + + ctx->pd = ibv_alloc_pd(ctx->context); + if (!ctx->pd) { + fprintf(stderr, "Couldn't allocate PD\n"); + goto clean_comp_channel; + } + + ctx->mr = ibv_reg_mr(ctx->pd, ctx->buf, size + 40, IBV_ACCESS_LOCAL_WRITE); + if (!ctx->mr) { + fprintf(stderr, "Couldn't register MR\n"); + goto clean_pd; + } + + ctx->cq = ibv_create_cq(ctx->context, rx_depth + 1, NULL, + ctx->channel, 0); + if (!ctx->cq) { + fprintf(stderr, "Couldn't create CQ\n"); + goto clean_mr; + } + + { + struct ibv_qp_attr attr; + struct ibv_qp_init_attr init_attr = { + .send_cq = ctx->cq, + .recv_cq = ctx->cq, + .cap = { + .max_send_wr = 1, + .max_recv_wr = rx_depth, + .max_send_sge = 1, + .max_recv_sge = 1 + }, + .qp_type = IBV_QPT_UD, + }; + + ctx->qp = ibv_create_qp(ctx->pd, &init_attr); + if (!ctx->qp) { + fprintf(stderr, "Couldn't create QP\n"); + goto clean_cq; + } + + ibv_query_qp(ctx->qp, &attr, IBV_QP_CAP, &init_attr); + if (init_attr.cap.max_inline_data >= size) { + ctx->send_flags |= IBV_SEND_INLINE; + } + } + + { + struct ibv_qp_attr attr = { + .qp_state = IBV_QPS_INIT, + .pkey_index = 0, + .port_num = port, + .qkey = 0x11111111 + }; + + if (ibv_modify_qp(ctx->qp, &attr, + IBV_QP_STATE | + IBV_QP_PKEY_INDEX | + IBV_QP_PORT | + IBV_QP_QKEY)) { + fprintf(stderr, "Failed to modify QP to INIT\n"); + goto clean_qp; + } + } + + return ctx; + +clean_qp: + ibv_destroy_qp(ctx->qp); + +clean_cq: + ibv_destroy_cq(ctx->cq); + +clean_mr: + ibv_dereg_mr(ctx->mr); + +clean_pd: + ibv_dealloc_pd(ctx->pd); + +clean_comp_channel: + if (ctx->channel) + ibv_destroy_comp_channel(ctx->channel); + +clean_device: + ibv_close_device(ctx->context); + +clean_buffer: + free(ctx->buf); + +clean_ctx: + free(ctx); + + return NULL; +} + +static int pp_close_ctx(struct pingpong_context *ctx) +{ + if (ibv_destroy_qp(ctx->qp)) { + fprintf(stderr, "Couldn't destroy QP\n"); + return 1; + } + + if (ibv_destroy_cq(ctx->cq)) { + fprintf(stderr, "Couldn't destroy CQ\n"); + return 1; + } + + if (ibv_dereg_mr(ctx->mr)) { + fprintf(stderr, "Couldn't deregister MR\n"); + return 1; + } + + if (ibv_destroy_ah(ctx->ah)) { + fprintf(stderr, "Couldn't destroy AH\n"); + return 1; + } + + if (ibv_dealloc_pd(ctx->pd)) { + fprintf(stderr, "Couldn't deallocate PD\n"); + return 1; + } + + if (ctx->channel) { + if (ibv_destroy_comp_channel(ctx->channel)) { + fprintf(stderr, "Couldn't destroy completion channel\n"); + return 1; + } + } + + if (ibv_close_device(ctx->context)) { + fprintf(stderr, "Couldn't release context\n"); + return 1; + } + + free(ctx->buf); + free(ctx); + + return 0; +} + +static int pp_post_recv(struct pingpong_context *ctx, int n) +{ + struct ibv_sge list = { + .addr = (uintptr_t) ctx->buf, + .length = ctx->size + 40, + .lkey = ctx->mr->lkey + }; + struct ibv_recv_wr wr = { + .wr_id = PINGPONG_RECV_WRID, + .sg_list = &list, + .num_sge = 1, + }; + struct ibv_recv_wr *bad_wr; + int i; + + for (i = 0; i < n; ++i) + if (ibv_post_recv(ctx->qp, &wr, &bad_wr)) + break; + + return i; +} + +static int pp_post_send(struct pingpong_context *ctx, uint32_t qpn) +{ + struct ibv_sge list = { + .addr = (uintptr_t) ctx->buf + 40, + .length = ctx->size, + .lkey = ctx->mr->lkey + }; + struct ibv_send_wr wr = { + .wr_id = PINGPONG_SEND_WRID, + .sg_list = &list, + .num_sge = 1, + .opcode = IBV_WR_SEND, + .send_flags = ctx->send_flags, + .wr = { + .ud = { + .ah = ctx->ah, + .remote_qpn = qpn, + .remote_qkey = 0x11111111 + } + } + }; + struct ibv_send_wr *bad_wr; + + return ibv_post_send(ctx->qp, &wr, &bad_wr); +} + +static void usage(const char *argv0) +{ + printf("Usage:\n"); + printf(" %s start a server and wait for connection\n", argv0); + printf(" %s <host> connect to server at <host>\n", argv0); + printf("\n"); + printf("Options:\n"); + printf(" -p, --port=<port> listen on/connect to port <port> (default 18515)\n"); + printf(" -d, --ib-dev=<dev> use IB device <dev> (default first device found)\n"); + printf(" -i, --ib-port=<port> use port <port> of IB device (default 1)\n"); + printf(" -s, --size=<size> size of message to exchange (default 2048)\n"); + printf(" -r, --rx-depth=<dep> number of receives to post at a time (default 500)\n"); + printf(" -n, --iters=<iters> number of exchanges (default 1000)\n"); + printf(" -l, --sl=<SL> send messages with service level <SL> (default 0)\n"); + printf(" -e, --events sleep on CQ events (default poll)\n"); + printf(" -g, --gid-idx=<gid index> local port gid index\n"); + printf(" -c, --chk validate received buffer\n"); +} + +int main(int argc, char *argv[]) +{ + struct ibv_device **dev_list; + struct ibv_device *ib_dev; + struct pingpong_context *ctx; + struct pingpong_dest my_dest; + struct pingpong_dest *rem_dest; + struct timeval start, end; + char *ib_devname = NULL; + char *servername = NULL; + unsigned int port = 18515; + int ib_port = 1; + unsigned int size = 1024; + unsigned int rx_depth = 500; + unsigned int iters = 1000; + int use_event = 0; + int routs; + int rcnt, scnt; + int num_cq_events = 0; + int sl = 0; + int gidx = -1; + char gid[33]; + + srand48(getpid() * time(NULL)); + + while (1) { + int c; + + static struct option long_options[] = { + { .name = "port", .has_arg = 1, .val = 'p' }, + { .name = "ib-dev", .has_arg = 1, .val = 'd' }, + { .name = "ib-port", .has_arg = 1, .val = 'i' }, + { .name = "size", .has_arg = 1, .val = 's' }, + { .name = "rx-depth", .has_arg = 1, .val = 'r' }, + { .name = "iters", .has_arg = 1, .val = 'n' }, + { .name = "sl", .has_arg = 1, .val = 'l' }, + { .name = "events", .has_arg = 0, .val = 'e' }, + { .name = "gid-idx", .has_arg = 1, .val = 'g' }, + { .name = "chk", .has_arg = 0, .val = 'c' }, + {} + }; + + c = getopt_long(argc, argv, "p:d:i:s:r:n:l:eg:c", long_options, + NULL); + if (c == -1) + break; + + switch (c) { + case 'p': + port = strtol(optarg, NULL, 0); + if (port > 65535) { + usage(argv[0]); + return 1; + } + break; + + case 'd': + ib_devname = strdupa(optarg); + break; + + case 'i': + ib_port = strtol(optarg, NULL, 0); + if (ib_port < 1) { + usage(argv[0]); + return 1; + } + break; + + case 's': + size = strtoul(optarg, NULL, 0); + break; + + case 'r': + rx_depth = strtoul(optarg, NULL, 0); + break; + + case 'n': + iters = strtoul(optarg, NULL, 0); + break; + + case 'l': + sl = strtol(optarg, NULL, 0); + break; + + case 'e': + ++use_event; + break; + + case 'g': + gidx = strtol(optarg, NULL, 0); + break; + + case 'c': + validate_buf = 1; + break; + + default: + usage(argv[0]); + return 1; + } + } + + if (optind == argc - 1) + servername = strdupa(argv[optind]); + else if (optind < argc) { + usage(argv[0]); + return 1; + } + + page_size = sysconf(_SC_PAGESIZE); + + dev_list = ibv_get_device_list(NULL); + if (!dev_list) { + perror("Failed to get IB devices list"); + return 1; + } + + if (!ib_devname) { + ib_dev = *dev_list; + if (!ib_dev) { + fprintf(stderr, "No IB devices found\n"); + return 1; + } + } else { + int i; + for (i = 0; dev_list[i]; ++i) + if (!strcmp(ibv_get_device_name(dev_list[i]), ib_devname)) + break; + ib_dev = dev_list[i]; + if (!ib_dev) { + fprintf(stderr, "IB device %s not found\n", ib_devname); + return 1; + } + } + + ctx = pp_init_ctx(ib_dev, size, rx_depth, ib_port, use_event); + if (!ctx) + return 1; + + routs = pp_post_recv(ctx, ctx->rx_depth); + if (routs < ctx->rx_depth) { + fprintf(stderr, "Couldn't post receive (%d)\n", routs); + return 1; + } + + if (use_event) + if (ibv_req_notify_cq(ctx->cq, 0)) { + fprintf(stderr, "Couldn't request CQ notification\n"); + return 1; + } + + if (pp_get_port_info(ctx->context, ib_port, &ctx->portinfo)) { + fprintf(stderr, "Couldn't get port info\n"); + return 1; + } + my_dest.lid = ctx->portinfo.lid; + + my_dest.qpn = ctx->qp->qp_num; + my_dest.psn = lrand48() & 0xffffff; + + if (gidx >= 0) { + if (ibv_query_gid(ctx->context, ib_port, gidx, &my_dest.gid)) { + fprintf(stderr, "Could not get local gid for gid index " + "%d\n", gidx); + return 1; + } + } else + memset(&my_dest.gid, 0, sizeof my_dest.gid); + + inet_ntop(AF_INET6, &my_dest.gid, gid, sizeof gid); + printf(" local address: LID 0x%04x, QPN 0x%06x, PSN 0x%06x: GID %s\n", + my_dest.lid, my_dest.qpn, my_dest.psn, gid); + + if (servername) + rem_dest = pp_client_exch_dest(servername, port, &my_dest); + else + rem_dest = pp_server_exch_dest(ctx, ib_port, port, sl, + &my_dest, gidx); + + if (!rem_dest) + return 1; + + inet_ntop(AF_INET6, &rem_dest->gid, gid, sizeof gid); + printf(" remote address: LID 0x%04x, QPN 0x%06x, PSN 0x%06x, GID %s\n", + rem_dest->lid, rem_dest->qpn, rem_dest->psn, gid); + + if (servername) + if (pp_connect_ctx(ctx, ib_port, my_dest.psn, sl, rem_dest, + gidx)) + return 1; + + ctx->pending = PINGPONG_RECV_WRID; + + if (servername) { + if (validate_buf) + for (int i = 0; i < size; i += page_size) + ctx->buf[i + 40] = i / page_size % sizeof(char); + + if (pp_post_send(ctx, rem_dest->qpn)) { + fprintf(stderr, "Couldn't post send\n"); + return 1; + } + ctx->pending |= PINGPONG_SEND_WRID; + } + + if (gettimeofday(&start, NULL)) { + perror("gettimeofday"); + return 1; + } + + rcnt = scnt = 0; + while (rcnt < iters || scnt < iters) { + if (use_event) { + struct ibv_cq *ev_cq; + void *ev_ctx; + + if (ibv_get_cq_event(ctx->channel, &ev_cq, &ev_ctx)) { + fprintf(stderr, "Failed to get cq_event\n"); + return 1; + } + + ++num_cq_events; + + if (ev_cq != ctx->cq) { + fprintf(stderr, "CQ event for unknown CQ %p\n", ev_cq); + return 1; + } + + if (ibv_req_notify_cq(ctx->cq, 0)) { + fprintf(stderr, "Couldn't request CQ notification\n"); + return 1; + } + } + + { + struct ibv_wc wc[2]; + int ne, i; + + do { + ne = ibv_poll_cq(ctx->cq, 2, wc); + if (ne < 0) { + fprintf(stderr, "poll CQ failed %d\n", ne); + return 1; + } + } while (!use_event && ne < 1); + + for (i = 0; i < ne; ++i) { + if (wc[i].status != IBV_WC_SUCCESS) { + fprintf(stderr, "Failed status %s (%d) for wr_id %d\n", + ibv_wc_status_str(wc[i].status), + wc[i].status, (int) wc[i].wr_id); + return 1; + } + + switch ((int) wc[i].wr_id) { + case PINGPONG_SEND_WRID: + ++scnt; + break; + + case PINGPONG_RECV_WRID: + if (--routs <= 1) { + routs += pp_post_recv(ctx, ctx->rx_depth - routs); + if (routs < ctx->rx_depth) { + fprintf(stderr, + "Couldn't post receive (%d)\n", + routs); + return 1; + } + } + + ++rcnt; + break; + + default: + fprintf(stderr, "Completion for unknown wr_id %d\n", + (int) wc[i].wr_id); + return 1; + } + + ctx->pending &= ~(int) wc[i].wr_id; + if (scnt < iters && !ctx->pending) { + if (pp_post_send(ctx, rem_dest->qpn)) { + fprintf(stderr, "Couldn't post send\n"); + return 1; + } + ctx->pending = PINGPONG_RECV_WRID | + PINGPONG_SEND_WRID; + } + } + } + } + + if (gettimeofday(&end, NULL)) { + perror("gettimeofday"); + return 1; + } + + { + float usec = (end.tv_sec - start.tv_sec) * 1000000 + + (end.tv_usec - start.tv_usec); + long long bytes = (long long) size * iters * 2; + + printf("%lld bytes in %.2f seconds = %.2f Mbit/sec\n", + bytes, usec / 1000000., bytes * 8. / usec); + printf("%d iters in %.2f seconds = %.2f usec/iter\n", + iters, usec / 1000000., usec / iters); + + if ((!servername) && (validate_buf)) { + for (int i = 0; i < size; i += page_size) + if (ctx->buf[i + 40] != + i / page_size % sizeof(char)) + printf("invalid data in page %d\n", + i / page_size); + } + } + + ibv_ack_cq_events(ctx->cq, num_cq_events); + + if (pp_close_ctx(ctx)) + return 1; + + ibv_free_device_list(dev_list); + free(rem_dest); + + return 0; +} diff --git a/libibverbs/examples/xsrq_pingpong.c b/libibverbs/examples/xsrq_pingpong.c new file mode 100644 index 0000000..cc86d4d --- /dev/null +++ b/libibverbs/examples/xsrq_pingpong.c @@ -0,0 +1,1048 @@ +/* + * Copyright (c) 2005 Topspin Communications. All rights reserved. + * Copyright (c) 2011 Intel Corporation, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#define _GNU_SOURCE +#include <config.h> + +#include <stdio.h> +#include <fcntl.h> +#include <errno.h> +#include <stdlib.h> +#include <unistd.h> +#include <string.h> +#include <sys/types.h> +#include <sys/socket.h> +#include <sys/time.h> +#include <netdb.h> +#include <malloc.h> +#include <getopt.h> +#include <arpa/inet.h> +#include <time.h> + +#include "pingpong.h" + +#define MSG_FORMAT "%04x:%06x:%06x:%06x:%06x:%32s" +#define MSG_SIZE 66 +#define MSG_SSCAN "%x:%x:%x:%x:%x:%s" +#define ADDR_FORMAT \ + "%8s: LID %04x, QPN RECV %06x SEND %06x, PSN %06x, SRQN %06x, GID %s\n" +#define TERMINATION_FORMAT "%s" +#define TERMINATION_MSG_SIZE 4 +#define TERMINATION_MSG "END" +static int page_size; +static int use_odp; + +struct pingpong_dest { + union ibv_gid gid; + int lid; + int recv_qpn; + int send_qpn; + int recv_psn; + int send_psn; + int srqn; + int pp_cnt; + int sockfd; +}; + +struct pingpong_context { + struct ibv_context *context; + struct ibv_comp_channel *channel; + struct ibv_pd *pd; + struct ibv_mr *mr; + struct ibv_cq *send_cq; + struct ibv_cq *recv_cq; + struct ibv_srq *srq; + struct ibv_xrcd *xrcd; + struct ibv_qp **recv_qp; + struct ibv_qp **send_qp; + struct pingpong_dest *rem_dest; + void *buf; + int lid; + int sl; + enum ibv_mtu mtu; + int ib_port; + int fd; + int size; + int num_clients; + int num_tests; + int use_event; + int gidx; +}; + +static struct pingpong_context ctx; + + +static int open_device(char *ib_devname) +{ + struct ibv_device **dev_list; + int i = 0; + + dev_list = ibv_get_device_list(NULL); + if (!dev_list) { + fprintf(stderr, "Failed to get IB devices list"); + return -1; + } + + if (ib_devname) { + for (; dev_list[i]; ++i) { + if (!strcmp(ibv_get_device_name(dev_list[i]), ib_devname)) + break; + } + } + if (!dev_list[i]) { + fprintf(stderr, "IB device %s not found\n", + ib_devname ? ib_devname : ""); + return -1; + } + + ctx.context = ibv_open_device(dev_list[i]); + if (!ctx.context) { + fprintf(stderr, "Couldn't get context for %s\n", + ibv_get_device_name(dev_list[i])); + return -1; + } + + ibv_free_device_list(dev_list); + return 0; +} + +static int create_qps(void) +{ + struct ibv_qp_init_attr_ex init; + struct ibv_qp_attr mod; + int i; + + for (i = 0; i < ctx.num_clients; ++i) { + + memset(&init, 0, sizeof init); + init.qp_type = IBV_QPT_XRC_RECV; + init.comp_mask = IBV_QP_INIT_ATTR_XRCD; + init.xrcd = ctx.xrcd; + + ctx.recv_qp[i] = ibv_create_qp_ex(ctx.context, &init); + if (!ctx.recv_qp[i]) { + fprintf(stderr, "Couldn't create recv QP[%d] errno %d\n", + i, errno); + return 1; + } + + mod.qp_state = IBV_QPS_INIT; + mod.pkey_index = 0; + mod.port_num = ctx.ib_port; + mod.qp_access_flags = IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ; + + if (ibv_modify_qp(ctx.recv_qp[i], &mod, + IBV_QP_STATE | IBV_QP_PKEY_INDEX | + IBV_QP_PORT | IBV_QP_ACCESS_FLAGS)) { + fprintf(stderr, "Failed to modify recv QP[%d] to INIT\n", i); + return 1; + } + + memset(&init, 0, sizeof init); + init.qp_type = IBV_QPT_XRC_SEND; + init.send_cq = ctx.send_cq; + init.cap.max_send_wr = ctx.num_clients * ctx.num_tests; + init.cap.max_send_sge = 1; + init.comp_mask = IBV_QP_INIT_ATTR_PD; + init.pd = ctx.pd; + + ctx.send_qp[i] = ibv_create_qp_ex(ctx.context, &init); + if (!ctx.send_qp[i]) { + fprintf(stderr, "Couldn't create send QP[%d] errno %d\n", + i, errno); + return 1; + } + + mod.qp_state = IBV_QPS_INIT; + mod.pkey_index = 0; + mod.port_num = ctx.ib_port; + mod.qp_access_flags = 0; + + if (ibv_modify_qp(ctx.send_qp[i], &mod, + IBV_QP_STATE | IBV_QP_PKEY_INDEX | + IBV_QP_PORT | IBV_QP_ACCESS_FLAGS)) { + fprintf(stderr, "Failed to modify send QP[%d] to INIT\n", i); + return 1; + } + } + + return 0; +} + +static int pp_init_ctx(char *ib_devname) +{ + struct ibv_srq_init_attr_ex attr; + struct ibv_xrcd_init_attr xrcd_attr; + struct ibv_port_attr port_attr; + int access_flags = IBV_ACCESS_LOCAL_WRITE; + + ctx.recv_qp = calloc(ctx.num_clients, sizeof *ctx.recv_qp); + ctx.send_qp = calloc(ctx.num_clients, sizeof *ctx.send_qp); + ctx.rem_dest = calloc(ctx.num_clients, sizeof *ctx.rem_dest); + if (!ctx.recv_qp || !ctx.send_qp || !ctx.rem_dest) + return 1; + + if (open_device(ib_devname)) { + fprintf(stderr, "Failed to open device\n"); + return 1; + } + + if (use_odp) { + struct ibv_device_attr_ex attrx; + const uint32_t xrc_caps_mask = IBV_ODP_SUPPORT_SEND | + IBV_ODP_SUPPORT_SRQ_RECV; + + if (ibv_query_device_ex(ctx.context, NULL, &attrx)) { + fprintf(stderr, "Couldn't query device for its features\n"); + return 1; + } + if (!(attrx.odp_caps.general_caps & IBV_ODP_SUPPORT) || + (attrx.xrc_odp_caps & xrc_caps_mask) != xrc_caps_mask) { + fprintf(stderr, "The device isn't ODP capable or does not support XRC send, receive and srq with ODP\n"); + return 1; + } + access_flags |= IBV_ACCESS_ON_DEMAND; + } + + if (pp_get_port_info(ctx.context, ctx.ib_port, &port_attr)) { + fprintf(stderr, "Failed to get port info\n"); + return 1; + } + + ctx.lid = port_attr.lid; + if (port_attr.link_layer != IBV_LINK_LAYER_ETHERNET && !ctx.lid) { + fprintf(stderr, "Couldn't get local LID\n"); + return 1; + } + + ctx.buf = memalign(page_size, ctx.size); + if (!ctx.buf) { + fprintf(stderr, "Couldn't allocate work buf.\n"); + return 1; + } + + memset(ctx.buf, 0, ctx.size); + + if (ctx.use_event) { + ctx.channel = ibv_create_comp_channel(ctx.context); + if (!ctx.channel) { + fprintf(stderr, "Couldn't create completion channel\n"); + return 1; + } + } + + ctx.pd = ibv_alloc_pd(ctx.context); + if (!ctx.pd) { + fprintf(stderr, "Couldn't allocate PD\n"); + return 1; + } + + ctx.mr = ibv_reg_mr(ctx.pd, ctx.buf, ctx.size, access_flags); + if (!ctx.mr) { + fprintf(stderr, "Couldn't register MR\n"); + return 1; + } + + ctx.fd = open("/tmp/xrc_domain", O_RDONLY | O_CREAT, S_IRUSR | S_IRGRP); + if (ctx.fd < 0) { + fprintf(stderr, + "Couldn't create the file for the XRC Domain " + "but not stopping %d\n", errno); + ctx.fd = -1; + } + + memset(&xrcd_attr, 0, sizeof xrcd_attr); + xrcd_attr.comp_mask = IBV_XRCD_INIT_ATTR_FD | IBV_XRCD_INIT_ATTR_OFLAGS; + xrcd_attr.fd = ctx.fd; + xrcd_attr.oflags = O_CREAT; + ctx.xrcd = ibv_open_xrcd(ctx.context, &xrcd_attr); + if (!ctx.xrcd) { + fprintf(stderr, "Couldn't Open the XRC Domain %d\n", errno); + return 1; + } + + ctx.recv_cq = ibv_create_cq(ctx.context, ctx.num_clients, &ctx.recv_cq, + ctx.channel, 0); + if (!ctx.recv_cq) { + fprintf(stderr, "Couldn't create recv CQ\n"); + return 1; + } + + if (ctx.use_event) { + if (ibv_req_notify_cq(ctx.recv_cq, 0)) { + fprintf(stderr, "Couldn't request CQ notification\n"); + return 1; + } + } + + ctx.send_cq = ibv_create_cq(ctx.context, ctx.num_clients, NULL, NULL, 0); + if (!ctx.send_cq) { + fprintf(stderr, "Couldn't create send CQ\n"); + return 1; + } + + memset(&attr, 0, sizeof attr); + attr.attr.max_wr = ctx.num_clients; + attr.attr.max_sge = 1; + attr.comp_mask = IBV_SRQ_INIT_ATTR_TYPE | IBV_SRQ_INIT_ATTR_XRCD | + IBV_SRQ_INIT_ATTR_CQ | IBV_SRQ_INIT_ATTR_PD; + attr.srq_type = IBV_SRQT_XRC; + attr.xrcd = ctx.xrcd; + attr.cq = ctx.recv_cq; + attr.pd = ctx.pd; + + ctx.srq = ibv_create_srq_ex(ctx.context, &attr); + if (!ctx.srq) { + fprintf(stderr, "Couldn't create SRQ\n"); + return 1; + } + + if (create_qps()) + return 1; + + return 0; +} + +static int recv_termination_ack(int index) +{ + char msg[TERMINATION_MSG_SIZE]; + int n = 0, r; + int sockfd = ctx.rem_dest[index].sockfd; + + while (n < TERMINATION_MSG_SIZE) { + r = read(sockfd, msg + n, TERMINATION_MSG_SIZE - n); + if (r < 0) { + perror("client read"); + fprintf(stderr, + "%d/%d: Couldn't read remote termination ack\n", + n, TERMINATION_MSG_SIZE); + return 1; + } + n += r; + } + + if (strcmp(msg, TERMINATION_MSG)) { + fprintf(stderr, "Invalid termination ack was accepted\n"); + return 1; + } + + return 0; +} + +static int send_termination_ack(int index) +{ + char msg[TERMINATION_MSG_SIZE]; + int sockfd = ctx.rem_dest[index].sockfd; + + sprintf(msg, TERMINATION_FORMAT, TERMINATION_MSG); + + if (write(sockfd, msg, TERMINATION_MSG_SIZE) != TERMINATION_MSG_SIZE) { + fprintf(stderr, "Couldn't send termination ack\n"); + return 1; + } + + return 0; +} + +static int pp_client_termination(void) +{ + if (send_termination_ack(0)) + return 1; + if (recv_termination_ack(0)) + return 1; + + return 0; +} + +static int pp_server_termination(void) +{ + int i; + + for (i = 0; i < ctx.num_clients; i++) { + if (recv_termination_ack(i)) + return 1; + } + + for (i = 0; i < ctx.num_clients; i++) { + if (send_termination_ack(i)) + return 1; + } + + return 0; +} + +static int send_local_dest(int sockfd, int index) +{ + char msg[MSG_SIZE]; + char gid[33]; + uint32_t srq_num; + union ibv_gid local_gid; + + if (ctx.gidx >= 0) { + if (ibv_query_gid(ctx.context, ctx.ib_port, ctx.gidx, + &local_gid)) { + fprintf(stderr, "can't read sgid of index %d\n", + ctx.gidx); + return -1; + } + } else { + memset(&local_gid, 0, sizeof(local_gid)); + } + + ctx.rem_dest[index].recv_psn = lrand48() & 0xffffff; + if (ibv_get_srq_num(ctx.srq, &srq_num)) { + fprintf(stderr, "Couldn't get SRQ num\n"); + return -1; + } + + inet_ntop(AF_INET6, &local_gid, gid, sizeof(gid)); + printf(ADDR_FORMAT, "local", ctx.lid, ctx.recv_qp[index]->qp_num, + ctx.send_qp[index]->qp_num, ctx.rem_dest[index].recv_psn, + srq_num, gid); + + gid_to_wire_gid(&local_gid, gid); + sprintf(msg, MSG_FORMAT, ctx.lid, ctx.recv_qp[index]->qp_num, + ctx.send_qp[index]->qp_num, ctx.rem_dest[index].recv_psn, + srq_num, gid); + + if (write(sockfd, msg, MSG_SIZE) != MSG_SIZE) { + fprintf(stderr, "Couldn't send local address\n"); + return -1; + } + + return 0; +} + +static int recv_remote_dest(int sockfd, int index) +{ + struct pingpong_dest *rem_dest; + char msg[MSG_SIZE]; + char gid[33]; + int n = 0, r; + + while (n < MSG_SIZE) { + r = read(sockfd, msg + n, MSG_SIZE - n); + if (r < 0) { + perror("client read"); + fprintf(stderr, + "%d/%d: Couldn't read remote address [%d]\n", + n, MSG_SIZE, index); + return -1; + } + n += r; + } + + rem_dest = &ctx.rem_dest[index]; + sscanf(msg, MSG_SSCAN, &rem_dest->lid, &rem_dest->recv_qpn, + &rem_dest->send_qpn, &rem_dest->send_psn, &rem_dest->srqn, gid); + + wire_gid_to_gid(gid, &rem_dest->gid); + inet_ntop(AF_INET6, &rem_dest->gid, gid, sizeof(gid)); + printf(ADDR_FORMAT, "remote", rem_dest->lid, rem_dest->recv_qpn, + rem_dest->send_qpn, rem_dest->send_psn, rem_dest->srqn, + gid); + + rem_dest->sockfd = sockfd; + return 0; +} + +static void set_ah_attr(struct ibv_ah_attr *attr, struct pingpong_context *myctx, + int index) +{ + attr->is_global = 1; + attr->grh.hop_limit = 5; + attr->grh.dgid = myctx->rem_dest[index].gid; + attr->grh.sgid_index = myctx->gidx; +} + +static int connect_qps(int index) +{ + struct ibv_qp_attr attr; + + memset(&attr, 0, sizeof attr); + attr.qp_state = IBV_QPS_RTR; + attr.dest_qp_num = ctx.rem_dest[index].send_qpn; + attr.path_mtu = ctx.mtu; + attr.rq_psn = ctx.rem_dest[index].send_psn; + attr.min_rnr_timer = 12; + attr.ah_attr.dlid = ctx.rem_dest[index].lid; + attr.ah_attr.sl = ctx.sl; + attr.ah_attr.port_num = ctx.ib_port; + + if (ctx.rem_dest[index].gid.global.interface_id) + set_ah_attr(&attr.ah_attr, &ctx, index); + + if (ibv_modify_qp(ctx.recv_qp[index], &attr, + IBV_QP_STATE | IBV_QP_AV | IBV_QP_PATH_MTU | + IBV_QP_DEST_QPN | IBV_QP_RQ_PSN | + IBV_QP_MAX_DEST_RD_ATOMIC | IBV_QP_MIN_RNR_TIMER)) { + fprintf(stderr, "Failed to modify recv QP[%d] to RTR\n", index); + return 1; + } + + memset(&attr, 0, sizeof attr); + attr.qp_state = IBV_QPS_RTS; + attr.timeout = 14; + attr.sq_psn = ctx.rem_dest[index].recv_psn; + + if (ibv_modify_qp(ctx.recv_qp[index], &attr, + IBV_QP_STATE | IBV_QP_TIMEOUT | IBV_QP_SQ_PSN)) { + fprintf(stderr, "Failed to modify recv QP[%d] to RTS\n", index); + return 1; + } + + memset(&attr, 0, sizeof attr); + attr.qp_state = IBV_QPS_RTR; + attr.dest_qp_num = ctx.rem_dest[index].recv_qpn; + attr.path_mtu = ctx.mtu; + attr.rq_psn = ctx.rem_dest[index].send_psn; + attr.ah_attr.dlid = ctx.rem_dest[index].lid; + attr.ah_attr.sl = ctx.sl; + attr.ah_attr.port_num = ctx.ib_port; + + if (ctx.rem_dest[index].gid.global.interface_id) + set_ah_attr(&attr.ah_attr, &ctx, index); + + if (ibv_modify_qp(ctx.send_qp[index], &attr, + IBV_QP_STATE | IBV_QP_AV | IBV_QP_PATH_MTU | + IBV_QP_DEST_QPN | IBV_QP_RQ_PSN)) { + fprintf(stderr, "Failed to modify send QP[%d] to RTR\n", index); + return 1; + } + + memset(&attr, 0, sizeof attr); + attr.qp_state = IBV_QPS_RTS; + attr.timeout = 14; + attr.retry_cnt = 7; + attr.rnr_retry = 7; + attr.sq_psn = ctx.rem_dest[index].recv_psn; + + if (ibv_modify_qp(ctx.send_qp[index], &attr, + IBV_QP_STATE | IBV_QP_TIMEOUT | IBV_QP_SQ_PSN | + IBV_QP_RETRY_CNT | IBV_QP_RNR_RETRY | IBV_QP_MAX_QP_RD_ATOMIC)) { + fprintf(stderr, "Failed to modify send QP[%d] to RTS\n", index); + return 1; + } + + return 0; +} + +static int pp_client_connect(const char *servername, int port) +{ + struct addrinfo *res, *t; + char *service; + int ret; + int sockfd = -1; + struct addrinfo hints = { + .ai_family = AF_UNSPEC, + .ai_socktype = SOCK_STREAM + }; + + if (asprintf(&service, "%d", port) < 0) + return 1; + + ret = getaddrinfo(servername, service, &hints, &res); + if (ret < 0) { + fprintf(stderr, "%s for %s:%d\n", gai_strerror(ret), servername, port); + free(service); + return 1; + } + + for (t = res; t; t = t->ai_next) { + sockfd = socket(t->ai_family, t->ai_socktype, t->ai_protocol); + if (sockfd >= 0) { + if (!connect(sockfd, t->ai_addr, t->ai_addrlen)) + break; + close(sockfd); + sockfd = -1; + } + } + + freeaddrinfo(res); + free(service); + + if (sockfd < 0) { + fprintf(stderr, "Couldn't connect to %s:%d\n", servername, port); + return 1; + } + + if (send_local_dest(sockfd, 0)) { + close(sockfd); + return 1; + } + + if (recv_remote_dest(sockfd, 0)) + return 1; + + if (connect_qps(0)) + return 1; + + return 0; +} + +static int pp_server_connect(int port) +{ + struct addrinfo *res, *t; + char *service; + int ret, i, n; + int sockfd = -1, connfd; + struct addrinfo hints = { + .ai_flags = AI_PASSIVE, + .ai_family = AF_UNSPEC, + .ai_socktype = SOCK_STREAM + }; + + if (asprintf(&service, "%d", port) < 0) + return 1; + + ret = getaddrinfo(NULL, service, &hints, &res); + if (ret < 0) { + fprintf(stderr, "%s for port %d\n", gai_strerror(ret), port); + free(service); + return 1; + } + + for (t = res; t; t = t->ai_next) { + sockfd = socket(t->ai_family, t->ai_socktype, t->ai_protocol); + if (sockfd >= 0) { + n = 1; + setsockopt(sockfd, SOL_SOCKET, SO_REUSEADDR, &n, sizeof n); + if (!bind(sockfd, t->ai_addr, t->ai_addrlen)) + break; + close(sockfd); + sockfd = -1; + } + } + + freeaddrinfo(res); + free(service); + + if (sockfd < 0) { + fprintf(stderr, "Couldn't listen to port %d\n", port); + return 1; + } + + listen(sockfd, ctx.num_clients); + + for (i = 0; i < ctx.num_clients; i++) { + connfd = accept(sockfd, NULL, NULL); + if (connfd < 0) { + fprintf(stderr, "accept() failed for client %d\n", i); + return 1; + } + + if (recv_remote_dest(connfd, i)) + return 1; + + if (send_local_dest(connfd, i)) + return 1; + + if (connect_qps(i)) + return 1; + } + + close(sockfd); + return 0; +} + + +static int pp_close_ctx(void) +{ + int i; + + for (i = 0; i < ctx.num_clients; ++i) { + + if (ibv_destroy_qp(ctx.send_qp[i])) { + fprintf(stderr, "Couldn't destroy INI QP[%d]\n", i); + return 1; + } + + if (ibv_destroy_qp(ctx.recv_qp[i])) { + fprintf(stderr, "Couldn't destroy TGT QP[%d]\n", i); + return 1; + } + + if (ctx.rem_dest[i].sockfd) + close(ctx.rem_dest[i].sockfd); + } + + if (ibv_destroy_srq(ctx.srq)) { + fprintf(stderr, "Couldn't destroy SRQ\n"); + return 1; + } + + if (ctx.xrcd && ibv_close_xrcd(ctx.xrcd)) { + fprintf(stderr, "Couldn't close the XRC Domain\n"); + return 1; + } + if (ctx.fd >= 0 && close(ctx.fd)) { + fprintf(stderr, "Couldn't close the file for the XRC Domain\n"); + return 1; + } + + if (ibv_destroy_cq(ctx.send_cq)) { + fprintf(stderr, "Couldn't destroy send CQ\n"); + return 1; + } + + if (ibv_destroy_cq(ctx.recv_cq)) { + fprintf(stderr, "Couldn't destroy recv CQ\n"); + return 1; + } + + if (ibv_dereg_mr(ctx.mr)) { + fprintf(stderr, "Couldn't deregister MR\n"); + return 1; + } + + if (ibv_dealloc_pd(ctx.pd)) { + fprintf(stderr, "Couldn't deallocate PD\n"); + return 1; + } + + if (ctx.channel) { + if (ibv_destroy_comp_channel(ctx.channel)) { + fprintf(stderr, + "Couldn't destroy completion channel\n"); + return 1; + } + } + + if (ibv_close_device(ctx.context)) { + fprintf(stderr, "Couldn't release context\n"); + return 1; + } + + free(ctx.buf); + free(ctx.rem_dest); + free(ctx.send_qp); + free(ctx.recv_qp); + return 0; +} + +static int pp_post_recv(int cnt) +{ + struct ibv_sge sge; + struct ibv_recv_wr wr, *bad_wr; + + sge.addr = (uintptr_t) ctx.buf; + sge.length = ctx.size; + sge.lkey = ctx.mr->lkey; + + wr.next = NULL; + wr.wr_id = (uintptr_t) &ctx; + wr.sg_list = &sge; + wr.num_sge = 1; + + while (cnt--) { + if (ibv_post_srq_recv(ctx.srq, &wr, &bad_wr)) { + fprintf(stderr, "Failed to post receive to SRQ\n"); + return 1; + } + } + return 0; +} + +/* + * Send to each client round robin on each set of xrc send/recv qp. + * Generate a completion on the last send. + */ +static int pp_post_send(int index) +{ + struct ibv_sge sge; + struct ibv_send_wr wr, *bad_wr; + int qpi; + + sge.addr = (uintptr_t) ctx.buf; + sge.length = ctx.size; + sge.lkey = ctx.mr->lkey; + + wr.wr_id = (uintptr_t) index; + wr.next = NULL; + wr.sg_list = &sge; + wr.num_sge = 1; + wr.opcode = IBV_WR_SEND; + wr.qp_type.xrc.remote_srqn = ctx.rem_dest[index].srqn; + + qpi = (index + ctx.rem_dest[index].pp_cnt) % ctx.num_clients; + wr.send_flags = (++ctx.rem_dest[index].pp_cnt >= ctx.num_tests) ? + IBV_SEND_SIGNALED : 0; + + return ibv_post_send(ctx.send_qp[qpi], &wr, &bad_wr); +} + +static int find_qp(int qpn) +{ + int i; + + if (ctx.num_clients == 1) + return 0; + + for (i = 0; i < ctx.num_clients; ++i) + if (ctx.recv_qp[i]->qp_num == qpn) + return i; + + fprintf(stderr, "Unable to find qp %x\n", qpn); + return 0; +} + +static int get_cq_event(void) +{ + struct ibv_cq *ev_cq; + void *ev_ctx; + + if (ibv_get_cq_event(ctx.channel, &ev_cq, &ev_ctx)) { + fprintf(stderr, "Failed to get cq_event\n"); + return 1; + } + + if (ev_cq != ctx.recv_cq) { + fprintf(stderr, "CQ event for unknown CQ %p\n", ev_cq); + return 1; + } + + if (ibv_req_notify_cq(ctx.recv_cq, 0)) { + fprintf(stderr, "Couldn't request CQ notification\n"); + return 1; + } + + return 0; +} + +static void init(void) +{ + srand48(getpid() * time(NULL)); + + ctx.size = 4096; + ctx.ib_port = 1; + ctx.num_clients = 1; + ctx.num_tests = 5; + ctx.mtu = IBV_MTU_1024; + ctx.sl = 0; + ctx.gidx = -1; +} + +static void usage(const char *argv0) +{ + printf("Usage:\n"); + printf(" %s start a server and wait for connection\n", argv0); + printf(" %s <host> connect to server at <host>\n", argv0); + printf("\n"); + printf("Options:\n"); + printf(" -p, --port=<port> listen on/connect to port <port> (default 18515)\n"); + printf(" -d, --ib-dev=<dev> use IB device <dev> (default first device found)\n"); + printf(" -i, --ib-port=<port> use port <port> of IB device (default 1)\n"); + printf(" -s, --size=<size> size of message to exchange (default 4096)\n"); + printf(" -m, --mtu=<size> path MTU (default 2048)\n"); + printf(" -c, --clients=<n> number of clients (on server only, default 1)\n"); + printf(" -n, --num_tests=<n> number of tests per client (default 5)\n"); + printf(" -l, --sl=<sl> service level value\n"); + printf(" -e, --events sleep on CQ events (default poll)\n"); + printf(" -o, --odp use on demand paging\n"); + printf(" -g, --gid-idx=<gid index> local port gid index\n"); +} + +int main(int argc, char *argv[]) +{ + char *ib_devname = NULL; + char *servername = NULL; + int port = 18515; + int i, total, cnt = 0; + int ne, qpi, num_cq_events = 0; + struct ibv_wc wc; + + init(); + while (1) { + int c; + + static struct option long_options[] = { + { .name = "port", .has_arg = 1, .val = 'p' }, + { .name = "ib-dev", .has_arg = 1, .val = 'd' }, + { .name = "ib-port", .has_arg = 1, .val = 'i' }, + { .name = "size", .has_arg = 1, .val = 's' }, + { .name = "mtu", .has_arg = 1, .val = 'm' }, + { .name = "clients", .has_arg = 1, .val = 'c' }, + { .name = "num_tests", .has_arg = 1, .val = 'n' }, + { .name = "sl", .has_arg = 1, .val = 'l' }, + { .name = "events", .has_arg = 0, .val = 'e' }, + { .name = "odp", .has_arg = 0, .val = 'o' }, + { .name = "gid-idx", .has_arg = 1, .val = 'g' }, + {} + }; + + c = getopt_long(argc, argv, "p:d:i:s:m:n:l:eog:c:", long_options, + NULL); + if (c == -1) + break; + + switch (c) { + case 'p': + port = strtol(optarg, NULL, 0); + if (port < 0 || port > 65535) { + usage(argv[0]); + return 1; + } + break; + case 'd': + ib_devname = strdupa(optarg); + break; + case 'i': + ctx.ib_port = strtol(optarg, NULL, 0); + if (ctx.ib_port < 0) { + usage(argv[0]); + return 1; + } + break; + case 's': + ctx.size = strtol(optarg, NULL, 0); + break; + case 'm': + ctx.mtu = pp_mtu_to_enum(strtol(optarg, NULL, 0)); + if (ctx.mtu == 0) { + usage(argv[0]); + return 1; + } + break; + case 'c': + ctx.num_clients = strtol(optarg, NULL, 0); + break; + case 'n': + ctx.num_tests = strtol(optarg, NULL, 0); + break; + case 'l': + ctx.sl = strtol(optarg, NULL, 0); + break; + case 'g': + ctx.gidx = strtol(optarg, NULL, 0); + break; + case 'e': + ctx.use_event = 1; + break; + case 'o': + use_odp = 1; + break; + default: + usage(argv[0]); + return 1; + } + } + + if (optind == argc - 1) { + servername = strdupa(argv[optind]); + ctx.num_clients = 1; + } else if (optind < argc) { + usage(argv[0]); + return 1; + } + + page_size = sysconf(_SC_PAGESIZE); + + if (pp_init_ctx(ib_devname)) + return 1; + + if (pp_post_recv(ctx.num_clients)) { + fprintf(stderr, "Couldn't post receives\n"); + return 1; + } + + if (servername) { + if (pp_client_connect(servername, port)) + return 1; + } else { + if (pp_server_connect(port)) + return 1; + + for (i = 0; i < ctx.num_clients; i++) + pp_post_send(i); + } + + total = ctx.num_clients * ctx.num_tests; + while (cnt < total) { + if (ctx.use_event) { + if (get_cq_event()) + return 1; + + ++num_cq_events; + } + + do { + ne = ibv_poll_cq(ctx.recv_cq, 1, &wc); + if (ne < 0) { + fprintf(stderr, "Error polling cq %d\n", ne); + return 1; + } else if (ne == 0) { + break; + } + + if (wc.status) { + fprintf(stderr, "Work completion error %d\n", wc.status); + return 1; + } + + pp_post_recv(ne); + qpi = find_qp(wc.qp_num); + if (ctx.rem_dest[qpi].pp_cnt < ctx.num_tests) + pp_post_send(qpi); + cnt += ne; + } while (ne > 0); + } + + for (cnt = 0; cnt < ctx.num_clients; cnt += ne) { + ne = ibv_poll_cq(ctx.send_cq, 1, &wc); + if (ne < 0) { + fprintf(stderr, "Error polling cq %d\n", ne); + return 1; + } + } + + if (ctx.use_event) + ibv_ack_cq_events(ctx.recv_cq, num_cq_events); + + /* Process should get an ack from the daemon to close its resources to + * make sure latest daemon's response sent via its target QP destined + * to an XSRQ created by another client won't be lost. + * Failure to do so may cause the other client to wait for that sent + * message forever. See comment on pp_post_send. + */ + if (servername) { + if (pp_client_termination()) + return 1; + } else if (pp_server_termination()) { + return 1; + } + + if (pp_close_ctx()) + return 1; + + printf("success\n"); + return 0; +} diff --git a/libibverbs/ibdev_nl.c b/libibverbs/ibdev_nl.c new file mode 100644 index 0000000..b459f0b --- /dev/null +++ b/libibverbs/ibdev_nl.c @@ -0,0 +1,225 @@ +/* + * Copyright (c) 2019, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include <util/rdma_nl.h> + +#include <dirent.h> +#include <fcntl.h> +#include <unistd.h> +#include <sys/sysmacros.h> + +#include <ccan/list.h> +#include <util/util.h> +#include <infiniband/driver.h> + +#include "ibverbs.h" + +/* Determine the name of the uverbsX class for the sysfs_dev using sysfs. */ +static int find_uverbs_sysfs(struct verbs_sysfs_dev *sysfs_dev) +{ + char path[IBV_SYSFS_PATH_MAX]; + struct dirent *dent; + DIR *class_dir; + int ret = ENOENT; + + if (!check_snprintf(path, sizeof(path), "%s/device/infiniband_verbs", + sysfs_dev->ibdev_path)) + return ENOMEM; + + class_dir = opendir(path); + if (!class_dir) + return ENOSYS; + + while ((dent = readdir(class_dir))) { + int uv_dirfd; + bool failed; + + if (dent->d_name[0] == '.') + continue; + + uv_dirfd = openat(dirfd(class_dir), dent->d_name, + O_RDONLY | O_DIRECTORY | O_CLOEXEC); + if (uv_dirfd == -1) + break; + failed = setup_sysfs_uverbs(uv_dirfd, dent->d_name, sysfs_dev); + close(uv_dirfd); + if (!failed) + ret = 0; + break; + } + closedir(class_dir); + return ret; +} + +static int find_uverbs_nl_cb(struct nl_msg *msg, void *data) +{ + struct verbs_sysfs_dev *sysfs_dev = data; + struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; + uint64_t cdev64; + int ret; + + ret = nlmsg_parse(nlmsg_hdr(msg), 0, tb, RDMA_NLDEV_ATTR_MAX - 1, + rdmanl_policy); + if (ret < 0) + return ret; + if (!tb[RDMA_NLDEV_ATTR_CHARDEV] || !tb[RDMA_NLDEV_ATTR_CHARDEV_ABI] || + !tb[RDMA_NLDEV_ATTR_CHARDEV_NAME]) + return NLE_PARSE_ERR; + + /* + * The global uverbs abi is 6 for the request string 'uverbs'. We + * don't expect to ever have to change the ABI version for uverbs + * again. + */ + abi_ver = 6; + + /* + * The top 32 bits of CHARDEV_ABI are reserved for a future use, + * current kernels set them to 0 + */ + sysfs_dev->abi_ver = nla_get_u64(tb[RDMA_NLDEV_ATTR_CHARDEV_ABI]); + if (tb[RDMA_NLDEV_ATTR_UVERBS_DRIVER_ID]) + sysfs_dev->driver_id = + nla_get_u32(tb[RDMA_NLDEV_ATTR_UVERBS_DRIVER_ID]); + else + sysfs_dev->driver_id = RDMA_DRIVER_UNKNOWN; + + /* Convert from huge_encode_dev to whatever glibc uses */ + cdev64 = nla_get_u64(tb[RDMA_NLDEV_ATTR_CHARDEV]); + sysfs_dev->sysfs_cdev = + makedev((cdev64 & 0xfff00) >> 8, + (cdev64 & 0xff) | ((cdev64 >> 12) & 0xfff00)); + + if (!check_snprintf(sysfs_dev->sysfs_name, + sizeof(sysfs_dev->sysfs_name), "%s", + nla_get_string(tb[RDMA_NLDEV_ATTR_CHARDEV_NAME]))) + return NLE_PARSE_ERR; + return 0; +} + +/* Ask the kernel for the uverbs char device information */ +static int find_uverbs_nl(struct nl_sock *nl, struct verbs_sysfs_dev *sysfs_dev) +{ + if (rdmanl_get_chardev(nl, sysfs_dev->ibdev_idx, "uverbs", + find_uverbs_nl_cb, sysfs_dev)) + return -1; + if (!sysfs_dev->sysfs_name[0]) + return -1; + return 0; +} + +static int find_sysfs_devs_nl_cb(struct nl_msg *msg, void *data) +{ + struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; + struct list_head *sysfs_list = data; + struct verbs_sysfs_dev *sysfs_dev; + int ret; + + ret = nlmsg_parse(nlmsg_hdr(msg), 0, tb, RDMA_NLDEV_ATTR_MAX - 1, + rdmanl_policy); + if (ret < 0) + return ret; + if (!tb[RDMA_NLDEV_ATTR_DEV_NAME] || + !tb[RDMA_NLDEV_ATTR_DEV_NODE_TYPE] || + !tb[RDMA_NLDEV_ATTR_DEV_INDEX] || + !tb[RDMA_NLDEV_ATTR_NODE_GUID]) + return NLE_PARSE_ERR; + + sysfs_dev = calloc(1, sizeof(*sysfs_dev)); + if (!sysfs_dev) + return NLE_NOMEM; + + sysfs_dev->ibdev_idx = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]); + sysfs_dev->node_guid = nla_get_u64(tb[RDMA_NLDEV_ATTR_NODE_GUID]); + sysfs_dev->flags |= VSYSFS_READ_NODE_GUID; + if (!check_snprintf(sysfs_dev->ibdev_name, + sizeof(sysfs_dev->ibdev_name), "%s", + nla_get_string(tb[RDMA_NLDEV_ATTR_DEV_NAME]))) + goto err; + if (!check_snprintf( + sysfs_dev->ibdev_path, sizeof(sysfs_dev->ibdev_path), + "%s/class/infiniband/%s", ibv_get_sysfs_path(), + sysfs_dev->ibdev_name)) + goto err; + if (tb[RDMA_NLDEV_ATTR_FW_VERSION]) { + if (!check_snprintf( + sysfs_dev->fw_ver, sizeof(sysfs_dev->fw_ver), "%s", + nla_get_string(tb[RDMA_NLDEV_ATTR_FW_VERSION]))) + goto err; + sysfs_dev->flags |= VSYSFS_READ_FW_VER; + } + sysfs_dev->node_type = decode_knode_type( + nla_get_u8(tb[RDMA_NLDEV_ATTR_DEV_NODE_TYPE])); + + /* + * We don't need to check the cdev as netlink only shows us devices in + * this namespace + */ + + list_add(sysfs_list, &sysfs_dev->entry); + return NL_OK; + +err: + free(sysfs_dev); + return NLE_PARSE_ERR; +} + +/* Fetch the list of IB devices and uverbs from netlink */ +int find_sysfs_devs_nl(struct list_head *tmp_sysfs_dev_list) +{ + struct verbs_sysfs_dev *dev, *dev_tmp; + struct nl_sock *nl; + + nl = rdmanl_socket_alloc(); + if (!nl) + return -EOPNOTSUPP; + + if (rdmanl_get_devices(nl, find_sysfs_devs_nl_cb, tmp_sysfs_dev_list)) + goto err; + + list_for_each_safe (tmp_sysfs_dev_list, dev, dev_tmp, entry) { + if (find_uverbs_nl(nl, dev) && find_uverbs_sysfs(dev)) { + list_del(&dev->entry); + free(dev); + } + } + + nl_socket_free(nl); + return 0; + +err: + list_for_each_safe (tmp_sysfs_dev_list, dev, dev_tmp, entry) { + list_del(&dev->entry); + free(dev); + } + nl_socket_free(nl); + return EINVAL; +} diff --git a/libibverbs/ibverbs.h b/libibverbs/ibverbs.h new file mode 100644 index 0000000..4b9b88f --- /dev/null +++ b/libibverbs/ibverbs.h @@ -0,0 +1,93 @@ +/* + * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved. + * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef IB_VERBS_H +#define IB_VERBS_H + +#include <pthread.h> + +#include <infiniband/driver.h> +#include <ccan/bitmap.h> + +#define INIT __attribute__((constructor)) + +#define PFX "libibverbs: " +#define VERBS_OPS_NUM (sizeof(struct verbs_context_ops) / sizeof(void *)) + +struct ibv_abi_compat_v2 { + struct ibv_comp_channel channel; + pthread_mutex_t in_use; +}; + +extern int abi_ver; +extern const struct verbs_context_ops verbs_dummy_ops; + +int ibverbs_get_device_list(struct list_head *list); +int ibverbs_init(void); +void ibverbs_device_put(struct ibv_device *dev); +void ibverbs_device_hold(struct ibv_device *dev); +int __lib_query_port(struct ibv_context *context, uint8_t port_num, + struct ibv_port_attr *port_attr, size_t port_attr_len); +int setup_sysfs_uverbs(int uv_dirfd, const char *uverbs, + struct verbs_sysfs_dev *sysfs_dev); + +#ifdef _STATIC_LIBRARY_BUILD_ +static inline void load_drivers(void) +{ +} +#else +void load_drivers(void); +#endif + +struct verbs_ex_private { + BITMAP_DECLARE(unsupported_ioctls, VERBS_OPS_NUM); + uint32_t driver_id; + bool use_ioctl_write; + struct verbs_context_ops ops; +}; + +static inline struct verbs_ex_private *get_priv(struct ibv_context *ctx) +{ + return container_of(ctx, struct verbs_context, context)->priv; +} + +static inline const struct verbs_context_ops *get_ops(struct ibv_context *ctx) +{ + return &get_priv(ctx)->ops; +} + +enum ibv_node_type decode_knode_type(unsigned int knode_type); + +int find_sysfs_devs_nl(struct list_head *tmp_sysfs_dev_list); + +#endif /* IB_VERBS_H */ diff --git a/libibverbs/init.c b/libibverbs/init.c new file mode 100644 index 0000000..7bac5af --- /dev/null +++ b/libibverbs/init.c @@ -0,0 +1,644 @@ +/* + * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved. + * Copyright (c) 2006 Cisco Systems, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#define _GNU_SOURCE +#include <config.h> + +#include <stdlib.h> +#include <string.h> +#include <glob.h> +#include <stdio.h> +#include <unistd.h> +#include <sys/stat.h> +#include <sys/types.h> +#include <sys/time.h> +#include <sys/resource.h> +#include <fcntl.h> +#include <dirent.h> +#include <errno.h> +#include <assert.h> +#include <fnmatch.h> +#include <sys/sysmacros.h> + +#include <rdma/rdma_netlink.h> + +#include <util/util.h> +#include "ibverbs.h" +#include <infiniband/cmd_write.h> + +int abi_ver; + +struct ibv_driver { + struct list_node entry; + const struct verbs_device_ops *ops; +}; + +static LIST_HEAD(driver_list); + +static int try_access_device(const struct verbs_sysfs_dev *sysfs_dev) +{ + struct stat cdev_stat; + char *devpath; + int ret; + + if (asprintf(&devpath, RDMA_CDEV_DIR"/%s", + sysfs_dev->sysfs_name) < 0) + return ENOMEM; + + ret = stat(devpath, &cdev_stat); + free(devpath); + return ret; +} + +enum ibv_node_type decode_knode_type(unsigned int knode_type) +{ + switch (knode_type) { + case RDMA_NODE_IB_CA: + return IBV_NODE_CA; + case RDMA_NODE_IB_SWITCH: + return IBV_NODE_SWITCH; + case RDMA_NODE_IB_ROUTER: + return IBV_NODE_ROUTER; + case RDMA_NODE_RNIC: + return IBV_NODE_RNIC; + case RDMA_NODE_USNIC: + return IBV_NODE_USNIC; + case RDMA_NODE_USNIC_UDP: + return IBV_NODE_USNIC_UDP; + case RDMA_NODE_UNSPECIFIED: + return IBV_NODE_UNSPECIFIED; + } + return IBV_NODE_UNKNOWN; +} + +int setup_sysfs_uverbs(int uv_dirfd, const char *uverbs, + struct verbs_sysfs_dev *sysfs_dev) +{ + unsigned int major; + unsigned int minor; + struct stat buf; + char value[32]; + + if (!check_snprintf(sysfs_dev->sysfs_name, + sizeof(sysfs_dev->sysfs_name), "%s", uverbs)) + return -1; + + if (stat(sysfs_dev->ibdev_path, &buf)) + return -1; + sysfs_dev->time_created = buf.st_mtim; + + if (ibv_read_sysfs_file_at(uv_dirfd, "dev", value, + sizeof(value)) < 0) + return -1; + if (sscanf(value, "%u:%u", &major, &minor) != 2) + return -1; + sysfs_dev->sysfs_cdev = makedev(major, minor); + + if (ibv_read_sysfs_file_at(uv_dirfd, "abi_version", value, + sizeof(value)) > 0) + sysfs_dev->abi_ver = strtoul(value, NULL, 10); + + return 0; +} + +static int setup_sysfs_dev(int dirfd, const char *uverbs, + struct list_head *tmp_sysfs_dev_list) +{ + struct verbs_sysfs_dev *sysfs_dev = NULL; + char value[32]; + int uv_dirfd; + + sysfs_dev = calloc(1, sizeof(*sysfs_dev)); + if (!sysfs_dev) + return ENOMEM; + + sysfs_dev->ibdev_idx = -1; + + uv_dirfd = openat(dirfd, uverbs, O_RDONLY | O_DIRECTORY | O_CLOEXEC); + if (uv_dirfd == -1) + goto err_alloc; + + if (ibv_read_sysfs_file_at(uv_dirfd, "ibdev", sysfs_dev->ibdev_name, + sizeof(sysfs_dev->ibdev_name)) < 0) + goto err_fd; + + if (!check_snprintf( + sysfs_dev->ibdev_path, sizeof(sysfs_dev->ibdev_path), + "%s/class/infiniband/%s", ibv_get_sysfs_path(), + sysfs_dev->ibdev_name)) + goto err_fd; + + if (setup_sysfs_uverbs(uv_dirfd, uverbs, sysfs_dev)) + goto err_fd; + + if (ibv_read_ibdev_sysfs_file(value, sizeof(value), sysfs_dev, + "node_type") <= 0) + sysfs_dev->node_type = IBV_NODE_UNKNOWN; + else + sysfs_dev->node_type = + decode_knode_type(strtoul(value, NULL, 10)); + + if (try_access_device(sysfs_dev)) + goto err_fd; + + close(uv_dirfd); + list_add(tmp_sysfs_dev_list, &sysfs_dev->entry); + return 0; + +err_fd: + close(uv_dirfd); +err_alloc: + free(sysfs_dev); + return 0; +} + +static int find_sysfs_devs(struct list_head *tmp_sysfs_dev_list) +{ + struct verbs_sysfs_dev *dev, *dev_tmp; + char class_path[IBV_SYSFS_PATH_MAX]; + DIR *class_dir; + struct dirent *dent; + int ret = 0; + + if (!check_snprintf(class_path, sizeof(class_path), + "%s/class/infiniband_verbs", ibv_get_sysfs_path())) + return ENOMEM; + + class_dir = opendir(class_path); + if (!class_dir) + return ENOSYS; + + while ((dent = readdir(class_dir))) { + if (dent->d_name[0] == '.') + continue; + + ret = setup_sysfs_dev(dirfd(class_dir), dent->d_name, + tmp_sysfs_dev_list); + if (ret) + break; + } + closedir(class_dir); + + if (ret) { + list_for_each_safe (tmp_sysfs_dev_list, dev, dev_tmp, entry) { + list_del(&dev->entry); + free(dev); + } + } + return ret; +} + +void verbs_register_driver(const struct verbs_device_ops *ops) +{ + struct ibv_driver *driver; + + driver = malloc(sizeof *driver); + if (!driver) { + fprintf(stderr, + PFX "Warning: couldn't allocate driver for %s\n", + ops->name); + return; + } + + driver->ops = ops; + + list_add_tail(&driver_list, &driver->entry); +} + +/* Match a single modalias value */ +static bool match_modalias(const struct verbs_match_ent *ent, const char *value) +{ + char pci_ma[100]; + + switch (ent->kind) { + case VERBS_MATCH_MODALIAS: + return fnmatch(ent->u.modalias, value, 0) == 0; + case VERBS_MATCH_PCI: + snprintf(pci_ma, sizeof(pci_ma), "pci:v%08Xd%08Xsv*", + ent->vendor, ent->device); + return fnmatch(pci_ma, value, 0) == 0; + default: + return false; + } +} + +/* Search a null terminated table of verbs_match_ent's and return the one + * that matches the device the verbs sysfs device is bound to or NULL. + */ +static const struct verbs_match_ent * +match_modalias_device(const struct verbs_device_ops *ops, + struct verbs_sysfs_dev *sysfs_dev) +{ + const struct verbs_match_ent *i; + + if (!(sysfs_dev->flags & VSYSFS_READ_MODALIAS)) { + sysfs_dev->flags |= VSYSFS_READ_MODALIAS; + if (ibv_read_ibdev_sysfs_file( + sysfs_dev->modalias, sizeof(sysfs_dev->modalias), + sysfs_dev, "device/modalias") <= 0) { + sysfs_dev->modalias[0] = 0; + return NULL; + } + } + + for (i = ops->match_table; i->kind != VERBS_MATCH_SENTINEL; i++) + if (match_modalias(i, sysfs_dev->modalias)) + return i; + + return NULL; +} + +/* Match the device name itself */ +static const struct verbs_match_ent * +match_name(const struct verbs_device_ops *ops, + struct verbs_sysfs_dev *sysfs_dev) +{ + char name_ma[100]; + const struct verbs_match_ent *i; + + if (!check_snprintf(name_ma, sizeof(name_ma), + "rdma_device:N%s", sysfs_dev->ibdev_name)) + return NULL; + + for (i = ops->match_table; i->kind != VERBS_MATCH_SENTINEL; i++) + if (match_modalias(i, name_ma)) + return i; + + return NULL; +} + +/* Match the driver id we get from netlink */ +static const struct verbs_match_ent * +match_driver_id(const struct verbs_device_ops *ops, + struct verbs_sysfs_dev *sysfs_dev) +{ + const struct verbs_match_ent *i; + + if (sysfs_dev->driver_id == RDMA_DRIVER_UNKNOWN) + return NULL; + + for (i = ops->match_table; i->kind != VERBS_MATCH_SENTINEL; i++) + if (i->kind == VERBS_MATCH_DRIVER_ID && + i->u.driver_id == sysfs_dev->driver_id) + return i; + return NULL; +} + +/* True if the provider matches the selected rdma sysfs device */ +static bool match_device(const struct verbs_device_ops *ops, + struct verbs_sysfs_dev *sysfs_dev) +{ + if (ops->match_table) { + sysfs_dev->match = match_driver_id(ops, sysfs_dev); + if (!sysfs_dev->match) + sysfs_dev->match = match_name(ops, sysfs_dev); + if (!sysfs_dev->match) + sysfs_dev->match = + match_modalias_device(ops, sysfs_dev); + } + + if (ops->match_device) { + /* If a matching function is provided then it is called + * unconditionally after the table match above, it is + * responsible for determining if the device matches based on + * the match pointer and any other internal information. + */ + if (!ops->match_device(sysfs_dev)) + return false; + } else { + /* With no match function, we must have a table match */ + if (!sysfs_dev->match) + return false; + } + + if (sysfs_dev->abi_ver < ops->match_min_abi_version || + sysfs_dev->abi_ver > ops->match_max_abi_version) { + fprintf(stderr, PFX + "Warning: Driver %s does not support the kernel ABI of %u (supports %u to %u) for device %s\n", + ops->name, sysfs_dev->abi_ver, + ops->match_min_abi_version, + ops->match_max_abi_version, + sysfs_dev->ibdev_path); + return false; + } + return true; +} + +static struct verbs_device *try_driver(const struct verbs_device_ops *ops, + struct verbs_sysfs_dev *sysfs_dev) +{ + struct verbs_device *vdev; + struct ibv_device *dev; + + if (!match_device(ops, sysfs_dev)) + return NULL; + + vdev = ops->alloc_device(sysfs_dev); + if (!vdev) { + fprintf(stderr, PFX "Fatal: couldn't allocate device for %s\n", + sysfs_dev->ibdev_path); + return NULL; + } + + vdev->ops = ops; + + atomic_init(&vdev->refcount, 1); + dev = &vdev->device; + assert(dev->_ops._dummy1 == NULL); + assert(dev->_ops._dummy2 == NULL); + + dev->node_type = sysfs_dev->node_type; + switch (sysfs_dev->node_type) { + case IBV_NODE_CA: + case IBV_NODE_SWITCH: + case IBV_NODE_ROUTER: + dev->transport_type = IBV_TRANSPORT_IB; + break; + case IBV_NODE_RNIC: + dev->transport_type = IBV_TRANSPORT_IWARP; + break; + case IBV_NODE_USNIC: + dev->transport_type = IBV_TRANSPORT_USNIC; + break; + case IBV_NODE_USNIC_UDP: + dev->transport_type = IBV_TRANSPORT_USNIC_UDP; + break; + case IBV_NODE_UNSPECIFIED: + dev->transport_type = IBV_TRANSPORT_UNSPECIFIED; + break; + default: + dev->transport_type = IBV_TRANSPORT_UNKNOWN; + break; + } + + strcpy(dev->dev_name, sysfs_dev->sysfs_name); + if (!check_snprintf(dev->dev_path, sizeof(dev->dev_path), + "%s/class/infiniband_verbs/%s", + ibv_get_sysfs_path(), sysfs_dev->sysfs_name)) + goto err; + strcpy(dev->name, sysfs_dev->ibdev_name); + strcpy(dev->ibdev_path, sysfs_dev->ibdev_path); + vdev->sysfs = sysfs_dev; + + return vdev; + +err: + ops->uninit_device(vdev); + return NULL; +} + +static struct verbs_device *try_drivers(struct verbs_sysfs_dev *sysfs_dev) +{ + struct ibv_driver *driver; + struct verbs_device *dev; + + /* + * Matching by driver_id takes priority over other match types, do it + * first. + */ + if (sysfs_dev->driver_id != RDMA_DRIVER_UNKNOWN) { + list_for_each (&driver_list, driver, entry) { + if (match_driver_id(driver->ops, sysfs_dev)) { + dev = try_driver(driver->ops, sysfs_dev); + if (dev) + return dev; + } + } + } + + list_for_each(&driver_list, driver, entry) { + dev = try_driver(driver->ops, sysfs_dev); + if (dev) + return dev; + } + + return NULL; +} + +static int check_abi_version(void) +{ + char value[8]; + + if (abi_ver) + return 0; + + if (ibv_read_sysfs_file(ibv_get_sysfs_path(), + "class/infiniband_verbs/abi_version", value, + sizeof(value)) < 0) { + return ENOSYS; + } + + abi_ver = strtol(value, NULL, 10); + + if (abi_ver < IB_USER_VERBS_MIN_ABI_VERSION || + abi_ver > IB_USER_VERBS_MAX_ABI_VERSION) { + fprintf(stderr, PFX "Fatal: kernel ABI version %d " + "doesn't match library version %d.\n", + abi_ver, IB_USER_VERBS_MAX_ABI_VERSION); + return ENOSYS; + } + + return 0; +} + +static void check_memlock_limit(void) +{ + struct rlimit rlim; + + if (!geteuid()) + return; + + if (getrlimit(RLIMIT_MEMLOCK, &rlim)) { + fprintf(stderr, PFX "Warning: getrlimit(RLIMIT_MEMLOCK) failed."); + return; + } + + if (rlim.rlim_cur <= 32768) + fprintf(stderr, PFX "Warning: RLIMIT_MEMLOCK is %llu bytes.\n" + " This will severely limit memory registrations.\n", + (unsigned long long)rlim.rlim_cur); +} + +static int same_sysfs_dev(struct verbs_sysfs_dev *sysfs1, + struct verbs_sysfs_dev *sysfs2) +{ + if (strcmp(sysfs1->sysfs_name, sysfs2->sysfs_name) != 0) + return 0; + + /* In netlink mode the idx is a globally unique ID */ + if (sysfs1->ibdev_idx != sysfs2->ibdev_idx) + return 0; + + if (sysfs1->ibdev_idx == -1 && + ts_cmp(&sysfs1->time_created, &sysfs2->time_created, !=)) + return 0; + + return 1; +} + +/* Match every ibv_sysfs_dev in the sysfs_list to a driver and add a new entry + * to device_list. Once matched to a driver the entry in sysfs_list is + * removed. + */ +static void try_all_drivers(struct list_head *sysfs_list, + struct list_head *device_list, + unsigned int *num_devices) +{ + struct verbs_sysfs_dev *sysfs_dev; + struct verbs_sysfs_dev *tmp; + struct verbs_device *vdev; + + list_for_each_safe(sysfs_list, sysfs_dev, tmp, entry) { + vdev = try_drivers(sysfs_dev); + if (vdev) { + list_del(&sysfs_dev->entry); + /* Ownership of sysfs_dev moves into vdev->sysfs */ + list_add(device_list, &vdev->entry); + (*num_devices)++; + } + } +} + +int ibverbs_get_device_list(struct list_head *device_list) +{ + LIST_HEAD(sysfs_list); + struct verbs_sysfs_dev *sysfs_dev, *next_dev; + struct verbs_device *vdev, *tmp; + static int drivers_loaded; + unsigned int num_devices = 0; + int ret; + + ret = find_sysfs_devs_nl(&sysfs_list); + if (ret) { + ret = find_sysfs_devs(&sysfs_list); + if (ret) + return -ret; + } + + if (!list_empty(&sysfs_list)) { + ret = check_abi_version(); + if (ret) + return -ret; + } + + /* Remove entries from the sysfs_list that are already preset in the + * device_list, and remove entries from the device_list that are not + * present in the sysfs_list. + */ + list_for_each_safe(device_list, vdev, tmp, entry) { + struct verbs_sysfs_dev *old_sysfs = NULL; + + list_for_each(&sysfs_list, sysfs_dev, entry) { + if (same_sysfs_dev(vdev->sysfs, sysfs_dev)) { + old_sysfs = sysfs_dev; + break; + } + } + + if (old_sysfs) { + list_del(&old_sysfs->entry); + free(old_sysfs); + num_devices++; + } else { + list_del(&vdev->entry); + ibverbs_device_put(&vdev->device); + } + } + + try_all_drivers(&sysfs_list, device_list, &num_devices); + + if (list_empty(&sysfs_list) || drivers_loaded) + goto out; + + load_drivers(); + drivers_loaded = 1; + + try_all_drivers(&sysfs_list, device_list, &num_devices); + +out: + /* Anything left in sysfs_list was not assoicated with a + * driver. + */ + list_for_each_safe(&sysfs_list, sysfs_dev, next_dev, entry) { + if (getenv("IBV_SHOW_WARNINGS")) { + fprintf(stderr, PFX + "Warning: no userspace device-specific driver found for %s\n", + sysfs_dev->ibdev_name); + } + free(sysfs_dev); + } + + return num_devices; +} + +int ibverbs_init(void) +{ + char *env_value; + + if (getenv("RDMAV_FORK_SAFE") || getenv("IBV_FORK_SAFE")) + if (ibv_fork_init()) + fprintf(stderr, PFX "Warning: fork()-safety requested " + "but init failed\n"); + + /* Backward compatibility for the mlx4 driver env */ + env_value = getenv("MLX4_DEVICE_FATAL_CLEANUP"); + if (env_value) + verbs_allow_disassociate_destroy = strcmp(env_value, "0") != 0; + + if (getenv("RDMAV_ALLOW_DISASSOC_DESTROY")) + verbs_allow_disassociate_destroy = true; + + if (!ibv_get_sysfs_path()) + return -errno; + + check_memlock_limit(); + + return 0; +} + +void ibverbs_device_hold(struct ibv_device *dev) +{ + struct verbs_device *verbs_device = verbs_get_device(dev); + + atomic_fetch_add(&verbs_device->refcount, 1); +} + +void ibverbs_device_put(struct ibv_device *dev) +{ + struct verbs_device *verbs_device = verbs_get_device(dev); + + if (atomic_fetch_sub(&verbs_device->refcount, 1) == 1) { + free(verbs_device->sysfs); + if (verbs_device->ops->uninit_device) + verbs_device->ops->uninit_device(verbs_device); + } +} diff --git a/libibverbs/kern-abi.h b/libibverbs/kern-abi.h new file mode 100644 index 0000000..dc2f33d --- /dev/null +++ b/libibverbs/kern-abi.h @@ -0,0 +1,311 @@ +/* + * Copyright (c) 2005 Topspin Communications. All rights reserved. + * Copyright (c) 2005, 2006 Cisco Systems. All rights reserved. + * Copyright (c) 2005 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef KERN_ABI_H +#define KERN_ABI_H + +#include <linux/types.h> +#include <assert.h> +#include <ccan/container_of.h> + +#include <rdma/ib_user_verbs.h> +#include <kernel-abi/ib_user_verbs.h> + +/* + * The minimum and maximum kernel ABI that we can handle. + */ +#define IB_USER_VERBS_MIN_ABI_VERSION 3 +#define IB_USER_VERBS_MAX_ABI_VERSION 6 + +struct ex_hdr { + struct ib_uverbs_cmd_hdr hdr; + struct ib_uverbs_ex_cmd_hdr ex_hdr; +}; + +/* + * These macros expand to type names that refer to the ABI structure type + * associated with the given enum string. + */ +#define IBV_ABI_REQ(_enum) _ABI_REQ_STRUCT_##_enum +#define IBV_KABI_REQ(_enum) _KABI_REQ_STRUCT_##_enum +#define IBV_KABI_RESP(_enum) _KABI_RESP_STRUCT_##_enum + +#define IBV_ABI_ALIGN(_enum) _ABI_ALIGN_##_enum + +/* + * Historically the code had copied the data in the kernel headers, modified + * it and placed them in structs. To avoid recoding eveything we continue to + * preserve the same struct layout, with the kernel struct 'loose' inside the + * modified userspace struct. + * + * This is automated with the make_abi_structs.py script which produces the + * _STRUCT_xx macro that produces a tagless version of the kernel struct. The + * tagless struct produces a layout that matches the original code. + */ +#define DECLARE_CMDX(_enum, _name, _kabi, _kabi_resp) \ + struct _name { \ + struct ib_uverbs_cmd_hdr hdr; \ + union { \ + _STRUCT_##_kabi; \ + struct _kabi core_payload; \ + }; \ + }; \ + typedef struct _name IBV_ABI_REQ(_enum); \ + typedef struct _kabi IBV_KABI_REQ(_enum); \ + typedef struct _kabi_resp IBV_KABI_RESP(_enum); \ + enum { IBV_ABI_ALIGN(_enum) = 4 }; \ + static_assert(sizeof(struct _kabi_resp) % 4 == 0, \ + "Bad resp alignment"); \ + static_assert(_enum != -1, "Bad enum"); \ + static_assert(sizeof(struct _name) == \ + sizeof(struct ib_uverbs_cmd_hdr) + \ + sizeof(struct _kabi), \ + "Bad size") + +#define DECLARE_CMD(_enum, _name, _kabi) \ + DECLARE_CMDX(_enum, _name, _kabi, _kabi##_resp) + +#define DECLARE_CMD_EXX(_enum, _name, _kabi, _kabi_resp) \ + struct _name { \ + struct ex_hdr hdr; \ + union { \ + _STRUCT_##_kabi; \ + struct _kabi core_payload; \ + }; \ + }; \ + typedef struct _name IBV_ABI_REQ(_enum); \ + typedef struct _kabi IBV_KABI_REQ(_enum); \ + typedef struct _kabi_resp IBV_KABI_RESP(_enum); \ + enum { IBV_ABI_ALIGN(_enum) = 8 }; \ + static_assert(_enum != -1, "Bad enum"); \ + static_assert(sizeof(struct _kabi) % 8 == 0, "Bad req alignment"); \ + static_assert(sizeof(struct _kabi_resp) % 8 == 0, \ + "Bad resp alignment"); \ + static_assert(sizeof(struct _name) == \ + sizeof(struct ex_hdr) + sizeof(struct _kabi), \ + "Bad size"); \ + static_assert(sizeof(struct _name) % 8 == 0, "Bad alignment") +#define DECLARE_CMD_EX(_enum, _name, _kabi) \ + DECLARE_CMD_EXX(_enum, _name, _kabi, _kabi##_resp) + +/* Drivers may use 'empty' for _kabi to signal no struct */ +struct empty {}; +#define _STRUCT_empty struct {} + +/* + * Define the ABI struct for use by the driver. The internal cmd APIs require + * this layout. The driver specifies the enum # they wish to define for and + * the base name, and the macros figure out the rest correctly. + * + * The static asserts check that the layout produced by the wrapper struct has + * no implicit padding in strange places, specifically between the core + * structure and the driver structure and between the driver structure and the + * end of the struct. + * + * Implicit padding can arise in various cases where the structs are not sizes + * to a multiple of 8 bytes. + */ +#define DECLARE_DRV_CMD(_name, _enum, _kabi_req, _kabi_resp) \ + struct _name { \ + IBV_ABI_REQ(_enum) ibv_cmd; \ + union { \ + _STRUCT_##_kabi_req; \ + struct _kabi_req drv_payload; \ + }; \ + }; \ + struct _name##_resp { \ + IBV_KABI_RESP(_enum) ibv_resp; \ + union { \ + _STRUCT_##_kabi_resp; \ + struct _kabi_resp drv_payload; \ + }; \ + }; \ + static_assert(sizeof(IBV_KABI_REQ(_enum)) % \ + __alignof__(struct _kabi_req) == \ + 0, \ + "Bad kabi req struct length"); \ + static_assert(sizeof(struct _name) == \ + sizeof(IBV_ABI_REQ(_enum)) + \ + sizeof(struct _kabi_req), \ + "Bad req size"); \ + static_assert(sizeof(struct _name) % IBV_ABI_ALIGN(_enum) == 0, \ + "Bad kabi req alignment"); \ + static_assert(sizeof(IBV_KABI_RESP(_enum)) % \ + __alignof__(struct _kabi_resp) == \ + 0, \ + "Bad kabi resp struct length"); \ + static_assert(sizeof(struct _name##_resp) == \ + sizeof(IBV_KABI_RESP(_enum)) + \ + sizeof(struct _kabi_resp), \ + "Bad resp size"); \ + static_assert(sizeof(struct _name##_resp) % IBV_ABI_ALIGN(_enum) == 0, \ + "Bad kabi resp alignment"); + +DECLARE_CMD(IB_USER_VERBS_CMD_ALLOC_MW, ibv_alloc_mw, ib_uverbs_alloc_mw); +DECLARE_CMD(IB_USER_VERBS_CMD_ALLOC_PD, ibv_alloc_pd, ib_uverbs_alloc_pd); +DECLARE_CMDX(IB_USER_VERBS_CMD_ATTACH_MCAST, ibv_attach_mcast, ib_uverbs_attach_mcast, empty); +DECLARE_CMDX(IB_USER_VERBS_CMD_CLOSE_XRCD, ibv_close_xrcd, ib_uverbs_close_xrcd, empty); +DECLARE_CMD(IB_USER_VERBS_CMD_CREATE_AH, ibv_create_ah, ib_uverbs_create_ah); +DECLARE_CMD(IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL, ibv_create_comp_channel, ib_uverbs_create_comp_channel); +DECLARE_CMD(IB_USER_VERBS_CMD_CREATE_CQ, ibv_create_cq, ib_uverbs_create_cq); +DECLARE_CMD(IB_USER_VERBS_CMD_CREATE_QP, ibv_create_qp, ib_uverbs_create_qp); +DECLARE_CMD(IB_USER_VERBS_CMD_CREATE_SRQ, ibv_create_srq, ib_uverbs_create_srq); +DECLARE_CMDX(IB_USER_VERBS_CMD_CREATE_XSRQ, ibv_create_xsrq, ib_uverbs_create_xsrq, ib_uverbs_create_srq_resp); +DECLARE_CMDX(IB_USER_VERBS_CMD_DEALLOC_MW, ibv_dealloc_mw, ib_uverbs_dealloc_mw, empty); +DECLARE_CMDX(IB_USER_VERBS_CMD_DEALLOC_PD, ibv_dealloc_pd, ib_uverbs_dealloc_pd, empty); +DECLARE_CMDX(IB_USER_VERBS_CMD_DEREG_MR, ibv_dereg_mr, ib_uverbs_dereg_mr, empty); +DECLARE_CMDX(IB_USER_VERBS_CMD_DESTROY_AH, ibv_destroy_ah, ib_uverbs_destroy_ah, empty); +DECLARE_CMD(IB_USER_VERBS_CMD_DESTROY_CQ, ibv_destroy_cq, ib_uverbs_destroy_cq); +DECLARE_CMD(IB_USER_VERBS_CMD_DESTROY_QP, ibv_destroy_qp, ib_uverbs_destroy_qp); +DECLARE_CMD(IB_USER_VERBS_CMD_DESTROY_SRQ, ibv_destroy_srq, ib_uverbs_destroy_srq); +DECLARE_CMDX(IB_USER_VERBS_CMD_DETACH_MCAST, ibv_detach_mcast, ib_uverbs_detach_mcast, empty); +DECLARE_CMD(IB_USER_VERBS_CMD_GET_CONTEXT, ibv_get_context, ib_uverbs_get_context); +DECLARE_CMDX(IB_USER_VERBS_CMD_MODIFY_QP, ibv_modify_qp, ib_uverbs_modify_qp, empty); +DECLARE_CMDX(IB_USER_VERBS_CMD_MODIFY_SRQ, ibv_modify_srq, ib_uverbs_modify_srq, empty); +DECLARE_CMDX(IB_USER_VERBS_CMD_OPEN_QP, ibv_open_qp, ib_uverbs_open_qp, ib_uverbs_create_qp_resp); +DECLARE_CMD(IB_USER_VERBS_CMD_OPEN_XRCD, ibv_open_xrcd, ib_uverbs_open_xrcd); +DECLARE_CMD(IB_USER_VERBS_CMD_POLL_CQ, ibv_poll_cq, ib_uverbs_poll_cq); +DECLARE_CMD(IB_USER_VERBS_CMD_POST_RECV, ibv_post_recv, ib_uverbs_post_recv); +DECLARE_CMD(IB_USER_VERBS_CMD_POST_SEND, ibv_post_send, ib_uverbs_post_send); +DECLARE_CMD(IB_USER_VERBS_CMD_POST_SRQ_RECV, ibv_post_srq_recv, ib_uverbs_post_srq_recv); +DECLARE_CMD(IB_USER_VERBS_CMD_QUERY_DEVICE, ibv_query_device, ib_uverbs_query_device); +DECLARE_CMD(IB_USER_VERBS_CMD_QUERY_PORT, ibv_query_port, ib_uverbs_query_port); +DECLARE_CMD(IB_USER_VERBS_CMD_QUERY_QP, ibv_query_qp, ib_uverbs_query_qp); +DECLARE_CMD(IB_USER_VERBS_CMD_QUERY_SRQ, ibv_query_srq, ib_uverbs_query_srq); +DECLARE_CMD(IB_USER_VERBS_CMD_REG_MR, ibv_reg_mr, ib_uverbs_reg_mr); +DECLARE_CMDX(IB_USER_VERBS_CMD_REQ_NOTIFY_CQ, ibv_req_notify_cq, ib_uverbs_req_notify_cq, empty); +DECLARE_CMD(IB_USER_VERBS_CMD_REREG_MR, ibv_rereg_mr, ib_uverbs_rereg_mr); +DECLARE_CMD(IB_USER_VERBS_CMD_RESIZE_CQ, ibv_resize_cq, ib_uverbs_resize_cq); + +DECLARE_CMD_EX(IB_USER_VERBS_EX_CMD_CREATE_CQ, ibv_create_cq_ex, ib_uverbs_ex_create_cq); +DECLARE_CMD_EX(IB_USER_VERBS_EX_CMD_CREATE_FLOW, ibv_create_flow, ib_uverbs_create_flow); +DECLARE_CMD_EX(IB_USER_VERBS_EX_CMD_CREATE_QP, ibv_create_qp_ex, ib_uverbs_ex_create_qp); +DECLARE_CMD_EX(IB_USER_VERBS_EX_CMD_CREATE_RWQ_IND_TBL, ibv_create_rwq_ind_table, ib_uverbs_ex_create_rwq_ind_table); +DECLARE_CMD_EX(IB_USER_VERBS_EX_CMD_CREATE_WQ, ibv_create_wq, ib_uverbs_ex_create_wq); +DECLARE_CMD_EXX(IB_USER_VERBS_EX_CMD_DESTROY_FLOW, ibv_destroy_flow, ib_uverbs_destroy_flow, empty); +DECLARE_CMD_EXX(IB_USER_VERBS_EX_CMD_DESTROY_RWQ_IND_TBL, ibv_destroy_rwq_ind_table, ib_uverbs_ex_destroy_rwq_ind_table, empty); +DECLARE_CMD_EX(IB_USER_VERBS_EX_CMD_DESTROY_WQ, ibv_destroy_wq, ib_uverbs_ex_destroy_wq); +DECLARE_CMD_EXX(IB_USER_VERBS_EX_CMD_MODIFY_CQ, ibv_modify_cq, ib_uverbs_ex_modify_cq, empty); +DECLARE_CMD_EX(IB_USER_VERBS_EX_CMD_MODIFY_QP, ibv_modify_qp_ex, ib_uverbs_ex_modify_qp); +DECLARE_CMD_EXX(IB_USER_VERBS_EX_CMD_MODIFY_WQ, ibv_modify_wq, ib_uverbs_ex_modify_wq, empty); +DECLARE_CMD_EX(IB_USER_VERBS_EX_CMD_QUERY_DEVICE, ibv_query_device_ex, ib_uverbs_ex_query_device); + +/* + * Both ib_uverbs_create_qp and ib_uverbs_ex_create_qp start with the same + * structure, this function converts the ex version into the normal version + */ +static inline struct ib_uverbs_create_qp * +ibv_create_qp_ex_to_reg(struct ibv_create_qp_ex *cmd_ex) +{ + /* + * user_handle is the start in both places, note that the ex + * does not have response located in the same place, so response + * cannot be touched. + */ + return container_of(&cmd_ex->user_handle, struct ib_uverbs_create_qp, + user_handle); +} + +/* + * This file contains copied data from the kernel's include/uapi/rdma/ib_user_verbs.h, + * now included above. + * + * Whenever possible use the definition from the kernel header and avoid + * copying from that header into this file. + */ + +struct ibv_kern_ipv4_filter { + __u32 src_ip; + __u32 dst_ip; +}; + +struct ibv_kern_spec_ipv4 { + __u32 type; + __u16 size; + __u16 reserved; + struct ibv_kern_ipv4_filter val; + struct ibv_kern_ipv4_filter mask; +}; + +struct ibv_kern_spec { + union { + struct ib_uverbs_flow_spec_hdr hdr; + struct ib_uverbs_flow_spec_eth eth; + struct ibv_kern_spec_ipv4 ipv4; + struct ib_uverbs_flow_spec_ipv4 ipv4_ext; + struct ib_uverbs_flow_spec_esp esp; + struct ib_uverbs_flow_spec_tcp_udp tcp_udp; + struct ib_uverbs_flow_spec_ipv6 ipv6; + struct ib_uverbs_flow_spec_gre gre; + struct ib_uverbs_flow_spec_tunnel tunnel; + struct ib_uverbs_flow_spec_mpls mpls; + struct ib_uverbs_flow_spec_action_tag flow_tag; + struct ib_uverbs_flow_spec_action_drop drop; + struct ib_uverbs_flow_spec_action_handle handle; + struct ib_uverbs_flow_spec_action_count flow_count; + }; +}; + +struct ib_uverbs_modify_srq_v3 { + __u32 srq_handle; + __u32 attr_mask; + __u32 max_wr; + __u32 max_sge; + __u32 srq_limit; + __u32 reserved; +}; +#define _STRUCT_ib_uverbs_modify_srq_v3 +enum { IB_USER_VERBS_CMD_MODIFY_SRQ_V3 = IB_USER_VERBS_CMD_MODIFY_SRQ }; +DECLARE_CMDX(IB_USER_VERBS_CMD_MODIFY_SRQ_V3, ibv_modify_srq_v3, ib_uverbs_modify_srq_v3, empty); + +struct ibv_create_qp_resp_v3 { + __u32 qp_handle; + __u32 qpn; +}; + +struct ibv_create_qp_resp_v4 { + __u32 qp_handle; + __u32 qpn; + __u32 max_send_wr; + __u32 max_recv_wr; + __u32 max_send_sge; + __u32 max_recv_sge; + __u32 max_inline_data; +}; + +struct ibv_create_srq_resp_v5 { + __u32 srq_handle; +}; + +#endif /* KERN_ABI_H */ diff --git a/libibverbs/libibverbs.map.in b/libibverbs/libibverbs.map.in new file mode 100644 index 0000000..5280cfe --- /dev/null +++ b/libibverbs/libibverbs.map.in @@ -0,0 +1,204 @@ +/* Do not change this file without reading Documentation/versioning.md */ +IBVERBS_1.0 { + global: + ibv_get_device_list; + ibv_free_device_list; + ibv_get_device_name; + ibv_get_device_guid; + ibv_open_device; + ibv_close_device; + ibv_get_async_event; + ibv_ack_async_event; + ibv_query_device; + ibv_query_device_ex; + ibv_query_port; + ibv_query_gid; + ibv_query_pkey; + ibv_alloc_pd; + ibv_dealloc_pd; + ibv_reg_mr; + ibv_dereg_mr; + ibv_create_comp_channel; + ibv_destroy_comp_channel; + ibv_create_cq; + ibv_resize_cq; + ibv_destroy_cq; + ibv_get_cq_event; + ibv_ack_cq_events; + ibv_create_srq; + ibv_modify_srq; + ibv_query_srq; + ibv_destroy_srq; + ibv_create_qp; + ibv_query_qp; + ibv_modify_qp; + ibv_destroy_qp; + ibv_create_ah; + ibv_destroy_ah; + ibv_attach_mcast; + ibv_detach_mcast; + ibv_rate_to_mult; + mult_to_ibv_rate; + + /* These historical symbols are now private to libibverbs, but used by + other rdma-core libraries. Do not change them. */ + ibv_copy_path_rec_from_kern; + ibv_copy_path_rec_to_kern; + ibv_copy_qp_attr_from_kern; + ibv_get_sysfs_path; + ibv_read_sysfs_file; + + local: *; +}; + +IBVERBS_1.1 { + global: + ibv_ack_async_event; + ibv_ack_cq_events; + ibv_alloc_pd; + ibv_attach_mcast; + ibv_close_device; + ibv_create_ah; + ibv_create_ah_from_wc; + ibv_create_cq; + ibv_create_qp; + ibv_create_srq; + ibv_dealloc_pd; + ibv_dereg_mr; + ibv_destroy_ah; + ibv_destroy_cq; + ibv_destroy_qp; + ibv_destroy_srq; + ibv_detach_mcast; + ibv_dofork_range; + ibv_dontfork_range; + ibv_event_type_str; + ibv_fork_init; + ibv_free_device_list; + ibv_get_async_event; + ibv_get_cq_event; + ibv_get_device_guid; + ibv_get_device_list; + ibv_get_device_name; + ibv_init_ah_from_wc; + ibv_modify_qp; + ibv_modify_srq; + ibv_node_type_str; + ibv_open_device; + ibv_port_state_str; + ibv_query_device; + ibv_query_gid; + ibv_query_pkey; + ibv_query_port; + ibv_query_qp; + ibv_query_srq; + ibv_rate_to_mbps; + ibv_reg_mr; + ibv_register_driver; + ibv_rereg_mr; + ibv_resize_cq; + ibv_resolve_eth_l2_from_gid; + ibv_wc_status_str; + mbps_to_ibv_rate; + + /* These historical symbols are now private to libibverbs, but used by + other rdma-core libraries. Do not change them. */ + ibv_copy_ah_attr_from_kern; +} IBVERBS_1.0; + +IBVERBS_1.5 { + global: + ibv_get_pkey_index; +} IBVERBS_1.1; + +IBVERBS_1.6 { + global: + ibv_qp_to_qp_ex; +} IBVERBS_1.5; + +IBVERBS_1.7 { + global: + ibv_reg_mr_iova; +} IBVERBS_1.6; + +IBVERBS_1.8 { + global: + ibv_reg_mr_iova2; +} IBVERBS_1.7; + +/* If any symbols in this stanza change ABI then the entire staza gets a new symbol + version. See the top level CMakeLists.txt for this setting. */ + +IBVERBS_PRIVATE_@IBVERBS_PABI_VERSION@ { + global: + /* These historical symbols are now private to libibverbs */ + __ioctl_final_num_attrs; + _verbs_init_and_alloc_context; + execute_ioctl; + ibv_cmd_advise_mr; + ibv_cmd_alloc_dm; + ibv_cmd_alloc_mw; + ibv_cmd_alloc_pd; + ibv_cmd_attach_mcast; + ibv_cmd_close_xrcd; + ibv_cmd_create_ah; + ibv_cmd_create_counters; + ibv_cmd_create_cq; + ibv_cmd_create_cq_ex; + ibv_cmd_create_flow; + ibv_cmd_create_flow_action_esp; + ibv_cmd_create_qp; + ibv_cmd_create_qp_ex2; + ibv_cmd_create_qp_ex; + ibv_cmd_create_rwq_ind_table; + ibv_cmd_create_srq; + ibv_cmd_create_srq_ex; + ibv_cmd_create_wq; + ibv_cmd_dealloc_mw; + ibv_cmd_dealloc_pd; + ibv_cmd_dereg_mr; + ibv_cmd_destroy_ah; + ibv_cmd_destroy_counters; + ibv_cmd_destroy_cq; + ibv_cmd_destroy_flow; + ibv_cmd_destroy_flow_action; + ibv_cmd_destroy_qp; + ibv_cmd_destroy_rwq_ind_table; + ibv_cmd_destroy_srq; + ibv_cmd_destroy_wq; + ibv_cmd_detach_mcast; + ibv_cmd_free_dm; + ibv_cmd_get_context; + ibv_cmd_modify_flow_action_esp; + ibv_cmd_modify_qp; + ibv_cmd_modify_qp_ex; + ibv_cmd_modify_srq; + ibv_cmd_modify_wq; + ibv_cmd_open_qp; + ibv_cmd_open_xrcd; + ibv_cmd_poll_cq; + ibv_cmd_post_recv; + ibv_cmd_post_send; + ibv_cmd_post_srq_recv; + ibv_cmd_query_device; + ibv_cmd_query_device_ex; + ibv_cmd_query_port; + ibv_cmd_query_qp; + ibv_cmd_query_srq; + ibv_cmd_read_counters; + ibv_cmd_reg_dm_mr; + ibv_cmd_reg_mr; + ibv_cmd_req_notify_cq; + ibv_cmd_rereg_mr; + ibv_cmd_resize_cq; + ibv_get_fw_ver; + ibv_query_gid_type; + ibv_read_ibdev_sysfs_file; + verbs_allow_disassociate_destroy; + verbs_open_device; + verbs_register_driver_@IBVERBS_PABI_VERSION@; + verbs_set_ops; + verbs_uninit_context; + verbs_init_cq; + ibv_cmd_modify_cq; +}; diff --git a/libibverbs/man/CMakeLists.txt b/libibverbs/man/CMakeLists.txt new file mode 100644 index 0000000..e1d5edf --- /dev/null +++ b/libibverbs/man/CMakeLists.txt @@ -0,0 +1,125 @@ +rdma_man_pages( + ibv_advise_mr.3.md + ibv_alloc_dm.3 + ibv_alloc_mw.3 + ibv_alloc_null_mr.3.md + ibv_alloc_parent_domain.3 + ibv_alloc_pd.3 + ibv_alloc_td.3 + ibv_asyncwatch.1 + ibv_attach_counters_point_flow.3.md + ibv_attach_mcast.3.md + ibv_bind_mw.3 + ibv_create_ah.3 + ibv_create_ah_from_wc.3 + ibv_create_comp_channel.3 + ibv_create_counters.3.md + ibv_create_cq.3 + ibv_create_cq_ex.3 + ibv_modify_cq.3 + ibv_create_flow.3 + ibv_create_flow_action.3.md + ibv_create_qp.3 + ibv_create_qp_ex.3 + ibv_create_rwq_ind_table.3 + ibv_create_srq.3 + ibv_create_srq_ex.3 + ibv_create_wq.3 + ibv_devices.1 + ibv_devinfo.1 + ibv_event_type_str.3.md + ibv_fork_init.3.md + ibv_get_async_event.3 + ibv_get_cq_event.3 + ibv_get_device_guid.3.md + ibv_get_device_list.3.md + ibv_get_device_name.3.md + ibv_get_pkey_index.3.md + ibv_get_srq_num.3.md + ibv_inc_rkey.3.md + ibv_modify_qp.3 + ibv_modify_qp_rate_limit.3 + ibv_modify_srq.3 + ibv_modify_wq.3 + ibv_open_device.3 + ibv_open_qp.3 + ibv_open_xrcd.3 + ibv_poll_cq.3 + ibv_post_recv.3 + ibv_post_send.3 + ibv_post_srq_ops.3 + ibv_post_srq_recv.3 + ibv_query_device.3 + ibv_query_device_ex.3 + ibv_query_gid.3.md + ibv_query_pkey.3.md + ibv_query_port.3 + ibv_query_qp.3 + ibv_query_rt_values_ex.3 + ibv_query_srq.3 + ibv_rate_to_mbps.3.md + ibv_rate_to_mult.3.md + ibv_rc_pingpong.1 + ibv_read_counters.3.md + ibv_reg_mr.3 + ibv_req_notify_cq.3.md + ibv_rereg_mr.3.md + ibv_resize_cq.3.md + ibv_srq_pingpong.1 + ibv_uc_pingpong.1 + ibv_ud_pingpong.1 + ibv_wr_post.3.md + ibv_xsrq_pingpong.1 + ) +rdma_alias_man_pages( + ibv_alloc_dm.3 ibv_free_dm.3 + ibv_alloc_dm.3 ibv_reg_dm_mr.3 + ibv_alloc_dm.3 ibv_memcpy_to_dm.3 + ibv_alloc_dm.3 ibv_memcpy_from_dm.3 + ibv_alloc_mw.3 ibv_dealloc_mw.3 + ibv_alloc_pd.3 ibv_dealloc_pd.3 + ibv_alloc_td.3 ibv_dealloc_td.3 + ibv_attach_mcast.3 ibv_detach_mcast.3 + ibv_create_ah.3 ibv_destroy_ah.3 + ibv_create_ah_from_wc.3 ibv_init_ah_from_wc.3 + ibv_create_comp_channel.3 ibv_destroy_comp_channel.3 + ibv_create_counters.3 ibv_destroy_counters.3 + ibv_create_cq.3 ibv_destroy_cq.3 + ibv_create_flow.3 ibv_destroy_flow.3 + ibv_create_flow_action.3 ibv_destroy_flow_action.3 + ibv_create_flow_action.3 ibv_modify_flow_action.3 + ibv_create_qp.3 ibv_destroy_qp.3 + ibv_create_rwq_ind_table.3 ibv_destroy_rwq_ind_table.3 + ibv_create_srq.3 ibv_destroy_srq.3 + ibv_create_wq.3 ibv_destroy_wq.3 + ibv_event_type_str.3 ibv_node_type_str.3 + ibv_event_type_str.3 ibv_port_state_str.3 + ibv_get_async_event.3 ibv_ack_async_event.3 + ibv_get_cq_event.3 ibv_ack_cq_events.3 + ibv_get_device_list.3 ibv_free_device_list.3 + ibv_open_device.3 ibv_close_device.3 + ibv_open_xrcd.3 ibv_close_xrcd.3 + ibv_rate_to_mbps.3 mbps_to_ibv_rate.3 + ibv_rate_to_mult.3 mult_to_ibv_rate.3 + ibv_reg_mr.3 ibv_dereg_mr.3 + ibv_wr_post.3 ibv_wr_abort.3 + ibv_wr_post.3 ibv_wr_complete.3 + ibv_wr_post.3 ibv_wr_start.3 + ibv_wr_post.3 ibv_wr_atomic_cmp_swp.3 + ibv_wr_post.3 ibv_wr_atomic_fetch_add.3 + ibv_wr_post.3 ibv_wr_bind_mw.3 + ibv_wr_post.3 ibv_wr_local_inv.3 + ibv_wr_post.3 ibv_wr_rdma_read.3 + ibv_wr_post.3 ibv_wr_rdma_write.3 + ibv_wr_post.3 ibv_wr_rdma_write_imm.3 + ibv_wr_post.3 ibv_wr_send.3 + ibv_wr_post.3 ibv_wr_send_imm.3 + ibv_wr_post.3 ibv_wr_send_inv.3 + ibv_wr_post.3 ibv_wr_send_tso.3 + ibv_wr_post.3 ibv_wr_set_inline_data.3 + ibv_wr_post.3 ibv_wr_set_inline_data_list.3 + ibv_wr_post.3 ibv_wr_set_sge.3 + ibv_wr_post.3 ibv_wr_set_sge_list.3 + ibv_wr_post.3 ibv_wr_set_ud_addr.3 + ibv_wr_post.3 ibv_wr_set_xrc_srqn.3 + ) diff --git a/libibverbs/man/ibv_advise_mr.3.md b/libibverbs/man/ibv_advise_mr.3.md new file mode 100644 index 0000000..5794b68 --- /dev/null +++ b/libibverbs/man/ibv_advise_mr.3.md @@ -0,0 +1,127 @@ +--- +date: 2018-10-19 +footer: libibverbs +header: "Libibverbs Programmer's Manual" +layout: page +license: 'Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md' +section: 3 +title: IBV_ADVISE_MR +--- + +# NAME + +ibv_advise_mr - Gives advice or directions to the kernel about an + address range belongs to a memory region (MR). + +# SYNOPSIS + +```c +#include <infiniband/verbs.h> + +int ibv_advise_mr(struct ibv_pd *pd, + enum ibv_advise_mr_advice advice, + uint32_t flags, + struct ibv_sge *sg_list, + uint32_t num_sge) +``` + +# DESCRIPTION + +**ibv_advise_mr()** Give advice or directions to the kernel about an +address range belonging to a memory region (MR). +Applications that are aware of future access patterns can use this verb +in order to leverage this knowledge to improve system or +application performance. + +**Conventional advice values** + +*IBV_ADVISE_MR_ADVICE_PREFETCH* +: Pre-fetch a range of an on-demand paging MR. + Make pages present with read-only permission before the actual IO is conducted. + This would provide a way to reduce latency by overlapping paging-in + and either compute time or IO to other ranges. + +*IBV_ADVISE_MR_ADVICE_PREFETCH_WRITE* +: Like IBV_ADVISE_MR_ADVICE_PREFETCH but with read-access + and write-access permission to the fetched memory. + +# ARGUMENTS +*pd* +: The protection domain (PD) associated with the MR. + +*advice* +: The requested advise value (as listed above). + +*flags* +: Describes the properties of the advise operation + **Conventional advice values** + *IBV_ADVISE_MR_FLAG_FLUSH* + : Request to be a synchronized operation. Return to the caller + after the operation is completed. + +*sg_list* +: Pointer to the s/g array + When using IBV_ADVISE_OP_PREFETCH advise value, all the lkeys of all + the scatter gather elements (SGEs) must be associated with ODP MRs + (MRs that were registered with IBV_ACCESS_ON_DEMAND). + +*num_sge* +: Number of elements in the s/g array + +# RETURN VALUE + +**ibv_advise_mr()** returns 0 when the call was successful, or the value + of errno on failure (which indicates the failure reason). + +*EOPNOTSUPP* +: libibverbs or provider driver doesn't support the ibv_advise_mr() verb + (ENOSYS may sometimes be returned by old versions of libibverbs). + +*ENOTSUP* +: The advise operation isn't supported. + +*EFAULT* +: In one of the following: + o When the range requested is out of the MR bounds, or when parts of + it are not part of the process address space. + o One of the lkeys provided in the scatter gather list is invalid or + with wrong write access. + +*EINVAL* +: In one of the following: + o The PD is invalid. + o The flags are invalid. + +# NOTES + +An application may pre-fetch any address range within an ODP MR when using the +**IBV_ADVISE_MR_ADVICE_PREFETCH** or **IBV_ADVISE_MR_ADVICE_PREFETCH_WRITE** advice. +Semantically, this operation is best-effort. That means the kernel does not +guarantee that underlying pages are updated in the HCA or the pre-fetched pages +would remain resident. + +When using **IBV_ADVISE_MR_ADVICE_PREFETCH** or **IBV_ADVISE_MR_ADVICE_PREFETCH_WRITE** +advice, the operation will be done in the following stages: + o Page in the user pages to memory (pages aren't pinned). + o Get the dma mapping of these user pages. + o Post the underlying page translations to the HCA. + +If **IBV_ADVISE_MR_FLAG_FLUSH** is specified then the underlying pages are +guaranteed to be updated in the HCA before returning SUCCESS. +Otherwise the driver can choose to postpone the posting of the new translations +to the HCA. +When performing a local RDMA access operation it is recommended to use +IBV_ADVISE_MR_FLAG_FLUSH flag with one of the pre-fetch advices to +increase probability that the pages translations are valid in the HCA +and avoid future page faults. + +# SEE ALSO + +**ibv_reg_mr**(3), +**ibv_rereg_mr**(3), +**ibv_dereg_mr**(3) + +# AUTHOR + +Aviad Yehezkel <aviadye@mellanox.com> + diff --git a/libibverbs/man/ibv_alloc_dm.3 b/libibverbs/man/ibv_alloc_dm.3 new file mode 100644 index 0000000..28fa488 --- /dev/null +++ b/libibverbs/man/ibv_alloc_dm.3 @@ -0,0 +1,116 @@ +.\" -*- nroff -*- +.\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md +.\" +.TH IBV_ALLOC_DM 3 2017-07-25 libibverbs "Libibverbs Programmer's Manual" +.SH "NAME" +ibv_alloc_dm, ibv_free_dm, ibv_memcpy_to/from_dm \- allocate or free a device memory buffer (DMs) and perform memory copy to or +from it +.SH "SYNOPSIS" +.nf +.B #include <infiniband/verbs.h> +.sp +.BI "struct ibv_dm *ibv_alloc_dm(struct ibv_context " "*context", +.BI " struct ibv_alloc_dm_attr " "*attr"); +.sp +.BI "int ibv_free_dm(struct ibv_dm " "*dm"); +.fi +.SH "DESCRIPTION" +.B ibv_alloc_dm() +allocates a device memory buffer for the RDMA device context +.I context\fR. +The argument +.I attr +is a pointer to an ibv_alloc_dm_attr struct, as defined in <infiniband/verbs.h>. +.PP +.B ibv_free_dm() +free the device memory buffer +.I dm\fR. +.PP +.nf +struct ibv_alloc_dm_attr { +.in +8 +size_t length; /* Length of desired device memory buffer */ +uint32_t log_align_req; /* Log base 2 of address alignment requirement */ +uint32_t comp_mask; /* Compatibility mask that defines which of the following variables are valid */ +.in -8 +}; + +Address alignment may be required in cases where RDMA atomic operations will be performed using the device memory. +.PP +In such cases, the user may specify the device memory start address alignment using the log_align_req parameter +.PP +in the allocation attributes struct. +.PP +.SH "Accessing an allocated device memory" +.nf +In order to perform a write/read memory access to an allocated device memory, a user could use the ibv_memcpy_to_dm +and ibv_memcpy_from_dm calls respectively. +.sp +.BI "int ibv_memcpy_to_dm(struct ibv_dm " "*dm" ", uint64_t " "dm_offset", +.BI " void " "*host_addr" ", size_t " "length" "); +.sp +.BI "int ibv_memcpy_from_dm(void " "*host_addr" ", struct ibv_dm " "*dm" ", +.BI " uint64_t " "dm_offset" ", size_t " "length" "); +.sp +.I dm_offest +is the byte offset from the beginning of the allocated device memory buffer to access. +.sp +.I host_addr +is the host memory buffer address to access. +.sp +.I length +is the copy length in bytes. +.sp +.fi +.SH "Device memory registration" +.nf +User may register the allocated device memory as a memory region and use the lkey/rkey inside sge when posting receive +or sending work request. This type of MR is defined as zero based and therefore any reference to it (specifically in sge) +is done with a byte offset from the beginning of the region. +.sp +This type of registration is done using ibv_reg_dm_mr. +.sp +.BI "struct ibv_mr* ibv_reg_dm_mr(struct ibv_pd " "*pd" ", struct ibv_dm " "*dm" ", uint64_t " "dm_offset", +.BI " size_t " "length" ", uint32_t " "access"); +.sp +.I pd +the associated pd for this registration. +.sp +.I dm +the associated dm for this registartion. +.sp +.I dm_offest +is the byte offset from the beginning of the allocated device memory buffer to register. +.sp +.I length +the memory length to register. +.sp +.I access +mr access flags (Use enum ibv_access_flags). For this type of registration, user must set the IBV_ACCESS_ZERO_BASED +flag. + +.SH "RETURN VALUE" +.B ibv_alloc_dm() +returns a pointer to an ibv_dm struct or NULL if the request fails. +.PP +.B ibv_free_dm() +returns 0 on success, or the value of errno on failure (which indicates the failure reason). +.PP +.B ibv_reg_dm_mr() +returns a pointer to an ibv_mr struct on success or NULL if request fails. +.PP +.B ibv_memcpy_to_dm()/ibv_memcpy_from_dm() +returns 0 on success or the failure reason value on failure. +.SH "NOTES" +.B ibv_alloc_dm() +may fail if device has no more free device memory left, where the maximum amount of allocated memory is provided by the +.I max_dm_size\fR attribute in +.I ibv_device_attr_ex\fR struct. +.B ibv_free_dm() +may fail if any other resources (such as an MR) is still associated with the DM being +freed. +.SH "SEE ALSO" +.BR ibv_query_device_ex (3), +.SH "AUTHORS" +.TP +Ariel Levkovich <lariel@mellanox.com> diff --git a/libibverbs/man/ibv_alloc_mw.3 b/libibverbs/man/ibv_alloc_mw.3 new file mode 100644 index 0000000..86acaca --- /dev/null +++ b/libibverbs/man/ibv_alloc_mw.3 @@ -0,0 +1,55 @@ +.\" -*- nroff -*- +.\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md +.\" +.TH IBV_ALLOC_MW 3 2016-02-02 libibverbs "Libibverbs Programmer's Manual" +.SH "NAME" +ibv_alloc_mw, ibv_dealloc_mw \- allocate or deallocate a memory window (MW) +.SH "SYNOPSIS" +.nf +.B #include <infiniband/verbs.h> +.sp +.BI "struct ibv_mw *ibv_alloc_mw(struct ibv_pd " "*pd" , +.BI " enum ibv_mw_type " "type"); +.sp +.BI "int ibv_dealloc_mw(struct ibv_mw " "*mw" ); +.fi +.SH "DESCRIPTION" +.B ibv_alloc_mw() +allocates a memory window (MW) associated with the protection domain +.I pd\fR. +The MW's type (1 or 2A/2B) is +.I type\fR. +.PP +The MW is created not bound. For it to be useful, the MW must be bound, through either ibv_bind_mw (type 1) or a special WR (type 2). +Once bound, the memory window allows RDMA (remote) access to a subset of the MR to which it was bound, +until invalidated by: ibv_bind_mw verb with zero length for type 1, +IBV_WR_LOCAL_INV/IBV_WR_SEND_WITH_INV WR opcode for type 2, deallocation. +.PP +.B ibv_dealloc_mw() +Unbinds in case was previously bound and deallocates the MW +.I mw\fR. +.SH "RETURN VALUE" +.B ibv_alloc_mw() +returns a pointer to the allocated MW, or NULL if the request fails. +The remote key (\fBR_Key\fR) +field +.B rkey +is used by remote processes to perform Atomic and RDMA operations. This key will be changed during bind operations. The remote process places this +.B rkey +as the rkey field of struct ibv_send_wr passed to the ibv_post_send function. +.PP +.B ibv_dealloc_mw() +returns 0 on success, or the value of errno on failure (which indicates the failure reason). +.SH "NOTES" +.B ibv_dereg_mr() +fails if any memory window is still bound to this MR. +.SH "SEE ALSO" +.BR ibv_alloc_pd (3), +.BR ibv_post_send (3), +.BR ibv_bind_mw (3), +.BR ibv_reg_mr (3), +.SH "AUTHORS" +.TP +Majd Dibbiny <majd@mellanox.com> +.TP +Yishai Hadas <yishaih@mellanox.com> diff --git a/libibverbs/man/ibv_alloc_null_mr.3.md b/libibverbs/man/ibv_alloc_null_mr.3.md new file mode 100644 index 0000000..e0e341f --- /dev/null +++ b/libibverbs/man/ibv_alloc_null_mr.3.md @@ -0,0 +1,59 @@ +--- +date: 2018-6-1 +footer: libibverbs +header: "Libibverbs Programmer's Manual" +layout: page +license: 'Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md' +section: 3 +title: ibv_alloc_null_mr +--- + +# NAME + +ibv_alloc_null_mr - allocate a null memory region (MR) + +# SYNOPSIS + +```c +#include <infiniband/verbs.h> + +struct ibv_mr *ibv_alloc_null_mr(struct ibv_pd *pd); +``` + + +# DESCRIPTION + +**ibv_alloc_null_mr()** allocates a null memory region (MR) that is associated with the protection +domain *pd*. + +A null MR discards all data written to it, and always returns 0 on +read. It has the maximum length and only the lkey is valid, the MR is not +exposed as an rkey. + +A device should implement the null MR in a way that bypasses PCI +transfers, internally discarding or sourcing 0 data. This provides a +way to avoid PCI bus transfers by using a scatter/gather list in +commands if applications do not intend to access the data, or need +data to be 0 filled. + +Specifically upon **ibv_post_send()** the device skips PCI read cycles and +upon **ibv_post_recv()** the device skips PCI write cycles which finally +improves performance. + +**ibv_dereg_mr()** deregisters the MR. +The use of ibv_rereg_mr() or ibv_bind_mw() +with this MR is invalid. + +# RETURN VALUE + +**ibv_alloc_null_mr()** returns a pointer to the allocated MR, or NULL if the request fails. + +# SEE ALSO + +**ibv_reg_mr**(3), +**ibv_dereg_mr**(3), + +# AUTHOR + +Yonatan Cohen <yonatanc@mellanox.com> + diff --git a/libibverbs/man/ibv_alloc_parent_domain.3 b/libibverbs/man/ibv_alloc_parent_domain.3 new file mode 100644 index 0000000..6e2f356 --- /dev/null +++ b/libibverbs/man/ibv_alloc_parent_domain.3 @@ -0,0 +1,128 @@ +.\" -*- nroff -*- +.\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md +.\" +.TH IBV_ALLOC_PARENT_DOMAIN 3 2017-11-06 libibverbs "Libibverbs Programmer's Manual" +.SH "NAME" +ibv_alloc_parent_domain(), ibv_dealloc_pd() \- allocate and deallocate the parent domain object +.SH "SYNOPSIS" +.nf +.B #include <infiniband/verbs.h> +.sp +.BI "struct ibv_pd *ibv_alloc_parent_domain(struct ibv_context "*context" ", struct ibv_parent_domain_init_attr " "*attr"); +.sp +.SH "DESCRIPTION" +.B ibv_alloc_parent_domain() +allocates a parent domain object for the RDMA device context +.I context\fR. +.sp +The parent domain object extends the normal protection domain with additional +objects, such as a thread domain. +.sp +A parent domain is completely interchangeable with the +.I +struct ibv_pd +used to create it, and can be used as an input argument to any function accepting a +.I +struct ibv_pd. +.sp +The behavior of each verb may be different if the verb is passed a parent +domain +.I +struct ibv_pd +that contains a +.I +struct ibv_td pointer\fR. +For instance the verb my choose to share resources +between objects using the same thread domain. The exact behavior is provider +dependent. +.sp +The +.I attr +argument specifies the following: +.PP +.nf +enum ibv_parent_domain_init_attr_mask { +.in +8 +IBV_PARENT_DOMAIN_INIT_ATTR_ALLOCATORS = 1 << 0, +IBV_PARENT_DOMAIN_INIT_ATTR_PD_CONTEXT = 1 << 1, +.in -8 +}; + +struct ibv_parent_domain_init_attr { +.in +8 +struct ibv_pd *pd; /* referance to a protection domain, can't be NULL */ +struct ibv_td *td; /* referance to a thread domain, or NULL */ +uint32_t comp_mask; +void *(*alloc)(struct ibv_pd *pd, void *pd_context, size_t size, + size_t alignment, uint64_t resource_type); +void (*free)(struct ibv_pd *pd, void *pd_context, void *ptr, + uint64_t resource_type); +void *pd_context; +.in -8 +}; +.fi +.PP +.sp +.B ibv_dealloc_pd() +will deallocate the parent domain as its exposed as an ibv_pd +.I pd\fR. +All resources created with the parent domain +should be destroyed prior to deallocating the parent domain\fR. +.SH "ARGUMENTS" +.B pd +Reference to the protection domain that this parent domain uses. +.PP +.B td +An optional thread domain that the parent domain uses. +.PP +.B comp_mask +Bit-mask of optional fields in the ibv_parent_domain_init_attr struct. +.PP +.B alloc +Custom memory allocation function for this parent domain. Provider +memory allocations will use this function to allocate the needed memory. +The allocation function is passed the parent domain +.B pd +and the user-specified context +.B pd_context. +In addition, the callback receives the +.B size +and the +.B alignment +of the requested buffer, as well a vendor-specific +.B resource_type +, which is derived from the rdma_driver_id enum (upper 32 bits) and a vendor +specific resource code. +The function returns the pointer to the allocated buffer, or NULL to +designate an error. It may also return +.B IBV_ALLOCATOR_USE_DEFAULT +asking the callee to allocate the buffer using the default allocator. + +The callback makes sure the allocated buffer is initialized with zeros. It is +also the responsibility of the callback to make sure the memory cannot be +COWed, e.g. by using madvise(MADV_DONTFORK) or by allocating anonymous shared +memory. +.PP +.B free +Callback to free memory buffers that were allocated using a successful +alloc(). +.PP +.B pd_context +A pointer for additional user-specific data to be associated with this +parent domain. The pointer is passed back to the custom allocator functions. +.SH "RETURN VALUE" +.B ibv_alloc_parent_domain() +returns a pointer to the allocated struct +.I ibv_pd +object, or NULL if the request fails (and sets errno to indicate the failure reason). +.sp +.SH "SEE ALSO" +.BR ibv_alloc_parent_domain (3), +.BR ibv_dealloc_pd (3), +.BR ibv_alloc_pd (3), +.BR ibv_alloc_td (3) +.SH "AUTHORS" +.TP +Alex Rosenbaum <alexr@mellanox.com> +.TP +Yishai Hadas <yishaih@mellanox.com> diff --git a/libibverbs/man/ibv_alloc_pd.3 b/libibverbs/man/ibv_alloc_pd.3 new file mode 100644 index 0000000..cc475f4 --- /dev/null +++ b/libibverbs/man/ibv_alloc_pd.3 @@ -0,0 +1,41 @@ +.\" -*- nroff -*- +.\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md +.\" +.TH IBV_ALLOC_PD 3 2006-10-31 libibverbs "Libibverbs Programmer's Manual" +.SH "NAME" +ibv_alloc_pd, ibv_dealloc_pd \- allocate or deallocate a protection domain (PDs) +.SH "SYNOPSIS" +.nf +.B #include <infiniband/verbs.h> +.sp +.BI "struct ibv_pd *ibv_alloc_pd(struct ibv_context " "*context" ); +.sp +.BI "int ibv_dealloc_pd(struct ibv_pd " "*pd" ); +.fi +.SH "DESCRIPTION" +.B ibv_alloc_pd() +allocates a PD for the RDMA device context +.I context\fR. +.PP +.B ibv_dealloc_pd() +deallocates the PD +.I pd\fR. +.SH "RETURN VALUE" +.B ibv_alloc_pd() +returns a pointer to the allocated PD, or NULL if the request fails. +.PP +.B ibv_dealloc_pd() +returns 0 on success, or the value of errno on failure (which indicates the failure reason). +.SH "NOTES" +.B ibv_dealloc_pd() +may fail if any other resource is still associated with the PD being +freed. +.SH "SEE ALSO" +.BR ibv_reg_mr (3), +.BR ibv_create_srq (3), +.BR ibv_create_qp (3), +.BR ibv_create_ah (3), +.BR ibv_create_ah_from_wc (3) +.SH "AUTHORS" +.TP +Dotan Barak <dotanba@gmail.com> diff --git a/libibverbs/man/ibv_alloc_td.3 b/libibverbs/man/ibv_alloc_td.3 new file mode 100644 index 0000000..4c92016 --- /dev/null +++ b/libibverbs/man/ibv_alloc_td.3 @@ -0,0 +1,61 @@ +.\" -*- nroff -*- +.\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md +.\" +.TH IBV_ALLOC_TD 3 2017-11-06 libibverbs "Libibverbs Programmer's Manual" +.SH "NAME" +ibv_alloc_td(), ibv_dealloc_td() \- allocate and deallocate thread domain object +.SH "SYNOPSIS" +.nf +.B #include <infiniband/verbs.h> +.sp +.BI "struct ibv_td *ibv_alloc_td(struct ibv_context " "*context" , +.BI " struct ibv_td_init_attr " "*init_attr" ); +.sp +.BI "int ibv_dealloc_td(struct ibv_td " "*td"); +.fi +.SH "DESCRIPTION" +.B ibv_alloc_td() +allocates a thread domain object for the RDMA device context +.I context\fR. +.sp +The thread domain object defines how the verbs libraries and provider will use +locks and additional hardware capabilities to achieve best performance for +handling multi-thread or single-thread protection. An application assigns +verbs resources to a thread domain when it creates a verbs object. +.sp +If the +.I +ibv_td +object is specified then any objects created under this thread domain will +disable internal locking designed to protect against concurrent access to that +object from multiple user threads. By default all verbs objects are safe for +multi-threaded access, whether or not a thread domain is specified. +.sp +A +.I struct ibv_td +can be added to a parent domain via +.B ibv_alloc_parent_domain() +and then the parent domain can be used to create verbs objects. +.sp +.B ibv_dealloc_td() +will deallocate the thread domain +.I td\fR. +All resources created with the +.I td +should be destroyed prior to deallocating the +.I td\fR. +.SH "RETURN VALUE" +.B ibv_alloc_td() +returns a pointer to the allocated struct +.I ibv_td +object, or NULL if the request fails (and sets errno to indicate the failure reason). +.sp +.B ibv_dealloc_td() +returns 0 on success, or the value of errno on failure (which indicates the failure reason). +.SH "SEE ALSO" +.BR ibv_alloc_parent_domain (3), +.SH "AUTHORS" +.TP +Alex Rosenbaum <alexr@mellanox.com> +.TP +Yishai Hadas <yishaih@mellanox.com> diff --git a/libibverbs/man/ibv_asyncwatch.1 b/libibverbs/man/ibv_asyncwatch.1 new file mode 100644 index 0000000..ae10117 --- /dev/null +++ b/libibverbs/man/ibv_asyncwatch.1 @@ -0,0 +1,31 @@ +.\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md +.TH IBV_ASYNCWATCH 1 "August 30, 2005" "libibverbs" "USER COMMANDS" + +.SH NAME +ibv_asyncwatch \- display asynchronous events + +.SH SYNOPSIS +.B ibv_asyncwatch +[\-d device] [-h] + +.SH DESCRIPTION +.PP +Display asynchronous events forwarded to userspace for an RDMA device. + +.SH OPTIONS + +.PP +.TP +\fB\-d\fR, \fB\-\-ib\-dev\fR=\fIDEVICE\fR +use IB device \fIDEVICE\fR (default first device found) +.TP +\fB\-h\fR, \fB\-\-help\fR=\fIDEVICE\fR +Print a help text and exit. + +.SH AUTHORS +.TP +Roland Dreier +.RI < rolandd@cisco.com > +.TP +Eran Ben Elisha +.RI < eranbe@mellanox.com > diff --git a/libibverbs/man/ibv_attach_counters_point_flow.3.md b/libibverbs/man/ibv_attach_counters_point_flow.3.md new file mode 100644 index 0000000..f376dc3 --- /dev/null +++ b/libibverbs/man/ibv_attach_counters_point_flow.3.md @@ -0,0 +1,134 @@ +--- +date: 2018-04-02 +footer: libibverbs +header: "Libibverbs Programmer's Manual" +layout: page +license: 'Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md' +section: 3 +title: ibv_attach_counters_point_flow +--- +# NAME + +**ibv_attach_counters_point_flow** - attach individual counter definition to +a flow object + +# SYNOPSIS + +```c +#include <infiniband/verbs.h> + +int ibv_attach_counters_point_flow(struct ibv_counters *counters, + struct ibv_counter_attach_attr *counter_attach_attr, + struct ibv_flow *flow); +``` + +# DESCRIPTION + +Attach counters point are a family of APIs to attach individual counter +description definition to a verb object at a specific index location. + +Counters object will start collecting values after it is bound to the verb object +resource. + +A static attach can be created when NULL is provided instead of the reference +to the verbs object (e.g.: in case of flow providing NULL instead of *flow*). +In this case, this counters object will only start collecting values after it is +bound to the verbs resource, for flow this is when referencing the counters handle +when creating a flow with **ibv_create_flow**(). + +Once an ibv_counters is bound statically to a verbs resource, no additional attach +is allowed till the counter object is not bound to any verb object. + +The argument counter_desc specifies which counter value should be collected. It +is defined in verbs.h as one of the enum ibv_counter_description options. + +Supported capabilities of specific counter_desc values per verbs object can be +tested by checking the return value for success or ENOTSUP errno. + +Attaching a counters handle to multiple objects of the same type will accumulate +the values into a single index. e.g.: creating several ibv_flow(s) with the same +ibv_counters handle will collect the values from all relevant flows into the +relevant index location when reading the values from **ibv_read_counters**(), +setting the index more than once with different or same counter_desc will +aggregate the values from all relevant counters into the relevant index +location. + +The runtime values of counters can be read from the hardware by calling +**ibv_read_counters**(). + +# ARGUMENTS + +*counters* +: Existing counters to attach new counter point on. + +*counter_attach_attr* +: An ibv_counter_attach_attr struct, as defined in verbs.h. + +*flow* +: Existing flow to attach a new counters point on (in static mode +it must be NULL). + +## *counter_attach_attr* Argument + +```c +struct ibv_counter_attach_attr { + enum ibv_counter_description counter_desc; + uint32_t index; + uint32_t comp_mask; +}; +``` + +## *counter_desc* Argument + +```c +enum ibv_counter_description { + IBV_COUNTER_PACKETS, + IBV_COUNTER_BYTES, +}; +``` + +*index* +: Desired location of the specific counter at the counters object. + +*comp_mask* +: Bitmask specifying what fields in the structure are valid. + +# RETURN VALUE + +**ibv_attach_counters_point_flow**() returns 0 on success, or the value of errno +on failure (which indicates the failure reason) + +# ERRORS + +EINVAL +: invalid argument(s) passed + +ENOTSUP +: *counter_desc* is not supported on the requested object + +EBUSY +: the counter object is already bound to a flow, additional attach calls is not allowed (valid for static attach only) + +ENOMEM +: not enough memory + +# NOTES +Counter values in each index location are cleared upon creation when calling +**ibv_create_counters**(). +Attaching counters points will only increase these values accordingly. + +# EXAMPLE + +An example of use of **ibv_attach_counters_point_flow**() is shown in +**ibv_read_counters** + +# SEE ALSO + +**ibv_create_counters**, **ibv_destroy_counters**, +**ibv_read_counters**, **ibv_create_flow** + +# AUTHORS + +Raed Salem <raeds@mellanox.com> + +Alex Rosenbaum <alexr@mellanox.com> diff --git a/libibverbs/man/ibv_attach_mcast.3.md b/libibverbs/man/ibv_attach_mcast.3.md new file mode 100644 index 0000000..42fb738 --- /dev/null +++ b/libibverbs/man/ibv_attach_mcast.3.md @@ -0,0 +1,57 @@ +--- +date: 2006-10-31 +footer: libibverbs +header: "Libibverbs Programmer's Manual" +layout: page +license: 'Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md' +section: 3 +title: IBV_ATTACH_MCAST +--- + +# NAME + +ibv_attach_mcast, ibv_detach_mcast - attach and detach a queue pair (QPs) +to/from a multicast group + +# SYNOPSIS + +```c +#include <infiniband/verbs.h> + +int ibv_attach_mcast(struct ibv_qp *qp, const union ibv_gid *gid, uint16_t lid); + +int ibv_detach_mcast(struct ibv_qp *qp, const union ibv_gid *gid, uint16_t lid); +``` + +# DESCRIPTION + +**ibv_attach_mcast()** attaches the QP *qp* to the multicast group having MGID +*gid* and MLID *lid*. + +**ibv_detach_mcast()** detaches the QP *qp* to the multicast group having MGID +*gid* and MLID *lid*. + +# RETURN VALUE + +**ibv_attach_mcast()** and **ibv_detach_mcast()** returns 0 on success, or the +value of errno on failure (which indicates the failure reason). + +# NOTES + +Only QPs of Transport Service Type **IBV_QPT_UD** may be attached to multicast +groups. + +If a QP is attached to the same multicast group multiple times, the QP will +still receive a single copy of a multicast message. + +In order to receive multicast messages, a join request for the multicast group +must be sent to the subnet administrator (SA), so that the fabric's multicast +routing is configured to deliver messages to the local port. + +# SEE ALSO + +**ibv_create_qp**(3) + +# AUTHOR + +Dotan Barak <dotanba@gmail.com> diff --git a/libibverbs/man/ibv_bind_mw.3 b/libibverbs/man/ibv_bind_mw.3 new file mode 100644 index 0000000..af309d0 --- /dev/null +++ b/libibverbs/man/ibv_bind_mw.3 @@ -0,0 +1,93 @@ +.\" -*- nroff -*- +.\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md +.\" +.TH IBV_BIND_MW 3 2016-02-02 libibverbs "Libibverbs Programmer's Manual" +.SH "NAME" +ibv_bind_mw \- post a request to bind a type 1 memory window to a memory region +.SH "SYNOPSIS" +.nf +.B #include <infiniband/verbs.h> +.sp +.BI "int ibv_bind_mw(struct ibv_qp " "*qp" ", struct ibv_mw " "*mw" ", +.BI " struct ibv_mw_bind " "*mw_bind" "); +.fi +.SH "DESCRIPTION" +.B ibv_bind_mw() +posts to the queue pair +.I qp +a request to bind the memory window +.I mw +according to the details in +.I mw_bind\fR. +.PP +The argument +.I mw_bind +is an ibv_mw_bind struct, as defined in <infiniband/verbs.h>. +.PP +.nf +struct ibv_mw_bind { +.in +8 +uint64_t wr_id; /* User defined WR ID */ +int send_flags; /* Use ibv_send_flags */ +struct ibv_mw_bind_info bind_info; /* MW bind information */ +.in -8 +} +.fi +.PP +.nf +struct ibv_mw_bind_info { +.in +8 +struct ibv_mr *mr; /* The MR to bind the MW to */ +uint64_t addr; /* The address the MW should start at */ +uint64_t length; /* The length (in bytes) the MW should span */ +int mw_access_flags; /* Access flags to the MW. Use ibv_access_flags */ +.in -8 +}; +.fi +.PP +The QP Transport Service Type must be either UC, RC or XRC_SEND for bind operations. +.PP +The attribute send_flags describes the properties of the \s-1WR\s0. It is either 0 or the bitwise \s-1OR\s0 of one or more of the following flags: +.PP +.TP +.B IBV_SEND_FENCE \fR Set the fence indicator. +.TP +.B IBV_SEND_SIGNALED \fR Set the completion notification indicator. Relevant only if QP was created with sq_sig_all=0 +.PP +The mw_access_flags define the allowed access to the MW after the bind +completes successfully. It is either 0 or the bitwise \s-1OR\s0 of one +or more of the following flags: +.TP +.B IBV_ACCESS_REMOTE_WRITE \fR Enable Remote Write Access. Requires local write access to the MR. +.TP +.B IBV_ACCESS_REMOTE_READ\fR Enable Remote Read Access +.TP +.B IBV_ACCESS_REMOTE_ATOMIC\fR Enable Remote Atomic Operation Access (if supported). Requires local write access to the MR. +.TP +.B IBV_ACCESS_ZERO_BASED\fR If set, the address set on the 'remote_addr' field on the WR will be an offset from the MW's start address. +.SH "RETURN VALUE" +.B ibv_bind_mw() +returns 0 on success, or the value of errno on failure (which +indicates the failure reason). In case of a success, the R_key of the +memory window after the bind is returned in the mw_bind->mw->rkey field. +.SH "NOTES" +The bind does not complete when the function return - it is merely +posted to the QP. The user should keep a copy of the old R_key, and +fix the mw structure if the subsequent CQE for the bind operation +indicates a failure. The user may safely send the R_key using a send +request on the same QP, (based on QP ordering rules: a send after a bind +request on the same QP are always ordered), but must not transfer it to the +remote in any other manner before reading a successful CQE. +.PP +Note that for type 2 MW, one should directly post bind WR to the QP, +using ibv_post_send. +.SH "SEE ALSO" +.BR ibv_alloc_mw (3), +.BR ibv_post_send (3), +.BR ibv_poll_cq (3) +.BR ibv_reg_mr (3), +.SH "AUTHORS" +.TP +Majd Dibbiny <majd@mellanox.com> +.TP +Yishai Hadas <yishaih@mellanox.com> diff --git a/libibverbs/man/ibv_create_ah.3 b/libibverbs/man/ibv_create_ah.3 new file mode 100644 index 0000000..47c78f8 --- /dev/null +++ b/libibverbs/man/ibv_create_ah.3 @@ -0,0 +1,69 @@ +.\" -*- nroff -*- +.\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md +.\" +.TH IBV_CREATE_AH 3 2006-10-31 libibverbs "Libibverbs Programmer's Manual" +.SH "NAME" +ibv_create_ah, ibv_destroy_ah \- create or destroy an address handle (AH) +.SH "SYNOPSIS" +.nf +.B #include <infiniband/verbs.h> +.sp +.BI "struct ibv_ah *ibv_create_ah(struct ibv_pd " "*pd" ", +.BI " struct ibv_ah_attr " "*attr" "); +.sp +.BI "int ibv_destroy_ah(struct ibv_ah " "*ah" "); +.fi +.SH "DESCRIPTION" +.B ibv_create_ah() +creates an address handle (AH) associated with the protection domain +.I pd\fR. +The argument +.I attr +is an ibv_ah_attr struct, as defined in <infiniband/verbs.h>. +.PP +.nf +struct ibv_ah_attr { +.in +8 +struct ibv_global_route grh; /* Global Routing Header (GRH) attributes */ +uint16_t dlid; /* Destination LID */ +uint8_t sl; /* Service Level */ +uint8_t src_path_bits; /* Source path bits */ +uint8_t static_rate; /* Maximum static rate */ +uint8_t is_global; /* GRH attributes are valid */ +uint8_t port_num; /* Physical port number */ +.in -8 +}; +.sp +.nf +struct ibv_global_route { +.in +8 +union ibv_gid dgid; /* Destination GID or MGID */ +uint32_t flow_label; /* Flow label */ +uint8_t sgid_index; /* Source GID index */ +uint8_t hop_limit; /* Hop limit */ +uint8_t traffic_class; /* Traffic class */ +.in -8 +}; +.fi +.sp +.PP +.B ibv_destroy_ah() +destroys the AH +.I ah\fR. +.SH "RETURN VALUE" +.B ibv_create_ah() +returns a pointer to the created AH, or NULL if the request fails. +.SH "NOTES" +If port flag IBV_QPF_GRH_REQUIRED is set then +.B ibv_create_ah() +must be created with definition of 'struct ibv_ah_attr { .is_global = 1; .grh = {...}; }'. +.PP +.B ibv_destroy_ah() +returns 0 on success, or the value of errno on failure (which indicates the failure reason). +.SH "SEE ALSO" +.BR ibv_alloc_pd (3), +.BR ibv_init_ah_from_wc (3), +.BR ibv_create_ah_from_wc (3) +.SH "AUTHORS" +.TP +Dotan Barak <dotanba@gmail.com> diff --git a/libibverbs/man/ibv_create_ah_from_wc.3 b/libibverbs/man/ibv_create_ah_from_wc.3 new file mode 100644 index 0000000..ec7d29f --- /dev/null +++ b/libibverbs/man/ibv_create_ah_from_wc.3 @@ -0,0 +1,64 @@ +.\" -*- nroff -*- +.\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md +.\" +.TH IBV_CREATE_AH_FROM_WC 3 2006-10-31 libibverbs "Libibverbs Programmer's Manual" +.SH "NAME" +ibv_init_ah_from_wc, ibv_create_ah_from_wc \- initialize or create an +address handle (AH) from a work completion +.SH "SYNOPSIS" +.nf +.B #include <infiniband/verbs.h> +.sp +.BI "int ibv_init_ah_from_wc(struct ibv_context " "*context" ", uint8_t " "port_num" , +.BI " struct ibv_wc " "*wc" ", struct ibv_grh " "*grh" , +.BI " struct ibv_ah_attr " "*ah_attr" ); +.sp +.BI "struct ibv_ah *ibv_create_ah_from_wc(struct ibv_pd " "*pd" , +.BI " struct ibv_wc " "*wc" , +.BI " struct ibv_grh " "*grh" , +.BI " uint8_t " "port_num" ); +.fi +.SH "DESCRIPTION" +.B ibv_init_ah_from_wc() +initializes the address handle (AH) attribute structure +.I ah_attr +for the RDMA device context +.I context +using the port number +.I port_num\fR, +using attributes from the work completion +.I wc +and the Global Routing Header (GRH) structure +.I grh\fR. +.PP +.B ibv_create_ah_from_wc() +creates an AH associated with the protection domain +.I pd +using the port number +.I port_num\fR, +using attributes from the work completion +.I wc +and the Global Routing Header (GRH) structure +.I grh\fR. +.SH "RETURN VALUE" +.B ibv_init_ah_from_wc() +returns 0 on success, and \-1 on error. +.PP +.B ibv_create_ah_from_wc() +returns a pointer to the created AH, or NULL if the request fails. +.SH "NOTES" +The filled structure +.I ah_attr +returned from +.B ibv_init_ah_from_wc() +can be used to create a new AH using +.B ibv_create_ah()\fR. +.SH "SEE ALSO" +.BR ibv_open_device (3), +.BR ibv_alloc_pd (3), +.BR ibv_create_ah (3), +.BR ibv_destroy_ah (3), +.BR ibv_poll_cq (3) +.SH "AUTHORS" +.TP +Dotan Barak <dotanba@gmail.com> diff --git a/libibverbs/man/ibv_create_comp_channel.3 b/libibverbs/man/ibv_create_comp_channel.3 new file mode 100644 index 0000000..7460978 --- /dev/null +++ b/libibverbs/man/ibv_create_comp_channel.3 @@ -0,0 +1,51 @@ +.\" -*- nroff -*- +.\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md +.\" +.TH IBV_CREATE_COMP_CHANNEL 3 2006-10-31 libibverbs "Libibverbs Programmer's Manual" +.SH "NAME" +ibv_create_comp_channel, ibv_destroy_comp_channel \- create or +destroy a completion event channel +.SH "SYNOPSIS" +.nf +.B #include <infiniband/verbs.h> +.sp +.BI "struct ibv_comp_channel *ibv_create_comp_channel(struct ibv_context +.BI " " "*context" ); +.sp +.BI "int ibv_destroy_comp_channel(struct ibv_comp_channel " "*channel" ); +.fi +.SH "DESCRIPTION" +.B ibv_create_comp_channel() +creates a completion event channel for the RDMA device context +.I context\fR. +.PP +.B ibv_destroy_comp_channel() +destroys the completion event channel +.I channel\fR. +.SH "RETURN VALUE" +.B ibv_create_comp_channel() +returns a pointer to the created completion event channel, or NULL if the request fails. +.PP +.B ibv_destroy_comp_channel() +returns 0 on success, or the value of errno on failure (which indicates the failure reason). +.SH "NOTES" +A "completion channel" is an abstraction introduced by libibverbs that +does not exist in the InfiniBand Architecture verbs specification or +RDMA Protocol Verbs Specification. A completion channel is +essentially file descriptor that is used to deliver completion +notifications to a userspace process. When a completion event is +generated for a completion queue (CQ), the event is delivered via the +completion channel attached to that CQ. This may be useful to steer +completion events to different threads by using multiple completion +channels. +.PP +.B ibv_destroy_comp_channel() +fails if any CQs are still associated with the completion event +channel being destroyed. +.SH "SEE ALSO" +.BR ibv_open_device (3), +.BR ibv_create_cq (3), +.BR ibv_get_cq_event (3) +.SH "AUTHORS" +.TP +Dotan Barak <dotanba@gmail.com> diff --git a/libibverbs/man/ibv_create_counters.3.md b/libibverbs/man/ibv_create_counters.3.md new file mode 100644 index 0000000..fd830f8 --- /dev/null +++ b/libibverbs/man/ibv_create_counters.3.md @@ -0,0 +1,98 @@ +--- +date: 2018-04-02 +footer: libibverbs +header: "Libibverbs Programmer's Manual" +layout: page +license: 'Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md' +section: 3 +title: ibv_create_counters +tagline: Verbs +--- + +# NAME + +**ibv_create_counters**, **ibv_destroy_counters** - Create or destroy a counters handle + +# SYNOPSIS + +```c +#include <infiniband/verbs.h> + +struct ibv_counters * +ibv_create_counters(struct ibv_context *context, + struct ibv_counters_init_attr *init_attr); + +int ibv_destroy_counters(struct ibv_counters *counters); +``` + +# DESCRIPTION + +**ibv_create_counters**() creates a new counters handle for the RDMA device +context. + +An ibv_counters handle can be attached to a verbs resource (e.g.: QP, WQ, Flow) +statically when these are created. + +For example attach an ibv_counters statically to a Flow (struct ibv_flow) during +creation of a new Flow by calling **ibv_create_flow()**. + +Counters are cleared upon creation and values will be monotonically increasing. + +**ibv_destroy_counters**() releases the counters handle, user should +detach the counters object before destroying it. + +# ARGUMENTS + +*context* +: RDMA device context to create the counters on. + +*init_attr* +: Is an ibv_counters_init_attr struct, as defined in verbs.h. + +## *init_attr* Argument + +```c +struct ibv_counters_init_attr { + int comp_mask; +}; +``` + +*comp_mask* +: Bitmask specifying what fields in the structure are valid. + +# RETURN VALUE + +**ibv_create_counters**() returns a pointer to the allocated ibv_counters +object, or NULL if the request fails (and sets errno to indicate the failure +reason) + +**ibv_destroy_counters**() returns 0 on success, or the value of errno on +failure (which indicates the failure reason) + +# ERRORS + +EOPNOTSUPP +: **ibv_create_counters**() is not currently supported on this device + (ENOSYS may sometimes be returned by old versions of libibverbs). + +ENOMEM +: **ibv_create_counters**() could not create ibv_counters object, not enough memory + +EINVAL +: invalid parameter supplied **ibv_destroy_counters**() + +# EXAMPLE + +An example of use of ibv_counters is shown in **ibv_read_counters** + +# SEE ALSO + +**ibv_attach_counters_point_flow**, **ibv_read_counters**, +**ibv_create_flow** + +# AUTHORS + +Raed Salem <raeds@mellanox.com> + +Alex Rosenbaum <alexr@mellanox.com> + diff --git a/libibverbs/man/ibv_create_cq.3 b/libibverbs/man/ibv_create_cq.3 new file mode 100644 index 0000000..98ea4d2 --- /dev/null +++ b/libibverbs/man/ibv_create_cq.3 @@ -0,0 +1,59 @@ +.\" -*- nroff -*- +.\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md +.\" +.TH IBV_CREATE_CQ 3 2006-10-31 libibverbs "Libibverbs Programmer's Manual" +.SH "NAME" +ibv_create_cq, ibv_destroy_cq \- create or destroy a completion queue (CQ) +.SH "SYNOPSIS" +.nf +.B #include <infiniband/verbs.h> +.sp +.BI "struct ibv_cq *ibv_create_cq(struct ibv_context " "*context" ", int " "cqe" , +.BI " void " "*cq_context" , +.BI " struct ibv_comp_channel " "*channel" , +.BI " int " "comp_vector" ); +.sp +.BI "int ibv_destroy_cq(struct ibv_cq " "*cq" ); +.fi +.SH "DESCRIPTION" +.B ibv_create_cq() +creates a completion queue (CQ) with at least +.I cqe +entries for the RDMA device context +.I context\fR. +The pointer +.I cq_context +will be used to set user context pointer of the CQ structure. The argument +.I channel +is optional; if not NULL, the completion channel +.I channel +will be used to return completion events. The CQ will use the +completion vector +.I comp_vector +for signaling completion events; it must be at least zero and less than +.I context\fR->num_comp_vectors. +.PP +.B ibv_destroy_cq() +destroys the CQ +.I cq\fR. +.SH "RETURN VALUE" +.B ibv_create_cq() +returns a pointer to the CQ, or NULL if the request fails. +.PP +.B ibv_destroy_cq() +returns 0 on success, or the value of errno on failure (which indicates the failure reason). +.SH "NOTES" +.B ibv_create_cq() +may create a CQ with size greater than or equal to the requested +size. Check the cqe attribute in the returned CQ for the actual size. +.PP +.B ibv_destroy_cq() +fails if any queue pair is still associated with this CQ. +.SH "SEE ALSO" +.BR ibv_resize_cq (3), +.BR ibv_req_notify_cq (3), +.BR ibv_ack_cq_events (3), +.BR ibv_create_qp (3) +.SH "AUTHORS" +.TP +Dotan Barak <dotanba@gmail.com> diff --git a/libibverbs/man/ibv_create_cq_ex.3 b/libibverbs/man/ibv_create_cq_ex.3 new file mode 100644 index 0000000..0f05693 --- /dev/null +++ b/libibverbs/man/ibv_create_cq_ex.3 @@ -0,0 +1,184 @@ +.\" -*- nroff -*- +.\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md +.\" +.TH IBV_CREATE_CQ_EX 3 2016-05-08 libibverbs "Libibverbs Programmer's Manual" +.SH "NAME" +ibv_create_cq_ex \- create a completion queue (CQ) +.SH "SYNOPSIS" +.nf +.B #include <infiniband/verbs.h> +.sp +.BI "struct ibv_cq_ex *ibv_create_cq_ex(struct ibv_context " "*context" ", +.BI " struct ibv_cq_init_attr_ex " "*cq_attr" ); +.fi +.SH "DESCRIPTION" +.B ibv_create_cq_ex() +creates a completion queue (CQ) for RDMA device context +.I context\fR. +The argument +.I cq_attr +is a pointer to struct ibv_cq_init_attr_ex as defined in <infiniband/verbs.h>. +.PP +.nf +struct ibv_cq_init_attr_ex { +.in +8 +int cqe; /* Minimum number of entries required for CQ */ +void *cq_context; /* Consumer-supplied context returned for completion events */ +struct ibv_comp_channel *channel; /* Completion channel where completion events will be queued. May be NULL if completion events will not be used. */ +int comp_vector; /* Completion vector used to signal completion events. Must be >= 0 and < context->num_comp_vectors. */ +uint64_t wc_flags; /* The wc_flags that should be returned in ibv_poll_cq_ex. Or'ed bit of enum ibv_wc_flags_ex. */ +uint32_t comp_mask; /* compatibility mask (extended verb). */ +uint32_t flags /* One or more flags from enum ibv_create_cq_attr_flags */ +struct ibv_pd *parent_domain; /* Parent domain to be used by this CQ */ +.in -8 +}; + +enum ibv_wc_flags_ex { + IBV_WC_EX_WITH_BYTE_LEN = 1 << 0, /* Require byte len in WC */ + IBV_WC_EX_WITH_IMM = 1 << 1, /* Require immediate in WC */ + IBV_WC_EX_WITH_QP_NUM = 1 << 2, /* Require QP number in WC */ + IBV_WC_EX_WITH_SRC_QP = 1 << 3, /* Require source QP in WC */ + IBV_WC_EX_WITH_SLID = 1 << 4, /* Require slid in WC */ + IBV_WC_EX_WITH_SL = 1 << 5, /* Require sl in WC */ + IBV_WC_EX_WITH_DLID_PATH_BITS = 1 << 6, /* Require dlid path bits in WC */ + IBV_WC_EX_WITH_COMPLETION_TIMESTAMP = 1 << 7, /* Require completion device timestamp in WC /* + IBV_WC_EX_WITH_CVLAN = 1 << 8, /* Require VLAN info in WC */ + IBV_WC_EX_WITH_FLOW_TAG = 1 << 9, /* Require flow tag in WC */ + IBV_WC_EX_WITH_COMPLETION_TIMESTAMP_WALLCLOCK = 1 << 11, /* Require completion wallclock timestamp in WC */ +}; + +enum ibv_cq_init_attr_mask { + IBV_CQ_INIT_ATTR_MASK_FLAGS = 1 << 0, + IBV_CQ_INIT_ATTR_MASK_PD = 1 << 1, +}; + +enum ibv_create_cq_attr_flags { + IBV_CREATE_CQ_ATTR_SINGLE_THREADED = 1 << 0, /* This CQ is used from a single threaded, thus no locking is required */ + IBV_CREATE_CQ_ATTR_IGNORE_OVERRUN = 1 << 1, /* This CQ will not pass to error state if overrun, CQE always will be written to next entry. + * An application must be designed to avoid ever overflowing the CQ, otherwise CQEs might be lost. + */ +}; + +.SH "Polling an extended CQ" +In order to poll an extended CQ efficiently, a user could use the following functions. + +.TP +.B Completion iterator functions + +.BI "int ibv_start_poll(struct ibv_cq_ex " "*cq" ", struct ibv_poll_cq_attr " "*attr") +.br +Start polling a batch of work completions. +.I attr +is given in order to make this function +easily extensible in the future. This function either returns 0 on success or an error code +otherwise. When no completions are available on the CQ, ENOENT is returned, but the CQ remains +in a valid state. On success, querying the completion's attribute could be done using the query +functions described below. If an error code is given, end_poll shouldn't be called. + +.BI "int ibv_next_poll(struct ibv_cq_ex " "*cq") +.br +This function is called in order to get the next work completion. It has to be called after +.I start_poll +and before +.I end_poll +are called. This function either returns 0 on success or an error code +otherwise. When no completions are available on the CQ, ENOENT is returned, but the CQ remains +in a valid state. On success, querying the completion's attribute could be done using the query +functions described below. If an error code is given, end_poll should still be called, +indicating this is the end of the polled batch. + +.BI "void ibv_end_poll(struct ibv_cq_ex " "*cq") +.br +This function indicates the end of polling batch of work completions. After calling this function, the user should start a new batch +by calling +.I start_poll. + +.TP +.B Polling fields in the completion +Below members and functions are used in order to poll the current completion. The current completion is the completion which the iterator points to (start_poll and next_poll advances this iterator). Only fields that the user requested via wc_flags in ibv_create_cq_ex could be queried. In addition, some fields are only valid in certain opcodes and status codes. + +.BI "uint64_t wr_id - Can be accessed directly from struct ibv_cq_ex". + +.BI "enum ibv_wc_status - Can be accessed directly from struct ibv_cq_ex". + +.BI "enum ibv_wc_opcode ibv_wc_read_opcode(struct ibv_cq_ex " "*cq"); \c + Get the opcode from the current completion. + +.BI "uint32_t ibv_wc_read_vendor_err(struct ibv_cq_ex " "*cq"); \c + Get the vendor error from the current completion. + +.BI "uint32_t ibv_wc_read_byte_len(struct ibv_cq_ex " "*cq"); \c + Get the vendor error from the current completion. + +.BI "__be32 ibv_wc_read_imm_data(struct ibv_cq_ex " "*cq"); \c + Get the immediate data field from the current completion. + +.BI "uint32_t ibv_wc_read_invalidated_rkey(struct ibv_cq_ex " "*cq"); \c + Get the rkey invalided by the SEND_INVAL from the current completion. + +.BI "uint32_t ibv_wc_read_qp_num(struct ibv_cq_ex " "*cq"); \c + Get the QP number field from the current completion. + +.BI "uint32_t ibv_wc_read_src_qp(struct ibv_cq_ex " "*cq"); \c + Get the source QP number field from the current completion. + +.BI "int ibv_wc_read_wc_flags(struct ibv_cq_ex " "*cq"); \c + Get the QP flags field from the current completion. + +.BI "uint16_t ibv_wc_read_pkey_index(struct ibv_cq_ex " "*cq"); \c + Get the pkey index field from the current completion. + +.BI "uint32_t ibv_wc_read_slid(struct ibv_cq_ex " "*cq"); \c + Get the slid field from the current completion. + +.BI "uint8_t ibv_wc_read_sl(struct ibv_cq_ex " "*cq"); \c + Get the sl field from the current completion. + +.BI "uint8_t ibv_wc_read_dlid_path_bits(struct ibv_cq_ex " "*cq"); \c + Get the dlid_path_bits field from the current completion. + +.BI "uint64_t ibv_wc_read_completion_ts(struct ibv_cq_ex " "*cq"); \c + Get the completion timestamp from the current completion in HCA clock units. + +.BI "uint64_t ibv_wc_read_completion_wallclock_ns(struct ibv_cq_ex " *cq "); +Get the completion timestamp from the current completion and convert it +from HCA clock units to wall clock nanoseconds. + +.BI "uint16_t ibv_wc_read_cvlan(struct ibv_cq_ex " "*cq"); \c + Get the CVLAN field from the current completion. + +.BI "uint32_t ibv_wc_read_flow_tag(struct ibv_cq_ex " "*cq"); \c + Get flow tag from the current completion. + +.BI "void ibv_wc_read_tm_info(struct ibv_cq_ex " *cq "," +.BI "struct ibv_wc_tm_info " *tm_info "); \c + Get tag matching info from the current completion. +.nf +struct ibv_wc_tm_info { +.in +8 +uint64_t tag; /* tag from TMH */ +uint32_t priv; /* opaque user data from TMH */ +.in -8 +}; + +.SH "RETURN VALUE" +.B ibv_create_cq_ex() +returns a pointer to the CQ, or NULL if the request fails. +.SH "NOTES" +.B ibv_create_cq_ex() +may create a CQ with size greater than or equal to the requested +size. Check the cqe attribute in the returned CQ for the actual size. +.PP +CQ should be destroyed with ibv_destroy_cq. +.PP +.SH "SEE ALSO" +.BR ibv_create_cq (3), +.BR ibv_destroy_cq (3), +.BR ibv_resize_cq (3), +.BR ibv_req_notify_cq (3), +.BR ibv_ack_cq_events (3), +.BR ibv_create_qp (3), +.BR ibv_alloc_parent_domain (3) +.SH "AUTHORS" +.TP +Matan Barak <matanb@mellanox.com> diff --git a/libibverbs/man/ibv_create_flow.3 b/libibverbs/man/ibv_create_flow.3 new file mode 100644 index 0000000..7cbb6e3 --- /dev/null +++ b/libibverbs/man/ibv_create_flow.3 @@ -0,0 +1,249 @@ +.\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md +.TH IBV_CREATE_FLOW 3 2016-03-15 libibverbs "Libibverbs Programmer's Manual" +.SH "NAME" +ibv_create_flow, ibv_destroy_flow \- create or destroy flow steering rules +.SH "SYNOPSIS" +.nf +.B #include <infiniband/verbs.h> +.sp +.BI "struct ibv_flow *ibv_create_flow(struct ibv_qp " "*qp" , +.BI " struct ibv_flow_attr " "*flow_attr"); +.BI "int ibv_destroy_flow(struct ibv_flow " "*flow_id"); +.sp +.fi +.SH "DESCRIPTION" +.SS ibv_create_flow() +allows a user application QP +.I qp +to be attached into a specified flow +.I flow +which is defined in +.I <infiniband/verbs.h> +.PP +.nf +struct ibv_flow_attr { +.in +8 +uint32_t comp_mask; /* Future extendibility */ +enum ibv_flow_attr_type type; /* Rule type - see below */ +uint16_t size; /* Size of command */ +uint16_t priority; /* Rule priority - see below */ +uint8_t num_of_specs; /* Number of ibv_flow_spec_xxx */ +uint8_t port; /* The uplink port number */ +uint32_t flags; /* Extra flags for rule - see below */ +/* Following are the optional layers according to user request + * struct ibv_flow_spec_xxx + * struct ibv_flow_spec_yyy + */ +.in -8 +}; +.sp +.nf +enum ibv_flow_attr_type { +.in +8 +IBV_FLOW_ATTR_NORMAL = 0x0, /* Steering according to rule specifications */ +IBV_FLOW_ATTR_ALL_DEFAULT = 0x1, /* Default unicast and multicast rule - receive all Eth traffic which isn't steered to any QP */ +IBV_FLOW_ATTR_MC_DEFAULT = 0x2, /* Default multicast rule - receive all Eth multicast traffic which isn't steered to any QP */ +IBV_FLOW_ATTR_SNIFFER = 0x3, /* Sniffer rule - receive all port traffic */ +.in -8 +}; +.sp +.nf +enum ibv_flow_flags { +.in +8 +IBV_FLOW_ATTR_FLAGS_DONT_TRAP = 1 << 1, /* Rule doesn't trap received packets, allowing them to match lower prioritized rules */ +IBV_FLOW_ATTR_FLAGS_EGRESS = 1 << 2, /* Match sent packets against EGRESS rules and carry associated actions if required */ +.in -8 +}; +.fi +.nf +.br + +enum ibv_flow_spec_type { +.in +8 +IBV_FLOW_SPEC_ETH = 0x20, /* Flow specification of L2 header */ +IBV_FLOW_SPEC_IPV4 = 0x30, /* Flow specification of IPv4 header */ +IBV_FLOW_SPEC_IPV6 = 0x31, /* Flow specification of IPv6 header */ +IBV_FLOW_SPEC_IPV4_EXT = 0x32, /* Extended flow specification of IPv4 */ +IBV_FLOW_SPEC_ESP = 0x34, /* Flow specification of ESP (IPSec) header */ +IBV_FLOW_SPEC_TCP = 0x40, /* Flow specification of TCP header */ +IBV_FLOW_SPEC_UDP = 0x41, /* Flow specification of UDP header */ +IBV_FLOW_SPEC_VXLAN_TUNNEL = 0x50, /* Flow specification of VXLAN header */ +IBV_FLOW_SPEC_GRE = 0x51, /* Flow specification of GRE header */ +IBV_FLOW_SPEC_MPLS = 0x60, /* Flow specification of MPLS header */ +IBV_FLOW_SPEC_INNER = 0x100, /* Flag making L2/L3/L4 specifications to be applied on the inner header */ +IBV_FLOW_SPEC_ACTION_TAG = 0x1000, /* Action tagging matched packet */ +IBV_FLOW_SPEC_ACTION_DROP = 0x1001, /* Action dropping matched packet */ +IBV_FLOW_SPEC_ACTION_HANDLE = 0x1002, /* Carry out an action created by ibv_create_flow_action_xxxx verb */ +IBV_FLOW_SPEC_ACTION_COUNT = 0x1003, /* Action count matched packet with a ibv_counters handle */ +.in -8 +}; +.br + +Flow specification general structure: +.BR +struct ibv_flow_spec_xxx { +.in +8 +enum ibv_flow_spec_type type; +uint16_t size; /* Flow specification size = sizeof(struct ibv_flow_spec_xxx) */ +struct ibv_flow_xxx_filter val; +struct ibv_flow_xxx_filter mask; /* Defines which bits from the filter value are applicable when looking for a match in the incoming packet */ +.in -8 +}; +.PP +Each spec struct holds the relevant network layer parameters for matching. To enforce the match, the user sets a mask for each parameter. +.br +Packets coming from the wire are matched against the flow specification. If a match is found, the associated flow actions are executed on the packet. +.br +In ingress flows, the QP parameter is treated as another action of scattering the packet to the respected QP. +.br +If the bit is set in the mask, the corresponding bit in the value should be matched. +.br +Note that most vendors support either full mask (all "1"s) or zero mask (all "0"s). +.br +.B Network parameters in the relevant network structs should be given in network order (big endian). + +.SS Flow domains and priority +Flow steering defines the concept of domain and priority. Each domain represents an application that can attach a flow. +Domains are prioritized. A higher priority domain will always supersede a lower priority domain when their flow specifications overlap. +.br +.B IB verbs have the higher priority domain. +.br +In addition to the domain, there is priority within each of the domains. +A lower priority numeric value (higher priority) takes precedence over matching rules with higher numeric priority value (lower priority). +It is important to note that the priority value of a flow spec is used not only to establish the precedence of conflicting flow matches +but also as a way to abstract the order on which flow specs are tested for matches. Flows with higher priorities will be tested before flows with lower priorities. + +.SS Rules definition ordering +An application can provide the ibv_flow_spec_xxx rules in an un-ordered scheme. In this case, each spec should be well +defined and match a specific network header layer. +In some cases, when certain flow spec types are present in the spec list, it is required to provide the list in an +ordered manner so that the position of that flow spec type in the protocol stack is strictly defined. +When the certain spec type, which requires the ordering, resides in the inner network protocol stack (in tunnel +protocols) the ordering should be applied to the inner network specs and should be combined with the inner spec indication using +the IBV_FLOW_SPEC_INNER flag. +For example: An MPLS spec which attempts to match an MPLS tag in the inner network should have the +IBV_FLOW_SPEC_INNER flag set and so do the rest of the inner network specs. On top of that, all the inner network specs should be provided in +an ordered manner. +This is essential to represent many of the encapsulation tunnel protocols. +.br + +The flow spec types which require this sort of ordering are: +.br +.B 1. IBV_FLOW_SPEC_MPLS - +.br +Since MPLS header can appear at several locations in the protocol stack and can also be +encapsulated on top of different layers, it is required to place this spec according to its exact location in the +protocol stack. +.br +.SS ibv_destroy_flow() +destroys the flow +.I flow_id\fR. +.SH "RETURN VALUE" +.B ibv_create_flow() +returns a pointer to the flow, or NULL if the request fails. In case of an error, errno is updated. +.PP +.B ibv_destroy_flow() +returns 0 on success, or the value of errno on failure (which indicates the failure reason). +.SH "ERRORS" +.SS EINVAL +.B ibv_create_flow() +flow specification, QP or priority are invalid +.PP +.B ibv_destroy_flow() +flow_id is invalid +.SS ENOMEM +Couldn't create/destroy flow, not enough memory +.SS ENXIO +Device managed flow steering isn't currently supported +.SS EPERM +No permissions to add the flow steering rule +.SH "NOTES" +1. These verbs are available only for devices supporting +.br + IBV_DEVICE_MANAGED_FLOW_STEERING and only for QPs of Transport Service Type +.BR IBV_QPT_UD +or +.BR IBV_QPT_RAW_PACKET +.br +2. User must memset the spec struct with zeros before using it. +.br +3. ether_type field in ibv_flow_eth_filter is the ethertype following the last VLAN tag of the packet. +.br +4. Only rule type IBV_FLOW_ATTR_NORMAL supports IBV_FLOW_ATTR_FLAGS_DONT_TRAP flag. +.br +5. No specifications are needed for IBV_FLOW_ATTR_SNIFFER rule type. +.br +6. When IBV_FLOW_ATTR_FLAGS_EGRESS flag is set, the qp parameter is used only as a mean to get the device. +.br +.PP +.SH EXAMPLE +.br +Below flow_attr defines a rule in priority 0 to match a destination +mac address and a source ipv4 address. For that, L2 and L3 specs are used. +.br +If there is a hit on this rule, means the +received packet has destination mac: 66:11:22:33:44:55 and source ip: 0x0B86C806, +the packet is steered to its attached qp. +.sp +.nf +struct raw_eth_flow_attr { +.in +8 +struct ibv_flow_attr attr; +struct ibv_flow_spec_eth spec_eth; +struct ibv_flow_spec_ipv4 spec_ipv4; +.in -8 +} __attribute__((packed)); +.sp +.nf +struct raw_eth_flow_attr flow_attr = { +.in +8 + .attr = { + .comp_mask = 0, + .type = IBV_FLOW_ATTR_NORMAL, + .size = sizeof(flow_attr), + .priority = 0, + .num_of_specs = 2, + .port = 1, + .flags = 0, + }, + .spec_eth = { + .type = IBV_FLOW_SPEC_ETH, + .size = sizeof(struct ibv_flow_spec_eth), + .val = { + .dst_mac = {0x66, 0x11, 0x22, 0x33, 0x44, 0x55}, + .src_mac = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}, + .ether_type = 0, + .vlan_tag = 0, + }, + .mask = { + .dst_mac = { 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF}, + .src_mac = { 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF}, + .ether_type = 0, + .vlan_tag = 0, + } + }, + .spec_ipv4 = { + .type = IBV_FLOW_SPEC_IPV4, + .size = sizeof(struct ibv_flow_spec_ipv4), + .val = { + .src_ip = 0x0B86C806, + .dst_ip = 0, + }, + .mask = { + .src_ip = 0xFFFFFFFF, + .dst_ip = 0, + } + } +.in -8 +}; +.sp +.nf +.SH "AUTHORS" +.TP +Hadar Hen Zion <hadarh@mellanox.com> +.TP +Matan Barak <matanb@mellanox.com> +.TP +Yishai Hadas <yishaih@mellanox.com> +.TP +Maor Gottlieb <maorg@mellanox.com> diff --git a/libibverbs/man/ibv_create_flow_action.3.md b/libibverbs/man/ibv_create_flow_action.3.md new file mode 100644 index 0000000..f0736ca --- /dev/null +++ b/libibverbs/man/ibv_create_flow_action.3.md @@ -0,0 +1,338 @@ +--- +layout: page +title: ibv_flow_action_esp +section: 3 +tagline: Verbs +--- + +# NAME + +ibv_flow_action_esp - Flow action esp for verbs + +# SYNOPSIS + +```c +#include <infiniband/verbs.h> + +struct ibv_flow_action * +ibv_create_flow_action_esp(struct ibv_context *ctx, + struct ibv_flow_action_esp *esp); +int +ibv_modify_flow_action_esp(struct ibv_flow_action *action, + struct ibv_flow_action_esp *esp); + +int ibv_destroy_flow_action(struct ibv_flow_action *action); +``` + +# DESCRIPTION + +An IPSEC ESP flow steering action allows a flow steering rule to decrypt or +encrypt a packet after matching. Each action contains the necessary +information for this operation in the *params* argument. + +After the crypto operation the packet will continue to be processed by flow +steering rules until it reaches a final action of discard or delivery. + +After the action is created, then it should be associated with a *struct +ibv_flow_attr* using *struct ibv_flow_spec_action_handle* flow specification. +Each action can be associated with multiple flows, and *ibv_modify_flow_action_esp* +will alter all associated flows simultaneously. + +# ARGUMENTS + +*ctx* +: RDMA device context to create the action on. + +*esp* +: ESP parameters and key material for the action. + +*action* +: Existing action to modify ESP parameters. + +## *action* Argument + +```c +struct ibv_flow_action_esp { + struct ibv_flow_action_esp_attr *esp_attr; + + /* See Key Material */ + uint16_t keymat_proto; + uint16_t keymat_len; + void *keymat_ptr; + + /* See Replay Protection */ + uint16_t replay_proto; + uint16_t replay_len; + void *replay_ptr; + + struct ibv_flow_action_esp_encap *esp_encap; + + uint32_t comp_mask; + uint32_t esn; +}; +``` + +*comp_mask* +: Bitmask specifying what fields in the structure are valid. + +*esn* +: The starting value of the ESP extended sequence number. + Valid only if *IBV_FLOW_ACTION_ESP_MASK_ESN* is set in *comp_mask*. + + The 32 bits of *esn* will be used to compute the full 64 bit ESN + required for the AAD construction. + + When in *IB_UVERBS_FLOW_ACTION_ESP_FLAGS_INLINE_CRYPTO* mode, the + implementation will automatically track rollover of the lower 32 bits + of the ESN. However, an update of the window is required once every + 2^31 sequences. + + When in *IB_UVERBS_FLOW_ACTION_ESP_FLAGS_FULL_OFFLOAD* mode this value is + automatically incremended and it is also used for anti-replay checks. + +*esp_attr* +: See *ESP Attributes*. May be NULL on modify. + +*keymat_proto*, *keymat_len*, *keymat_ptr* +: Describe the key material and encryption standard to use. May be NULL on + modify. + +*replay_proto*, *replay_len*, *replay_ptr* +: Describe the replay protection scheme used to manage sequence numbers and + prevent replay attacks. This field is only valid in full offload mode. + May be NULL on modify. + +*esp_encap* +: Describe the encapsulation of ESP packets such as the IP tunnel and/or + UDP encapsulation. This field is only valid in full offload mode. + May be NULL on modify. + +## ESP attributes + +```c +struct ibv_flow_action_esp_attr { + uint32_t spi; + uint32_t seq; + uint32_t tfc_pad; + uint32_t flags; + uint64_t hard_limit_pkts; +}; +``` + +*flags* +: A bitwise OR of the various *IB_UVERBS_FLOW_ACTION_ESP_FLAGS* described below. + + *IB_UVERBS_FLOW_ACTION_ESP_FLAGS_DECRYPT*, *IB_UVERBS_FLOW_ACTION_ESP_FLAGS_ENCRYPT* + : The action will decrypt or encrypt a packet using the provided + keying material. + + The implementation may require that encrypt is only used with an + egress flow steering rule, and that decrypt is only used with an + ingress flow steering rule. + +## Full Offload Mode + +When *esp_attr* flag *IB_UVERBS_FLOW_ACTION_ESP_FLAGS_FULL_OFFLOAD* is set the +ESP header and trailer are added and removed automatically during the cipher +operation. In this case the *esn* and *spi* are used to populate and check +the ESP header, and any information from the *keymat* (eg a IV) is placed in +the headers and otherwise handled automatically. + +For decrypt the hardware will perform anti-replay. + +Decryption failure will cause the packet to be dropped. + +This action must be combined with the flow steering that identifies the +packets protected by the SA defined in this action. + +The following members of the esp_attr are used only in full offload mode: + +*spi* +: The value for the ESP Security Parameters Index. + It is only used for *IB_UVERBS_FLOW_ACTION_ESP_FLAGS_FULL_OFFLAOD*. + +*seq* +: The initial 32 lower bytes of the sequence number. + This is the value of the ESP sequence number. + It is only used for *IB_UVERBS_FLOW_ACTION_ESP_FLAGS_FULL_OFFLAOD*. + +*tfc_pad* +: The length of Traffic Flow Confidentiality Padding as specified by + RFC4303. If it is set to zero no additional padding is added. + It is only used for *IB_UVERBS_FLOW_ACTION_ESP_FLAGS_FULL_OFFLAOD*. + +*hard_limit_pkts* +: The hard lifetime of the SA measured in number of packets. + As specified by RFC4301. + After this limit is reached the action will drop future packets + to prevent breaking the crypto. + It is only used for *IB_UVERBS_FLOW_ACTION_ESP_FLAGS_FULL_OFFLAOD*. + +## Inline Crypto Mode + +When *esp_attr* flag *IB_UVERBS_FLOW_ACTION_ESP_FLAGS_INLINE_CRYPTO* is set +the user must providate packets with additional headers. + +For encrypt the packet must contain a fully populated IPSEC packet except the +data payload is left un-encrypted and there is no IPsec trailer. +If the IV must be unpredictable, then a flag should indicate the transofrmation +such as *IB_UVERBS_FLOW_ACTION_IV_ALGO_SEQ*. + +*IB_UVERBS_FLOW_ACTION_IV_ALGO_SEQ* means that the IV is incremented +sequentually. If the IV algorithm is supported by HW, then it could provide +support for LSO offload with ESP inline crypto. + +Finally, the IV used to encrypt the packet replaces the IV field provided, the +payload is encrypted and authenticated, a trailer with padding is added and +the ICV is added as well. + +For decrypt the packet is authenticated and decrypted in-place, resulting in a +decrypted IPSEC packet with no trailer. The result of decryption and +authentication can be retrieved from an extended CQ via the +*ibv_wc_read_XXX(3)* function. + +This mode must be combined with the flow steering including +*IBV_FLOW_SPEC_IPV4* and *IBV_FLOW_SPEC_ESP* to match the outer packet headers +to ensure that the action is only applied to IPSEC packets with the correct +identifiers. + +For inline crypto, we have some special requirements to maintain a stateless +ESN while maintaining the same parameters as software. The system supports +offloading a portion of the IPSEC flow, enabling a single flow to be split +between multiple NICs. + +### Determining the ESN for Ingress Packets + +We require a "modify" command once every 2^31 packets. This +modify command allows the implementation in HW to be stateless, as follows: + +``` + ESN 1 ESN 2 ESN 3 +|-------------*-------------|-------------*-------------|-------------* +^ ^ ^ ^ ^ ^ +``` + +^ - marks where command invoked to update the SA ESN state machine. +| - marks the start of the ESN scope (0-2^32-1). At this point move SA ESN "new_window" bit to zero and increment ESN. +* - marks the middle of the ESN scope (2^31). At this point move SA ESN "new_window" bit to one. + +For decryption the implementation uses the following state machine to determine ESN: + +```c +if (!overlap) { + use esn // regardless of packet.seq +} else { // new_window + if (packet.seq >= 2^31) + use esn + else // packet.seq < 2^31 + use esn+1 +} +``` + +This mechanism is controlled by the *esp_attr* flag: + +*IB_UVERBS_FLOW_ACTION_ESP_FLAGS_ESN_NEW_WINDOW* +: This flag is only used to provide stateless ESN support for inline crypto. + It is used only for *IB_UVERBS_FLOW_ACTION_ESP_FLAGS_INLINE_CRYPTO* and + *IBV_FLOW_ACTION_ESP_MASK_ESN*. + + Setting this flag indicates that the bottom of the replay window is + between 2^31 - 2^32. + +## Key Material for AES GCM (*IBV_ACTION_ESP_KEYMAT_AES_GCM*) + +The AES GCM crypto algorithm as defined by RFC4106. This struct is to be +provided in *keymat_ptr* when *keymat_proto* is set to +*IBV_ACTION_ESP_KEYMAT_AES_GCM*. + +```c +struct ibv_flow_action_esp_aes_keymat_aes_gcm { + uint64_t iv; + uint32_t iv_algo; /* Use enum ib_uverbs_flow_action_esp_aes_gcm_keymat_iv_algo */ + + uint32_t salt; + uint32_t icv_len; + + uint32_t key_len; + uint32_t aes_key[256 / 32]; +}; +``` + +*iv* +: The starting value for the initialization vector used only with + *IB_UVERBS_FLOW_ACTION_ESP_FLAGS_FULL_OFFLOAD* encryption as defined in + RFC4106. This field is ignored for + *IB_UVERBS_FLOW_ACTION_ESP_FLAGS_INLINE_CRYPTO*. + + For a given key, the IV MUST NOT be reused. + +*iv_algo* +: The algorithm used to transform/generate new IVs with + *IB_UVERBS_FLOW_ACTION_ESP_FLAGS_FULL_OFFLOAD* encryption. + + The only supported value is *IB_UVERBS_FLOW_ACTION_IV_ALGO_SEQ* to + generate sequantial IVs. + +*salt* +: The salt as defined by RFC4106. + +*icv_len* +: The length of the Integrity Check Value in bytes as defined by RFC4106. + +*aes_key*, *key_len* +: The cipher key data. It must be either 16, 24 or 32 bytes as defined by RFC4106. + +## Bitmap Replay Protection (*IBV_FLOW_ACTION_ESP_REPLAY_BMP*) + +A shifting bitmap is used to identify which packets have already been +transmitted. Each bit in the bitmap represents a packet, it is set if a packet +with this ESP sequence number has been received and it passed authentication. +If a packet with the same sequence is received, then the bit is already set, +causing replay protection to drop the packet. The bitmap represents a window +of *size* sequence numbers. If a newer sequence number is received, then the +bitmap will shift to represent this as in RFC6479. The replay window cannot +shift more than 2^31 sequence numbers forward. + +This struct is to be provided in *replay_ptr* when *reply_proto* is set to +*IBV_FLOW_ACTION_ESP_REPLAY_BMP*. In this mode reply_ptr and reply_len should +point to a struct ibv_flow_action_esp_replay_bmp containing: + *size* +: The size of the bitmap. + +## ESP Encapsulation + +An *esp_encap* specification is required when *eps_attr* flags +*IB_UVERBS_FLOW_ACTION_ESP_FLAGS_TUNNEL* is set. It is used to provide the fields +for the encapsulation header that is added/removed to/from packets. +Tunnel and Transport mode are defined as in RFC4301. +UDP encapsulation of ESP can be specified by providing the appropriate UDP header. + +This setting is only used in *IB_UVERBS_FLOW_ACTION_ESP_FLAGS_FULL_OFFLOAD* +mode. + +```C +struct ibv_flow_action_esp_encap { + void *val; /* pointer to struct ibv_flow_xxxx_filter */ + struct ibv_flow_action_esp_encap *next_ptr; + uint16_t len; /* Len of mask and pointer (separately) */ + uint16_t type; /* Use flow_spec enum */ +}; +``` + +Each link in the list specifies a network header in the same manner as the flow steering API. +The header should be selected from a supported header in 'enum ibv_flow_spec_type'. + +# RETURN VALUE + +Upon success *ibv_create_flow_action_esp* will return a new *struct +ibv_flow_action* object, on error NULL will be returned and errno will be set. + +Upon success *ibv_modify_action_esp* will return 0. On error the value of +errno will be returned. If ibv_modify_flow_action fails, it is guaranteed that +the last action still holds. If it succeeds, there is a point in the future +where the old action is applied on all packets until this point and the new +one is applied on all packets from this point and on. + +# SEE ALSO + +*ibv_create_flow(3)*, *ibv_destroy_action(3)*, *RFC 4106* diff --git a/libibverbs/man/ibv_create_qp.3 b/libibverbs/man/ibv_create_qp.3 new file mode 100644 index 0000000..1cdf247 --- /dev/null +++ b/libibverbs/man/ibv_create_qp.3 @@ -0,0 +1,89 @@ +.\" -*- nroff -*- +.\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md +.\" +.TH IBV_CREATE_QP 3 2006-10-31 libibverbs "Libibverbs Programmer's Manual" +.SH "NAME" +ibv_create_qp, ibv_destroy_qp \- create or destroy a queue pair (QP) +.SH "SYNOPSIS" +.nf +.B #include <infiniband/verbs.h> +.sp +.BI "struct ibv_qp *ibv_create_qp(struct ibv_pd " "*pd" , +.BI " struct ibv_qp_init_attr " "*qp_init_attr" ); +.sp +.BI "int ibv_destroy_qp(struct ibv_qp " "*qp" ); +.fi +.SH "DESCRIPTION" +.B ibv_create_qp() +creates a queue pair (QP) associated with the protection domain +.I pd\fR. +The argument +.I qp_init_attr +is an ibv_qp_init_attr struct, as defined in <infiniband/verbs.h>. +.PP +.nf +struct ibv_qp_init_attr { +.in +8 +void *qp_context; /* Associated context of the QP */ +struct ibv_cq *send_cq; /* CQ to be associated with the Send Queue (SQ) */ +struct ibv_cq *recv_cq; /* CQ to be associated with the Receive Queue (RQ) */ +struct ibv_srq *srq; /* SRQ handle if QP is to be associated with an SRQ, otherwise NULL */ +struct ibv_qp_cap cap; /* QP capabilities */ +enum ibv_qp_type qp_type; /* QP Transport Service Type: IBV_QPT_RC, IBV_QPT_UC, IBV_QPT_UD, IBV_QPT_RAW_PACKET or IBV_QPT_DRIVER */ +int sq_sig_all; /* If set, each Work Request (WR) submitted to the SQ generates a completion entry */ +.in -8 +}; +.sp +.nf +struct ibv_qp_cap { +.in +8 +uint32_t max_send_wr; /* Requested max number of outstanding WRs in the SQ */ +uint32_t max_recv_wr; /* Requested max number of outstanding WRs in the RQ */ +uint32_t max_send_sge; /* Requested max number of scatter/gather (s/g) elements in a WR in the SQ */ +uint32_t max_recv_sge; /* Requested max number of s/g elements in a WR in the SQ */ +uint32_t max_inline_data;/* Requested max number of data (bytes) that can be posted inline to the SQ, otherwise 0 */ +.in -8 +}; +.fi +.PP +The function +.B ibv_create_qp() +will update the +.I qp_init_attr\fB\fR->cap +struct with the actual \s-1QP\s0 values of the QP that was created; +the values will be greater than or equal to the values requested. +.PP +.B ibv_destroy_qp() +destroys the QP +.I qp\fR. +.SH "RETURN VALUE" +.B ibv_create_qp() +returns a pointer to the created QP, or NULL if the request fails. +Check the QP number (\fBqp_num\fR) in the returned QP. +.PP +.B ibv_destroy_qp() +returns 0 on success, or the value of errno on failure (which indicates the failure reason). +.SH "NOTES" +.B ibv_create_qp() +will fail if a it is asked to create QP of a type other than +.B IBV_QPT_RC +or +.B IBV_QPT_UD +associated with an SRQ. +.PP +The attributes max_recv_wr and max_recv_sge are ignored by +.B ibv_create_qp() +if the QP is to be associated with an SRQ. +.PP +.B ibv_destroy_qp() +fails if the QP is attached to a multicast group. +.PP +.B IBV_QPT_DRIVER +does not represent a specific service and is used for vendor specific QP logic. +.SH "SEE ALSO" +.BR ibv_alloc_pd (3), +.BR ibv_modify_qp (3), +.BR ibv_query_qp (3) +.SH "AUTHORS" +.TP +Dotan Barak <dotanba@gmail.com> diff --git a/libibverbs/man/ibv_create_qp_ex.3 b/libibverbs/man/ibv_create_qp_ex.3 new file mode 100644 index 0000000..277e9fa --- /dev/null +++ b/libibverbs/man/ibv_create_qp_ex.3 @@ -0,0 +1,158 @@ +.\" -*- nroff -*- +.\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md +.\" +.TH IBV_CREATE_QP_EX 3 2013-06-26 libibverbs "Libibverbs Programmer's Manual" +.SH "NAME" +ibv_create_qp_ex, ibv_destroy_qp \- create or destroy a queue pair (QP) +.SH "SYNOPSIS" +.nf +.B #include <infiniband/verbs.h> +.sp +.BI "struct ibv_qp *ibv_create_qp_ex(struct ibv_context " "*context" , +.BI " struct ibv_qp_init_attr_ex " "*qp_init_attr" ); +.sp +.BI "int ibv_destroy_qp(struct ibv_qp " "*qp" ); +.fi +.SH "DESCRIPTION" +.B ibv_create_qp_ex() +creates a queue pair (QP) associated with the protection domain +.I pd\fR. +The argument +.I qp_init_attr_ex +is an ibv_qp_init_attr_ex struct, as defined in <infiniband/verbs.h>. +.PP +.nf +struct ibv_qp_init_attr_ex { +.in +8 +void *qp_context; /* Associated context of the QP */ +struct ibv_cq *send_cq; /* CQ to be associated with the Send Queue (SQ) */ +struct ibv_cq *recv_cq; /* CQ to be associated with the Receive Queue (RQ) */ +struct ibv_srq *srq; /* SRQ handle if QP is to be associated with an SRQ, otherwise NULL */ +struct ibv_qp_cap cap; /* QP capabilities */ +enum ibv_qp_type qp_type; /* QP Transport Service Type: IBV_QPT_RC, IBV_QPT_UC, IBV_QPT_UD, IBV_QPT_RAW_PACKET or IBV_QPT_DRIVER */ +int sq_sig_all; /* If set, each Work Request (WR) submitted to the SQ generates a completion entry */ +uint32_t comp_mask; /* Identifies valid fields */ +struct ibv_pd *pd; /* PD to be associated with the QP */ +struct ibv_xrcd *xrcd; /* XRC domain to be associated with the target QP */ +enum ibv_qp_create_flags create_flags; /* Creation flags for this QP */ +uint16_t max_tso_header; /* Maximum TSO header size */ +struct ibv_rwq_ind_table *rwq_ind_tbl; /* Indirection table to be associated with the QP */ +struct ibv_rx_hash_conf rx_hash_conf; /* RX hash configuration to be used */ +uint32_t source_qpn; /* Source QP number, creation flag IBV_QP_CREATE_SOURCE_QPN should be set, few NOTEs below */ +uint64_t send_ops_flags; /* Select which QP send ops will be defined in struct ibv_qp_ex. Use enum ibv_qp_create_send_ops_flags */ +.in -8 +}; +.sp +.nf +struct ibv_qp_cap { +.in +8 +uint32_t max_send_wr; /* Requested max number of outstanding WRs in the SQ */ +uint32_t max_recv_wr; /* Requested max number of outstanding WRs in the RQ */ +uint32_t max_send_sge; /* Requested max number of scatter/gather (s/g) elements in a WR in the SQ */ +uint32_t max_recv_sge; /* Requested max number of s/g elements in a WR in the SQ */ +uint32_t max_inline_data;/* Requested max number of data (bytes) that can be posted inline to the SQ, otherwise 0 */ +.in -8 +}; +.nf +enum ibv_qp_create_flags { +.in +8 +IBV_QP_CREATE_BLOCK_SELF_MCAST_LB = 1 << 1, /* Prevent self multicast loopback */ +IBV_QP_CREATE_SCATTER_FCS = 1 << 8, /* FCS field will be scattered to host memory */ +IBV_QP_CREATE_CVLAN_STRIPPING = 1 << 9, /* CVLAN field will be stripped from incoming packets */ +IBV_QP_CREATE_SOURCE_QPN = 1 << 10, /* The created QP will use the source_qpn as its wire QP number */ +IBV_QP_CREATE_PCI_WRITE_END_PADDING = 1 << 11, /* Incoming packets will be padded to cacheline size */ +.in -8 +}; +.fi +.nf +struct ibv_rx_hash_conf { +.in +8 +uint8_t rx_hash_function; /* RX hash function, use enum ibv_rx_hash_function_flags */ +uint8_t rx_hash_key_len; /* RX hash key length */ +uint8_t *rx_hash_key; /* RX hash key data */ +uint64_t rx_hash_fields_mask; /* RX fields that should participate in the hashing, use enum ibv_rx_hash_fields */ +.in -8 +}; +.fi +.nf +enum ibv_rx_hash_fields { +.in +8 +IBV_RX_HASH_SRC_IPV4 = 1 << 0, +IBV_RX_HASH_DST_IPV4 = 1 << 1, +IBV_RX_HASH_SRC_IPV6 = 1 << 2, +IBV_RX_HASH_DST_IPV6 = 1 << 3, +IBV_RX_HASH_SRC_PORT_TCP = 1 << 4, +IBV_RX_HASH_DST_PORT_TCP = 1 << 5, +IBV_RX_HASH_SRC_PORT_UDP = 1 << 6, +IBV_RX_HASH_DST_PORT_UDP = 1 << 7, +IBV_RX_HASH_IPSEC_SPI = 1 << 8, +/* When using tunneling protocol, e.g. VXLAN, then we have an inner (encapsulated packet) and outer. + * For applying RSS on the inner packet, then the following field should be set with one of the L3/L4 fields. +*/ +IBV_RX_HASH_INNER = (1UL << 31), +.in -8 +}; +.fi +.nf +struct ibv_qp_create_send_ops_flags { +.in +8 +IBV_QP_EX_WITH_RDMA_WRITE = 1 << 0, +IBV_QP_EX_WITH_RDMA_WRITE_WITH_IMM = 1 << 1, +IBV_QP_EX_WITH_SEND = 1 << 2, +IBV_QP_EX_WITH_SEND_WITH_IMM = 1 << 3, +IBV_QP_EX_WITH_RDMA_READ = 1 << 4, +IBV_QP_EX_WITH_ATOMIC_CMP_AND_SWP = 1 << 5, +IBV_QP_EX_WITH_ATOMIC_FETCH_AND_ADD = 1 << 6, +IBV_QP_EX_WITH_LOCAL_INV = 1 << 7, +IBV_QP_EX_WITH_BIND_MW = 1 << 8, +IBV_QP_EX_WITH_SEND_WITH_INV = 1 << 9, +IBV_QP_EX_WITH_TSO = 1 << 10, +.in -8 +}; +.fi + +.PP +The function +.B ibv_create_qp_ex() +will update the +.I qp_init_attr_ex\fB\fR->cap +struct with the actual \s-1QP\s0 values of the QP that was created; +the values will be greater than or equal to the values requested. +.PP +.B ibv_destroy_qp() +destroys the QP +.I qp\fR. +.SH "RETURN VALUE" +.B ibv_create_qp_ex() +returns a pointer to the created QP, or NULL if the request fails. +Check the QP number (\fBqp_num\fR) in the returned QP. +.PP +.B ibv_destroy_qp() +returns 0 on success, or the value of errno on failure (which indicates the failure reason). +.SH "NOTES" +.PP +The attributes max_recv_wr and max_recv_sge are ignored by +.B ibv_create_qp_ex() +if the QP is to be associated with an SRQ. +.PP +The attribute source_qpn is supported only on UD QP, without flow steering RX should not be possible. +.PP +Use +.B ibv_qp_to_qp_ex() +to get the +.I ibv_qp_ex +for accessing the send ops iterator interface, when QP create attr IBV_QP_INIT_ATTR_SEND_OPS_FLAGS is used. +.PP +.B ibv_destroy_qp() +fails if the QP is attached to a multicast group. +.PP +.B IBV_QPT_DRIVER +does not represent a specific service and is used for vendor specific QP logic. +.SH "SEE ALSO" +.BR ibv_alloc_pd (3), +.BR ibv_modify_qp (3), +.BR ibv_query_qp (3), +.BR ibv_create_rwq_ind_table (3) +.SH "AUTHORS" +.TP +Yishai Hadas <yishaih@mellanox.com> diff --git a/libibverbs/man/ibv_create_rwq_ind_table.3 b/libibverbs/man/ibv_create_rwq_ind_table.3 new file mode 100644 index 0000000..176a5f1 --- /dev/null +++ b/libibverbs/man/ibv_create_rwq_ind_table.3 @@ -0,0 +1,59 @@ +.\" -*- nroff -*- +.\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md +.\" +.TH CREATE_RWQ_IND_TBL 3 2016-07-27 libibverbs "Libibverbs Programmer's Manual" +.SH "NAME" +ibv_create_rwq_ind_table, ibv_destroy_rwq_ind_table \- create or destroy a Receive Work Queue Indirection Table (RWQ IND TBL). +.SH "SYNOPSIS" +.nf +.B #include <infiniband/verbs.h> +.sp +.BI "struct ibv_rwq_ind_table *ibv_create_rwq_ind_table(struct ibv_context " "*context," +.BI " struct ibv_rwq_ind_table_init_attr " "*init_attr" ); +.sp +.BI "int ibv_destroy_rwq_ind_table(struct ibv_rwq_ind_table " "*rwq_ind_table" ); +.fi +.SH "DESCRIPTION" +.B ibv_create_rwq_ind_table() +creates a RWQ IND TBL associated with the ibv_context +.I context\fR. +The argument +.I init_attr +is an ibv_rwq_ind_table_init_attr struct, as defined in <infiniband/verbs.h>. +.PP +.nf +struct ibv_rwq_ind_table_init_attr { +.in +8 +uint32_t log_ind_tbl_size; /* Log, base 2, of Indirection table size */ +struct ibv_wq **ind_tbl; /* Each entry is a pointer to Receive Work Queue */ +uint32_t comp_mask; /* Identifies valid fields. Use ibv_ind_table_init_attr_mask */ +.in -8 +}; +.fi +.PP +The function +.B ibv_create_rwq_ind_table() +will create a RWQ IND TBL that holds a table of Receive Work Queue. +For further usage of the created object see below +.I NOTES\fR. +.PP +.B ibv_destroy_rwq_ind_table() +destroys the RWQ IND TBL +.I rwq_ind_table\fR. +.SH "RETURN VALUE" +.B ibv_create_rwq_ind_table() +returns a pointer to the created RWQ IND TBL, or NULL if the request fails. +.PP +.B ibv_destroy_rwq_ind_table() +returns 0 on success, or the value of errno on failure (which indicates the failure reason). +.SH "NOTES" +The created object should be used as part of +.I ibv_create_qp_ex() +to enable dispatching of incoming packets based on some RX hash configuration. +.SH "SEE ALSO" +.BR ibv_create_wq (3), +.BR ibv_modify_wq (3), +.BR ibv_create_qp_ex (3), +.SH "AUTHORS" +.TP +Yishai Hadas <yishaih@mellanox.com> diff --git a/libibverbs/man/ibv_create_srq.3 b/libibverbs/man/ibv_create_srq.3 new file mode 100644 index 0000000..46ab8dd --- /dev/null +++ b/libibverbs/man/ibv_create_srq.3 @@ -0,0 +1,68 @@ +.\" -*- nroff -*- +.\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md +.\" +.TH IBV_CREATE_SRQ 3 2006-10-31 libibverbs "Libibverbs Programmer's Manual" +.SH "NAME" +ibv_create_srq, ibv_destroy_srq \- create or destroy a shared receive queue (SRQ) +.SH "SYNOPSIS" +.nf +.B #include <infiniband/verbs.h> +.sp +.BI "struct ibv_srq *ibv_create_srq(struct ibv_pd " "*pd" ", struct " +.BI " ibv_srq_init_attr " "*srq_init_attr" ); +.sp +.BI "int ibv_destroy_srq(struct ibv_srq " "*srq" ); +.fi +.SH "DESCRIPTION" +.B ibv_create_srq() +creates a shared receive queue (SRQ) associated with the protection domain +.I pd\fR. +The argument +.I srq_init_attr +is an ibv_srq_init_attr struct, as defined in <infiniband/verbs.h>. +.PP +.nf +struct ibv_srq_init_attr { +.in +8 +void *srq_context; /* Associated context of the SRQ */ +struct ibv_srq_attr attr; /* SRQ attributes */ +.in -8 +}; +.sp +.nf +struct ibv_srq_attr { +.in +8 +uint32_t max_wr; /* Requested max number of outstanding work requests (WRs) in the SRQ */ +uint32_t max_sge; /* Requested max number of scatter elements per WR */ +uint32_t srq_limit; /* The limit value of the SRQ (irrelevant for ibv_create_srq) */ +.in -8 +}; +.fi +.PP +The function +.B ibv_create_srq() +will update the +.I srq_init_attr +struct with the original values of the SRQ that was created; the +values of max_wr and max_sge will be greater than or equal to the +values requested. +.PP +.B ibv_destroy_srq() +destroys the SRQ +.I srq\fR. +.SH "RETURN VALUE" +.B ibv_create_srq() +returns a pointer to the created SRQ, or NULL if the request fails. +.PP +.B ibv_destroy_srq() +returns 0 on success, or the value of errno on failure (which indicates the failure reason). +.SH "NOTES" +.B ibv_destroy_srq() +fails if any queue pair is still associated with this SRQ. +.SH "SEE ALSO" +.BR ibv_alloc_pd (3), +.BR ibv_modify_srq (3), +.BR ibv_query_srq (3) +.SH "AUTHORS" +.TP +Dotan Barak <dotanba@gmail.com> diff --git a/libibverbs/man/ibv_create_srq_ex.3 b/libibverbs/man/ibv_create_srq_ex.3 new file mode 100644 index 0000000..97529ae --- /dev/null +++ b/libibverbs/man/ibv_create_srq_ex.3 @@ -0,0 +1,83 @@ +.\" -*- nroff -*- +.\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md +.\" +.TH IBV_CREATE_SRQ_EX 3 2013-06-26 libibverbs "Libibverbs Programmer's Manual" +.SH "NAME" +ibv_create_srq_ex, ibv_destroy_srq \- create or destroy a shared receive queue (SRQ) +.SH "SYNOPSIS" +.nf +.B #include <infiniband/verbs.h> +.sp +.BI "struct ibv_srq *ibv_create_srq_ex(struct ibv_context " "*context" ", struct " +.BI " ibv_srq_init_attr_ex " "*srq_init_attr_ex" ); +.sp +.BI "int ibv_destroy_srq(struct ibv_srq " "*srq" ); +.fi +.SH "DESCRIPTION" +.B ibv_create_srq_ex() +creates a shared receive queue (SRQ) supporting both basic and xrc modes. +The argument +.I srq_init_attr_ex +is an ibv_srq_init_attr_ex struct, as defined in <infiniband/verbs.h>. +.PP +.nf +struct ibv_srq_init_attr_ex { +.in +8 +void *srq_context; /* Associated context of the SRQ */ +struct ibv_srq_attr attr; /* SRQ attributes */ +uint32_t comp_mask; /* Identifies valid fields */ +enum ibv_srq_type srq_type; /* Basic / XRC / tag matching */ +struct ibv_pd *pd; /* PD associated with the SRQ */ +struct ibv_xrcd *xrcd; /* XRC domain to associate with the SRQ */ +struct ibv_cq *cq; /* CQ to associate with the SRQ for XRC mode */ +struct ibv_tm_cap tm_cap; /* Tag matching attributes */ +.in -8 +}; +.sp +.nf +struct ibv_srq_attr { +.in +8 +uint32_t max_wr; /* Requested max number of outstanding work requests (WRs) in the SRQ */ +uint32_t max_sge; /* Requested max number of scatter elements per WR */ +uint32_t srq_limit; /* The limit value of the SRQ */ +.in -8 +}; +.sp +.nf +struct ibv_tm_cap { +.in +8 +uint32_t max_num_tags; /* Tag matching list size */ +uint32_t max_ops; /* Number of outstanding tag list operations */ +.in -8 +}; +.sp +.nf +.fi +.PP +The function +.B ibv_create_srq_ex() +will update the +.I srq_init_attr_ex +struct with the original values of the SRQ that was created; the +values of max_wr and max_sge will be greater than or equal to the +values requested. +.PP +.B ibv_destroy_srq() +destroys the SRQ +.I srq\fR. +.SH "RETURN VALUE" +.B ibv_create_srq_ex() +returns a pointer to the created SRQ, or NULL if the request fails. +.PP +.B ibv_destroy_srq() +returns 0 on success, or the value of errno on failure (which indicates the failure reason). +.SH "NOTES" +.B ibv_destroy_srq() +fails if any queue pair is still associated with this SRQ. +.SH "SEE ALSO" +.BR ibv_alloc_pd (3), +.BR ibv_modify_srq (3), +.BR ibv_query_srq (3) +.SH "AUTHORS" +.TP +Yishai Hadas <yishaih@mellanox.com> diff --git a/libibverbs/man/ibv_create_wq.3 b/libibverbs/man/ibv_create_wq.3 new file mode 100644 index 0000000..10fe965 --- /dev/null +++ b/libibverbs/man/ibv_create_wq.3 @@ -0,0 +1,74 @@ +.\" -*- nroff -*- +.\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md +.\" +.TH IBV_CREATE_WQ 3 2016-07-27 libibverbs "Libibverbs Programmer's Manual" +.SH "NAME" +ibv_create_wq, ibv_destroy_wq \- create or destroy a Work Queue (WQ). +.SH "SYNOPSIS" +.nf +.B #include <infiniband/verbs_exp.h> +.sp +.BI "struct ibv_wq *ibv_create_wq(struct ibv_context " "*context," +.BI " struct ibv_wq_init_attr " "*wq_init_attr" ); +.sp +.BI "int ibv_destroy_wq(struct ibv_wq " "*wq" ); +.fi +.SH "DESCRIPTION" +.B ibv_create_wq() +creates a WQ associated with the ibv_context +.I context\fR. +The argument +.I wq_init_attr +is an ibv_wq_init_attr struct, as defined in <infiniband/verbs.h>. +.PP +.nf +struct ibv_wq_init_attr { +.in +8 +void *wq_context; /* Associated context of the WQ */ +enum ibv_wq_type wq_type; /* WQ type */ +uint32_t max_wr; /* Requested max number of outstanding WRs in the WQ */ +uint32_t max_sge; /* Requested max number of scatter/gather (s/g) elements per WR in the WQ */ +struct ibv_pd *pd; /* PD to be associated with the WQ */ +struct ibv_cq *cq; /* CQ to be associated with the WQ */ +uint32_t comp_mask; /* Identifies valid fields. Use ibv_wq_init_attr_mask */ +uint32_t create_flags /* Creation flags for this WQ, use enum ibv_wq_flags */ +.in -8 +}; + +.sp +.nf +enum ibv_wq_flags { +.in +8 +IBV_WQ_FLAGS_CVLAN_STRIPPING = 1 << 0, /* CVLAN field will be stripped from incoming packets */ +IBV_WQ_FLAGS_SCATTER_FCS = 1 << 1, /* FCS field will be scattered to host memory */ +IBV_WQ_FLAGS_DELAY_DROP = 1 << 2, /* Packets won't be dropped immediately if no receive WQEs */ +IBV_WQ_FLAGS_PCI_WRITE_END_PADDING = 1 << 3, /* Incoming packets will be padded to cacheline size */ +IBV_WQ_FLAGS_RESERVED = 1 << 4, +.in -8 +}; +.nf +.fi +.PP +The function +.B ibv_create_wq() +will update the +.I wq_init_attr\fB\fR->max_wr +and +.I wq_init_attr\fB\fR->max_sge +fields with the actual \s-1WQ\s0 values of the WQ that was created; +the values will be greater than or equal to the values requested. +.PP +.B ibv_destroy_wq() +destroys the WQ +.I wq\fR. +.SH "RETURN VALUE" +.B ibv_create_wq() +returns a pointer to the created WQ, or NULL if the request fails. +.PP +.B ibv_destroy_wq() +returns 0 on success, or the value of errno on failure (which indicates the failure reason). +.SH "SEE ALSO" +.BR ibv_modify_wq (3), +.SH "AUTHORS" +.TP +Yishai Hadas <yishaih@mellanox.com> diff --git a/libibverbs/man/ibv_devices.1 b/libibverbs/man/ibv_devices.1 new file mode 100644 index 0000000..ffd57f8 --- /dev/null +++ b/libibverbs/man/ibv_devices.1 @@ -0,0 +1,20 @@ +.\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md +.TH IBV_DEVICES 1 "August 30, 2005" "libibverbs" "USER COMMANDS" + +.SH NAME +ibv_devices \- list RDMA devices + +.SH SYNOPSIS +.B ibv_devices + +.SH DESCRIPTION +.PP +List RDMA devices available for use from userspace. + +.SH SEE ALSO +.BR ibv_devinfo (1) + +.SH AUTHORS +.TP +Roland Dreier +.RI < rolandd@cisco.com > diff --git a/libibverbs/man/ibv_devinfo.1 b/libibverbs/man/ibv_devinfo.1 new file mode 100644 index 0000000..b5e9a58 --- /dev/null +++ b/libibverbs/man/ibv_devinfo.1 @@ -0,0 +1,40 @@ +.\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md +.TH IBV_DEVINFO 1 "August 30, 2005" "libibverbs" "USER COMMANDS" + +.SH NAME +ibv_devinfo \- query RDMA devices + +.SH SYNOPSIS +.B ibv_devinfo +[\-d device] [\-i port] [\-l] [\-v] + +.SH DESCRIPTION +.PP +Print information about RDMA devices available for use from userspace. + +.SH OPTIONS + +.PP +.TP +\fB\-d\fR, \fB\-\-ib\-dev\fR=\fIDEVICE\fR +use IB device \fIDEVICE\fR (default first device found) + +\fB\-i\fR, \fB\-\-ib\-port\fR=\fIPORT\fR +query port \fIPORT\fR (default all ports) + +\fB\-l\fR, \fB\-\-list\fR +only list names of RDMA devices + +\fB\-v\fR, \fB\-\-verbose\fR +print all available information about RDMA devices + +.SH SEE ALSO +.BR ibv_devices (1) + +.SH AUTHORS +.TP +Dotan Barak +.RI < dotanba@gmail.com > +.TP +Roland Dreier +.RI < rolandd@cisco.com > diff --git a/libibverbs/man/ibv_event_type_str.3.md b/libibverbs/man/ibv_event_type_str.3.md new file mode 100644 index 0000000..35e61a3 --- /dev/null +++ b/libibverbs/man/ibv_event_type_str.3.md @@ -0,0 +1,49 @@ +--- +date: 2006-10-31 +footer: libibverbs +header: "Libibverbs Programmer's Manual" +layout: page +license: 'Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md' +section: 3 +title: IBV_EVENT_TYPE_STR +--- + +# NAME + +ibv_event_type_str - Return string describing event_type enum value + +ibv_node_type_str - Return string describing node_type enum value + +ibv_port_state_str - Return string describing port_state enum value + +# SYNOPSIS + +```c +#include <infiniband/verbs.h> + +const char *ibv_event_type_str(enum ibv_event_type event_type); + +const char *ibv_node_type_str(enum ibv_node_type node_type); + +const char *ibv_port_state_str(enum ibv_port_state port_state); +``` + +# DESCRIPTION + +**ibv_node_type_str()** returns a string describing the node type enum value +*node_type*. + +**ibv_port_state_str()** returns a string describing the port state enum value +*port_state*. + +**ibv_event_type_str()** returns a string describing the event type enum value +*event_type*. + +# RETURN VALUE + +These functions return a constant string that describes the enum value passed +as their argument. + +# AUTHOR + +Roland Dreier <rolandd@cisco.com> diff --git a/libibverbs/man/ibv_fork_init.3.md b/libibverbs/man/ibv_fork_init.3.md new file mode 100644 index 0000000..5b2a564 --- /dev/null +++ b/libibverbs/man/ibv_fork_init.3.md @@ -0,0 +1,69 @@ +--- +date: 2006-10-31 +footer: libibverbs +header: "Libibverbs Programmer's Manual" +layout: page +license: 'Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md' +section: 3 +title: IBV_FORK_INIT +--- + +# NAME + +ibv_fork_init - initialize libibverbs to support fork() + +# SYNOPSIS + +```c +#include <infiniband/verbs.h> + +int ibv_fork_init(void); +``` + +# DESCRIPTION + +**ibv_fork_init()** initializes libibverbs's data structures to handle +**fork()** function calls correctly and avoid data corruption, whether +**fork()** is called explicitly or implicitly (such as in **system()**). + +It is not necessary to use this function if all parent process threads are +always blocked until all child processes end or change address spaces via an +**exec()** operation. + +# RETURN VALUE + +**ibv_fork_init()** returns 0 on success, or the value of errno on failure +(which indicates the failure reason). + +# NOTES + +**ibv_fork_init()** works on Linux kernels supporting the **MADV_DONTFORK** +flag for **madvise()** (2.6.17 and higher). + +Setting the environment variable **RDMAV_FORK_SAFE** or **IBV_FORK_SAFE** has +the same effect as calling **ibv_fork_init()**. + +Setting the environment variable **RDMAV_HUGEPAGES_SAFE** tells the library to +check the underlying page size used by the kernel for memory regions. This is +required if an application uses huge pages either directly or indirectly via a +library such as libhugetlbfs. + +Calling **ibv_fork_init()** will reduce performance due to an extra system +call for every memory registration, and the additional memory allocated to +track memory regions. The precise performance impact depends on the workload +and usually will not be significant. + +Setting **RDMAV_HUGEPAGES_SAFE** adds further overhead to all memory +registrations. + +# SEE ALSO + +**exec**(3), +**fork**(2), +**ibv_get_device_list**(3), +**system**(3), +**wait**(2) + +# AUTHOR + +Dotan Barak <dotanba@gmail.com> diff --git a/libibverbs/man/ibv_get_async_event.3 b/libibverbs/man/ibv_get_async_event.3 new file mode 100644 index 0000000..85ce6e1 --- /dev/null +++ b/libibverbs/man/ibv_get_async_event.3 @@ -0,0 +1,165 @@ +.\" -*- nroff -*- +.\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md +.\" +.TH IBV_GET_ASYNC_EVENT 3 2006-10-31 libibverbs "Libibverbs Programmer's Manual" +.SH "NAME" +ibv_get_async_event, ibv_ack_async_event \- get or acknowledge asynchronous events +.SH "SYNOPSIS" +.nf +.B #include <infiniband/verbs.h> +.sp +.BI "int ibv_get_async_event(struct ibv_context " "*context" , +.BI " struct ibv_async_event " "*event" ); +.sp +.BI "void ibv_ack_async_event(struct ibv_async_event " "*event" ); +.fi +.SH "DESCRIPTION" +.B ibv_get_async_event() +waits for the next async event of the RDMA device context +.I context +and returns it through the pointer +.I event\fR, +which is an ibv_async_event struct, as defined in <infiniband/verbs.h>. +.PP +.nf +struct ibv_async_event { +.in +8 +union { +.in +8 +struct ibv_cq *cq; /* CQ that got the event */ +struct ibv_qp *qp; /* QP that got the event */ +struct ibv_srq *srq; /* SRQ that got the event */ +int port_num; /* port number that got the event */ +.in -8 +} element; +enum ibv_event_type event_type; /* type of the event */ +.in -8 +}; +.fi +.PP +One member of the element union will be valid, depending on the +event_type member of the structure. event_type will be one of the +following events: +.PP +.I QP events: +.TP +.B IBV_EVENT_QP_FATAL \fR Error occurred on a QP and it transitioned to error state +.TP +.B IBV_EVENT_QP_REQ_ERR \fR Invalid Request Local Work Queue Error +.TP +.B IBV_EVENT_QP_ACCESS_ERR \fR Local access violation error +.TP +.B IBV_EVENT_COMM_EST \fR Communication was established on a QP +.TP +.B IBV_EVENT_SQ_DRAINED \fR Send Queue was drained of outstanding messages in progress +.TP +.B IBV_EVENT_PATH_MIG \fR A connection has migrated to the alternate path +.TP +.B IBV_EVENT_PATH_MIG_ERR \fR A connection failed to migrate to the alternate path +.TP +.B IBV_EVENT_QP_LAST_WQE_REACHED \fR Last WQE Reached on a QP associated with an SRQ +.PP +.I CQ events: +.TP +.B IBV_EVENT_CQ_ERR \fR CQ is in error (CQ overrun) +.PP +.I SRQ events: +.TP +.B IBV_EVENT_SRQ_ERR \fR Error occurred on an SRQ +.TP +.B IBV_EVENT_SRQ_LIMIT_REACHED \fR SRQ limit was reached +.PP +.I Port events: +.TP +.B IBV_EVENT_PORT_ACTIVE \fR Link became active on a port +.TP +.B IBV_EVENT_PORT_ERR \fR Link became unavailable on a port +.TP +.B IBV_EVENT_LID_CHANGE \fR LID was changed on a port +.TP +.B IBV_EVENT_PKEY_CHANGE \fR P_Key table was changed on a port +.TP +.B IBV_EVENT_SM_CHANGE \fR SM was changed on a port +.TP +.B IBV_EVENT_CLIENT_REREGISTER \fR SM sent a CLIENT_REREGISTER request to a port +.TP +.B IBV_EVENT_GID_CHANGE \fR GID table was changed on a port +.PP +.I CA events: +.TP +.B IBV_EVENT_DEVICE_FATAL \fR CA is in FATAL state +.PP +.B ibv_ack_async_event() +acknowledge the async event +.I event\fR. +.SH "RETURN VALUE" +.B ibv_get_async_event() +returns 0 on success, and \-1 on error. +.PP +.B ibv_ack_async_event() +returns no value. +.SH "NOTES" +All async events that +.B ibv_get_async_event() +returns must be acknowledged using +.B ibv_ack_async_event()\fR. +To avoid races, destroying an object (CQ, SRQ or QP) will wait for all +affiliated events for the object to be acknowledged; this avoids an +application retrieving an affiliated event after the corresponding +object has already been destroyed. +.PP +.B ibv_get_async_event() +is a blocking function. If multiple threads call this function +simultaneously, then when an async event occurs, only one thread will +receive it, and it is not possible to predict which thread will +receive it. +.SH "EXAMPLES" +The following code example demonstrates one possible way to work with async events in non-blocking mode. +It performs the following steps: +.PP +1. Set the async events queue work mode to be non-blocked +.br +2. Poll the queue until it has an async event +.br +3. Get the async event and ack it +.PP +.nf +/* change the blocking mode of the async event queue */ +flags = fcntl(ctx->async_fd, F_GETFL); +rc = fcntl(ctx->async_fd, F_SETFL, flags | O_NONBLOCK); +if (rc < 0) { + fprintf(stderr, "Failed to change file descriptor of async event queue\en"); + return 1; +} + +/* + * poll the queue until it has an event and sleep ms_timeout + * milliseconds between any iteration + */ +my_pollfd.fd = ctx->async_fd; +my_pollfd.events = POLLIN; +my_pollfd.revents = 0; + +do { + rc = poll(&my_pollfd, 1, ms_timeout); +} while (rc == 0); +if (rc < 0) { + fprintf(stderr, "poll failed\en"); + return 1; +} + +/* Get the async event */ +if (ibv_get_async_event(ctx, &async_event)) { + fprintf(stderr, "Failed to get async_event\en"); + return 1; +} + +/* Ack the event */ +ibv_ack_async_event(&async_event); + +.fi +.SH "SEE ALSO" +.BR ibv_open_device (3) +.SH "AUTHORS" +.TP +Dotan Barak <dotanba@gmail.com> diff --git a/libibverbs/man/ibv_get_cq_event.3 b/libibverbs/man/ibv_get_cq_event.3 new file mode 100644 index 0000000..a1a42e6 --- /dev/null +++ b/libibverbs/man/ibv_get_cq_event.3 @@ -0,0 +1,186 @@ +.\" -*- nroff -*- +.\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md +.\" +.TH IBV_GET_CQ_EVENT 3 2006-10-31 libibverbs "Libibverbs Programmer's Manual" +.SH "NAME" +ibv_get_cq_event, ibv_ack_cq_events \- get and acknowledge completion queue (CQ) events + +.SH "SYNOPSIS" +.nf +.B #include <infiniband/verbs.h> +.sp +.BI "int ibv_get_cq_event(struct ibv_comp_channel " "*channel" , +.BI " struct ibv_cq " "**cq" ", void " "**cq_context" ); +.sp +.BI "void ibv_ack_cq_events(struct ibv_cq " "*cq" ", unsigned int " "nevents" ); +.fi + +.SH "DESCRIPTION" +.B ibv_get_cq_event() +waits for the next completion event in the completion event channel +.I channel\fR. +Fills the arguments +.I cq +with the CQ that got the event and +.I cq_context +with the CQ's context\fR. +.PP +.B ibv_ack_cq_events() +acknowledges +.I nevents +events on the CQ +.I cq\fR. + +.SH "RETURN VALUE" +.B ibv_get_cq_event() +returns 0 on success, and \-1 on error. +.PP +.B ibv_ack_cq_events() +returns no value. +.SH "NOTES" +All completion events that +.B ibv_get_cq_event() +returns must be acknowledged using +.B ibv_ack_cq_events()\fR. +To avoid races, destroying a CQ will wait for all completion events to +be acknowledged; this guarantees a one-to-one correspondence between +acks and successful gets. +.PP +Calling +.B ibv_ack_cq_events() +may be relatively expensive in the datapath, since it must take a +mutex. Therefore it may be better to amortize this cost by +keeping a count of the number of events needing acknowledgement and +acking several completion events in one call to +.B ibv_ack_cq_events()\fR. +.SH "EXAMPLES" +The following code example demonstrates one possible way to work with +completion events. It performs the following steps: +.PP +Stage I: Preparation +.br +1. Creates a CQ +.br +2. Requests for notification upon a new (first) completion event +.PP +Stage II: Completion Handling Routine +.br +3. Wait for the completion event and ack it +.br +4. Request for notification upon the next completion event +.br +5. Empty the CQ +.PP +Note that an extra event may be triggered without having a +corresponding completion entry in the CQ. This occurs if a completion +entry is added to the CQ between Step 4 and Step 5, and the CQ is then +emptied (polled) in Step 5. +.PP +.nf +cq = ibv_create_cq(ctx, 1, ev_ctx, channel, 0); +if (!cq) { + fprintf(stderr, "Failed to create CQ\en"); + return 1; +} +.PP +/* Request notification before any completion can be created */ +if (ibv_req_notify_cq(cq, 0)) { + fprintf(stderr, "Couldn't request CQ notification\en"); + return 1; +} +.PP +\&. +\&. +\&. +.PP +/* Wait for the completion event */ +if (ibv_get_cq_event(channel, &ev_cq, &ev_ctx)) { + fprintf(stderr, "Failed to get cq_event\en"); + return 1; +} + +/* Ack the event */ +ibv_ack_cq_events(ev_cq, 1); +.PP +/* Request notification upon the next completion event */ +if (ibv_req_notify_cq(ev_cq, 0)) { + fprintf(stderr, "Couldn't request CQ notification\en"); + return 1; +} +.PP +/* Empty the CQ: poll all of the completions from the CQ (if any exist) */ +do { + ne = ibv_poll_cq(cq, 1, &wc); + if (ne < 0) { + fprintf(stderr, "Failed to poll completions from the CQ\en"); + return 1; + } + + /* there may be an extra event with no completion in the CQ */ + if (ne == 0) + continue; +.PP + if (wc.status != IBV_WC_SUCCESS) { + fprintf(stderr, "Completion with status 0x%x was found\en", wc.status); + return 1; + } +} while (ne); +.fi + +The following code example demonstrates one possible way to work with +completion events in non-blocking mode. It performs the following +steps: +.PP +1. Set the completion event channel to be non-blocked +.br +2. Poll the channel until there it has a completion event +.br +3. Get the completion event and ack it +.PP +.nf +/* change the blocking mode of the completion channel */ +flags = fcntl(channel->fd, F_GETFL); +rc = fcntl(channel->fd, F_SETFL, flags | O_NONBLOCK); +if (rc < 0) { + fprintf(stderr, "Failed to change file descriptor of completion event channel\en"); + return 1; +} + + +/* + * poll the channel until it has an event and sleep ms_timeout + * milliseconds between any iteration + */ +my_pollfd.fd = channel->fd; +my_pollfd.events = POLLIN; +my_pollfd.revents = 0; + +do { + rc = poll(&my_pollfd, 1, ms_timeout); +} while (rc == 0); +if (rc < 0) { + fprintf(stderr, "poll failed\en"); + return 1; +} +ev_cq = cq; + +/* Wait for the completion event */ +if (ibv_get_cq_event(channel, &ev_cq, &ev_ctx)) { + fprintf(stderr, "Failed to get cq_event\en"); + return 1; +} + +/* Ack the event */ +ibv_ack_cq_events(ev_cq, 1); + +.fi +.SH "SEE ALSO" +.BR ibv_create_comp_channel (3), +.BR ibv_create_cq (3), +.BR ibv_req_notify_cq (3), +.BR ibv_poll_cq (3) + +.SH "AUTHORS" +.TP +Dotan Barak +.RI < dotanba@gmail.com > diff --git a/libibverbs/man/ibv_get_device_guid.3.md b/libibverbs/man/ibv_get_device_guid.3.md new file mode 100644 index 0000000..683900f --- /dev/null +++ b/libibverbs/man/ibv_get_device_guid.3.md @@ -0,0 +1,41 @@ +--- +date: 2006-10-31 +footer: libibverbs +header: "Libibverbs Programmer's Manual" +layout: page +license: 'Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md' +section: 3 +title: IBV_GET_DEVICE_GUID +--- + +# NAME + +ibv_get_device_guid - get an RDMA device's GUID + +# SYNOPSIS + +```c +#include <infiniband/verbs.h> + +uint64_t ibv_get_device_guid(struct ibv_device *device); +``` + +# DESCRIPTION + +**ibv_get_device_name()** returns the Global Unique IDentifier (GUID) of the +RDMA device *device*. + +# RETURN VALUE + +**ibv_get_device_guid()** returns the GUID of the device in network byte +order. + +# SEE ALSO + +**ibv_get_device_list**(3), +**ibv_get_device_name**(3), +**ibv_open_device**(3) + +# AUTHOR + +Dotan Barak <dotanba@gmail.com> diff --git a/libibverbs/man/ibv_get_device_list.3.md b/libibverbs/man/ibv_get_device_list.3.md new file mode 100644 index 0000000..3d222f6 --- /dev/null +++ b/libibverbs/man/ibv_get_device_list.3.md @@ -0,0 +1,95 @@ +--- +date: 2006-10-31 +footer: libibverbs +header: "Libibverbs Programmer's Manual" +layout: page +license: 'Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md' +section: 3 +title: IBV_GET_DEVICE_LIST +--- + +# NAME + +ibv_get_device_list, ibv_free_device_list - get and release list of available +RDMA devices + +# SYNOPSIS + +```c +#include <infiniband/verbs.h> + +struct ibv_device **ibv_get_device_list(int *num_devices); + +void ibv_free_device_list(struct ibv_device **list); +``` + +# DESCRIPTION + +**ibv_get_device_list()** returns a NULL-terminated array of RDMA devices +currently available. The argument *num_devices* is optional; if not NULL, it +is set to the number of devices returned in the array. + +**ibv_free_device_list()** frees the array of devices *list* returned by +**ibv_get_device_list()**. + +# RETURN VALUE + +**ibv_get_device_list()** returns the array of available RDMA devices, or sets +*errno* and returns NULL if the request fails. If no devices are found then +*num_devices* is set to 0, and non-NULL is returned. + +**ibv_free_device_list()** returns no value. + +# ERRORS + +**EPERM** +: Permission denied. + +**ENOSYS** +: No kernel support for RDMA. + +**ENOMEM** +: Insufficient memory to complete the operation. + + +# NOTES + +Client code should open all the devices it intends to use with +**ibv_open_device()** before calling **ibv_free_device_list()**. Once it frees +the array with **ibv_free_device_list()**, it will be able to use only the +open devices; pointers to unopened devices will no longer be valid. + +Setting the environment variable **IBV_SHOW_WARNINGS** will cause warnings to +be emitted to stderr if a kernel verbs device is discovered, but no +corresponding userspace driver can be found for it. + +# STATIC LINKING + +If **libibverbs** is statically linked to the application then all provider +drivers must also be statically linked. The library will not load dynamic +providers when static linking is used. + +To link the providers set the **RDMA_STATIC_PROVIDERS** define to the comma +separated list of desired providers when compiling the application. The +special keyword 'all' will statically link all supported **libibverbs** +providers. + +This is intended to be used along with **pkg-config(1)** to setup the proper +flags for **libibverbs** linking. + +If this is not done then **ibv_get_device_list** will always return an empty +list. + +Using only dynamic linking for **libibverbs** applications is strongly +recommended. + +# SEE ALSO + +**ibv_fork_init**(3), +**ibv_get_device_guid**(3), +**ibv_get_device_name**(3), +**ibv_open_device**(3) + +# AUTHOR + +Dotan Barak <dotanba@gmail.com> diff --git a/libibverbs/man/ibv_get_device_name.3.md b/libibverbs/man/ibv_get_device_name.3.md new file mode 100644 index 0000000..5703562 --- /dev/null +++ b/libibverbs/man/ibv_get_device_name.3.md @@ -0,0 +1,41 @@ +--- +date: ' 2006-10-31' +footer: libibverbs +header: "Libibverbs Programmer's Manual" +layout: page +license: 'Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md' +section: 3 +title: IBV_GET_DEVICE_NAME +--- + +# NAME + +ibv_get_device_name - get an RDMA device's name + +# SYNOPSIS + +```c +#include <infiniband/verbs.h> + +const char *ibv_get_device_name(struct ibv_device *device); +``` + +# DESCRIPTION + +**ibv_get_device_name()** returns a human-readable name associated with the +RDMA device *device*. + +# RETURN VALUE + +**ibv_get_device_name()** returns a pointer to the device name, or NULL if the +request fails. + +# SEE ALSO + +**ibv_get_device_guid**(3), +**ibv_get_device_list**(3), +**ibv_open_device**(3) + +# AUTHOR + +Dotan Barak <dotanba@gmail.com> diff --git a/libibverbs/man/ibv_get_pkey_index.3.md b/libibverbs/man/ibv_get_pkey_index.3.md new file mode 100644 index 0000000..729f182 --- /dev/null +++ b/libibverbs/man/ibv_get_pkey_index.3.md @@ -0,0 +1,48 @@ +--- +date: 2018-07-16 +footer: libibverbs +header: "Libibverbs Programmer's Manual" +layout: page +license: 'Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md' +section: 3 +title: IBV_GET_PKEY_INDEX +--- + +# NAME + +ibv_get_pkey_index - obtain the index in the P_Key table of a P_Key + +# SYNOPSIS + +```c +#include <infiniband/verbs.h> + +int ibv_get_pkey_index(struct ibv_context *context, + uint8_t port_num, + __be16 pkey); +``` + +# DESCRIPTION + +Every InfiniBand HCA maintains a P_Key table for each of its ports that is +indexed by an integer and with a P_Key in each element. Certain InfiniBand +data structures that work with P_Keys expect a P_Key index, e.g. **struct +ibv_qp_attr** and **struct ib_mad_addr**. Hence the function +**ibv_get_pkey_index()** that accepts a P_Key in network byte order and that +returns an index in the P_Key table as result. + +# RETURN VALUE + +**ibv_get_pkey_index()** returns the P_Key index on success, and -1 on error. + +# SEE ALSO + +**ibv_open_device**(3), +**ibv_query_device**(3), +**ibv_query_gid**(3), +**ibv_query_pkey**(3), +**ibv_query_port**(3) + +# AUTHOR + +Bart Van Assche <bvanassche@acm.org> diff --git a/libibverbs/man/ibv_get_srq_num.3.md b/libibverbs/man/ibv_get_srq_num.3.md new file mode 100644 index 0000000..f015b9e --- /dev/null +++ b/libibverbs/man/ibv_get_srq_num.3.md @@ -0,0 +1,45 @@ +--- +date: 2013-06-26 +footer: libibverbs +header: "Libibverbs Programmer's Manual" +layout: page +license: 'Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md' +section: 3 +title: IBV_GET_SRQ_NUM +--- + +# NAME + +ibv_get_srq_num - return srq number associated with the given shared receive +queue (SRQ) + +# SYNOPSIS + +```c +#include <infiniband/verbs.h> + +int ibv_get_srq_num(struct ibv_srq *srq, uint32_t *srq_num); +``` + +# DESCRIPTION + +**ibv_get_srq_num()** return srq number associated with the given shared +receive queue The argument *srq* is an ibv_srq struct, as defined in +<infiniband/verbs.h>. *srq_num* is an output parameter that holds the returned +srq number. + + +# RETURN VALUE + +**ibv_get_srq_num()** returns 0 on success, or the value of errno on failure +(which indicates the failure reason). + +# SEE ALSO + +**ibv_alloc_pd**(3), +**ibv_create_srq_ex**(3), +**ibv_modify_srq**(3) + +# AUTHOR + +Yishai Hadas <yishaih@mellanox.com> diff --git a/libibverbs/man/ibv_inc_rkey.3.md b/libibverbs/man/ibv_inc_rkey.3.md new file mode 100644 index 0000000..44ba8e1 --- /dev/null +++ b/libibverbs/man/ibv_inc_rkey.3.md @@ -0,0 +1,43 @@ +--- +date: 2015-01-29 +footer: libibverbs +header: "Libibverbs Programmer's Manual" +layout: page +license: 'Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md' +section: 3 +title: IBV_INC_RKEY +--- + +# NAME + +ibv_inc_rkey - creates a new rkey from the given one + +# SYNOPSIS + +```c +#include <infiniband/verbs.h> + +uint32_t ibv_inc_rkey(uint32_t rkey); +``` + +# DESCRIPTION + +**ibv_inc_rkey()** Increases the 8 LSB of *rkey* and returns the new value. + + +# RETURN VALUE + +**ibv_inc_rkey()** returns the new rkey. + +# NOTES + + +The verb generates a new rkey that is different from the previous one on its +tag part but has the same index (bits 0xffffff00). A use case for this verb +can be to create a new rkey from a Memory window's rkey when binding it to a +Memory region. + +# AUTHORS + +Majd Dibbiny <majd@mellanox.com>, +Yishai Hadas <yishaih@mellanox.com> diff --git a/libibverbs/man/ibv_modify_cq.3 b/libibverbs/man/ibv_modify_cq.3 new file mode 100644 index 0000000..d15d2cd --- /dev/null +++ b/libibverbs/man/ibv_modify_cq.3 @@ -0,0 +1,48 @@ +.\" -*- nroff -*- +.\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md +.\" +.TH IBV_MODIFY_CQ 3 2017-10-20 libibverbs "Libibverbs Programmer's Manual" +.SH "NAME" +ibv_modify_cq \- modify a completion queue (CQ) +.SH "SYNOPSIS" +.nf +.B #include <infiniband/verbs.h> +.sp +.BI "int ibv_modify_cq(struct ibv_cq " *cq ", struct ibv_modify_cq_attr "*cq_attr "); +.sp +.fi +.SH "DESCRIPTION" +.B ibv_modify_cq() +modify a CQ +.I cq\fR. +The argument +.I cq_attr +is an ibv_modify_cq_attr struct, as defined in <infiniband/verbs.h>. +.PP +.nf +struct ibv_moderate_cq { +.in +8 +uint16_t cq_count; /* number of completions per event */ +uint16_t cq_period; /* in micro seconds */ +.in -8 +}; + +struct ibv_modify_cq_attr { +.in +8 + uint32_t attr_mask; + struct ibv_moderate_cq moderate; +.in -8 +}; +.fi +.PP +The function +.B ibv_modify_cq() +will modify the CQ, based on the given +.I cq_attr\fB\fR->attr_mask +.SH "RETURN VALUE" +returns 0 on success, or the value of errno on failure (which indicates the failure reason). +.SH "SEE ALSO" +.BR ibv_create_cq (3) +.SH "AUTHORS" +.TP +Yonatan Cohen <yonatanc@mellanox.com> diff --git a/libibverbs/man/ibv_modify_qp.3 b/libibverbs/man/ibv_modify_qp.3 new file mode 100644 index 0000000..fd85964 --- /dev/null +++ b/libibverbs/man/ibv_modify_qp.3 @@ -0,0 +1,187 @@ +.\" -*- nroff -*- +.\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md +.\" +.TH IBV_MODIFY_QP 3 2006-10-31 libibverbs "Libibverbs Programmer's Manual" +.SH "NAME" +ibv_modify_qp \- modify the attributes of a queue pair (QP) +.SH "SYNOPSIS" +.nf +.B #include <infiniband/verbs.h> +.sp +.BI "int ibv_modify_qp(struct ibv_qp " "*qp" ", struct ibv_qp_attr " "*attr" , +.BI " int " "attr_mask" ); +.fi +.SH "DESCRIPTION" +.B ibv_modify_qp() +modifies the attributes of QP +.I qp +with the attributes in +.I attr +according to the mask +.I attr_mask\fR. +The argument \fIattr\fR is an ibv_qp_attr struct, as defined in <infiniband/verbs.h>. +.PP +.nf +struct ibv_qp_attr { +.in +8 +enum ibv_qp_state qp_state; /* Move the QP to this state */ +enum ibv_qp_state cur_qp_state; /* Assume this is the current QP state */ +enum ibv_mtu path_mtu; /* Path MTU (valid only for RC/UC QPs) */ +enum ibv_mig_state path_mig_state; /* Path migration state (valid if HCA supports APM) */ +uint32_t qkey; /* Q_Key for the QP (valid only for UD QPs) */ +uint32_t rq_psn; /* PSN for receive queue (valid only for RC/UC QPs) */ +uint32_t sq_psn; /* PSN for send queue (valid only for RC/UC QPs) */ +uint32_t dest_qp_num; /* Destination QP number (valid only for RC/UC QPs) */ +int qp_access_flags; /* Mask of enabled remote access operations (valid only for RC/UC QPs) */ +struct ibv_qp_cap cap; /* QP capabilities (valid if HCA supports QP resizing) */ +struct ibv_ah_attr ah_attr; /* Primary path address vector (valid only for RC/UC QPs) */ +struct ibv_ah_attr alt_ah_attr; /* Alternate path address vector (valid only for RC/UC QPs) */ +uint16_t pkey_index; /* Primary P_Key index */ +uint16_t alt_pkey_index; /* Alternate P_Key index */ +uint8_t en_sqd_async_notify; /* Enable SQD.drained async notification (Valid only if qp_state is SQD) */ +uint8_t sq_draining; /* Is the QP draining? Irrelevant for ibv_modify_qp() */ +uint8_t max_rd_atomic; /* Number of outstanding RDMA reads & atomic operations on the destination QP (valid only for RC QPs) */ +uint8_t max_dest_rd_atomic; /* Number of responder resources for handling incoming RDMA reads & atomic operations (valid only for RC QPs) */ +uint8_t min_rnr_timer; /* Minimum RNR NAK timer (valid only for RC QPs) */ +uint8_t port_num; /* Primary port number */ +uint8_t timeout; /* Local ack timeout for primary path (valid only for RC QPs) */ +uint8_t retry_cnt; /* Retry count (valid only for RC QPs) */ +uint8_t rnr_retry; /* RNR retry (valid only for RC QPs) */ +uint8_t alt_port_num; /* Alternate port number */ +uint8_t alt_timeout; /* Local ack timeout for alternate path (valid only for RC QPs) */ +uint32_t rate_limit; /* Rate limit in kbps for packet pacing */ +.in -8 +}; +.fi +.PP +For details on struct ibv_qp_cap see the description of +.B ibv_create_qp()\fR. +For details on struct ibv_ah_attr see the description of +.B ibv_create_ah()\fR. +.PP +The argument +.I attr_mask +specifies the QP attributes to be modified. +The argument is either 0 or the bitwise OR of one or more of the following flags: +.PP +.TP +.B IBV_QP_STATE \fR Modify qp_state +.TP +.B IBV_QP_CUR_STATE \fR Set cur_qp_state +.TP +.B IBV_QP_EN_SQD_ASYNC_NOTIFY \fR Set en_sqd_async_notify +.TP +.B IBV_QP_ACCESS_FLAGS \fR Set qp_access_flags +.TP +.B IBV_QP_PKEY_INDEX \fR Set pkey_index +.TP +.B IBV_QP_PORT \fR Set port_num +.TP +.B IBV_QP_QKEY \fR Set qkey +.TP +.B IBV_QP_AV \fR Set ah_attr +.TP +.B IBV_QP_PATH_MTU \fR Set path_mtu +.TP +.B IBV_QP_TIMEOUT \fR Set timeout +.TP +.B IBV_QP_RETRY_CNT \fR Set retry_cnt +.TP +.B IBV_QP_RNR_RETRY \fR Set rnr_retry +.TP +.B IBV_QP_RQ_PSN \fR Set rq_psn +.TP +.B IBV_QP_MAX_QP_RD_ATOMIC \fR Set max_rd_atomic +.TP +.B IBV_QP_ALT_PATH \fR Set the alternative path via: alt_ah_attr, alt_pkey_index, alt_port_num, alt_timeout +.TP +.B IBV_QP_MIN_RNR_TIMER \fR Set min_rnr_timer +.TP +.B IBV_QP_SQ_PSN \fR Set sq_psn +.TP +.B IBV_QP_MAX_DEST_RD_ATOMIC \fR Set max_dest_rd_atomic +.TP +.B IBV_QP_PATH_MIG_STATE \fR Set path_mig_state +.TP +.B IBV_QP_CAP \fR Set cap +.TP +.B IBV_QP_DEST_QPN \fR Set dest_qp_num +.B IBV_QP_RATE_LIMIT \fR Set rate_limit +.SH "RETURN VALUE" +.B ibv_modify_qp() +returns 0 on success, or the value of errno on failure (which indicates the failure reason). +.SH "NOTES" +If any of the modify attributes or the modify mask are invalid, none +of the attributes will be modified (including the QP state). +.PP +Not all devices support resizing QPs. To check if a device supports it, check if the +.B IBV_DEVICE_RESIZE_MAX_WR +bit is set in the device capabilities flags. +.PP +Not all devices support alternate paths. To check if a device supports it, check if the +.B IBV_DEVICE_AUTO_PATH_MIG +bit is set in the device capabilities flags. +.PP +The following tables indicate for each QP Transport Service Type, the +minimum list of attributes that must be changed upon transitioning QP +state from: Reset \-\-> Init \-\-> RTR \-\-> RTS. +.PP +.nf +For QP Transport Service Type \fB IBV_QPT_UD\fR: +.sp +Next state Required attributes +\-\-\-\-\-\-\-\-\-\- \-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\- +Init \fB IBV_QP_STATE, IBV_QP_PKEY_INDEX, IBV_QP_PORT, \fR + \fB IBV_QP_QKEY \fR +RTR \fB IBV_QP_STATE \fR +RTS \fB IBV_QP_STATE, IBV_QP_SQ_PSN \fR +.fi +.PP +.nf +For QP Transport Service Type \fB IBV_QPT_UC\fR: +.sp +Next state Required attributes +\-\-\-\-\-\-\-\-\-\- \-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\- +Init \fB IBV_QP_STATE, IBV_QP_PKEY_INDEX, IBV_QP_PORT, \fR + \fB IBV_QP_ACCESS_FLAGS \fR +RTR \fB IBV_QP_STATE, IBV_QP_AV, IBV_QP_PATH_MTU, \fR + \fB IBV_QP_DEST_QPN, IBV_QP_RQ_PSN \fR +RTS \fB IBV_QP_STATE, IBV_QP_SQ_PSN \fR +.fi +.PP +.nf +For QP Transport Service Type \fB IBV_QPT_RC\fR: +.sp +Next state Required attributes +\-\-\-\-\-\-\-\-\-\- \-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\- +Init \fB IBV_QP_STATE, IBV_QP_PKEY_INDEX, IBV_QP_PORT, \fR + \fB IBV_QP_ACCESS_FLAGS \fR +RTR \fB IBV_QP_STATE, IBV_QP_AV, IBV_QP_PATH_MTU, \fR + \fB IBV_QP_DEST_QPN, IBV_QP_RQ_PSN, \fR + \fB IBV_QP_MAX_DEST_RD_ATOMIC, IBV_QP_MIN_RNR_TIMER \fR +RTS \fB IBV_QP_STATE, IBV_QP_SQ_PSN, IBV_QP_MAX_QP_RD_ATOMIC, \fR + \fB IBV_QP_RETRY_CNT, IBV_QP_RNR_RETRY, IBV_QP_TIMEOUT \fR +.fi +.PP +.nf +For QP Transport Service Type \fB IBV_QPT_RAW_PACKET\fR: +.sp +Next state Required attributes +\-\-\-\-\-\-\-\-\-\- \-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\- +Init \fB IBV_QP_STATE, IBV_QP_PORT\fR +RTR \fB IBV_QP_STATE\fR +RTS \fB IBV_QP_STATE\fR +.fi +.PP +If port flag IBV_QPF_GRH_REQUIRED is set then +ah_attr and alt_ah_attr +must be passed with definition of 'struct ibv_ah_attr { .is_global = 1; .grh = {...}; }'. +.PP +.SH "SEE ALSO" +.BR ibv_create_qp (3), +.BR ibv_destroy_qp (3), +.BR ibv_query_qp (3), +.BR ibv_create_ah (3) +.SH "AUTHORS" +.TP +Dotan Barak <dotanba@gmail.com> diff --git a/libibverbs/man/ibv_modify_qp_rate_limit.3 b/libibverbs/man/ibv_modify_qp_rate_limit.3 new file mode 100644 index 0000000..87228ba --- /dev/null +++ b/libibverbs/man/ibv_modify_qp_rate_limit.3 @@ -0,0 +1,68 @@ +.\" -*- nroff -*- +.\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md +.\" +.TH IBV_MODIFY_QP_RATE_LIMIT 3 2018-01-09 libibverbs "Libibverbs Programmer's Manual" +.SH "NAME" +ibv_modify_qp_rate_limit \- modify the send rate limits attributes of a queue pair (QP) +.SH "SYNOPSIS" +.nf +.B #include <infiniband/verbs.h> +.sp +.BI "int ibv_modify_qp_rate_limit(struct ibv_qp " "*qp" ", struct ibv_qp_rate_limit_attr " "*attr"); +.fi +.SH "DESCRIPTION" +.B ibv_modify_qp_rate_limit() +modifies the send rate limiting packet pacing attributes of QP +.I qp +with the attributes in +.I attr\fR. +The argument \fIattr\fR is an ibv_qp_rate_limit_attr struct, as defined in <infiniband/verbs.h>. +.PP +The +.I rate_limit +defines the MAX send rate this QP will send as long as the link in not blocked and there are work requests in send queue. +.PP +Finer control for shaping the rate limit of a QP is achieved by defining the +.I max_burst_sz\fR, +single burst max bytes size and the +.I typical_pkt_sz\fR, +typical packet bytes size. These allow the device to adjust the inter-burst gap delay required to correctly shape the scheduling of sends to the wire in order to reach for requested application requirements. +.PP +Setting a value of 0 for +.I max_burst_sz +or +.I typical_pkt_sz +will use the devices defaults. +.I typical_pkt_sz +will default to the port's MTU value. +.PP +.nf +struct ibv_qp_rate_limit_attr { +.in +8 +uint32_t rate_limit; /* kbps */ +uint32_t max_burst_sz; /* bytes */ +uint16_t typical_pkt_sz; /* bytes */ +.in -8 +}; +.fi +.PP +.SH "RETURN VALUE" +.B ibv_modify_qp_rate_limit() +returns 0 on success, or the value of errno on failure (which indicates the failure reason). +.SH "ERRORS" +.SS EINVAL +Invalid arguments. +.SS EOPNOTSUPP +Function is not implemented for this device. +(ENOSYS may sometimes be returned by old versions of libibverbs). +.PP +.SH "SEE ALSO" +.BR ibv_create_qp (3), +.BR ibv_destroy_qp (3), +.BR ibv_modify_qp (3), +.BR ibv_query_qp (3) +.SH "AUTHORS" +.TP +Alex Rosenbaum <alexr@mellanox.com> +.TP +Bodong Wang <bodong@mellanox.com> diff --git a/libibverbs/man/ibv_modify_srq.3 b/libibverbs/man/ibv_modify_srq.3 new file mode 100644 index 0000000..5233922 --- /dev/null +++ b/libibverbs/man/ibv_modify_srq.3 @@ -0,0 +1,64 @@ +.\" -*- nroff -*- +.\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md +.\" +.TH IBV_MODIFY_SRQ 3 2006-10-31 libibverbs "Libibverbs Programmer's Manual" +.SH "NAME" +ibv_modify_srq \- modify attributes of a shared receive queue (SRQ) +.SH "SYNOPSIS" +.nf +.B #include <infiniband/verbs.h> +.sp +.BI "int ibv_modify_srq(struct ibv_srq " "*srq" , +.BI " struct ibv_srq_attr " "*srq_attr" , +.BI " int " "srq_attr_mask" ); +.fi +.SH "DESCRIPTION" +.B ibv_modify_srq() +modifies the attributes of SRQ +.I srq +with the attributes in +.I srq_attr +according to the mask +.I srq_attr_mask\fR. +The argument \fIsrq_attr\fR is an ibv_srq_attr struct, as defined in <infiniband/verbs.h>. +.PP +.nf +struct ibv_srq_attr { +.in +8 +uint32_t max_wr; /* maximum number of outstanding work requests (WRs) in the SRQ */ +uint32_t max_sge; /* number of scatter elements per WR (irrelevant for ibv_modify_srq) */ +uint32_t srq_limit; /* the limit value of the SRQ */ +.in -8 +}; +.fi +.PP +The argument +.I srq_attr_mask +specifies the SRQ attributes to be modified. +The argument is either 0 or the bitwise OR of one or more of the following flags: +.PP +.TP +.B IBV_SRQ_MAX_WR \fR Resize the SRQ +.TP +.B IBV_SRQ_LIMIT \fR Set the SRQ limit +.SH "RETURN VALUE" +.B ibv_modify_srq() +returns 0 on success, or the value of errno on failure (which indicates the failure reason). +.SH "NOTES" +If any of the modify attributes is invalid, none of the attributes will be modified. +.PP +Not all devices support resizing SRQs. To check if a device supports it, check if the +.B IBV_DEVICE_SRQ_RESIZE +bit is set in the device capabilities flags. +.PP +Modifying the srq_limit arms the SRQ to produce an +.B IBV_EVENT_SRQ_LIMIT_REACHED +"low watermark" asynchronous event once the number of WRs in the SRQ drops below srq_limit. +.SH "SEE ALSO" +.BR ibv_query_device (3), +.BR ibv_create_srq (3), +.BR ibv_destroy_srq (3), +.BR ibv_query_srq (3) +.SH "AUTHORS" +.TP +Dotan Barak <dotanba@gmail.com> diff --git a/libibverbs/man/ibv_modify_wq.3 b/libibverbs/man/ibv_modify_wq.3 new file mode 100644 index 0000000..1972ec2 --- /dev/null +++ b/libibverbs/man/ibv_modify_wq.3 @@ -0,0 +1,46 @@ +.\" -*- nroff -*- +.\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md +.\" +.TH IBV_MODIFY_WQ 3 2016-07-27 libibverbs "Libibverbs Programmer's Manual" +.SH "NAME" +ibv_modify_wq \- Modify a Work Queue (WQ). +.SH "SYNOPSIS" +.nf +.B #include <infiniband/verbs.h> +.sp +.BI "struct int ibv_modify_wq(struct ibv_wq " "*wq," +.BI " struct ibv_wq_attr " "*wq_attr" ); +.sp +.fi +.SH "DESCRIPTION" +.B ibv_modify_wq() +modifys a WQ +.I wq\fR. +The argument +.I wq_attr +is an ibv_wq_attr struct, as defined in <infiniband/verbs.h>. +.PP +.nf +struct ibv_wq_attr { +.in +8 +uint32_t attr_mask; /* Use enum ibv_wq_attr_mask */ +enum ibv_wq_state wq_state; /* Move to this state */ +enum ibv_wq_state curr_wq_state; /* Assume this is the current state */ +uint32_t flags; /* Flags values to modify, use enum ibv_wq_flags */ +uint32_t flags_mask; /* Which flags to modify, use enum ibv_wq_flags */ +.in -8 +}; +.fi +.PP +The function +.B ibv_modify_wq() +will modify the WQ based on the given +.I wq_attr\fB\fR->attr_mask +.SH "RETURN VALUE" +returns 0 on success, or the value of errno on failure (which indicates the failure reason). +.SH "SEE ALSO" +.BR ibv_create_wq (3), +.BR ibv_destroy_wq (3), +.SH "AUTHORS" +.TP +Yishai Hadas <yishaih@mellanox.com> diff --git a/libibverbs/man/ibv_open_device.3 b/libibverbs/man/ibv_open_device.3 new file mode 100644 index 0000000..3a12d2d --- /dev/null +++ b/libibverbs/man/ibv_open_device.3 @@ -0,0 +1,52 @@ +.\" -*- nroff -*- +.\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md +.\" +.TH IBV_OPEN_DEVICE 3 2006-10-31 libibverbs "Libibverbs Programmer's Manual" +.SH "NAME" +ibv_open_device, ibv_close_device \- open and close an RDMA device context +.SH "SYNOPSIS" +.nf +.B #include <infiniband/verbs.h> +.sp +.BI "struct ibv_context *ibv_open_device(struct ibv_device " "*device" "); +.sp +.BI "int ibv_close_device(struct ibv_context " "*context" "); +.fi +.SH "DESCRIPTION" +.B ibv_open_device() +opens the device +.I device +and creates a context for further use. +.PP +.B ibv_close_device() +closes the device context +.I context\fR. +.SH "RETURN VALUE" +.B ibv_open_device() +returns a pointer to the allocated device context, or NULL if the request fails. +.PP +.B ibv_close_device() +returns 0 on success, \-1 on failure. +.SH "NOTES" +.B ibv_close_device() +does not release all the resources allocated using context +.I context\fR. +To avoid resource leaks, the user should release all associated +resources before closing a context. + +Setting the environment variable **RDMAV_ALLOW_DISASSOC_DESTROY** tells the +library to relate an EIO from destroy commands as a success as the kernel +resources were already released. This comes to prevent memory leakage in the +user space area upon device disassociation. Applications using this flag cannot +call ibv_get_cq_event or ibv_get_async_event concurrently with any call to an +object destruction function. + +.SH "SEE ALSO" +.BR ibv_get_device_list (3), +.BR ibv_query_device (3), +.BR ibv_query_port (3), +.BR ibv_query_gid (3), +.BR ibv_query_pkey (3) +.SH "AUTHORS" +.TP +Dotan Barak <dotanba@gmail.com> diff --git a/libibverbs/man/ibv_open_qp.3 b/libibverbs/man/ibv_open_qp.3 new file mode 100644 index 0000000..e25e933 --- /dev/null +++ b/libibverbs/man/ibv_open_qp.3 @@ -0,0 +1,52 @@ +.\" -*- nroff -*- +.\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md +.\" +.TH IBV_OPEN_QP 3 2011-08-12 libibverbs "Libibverbs Programmer's Manual" +.SH "NAME" +ibv_open_qp \- open a shareable queue pair (QP) +.SH "SYNOPSIS" +.nf +.B #include <infiniband/verbs.h> +.sp +.BI "struct ibv_qp *ibv_open_qp(struct ibv_context " "*context" , +.BI " struct ibv_qp_open_attr " "*qp_open_attr" ); +.fi +.SH "DESCRIPTION" +.B ibv_open_qp() +opens an existing queue pair (QP) associated with the extended protection domain +.I xrcd\fR. +The argument +.I qp_open_attr +is an ibv_qp_open_attr struct, as defined in <infiniband/verbs.h>. +.PP +.nf +struct ibv_qp_open_attr { +.in +8 +uint32_t comp_mask; /* Identifies valid fields */ +uint32_t qp_num; /* QP number */ +struct *ibv_xrcd; /* XRC domain */ +void *qp_context; /* User defined opaque value */ +enum ibv_qp_type qp_type; /* QP transport service type */ +.fi +.PP +.B ibv_destroy_qp() +closes the opened QP and destroys the underlying QP if it has no +other references. +.I qp\fR. +.SH "RETURN VALUE" +.B ibv_open_qp() +returns a pointer to the opened QP, or NULL if the request fails. +Check the QP number (\fBqp_num\fR) in the returned QP. +.SH "NOTES" +.B ibv_open_qp() +will fail if a it is asked to open a QP that does not exist within +the xrcd with the specified qp_num and qp_type. +.SH "SEE ALSO" +.BR ibv_alloc_pd (3), +.BR ibv_create_qp (3), +.BR ibv_create_qp_ex (3), +.BR ibv_modify_qp (3), +.BR ibv_query_qp (3) +.SH "AUTHORS" +.TP +Sean Hefty <sean.hefty@intel.com> diff --git a/libibverbs/man/ibv_open_xrcd.3 b/libibverbs/man/ibv_open_xrcd.3 new file mode 100644 index 0000000..239af2d --- /dev/null +++ b/libibverbs/man/ibv_open_xrcd.3 @@ -0,0 +1,77 @@ +.\" -*- nroff -*- +.\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md +.\" +.TH IBV_OPEN_XRCD 3 2011-06-17 libibverbs "Libibverbs Programmer's Manual" +.SH "NAME" +ibv_open_xrcd, ibv_close_xrcd \- open or close an XRC protection domain (XRCDs) +.SH "SYNOPSIS" +.nf +.B #include <infiniband/verbs.h> +.sp +.BI "struct ibv_xrcd *ibv_open_xrcd(struct ibv_context " "*context" "," +.BI " struct ibv_xrcd_init_attr " "*xrcd_init_attr" ); +.sp +.BI "int ibv_close_xrcd(struct ibv_xrcd " "*xrcd" ); +.fi +.SH "DESCRIPTION" +.B ibv_open_xrcd() +open an XRC domain for the RDMA device context +.I context +.I xrcd_init_attr +is an ibv_xrcd_init_attr struct, as defined in <infiniband/verbs.h>. +.PP +.nf +struct ibv_xrcd_init_attr { +.in +8 +uint32_t comp_mask; /* Identifies valid fields */ +int fd; +int oflag; +.fi +.PP +.I fd +is the file descriptor to associate with the XRCD. +.I oflag +describes the desired creation attributes. It is a bitwise OR of zero or more +of the following flags: +.PP +.TP +.B O_CREAT +Indicates that an XRCD should be created and associated with the inode referenced +by the given fd. If the XRCD exists, this flag has no effect except as noted under +.BR O_EXCL +below.\fR +.TP +.B O_EXCL +If +.BR O_EXCL +and +.BR O_CREAT +are set, open will fail if an XRCD associated with the inode exists. +.PP +If +.I fd +equals -1, no inode is associated with the XRCD. To indicate that XRCD should be created, use +.I oflag += +.B O_CREAT\fR. +.PP +.B ibv_close_xrcd() +closes the XRCD +.I xrcd\fR. +If this is the last reference, the XRCD will be destroyed. +.SH "RETURN VALUE" +.B ibv_open_xrcd() +returns a pointer to the opened XRCD, or NULL if the request fails. +.PP +.B ibv_close_xrcd() +returns 0 on success, or the value of errno on failure (which indicates the +failure reason). +.SH "NOTES" +.B ibv_close_xrcd() +may fail if any other resource is still associated with the XRCD being closed. +.SH "SEE ALSO" +.BR ibv_create_srq_ex (3), +.BR ibv_create_qp_ex (3), +.SH "AUTHORS" +.TP +Sean Hefty <sean.hefty@intel.com> diff --git a/libibverbs/man/ibv_poll_cq.3 b/libibverbs/man/ibv_poll_cq.3 new file mode 100644 index 0000000..957fd15 --- /dev/null +++ b/libibverbs/man/ibv_poll_cq.3 @@ -0,0 +1,92 @@ +.\" -*- nroff -*- +.\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md +.\" +.TH IBV_POLL_CQ 3 2006-10-31 libibverbs "Libibverbs Programmer's Manual" +.SH "NAME" +ibv_poll_cq \- poll a completion queue (CQ) +.SH "SYNOPSIS" +.nf +.B #include <infiniband/verbs.h> +.sp +.BI "int ibv_poll_cq(struct ibv_cq " "*cq" ", int " "num_entries" , +.BI " struct ibv_wc " "*wc" ); +.fi +.SH "DESCRIPTION" +.B ibv_poll_cq() +polls the CQ +.I cq +for work completions and returns the first +.I num_entries +(or all available completions if the CQ contains fewer than this number) in the array +.I wc\fR. +The argument +.I wc +is a pointer to an array of ibv_wc structs, as defined in <infiniband/verbs.h>. +.PP +.nf +struct ibv_wc { +.in +8 +uint64_t wr_id; /* ID of the completed Work Request (WR) */ +enum ibv_wc_status status; /* Status of the operation */ +enum ibv_wc_opcode opcode; /* Operation type specified in the completed WR */ +uint32_t vendor_err; /* Vendor error syndrome */ +uint32_t byte_len; /* Number of bytes transferred */ +union { +.in +8 +__be32 imm_data; /* Immediate data (in network byte order) */ +uint32_t invalidated_rkey; /* Local RKey that was invalidated */ +.in -8 +}; +uint32_t qp_num; /* Local QP number of completed WR */ +uint32_t src_qp; /* Source QP number (remote QP number) of completed WR (valid only for UD QPs) */ +int wc_flags; /* Flags of the completed WR */ +uint16_t pkey_index; /* P_Key index (valid only for GSI QPs) */ +uint16_t slid; /* Source LID */ +uint8_t sl; /* Service Level */ +uint8_t dlid_path_bits; /* DLID path bits (not applicable for multicast messages) */ +.in -8 +}; +.sp +.fi +.PP +The attribute wc_flags describes the properties of the work completion. +It is either 0 or the bitwise OR of one or more of the following flags: +.PP +.TP +.B IBV_WC_GRH \fR GRH is present (valid only for UD QPs) +.TP +.B IBV_WC_WITH_IMM \fR Immediate data value is valid +.TP +.B IBV_WC_WITH_INV \fR Invalidated RKey data value is valid (cannot be combined with IBV_WC_WITH_IMM) +.TP +.B IBV_WC_IP_CSUM_OK \fR TCP/UDP checksum over IPv4 and IPv4 header checksum are +verified. +Valid only when \fBdevice_cap_flags\fR in device_attr indicates current QP is +supported by checksum offload. +.PP +Not all +.I wc +attributes are always valid. If the completion status is other than +.B IBV_WC_SUCCESS\fR, +only the following attributes are valid: wr_id, status, qp_num, and vendor_err. +.SH "RETURN VALUE" +On success, +.B ibv_poll_cq() +returns a non-negative value equal to the number of completions +found. On failure, a negative value is returned. +.SH "NOTES" +.PP +Each polled completion is removed from the CQ and cannot be returned to it. +.PP +The user should consume work completions at a rate that prevents CQ +overrun from occurrence. In case of a CQ overrun, the async event +.B IBV_EVENT_CQ_ERR +will be triggered, and the CQ cannot be used. +.PP +IBV_WC_DRIVER1 will be reported as a response to IBV_WR_DRIVER1 opcode. +.SH "SEE ALSO" +.BR ibv_post_send (3), +.BR ibv_post_recv (3) +.SH "AUTHORS" +.TP +Dotan Barak <dotanba@gmail.com> diff --git a/libibverbs/man/ibv_post_recv.3 b/libibverbs/man/ibv_post_recv.3 new file mode 100644 index 0000000..affca74 --- /dev/null +++ b/libibverbs/man/ibv_post_recv.3 @@ -0,0 +1,77 @@ +.\" -*- nroff -*- +.\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md +.\" +.TH IBV_POST_RECV 3 2006-10-31 libibverbs "Libibverbs Programmer's Manual" +.SH "NAME" +ibv_post_recv \- post a list of work requests (WRs) to a receive queue +.SH "SYNOPSIS" +.nf +.B #include <infiniband/verbs.h> +.sp +.BI "int ibv_post_recv(struct ibv_qp " "*qp" ", struct ibv_recv_wr " "*wr" , +.BI " struct ibv_recv_wr " "**bad_wr" ); +.fi +.SH "DESCRIPTION" +.B ibv_post_recv() +posts the linked list of work requests (WRs) starting with +.I wr +to the receive queue of the queue pair +.I qp\fR. +It stops processing WRs from this list at the first failure (that can +be detected immediately while requests are being posted), and returns +this failing WR through +.I bad_wr\fR. +.PP +The argument +.I wr +is an ibv_recv_wr struct, as defined in <infiniband/verbs.h>. +.PP +.nf +struct ibv_recv_wr { +.in +8 +uint64_t wr_id; /* User defined WR ID */ +struct ibv_recv_wr *next; /* Pointer to next WR in list, NULL if last WR */ +struct ibv_sge *sg_list; /* Pointer to the s/g array */ +int num_sge; /* Size of the s/g array */ +.in -8 +}; +.sp +.nf +struct ibv_sge { +.in +8 +uint64_t addr; /* Start address of the local memory buffer */ +uint32_t length; /* Length of the buffer */ +uint32_t lkey; /* Key of the local Memory Region */ +.in -8 +}; +.fi +.SH "RETURN VALUE" +.B ibv_post_recv() +returns 0 on success, or the value of errno on failure (which indicates the failure reason). +.SH "NOTES" +The buffers used by a WR can only be safely reused after WR the +request is fully executed and a work completion has been retrieved +from the corresponding completion queue (CQ). +.PP +If the QP +.I qp +is associated with a shared receive queue, you must use the function +.B ibv_post_srq_recv()\fR, +and not +.B ibv_post_recv()\fR, +since the QP's own receive queue will not be used. +.PP +If a WR is being posted to a UD QP, the Global Routing Header (GRH) of +the incoming message will be placed in the first 40 bytes of the +buffer(s) in the scatter list. If no GRH is present in the incoming +message, then the first bytes will be undefined. This means that in +all cases, the actual data of the incoming message will start at an +offset of 40 bytes into the buffer(s) in the scatter list. +.SH "SEE ALSO" +.BR ibv_create_qp (3), +.BR ibv_post_send (3), +.BR ibv_post_srq_recv (3), +.BR ibv_poll_cq (3) +.SH "AUTHORS" +.TP +Dotan Barak <dotanba@gmail.com> diff --git a/libibverbs/man/ibv_post_send.3 b/libibverbs/man/ibv_post_send.3 new file mode 100644 index 0000000..4fb99f7 --- /dev/null +++ b/libibverbs/man/ibv_post_send.3 @@ -0,0 +1,183 @@ +.\" -*- nroff -*- +.\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md +.\" +.TH IBV_POST_SEND 3 2006-10-31 libibverbs "Libibverbs Programmer's Manual" +.SH "NAME" +ibv_post_send \- post a list of work requests (WRs) to a send queue +.SH "SYNOPSIS" +.nf +.B #include <infiniband/verbs.h> +.sp +.BI "int ibv_post_send(struct ibv_qp " "*qp" ", struct ibv_send_wr " "*wr" , +.BI " struct ibv_send_wr " "**bad_wr" ); +.fi +.SH "DESCRIPTION" +.B ibv_post_send() +posts the linked list of work requests (WRs) starting with +.I wr +to the send queue of the queue pair +.I qp\fR. +It stops processing WRs from this list at the first failure (that can +be detected immediately while requests are being posted), and returns +this failing WR through +.I bad_wr\fR. +.PP +The argument +.I wr +is an ibv_send_wr struct, as defined in <infiniband/verbs.h>. +.PP +.nf +struct ibv_send_wr { +.in +8 +uint64_t wr_id; /* User defined WR ID */ +struct ibv_send_wr *next; /* Pointer to next WR in list, NULL if last WR */ +struct ibv_sge *sg_list; /* Pointer to the s/g array */ +int num_sge; /* Size of the s/g array */ +enum ibv_wr_opcode opcode; /* Operation type */ +int send_flags; /* Flags of the WR properties */ +union { +.in +8 +__be32 imm_data; /* Immediate data (in network byte order) */ +uint32_t invalidate_rkey; /* Remote rkey to invalidate */ +.in -8 +}; +union { +.in +8 +struct { +.in +8 +uint64_t remote_addr; /* Start address of remote memory buffer */ +uint32_t rkey; /* Key of the remote Memory Region */ +.in -8 +} rdma; +struct { +.in +8 +uint64_t remote_addr; /* Start address of remote memory buffer */ +uint64_t compare_add; /* Compare operand */ +uint64_t swap; /* Swap operand */ +uint32_t rkey; /* Key of the remote Memory Region */ +.in -8 +} atomic; +struct { +.in +8 +struct ibv_ah *ah; /* Address handle (AH) for the remote node address */ +uint32_t remote_qpn; /* QP number of the destination QP */ +uint32_t remote_qkey; /* Q_Key number of the destination QP */ +.in -8 +} ud; +.in -8 +} wr; +union { +.in +8 +struct { +.in +8 +uint32_t remote_srqn; /* Number of the remote SRQ */ +.in -8 +} xrc; +.in -8 +} qp_type; +union { +.in +8 +struct { +.in +8 +struct ibv_mw *mw; /* Memory window (MW) of type 2 to bind */ +uint32_t rkey; /* The desired new rkey of the MW */ +struct ibv_mw_bind_info bind_info; /* MW additional bind information */ +.in -8 +} bind_mw; +struct { +.in +8 +void *hdr; /* Pointer address of inline header */ +uint16_t hdr_sz; /* Inline header size */ +uint16_t mss; /* Maximum segment size for each TSO fragment */ +.in -8 +} tso; +.in -8 +}; +.in -8 +}; +.fi +.sp +.nf +struct ibv_mw_bind_info { +.in +8 +struct ibv_mr *mr; /* The Memory region (MR) to bind the MW to */ +uint64_t addr; /* The address the MW should start at */ +uint64_t length; /* The length (in bytes) the MW should span */ +int mw_access_flags; /* Access flags to the MW. Use ibv_access_flags */ +.in -8 +}; +.fi +.sp +.nf +struct ibv_sge { +.in +8 +uint64_t addr; /* Start address of the local memory buffer or number of bytes from the + start of the MR for MRs which are IBV_ZERO_BASED */ +uint32_t length; /* Length of the buffer */ +uint32_t lkey; /* Key of the local Memory Region */ +.in -8 +}; +.fi +.PP +Each QP Transport Service Type supports a specific set of opcodes, as shown in the following table: +.PP +.nf +OPCODE | IBV_QPT_UD | IBV_QPT_UC | IBV_QPT_RC | IBV_QPT_XRC_SEND | IBV_QPT_RAW_PACKET +\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-+\-\-\-\-\-\-\-\-\-\-\-\-+\-\-\-\-\-\-\-\-\-\-\-\-+\-\-\-\-\-\-\-\-\-\-\-\-+\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-+\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\- +IBV_WR_SEND | X | X | X | X | X +IBV_WR_SEND_WITH_IMM | X | X | X | X | +IBV_WR_RDMA_WRITE | | X | X | X | +IBV_WR_RDMA_WRITE_WITH_IMM | | X | X | X | +IBV_WR_RDMA_READ | | | X | X | +IBV_WR_ATOMIC_CMP_AND_SWP | | | X | X | +IBV_WR_ATOMIC_FETCH_AND_ADD | | | X | X | +IBV_WR_LOCAL_INV | | X | X | X | +IBV_WR_BIND_MW | | X | X | X | +IBV_WR_SEND_WITH_INV | | X | X | X | +IBV_WR_TSO | X | | | | X +.fi +.PP +The attribute send_flags describes the properties of the \s-1WR\s0. It is either 0 or the bitwise \s-1OR\s0 of one or more of the following flags: +.PP +.TP +.B IBV_SEND_FENCE \fR Set the fence indicator. Valid only for QPs with Transport Service Type \fBIBV_QPT_RC +.TP +.B IBV_SEND_SIGNALED \fR Set the completion notification indicator. Relevant only if QP was created with sq_sig_all=0 +.TP +.B IBV_SEND_SOLICITED \fR Set the solicited event indicator. Valid only for Send and RDMA Write with immediate +.TP +.B IBV_SEND_INLINE \fR Send data in given gather list as inline data +in a send WQE. Valid only for Send and RDMA Write. The L_Key will not be checked. +.TP +.B IBV_SEND_IP_CSUM \fR Offload the IPv4 and TCP/UDP checksum calculation. +Valid only when \fBdevice_cap_flags\fR in device_attr indicates current QP is +supported by checksum offload. +.SH "RETURN VALUE" +.B ibv_post_send() +returns 0 on success, or the value of errno on failure (which indicates the failure reason). +.SH "NOTES" +The user should not alter or destroy AHs associated with WRs until +request is fully executed and a work completion has been retrieved +from the corresponding completion queue (CQ) to avoid unexpected +behavior. +.PP +The buffers used by a WR can only be safely reused after WR the +request is fully executed and a work completion has been retrieved +from the corresponding completion queue (CQ). However, if the +IBV_SEND_INLINE flag was set, the buffer can be reused immediately +after the call returns. +.PP +IBV_WR_DRIVER1 is an opcode that should be used to issue a specific driver operation. +.SH "SEE ALSO" +.BR ibv_create_qp (3), +.BR ibv_create_ah (3), +.BR ibv_post_recv (3), +.BR ibv_post_srq_recv (3), +.BR ibv_poll_cq (3) +.SH "AUTHORS" +.TP +Dotan Barak <dotanba@gmail.com> +.TP +Majd Dibbiny <majd@mellanox.com> +.TP +Yishai Hadas <yishaih@mellanox.com> diff --git a/libibverbs/man/ibv_post_srq_ops.3 b/libibverbs/man/ibv_post_srq_ops.3 new file mode 100644 index 0000000..6bab0c0 --- /dev/null +++ b/libibverbs/man/ibv_post_srq_ops.3 @@ -0,0 +1,100 @@ +.\" -*- nroff -*- +.\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md +.\" +.TH IBV_POST_SRQ_OPS 3 2017-03-26 libibverbs "Libibverbs Programmer's Manual" +.SH "NAME" +ibv_post_srq_ops \- perform on a special shared receive queue (SRQ) +configuration manipulations +.SH "SYNOPSIS" +.nf +.B #include <infiniband/verbs.h> +.sp +.BI "int ibv_post_srq_ops(struct ibv_srq " "*srq" ", struct ibv_ops_wr " "*wr" , +.BI " struct ibv_ops_wr " "**bad_wr" ); +.fi +.SH "DESCRIPTION" +The +.B ibv_post_srq_ops() +performs series of offload configuration manipulations on special types of SRQ +.I srq\fR. Currenlty it is used to configure tag matching SRQ. Series of configuration +operations defined by linked lists of struct ibv_ops_wr elements starting from +.I wr. +.PP +.nf +struct ibv_ops_wr { +.in +8 +uint64_t wr_id; /* User defined WR ID */ +/* Pointer to next WR in list, NULL if last WR */ +struct ibv_ops_wr *next; +enum ibv_ops_wr_opcode opcode; /* From enum ibv_ops_wr_opcode */ +int flags; /* From enum ibv_ops_flags */ +struct { +.in +8 +/* Number of unexpected messages + * handled by SW */ +uint32_t unexpected_cnt; +/* Input parameter for the DEL opcode + * and output parameter for the ADD opcode */ +uint32_t handle; +struct { +.in +8 +uint64_t recv_wr_id; /* User defined WR ID for TM_RECV */ +struct ibv_sge *sg_list; /* Pointer to the s/g array */ +int num_sge; /* Size of the s/g array */ +uint64_t tag; +uint64_t mask; /* Incoming message considered matching if + TMH.tag & entry.mask == entry.tag */ +.in -8 +} add; +.in -8 +} tm; +.in -8 +}; +.fi +.PP +First part of struct ibv_ops_wr retains ibv_send_wr notion. +Opcode defines operation to perform. Currently supported IBV_WR_TAG_ADD, +IBV_WR_TAG_DEL and IBV_WR_TAG_SYNC values. See below for detailed +description. +.PP +To allow reliable data delivery TM SRQ maintains special low level +synchronization primitive - phase synchronization. Receive side message +handling comprises two concurrent activities - posting tagged buffers by +SW and receiving incoming messages by HW. This process considered +coherent only if all unexpected messages received by HW is completely +processed in SW. To pass to hardware number of processed unexpected +messages unexpected_cnt field should be used and IBV_OPS_TM_SYNC flag +should be set. +.PP +To request WC for tag list operations IBV_OPS_SIGNALED flags should be +passed. In this case WC will be generated on TM SRQ's CQ, provided wr_id +will identify WC. +.PP +Opcode IBV_WR_TAG_ADD used to add tag entry to tag matching list. +Tag entry consists of SGE list, tag & mask (matching parameters), +user specified opaque wr_id (passed via recv_wr_id field) and uniquely +identified by handle (returned by driver). +Size of tag matching list is limited by max_num_tags. +SGE list size is limited by max_sge. +.PP +Opcode IBV_WR_TAG_DEL removes previously added tag entry. +Field handle should be set to value returned by previously performed +IBV_WR_TAG_ADD operation. +Operation may fail due to concurrent tag consumption - in this case IBV_WC_TM_ERR +status will be returned in WC. +.PP +Opcode IBV_WR_TAG_SYNC may be used if no changes to matching list +required, just to updated unexpected messages counter. +.PP +IBV_WC_TM_SYNC_REQ flag returned in list operation WC shows that counter +synchronization required. This flag also may be returned by unexpected receive WC, +asking for IBV_WR_TAG_SYNC operation to keep TM coherence consistency. +.SH "RETURN VALUE" +.B ibv_post_srq_ops() +returns 0 on success, or the value of errno on failure (which indicates the +failure reason). +.SH "SEE ALSO" +.BR ibv_create_srq_ex (3), +.SH "AUTHORS" +.TP +Artemy Kovalyov <artemyko@mellanox.com> diff --git a/libibverbs/man/ibv_post_srq_recv.3 b/libibverbs/man/ibv_post_srq_recv.3 new file mode 100644 index 0000000..51d1516 --- /dev/null +++ b/libibverbs/man/ibv_post_srq_recv.3 @@ -0,0 +1,69 @@ +.\" -*- nroff -*- +.\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md +.\" +.TH IBV_POST_SRQ_RECV 3 2006-10-31 libibverbs "Libibverbs Programmer's Manual" +.SH "NAME" +ibv_post_srq_recv \- post a list of work requests (WRs) to a shared receive queue (SRQ) +.SH "SYNOPSIS" +.nf +.B #include <infiniband/verbs.h> +.sp +.BI "int ibv_post_srq_recv(struct ibv_srq " "*srq" ", struct ibv_recv_wr " "*wr" , +.BI " struct ibv_recv_wr " "**bad_wr" ); +.fi +.SH "DESCRIPTION" +.B ibv_post_srq_recv() +posts the linked list of work requests (WRs) starting with +.I wr +to the shared receive queue (SRQ) +.I srq\fR. +It stops processing WRs from this list at the first failure (that can +be detected immediately while requests are being posted), and returns +this failing WR through +.I bad_wr\fR. +.PP +The argument +.I wr +is an ibv_recv_wr struct, as defined in <infiniband/verbs.h>. +.PP +.nf +struct ibv_recv_wr { +.in +8 +uint64_t wr_id; /* User defined WR ID */ +struct ibv_recv_wr *next; /* Pointer to next WR in list, NULL if last WR */ +struct ibv_sge *sg_list; /* Pointer to the s/g array */ +int num_sge; /* Size of the s/g array */ +.in -8 +}; +.sp +.nf +struct ibv_sge { +.in +8 +uint64_t addr; /* Start address of the local memory buffer */ +uint32_t length; /* Length of the buffer */ +uint32_t lkey; /* Key of the local Memory Region */ +.in -8 +}; +.fi +.SH "RETURN VALUE" +.B ibv_post_srq_recv() +returns 0 on success, or the value of errno on failure (which indicates the failure reason). +.SH "NOTES" +The buffers used by a WR can only be safely reused after WR the +request is fully executed and a work completion has been retrieved +from the corresponding completion queue (CQ). +.PP +If a WR is being posted to a UD QP, the Global Routing Header (GRH) of +the incoming message will be placed in the first 40 bytes of the +buffer(s) in the scatter list. If no GRH is present in the incoming +message, then the first bytes will be undefined. This means that in +all cases, the actual data of the incoming message will start at an +offset of 40 bytes into the buffer(s) in the scatter list. +.SH "SEE ALSO" +.BR ibv_create_qp (3), +.BR ibv_post_send (3), +.BR ibv_post_recv (3), +.BR ibv_poll_cq (3) +.SH "AUTHORS" +.TP +Dotan Barak <dotanba@gmail.com> diff --git a/libibverbs/man/ibv_query_device.3 b/libibverbs/man/ibv_query_device.3 new file mode 100644 index 0000000..94dc8a5 --- /dev/null +++ b/libibverbs/man/ibv_query_device.3 @@ -0,0 +1,85 @@ +.\" -*- nroff -*- +.\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md +.\" +.TH IBV_QUERY_DEVICE 3 2006-10-31 libibverbs "Libibverbs Programmer's Manual" +.SH "NAME" +ibv_query_device \- query an RDMA device's attributes +.SH "SYNOPSIS" +.nf +.B #include <infiniband/verbs.h> +.sp +.BI "int ibv_query_device(struct ibv_context " "*context", +.BI " struct ibv_device_attr " "*device_attr" ); +.fi +.SH "DESCRIPTION" +.B ibv_query_device() +returns the attributes of the device with context +.I context\fR. +The argument +.I device_attr +is a pointer to an ibv_device_attr struct, as defined in <infiniband/verbs.h>. +.PP +.nf +struct ibv_device_attr { +.in +8 +char fw_ver[64]; /* FW version */ +uint64_t node_guid; /* Node GUID (in network byte order) */ +uint64_t sys_image_guid; /* System image GUID (in network byte order) */ +uint64_t max_mr_size; /* Largest contiguous block that can be registered */ +uint64_t page_size_cap; /* Supported memory shift sizes */ +uint32_t vendor_id; /* Vendor ID, per IEEE */ +uint32_t vendor_part_id; /* Vendor supplied part ID */ +uint32_t hw_ver; /* Hardware version */ +int max_qp; /* Maximum number of supported QPs */ +int max_qp_wr; /* Maximum number of outstanding WR on any work queue */ +unsigned int device_cap_flags; /* HCA capabilities mask */ +int max_sge; /* Maximum number of s/g per WR for SQ & RQ of QP for non RDMA Read operations */ +int max_sge_rd; /* Maximum number of s/g per WR for RDMA Read operations */ +int max_cq; /* Maximum number of supported CQs */ +int max_cqe; /* Maximum number of CQE capacity per CQ */ +int max_mr; /* Maximum number of supported MRs */ +int max_pd; /* Maximum number of supported PDs */ +int max_qp_rd_atom; /* Maximum number of RDMA Read & Atomic operations that can be outstanding per QP */ +int max_ee_rd_atom; /* Maximum number of RDMA Read & Atomic operations that can be outstanding per EEC */ +int max_res_rd_atom; /* Maximum number of resources used for RDMA Read & Atomic operations by this HCA as the Target */ +int max_qp_init_rd_atom; /* Maximum depth per QP for initiation of RDMA Read & Atomic operations */ +int max_ee_init_rd_atom; /* Maximum depth per EEC for initiation of RDMA Read & Atomic operations */ +enum ibv_atomic_cap atomic_cap; /* Atomic operations support level */ +int max_ee; /* Maximum number of supported EE contexts */ +int max_rdd; /* Maximum number of supported RD domains */ +int max_mw; /* Maximum number of supported MWs */ +int max_raw_ipv6_qp; /* Maximum number of supported raw IPv6 datagram QPs */ +int max_raw_ethy_qp; /* Maximum number of supported Ethertype datagram QPs */ +int max_mcast_grp; /* Maximum number of supported multicast groups */ +int max_mcast_qp_attach; /* Maximum number of QPs per multicast group which can be attached */ +int max_total_mcast_qp_attach;/* Maximum number of QPs which can be attached to multicast groups */ +int max_ah; /* Maximum number of supported address handles */ +int max_fmr; /* Maximum number of supported FMRs */ +int max_map_per_fmr; /* Maximum number of (re)maps per FMR before an unmap operation in required */ +int max_srq; /* Maximum number of supported SRQs */ +int max_srq_wr; /* Maximum number of WRs per SRQ */ +int max_srq_sge; /* Maximum number of s/g per SRQ */ +uint16_t max_pkeys; /* Maximum number of partitions */ +uint8_t local_ca_ack_delay; /* Local CA ack delay */ +uint8_t phys_port_cnt; /* Number of physical ports */ +.in -8 +}; +.fi +.SH "RETURN VALUE" +.B ibv_query_device() +returns 0 on success, or the value of errno on failure (which indicates the failure reason). +.SH "NOTES" +The maximum values returned by this function are the upper limits of +supported resources by the device. However, it may not be possible to +use these maximum values, since the actual number of any resource that +can be created may be limited by the machine configuration, the amount +of host memory, user permissions, and the amount of resources already +in use by other users/processes. +.SH "SEE ALSO" +.BR ibv_open_device (3), +.BR ibv_query_port (3), +.BR ibv_query_pkey (3), +.BR ibv_query_gid (3) +.SH "AUTHORS" +.TP +Dotan Barak <dotanba@gmail.com> diff --git a/libibverbs/man/ibv_query_device_ex.3 b/libibverbs/man/ibv_query_device_ex.3 new file mode 100644 index 0000000..2baccc7 --- /dev/null +++ b/libibverbs/man/ibv_query_device_ex.3 @@ -0,0 +1,178 @@ +.\" -*- nroff -*- +.\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md +.\" +.TH IBV_QUERY_DEVICE_EX 3 2014-12-17 libibverbs "Libibverbs Programmer's Manual" +.SH "NAME" +ibv_query_device_ex \- query an RDMA device's attributes including extended +device properties. +.SH "SYNOPSIS" +.nf +.B #include <infiniband/verbs.h> +.sp +.BI "int ibv_query_device_ex(struct ibv_context " "*context", +.BI " struct ibv_query_device_ex_input " "*input", +.BI " struct ibv_device_attr_ex " "*attr" ); +.fi +.SH "DESCRIPTION" +.B ibv_query_device_ex() +returns the attributes of the device with context +.I context\fR. +The argument +.I input +is a pointer to an ibv_query_device_ex_input structure, used for future extensions +The argument +.I attr +is a pointer to an ibv_device_attr_ex struct, as defined in <infiniband/verbs.h>. +.PP +.nf +struct ibv_device_attr_ex { +.in +8 +struct ibv_device_attr orig_attr; +uint32_t comp_mask; /* Compatibility mask that defines which of the following variables are valid */ +struct ibv_odp_caps odp_caps; /* On-Demand Paging capabilities */ +uint64_t completion_timestamp_mask; /* Completion timestamp mask (0 = unsupported) */ +uint64_t hca_core_clock; /* The frequency (in kHZ) of the HCA (0 = unsupported) */ +uint64_t device_cap_flags_ex; /* Extended device capability flags */ +struct ibv_tso_caps tso_caps; /* TCP segmentation offload capabilities */ +struct ibv_rss_caps rss_caps; /* RSS capabilities */ +uint32_t max_wq_type_rq; /* Max Work Queue from type RQ */ +struct ibv_packet_pacing_caps packet_pacing_caps; /* Packet pacing capabilities */ +uint32_t raw_packet_caps; /* Raw packet capabilities, use enum ibv_raw_packet_caps */ +struct ibv_tm_caps tm_caps; /* Tag matching capabilities */ +struct ibv_cq_moderation_caps cq_mod_caps; /* CQ moderation max capabilities */ +uint64_t max_dm_size; /* Max Device Memory size (in bytes) available for allocation */ +struct ibv_pci_atomic_caps atomic_caps; /* PCI atomic operations capabilities, use enum ibv_pci_atomic_op_size */ +uint32_t xrc_odp_caps; /* Mask with enum ibv_odp_transport_cap_bits to know which operations are supported. */ +.in -8 +}; + +struct ibv_odp_caps { + uint64_t general_odp_caps; /* Mask with enum ibv_odp_general_cap_bits */ + struct { + uint32_t rc_odp_caps; /* Mask with enum ibv_odp_tranport_cap_bits to know which operations are supported. */ + uint32_t uc_odp_caps; /* Mask with enum ibv_odp_tranport_cap_bits to know which operations are supported. */ + uint32_t ud_odp_caps; /* Mask with enum ibv_odp_tranport_cap_bits to know which operations are supported. */ + } per_transport_caps; +}; + +enum ibv_odp_general_cap_bits { + IBV_ODP_SUPPORT = 1 << 0, /* On demand paging is supported */ + IBV_ODP_SUPPORT_IMPLICIT = 1 << 1, /* Implicit on demand paging is supported */ +}; + +enum ibv_odp_transport_cap_bits { + IBV_ODP_SUPPORT_SEND = 1 << 0, /* Send operations support on-demand paging */ + IBV_ODP_SUPPORT_RECV = 1 << 1, /* Receive operations support on-demand paging */ + IBV_ODP_SUPPORT_WRITE = 1 << 2, /* RDMA-Write operations support on-demand paging */ + IBV_ODP_SUPPORT_READ = 1 << 3, /* RDMA-Read operations support on-demand paging */ + IBV_ODP_SUPPORT_ATOMIC = 1 << 4, /* RDMA-Atomic operations support on-demand paging */ + IBV_ODP_SUPPORT_SRQ_RECV = 1 << 5, /* SRQ receive operations support on-demand paging */ +}; + +struct ibv_tso_caps { + uint32_t max_tso; /* Maximum payload size in bytes supported for segmentation by TSO engine.*/ + uint32_t supported_qpts; /* Bitmap showing which QP types are supported by TSO operation. */ +}; + +struct ibv_rss_caps { + uint32_t supported_qpts; /* Bitmap showing which QP types are supported RSS */ + uint32_t max_rwq_indirection_tables; /* Max receive work queue indirection tables */ + uint32_t max_rwq_indirection_table_size; /* Max receive work queue indirection table size */ + uint64_t rx_hash_fields_mask; /* Mask with enum ibv_rx_hash_fields to know which incoming packet's field can participates in the RX hash */ + uint8_t rx_hash_function; /* Mask with enum ibv_rx_hash_function_flags to know which hash functions are supported */ +}; + +struct ibv_packet_pacing_caps { + uint32_t qp_rate_limit_min; /* Minimum rate limit in kbps */ + uint32_t qp_rate_limit_max; /* Maximum rate limit in kbps */ + uint32_t supported_qpts; /* Bitmap showing which QP types are supported. */ +}; + +enum ibv_raw_packet_caps { +.in +8 +IBV_RAW_PACKET_CAP_CVLAN_STRIPPING = 1 << 0, /* CVLAN stripping is supported */ +IBV_RAW_PACKET_CAP_SCATTER_FCS = 1 << 1, /* FCS scattering is supported */ +IBV_RAW_PACKET_CAP_IP_CSUM = 1 << 2, /* IP CSUM offload is supported */ +.in -8 +}; + +enum ibv_tm_cap_flags { +.in +8 +IBV_TM_CAP_RC = 1 << 0, /* Support tag matching on RC transport */ +.in -8 +}; + +struct ibv_tm_caps { +.in +8 +uint32_t max_rndv_hdr_size; /* Max size of rendezvous request header */ +uint32_t max_num_tags; /* Max number of tagged buffers in a TM-SRQ matching list */ +uint32_t flags; /* From enum ibv_tm_cap_flags */ +uint32_t max_ops; /* Max number of outstanding list operations */ +uint32_t max_sge; /* Max number of SGEs in a tagged buffer */ +.in -8 +}; + +struct ibv_cq_moderation_caps { + uint16_t max_cq_count; + uint16_t max_cq_period; +}; + +enum ibv_pci_atomic_op_size { +.in +8 +IBV_PCI_ATOMIC_OPERATION_4_BYTE_SIZE_SUP = 1 << 0, +IBV_PCI_ATOMIC_OPERATION_8_BYTE_SIZE_SUP = 1 << 1, +IBV_PCI_ATOMIC_OPERATION_16_BYTE_SIZE_SUP = 1 << 2, +.in -8 +}; + +struct ibv_pci_atomic_caps { +.in +8 +uint16_t fetch_add; /* Supported sizes for an atomic fetch and add operation, use enum ibv_pci_atomic_op_size */ +uint16_t swap; /* Supported sizes for an atomic unconditional swap operation, use enum ibv_pci_atomic_op_size */ +uint16_t compare_swap; /* Supported sizes for an atomic compare and swap operation, use enum ibv_pci_atomic_op_size */ +.in -8 +}; +.fi + +Extended device capability flags (device_cap_flags_ex): +.br +.TP 7 +IBV_DEVICE_PCI_WRITE_END_PADDING + +Indicates the device has support for padding PCI writes to a full cache line. + +Padding packets to full cache lines reduces the amount of traffic +required at the memory controller at the expense of creating more +traffic on the PCI-E port. + +Workloads that have a high CPU memory load and low PCI-E utilization +will benefit from this feature, while workloads that have a high PCI-E +utilization and small packets will be harmed. + +For instance, with a 128 byte cache line size, the transfer of any +packets less than 128 bytes will require a full 128 transfer on PCI, +potentially doubling the required PCI-E bandwidth. + +This feature can be enabled on a QP or WQ basis via the +IBV_QP_CREATE_PCI_WRITE_END_PADDING or IBV_WQ_FLAGS_PCI_WRITE_END_PADDING +flags. + +.SH "RETURN VALUE" +.B ibv_query_device_ex() +returns 0 on success, or the value of errno on failure (which indicates the failure reason). +.SH "NOTES" +The maximum values returned by this function are the upper limits of +supported resources by the device. However, it may not be possible to +use these maximum values, since the actual number of any resource that +can be created may be limited by the machine configuration, the amount +of host memory, user permissions, and the amount of resources already +in use by other users/processes. +.SH "SEE ALSO" +.BR ibv_query_device (3), +.BR ibv_open_device (3), +.BR ibv_query_port (3), +.BR ibv_query_pkey (3), +.BR ibv_query_gid (3) +.SH "AUTHORS" +.TP +Majd Dibbiny <majd@mellanox.com> diff --git a/libibverbs/man/ibv_query_gid.3.md b/libibverbs/man/ibv_query_gid.3.md new file mode 100644 index 0000000..e1c67bb --- /dev/null +++ b/libibverbs/man/ibv_query_gid.3.md @@ -0,0 +1,44 @@ +--- +date: 2006-10-31 +footer: libibverbs +header: "Libibverbs Programmer's Manual" +layout: page +license: 'Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md' +section: 3 +title: IBV_QUERY_GID +--- + +# NAME + +ibv_query_gid - query an InfiniBand port's GID table + +# SYNOPSIS + +```c +#include <infiniband/verbs.h> + +int ibv_query_gid(struct ibv_context *context, + uint8_t port_num, + int index, + union ibv_gid *gid); +``` + +# DESCRIPTION + +**ibv_query_gid()** returns the GID value in entry *index* of port *port_num* +for device context *context* through the pointer *gid*. + +# RETURN VALUE + +**ibv_query_gid()** returns 0 on success, and -1 on error. + +# SEE ALSO + +**ibv_open_device**(3), +**ibv_query_device**(3), +**ibv_query_pkey**(3), +**ibv_query_port**(3) + +# AUTHOR + +Dotan Barak <dotanba@gmail.com> diff --git a/libibverbs/man/ibv_query_pkey.3.md b/libibverbs/man/ibv_query_pkey.3.md new file mode 100644 index 0000000..8c8e71b --- /dev/null +++ b/libibverbs/man/ibv_query_pkey.3.md @@ -0,0 +1,45 @@ +--- +date: 2006-10-31 +footer: libibverbs +header: "Libibverbs Programmer's Manual" +layout: page +license: 'Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md' +section: 3 +title: IBV_QUERY_PKEY +--- + +# NAME + +ibv_query_pkey - query an InfiniBand port's P_Key table + +# SYNOPSIS + +```c +#include <infiniband/verbs.h> + +int ibv_query_pkey(struct ibv_context *context, + uint8_t port_num, + int index, + uint16_t *pkey); +``` + +# DESCRIPTION + +**ibv_query_pkey()** returns the P_Key value (in network byte order) in entry +*index* of port *port_num* for device context *context* through the pointer +*pkey*. + +# RETURN VALUE + +**ibv_query_pkey()** returns 0 on success, and -1 on error. + +# SEE ALSO + +**ibv_open_device**(3), +**ibv_query_device**(3), +**ibv_query_gid**(3), +**ibv_query_port**(3) + +# AUTHOR + +Dotan Barak <dotanba@gmail.com> diff --git a/libibverbs/man/ibv_query_port.3 b/libibverbs/man/ibv_query_port.3 new file mode 100644 index 0000000..6d077a3 --- /dev/null +++ b/libibverbs/man/ibv_query_port.3 @@ -0,0 +1,71 @@ +.\" -*- nroff -*- +.\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md +.\" +.TH IBV_QUERY_PORT 3 2006-10-31 libibverbs "Libibverbs Programmer's Manual" +.SH "NAME" +ibv_query_port \- query an RDMA port's attributes +.SH "SYNOPSIS" +.nf +.B #include <infiniband/verbs.h> +.sp +.BI "int ibv_query_port(struct ibv_context " "*context" ", uint8_t " "port_num" , +.BI " struct ibv_port_attr " "*port_attr" "); +.fi +.SH "DESCRIPTION" +.B ibv_query_port() +returns the attributes of port +.I port_num +for device context +.I context +through the pointer +.I port_attr\fR. +The argument +.I port_attr +is an ibv_port_attr struct, as defined in <infiniband/verbs.h>. +.PP +.nf +struct ibv_port_attr { +.in +8 +enum ibv_port_state state; /* Logical port state */ +enum ibv_mtu max_mtu; /* Max MTU supported by port */ +enum ibv_mtu active_mtu; /* Actual MTU */ +int gid_tbl_len; /* Length of source GID table */ +uint32_t port_cap_flags; /* Port capabilities */ +uint32_t max_msg_sz; /* Maximum message size */ +uint32_t bad_pkey_cntr; /* Bad P_Key counter */ +uint32_t qkey_viol_cntr; /* Q_Key violation counter */ +uint16_t pkey_tbl_len; /* Length of partition table */ +uint16_t lid; /* Base port LID */ +uint16_t sm_lid; /* SM LID */ +uint8_t lmc; /* LMC of LID */ +uint8_t max_vl_num; /* Maximum number of VLs */ +uint8_t sm_sl; /* SM service level */ +uint8_t subnet_timeout; /* Subnet propagation delay */ +uint8_t init_type_reply;/* Type of initialization performed by SM */ +uint8_t active_width; /* Currently active link width */ +uint8_t active_speed; /* Currently active link speed */ +uint8_t phys_state; /* Physical port state */ +uint8_t link_layer; /* link layer protocol of the port */ +uint8_t flags; /* Port flags */ +uint16_t port_cap_flags2;/* Port capabilities */ +.in -8 +}; +.sp +possible values for the link layer field are IBV_LINK_LAYER_INFINIBAND, +IBV_LINK_LAYER_ETHERNET, or IBV_LINK_LAYER_UNSPECIFIED. +.sp +supported port flags: +IBV_QPF_GRH_REQUIRED - When this flag is set, the applications must create all AH with GRH configured. +.sp +.fi +.SH "RETURN VALUE" +.B ibv_query_port() +returns 0 on success, or the value of errno on failure (which indicates the failure reason). +.SH "SEE ALSO" +.BR ibv_create_qp (3), +.BR ibv_destroy_qp (3), +.BR ibv_query_qp (3), +.BR ibv_create_ah (3) +.SH "AUTHORS" +.TP +Dotan Barak <dotanba@gmail.com> diff --git a/libibverbs/man/ibv_query_qp.3 b/libibverbs/man/ibv_query_qp.3 new file mode 100644 index 0000000..907bc56 --- /dev/null +++ b/libibverbs/man/ibv_query_qp.3 @@ -0,0 +1,91 @@ +.\" -*- nroff -*- +.\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md +.\" +.TH IBV_QUERY_QP 3 2006-10-31 libibverbs "Libibverbs Programmer's Manual" +.SH "NAME" +ibv_query_qp \- get the attributes of a queue pair (QP) +.SH "SYNOPSIS" +.nf +.B #include <infiniband/verbs.h> +.sp +.BI "int ibv_query_qp(struct ibv_qp " "*qp" ", struct ibv_qp_attr " "*attr" , +.BI " int " "attr_mask" , +.BI " struct ibv_qp_init_attr " "*init_attr" ); +.fi +.SH "DESCRIPTION" +.B ibv_query_qp() +gets the attributes specified in +.I attr_mask +for the QP +.I qp +and returns them through the pointers +.I attr +and +.I init_attr\fR. +The argument +.I attr +is an ibv_qp_attr struct, as defined in <infiniband/verbs.h>. +.PP +.nf +struct ibv_qp_attr { +.in +8 +enum ibv_qp_state qp_state; /* Current QP state */ +enum ibv_qp_state cur_qp_state; /* Current QP state - irrelevant for ibv_query_qp */ +enum ibv_mtu path_mtu; /* Path MTU (valid only for RC/UC QPs) */ +enum ibv_mig_state path_mig_state; /* Path migration state (valid if HCA supports APM) */ +uint32_t qkey; /* Q_Key of the QP (valid only for UD QPs) */ +uint32_t rq_psn; /* PSN for receive queue (valid only for RC/UC QPs) */ +uint32_t sq_psn; /* PSN for send queue (valid only for RC/UC QPs) */ +uint32_t dest_qp_num; /* Destination QP number (valid only for RC/UC QPs) */ +int qp_access_flags; /* Mask of enabled remote access operations (valid only for RC/UC QPs) */ +struct ibv_qp_cap cap; /* QP capabilities */ +struct ibv_ah_attr ah_attr; /* Primary path address vector (valid only for RC/UC QPs) */ +struct ibv_ah_attr alt_ah_attr; /* Alternate path address vector (valid only for RC/UC QPs) */ +uint16_t pkey_index; /* Primary P_Key index */ +uint16_t alt_pkey_index; /* Alternate P_Key index */ +uint8_t en_sqd_async_notify; /* Enable SQD.drained async notification - irrelevant for ibv_query_qp */ +uint8_t sq_draining; /* Is the QP draining? (Valid only if qp_state is SQD) */ +uint8_t max_rd_atomic; /* Number of outstanding RDMA reads & atomic operations on the destination QP (valid only for RC QPs) */ +uint8_t max_dest_rd_atomic; /* Number of responder resources for handling incoming RDMA reads & atomic operations (valid only for RC QPs) */ +uint8_t min_rnr_timer; /* Minimum RNR NAK timer (valid only for RC QPs) */ +uint8_t port_num; /* Primary port number */ +uint8_t timeout; /* Local ack timeout for primary path (valid only for RC QPs) */ +uint8_t retry_cnt; /* Retry count (valid only for RC QPs) */ +uint8_t rnr_retry; /* RNR retry (valid only for RC QPs) */ +uint8_t alt_port_num; /* Alternate port number */ +uint8_t alt_timeout; /* Local ack timeout for alternate path (valid only for RC QPs) */ +.in -8 +}; +.fi +.PP +For details on struct ibv_qp_cap see the description of +.B ibv_create_qp()\fR. +For details on struct ibv_ah_attr see the description of +.B ibv_create_ah()\fR. +.SH "RETURN VALUE" +.B ibv_query_qp() +returns 0 on success, or the value of errno on failure (which indicates the failure reason). +.SH "NOTES" +The argument +.I attr_mask +is a hint that specifies the minimum list of attributes to retrieve. +Some RDMA devices may return extra attributes not requested, for +example if the value can be returned cheaply. This has the same +form as in +.B ibv_modify_qp()\fR. +.PP +Attribute values are valid if they have been set using +.B ibv_modify_qp()\fR. +The exact list of valid attributes depends on the QP state. +.PP +Multiple calls to +.B ibv_query_qp() +may yield some differences in the values returned for the following attributes: qp_state, path_mig_state, sq_draining, ah_attr (if APM is enabled). +.SH "SEE ALSO" +.BR ibv_create_qp (3), +.BR ibv_destroy_qp (3), +.BR ibv_modify_qp (3), +.BR ibv_create_ah (3) +.SH "AUTHORS" +.TP +Dotan Barak <dotanba@gmail.com> diff --git a/libibverbs/man/ibv_query_rt_values_ex.3 b/libibverbs/man/ibv_query_rt_values_ex.3 new file mode 100644 index 0000000..8ff5b66 --- /dev/null +++ b/libibverbs/man/ibv_query_rt_values_ex.3 @@ -0,0 +1,51 @@ +.\" -*- nroff -*- +.\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md +.\" +.TH IBV_QUERY_RT_VALUES_EX 3 2016-2-20 libibverbs "Libibverbs Programmer's Manual" +.SH "NAME" +ibv_query_rt_values_ex \- query an RDMA device for some real time values +.SH "SYNOPSIS" +.nf +.B #include <infiniband/verbs.h> +.sp +.BI "int ibv_query_rt_values_ex(struct ibv_context " "*context", +.BI " struct ibv_values_ex " "*values" ); +.fi +.SH "DESCRIPTION" +.B ibv_query_rt_values_ex() +returns certain real time values of a device +.I context\fR. +The argument +.I attr +is a pointer to an ibv_device_attr_ex struct, as defined in <infiniband/verbs.h>. +.PP +.nf +struct ibv_values_ex { +.in +8 +uint32_t comp_mask; /* Compatibility mask that defines the query/queried fields [in/out] */ +struct timespec raw_clock; /* HW raw clock */ +.in -8 +}; + +enum ibv_values_mask { + IBV_VALUES_MASK_RAW_CLOCK = 1 << 0, /* HW raw clock */ +}; + +.fi +.SH "RETURN VALUE" +.B ibv_query_rt_values_ex() +returns 0 on success, or the value of errno on failure (which indicates the failure reason). +.SH "NOTES" +This extension verb only calls the provider, the provider has to query this value somehow and mark +the queried values in the comp_mask field. +.SH "SEE ALSO" +.BR ibv_query_device (3), +.BR ibv_open_device (3), +.BR ibv_query_port (3), +.BR ibv_query_pkey (3), +.BR ibv_query_gid (3) +.SH "AUTHORS" +.TP +Matan Barak <matanb@mellanox.com> +.TP +Yishai Hadas <yishaih@mellanox.com> diff --git a/libibverbs/man/ibv_query_srq.3 b/libibverbs/man/ibv_query_srq.3 new file mode 100644 index 0000000..8a35ce0 --- /dev/null +++ b/libibverbs/man/ibv_query_srq.3 @@ -0,0 +1,45 @@ +.\" -*- nroff -*- +.\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md +.\" +.TH IBV_QUERY_SRQ 3 2006-10-31 libibverbs "Libibverbs Programmer's Manual" +.SH "NAME" +ibv_query_srq \- get the attributes of a shared receive queue (SRQ) +.SH "SYNOPSIS" +.nf +.B #include <infiniband/verbs.h> +.sp +.BI "int ibv_query_srq(struct ibv_srq " "*srq" ", struct ibv_srq_attr " "*srq_attr" ); +.fi +.SH "DESCRIPTION" +.B ibv_query_srq() +gets the attributes of the SRQ +.I srq +and returns them through the pointer +.I srq_attr\fR. +The argument +.I srq_attr +is an ibv_srq_attr struct, as defined in <infiniband/verbs.h>. +.PP +.nf +struct ibv_srq_attr { +.in +8 +uint32_t max_wr; /* maximum number of outstanding work requests (WRs) in the SRQ */ +uint32_t max_sge; /* maximum number of scatter elements per WR */ +uint32_t srq_limit; /* the limit value of the SRQ */ +.in -8 +}; +.fi +.SH "RETURN VALUE" +.B ibv_query_srq() +returns 0 on success, or the value of errno on failure (which indicates the failure reason). +.SH "NOTES" +If the value returned for srq_limit is 0, then the SRQ limit reached +("low watermark") event is not (or no longer) armed, and no +asynchronous events will be generated until the event is rearmed. +.SH "SEE ALSO" +.BR ibv_create_srq (3), +.BR ibv_destroy_srq (3), +.BR ibv_modify_srq (3) +.SH "AUTHORS" +.TP +Dotan Barak <dotanba@gmail.com> diff --git a/libibverbs/man/ibv_rate_to_mbps.3.md b/libibverbs/man/ibv_rate_to_mbps.3.md new file mode 100644 index 0000000..edfb4ff --- /dev/null +++ b/libibverbs/man/ibv_rate_to_mbps.3.md @@ -0,0 +1,50 @@ +--- +date: 2012-03-31 +footer: libibverbs +header: "Libibverbs Programmer's Manual" +layout: page +license: 'Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md' +section: 3 +title: IBV_RATE_TO_MBPS +--- + +# NAME + +ibv_rate_to_mbps - convert IB rate enumeration to Mbit/sec + +mbps_to_ibv_rate - convert Mbit/sec to an IB rate enumeration + +# SYNOPSIS + +```c +#include <infiniband/verbs.h> + +int ibv_rate_to_mbps(enum ibv_rate rate); + +enum ibv_rate mbps_to_ibv_rate(int mbps); +``` + +# DESCRIPTION + +**ibv_rate_to_mbps()** converts the IB transmission rate enumeration *rate* to +a number of Mbit/sec. For example, if *rate* is **IBV_RATE_5_GBPS**, the +value 5000 will be returned (5 Gbit/sec = 5000 Mbit/sec). + +**mbps_to_ibv_rate()** converts the number of Mbit/sec *mult* to an IB +transmission rate enumeration. For example, if *mult* is 5000, the rate +enumeration **IBV_RATE_5_GBPS** will be returned. + +# RETURN VALUE + +**ibv_rate_to_mbps()** returns the number of Mbit/sec. + +**mbps_to_ibv_rate()** returns the enumeration representing the IB +transmission rate. + +# SEE ALSO + +**ibv_query_port**(3) + +# AUTHOR + +Dotan Barak <dotanb@dev.mellanox.co.il> diff --git a/libibverbs/man/ibv_rate_to_mult.3.md b/libibverbs/man/ibv_rate_to_mult.3.md new file mode 100644 index 0000000..0833809 --- /dev/null +++ b/libibverbs/man/ibv_rate_to_mult.3.md @@ -0,0 +1,52 @@ +--- +date: 2006-10-31 +footer: libibverbs +header: "Libibverbs Programmer's Manual" +layout: page +license: 'Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md' +section: 3 +title: IBV_RATE_TO_MULT +--- + +# NAME + +ibv_rate_to_mult - convert IB rate enumeration to multiplier of 2.5 Gbit/sec + +mult_to_ibv_rate - convert multiplier of 2.5 Gbit/sec to an IB rate +enumeration + +# SYNOPSIS + +```c +#include <infiniband/verbs.h> + +int ibv_rate_to_mult(enum ibv_rate rate); + +enum ibv_rate mult_to_ibv_rate(int mult); +``` + +# DESCRIPTION + +**ibv_rate_to_mult()** converts the IB transmission rate enumeration *rate* to +a multiple of 2.5 Gbit/sec (the base rate). For example, if *rate* is +**IBV_RATE_5_GBPS**, the value 2 will be returned (5 Gbit/sec = 2 * 2.5 +Gbit/sec). + +**mult_to_ibv_rate()** converts the multiplier value (of 2.5 Gbit/sec) *mult* +to an IB transmission rate enumeration. For example, if *mult* is 2, the rate +enumeration **IBV_RATE_5_GBPS** will be returned. + +# RETURN VALUE + +**ibv_rate_to_mult()** returns the multiplier of the base rate 2.5 Gbit/sec. + +**mult_to_ibv_rate()** returns the enumeration representing the IB +transmission rate. + +# SEE ALSO + +**ibv_query_port**(3) + +# AUTHOR + +Dotan Barak <dotanba@gmail.com> diff --git a/libibverbs/man/ibv_rc_pingpong.1 b/libibverbs/man/ibv_rc_pingpong.1 new file mode 100644 index 0000000..92554c0 --- /dev/null +++ b/libibverbs/man/ibv_rc_pingpong.1 @@ -0,0 +1,92 @@ +.\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md +.TH IBV_RC_PINGPONG 1 "August 30, 2005" "libibverbs" "USER COMMANDS" + +.SH NAME +ibv_rc_pingpong \- simple InfiniBand RC transport test + +.SH SYNOPSIS +.B ibv_rc_pingpong +[\-p port] [\-d device] [\-i ib port] [\-s size] [\-m size] +[\-r rx depth] [\-n iters] [\-l sl] [\-e] [\-g gid index] +[\-o] [\-P] [\-t] [\-j] [\-N] \fBHOSTNAME\fR + +.B ibv_rc_pingpong +[\-p port] [\-d device] [\-i ib port] [\-s size] [\-m size] +[\-r rx depth] [\-n iters] [\-l sl] [\-e] [\-g gid index] +[\-o] [\-P] [\-t] [\-j] [\-N] + +.SH DESCRIPTION +.PP +Run a simple ping-pong test over InfiniBand via the reliable +connected (RC) transport. + +.SH OPTIONS + +.PP +.TP +\fB\-p\fR, \fB\-\-port\fR=\fIPORT\fR +use TCP port \fIPORT\fR for initial synchronization (default 18515) +.TP +\fB\-d\fR, \fB\-\-ib\-dev\fR=\fIDEVICE\fR +use IB device \fIDEVICE\fR (default first device found) +.TP +\fB\-i\fR, \fB\-\-ib\-port\fR=\fIPORT\fR +use IB port \fIPORT\fR (default port 1) +.TP +\fB\-s\fR, \fB\-\-size\fR=\fISIZE\fR +ping-pong messages of size \fISIZE\fR (default 4096) +.TP +\fB\-m\fR, \fB\-\-mtu\fR=\fISIZE\fR +path MTU \fISIZE\fR (default 1024) +.TP +\fB\-r\fR, \fB\-\-rx\-depth\fR=\fIDEPTH\fR +post \fIDEPTH\fR receives at a time (default 1000) +.TP +\fB\-n\fR, \fB\-\-iters\fR=\fIITERS\fR +perform \fIITERS\fR message exchanges (default 1000) +.TP +\fB\-l\fR, \fB\-\-sl\fR=\fISL\fR +use \fISL\fR as the service level value of the QP (default 0) +.TP +\fB\-e\fR, \fB\-\-events\fR +sleep while waiting for work completion events (default is to poll for +completions) +.TP +\fB\-g\fR, \fB\-\-gid-idx\fR=\fIGIDINDEX\fR +local port \fIGIDINDEX\fR +.TP +\fB\-o\fR, \fB\-\-odp\fR +use on demand paging +.TP +\fB\-P\fR, \fB\-\-prefetch=\fR +prefetch an ODP MR +.TP +\fB\-t\fR, \fB\-\-ts\fR +get CQE with timestamp +.TP +\fB\-c\fR, \fB\-\-chk\fR +validate received buffer +.TP +\fB\-j\fR, \fB\-\-dm\fR +use device memory +.TP +\fB\-N\fR, \fB\-\-new_send\fR +use new post send WR API + +.SH SEE ALSO +.BR ibv_uc_pingpong (1), +.BR ibv_ud_pingpong (1), +.BR ibv_srq_pingpong (1), +.BR ibv_xsrq_pingpong (1) + +.SH AUTHORS +.TP +Roland Dreier +.RI < rolandd@cisco.com > + +.SH BUGS +The network synchronization between client and server instances is +weak, and does not prevent incompatible options from being used on the +two instances. The method used for retrieving work completions is not +strictly correct, and race conditions may cause failures on some +systems. diff --git a/libibverbs/man/ibv_read_counters.3.md b/libibverbs/man/ibv_read_counters.3.md new file mode 100644 index 0000000..74407b1 --- /dev/null +++ b/libibverbs/man/ibv_read_counters.3.md @@ -0,0 +1,179 @@ +--- +date: 2018-04-02 +footer: libibverbs +header: "Libibverbs Programmer's Manual" +layout: page +license: 'Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md' +section: 3 +title: ibv_read_counters +--- + +# NAME + +**ibv_read_counters** - Read counter values + +# SYNOPSIS + +```c +#include <infiniband/verbs.h> + +int ibv_read_counters(struct ibv_counters *counters, + uint64_t *counters_value, + uint32_t ncounters, + uint32_t flags); +``` + +# DESCRIPTION + +**ibv_read_counters**() returns the values of the chosen counters into +*counters_value* array of which can accumulate *ncounters*. +The values are filled according to the configuration defined by the +user in the **ibv_attach_counters_point_xxx** functions. + +# ARGUMENTS + +*counters* +: Counters object to read. + +*counters_value* +: Input buffer to hold read result. + +*ncounters* +: Number of counters to fill. + +*flags* +: Use enum ibv_read_counters_flags. + +## *flags* Argument +IBV_READ_COUNTERS_ATTR_PREFER_CACHED +: Will prefer reading the values from driver cache, else it will do volatile hardware access which is the default. + +# RETURN VALUE + +**ibv_read_counters**() returns 0 on success, or the value of errno on failure +(which indicates the failure reason) + +# EXAMPLE + +Example: Statically attach counters to a new flow + +This example demonstrates the use of counters which are attached statically with +the creation of a new flow. +The counters are read from hardware periodically, and finally all resources are released. +```c +/* create counters object and define its counters points */ +/* create simple L2 flow with hardcoded MAC, and a count action */ +/* read counters periodically, every 1sec, until loop ends */ +/* assumes user prepared a RAW_PACKET QP as input */ +/* only limited error checking in run time for code simplicity */ + +#include <inttypes.h> +#include <infiniband/verbs.h> + +/* the below MAC should be replaced by user */ +#define FLOW_SPEC_ETH_MAC_VAL { + .dst_mac = { 0x00, 0x01, 0x02, 0x03, 0x04,0x05}, + .src_mac = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}, + .ether_type = 0, .vlan_tag = 0, } +#define FLOW_SPEC_ETH_MAC_MASK { + .dst_mac = { 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF}, + .src_mac = { 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF}, + .ether_type = 0, .vlan_tag = 0, } + +void example_create_flow_with_counters_on_raw_qp(struct ibv_qp *qp) { + int idx = 0; + int loop = 10; + struct ibv_flow *flow = NULL; + struct ibv_counters *counters = NULL; + struct ibv_counters_init_attr init_attr = {0}; + struct ibv_counter_attach_attr attach_attr = {0}; + + /* create single counters handle */ + counters = ibv_create_counters(qp->context, &init_attr); + + /* define counters points */ + attach_attr.counter_desc = IBV_COUNTER_PACKETS; + attach_attr.index = idx++; + ret = ibv_attach_counters_point_flow(counters, &attach_attr, NULL); + if (ret == ENOTSUP) { + fprintf(stderr, "Attaching IBV_COUNTER_PACKETS to flow is not \ +supported"); + exit(1); + } + attach_attr.counter_desc = IBV_COUNTER_BYTES; + attach_attr.index = idx++; + ibv_attach_counters_point_flow(counters, &attach_attr, NULL); + if (ret == ENOTSUP) { + fprintf(stderr, "Attaching IBV_COUNTER_BYTES to flow is not \ +supported"); + exit(1); + } + + /* define a new flow attr that includes the counters handle */ + struct raw_eth_flow_attr { + struct ibv_flow_attr attr; + struct ibv_flow_spec_eth spec_eth; + struct ibv_flow_spec_counter_action spec_count; + } flow_attr = { + .attr = { + .comp_mask = 0, + .type = IBV_FLOW_ATTR_NORMAL, + .size = sizeof(flow_attr), + .priority = 0, + .num_of_specs = 2, /* ETH + COUNT */ + .port = 1, + .flags = 0, + }, + .spec_eth = { + .type = IBV_EXP_FLOW_SPEC_ETH, + .size = sizeof(struct ibv_flow_spec_eth), + .val = FLOW_SPEC_ETH_MAC_VAL, + .mask = FLOW_SPEC_ETH_MAC_MASK, + }, + .spec_count = { + .type = IBV_FLOW_SPEC_ACTION_COUNT, + .size = sizeof(struct ibv_flow_spec_counter_action), + .counters = counters, /* attached this counters handle +to the newly created ibv_flow */ } }; + + /* create the flow */ + flow = ibv_create_flow(qp, &flow_attr.attr); + + /* allocate array for counters value reading */ + uint64_t *counters_value = malloc(sizeof(uint64_t) * idx); + + /* periodical read and print of flow counters */ + while (--loop) { + sleep(1); + + /* read hardware counters values */ + ibv_read_counters(counters, counters_value, idx, + IBV_READ_COUNTERS_ATTR_PREFER_CACHED); + + printf("PACKETS = %"PRIu64", BYTES = %"PRIu64 \n", + counters_value[0], counters_value[1] ); + } + + /* all done, release all */ + free(counters_value); + + /* destroy flow and detach counters */ + ibv_destroy_flow(flow); + + /* destroy counters handle */ + ibv_destroy_counters(counters); + + return; +} +``` + +# SEE ALSO + +**ibv_create_counters**, **ibv_destroy_counters**, +**ibv_attach_counters_point_flow**, **ibv_create_flow** + +# AUTHORS + +Raed Salem <raeds@mellanox.com> + +Alex Rosenbaum <alexr@mellanox.com> diff --git a/libibverbs/man/ibv_reg_mr.3 b/libibverbs/man/ibv_reg_mr.3 new file mode 100644 index 0000000..2bfc955 --- /dev/null +++ b/libibverbs/man/ibv_reg_mr.3 @@ -0,0 +1,102 @@ +.\" -*- nroff -*- +.\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md +.\" +.TH IBV_REG_MR 3 2006-10-31 libibverbs "Libibverbs Programmer's Manual" +.SH "NAME" +ibv_reg_mr, ibv_reg_mr_iova, ibv_dereg_mr \- register or deregister a memory region (MR) +.SH "SYNOPSIS" +.nf +.B #include <infiniband/verbs.h> +.sp +.BI "struct ibv_mr *ibv_reg_mr(struct ibv_pd " "*pd" ", void " "*addr" , +.BI " size_t " "length" ", int " "access" ); +.sp +.BI "struct ibv_mr *ibv_reg_mr_iova(struct ibv_pd " "*pd" ", void " "*addr" , +.BI " size_t " "length" ", uint64_t " "hca_va" , +.BI " int " "access" ); +.sp +.BI "int ibv_dereg_mr(struct ibv_mr " "*mr" ); +.fi +.SH "DESCRIPTION" +.B ibv_reg_mr() +registers a memory region (MR) associated with the protection domain +.I pd\fR. +The MR's starting address is +.I addr +and its size is +.I length\fR. +The argument +.I access +describes the desired memory protection attributes; it is either 0 or the bitwise OR of one or more of the following flags: +.PP +.TP +.B IBV_ACCESS_LOCAL_WRITE \fR Enable Local Write Access +.TP +.B IBV_ACCESS_REMOTE_WRITE \fR Enable Remote Write Access +.TP +.B IBV_ACCESS_REMOTE_READ\fR Enable Remote Read Access +.TP +.B IBV_ACCESS_REMOTE_ATOMIC\fR Enable Remote Atomic Operation Access (if supported) +.TP +.B IBV_ACCESS_MW_BIND\fR Enable Memory Window Binding +.TP +.B IBV_ACCESS_ZERO_BASED\fR Use byte offset from beginning of MR to access this MR, instead of a pointer address +.TP +.B IBV_ACCESS_ON_DEMAND\fR Create an on-demand paging MR +.TP +.B IBV_ACCESS_HUGETLB\fR Huge pages are guaranteed to be used for this MR, applicable with IBV_ACCESS_ON_DEMAND in explicit mode only +.TP +.B IBV_ACCESS_RELAXED_ORDERING\fR Allow system to reorder accesses to the MR to improve performance +.PP +If +.B IBV_ACCESS_REMOTE_WRITE +or +.B IBV_ACCESS_REMOTE_ATOMIC +is set, then +.B IBV_ACCESS_LOCAL_WRITE +must be set too. +.PP +Local read access is always enabled for the MR. +.PP +To create an implicit ODP MR, IBV_ACCESS_ON_DEMAND should be set, addr should be 0 and length should be SIZE_MAX. +.PP +If +.B IBV_ACCESS_HUGETLB +is set, then application awares that for this MR all pages are huge and must promise it will never do anything to break huge pages. +.PP +.B ibv_reg_mr_iova() +ibv_reg_mr_iova is the same as the normal reg_mr, except that the user is +allowed to specify the virtual base address of the MR when accessed through +a lkey or rkey. The offset in the memory region is computed as 'addr + +(iova - hca_va)'. Specifying 0 for hca_va has the same effect as +IBV_ACCESS_ZERO_BASED. +.PP +.B ibv_dereg_mr() +deregisters the MR +.I mr\fR. +.SH "RETURN VALUE" +.B ibv_reg_mr() / ibv_reg_mr_iova() +returns a pointer to the registered MR, or NULL if the request fails. +The local key (\fBL_Key\fR) field +.B lkey +is used as the lkey field of struct ibv_sge when posting buffers with +ibv_post_* verbs, and the the remote key (\fBR_Key\fR) +field +.B rkey +is used by remote processes to perform Atomic and RDMA operations. The remote process places this +.B rkey +as the rkey field of struct ibv_send_wr passed to the ibv_post_send function. +.PP +.B ibv_dereg_mr() +returns 0 on success, or the value of errno on failure (which indicates the failure reason). +.SH "NOTES" +.B ibv_dereg_mr() +fails if any memory window is still bound to this MR. +.SH "SEE ALSO" +.BR ibv_alloc_pd (3), +.BR ibv_post_send (3), +.BR ibv_post_recv (3), +.BR ibv_post_srq_recv (3) +.SH "AUTHORS" +.TP +Dotan Barak <dotanba@gmail.com> diff --git a/libibverbs/man/ibv_req_notify_cq.3.md b/libibverbs/man/ibv_req_notify_cq.3.md new file mode 100644 index 0000000..b1b57b4 --- /dev/null +++ b/libibverbs/man/ibv_req_notify_cq.3.md @@ -0,0 +1,55 @@ +--- +date: 2006-10-31 +footer: libibverbs +header: "Libibverbs Programmer's Manual" +layout: page +license: 'Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md' +section: 3 +title: IBV_REQ_NOTIFY_CQ +--- + +# NAME + +ibv_req_notify_cq - request completion notification on a completion queue (CQ) + +# SYNOPSIS + +```c +#include <infiniband/verbs.h> + +int ibv_req_notify_cq(struct ibv_cq *cq, int solicited_only); +``` + +# DESCRIPTION + +**ibv_req_notify_cq()** requests a completion notification on the completion +queue (CQ) *cq*. + +Upon the addition of a new CQ entry (CQE) to *cq*, a completion event will be +added to the completion channel associated with the CQ. If the argument +*solicited_only* is zero, a completion event is generated for any new CQE. If +*solicited_only* is non-zero, an event is only generated for a new CQE with +that is considered "solicited." A CQE is solicited if it is a receive +completion for a message with the Solicited Event header bit set, or if the +status is not successful. All other successful receive completions, or any +successful send completion is unsolicited. + +# RETURN VALUE + +**ibv_req_notify_cq()** returns 0 on success, or the value of errno on failure +(which indicates the failure reason). + +# NOTES + +The request for notification is "one shot." Only one completion event will be +generated for each call to **ibv_req_notify_cq()**. + +# SEE ALSO + +**ibv_create_comp_channel**(3), +**ibv_create_cq**(3), +**ibv_get_cq_event**(3) + +# AUTHOR + +Dotan Barak <dotanba@gmail.com> diff --git a/libibverbs/man/ibv_rereg_mr.3.md b/libibverbs/man/ibv_rereg_mr.3.md new file mode 100644 index 0000000..e6b0098 --- /dev/null +++ b/libibverbs/man/ibv_rereg_mr.3.md @@ -0,0 +1,91 @@ +--- +date: 2016-03-13 +footer: libibverbs +header: "Libibverbs Programmer's Manual" +layout: page +license: 'Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md' +section: 3 +title: IBV_REREG_MR +--- + +# NAME + +ibv_rereg_mr - re-register a memory region (MR) + +# SYNOPSIS + +```c +#include <infiniband/verbs.h> + +int ibv_rereg_mr(struct ibv_mr *mr, + int flags, + struct ibv_pd *pd, + void *addr, + size_t length, + int access); +``` + +# DESCRIPTION + +**ibv_rereg_mr()** Modifies the attributes of an existing memory region (MR) +*mr*. Conceptually, this call performs the functions deregister memory region +followed by register memory region. Where possible, resources are reused +instead of deallocated and reallocated. + +*flags* is a bit-mask used to indicate which of the following properties of +the memory region are being modified. Flags should be a combination (bit +field) of: + +**IBV_REREG_MR_CHANGE_TRANSLATION ** +: Change translation (location and length) + +**IBV_REREG_MR_CHANGE_PD ** +: Change protection domain + +**IBV_REREG_MR_CHANGE_ACCESS ** +: Change access flags + +When **IBV_REREG_MR_CHANGE_PD** is used, *pd* represents the new PD this MR +should be registered to. + +When **IBV_REREG_MR_CHANGE_TRANSLATION** is used, *addr*. represents the +virtual address (user-space pointer) of the new MR, while *length* represents +its length. + +The access and other flags are represented in the field *access*. This field +describes the desired memory protection attributes; it is either 0 or the +bitwise OR of one or more of ibv_access_flags. + +# RETURN VALUE + +**ibv_rereg_mr()** returns 0 on success, otherwise an error has occurred, +*enum ibv_rereg_mr_err_code* represents the error as of below. + +IBV_REREG_MR_ERR_INPUT - Old MR is valid, an input error was detected by +libibverbs. + +IBV_REREG_MR_ERR_DONT_FORK_NEW - Old MR is valid, failed via don't fork on new +address range. + +IBV_REREG_MR_ERR_DO_FORK_OLD - New MR is valid, failed via do fork on old +address range. + +IBV_REREG_MR_ERR_CMD - MR shouldn't be used, command error. + +IBV_REREG_MR_ERR_CMD_AND_DO_FORK_NEW - MR shouldn't be used, command error, +invalid fork state on new address range. + + +# NOTES + +Even on a failure, the user still needs to call ibv_dereg_mr on this MR. + +# SEE ALSO + +**ibv_dereg_mr**(3), +**ibv_reg_mr**(3) + +# AUTHORS + +Matan Barak <matanb@mellanox.com>, +Yishai Hadas <yishaih@mellanox.com> diff --git a/libibverbs/man/ibv_resize_cq.3.md b/libibverbs/man/ibv_resize_cq.3.md new file mode 100644 index 0000000..6582d12 --- /dev/null +++ b/libibverbs/man/ibv_resize_cq.3.md @@ -0,0 +1,48 @@ +--- +date: 2006-10-31 +footer: libibverbs +header: "Libibverbs Programmer's Manual" +layout: page +license: 'Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md' +section: 3 +title: IBV_RESIZE_CQ +--- + +# NAME + +ibv_resize_cq - resize a completion queue (CQ) + +# SYNOPSIS + +```c +#include <infiniband/verbs.h> + +int ibv_resize_cq(struct ibv_cq *cq, int cqe); +``` + +# DESCRIPTION + +**ibv_resize_cq()** resizes the completion queue (CQ) *cq* to have at least +*cqe* entries. *cqe* must be at least the number of unpolled entries in the CQ +*cq*. If *cqe* is a valid value less than the current CQ size, +**ibv_resize_cq()** may not do anything, since this function is only +guaranteed to resize the CQ to a size at least as big as the requested size. + +# RETURN VALUE + +**ibv_resize_cq()** returns 0 on success, or the value of errno on failure +(which indicates the failure reason). + +# NOTES + +**ibv_resize_cq()** may assign a CQ size greater than or equal to the +requested size. The cqe member of *cq* will be updated to the actual size. + +# SEE ALSO + +**ibv_create_cq**(3), +**ibv_destroy_cq**(3) + +# AUTHOR + +Dotan Barak <dotanba@gmail.com> diff --git a/libibverbs/man/ibv_srq_pingpong.1 b/libibverbs/man/ibv_srq_pingpong.1 new file mode 100644 index 0000000..c0b028e --- /dev/null +++ b/libibverbs/man/ibv_srq_pingpong.1 @@ -0,0 +1,81 @@ +.\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md +.TH IBV_SRQ_PINGPONG 1 "August 30, 2005" "libibverbs" "USER COMMANDS" + +.SH NAME +ibv_srq_pingpong \- simple InfiniBand shared receive queue test + +.SH SYNOPSIS +.B ibv_srq_pingpong +[\-p port] [\-d device] [\-i ib port] [\-s size] [\-m size] +[\-q num QPs] [\-r rx depth] [\-n iters] [\-l sl] [\-e] +[\-g gid index] \fBHOSTNAME\fR + +.B ibv_srq_pingpong +[\-p port] [\-d device] [\-i ib port] [\-s size] [\-m size] +[\-q num QPs] [\-r rx depth] [\-n iters] [\-l sl] [\-e] +[\-g gid index] + +.SH DESCRIPTION +.PP +Run a simple ping-pong test over InfiniBand via the reliable +connected (RC) transport, using multiple queue pairs (QPs) and a +single shared receive queue (SRQ). + +.SH OPTIONS + +.PP +.TP +\fB\-p\fR, \fB\-\-port\fR=\fIPORT\fR +use TCP port \fIPORT\fR for initial synchronization (default 18515) +.TP +\fB\-d\fR, \fB\-\-ib\-dev\fR=\fIDEVICE\fR +use IB device \fIDEVICE\fR (default first device found) +.TP +\fB\-i\fR, \fB\-\-ib\-port\fR=\fIPORT\fR +use IB port \fIPORT\fR (default port 1) +.TP +\fB\-s\fR, \fB\-\-size\fR=\fISIZE\fR +ping-pong messages of size \fISIZE\fR (default 4096) +.TP +\fB\-m\fR, \fB\-\-mtu\fR=\fISIZE\fR +path MTU \fISIZE\fR (default 1024) +.TP +\fB\-q\fR, \fB\-\-num\-qp\fR=\fINUM\fR +use \fINUM\fR queue pairs for test (default 16) +.TP +\fB\-r\fR, \fB\-\-rx\-depth\fR=\fIDEPTH\fR +post \fIDEPTH\fR receives at a time (default 1000) +.TP +\fB\-n\fR, \fB\-\-iters\fR=\fIITERS\fR +perform \fIITERS\fR message exchanges (default 1000) +.TP +\fB\-l\fR, \fB\-\-sl\fR=\fISL\fR +use \fISL\fR as the service level value of the QPs (default 0) +.TP +\fB\-e\fR, \fB\-\-events\fR +sleep while waiting for work completion events (default is to poll for +completions) +.TP +\fB\-g\fR, \fB\-\-gid-idx\fR=\fIGIDINDEX\fR +local port \fIGIDINDEX\fR +.TP +\fB\-c\fR, \fB\-\-chk\fR +validate received buffer + +.SH SEE ALSO +.BR ibv_rc_pingpong (1), +.BR ibv_uc_pingpong (1), +.BR ibv_ud_pingpong (1), +.BR ibv_xsrq_pingpong (1) + +.SH AUTHORS +.TP +Roland Dreier +.RI < rolandd@cisco.com > + +.SH BUGS +The network synchronization between client and server instances is +weak, and does not prevent incompatible options from being used on the +two instances. The method used for retrieving work completions is not +strictly correct, and race conditions may cause failures on some +systems. diff --git a/libibverbs/man/ibv_uc_pingpong.1 b/libibverbs/man/ibv_uc_pingpong.1 new file mode 100644 index 0000000..128715f --- /dev/null +++ b/libibverbs/man/ibv_uc_pingpong.1 @@ -0,0 +1,76 @@ +.\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md +.TH IBV_UC_PINGPONG 1 "August 30, 2005" "libibverbs" "USER COMMANDS" + +.SH NAME +ibv_uc_pingpong \- simple InfiniBand UC transport test + +.SH SYNOPSIS +.B ibv_uc_pingpong +[\-p port] [\-d device] [\-i ib port] [\-s size] [\-m size] +[\-r rx depth] [\-n iters] [\-l sl] [\-e] [\-g gid index] +\fBHOSTNAME\fR + +.B ibv_uc_pingpong +[\-p port] [\-d device] [\-i ib port] [\-s size] [\-m size] +[\-r rx depth] [\-n iters] [\-l sl] [\-e] [\-g gid index] + +.SH DESCRIPTION +.PP +Run a simple ping-pong test over InfiniBand via the unreliable +connected (UC) transport. + +.SH OPTIONS + +.PP +.TP +\fB\-p\fR, \fB\-\-port\fR=\fIPORT\fR +use TCP port \fIPORT\fR for initial synchronization (default 18515) +.TP +\fB\-d\fR, \fB\-\-ib\-dev\fR=\fIDEVICE\fR +use IB device \fIDEVICE\fR (default first device found) +.TP +\fB\-i\fR, \fB\-\-ib\-port\fR=\fIPORT\fR +use IB port \fIPORT\fR (default port 1) +.TP +\fB\-s\fR, \fB\-\-size\fR=\fISIZE\fR +ping-pong messages of size \fISIZE\fR (default 4096) +.TP +\fB\-m\fR, \fB\-\-mtu\fR=\fISIZE\fR +path MTU \fISIZE\fR (default 1024) +.TP +\fB\-r\fR, \fB\-\-rx\-depth\fR=\fIDEPTH\fR +post \fIDEPTH\fR receives at a time (default 1000) +.TP +\fB\-n\fR, \fB\-\-iters\fR=\fIITERS\fR +perform \fIITERS\fR message exchanges (default 1000) +.TP +\fB\-l\fR, \fB\-\-sl\fR=\fISL\fR +use \fISL\fR as the service level value of the QP (default 0) +.TP +\fB\-e\fR, \fB\-\-events\fR +sleep while waiting for work completion events (default is to poll for +completions) +.TP +\fB\-g\fR, \fB\-\-gid-idx\fR=\fIGIDINDEX\fR +local port \fIGIDINDEX\fR +.TP +\fB\-c\fR, \fB\-\-chk\fR +validate received buffer + +.SH SEE ALSO +.BR ibv_rc_pingpong (1), +.BR ibv_ud_pingpong (1), +.BR ibv_srq_pingpong (1), +.BR ibv_xsrq_pingpong (1) + +.SH AUTHORS +.TP +Roland Dreier +.RI < rolandd@cisco.com > + +.SH BUGS +The network synchronization between client and server instances is +weak, and does not prevent incompatible options from being used on the +two instances. The method used for retrieving work completions is not +strictly correct, and race conditions may cause failures on some +systems. diff --git a/libibverbs/man/ibv_ud_pingpong.1 b/libibverbs/man/ibv_ud_pingpong.1 new file mode 100644 index 0000000..8642acf --- /dev/null +++ b/libibverbs/man/ibv_ud_pingpong.1 @@ -0,0 +1,72 @@ +.\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md +.TH IBV_UD_PINGPONG 1 "August 30, 2005" "libibverbs" "USER COMMANDS" + +.SH NAME +ibv_ud_pingpong \- simple InfiniBand UD transport test + +.SH SYNOPSIS +.B ibv_ud_pingpong +[\-p port] [\-d device] [\-i ib port] [\-s size] [\-r rx depth] +[\-n iters] [\-l sl] [\-e] [\-g gid index] \fBHOSTNAME\fR + +.B ibv_ud_pingpong +[\-p port] [\-d device] [\-i ib port] [\-s size] [\-r rx depth] +[\-n iters] [\-l sl] [\-e] [\-g gid index] + +.SH DESCRIPTION +.PP +Run a simple ping-pong test over InfiniBand via the unreliable +datagram (UD) transport. + +.SH OPTIONS + +.PP +.TP +\fB\-p\fR, \fB\-\-port\fR=\fIPORT\fR +use TCP port \fIPORT\fR for initial synchronization (default 18515) +.TP +\fB\-d\fR, \fB\-\-ib\-dev\fR=\fIDEVICE\fR +use IB device \fIDEVICE\fR (default first device found) +.TP +\fB\-i\fR, \fB\-\-ib\-port\fR=\fIPORT\fR +use IB port \fIPORT\fR (default port 1) +.TP +\fB\-s\fR, \fB\-\-size\fR=\fISIZE\fR +ping-pong messages of size \fISIZE\fR (default 2048) +.TP +\fB\-r\fR, \fB\-\-rx\-depth\fR=\fIDEPTH\fR +post \fIDEPTH\fR receives at a time (default 500) +.TP +\fB\-n\fR, \fB\-\-iters\fR=\fIITERS\fR +perform \fIITERS\fR message exchanges (default 1000) +.TP +\fB\-l\fR, \fB\-\-sl\fR=\fISL\fR +send messages with service level \fISL\fR (default 0) +.TP +\fB\-e\fR, \fB\-\-events\fR +sleep while waiting for work completion events (default is to poll for +completions) +.TP +\fB\-g\fR, \fB\-\-gid-idx\fR=\fIGIDINDEX\fR +local port \fIGIDINDEX\fR +.TP +\fB\-c\fR, \fB\-\-chk\fR +validate received buffer + +.SH SEE ALSO +.BR ibv_rc_pingpong (1), +.BR ibv_uc_pingpong (1), +.BR ibv_srq_pingpong (1), +.BR ibv_xsrq_pingpong (1) + +.SH AUTHORS +.TP +Roland Dreier +.RI < rolandd@cisco.com > + +.SH BUGS +The network synchronization between client and server instances is +weak, and does not prevent incompatible options from being used on the +two instances. The method used for retrieving work completions is not +strictly correct, and race conditions may cause failures on some +systems. diff --git a/libibverbs/man/ibv_wr_post.3.md b/libibverbs/man/ibv_wr_post.3.md new file mode 100644 index 0000000..ab7fc5f --- /dev/null +++ b/libibverbs/man/ibv_wr_post.3.md @@ -0,0 +1,333 @@ +--- +date: 2018-11-27 +footer: libibverbs +header: "Libibverbs Programmer's Manual" +layout: page +license: 'Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md' +section: 3 +title: IBV_WR API +--- + +# NAME + +ibv_wr_abort, ibv_wr_complete, ibv_wr_start - Manage regions allowed to post work + +ibv_wr_atomic_cmp_swp, ibv_wr_atomic_fetch_add - Post remote atomic operation work requests + +ibv_wr_bind_mw, ibv_wr_local_inv - Post work requests for memory windows + +ibv_wr_rdma_read, ibv_wr_rdma_write, ibv_wr_rdma_write_imm - Post RDMA work requests + +ibv_wr_send, ibv_wr_send_imm, ibv_wr_send_inv - Post send work requests + +ibv_wr_send_tso - Post segmentation offload work requests + +ibv_wr_set_inline_data, ibv_wr_set_inline_data_list - Attach inline data to the last work request + +ibv_wr_set_sge, ibv_wr_set_sge_list - Attach data to the last work request + +ibv_wr_set_ud_addr - Attach UD addressing info to the last work request + +ibv_wr_set_xrc_srqn - Attach an XRC SRQN to the last work request + +# SYNOPSIS + +```c +#include <infiniband/verbs.h> + +void ibv_wr_abort(struct ibv_qp_ex *qp); +int ibv_wr_complete(struct ibv_qp_ex *qp); +void ibv_wr_start(struct ibv_qp_ex *qp); + +void ibv_wr_atomic_cmp_swp(struct ibv_qp_ex *qp, uint32_t rkey, + uint64_t remote_addr, uint64_t compare, + uint64_t swap); +void ibv_wr_atomic_fetch_add(struct ibv_qp_ex *qp, uint32_t rkey, + uint64_t remote_addr, uint64_t add); + +void ibv_wr_bind_mw(struct ibv_qp_ex *qp, struct ibv_mw *mw, uint32_t rkey, + const struct ibv_mw_bind_info *bind_info); +void ibv_wr_local_inv(struct ibv_qp_ex *qp, uint32_t invalidate_rkey); + +void ibv_wr_rdma_read(struct ibv_qp_ex *qp, uint32_t rkey, + uint64_t remote_addr); +void ibv_wr_rdma_write(struct ibv_qp_ex *qp, uint32_t rkey, + uint64_t remote_addr); +void ibv_wr_rdma_write_imm(struct ibv_qp_ex *qp, uint32_t rkey, + uint64_t remote_addr, __be32 imm_data); + +void ibv_wr_send(struct ibv_qp_ex *qp); +void ibv_wr_send_imm(struct ibv_qp_ex *qp, __be32 imm_data); +void ibv_wr_send_inv(struct ibv_qp_ex *qp, uint32_t invalidate_rkey); +void ibv_wr_send_tso(struct ibv_qp_ex *qp, void *hdr, uint16_t hdr_sz, + uint16_t mss); + +void ibv_wr_set_inline_data(struct ibv_qp_ex *qp, void *addr, size_t length); +void ibv_wr_set_inline_data_list(struct ibv_qp_ex *qp, size_t num_buf, + const struct ibv_data_buf *buf_list); +void ibv_wr_set_sge(struct ibv_qp_ex *qp, uint32_t lkey, uint64_t addr, + uint32_t length); +void ibv_wr_set_sge_list(struct ibv_qp_ex *qp, size_t num_sge, + const struct ibv_sge *sg_list); + +void ibv_wr_set_ud_addr(struct ibv_qp_ex *qp, struct ibv_ah *ah, + uint32_t remote_qpn, uint32_t remote_qkey); +void ibv_wr_set_xrc_srqn(struct ibv_qp_ex *qp, uint32_t remote_srqn); +``` + +# DESCRIPTION + +The verbs work request API (ibv_wr_\*) allows efficient posting of work to a send +queue using function calls instead of the struct based *ibv_post_send()* +scheme. This approach is designed to minimize CPU branching and locking during +the posting process. + +This API is intended to be used to access additional functionality beyond +what is provided by *ibv_post_send()*. + +WRs batches of *ibv_post_send()* and this API WRs batches can interleave +together just if they are not posted within the critical region of each other. +(A critical region in this API formed by *ibv_wr_start()* and +*ibv_wr_complete()*/*ibv_wr_abort()*) + +# USAGE + +To use these APIs the QP must be created using ibv_create_qp_ex() which allows +setting the **IBV_QP_INIT_ATTR_SEND_OPS_FLAGS** in *comp_mask*. The +*send_ops_flags* should be set to the OR of the work request types that will +be posted to the QP. + +If the QP does not support all the requested work request types then QP +creation will fail. + +Posting work requests to the QP is done within the critical region formed by +*ibv_wr_start()* and *ibv_wr_complete()*/*ibv_wr_abort()* (see CONCURRENCY below). + +Each work request is created by calling a WR builder function (see the table +column WR builder below) to start creating the work request, followed by +allowed/required setter functions described below. + +The WR builder and setter combination can be called multiple times to +efficiently post multiple work requests within a single critical region. + +Each WR builder will use the *wr_id* member of *struct ibv_qp_ex* to set the +value to be returned in the completion. Some operations will also use the +*wr_flags* member to influence operation (see Flags below). These values +should be set before invoking the WR builder function. + +For example a simple send could be formed as follows: + +```C +qpx->wr_id = 1; +ibv_wr_send(qpx); +ibv_wr_set_sge(qpx, lkey, &data, sizeof(data)); +``` + +The section WORK REQUESTS describes the various WR builders and setters in +details. + +Posting work is completed by calling *ibv_wr_complete()* or *ibv_wr_abort()*. +No work is executed to the queue until *ibv_wr_complete()* returns +success. *ibv_wr_abort()* will discard all work prepared since *ibv_wr_start()*. + +# WORK REQUESTS + +Many of the operations match the opcodes available for *ibv_post_send()*. Each +operation has a WR builder function, a list of allowed setters, and a flag bit +to request the operation with *send_ops_flags* in *struct +ibv_qp_init_attr_ex* (see the EXAMPLE below). + +| Operation | WR builder | QP Type Supported | setters | +|----------------------|---------------------------|----------------------------------|----------| +| ATOMIC_CMP_AND_SWP | ibv_wr_atomic_cmp_swp() | RC, XRC_SEND | DATA, QP | +| ATOMIC_FETCH_AND_ADD | ibv_wr_atomic_fetch_add() | RC, XRC_SEND | DATA, QP | +| BIND_MW | ibv_wr_bind_mw() | UC, RC, XRC_SEND | NONE | +| LOCAL_INV | ibv_wr_local_inv() | UC, RC, XRC_SEND | NONE | +| RDMA_READ | ibv_wr_rdma_read() | RC, XRC_SEND | DATA, QP | +| RDMA_WRITE | ibv_wr_rdma_write() | UC, RC, XRC_SEND | DATA, QP | +| RDMA_WRITE_WITH_IMM | ibv_wr_rdma_write_imm() | UC, RC, XRC_SEND | DATA, QP | +| SEND | ibv_wr_send() | UD, UC, RC, XRC_SEND, RAW_PACKET | DATA, QP | +| SEND_WITH_IMM | ibv_wr_send_imm() | UD, UC, RC, SRC SEND | DATA, QP | +| SEND_WITH_INV | ibv_wr_send_inv() | UC, RC, XRC_SEND | DATA, QP | +| TSO | ibv_wr_send_tso() | UD, RAW_PACKET | DATA, QP | + + +## Atomic operations + +Atomic operations are only atomic so long as all writes to memory go only +through the same RDMA hardware. It is not atomic with writes performed by the +CPU, or by other RDMA hardware in the system. + +*ibv_wr_atomic_cmp_swp()* +: If the remote 64 bit memory location specified by *rkey* and *remote_addr* + equals *compare* then set it to *swap*. + +*ibv_wr_atomic_fetch_add()* +: Add *add* to the 64 bit memory location specified *rkey* and *remote_addr*. + +## Memory Windows + +Memory window type 2 operations (See man page for ibv_alloc_mw). + +*ibv_wr_bind_mw()* +: Bind a MW type 2 specified by **mw**, set a new **rkey** and set its + properties by **bind_info**. + +*ibv_wr_local_inv()* +: Invalidate a MW type 2 which is associated with **rkey**. + +## RDMA + +*ibv_wr_rdma_read()* +: Read from the remote memory location specified *rkey* and + *remote_addr*. The number of bytes to read, and the local location to + store the data, is determined by the DATA buffers set after this call. + +*ibv_wr_rdma_write()*, *ibv_wr_rdma_write_imm()* +: Write to the remote memory location specified *rkey* and + *remote_addr*. The number of bytes to read, and the local location to get + the data, is determined by the DATA buffers set after this call. + + The _imm version causes the remote side to get a IBV_WC_RECV_RDMA_WITH_IMM + containing the 32 bits of immediate data. + +## Message Send + +*ibv_wr_send()*, *ibv_wr_send_imm()* +: Send a message. The number of bytes to send, and the local location to get + the data, is determined by the DATA buffers set after this call. + + The _imm version causes the remote side to get a IBV_WC_RECV_RDMA_WITH_IMM + containing the 32 bits of immediate data. + +*ibv_wr_send_inv()* +: The data transfer is the same as for *ibv_wr_send()*, however the remote + side will invalidate the MR specified by *invalidate_rkey* before + delivering a completion. + +*ibv_wr_send_tso()* +: Produce multiple SEND messages using TCP Segmentation Offload. The SGE + points to a TCP Stream buffer which will be segmented into + MSS size SENDs. The hdr includes the entire network headers up to and + including the TCP header and is prefixed before each segment. + +## QP Specific setters + +Certain QP types require each post to be accompanied by additional setters, +these setters are mandatory for any operation listing a QP setter in the above +table. + +*UD* QPs +: *ibv_wr_set_ud_addr()* must be called to set the destination address of + the work. + +*XRC_SEND* QPs +: *ibv_wr_set_xrc_srqn()* must be called to set the destination SRQN field. + +## DATA transfer setters + +For work that requires to transfer data one of the following setters should +be called once after the WR builder: + +*ibv_wr_set_sge()* +: Transfer data to/from a single buffer given by the lkey, addr and + length. This is equivalent to *ibv_wr_set_sge_list()* with a single + element. + +*ibv_wr_set_sge_list()* +: Transfer data to/from a list of buffers, logically concatenated + together. Each buffer is specified by an element in an array of *struct + ibv_sge*. + +Inline setters will copy the send data during the setter and allows the caller +to immediately re-use the buffer. This behavior is identical to the +IBV_SEND_INLINE flag. Generally this copy is done in a way that optimizes +SEND latency and is suitable for small messages. The provider will limit the +amount of data it can support in a single operation. This limit is requested +in the *max_inline_data* member of *struct ibv_qp_init_attr*. Valid only +for SEND and RDMA_WRITE. + +*ibv_wr_set_inline_data()* +: Copy send data from a single buffer given by the addr and length. + This is equivalent to *ibv_wr_set_inline_data_list()* with a single + element. + +*ibv_wr_set_inline_data_list()* +: Copy send data from a list of buffers, logically concatenated + together. Each buffer is specified by an element in an array of *struct + ibv_inl_data*. + +## Flags + +A bit mask of flags may be specified in *wr_flags* to control the behavior of +the work request. + +**IBV_SEND_FENCE** +: Do not start this work request until prior work has completed. + +**IBV_SEND_IP_CSUM** +: Offload the IPv4 and TCP/UDP checksum calculation + +**IBV_SEND_SIGNALED** +: A completion will be generated in the completion queue for the operation. + +**IBV_SEND_SOLICTED** +: Set the solicted bit in the RDMA packet. This informs the other side to + generate a completion event upon receiving the RDMA operation. + +# CONCURRENCY + +The provider will provide locking to ensure that *ibv_wr_start()* and +*ibv_wr_complete()/abort()* form a per-QP critical section where no other +threads can enter. + +If an *ibv_td* is provided during QP creation then no locking will be performed +and it is up to the caller to ensure that only one thread can be within the +critical region at a time. + +# RETURN VALUE + +Applications should use this API in a way that does not create failures. The +individual APIs do not return a failure indication to avoid branching. + +If a failure is detected during operation, for instance due to an invalid +argument, then *ibv_wr_complete()* will return failure and the entire posting +will be aborted. + +# EXAMPLE + +```c +/* create RC QP type and specify the required send opcodes */ +qp_init_attr_ex.qp_type = IBV_QPT_RC; +qp_init_attr_ex.comp_mask |= IBV_QP_INIT_ATTR_SEND_OPS_FLAGS; +qp_init_attr_ex.send_ops_flags |= IBV_QP_EX_WITH_RDMA_WRITE; +qp_init_attr_ex.send_ops_flags |= IBV_QP_EX_WITH_RDMA_WRITE_WITH_IMM; + +ibv_qp *qp = ibv_create_qp_ex(ctx, qp_init_attr_ex); +ibv_qp_ex *qpx = ibv_qp_to_qp_ex(qp); + +ibv_wr_start(qpx); + +/* create 1st WRITE WR entry */ +qpx->wr_id = my_wr_id_1; +ibv_wr_rdma_write(qpx, rkey, remote_addr_1); +ibv_wr_set_sge(qpx, lkey, local_addr_1, length_1); + +/* create 2nd WRITE_WITH_IMM WR entry */ +qpx->wr_id = my_wr_id_2; +qpx->wr_flags = IBV_SEND_SIGNALED; +ibv_wr_rdma_write_imm(qpx, rkey, remote_addr_2, htonl(0x1234)); +ibv_set_wr_sge(qpx, lkey, local_addr_2, length_2); + +/* Begin processing WRs */ +ret = ibv_wr_complete(qpx); +``` + +# SEE ALSO + +**ibv_post_send**(3), **ibv_create_qp_ex(3)**. + +# AUTHOR + +Jason Gunthorpe <jgg@mellanox.com> +Guy Levi <guyle@mellanox.com> diff --git a/libibverbs/man/ibv_xsrq_pingpong.1 b/libibverbs/man/ibv_xsrq_pingpong.1 new file mode 100644 index 0000000..b225d9b --- /dev/null +++ b/libibverbs/man/ibv_xsrq_pingpong.1 @@ -0,0 +1,75 @@ +.\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md +.TH IBV_XSRQ_PINGPONG 1 "May 24, 2016" "libibverbs" "USER COMMANDS" + +.SH NAME +ibv_xsrq_pingpong \- simple InfiniBand shared receive queue test + +.SH SYNOPSIS +.B ibv_xsrq_pingpong +[\-p port] [\-d device] [\-i ib port] [\-s size] [\-m mtu] [\-c clients] +[\-n num_tests] [\-l sl] [\-e] [\-g gid index] \fBHOSTNAME\fR + +.B ibv_xsrq_pingpong +[\-p port] [\-d device] [\-i ib port] [\-s size] [\-m mtu] [\-c clients] +[\-n num_tests] [\-l sl] [\-e] [\-g gid index] + +.SH DESCRIPTION +.PP +Run a simple ping-pong test over InfiniBand via the extended reliable +connected (XRC) transport service, using a shared receive queue (SRQ). + +.SH OPTIONS + +.PP +.TP +\fB\-p\fR, \fB\-\-port\fR=\fIPORT\fR +use TCP port \fIPORT\fR for initial synchronization (default 18515) +.TP +\fB\-d\fR, \fB\-\-ib\-dev\fR=\fIDEVICE\fR +use IB device \fIDEVICE\fR (default first device found) +.TP +\fB\-i\fR, \fB\-\-ib\-port\fR=\fIPORT\fR +use IB port \fIPORT\fR (default port 1) +.TP +\fB\-s\fR, \fB\-\-size\fR=\fISIZE\fR +ping-pong messages of size \fISIZE\fR (default 4096) +.TP +\fB\-m\fR, \fB\-\-mtu\fR=\fIMTU\fR +use path mtu of size \fIMTU\fR (default 2048) +.TP +\fB\-c\fR, \fB\-\-clients\fR=\fICLIENTS\fR +number of clients \fICLIENTS\fR (on server only, default 1) +.TP +\fB\-n\fR, \fB\-\-num\-tests\fR=\fINUM_TESTS\fR +perform \fINUM_TESTS\fR tests per client (default 5) +.TP +\fB\-l\fR, \fB\-\-sl\fR=\fISL\fR +use \fISL\fR as the service level value (default 0) +.TP +\fB\-e\fR, \fB\-\-events\fR +sleep while waiting for work completion events (default is to poll for +completions) +.TP +\fB\-g\fR, \fB\-\-gid-idx\fR=\fIGIDINDEX\fR +local port \fIGIDINDEX\fR + +.SH SEE ALSO +.BR ibv_rc_pingpong (1), +.BR ibv_uc_pingpong (1), +.BR ibv_ud_pingpong (1) +.BR ibv_srq_pingpong (1) + +.SH AUTHORS +.TP +Roland Dreier +.RI < roland@purestorage.com > +.TP +Jarod Wilson +.RI < jarod@redhat.com > + +.SH BUGS +The network synchronization between client and server instances is +weak, and does not prevent incompatible options from being used on the +two instances. The method used for retrieving work completions is not +strictly correct, and race conditions may cause failures on some +systems. diff --git a/libibverbs/marshall.c b/libibverbs/marshall.c new file mode 100644 index 0000000..20c2ba9 --- /dev/null +++ b/libibverbs/marshall.c @@ -0,0 +1,140 @@ +/* + * Copyright (c) 2005 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <config.h> + +#include <string.h> + +#include <infiniband/marshall.h> + +void ibv_copy_ah_attr_from_kern(struct ibv_ah_attr *dst, + struct ib_uverbs_ah_attr *src) +{ + memcpy(dst->grh.dgid.raw, src->grh.dgid, sizeof dst->grh.dgid); + dst->grh.flow_label = src->grh.flow_label; + dst->grh.sgid_index = src->grh.sgid_index; + dst->grh.hop_limit = src->grh.hop_limit; + dst->grh.traffic_class = src->grh.traffic_class; + + dst->dlid = src->dlid; + dst->sl = src->sl; + dst->src_path_bits = src->src_path_bits; + dst->static_rate = src->static_rate; + dst->is_global = src->is_global; + dst->port_num = src->port_num; +} + +void ibv_copy_qp_attr_from_kern(struct ibv_qp_attr *dst, + struct ib_uverbs_qp_attr *src) +{ + dst->cur_qp_state = src->cur_qp_state; + dst->path_mtu = src->path_mtu; + dst->path_mig_state = src->path_mig_state; + dst->qkey = src->qkey; + dst->rq_psn = src->rq_psn; + dst->sq_psn = src->sq_psn; + dst->dest_qp_num = src->dest_qp_num; + dst->qp_access_flags = src->qp_access_flags; + + dst->cap.max_send_wr = src->max_send_wr; + dst->cap.max_recv_wr = src->max_recv_wr; + dst->cap.max_send_sge = src->max_send_sge; + dst->cap.max_recv_sge = src->max_recv_sge; + dst->cap.max_inline_data = src->max_inline_data; + + ibv_copy_ah_attr_from_kern(&dst->ah_attr, &src->ah_attr); + ibv_copy_ah_attr_from_kern(&dst->alt_ah_attr, &src->alt_ah_attr); + + dst->pkey_index = src->pkey_index; + dst->alt_pkey_index = src->alt_pkey_index; + dst->en_sqd_async_notify = src->en_sqd_async_notify; + dst->sq_draining = src->sq_draining; + dst->max_rd_atomic = src->max_rd_atomic; + dst->max_dest_rd_atomic = src->max_dest_rd_atomic; + dst->min_rnr_timer = src->min_rnr_timer; + dst->port_num = src->port_num; + dst->timeout = src->timeout; + dst->retry_cnt = src->retry_cnt; + dst->rnr_retry = src->rnr_retry; + dst->alt_port_num = src->alt_port_num; + dst->alt_timeout = src->alt_timeout; +} + +void ibv_copy_path_rec_from_kern(struct ibv_sa_path_rec *dst, + struct ib_user_path_rec *src) +{ + memcpy(dst->dgid.raw, src->dgid, sizeof dst->dgid); + memcpy(dst->sgid.raw, src->sgid, sizeof dst->sgid); + + dst->dlid = src->dlid; + dst->slid = src->slid; + dst->raw_traffic = src->raw_traffic; + dst->flow_label = src->flow_label; + dst->hop_limit = src->hop_limit; + dst->traffic_class = src->traffic_class; + dst->reversible = src->reversible; + dst->numb_path = src->numb_path; + dst->pkey = src->pkey; + dst->sl = src->sl; + dst->mtu_selector = src->mtu_selector; + dst->mtu = src->mtu; + dst->rate_selector = src->rate_selector; + dst->rate = src->rate; + dst->packet_life_time = src->packet_life_time; + dst->preference = src->preference; + dst->packet_life_time_selector = src->packet_life_time_selector; +} + +void ibv_copy_path_rec_to_kern(struct ib_user_path_rec *dst, + struct ibv_sa_path_rec *src) +{ + memcpy(dst->dgid, src->dgid.raw, sizeof src->dgid); + memcpy(dst->sgid, src->sgid.raw, sizeof src->sgid); + + dst->dlid = src->dlid; + dst->slid = src->slid; + dst->raw_traffic = src->raw_traffic; + dst->flow_label = src->flow_label; + dst->hop_limit = src->hop_limit; + dst->traffic_class = src->traffic_class; + dst->reversible = src->reversible; + dst->numb_path = src->numb_path; + dst->pkey = src->pkey; + dst->sl = src->sl; + dst->mtu_selector = src->mtu_selector; + dst->mtu = src->mtu; + dst->rate_selector = src->rate_selector; + dst->rate = src->rate; + dst->packet_life_time = src->packet_life_time; + dst->preference = src->preference; + dst->packet_life_time_selector = src->packet_life_time_selector; +} diff --git a/libibverbs/marshall.h b/libibverbs/marshall.h new file mode 100644 index 0000000..723f4f4 --- /dev/null +++ b/libibverbs/marshall.h @@ -0,0 +1,61 @@ +/* + * Copyright (c) 2005 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef INFINIBAND_MARSHALL_H +#define INFINIBAND_MARSHALL_H + +#include <infiniband/verbs.h> +#include <infiniband/sa.h> +#include <infiniband/kern-abi.h> +#include <rdma/ib_user_sa.h> + +#ifdef __cplusplus +extern "C" { +#endif + +void ibv_copy_qp_attr_from_kern(struct ibv_qp_attr *dst, + struct ib_uverbs_qp_attr *src); + +void ibv_copy_ah_attr_from_kern(struct ibv_ah_attr *dst, + struct ib_uverbs_ah_attr *src); + +void ibv_copy_path_rec_from_kern(struct ibv_sa_path_rec *dst, + struct ib_user_path_rec *src); + +void ibv_copy_path_rec_to_kern(struct ib_user_path_rec *dst, + struct ibv_sa_path_rec *src); + +#ifdef __cplusplus +} +#endif + +#endif /* INFINIBAND_MARSHALL_H */ diff --git a/libibverbs/memory.c b/libibverbs/memory.c new file mode 100644 index 0000000..2b1c1ae --- /dev/null +++ b/libibverbs/memory.c @@ -0,0 +1,704 @@ +/* + * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved. + * Copyright (c) 2006 Cisco Systems, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <config.h> + +#include <errno.h> +#include <sys/mman.h> +#include <unistd.h> +#include <stdlib.h> +#include <stdint.h> +#include <stdio.h> +#include <string.h> +#include <dirent.h> +#include <limits.h> +#include <inttypes.h> + +#include "ibverbs.h" + +struct ibv_mem_node { + enum { + IBV_RED, + IBV_BLACK + } color; + struct ibv_mem_node *parent; + struct ibv_mem_node *left, *right; + uintptr_t start, end; + int refcnt; +}; + +static struct ibv_mem_node *mm_root; +static pthread_mutex_t mm_mutex = PTHREAD_MUTEX_INITIALIZER; +static int page_size; +static int huge_page_enabled; +static int too_late; + +static unsigned long smaps_page_size(FILE *file) +{ + int n; + unsigned long size = page_size; + char buf[1024]; + + while (fgets(buf, sizeof(buf), file) != NULL) { + if (!strstr(buf, "KernelPageSize:")) + continue; + + n = sscanf(buf, "%*s %lu", &size); + if (n < 1) + continue; + + /* page size is printed in Kb */ + size = size * 1024; + + break; + } + + return size; +} + +static unsigned long get_page_size(void *base) +{ + unsigned long ret = page_size; + pid_t pid; + FILE *file; + char buf[1024]; + + pid = getpid(); + snprintf(buf, sizeof(buf), "/proc/%d/smaps", pid); + + file = fopen(buf, "r" STREAM_CLOEXEC); + if (!file) + goto out; + + while (fgets(buf, sizeof(buf), file) != NULL) { + int n; + uintptr_t range_start, range_end; + + n = sscanf(buf, "%" SCNxPTR "-%" SCNxPTR, &range_start, &range_end); + + if (n < 2) + continue; + + if ((uintptr_t) base >= range_start && (uintptr_t) base < range_end) { + ret = smaps_page_size(file); + break; + } + } + + fclose(file); + +out: + return ret; +} + +int ibv_fork_init(void) +{ + void *tmp, *tmp_aligned; + int ret; + unsigned long size; + + if (getenv("RDMAV_HUGEPAGES_SAFE")) + huge_page_enabled = 1; + + if (mm_root) + return 0; + + if (too_late) + return EINVAL; + + page_size = sysconf(_SC_PAGESIZE); + if (page_size < 0) + return errno; + + if (posix_memalign(&tmp, page_size, page_size)) + return ENOMEM; + + if (huge_page_enabled) { + size = get_page_size(tmp); + tmp_aligned = (void *) ((uintptr_t) tmp & ~(size - 1)); + } else { + size = page_size; + tmp_aligned = tmp; + } + + ret = madvise(tmp_aligned, size, MADV_DONTFORK) || + madvise(tmp_aligned, size, MADV_DOFORK); + + free(tmp); + + if (ret) + return ENOSYS; + + mm_root = malloc(sizeof *mm_root); + if (!mm_root) + return ENOMEM; + + mm_root->parent = NULL; + mm_root->left = NULL; + mm_root->right = NULL; + mm_root->color = IBV_BLACK; + mm_root->start = 0; + mm_root->end = UINTPTR_MAX; + mm_root->refcnt = 0; + + return 0; +} + +static struct ibv_mem_node *__mm_prev(struct ibv_mem_node *node) +{ + if (node->left) { + node = node->left; + while (node->right) + node = node->right; + } else { + while (node->parent && node == node->parent->left) + node = node->parent; + + node = node->parent; + } + + return node; +} + +static struct ibv_mem_node *__mm_next(struct ibv_mem_node *node) +{ + if (node->right) { + node = node->right; + while (node->left) + node = node->left; + } else { + while (node->parent && node == node->parent->right) + node = node->parent; + + node = node->parent; + } + + return node; +} + +static void __mm_rotate_right(struct ibv_mem_node *node) +{ + struct ibv_mem_node *tmp; + + tmp = node->left; + + node->left = tmp->right; + if (node->left) + node->left->parent = node; + + if (node->parent) { + if (node->parent->right == node) + node->parent->right = tmp; + else + node->parent->left = tmp; + } else + mm_root = tmp; + + tmp->parent = node->parent; + + tmp->right = node; + node->parent = tmp; +} + +static void __mm_rotate_left(struct ibv_mem_node *node) +{ + struct ibv_mem_node *tmp; + + tmp = node->right; + + node->right = tmp->left; + if (node->right) + node->right->parent = node; + + if (node->parent) { + if (node->parent->right == node) + node->parent->right = tmp; + else + node->parent->left = tmp; + } else + mm_root = tmp; + + tmp->parent = node->parent; + + tmp->left = node; + node->parent = tmp; +} + +#if 0 +static int verify(struct ibv_mem_node *node) +{ + int hl, hr; + + if (!node) + return 1; + + hl = verify(node->left); + hr = verify(node->left); + + if (!hl || !hr) + return 0; + if (hl != hr) + return 0; + + if (node->color == IBV_RED) { + if (node->left && node->left->color != IBV_BLACK) + return 0; + if (node->right && node->right->color != IBV_BLACK) + return 0; + return hl; + } + + return hl + 1; +} +#endif + +static void __mm_add_rebalance(struct ibv_mem_node *node) +{ + struct ibv_mem_node *parent, *gp, *uncle; + + while (node->parent && node->parent->color == IBV_RED) { + parent = node->parent; + gp = node->parent->parent; + + if (parent == gp->left) { + uncle = gp->right; + + if (uncle && uncle->color == IBV_RED) { + parent->color = IBV_BLACK; + uncle->color = IBV_BLACK; + gp->color = IBV_RED; + + node = gp; + } else { + if (node == parent->right) { + __mm_rotate_left(parent); + node = parent; + parent = node->parent; + } + + parent->color = IBV_BLACK; + gp->color = IBV_RED; + + __mm_rotate_right(gp); + } + } else { + uncle = gp->left; + + if (uncle && uncle->color == IBV_RED) { + parent->color = IBV_BLACK; + uncle->color = IBV_BLACK; + gp->color = IBV_RED; + + node = gp; + } else { + if (node == parent->left) { + __mm_rotate_right(parent); + node = parent; + parent = node->parent; + } + + parent->color = IBV_BLACK; + gp->color = IBV_RED; + + __mm_rotate_left(gp); + } + } + } + + mm_root->color = IBV_BLACK; +} + +static void __mm_add(struct ibv_mem_node *new) +{ + struct ibv_mem_node *node, *parent = NULL; + + node = mm_root; + while (node) { + parent = node; + if (node->start < new->start) + node = node->right; + else + node = node->left; + } + + if (parent->start < new->start) + parent->right = new; + else + parent->left = new; + + new->parent = parent; + new->left = NULL; + new->right = NULL; + + new->color = IBV_RED; + __mm_add_rebalance(new); +} + +static void __mm_remove(struct ibv_mem_node *node) +{ + struct ibv_mem_node *child, *parent, *sib, *tmp; + int nodecol; + + if (node->left && node->right) { + tmp = node->left; + while (tmp->right) + tmp = tmp->right; + + nodecol = tmp->color; + child = tmp->left; + tmp->color = node->color; + + if (tmp->parent != node) { + parent = tmp->parent; + parent->right = tmp->left; + if (tmp->left) + tmp->left->parent = parent; + + tmp->left = node->left; + node->left->parent = tmp; + } else + parent = tmp; + + tmp->right = node->right; + node->right->parent = tmp; + + tmp->parent = node->parent; + if (node->parent) { + if (node->parent->left == node) + node->parent->left = tmp; + else + node->parent->right = tmp; + } else + mm_root = tmp; + } else { + nodecol = node->color; + + child = node->left ? node->left : node->right; + parent = node->parent; + + if (child) + child->parent = parent; + if (parent) { + if (parent->left == node) + parent->left = child; + else + parent->right = child; + } else + mm_root = child; + } + + free(node); + + if (nodecol == IBV_RED) + return; + + while ((!child || child->color == IBV_BLACK) && child != mm_root) { + if (parent->left == child) { + sib = parent->right; + + if (sib->color == IBV_RED) { + parent->color = IBV_RED; + sib->color = IBV_BLACK; + __mm_rotate_left(parent); + sib = parent->right; + } + + if ((!sib->left || sib->left->color == IBV_BLACK) && + (!sib->right || sib->right->color == IBV_BLACK)) { + sib->color = IBV_RED; + child = parent; + parent = child->parent; + } else { + if (!sib->right || sib->right->color == IBV_BLACK) { + if (sib->left) + sib->left->color = IBV_BLACK; + sib->color = IBV_RED; + __mm_rotate_right(sib); + sib = parent->right; + } + + sib->color = parent->color; + parent->color = IBV_BLACK; + if (sib->right) + sib->right->color = IBV_BLACK; + __mm_rotate_left(parent); + child = mm_root; + break; + } + } else { + sib = parent->left; + + if (sib->color == IBV_RED) { + parent->color = IBV_RED; + sib->color = IBV_BLACK; + __mm_rotate_right(parent); + sib = parent->left; + } + + if ((!sib->left || sib->left->color == IBV_BLACK) && + (!sib->right || sib->right->color == IBV_BLACK)) { + sib->color = IBV_RED; + child = parent; + parent = child->parent; + } else { + if (!sib->left || sib->left->color == IBV_BLACK) { + if (sib->right) + sib->right->color = IBV_BLACK; + sib->color = IBV_RED; + __mm_rotate_left(sib); + sib = parent->left; + } + + sib->color = parent->color; + parent->color = IBV_BLACK; + if (sib->left) + sib->left->color = IBV_BLACK; + __mm_rotate_right(parent); + child = mm_root; + break; + } + } + } + + if (child) + child->color = IBV_BLACK; +} + +static struct ibv_mem_node *__mm_find_start(uintptr_t start, uintptr_t end) +{ + struct ibv_mem_node *node = mm_root; + + while (node) { + if (node->start <= start && node->end >= start) + break; + + if (node->start < start) + node = node->right; + else + node = node->left; + } + + return node; +} + +static struct ibv_mem_node *merge_ranges(struct ibv_mem_node *node, + struct ibv_mem_node *prev) +{ + prev->end = node->end; + prev->refcnt = node->refcnt; + __mm_remove(node); + + return prev; +} + +static struct ibv_mem_node *split_range(struct ibv_mem_node *node, + uintptr_t cut_line) +{ + struct ibv_mem_node *new_node = NULL; + + new_node = malloc(sizeof *new_node); + if (!new_node) + return NULL; + new_node->start = cut_line; + new_node->end = node->end; + new_node->refcnt = node->refcnt; + node->end = cut_line - 1; + __mm_add(new_node); + + return new_node; +} + +static struct ibv_mem_node *get_start_node(uintptr_t start, uintptr_t end, + int inc) +{ + struct ibv_mem_node *node, *tmp = NULL; + + node = __mm_find_start(start, end); + if (node->start < start) + node = split_range(node, start); + else { + tmp = __mm_prev(node); + if (tmp && tmp->refcnt == node->refcnt + inc) + node = merge_ranges(node, tmp); + } + return node; +} + +/* + * This function is called if madvise() fails to undo merging/splitting + * operations performed on the node. + */ +static struct ibv_mem_node *undo_node(struct ibv_mem_node *node, + uintptr_t start, int inc) +{ + struct ibv_mem_node *tmp = NULL; + + /* + * This condition can be true only if we merged this + * node with the previous one, so we need to split them. + */ + if (start > node->start) { + tmp = split_range(node, start); + if (tmp) { + node->refcnt += inc; + node = tmp; + } else + return NULL; + } + + tmp = __mm_prev(node); + if (tmp && tmp->refcnt == node->refcnt) + node = merge_ranges(node, tmp); + + tmp = __mm_next(node); + if (tmp && tmp->refcnt == node->refcnt) + node = merge_ranges(tmp, node); + + return node; +} + +static int ibv_madvise_range(void *base, size_t size, int advice) +{ + uintptr_t start, end; + struct ibv_mem_node *node, *tmp; + int inc; + int rolling_back = 0; + int ret = 0; + unsigned long range_page_size; + + if (!size || !base) + return 0; + + if (huge_page_enabled) + range_page_size = get_page_size(base); + else + range_page_size = page_size; + + start = (uintptr_t) base & ~(range_page_size - 1); + end = ((uintptr_t) (base + size + range_page_size - 1) & + ~(range_page_size - 1)) - 1; + + pthread_mutex_lock(&mm_mutex); +again: + inc = advice == MADV_DONTFORK ? 1 : -1; + + node = get_start_node(start, end, inc); + if (!node) { + ret = -1; + goto out; + } + + while (node && node->start <= end) { + if (node->end > end) { + if (!split_range(node, end + 1)) { + ret = -1; + goto out; + } + } + + if ((inc == -1 && node->refcnt == 1) || + (inc == 1 && node->refcnt == 0)) { + /* + * If this is the first time through the loop, + * and we merged this node with the previous + * one, then we only want to do the madvise() + * on start ... node->end (rather than + * starting at node->start). + * + * Otherwise we end up doing madvise() on + * bigger region than we're being asked to, + * and that may lead to a spurious failure. + */ + if (start > node->start) + ret = madvise((void *) start, node->end - start + 1, + advice); + else + ret = madvise((void *) node->start, + node->end - node->start + 1, + advice); + if (ret) { + node = undo_node(node, start, inc); + + if (rolling_back || !node) + goto out; + + /* madvise failed, roll back previous changes */ + rolling_back = 1; + advice = advice == MADV_DONTFORK ? + MADV_DOFORK : MADV_DONTFORK; + tmp = __mm_prev(node); + if (!tmp || start > tmp->end) + goto out; + end = tmp->end; + goto again; + } + } + + node->refcnt += inc; + node = __mm_next(node); + } + + if (node) { + tmp = __mm_prev(node); + if (tmp && node->refcnt == tmp->refcnt) + node = merge_ranges(node, tmp); + } + +out: + if (rolling_back) + ret = -1; + + pthread_mutex_unlock(&mm_mutex); + + return ret; +} + +int ibv_dontfork_range(void *base, size_t size) +{ + if (mm_root) + return ibv_madvise_range(base, size, MADV_DONTFORK); + else { + too_late = 1; + return 0; + } +} + +int ibv_dofork_range(void *base, size_t size) +{ + if (mm_root) + return ibv_madvise_range(base, size, MADV_DOFORK); + else { + too_late = 1; + return 0; + } +} diff --git a/libibverbs/neigh.c b/libibverbs/neigh.c new file mode 100644 index 0000000..fa3cbf5 --- /dev/null +++ b/libibverbs/neigh.c @@ -0,0 +1,837 @@ +/* Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md + */ + +#include "config.h" +#include <linux/netlink.h> +#include <linux/rtnetlink.h> +#include <endian.h> +#include <stdio.h> +#include <stdlib.h> +#include <stdbool.h> + +#if HAVE_WORKING_IF_H +#include <net/if.h> +#endif + +#include <netlink/route/rtnl.h> +#include <netlink/route/link.h> +#include <netlink/route/route.h> +#include <netlink/route/neighbour.h> + +#include <sys/types.h> +#include <sys/socket.h> +#include <sys/timerfd.h> +#include <errno.h> +#include <unistd.h> +#include <ifaddrs.h> +#include <netdb.h> +#include <assert.h> + +#if !HAVE_WORKING_IF_H +/* We need this decl from net/if.h but old systems do not let use co-include + net/if.h and netlink/route/link.h */ +extern unsigned int if_nametoindex(__const char *__ifname) __THROW; +#endif + +/* for PFX */ +#include "ibverbs.h" +#include <ccan/minmax.h> + +#include "neigh.h" + +#include <netlink/route/link/vlan.h> + +union sktaddr { + struct sockaddr s; + struct sockaddr_in s4; + struct sockaddr_in6 s6; +}; + +struct skt { + union sktaddr sktaddr; + socklen_t len; +}; + +static int set_link_port(union sktaddr *s, __be16 port, int oif) +{ + switch (s->s.sa_family) { + case AF_INET: + s->s4.sin_port = port; + break; + case AF_INET6: + s->s6.sin6_port = port; + s->s6.sin6_scope_id = oif; + break; + default: + return -EINVAL; + } + + return 0; +} + +static bool cmp_address(const struct sockaddr *s1, + const struct sockaddr *s2) +{ + if (s1->sa_family != s2->sa_family) + return false; + + switch (s1->sa_family) { + case AF_INET: + return ((struct sockaddr_in *)s1)->sin_addr.s_addr == + ((struct sockaddr_in *)s2)->sin_addr.s_addr; + case AF_INET6: + return !memcmp( + ((struct sockaddr_in6 *)s1)->sin6_addr.s6_addr, + ((struct sockaddr_in6 *)s2)->sin6_addr.s6_addr, + sizeof(((struct sockaddr_in6 *)s1)->sin6_addr.s6_addr)); + default: + return false; + } +} + +static int get_ifindex(const struct sockaddr *s) +{ + struct ifaddrs *ifaddr, *ifa; + int name2index = -ENODEV; + + if (-1 == getifaddrs(&ifaddr)) + return errno; + + for (ifa = ifaddr; ifa != NULL; ifa = ifa->ifa_next) { + if (ifa->ifa_addr == NULL) + continue; + + if (cmp_address(ifa->ifa_addr, s)) { + name2index = if_nametoindex(ifa->ifa_name); + break; + } + } + + freeifaddrs(ifaddr); + + return name2index; +} + +static struct nl_addr *get_neigh_mac(struct get_neigh_handler *neigh_handler) +{ + struct rtnl_neigh *neigh; + struct nl_addr *ll_addr = NULL; + + /* future optimization - if link local address - parse address and + * return mac now instead of doing so after the routing CB. This + * is of course referred to GIDs */ + neigh = rtnl_neigh_get(neigh_handler->neigh_cache, + neigh_handler->oif, + neigh_handler->dst); + if (neigh == NULL) + return NULL; + + ll_addr = rtnl_neigh_get_lladdr(neigh); + if (NULL != ll_addr) + ll_addr = nl_addr_clone(ll_addr); + + rtnl_neigh_put(neigh); + return ll_addr; +} + +static void get_neigh_cb_event(struct nl_object *obj, void *arg) +{ + struct get_neigh_handler *neigh_handler = + (struct get_neigh_handler *)arg; + /* assumed serilized callback (no parallel execution of function) */ + if (nl_object_match_filter( + obj, + (struct nl_object *)neigh_handler->filter_neigh)) { + struct rtnl_neigh *neigh = (struct rtnl_neigh *)obj; + /* check that we didn't set it already */ + if (neigh_handler->found_ll_addr == NULL) { + if (rtnl_neigh_get_lladdr(neigh) == NULL) + return; + + neigh_handler->found_ll_addr = + nl_addr_clone(rtnl_neigh_get_lladdr(neigh)); + } + } +} + +static int get_neigh_cb(struct nl_msg *msg, void *arg) +{ + struct get_neigh_handler *neigh_handler = + (struct get_neigh_handler *)arg; + + if (nl_msg_parse(msg, &get_neigh_cb_event, neigh_handler) < 0) + errno = ENOMSG; + + return NL_OK; +} + +static void set_neigh_filter(struct get_neigh_handler *neigh_handler, + struct rtnl_neigh *filter) { + neigh_handler->filter_neigh = filter; +} + +static struct rtnl_neigh *create_filter_neigh_for_dst(struct nl_addr *dst_addr, + int oif) +{ + struct rtnl_neigh *filter_neigh; + + filter_neigh = rtnl_neigh_alloc(); + if (filter_neigh == NULL) + return NULL; + + rtnl_neigh_set_ifindex(filter_neigh, oif); + rtnl_neigh_set_dst(filter_neigh, dst_addr); + + return filter_neigh; +} + +#define PORT_DISCARD htobe16(9) +#define SEND_PAYLOAD "H" + +static int create_socket(struct get_neigh_handler *neigh_handler, + struct skt *addr_dst, int *psock_fd) +{ + int err; + struct skt addr_src; + int sock_fd; + + memset(addr_dst, 0, sizeof(*addr_dst)); + memset(&addr_src, 0, sizeof(addr_src)); + addr_src.len = sizeof(addr_src.sktaddr); + + err = nl_addr_fill_sockaddr(neigh_handler->src, + &addr_src.sktaddr.s, + &addr_src.len); + if (err) { + errno = EADDRNOTAVAIL; + return -1; + } + + addr_dst->len = sizeof(addr_dst->sktaddr); + err = nl_addr_fill_sockaddr(neigh_handler->dst, + &addr_dst->sktaddr.s, + &addr_dst->len); + if (err) { + errno = EADDRNOTAVAIL; + return -1; + } + + err = set_link_port(&addr_dst->sktaddr, PORT_DISCARD, + neigh_handler->oif); + if (err) + return -1; + + sock_fd = socket(addr_dst->sktaddr.s.sa_family, + SOCK_DGRAM | SOCK_CLOEXEC, 0); + if (sock_fd == -1) + return -1; + err = bind(sock_fd, &addr_src.sktaddr.s, addr_src.len); + if (err) { + close(sock_fd); + return -1; + } + + *psock_fd = sock_fd; + + return 0; +} + +#define NUM_OF_RETRIES 10 +#define NUM_OF_TRIES ((NUM_OF_RETRIES) + 1) +#if NUM_OF_TRIES < 1 +#error "neigh: invalid value of NUM_OF_RETRIES" +#endif +static int create_timer(struct get_neigh_handler *neigh_handler) +{ + int user_timeout = neigh_handler->timeout/NUM_OF_TRIES; + struct timespec timeout = { + .tv_sec = user_timeout / 1000, + .tv_nsec = (user_timeout % 1000) * 1000000 + }; + struct itimerspec timer_time = {.it_value = timeout}; + int timer_fd; + + timer_fd = timerfd_create(CLOCK_MONOTONIC, TFD_NONBLOCK | TFD_CLOEXEC); + if (timer_fd == -1) + return timer_fd; + + if (neigh_handler->timeout) { + if (NUM_OF_TRIES <= 1) + bzero(&timer_time.it_interval, + sizeof(timer_time.it_interval)); + else + timer_time.it_interval = timeout; + if (timerfd_settime(timer_fd, 0, &timer_time, NULL)) { + close(timer_fd); + return -1; + } + } + + return timer_fd; +} + +#define UDP_SOCKET_MAX_SENDTO 100000ULL +static int try_send_to(int sock_fd, void *buff, size_t buf_size, + struct skt *addr_dst) +{ + uint64_t max_count = UDP_SOCKET_MAX_SENDTO; + int err; + + do { + err = sendto(sock_fd, buff, buf_size, 0, + &addr_dst->sktaddr.s, + addr_dst->len); + if (err > 0) + err = 0; + } while (-1 == err && EADDRNOTAVAIL == errno && --max_count); + + return err; +} + +static struct nl_addr *process_get_neigh_mac( + struct get_neigh_handler *neigh_handler) +{ + int err; + struct nl_addr *ll_addr = get_neigh_mac(neigh_handler); + struct rtnl_neigh *neigh_filter; + fd_set fdset; + int sock_fd; + int fd; + int nfds; + int timer_fd; + int ret; + struct skt addr_dst; + char buff[sizeof(SEND_PAYLOAD)] = SEND_PAYLOAD; + int retries = 0; + + if (NULL != ll_addr) + return ll_addr; + + err = nl_socket_add_membership(neigh_handler->sock, + RTNLGRP_NEIGH); + if (err < 0) + return NULL; + + neigh_filter = create_filter_neigh_for_dst(neigh_handler->dst, + neigh_handler->oif); + if (neigh_filter == NULL) + return NULL; + + set_neigh_filter(neigh_handler, neigh_filter); + + nl_socket_disable_seq_check(neigh_handler->sock); + nl_socket_modify_cb(neigh_handler->sock, NL_CB_VALID, NL_CB_CUSTOM, + &get_neigh_cb, neigh_handler); + + fd = nl_socket_get_fd(neigh_handler->sock); + + err = create_socket(neigh_handler, &addr_dst, &sock_fd); + + if (err) + return NULL; + + err = try_send_to(sock_fd, buff, sizeof(buff), &addr_dst); + if (err) + goto close_socket; + + timer_fd = create_timer(neigh_handler); + if (timer_fd < 0) + goto close_socket; + + nfds = max(fd, timer_fd) + 1; + + while (1) { + FD_ZERO(&fdset); + FD_SET(fd, &fdset); + FD_SET(timer_fd, &fdset); + + /* wait for an incoming message on the netlink socket */ + ret = select(nfds, &fdset, NULL, NULL, NULL); + if (ret == -1) { + goto select_err; + } else if (ret) { + if (FD_ISSET(fd, &fdset)) { + nl_recvmsgs_default(neigh_handler->sock); + if (neigh_handler->found_ll_addr) + break; + } else { + nl_cache_refill(neigh_handler->sock, + neigh_handler->neigh_cache); + ll_addr = get_neigh_mac(neigh_handler); + if (NULL != ll_addr) { + break; + } else if (FD_ISSET(timer_fd, &fdset) && + retries < NUM_OF_RETRIES) { + try_send_to(sock_fd, buff, sizeof(buff), + &addr_dst); + } + } + + if (FD_ISSET(timer_fd, &fdset)) { + uint64_t read_val; + ssize_t __attribute__((unused)) rc; + + rc = + read(timer_fd, &read_val, sizeof(read_val)); + assert(rc == sizeof(read_val)); + if (++retries >= NUM_OF_TRIES) { + if (!errno) + errno = EDESTADDRREQ; + break; + } + } + } + } +select_err: + close(timer_fd); +close_socket: + close(sock_fd); + return ll_addr ? ll_addr : neigh_handler->found_ll_addr; +} + +static int get_mcast_mac_ipv4(struct nl_addr *dst, struct nl_addr **ll_addr) +{ + uint8_t mac_addr[6] = {0x01, 0x00, 0x5E}; + uint32_t addr = be32toh(*(__be32 *)nl_addr_get_binary_addr(dst)); + + mac_addr[5] = addr & 0xFF; + addr >>= 8; + mac_addr[4] = addr & 0xFF; + addr >>= 8; + mac_addr[3] = addr & 0x7F; + + *ll_addr = nl_addr_build(AF_LLC, mac_addr, sizeof(mac_addr)); + + return *ll_addr == NULL ? -EINVAL : 0; +} + +static int get_mcast_mac_ipv6(struct nl_addr *dst, struct nl_addr **ll_addr) +{ + uint8_t mac_addr[6] = {0x33, 0x33}; + + memcpy(mac_addr + 2, (uint8_t *)nl_addr_get_binary_addr(dst) + 12, 4); + + *ll_addr = nl_addr_build(AF_LLC, mac_addr, sizeof(mac_addr)); + + return *ll_addr == NULL ? -EINVAL : 0; +} + +static int get_link_local_mac_ipv6(struct nl_addr *dst, + struct nl_addr **ll_addr) +{ + uint8_t mac_addr[6]; + + memcpy(mac_addr + 3, (uint8_t *)nl_addr_get_binary_addr(dst) + 13, 3); + memcpy(mac_addr, (uint8_t *)nl_addr_get_binary_addr(dst) + 8, 3); + mac_addr[0] ^= 2; + + *ll_addr = nl_addr_build(AF_LLC, mac_addr, sizeof(mac_addr)); + return *ll_addr == NULL ? -EINVAL : 0; +} + +static const struct encoded_l3_addr { + short family; + uint8_t prefix_bits; + const uint8_t data[16]; + int (*getter)(struct nl_addr *dst, struct nl_addr **ll_addr); +} encoded_prefixes[] = { + {.family = AF_INET, + .prefix_bits = 4, + .data = {0xe0}, + .getter = &get_mcast_mac_ipv4}, + {.family = AF_INET6, + .prefix_bits = 8, + .data = {0xff}, + .getter = &get_mcast_mac_ipv6}, + {.family = AF_INET6, + .prefix_bits = 64, + .data = {0xfe, 0x80}, + .getter = get_link_local_mac_ipv6}, +}; + +static int nl_addr_cmp_prefix_msb(void *addr1, int len1, void *addr2, int len2) +{ + int len = min(len1, len2); + int bytes = len / 8; + int d = memcmp(addr1, addr2, bytes); + + if (d == 0) { + int mask = ((1UL << (len % 8)) - 1UL) << (8 - len); + + d = (((uint8_t *)addr1)[bytes] & mask) - + (((uint8_t *)addr2)[bytes] & mask); + } + + return d; +} + +static int handle_encoded_mac(struct nl_addr *dst, struct nl_addr **ll_addr) +{ + uint32_t family = nl_addr_get_family(dst); + struct nl_addr *prefix = NULL; + int i; + int ret = 1; + + for (i = 0; + i < sizeof(encoded_prefixes)/sizeof(encoded_prefixes[0]) && + ret; prefix = NULL, i++) { + if (encoded_prefixes[i].family != family) + continue; + + prefix = nl_addr_build( + family, (void *)encoded_prefixes[i].data, + min_t(size_t, encoded_prefixes[i].prefix_bits / 8 + + !!(encoded_prefixes[i].prefix_bits % 8), + sizeof(encoded_prefixes[i].data))); + + if (prefix == NULL) + return -ENOMEM; + nl_addr_set_prefixlen(prefix, + encoded_prefixes[i].prefix_bits); + + if (nl_addr_cmp_prefix_msb(nl_addr_get_binary_addr(dst), + nl_addr_get_prefixlen(dst), + nl_addr_get_binary_addr(prefix), + nl_addr_get_prefixlen(prefix))) + continue; + + ret = encoded_prefixes[i].getter(dst, ll_addr); + nl_addr_put(prefix); + } + + return ret; +} + +static void get_route_cb_parser(struct nl_object *obj, void *arg) +{ + struct get_neigh_handler *neigh_handler = + (struct get_neigh_handler *)arg; + + struct rtnl_route *route = (struct rtnl_route *)obj; + struct nl_addr *gateway = NULL; + struct nl_addr *src = rtnl_route_get_pref_src(route); + int oif; + int type = rtnl_route_get_type(route); + struct rtnl_link *link; + + struct rtnl_nexthop *nh = rtnl_route_nexthop_n(route, 0); + + if (nh != NULL) + gateway = rtnl_route_nh_get_gateway(nh); + oif = rtnl_route_nh_get_ifindex(nh); + + if (gateway) { + nl_addr_put(neigh_handler->dst); + neigh_handler->dst = nl_addr_clone(gateway); + } + + if (RTN_BLACKHOLE == type || + RTN_UNREACHABLE == type || + RTN_PROHIBIT == type || + RTN_THROW == type) { + errno = ENETUNREACH; + goto err; + } + + if (!neigh_handler->src && src) + neigh_handler->src = nl_addr_clone(src); + + if (neigh_handler->oif < 0 && oif > 0) + neigh_handler->oif = oif; + + /* Link Local */ + if (RTN_LOCAL == type) { + struct nl_addr *lladdr; + + link = rtnl_link_get(neigh_handler->link_cache, + neigh_handler->oif); + + if (link == NULL) + goto err; + + lladdr = rtnl_link_get_addr(link); + + if (lladdr == NULL) + goto err_link; + + neigh_handler->found_ll_addr = nl_addr_clone(lladdr); + rtnl_link_put(link); + } else { + handle_encoded_mac( + neigh_handler->dst, + &neigh_handler->found_ll_addr); + } + + return; + +err_link: + rtnl_link_put(link); +err: + if (neigh_handler->src) { + nl_addr_put(neigh_handler->src); + neigh_handler->src = NULL; + } +} + +static int get_route_cb(struct nl_msg *msg, void *arg) +{ + struct get_neigh_handler *neigh_handler = + (struct get_neigh_handler *)arg; + int err; + + err = nl_msg_parse(msg, &get_route_cb_parser, neigh_handler); + if (err < 0) { + errno = ENOMSG; + return err; + } + + if (!neigh_handler->dst || !neigh_handler->src || + neigh_handler->oif <= 0) { + errno = EINVAL; + return -1; + } + + if (NULL != neigh_handler->found_ll_addr) + goto found; + + neigh_handler->found_ll_addr = + process_get_neigh_mac(neigh_handler); + +found: + return neigh_handler->found_ll_addr ? 0 : -1; +} + +int neigh_get_oif_from_src(struct get_neigh_handler *neigh_handler) +{ + int oif = -ENODEV; + struct addrinfo *src_info; + int err; + + err = nl_addr_info(neigh_handler->src, &src_info); + if (err) { + if (!errno) + errno = ENXIO; + return oif; + } + + oif = get_ifindex(src_info->ai_addr); + if (oif <= 0) + goto free; + +free: + freeaddrinfo(src_info); + return oif; +} + +int neigh_init_resources(struct get_neigh_handler *neigh_handler, int timeout) +{ + int err; + + neigh_handler->sock = nl_socket_alloc(); + if (neigh_handler->sock == NULL) { + errno = ENOSYS; + return -ENOSYS; + } + + err = nl_connect(neigh_handler->sock, NETLINK_ROUTE); + if (err < 0) + goto free_socket; + + err = rtnl_link_alloc_cache(neigh_handler->sock, AF_UNSPEC, + &neigh_handler->link_cache); + if (err) { + err = -1; + errno = ENOMEM; + goto free_socket; + } + + nl_cache_mngt_provide(neigh_handler->link_cache); + + err = rtnl_route_alloc_cache(neigh_handler->sock, AF_UNSPEC, 0, + &neigh_handler->route_cache); + if (err) { + err = -1; + errno = ENOMEM; + goto free_link_cache; + } + + nl_cache_mngt_provide(neigh_handler->route_cache); + + err = rtnl_neigh_alloc_cache(neigh_handler->sock, + &neigh_handler->neigh_cache); + if (err) { + err = -ENOMEM; + goto free_route_cache; + } + + nl_cache_mngt_provide(neigh_handler->neigh_cache); + + /* init structure */ + neigh_handler->timeout = timeout; + neigh_handler->oif = -1; + neigh_handler->filter_neigh = NULL; + neigh_handler->found_ll_addr = NULL; + neigh_handler->dst = NULL; + neigh_handler->src = NULL; + neigh_handler->vid = -1; + + return 0; + +free_route_cache: + nl_cache_mngt_unprovide(neigh_handler->route_cache); + nl_cache_free(neigh_handler->route_cache); + neigh_handler->route_cache = NULL; +free_link_cache: + nl_cache_mngt_unprovide(neigh_handler->link_cache); + nl_cache_free(neigh_handler->link_cache); + neigh_handler->link_cache = NULL; +free_socket: + nl_socket_free(neigh_handler->sock); + neigh_handler->sock = NULL; + return err; +} + +uint16_t neigh_get_vlan_id_from_dev(struct get_neigh_handler *neigh_handler) +{ + struct rtnl_link *link; + int vid = 0xffff; + + link = rtnl_link_get(neigh_handler->link_cache, neigh_handler->oif); + if (link == NULL) { + errno = EINVAL; + return vid; + } + + if (rtnl_link_is_vlan(link)) + vid = rtnl_link_vlan_get_id(link); + rtnl_link_put(link); + return vid >= 0 && vid <= 0xfff ? vid : 0xffff; +} + +void neigh_set_vlan_id(struct get_neigh_handler *neigh_handler, uint16_t vid) +{ + if (vid <= 0xfff) + neigh_handler->vid = vid; +} + +int neigh_set_dst(struct get_neigh_handler *neigh_handler, + int family, void *buf, size_t size) +{ + neigh_handler->dst = nl_addr_build(family, buf, size); + return neigh_handler->dst == NULL; +} + +int neigh_set_src(struct get_neigh_handler *neigh_handler, + int family, void *buf, size_t size) +{ + neigh_handler->src = nl_addr_build(family, buf, size); + return neigh_handler->src == NULL; +} + +void neigh_set_oif(struct get_neigh_handler *neigh_handler, int oif) +{ + neigh_handler->oif = oif; +} + +int neigh_get_ll(struct get_neigh_handler *neigh_handler, void *addr_buff, + int addr_size) { + int neigh_len; + + if (neigh_handler->found_ll_addr == NULL) + return -EINVAL; + + neigh_len = nl_addr_get_len(neigh_handler->found_ll_addr); + + if (neigh_len > addr_size) + return -EINVAL; + + memcpy(addr_buff, nl_addr_get_binary_addr(neigh_handler->found_ll_addr), + neigh_len); + + return neigh_len; +} + +void neigh_free_resources(struct get_neigh_handler *neigh_handler) +{ + /* Should be released first because it's holding a reference to dst */ + if (neigh_handler->filter_neigh != NULL) { + rtnl_neigh_put(neigh_handler->filter_neigh); + neigh_handler->filter_neigh = NULL; + } + + if (neigh_handler->src != NULL) { + nl_addr_put(neigh_handler->src); + neigh_handler->src = NULL; + } + + if (neigh_handler->dst != NULL) { + nl_addr_put(neigh_handler->dst); + neigh_handler->dst = NULL; + } + + if (neigh_handler->found_ll_addr != NULL) { + nl_addr_put(neigh_handler->found_ll_addr); + neigh_handler->found_ll_addr = NULL; + } + + if (neigh_handler->neigh_cache != NULL) { + nl_cache_mngt_unprovide(neigh_handler->neigh_cache); + nl_cache_free(neigh_handler->neigh_cache); + neigh_handler->neigh_cache = NULL; + } + + if (neigh_handler->route_cache != NULL) { + nl_cache_mngt_unprovide(neigh_handler->route_cache); + nl_cache_free(neigh_handler->route_cache); + neigh_handler->route_cache = NULL; + } + + if (neigh_handler->link_cache != NULL) { + nl_cache_mngt_unprovide(neigh_handler->link_cache); + nl_cache_free(neigh_handler->link_cache); + neigh_handler->link_cache = NULL; + } + + if (neigh_handler->sock != NULL) { + nl_socket_free(neigh_handler->sock); + neigh_handler->sock = NULL; + } +} + +int process_get_neigh(struct get_neigh_handler *neigh_handler) +{ + struct nl_msg *m; + struct rtmsg rmsg = { + .rtm_family = nl_addr_get_family(neigh_handler->dst), + .rtm_dst_len = nl_addr_get_prefixlen(neigh_handler->dst), + }; + int err; + + m = nlmsg_alloc_simple(RTM_GETROUTE, 0); + + if (m == NULL) + return -ENOMEM; + + nlmsg_append(m, &rmsg, sizeof(rmsg), NLMSG_ALIGNTO); + + NLA_PUT_ADDR(m, RTA_DST, neigh_handler->dst); + + if (neigh_handler->oif > 0) + NLA_PUT_U32(m, RTA_OIF, neigh_handler->oif); + + err = nl_send_auto(neigh_handler->sock, m); + nlmsg_free(m); + if (err < 0) + return err; + + nl_socket_modify_cb(neigh_handler->sock, NL_CB_VALID, + NL_CB_CUSTOM, &get_route_cb, neigh_handler); + + err = nl_recvmsgs_default(neigh_handler->sock); + + return err; + +nla_put_failure: + nlmsg_free(m); + return -ENOMEM; +} diff --git a/libibverbs/neigh.h b/libibverbs/neigh.h new file mode 100644 index 0000000..b1812d1 --- /dev/null +++ b/libibverbs/neigh.h @@ -0,0 +1,41 @@ +/* Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md + */ + +#ifndef _NEIGH_H_ +#define _NEIGH_H_ + +#include <stddef.h> +#include <stdint.h> +#include "config.h" +#include <netlink/object-api.h> + +struct get_neigh_handler { + struct nl_sock *sock; + struct nl_cache *link_cache; + struct nl_cache *neigh_cache; + struct nl_cache *route_cache; + int32_t oif; + int vid; + struct rtnl_neigh *filter_neigh; + struct nl_addr *found_ll_addr; + struct nl_addr *dst; + struct nl_addr *src; + uint64_t timeout; +}; + +int process_get_neigh(struct get_neigh_handler *neigh_handler); +void neigh_free_resources(struct get_neigh_handler *neigh_handler); +void neigh_set_vlan_id(struct get_neigh_handler *neigh_handler, uint16_t vid); +uint16_t neigh_get_vlan_id_from_dev(struct get_neigh_handler *neigh_handler); +int neigh_init_resources(struct get_neigh_handler *neigh_handler, int timeout); + +int neigh_set_src(struct get_neigh_handler *neigh_handler, + int family, void *buf, size_t size); +void neigh_set_oif(struct get_neigh_handler *neigh_handler, int oif); +int neigh_set_dst(struct get_neigh_handler *neigh_handler, + int family, void *buf, size_t size); +int neigh_get_oif_from_src(struct get_neigh_handler *neigh_handler); +int neigh_get_ll(struct get_neigh_handler *neigh_handler, void *addr_buf, + int addr_size); + +#endif diff --git a/libibverbs/opcode.h b/libibverbs/opcode.h new file mode 100644 index 0000000..fd4bc96 --- /dev/null +++ b/libibverbs/opcode.h @@ -0,0 +1,147 @@ +/* + * Copyright (c) 2005 Topspin Communications. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef INFINIBAND_OPCODE_H +#define INFINIBAND_OPCODE_H + +/* + * This macro cleans up the definitions of constants for BTH opcodes. + * It is used to define constants such as IBV_OPCODE_UD_SEND_ONLY, + * which becomes IBV_OPCODE_UD + IBV_OPCODE_SEND_ONLY, and this gives + * the correct value. + * + * In short, user code should use the constants defined using the + * macro rather than worrying about adding together other constants. +*/ +#define IBV_OPCODE(transport, op) \ + IBV_OPCODE_ ## transport ## _ ## op = \ + IBV_OPCODE_ ## transport + IBV_OPCODE_ ## op + +enum { + /* transport types -- just used to define real constants */ + IBV_OPCODE_RC = 0x00, + IBV_OPCODE_UC = 0x20, + IBV_OPCODE_RD = 0x40, + IBV_OPCODE_UD = 0x60, + + /* operations -- just used to define real constants */ + IBV_OPCODE_SEND_FIRST = 0x00, + IBV_OPCODE_SEND_MIDDLE = 0x01, + IBV_OPCODE_SEND_LAST = 0x02, + IBV_OPCODE_SEND_LAST_WITH_IMMEDIATE = 0x03, + IBV_OPCODE_SEND_ONLY = 0x04, + IBV_OPCODE_SEND_ONLY_WITH_IMMEDIATE = 0x05, + IBV_OPCODE_RDMA_WRITE_FIRST = 0x06, + IBV_OPCODE_RDMA_WRITE_MIDDLE = 0x07, + IBV_OPCODE_RDMA_WRITE_LAST = 0x08, + IBV_OPCODE_RDMA_WRITE_LAST_WITH_IMMEDIATE = 0x09, + IBV_OPCODE_RDMA_WRITE_ONLY = 0x0a, + IBV_OPCODE_RDMA_WRITE_ONLY_WITH_IMMEDIATE = 0x0b, + IBV_OPCODE_RDMA_READ_REQUEST = 0x0c, + IBV_OPCODE_RDMA_READ_RESPONSE_FIRST = 0x0d, + IBV_OPCODE_RDMA_READ_RESPONSE_MIDDLE = 0x0e, + IBV_OPCODE_RDMA_READ_RESPONSE_LAST = 0x0f, + IBV_OPCODE_RDMA_READ_RESPONSE_ONLY = 0x10, + IBV_OPCODE_ACKNOWLEDGE = 0x11, + IBV_OPCODE_ATOMIC_ACKNOWLEDGE = 0x12, + IBV_OPCODE_COMPARE_SWAP = 0x13, + IBV_OPCODE_FETCH_ADD = 0x14, + + /* real constants follow -- see comment about above IBV_OPCODE() + macro for more details */ + + /* RC */ + IBV_OPCODE(RC, SEND_FIRST), + IBV_OPCODE(RC, SEND_MIDDLE), + IBV_OPCODE(RC, SEND_LAST), + IBV_OPCODE(RC, SEND_LAST_WITH_IMMEDIATE), + IBV_OPCODE(RC, SEND_ONLY), + IBV_OPCODE(RC, SEND_ONLY_WITH_IMMEDIATE), + IBV_OPCODE(RC, RDMA_WRITE_FIRST), + IBV_OPCODE(RC, RDMA_WRITE_MIDDLE), + IBV_OPCODE(RC, RDMA_WRITE_LAST), + IBV_OPCODE(RC, RDMA_WRITE_LAST_WITH_IMMEDIATE), + IBV_OPCODE(RC, RDMA_WRITE_ONLY), + IBV_OPCODE(RC, RDMA_WRITE_ONLY_WITH_IMMEDIATE), + IBV_OPCODE(RC, RDMA_READ_REQUEST), + IBV_OPCODE(RC, RDMA_READ_RESPONSE_FIRST), + IBV_OPCODE(RC, RDMA_READ_RESPONSE_MIDDLE), + IBV_OPCODE(RC, RDMA_READ_RESPONSE_LAST), + IBV_OPCODE(RC, RDMA_READ_RESPONSE_ONLY), + IBV_OPCODE(RC, ACKNOWLEDGE), + IBV_OPCODE(RC, ATOMIC_ACKNOWLEDGE), + IBV_OPCODE(RC, COMPARE_SWAP), + IBV_OPCODE(RC, FETCH_ADD), + + /* UC */ + IBV_OPCODE(UC, SEND_FIRST), + IBV_OPCODE(UC, SEND_MIDDLE), + IBV_OPCODE(UC, SEND_LAST), + IBV_OPCODE(UC, SEND_LAST_WITH_IMMEDIATE), + IBV_OPCODE(UC, SEND_ONLY), + IBV_OPCODE(UC, SEND_ONLY_WITH_IMMEDIATE), + IBV_OPCODE(UC, RDMA_WRITE_FIRST), + IBV_OPCODE(UC, RDMA_WRITE_MIDDLE), + IBV_OPCODE(UC, RDMA_WRITE_LAST), + IBV_OPCODE(UC, RDMA_WRITE_LAST_WITH_IMMEDIATE), + IBV_OPCODE(UC, RDMA_WRITE_ONLY), + IBV_OPCODE(UC, RDMA_WRITE_ONLY_WITH_IMMEDIATE), + + /* RD */ + IBV_OPCODE(RD, SEND_FIRST), + IBV_OPCODE(RD, SEND_MIDDLE), + IBV_OPCODE(RD, SEND_LAST), + IBV_OPCODE(RD, SEND_LAST_WITH_IMMEDIATE), + IBV_OPCODE(RD, SEND_ONLY), + IBV_OPCODE(RD, SEND_ONLY_WITH_IMMEDIATE), + IBV_OPCODE(RD, RDMA_WRITE_FIRST), + IBV_OPCODE(RD, RDMA_WRITE_MIDDLE), + IBV_OPCODE(RD, RDMA_WRITE_LAST), + IBV_OPCODE(RD, RDMA_WRITE_LAST_WITH_IMMEDIATE), + IBV_OPCODE(RD, RDMA_WRITE_ONLY), + IBV_OPCODE(RD, RDMA_WRITE_ONLY_WITH_IMMEDIATE), + IBV_OPCODE(RD, RDMA_READ_REQUEST), + IBV_OPCODE(RD, RDMA_READ_RESPONSE_FIRST), + IBV_OPCODE(RD, RDMA_READ_RESPONSE_MIDDLE), + IBV_OPCODE(RD, RDMA_READ_RESPONSE_LAST), + IBV_OPCODE(RD, RDMA_READ_RESPONSE_ONLY), + IBV_OPCODE(RD, ACKNOWLEDGE), + IBV_OPCODE(RD, ATOMIC_ACKNOWLEDGE), + IBV_OPCODE(RD, COMPARE_SWAP), + IBV_OPCODE(RD, FETCH_ADD), + + /* UD */ + IBV_OPCODE(UD, SEND_ONLY), + IBV_OPCODE(UD, SEND_ONLY_WITH_IMMEDIATE) +}; + +#endif /* INFINIBAND_OPCODE_H */ diff --git a/libibverbs/sa-kern-abi.h b/libibverbs/sa-kern-abi.h new file mode 100644 index 0000000..134aecc --- /dev/null +++ b/libibverbs/sa-kern-abi.h @@ -0,0 +1,41 @@ +/* + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef INFINIBAND_SA_KERN_ABI_H +#define INFINIBAND_SA_KERN_ABI_H + +#warning "This header is obsolete, use rdma/ib_user_sa.h instead" + +#include <rdma/ib_user_sa.h> + +#define ib_kern_path_rec ib_user_path_rec +#define ibv_kern_path_rec ib_user_path_rec + +#endif diff --git a/libibverbs/sa.h b/libibverbs/sa.h new file mode 100644 index 0000000..e7f96dd --- /dev/null +++ b/libibverbs/sa.h @@ -0,0 +1,136 @@ +/* + * Copyright (c) 2004 Topspin Communications. All rights reserved. + * Copyright (c) 2005 Voltaire, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef INFINIBAND_SA_H +#define INFINIBAND_SA_H + +#include <infiniband/verbs.h> +#include <linux/types.h> + +struct ibv_sa_path_rec { + /* reserved */ + /* reserved */ + union ibv_gid dgid; + union ibv_gid sgid; + __be16 dlid; + __be16 slid; + int raw_traffic; + /* reserved */ + __be32 flow_label; + uint8_t hop_limit; + uint8_t traffic_class; + int reversible; + uint8_t numb_path; + __be16 pkey; + /* reserved */ + uint8_t sl; + uint8_t mtu_selector; + uint8_t mtu; + uint8_t rate_selector; + uint8_t rate; + uint8_t packet_life_time_selector; + uint8_t packet_life_time; + uint8_t preference; +}; + +struct ibv_sa_mcmember_rec { + union ibv_gid mgid; + union ibv_gid port_gid; + uint32_t qkey; + uint16_t mlid; + uint8_t mtu_selector; + uint8_t mtu; + uint8_t traffic_class; + uint16_t pkey; + uint8_t rate_selector; + uint8_t rate; + uint8_t packet_life_time_selector; + uint8_t packet_life_time; + uint8_t sl; + uint32_t flow_label; + uint8_t hop_limit; + uint8_t scope; + uint8_t join_state; + int proxy_join; +}; + +struct ibv_sa_service_rec { + uint64_t id; + union ibv_gid gid; + uint16_t pkey; + /* uint16_t resv; */ + uint32_t lease; + uint8_t key[16]; + uint8_t name[64]; + uint8_t data8[16]; + uint16_t data16[8]; + uint32_t data32[4]; + uint64_t data64[2]; +}; + +#define IBV_PATH_RECORD_REVERSIBLE 0x80 + +struct ibv_path_record { + __be64 service_id; + union ibv_gid dgid; + union ibv_gid sgid; + __be16 dlid; + __be16 slid; + __be32 flowlabel_hoplimit; /* resv-31:28 flow label-27:8 hop limit-7:0*/ + uint8_t tclass; + uint8_t reversible_numpath; /* reversible-7:7 num path-6:0 */ + __be16 pkey; + __be16 qosclass_sl; /* qos class-15:4 sl-3:0 */ + uint8_t mtu; /* mtu selector-7:6 mtu-5:0 */ + uint8_t rate; /* rate selector-7:6 rate-5:0 */ + uint8_t packetlifetime; /* lifetime selector-7:6 lifetime-5:0 */ + uint8_t preference; + uint8_t reserved[6]; +}; + +#define IBV_PATH_FLAG_GMP (1<<0) +#define IBV_PATH_FLAG_PRIMARY (1<<1) +#define IBV_PATH_FLAG_ALTERNATE (1<<2) +#define IBV_PATH_FLAG_OUTBOUND (1<<3) +#define IBV_PATH_FLAG_INBOUND (1<<4) +#define IBV_PATH_FLAG_INBOUND_REVERSE (1<<5) +#define IBV_PATH_FLAG_BIDIRECTIONAL (IBV_PATH_FLAG_OUTBOUND | \ + IBV_PATH_FLAG_INBOUND_REVERSE) + +struct ibv_path_data { + uint32_t flags; + uint32_t reserved; + struct ibv_path_record path; +}; + +#endif /* INFINIBAND_SA_H */ diff --git a/libibverbs/static_driver.c b/libibverbs/static_driver.c new file mode 100644 index 0000000..b6ee32d --- /dev/null +++ b/libibverbs/static_driver.c @@ -0,0 +1,52 @@ +/* + * Copyright (c) 2018 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifdef _STATIC_LIBRARY_BUILD_ +#define RDMA_STATIC_PROVIDERS none +#include <infiniband/verbs.h> +#include <infiniband/driver.h> + +const struct verbs_device_ops verbs_provider_none; + +void ibv_static_providers(void *unused, ...) +{ + /* + * We do not need to do anything with the VA_ARGs since we continue to + * rely on the constructor attribute and simply referencing the + * verbs_provider_X symbol will be enough to trigger the constructor. + * + * This would need to actually check and do the registration for + * specialty cases like LTO or section-gc which may not work with the + * constructor scheme. + */ +} + +#endif diff --git a/libibverbs/sysfs.c b/libibverbs/sysfs.c new file mode 100644 index 0000000..8ba4472 --- /dev/null +++ b/libibverbs/sysfs.c @@ -0,0 +1,140 @@ +/* + * Copyright (c) 2006 Cisco Systems, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#define _GNU_SOURCE +#include <config.h> + +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> +#include <string.h> +#include <stdarg.h> + +#include "ibverbs.h" + +static const char *sysfs_path; + +const char *ibv_get_sysfs_path(void) +{ + const char *env = NULL; + + if (sysfs_path) + return sysfs_path; + + /* + * Only follow use path passed in through the calling user's + * environment if we're not running SUID. + */ + if (getuid() == geteuid()) + env = getenv("SYSFS_PATH"); + + if (env) { + int len; + char *dup; + + sysfs_path = dup = strndup(env, IBV_SYSFS_PATH_MAX); + len = strlen(dup); + while (len > 0 && dup[len - 1] == '/') { + --len; + dup[len] = '\0'; + } + } else + sysfs_path = "/sys"; + + return sysfs_path; +} + +int ibv_read_sysfs_file_at(int dirfd, const char *file, char *buf, size_t size) +{ + ssize_t len; + int fd; + + fd = openat(dirfd, file, O_RDONLY | O_CLOEXEC); + if (fd < 0) + return -1; + + len = read(fd, buf, size); + + close(fd); + + if (len > 0) { + if (buf[len - 1] == '\n') + buf[--len] = '\0'; + else if (len < size) + buf[len] = '\0'; + else + /* We would have to truncate the contents to NULL + * terminate, so we are going to fail no matter + * what we do, either right now or later when + * we pass around an unterminated string. Fail now. + */ + return -1; + } + + return len; +} + +int ibv_read_sysfs_file(const char *dir, const char *file, + char *buf, size_t size) +{ + char *path; + int res; + + if (asprintf(&path, "%s/%s", dir, file) < 0) + return -1; + + res = ibv_read_sysfs_file_at(AT_FDCWD, path, buf, size); + free(path); + return res; +} + +int ibv_read_ibdev_sysfs_file(char *buf, size_t size, + struct verbs_sysfs_dev *sysfs_dev, + const char *fnfmt, ...) +{ + char *path; + va_list va; + int res; + + va_start(va, fnfmt); + if (vasprintf(&path, fnfmt, va) < 0) { + va_end(va); + return -1; + } + va_end(va); + + res = ibv_read_sysfs_file(sysfs_dev->ibdev_path, path, buf, size); + free(path); + return res; +} diff --git a/libibverbs/tm_types.h b/libibverbs/tm_types.h new file mode 100644 index 0000000..f1b302a --- /dev/null +++ b/libibverbs/tm_types.h @@ -0,0 +1,66 @@ +/* + * Copyright (c) 2017 Mellanox Technologies Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#ifndef _TM_TYPES_H +#define _TM_TYPES_H + +#include <linux/types.h> +#include <stdint.h> + +#ifdef __cplusplus +extern "C" { +#endif + +enum ibv_tmh_op { + IBV_TMH_NO_TAG = 0, + IBV_TMH_RNDV = 1, + IBV_TMH_FIN = 2, + IBV_TMH_EAGER = 3, +}; + +struct ibv_tmh { + uint8_t opcode; /* from enum ibv_tmh_op */ + uint8_t reserved[3]; /* must be zero */ + __be32 app_ctx; /* opaque user data */ + __be64 tag; +}; + +struct ibv_rvh { + __be64 va; + __be32 rkey; + __be32 len; +}; + +#ifdef __cplusplus +} +#endif +#endif /* _TM_TYPES_H */ diff --git a/libibverbs/verbs.c b/libibverbs/verbs.c new file mode 100644 index 0000000..629f24c --- /dev/null +++ b/libibverbs/verbs.c @@ -0,0 +1,1082 @@ +/* + * Copyright (c) 2005 Topspin Communications. All rights reserved. + * Copyright (c) 2006, 2007 Cisco Systems, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#define _GNU_SOURCE +#include <config.h> + +#include <endian.h> +#include <stdio.h> +#include <unistd.h> +#include <stdlib.h> +#include <errno.h> +#include <string.h> +#include <linux/ip.h> +#include <dirent.h> +#include <netinet/in.h> + +#include <util/compiler.h> +#include <util/symver.h> +#include <infiniband/cmd_write.h> + +#include "ibverbs.h" +#include <net/if.h> +#include <net/if_arp.h> +#include "neigh.h" + +#undef ibv_query_port + +int __attribute__((const)) ibv_rate_to_mult(enum ibv_rate rate) +{ + switch (rate) { + case IBV_RATE_2_5_GBPS: return 1; + case IBV_RATE_5_GBPS: return 2; + case IBV_RATE_10_GBPS: return 4; + case IBV_RATE_20_GBPS: return 8; + case IBV_RATE_30_GBPS: return 12; + case IBV_RATE_40_GBPS: return 16; + case IBV_RATE_60_GBPS: return 24; + case IBV_RATE_80_GBPS: return 32; + case IBV_RATE_120_GBPS: return 48; + case IBV_RATE_28_GBPS: return 11; + case IBV_RATE_50_GBPS: return 20; + case IBV_RATE_400_GBPS: return 160; + case IBV_RATE_600_GBPS: return 240; + default: return -1; + } +} + +enum ibv_rate __attribute__((const)) mult_to_ibv_rate(int mult) +{ + switch (mult) { + case 1: return IBV_RATE_2_5_GBPS; + case 2: return IBV_RATE_5_GBPS; + case 4: return IBV_RATE_10_GBPS; + case 8: return IBV_RATE_20_GBPS; + case 12: return IBV_RATE_30_GBPS; + case 16: return IBV_RATE_40_GBPS; + case 24: return IBV_RATE_60_GBPS; + case 32: return IBV_RATE_80_GBPS; + case 48: return IBV_RATE_120_GBPS; + case 11: return IBV_RATE_28_GBPS; + case 20: return IBV_RATE_50_GBPS; + case 160: return IBV_RATE_400_GBPS; + case 240: return IBV_RATE_600_GBPS; + default: return IBV_RATE_MAX; + } +} + +int __attribute__((const)) ibv_rate_to_mbps(enum ibv_rate rate) +{ + switch (rate) { + case IBV_RATE_2_5_GBPS: return 2500; + case IBV_RATE_5_GBPS: return 5000; + case IBV_RATE_10_GBPS: return 10000; + case IBV_RATE_20_GBPS: return 20000; + case IBV_RATE_30_GBPS: return 30000; + case IBV_RATE_40_GBPS: return 40000; + case IBV_RATE_60_GBPS: return 60000; + case IBV_RATE_80_GBPS: return 80000; + case IBV_RATE_120_GBPS: return 120000; + case IBV_RATE_14_GBPS: return 14062; + case IBV_RATE_56_GBPS: return 56250; + case IBV_RATE_112_GBPS: return 112500; + case IBV_RATE_168_GBPS: return 168750; + case IBV_RATE_25_GBPS: return 25781; + case IBV_RATE_100_GBPS: return 103125; + case IBV_RATE_200_GBPS: return 206250; + case IBV_RATE_300_GBPS: return 309375; + case IBV_RATE_28_GBPS: return 28125; + case IBV_RATE_50_GBPS: return 53125; + case IBV_RATE_400_GBPS: return 425000; + case IBV_RATE_600_GBPS: return 637500; + default: return -1; + } +} + +enum ibv_rate __attribute__((const)) mbps_to_ibv_rate(int mbps) +{ + switch (mbps) { + case 2500: return IBV_RATE_2_5_GBPS; + case 5000: return IBV_RATE_5_GBPS; + case 10000: return IBV_RATE_10_GBPS; + case 20000: return IBV_RATE_20_GBPS; + case 30000: return IBV_RATE_30_GBPS; + case 40000: return IBV_RATE_40_GBPS; + case 60000: return IBV_RATE_60_GBPS; + case 80000: return IBV_RATE_80_GBPS; + case 120000: return IBV_RATE_120_GBPS; + case 14062: return IBV_RATE_14_GBPS; + case 56250: return IBV_RATE_56_GBPS; + case 112500: return IBV_RATE_112_GBPS; + case 168750: return IBV_RATE_168_GBPS; + case 25781: return IBV_RATE_25_GBPS; + case 103125: return IBV_RATE_100_GBPS; + case 206250: return IBV_RATE_200_GBPS; + case 309375: return IBV_RATE_300_GBPS; + case 28125: return IBV_RATE_28_GBPS; + case 53125: return IBV_RATE_50_GBPS; + case 425000: return IBV_RATE_400_GBPS; + case 637500: return IBV_RATE_600_GBPS; + default: return IBV_RATE_MAX; + } +} + +LATEST_SYMVER_FUNC(ibv_query_device, 1_1, "IBVERBS_1.1", + int, + struct ibv_context *context, + struct ibv_device_attr *device_attr) +{ + return get_ops(context)->query_device(context, device_attr); +} + +int __lib_query_port(struct ibv_context *context, uint8_t port_num, + struct ibv_port_attr *port_attr, size_t port_attr_len) +{ + /* Don't expose this mess to the provider, provide a large enough + * temporary buffer if the user buffer is too small. + */ + if (port_attr_len < sizeof(struct ibv_port_attr)) { + struct ibv_port_attr tmp_attr = {}; + int rc; + + rc = get_ops(context)->query_port(context, port_num, + &tmp_attr); + if (rc) + return rc; + + memcpy(port_attr, &tmp_attr, port_attr_len); + return 0; + } + + memset(port_attr, 0, port_attr_len); + return get_ops(context)->query_port(context, port_num, port_attr); +} + +struct _compat_ibv_port_attr { + enum ibv_port_state state; + enum ibv_mtu max_mtu; + enum ibv_mtu active_mtu; + int gid_tbl_len; + uint32_t port_cap_flags; + uint32_t max_msg_sz; + uint32_t bad_pkey_cntr; + uint32_t qkey_viol_cntr; + uint16_t pkey_tbl_len; + uint16_t lid; + uint16_t sm_lid; + uint8_t lmc; + uint8_t max_vl_num; + uint8_t sm_sl; + uint8_t subnet_timeout; + uint8_t init_type_reply; + uint8_t active_width; + uint8_t active_speed; + uint8_t phys_state; + uint8_t link_layer; + uint8_t flags; +}; + +LATEST_SYMVER_FUNC(ibv_query_port, 1_1, "IBVERBS_1.1", + int, + struct ibv_context *context, uint8_t port_num, + struct _compat_ibv_port_attr *port_attr) +{ + return __lib_query_port(context, port_num, + (struct ibv_port_attr *)port_attr, + sizeof(*port_attr)); +} + +LATEST_SYMVER_FUNC(ibv_query_gid, 1_1, "IBVERBS_1.1", + int, + struct ibv_context *context, uint8_t port_num, + int index, union ibv_gid *gid) +{ + struct verbs_device *verbs_device = verbs_get_device(context->device); + char attr[41]; + uint16_t val; + int i; + + if (ibv_read_ibdev_sysfs_file(attr, sizeof(attr), verbs_device->sysfs, + "ports/%d/gids/%d", port_num, index) < 0) + return -1; + + for (i = 0; i < 8; ++i) { + if (sscanf(attr + i * 5, "%hx", &val) != 1) + return -1; + gid->raw[i * 2 ] = val >> 8; + gid->raw[i * 2 + 1] = val & 0xff; + } + + return 0; +} + +LATEST_SYMVER_FUNC(ibv_query_pkey, 1_1, "IBVERBS_1.1", + int, + struct ibv_context *context, uint8_t port_num, + int index, __be16 *pkey) +{ + struct verbs_device *verbs_device = verbs_get_device(context->device); + char attr[8]; + uint16_t val; + + if (ibv_read_ibdev_sysfs_file(attr, sizeof(attr), verbs_device->sysfs, + "ports/%d/pkeys/%d", port_num, index) < 0) + return -1; + + if (sscanf(attr, "%hx", &val) != 1) + return -1; + + *pkey = htobe16(val); + return 0; +} + +LATEST_SYMVER_FUNC(ibv_get_pkey_index, 1_5, "IBVERBS_1.5", + int, + struct ibv_context *context, uint8_t port_num, __be16 pkey) +{ + __be16 pkey_i; + int i, ret; + + for (i = 0; ; i++) { + ret = ibv_query_pkey(context, port_num, i, &pkey_i); + if (ret < 0) + return ret; + if (pkey == pkey_i) + return i; + } +} + +LATEST_SYMVER_FUNC(ibv_alloc_pd, 1_1, "IBVERBS_1.1", + struct ibv_pd *, + struct ibv_context *context) +{ + struct ibv_pd *pd; + + pd = get_ops(context)->alloc_pd(context); + if (pd) + pd->context = context; + + return pd; +} + +LATEST_SYMVER_FUNC(ibv_dealloc_pd, 1_1, "IBVERBS_1.1", + int, + struct ibv_pd *pd) +{ + return get_ops(pd->context)->dealloc_pd(pd); +} + +#undef ibv_reg_mr +LATEST_SYMVER_FUNC(ibv_reg_mr, 1_1, "IBVERBS_1.1", + struct ibv_mr *, + struct ibv_pd *pd, void *addr, + size_t length, int access) +{ + struct ibv_mr *mr; + + if (ibv_dontfork_range(addr, length)) + return NULL; + + mr = get_ops(pd->context)->reg_mr(pd, addr, length, (uintptr_t) addr, + access); + if (mr) { + mr->context = pd->context; + mr->pd = pd; + mr->addr = addr; + mr->length = length; + } else + ibv_dofork_range(addr, length); + + return mr; +} + +#undef ibv_reg_mr_iova +struct ibv_mr *ibv_reg_mr_iova(struct ibv_pd *pd, void *addr, size_t length, + uint64_t iova, int access) +{ + struct ibv_mr *mr; + + if (ibv_dontfork_range(addr, length)) + return NULL; + + mr = get_ops(pd->context)->reg_mr(pd, addr, length, iova, access); + if (mr) { + mr->context = pd->context; + mr->pd = pd; + mr->addr = addr; + mr->length = length; + } else + ibv_dofork_range(addr, length); + + return mr; +} + +struct ibv_mr *ibv_reg_mr_iova2(struct ibv_pd *pd, void *addr, size_t length, + uint64_t iova, unsigned int access) +{ + struct verbs_device *device = verbs_get_device(pd->context->device); + + if (!(device->core_support & IB_UVERBS_CORE_SUPPORT_OPTIONAL_MR_ACCESS)) + access &= ~IBV_ACCESS_OPTIONAL_RANGE; + + return ibv_reg_mr_iova(pd, addr, length, iova, access); +} + +LATEST_SYMVER_FUNC(ibv_rereg_mr, 1_1, "IBVERBS_1.1", + int, + struct ibv_mr *mr, int flags, + struct ibv_pd *pd, void *addr, + size_t length, int access) +{ + int dofork_onfail = 0; + int err; + void *old_addr; + size_t old_len; + + if (verbs_get_mr(mr)->mr_type != IBV_MR_TYPE_MR) { + errno = EINVAL; + return IBV_REREG_MR_ERR_INPUT; + } + + if (flags & ~IBV_REREG_MR_FLAGS_SUPPORTED) { + errno = EINVAL; + return IBV_REREG_MR_ERR_INPUT; + } + + if ((flags & IBV_REREG_MR_CHANGE_TRANSLATION) && + (!length || !addr)) { + errno = EINVAL; + return IBV_REREG_MR_ERR_INPUT; + } + + if (access && !(flags & IBV_REREG_MR_CHANGE_ACCESS)) { + errno = EINVAL; + return IBV_REREG_MR_ERR_INPUT; + } + + if (flags & IBV_REREG_MR_CHANGE_TRANSLATION) { + err = ibv_dontfork_range(addr, length); + if (err) + return IBV_REREG_MR_ERR_DONT_FORK_NEW; + dofork_onfail = 1; + } + + old_addr = mr->addr; + old_len = mr->length; + err = get_ops(mr->context)->rereg_mr(verbs_get_mr(mr), + flags, pd, addr, + length, access); + if (!err) { + if (flags & IBV_REREG_MR_CHANGE_PD) + mr->pd = pd; + if (flags & IBV_REREG_MR_CHANGE_TRANSLATION) { + mr->addr = addr; + mr->length = length; + err = ibv_dofork_range(old_addr, old_len); + if (err) + return IBV_REREG_MR_ERR_DO_FORK_OLD; + } + } else { + err = IBV_REREG_MR_ERR_CMD; + if (dofork_onfail) { + if (ibv_dofork_range(addr, length)) + err = IBV_REREG_MR_ERR_CMD_AND_DO_FORK_NEW; + } + } + + return err; +} + +LATEST_SYMVER_FUNC(ibv_dereg_mr, 1_1, "IBVERBS_1.1", + int, + struct ibv_mr *mr) +{ + int ret; + void *addr = mr->addr; + size_t length = mr->length; + enum ibv_mr_type type = verbs_get_mr(mr)->mr_type; + + ret = get_ops(mr->context)->dereg_mr(verbs_get_mr(mr)); + if (!ret && type == IBV_MR_TYPE_MR) + ibv_dofork_range(addr, length); + + return ret; +} + +struct ibv_comp_channel *ibv_create_comp_channel(struct ibv_context *context) +{ + struct ibv_create_comp_channel req; + struct ib_uverbs_create_comp_channel_resp resp; + struct ibv_comp_channel *channel; + + channel = malloc(sizeof *channel); + if (!channel) + return NULL; + + req.core_payload = (struct ib_uverbs_create_comp_channel){}; + if (execute_cmd_write(context, IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL, + &req, sizeof(req), &resp, sizeof(resp))) { + free(channel); + return NULL; + } + + channel->context = context; + channel->fd = resp.fd; + channel->refcnt = 0; + + return channel; +} + +int ibv_destroy_comp_channel(struct ibv_comp_channel *channel) +{ + struct ibv_context *context; + int ret; + + context = channel->context; + pthread_mutex_lock(&context->mutex); + + if (channel->refcnt) { + ret = EBUSY; + goto out; + } + + close(channel->fd); + free(channel); + ret = 0; + +out: + pthread_mutex_unlock(&context->mutex); + + return ret; +} + +LATEST_SYMVER_FUNC(ibv_create_cq, 1_1, "IBVERBS_1.1", + struct ibv_cq *, + struct ibv_context *context, int cqe, void *cq_context, + struct ibv_comp_channel *channel, int comp_vector) +{ + struct ibv_cq *cq; + + cq = get_ops(context)->create_cq(context, cqe, channel, comp_vector); + + if (cq) + verbs_init_cq(cq, context, channel, cq_context); + + return cq; +} + +LATEST_SYMVER_FUNC(ibv_resize_cq, 1_1, "IBVERBS_1.1", + int, + struct ibv_cq *cq, int cqe) +{ + return get_ops(cq->context)->resize_cq(cq, cqe); +} + +LATEST_SYMVER_FUNC(ibv_destroy_cq, 1_1, "IBVERBS_1.1", + int, + struct ibv_cq *cq) +{ + struct ibv_comp_channel *channel = cq->channel; + int ret; + + ret = get_ops(cq->context)->destroy_cq(cq); + + if (channel) { + if (!ret) { + pthread_mutex_lock(&channel->context->mutex); + --channel->refcnt; + pthread_mutex_unlock(&channel->context->mutex); + } + } + + return ret; +} + +LATEST_SYMVER_FUNC(ibv_get_cq_event, 1_1, "IBVERBS_1.1", + int, + struct ibv_comp_channel *channel, + struct ibv_cq **cq, void **cq_context) +{ + struct ib_uverbs_comp_event_desc ev; + + if (read(channel->fd, &ev, sizeof ev) != sizeof ev) + return -1; + + *cq = (struct ibv_cq *) (uintptr_t) ev.cq_handle; + *cq_context = (*cq)->cq_context; + + get_ops((*cq)->context)->cq_event(*cq); + + return 0; +} + +LATEST_SYMVER_FUNC(ibv_ack_cq_events, 1_1, "IBVERBS_1.1", + void, + struct ibv_cq *cq, unsigned int nevents) +{ + pthread_mutex_lock(&cq->mutex); + cq->comp_events_completed += nevents; + pthread_cond_signal(&cq->cond); + pthread_mutex_unlock(&cq->mutex); +} + +LATEST_SYMVER_FUNC(ibv_create_srq, 1_1, "IBVERBS_1.1", + struct ibv_srq *, + struct ibv_pd *pd, + struct ibv_srq_init_attr *srq_init_attr) +{ + struct ibv_srq *srq; + + srq = get_ops(pd->context)->create_srq(pd, srq_init_attr); + if (srq) { + srq->context = pd->context; + srq->srq_context = srq_init_attr->srq_context; + srq->pd = pd; + srq->events_completed = 0; + pthread_mutex_init(&srq->mutex, NULL); + pthread_cond_init(&srq->cond, NULL); + } + + return srq; +} + +LATEST_SYMVER_FUNC(ibv_modify_srq, 1_1, "IBVERBS_1.1", + int, + struct ibv_srq *srq, + struct ibv_srq_attr *srq_attr, + int srq_attr_mask) +{ + return get_ops(srq->context)->modify_srq(srq, srq_attr, srq_attr_mask); +} + +LATEST_SYMVER_FUNC(ibv_query_srq, 1_1, "IBVERBS_1.1", + int, + struct ibv_srq *srq, struct ibv_srq_attr *srq_attr) +{ + return get_ops(srq->context)->query_srq(srq, srq_attr); +} + +LATEST_SYMVER_FUNC(ibv_destroy_srq, 1_1, "IBVERBS_1.1", + int, + struct ibv_srq *srq) +{ + return get_ops(srq->context)->destroy_srq(srq); +} + +LATEST_SYMVER_FUNC(ibv_create_qp, 1_1, "IBVERBS_1.1", + struct ibv_qp *, + struct ibv_pd *pd, + struct ibv_qp_init_attr *qp_init_attr) +{ + struct ibv_qp *qp = get_ops(pd->context)->create_qp(pd, qp_init_attr); + + if (qp) { + qp->context = pd->context; + qp->qp_context = qp_init_attr->qp_context; + qp->pd = pd; + qp->send_cq = qp_init_attr->send_cq; + qp->recv_cq = qp_init_attr->recv_cq; + qp->srq = qp_init_attr->srq; + qp->qp_type = qp_init_attr->qp_type; + qp->state = IBV_QPS_RESET; + qp->events_completed = 0; + pthread_mutex_init(&qp->mutex, NULL); + pthread_cond_init(&qp->cond, NULL); + } + + return qp; +} + +struct ibv_qp_ex *ibv_qp_to_qp_ex(struct ibv_qp *qp) +{ + struct verbs_qp *vqp = (struct verbs_qp *)qp; + + if (vqp->comp_mask & VERBS_QP_EX) + return &vqp->qp_ex; + return NULL; +} + +LATEST_SYMVER_FUNC(ibv_query_qp, 1_1, "IBVERBS_1.1", + int, + struct ibv_qp *qp, struct ibv_qp_attr *attr, + int attr_mask, + struct ibv_qp_init_attr *init_attr) +{ + int ret; + + ret = get_ops(qp->context)->query_qp(qp, attr, attr_mask, init_attr); + if (ret) + return ret; + + if (attr_mask & IBV_QP_STATE) + qp->state = attr->qp_state; + + return 0; +} + +LATEST_SYMVER_FUNC(ibv_modify_qp, 1_1, "IBVERBS_1.1", + int, + struct ibv_qp *qp, struct ibv_qp_attr *attr, + int attr_mask) +{ + int ret; + + ret = get_ops(qp->context)->modify_qp(qp, attr, attr_mask); + if (ret) + return ret; + + if (attr_mask & IBV_QP_STATE) + qp->state = attr->qp_state; + + return 0; +} + +LATEST_SYMVER_FUNC(ibv_destroy_qp, 1_1, "IBVERBS_1.1", + int, + struct ibv_qp *qp) +{ + return get_ops(qp->context)->destroy_qp(qp); +} + +LATEST_SYMVER_FUNC(ibv_create_ah, 1_1, "IBVERBS_1.1", + struct ibv_ah *, + struct ibv_pd *pd, struct ibv_ah_attr *attr) +{ + struct ibv_ah *ah = get_ops(pd->context)->create_ah(pd, attr); + + if (ah) { + ah->context = pd->context; + ah->pd = pd; + } + + return ah; +} + +/* GID types as appear in sysfs, no change is expected as of ABI + * compatibility. + */ +#define V1_TYPE "IB/RoCE v1" +#define V2_TYPE "RoCE v2" +int ibv_query_gid_type(struct ibv_context *context, uint8_t port_num, + unsigned int index, enum ibv_gid_type *type) +{ + struct verbs_device *verbs_device = verbs_get_device(context->device); + char buff[11]; + + /* Reset errno so that we can rely on its value upon any error flow in + * ibv_read_sysfs_file. + */ + errno = 0; + if (ibv_read_ibdev_sysfs_file(buff, sizeof(buff), verbs_device->sysfs, + "ports/%d/gid_attrs/types/%d", port_num, + index) <= 0) { + char *dir_path; + DIR *dir; + + if (errno == EINVAL) { + /* In IB, this file doesn't exist and the kernel sets + * errno to -EINVAL. + */ + *type = IBV_GID_TYPE_IB_ROCE_V1; + return 0; + } + if (asprintf(&dir_path, "%s/%s/%d/%s/", + verbs_device->sysfs->ibdev_path, "ports", port_num, + "gid_attrs") < 0) + return -1; + dir = opendir(dir_path); + free(dir_path); + if (!dir) { + if (errno == ENOENT) + /* Assuming that if gid_attrs doesn't exist, + * we have an old kernel and all GIDs are + * IB/RoCE v1 + */ + *type = IBV_GID_TYPE_IB_ROCE_V1; + else + return -1; + } else { + closedir(dir); + errno = EFAULT; + return -1; + } + } else { + if (!strcmp(buff, V1_TYPE)) { + *type = IBV_GID_TYPE_IB_ROCE_V1; + } else if (!strcmp(buff, V2_TYPE)) { + *type = IBV_GID_TYPE_ROCE_V2; + } else { + errno = ENOTSUP; + return -1; + } + } + + return 0; +} + +static int ibv_find_gid_index(struct ibv_context *context, uint8_t port_num, + union ibv_gid *gid, enum ibv_gid_type gid_type) +{ + enum ibv_gid_type sgid_type = 0; + union ibv_gid sgid; + int i = 0, ret; + + do { + ret = ibv_query_gid(context, port_num, i, &sgid); + if (!ret) { + ret = ibv_query_gid_type(context, port_num, i, + &sgid_type); + } + i++; + } while (!ret && (memcmp(&sgid, gid, sizeof(*gid)) || + (gid_type != sgid_type))); + + return ret ? ret : i - 1; +} + +static inline void map_ipv4_addr_to_ipv6(__be32 ipv4, struct in6_addr *ipv6) +{ + ipv6->s6_addr32[0] = 0; + ipv6->s6_addr32[1] = 0; + ipv6->s6_addr32[2] = htobe32(0x0000FFFF); + ipv6->s6_addr32[3] = ipv4; +} + +static inline __sum16 ipv4_calc_hdr_csum(uint16_t *data, unsigned int num_hwords) +{ + unsigned int i = 0; + uint32_t sum = 0; + + for (i = 0; i < num_hwords; i++) + sum += *(data++); + + sum = (sum & 0xffff) + (sum >> 16); + + return (__force __sum16)~sum; +} + +static inline int get_grh_header_version(struct ibv_grh *grh) +{ + int ip6h_version = (be32toh(grh->version_tclass_flow) >> 28) & 0xf; + struct iphdr *ip4h = (struct iphdr *)((void *)grh + 20); + struct iphdr ip4h_checked; + + if (ip6h_version != 6) { + if (ip4h->version == 4) + return 4; + errno = EPROTONOSUPPORT; + return -1; + } + /* version may be 6 or 4 */ + if (ip4h->ihl != 5) /* IPv4 header length must be 5 for RoCE v2. */ + return 6; + /* + * Verify checksum. + * We can't write on scattered buffers so we have to copy to temp + * buffer. + */ + memcpy(&ip4h_checked, ip4h, sizeof(ip4h_checked)); + /* Need to set the checksum field (check) to 0 before re-calculating + * the checksum. + */ + ip4h_checked.check = 0; + ip4h_checked.check = ipv4_calc_hdr_csum((uint16_t *)&ip4h_checked, 10); + /* if IPv4 header checksum is OK, believe it */ + if (ip4h->check == ip4h_checked.check) + return 4; + return 6; +} + +static inline void set_ah_attr_generic_fields(struct ibv_ah_attr *ah_attr, + struct ibv_wc *wc, + struct ibv_grh *grh, + uint8_t port_num) +{ + uint32_t flow_class; + + flow_class = be32toh(grh->version_tclass_flow); + ah_attr->grh.flow_label = flow_class & 0xFFFFF; + ah_attr->dlid = wc->slid; + ah_attr->sl = wc->sl; + ah_attr->src_path_bits = wc->dlid_path_bits; + ah_attr->port_num = port_num; +} + +static inline int set_ah_attr_by_ipv4(struct ibv_context *context, + struct ibv_ah_attr *ah_attr, + struct iphdr *ip4h, uint8_t port_num) +{ + union ibv_gid sgid; + int ret; + + /* No point searching multicast GIDs in GID table */ + if (IN_CLASSD(be32toh(ip4h->daddr))) { + errno = EINVAL; + return -1; + } + + map_ipv4_addr_to_ipv6(ip4h->daddr, (struct in6_addr *)&sgid); + ret = ibv_find_gid_index(context, port_num, &sgid, + IBV_GID_TYPE_ROCE_V2); + if (ret < 0) + return ret; + + map_ipv4_addr_to_ipv6(ip4h->saddr, + (struct in6_addr *)&ah_attr->grh.dgid); + ah_attr->grh.sgid_index = (uint8_t) ret; + ah_attr->grh.hop_limit = ip4h->ttl; + ah_attr->grh.traffic_class = ip4h->tos; + + return 0; +} + +#define IB_NEXT_HDR 0x1b +static inline int set_ah_attr_by_ipv6(struct ibv_context *context, + struct ibv_ah_attr *ah_attr, + struct ibv_grh *grh, uint8_t port_num) +{ + uint32_t flow_class; + uint32_t sgid_type; + int ret; + + /* No point searching multicast GIDs in GID table */ + if (grh->dgid.raw[0] == 0xFF) { + errno = EINVAL; + return -1; + } + + ah_attr->grh.dgid = grh->sgid; + if (grh->next_hdr == IPPROTO_UDP) { + sgid_type = IBV_GID_TYPE_ROCE_V2; + } else if (grh->next_hdr == IB_NEXT_HDR) { + sgid_type = IBV_GID_TYPE_IB_ROCE_V1; + } else { + errno = EPROTONOSUPPORT; + return -1; + } + + ret = ibv_find_gid_index(context, port_num, &grh->dgid, + sgid_type); + if (ret < 0) + return ret; + + ah_attr->grh.sgid_index = (uint8_t) ret; + flow_class = be32toh(grh->version_tclass_flow); + ah_attr->grh.hop_limit = grh->hop_limit; + ah_attr->grh.traffic_class = (flow_class >> 20) & 0xFF; + + return 0; +} + +int ibv_init_ah_from_wc(struct ibv_context *context, uint8_t port_num, + struct ibv_wc *wc, struct ibv_grh *grh, + struct ibv_ah_attr *ah_attr) +{ + int version; + int ret = 0; + + memset(ah_attr, 0, sizeof *ah_attr); + set_ah_attr_generic_fields(ah_attr, wc, grh, port_num); + + if (wc->wc_flags & IBV_WC_GRH) { + ah_attr->is_global = 1; + version = get_grh_header_version(grh); + + if (version == 4) + ret = set_ah_attr_by_ipv4(context, ah_attr, + (struct iphdr *)((void *)grh + 20), + port_num); + else if (version == 6) + ret = set_ah_attr_by_ipv6(context, ah_attr, grh, + port_num); + else + ret = -1; + } + + return ret; +} + +struct ibv_ah *ibv_create_ah_from_wc(struct ibv_pd *pd, struct ibv_wc *wc, + struct ibv_grh *grh, uint8_t port_num) +{ + struct ibv_ah_attr ah_attr; + int ret; + + ret = ibv_init_ah_from_wc(pd->context, port_num, wc, grh, &ah_attr); + if (ret) + return NULL; + + return ibv_create_ah(pd, &ah_attr); +} + +LATEST_SYMVER_FUNC(ibv_destroy_ah, 1_1, "IBVERBS_1.1", + int, + struct ibv_ah *ah) +{ + return get_ops(ah->context)->destroy_ah(ah); +} + +LATEST_SYMVER_FUNC(ibv_attach_mcast, 1_1, "IBVERBS_1.1", + int, + struct ibv_qp *qp, const union ibv_gid *gid, uint16_t lid) +{ + return get_ops(qp->context)->attach_mcast(qp, gid, lid); +} + +LATEST_SYMVER_FUNC(ibv_detach_mcast, 1_1, "IBVERBS_1.1", + int, + struct ibv_qp *qp, const union ibv_gid *gid, uint16_t lid) +{ + return get_ops(qp->context)->detach_mcast(qp, gid, lid); +} + +static inline int ipv6_addr_v4mapped(const struct in6_addr *a) +{ + return IN6_IS_ADDR_V4MAPPED(&a->s6_addr32) || + /* IPv4 encoded multicast addresses */ + (a->s6_addr32[0] == htobe32(0xff0e0000) && + ((a->s6_addr32[1] | + (a->s6_addr32[2] ^ htobe32(0x0000ffff))) == 0UL)); +} + +struct peer_address { + void *address; + uint32_t size; +}; + +static inline int create_peer_from_gid(int family, void *raw_gid, + struct peer_address *peer_address) +{ + switch (family) { + case AF_INET: + peer_address->address = raw_gid + 12; + peer_address->size = 4; + break; + case AF_INET6: + peer_address->address = raw_gid; + peer_address->size = 16; + break; + default: + return -1; + } + + return 0; +} + +#define NEIGH_GET_DEFAULT_TIMEOUT_MS 3000 +int ibv_resolve_eth_l2_from_gid(struct ibv_context *context, + struct ibv_ah_attr *attr, + uint8_t eth_mac[ETHERNET_LL_SIZE], + uint16_t *vid) +{ + int dst_family; + int src_family; + int oif; + struct get_neigh_handler neigh_handler; + union ibv_gid sgid; + int ether_len; + struct peer_address src; + struct peer_address dst; + uint16_t ret_vid; + int ret = -EINVAL; + int err; + + err = ibv_query_gid(context, attr->port_num, + attr->grh.sgid_index, &sgid); + + if (err) + return err; + + err = neigh_init_resources(&neigh_handler, + NEIGH_GET_DEFAULT_TIMEOUT_MS); + + if (err) + return err; + + dst_family = ipv6_addr_v4mapped((struct in6_addr *)attr->grh.dgid.raw) ? + AF_INET : AF_INET6; + src_family = ipv6_addr_v4mapped((struct in6_addr *)sgid.raw) ? + AF_INET : AF_INET6; + + if (create_peer_from_gid(dst_family, attr->grh.dgid.raw, &dst)) + goto free_resources; + + if (create_peer_from_gid(src_family, &sgid.raw, &src)) + goto free_resources; + + if (neigh_set_dst(&neigh_handler, dst_family, dst.address, + dst.size)) + goto free_resources; + + if (neigh_set_src(&neigh_handler, src_family, src.address, + src.size)) + goto free_resources; + + oif = neigh_get_oif_from_src(&neigh_handler); + + if (oif > 0) + neigh_set_oif(&neigh_handler, oif); + else + goto free_resources; + + ret = -EHOSTUNREACH; + + /* blocking call */ + if (process_get_neigh(&neigh_handler)) + goto free_resources; + + if (vid) { + ret_vid = neigh_get_vlan_id_from_dev(&neigh_handler); + + if (ret_vid <= 0xfff) + neigh_set_vlan_id(&neigh_handler, ret_vid); + } + + /* We are using only Ethernet here */ + ether_len = neigh_get_ll(&neigh_handler, + eth_mac, + sizeof(uint8_t) * ETHERNET_LL_SIZE); + + if (ether_len <= 0) + goto free_resources; + + if (vid) + *vid = ret_vid; + + ret = 0; + +free_resources: + neigh_free_resources(&neigh_handler); + + return ret; +} diff --git a/libibverbs/verbs.h b/libibverbs/verbs.h new file mode 100644 index 0000000..288985d --- /dev/null +++ b/libibverbs/verbs.h @@ -0,0 +1,3352 @@ +/* + * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved. + * Copyright (c) 2004, 2011-2012 Intel Corporation. All rights reserved. + * Copyright (c) 2005, 2006, 2007 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2005 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef INFINIBAND_VERBS_H +#define INFINIBAND_VERBS_H + +#include <stdint.h> +#include <pthread.h> +#include <stddef.h> +#include <errno.h> +#include <string.h> +#include <linux/types.h> +#include <stdint.h> +#include <infiniband/verbs_api.h> + +#ifdef __cplusplus +#include <limits> +#endif + +#if __GNUC__ >= 3 +# define __attribute_const __attribute__((const)) +#else +# define __attribute_const +#endif + +#ifdef __cplusplus +extern "C" { +#endif + + +union ibv_gid { + uint8_t raw[16]; + struct { + __be64 subnet_prefix; + __be64 interface_id; + } global; +}; + +#define vext_field_avail(type, fld, sz) (offsetof(type, fld) < (sz)) + +#ifdef __cplusplus +#define __VERBS_ABI_IS_EXTENDED ((void *)std::numeric_limits<uintptr_t>::max()) +#else +#define __VERBS_ABI_IS_EXTENDED ((void *)UINTPTR_MAX) +#endif + +enum ibv_node_type { + IBV_NODE_UNKNOWN = -1, + IBV_NODE_CA = 1, + IBV_NODE_SWITCH, + IBV_NODE_ROUTER, + IBV_NODE_RNIC, + IBV_NODE_USNIC, + IBV_NODE_USNIC_UDP, + IBV_NODE_UNSPECIFIED, +}; + +enum ibv_transport_type { + IBV_TRANSPORT_UNKNOWN = -1, + IBV_TRANSPORT_IB = 0, + IBV_TRANSPORT_IWARP, + IBV_TRANSPORT_USNIC, + IBV_TRANSPORT_USNIC_UDP, + IBV_TRANSPORT_UNSPECIFIED, +}; + +enum ibv_device_cap_flags { + IBV_DEVICE_RESIZE_MAX_WR = 1, + IBV_DEVICE_BAD_PKEY_CNTR = 1 << 1, + IBV_DEVICE_BAD_QKEY_CNTR = 1 << 2, + IBV_DEVICE_RAW_MULTI = 1 << 3, + IBV_DEVICE_AUTO_PATH_MIG = 1 << 4, + IBV_DEVICE_CHANGE_PHY_PORT = 1 << 5, + IBV_DEVICE_UD_AV_PORT_ENFORCE = 1 << 6, + IBV_DEVICE_CURR_QP_STATE_MOD = 1 << 7, + IBV_DEVICE_SHUTDOWN_PORT = 1 << 8, + IBV_DEVICE_INIT_TYPE = 1 << 9, + IBV_DEVICE_PORT_ACTIVE_EVENT = 1 << 10, + IBV_DEVICE_SYS_IMAGE_GUID = 1 << 11, + IBV_DEVICE_RC_RNR_NAK_GEN = 1 << 12, + IBV_DEVICE_SRQ_RESIZE = 1 << 13, + IBV_DEVICE_N_NOTIFY_CQ = 1 << 14, + IBV_DEVICE_MEM_WINDOW = 1 << 17, + IBV_DEVICE_UD_IP_CSUM = 1 << 18, + IBV_DEVICE_XRC = 1 << 20, + IBV_DEVICE_MEM_MGT_EXTENSIONS = 1 << 21, + IBV_DEVICE_MEM_WINDOW_TYPE_2A = 1 << 23, + IBV_DEVICE_MEM_WINDOW_TYPE_2B = 1 << 24, + IBV_DEVICE_RC_IP_CSUM = 1 << 25, + IBV_DEVICE_RAW_IP_CSUM = 1 << 26, + IBV_DEVICE_MANAGED_FLOW_STEERING = 1 << 29 +}; + +/* + * Can't extended above ibv_device_cap_flags enum as in some systems/compilers + * enum range is limited to 4 bytes. + */ +#define IBV_DEVICE_RAW_SCATTER_FCS (1ULL << 34) +#define IBV_DEVICE_PCI_WRITE_END_PADDING (1ULL << 36) + +enum ibv_atomic_cap { + IBV_ATOMIC_NONE, + IBV_ATOMIC_HCA, + IBV_ATOMIC_GLOB +}; + +struct ibv_alloc_dm_attr { + size_t length; + uint32_t log_align_req; + uint32_t comp_mask; +}; + +struct ibv_dm { + struct ibv_context *context; + int (*memcpy_to_dm)(struct ibv_dm *dm, uint64_t dm_offset, + const void *host_addr, size_t length); + int (*memcpy_from_dm)(void *host_addr, struct ibv_dm *dm, + uint64_t dm_offset, size_t length); + uint32_t comp_mask; +}; + +struct ibv_device_attr { + char fw_ver[64]; + __be64 node_guid; + __be64 sys_image_guid; + uint64_t max_mr_size; + uint64_t page_size_cap; + uint32_t vendor_id; + uint32_t vendor_part_id; + uint32_t hw_ver; + int max_qp; + int max_qp_wr; + unsigned int device_cap_flags; + int max_sge; + int max_sge_rd; + int max_cq; + int max_cqe; + int max_mr; + int max_pd; + int max_qp_rd_atom; + int max_ee_rd_atom; + int max_res_rd_atom; + int max_qp_init_rd_atom; + int max_ee_init_rd_atom; + enum ibv_atomic_cap atomic_cap; + int max_ee; + int max_rdd; + int max_mw; + int max_raw_ipv6_qp; + int max_raw_ethy_qp; + int max_mcast_grp; + int max_mcast_qp_attach; + int max_total_mcast_qp_attach; + int max_ah; + int max_fmr; + int max_map_per_fmr; + int max_srq; + int max_srq_wr; + int max_srq_sge; + uint16_t max_pkeys; + uint8_t local_ca_ack_delay; + uint8_t phys_port_cnt; +}; + +/* An extensible input struct for possible future extensions of the + * ibv_query_device_ex verb. */ +struct ibv_query_device_ex_input { + uint32_t comp_mask; +}; + +enum ibv_odp_transport_cap_bits { + IBV_ODP_SUPPORT_SEND = 1 << 0, + IBV_ODP_SUPPORT_RECV = 1 << 1, + IBV_ODP_SUPPORT_WRITE = 1 << 2, + IBV_ODP_SUPPORT_READ = 1 << 3, + IBV_ODP_SUPPORT_ATOMIC = 1 << 4, + IBV_ODP_SUPPORT_SRQ_RECV = 1 << 5, +}; + +struct ibv_odp_caps { + uint64_t general_caps; + struct { + uint32_t rc_odp_caps; + uint32_t uc_odp_caps; + uint32_t ud_odp_caps; + } per_transport_caps; +}; + +enum ibv_odp_general_caps { + IBV_ODP_SUPPORT = 1 << 0, + IBV_ODP_SUPPORT_IMPLICIT = 1 << 1, +}; + +struct ibv_tso_caps { + uint32_t max_tso; + uint32_t supported_qpts; +}; + +/* RX Hash function flags */ +enum ibv_rx_hash_function_flags { + IBV_RX_HASH_FUNC_TOEPLITZ = 1 << 0, +}; + +/* + * RX Hash fields enable to set which incoming packet's field should + * participates in RX Hash. Each flag represent certain packet's field, + * when the flag is set the field that is represented by the flag will + * participate in RX Hash calculation. + * Note: *IPV4 and *IPV6 flags can't be enabled together on the same QP + * and *TCP and *UDP flags can't be enabled together on the same QP. +*/ +enum ibv_rx_hash_fields { + IBV_RX_HASH_SRC_IPV4 = 1 << 0, + IBV_RX_HASH_DST_IPV4 = 1 << 1, + IBV_RX_HASH_SRC_IPV6 = 1 << 2, + IBV_RX_HASH_DST_IPV6 = 1 << 3, + IBV_RX_HASH_SRC_PORT_TCP = 1 << 4, + IBV_RX_HASH_DST_PORT_TCP = 1 << 5, + IBV_RX_HASH_SRC_PORT_UDP = 1 << 6, + IBV_RX_HASH_DST_PORT_UDP = 1 << 7, + IBV_RX_HASH_IPSEC_SPI = 1 << 8, + IBV_RX_HASH_INNER = (1UL << 31), +}; + +struct ibv_rss_caps { + uint32_t supported_qpts; + uint32_t max_rwq_indirection_tables; + uint32_t max_rwq_indirection_table_size; + uint64_t rx_hash_fields_mask; /* enum ibv_rx_hash_fields */ + uint8_t rx_hash_function; /* enum ibv_rx_hash_function_flags */ +}; + +struct ibv_packet_pacing_caps { + uint32_t qp_rate_limit_min; + uint32_t qp_rate_limit_max; /* In kbps */ + uint32_t supported_qpts; +}; + +enum ibv_raw_packet_caps { + IBV_RAW_PACKET_CAP_CVLAN_STRIPPING = 1 << 0, + IBV_RAW_PACKET_CAP_SCATTER_FCS = 1 << 1, + IBV_RAW_PACKET_CAP_IP_CSUM = 1 << 2, + IBV_RAW_PACKET_CAP_DELAY_DROP = 1 << 3, +}; + +enum ibv_tm_cap_flags { + IBV_TM_CAP_RC = 1 << 0, +}; + +struct ibv_tm_caps { + /* Max size of rendezvous request header */ + uint32_t max_rndv_hdr_size; + /* Max number of tagged buffers in a TM-SRQ matching list */ + uint32_t max_num_tags; + /* From enum ibv_tm_cap_flags */ + uint32_t flags; + /* Max number of outstanding list operations */ + uint32_t max_ops; + /* Max number of SGEs in a tagged buffer */ + uint32_t max_sge; +}; + +struct ibv_cq_moderation_caps { + uint16_t max_cq_count; + uint16_t max_cq_period; /* in micro seconds */ +}; + +enum ibv_pci_atomic_op_size { + IBV_PCI_ATOMIC_OPERATION_4_BYTE_SIZE_SUP = 1 << 0, + IBV_PCI_ATOMIC_OPERATION_8_BYTE_SIZE_SUP = 1 << 1, + IBV_PCI_ATOMIC_OPERATION_16_BYTE_SIZE_SUP = 1 << 2, +}; + +/* + * Bitmask for supported operation sizes + * Use enum ibv_pci_atomic_op_size + */ +struct ibv_pci_atomic_caps { + uint16_t fetch_add; + uint16_t swap; + uint16_t compare_swap; +}; + +struct ibv_device_attr_ex { + struct ibv_device_attr orig_attr; + uint32_t comp_mask; + struct ibv_odp_caps odp_caps; + uint64_t completion_timestamp_mask; + uint64_t hca_core_clock; + uint64_t device_cap_flags_ex; + struct ibv_tso_caps tso_caps; + struct ibv_rss_caps rss_caps; + uint32_t max_wq_type_rq; + struct ibv_packet_pacing_caps packet_pacing_caps; + uint32_t raw_packet_caps; /* Use ibv_raw_packet_caps */ + struct ibv_tm_caps tm_caps; + struct ibv_cq_moderation_caps cq_mod_caps; + uint64_t max_dm_size; + struct ibv_pci_atomic_caps pci_atomic_caps; + uint32_t xrc_odp_caps; +}; + +enum ibv_mtu { + IBV_MTU_256 = 1, + IBV_MTU_512 = 2, + IBV_MTU_1024 = 3, + IBV_MTU_2048 = 4, + IBV_MTU_4096 = 5 +}; + +enum ibv_port_state { + IBV_PORT_NOP = 0, + IBV_PORT_DOWN = 1, + IBV_PORT_INIT = 2, + IBV_PORT_ARMED = 3, + IBV_PORT_ACTIVE = 4, + IBV_PORT_ACTIVE_DEFER = 5 +}; + +enum { + IBV_LINK_LAYER_UNSPECIFIED, + IBV_LINK_LAYER_INFINIBAND, + IBV_LINK_LAYER_ETHERNET, +}; + +enum ibv_port_cap_flags { + IBV_PORT_SM = 1 << 1, + IBV_PORT_NOTICE_SUP = 1 << 2, + IBV_PORT_TRAP_SUP = 1 << 3, + IBV_PORT_OPT_IPD_SUP = 1 << 4, + IBV_PORT_AUTO_MIGR_SUP = 1 << 5, + IBV_PORT_SL_MAP_SUP = 1 << 6, + IBV_PORT_MKEY_NVRAM = 1 << 7, + IBV_PORT_PKEY_NVRAM = 1 << 8, + IBV_PORT_LED_INFO_SUP = 1 << 9, + IBV_PORT_SYS_IMAGE_GUID_SUP = 1 << 11, + IBV_PORT_PKEY_SW_EXT_PORT_TRAP_SUP = 1 << 12, + IBV_PORT_EXTENDED_SPEEDS_SUP = 1 << 14, + IBV_PORT_CAP_MASK2_SUP = 1 << 15, + IBV_PORT_CM_SUP = 1 << 16, + IBV_PORT_SNMP_TUNNEL_SUP = 1 << 17, + IBV_PORT_REINIT_SUP = 1 << 18, + IBV_PORT_DEVICE_MGMT_SUP = 1 << 19, + IBV_PORT_VENDOR_CLASS_SUP = 1 << 20, + IBV_PORT_DR_NOTICE_SUP = 1 << 21, + IBV_PORT_CAP_MASK_NOTICE_SUP = 1 << 22, + IBV_PORT_BOOT_MGMT_SUP = 1 << 23, + IBV_PORT_LINK_LATENCY_SUP = 1 << 24, + IBV_PORT_CLIENT_REG_SUP = 1 << 25, + IBV_PORT_IP_BASED_GIDS = 1 << 26 +}; + +enum ibv_port_cap_flags2 { + IBV_PORT_SET_NODE_DESC_SUP = 1 << 0, + IBV_PORT_INFO_EXT_SUP = 1 << 1, + IBV_PORT_VIRT_SUP = 1 << 2, + IBV_PORT_SWITCH_PORT_STATE_TABLE_SUP = 1 << 3, + IBV_PORT_LINK_WIDTH_2X_SUP = 1 << 4, + IBV_PORT_LINK_SPEED_HDR_SUP = 1 << 5, +}; + +struct ibv_port_attr { + enum ibv_port_state state; + enum ibv_mtu max_mtu; + enum ibv_mtu active_mtu; + int gid_tbl_len; + uint32_t port_cap_flags; + uint32_t max_msg_sz; + uint32_t bad_pkey_cntr; + uint32_t qkey_viol_cntr; + uint16_t pkey_tbl_len; + uint16_t lid; + uint16_t sm_lid; + uint8_t lmc; + uint8_t max_vl_num; + uint8_t sm_sl; + uint8_t subnet_timeout; + uint8_t init_type_reply; + uint8_t active_width; + uint8_t active_speed; + uint8_t phys_state; + uint8_t link_layer; + uint8_t flags; + uint16_t port_cap_flags2; +}; + +enum ibv_event_type { + IBV_EVENT_CQ_ERR, + IBV_EVENT_QP_FATAL, + IBV_EVENT_QP_REQ_ERR, + IBV_EVENT_QP_ACCESS_ERR, + IBV_EVENT_COMM_EST, + IBV_EVENT_SQ_DRAINED, + IBV_EVENT_PATH_MIG, + IBV_EVENT_PATH_MIG_ERR, + IBV_EVENT_DEVICE_FATAL, + IBV_EVENT_PORT_ACTIVE, + IBV_EVENT_PORT_ERR, + IBV_EVENT_LID_CHANGE, + IBV_EVENT_PKEY_CHANGE, + IBV_EVENT_SM_CHANGE, + IBV_EVENT_SRQ_ERR, + IBV_EVENT_SRQ_LIMIT_REACHED, + IBV_EVENT_QP_LAST_WQE_REACHED, + IBV_EVENT_CLIENT_REREGISTER, + IBV_EVENT_GID_CHANGE, + IBV_EVENT_WQ_FATAL, +}; + +struct ibv_async_event { + union { + struct ibv_cq *cq; + struct ibv_qp *qp; + struct ibv_srq *srq; + struct ibv_wq *wq; + int port_num; + } element; + enum ibv_event_type event_type; +}; + +enum ibv_wc_status { + IBV_WC_SUCCESS, + IBV_WC_LOC_LEN_ERR, + IBV_WC_LOC_QP_OP_ERR, + IBV_WC_LOC_EEC_OP_ERR, + IBV_WC_LOC_PROT_ERR, + IBV_WC_WR_FLUSH_ERR, + IBV_WC_MW_BIND_ERR, + IBV_WC_BAD_RESP_ERR, + IBV_WC_LOC_ACCESS_ERR, + IBV_WC_REM_INV_REQ_ERR, + IBV_WC_REM_ACCESS_ERR, + IBV_WC_REM_OP_ERR, + IBV_WC_RETRY_EXC_ERR, + IBV_WC_RNR_RETRY_EXC_ERR, + IBV_WC_LOC_RDD_VIOL_ERR, + IBV_WC_REM_INV_RD_REQ_ERR, + IBV_WC_REM_ABORT_ERR, + IBV_WC_INV_EECN_ERR, + IBV_WC_INV_EEC_STATE_ERR, + IBV_WC_FATAL_ERR, + IBV_WC_RESP_TIMEOUT_ERR, + IBV_WC_GENERAL_ERR, + IBV_WC_TM_ERR, + IBV_WC_TM_RNDV_INCOMPLETE, +}; +const char *ibv_wc_status_str(enum ibv_wc_status status); + +enum ibv_wc_opcode { + IBV_WC_SEND, + IBV_WC_RDMA_WRITE, + IBV_WC_RDMA_READ, + IBV_WC_COMP_SWAP, + IBV_WC_FETCH_ADD, + IBV_WC_BIND_MW, + IBV_WC_LOCAL_INV, + IBV_WC_TSO, +/* + * Set value of IBV_WC_RECV so consumers can test if a completion is a + * receive by testing (opcode & IBV_WC_RECV). + */ + IBV_WC_RECV = 1 << 7, + IBV_WC_RECV_RDMA_WITH_IMM, + + IBV_WC_TM_ADD, + IBV_WC_TM_DEL, + IBV_WC_TM_SYNC, + IBV_WC_TM_RECV, + IBV_WC_TM_NO_TAG, + IBV_WC_DRIVER1, +}; + +enum { + IBV_WC_IP_CSUM_OK_SHIFT = 2 +}; + +enum ibv_create_cq_wc_flags { + IBV_WC_EX_WITH_BYTE_LEN = 1 << 0, + IBV_WC_EX_WITH_IMM = 1 << 1, + IBV_WC_EX_WITH_QP_NUM = 1 << 2, + IBV_WC_EX_WITH_SRC_QP = 1 << 3, + IBV_WC_EX_WITH_SLID = 1 << 4, + IBV_WC_EX_WITH_SL = 1 << 5, + IBV_WC_EX_WITH_DLID_PATH_BITS = 1 << 6, + IBV_WC_EX_WITH_COMPLETION_TIMESTAMP = 1 << 7, + IBV_WC_EX_WITH_CVLAN = 1 << 8, + IBV_WC_EX_WITH_FLOW_TAG = 1 << 9, + IBV_WC_EX_WITH_TM_INFO = 1 << 10, + IBV_WC_EX_WITH_COMPLETION_TIMESTAMP_WALLCLOCK = 1 << 11, +}; + +enum { + IBV_WC_STANDARD_FLAGS = IBV_WC_EX_WITH_BYTE_LEN | + IBV_WC_EX_WITH_IMM | + IBV_WC_EX_WITH_QP_NUM | + IBV_WC_EX_WITH_SRC_QP | + IBV_WC_EX_WITH_SLID | + IBV_WC_EX_WITH_SL | + IBV_WC_EX_WITH_DLID_PATH_BITS +}; + +enum { + IBV_CREATE_CQ_SUP_WC_FLAGS = IBV_WC_STANDARD_FLAGS | + IBV_WC_EX_WITH_COMPLETION_TIMESTAMP | + IBV_WC_EX_WITH_CVLAN | + IBV_WC_EX_WITH_FLOW_TAG | + IBV_WC_EX_WITH_TM_INFO | + IBV_WC_EX_WITH_COMPLETION_TIMESTAMP_WALLCLOCK +}; + +enum ibv_wc_flags { + IBV_WC_GRH = 1 << 0, + IBV_WC_WITH_IMM = 1 << 1, + IBV_WC_IP_CSUM_OK = 1 << IBV_WC_IP_CSUM_OK_SHIFT, + IBV_WC_WITH_INV = 1 << 3, + IBV_WC_TM_SYNC_REQ = 1 << 4, + IBV_WC_TM_MATCH = 1 << 5, + IBV_WC_TM_DATA_VALID = 1 << 6, +}; + +struct ibv_wc { + uint64_t wr_id; + enum ibv_wc_status status; + enum ibv_wc_opcode opcode; + uint32_t vendor_err; + uint32_t byte_len; + /* When (wc_flags & IBV_WC_WITH_IMM): Immediate data in network byte order. + * When (wc_flags & IBV_WC_WITH_INV): Stores the invalidated rkey. + */ + union { + __be32 imm_data; + uint32_t invalidated_rkey; + }; + uint32_t qp_num; + uint32_t src_qp; + unsigned int wc_flags; + uint16_t pkey_index; + uint16_t slid; + uint8_t sl; + uint8_t dlid_path_bits; +}; + +enum ibv_access_flags { + IBV_ACCESS_LOCAL_WRITE = 1, + IBV_ACCESS_REMOTE_WRITE = (1<<1), + IBV_ACCESS_REMOTE_READ = (1<<2), + IBV_ACCESS_REMOTE_ATOMIC = (1<<3), + IBV_ACCESS_MW_BIND = (1<<4), + IBV_ACCESS_ZERO_BASED = (1<<5), + IBV_ACCESS_ON_DEMAND = (1<<6), + IBV_ACCESS_HUGETLB = (1<<7), + IBV_ACCESS_RELAXED_ORDERING = IBV_ACCESS_OPTIONAL_FIRST, +}; + +struct ibv_mw_bind_info { + struct ibv_mr *mr; + uint64_t addr; + uint64_t length; + unsigned int mw_access_flags; /* use ibv_access_flags */ +}; + +struct ibv_pd { + struct ibv_context *context; + uint32_t handle; +}; + +struct ibv_td_init_attr { + uint32_t comp_mask; +}; + +struct ibv_td { + struct ibv_context *context; +}; + +enum ibv_xrcd_init_attr_mask { + IBV_XRCD_INIT_ATTR_FD = 1 << 0, + IBV_XRCD_INIT_ATTR_OFLAGS = 1 << 1, + IBV_XRCD_INIT_ATTR_RESERVED = 1 << 2 +}; + +struct ibv_xrcd_init_attr { + uint32_t comp_mask; + int fd; + int oflags; +}; + +struct ibv_xrcd { + struct ibv_context *context; +}; + +enum ibv_rereg_mr_flags { + IBV_REREG_MR_CHANGE_TRANSLATION = (1 << 0), + IBV_REREG_MR_CHANGE_PD = (1 << 1), + IBV_REREG_MR_CHANGE_ACCESS = (1 << 2), + IBV_REREG_MR_KEEP_VALID = (1 << 3), + IBV_REREG_MR_FLAGS_SUPPORTED = ((IBV_REREG_MR_KEEP_VALID << 1) - 1) +}; + +struct ibv_mr { + struct ibv_context *context; + struct ibv_pd *pd; + void *addr; + size_t length; + uint32_t handle; + uint32_t lkey; + uint32_t rkey; +}; + +enum ibv_mw_type { + IBV_MW_TYPE_1 = 1, + IBV_MW_TYPE_2 = 2 +}; + +struct ibv_mw { + struct ibv_context *context; + struct ibv_pd *pd; + uint32_t rkey; + uint32_t handle; + enum ibv_mw_type type; +}; + +struct ibv_global_route { + union ibv_gid dgid; + uint32_t flow_label; + uint8_t sgid_index; + uint8_t hop_limit; + uint8_t traffic_class; +}; + +struct ibv_grh { + __be32 version_tclass_flow; + __be16 paylen; + uint8_t next_hdr; + uint8_t hop_limit; + union ibv_gid sgid; + union ibv_gid dgid; +}; + +enum ibv_rate { + IBV_RATE_MAX = 0, + IBV_RATE_2_5_GBPS = 2, + IBV_RATE_5_GBPS = 5, + IBV_RATE_10_GBPS = 3, + IBV_RATE_20_GBPS = 6, + IBV_RATE_30_GBPS = 4, + IBV_RATE_40_GBPS = 7, + IBV_RATE_60_GBPS = 8, + IBV_RATE_80_GBPS = 9, + IBV_RATE_120_GBPS = 10, + IBV_RATE_14_GBPS = 11, + IBV_RATE_56_GBPS = 12, + IBV_RATE_112_GBPS = 13, + IBV_RATE_168_GBPS = 14, + IBV_RATE_25_GBPS = 15, + IBV_RATE_100_GBPS = 16, + IBV_RATE_200_GBPS = 17, + IBV_RATE_300_GBPS = 18, + IBV_RATE_28_GBPS = 19, + IBV_RATE_50_GBPS = 20, + IBV_RATE_400_GBPS = 21, + IBV_RATE_600_GBPS = 22, +}; + +/** + * ibv_rate_to_mult - Convert the IB rate enum to a multiple of the + * base rate of 2.5 Gbit/sec. For example, IBV_RATE_5_GBPS will be + * converted to 2, since 5 Gbit/sec is 2 * 2.5 Gbit/sec. + * @rate: rate to convert. + */ +int __attribute_const ibv_rate_to_mult(enum ibv_rate rate); + +/** + * mult_to_ibv_rate - Convert a multiple of 2.5 Gbit/sec to an IB rate enum. + * @mult: multiple to convert. + */ +enum ibv_rate __attribute_const mult_to_ibv_rate(int mult); + +/** + * ibv_rate_to_mbps - Convert the IB rate enum to Mbit/sec. + * For example, IBV_RATE_5_GBPS will return the value 5000. + * @rate: rate to convert. + */ +int __attribute_const ibv_rate_to_mbps(enum ibv_rate rate); + +/** + * mbps_to_ibv_rate - Convert a Mbit/sec value to an IB rate enum. + * @mbps: value to convert. + */ +enum ibv_rate __attribute_const mbps_to_ibv_rate(int mbps) __attribute_const; + +struct ibv_ah_attr { + struct ibv_global_route grh; + uint16_t dlid; + uint8_t sl; + uint8_t src_path_bits; + uint8_t static_rate; + uint8_t is_global; + uint8_t port_num; +}; + +enum ibv_srq_attr_mask { + IBV_SRQ_MAX_WR = 1 << 0, + IBV_SRQ_LIMIT = 1 << 1 +}; + +struct ibv_srq_attr { + uint32_t max_wr; + uint32_t max_sge; + uint32_t srq_limit; +}; + +struct ibv_srq_init_attr { + void *srq_context; + struct ibv_srq_attr attr; +}; + +enum ibv_srq_type { + IBV_SRQT_BASIC, + IBV_SRQT_XRC, + IBV_SRQT_TM, +}; + +enum ibv_srq_init_attr_mask { + IBV_SRQ_INIT_ATTR_TYPE = 1 << 0, + IBV_SRQ_INIT_ATTR_PD = 1 << 1, + IBV_SRQ_INIT_ATTR_XRCD = 1 << 2, + IBV_SRQ_INIT_ATTR_CQ = 1 << 3, + IBV_SRQ_INIT_ATTR_TM = 1 << 4, + IBV_SRQ_INIT_ATTR_RESERVED = 1 << 5, +}; + +struct ibv_tm_cap { + uint32_t max_num_tags; + uint32_t max_ops; +}; + +struct ibv_srq_init_attr_ex { + void *srq_context; + struct ibv_srq_attr attr; + + uint32_t comp_mask; + enum ibv_srq_type srq_type; + struct ibv_pd *pd; + struct ibv_xrcd *xrcd; + struct ibv_cq *cq; + struct ibv_tm_cap tm_cap; +}; + +enum ibv_wq_type { + IBV_WQT_RQ +}; + +enum ibv_wq_init_attr_mask { + IBV_WQ_INIT_ATTR_FLAGS = 1 << 0, + IBV_WQ_INIT_ATTR_RESERVED = 1 << 1, +}; + +enum ibv_wq_flags { + IBV_WQ_FLAGS_CVLAN_STRIPPING = 1 << 0, + IBV_WQ_FLAGS_SCATTER_FCS = 1 << 1, + IBV_WQ_FLAGS_DELAY_DROP = 1 << 2, + IBV_WQ_FLAGS_PCI_WRITE_END_PADDING = 1 << 3, + IBV_WQ_FLAGS_RESERVED = 1 << 4, +}; + +struct ibv_wq_init_attr { + void *wq_context; + enum ibv_wq_type wq_type; + uint32_t max_wr; + uint32_t max_sge; + struct ibv_pd *pd; + struct ibv_cq *cq; + uint32_t comp_mask; /* Use ibv_wq_init_attr_mask */ + uint32_t create_flags; /* use ibv_wq_flags */ +}; + +enum ibv_wq_state { + IBV_WQS_RESET, + IBV_WQS_RDY, + IBV_WQS_ERR, + IBV_WQS_UNKNOWN +}; + +enum ibv_wq_attr_mask { + IBV_WQ_ATTR_STATE = 1 << 0, + IBV_WQ_ATTR_CURR_STATE = 1 << 1, + IBV_WQ_ATTR_FLAGS = 1 << 2, + IBV_WQ_ATTR_RESERVED = 1 << 3, +}; + +struct ibv_wq_attr { + /* enum ibv_wq_attr_mask */ + uint32_t attr_mask; + /* Move the WQ to this state */ + enum ibv_wq_state wq_state; + /* Assume this is the current WQ state */ + enum ibv_wq_state curr_wq_state; + uint32_t flags; /* Use ibv_wq_flags */ + uint32_t flags_mask; /* Use ibv_wq_flags */ +}; + +/* + * Receive Work Queue Indirection Table. + * It's used in order to distribute incoming packets between different + * Receive Work Queues. Associating Receive WQs with different CPU cores + * allows one to workload the traffic between different CPU cores. + * The Indirection Table can contain only WQs of type IBV_WQT_RQ. +*/ +struct ibv_rwq_ind_table { + struct ibv_context *context; + int ind_tbl_handle; + int ind_tbl_num; + uint32_t comp_mask; +}; + +enum ibv_ind_table_init_attr_mask { + IBV_CREATE_IND_TABLE_RESERVED = (1 << 0) +}; + +/* + * Receive Work Queue Indirection Table attributes + */ +struct ibv_rwq_ind_table_init_attr { + uint32_t log_ind_tbl_size; + /* Each entry is a pointer to a Receive Work Queue */ + struct ibv_wq **ind_tbl; + uint32_t comp_mask; +}; + +enum ibv_qp_type { + IBV_QPT_RC = 2, + IBV_QPT_UC, + IBV_QPT_UD, + IBV_QPT_RAW_PACKET = 8, + IBV_QPT_XRC_SEND = 9, + IBV_QPT_XRC_RECV, + IBV_QPT_DRIVER = 0xff, +}; + +struct ibv_qp_cap { + uint32_t max_send_wr; + uint32_t max_recv_wr; + uint32_t max_send_sge; + uint32_t max_recv_sge; + uint32_t max_inline_data; +}; + +struct ibv_qp_init_attr { + void *qp_context; + struct ibv_cq *send_cq; + struct ibv_cq *recv_cq; + struct ibv_srq *srq; + struct ibv_qp_cap cap; + enum ibv_qp_type qp_type; + int sq_sig_all; +}; + +enum ibv_qp_init_attr_mask { + IBV_QP_INIT_ATTR_PD = 1 << 0, + IBV_QP_INIT_ATTR_XRCD = 1 << 1, + IBV_QP_INIT_ATTR_CREATE_FLAGS = 1 << 2, + IBV_QP_INIT_ATTR_MAX_TSO_HEADER = 1 << 3, + IBV_QP_INIT_ATTR_IND_TABLE = 1 << 4, + IBV_QP_INIT_ATTR_RX_HASH = 1 << 5, + IBV_QP_INIT_ATTR_SEND_OPS_FLAGS = 1 << 6, +}; + +enum ibv_qp_create_flags { + IBV_QP_CREATE_BLOCK_SELF_MCAST_LB = 1 << 1, + IBV_QP_CREATE_SCATTER_FCS = 1 << 8, + IBV_QP_CREATE_CVLAN_STRIPPING = 1 << 9, + IBV_QP_CREATE_SOURCE_QPN = 1 << 10, + IBV_QP_CREATE_PCI_WRITE_END_PADDING = 1 << 11, +}; + +enum ibv_qp_create_send_ops_flags { + IBV_QP_EX_WITH_RDMA_WRITE = 1 << 0, + IBV_QP_EX_WITH_RDMA_WRITE_WITH_IMM = 1 << 1, + IBV_QP_EX_WITH_SEND = 1 << 2, + IBV_QP_EX_WITH_SEND_WITH_IMM = 1 << 3, + IBV_QP_EX_WITH_RDMA_READ = 1 << 4, + IBV_QP_EX_WITH_ATOMIC_CMP_AND_SWP = 1 << 5, + IBV_QP_EX_WITH_ATOMIC_FETCH_AND_ADD = 1 << 6, + IBV_QP_EX_WITH_LOCAL_INV = 1 << 7, + IBV_QP_EX_WITH_BIND_MW = 1 << 8, + IBV_QP_EX_WITH_SEND_WITH_INV = 1 << 9, + IBV_QP_EX_WITH_TSO = 1 << 10, +}; + +struct ibv_rx_hash_conf { + /* enum ibv_rx_hash_function_flags */ + uint8_t rx_hash_function; + uint8_t rx_hash_key_len; + uint8_t *rx_hash_key; + /* enum ibv_rx_hash_fields */ + uint64_t rx_hash_fields_mask; +}; + +struct ibv_qp_init_attr_ex { + void *qp_context; + struct ibv_cq *send_cq; + struct ibv_cq *recv_cq; + struct ibv_srq *srq; + struct ibv_qp_cap cap; + enum ibv_qp_type qp_type; + int sq_sig_all; + + uint32_t comp_mask; + struct ibv_pd *pd; + struct ibv_xrcd *xrcd; + uint32_t create_flags; + uint16_t max_tso_header; + struct ibv_rwq_ind_table *rwq_ind_tbl; + struct ibv_rx_hash_conf rx_hash_conf; + uint32_t source_qpn; + /* See enum ibv_qp_create_send_ops_flags */ + uint64_t send_ops_flags; +}; + +enum ibv_qp_open_attr_mask { + IBV_QP_OPEN_ATTR_NUM = 1 << 0, + IBV_QP_OPEN_ATTR_XRCD = 1 << 1, + IBV_QP_OPEN_ATTR_CONTEXT = 1 << 2, + IBV_QP_OPEN_ATTR_TYPE = 1 << 3, + IBV_QP_OPEN_ATTR_RESERVED = 1 << 4 +}; + +struct ibv_qp_open_attr { + uint32_t comp_mask; + uint32_t qp_num; + struct ibv_xrcd *xrcd; + void *qp_context; + enum ibv_qp_type qp_type; +}; + +enum ibv_qp_attr_mask { + IBV_QP_STATE = 1 << 0, + IBV_QP_CUR_STATE = 1 << 1, + IBV_QP_EN_SQD_ASYNC_NOTIFY = 1 << 2, + IBV_QP_ACCESS_FLAGS = 1 << 3, + IBV_QP_PKEY_INDEX = 1 << 4, + IBV_QP_PORT = 1 << 5, + IBV_QP_QKEY = 1 << 6, + IBV_QP_AV = 1 << 7, + IBV_QP_PATH_MTU = 1 << 8, + IBV_QP_TIMEOUT = 1 << 9, + IBV_QP_RETRY_CNT = 1 << 10, + IBV_QP_RNR_RETRY = 1 << 11, + IBV_QP_RQ_PSN = 1 << 12, + IBV_QP_MAX_QP_RD_ATOMIC = 1 << 13, + IBV_QP_ALT_PATH = 1 << 14, + IBV_QP_MIN_RNR_TIMER = 1 << 15, + IBV_QP_SQ_PSN = 1 << 16, + IBV_QP_MAX_DEST_RD_ATOMIC = 1 << 17, + IBV_QP_PATH_MIG_STATE = 1 << 18, + IBV_QP_CAP = 1 << 19, + IBV_QP_DEST_QPN = 1 << 20, + /* These bits were supported on older kernels, but never exposed from + libibverbs: + _IBV_QP_SMAC = 1 << 21, + _IBV_QP_ALT_SMAC = 1 << 22, + _IBV_QP_VID = 1 << 23, + _IBV_QP_ALT_VID = 1 << 24, + */ + IBV_QP_RATE_LIMIT = 1 << 25, +}; + +enum ibv_qp_state { + IBV_QPS_RESET, + IBV_QPS_INIT, + IBV_QPS_RTR, + IBV_QPS_RTS, + IBV_QPS_SQD, + IBV_QPS_SQE, + IBV_QPS_ERR, + IBV_QPS_UNKNOWN +}; + +enum ibv_mig_state { + IBV_MIG_MIGRATED, + IBV_MIG_REARM, + IBV_MIG_ARMED +}; + +struct ibv_qp_attr { + enum ibv_qp_state qp_state; + enum ibv_qp_state cur_qp_state; + enum ibv_mtu path_mtu; + enum ibv_mig_state path_mig_state; + uint32_t qkey; + uint32_t rq_psn; + uint32_t sq_psn; + uint32_t dest_qp_num; + unsigned int qp_access_flags; + struct ibv_qp_cap cap; + struct ibv_ah_attr ah_attr; + struct ibv_ah_attr alt_ah_attr; + uint16_t pkey_index; + uint16_t alt_pkey_index; + uint8_t en_sqd_async_notify; + uint8_t sq_draining; + uint8_t max_rd_atomic; + uint8_t max_dest_rd_atomic; + uint8_t min_rnr_timer; + uint8_t port_num; + uint8_t timeout; + uint8_t retry_cnt; + uint8_t rnr_retry; + uint8_t alt_port_num; + uint8_t alt_timeout; + uint32_t rate_limit; +}; + +struct ibv_qp_rate_limit_attr { + uint32_t rate_limit; /* in kbps */ + uint32_t max_burst_sz; /* total burst size in bytes */ + uint16_t typical_pkt_sz; /* typical send packet size in bytes */ + uint32_t comp_mask; +}; + +enum ibv_wr_opcode { + IBV_WR_RDMA_WRITE, + IBV_WR_RDMA_WRITE_WITH_IMM, + IBV_WR_SEND, + IBV_WR_SEND_WITH_IMM, + IBV_WR_RDMA_READ, + IBV_WR_ATOMIC_CMP_AND_SWP, + IBV_WR_ATOMIC_FETCH_AND_ADD, + IBV_WR_LOCAL_INV, + IBV_WR_BIND_MW, + IBV_WR_SEND_WITH_INV, + IBV_WR_TSO, + IBV_WR_DRIVER1, +}; + +enum ibv_send_flags { + IBV_SEND_FENCE = 1 << 0, + IBV_SEND_SIGNALED = 1 << 1, + IBV_SEND_SOLICITED = 1 << 2, + IBV_SEND_INLINE = 1 << 3, + IBV_SEND_IP_CSUM = 1 << 4 +}; + +struct ibv_data_buf { + void *addr; + size_t length; +}; + +struct ibv_sge { + uint64_t addr; + uint32_t length; + uint32_t lkey; +}; + +struct ibv_send_wr { + uint64_t wr_id; + struct ibv_send_wr *next; + struct ibv_sge *sg_list; + int num_sge; + enum ibv_wr_opcode opcode; + unsigned int send_flags; + /* When opcode is *_WITH_IMM: Immediate data in network byte order. + * When opcode is *_INV: Stores the rkey to invalidate + */ + union { + __be32 imm_data; + uint32_t invalidate_rkey; + }; + union { + struct { + uint64_t remote_addr; + uint32_t rkey; + } rdma; + struct { + uint64_t remote_addr; + uint64_t compare_add; + uint64_t swap; + uint32_t rkey; + } atomic; + struct { + struct ibv_ah *ah; + uint32_t remote_qpn; + uint32_t remote_qkey; + } ud; + } wr; + union { + struct { + uint32_t remote_srqn; + } xrc; + } qp_type; + union { + struct { + struct ibv_mw *mw; + uint32_t rkey; + struct ibv_mw_bind_info bind_info; + } bind_mw; + struct { + void *hdr; + uint16_t hdr_sz; + uint16_t mss; + } tso; + }; +}; + +struct ibv_recv_wr { + uint64_t wr_id; + struct ibv_recv_wr *next; + struct ibv_sge *sg_list; + int num_sge; +}; + +enum ibv_ops_wr_opcode { + IBV_WR_TAG_ADD, + IBV_WR_TAG_DEL, + IBV_WR_TAG_SYNC, +}; + +enum ibv_ops_flags { + IBV_OPS_SIGNALED = 1 << 0, + IBV_OPS_TM_SYNC = 1 << 1, +}; + +struct ibv_ops_wr { + uint64_t wr_id; + struct ibv_ops_wr *next; + enum ibv_ops_wr_opcode opcode; + int flags; + struct { + uint32_t unexpected_cnt; + uint32_t handle; + struct { + uint64_t recv_wr_id; + struct ibv_sge *sg_list; + int num_sge; + uint64_t tag; + uint64_t mask; + } add; + } tm; +}; + +struct ibv_mw_bind { + uint64_t wr_id; + unsigned int send_flags; + struct ibv_mw_bind_info bind_info; +}; + +struct ibv_srq { + struct ibv_context *context; + void *srq_context; + struct ibv_pd *pd; + uint32_t handle; + + pthread_mutex_t mutex; + pthread_cond_t cond; + uint32_t events_completed; +}; + +/* + * Work Queue. QP can be created without internal WQs "packaged" inside it, + * this QP can be configured to use "external" WQ object as its + * receive/send queue. + * WQ associated (many to one) with Completion Queue it owns WQ properties + * (PD, WQ size etc). + * WQ of type IBV_WQT_RQ: + * - Contains receive WQEs, in this case its PD serves as scatter as well. + * - Exposes post receive function to be used to post a list of work + * requests (WRs) to its receive queue. + */ +struct ibv_wq { + struct ibv_context *context; + void *wq_context; + struct ibv_pd *pd; + struct ibv_cq *cq; + uint32_t wq_num; + uint32_t handle; + enum ibv_wq_state state; + enum ibv_wq_type wq_type; + int (*post_recv)(struct ibv_wq *current, + struct ibv_recv_wr *recv_wr, + struct ibv_recv_wr **bad_recv_wr); + pthread_mutex_t mutex; + pthread_cond_t cond; + uint32_t events_completed; + uint32_t comp_mask; +}; + +struct ibv_qp { + struct ibv_context *context; + void *qp_context; + struct ibv_pd *pd; + struct ibv_cq *send_cq; + struct ibv_cq *recv_cq; + struct ibv_srq *srq; + uint32_t handle; + uint32_t qp_num; + enum ibv_qp_state state; + enum ibv_qp_type qp_type; + + pthread_mutex_t mutex; + pthread_cond_t cond; + uint32_t events_completed; +}; + +struct ibv_qp_ex { + struct ibv_qp qp_base; + uint64_t comp_mask; + + uint64_t wr_id; + /* bitmask from enum ibv_send_flags */ + unsigned int wr_flags; + + void (*wr_atomic_cmp_swp)(struct ibv_qp_ex *qp, uint32_t rkey, + uint64_t remote_addr, uint64_t compare, + uint64_t swap); + void (*wr_atomic_fetch_add)(struct ibv_qp_ex *qp, uint32_t rkey, + uint64_t remote_addr, uint64_t add); + void (*wr_bind_mw)(struct ibv_qp_ex *qp, struct ibv_mw *mw, + uint32_t rkey, + const struct ibv_mw_bind_info *bind_info); + void (*wr_local_inv)(struct ibv_qp_ex *qp, uint32_t invalidate_rkey); + void (*wr_rdma_read)(struct ibv_qp_ex *qp, uint32_t rkey, + uint64_t remote_addr); + void (*wr_rdma_write)(struct ibv_qp_ex *qp, uint32_t rkey, + uint64_t remote_addr); + void (*wr_rdma_write_imm)(struct ibv_qp_ex *qp, uint32_t rkey, + uint64_t remote_addr, __be32 imm_data); + + void (*wr_send)(struct ibv_qp_ex *qp); + void (*wr_send_imm)(struct ibv_qp_ex *qp, __be32 imm_data); + void (*wr_send_inv)(struct ibv_qp_ex *qp, uint32_t invalidate_rkey); + void (*wr_send_tso)(struct ibv_qp_ex *qp, void *hdr, uint16_t hdr_sz, + uint16_t mss); + + void (*wr_set_ud_addr)(struct ibv_qp_ex *qp, struct ibv_ah *ah, + uint32_t remote_qpn, uint32_t remote_qkey); + void (*wr_set_xrc_srqn)(struct ibv_qp_ex *qp, uint32_t remote_srqn); + + void (*wr_set_inline_data)(struct ibv_qp_ex *qp, void *addr, + size_t length); + void (*wr_set_inline_data_list)(struct ibv_qp_ex *qp, size_t num_buf, + const struct ibv_data_buf *buf_list); + void (*wr_set_sge)(struct ibv_qp_ex *qp, uint32_t lkey, uint64_t addr, + uint32_t length); + void (*wr_set_sge_list)(struct ibv_qp_ex *qp, size_t num_sge, + const struct ibv_sge *sg_list); + + void (*wr_start)(struct ibv_qp_ex *qp); + int (*wr_complete)(struct ibv_qp_ex *qp); + void (*wr_abort)(struct ibv_qp_ex *qp); +}; + +struct ibv_qp_ex *ibv_qp_to_qp_ex(struct ibv_qp *qp); + +static inline void ibv_wr_atomic_cmp_swp(struct ibv_qp_ex *qp, uint32_t rkey, + uint64_t remote_addr, uint64_t compare, + uint64_t swap) +{ + qp->wr_atomic_cmp_swp(qp, rkey, remote_addr, compare, swap); +} + +static inline void ibv_wr_atomic_fetch_add(struct ibv_qp_ex *qp, uint32_t rkey, + uint64_t remote_addr, uint64_t add) +{ + qp->wr_atomic_fetch_add(qp, rkey, remote_addr, add); +} + +static inline void ibv_wr_bind_mw(struct ibv_qp_ex *qp, struct ibv_mw *mw, + uint32_t rkey, + const struct ibv_mw_bind_info *bind_info) +{ + qp->wr_bind_mw(qp, mw, rkey, bind_info); +} + +static inline void ibv_wr_local_inv(struct ibv_qp_ex *qp, + uint32_t invalidate_rkey) +{ + qp->wr_local_inv(qp, invalidate_rkey); +} + +static inline void ibv_wr_rdma_read(struct ibv_qp_ex *qp, uint32_t rkey, + uint64_t remote_addr) +{ + qp->wr_rdma_read(qp, rkey, remote_addr); +} + +static inline void ibv_wr_rdma_write(struct ibv_qp_ex *qp, uint32_t rkey, + uint64_t remote_addr) +{ + qp->wr_rdma_write(qp, rkey, remote_addr); +} + +static inline void ibv_wr_rdma_write_imm(struct ibv_qp_ex *qp, uint32_t rkey, + uint64_t remote_addr, __be32 imm_data) +{ + qp->wr_rdma_write_imm(qp, rkey, remote_addr, imm_data); +} + +static inline void ibv_wr_send(struct ibv_qp_ex *qp) +{ + qp->wr_send(qp); +} + +static inline void ibv_wr_send_imm(struct ibv_qp_ex *qp, __be32 imm_data) +{ + qp->wr_send_imm(qp, imm_data); +} + +static inline void ibv_wr_send_inv(struct ibv_qp_ex *qp, + uint32_t invalidate_rkey) +{ + qp->wr_send_inv(qp, invalidate_rkey); +} + +static inline void ibv_wr_send_tso(struct ibv_qp_ex *qp, void *hdr, + uint16_t hdr_sz, uint16_t mss) +{ + qp->wr_send_tso(qp, hdr, hdr_sz, mss); +} + +static inline void ibv_wr_set_ud_addr(struct ibv_qp_ex *qp, struct ibv_ah *ah, + uint32_t remote_qpn, uint32_t remote_qkey) +{ + qp->wr_set_ud_addr(qp, ah, remote_qpn, remote_qkey); +} + +static inline void ibv_wr_set_xrc_srqn(struct ibv_qp_ex *qp, + uint32_t remote_srqn) +{ + qp->wr_set_xrc_srqn(qp, remote_srqn); +} + +static inline void ibv_wr_set_inline_data(struct ibv_qp_ex *qp, void *addr, + size_t length) +{ + qp->wr_set_inline_data(qp, addr, length); +} + +static inline void ibv_wr_set_inline_data_list(struct ibv_qp_ex *qp, + size_t num_buf, + const struct ibv_data_buf *buf_list) +{ + qp->wr_set_inline_data_list(qp, num_buf, buf_list); +} + +static inline void ibv_wr_set_sge(struct ibv_qp_ex *qp, uint32_t lkey, + uint64_t addr, uint32_t length) +{ + qp->wr_set_sge(qp, lkey, addr, length); +} + +static inline void ibv_wr_set_sge_list(struct ibv_qp_ex *qp, size_t num_sge, + const struct ibv_sge *sg_list) +{ + qp->wr_set_sge_list(qp, num_sge, sg_list); +} + +static inline void ibv_wr_start(struct ibv_qp_ex *qp) +{ + qp->wr_start(qp); +} + +static inline int ibv_wr_complete(struct ibv_qp_ex *qp) +{ + return qp->wr_complete(qp); +} + +static inline void ibv_wr_abort(struct ibv_qp_ex *qp) +{ + qp->wr_abort(qp); +} + +struct ibv_comp_channel { + struct ibv_context *context; + int fd; + int refcnt; +}; + +struct ibv_cq { + struct ibv_context *context; + struct ibv_comp_channel *channel; + void *cq_context; + uint32_t handle; + int cqe; + + pthread_mutex_t mutex; + pthread_cond_t cond; + uint32_t comp_events_completed; + uint32_t async_events_completed; +}; + +struct ibv_poll_cq_attr { + uint32_t comp_mask; +}; + +struct ibv_wc_tm_info { + uint64_t tag; /* tag from TMH */ + uint32_t priv; /* opaque user data from TMH */ +}; + +struct ibv_cq_ex { + struct ibv_context *context; + struct ibv_comp_channel *channel; + void *cq_context; + uint32_t handle; + int cqe; + + pthread_mutex_t mutex; + pthread_cond_t cond; + uint32_t comp_events_completed; + uint32_t async_events_completed; + + uint32_t comp_mask; + enum ibv_wc_status status; + uint64_t wr_id; + int (*start_poll)(struct ibv_cq_ex *current, + struct ibv_poll_cq_attr *attr); + int (*next_poll)(struct ibv_cq_ex *current); + void (*end_poll)(struct ibv_cq_ex *current); + enum ibv_wc_opcode (*read_opcode)(struct ibv_cq_ex *current); + uint32_t (*read_vendor_err)(struct ibv_cq_ex *current); + uint32_t (*read_byte_len)(struct ibv_cq_ex *current); + __be32 (*read_imm_data)(struct ibv_cq_ex *current); + uint32_t (*read_qp_num)(struct ibv_cq_ex *current); + uint32_t (*read_src_qp)(struct ibv_cq_ex *current); + unsigned int (*read_wc_flags)(struct ibv_cq_ex *current); + uint32_t (*read_slid)(struct ibv_cq_ex *current); + uint8_t (*read_sl)(struct ibv_cq_ex *current); + uint8_t (*read_dlid_path_bits)(struct ibv_cq_ex *current); + uint64_t (*read_completion_ts)(struct ibv_cq_ex *current); + uint16_t (*read_cvlan)(struct ibv_cq_ex *current); + uint32_t (*read_flow_tag)(struct ibv_cq_ex *current); + void (*read_tm_info)(struct ibv_cq_ex *current, + struct ibv_wc_tm_info *tm_info); + uint64_t (*read_completion_wallclock_ns)(struct ibv_cq_ex *current); +}; + +static inline struct ibv_cq *ibv_cq_ex_to_cq(struct ibv_cq_ex *cq) +{ + return (struct ibv_cq *)cq; +} + +enum ibv_cq_attr_mask { + IBV_CQ_ATTR_MODERATE = 1 << 0, + IBV_CQ_ATTR_RESERVED = 1 << 1, +}; + +struct ibv_moderate_cq { + uint16_t cq_count; + uint16_t cq_period; /* in micro seconds */ +}; + +struct ibv_modify_cq_attr { + uint32_t attr_mask; + struct ibv_moderate_cq moderate; +}; + +static inline int ibv_start_poll(struct ibv_cq_ex *cq, + struct ibv_poll_cq_attr *attr) +{ + return cq->start_poll(cq, attr); +} + +static inline int ibv_next_poll(struct ibv_cq_ex *cq) +{ + return cq->next_poll(cq); +} + +static inline void ibv_end_poll(struct ibv_cq_ex *cq) +{ + cq->end_poll(cq); +} + +static inline enum ibv_wc_opcode ibv_wc_read_opcode(struct ibv_cq_ex *cq) +{ + return cq->read_opcode(cq); +} + +static inline uint32_t ibv_wc_read_vendor_err(struct ibv_cq_ex *cq) +{ + return cq->read_vendor_err(cq); +} + +static inline uint32_t ibv_wc_read_byte_len(struct ibv_cq_ex *cq) +{ + return cq->read_byte_len(cq); +} + +static inline __be32 ibv_wc_read_imm_data(struct ibv_cq_ex *cq) +{ + return cq->read_imm_data(cq); +} + +static inline uint32_t ibv_wc_read_invalidated_rkey(struct ibv_cq_ex *cq) +{ +#ifdef __CHECKER__ + return (__attribute__((force)) uint32_t)cq->read_imm_data(cq); +#else + return cq->read_imm_data(cq); +#endif +} + +static inline uint32_t ibv_wc_read_qp_num(struct ibv_cq_ex *cq) +{ + return cq->read_qp_num(cq); +} + +static inline uint32_t ibv_wc_read_src_qp(struct ibv_cq_ex *cq) +{ + return cq->read_src_qp(cq); +} + +static inline unsigned int ibv_wc_read_wc_flags(struct ibv_cq_ex *cq) +{ + return cq->read_wc_flags(cq); +} + +static inline uint32_t ibv_wc_read_slid(struct ibv_cq_ex *cq) +{ + return cq->read_slid(cq); +} + +static inline uint8_t ibv_wc_read_sl(struct ibv_cq_ex *cq) +{ + return cq->read_sl(cq); +} + +static inline uint8_t ibv_wc_read_dlid_path_bits(struct ibv_cq_ex *cq) +{ + return cq->read_dlid_path_bits(cq); +} + +static inline uint64_t ibv_wc_read_completion_ts(struct ibv_cq_ex *cq) +{ + return cq->read_completion_ts(cq); +} + +static inline uint64_t ibv_wc_read_completion_wallclock_ns(struct ibv_cq_ex *cq) +{ + return cq->read_completion_wallclock_ns(cq); +} + +static inline uint16_t ibv_wc_read_cvlan(struct ibv_cq_ex *cq) +{ + return cq->read_cvlan(cq); +} + +static inline uint32_t ibv_wc_read_flow_tag(struct ibv_cq_ex *cq) +{ + return cq->read_flow_tag(cq); +} + +static inline void ibv_wc_read_tm_info(struct ibv_cq_ex *cq, + struct ibv_wc_tm_info *tm_info) +{ + cq->read_tm_info(cq, tm_info); +} + +static inline int ibv_post_wq_recv(struct ibv_wq *wq, + struct ibv_recv_wr *recv_wr, + struct ibv_recv_wr **bad_recv_wr) +{ + return wq->post_recv(wq, recv_wr, bad_recv_wr); +} + +struct ibv_ah { + struct ibv_context *context; + struct ibv_pd *pd; + uint32_t handle; +}; + +enum ibv_flow_flags { + /* First bit is deprecated and can't be used */ + IBV_FLOW_ATTR_FLAGS_DONT_TRAP = 1 << 1, + IBV_FLOW_ATTR_FLAGS_EGRESS = 1 << 2, +}; + +enum ibv_flow_attr_type { + /* steering according to rule specifications */ + IBV_FLOW_ATTR_NORMAL = 0x0, + /* default unicast and multicast rule - + * receive all Eth traffic which isn't steered to any QP + */ + IBV_FLOW_ATTR_ALL_DEFAULT = 0x1, + /* default multicast rule - + * receive all Eth multicast traffic which isn't steered to any QP + */ + IBV_FLOW_ATTR_MC_DEFAULT = 0x2, + /* sniffer rule - receive all port traffic */ + IBV_FLOW_ATTR_SNIFFER = 0x3, +}; + +enum ibv_flow_spec_type { + IBV_FLOW_SPEC_ETH = 0x20, + IBV_FLOW_SPEC_IPV4 = 0x30, + IBV_FLOW_SPEC_IPV6 = 0x31, + IBV_FLOW_SPEC_IPV4_EXT = 0x32, + IBV_FLOW_SPEC_ESP = 0x34, + IBV_FLOW_SPEC_TCP = 0x40, + IBV_FLOW_SPEC_UDP = 0x41, + IBV_FLOW_SPEC_VXLAN_TUNNEL = 0x50, + IBV_FLOW_SPEC_GRE = 0x51, + IBV_FLOW_SPEC_MPLS = 0x60, + IBV_FLOW_SPEC_INNER = 0x100, + IBV_FLOW_SPEC_ACTION_TAG = 0x1000, + IBV_FLOW_SPEC_ACTION_DROP = 0x1001, + IBV_FLOW_SPEC_ACTION_HANDLE = 0x1002, + IBV_FLOW_SPEC_ACTION_COUNT = 0x1003, +}; + +struct ibv_flow_eth_filter { + uint8_t dst_mac[6]; + uint8_t src_mac[6]; + uint16_t ether_type; + /* + * same layout as 802.1q: prio 3, cfi 1, vlan id 12 + */ + uint16_t vlan_tag; +}; + +struct ibv_flow_spec_eth { + enum ibv_flow_spec_type type; + uint16_t size; + struct ibv_flow_eth_filter val; + struct ibv_flow_eth_filter mask; +}; + +struct ibv_flow_ipv4_filter { + uint32_t src_ip; + uint32_t dst_ip; +}; + +struct ibv_flow_spec_ipv4 { + enum ibv_flow_spec_type type; + uint16_t size; + struct ibv_flow_ipv4_filter val; + struct ibv_flow_ipv4_filter mask; +}; + +struct ibv_flow_ipv4_ext_filter { + uint32_t src_ip; + uint32_t dst_ip; + uint8_t proto; + uint8_t tos; + uint8_t ttl; + uint8_t flags; +}; + +struct ibv_flow_spec_ipv4_ext { + enum ibv_flow_spec_type type; + uint16_t size; + struct ibv_flow_ipv4_ext_filter val; + struct ibv_flow_ipv4_ext_filter mask; +}; + +struct ibv_flow_ipv6_filter { + uint8_t src_ip[16]; + uint8_t dst_ip[16]; + uint32_t flow_label; + uint8_t next_hdr; + uint8_t traffic_class; + uint8_t hop_limit; +}; + +struct ibv_flow_spec_ipv6 { + enum ibv_flow_spec_type type; + uint16_t size; + struct ibv_flow_ipv6_filter val; + struct ibv_flow_ipv6_filter mask; +}; + +struct ibv_flow_esp_filter { + uint32_t spi; + uint32_t seq; +}; + +struct ibv_flow_spec_esp { + enum ibv_flow_spec_type type; + uint16_t size; + struct ibv_flow_esp_filter val; + struct ibv_flow_esp_filter mask; +}; + +struct ibv_flow_tcp_udp_filter { + uint16_t dst_port; + uint16_t src_port; +}; + +struct ibv_flow_spec_tcp_udp { + enum ibv_flow_spec_type type; + uint16_t size; + struct ibv_flow_tcp_udp_filter val; + struct ibv_flow_tcp_udp_filter mask; +}; + +struct ibv_flow_gre_filter { + /* c_ks_res0_ver field is bits 0-15 in offset 0 of a standard GRE header: + * bit 0 - checksum present bit. + * bit 1 - reserved. set to 0. + * bit 2 - key present bit. + * bit 3 - sequence number present bit. + * bits 4:12 - reserved. set to 0. + * bits 13:15 - GRE version. + */ + uint16_t c_ks_res0_ver; + uint16_t protocol; + uint32_t key; +}; + +struct ibv_flow_spec_gre { + enum ibv_flow_spec_type type; + uint16_t size; + struct ibv_flow_gre_filter val; + struct ibv_flow_gre_filter mask; +}; + +struct ibv_flow_mpls_filter { + /* The field includes the entire MPLS label: + * bits 0:19 - label value field. + * bits 20:22 - traffic class field. + * bits 23 - bottom of stack bit. + * bits 24:31 - ttl field. + */ + uint32_t label; +}; + +struct ibv_flow_spec_mpls { + enum ibv_flow_spec_type type; + uint16_t size; + struct ibv_flow_mpls_filter val; + struct ibv_flow_mpls_filter mask; +}; + +struct ibv_flow_tunnel_filter { + uint32_t tunnel_id; +}; + +struct ibv_flow_spec_tunnel { + enum ibv_flow_spec_type type; + uint16_t size; + struct ibv_flow_tunnel_filter val; + struct ibv_flow_tunnel_filter mask; +}; + +struct ibv_flow_spec_action_tag { + enum ibv_flow_spec_type type; + uint16_t size; + uint32_t tag_id; +}; + +struct ibv_flow_spec_action_drop { + enum ibv_flow_spec_type type; + uint16_t size; +}; + +struct ibv_flow_spec_action_handle { + enum ibv_flow_spec_type type; + uint16_t size; + const struct ibv_flow_action *action; +}; + +struct ibv_flow_spec_counter_action { + enum ibv_flow_spec_type type; + uint16_t size; + struct ibv_counters *counters; +}; + +struct ibv_flow_spec { + union { + struct { + enum ibv_flow_spec_type type; + uint16_t size; + } hdr; + struct ibv_flow_spec_eth eth; + struct ibv_flow_spec_ipv4 ipv4; + struct ibv_flow_spec_tcp_udp tcp_udp; + struct ibv_flow_spec_ipv4_ext ipv4_ext; + struct ibv_flow_spec_ipv6 ipv6; + struct ibv_flow_spec_esp esp; + struct ibv_flow_spec_tunnel tunnel; + struct ibv_flow_spec_gre gre; + struct ibv_flow_spec_mpls mpls; + struct ibv_flow_spec_action_tag flow_tag; + struct ibv_flow_spec_action_drop drop; + struct ibv_flow_spec_action_handle handle; + struct ibv_flow_spec_counter_action flow_count; + }; +}; + +struct ibv_flow_attr { + uint32_t comp_mask; + enum ibv_flow_attr_type type; + uint16_t size; + uint16_t priority; + uint8_t num_of_specs; + uint8_t port; + uint32_t flags; + /* Following are the optional layers according to user request + * struct ibv_flow_spec_xxx [L2] + * struct ibv_flow_spec_yyy [L3/L4] + */ +}; + +struct ibv_flow { + uint32_t comp_mask; + struct ibv_context *context; + uint32_t handle; +}; + +struct ibv_flow_action { + struct ibv_context *context; +}; + +enum ibv_flow_action_esp_mask { + IBV_FLOW_ACTION_ESP_MASK_ESN = 1UL << 0, +}; + +struct ibv_flow_action_esp_attr { + struct ibv_flow_action_esp *esp_attr; + + enum ibv_flow_action_esp_keymat keymat_proto; + uint16_t keymat_len; + void *keymat_ptr; + + enum ibv_flow_action_esp_replay replay_proto; + uint16_t replay_len; + void *replay_ptr; + + struct ibv_flow_action_esp_encap *esp_encap; + + uint32_t comp_mask; /* Use enum ibv_flow_action_esp_mask */ + uint32_t esn; +}; + +struct ibv_device; +struct ibv_context; + +/* Obsolete, never used, do not touch */ +struct _ibv_device_ops { + struct ibv_context * (*_dummy1)(struct ibv_device *device, int cmd_fd); + void (*_dummy2)(struct ibv_context *context); +}; + +enum { + IBV_SYSFS_NAME_MAX = 64, + IBV_SYSFS_PATH_MAX = 256 +}; + +struct ibv_device { + struct _ibv_device_ops _ops; + enum ibv_node_type node_type; + enum ibv_transport_type transport_type; + /* Name of underlying kernel IB device, eg "mthca0" */ + char name[IBV_SYSFS_NAME_MAX]; + /* Name of uverbs device, eg "uverbs0" */ + char dev_name[IBV_SYSFS_NAME_MAX]; + /* Path to infiniband_verbs class device in sysfs */ + char dev_path[IBV_SYSFS_PATH_MAX]; + /* Path to infiniband class device in sysfs */ + char ibdev_path[IBV_SYSFS_PATH_MAX]; +}; + +struct _compat_ibv_port_attr; +struct ibv_context_ops { + void *(*_compat_query_device)(void); + int (*_compat_query_port)(struct ibv_context *context, + uint8_t port_num, + struct _compat_ibv_port_attr *port_attr); + void *(*_compat_alloc_pd)(void); + void *(*_compat_dealloc_pd)(void); + void *(*_compat_reg_mr)(void); + void *(*_compat_rereg_mr)(void); + void *(*_compat_dereg_mr)(void); + struct ibv_mw * (*alloc_mw)(struct ibv_pd *pd, enum ibv_mw_type type); + int (*bind_mw)(struct ibv_qp *qp, struct ibv_mw *mw, + struct ibv_mw_bind *mw_bind); + int (*dealloc_mw)(struct ibv_mw *mw); + void *(*_compat_create_cq)(void); + int (*poll_cq)(struct ibv_cq *cq, int num_entries, struct ibv_wc *wc); + int (*req_notify_cq)(struct ibv_cq *cq, int solicited_only); + void *(*_compat_cq_event)(void); + void *(*_compat_resize_cq)(void); + void *(*_compat_destroy_cq)(void); + void *(*_compat_create_srq)(void); + void *(*_compat_modify_srq)(void); + void *(*_compat_query_srq)(void); + void *(*_compat_destroy_srq)(void); + int (*post_srq_recv)(struct ibv_srq *srq, + struct ibv_recv_wr *recv_wr, + struct ibv_recv_wr **bad_recv_wr); + void *(*_compat_create_qp)(void); + void *(*_compat_query_qp)(void); + void *(*_compat_modify_qp)(void); + void *(*_compat_destroy_qp)(void); + int (*post_send)(struct ibv_qp *qp, struct ibv_send_wr *wr, + struct ibv_send_wr **bad_wr); + int (*post_recv)(struct ibv_qp *qp, struct ibv_recv_wr *wr, + struct ibv_recv_wr **bad_wr); + void *(*_compat_create_ah)(void); + void *(*_compat_destroy_ah)(void); + void *(*_compat_attach_mcast)(void); + void *(*_compat_detach_mcast)(void); + void *(*_compat_async_event)(void); +}; + +struct ibv_context { + struct ibv_device *device; + struct ibv_context_ops ops; + int cmd_fd; + int async_fd; + int num_comp_vectors; + pthread_mutex_t mutex; + void *abi_compat; +}; + +enum ibv_cq_init_attr_mask { + IBV_CQ_INIT_ATTR_MASK_FLAGS = 1 << 0, + IBV_CQ_INIT_ATTR_MASK_PD = 1 << 1, +}; + +enum ibv_create_cq_attr_flags { + IBV_CREATE_CQ_ATTR_SINGLE_THREADED = 1 << 0, + IBV_CREATE_CQ_ATTR_IGNORE_OVERRUN = 1 << 1, +}; + +struct ibv_cq_init_attr_ex { + /* Minimum number of entries required for CQ */ + uint32_t cqe; + /* Consumer-supplied context returned for completion events */ + void *cq_context; + /* Completion channel where completion events will be queued. + * May be NULL if completion events will not be used. + */ + struct ibv_comp_channel *channel; + /* Completion vector used to signal completion events. + * Must be < context->num_comp_vectors. + */ + uint32_t comp_vector; + /* Or'ed bit of enum ibv_create_cq_wc_flags. */ + uint64_t wc_flags; + /* compatibility mask (extended verb). Or'd flags of + * enum ibv_cq_init_attr_mask + */ + uint32_t comp_mask; + /* create cq attr flags - one or more flags from + * enum ibv_create_cq_attr_flags + */ + uint32_t flags; + struct ibv_pd *parent_domain; +}; + +enum ibv_parent_domain_init_attr_mask { + IBV_PARENT_DOMAIN_INIT_ATTR_ALLOCATORS = 1 << 0, + IBV_PARENT_DOMAIN_INIT_ATTR_PD_CONTEXT = 1 << 1, +}; + +#define IBV_ALLOCATOR_USE_DEFAULT ((void *)-1) + +struct ibv_parent_domain_init_attr { + struct ibv_pd *pd; /* referance to a protection domain object, can't be NULL */ + struct ibv_td *td; /* referance to a thread domain object, or NULL */ + uint32_t comp_mask; + void *(*alloc)(struct ibv_pd *pd, void *pd_context, size_t size, + size_t alignment, uint64_t resource_type); + void (*free)(struct ibv_pd *pd, void *pd_context, void *ptr, + uint64_t resource_type); + void *pd_context; +}; + +struct ibv_counters_init_attr { + uint32_t comp_mask; +}; + +struct ibv_counters { + struct ibv_context *context; +}; + +enum ibv_counter_description { + IBV_COUNTER_PACKETS, + IBV_COUNTER_BYTES, +}; + +struct ibv_counter_attach_attr { + enum ibv_counter_description counter_desc; + uint32_t index; /* Desired location index of the counter at the counters object */ + uint32_t comp_mask; +}; + +enum ibv_read_counters_flags { + IBV_READ_COUNTERS_ATTR_PREFER_CACHED = 1 << 0, +}; + +enum ibv_values_mask { + IBV_VALUES_MASK_RAW_CLOCK = 1 << 0, + IBV_VALUES_MASK_RESERVED = 1 << 1 +}; + +struct ibv_values_ex { + uint32_t comp_mask; + struct timespec raw_clock; +}; + +struct verbs_context { + /* "grows up" - new fields go here */ + int (*query_port)(struct ibv_context *context, uint8_t port_num, + struct ibv_port_attr *port_attr, + size_t port_attr_len); + int (*advise_mr)(struct ibv_pd *pd, + enum ibv_advise_mr_advice advice, + uint32_t flags, + struct ibv_sge *sg_list, + uint32_t num_sges); + struct ibv_mr *(*alloc_null_mr)(struct ibv_pd *pd); + int (*read_counters)(struct ibv_counters *counters, + uint64_t *counters_value, + uint32_t ncounters, + uint32_t flags); + int (*attach_counters_point_flow)(struct ibv_counters *counters, + struct ibv_counter_attach_attr *attr, + struct ibv_flow *flow); + struct ibv_counters *(*create_counters)(struct ibv_context *context, + struct ibv_counters_init_attr *init_attr); + int (*destroy_counters)(struct ibv_counters *counters); + struct ibv_mr *(*reg_dm_mr)(struct ibv_pd *pd, struct ibv_dm *dm, + uint64_t dm_offset, size_t length, + unsigned int access); + struct ibv_dm *(*alloc_dm)(struct ibv_context *context, + struct ibv_alloc_dm_attr *attr); + int (*free_dm)(struct ibv_dm *dm); + int (*modify_flow_action_esp)(struct ibv_flow_action *action, + struct ibv_flow_action_esp_attr *attr); + int (*destroy_flow_action)(struct ibv_flow_action *action); + struct ibv_flow_action *(*create_flow_action_esp)(struct ibv_context *context, + struct ibv_flow_action_esp_attr *attr); + int (*modify_qp_rate_limit)(struct ibv_qp *qp, + struct ibv_qp_rate_limit_attr *attr); + struct ibv_pd *(*alloc_parent_domain)(struct ibv_context *context, + struct ibv_parent_domain_init_attr *attr); + int (*dealloc_td)(struct ibv_td *td); + struct ibv_td *(*alloc_td)(struct ibv_context *context, struct ibv_td_init_attr *init_attr); + int (*modify_cq)(struct ibv_cq *cq, struct ibv_modify_cq_attr *attr); + int (*post_srq_ops)(struct ibv_srq *srq, + struct ibv_ops_wr *op, + struct ibv_ops_wr **bad_op); + int (*destroy_rwq_ind_table)(struct ibv_rwq_ind_table *rwq_ind_table); + struct ibv_rwq_ind_table *(*create_rwq_ind_table)(struct ibv_context *context, + struct ibv_rwq_ind_table_init_attr *init_attr); + int (*destroy_wq)(struct ibv_wq *wq); + int (*modify_wq)(struct ibv_wq *wq, struct ibv_wq_attr *wq_attr); + struct ibv_wq * (*create_wq)(struct ibv_context *context, + struct ibv_wq_init_attr *wq_init_attr); + int (*query_rt_values)(struct ibv_context *context, + struct ibv_values_ex *values); + struct ibv_cq_ex *(*create_cq_ex)(struct ibv_context *context, + struct ibv_cq_init_attr_ex *init_attr); + struct verbs_ex_private *priv; + int (*query_device_ex)(struct ibv_context *context, + const struct ibv_query_device_ex_input *input, + struct ibv_device_attr_ex *attr, + size_t attr_size); + int (*ibv_destroy_flow) (struct ibv_flow *flow); + void (*ABI_placeholder2) (void); /* DO NOT COPY THIS GARBAGE */ + struct ibv_flow * (*ibv_create_flow) (struct ibv_qp *qp, + struct ibv_flow_attr *flow_attr); + void (*ABI_placeholder1) (void); /* DO NOT COPY THIS GARBAGE */ + struct ibv_qp *(*open_qp)(struct ibv_context *context, + struct ibv_qp_open_attr *attr); + struct ibv_qp *(*create_qp_ex)(struct ibv_context *context, + struct ibv_qp_init_attr_ex *qp_init_attr_ex); + int (*get_srq_num)(struct ibv_srq *srq, uint32_t *srq_num); + struct ibv_srq * (*create_srq_ex)(struct ibv_context *context, + struct ibv_srq_init_attr_ex *srq_init_attr_ex); + struct ibv_xrcd * (*open_xrcd)(struct ibv_context *context, + struct ibv_xrcd_init_attr *xrcd_init_attr); + int (*close_xrcd)(struct ibv_xrcd *xrcd); + uint64_t _ABI_placeholder3; + size_t sz; /* Must be immediately before struct ibv_context */ + struct ibv_context context; /* Must be last field in the struct */ +}; + +static inline struct verbs_context *verbs_get_ctx(struct ibv_context *ctx) +{ + if (ctx->abi_compat != __VERBS_ABI_IS_EXTENDED) + return NULL; + + /* open code container_of to not pollute the global namespace */ + return (struct verbs_context *)(((uint8_t *)ctx) - + offsetof(struct verbs_context, + context)); +} + +#define verbs_get_ctx_op(ctx, op) ({ \ + struct verbs_context *__vctx = verbs_get_ctx(ctx); \ + (!__vctx || (__vctx->sz < sizeof(*__vctx) - offsetof(struct verbs_context, op)) || \ + !__vctx->op) ? NULL : __vctx; }) + +/** + * ibv_get_device_list - Get list of IB devices currently available + * @num_devices: optional. if non-NULL, set to the number of devices + * returned in the array. + * + * Return a NULL-terminated array of IB devices. The array can be + * released with ibv_free_device_list(). + */ +struct ibv_device **ibv_get_device_list(int *num_devices); + +/* + * When statically linking the user can set RDMA_STATIC_PROVIDERS to a comma + * separated list of provider names to include in the static link, and this + * machinery will cause those providers to be included statically. + * + * Linking will fail if this is set for dynamic linking. + */ +#ifdef RDMA_STATIC_PROVIDERS +#define _RDMA_STATIC_PREFIX_(_1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, \ + _12, _13, _14, _15, _16, _17, ...) \ + &verbs_provider_##_1, &verbs_provider_##_2, &verbs_provider_##_3, \ + &verbs_provider_##_4, &verbs_provider_##_5, \ + &verbs_provider_##_6, &verbs_provider_##_7, \ + &verbs_provider_##_8, &verbs_provider_##_9, \ + &verbs_provider_##_10, &verbs_provider_##_11, \ + &verbs_provider_##_12, &verbs_provider_##_13, \ + &verbs_provider_##_14, &verbs_provider_##_15, \ + &verbs_provider_##_16, &verbs_provider_##_17 +#define _RDMA_STATIC_PREFIX(arg) \ + _RDMA_STATIC_PREFIX_(arg, none, none, none, none, none, none, none, \ + none, none, none, none, none, none, none, none, \ + none) + +struct verbs_devices_ops; +extern const struct verbs_device_ops verbs_provider_bnxt_re; +extern const struct verbs_device_ops verbs_provider_cxgb4; +extern const struct verbs_device_ops verbs_provider_efa; +extern const struct verbs_device_ops verbs_provider_hfi1verbs; +extern const struct verbs_device_ops verbs_provider_hns; +extern const struct verbs_device_ops verbs_provider_i40iw; +extern const struct verbs_device_ops verbs_provider_ipathverbs; +extern const struct verbs_device_ops verbs_provider_mlx4; +extern const struct verbs_device_ops verbs_provider_mlx5; +extern const struct verbs_device_ops verbs_provider_mthca; +extern const struct verbs_device_ops verbs_provider_ocrdma; +extern const struct verbs_device_ops verbs_provider_qedr; +extern const struct verbs_device_ops verbs_provider_rxe; +extern const struct verbs_device_ops verbs_provider_siw; +extern const struct verbs_device_ops verbs_provider_vmw_pvrdma; +extern const struct verbs_device_ops verbs_provider_all; +extern const struct verbs_device_ops verbs_provider_none; +void ibv_static_providers(void *unused, ...); + +static inline struct ibv_device **__ibv_get_device_list(int *num_devices) +{ + ibv_static_providers(NULL, _RDMA_STATIC_PREFIX(RDMA_STATIC_PROVIDERS), + NULL); + return ibv_get_device_list(num_devices); +} +#define ibv_get_device_list(num_devices) __ibv_get_device_list(num_devices) +#endif + +/** + * ibv_free_device_list - Free list from ibv_get_device_list() + * + * Free an array of devices returned from ibv_get_device_list(). Once + * the array is freed, pointers to devices that were not opened with + * ibv_open_device() are no longer valid. Client code must open all + * devices it intends to use before calling ibv_free_device_list(). + */ +void ibv_free_device_list(struct ibv_device **list); + +/** + * ibv_get_device_name - Return kernel device name + */ +const char *ibv_get_device_name(struct ibv_device *device); + +/** + * ibv_get_device_guid - Return device's node GUID + */ +__be64 ibv_get_device_guid(struct ibv_device *device); + +/** + * ibv_open_device - Initialize device for use + */ +struct ibv_context *ibv_open_device(struct ibv_device *device); + +/** + * ibv_close_device - Release device + */ +int ibv_close_device(struct ibv_context *context); + +/** + * ibv_get_async_event - Get next async event + * @event: Pointer to use to return async event + * + * All async events returned by ibv_get_async_event() must eventually + * be acknowledged with ibv_ack_async_event(). + */ +int ibv_get_async_event(struct ibv_context *context, + struct ibv_async_event *event); + +/** + * ibv_ack_async_event - Acknowledge an async event + * @event: Event to be acknowledged. + * + * All async events which are returned by ibv_get_async_event() must + * be acknowledged. To avoid races, destroying an object (CQ, SRQ or + * QP) will wait for all affiliated events to be acknowledged, so + * there should be a one-to-one correspondence between acks and + * successful gets. + */ +void ibv_ack_async_event(struct ibv_async_event *event); + +/** + * ibv_query_device - Get device properties + */ +int ibv_query_device(struct ibv_context *context, + struct ibv_device_attr *device_attr); + +/** + * ibv_query_port - Get port properties + */ +int ibv_query_port(struct ibv_context *context, uint8_t port_num, + struct _compat_ibv_port_attr *port_attr); + +static inline int ___ibv_query_port(struct ibv_context *context, + uint8_t port_num, + struct ibv_port_attr *port_attr) +{ + struct verbs_context *vctx = verbs_get_ctx_op(context, query_port); + + if (!vctx) { + int rc; + + memset(port_attr, 0, sizeof(*port_attr)); + + rc = ibv_query_port(context, port_num, + (struct _compat_ibv_port_attr *)port_attr); + return rc; + } + + return vctx->query_port(context, port_num, port_attr, + sizeof(*port_attr)); +} + +#define ibv_query_port(context, port_num, port_attr) \ + ___ibv_query_port(context, port_num, port_attr) + +/** + * ibv_query_gid - Get a GID table entry + */ +int ibv_query_gid(struct ibv_context *context, uint8_t port_num, + int index, union ibv_gid *gid); + +/** + * ibv_query_pkey - Get a P_Key table entry + */ +int ibv_query_pkey(struct ibv_context *context, uint8_t port_num, + int index, __be16 *pkey); + +/** + * ibv_get_pkey_index - Translate a P_Key into a P_Key index + */ +int ibv_get_pkey_index(struct ibv_context *context, uint8_t port_num, + __be16 pkey); + +/** + * ibv_alloc_pd - Allocate a protection domain + */ +struct ibv_pd *ibv_alloc_pd(struct ibv_context *context); + +/** + * ibv_dealloc_pd - Free a protection domain + */ +int ibv_dealloc_pd(struct ibv_pd *pd); + +static inline struct ibv_flow *ibv_create_flow(struct ibv_qp *qp, + struct ibv_flow_attr *flow) +{ + struct verbs_context *vctx = verbs_get_ctx_op(qp->context, + ibv_create_flow); + if (!vctx) { + errno = EOPNOTSUPP; + return NULL; + } + + return vctx->ibv_create_flow(qp, flow); +} + +static inline int ibv_destroy_flow(struct ibv_flow *flow_id) +{ + struct verbs_context *vctx = verbs_get_ctx_op(flow_id->context, + ibv_destroy_flow); + if (!vctx) + return -EOPNOTSUPP; + return vctx->ibv_destroy_flow(flow_id); +} + +static inline struct ibv_flow_action * +ibv_create_flow_action_esp(struct ibv_context *ctx, + struct ibv_flow_action_esp_attr *esp) +{ + struct verbs_context *vctx = verbs_get_ctx_op(ctx, + create_flow_action_esp); + + if (!vctx) { + errno = EOPNOTSUPP; + return NULL; + } + + return vctx->create_flow_action_esp(ctx, esp); +} + +static inline int +ibv_modify_flow_action_esp(struct ibv_flow_action *action, + struct ibv_flow_action_esp_attr *esp) +{ + struct verbs_context *vctx = verbs_get_ctx_op(action->context, + modify_flow_action_esp); + + if (!vctx) + return EOPNOTSUPP; + + return vctx->modify_flow_action_esp(action, esp); +} + +static inline int ibv_destroy_flow_action(struct ibv_flow_action *action) +{ + struct verbs_context *vctx = verbs_get_ctx_op(action->context, + destroy_flow_action); + + if (!vctx) + return EOPNOTSUPP; + + return vctx->destroy_flow_action(action); +} + +/** + * ibv_open_xrcd - Open an extended connection domain + */ +static inline struct ibv_xrcd * +ibv_open_xrcd(struct ibv_context *context, struct ibv_xrcd_init_attr *xrcd_init_attr) +{ + struct verbs_context *vctx = verbs_get_ctx_op(context, open_xrcd); + if (!vctx) { + errno = EOPNOTSUPP; + return NULL; + } + return vctx->open_xrcd(context, xrcd_init_attr); +} + +/** + * ibv_close_xrcd - Close an extended connection domain + */ +static inline int ibv_close_xrcd(struct ibv_xrcd *xrcd) +{ + struct verbs_context *vctx = verbs_get_ctx(xrcd->context); + return vctx->close_xrcd(xrcd); +} + +/** + * ibv_reg_mr_iova2 - Register memory region with a virtual offset address + * + * This version will be called if ibv_reg_mr or ibv_reg_mr_iova were called + * with at least one potential access flag from the IBV_OPTIONAL_ACCESS_RANGE + * flags range The optional access flags will be masked if running over kernel + * that does not support passing them. + */ +struct ibv_mr *ibv_reg_mr_iova2(struct ibv_pd *pd, void *addr, size_t length, + uint64_t iova, unsigned int access); + +/** + * ibv_reg_mr - Register a memory region + */ +struct ibv_mr *ibv_reg_mr(struct ibv_pd *pd, void *addr, size_t length, + int access); +/* use new ibv_reg_mr version only if access flags that require it are used */ +__attribute__((__always_inline__)) static inline struct ibv_mr * +__ibv_reg_mr(struct ibv_pd *pd, void *addr, size_t length, unsigned int access, + int is_access_const) +{ + if (is_access_const && (access & IBV_ACCESS_OPTIONAL_RANGE) == 0) + return ibv_reg_mr(pd, addr, length, access); + else + return ibv_reg_mr_iova2(pd, addr, length, (uintptr_t)addr, + access); +} + +#define ibv_reg_mr(pd, addr, length, access) \ + __ibv_reg_mr(pd, addr, length, access, \ + __builtin_constant_p( \ + ((access) & IBV_ACCESS_OPTIONAL_RANGE) == 0)) + +/** + * ibv_reg_mr_iova - Register a memory region with a virtual offset + * address + */ +struct ibv_mr *ibv_reg_mr_iova(struct ibv_pd *pd, void *addr, size_t length, + uint64_t iova, int access); +/* use new ibv_reg_mr version only if access flags that require it are used */ +__attribute__((__always_inline__)) static inline struct ibv_mr * +__ibv_reg_mr_iova(struct ibv_pd *pd, void *addr, size_t length, uint64_t iova, + unsigned int access, int is_access_const) +{ + if (is_access_const && (access & IBV_ACCESS_OPTIONAL_RANGE) == 0) + return ibv_reg_mr_iova(pd, addr, length, iova, access); + else + return ibv_reg_mr_iova2(pd, addr, length, iova, access); +} + +#define ibv_reg_mr_iova(pd, addr, length, iova, access) \ + __ibv_reg_mr_iova(pd, addr, length, iova, access, \ + __builtin_constant_p( \ + ((access) & IBV_ACCESS_OPTIONAL_RANGE) == 0)) + +enum ibv_rereg_mr_err_code { + /* Old MR is valid, invalid input */ + IBV_REREG_MR_ERR_INPUT = -1, + /* Old MR is valid, failed via don't fork on new address range */ + IBV_REREG_MR_ERR_DONT_FORK_NEW = -2, + /* New MR is valid, failed via do fork on old address range */ + IBV_REREG_MR_ERR_DO_FORK_OLD = -3, + /* MR shouldn't be used, command error */ + IBV_REREG_MR_ERR_CMD = -4, + /* MR shouldn't be used, command error, invalid fork state on new address range */ + IBV_REREG_MR_ERR_CMD_AND_DO_FORK_NEW = -5, +}; + +/** + * ibv_rereg_mr - Re-Register a memory region + */ +int ibv_rereg_mr(struct ibv_mr *mr, int flags, + struct ibv_pd *pd, void *addr, + size_t length, int access); +/** + * ibv_dereg_mr - Deregister a memory region + */ +int ibv_dereg_mr(struct ibv_mr *mr); + +/** + * ibv_alloc_mw - Allocate a memory window + */ +static inline struct ibv_mw *ibv_alloc_mw(struct ibv_pd *pd, + enum ibv_mw_type type) +{ + struct ibv_mw *mw; + + if (!pd->context->ops.alloc_mw) { + errno = EOPNOTSUPP; + return NULL; + } + + mw = pd->context->ops.alloc_mw(pd, type); + return mw; +} + +/** + * ibv_dealloc_mw - Free a memory window + */ +static inline int ibv_dealloc_mw(struct ibv_mw *mw) +{ + return mw->context->ops.dealloc_mw(mw); +} + +/** + * ibv_inc_rkey - Increase the 8 lsb in the given rkey + */ +static inline uint32_t ibv_inc_rkey(uint32_t rkey) +{ + const uint32_t mask = 0x000000ff; + uint8_t newtag = (uint8_t)((rkey + 1) & mask); + + return (rkey & ~mask) | newtag; +} + +/** + * ibv_bind_mw - Bind a memory window to a region + */ +static inline int ibv_bind_mw(struct ibv_qp *qp, struct ibv_mw *mw, + struct ibv_mw_bind *mw_bind) +{ + if (mw->type != IBV_MW_TYPE_1) + return EINVAL; + + return mw->context->ops.bind_mw(qp, mw, mw_bind); +} + +/** + * ibv_create_comp_channel - Create a completion event channel + */ +struct ibv_comp_channel *ibv_create_comp_channel(struct ibv_context *context); + +/** + * ibv_destroy_comp_channel - Destroy a completion event channel + */ +int ibv_destroy_comp_channel(struct ibv_comp_channel *channel); + +/** + * ibv_advise_mr - Gives advice about an address range in MRs + * @pd - protection domain of all MRs for which the advice is for + * @advice - type of advice + * @flags - advice modifiers + * @sg_list - an array of memory ranges + * @num_sge - number of elements in the array + */ +static inline int ibv_advise_mr(struct ibv_pd *pd, + enum ibv_advise_mr_advice advice, + uint32_t flags, + struct ibv_sge *sg_list, + uint32_t num_sge) +{ + struct verbs_context *vctx; + + vctx = verbs_get_ctx_op(pd->context, advise_mr); + if (!vctx) + return EOPNOTSUPP; + + return vctx->advise_mr(pd, advice, flags, sg_list, num_sge); +} + +/** + * ibv_alloc_dm - Allocate device memory + * @context - Context DM will be attached to + * @attr - Attributes to allocate the DM with + */ +static inline +struct ibv_dm *ibv_alloc_dm(struct ibv_context *context, + struct ibv_alloc_dm_attr *attr) +{ + struct verbs_context *vctx = verbs_get_ctx_op(context, alloc_dm); + + if (!vctx) { + errno = EOPNOTSUPP; + return NULL; + } + + return vctx->alloc_dm(context, attr); +} + +/** + * ibv_free_dm - Free device allocated memory + * @dm - The DM to free + */ +static inline +int ibv_free_dm(struct ibv_dm *dm) +{ + struct verbs_context *vctx = verbs_get_ctx_op(dm->context, free_dm); + + if (!vctx) + return EOPNOTSUPP; + + return vctx->free_dm(dm); +} + +/** + * ibv_memcpy_to/from_dm - copy to/from device allocated memory + * @dm - The DM to copy to/from + * @dm_offset - Offset in bytes from beginning of DM to start copy to/form + * @host_addr - Host memory address to copy to/from + * @length - Number of bytes to copy + */ +static inline +int ibv_memcpy_to_dm(struct ibv_dm *dm, uint64_t dm_offset, + const void *host_addr, size_t length) +{ + return dm->memcpy_to_dm(dm, dm_offset, host_addr, length); +} + +static inline +int ibv_memcpy_from_dm(void *host_addr, struct ibv_dm *dm, + uint64_t dm_offset, size_t length) +{ + return dm->memcpy_from_dm(host_addr, dm, dm_offset, length); +} + +/* + * ibv_alloc_null_mr - Allocate a null memory region. + * @pd - The protection domain associated with the MR. + */ +static inline +struct ibv_mr *ibv_alloc_null_mr(struct ibv_pd *pd) +{ + struct verbs_context *vctx; + + vctx = verbs_get_ctx_op(pd->context, alloc_null_mr); + if (!vctx) { + errno = EOPNOTSUPP; + return NULL; + } + + return vctx->alloc_null_mr(pd); +} + +/** + * ibv_reg_dm_mr - Register device memory as a memory region + * @pd - The PD to associated this MR with + * @dm - The DM to register + * @dm_offset - Offset in bytes from beginning of DM to start registration from + * @length - Number of bytes to register + * @access - memory region access flags + */ +static inline +struct ibv_mr *ibv_reg_dm_mr(struct ibv_pd *pd, struct ibv_dm *dm, + uint64_t dm_offset, + size_t length, unsigned int access) +{ + struct verbs_context *vctx = verbs_get_ctx_op(pd->context, reg_dm_mr); + + if (!vctx) { + errno = EOPNOTSUPP; + return NULL; + } + + return vctx->reg_dm_mr(pd, dm, dm_offset, length, access); +} + +/** + * ibv_create_cq - Create a completion queue + * @context - Context CQ will be attached to + * @cqe - Minimum number of entries required for CQ + * @cq_context - Consumer-supplied context returned for completion events + * @channel - Completion channel where completion events will be queued. + * May be NULL if completion events will not be used. + * @comp_vector - Completion vector used to signal completion events. + * Must be >= 0 and < context->num_comp_vectors. + */ +struct ibv_cq *ibv_create_cq(struct ibv_context *context, int cqe, + void *cq_context, + struct ibv_comp_channel *channel, + int comp_vector); + +/** + * ibv_create_cq_ex - Create a completion queue + * @context - Context CQ will be attached to + * @cq_attr - Attributes to create the CQ with + */ +static inline +struct ibv_cq_ex *ibv_create_cq_ex(struct ibv_context *context, + struct ibv_cq_init_attr_ex *cq_attr) +{ + struct verbs_context *vctx = verbs_get_ctx_op(context, create_cq_ex); + + if (!vctx) { + errno = EOPNOTSUPP; + return NULL; + } + + return vctx->create_cq_ex(context, cq_attr); +} + +/** + * ibv_resize_cq - Modifies the capacity of the CQ. + * @cq: The CQ to resize. + * @cqe: The minimum size of the CQ. + * + * Users can examine the cq structure to determine the actual CQ size. + */ +int ibv_resize_cq(struct ibv_cq *cq, int cqe); + +/** + * ibv_destroy_cq - Destroy a completion queue + */ +int ibv_destroy_cq(struct ibv_cq *cq); + +/** + * ibv_get_cq_event - Read next CQ event + * @channel: Channel to get next event from. + * @cq: Used to return pointer to CQ. + * @cq_context: Used to return consumer-supplied CQ context. + * + * All completion events returned by ibv_get_cq_event() must + * eventually be acknowledged with ibv_ack_cq_events(). + */ +int ibv_get_cq_event(struct ibv_comp_channel *channel, + struct ibv_cq **cq, void **cq_context); + +/** + * ibv_ack_cq_events - Acknowledge CQ completion events + * @cq: CQ to acknowledge events for + * @nevents: Number of events to acknowledge. + * + * All completion events which are returned by ibv_get_cq_event() must + * be acknowledged. To avoid races, ibv_destroy_cq() will wait for + * all completion events to be acknowledged, so there should be a + * one-to-one correspondence between acks and successful gets. An + * application may accumulate multiple completion events and + * acknowledge them in a single call to ibv_ack_cq_events() by passing + * the number of events to ack in @nevents. + */ +void ibv_ack_cq_events(struct ibv_cq *cq, unsigned int nevents); + +/** + * ibv_poll_cq - Poll a CQ for work completions + * @cq:the CQ being polled + * @num_entries:maximum number of completions to return + * @wc:array of at least @num_entries of &struct ibv_wc where completions + * will be returned + * + * Poll a CQ for (possibly multiple) completions. If the return value + * is < 0, an error occurred. If the return value is >= 0, it is the + * number of completions returned. If the return value is + * non-negative and strictly less than num_entries, then the CQ was + * emptied. + */ +static inline int ibv_poll_cq(struct ibv_cq *cq, int num_entries, struct ibv_wc *wc) +{ + return cq->context->ops.poll_cq(cq, num_entries, wc); +} + +/** + * ibv_req_notify_cq - Request completion notification on a CQ. An + * event will be added to the completion channel associated with the + * CQ when an entry is added to the CQ. + * @cq: The completion queue to request notification for. + * @solicited_only: If non-zero, an event will be generated only for + * the next solicited CQ entry. If zero, any CQ entry, solicited or + * not, will generate an event. + */ +static inline int ibv_req_notify_cq(struct ibv_cq *cq, int solicited_only) +{ + return cq->context->ops.req_notify_cq(cq, solicited_only); +} + +static inline int ibv_modify_cq(struct ibv_cq *cq, struct ibv_modify_cq_attr *attr) +{ + struct verbs_context *vctx = verbs_get_ctx_op(cq->context, modify_cq); + + if (!vctx) + return EOPNOTSUPP; + + return vctx->modify_cq(cq, attr); +} +/** + * ibv_create_srq - Creates a SRQ associated with the specified protection + * domain. + * @pd: The protection domain associated with the SRQ. + * @srq_init_attr: A list of initial attributes required to create the SRQ. + * + * srq_attr->max_wr and srq_attr->max_sge are read the determine the + * requested size of the SRQ, and set to the actual values allocated + * on return. If ibv_create_srq() succeeds, then max_wr and max_sge + * will always be at least as large as the requested values. + */ +struct ibv_srq *ibv_create_srq(struct ibv_pd *pd, + struct ibv_srq_init_attr *srq_init_attr); + +static inline struct ibv_srq * +ibv_create_srq_ex(struct ibv_context *context, + struct ibv_srq_init_attr_ex *srq_init_attr_ex) +{ + struct verbs_context *vctx; + uint32_t mask = srq_init_attr_ex->comp_mask; + + if (!(mask & ~(IBV_SRQ_INIT_ATTR_PD | IBV_SRQ_INIT_ATTR_TYPE)) && + (mask & IBV_SRQ_INIT_ATTR_PD) && + (!(mask & IBV_SRQ_INIT_ATTR_TYPE) || + (srq_init_attr_ex->srq_type == IBV_SRQT_BASIC))) + return ibv_create_srq(srq_init_attr_ex->pd, + (struct ibv_srq_init_attr *)srq_init_attr_ex); + + vctx = verbs_get_ctx_op(context, create_srq_ex); + if (!vctx) { + errno = EOPNOTSUPP; + return NULL; + } + return vctx->create_srq_ex(context, srq_init_attr_ex); +} + +/** + * ibv_modify_srq - Modifies the attributes for the specified SRQ. + * @srq: The SRQ to modify. + * @srq_attr: On input, specifies the SRQ attributes to modify. On output, + * the current values of selected SRQ attributes are returned. + * @srq_attr_mask: A bit-mask used to specify which attributes of the SRQ + * are being modified. + * + * The mask may contain IBV_SRQ_MAX_WR to resize the SRQ and/or + * IBV_SRQ_LIMIT to set the SRQ's limit and request notification when + * the number of receives queued drops below the limit. + */ +int ibv_modify_srq(struct ibv_srq *srq, + struct ibv_srq_attr *srq_attr, + int srq_attr_mask); + +/** + * ibv_query_srq - Returns the attribute list and current values for the + * specified SRQ. + * @srq: The SRQ to query. + * @srq_attr: The attributes of the specified SRQ. + */ +int ibv_query_srq(struct ibv_srq *srq, struct ibv_srq_attr *srq_attr); + +static inline int ibv_get_srq_num(struct ibv_srq *srq, uint32_t *srq_num) +{ + struct verbs_context *vctx = verbs_get_ctx_op(srq->context, get_srq_num); + + if (!vctx) + return EOPNOTSUPP; + + return vctx->get_srq_num(srq, srq_num); +} + +/** + * ibv_destroy_srq - Destroys the specified SRQ. + * @srq: The SRQ to destroy. + */ +int ibv_destroy_srq(struct ibv_srq *srq); + +/** + * ibv_post_srq_recv - Posts a list of work requests to the specified SRQ. + * @srq: The SRQ to post the work request on. + * @recv_wr: A list of work requests to post on the receive queue. + * @bad_recv_wr: On an immediate failure, this parameter will reference + * the work request that failed to be posted on the QP. + */ +static inline int ibv_post_srq_recv(struct ibv_srq *srq, + struct ibv_recv_wr *recv_wr, + struct ibv_recv_wr **bad_recv_wr) +{ + return srq->context->ops.post_srq_recv(srq, recv_wr, bad_recv_wr); +} + +static inline int ibv_post_srq_ops(struct ibv_srq *srq, + struct ibv_ops_wr *op, + struct ibv_ops_wr **bad_op) +{ + struct verbs_context *vctx; + + vctx = verbs_get_ctx_op(srq->context, post_srq_ops); + if (!vctx) { + *bad_op = op; + return EOPNOTSUPP; + } + return vctx->post_srq_ops(srq, op, bad_op); +} + +/** + * ibv_create_qp - Create a queue pair. + */ +struct ibv_qp *ibv_create_qp(struct ibv_pd *pd, + struct ibv_qp_init_attr *qp_init_attr); + +static inline struct ibv_qp * +ibv_create_qp_ex(struct ibv_context *context, struct ibv_qp_init_attr_ex *qp_init_attr_ex) +{ + struct verbs_context *vctx; + uint32_t mask = qp_init_attr_ex->comp_mask; + + if (mask == IBV_QP_INIT_ATTR_PD) + return ibv_create_qp(qp_init_attr_ex->pd, + (struct ibv_qp_init_attr *)qp_init_attr_ex); + + vctx = verbs_get_ctx_op(context, create_qp_ex); + if (!vctx) { + errno = EOPNOTSUPP; + return NULL; + } + return vctx->create_qp_ex(context, qp_init_attr_ex); +} + +/** + * ibv_alloc_td - Allocate a thread domain + */ +static inline struct ibv_td *ibv_alloc_td(struct ibv_context *context, + struct ibv_td_init_attr *init_attr) +{ + struct verbs_context *vctx; + + vctx = verbs_get_ctx_op(context, alloc_td); + if (!vctx) { + errno = EOPNOTSUPP; + return NULL; + } + + return vctx->alloc_td(context, init_attr); +} + +/** + * ibv_dealloc_td - Free a thread domain + */ +static inline int ibv_dealloc_td(struct ibv_td *td) +{ + struct verbs_context *vctx; + + vctx = verbs_get_ctx_op(td->context, dealloc_td); + if (!vctx) + return EOPNOTSUPP; + + return vctx->dealloc_td(td); +} + +/** + * ibv_alloc_parent_domain - Allocate a parent domain + */ +static inline struct ibv_pd * +ibv_alloc_parent_domain(struct ibv_context *context, + struct ibv_parent_domain_init_attr *attr) +{ + struct verbs_context *vctx; + + vctx = verbs_get_ctx_op(context, alloc_parent_domain); + if (!vctx) { + errno = EOPNOTSUPP; + return NULL; + } + + return vctx->alloc_parent_domain(context, attr); +} + +/** + * ibv_query_rt_values_ex - Get current real time @values of a device. + * @values - in/out - defines the attributes we need to query/queried. + * (Or's bits of enum ibv_values_mask on values->comp_mask field) + */ +static inline int +ibv_query_rt_values_ex(struct ibv_context *context, + struct ibv_values_ex *values) +{ + struct verbs_context *vctx; + + vctx = verbs_get_ctx_op(context, query_rt_values); + if (!vctx) + return EOPNOTSUPP; + + return vctx->query_rt_values(context, values); +} + +/** + * ibv_query_device_ex - Get extended device properties + */ +static inline int +ibv_query_device_ex(struct ibv_context *context, + const struct ibv_query_device_ex_input *input, + struct ibv_device_attr_ex *attr) +{ + struct verbs_context *vctx; + int ret; + + if (input && input->comp_mask) + return EINVAL; + + vctx = verbs_get_ctx_op(context, query_device_ex); + if (!vctx) + goto legacy; + + ret = vctx->query_device_ex(context, input, attr, sizeof(*attr)); + if (ret == EOPNOTSUPP || ret == ENOSYS) + goto legacy; + + return ret; + +legacy: + memset(attr, 0, sizeof(*attr)); + ret = ibv_query_device(context, &attr->orig_attr); + + return ret; +} + +/** + * ibv_open_qp - Open a shareable queue pair. + */ +static inline struct ibv_qp * +ibv_open_qp(struct ibv_context *context, struct ibv_qp_open_attr *qp_open_attr) +{ + struct verbs_context *vctx = verbs_get_ctx_op(context, open_qp); + if (!vctx) { + errno = EOPNOTSUPP; + return NULL; + } + return vctx->open_qp(context, qp_open_attr); +} + +/** + * ibv_modify_qp - Modify a queue pair. + */ +int ibv_modify_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, + int attr_mask); + +/** + * ibv_modify_qp_rate_limit - Modify a queue pair rate limit values + * @qp - QP object to modify + * @attr - Attributes to configure the rate limiting values of the QP + */ +static inline int +ibv_modify_qp_rate_limit(struct ibv_qp *qp, + struct ibv_qp_rate_limit_attr *attr) +{ + struct verbs_context *vctx; + + vctx = verbs_get_ctx_op(qp->context, modify_qp_rate_limit); + if (!vctx) + return EOPNOTSUPP; + + return vctx->modify_qp_rate_limit(qp, attr); +} + +/** + * ibv_query_qp - Returns the attribute list and current values for the + * specified QP. + * @qp: The QP to query. + * @attr: The attributes of the specified QP. + * @attr_mask: A bit-mask used to select specific attributes to query. + * @init_attr: Additional attributes of the selected QP. + * + * The qp_attr_mask may be used to limit the query to gathering only the + * selected attributes. + */ +int ibv_query_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, + int attr_mask, + struct ibv_qp_init_attr *init_attr); + +/** + * ibv_destroy_qp - Destroy a queue pair. + */ +int ibv_destroy_qp(struct ibv_qp *qp); + +/* + * ibv_create_wq - Creates a WQ associated with the specified protection + * domain. + * @context: ibv_context. + * @wq_init_attr: A list of initial attributes required to create the + * WQ. If WQ creation succeeds, then the attributes are updated to + * the actual capabilities of the created WQ. + * + * wq_init_attr->max_wr and wq_init_attr->max_sge determine + * the requested size of the WQ, and set to the actual values allocated + * on return. + * If ibv_create_wq() succeeds, then max_wr and max_sge will always be + * at least as large as the requested values. + * + * Return Value + * ibv_create_wq() returns a pointer to the created WQ, or NULL if the request + * fails. + */ +static inline struct ibv_wq *ibv_create_wq(struct ibv_context *context, + struct ibv_wq_init_attr *wq_init_attr) +{ + struct verbs_context *vctx = verbs_get_ctx_op(context, create_wq); + struct ibv_wq *wq; + + if (!vctx) { + errno = EOPNOTSUPP; + return NULL; + } + + wq = vctx->create_wq(context, wq_init_attr); + if (wq) { + wq->events_completed = 0; + pthread_mutex_init(&wq->mutex, NULL); + pthread_cond_init(&wq->cond, NULL); + } + + return wq; +} + +/* + * ibv_modify_wq - Modifies the attributes for the specified WQ. + * @wq: The WQ to modify. + * @wq_attr: On input, specifies the WQ attributes to modify. + * wq_attr->attr_mask: A bit-mask used to specify which attributes of the WQ + * are being modified. + * On output, the current values of selected WQ attributes are returned. + * + * Return Value + * ibv_modify_wq() returns 0 on success, or the value of errno + * on failure (which indicates the failure reason). + * +*/ +static inline int ibv_modify_wq(struct ibv_wq *wq, struct ibv_wq_attr *wq_attr) +{ + struct verbs_context *vctx = verbs_get_ctx_op(wq->context, modify_wq); + + if (!vctx) + return EOPNOTSUPP; + + return vctx->modify_wq(wq, wq_attr); +} + +/* + * ibv_destroy_wq - Destroys the specified WQ. + * @ibv_wq: The WQ to destroy. + * Return Value + * ibv_destroy_wq() returns 0 on success, or the value of errno + * on failure (which indicates the failure reason). +*/ +static inline int ibv_destroy_wq(struct ibv_wq *wq) +{ + struct verbs_context *vctx; + + vctx = verbs_get_ctx_op(wq->context, destroy_wq); + if (!vctx) + return EOPNOTSUPP; + + return vctx->destroy_wq(wq); +} + +/* + * ibv_create_rwq_ind_table - Creates a receive work queue Indirection Table + * @context: ibv_context. + * @init_attr: A list of initial attributes required to create the Indirection Table. + * Return Value + * ibv_create_rwq_ind_table returns a pointer to the created + * Indirection Table, or NULL if the request fails. + */ +static inline struct ibv_rwq_ind_table *ibv_create_rwq_ind_table(struct ibv_context *context, + struct ibv_rwq_ind_table_init_attr *init_attr) +{ + struct verbs_context *vctx; + + vctx = verbs_get_ctx_op(context, create_rwq_ind_table); + if (!vctx) { + errno = EOPNOTSUPP; + return NULL; + } + + return vctx->create_rwq_ind_table(context, init_attr); +} + +/* + * ibv_destroy_rwq_ind_table - Destroys the specified Indirection Table. + * @rwq_ind_table: The Indirection Table to destroy. + * Return Value + * ibv_destroy_rwq_ind_table() returns 0 on success, or the value of errno + * on failure (which indicates the failure reason). +*/ +static inline int ibv_destroy_rwq_ind_table(struct ibv_rwq_ind_table *rwq_ind_table) +{ + struct verbs_context *vctx; + + vctx = verbs_get_ctx_op(rwq_ind_table->context, destroy_rwq_ind_table); + if (!vctx) + return EOPNOTSUPP; + + return vctx->destroy_rwq_ind_table(rwq_ind_table); +} + +/** + * ibv_post_send - Post a list of work requests to a send queue. + * + * If IBV_SEND_INLINE flag is set, the data buffers can be reused + * immediately after the call returns. + */ +static inline int ibv_post_send(struct ibv_qp *qp, struct ibv_send_wr *wr, + struct ibv_send_wr **bad_wr) +{ + return qp->context->ops.post_send(qp, wr, bad_wr); +} + +/** + * ibv_post_recv - Post a list of work requests to a receive queue. + */ +static inline int ibv_post_recv(struct ibv_qp *qp, struct ibv_recv_wr *wr, + struct ibv_recv_wr **bad_wr) +{ + return qp->context->ops.post_recv(qp, wr, bad_wr); +} + +/** + * ibv_create_ah - Create an address handle. + */ +struct ibv_ah *ibv_create_ah(struct ibv_pd *pd, struct ibv_ah_attr *attr); + +/** + * ibv_init_ah_from_wc - Initializes address handle attributes from a + * work completion. + * @context: Device context on which the received message arrived. + * @port_num: Port on which the received message arrived. + * @wc: Work completion associated with the received message. + * @grh: References the received global route header. This parameter is + * ignored unless the work completion indicates that the GRH is valid. + * @ah_attr: Returned attributes that can be used when creating an address + * handle for replying to the message. + */ +int ibv_init_ah_from_wc(struct ibv_context *context, uint8_t port_num, + struct ibv_wc *wc, struct ibv_grh *grh, + struct ibv_ah_attr *ah_attr); + +/** + * ibv_create_ah_from_wc - Creates an address handle associated with the + * sender of the specified work completion. + * @pd: The protection domain associated with the address handle. + * @wc: Work completion information associated with a received message. + * @grh: References the received global route header. This parameter is + * ignored unless the work completion indicates that the GRH is valid. + * @port_num: The outbound port number to associate with the address. + * + * The address handle is used to reference a local or global destination + * in all UD QP post sends. + */ +struct ibv_ah *ibv_create_ah_from_wc(struct ibv_pd *pd, struct ibv_wc *wc, + struct ibv_grh *grh, uint8_t port_num); + +/** + * ibv_destroy_ah - Destroy an address handle. + */ +int ibv_destroy_ah(struct ibv_ah *ah); + +/** + * ibv_attach_mcast - Attaches the specified QP to a multicast group. + * @qp: QP to attach to the multicast group. The QP must be a UD QP. + * @gid: Multicast group GID. + * @lid: Multicast group LID in host byte order. + * + * In order to route multicast packets correctly, subnet + * administration must have created the multicast group and configured + * the fabric appropriately. The port associated with the specified + * QP must also be a member of the multicast group. + */ +int ibv_attach_mcast(struct ibv_qp *qp, const union ibv_gid *gid, uint16_t lid); + +/** + * ibv_detach_mcast - Detaches the specified QP from a multicast group. + * @qp: QP to detach from the multicast group. + * @gid: Multicast group GID. + * @lid: Multicast group LID in host byte order. + */ +int ibv_detach_mcast(struct ibv_qp *qp, const union ibv_gid *gid, uint16_t lid); + +/** + * ibv_fork_init - Prepare data structures so that fork() may be used + * safely. If this function is not called or returns a non-zero + * status, then libibverbs data structures are not fork()-safe and the + * effect of an application calling fork() is undefined. + */ +int ibv_fork_init(void); + +/** + * ibv_node_type_str - Return string describing node_type enum value + */ +const char *ibv_node_type_str(enum ibv_node_type node_type); + +/** + * ibv_port_state_str - Return string describing port_state enum value + */ +const char *ibv_port_state_str(enum ibv_port_state port_state); + +/** + * ibv_event_type_str - Return string describing event_type enum value + */ +const char *ibv_event_type_str(enum ibv_event_type event); + +#define ETHERNET_LL_SIZE 6 +int ibv_resolve_eth_l2_from_gid(struct ibv_context *context, + struct ibv_ah_attr *attr, + uint8_t eth_mac[ETHERNET_LL_SIZE], + uint16_t *vid); + +static inline int ibv_is_qpt_supported(uint32_t caps, enum ibv_qp_type qpt) +{ + return !!(caps & (1 << qpt)); +} + +static inline struct ibv_counters *ibv_create_counters(struct ibv_context *context, + struct ibv_counters_init_attr *init_attr) +{ + struct verbs_context *vctx; + + vctx = verbs_get_ctx_op(context, create_counters); + if (!vctx) { + errno = EOPNOTSUPP; + return NULL; + } + + return vctx->create_counters(context, init_attr); +} + +static inline int ibv_destroy_counters(struct ibv_counters *counters) +{ + struct verbs_context *vctx; + + vctx = verbs_get_ctx_op(counters->context, destroy_counters); + if (!vctx) + return EOPNOTSUPP; + + return vctx->destroy_counters(counters); +} + +static inline int ibv_attach_counters_point_flow(struct ibv_counters *counters, + struct ibv_counter_attach_attr *attr, + struct ibv_flow *flow) +{ + struct verbs_context *vctx; + + vctx = verbs_get_ctx_op(counters->context, attach_counters_point_flow); + if (!vctx) + return EOPNOTSUPP; + + return vctx->attach_counters_point_flow(counters, attr, flow); +} + +static inline int ibv_read_counters(struct ibv_counters *counters, + uint64_t *counters_value, + uint32_t ncounters, + uint32_t flags) +{ + struct verbs_context *vctx; + + vctx = verbs_get_ctx_op(counters->context, read_counters); + if (!vctx) + return EOPNOTSUPP; + + return vctx->read_counters(counters, counters_value, ncounters, flags); +} + +#define IB_ROCE_UDP_ENCAP_VALID_PORT_MIN (0xC000) +#define IB_ROCE_UDP_ENCAP_VALID_PORT_MAX (0xFFFF) +#define IB_GRH_FLOWLABEL_MASK (0x000FFFFF) + +static inline uint16_t ibv_flow_label_to_udp_sport(uint32_t fl) +{ + uint32_t fl_low = fl & 0x03FFF, fl_high = fl & 0xFC000; + + fl_low ^= fl_high >> 14; + return (uint16_t)(fl_low | IB_ROCE_UDP_ENCAP_VALID_PORT_MIN); +} + +#ifdef __cplusplus +} +#endif + +# undef __attribute_const + + +#endif /* INFINIBAND_VERBS_H */ diff --git a/libibverbs/verbs_api.h b/libibverbs/verbs_api.h new file mode 100644 index 0000000..ded6fa4 --- /dev/null +++ b/libibverbs/verbs_api.h @@ -0,0 +1,99 @@ +/* + * Copyright (c) 2017, Mellanox Technologies inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef VERBS_API_H +#define VERBS_API_H + +#if UINTPTR_MAX == UINT32_MAX +#if __BYTE_ORDER == __LITTLE_ENDIAN +#define RDMA_UAPI_PTR(_type, _name) \ + union { \ + struct { \ + _type _name; \ + __u32 _name##_reserved; \ + }; \ + __aligned_u64 _name##_data_u64; \ + } +#else +#define RDMA_UAPI_PTR(_type, _name) \ + union { \ + struct { \ + __u32 _name##_reserved; \ + _type _name; \ + }; \ + __aligned_u64 _name##_data_u64; \ + } +#endif +#elif UINTPTR_MAX == UINT64_MAX +#define RDMA_UAPI_PTR(_type, _name) \ + union { \ + _type _name; \ + __aligned_u64 _name##_data_u64; \ + } +#else +#error "Pointer size not supported" +#endif + +#include <infiniband/ib_user_ioctl_verbs.h> + +#define ibv_flow_action_esp_keymat ib_uverbs_flow_action_esp_keymat +#define IBV_FLOW_ACTION_ESP_KEYMAT_AES_GCM IB_UVERBS_FLOW_ACTION_ESP_KEYMAT_AES_GCM +#define ibv_flow_action_esp_keymat_aes_gcm_iv_algo ib_uverbs_flow_action_esp_keymat_aes_gcm_iv_algo +#define IBV_FLOW_ACTION_IV_ALGO_SEQ IB_UVERBS_FLOW_ACTION_IV_ALGO_SEQ +#define ibv_flow_action_esp_keymat_aes_gcm ib_uverbs_flow_action_esp_keymat_aes_gcm +#define ibv_flow_action_esp_replay ib_uverbs_flow_action_esp_replay +#define IBV_FLOW_ACTION_ESP_REPLAY_NONE IB_UVERBS_FLOW_ACTION_ESP_REPLAY_NONE +#define IBV_FLOW_ACTION_ESP_REPLAY_BMP IB_UVERBS_FLOW_ACTION_ESP_REPLAY_BMP +#define ibv_flow_action_esp_replay_bmp ib_uverbs_flow_action_esp_replay_bmp +#define ibv_flow_action_esp_flags ib_uverbs_flow_action_esp_flags +#define IBV_FLOW_ACTION_ESP_FLAGS_INLINE_CRYPTO IB_UVERBS_FLOW_ACTION_ESP_FLAGS_INLINE_CRYPTO +#define IBV_FLOW_ACTION_ESP_FLAGS_FULL_OFFLOAD IB_UVERBS_FLOW_ACTION_ESP_FLAGS_FULL_OFFLOAD +#define IBV_FLOW_ACTION_ESP_FLAGS_TUNNEL IB_UVERBS_FLOW_ACTION_ESP_FLAGS_TUNNEL +#define IBV_FLOW_ACTION_ESP_FLAGS_TRANSPORT IB_UVERBS_FLOW_ACTION_ESP_FLAGS_TRANSPORT +#define IBV_FLOW_ACTION_ESP_FLAGS_DECRYPT IB_UVERBS_FLOW_ACTION_ESP_FLAGS_DECRYPT +#define IBV_FLOW_ACTION_ESP_FLAGS_ENCRYPT IB_UVERBS_FLOW_ACTION_ESP_FLAGS_ENCRYPT +#define IBV_FLOW_ACTION_ESP_FLAGS_ESN_NEW_WINDOW IB_UVERBS_FLOW_ACTION_ESP_FLAGS_ESN_NEW_WINDOW +#define ibv_flow_action_esp_encap ib_uverbs_flow_action_esp_encap +#define ibv_flow_action_esp ib_uverbs_flow_action_esp + +#define ibv_advise_mr_advice ib_uverbs_advise_mr_advice +#define IBV_ADVISE_MR_ADVICE_PREFETCH IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH +#define IBV_ADVISE_MR_ADVICE_PREFETCH_WRITE IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_WRITE + +#define IBV_ADVISE_MR_FLAG_FLUSH IB_UVERBS_ADVISE_MR_FLAG_FLUSH + +#define IBV_QPF_GRH_REQUIRED IB_UVERBS_QPF_GRH_REQUIRED + +#define IBV_ACCESS_OPTIONAL_RANGE IB_UVERBS_ACCESS_OPTIONAL_RANGE +#define IBV_ACCESS_OPTIONAL_FIRST IB_UVERBS_ACCESS_OPTIONAL_FIRST +#endif + diff --git a/librdmacm/CMakeLists.txt b/librdmacm/CMakeLists.txt new file mode 100644 index 0000000..f0767cf --- /dev/null +++ b/librdmacm/CMakeLists.txt @@ -0,0 +1,55 @@ +publish_headers(rdma + rdma_cma.h + rdma_cma_abi.h + rdma_verbs.h + rsocket.h + ) +publish_headers(infiniband + acm.h + ib.h + ) + +rdma_library(rdmacm librdmacm.map + # See Documentation/versioning.md + 1 1.2.${PACKAGE_VERSION} + acm.c + addrinfo.c + cma.c + indexer.c + rsocket.c + ) +target_link_libraries(rdmacm LINK_PUBLIC ibverbs) +target_link_libraries(rdmacm LINK_PRIVATE + ${NL_LIBRARIES} + ${CMAKE_THREAD_LIBS_INIT} + ${RT_LIBRARIES} + ) + +# The preload library is a bit special, it needs to be open coded +# Since it is a LD_PRELOAD it has no soname, and is installed in sub dir +add_library(rspreload MODULE + preload.c + indexer.c + ) +# Even though this is a module we still want to use Wl,--no-undefined +set_target_properties(rspreload PROPERTIES LINK_FLAGS ${CMAKE_SHARED_LINKER_FLAGS}) +set_target_properties(rspreload PROPERTIES LIBRARY_OUTPUT_DIRECTORY "${BUILD_LIB}") +rdma_set_library_map(rspreload librspreload.map) +target_link_libraries(rspreload LINK_PRIVATE + rdmacm + ${CMAKE_THREAD_LIBS_INIT} + ${CMAKE_DL_LIBS} +) +install(TARGETS rspreload DESTINATION "${CMAKE_INSTALL_LIBDIR}/rsocket/") + +# These are for compat with old packaging, these name should not be used. +# FIXME: Maybe we can get rid of them? +rdma_install_symlink("librspreload.so" "${CMAKE_INSTALL_LIBDIR}/rsocket/librspreload.so.1") +rdma_install_symlink("librspreload.so" "${CMAKE_INSTALL_LIBDIR}/rsocket/librspreload.so.1.0.0") + +if (ENABLE_STATIC) + if (NOT NL_KIND EQUAL 0) + set(REQUIRES "libnl-3.0, libnl-route-3.0, ") + endif() +endif() +rdma_pkg_config("rdmacm" "${REQUIRES}libibverbs" "${CMAKE_THREAD_LIBS_INIT}") diff --git a/librdmacm/acm.c b/librdmacm/acm.c new file mode 100644 index 0000000..807ff55 --- /dev/null +++ b/librdmacm/acm.c @@ -0,0 +1,399 @@ +/* + * Copyright (c) 2010-2012 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <config.h> + +#include <stdio.h> +#include <inttypes.h> +#include <sys/types.h> +#include <sys/socket.h> +#include <sys/un.h> +#include <netdb.h> +#include <unistd.h> + +#include "cma.h" +#include "acm.h" +#include <rdma/rdma_cma.h> +#include <infiniband/ib.h> +#include <infiniband/sa.h> + +static pthread_mutex_t acm_lock = PTHREAD_MUTEX_INITIALIZER; +static int sock = -1; +static uint16_t server_port; + +static int ucma_set_server_port(void) +{ + FILE *f; + + if ((f = fopen(IBACM_PORT_FILE, "r" STREAM_CLOEXEC))) { + if (fscanf(f, "%" SCNu16, &server_port) != 1) + server_port = 0; + fclose(f); + } else + server_port = 0; + + return server_port; +} + +void ucma_ib_init(void) +{ + union { + struct sockaddr any; + struct sockaddr_in inet; + struct sockaddr_un unx; + } addr; + static int init; + int ret; + + if (init) + return; + + pthread_mutex_lock(&acm_lock); + if (init) + goto unlock; + + if (ucma_set_server_port()) { + sock = socket(AF_INET, SOCK_STREAM | SOCK_CLOEXEC, IPPROTO_TCP); + if (sock < 0) + goto out; + + memset(&addr, 0, sizeof(addr)); + addr.any.sa_family = AF_INET; + addr.inet.sin_addr.s_addr = htobe32(INADDR_LOOPBACK); + addr.inet.sin_port = htobe16(server_port); + ret = connect(sock, &addr.any, sizeof(addr.inet)); + if (ret) { + close(sock); + sock = -1; + } + } else { + sock = socket(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0); + if (sock < 0) + goto out; + + memset(&addr, 0, sizeof(addr)); + addr.any.sa_family = AF_UNIX; + BUILD_ASSERT(sizeof(IBACM_SERVER_PATH) <= + sizeof(addr.unx.sun_path)); + strcpy(addr.unx.sun_path, IBACM_SERVER_PATH); + ret = connect(sock, &addr.any, sizeof(addr.unx)); + if (ret) { + close(sock); + sock = -1; + } + } +out: + init = 1; +unlock: + pthread_mutex_unlock(&acm_lock); +} + +void ucma_ib_cleanup(void) +{ + if (sock >= 0) { + shutdown(sock, SHUT_RDWR); + close(sock); + } +} + +static int ucma_ib_set_addr(struct rdma_addrinfo *ib_rai, + struct rdma_addrinfo *rai) +{ + struct sockaddr_ib *src, *dst; + struct ibv_path_record *path; + + src = calloc(1, sizeof(*src)); + if (!src) + return ERR(ENOMEM); + + dst = calloc(1, sizeof(*dst)); + if (!dst) { + free(src); + return ERR(ENOMEM); + } + + path = &((struct ibv_path_data *) ib_rai->ai_route)->path; + + src->sib_family = AF_IB; + src->sib_pkey = path->pkey; + src->sib_flowinfo = htobe32(be32toh(path->flowlabel_hoplimit) >> 8); + memcpy(&src->sib_addr, &path->sgid, 16); + ucma_set_sid(ib_rai->ai_port_space, rai->ai_src_addr, src); + + dst->sib_family = AF_IB; + dst->sib_pkey = path->pkey; + dst->sib_flowinfo = htobe32(be32toh(path->flowlabel_hoplimit) >> 8); + memcpy(&dst->sib_addr, &path->dgid, 16); + ucma_set_sid(ib_rai->ai_port_space, rai->ai_dst_addr, dst); + + ib_rai->ai_src_addr = (struct sockaddr *) src; + ib_rai->ai_src_len = sizeof(*src); + + ib_rai->ai_dst_addr = (struct sockaddr *) dst; + ib_rai->ai_dst_len = sizeof(*dst); + + return 0; +} + +static int ucma_ib_set_connect(struct rdma_addrinfo *ib_rai, + struct rdma_addrinfo *rai) +{ + struct ib_connect_hdr *hdr; + + if (rai->ai_family == AF_IB) + return 0; + + hdr = calloc(1, sizeof(*hdr)); + if (!hdr) + return ERR(ENOMEM); + + if (rai->ai_family == AF_INET) { + hdr->ip_version = 4 << 4; + memcpy(&hdr->cma_src_ip4, + &((struct sockaddr_in *) rai->ai_src_addr)->sin_addr, 4); + memcpy(&hdr->cma_dst_ip4, + &((struct sockaddr_in *) rai->ai_dst_addr)->sin_addr, 4); + } else { + hdr->ip_version = 6 << 4; + memcpy(&hdr->cma_src_ip6, + &((struct sockaddr_in6 *) rai->ai_src_addr)->sin6_addr, 16); + memcpy(&hdr->cma_dst_ip6, + &((struct sockaddr_in6 *) rai->ai_dst_addr)->sin6_addr, 16); + } + + ib_rai->ai_connect = hdr; + ib_rai->ai_connect_len = sizeof(*hdr); + return 0; +} + +static void ucma_resolve_af_ib(struct rdma_addrinfo **rai) +{ + struct rdma_addrinfo *ib_rai; + + ib_rai = calloc(1, sizeof(*ib_rai)); + if (!ib_rai) + return; + + ib_rai->ai_flags = (*rai)->ai_flags; + ib_rai->ai_family = AF_IB; + ib_rai->ai_qp_type = (*rai)->ai_qp_type; + ib_rai->ai_port_space = (*rai)->ai_port_space; + + ib_rai->ai_route = calloc(1, (*rai)->ai_route_len); + if (!ib_rai->ai_route) + goto err; + + memcpy(ib_rai->ai_route, (*rai)->ai_route, (*rai)->ai_route_len); + ib_rai->ai_route_len = (*rai)->ai_route_len; + + if ((*rai)->ai_src_canonname) { + ib_rai->ai_src_canonname = strdup((*rai)->ai_src_canonname); + if (!ib_rai->ai_src_canonname) + goto err; + } + + if ((*rai)->ai_dst_canonname) { + ib_rai->ai_dst_canonname = strdup((*rai)->ai_dst_canonname); + if (!ib_rai->ai_dst_canonname) + goto err; + } + + if (ucma_ib_set_connect(ib_rai, *rai)) + goto err; + + if (ucma_ib_set_addr(ib_rai, *rai)) + goto err; + + ib_rai->ai_next = *rai; + *rai = ib_rai; + return; + +err: + rdma_freeaddrinfo(ib_rai); +} + +static void ucma_ib_save_resp(struct rdma_addrinfo *rai, struct acm_msg *msg) +{ + struct acm_ep_addr_data *ep_data; + struct ibv_path_data *path_data = NULL; + struct sockaddr_in *sin; + struct sockaddr_in6 *sin6; + int i, cnt, path_cnt = 0; + + cnt = (msg->hdr.length - ACM_MSG_HDR_LENGTH) / ACM_MSG_EP_LENGTH; + for (i = 0; i < cnt; i++) { + ep_data = &msg->resolve_data[i]; + switch (ep_data->type) { + case ACM_EP_INFO_PATH: + ep_data->type = 0; + if (!path_data) + path_data = (struct ibv_path_data *) ep_data; + path_cnt++; + break; + case ACM_EP_INFO_ADDRESS_IP: + if (!(ep_data->flags & ACM_EP_FLAG_SOURCE) || rai->ai_src_len) + break; + + sin = calloc(1, sizeof(*sin)); + if (!sin) + break; + + sin->sin_family = AF_INET; + memcpy(&sin->sin_addr, &ep_data->info.addr, 4); + rai->ai_src_len = sizeof(*sin); + rai->ai_src_addr = (struct sockaddr *) sin; + break; + case ACM_EP_INFO_ADDRESS_IP6: + if (!(ep_data->flags & ACM_EP_FLAG_SOURCE) || rai->ai_src_len) + break; + + sin6 = calloc(1, sizeof(*sin6)); + if (!sin6) + break; + + sin6->sin6_family = AF_INET6; + memcpy(&sin6->sin6_addr, &ep_data->info.addr, 16); + rai->ai_src_len = sizeof(*sin6); + rai->ai_src_addr = (struct sockaddr *) sin6; + break; + default: + break; + } + } + + rai->ai_route = calloc(path_cnt, sizeof(*path_data)); + if (rai->ai_route) { + memcpy(rai->ai_route, path_data, path_cnt * sizeof(*path_data)); + rai->ai_route_len = path_cnt * sizeof(*path_data); + } +} + +static void ucma_set_ep_addr(struct acm_ep_addr_data *data, struct sockaddr *addr) +{ + if (addr->sa_family == AF_INET) { + data->type = ACM_EP_INFO_ADDRESS_IP; + memcpy(data->info.addr, &((struct sockaddr_in *) addr)->sin_addr, 4); + } else { + data->type = ACM_EP_INFO_ADDRESS_IP6; + memcpy(data->info.addr, &((struct sockaddr_in6 *) addr)->sin6_addr, 16); + } +} + +static int ucma_inet_addr(struct sockaddr *addr, socklen_t len) +{ + return len && addr && (addr->sa_family == AF_INET || + addr->sa_family == AF_INET6); +} + +static int ucma_ib_addr(struct sockaddr *addr, socklen_t len) +{ + return len && addr && (addr->sa_family == AF_IB); +} + +void ucma_ib_resolve(struct rdma_addrinfo **rai, + const struct rdma_addrinfo *hints) +{ + struct acm_msg msg; + struct acm_ep_addr_data *data; + int ret; + + ucma_ib_init(); + if (sock < 0) + return; + + memset(&msg, 0, sizeof msg); + msg.hdr.version = ACM_VERSION; + msg.hdr.opcode = ACM_OP_RESOLVE; + msg.hdr.length = ACM_MSG_HDR_LENGTH; + + data = &msg.resolve_data[0]; + if (ucma_inet_addr((*rai)->ai_src_addr, (*rai)->ai_src_len)) { + data->flags = ACM_EP_FLAG_SOURCE; + ucma_set_ep_addr(data, (*rai)->ai_src_addr); + data++; + msg.hdr.length += ACM_MSG_EP_LENGTH; + } + + if (ucma_inet_addr((*rai)->ai_dst_addr, (*rai)->ai_dst_len)) { + data->flags = ACM_EP_FLAG_DEST; + if (hints->ai_flags & (RAI_NUMERICHOST | RAI_NOROUTE)) + data->flags |= ACM_FLAGS_NODELAY; + ucma_set_ep_addr(data, (*rai)->ai_dst_addr); + data++; + msg.hdr.length += ACM_MSG_EP_LENGTH; + } + + if (hints->ai_route_len || + ucma_ib_addr((*rai)->ai_src_addr, (*rai)->ai_src_len) || + ucma_ib_addr((*rai)->ai_dst_addr, (*rai)->ai_dst_len)) { + struct ibv_path_record *path; + + if (hints->ai_route_len == sizeof(struct ibv_path_record)) + path = (struct ibv_path_record *) hints->ai_route; + else if (hints->ai_route_len == sizeof(struct ibv_path_data)) + path = &((struct ibv_path_data *) hints->ai_route)->path; + else + path = NULL; + + if (path) + memcpy(&data->info.path, path, sizeof(*path)); + + if (ucma_ib_addr((*rai)->ai_src_addr, (*rai)->ai_src_len)) { + memcpy(&data->info.path.sgid, + &((struct sockaddr_ib *) (*rai)->ai_src_addr)->sib_addr, 16); + } + if (ucma_ib_addr((*rai)->ai_dst_addr, (*rai)->ai_dst_len)) { + memcpy(&data->info.path.dgid, + &((struct sockaddr_ib *) (*rai)->ai_dst_addr)->sib_addr, 16); + } + data->type = ACM_EP_INFO_PATH; + data++; + msg.hdr.length += ACM_MSG_EP_LENGTH; + } + + pthread_mutex_lock(&acm_lock); + ret = send(sock, (char *) &msg, msg.hdr.length, 0); + if (ret != msg.hdr.length) { + pthread_mutex_unlock(&acm_lock); + return; + } + + ret = recv(sock, (char *) &msg, sizeof msg, 0); + pthread_mutex_unlock(&acm_lock); + if (ret < ACM_MSG_HDR_LENGTH || ret != msg.hdr.length || msg.hdr.status) + return; + + ucma_ib_save_resp(*rai, &msg); + + if (af_ib_support && !(hints->ai_flags & RAI_ROUTEONLY) && (*rai)->ai_route_len) + ucma_resolve_af_ib(rai); +} diff --git a/librdmacm/acm.h b/librdmacm/acm.h new file mode 100644 index 0000000..7397b35 --- /dev/null +++ b/librdmacm/acm.h @@ -0,0 +1,169 @@ +/* + * Copyright (c) 2009 Intel Corporation. All rights reserved. + * + * This software is available to you under the OpenFabrics.org BSD license + * below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AWV + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#if !defined(ACM_H) +#define ACM_H + +#include <infiniband/verbs.h> +#include <infiniband/sa.h> + +#ifdef __cplusplus +extern "C" { +#endif + +#define ACM_VERSION 1 + +#define ACM_OP_MASK 0x0F +#define ACM_OP_RESOLVE 0x01 +#define ACM_OP_PERF_QUERY 0x02 +#define ACM_OP_EP_QUERY 0x03 +#define ACM_OP_ACK 0x80 + +#define ACM_STATUS_SUCCESS 0 +#define ACM_STATUS_ENOMEM 1 +#define ACM_STATUS_EINVAL 2 +#define ACM_STATUS_ENODATA 3 +#define ACM_STATUS_ENOTCONN 5 +#define ACM_STATUS_ETIMEDOUT 6 +#define ACM_STATUS_ESRCADDR 7 +#define ACM_STATUS_ESRCTYPE 8 +#define ACM_STATUS_EDESTADDR 9 +#define ACM_STATUS_EDESTTYPE 10 + +#define ACM_FLAGS_QUERY_SA (1<<31) +#define ACM_FLAGS_NODELAY (1<<30) + +#define ACM_MSG_HDR_LENGTH 16 +#define ACM_MAX_ADDRESS 64 +#define ACM_MSG_EP_LENGTH 72 +#define ACM_MAX_PROV_NAME 64 +/* + * Support up to 6 path records (primary and alternate CM paths, + * inbound and outbound primary and alternate data paths), plus CM data. + */ +#define ACM_MSG_DATA_LENGTH (ACM_MSG_EP_LENGTH * 8) + +#define src_out data[0] +#define src_index data[1] +#define dst_index data[2] + +struct acm_hdr { + uint8_t version; + uint8_t opcode; + uint8_t status; + uint8_t data[3]; + uint16_t length; + uint64_t tid; +}; + +#define ACM_EP_INFO_NAME 0x0001 +#define ACM_EP_INFO_ADDRESS_IP 0x0002 +#define ACM_EP_INFO_ADDRESS_IP6 0x0003 +#define ACM_EP_INFO_PATH 0x0010 + +union acm_ep_info { + uint8_t addr[ACM_MAX_ADDRESS]; + uint8_t name[ACM_MAX_ADDRESS]; + struct ibv_path_record path; +}; + +#define ACM_EP_FLAG_SOURCE (1<<0) +#define ACM_EP_FLAG_DEST (1<<1) + +struct acm_ep_addr_data { + uint32_t flags; + uint16_t type; + uint16_t reserved; + union acm_ep_info info; +}; + +/* + * Resolve messages with the opcode set to ACM_OP_RESOLVE are only + * used to communicate with the local ib_acm service. Message fields + * in this case are not byte swapped, but note that the acm_ep_info + * data is in network order. + */ +struct acm_resolve_msg { + struct acm_hdr hdr; + struct acm_ep_addr_data data[0]; +}; + +enum { + ACM_CNTR_ERROR, + ACM_CNTR_RESOLVE, + ACM_CNTR_NODATA, + ACM_CNTR_ADDR_QUERY, + ACM_CNTR_ADDR_CACHE, + ACM_CNTR_ROUTE_QUERY, + ACM_CNTR_ROUTE_CACHE, + ACM_MAX_COUNTER +}; + +/* + * Performance messages are sent/received in network byte order. + */ +struct acm_perf_msg { + struct acm_hdr hdr; + uint64_t data[0]; +}; + +/* + * Endpoint query messages are sent/received in network byte order. + */ +struct acm_ep_config_data { + uint64_t dev_guid; + uint8_t port_num; + uint8_t phys_port_cnt; + uint8_t rsvd[2]; + uint16_t pkey; + uint16_t addr_cnt; + uint8_t prov_name[ACM_MAX_PROV_NAME]; + union acm_ep_info addrs[0]; +}; + +struct acm_ep_query_msg { + struct acm_hdr hdr; + struct acm_ep_config_data data[0]; +}; + +struct acm_msg { + struct acm_hdr hdr; + union{ + uint8_t data[ACM_MSG_DATA_LENGTH]; + struct acm_ep_addr_data resolve_data[0]; + uint64_t perf_data[0]; + struct acm_ep_config_data ep_data[0]; + }; +}; + +#ifdef __cplusplus +} +#endif + +#endif /* ACM_H */ diff --git a/librdmacm/addrinfo.c b/librdmacm/addrinfo.c new file mode 100644 index 0000000..7e66065 --- /dev/null +++ b/librdmacm/addrinfo.c @@ -0,0 +1,319 @@ +/* + * Copyright (c) 2010-2014 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * $Id: cm.c 3453 2005-09-15 21:43:21Z sean.hefty $ + */ + +#include <config.h> + +#include <sys/types.h> +#include <sys/socket.h> +#include <netdb.h> +#include <unistd.h> + +#include "cma.h" +#include <rdma/rdma_cma.h> +#include <infiniband/ib.h> + +static struct rdma_addrinfo nohints; + +static void ucma_convert_to_ai(struct addrinfo *ai, + const struct rdma_addrinfo *rai) +{ + memset(ai, 0, sizeof(*ai)); + if (rai->ai_flags & RAI_PASSIVE) + ai->ai_flags = AI_PASSIVE; + if (rai->ai_flags & RAI_NUMERICHOST) + ai->ai_flags |= AI_NUMERICHOST; + if (rai->ai_family != AF_IB) + ai->ai_family = rai->ai_family; + + switch (rai->ai_qp_type) { + case IBV_QPT_RC: + case IBV_QPT_UC: + case IBV_QPT_XRC_SEND: + case IBV_QPT_XRC_RECV: + ai->ai_socktype = SOCK_STREAM; + break; + case IBV_QPT_UD: + ai->ai_socktype = SOCK_DGRAM; + break; + } + + switch (rai->ai_port_space) { + case RDMA_PS_TCP: + ai->ai_protocol = IPPROTO_TCP; + break; + case RDMA_PS_IPOIB: + case RDMA_PS_UDP: + ai->ai_protocol = IPPROTO_UDP; + break; + case RDMA_PS_IB: + if (ai->ai_socktype == SOCK_STREAM) + ai->ai_protocol = IPPROTO_TCP; + else if (ai->ai_socktype == SOCK_DGRAM) + ai->ai_protocol = IPPROTO_UDP; + break; + } + + if (rai->ai_flags & RAI_PASSIVE) { + ai->ai_addrlen = rai->ai_src_len; + ai->ai_addr = rai->ai_src_addr; + } else { + ai->ai_addrlen = rai->ai_dst_len; + ai->ai_addr = rai->ai_dst_addr; + } + ai->ai_canonname = rai->ai_dst_canonname; + ai->ai_next = NULL; +} + +static int ucma_copy_addr(struct sockaddr **dst, socklen_t *dst_len, + struct sockaddr *src, socklen_t src_len) +{ + *dst = malloc(src_len); + if (!(*dst)) + return ERR(ENOMEM); + + memcpy(*dst, src, src_len); + *dst_len = src_len; + return 0; +} + +void ucma_set_sid(enum rdma_port_space ps, struct sockaddr *addr, + struct sockaddr_ib *sib) +{ + __be16 port; + + port = addr ? ucma_get_port(addr) : 0; + sib->sib_sid = htobe64(((uint64_t) ps << 16) + be16toh(port)); + + if (ps) + sib->sib_sid_mask = htobe64(RDMA_IB_IP_PS_MASK); + if (port) + sib->sib_sid_mask |= htobe64(RDMA_IB_IP_PORT_MASK); +} + +static int ucma_convert_in6(int ps, struct sockaddr_ib **dst, socklen_t *dst_len, + struct sockaddr_in6 *src, socklen_t src_len) +{ + *dst = calloc(1, sizeof(struct sockaddr_ib)); + if (!(*dst)) + return ERR(ENOMEM); + + (*dst)->sib_family = AF_IB; + (*dst)->sib_pkey = htobe16(0xFFFF); + (*dst)->sib_flowinfo = src->sin6_flowinfo; + ib_addr_set(&(*dst)->sib_addr, src->sin6_addr.s6_addr32[0], + src->sin6_addr.s6_addr32[1], src->sin6_addr.s6_addr32[2], + src->sin6_addr.s6_addr32[3]); + ucma_set_sid(ps, (struct sockaddr *) src, *dst); + (*dst)->sib_scope_id = src->sin6_scope_id; + + *dst_len = sizeof(struct sockaddr_ib); + return 0; +} + +static int ucma_convert_to_rai(struct rdma_addrinfo *rai, + const struct rdma_addrinfo *hints, + const struct addrinfo *ai) +{ + int ret; + + if (hints->ai_qp_type) { + rai->ai_qp_type = hints->ai_qp_type; + } else { + switch (ai->ai_socktype) { + case SOCK_STREAM: + rai->ai_qp_type = IBV_QPT_RC; + break; + case SOCK_DGRAM: + rai->ai_qp_type = IBV_QPT_UD; + break; + } + } + + if (hints->ai_port_space) { + rai->ai_port_space = hints->ai_port_space; + } else { + switch (ai->ai_protocol) { + case IPPROTO_TCP: + rai->ai_port_space = RDMA_PS_TCP; + break; + case IPPROTO_UDP: + rai->ai_port_space = RDMA_PS_UDP; + break; + } + } + + if (ai->ai_flags & AI_PASSIVE) { + rai->ai_flags = RAI_PASSIVE; + if (ai->ai_canonname) + rai->ai_src_canonname = strdup(ai->ai_canonname); + + if ((hints->ai_flags & RAI_FAMILY) && (hints->ai_family == AF_IB) && + (hints->ai_flags & RAI_NUMERICHOST)) { + rai->ai_family = AF_IB; + ret = ucma_convert_in6(rai->ai_port_space, + (struct sockaddr_ib **) &rai->ai_src_addr, + &rai->ai_src_len, + (struct sockaddr_in6 *) ai->ai_addr, + ai->ai_addrlen); + } else { + rai->ai_family = ai->ai_family; + ret = ucma_copy_addr(&rai->ai_src_addr, &rai->ai_src_len, + ai->ai_addr, ai->ai_addrlen); + } + } else { + if (ai->ai_canonname) + rai->ai_dst_canonname = strdup(ai->ai_canonname); + + if ((hints->ai_flags & RAI_FAMILY) && (hints->ai_family == AF_IB) && + (hints->ai_flags & RAI_NUMERICHOST)) { + rai->ai_family = AF_IB; + ret = ucma_convert_in6(rai->ai_port_space, + (struct sockaddr_ib **) &rai->ai_dst_addr, + &rai->ai_dst_len, + (struct sockaddr_in6 *) ai->ai_addr, + ai->ai_addrlen); + } else { + rai->ai_family = ai->ai_family; + ret = ucma_copy_addr(&rai->ai_dst_addr, &rai->ai_dst_len, + ai->ai_addr, ai->ai_addrlen); + } + } + return ret; +} + +static int ucma_getaddrinfo(const char *node, const char *service, + const struct rdma_addrinfo *hints, + struct rdma_addrinfo *rai) +{ + struct addrinfo ai_hints; + struct addrinfo *ai; + int ret; + + if (hints != &nohints) { + ucma_convert_to_ai(&ai_hints, hints); + ret = getaddrinfo(node, service, &ai_hints, &ai); + } else { + ret = getaddrinfo(node, service, NULL, &ai); + } + if (ret) + return ret; + + ret = ucma_convert_to_rai(rai, hints, ai); + freeaddrinfo(ai); + return ret; +} + +int rdma_getaddrinfo(const char *node, const char *service, + const struct rdma_addrinfo *hints, + struct rdma_addrinfo **res) +{ + struct rdma_addrinfo *rai; + int ret; + + if (!service && !node && !hints) + return ERR(EINVAL); + + ret = ucma_init(); + if (ret) + return ret; + + rai = calloc(1, sizeof(*rai)); + if (!rai) + return ERR(ENOMEM); + + if (!hints) + hints = &nohints; + + if (node || service) { + ret = ucma_getaddrinfo(node, service, hints, rai); + } else { + rai->ai_flags = hints->ai_flags; + rai->ai_family = hints->ai_family; + rai->ai_qp_type = hints->ai_qp_type; + rai->ai_port_space = hints->ai_port_space; + if (hints->ai_dst_len) { + ret = ucma_copy_addr(&rai->ai_dst_addr, &rai->ai_dst_len, + hints->ai_dst_addr, hints->ai_dst_len); + } + } + if (ret) + goto err; + + if (!rai->ai_src_len && hints->ai_src_len) { + ret = ucma_copy_addr(&rai->ai_src_addr, &rai->ai_src_len, + hints->ai_src_addr, hints->ai_src_len); + if (ret) + goto err; + } + + if (!(rai->ai_flags & RAI_PASSIVE)) + ucma_ib_resolve(&rai, hints); + + *res = rai; + return 0; + +err: + rdma_freeaddrinfo(rai); + return ret; +} + +void rdma_freeaddrinfo(struct rdma_addrinfo *res) +{ + struct rdma_addrinfo *rai; + + while (res) { + rai = res; + res = res->ai_next; + + if (rai->ai_connect) + free(rai->ai_connect); + + if (rai->ai_route) + free(rai->ai_route); + + if (rai->ai_src_canonname) + free(rai->ai_src_canonname); + + if (rai->ai_dst_canonname) + free(rai->ai_dst_canonname); + + if (rai->ai_src_addr) + free(rai->ai_src_addr); + + if (rai->ai_dst_addr) + free(rai->ai_dst_addr); + + free(rai); + } +} diff --git a/librdmacm/cma.c b/librdmacm/cma.c new file mode 100644 index 0000000..9855d0a --- /dev/null +++ b/librdmacm/cma.c @@ -0,0 +1,2563 @@ +/* + * Copyright (c) 2005-2014 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <config.h> + +#include <stdlib.h> +#include <string.h> +#include <glob.h> +#include <stdio.h> +#include <fcntl.h> +#include <errno.h> +#include <stdint.h> +#include <poll.h> +#include <unistd.h> +#include <pthread.h> +#include <endian.h> +#include <stddef.h> +#include <netdb.h> +#include <syslog.h> +#include <limits.h> +#include <sys/sysmacros.h> + +#include "cma.h" +#include "indexer.h" +#include <infiniband/driver.h> +#include <infiniband/marshall.h> +#include <rdma/rdma_cma.h> +#include <rdma/rdma_cma_abi.h> +#include <rdma/rdma_verbs.h> +#include <infiniband/ib.h> +#include <util/util.h> +#include <util/rdma_nl.h> + +#define CMA_INIT_CMD(req, req_size, op) \ +do { \ + memset(req, 0, req_size); \ + (req)->cmd = UCMA_CMD_##op; \ + (req)->in = req_size - sizeof(struct ucma_abi_cmd_hdr); \ +} while (0) + +#define CMA_INIT_CMD_RESP(req, req_size, op, resp, resp_size) \ +do { \ + CMA_INIT_CMD(req, req_size, op); \ + (req)->out = resp_size; \ + (req)->response = (uintptr_t) (resp); \ +} while (0) + +struct cma_port { + uint8_t link_layer; +}; + +struct cma_device { + struct ibv_context *verbs; + struct ibv_pd *pd; + struct ibv_xrcd *xrcd; + struct cma_port *port; + __be64 guid; + int port_cnt; + int refcnt; + int max_qpsize; + uint8_t max_initiator_depth; + uint8_t max_responder_resources; +}; + +struct cma_id_private { + struct rdma_cm_id id; + struct cma_device *cma_dev; + void *connect; + size_t connect_len; + int events_completed; + int connect_error; + int sync; + pthread_cond_t cond; + pthread_mutex_t mut; + uint32_t handle; + struct cma_multicast *mc_list; + struct ibv_qp_init_attr *qp_init_attr; + uint8_t initiator_depth; + uint8_t responder_resources; +}; + +struct cma_multicast { + struct cma_multicast *next; + struct cma_id_private *id_priv; + void *context; + int events_completed; + pthread_cond_t cond; + uint32_t handle; + union ibv_gid mgid; + uint16_t mlid; + uint16_t join_flags; + struct sockaddr_storage addr; +}; + +struct cma_event { + struct rdma_cm_event event; + uint8_t private_data[RDMA_MAX_PRIVATE_DATA]; + struct cma_id_private *id_priv; + struct cma_multicast *mc; +}; + +static struct cma_device *cma_dev_array; +static int cma_dev_cnt; +static int cma_init_cnt; +static pthread_mutex_t mut = PTHREAD_MUTEX_INITIALIZER; +static int abi_ver = -1; +static char dev_name[64] = "rdma_cm"; +static dev_t dev_cdev; +int af_ib_support; +static struct index_map ucma_idm; +static fastlock_t idm_lock; + +static int check_abi_version_nl_cb(struct nl_msg *msg, void *data) +{ + struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; + uint64_t cdev64; + int ret; + + ret = nlmsg_parse(nlmsg_hdr(msg), 0, tb, RDMA_NLDEV_ATTR_MAX - 1, + rdmanl_policy); + if (ret < 0) + return ret; + if (!tb[RDMA_NLDEV_ATTR_CHARDEV] || !tb[RDMA_NLDEV_ATTR_CHARDEV_ABI] || + !tb[RDMA_NLDEV_ATTR_CHARDEV_NAME]) + return NLE_PARSE_ERR; + + /* Convert from huge_encode_dev to whatever glibc uses */ + cdev64 = nla_get_u64(tb[RDMA_NLDEV_ATTR_CHARDEV]); + dev_cdev = makedev((cdev64 & 0xfff00) >> 8, + (cdev64 & 0xff) | ((cdev64 >> 12) & 0xfff00)); + + if (!check_snprintf(dev_name, sizeof(dev_name), "%s", + nla_get_string(tb[RDMA_NLDEV_ATTR_CHARDEV_NAME]))) + return NLE_PARSE_ERR; + + /* + * The top 32 bits of CHARDEV_ABI are reserved for a future use, + * current kernels set them to 0 + */ + abi_ver = (uint32_t)nla_get_u64(tb[RDMA_NLDEV_ATTR_CHARDEV_ABI]); + + return 0; +} + +/* Ask the kernel for the uverbs char device information */ +static int check_abi_version_nl(void) +{ + struct nl_sock *nl; + + nl = rdmanl_socket_alloc(); + if (!nl) + return -1; + if (rdmanl_get_chardev(nl, -1, "rdma_cm", check_abi_version_nl_cb, + NULL)) + goto err_socket; + if (abi_ver == -1) + goto err_socket; + nl_socket_free(nl); + return 0; + +err_socket: + nl_socket_free(nl); + return -1; +} + +static void check_abi_version_sysfs(void) +{ + char value[8]; + + if ((ibv_read_sysfs_file(ibv_get_sysfs_path(), + "class/misc/rdma_cm/abi_version", + value, sizeof value) < 0) && + (ibv_read_sysfs_file(ibv_get_sysfs_path(), + "class/infiniband_ucma/abi_version", + value, sizeof value) < 0)) { + /* + * Older version of Linux do not have class/misc. To support + * backports, assume the most recent version of the ABI. If + * we're wrong, we'll simply fail later when calling the ABI. + */ + abi_ver = RDMA_USER_CM_MAX_ABI_VERSION; + return; + } + abi_ver = strtol(value, NULL, 10); + dev_cdev = 0; +} + +static int check_abi_version(void) +{ + if (abi_ver == -1) { + if (check_abi_version_nl()) + check_abi_version_sysfs(); + } + + if (abi_ver < RDMA_USER_CM_MIN_ABI_VERSION || + abi_ver > RDMA_USER_CM_MAX_ABI_VERSION) + return -1; + return 0; +} + +/* + * This function is called holding the mutex lock + * cma_dev_cnt must be set before calling this function to + * ensure that the lock is not acquired recursively. + */ +static void ucma_set_af_ib_support(void) +{ + struct rdma_cm_id *id; + struct sockaddr_ib sib; + int ret; + + ret = rdma_create_id(NULL, &id, NULL, RDMA_PS_IB); + if (ret) + return; + + memset(&sib, 0, sizeof sib); + sib.sib_family = AF_IB; + sib.sib_sid = htobe64(RDMA_IB_IP_PS_TCP); + sib.sib_sid_mask = htobe64(RDMA_IB_IP_PS_MASK); + af_ib_support = 1; + ret = rdma_bind_addr(id, (struct sockaddr *) &sib); + af_ib_support = !ret; + + rdma_destroy_id(id); +} + +int ucma_init(void) +{ + struct ibv_device **dev_list = NULL; + int i, ret, dev_cnt; + + /* Quick check without lock to see if we're already initialized */ + if (cma_dev_cnt) + return 0; + + pthread_mutex_lock(&mut); + if (cma_dev_cnt) { + pthread_mutex_unlock(&mut); + return 0; + } + + fastlock_init(&idm_lock); + ret = check_abi_version(); + if (ret) { + ret = ERR(EPERM); + goto err1; + } + + dev_list = ibv_get_device_list(&dev_cnt); + if (!dev_list) { + ret = ERR(ENODEV); + goto err1; + } + + if (!dev_cnt) { + ret = ERR(ENODEV); + goto err2; + } + + cma_dev_array = calloc(dev_cnt, sizeof(*cma_dev_array)); + if (!cma_dev_array) { + ret = ERR(ENOMEM); + goto err2; + } + + for (i = 0; dev_list[i]; i++) + cma_dev_array[i].guid = ibv_get_device_guid(dev_list[i]); + + cma_dev_cnt = dev_cnt; + ucma_set_af_ib_support(); + pthread_mutex_unlock(&mut); + ibv_free_device_list(dev_list); + return 0; + +err2: + ibv_free_device_list(dev_list); +err1: + fastlock_destroy(&idm_lock); + pthread_mutex_unlock(&mut); + return ret; +} + +static struct ibv_context *ucma_open_device(__be64 guid) +{ + struct ibv_device **dev_list; + struct ibv_context *verbs = NULL; + int i; + + dev_list = ibv_get_device_list(NULL); + if (!dev_list) { + return NULL; + } + + for (i = 0; dev_list[i]; i++) { + if (ibv_get_device_guid(dev_list[i]) == guid) { + verbs = ibv_open_device(dev_list[i]); + break; + } + } + + ibv_free_device_list(dev_list); + return verbs; +} + +static int ucma_init_device(struct cma_device *cma_dev) +{ + struct ibv_port_attr port_attr; + struct ibv_device_attr attr; + int i, ret; + + if (cma_dev->verbs) + return 0; + + cma_dev->verbs = ucma_open_device(cma_dev->guid); + if (!cma_dev->verbs) + return ERR(ENODEV); + + ret = ibv_query_device(cma_dev->verbs, &attr); + if (ret) { + ret = ERR(ret); + goto err; + } + + cma_dev->port = malloc(sizeof(*cma_dev->port) * attr.phys_port_cnt); + if (!cma_dev->port) { + ret = ERR(ENOMEM); + goto err; + } + + for (i = 1; i <= attr.phys_port_cnt; i++) { + if (ibv_query_port(cma_dev->verbs, i, &port_attr)) + cma_dev->port[i - 1].link_layer = IBV_LINK_LAYER_UNSPECIFIED; + else + cma_dev->port[i - 1].link_layer = port_attr.link_layer; + } + + cma_dev->port_cnt = attr.phys_port_cnt; + cma_dev->max_qpsize = attr.max_qp_wr; + cma_dev->max_initiator_depth = (uint8_t) attr.max_qp_init_rd_atom; + cma_dev->max_responder_resources = (uint8_t) attr.max_qp_rd_atom; + cma_init_cnt++; + return 0; + +err: + ibv_close_device(cma_dev->verbs); + cma_dev->verbs = NULL; + return ret; +} + +static int ucma_init_all(void) +{ + int i, ret = 0; + + if (!cma_dev_cnt) { + ret = ucma_init(); + if (ret) + return ret; + } + + if (cma_init_cnt == cma_dev_cnt) + return 0; + + pthread_mutex_lock(&mut); + for (i = 0; i < cma_dev_cnt; i++) { + ret = ucma_init_device(&cma_dev_array[i]); + if (ret) + break; + } + pthread_mutex_unlock(&mut); + return ret; +} + +struct ibv_context **rdma_get_devices(int *num_devices) +{ + struct ibv_context **devs = NULL; + int i; + + if (ucma_init_all()) + goto out; + + devs = malloc(sizeof(*devs) * (cma_dev_cnt + 1)); + if (!devs) + goto out; + + for (i = 0; i < cma_dev_cnt; i++) + devs[i] = cma_dev_array[i].verbs; + devs[i] = NULL; +out: + if (num_devices) + *num_devices = devs ? cma_dev_cnt : 0; + return devs; +} + +void rdma_free_devices(struct ibv_context **list) +{ + free(list); +} + +struct rdma_event_channel *rdma_create_event_channel(void) +{ + struct rdma_event_channel *channel; + + if (ucma_init()) + return NULL; + + channel = malloc(sizeof(*channel)); + if (!channel) + return NULL; + + channel->fd = open_cdev(dev_name, dev_cdev); + if (channel->fd < 0) { + goto err; + } + return channel; +err: + free(channel); + return NULL; +} + +void rdma_destroy_event_channel(struct rdma_event_channel *channel) +{ + close(channel->fd); + free(channel); +} + +static int ucma_get_device(struct cma_id_private *id_priv, __be64 guid) +{ + struct cma_device *cma_dev; + int i, ret; + + for (i = 0; i < cma_dev_cnt; i++) { + cma_dev = &cma_dev_array[i]; + if (cma_dev->guid == guid) + goto match; + } + + return ERR(ENODEV); +match: + pthread_mutex_lock(&mut); + if ((ret = ucma_init_device(cma_dev))) + goto out; + + if (!cma_dev->refcnt++) { + cma_dev->pd = ibv_alloc_pd(cma_dev->verbs); + if (!cma_dev->pd) { + cma_dev->refcnt--; + ret = ERR(ENOMEM); + goto out; + } + } + id_priv->cma_dev = cma_dev; + id_priv->id.verbs = cma_dev->verbs; + id_priv->id.pd = cma_dev->pd; +out: + pthread_mutex_unlock(&mut); + return ret; +} + +static void ucma_put_device(struct cma_device *cma_dev) +{ + pthread_mutex_lock(&mut); + if (!--cma_dev->refcnt) { + ibv_dealloc_pd(cma_dev->pd); + if (cma_dev->xrcd) + ibv_close_xrcd(cma_dev->xrcd); + } + pthread_mutex_unlock(&mut); +} + +static struct ibv_xrcd *ucma_get_xrcd(struct cma_device *cma_dev) +{ + struct ibv_xrcd_init_attr attr; + + pthread_mutex_lock(&mut); + if (!cma_dev->xrcd) { + memset(&attr, 0, sizeof attr); + attr.comp_mask = IBV_XRCD_INIT_ATTR_FD | IBV_XRCD_INIT_ATTR_OFLAGS; + attr.fd = -1; + attr.oflags = O_CREAT; + cma_dev->xrcd = ibv_open_xrcd(cma_dev->verbs, &attr); + } + pthread_mutex_unlock(&mut); + return cma_dev->xrcd; +} + +static void ucma_insert_id(struct cma_id_private *id_priv) +{ + fastlock_acquire(&idm_lock); + idm_set(&ucma_idm, id_priv->handle, id_priv); + fastlock_release(&idm_lock); +} + +static void ucma_remove_id(struct cma_id_private *id_priv) +{ + if (id_priv->handle <= IDX_MAX_INDEX) + idm_clear(&ucma_idm, id_priv->handle); +} + +static struct cma_id_private *ucma_lookup_id(int handle) +{ + return idm_lookup(&ucma_idm, handle); +} + +static void ucma_free_id(struct cma_id_private *id_priv) +{ + ucma_remove_id(id_priv); + if (id_priv->cma_dev) + ucma_put_device(id_priv->cma_dev); + pthread_cond_destroy(&id_priv->cond); + pthread_mutex_destroy(&id_priv->mut); + if (id_priv->id.route.path_rec) + free(id_priv->id.route.path_rec); + + if (id_priv->sync) + rdma_destroy_event_channel(id_priv->id.channel); + if (id_priv->connect_len) + free(id_priv->connect); + free(id_priv); +} + +static struct cma_id_private *ucma_alloc_id(struct rdma_event_channel *channel, + void *context, + enum rdma_port_space ps, + enum ibv_qp_type qp_type) +{ + struct cma_id_private *id_priv; + + id_priv = calloc(1, sizeof(*id_priv)); + if (!id_priv) + return NULL; + + id_priv->id.context = context; + id_priv->id.ps = ps; + id_priv->id.qp_type = qp_type; + id_priv->handle = 0xFFFFFFFF; + + if (!channel) { + id_priv->id.channel = rdma_create_event_channel(); + if (!id_priv->id.channel) + goto err; + id_priv->sync = 1; + } else { + id_priv->id.channel = channel; + } + + pthread_mutex_init(&id_priv->mut, NULL); + if (pthread_cond_init(&id_priv->cond, NULL)) + goto err; + + return id_priv; + +err: ucma_free_id(id_priv); + return NULL; +} + +static int rdma_create_id2(struct rdma_event_channel *channel, + struct rdma_cm_id **id, void *context, + enum rdma_port_space ps, enum ibv_qp_type qp_type) +{ + struct ucma_abi_create_id_resp resp; + struct ucma_abi_create_id cmd; + struct cma_id_private *id_priv; + int ret; + + ret = ucma_init(); + if (ret) + return ret; + + id_priv = ucma_alloc_id(channel, context, ps, qp_type); + if (!id_priv) + return ERR(ENOMEM); + + CMA_INIT_CMD_RESP(&cmd, sizeof cmd, CREATE_ID, &resp, sizeof resp); + cmd.uid = (uintptr_t) id_priv; + cmd.ps = ps; + cmd.qp_type = qp_type; + + ret = write(id_priv->id.channel->fd, &cmd, sizeof cmd); + if (ret != sizeof(cmd)) { + ret = (ret >= 0) ? ERR(ENODATA) : -1; + goto err; + } + + VALGRIND_MAKE_MEM_DEFINED(&resp, sizeof resp); + + id_priv->handle = resp.id; + ucma_insert_id(id_priv); + *id = &id_priv->id; + return 0; + +err: ucma_free_id(id_priv); + return ret; +} + +int rdma_create_id(struct rdma_event_channel *channel, + struct rdma_cm_id **id, void *context, + enum rdma_port_space ps) +{ + enum ibv_qp_type qp_type; + + qp_type = (ps == RDMA_PS_IPOIB || ps == RDMA_PS_UDP) ? + IBV_QPT_UD : IBV_QPT_RC; + return rdma_create_id2(channel, id, context, ps, qp_type); +} + +static int ucma_destroy_kern_id(int fd, uint32_t handle) +{ + struct ucma_abi_destroy_id_resp resp; + struct ucma_abi_destroy_id cmd; + int ret; + + CMA_INIT_CMD_RESP(&cmd, sizeof cmd, DESTROY_ID, &resp, sizeof resp); + cmd.id = handle; + + ret = write(fd, &cmd, sizeof cmd); + if (ret != sizeof cmd) + return (ret >= 0) ? ERR(ENODATA) : -1; + + VALGRIND_MAKE_MEM_DEFINED(&resp, sizeof resp); + + return resp.events_reported; +} + +int rdma_destroy_id(struct rdma_cm_id *id) +{ + struct cma_id_private *id_priv; + int ret; + + id_priv = container_of(id, struct cma_id_private, id); + ret = ucma_destroy_kern_id(id->channel->fd, id_priv->handle); + if (ret < 0) + return ret; + + if (id_priv->id.event) + rdma_ack_cm_event(id_priv->id.event); + + pthread_mutex_lock(&id_priv->mut); + while (id_priv->events_completed < ret) + pthread_cond_wait(&id_priv->cond, &id_priv->mut); + pthread_mutex_unlock(&id_priv->mut); + + ucma_free_id(id_priv); + return 0; +} + +int ucma_addrlen(struct sockaddr *addr) +{ + if (!addr) + return 0; + + switch (addr->sa_family) { + case PF_INET: + return sizeof(struct sockaddr_in); + case PF_INET6: + return sizeof(struct sockaddr_in6); + case PF_IB: + return af_ib_support ? sizeof(struct sockaddr_ib) : 0; + default: + return 0; + } +} + +static int ucma_query_addr(struct rdma_cm_id *id) +{ + struct ucma_abi_query_addr_resp resp; + struct ucma_abi_query cmd; + struct cma_id_private *id_priv; + int ret; + + CMA_INIT_CMD_RESP(&cmd, sizeof cmd, QUERY, &resp, sizeof resp); + id_priv = container_of(id, struct cma_id_private, id); + cmd.id = id_priv->handle; + cmd.option = UCMA_QUERY_ADDR; + + ret = write(id->channel->fd, &cmd, sizeof cmd); + if (ret != sizeof cmd) + return (ret >= 0) ? ERR(ENODATA) : -1; + + VALGRIND_MAKE_MEM_DEFINED(&resp, sizeof resp); + + memcpy(&id->route.addr.src_addr, &resp.src_addr, resp.src_size); + memcpy(&id->route.addr.dst_addr, &resp.dst_addr, resp.dst_size); + + if (!id_priv->cma_dev && resp.node_guid) { + ret = ucma_get_device(id_priv, resp.node_guid); + if (ret) + return ret; + id->port_num = resp.port_num; + id->route.addr.addr.ibaddr.pkey = resp.pkey; + } + + return 0; +} + +static int ucma_query_gid(struct rdma_cm_id *id) +{ + struct ucma_abi_query_addr_resp resp; + struct ucma_abi_query cmd; + struct cma_id_private *id_priv; + struct sockaddr_ib *sib; + int ret; + + CMA_INIT_CMD_RESP(&cmd, sizeof cmd, QUERY, &resp, sizeof resp); + id_priv = container_of(id, struct cma_id_private, id); + cmd.id = id_priv->handle; + cmd.option = UCMA_QUERY_GID; + + ret = write(id->channel->fd, &cmd, sizeof cmd); + if (ret != sizeof cmd) + return (ret >= 0) ? ERR(ENODATA) : -1; + + VALGRIND_MAKE_MEM_DEFINED(&resp, sizeof resp); + + sib = (struct sockaddr_ib *) &resp.src_addr; + memcpy(id->route.addr.addr.ibaddr.sgid.raw, sib->sib_addr.sib_raw, + sizeof id->route.addr.addr.ibaddr.sgid); + + sib = (struct sockaddr_ib *) &resp.dst_addr; + memcpy(id->route.addr.addr.ibaddr.dgid.raw, sib->sib_addr.sib_raw, + sizeof id->route.addr.addr.ibaddr.dgid); + + return 0; +} + +static void ucma_convert_path(struct ibv_path_data *path_data, + struct ibv_sa_path_rec *sa_path) +{ + uint32_t fl_hop; + + sa_path->dgid = path_data->path.dgid; + sa_path->sgid = path_data->path.sgid; + sa_path->dlid = path_data->path.dlid; + sa_path->slid = path_data->path.slid; + sa_path->raw_traffic = 0; + + fl_hop = be32toh(path_data->path.flowlabel_hoplimit); + sa_path->flow_label = htobe32(fl_hop >> 8); + sa_path->hop_limit = (uint8_t) fl_hop; + + sa_path->traffic_class = path_data->path.tclass; + sa_path->reversible = path_data->path.reversible_numpath >> 7; + sa_path->numb_path = 1; + sa_path->pkey = path_data->path.pkey; + sa_path->sl = be16toh(path_data->path.qosclass_sl) & 0xF; + sa_path->mtu_selector = 2; /* exactly */ + sa_path->mtu = path_data->path.mtu & 0x1F; + sa_path->rate_selector = 2; + sa_path->rate = path_data->path.rate & 0x1F; + sa_path->packet_life_time_selector = 2; + sa_path->packet_life_time = path_data->path.packetlifetime & 0x1F; + + sa_path->preference = (uint8_t) path_data->flags; +} + +static int ucma_query_path(struct rdma_cm_id *id) +{ + struct ucma_abi_query_path_resp *resp; + struct ucma_abi_query cmd; + struct cma_id_private *id_priv; + int ret, i, size; + + size = sizeof(*resp) + sizeof(struct ibv_path_data) * 6; + resp = alloca(size); + CMA_INIT_CMD_RESP(&cmd, sizeof cmd, QUERY, resp, size); + id_priv = container_of(id, struct cma_id_private, id); + cmd.id = id_priv->handle; + cmd.option = UCMA_QUERY_PATH; + + ret = write(id->channel->fd, &cmd, sizeof cmd); + if (ret != sizeof cmd) + return (ret >= 0) ? ERR(ENODATA) : -1; + + VALGRIND_MAKE_MEM_DEFINED(resp, size); + + if (resp->num_paths) { + id->route.path_rec = malloc(sizeof(*id->route.path_rec) * + resp->num_paths); + if (!id->route.path_rec) + return ERR(ENOMEM); + + id->route.num_paths = resp->num_paths; + for (i = 0; i < resp->num_paths; i++) + ucma_convert_path(&resp->path_data[i], &id->route.path_rec[i]); + } + + return 0; +} + +static int ucma_query_route(struct rdma_cm_id *id) +{ + struct ucma_abi_query_route_resp resp; + struct ucma_abi_query cmd; + struct cma_id_private *id_priv; + int ret, i; + + CMA_INIT_CMD_RESP(&cmd, sizeof cmd, QUERY_ROUTE, &resp, sizeof resp); + id_priv = container_of(id, struct cma_id_private, id); + cmd.id = id_priv->handle; + + ret = write(id->channel->fd, &cmd, sizeof cmd); + if (ret != sizeof cmd) + return (ret >= 0) ? ERR(ENODATA) : -1; + + VALGRIND_MAKE_MEM_DEFINED(&resp, sizeof resp); + + if (resp.num_paths) { + id->route.path_rec = malloc(sizeof(*id->route.path_rec) * + resp.num_paths); + if (!id->route.path_rec) + return ERR(ENOMEM); + + id->route.num_paths = resp.num_paths; + for (i = 0; i < resp.num_paths; i++) + ibv_copy_path_rec_from_kern(&id->route.path_rec[i], + &resp.ib_route[i]); + } + + memcpy(id->route.addr.addr.ibaddr.sgid.raw, resp.ib_route[0].sgid, + sizeof id->route.addr.addr.ibaddr.sgid); + memcpy(id->route.addr.addr.ibaddr.dgid.raw, resp.ib_route[0].dgid, + sizeof id->route.addr.addr.ibaddr.dgid); + id->route.addr.addr.ibaddr.pkey = resp.ib_route[0].pkey; + memcpy(&id->route.addr.src_addr, &resp.src_addr, + sizeof resp.src_addr); + memcpy(&id->route.addr.dst_addr, &resp.dst_addr, + sizeof resp.dst_addr); + + if (!id_priv->cma_dev && resp.node_guid) { + ret = ucma_get_device(id_priv, resp.node_guid); + if (ret) + return ret; + id_priv->id.port_num = resp.port_num; + } + + return 0; +} + +static int rdma_bind_addr2(struct rdma_cm_id *id, struct sockaddr *addr, + socklen_t addrlen) +{ + struct ucma_abi_bind cmd; + struct cma_id_private *id_priv; + int ret; + + CMA_INIT_CMD(&cmd, sizeof cmd, BIND); + id_priv = container_of(id, struct cma_id_private, id); + cmd.id = id_priv->handle; + cmd.addr_size = addrlen; + memcpy(&cmd.addr, addr, addrlen); + + ret = write(id->channel->fd, &cmd, sizeof cmd); + if (ret != sizeof cmd) + return (ret >= 0) ? ERR(ENODATA) : -1; + + ret = ucma_query_addr(id); + if (!ret) + ret = ucma_query_gid(id); + return ret; +} + +int rdma_bind_addr(struct rdma_cm_id *id, struct sockaddr *addr) +{ + struct ucma_abi_bind_ip cmd; + struct cma_id_private *id_priv; + int ret, addrlen; + + addrlen = ucma_addrlen(addr); + if (!addrlen) + return ERR(EINVAL); + + if (af_ib_support) + return rdma_bind_addr2(id, addr, addrlen); + + CMA_INIT_CMD(&cmd, sizeof cmd, BIND_IP); + id_priv = container_of(id, struct cma_id_private, id); + cmd.id = id_priv->handle; + memcpy(&cmd.addr, addr, addrlen); + + ret = write(id->channel->fd, &cmd, sizeof cmd); + if (ret != sizeof cmd) + return (ret >= 0) ? ERR(ENODATA) : -1; + + return ucma_query_route(id); +} + +int ucma_complete(struct rdma_cm_id *id) +{ + struct cma_id_private *id_priv; + int ret; + + id_priv = container_of(id, struct cma_id_private, id); + if (!id_priv->sync) + return 0; + + if (id_priv->id.event) { + rdma_ack_cm_event(id_priv->id.event); + id_priv->id.event = NULL; + } + + ret = rdma_get_cm_event(id_priv->id.channel, &id_priv->id.event); + if (ret) + return ret; + + if (id_priv->id.event->status) { + if (id_priv->id.event->event == RDMA_CM_EVENT_REJECTED) + ret = ERR(ECONNREFUSED); + else if (id_priv->id.event->status < 0) + ret = ERR(-id_priv->id.event->status); + else + ret = ERR(id_priv->id.event->status); + } + return ret; +} + +static int rdma_resolve_addr2(struct rdma_cm_id *id, struct sockaddr *src_addr, + socklen_t src_len, struct sockaddr *dst_addr, + socklen_t dst_len, int timeout_ms) +{ + struct ucma_abi_resolve_addr cmd; + struct cma_id_private *id_priv; + int ret; + + CMA_INIT_CMD(&cmd, sizeof cmd, RESOLVE_ADDR); + id_priv = container_of(id, struct cma_id_private, id); + cmd.id = id_priv->handle; + cmd.src_size = src_len; + memcpy(&cmd.src_addr, src_addr, src_len); + memcpy(&cmd.dst_addr, dst_addr, dst_len); + cmd.dst_size = dst_len; + cmd.timeout_ms = timeout_ms; + + ret = write(id->channel->fd, &cmd, sizeof cmd); + if (ret != sizeof cmd) + return (ret >= 0) ? ERR(ENODATA) : -1; + + memcpy(&id->route.addr.dst_addr, dst_addr, dst_len); + return ucma_complete(id); +} + +int rdma_resolve_addr(struct rdma_cm_id *id, struct sockaddr *src_addr, + struct sockaddr *dst_addr, int timeout_ms) +{ + struct ucma_abi_resolve_ip cmd; + struct cma_id_private *id_priv; + int ret, dst_len, src_len; + + dst_len = ucma_addrlen(dst_addr); + if (!dst_len) + return ERR(EINVAL); + + src_len = ucma_addrlen(src_addr); + if (src_addr && !src_len) + return ERR(EINVAL); + + if (af_ib_support) + return rdma_resolve_addr2(id, src_addr, src_len, dst_addr, + dst_len, timeout_ms); + + CMA_INIT_CMD(&cmd, sizeof cmd, RESOLVE_IP); + id_priv = container_of(id, struct cma_id_private, id); + cmd.id = id_priv->handle; + if (src_addr) + memcpy(&cmd.src_addr, src_addr, src_len); + memcpy(&cmd.dst_addr, dst_addr, dst_len); + cmd.timeout_ms = timeout_ms; + + ret = write(id->channel->fd, &cmd, sizeof cmd); + if (ret != sizeof cmd) + return (ret >= 0) ? ERR(ENODATA) : -1; + + memcpy(&id->route.addr.dst_storage, dst_addr, dst_len); + return ucma_complete(id); +} + +static int ucma_set_ib_route(struct rdma_cm_id *id) +{ + struct rdma_addrinfo hint, *rai; + int ret; + + memset(&hint, 0, sizeof hint); + hint.ai_flags = RAI_ROUTEONLY; + hint.ai_family = id->route.addr.src_addr.sa_family; + hint.ai_src_len = ucma_addrlen((struct sockaddr *) &id->route.addr.src_addr); + hint.ai_src_addr = &id->route.addr.src_addr; + hint.ai_dst_len = ucma_addrlen((struct sockaddr *) &id->route.addr.dst_addr); + hint.ai_dst_addr = &id->route.addr.dst_addr; + + ret = rdma_getaddrinfo(NULL, NULL, &hint, &rai); + if (ret) + return ret; + + if (rai->ai_route_len) + ret = rdma_set_option(id, RDMA_OPTION_IB, RDMA_OPTION_IB_PATH, + rai->ai_route, rai->ai_route_len); + else + ret = -1; + + rdma_freeaddrinfo(rai); + return ret; +} + +int rdma_resolve_route(struct rdma_cm_id *id, int timeout_ms) +{ + struct ucma_abi_resolve_route cmd; + struct cma_id_private *id_priv; + int ret; + + id_priv = container_of(id, struct cma_id_private, id); + if (id->verbs->device->transport_type == IBV_TRANSPORT_IB) { + ret = ucma_set_ib_route(id); + if (!ret) + goto out; + } + + CMA_INIT_CMD(&cmd, sizeof cmd, RESOLVE_ROUTE); + cmd.id = id_priv->handle; + cmd.timeout_ms = timeout_ms; + + ret = write(id->channel->fd, &cmd, sizeof cmd); + if (ret != sizeof cmd) + return (ret >= 0) ? ERR(ENODATA) : -1; + +out: + return ucma_complete(id); +} + +static int ucma_is_ud_qp(enum ibv_qp_type qp_type) +{ + return (qp_type == IBV_QPT_UD); +} + +int rdma_init_qp_attr(struct rdma_cm_id *id, struct ibv_qp_attr *qp_attr, + int *qp_attr_mask) +{ + struct ucma_abi_init_qp_attr cmd; + struct ib_uverbs_qp_attr resp; + struct cma_id_private *id_priv; + int ret; + + CMA_INIT_CMD_RESP(&cmd, sizeof cmd, INIT_QP_ATTR, &resp, sizeof resp); + id_priv = container_of(id, struct cma_id_private, id); + cmd.id = id_priv->handle; + cmd.qp_state = qp_attr->qp_state; + + ret = write(id->channel->fd, &cmd, sizeof cmd); + if (ret != sizeof cmd) + return (ret >= 0) ? ERR(ENODATA) : -1; + + VALGRIND_MAKE_MEM_DEFINED(&resp, sizeof resp); + + ibv_copy_qp_attr_from_kern(qp_attr, &resp); + *qp_attr_mask = resp.qp_attr_mask; + return 0; +} + +static int ucma_modify_qp_rtr(struct rdma_cm_id *id, uint8_t resp_res) +{ + struct cma_id_private *id_priv; + struct ibv_qp_attr qp_attr; + int qp_attr_mask, ret; + uint8_t link_layer; + + if (!id->qp) + return 0; + + /* Need to update QP attributes from default values. */ + qp_attr.qp_state = IBV_QPS_INIT; + ret = rdma_init_qp_attr(id, &qp_attr, &qp_attr_mask); + if (ret) + return ret; + + ret = ibv_modify_qp(id->qp, &qp_attr, qp_attr_mask); + if (ret) + return ERR(ret); + + qp_attr.qp_state = IBV_QPS_RTR; + ret = rdma_init_qp_attr(id, &qp_attr, &qp_attr_mask); + if (ret) + return ret; + + /* + * Workaround for rdma_ucm kernel bug: + * mask off qp_attr_mask bits 21-24 which are used for RoCE + */ + id_priv = container_of(id, struct cma_id_private, id); + link_layer = id_priv->cma_dev->port[id->port_num - 1].link_layer; + + if (link_layer == IBV_LINK_LAYER_INFINIBAND) + qp_attr_mask &= UINT_MAX ^ 0xe00000; + + if (resp_res != RDMA_MAX_RESP_RES) + qp_attr.max_dest_rd_atomic = resp_res; + return rdma_seterrno(ibv_modify_qp(id->qp, &qp_attr, qp_attr_mask)); +} + +static int ucma_modify_qp_rts(struct rdma_cm_id *id, uint8_t init_depth) +{ + struct ibv_qp_attr qp_attr; + int qp_attr_mask, ret; + + if (!id->qp) + return 0; + + qp_attr.qp_state = IBV_QPS_RTS; + ret = rdma_init_qp_attr(id, &qp_attr, &qp_attr_mask); + if (ret) + return ret; + + if (init_depth != RDMA_MAX_INIT_DEPTH) + qp_attr.max_rd_atomic = init_depth; + return rdma_seterrno(ibv_modify_qp(id->qp, &qp_attr, qp_attr_mask)); +} + +static int ucma_modify_qp_sqd(struct rdma_cm_id *id) +{ + struct ibv_qp_attr qp_attr; + + if (!id->qp) + return 0; + + qp_attr.qp_state = IBV_QPS_SQD; + return rdma_seterrno(ibv_modify_qp(id->qp, &qp_attr, IBV_QP_STATE)); +} + +static int ucma_modify_qp_err(struct rdma_cm_id *id) +{ + struct ibv_qp_attr qp_attr; + + if (!id->qp) + return 0; + + qp_attr.qp_state = IBV_QPS_ERR; + return rdma_seterrno(ibv_modify_qp(id->qp, &qp_attr, IBV_QP_STATE)); +} + +static int ucma_init_conn_qp3(struct cma_id_private *id_priv, struct ibv_qp *qp) +{ + struct ibv_qp_attr qp_attr; + int ret; + + ret = ibv_get_pkey_index(id_priv->cma_dev->verbs, id_priv->id.port_num, + id_priv->id.route.addr.addr.ibaddr.pkey); + if (ret < 0) + return ERR(EINVAL); + + qp_attr.pkey_index = ret; + qp_attr.port_num = id_priv->id.port_num; + qp_attr.qp_state = IBV_QPS_INIT; + qp_attr.qp_access_flags = 0; + + ret = ibv_modify_qp(qp, &qp_attr, IBV_QP_STATE | IBV_QP_ACCESS_FLAGS | + IBV_QP_PKEY_INDEX | IBV_QP_PORT); + return rdma_seterrno(ret); +} + +static int ucma_init_conn_qp(struct cma_id_private *id_priv, struct ibv_qp *qp) +{ + struct ibv_qp_attr qp_attr; + int qp_attr_mask, ret; + + if (abi_ver == 3) + return ucma_init_conn_qp3(id_priv, qp); + + qp_attr.qp_state = IBV_QPS_INIT; + ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask); + if (ret) + return ret; + + return rdma_seterrno(ibv_modify_qp(qp, &qp_attr, qp_attr_mask)); +} + +static int ucma_init_ud_qp3(struct cma_id_private *id_priv, struct ibv_qp *qp) +{ + struct ibv_qp_attr qp_attr; + int ret; + + ret = ibv_get_pkey_index(id_priv->cma_dev->verbs, id_priv->id.port_num, + id_priv->id.route.addr.addr.ibaddr.pkey); + if (ret < 0) + return ERR(EINVAL); + + qp_attr.pkey_index = ret; + qp_attr.port_num = id_priv->id.port_num; + qp_attr.qp_state = IBV_QPS_INIT; + qp_attr.qkey = RDMA_UDP_QKEY; + + ret = ibv_modify_qp(qp, &qp_attr, IBV_QP_STATE | IBV_QP_QKEY | + IBV_QP_PKEY_INDEX | IBV_QP_PORT); + if (ret) + return ERR(ret); + + qp_attr.qp_state = IBV_QPS_RTR; + ret = ibv_modify_qp(qp, &qp_attr, IBV_QP_STATE); + if (ret) + return ERR(ret); + + qp_attr.qp_state = IBV_QPS_RTS; + qp_attr.sq_psn = 0; + ret = ibv_modify_qp(qp, &qp_attr, IBV_QP_STATE | IBV_QP_SQ_PSN); + return rdma_seterrno(ret); +} + +static int ucma_init_ud_qp(struct cma_id_private *id_priv, struct ibv_qp *qp) +{ + struct ibv_qp_attr qp_attr; + int qp_attr_mask, ret; + + if (abi_ver == 3) + return ucma_init_ud_qp3(id_priv, qp); + + qp_attr.qp_state = IBV_QPS_INIT; + ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask); + if (ret) + return ret; + + ret = ibv_modify_qp(qp, &qp_attr, qp_attr_mask); + if (ret) + return ERR(ret); + + qp_attr.qp_state = IBV_QPS_RTR; + ret = ibv_modify_qp(qp, &qp_attr, IBV_QP_STATE); + if (ret) + return ERR(ret); + + qp_attr.qp_state = IBV_QPS_RTS; + qp_attr.sq_psn = 0; + ret = ibv_modify_qp(qp, &qp_attr, IBV_QP_STATE | IBV_QP_SQ_PSN); + return rdma_seterrno(ret); +} + +static void ucma_destroy_cqs(struct rdma_cm_id *id) +{ + if (id->qp_type == IBV_QPT_XRC_RECV && id->srq) + return; + + if (id->recv_cq) { + ibv_destroy_cq(id->recv_cq); + if (id->send_cq && (id->send_cq != id->recv_cq)) { + ibv_destroy_cq(id->send_cq); + id->send_cq = NULL; + } + id->recv_cq = NULL; + } + + if (id->recv_cq_channel) { + ibv_destroy_comp_channel(id->recv_cq_channel); + if (id->send_cq_channel && (id->send_cq_channel != id->recv_cq_channel)) { + ibv_destroy_comp_channel(id->send_cq_channel); + id->send_cq_channel = NULL; + } + id->recv_cq_channel = NULL; + } +} + +static int ucma_create_cqs(struct rdma_cm_id *id, uint32_t send_size, uint32_t recv_size) +{ + if (recv_size) { + id->recv_cq_channel = ibv_create_comp_channel(id->verbs); + if (!id->recv_cq_channel) + goto err; + + id->recv_cq = ibv_create_cq(id->verbs, recv_size, + id, id->recv_cq_channel, 0); + if (!id->recv_cq) + goto err; + } + + if (send_size) { + id->send_cq_channel = ibv_create_comp_channel(id->verbs); + if (!id->send_cq_channel) + goto err; + + id->send_cq = ibv_create_cq(id->verbs, send_size, + id, id->send_cq_channel, 0); + if (!id->send_cq) + goto err; + } + + return 0; +err: + ucma_destroy_cqs(id); + return ERR(ENOMEM); +} + +int rdma_create_srq_ex(struct rdma_cm_id *id, struct ibv_srq_init_attr_ex *attr) +{ + struct cma_id_private *id_priv; + struct ibv_srq *srq; + int ret; + + id_priv = container_of(id, struct cma_id_private, id); + if (!(attr->comp_mask & IBV_SRQ_INIT_ATTR_TYPE)) + return ERR(EINVAL); + + if (!(attr->comp_mask & IBV_SRQ_INIT_ATTR_PD) || !attr->pd) { + attr->pd = id->pd; + attr->comp_mask |= IBV_SRQ_INIT_ATTR_PD; + } + + if (attr->srq_type == IBV_SRQT_XRC) { + if (!(attr->comp_mask & IBV_SRQ_INIT_ATTR_XRCD) || !attr->xrcd) { + attr->xrcd = ucma_get_xrcd(id_priv->cma_dev); + if (!attr->xrcd) + return -1; + } + if (!(attr->comp_mask & IBV_SRQ_INIT_ATTR_CQ) || !attr->cq) { + ret = ucma_create_cqs(id, 0, attr->attr.max_wr); + if (ret) + return ret; + attr->cq = id->recv_cq; + } + attr->comp_mask |= IBV_SRQ_INIT_ATTR_XRCD | IBV_SRQ_INIT_ATTR_CQ; + } + + srq = ibv_create_srq_ex(id->verbs, attr); + if (!srq) { + ret = -1; + goto err; + } + + if (!id->pd) + id->pd = attr->pd; + id->srq = srq; + return 0; +err: + ucma_destroy_cqs(id); + return ret; +} + +int rdma_create_srq(struct rdma_cm_id *id, struct ibv_pd *pd, + struct ibv_srq_init_attr *attr) +{ + struct ibv_srq_init_attr_ex attr_ex; + int ret; + + memcpy(&attr_ex, attr, sizeof(*attr)); + attr_ex.comp_mask = IBV_SRQ_INIT_ATTR_TYPE | IBV_SRQ_INIT_ATTR_PD; + if (id->qp_type == IBV_QPT_XRC_RECV) { + attr_ex.srq_type = IBV_SRQT_XRC; + } else { + attr_ex.srq_type = IBV_SRQT_BASIC; + } + attr_ex.pd = pd; + ret = rdma_create_srq_ex(id, &attr_ex); + memcpy(attr, &attr_ex, sizeof(*attr)); + return ret; +} + +void rdma_destroy_srq(struct rdma_cm_id *id) +{ + ibv_destroy_srq(id->srq); + id->srq = NULL; + ucma_destroy_cqs(id); +} + +int rdma_create_qp_ex(struct rdma_cm_id *id, + struct ibv_qp_init_attr_ex *attr) +{ + struct cma_id_private *id_priv; + struct ibv_qp *qp; + int ret; + + if (id->qp) + return ERR(EINVAL); + + id_priv = container_of(id, struct cma_id_private, id); + if (!(attr->comp_mask & IBV_QP_INIT_ATTR_PD) || !attr->pd) { + attr->comp_mask |= IBV_QP_INIT_ATTR_PD; + attr->pd = id->pd; + } else if (id->verbs != attr->pd->context) + return ERR(EINVAL); + + if ((id->recv_cq && attr->recv_cq && id->recv_cq != attr->recv_cq) || + (id->send_cq && attr->send_cq && id->send_cq != attr->send_cq)) + return ERR(EINVAL); + + if (id->qp_type == IBV_QPT_XRC_RECV) { + if (!(attr->comp_mask & IBV_QP_INIT_ATTR_XRCD) || !attr->xrcd) { + attr->xrcd = ucma_get_xrcd(id_priv->cma_dev); + if (!attr->xrcd) + return -1; + attr->comp_mask |= IBV_QP_INIT_ATTR_XRCD; + } + } + + ret = ucma_create_cqs(id, attr->send_cq || id->send_cq ? 0 : attr->cap.max_send_wr, + attr->recv_cq || id->recv_cq ? 0 : attr->cap.max_recv_wr); + if (ret) + return ret; + + if (!attr->send_cq) + attr->send_cq = id->send_cq; + if (!attr->recv_cq) + attr->recv_cq = id->recv_cq; + if (id->srq && !attr->srq) + attr->srq = id->srq; + qp = ibv_create_qp_ex(id->verbs, attr); + if (!qp) { + ret = ERR(ENOMEM); + goto err1; + } + + if (ucma_is_ud_qp(id->qp_type)) + ret = ucma_init_ud_qp(id_priv, qp); + else + ret = ucma_init_conn_qp(id_priv, qp); + if (ret) + goto err2; + + id->pd = qp->pd; + id->qp = qp; + return 0; +err2: + ibv_destroy_qp(qp); +err1: + ucma_destroy_cqs(id); + return ret; +} + +int rdma_create_qp(struct rdma_cm_id *id, struct ibv_pd *pd, + struct ibv_qp_init_attr *qp_init_attr) +{ + struct ibv_qp_init_attr_ex attr_ex; + int ret; + + memcpy(&attr_ex, qp_init_attr, sizeof(*qp_init_attr)); + attr_ex.comp_mask = IBV_QP_INIT_ATTR_PD; + attr_ex.pd = pd ? pd : id->pd; + ret = rdma_create_qp_ex(id, &attr_ex); + memcpy(qp_init_attr, &attr_ex, sizeof(*qp_init_attr)); + return ret; +} + +void rdma_destroy_qp(struct rdma_cm_id *id) +{ + ibv_destroy_qp(id->qp); + id->qp = NULL; + ucma_destroy_cqs(id); +} + +static int ucma_valid_param(struct cma_id_private *id_priv, + struct rdma_conn_param *param) +{ + if (id_priv->id.ps != RDMA_PS_TCP) + return 0; + + if (!id_priv->id.qp && !param) + goto err; + + if (!param) + return 0; + + if ((param->responder_resources != RDMA_MAX_RESP_RES) && + (param->responder_resources > id_priv->cma_dev->max_responder_resources)) + goto err; + + if ((param->initiator_depth != RDMA_MAX_INIT_DEPTH) && + (param->initiator_depth > id_priv->cma_dev->max_initiator_depth)) + goto err; + + return 0; +err: + return ERR(EINVAL); +} + +static void ucma_copy_conn_param_to_kern(struct cma_id_private *id_priv, + struct ucma_abi_conn_param *dst, + struct rdma_conn_param *src, + uint32_t qp_num, uint8_t srq) +{ + dst->qp_num = qp_num; + dst->srq = srq; + dst->responder_resources = id_priv->responder_resources; + dst->initiator_depth = id_priv->initiator_depth; + dst->valid = 1; + + if (id_priv->connect_len) { + memcpy(dst->private_data, id_priv->connect, id_priv->connect_len); + dst->private_data_len = id_priv->connect_len; + } + + if (src) { + dst->flow_control = src->flow_control; + dst->retry_count = src->retry_count; + dst->rnr_retry_count = src->rnr_retry_count; + + if (src->private_data && src->private_data_len) { + memcpy(dst->private_data + dst->private_data_len, + src->private_data, src->private_data_len); + dst->private_data_len += src->private_data_len; + } + } else { + dst->retry_count = 7; + dst->rnr_retry_count = 7; + } +} + +int rdma_connect(struct rdma_cm_id *id, struct rdma_conn_param *conn_param) +{ + uint32_t qp_num = conn_param ? conn_param->qp_num : 0; + uint8_t srq = conn_param ? conn_param->srq : 0; + struct ucma_abi_connect cmd; + struct cma_id_private *id_priv; + int ret; + + id_priv = container_of(id, struct cma_id_private, id); + ret = ucma_valid_param(id_priv, conn_param); + if (ret) + return ret; + + if (conn_param && conn_param->initiator_depth != RDMA_MAX_INIT_DEPTH) + id_priv->initiator_depth = conn_param->initiator_depth; + else + id_priv->initiator_depth = id_priv->cma_dev->max_initiator_depth; + if (conn_param && conn_param->responder_resources != RDMA_MAX_RESP_RES) + id_priv->responder_resources = conn_param->responder_resources; + else + id_priv->responder_resources = id_priv->cma_dev->max_responder_resources; + + CMA_INIT_CMD(&cmd, sizeof cmd, CONNECT); + cmd.id = id_priv->handle; + if (id->qp) { + qp_num = id->qp->qp_num; + srq = !!id->qp->srq; + } + + ucma_copy_conn_param_to_kern(id_priv, &cmd.conn_param, conn_param, + qp_num, srq); + + ret = write(id->channel->fd, &cmd, sizeof cmd); + if (ret != sizeof cmd) + return (ret >= 0) ? ERR(ENODATA) : -1; + + if (id_priv->connect_len) { + free(id_priv->connect); + id_priv->connect_len = 0; + } + + return ucma_complete(id); +} + +int rdma_listen(struct rdma_cm_id *id, int backlog) +{ + struct ucma_abi_listen cmd; + struct cma_id_private *id_priv; + int ret; + + CMA_INIT_CMD(&cmd, sizeof cmd, LISTEN); + id_priv = container_of(id, struct cma_id_private, id); + cmd.id = id_priv->handle; + cmd.backlog = backlog; + + ret = write(id->channel->fd, &cmd, sizeof cmd); + if (ret != sizeof cmd) + return (ret >= 0) ? ERR(ENODATA) : -1; + + if (af_ib_support) + return ucma_query_addr(id); + else + return ucma_query_route(id); +} + +int rdma_get_request(struct rdma_cm_id *listen, struct rdma_cm_id **id) +{ + struct cma_id_private *id_priv; + struct rdma_cm_event *event; + int ret; + + id_priv = container_of(listen, struct cma_id_private, id); + if (!id_priv->sync) + return ERR(EINVAL); + + if (listen->event) { + rdma_ack_cm_event(listen->event); + listen->event = NULL; + } + + ret = rdma_get_cm_event(listen->channel, &event); + if (ret) + return ret; + + if (event->status) { + ret = ERR(event->status); + goto err; + } + + if (event->event != RDMA_CM_EVENT_CONNECT_REQUEST) { + ret = ERR(EINVAL); + goto err; + } + + if (id_priv->qp_init_attr) { + struct ibv_qp_init_attr attr; + + attr = *id_priv->qp_init_attr; + ret = rdma_create_qp(event->id, listen->pd, &attr); + if (ret) + goto err; + } + + *id = event->id; + (*id)->event = event; + return 0; + +err: + listen->event = event; + return ret; +} + +int rdma_accept(struct rdma_cm_id *id, struct rdma_conn_param *conn_param) +{ + uint32_t qp_num = id->qp ? id->qp->qp_num : conn_param->qp_num; + uint8_t srq = id->qp ? !!id->qp->srq : conn_param->srq; + struct ucma_abi_accept cmd; + struct cma_id_private *id_priv; + int ret; + + id_priv = container_of(id, struct cma_id_private, id); + ret = ucma_valid_param(id_priv, conn_param); + if (ret) + return ret; + + if (!conn_param || conn_param->initiator_depth == RDMA_MAX_INIT_DEPTH) { + id_priv->initiator_depth = min(id_priv->initiator_depth, + id_priv->cma_dev->max_initiator_depth); + } else { + id_priv->initiator_depth = conn_param->initiator_depth; + } + if (!conn_param || conn_param->responder_resources == RDMA_MAX_RESP_RES) { + id_priv->responder_resources = min(id_priv->responder_resources, + id_priv->cma_dev->max_responder_resources); + } else { + id_priv->responder_resources = conn_param->responder_resources; + } + + if (!ucma_is_ud_qp(id->qp_type)) { + ret = ucma_modify_qp_rtr(id, id_priv->responder_resources); + if (ret) + return ret; + + ret = ucma_modify_qp_rts(id, id_priv->initiator_depth); + if (ret) + return ret; + } + + CMA_INIT_CMD(&cmd, sizeof cmd, ACCEPT); + cmd.id = id_priv->handle; + cmd.uid = (uintptr_t) id_priv; + ucma_copy_conn_param_to_kern(id_priv, &cmd.conn_param, conn_param, + qp_num, srq); + + ret = write(id->channel->fd, &cmd, sizeof cmd); + if (ret != sizeof cmd) { + ucma_modify_qp_err(id); + return (ret >= 0) ? ERR(ENODATA) : -1; + } + + if (ucma_is_ud_qp(id->qp_type)) + return 0; + + return ucma_complete(id); +} + +int rdma_reject(struct rdma_cm_id *id, const void *private_data, + uint8_t private_data_len) +{ + struct ucma_abi_reject cmd; + struct cma_id_private *id_priv; + int ret; + + CMA_INIT_CMD(&cmd, sizeof cmd, REJECT); + + id_priv = container_of(id, struct cma_id_private, id); + cmd.id = id_priv->handle; + if (private_data && private_data_len) { + memcpy(cmd.private_data, private_data, private_data_len); + cmd.private_data_len = private_data_len; + } + + ret = write(id->channel->fd, &cmd, sizeof cmd); + if (ret != sizeof cmd) + return (ret >= 0) ? ERR(ENODATA) : -1; + + return 0; +} + +int rdma_notify(struct rdma_cm_id *id, enum ibv_event_type event) +{ + struct ucma_abi_notify cmd; + struct cma_id_private *id_priv; + int ret; + + CMA_INIT_CMD(&cmd, sizeof cmd, NOTIFY); + + id_priv = container_of(id, struct cma_id_private, id); + cmd.id = id_priv->handle; + cmd.event = event; + ret = write(id->channel->fd, &cmd, sizeof cmd); + if (ret != sizeof cmd) + return (ret >= 0) ? ERR(ENODATA) : -1; + + return 0; +} + +int ucma_shutdown(struct rdma_cm_id *id) +{ + if (!id->verbs || !id->verbs->device) + return ERR(EINVAL); + + switch (id->verbs->device->transport_type) { + case IBV_TRANSPORT_IB: + return ucma_modify_qp_err(id); + case IBV_TRANSPORT_IWARP: + return ucma_modify_qp_sqd(id); + default: + return ERR(EINVAL); + } +} + +int rdma_disconnect(struct rdma_cm_id *id) +{ + struct ucma_abi_disconnect cmd; + struct cma_id_private *id_priv; + int ret; + + ret = ucma_shutdown(id); + if (ret) + return ret; + + CMA_INIT_CMD(&cmd, sizeof cmd, DISCONNECT); + id_priv = container_of(id, struct cma_id_private, id); + cmd.id = id_priv->handle; + + ret = write(id->channel->fd, &cmd, sizeof cmd); + if (ret != sizeof cmd) + return (ret >= 0) ? ERR(ENODATA) : -1; + + return ucma_complete(id); +} + +static int rdma_join_multicast2(struct rdma_cm_id *id, struct sockaddr *addr, + socklen_t addrlen, uint16_t join_flags, + void *context) +{ + struct ucma_abi_create_id_resp resp; + struct cma_id_private *id_priv; + struct cma_multicast *mc, **pos; + int ret; + + id_priv = container_of(id, struct cma_id_private, id); + mc = calloc(1, sizeof(*mc)); + if (!mc) + return ERR(ENOMEM); + + mc->context = context; + mc->id_priv = id_priv; + mc->join_flags = join_flags; + memcpy(&mc->addr, addr, addrlen); + if (pthread_cond_init(&mc->cond, NULL)) { + ret = -1; + goto err1; + } + + pthread_mutex_lock(&id_priv->mut); + mc->next = id_priv->mc_list; + id_priv->mc_list = mc; + pthread_mutex_unlock(&id_priv->mut); + + if (af_ib_support) { + struct ucma_abi_join_mcast cmd; + + CMA_INIT_CMD_RESP(&cmd, sizeof cmd, JOIN_MCAST, &resp, sizeof resp); + cmd.id = id_priv->handle; + memcpy(&cmd.addr, addr, addrlen); + cmd.addr_size = addrlen; + cmd.uid = (uintptr_t) mc; + cmd.join_flags = join_flags; + + ret = write(id->channel->fd, &cmd, sizeof cmd); + if (ret != sizeof cmd) { + ret = (ret >= 0) ? ERR(ENODATA) : -1; + goto err2; + } + } else { + struct ucma_abi_join_ip_mcast cmd; + + CMA_INIT_CMD_RESP(&cmd, sizeof cmd, JOIN_IP_MCAST, &resp, sizeof resp); + cmd.id = id_priv->handle; + memcpy(&cmd.addr, addr, addrlen); + cmd.uid = (uintptr_t) mc; + + ret = write(id->channel->fd, &cmd, sizeof cmd); + if (ret != sizeof cmd) { + ret = (ret >= 0) ? ERR(ENODATA) : -1; + goto err2; + } + } + + VALGRIND_MAKE_MEM_DEFINED(&resp, sizeof resp); + + mc->handle = resp.id; + return ucma_complete(id); + +err2: + pthread_mutex_lock(&id_priv->mut); + for (pos = &id_priv->mc_list; *pos != mc; pos = &(*pos)->next) + ; + *pos = mc->next; + pthread_mutex_unlock(&id_priv->mut); +err1: + free(mc); + return ret; +} + +int rdma_join_multicast_ex(struct rdma_cm_id *id, + struct rdma_cm_join_mc_attr_ex *mc_join_attr, + void *context) +{ + int addrlen; + + if (mc_join_attr->comp_mask >= RDMA_CM_JOIN_MC_ATTR_RESERVED) + return ERR(ENOTSUP); + + if (!(mc_join_attr->comp_mask & RDMA_CM_JOIN_MC_ATTR_ADDRESS)) + return ERR(EINVAL); + + if (!(mc_join_attr->comp_mask & RDMA_CM_JOIN_MC_ATTR_JOIN_FLAGS) || + (mc_join_attr->join_flags >= RDMA_MC_JOIN_FLAG_RESERVED)) + return ERR(EINVAL); + + addrlen = ucma_addrlen(mc_join_attr->addr); + if (!addrlen) + return ERR(EINVAL); + + return rdma_join_multicast2(id, mc_join_attr->addr, addrlen, + mc_join_attr->join_flags, context); +} + +int rdma_join_multicast(struct rdma_cm_id *id, struct sockaddr *addr, + void *context) +{ + int addrlen; + + addrlen = ucma_addrlen(addr); + if (!addrlen) + return ERR(EINVAL); + + return rdma_join_multicast2(id, addr, addrlen, + RDMA_MC_JOIN_FLAG_FULLMEMBER, context); +} + +int rdma_leave_multicast(struct rdma_cm_id *id, struct sockaddr *addr) +{ + struct ucma_abi_destroy_id cmd; + struct ucma_abi_destroy_id_resp resp; + struct cma_id_private *id_priv; + struct cma_multicast *mc, **pos; + int ret, addrlen; + + addrlen = ucma_addrlen(addr); + if (!addrlen) + return ERR(EINVAL); + + id_priv = container_of(id, struct cma_id_private, id); + pthread_mutex_lock(&id_priv->mut); + for (pos = &id_priv->mc_list; *pos; pos = &(*pos)->next) + if (!memcmp(&(*pos)->addr, addr, addrlen)) + break; + + mc = *pos; + if (*pos) + *pos = mc->next; + pthread_mutex_unlock(&id_priv->mut); + if (!mc) + return ERR(EADDRNOTAVAIL); + + if (id->qp && (mc->join_flags != RDMA_MC_JOIN_FLAG_SENDONLY_FULLMEMBER)) + ibv_detach_mcast(id->qp, &mc->mgid, mc->mlid); + + CMA_INIT_CMD_RESP(&cmd, sizeof cmd, LEAVE_MCAST, &resp, sizeof resp); + cmd.id = mc->handle; + + ret = write(id->channel->fd, &cmd, sizeof cmd); + if (ret != sizeof cmd) { + ret = (ret >= 0) ? ERR(ENODATA) : -1; + goto free; + } + + VALGRIND_MAKE_MEM_DEFINED(&resp, sizeof resp); + + pthread_mutex_lock(&id_priv->mut); + while (mc->events_completed < resp.events_reported) + pthread_cond_wait(&mc->cond, &id_priv->mut); + pthread_mutex_unlock(&id_priv->mut); + + ret = 0; +free: + free(mc); + return ret; +} + +static void ucma_complete_event(struct cma_id_private *id_priv) +{ + pthread_mutex_lock(&id_priv->mut); + id_priv->events_completed++; + pthread_cond_signal(&id_priv->cond); + pthread_mutex_unlock(&id_priv->mut); +} + +static void ucma_complete_mc_event(struct cma_multicast *mc) +{ + pthread_mutex_lock(&mc->id_priv->mut); + mc->events_completed++; + pthread_cond_signal(&mc->cond); + mc->id_priv->events_completed++; + pthread_cond_signal(&mc->id_priv->cond); + pthread_mutex_unlock(&mc->id_priv->mut); +} + +int rdma_ack_cm_event(struct rdma_cm_event *event) +{ + struct cma_event *evt; + + if (!event) + return ERR(EINVAL); + + evt = container_of(event, struct cma_event, event); + + if (evt->mc) + ucma_complete_mc_event(evt->mc); + else + ucma_complete_event(evt->id_priv); + free(evt); + return 0; +} + +static void ucma_process_addr_resolved(struct cma_event *evt) +{ + if (af_ib_support) { + evt->event.status = ucma_query_addr(&evt->id_priv->id); + if (!evt->event.status && + evt->id_priv->id.verbs->device->transport_type == IBV_TRANSPORT_IB) + evt->event.status = ucma_query_gid(&evt->id_priv->id); + } else { + evt->event.status = ucma_query_route(&evt->id_priv->id); + } + + if (evt->event.status) + evt->event.event = RDMA_CM_EVENT_ADDR_ERROR; +} + +static void ucma_process_route_resolved(struct cma_event *evt) +{ + if (evt->id_priv->id.verbs->device->transport_type != IBV_TRANSPORT_IB) + return; + + if (af_ib_support) + evt->event.status = ucma_query_path(&evt->id_priv->id); + else + evt->event.status = ucma_query_route(&evt->id_priv->id); + + if (evt->event.status) + evt->event.event = RDMA_CM_EVENT_ROUTE_ERROR; +} + +static int ucma_query_req_info(struct rdma_cm_id *id) +{ + int ret; + + if (!af_ib_support) + return ucma_query_route(id); + + ret = ucma_query_addr(id); + if (ret) + return ret; + + ret = ucma_query_gid(id); + if (ret) + return ret; + + ret = ucma_query_path(id); + if (ret) + return ret; + + return 0; +} + +static int ucma_process_conn_req(struct cma_event *evt, + uint32_t handle) +{ + struct cma_id_private *id_priv; + int ret; + + id_priv = ucma_alloc_id(evt->id_priv->id.channel, + evt->id_priv->id.context, evt->id_priv->id.ps, + evt->id_priv->id.qp_type); + if (!id_priv) { + ucma_destroy_kern_id(evt->id_priv->id.channel->fd, handle); + ret = ERR(ENOMEM); + goto err1; + } + + evt->event.listen_id = &evt->id_priv->id; + evt->event.id = &id_priv->id; + id_priv->handle = handle; + ucma_insert_id(id_priv); + id_priv->initiator_depth = evt->event.param.conn.initiator_depth; + id_priv->responder_resources = evt->event.param.conn.responder_resources; + + if (evt->id_priv->sync) { + ret = rdma_migrate_id(&id_priv->id, NULL); + if (ret) + goto err2; + } + + ret = ucma_query_req_info(&id_priv->id); + if (ret) + goto err2; + + return 0; + +err2: + rdma_destroy_id(&id_priv->id); +err1: + ucma_complete_event(evt->id_priv); + return ret; +} + +static int ucma_process_conn_resp(struct cma_id_private *id_priv) +{ + struct ucma_abi_accept cmd; + int ret; + + ret = ucma_modify_qp_rtr(&id_priv->id, RDMA_MAX_RESP_RES); + if (ret) + goto err; + + ret = ucma_modify_qp_rts(&id_priv->id, RDMA_MAX_INIT_DEPTH); + if (ret) + goto err; + + CMA_INIT_CMD(&cmd, sizeof cmd, ACCEPT); + cmd.id = id_priv->handle; + + ret = write(id_priv->id.channel->fd, &cmd, sizeof cmd); + if (ret != sizeof cmd) { + ret = (ret >= 0) ? ERR(ENODATA) : -1; + goto err; + } + + return 0; +err: + ucma_modify_qp_err(&id_priv->id); + return ret; +} + +static int ucma_process_join(struct cma_event *evt) +{ + evt->mc->mgid = evt->event.param.ud.ah_attr.grh.dgid; + evt->mc->mlid = evt->event.param.ud.ah_attr.dlid; + + if (!evt->id_priv->id.qp) + return 0; + + /* Don't attach QP to multicast if joined as send only full member */ + if (evt->mc->join_flags == RDMA_MC_JOIN_FLAG_SENDONLY_FULLMEMBER) + return 0; + + return rdma_seterrno(ibv_attach_mcast(evt->id_priv->id.qp, + &evt->mc->mgid, evt->mc->mlid)); +} + +static void ucma_copy_conn_event(struct cma_event *event, + struct ucma_abi_conn_param *src) +{ + struct rdma_conn_param *dst = &event->event.param.conn; + + dst->private_data_len = src->private_data_len; + if (src->private_data_len) { + dst->private_data = &event->private_data; + memcpy(&event->private_data, src->private_data, + src->private_data_len); + } + + dst->responder_resources = src->responder_resources; + dst->initiator_depth = src->initiator_depth; + dst->flow_control = src->flow_control; + dst->retry_count = src->retry_count; + dst->rnr_retry_count = src->rnr_retry_count; + dst->srq = src->srq; + dst->qp_num = src->qp_num; +} + +static void ucma_copy_ud_event(struct cma_event *event, + struct ucma_abi_ud_param *src) +{ + struct rdma_ud_param *dst = &event->event.param.ud; + + dst->private_data_len = src->private_data_len; + if (src->private_data_len) { + dst->private_data = &event->private_data; + memcpy(&event->private_data, src->private_data, + src->private_data_len); + } + + ibv_copy_ah_attr_from_kern(&dst->ah_attr, &src->ah_attr); + dst->qp_num = src->qp_num; + dst->qkey = src->qkey; +} + +int rdma_establish(struct rdma_cm_id *id) +{ + if (id->qp) + return ERR(EINVAL); + + /* id->qp is NULL, so ucma_process_conn_resp() will only send ACCEPT to + * the passive side, and will not attempt to modify the QP. + */ + return ucma_process_conn_resp(container_of(id, struct cma_id_private, + id)); +} + +int rdma_get_cm_event(struct rdma_event_channel *channel, + struct rdma_cm_event **event) +{ + struct ucma_abi_event_resp resp; + struct ucma_abi_get_event cmd; + struct cma_event *evt; + int ret; + + ret = ucma_init(); + if (ret) + return ret; + + if (!event) + return ERR(EINVAL); + + evt = malloc(sizeof(*evt)); + if (!evt) + return ERR(ENOMEM); + +retry: + memset(evt, 0, sizeof(*evt)); + CMA_INIT_CMD_RESP(&cmd, sizeof cmd, GET_EVENT, &resp, sizeof resp); + ret = write(channel->fd, &cmd, sizeof cmd); + if (ret != sizeof cmd) { + free(evt); + return (ret >= 0) ? ERR(ENODATA) : -1; + } + + VALGRIND_MAKE_MEM_DEFINED(&resp, sizeof resp); + + evt->event.event = resp.event; + /* + * We should have a non-zero uid, except for connection requests. + * But a bug in older kernels can report a uid 0. Work-around this + * issue by looking up the cma_id based on the kernel's id when the + * uid is 0 and we're processing a connection established event. + * In all other cases, if the uid is 0, we discard the event, like + * the kernel should have done. + */ + if (resp.uid) { + evt->id_priv = (void *) (uintptr_t) resp.uid; + } else { + evt->id_priv = ucma_lookup_id(resp.id); + if (!evt->id_priv) { + syslog(LOG_WARNING, PFX "Warning: discarding unmatched " + "event - rdma_destroy_id may hang.\n"); + goto retry; + } + if (resp.event != RDMA_CM_EVENT_ESTABLISHED) { + ucma_complete_event(evt->id_priv); + goto retry; + } + } + evt->event.id = &evt->id_priv->id; + evt->event.status = resp.status; + + switch (resp.event) { + case RDMA_CM_EVENT_ADDR_RESOLVED: + ucma_process_addr_resolved(evt); + break; + case RDMA_CM_EVENT_ROUTE_RESOLVED: + ucma_process_route_resolved(evt); + break; + case RDMA_CM_EVENT_CONNECT_REQUEST: + evt->id_priv = (void *) (uintptr_t) resp.uid; + if (ucma_is_ud_qp(evt->id_priv->id.qp_type)) + ucma_copy_ud_event(evt, &resp.param.ud); + else + ucma_copy_conn_event(evt, &resp.param.conn); + + ret = ucma_process_conn_req(evt, resp.id); + if (ret) + goto retry; + break; + case RDMA_CM_EVENT_CONNECT_RESPONSE: + ucma_copy_conn_event(evt, &resp.param.conn); + if (!evt->id_priv->id.qp) { + evt->event.event = RDMA_CM_EVENT_CONNECT_RESPONSE; + } else { + evt->event.status = + ucma_process_conn_resp(evt->id_priv); + if (!evt->event.status) + evt->event.event = RDMA_CM_EVENT_ESTABLISHED; + else { + evt->event.event = RDMA_CM_EVENT_CONNECT_ERROR; + evt->id_priv->connect_error = 1; + } + } + break; + case RDMA_CM_EVENT_ESTABLISHED: + if (ucma_is_ud_qp(evt->id_priv->id.qp_type)) { + ucma_copy_ud_event(evt, &resp.param.ud); + break; + } + + ucma_copy_conn_event(evt, &resp.param.conn); + break; + case RDMA_CM_EVENT_REJECTED: + if (evt->id_priv->connect_error) { + ucma_complete_event(evt->id_priv); + goto retry; + } + ucma_copy_conn_event(evt, &resp.param.conn); + ucma_modify_qp_err(evt->event.id); + break; + case RDMA_CM_EVENT_DISCONNECTED: + if (evt->id_priv->connect_error) { + ucma_complete_event(evt->id_priv); + goto retry; + } + ucma_copy_conn_event(evt, &resp.param.conn); + break; + case RDMA_CM_EVENT_MULTICAST_JOIN: + evt->mc = (void *) (uintptr_t) resp.uid; + evt->id_priv = evt->mc->id_priv; + evt->event.id = &evt->id_priv->id; + ucma_copy_ud_event(evt, &resp.param.ud); + evt->event.param.ud.private_data = evt->mc->context; + evt->event.status = ucma_process_join(evt); + if (evt->event.status) + evt->event.event = RDMA_CM_EVENT_MULTICAST_ERROR; + break; + case RDMA_CM_EVENT_MULTICAST_ERROR: + evt->mc = (void *) (uintptr_t) resp.uid; + evt->id_priv = evt->mc->id_priv; + evt->event.id = &evt->id_priv->id; + evt->event.param.ud.private_data = evt->mc->context; + break; + default: + evt->id_priv = (void *) (uintptr_t) resp.uid; + evt->event.id = &evt->id_priv->id; + evt->event.status = resp.status; + if (ucma_is_ud_qp(evt->id_priv->id.qp_type)) + ucma_copy_ud_event(evt, &resp.param.ud); + else + ucma_copy_conn_event(evt, &resp.param.conn); + break; + } + + *event = &evt->event; + return 0; +} + +const char *rdma_event_str(enum rdma_cm_event_type event) +{ + switch (event) { + case RDMA_CM_EVENT_ADDR_RESOLVED: + return "RDMA_CM_EVENT_ADDR_RESOLVED"; + case RDMA_CM_EVENT_ADDR_ERROR: + return "RDMA_CM_EVENT_ADDR_ERROR"; + case RDMA_CM_EVENT_ROUTE_RESOLVED: + return "RDMA_CM_EVENT_ROUTE_RESOLVED"; + case RDMA_CM_EVENT_ROUTE_ERROR: + return "RDMA_CM_EVENT_ROUTE_ERROR"; + case RDMA_CM_EVENT_CONNECT_REQUEST: + return "RDMA_CM_EVENT_CONNECT_REQUEST"; + case RDMA_CM_EVENT_CONNECT_RESPONSE: + return "RDMA_CM_EVENT_CONNECT_RESPONSE"; + case RDMA_CM_EVENT_CONNECT_ERROR: + return "RDMA_CM_EVENT_CONNECT_ERROR"; + case RDMA_CM_EVENT_UNREACHABLE: + return "RDMA_CM_EVENT_UNREACHABLE"; + case RDMA_CM_EVENT_REJECTED: + return "RDMA_CM_EVENT_REJECTED"; + case RDMA_CM_EVENT_ESTABLISHED: + return "RDMA_CM_EVENT_ESTABLISHED"; + case RDMA_CM_EVENT_DISCONNECTED: + return "RDMA_CM_EVENT_DISCONNECTED"; + case RDMA_CM_EVENT_DEVICE_REMOVAL: + return "RDMA_CM_EVENT_DEVICE_REMOVAL"; + case RDMA_CM_EVENT_MULTICAST_JOIN: + return "RDMA_CM_EVENT_MULTICAST_JOIN"; + case RDMA_CM_EVENT_MULTICAST_ERROR: + return "RDMA_CM_EVENT_MULTICAST_ERROR"; + case RDMA_CM_EVENT_ADDR_CHANGE: + return "RDMA_CM_EVENT_ADDR_CHANGE"; + case RDMA_CM_EVENT_TIMEWAIT_EXIT: + return "RDMA_CM_EVENT_TIMEWAIT_EXIT"; + default: + return "UNKNOWN EVENT"; + } +} + +int rdma_set_option(struct rdma_cm_id *id, int level, int optname, + void *optval, size_t optlen) +{ + struct ucma_abi_set_option cmd; + struct cma_id_private *id_priv; + int ret; + + CMA_INIT_CMD(&cmd, sizeof cmd, SET_OPTION); + id_priv = container_of(id, struct cma_id_private, id); + cmd.id = id_priv->handle; + cmd.optval = (uintptr_t) optval; + cmd.level = level; + cmd.optname = optname; + cmd.optlen = optlen; + + ret = write(id->channel->fd, &cmd, sizeof cmd); + if (ret != sizeof cmd) + return (ret >= 0) ? ERR(ENODATA) : -1; + + return 0; +} + +int rdma_migrate_id(struct rdma_cm_id *id, struct rdma_event_channel *channel) +{ + struct ucma_abi_migrate_resp resp; + struct ucma_abi_migrate_id cmd; + struct cma_id_private *id_priv; + int ret, sync; + + id_priv = container_of(id, struct cma_id_private, id); + if (id_priv->sync && !channel) + return ERR(EINVAL); + + if ((sync = (channel == NULL))) { + channel = rdma_create_event_channel(); + if (!channel) + return -1; + } + + CMA_INIT_CMD_RESP(&cmd, sizeof cmd, MIGRATE_ID, &resp, sizeof resp); + cmd.id = id_priv->handle; + cmd.fd = id->channel->fd; + + ret = write(channel->fd, &cmd, sizeof cmd); + if (ret != sizeof cmd) { + if (sync) + rdma_destroy_event_channel(channel); + return (ret >= 0) ? ERR(ENODATA) : -1; + } + + VALGRIND_MAKE_MEM_DEFINED(&resp, sizeof resp); + + if (id_priv->sync) { + if (id->event) { + rdma_ack_cm_event(id->event); + id->event = NULL; + } + rdma_destroy_event_channel(id->channel); + } + + /* + * Eventually if we want to support migrating channels while events are + * being processed on the current channel, we need to block here while + * there are any outstanding events on the current channel for this id + * to prevent the user from processing events for this id on the old + * channel after this call returns. + */ + pthread_mutex_lock(&id_priv->mut); + id_priv->sync = sync; + id->channel = channel; + while (id_priv->events_completed < resp.events_reported) + pthread_cond_wait(&id_priv->cond, &id_priv->mut); + pthread_mutex_unlock(&id_priv->mut); + + return 0; +} + +static int ucma_passive_ep(struct rdma_cm_id *id, struct rdma_addrinfo *res, + struct ibv_pd *pd, struct ibv_qp_init_attr *qp_init_attr) +{ + struct cma_id_private *id_priv; + int ret; + + if (af_ib_support) + ret = rdma_bind_addr2(id, res->ai_src_addr, res->ai_src_len); + else + ret = rdma_bind_addr(id, res->ai_src_addr); + if (ret) + return ret; + + id_priv = container_of(id, struct cma_id_private, id); + if (pd) + id->pd = pd; + + if (qp_init_attr) { + id_priv->qp_init_attr = malloc(sizeof(*qp_init_attr)); + if (!id_priv->qp_init_attr) + return ERR(ENOMEM); + + *id_priv->qp_init_attr = *qp_init_attr; + id_priv->qp_init_attr->qp_type = res->ai_qp_type; + } + + return 0; +} + +int rdma_create_ep(struct rdma_cm_id **id, struct rdma_addrinfo *res, + struct ibv_pd *pd, struct ibv_qp_init_attr *qp_init_attr) +{ + struct rdma_cm_id *cm_id; + struct cma_id_private *id_priv; + int ret; + + ret = rdma_create_id2(NULL, &cm_id, NULL, res->ai_port_space, res->ai_qp_type); + if (ret) + return ret; + + if (res->ai_flags & RAI_PASSIVE) { + ret = ucma_passive_ep(cm_id, res, pd, qp_init_attr); + if (ret) + goto err; + goto out; + } + + if (af_ib_support) + ret = rdma_resolve_addr2(cm_id, res->ai_src_addr, res->ai_src_len, + res->ai_dst_addr, res->ai_dst_len, 2000); + else + ret = rdma_resolve_addr(cm_id, res->ai_src_addr, res->ai_dst_addr, 2000); + if (ret) + goto err; + + if (res->ai_route_len) { + ret = rdma_set_option(cm_id, RDMA_OPTION_IB, RDMA_OPTION_IB_PATH, + res->ai_route, res->ai_route_len); + if (!ret) + ret = ucma_complete(cm_id); + } else { + ret = rdma_resolve_route(cm_id, 2000); + } + if (ret) + goto err; + + if (qp_init_attr) { + qp_init_attr->qp_type = res->ai_qp_type; + ret = rdma_create_qp(cm_id, pd, qp_init_attr); + if (ret) + goto err; + } + + if (res->ai_connect_len) { + id_priv = container_of(cm_id, struct cma_id_private, id); + id_priv->connect = malloc(res->ai_connect_len); + if (!id_priv->connect) { + ret = ERR(ENOMEM); + goto err; + } + memcpy(id_priv->connect, res->ai_connect, res->ai_connect_len); + id_priv->connect_len = res->ai_connect_len; + } + +out: + *id = cm_id; + return 0; + +err: + rdma_destroy_ep(cm_id); + return ret; +} + +void rdma_destroy_ep(struct rdma_cm_id *id) +{ + struct cma_id_private *id_priv; + + if (id->qp) + rdma_destroy_qp(id); + + if (id->srq) + rdma_destroy_srq(id); + + id_priv = container_of(id, struct cma_id_private, id); + if (id_priv->qp_init_attr) + free(id_priv->qp_init_attr); + + rdma_destroy_id(id); +} + +int ucma_max_qpsize(struct rdma_cm_id *id) +{ + struct cma_id_private *id_priv; + int i, max_size = 0; + + id_priv = container_of(id, struct cma_id_private, id); + if (id && id_priv->cma_dev) { + max_size = id_priv->cma_dev->max_qpsize; + } else { + ucma_init_all(); + for (i = 0; i < cma_dev_cnt; i++) { + if (!max_size || max_size > cma_dev_array[i].max_qpsize) + max_size = cma_dev_array[i].max_qpsize; + } + } + return max_size; +} + +__be16 ucma_get_port(struct sockaddr *addr) +{ + switch (addr->sa_family) { + case AF_INET: + return ((struct sockaddr_in *) addr)->sin_port; + case AF_INET6: + return ((struct sockaddr_in6 *) addr)->sin6_port; + case AF_IB: + return htobe16((uint16_t) be64toh(((struct sockaddr_ib *) addr)->sib_sid)); + default: + return 0; + } +} + +__be16 rdma_get_src_port(struct rdma_cm_id *id) +{ + return ucma_get_port(&id->route.addr.src_addr); +} + +__be16 rdma_get_dst_port(struct rdma_cm_id *id) +{ + return ucma_get_port(&id->route.addr.dst_addr); +} + diff --git a/librdmacm/cma.h b/librdmacm/cma.h new file mode 100644 index 0000000..6282105 --- /dev/null +++ b/librdmacm/cma.h @@ -0,0 +1,117 @@ +/* + * Copyright (c) 2005-2014 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#if !defined(CMA_H) +#define CMA_H + +#include <config.h> + +#include <stdlib.h> +#include <errno.h> +#include <endian.h> +#include <semaphore.h> +#include <stdatomic.h> + +#include <rdma/rdma_cma.h> +#include <infiniband/ib.h> + +#include <ccan/minmax.h> + +#include <valgrind/memcheck.h> + +#define PFX "librdmacm: " + +/* + * Fast synchronization for low contention locking. + */ +typedef struct { + sem_t sem; + _Atomic(int) cnt; +} fastlock_t; +static inline void fastlock_init(fastlock_t *lock) +{ + sem_init(&lock->sem, 0, 0); + atomic_store(&lock->cnt, 0); +} +static inline void fastlock_destroy(fastlock_t *lock) +{ + sem_destroy(&lock->sem); +} +static inline void fastlock_acquire(fastlock_t *lock) +{ + if (atomic_fetch_add(&lock->cnt, 1) > 0) + sem_wait(&lock->sem); +} +static inline void fastlock_release(fastlock_t *lock) +{ + if (atomic_fetch_sub(&lock->cnt, 1) > 1) + sem_post(&lock->sem); +} + +__be16 ucma_get_port(struct sockaddr *addr); +int ucma_addrlen(struct sockaddr *addr); +void ucma_set_sid(enum rdma_port_space ps, struct sockaddr *addr, + struct sockaddr_ib *sib); +int ucma_max_qpsize(struct rdma_cm_id *id); +int ucma_complete(struct rdma_cm_id *id); +int ucma_shutdown(struct rdma_cm_id *id); + +static inline int ERR(int err) +{ + errno = err; + return -1; +} + +int ucma_init(void); +extern int af_ib_support; + +#define RAI_ROUTEONLY 0x01000000 + +void ucma_ib_init(void); +void ucma_ib_cleanup(void); +void ucma_ib_resolve(struct rdma_addrinfo **rai, + const struct rdma_addrinfo *hints); + +struct ib_connect_hdr { + uint8_t cma_version; + uint8_t ip_version; /* IP version: 7:4 */ + uint16_t port; + uint32_t src_addr[4]; + uint32_t dst_addr[4]; +#define cma_src_ip4 src_addr[3] +#define cma_src_ip6 src_addr[0] +#define cma_dst_ip4 dst_addr[3] +#define cma_dst_ip6 dst_addr[0] +}; + +#endif /* CMA_H */ diff --git a/librdmacm/docs/rsocket b/librdmacm/docs/rsocket new file mode 100644 index 0000000..43834e0 --- /dev/null +++ b/librdmacm/docs/rsocket @@ -0,0 +1,280 @@ +.. Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md +rsocket Protocol and Design Guide 11/11/2012 + +Data Streaming (TCP) Overview +----------------------------- +Rsockets is a protocol over RDMA that supports a socket-level API +for applications. For details on the current state of the +implementation, readers should refer to the rsocket man page. This +document describes the rsocket protocol, general design, and +some implementation details. + +Rsockets exchanges data by performing RDMA write operations into +exposed data buffers. In addition to RDMA write data, rsockets uses +small, 32-bit messages for internal communication. RDMA writes +are used to transfer application data into remote data buffers +and to notify the peer when new target data buffers are available. +The following figure highlights the operation. + + host A host B + remote SGL + target SGL <------------- [ ] + [ ] ------ + [ ] -- ------ receive buffer(s) + -- -----> +--+ + -- | | + -- | | + -- | | + -- +--+ + -- + ---> +--+ + | | + | | + +--+ + +The remote SGL contains the address, size, and rkey of the target SGL. As +receive buffers become available on host B, rsockets will issue an RDMA +write against one of the entries in the target SGL on host A. The +updated entry will reference an available receive buffer. Immediate data +included with the RDMA write will indicate to host A that a target SGE +has been updated. + +When host A has data to send, it will check its target SGL. The current +target SGE will contain the address, size, and rkey of the next receive +buffer on host B. If the data transfer is smaller than the size of the +remote receive buffer, host A will update its target SGE to reflect the +remaining size of the receive buffer. That is, once a receive buffer has +been published to a remote peer, it will be fully consumed before a second +buffer is used. + +Rsockets relies on immediate data to notify the remote peer when data has +been transferred or when a target SGL has been updated. Because immediate +data requires that the remote QP have a posted receive, rsockets also uses +a credit based flow control mechanism. The number of credits is based on +the size of the receive queue, with initial credits exchanged during +connection setup. In order to transfer data, rsockets requires both +available receive buffers (published via the target SGL) and data credits. + +Since immediate data is limited to 32-bits, messages may either indicate +the arrival of application data or may be an internal message, but not both. +To avoid credit deadlock, rsockets reserves a small number of available +credits for control messages only, with the protocol relying on RNR NAKs +and retries to make forward progress. + + +Connection Establishment +------------------------ +rsockets uses the RDMA CM for connection establishment. Struct rs_conn_data +is exchanged during the connection exchange as private data in the request +and reply messages. + +struct rs_sge { + uint64_t addr; + uint32_t key; + uint32_t length; +}; + +#define RS_CONN_FLAG_NET 1 + +struct rs_conn_data { + uint8_t version; + uint8_t flags; + uint16_t credits; + uint32_t reserved2; + struct rs_sge target_sgl; + struct rs_sge data_buf; +}; + +Version - current version is 1 +Flags +RS_CONN_FLAG_NET - Set to 1 if host is big Endian. + Determines byte ordering for RDMA write messages +Credits - number of initial receive credits +Reserved2 - set to 0 +Target SGL - Address, size (# entries), and rkey of target SGL. + Remote side will copy this into their remote SGL. +Data Buffer - Initial receive buffer address, size (in bytes), and rkey. + Remote side will copy this into their first target SGE. + + +Message Format +-------------- +Rsocket uses RDMA writes with immediate data for all message exchanges. +RDMA writes of 0 length are used if no additional data beyond the message +needs to be exchanged. Immediate data is limited to 32-bits. Rsockets +defines the following format for messages. + +The upper 3 bits are used to define the type of message being exchanged, +with the meaning of the lower 29 bits determined by the upper bits. + +Bits Message Meaning of +31:29 Type Bits 28:0 +000 Data Transfer bytes transfered +001 reserved +010 reserved - used internally, available for future use +011 reserved +100 Credit Update received credits granted +101 reserved +110 Iomap Updated index of updated entry +111 Control control message type + +Data Transfer +Indicates that application data has been written into the next available +receive buffer. The size of the transfer, in bytes, is carried in the lower +bits of the message. + +Credit Update +Used to indicate that additional receive buffers and credits are available. +The number of available credits is carried in the lower bits of the message. +A credit update message is also used to indicate that a target SGE has been +updated, in which case the number of additional credits may be 0. The +receiver of a credit update message must check for updates to the target SGL +by inspecting the contents of the SGL. The rsocket implementation must take +care not to modify a remote target SGL while it may be in use. This is done +by tracking when a receive buffer referenced by a remote target SGL has been +filled. + +Iomap Updated +Used to indicate that a remote iomap entry was updated. The updated entry +contains the offset value associated with an address, length, and rkey. Once +an iomap has been updated, the local application can issue directed IO +transfers against the corresponding remote buffer. + +Control Message - DISCONNECT +Indicates that the rsocket connection has been fully disconnected and will no +longer send or receive data. Data received before the disconnect message was +processed may still be available for reading. + +Control Message - SHUTDOWN +Indicates that the remote rsocket has shutdown the send side of its +connection. The recipient of a shutdown message will no longer accept +incoming data, but may still transfer outbound data. + + +Iomapped Buffers +---------------- +Rsockets allows for zero-copy transfers using what it refers to as iomapped +buffers. Iomapping and direct data placement (zero-copy) transfers are done +using rsocket specific extensions. The general operation is similar to +that used for normal data transfers described above. + + host A host B + remote iomap + target iomap <----------- [ ] + [ ] ------ + [ ] -- ------ iomapped buffer(s) + -- -----> +--+ + -- | | + -- | | + -- | | + -- +--+ + -- + ---> +--+ + | | + | | + +--+ + +The remote iomap contains the address, size, and rkey of the target iomap. As +the applicaton maps buffers host B to a given rsocket, rsockets will issue an RDMA +write against one of the entries in the target iomap on host A. The +updated entry will reference an available iomapped buffer. Immediate data +included with the RDMA write will indicate to host A that a target iomap +has been updated. + +When host A wishes to transfer directly into an iomapped buffer, it will check +its target iomap for an offset corresponding to a remotely mapped buffer. A +matching iomap entry will contain the address, size, and rkey of the target +buffer on host B. Host A will then issue an RDMA operation against the +registered remote data buffer. + +From host A's perspective, the transfer appears as a normal send/write +operation, with the data stream redirected directly into the receiving +application's buffer. + + + +Datagram Overview +----------------- +The rsocket API supports datagram sockets. Datagram support is handled through an +entirely different protocol and internal implementation. Unlike connected rsockets, +datagram rsockets are not necessarily bound to a network (IP) address. A datagram +socket may use any number of network (IP) addresses, including those which map to +different RDMA devices. As a result, a single datagram rsocket must support +using multiple RDMA devices and ports, and a datagram rsocket references a single +UDP socket, plus zero or more UD QPs. + +Rsockets uses headers inserted before user data sent over UDP sockets to resolve +remote UD QP numbers. When a user first attempts to send a datagram to a remote +address (IP and UDP port), rsockets will take the following steps: + +1. Store the destination address into a lookup table. +2. Resolve which local network address should be used when sending + to the specified destination. +3. Allocate a UD QP on the RDMA device associated with the local address. +4. Send the user's datagram to the remote UDP socket. + +A header is inserted before the user's datagram. The header specifies the +UD QP number associated with the local network address (IP and UDP port) of +the send. + +A service thread is used to process messages received on the UDP socket. This +thread updates the rsocket lookup tables with the remote QPN and path record +data. The service thread forwards data received on the UDP socket to an +rsocket QP. After the remote QPN and path records have been resolved, datagram +communication between two nodes are done over the UD QP. + +UDP Message Format +------------------ +Rsockets uses messages exchanged over UDP sockets to resolve remote QP numbers. +If a user sends a datagram to a remote service and the local rsocket is not +yet configured to send directly to a remote UD QP, the user data is sent over +a UDP socket with the following header inserted before the user data. + +struct ds_udp_header { + uint32_t tag; + uint8_t version; + uint8_t op; + uint8_t length; + uint8_t reserved; + uint32_t qpn; /* lower 8-bits reserved */ + union { + uint32_t ipv4; + uint8_t ipv6[16]; + } addr; +}; + +Tag - Marker used to help identify that the UDP header is present. +#define DS_UDP_TAG 0x55555555 + +Version - IP address version, either 4 or 6 +Op - Indicates message type, used to control the receiver's operation. + Valid operations are RS_OP_DATA and RS_OP_CTRL. Data messages + carry user data, while control messages are used to reply with the + local QP number. +Length - Size of the UDP header. +QPN - UD QP number associated with sender's IP address and port. + The sender's address and port is extracted from the received UDP + datagram. +Addr - Target IP address of the sent datagram. + +Once the remote QP information has been resolved, data is sent directly +between UD QPs. The following header is inserted before any user data that +is transferred over a UD QP. + +struct ds_header { + uint8_t version; + uint8_t length; + uint16_t port; + union { + uint32_t ipv4; + struct { + uint32_t flowinfo; + uint8_t addr[16]; + } ipv6; + } addr; +}; + +Verion - IP address version +Length - Size of the header +Port - Associated source address UDP port +Addr - Associated source IP address \ No newline at end of file diff --git a/librdmacm/examples/CMakeLists.txt b/librdmacm/examples/CMakeLists.txt new file mode 100644 index 0000000..46347b6 --- /dev/null +++ b/librdmacm/examples/CMakeLists.txt @@ -0,0 +1,43 @@ +# Shared example files +add_library(rdmacm_tools STATIC + common.c + ) + +rdma_executable(cmtime cmtime.c) +target_link_libraries(cmtime LINK_PRIVATE rdmacm ${CMAKE_THREAD_LIBS_INIT} rdmacm_tools) + +rdma_executable(mckey mckey.c) +target_link_libraries(mckey LINK_PRIVATE rdmacm ${CMAKE_THREAD_LIBS_INIT} rdmacm_tools) + +rdma_executable(rcopy rcopy.c) +target_link_libraries(rcopy LINK_PRIVATE rdmacm rdmacm_tools) + +rdma_executable(rdma_client rdma_client.c) +target_link_libraries(rdma_client LINK_PRIVATE rdmacm) + +rdma_executable(rdma_server rdma_server.c) +target_link_libraries(rdma_server LINK_PRIVATE rdmacm) + +rdma_executable(rdma_xclient rdma_xclient.c) +target_link_libraries(rdma_xclient LINK_PRIVATE rdmacm) + +rdma_executable(rdma_xserver rdma_xserver.c) +target_link_libraries(rdma_xserver LINK_PRIVATE rdmacm) + +rdma_executable(riostream riostream.c) +target_link_libraries(riostream LINK_PRIVATE rdmacm rdmacm_tools) + +rdma_executable(rping rping.c) +target_link_libraries(rping LINK_PRIVATE rdmacm ${CMAKE_THREAD_LIBS_INIT} rdmacm_tools) + +rdma_executable(rstream rstream.c) +target_link_libraries(rstream LINK_PRIVATE rdmacm rdmacm_tools) + +rdma_executable(ucmatose cmatose.c) +target_link_libraries(ucmatose LINK_PRIVATE rdmacm rdmacm_tools) + +rdma_executable(udaddy udaddy.c) +target_link_libraries(udaddy LINK_PRIVATE rdmacm rdmacm_tools) + +rdma_executable(udpong udpong.c) +target_link_libraries(udpong LINK_PRIVATE rdmacm rdmacm_tools) diff --git a/librdmacm/examples/cmatose.c b/librdmacm/examples/cmatose.c new file mode 100644 index 0000000..1f7f315 --- /dev/null +++ b/librdmacm/examples/cmatose.c @@ -0,0 +1,745 @@ +/* + * Copyright (c) 2005-2006,2011-2012 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * $Id$ + */ + +#include <stdlib.h> +#include <string.h> +#include <stdio.h> +#include <errno.h> +#include <sys/types.h> +#include <sys/socket.h> +#include <netdb.h> +#include <getopt.h> + +#include <rdma/rdma_cma.h> +#include "common.h" + +struct cmatest_node { + int id; + struct rdma_cm_id *cma_id; + int connected; + struct ibv_pd *pd; + struct ibv_cq *cq[2]; + struct ibv_mr *mr; + void *mem; +}; + +enum CQ_INDEX { + SEND_CQ_INDEX, + RECV_CQ_INDEX +}; + +struct cmatest { + struct rdma_event_channel *channel; + struct cmatest_node *nodes; + int conn_index; + int connects_left; + int disconnects_left; + + struct rdma_addrinfo *rai; +}; + +static struct cmatest test; +static int connections = 1; +static int message_size = 100; +static int message_count = 10; +static const char *port = "7471"; +static uint8_t set_tos = 0; +static uint8_t tos; +static uint8_t migrate = 0; +static char *dst_addr; +static char *src_addr; +static struct rdma_addrinfo hints; +static uint8_t set_timeout; +static uint8_t timeout; + +static int create_message(struct cmatest_node *node) +{ + if (!message_size) + message_count = 0; + + if (!message_count) + return 0; + + node->mem = malloc(message_size); + if (!node->mem) { + printf("failed message allocation\n"); + return -1; + } + node->mr = ibv_reg_mr(node->pd, node->mem, message_size, + IBV_ACCESS_LOCAL_WRITE); + if (!node->mr) { + printf("failed to reg MR\n"); + goto err; + } + return 0; +err: + free(node->mem); + return -1; +} + +static int init_node(struct cmatest_node *node) +{ + struct ibv_qp_init_attr init_qp_attr; + int cqe, ret; + + node->pd = ibv_alloc_pd(node->cma_id->verbs); + if (!node->pd) { + ret = -ENOMEM; + printf("cmatose: unable to allocate PD\n"); + goto out; + } + + cqe = message_count ? message_count : 1; + node->cq[SEND_CQ_INDEX] = ibv_create_cq(node->cma_id->verbs, cqe, node, NULL, 0); + node->cq[RECV_CQ_INDEX] = ibv_create_cq(node->cma_id->verbs, cqe, node, NULL, 0); + if (!node->cq[SEND_CQ_INDEX] || !node->cq[RECV_CQ_INDEX]) { + ret = -ENOMEM; + printf("cmatose: unable to create CQ\n"); + goto out; + } + + memset(&init_qp_attr, 0, sizeof init_qp_attr); + init_qp_attr.cap.max_send_wr = cqe; + init_qp_attr.cap.max_recv_wr = cqe; + init_qp_attr.cap.max_send_sge = 1; + init_qp_attr.cap.max_recv_sge = 1; + init_qp_attr.qp_context = node; + init_qp_attr.sq_sig_all = 1; + init_qp_attr.qp_type = IBV_QPT_RC; + init_qp_attr.send_cq = node->cq[SEND_CQ_INDEX]; + init_qp_attr.recv_cq = node->cq[RECV_CQ_INDEX]; + ret = rdma_create_qp(node->cma_id, node->pd, &init_qp_attr); + if (ret) { + perror("cmatose: unable to create QP"); + goto out; + } + + ret = create_message(node); + if (ret) { + printf("cmatose: failed to create messages: %d\n", ret); + goto out; + } +out: + return ret; +} + +static int post_recvs(struct cmatest_node *node) +{ + struct ibv_recv_wr recv_wr, *recv_failure; + struct ibv_sge sge; + int i, ret = 0; + + if (!message_count) + return 0; + + recv_wr.next = NULL; + recv_wr.sg_list = &sge; + recv_wr.num_sge = 1; + recv_wr.wr_id = (uintptr_t) node; + + sge.length = message_size; + sge.lkey = node->mr->lkey; + sge.addr = (uintptr_t) node->mem; + + for (i = 0; i < message_count && !ret; i++ ) { + ret = ibv_post_recv(node->cma_id->qp, &recv_wr, &recv_failure); + if (ret) { + printf("failed to post receives: %d\n", ret); + break; + } + } + return ret; +} + +static int post_sends(struct cmatest_node *node) +{ + struct ibv_send_wr send_wr, *bad_send_wr; + struct ibv_sge sge; + int i, ret = 0; + + if (!node->connected || !message_count) + return 0; + + send_wr.next = NULL; + send_wr.sg_list = &sge; + send_wr.num_sge = 1; + send_wr.opcode = IBV_WR_SEND; + send_wr.send_flags = 0; + send_wr.wr_id = (unsigned long)node; + + sge.length = message_size; + sge.lkey = node->mr->lkey; + sge.addr = (uintptr_t) node->mem; + + for (i = 0; i < message_count && !ret; i++) { + ret = ibv_post_send(node->cma_id->qp, &send_wr, &bad_send_wr); + if (ret) + printf("failed to post sends: %d\n", ret); + } + return ret; +} + +static void connect_error(void) +{ + test.connects_left--; +} + +static int addr_handler(struct cmatest_node *node) +{ + int ret; + + if (set_tos) { + ret = rdma_set_option(node->cma_id, RDMA_OPTION_ID, + RDMA_OPTION_ID_TOS, &tos, sizeof tos); + if (ret) + perror("cmatose: set TOS option failed"); + } + if (set_timeout) { + ret = rdma_set_option(node->cma_id, RDMA_OPTION_ID, + RDMA_OPTION_ID_ACK_TIMEOUT, + &timeout, sizeof(timeout)); + if (ret) + perror("cmatose: set ack timeout option failed"); + } + ret = rdma_resolve_route(node->cma_id, 2000); + if (ret) { + perror("cmatose: resolve route failed"); + connect_error(); + } + return ret; +} + +static int route_handler(struct cmatest_node *node) +{ + struct rdma_conn_param conn_param; + int ret; + + ret = init_node(node); + if (ret) + goto err; + + ret = post_recvs(node); + if (ret) + goto err; + + memset(&conn_param, 0, sizeof conn_param); + conn_param.responder_resources = 1; + conn_param.initiator_depth = 1; + conn_param.retry_count = 5; + conn_param.private_data = test.rai->ai_connect; + conn_param.private_data_len = test.rai->ai_connect_len; + ret = rdma_connect(node->cma_id, &conn_param); + if (ret) { + perror("cmatose: failure connecting"); + goto err; + } + return 0; +err: + connect_error(); + return ret; +} + +static int connect_handler(struct rdma_cm_id *cma_id) +{ + struct cmatest_node *node; + int ret; + + if (test.conn_index == connections) { + ret = -ENOMEM; + goto err1; + } + node = &test.nodes[test.conn_index++]; + + node->cma_id = cma_id; + cma_id->context = node; + + ret = init_node(node); + if (ret) + goto err2; + + if (set_timeout) { + ret = rdma_set_option(node->cma_id, RDMA_OPTION_ID, + RDMA_OPTION_ID_ACK_TIMEOUT, + &timeout, sizeof(timeout)); + if (ret) + perror("cmatose: set ack timeout option failed"); + } + ret = post_recvs(node); + if (ret) + goto err2; + + ret = rdma_accept(node->cma_id, NULL); + if (ret) { + perror("cmatose: failure accepting"); + goto err2; + } + return 0; + +err2: + node->cma_id = NULL; + connect_error(); +err1: + printf("cmatose: failing connection request\n"); + rdma_reject(cma_id, NULL, 0); + return ret; +} + +static int cma_handler(struct rdma_cm_id *cma_id, struct rdma_cm_event *event) +{ + int ret = 0; + + switch (event->event) { + case RDMA_CM_EVENT_ADDR_RESOLVED: + ret = addr_handler(cma_id->context); + break; + case RDMA_CM_EVENT_ROUTE_RESOLVED: + ret = route_handler(cma_id->context); + break; + case RDMA_CM_EVENT_CONNECT_REQUEST: + ret = connect_handler(cma_id); + break; + case RDMA_CM_EVENT_ESTABLISHED: + ((struct cmatest_node *) cma_id->context)->connected = 1; + test.connects_left--; + test.disconnects_left++; + break; + case RDMA_CM_EVENT_ADDR_ERROR: + case RDMA_CM_EVENT_ROUTE_ERROR: + case RDMA_CM_EVENT_CONNECT_ERROR: + case RDMA_CM_EVENT_UNREACHABLE: + case RDMA_CM_EVENT_REJECTED: + printf("cmatose: event: %s, error: %d\n", + rdma_event_str(event->event), event->status); + connect_error(); + ret = event->status; + break; + case RDMA_CM_EVENT_DISCONNECTED: + rdma_disconnect(cma_id); + test.disconnects_left--; + break; + case RDMA_CM_EVENT_DEVICE_REMOVAL: + /* Cleanup will occur after test completes. */ + break; + default: + break; + } + return ret; +} + +static void destroy_node(struct cmatest_node *node) +{ + if (!node->cma_id) + return; + + if (node->cma_id->qp) + rdma_destroy_qp(node->cma_id); + + if (node->cq[SEND_CQ_INDEX]) + ibv_destroy_cq(node->cq[SEND_CQ_INDEX]); + + if (node->cq[RECV_CQ_INDEX]) + ibv_destroy_cq(node->cq[RECV_CQ_INDEX]); + + if (node->mem) { + ibv_dereg_mr(node->mr); + free(node->mem); + } + + if (node->pd) + ibv_dealloc_pd(node->pd); + + /* Destroy the RDMA ID after all device resources */ + rdma_destroy_id(node->cma_id); +} + +static int alloc_nodes(void) +{ + int ret, i; + + test.nodes = malloc(sizeof *test.nodes * connections); + if (!test.nodes) { + printf("cmatose: unable to allocate memory for test nodes\n"); + return -ENOMEM; + } + memset(test.nodes, 0, sizeof *test.nodes * connections); + + for (i = 0; i < connections; i++) { + test.nodes[i].id = i; + if (dst_addr) { + ret = rdma_create_id(test.channel, + &test.nodes[i].cma_id, + &test.nodes[i], hints.ai_port_space); + if (ret) + goto err; + } + } + return 0; +err: + while (--i >= 0) + rdma_destroy_id(test.nodes[i].cma_id); + free(test.nodes); + return ret; +} + +static void destroy_nodes(void) +{ + int i; + + for (i = 0; i < connections; i++) + destroy_node(&test.nodes[i]); + free(test.nodes); +} + +static int poll_cqs(enum CQ_INDEX index) +{ + struct ibv_wc wc[8]; + int done, i, ret; + + for (i = 0; i < connections; i++) { + if (!test.nodes[i].connected) + continue; + + for (done = 0; done < message_count; done += ret) { + ret = ibv_poll_cq(test.nodes[i].cq[index], 8, wc); + if (ret < 0) { + printf("cmatose: failed polling CQ: %d\n", ret); + return ret; + } + } + } + return 0; +} + +static int connect_events(void) +{ + struct rdma_cm_event *event; + int ret = 0; + + while (test.connects_left && !ret) { + ret = rdma_get_cm_event(test.channel, &event); + if (!ret) { + ret = cma_handler(event->id, event); + rdma_ack_cm_event(event); + } else { + perror("cmatose: failure in rdma_get_cm_event in connect events"); + ret = errno; + } + } + + return ret; +} + +static int disconnect_events(void) +{ + struct rdma_cm_event *event; + int ret = 0; + + while (test.disconnects_left && !ret) { + ret = rdma_get_cm_event(test.channel, &event); + if (!ret) { + ret = cma_handler(event->id, event); + rdma_ack_cm_event(event); + } else { + perror("cmatose: failure in rdma_get_cm_event in disconnect events"); + ret = errno; + } + } + + return ret; +} + +static int migrate_channel(struct rdma_cm_id *listen_id) +{ + struct rdma_event_channel *channel; + int i, ret; + + printf("migrating to new event channel\n"); + + channel = rdma_create_event_channel(); + if (!channel) { + perror("cmatose: failed to create event channel"); + return -1; + } + + ret = 0; + if (listen_id) + ret = rdma_migrate_id(listen_id, channel); + + for (i = 0; i < connections && !ret; i++) + ret = rdma_migrate_id(test.nodes[i].cma_id, channel); + + if (!ret) { + rdma_destroy_event_channel(test.channel); + test.channel = channel; + } else + perror("cmatose: failure migrating to channel"); + + return ret; +} + +static int run_server(void) +{ + struct rdma_cm_id *listen_id; + int i, ret; + + printf("cmatose: starting server\n"); + ret = rdma_create_id(test.channel, &listen_id, &test, hints.ai_port_space); + if (ret) { + perror("cmatose: listen request failed"); + return ret; + } + + ret = get_rdma_addr(src_addr, dst_addr, port, &hints, &test.rai); + if (ret) { + printf("cmatose: getrdmaaddr error: %s\n", gai_strerror(ret)); + goto out; + } + + ret = rdma_bind_addr(listen_id, test.rai->ai_src_addr); + if (ret) { + perror("cmatose: bind address failed"); + goto out; + } + + ret = rdma_listen(listen_id, 0); + if (ret) { + perror("cmatose: failure trying to listen"); + goto out; + } + + ret = connect_events(); + if (ret) + goto out; + + if (message_count) { + printf("initiating data transfers\n"); + for (i = 0; i < connections; i++) { + ret = post_sends(&test.nodes[i]); + if (ret) + goto out; + } + + printf("completing sends\n"); + ret = poll_cqs(SEND_CQ_INDEX); + if (ret) + goto out; + + printf("receiving data transfers\n"); + ret = poll_cqs(RECV_CQ_INDEX); + if (ret) + goto out; + printf("data transfers complete\n"); + + } + + if (migrate) { + ret = migrate_channel(listen_id); + if (ret) + goto out; + } + + printf("cmatose: disconnecting\n"); + for (i = 0; i < connections; i++) { + if (!test.nodes[i].connected) + continue; + + test.nodes[i].connected = 0; + rdma_disconnect(test.nodes[i].cma_id); + } + + ret = disconnect_events(); + + printf("disconnected\n"); + +out: + rdma_destroy_id(listen_id); + return ret; +} + +static int run_client(void) +{ + int i, ret, ret2; + + printf("cmatose: starting client\n"); + + ret = get_rdma_addr(src_addr, dst_addr, port, &hints, &test.rai); + if (ret) { + printf("cmatose: getaddrinfo error: %s\n", gai_strerror(ret)); + return ret; + } + + printf("cmatose: connecting\n"); + for (i = 0; i < connections; i++) { + ret = rdma_resolve_addr(test.nodes[i].cma_id, test.rai->ai_src_addr, + test.rai->ai_dst_addr, 2000); + if (ret) { + perror("cmatose: failure getting addr"); + connect_error(); + return ret; + } + } + + ret = connect_events(); + if (ret) + goto disc; + + if (message_count) { + printf("receiving data transfers\n"); + ret = poll_cqs(RECV_CQ_INDEX); + if (ret) + goto disc; + + printf("sending replies\n"); + for (i = 0; i < connections; i++) { + ret = post_sends(&test.nodes[i]); + if (ret) + goto disc; + } + + printf("data transfers complete\n"); + } + + ret = 0; + + if (migrate) { + ret = migrate_channel(NULL); + if (ret) + goto out; + } +disc: + ret2 = disconnect_events(); + if (ret2) + ret = ret2; +out: + return ret; +} + +int main(int argc, char **argv) +{ + int op, ret; + + hints.ai_port_space = RDMA_PS_TCP; + while ((op = getopt(argc, argv, "s:b:f:P:c:C:S:t:p:a:m")) != -1) { + switch (op) { + case 's': + dst_addr = optarg; + break; + case 'b': + src_addr = optarg; + break; + case 'f': + if (!strncasecmp("ip", optarg, 2)) { + hints.ai_flags = RAI_NUMERICHOST; + } else if (!strncasecmp("gid", optarg, 3)) { + hints.ai_flags = RAI_NUMERICHOST | RAI_FAMILY; + hints.ai_family = AF_IB; + } else if (strncasecmp("name", optarg, 4)) { + fprintf(stderr, "Warning: unknown address format\n"); + } + break; + case 'P': + if (!strncasecmp("ib", optarg, 2)) { + hints.ai_port_space = RDMA_PS_IB; + } else if (strncasecmp("tcp", optarg, 3)) { + fprintf(stderr, "Warning: unknown port space format\n"); + } + break; + case 'c': + connections = atoi(optarg); + break; + case 'C': + message_count = atoi(optarg); + break; + case 'S': + message_size = atoi(optarg); + break; + case 't': + set_tos = 1; + tos = (uint8_t) strtoul(optarg, NULL, 0); + break; + case 'p': + port = optarg; + break; + case 'm': + migrate = 1; + break; + case 'a': + set_timeout = 1; + timeout = (uint8_t) strtoul(optarg, NULL, 0); + break; + default: + printf("usage: %s\n", argv[0]); + printf("\t[-s server_address]\n"); + printf("\t[-b bind_address]\n"); + printf("\t[-f address_format]\n"); + printf("\t name, ip, ipv6, or gid\n"); + printf("\t[-P port_space]\n"); + printf("\t tcp or ib\n"); + printf("\t[-c connections]\n"); + printf("\t[-C message_count]\n"); + printf("\t[-S message_size]\n"); + printf("\t[-t type_of_service]\n"); + printf("\t[-p port_number]\n"); + printf("\t[-m(igrate)]\n"); + printf("\t[-a ack_timeout]\n"); + exit(1); + } + } + + test.connects_left = connections; + + test.channel = create_first_event_channel(); + if (!test.channel) { + exit(1); + } + + if (alloc_nodes()) + exit(1); + + if (dst_addr) { + ret = run_client(); + } else { + hints.ai_flags |= RAI_PASSIVE; + ret = run_server(); + } + + printf("test complete\n"); + destroy_nodes(); + rdma_destroy_event_channel(test.channel); + if (test.rai) + rdma_freeaddrinfo(test.rai); + + printf("return status %d\n", ret); + return ret; +} diff --git a/librdmacm/examples/cmtime.c b/librdmacm/examples/cmtime.c new file mode 100644 index 0000000..8d8a933 --- /dev/null +++ b/librdmacm/examples/cmtime.c @@ -0,0 +1,680 @@ +/* + * Copyright (c) 2013 Intel Corporation. All rights reserved. + * + * This software is available to you under the OpenIB.org BSD license + * below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AWV + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <strings.h> +#include <errno.h> +#include <getopt.h> +#include <sys/types.h> +#include <sys/socket.h> +#include <sys/time.h> +#include <sys/wait.h> +#include <netdb.h> +#include <fcntl.h> +#include <unistd.h> +#include <netinet/tcp.h> + +#include <rdma/rdma_cma.h> +#include "common.h" + +static struct rdma_addrinfo hints, *rai; +static struct rdma_event_channel *channel; +static const char *port = "7471"; +static char *dst_addr; +static char *src_addr; +static int timeout = 2000; +static int retries = 2; + +enum step { + STEP_CREATE_ID, + STEP_BIND, + STEP_RESOLVE_ADDR, + STEP_RESOLVE_ROUTE, + STEP_CREATE_QP, + STEP_CONNECT, + STEP_DISCONNECT, + STEP_DESTROY, + STEP_CNT +}; + +static const char *step_str[] = { + "create id", + "bind addr", + "resolve addr", + "resolve route", + "create qp", + "connect", + "disconnect", + "destroy" +}; + +struct node { + struct rdma_cm_id *id; + struct timeval times[STEP_CNT][2]; + int error; + int retries; +}; + +struct list_head { + struct list_head *prev; + struct list_head *next; + struct rdma_cm_id *id; +}; + +struct work_list { + pthread_mutex_t lock; + pthread_cond_t cond; + struct list_head list; +}; + +#define INIT_LIST(x) ((x)->prev = (x)->next = (x)) + +static struct work_list req_work; +static struct work_list disc_work; +static struct node *nodes; +static struct timeval times[STEP_CNT][2]; +static int connections = 100; +static volatile int started[STEP_CNT]; +static volatile int completed[STEP_CNT]; +static struct ibv_qp_init_attr init_qp_attr; +static struct rdma_conn_param conn_param; + +#define start_perf(n, s) gettimeofday(&((n)->times[s][0]), NULL) +#define end_perf(n, s) gettimeofday(&((n)->times[s][1]), NULL) +#define start_time(s) gettimeofday(×[s][0], NULL) +#define end_time(s) gettimeofday(×[s][1], NULL) + +static inline void __list_delete(struct list_head *list) +{ + struct list_head *prev, *next; + prev = list->prev; + next = list->next; + prev->next = next; + next->prev = prev; + INIT_LIST(list); +} + +static inline int __list_empty(struct work_list *list) +{ + return list->list.next == &list->list; +} + +static inline struct list_head *__list_remove_head(struct work_list *work_list) +{ + struct list_head *list_item; + + list_item = work_list->list.next; + __list_delete(list_item); + return list_item; +} + +static inline void list_add_tail(struct work_list *work_list, struct list_head *req) +{ + int empty; + pthread_mutex_lock(&work_list->lock); + empty = __list_empty(work_list); + req->prev = work_list->list.prev; + req->next = &work_list->list; + req->prev->next = work_list->list.prev = req; + pthread_mutex_unlock(&work_list->lock); + if (empty) + pthread_cond_signal(&work_list->cond); +} + +static int zero_time(struct timeval *t) +{ + return !(t->tv_sec || t->tv_usec); +} + +static float diff_us(struct timeval *end, struct timeval *start) +{ + return (end->tv_sec - start->tv_sec) * 1000000. + (end->tv_usec - start->tv_usec); +} + +static void show_perf(void) +{ + int c, i; + float us, max[STEP_CNT], min[STEP_CNT]; + + for (i = 0; i < STEP_CNT; i++) { + max[i] = 0; + min[i] = 999999999.; + for (c = 0; c < connections; c++) { + if (!zero_time(&nodes[c].times[i][0]) && + !zero_time(&nodes[c].times[i][1])) { + us = diff_us(&nodes[c].times[i][1], &nodes[c].times[i][0]); + if (us > max[i]) + max[i] = us; + if (us < min[i]) + min[i] = us; + } + } + } + + printf("step total ms max ms min us us / conn\n"); + for (i = 0; i < STEP_CNT; i++) { + if (i == STEP_BIND && !src_addr) + continue; + + us = diff_us(×[i][1], ×[i][0]); + printf("%-13s: %11.2f%11.2f%11.2f%11.2f\n", step_str[i], us / 1000., + max[i] / 1000., min[i], us / connections); + } +} + +static void addr_handler(struct node *n) +{ + end_perf(n, STEP_RESOLVE_ADDR); + completed[STEP_RESOLVE_ADDR]++; +} + +static void route_handler(struct node *n) +{ + end_perf(n, STEP_RESOLVE_ROUTE); + completed[STEP_RESOLVE_ROUTE]++; +} + +static void conn_handler(struct node *n) +{ + end_perf(n, STEP_CONNECT); + completed[STEP_CONNECT]++; +} + +static void disc_handler(struct node *n) +{ + end_perf(n, STEP_DISCONNECT); + completed[STEP_DISCONNECT]++; +} + +static void __req_handler(struct rdma_cm_id *id) +{ + int ret; + + ret = rdma_create_qp(id, NULL, &init_qp_attr); + if (ret) { + perror("failure creating qp"); + goto err1; + } + + ret = rdma_accept(id, NULL); + if (ret) { + perror("failure accepting"); + goto err2; + } + return; +err2: + rdma_destroy_qp(id); +err1: + printf("failing connection request\n"); + rdma_reject(id, NULL, 0); + rdma_destroy_id(id); + return; +} + +static void *req_handler_thread(void *arg) +{ + struct list_head *work; + do { + pthread_mutex_lock(&req_work.lock); + if (__list_empty(&req_work)) + pthread_cond_wait(&req_work.cond, &req_work.lock); + work = __list_remove_head(&req_work); + pthread_mutex_unlock(&req_work.lock); + __req_handler(work->id); + free(work); + } while (1); + return NULL; +} + +static void *disc_handler_thread(void *arg) +{ + struct list_head *work; + do { + pthread_mutex_lock(&disc_work.lock); + if (__list_empty(&disc_work)) + pthread_cond_wait(&disc_work.cond, &disc_work.lock); + work = __list_remove_head(&disc_work); + pthread_mutex_unlock(&disc_work.lock); + rdma_disconnect(work->id); + rdma_destroy_qp(work->id); + rdma_destroy_id(work->id); + free(work); + } while (1); + return NULL; +} + +static void cma_handler(struct rdma_cm_id *id, struct rdma_cm_event *event) +{ + struct node *n = id->context; + struct list_head *request; + + switch (event->event) { + case RDMA_CM_EVENT_ADDR_RESOLVED: + addr_handler(n); + break; + case RDMA_CM_EVENT_ROUTE_RESOLVED: + route_handler(n); + break; + case RDMA_CM_EVENT_CONNECT_REQUEST: + request = malloc(sizeof *request); + if (!request) { + perror("out of memory accepting connect request"); + rdma_reject(id, NULL, 0); + rdma_destroy_id(id); + } else { + INIT_LIST(request); + request->id = id; + list_add_tail(&req_work, request); + } + break; + case RDMA_CM_EVENT_ESTABLISHED: + if (n) + conn_handler(n); + break; + case RDMA_CM_EVENT_ADDR_ERROR: + if (n->retries--) { + if (!rdma_resolve_addr(n->id, rai->ai_src_addr, + rai->ai_dst_addr, timeout)) + break; + } + printf("RDMA_CM_EVENT_ADDR_ERROR, error: %d\n", event->status); + addr_handler(n); + n->error = 1; + break; + case RDMA_CM_EVENT_ROUTE_ERROR: + if (n->retries--) { + if (!rdma_resolve_route(n->id, timeout)) + break; + } + printf("RDMA_CM_EVENT_ROUTE_ERROR, error: %d\n", event->status); + route_handler(n); + n->error = 1; + break; + case RDMA_CM_EVENT_CONNECT_ERROR: + case RDMA_CM_EVENT_UNREACHABLE: + case RDMA_CM_EVENT_REJECTED: + printf("event: %s, error: %d\n", + rdma_event_str(event->event), event->status); + conn_handler(n); + n->error = 1; + break; + case RDMA_CM_EVENT_DISCONNECTED: + if (!n) { + request = malloc(sizeof *request); + if (!request) { + perror("out of memory queueing disconnect request, handling synchronously"); + rdma_disconnect(id); + rdma_destroy_qp(id); + rdma_destroy_id(id); + } else { + INIT_LIST(request); + request->id = id; + list_add_tail(&disc_work, request); + } + } else + disc_handler(n); + break; + case RDMA_CM_EVENT_DEVICE_REMOVAL: + /* Cleanup will occur after test completes. */ + break; + default: + break; + } + rdma_ack_cm_event(event); +} + +static int alloc_nodes(void) +{ + int ret, i; + + nodes = calloc(sizeof *nodes, connections); + if (!nodes) + return -ENOMEM; + + printf("creating id\n"); + start_time(STEP_CREATE_ID); + for (i = 0; i < connections; i++) { + start_perf(&nodes[i], STEP_CREATE_ID); + if (dst_addr) { + ret = rdma_create_id(channel, &nodes[i].id, &nodes[i], + hints.ai_port_space); + if (ret) + goto err; + } + end_perf(&nodes[i], STEP_CREATE_ID); + } + end_time(STEP_CREATE_ID); + return 0; + +err: + while (--i >= 0) + rdma_destroy_id(nodes[i].id); + free(nodes); + return ret; +} + +static void cleanup_nodes(void) +{ + int i; + + printf("destroying id\n"); + start_time(STEP_DESTROY); + for (i = 0; i < connections; i++) { + start_perf(&nodes[i], STEP_DESTROY); + if (nodes[i].id) + rdma_destroy_id(nodes[i].id); + end_perf(&nodes[i], STEP_DESTROY); + } + end_time(STEP_DESTROY); +} + +static void *process_events(void *arg) +{ + struct rdma_cm_event *event; + int ret = 0; + + while (!ret) { + ret = rdma_get_cm_event(channel, &event); + if (!ret) { + cma_handler(event->id, event); + } else { + perror("failure in rdma_get_cm_event in process_server_events"); + ret = errno; + } + } + return NULL; +} + +static int run_server(void) +{ + pthread_t req_thread, disc_thread; + struct rdma_cm_id *listen_id; + int ret; + + INIT_LIST(&req_work.list); + INIT_LIST(&disc_work.list); + ret = pthread_mutex_init(&req_work.lock, NULL); + if (ret) { + perror("initializing mutex for req work"); + return ret; + } + + ret = pthread_mutex_init(&disc_work.lock, NULL); + if (ret) { + perror("initializing mutex for disc work"); + return ret; + } + + ret = pthread_cond_init(&req_work.cond, NULL); + if (ret) { + perror("initializing cond for req work"); + return ret; + } + + ret = pthread_cond_init(&disc_work.cond, NULL); + if (ret) { + perror("initializing cond for disc work"); + return ret; + } + + ret = pthread_create(&req_thread, NULL, req_handler_thread, NULL); + if (ret) { + perror("failed to create req handler thread"); + return ret; + } + + ret = pthread_create(&disc_thread, NULL, disc_handler_thread, NULL); + if (ret) { + perror("failed to create disconnect handler thread"); + return ret; + } + + ret = rdma_create_id(channel, &listen_id, NULL, hints.ai_port_space); + if (ret) { + perror("listen request failed"); + return ret; + } + + ret = get_rdma_addr(src_addr, dst_addr, port, &hints, &rai); + if (ret) { + printf("getrdmaaddr error: %s\n", gai_strerror(ret)); + goto out; + } + + ret = rdma_bind_addr(listen_id, rai->ai_src_addr); + if (ret) { + perror("bind address failed"); + goto out; + } + + ret = rdma_listen(listen_id, 0); + if (ret) { + perror("failure trying to listen"); + goto out; + } + + process_events(NULL); + out: + rdma_destroy_id(listen_id); + return ret; +} + +static int run_client(void) +{ + pthread_t event_thread; + int i, ret; + + ret = get_rdma_addr(src_addr, dst_addr, port, &hints, &rai); + if (ret) { + printf("getaddrinfo error: %s\n", gai_strerror(ret)); + return ret; + } + + conn_param.responder_resources = 1; + conn_param.initiator_depth = 1; + conn_param.retry_count = retries; + conn_param.private_data = rai->ai_connect; + conn_param.private_data_len = rai->ai_connect_len; + + ret = pthread_create(&event_thread, NULL, process_events, NULL); + if (ret) { + perror("failure creating event thread"); + return ret; + } + + if (src_addr) { + printf("binding source address\n"); + start_time(STEP_BIND); + for (i = 0; i < connections; i++) { + start_perf(&nodes[i], STEP_BIND); + ret = rdma_bind_addr(nodes[i].id, rai->ai_src_addr); + if (ret) { + perror("failure bind addr"); + nodes[i].error = 1; + continue; + } + end_perf(&nodes[i], STEP_BIND); + } + end_time(STEP_BIND); + } + + printf("resolving address\n"); + start_time(STEP_RESOLVE_ADDR); + for (i = 0; i < connections; i++) { + if (nodes[i].error) + continue; + nodes[i].retries = retries; + start_perf(&nodes[i], STEP_RESOLVE_ADDR); + ret = rdma_resolve_addr(nodes[i].id, rai->ai_src_addr, + rai->ai_dst_addr, timeout); + if (ret) { + perror("failure getting addr"); + nodes[i].error = 1; + continue; + } + started[STEP_RESOLVE_ADDR]++; + } + while (started[STEP_RESOLVE_ADDR] != completed[STEP_RESOLVE_ADDR]) sched_yield(); + end_time(STEP_RESOLVE_ADDR); + + printf("resolving route\n"); + start_time(STEP_RESOLVE_ROUTE); + for (i = 0; i < connections; i++) { + if (nodes[i].error) + continue; + nodes[i].retries = retries; + start_perf(&nodes[i], STEP_RESOLVE_ROUTE); + ret = rdma_resolve_route(nodes[i].id, timeout); + if (ret) { + perror("failure resolving route"); + nodes[i].error = 1; + continue; + } + started[STEP_RESOLVE_ROUTE]++; + } + while (started[STEP_RESOLVE_ROUTE] != completed[STEP_RESOLVE_ROUTE]) sched_yield(); + end_time(STEP_RESOLVE_ROUTE); + + printf("creating qp\n"); + start_time(STEP_CREATE_QP); + for (i = 0; i < connections; i++) { + if (nodes[i].error) + continue; + start_perf(&nodes[i], STEP_CREATE_QP); + ret = rdma_create_qp(nodes[i].id, NULL, &init_qp_attr); + if (ret) { + perror("failure creating qp"); + nodes[i].error = 1; + continue; + } + end_perf(&nodes[i], STEP_CREATE_QP); + } + end_time(STEP_CREATE_QP); + + printf("connecting\n"); + start_time(STEP_CONNECT); + for (i = 0; i < connections; i++) { + if (nodes[i].error) + continue; + start_perf(&nodes[i], STEP_CONNECT); + ret = rdma_connect(nodes[i].id, &conn_param); + if (ret) { + perror("failure rconnecting"); + nodes[i].error = 1; + continue; + } + started[STEP_CONNECT]++; + } + while (started[STEP_CONNECT] != completed[STEP_CONNECT]) sched_yield(); + end_time(STEP_CONNECT); + + printf("disconnecting\n"); + start_time(STEP_DISCONNECT); + for (i = 0; i < connections; i++) { + if (nodes[i].error) + continue; + start_perf(&nodes[i], STEP_DISCONNECT); + rdma_disconnect(nodes[i].id); + rdma_destroy_qp(nodes[i].id); + started[STEP_DISCONNECT]++; + } + while (started[STEP_DISCONNECT] != completed[STEP_DISCONNECT]) sched_yield(); + end_time(STEP_DISCONNECT); + + return ret; +} + +int main(int argc, char **argv) +{ + int op, ret; + + hints.ai_port_space = RDMA_PS_TCP; + hints.ai_qp_type = IBV_QPT_RC; + while ((op = getopt(argc, argv, "s:b:c:p:r:t:")) != -1) { + switch (op) { + case 's': + dst_addr = optarg; + break; + case 'b': + src_addr = optarg; + break; + case 'c': + connections = atoi(optarg); + break; + case 'p': + port = optarg; + break; + case 'r': + retries = atoi(optarg); + break; + case 't': + timeout = atoi(optarg); + break; + default: + printf("usage: %s\n", argv[0]); + printf("\t[-s server_address]\n"); + printf("\t[-b bind_address]\n"); + printf("\t[-c connections]\n"); + printf("\t[-p port_number]\n"); + printf("\t[-r retries]\n"); + printf("\t[-t timeout_ms]\n"); + exit(1); + } + } + + init_qp_attr.cap.max_send_wr = 1; + init_qp_attr.cap.max_recv_wr = 1; + init_qp_attr.cap.max_send_sge = 1; + init_qp_attr.cap.max_recv_sge = 1; + init_qp_attr.qp_type = IBV_QPT_RC; + + channel = create_first_event_channel(); + if (!channel) { + exit(1); + } + + if (dst_addr) { + alloc_nodes(); + ret = run_client(); + } else { + hints.ai_flags |= RAI_PASSIVE; + ret = run_server(); + } + + cleanup_nodes(); + rdma_destroy_event_channel(channel); + if (rai) + rdma_freeaddrinfo(rai); + + show_perf(); + free(nodes); + return ret; +} diff --git a/librdmacm/examples/common.c b/librdmacm/examples/common.c new file mode 100644 index 0000000..ad05772 --- /dev/null +++ b/librdmacm/examples/common.c @@ -0,0 +1,177 @@ +/* + * Copyright (c) 2005-2006,2012 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * $Id$ + */ + +#include <stdlib.h> +#include <string.h> +#include <stdio.h> +#include <errno.h> +#include <sys/types.h> +#include <sys/socket.h> +#include <netdb.h> + +#include <rdma/rdma_cma.h> +#include "common.h" + +int use_rs = 1; + +int get_rdma_addr(const char *src, const char *dst, const char *port, + struct rdma_addrinfo *hints, struct rdma_addrinfo **rai) +{ + struct rdma_addrinfo rai_hints, *res; + int ret; + + if (hints->ai_flags & RAI_PASSIVE) + return rdma_getaddrinfo(src, port, hints, rai); + + rai_hints = *hints; + if (src) { + rai_hints.ai_flags |= RAI_PASSIVE; + ret = rdma_getaddrinfo(src, NULL, &rai_hints, &res); + if (ret) + return ret; + + rai_hints.ai_src_addr = res->ai_src_addr; + rai_hints.ai_src_len = res->ai_src_len; + rai_hints.ai_flags &= ~RAI_PASSIVE; + } + + ret = rdma_getaddrinfo(dst, port, &rai_hints, rai); + if (src) + rdma_freeaddrinfo(res); + + return ret; +} + +void size_str(char *str, size_t ssize, long long size) +{ + long long base, fraction = 0; + char mag; + + if (size >= (1 << 30)) { + base = 1 << 30; + mag = 'g'; + } else if (size >= (1 << 20)) { + base = 1 << 20; + mag = 'm'; + } else if (size >= (1 << 10)) { + base = 1 << 10; + mag = 'k'; + } else { + base = 1; + mag = '\0'; + } + + if (size / base < 10) + fraction = (size % base) * 10 / base; + if (fraction) { + snprintf(str, ssize, "%lld.%lld%c", size / base, fraction, mag); + } else { + snprintf(str, ssize, "%lld%c", size / base, mag); + } +} + +void cnt_str(char *str, size_t ssize, long long cnt) +{ + if (cnt >= 1000000000) + snprintf(str, ssize, "%lldb", cnt / 1000000000); + else if (cnt >= 1000000) + snprintf(str, ssize, "%lldm", cnt / 1000000); + else if (cnt >= 1000) + snprintf(str, ssize, "%lldk", cnt / 1000); + else + snprintf(str, ssize, "%lld", cnt); +} + +int size_to_count(int size) +{ + if (size >= (1 << 20)) + return 100; + else if (size >= (1 << 16)) + return 1000; + else if (size >= (1 << 10)) + return 10000; + else + return 100000; +} + +void format_buf(void *buf, int size) +{ + uint8_t *array = buf; + static uint8_t data; + int i; + + for (i = 0; i < size; i++) + array[i] = data++; +} + +int verify_buf(void *buf, int size) +{ + static long long total_bytes; + uint8_t *array = buf; + static uint8_t data; + int i; + + for (i = 0; i < size; i++, total_bytes++) { + if (array[i] != data++) { + printf("data verification failed byte %lld\n", total_bytes); + return -1; + } + } + return 0; +} + +int do_poll(struct pollfd *fds, int timeout) +{ + int ret; + + do { + ret = rs_poll(fds, 1, timeout); + } while (!ret); + + return ret == 1 ? (fds->revents & (POLLERR | POLLHUP)) : ret; +} + +struct rdma_event_channel *create_first_event_channel(void) +{ + struct rdma_event_channel *channel; + + channel = rdma_create_event_channel(); + if (!channel) { + if (errno == ENODEV) + fprintf(stderr, "No RDMA devices were detected\n"); + else + perror("failed to create RDMA CM event channel"); + } + return channel; +} diff --git a/librdmacm/examples/common.h b/librdmacm/examples/common.h new file mode 100644 index 0000000..cf7a96a --- /dev/null +++ b/librdmacm/examples/common.h @@ -0,0 +1,106 @@ +/* + * Copyright (c) 2005-2012 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * $Id$ + */ + +#include <stdlib.h> +#include <sys/types.h> +#include <endian.h> +#include <poll.h> + +#include <rdma/rdma_cma.h> +#include <rdma/rsocket.h> +#include <infiniband/ib.h> + +/* Defined in common.c; used in all rsocket demos to determine whether to use + * rsocket calls or standard socket calls. + */ +extern int use_rs; + +static inline int rs_socket(int f, int t, int p) +{ + int fd; + + if (!use_rs) + return socket(f, t, p); + + fd = rsocket(f, t, p); + if (fd < 0) { + if (t == SOCK_STREAM && errno == ENODEV) + fprintf(stderr, "No RDMA devices were detected\n"); + else + perror("rsocket failed"); + } + return fd; +} + +#define rs_bind(s,a,l) use_rs ? rbind(s,a,l) : bind(s,a,l) +#define rs_listen(s,b) use_rs ? rlisten(s,b) : listen(s,b) +#define rs_connect(s,a,l) use_rs ? rconnect(s,a,l) : connect(s,a,l) +#define rs_accept(s,a,l) use_rs ? raccept(s,a,l) : accept(s,a,l) +#define rs_shutdown(s,h) use_rs ? rshutdown(s,h) : shutdown(s,h) +#define rs_close(s) use_rs ? rclose(s) : close(s) +#define rs_recv(s,b,l,f) use_rs ? rrecv(s,b,l,f) : recv(s,b,l,f) +#define rs_send(s,b,l,f) use_rs ? rsend(s,b,l,f) : send(s,b,l,f) +#define rs_recvfrom(s,b,l,f,a,al) \ + use_rs ? rrecvfrom(s,b,l,f,a,al) : recvfrom(s,b,l,f,a,al) +#define rs_sendto(s,b,l,f,a,al) \ + use_rs ? rsendto(s,b,l,f,a,al) : sendto(s,b,l,f,a,al) +#define rs_poll(f,n,t) use_rs ? rpoll(f,n,t) : poll(f,n,t) +#define rs_fcntl(s,c,p) use_rs ? rfcntl(s,c,p) : fcntl(s,c,p) +#define rs_setsockopt(s,l,n,v,ol) \ + use_rs ? rsetsockopt(s,l,n,v,ol) : setsockopt(s,l,n,v,ol) +#define rs_getsockopt(s,l,n,v,ol) \ + use_rs ? rgetsockopt(s,l,n,v,ol) : getsockopt(s,l,n,v,ol) + +union socket_addr { + struct sockaddr sa; + struct sockaddr_in sin; + struct sockaddr_in6 sin6; +}; + +enum rs_optimization { + opt_mixed, + opt_latency, + opt_bandwidth +}; + +int get_rdma_addr(const char *src, const char *dst, const char *port, + struct rdma_addrinfo *hints, struct rdma_addrinfo **rai); + +void size_str(char *str, size_t ssize, long long size); +void cnt_str(char *str, size_t ssize, long long cnt); +int size_to_count(int size); +void format_buf(void *buf, int size); +int verify_buf(void *buf, int size); +int do_poll(struct pollfd *fds, int timeout); +struct rdma_event_channel *create_first_event_channel(void); diff --git a/librdmacm/examples/mckey.c b/librdmacm/examples/mckey.c new file mode 100644 index 0000000..668f4c8 --- /dev/null +++ b/librdmacm/examples/mckey.c @@ -0,0 +1,643 @@ +/* + * Copyright (c) 2005-2007 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * $Id$ + */ + +#include <stdlib.h> +#include <string.h> +#include <stdio.h> +#include <errno.h> +#include <sys/types.h> +#include <arpa/inet.h> +#include <sys/socket.h> +#include <netdb.h> +#include <unistd.h> +#include <getopt.h> + +#include <rdma/rdma_cma.h> +#include <infiniband/ib.h> + +#include "common.h" + +struct cmatest_node { + int id; + struct rdma_cm_id *cma_id; + int connected; + struct ibv_pd *pd; + struct ibv_cq *cq; + struct ibv_mr *mr; + struct ibv_ah *ah; + uint32_t remote_qpn; + uint32_t remote_qkey; + void *mem; +}; + +struct cmatest { + struct rdma_event_channel *channel; + pthread_t cmathread; + struct cmatest_node *nodes; + int conn_index; + int connects_left; + + struct sockaddr_storage dst_in; + struct sockaddr *dst_addr; + struct sockaddr_storage src_in; + struct sockaddr *src_addr; +}; + +static struct cmatest test; +static int connections = 1; +static int message_size = 100; +static int message_count = 10; +static int is_sender; +static int send_only; +static int unmapped_addr; +static char *dst_addr; +static char *src_addr; +static enum rdma_port_space port_space = RDMA_PS_UDP; + +static int create_message(struct cmatest_node *node) +{ + if (!message_size) + message_count = 0; + + if (!message_count) + return 0; + + node->mem = malloc(message_size + sizeof(struct ibv_grh)); + if (!node->mem) { + printf("failed message allocation\n"); + return -1; + } + node->mr = ibv_reg_mr(node->pd, node->mem, + message_size + sizeof(struct ibv_grh), + IBV_ACCESS_LOCAL_WRITE); + if (!node->mr) { + printf("failed to reg MR\n"); + goto err; + } + return 0; +err: + free(node->mem); + return -1; +} + +static int verify_test_params(struct cmatest_node *node) +{ + struct ibv_port_attr port_attr; + int ret; + + ret = ibv_query_port(node->cma_id->verbs, node->cma_id->port_num, + &port_attr); + if (ret) + return ret; + + if (message_count && message_size > (1 << (port_attr.active_mtu + 7))) { + printf("mckey: message_size %d is larger than active mtu %d\n", + message_size, 1 << (port_attr.active_mtu + 7)); + return -EINVAL; + } + + return 0; +} + +static int init_node(struct cmatest_node *node) +{ + struct ibv_qp_init_attr init_qp_attr; + int cqe, ret; + + node->pd = ibv_alloc_pd(node->cma_id->verbs); + if (!node->pd) { + ret = -ENOMEM; + printf("mckey: unable to allocate PD\n"); + goto out; + } + + cqe = message_count ? message_count * 2 : 2; + node->cq = ibv_create_cq(node->cma_id->verbs, cqe, node, NULL, 0); + if (!node->cq) { + ret = -ENOMEM; + printf("mckey: unable to create CQ\n"); + goto out; + } + + memset(&init_qp_attr, 0, sizeof init_qp_attr); + init_qp_attr.cap.max_send_wr = message_count ? message_count : 1; + init_qp_attr.cap.max_recv_wr = message_count ? message_count : 1; + init_qp_attr.cap.max_send_sge = 1; + init_qp_attr.cap.max_recv_sge = 1; + init_qp_attr.qp_context = node; + init_qp_attr.sq_sig_all = 0; + init_qp_attr.qp_type = IBV_QPT_UD; + init_qp_attr.send_cq = node->cq; + init_qp_attr.recv_cq = node->cq; + ret = rdma_create_qp(node->cma_id, node->pd, &init_qp_attr); + if (ret) { + perror("mckey: unable to create QP"); + goto out; + } + + ret = create_message(node); + if (ret) { + printf("mckey: failed to create messages: %d\n", ret); + goto out; + } +out: + return ret; +} + +static int post_recvs(struct cmatest_node *node) +{ + struct ibv_recv_wr recv_wr, *recv_failure; + struct ibv_sge sge; + int i, ret = 0; + + if (!message_count) + return 0; + + recv_wr.next = NULL; + recv_wr.sg_list = &sge; + recv_wr.num_sge = 1; + recv_wr.wr_id = (uintptr_t) node; + + sge.length = message_size + sizeof(struct ibv_grh); + sge.lkey = node->mr->lkey; + sge.addr = (uintptr_t) node->mem; + + for (i = 0; i < message_count && !ret; i++ ) { + ret = ibv_post_recv(node->cma_id->qp, &recv_wr, &recv_failure); + if (ret) { + printf("failed to post receives: %d\n", ret); + break; + } + } + return ret; +} + +static int post_sends(struct cmatest_node *node, int signal_flag) +{ + struct ibv_send_wr send_wr, *bad_send_wr; + struct ibv_sge sge; + int i, ret = 0; + + if (!node->connected || !message_count) + return 0; + + send_wr.next = NULL; + send_wr.sg_list = &sge; + send_wr.num_sge = 1; + send_wr.opcode = IBV_WR_SEND_WITH_IMM; + send_wr.send_flags = signal_flag; + send_wr.wr_id = (unsigned long)node; + send_wr.imm_data = htobe32(node->cma_id->qp->qp_num); + + send_wr.wr.ud.ah = node->ah; + send_wr.wr.ud.remote_qpn = node->remote_qpn; + send_wr.wr.ud.remote_qkey = node->remote_qkey; + + sge.length = message_size; + sge.lkey = node->mr->lkey; + sge.addr = (uintptr_t) node->mem; + + for (i = 0; i < message_count && !ret; i++) { + ret = ibv_post_send(node->cma_id->qp, &send_wr, &bad_send_wr); + if (ret) + printf("failed to post sends: %d\n", ret); + } + return ret; +} + +static void connect_error(void) +{ + test.connects_left--; +} + +static int addr_handler(struct cmatest_node *node) +{ + int ret; + struct rdma_cm_join_mc_attr_ex mc_attr; + + ret = verify_test_params(node); + if (ret) + goto err; + + ret = init_node(node); + if (ret) + goto err; + + if (!is_sender) { + ret = post_recvs(node); + if (ret) + goto err; + } + + mc_attr.comp_mask = + RDMA_CM_JOIN_MC_ATTR_ADDRESS | RDMA_CM_JOIN_MC_ATTR_JOIN_FLAGS; + mc_attr.addr = test.dst_addr; + mc_attr.join_flags = send_only ? RDMA_MC_JOIN_FLAG_SENDONLY_FULLMEMBER + : RDMA_MC_JOIN_FLAG_FULLMEMBER; + + ret = rdma_join_multicast_ex(node->cma_id, &mc_attr, node); + + if (ret) { + perror("mckey: failure joining"); + goto err; + } + return 0; +err: + connect_error(); + return ret; +} + +static int join_handler(struct cmatest_node *node, + struct rdma_ud_param *param) +{ + char buf[40]; + + inet_ntop(AF_INET6, param->ah_attr.grh.dgid.raw, buf, 40); + printf("mckey: joined dgid: %s mlid 0x%x sl %d\n", buf, + param->ah_attr.dlid, param->ah_attr.sl); + + node->remote_qpn = param->qp_num; + node->remote_qkey = param->qkey; + node->ah = ibv_create_ah(node->pd, ¶m->ah_attr); + if (!node->ah) { + printf("mckey: failure creating address handle\n"); + goto err; + } + + node->connected = 1; + test.connects_left--; + return 0; +err: + connect_error(); + return -1; +} + +static int cma_handler(struct rdma_cm_id *cma_id, struct rdma_cm_event *event) +{ + int ret = 0; + + switch (event->event) { + case RDMA_CM_EVENT_ADDR_RESOLVED: + ret = addr_handler(cma_id->context); + break; + case RDMA_CM_EVENT_MULTICAST_JOIN: + ret = join_handler(cma_id->context, &event->param.ud); + break; + case RDMA_CM_EVENT_ADDR_ERROR: + case RDMA_CM_EVENT_ROUTE_ERROR: + case RDMA_CM_EVENT_MULTICAST_ERROR: + printf("mckey: event: %s, error: %d\n", + rdma_event_str(event->event), event->status); + connect_error(); + ret = event->status; + break; + case RDMA_CM_EVENT_DEVICE_REMOVAL: + /* Cleanup will occur after test completes. */ + break; + default: + break; + } + return ret; +} + +static void *cma_thread(void *arg) +{ + struct rdma_cm_event *event; + int ret; + + while (1) { + ret = rdma_get_cm_event(test.channel, &event); + if (ret) { + perror("rdma_get_cm_event"); + break; + } + + switch (event->event) { + case RDMA_CM_EVENT_MULTICAST_ERROR: + case RDMA_CM_EVENT_ADDR_CHANGE: + printf("mckey: event: %s, status: %d\n", + rdma_event_str(event->event), event->status); + break; + default: + break; + } + + rdma_ack_cm_event(event); + } + return NULL; +} + +static void destroy_node(struct cmatest_node *node) +{ + if (!node->cma_id) + return; + + if (node->ah) + ibv_destroy_ah(node->ah); + + if (node->cma_id->qp) + rdma_destroy_qp(node->cma_id); + + if (node->cq) + ibv_destroy_cq(node->cq); + + if (node->mem) { + ibv_dereg_mr(node->mr); + free(node->mem); + } + + if (node->pd) + ibv_dealloc_pd(node->pd); + + /* Destroy the RDMA ID after all device resources */ + rdma_destroy_id(node->cma_id); +} + +static int alloc_nodes(void) +{ + int ret, i; + + test.nodes = malloc(sizeof *test.nodes * connections); + if (!test.nodes) { + printf("mckey: unable to allocate memory for test nodes\n"); + return -ENOMEM; + } + memset(test.nodes, 0, sizeof *test.nodes * connections); + + for (i = 0; i < connections; i++) { + test.nodes[i].id = i; + ret = rdma_create_id(test.channel, &test.nodes[i].cma_id, + &test.nodes[i], port_space); + if (ret) + goto err; + } + return 0; +err: + while (--i >= 0) + rdma_destroy_id(test.nodes[i].cma_id); + free(test.nodes); + return ret; +} + +static void destroy_nodes(void) +{ + int i; + + for (i = 0; i < connections; i++) + destroy_node(&test.nodes[i]); + free(test.nodes); +} + +static int poll_cqs(void) +{ + struct ibv_wc wc[8]; + int done, i, ret; + + for (i = 0; i < connections; i++) { + if (!test.nodes[i].connected) + continue; + + for (done = 0; done < message_count; done += ret) { + ret = ibv_poll_cq(test.nodes[i].cq, 8, wc); + if (ret < 0) { + printf("mckey: failed polling CQ: %d\n", ret); + return ret; + } + } + } + return 0; +} + +static int connect_events(void) +{ + struct rdma_cm_event *event; + int ret = 0; + + while (test.connects_left && !ret) { + ret = rdma_get_cm_event(test.channel, &event); + if (!ret) { + ret = cma_handler(event->id, event); + rdma_ack_cm_event(event); + } + } + return ret; +} + +static int get_addr(char *dst, struct sockaddr *addr) +{ + struct addrinfo *res; + int ret; + + ret = getaddrinfo(dst, NULL, NULL, &res); + if (ret) { + printf("getaddrinfo failed (%s) - invalid hostname or IP address\n", gai_strerror(ret)); + return ret; + } + + memcpy(addr, res->ai_addr, res->ai_addrlen); + freeaddrinfo(res); + return ret; +} + +static int get_dst_addr(char *dst, struct sockaddr *addr) +{ + struct sockaddr_ib *sib; + + if (!unmapped_addr) + return get_addr(dst, addr); + + sib = (struct sockaddr_ib *) addr; + memset(sib, 0, sizeof *sib); + sib->sib_family = AF_IB; + inet_pton(AF_INET6, dst, &sib->sib_addr); + return 0; +} + +static int run(void) +{ + int i, ret; + + printf("mckey: starting %s\n", is_sender ? "client" : "server"); + if (src_addr) { + ret = get_addr(src_addr, (struct sockaddr *) &test.src_in); + if (ret) + return ret; + } + + ret = get_dst_addr(dst_addr, (struct sockaddr *) &test.dst_in); + if (ret) + return ret; + + printf("mckey: joining\n"); + for (i = 0; i < connections; i++) { + if (src_addr) { + ret = rdma_bind_addr(test.nodes[i].cma_id, + test.src_addr); + if (ret) { + perror("mckey: addr bind failure"); + connect_error(); + return ret; + } + } + + if (unmapped_addr) + ret = addr_handler(&test.nodes[i]); + else + ret = rdma_resolve_addr(test.nodes[i].cma_id, + test.src_addr, test.dst_addr, + 2000); + if (ret) { + perror("mckey: resolve addr failure"); + connect_error(); + return ret; + } + } + + ret = connect_events(); + if (ret) + goto out; + + pthread_create(&test.cmathread, NULL, cma_thread, NULL); + + /* + * Pause to give SM chance to configure switches. We don't want to + * handle reliability issue in this simple test program. + */ + sleep(3); + + if (message_count) { + if (is_sender) { + printf("initiating data transfers\n"); + for (i = 0; i < connections; i++) { + ret = post_sends(&test.nodes[i], 0); + if (ret) + goto out; + } + } else { + printf("receiving data transfers\n"); + ret = poll_cqs(); + if (ret) + goto out; + } + printf("data transfers complete\n"); + } +out: + for (i = 0; i < connections; i++) { + ret = rdma_leave_multicast(test.nodes[i].cma_id, + test.dst_addr); + if (ret) + perror("mckey: failure leaving"); + } + return ret; +} + +int main(int argc, char **argv) +{ + int op, ret; + + while ((op = getopt(argc, argv, "m:M:sb:c:C:S:p:o")) != -1) { + switch (op) { + case 'm': + dst_addr = optarg; + break; + case 'M': + unmapped_addr = 1; + dst_addr = optarg; + break; + case 's': + is_sender = 1; + break; + case 'b': + src_addr = optarg; + test.src_addr = (struct sockaddr *) &test.src_in; + break; + case 'c': + connections = atoi(optarg); + break; + case 'C': + message_count = atoi(optarg); + break; + case 'S': + message_size = atoi(optarg); + break; + case 'p': + port_space = strtol(optarg, NULL, 0); + break; + case 'o': + send_only = 1; + break; + + default: + printf("usage: %s\n", argv[0]); + printf("\t-m multicast_address\n"); + printf("\t[-M unmapped_multicast_address]\n" + "\t replaces -m and requires -b\n"); + printf("\t[-s(ender)]\n"); + printf("\t[-b bind_address]\n"); + printf("\t[-c connections]\n"); + printf("\t[-C message_count]\n"); + printf("\t[-S message_size]\n"); + printf("\t[-p port_space - %#x for UDP (default), " + "%#x for IPOIB]\n", RDMA_PS_UDP, RDMA_PS_IPOIB); + printf("\t[-o join as a send-only full-member]\n"); + exit(1); + } + } + + if (unmapped_addr && !src_addr) { + printf("unmapped multicast address requires binding " + "to source address\n"); + exit(1); + } + + test.dst_addr = (struct sockaddr *) &test.dst_in; + test.connects_left = connections; + + test.channel = create_first_event_channel(); + if (!test.channel) { + exit(1); + } + + if (alloc_nodes()) + exit(1); + + ret = run(); + + printf("test complete\n"); + destroy_nodes(); + rdma_destroy_event_channel(test.channel); + + printf("return status %d\n", ret); + return ret; +} diff --git a/librdmacm/examples/rcopy.c b/librdmacm/examples/rcopy.c new file mode 100644 index 0000000..c1ef46b --- /dev/null +++ b/librdmacm/examples/rcopy.c @@ -0,0 +1,629 @@ +/* + * Copyright (c) 2011 Intel Corporation. All rights reserved. + * + * This software is available to you under the OpenIB.org BSD license + * below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AWV + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <strings.h> +#include <errno.h> +#include <getopt.h> +#include <arpa/inet.h> +#include <sys/mman.h> +#include <sys/types.h> +#include <sys/socket.h> +#include <sys/time.h> +#include <sys/stat.h> +#include <fcntl.h> +#include <netdb.h> +#include <unistd.h> + +#include <rdma/rsocket.h> + +#include "common.h" + +union rsocket_address { + struct sockaddr sa; + struct sockaddr_in sin; + struct sockaddr_in6 sin6; + struct sockaddr_storage storage; +}; + +static const char *port = "7427"; +static char *dst_addr; +static char *dst_file; +static char *src_file; +static struct timeval start, end; +//static void buf[1024 * 1024]; +static uint64_t bytes; +static int fd; +static void *file_addr; + +enum { + CMD_NOOP, + CMD_OPEN, + CMD_CLOSE, + CMD_WRITE, + CMD_RESP = 0x80, +}; + +/* TODO: handle byte swapping */ +struct msg_hdr { + uint8_t version; + uint8_t command; + uint16_t len; + uint32_t data; + uint64_t id; +}; + +struct msg_open { + struct msg_hdr hdr; + char path[0]; +}; + +struct msg_write { + struct msg_hdr hdr; + uint64_t size; +}; + +static void show_perf(void) +{ + float usec; + + usec = (end.tv_sec - start.tv_sec) * 1000000 + (end.tv_usec - start.tv_usec); + + printf("%lld bytes in %.2f seconds = %.2f Gb/sec\n", + (long long) bytes, usec / 1000000., (bytes * 8) / (1000. * usec)); +} + +static char *_ntop(union rsocket_address *rsa) +{ + static char addr[32]; + + switch (rsa->sa.sa_family) { + case AF_INET: + inet_ntop(AF_INET, &rsa->sin.sin_addr, addr, sizeof addr); + break; + case AF_INET6: + inet_ntop(AF_INET6, &rsa->sin6.sin6_addr, addr, sizeof addr); + break; + default: + addr[0] = '\0'; + break; + } + + return addr; +} + +static size_t _recv(int rs, char *msg, size_t len) +{ + size_t ret, offset; + + for (offset = 0; offset < len; offset += ret) { + ret = rrecv(rs, msg + offset, len - offset, 0); + if (ret <= 0) + return ret; + } + + return len; +} + +static int msg_recv_hdr(int rs, struct msg_hdr *hdr) +{ + int ret; + + ret = _recv(rs, (char *) hdr, sizeof *hdr); + if (ret != sizeof *hdr) + return -1; + + if (hdr->version || hdr->len < sizeof *hdr) { + printf("invalid version %d or length %d\n", + hdr->version, hdr->len); + return -1; + } + + return sizeof *hdr; +} + +static int msg_get_resp(int rs, struct msg_hdr *msg, uint8_t cmd) +{ + int ret; + + ret = msg_recv_hdr(rs, msg); + if (ret != sizeof *msg) + return ret; + + if ((msg->len != sizeof *msg) || (msg->command != (cmd | CMD_RESP))) { + printf("invalid length %d or bad command response %x:%x\n", + msg->len, msg->command, cmd | CMD_RESP); + return -1; + } + + return msg->data; +} + +static void msg_send_resp(int rs, struct msg_hdr *msg, uint32_t status) +{ + struct msg_hdr resp; + + resp.version = 0; + resp.command = msg->command | CMD_RESP; + resp.len = sizeof resp; + resp.data = status; + resp.id = msg->id; + rsend(rs, (char *) &resp, sizeof resp, 0); +} + +static int server_listen(void) +{ + struct addrinfo hints, *res; + int ret, rs; + + memset(&hints, 0, sizeof hints); + hints.ai_flags = RAI_PASSIVE; + ret = getaddrinfo(NULL, port, &hints, &res); + if (ret) { + printf("getaddrinfo failed: %s\n", gai_strerror(ret)); + return ret; + } + + rs = rs_socket(res->ai_family, res->ai_socktype, res->ai_protocol); + if (rs < 0) { + ret = rs; + goto free; + } + + ret = 1; + ret = rsetsockopt(rs, SOL_SOCKET, SO_REUSEADDR, &ret, sizeof ret); + if (ret) { + perror("rsetsockopt failed"); + goto close; + } + + ret = rbind(rs, res->ai_addr, res->ai_addrlen); + if (ret) { + perror("rbind failed"); + goto close; + } + + ret = rlisten(rs, 1); + if (ret) { + perror("rlisten failed"); + goto close; + } + + ret = rs; + goto free; + +close: + rclose(rs); +free: + freeaddrinfo(res); + return ret; +} + +static int server_open(int rs, struct msg_hdr *msg) +{ + char *path = NULL; + int ret, len; + + printf("opening: "); + fflush(NULL); + if (file_addr || fd > 0) { + printf("cannot open another file\n"); + ret = EBUSY; + goto out; + } + + len = msg->len - sizeof *msg; + path = malloc(len); + if (!path) { + printf("cannot allocate path name\n"); + ret = ENOMEM; + goto out; + } + + ret = _recv(rs, path, len); + if (ret != len) { + printf("error receiving path\n"); + goto out; + } + + printf("%s, ", path); + fflush(NULL); + fd = open(path, O_RDWR | O_CREAT | O_TRUNC, msg->data); + if (fd < 0) { + printf("unable to open destination file\n"); + ret = errno; + goto out; + } + + ret = 0; +out: + if (path) + free(path); + + msg_send_resp(rs, msg, ret); + return ret; +} + +static void server_close(int rs, struct msg_hdr *msg) +{ + printf("closing..."); + fflush(NULL); + msg_send_resp(rs, msg, 0); + + if (file_addr) { + munmap(file_addr, bytes); + file_addr = NULL; + } + + if (fd > 0) { + close(fd); + fd = 0; + } + printf("done\n"); +} + +static int server_write(int rs, struct msg_hdr *msg) +{ + size_t len; + int ret; + + printf("transferring"); + fflush(NULL); + if (fd <= 0) { + printf("...file not opened\n"); + ret = EINVAL; + goto out; + } + + if (msg->len != sizeof(struct msg_write)) { + printf("...invalid message length %d\n", msg->len); + ret = EINVAL; + goto out; + } + + ret = _recv(rs, (char *) &bytes, sizeof bytes); + if (ret != sizeof bytes) + goto out; + + ret = ftruncate(fd, bytes); + if (ret) + goto out; + + file_addr = mmap(NULL, bytes, PROT_WRITE, MAP_SHARED, fd, 0); + if (file_addr == (void *) -1) { + printf("...error mapping file\n"); + ret = errno; + goto out; + } + + printf("...%lld bytes...", (long long) bytes); + fflush(NULL); + len = _recv(rs, file_addr, bytes); + if (len != bytes) { + printf("...error receiving data\n"); + ret = (int) len; + } +out: + msg_send_resp(rs, msg, ret); + return ret; +} + +static void server_process(int rs) +{ + struct msg_hdr msg; + int ret; + + do { + ret = msg_recv_hdr(rs, &msg); + if (ret != sizeof msg) + break; + + switch (msg.command) { + case CMD_OPEN: + ret = server_open(rs, &msg); + break; + case CMD_CLOSE: + server_close(rs, &msg); + ret = 0; + break; + case CMD_WRITE: + ret = server_write(rs, &msg); + break; + default: + msg_send_resp(rs, &msg, EINVAL); + ret = -1; + break; + } + + } while (!ret); +} + +static int server_run(void) +{ + int lrs, rs; + union rsocket_address rsa; + socklen_t len; + + lrs = server_listen(); + if (lrs < 0) + return lrs; + + while (1) { + len = sizeof rsa; + printf("waiting for connection..."); + fflush(NULL); + rs = raccept(lrs, &rsa.sa, &len); + + printf("client: %s\n", _ntop(&rsa)); + server_process(rs); + + rshutdown(rs, SHUT_RDWR); + rclose(rs); + } + return 0; +} + +static int client_connect(void) +{ + struct addrinfo *res; + int ret, rs; + + ret = getaddrinfo(dst_addr, port, NULL, &res); + if (ret) { + printf("getaddrinfo failed: %s\n", gai_strerror(ret)); + return ret; + } + + rs = rs_socket(res->ai_family, res->ai_socktype, res->ai_protocol); + if (rs < 0) { + goto free; + } + + ret = rconnect(rs, res->ai_addr, res->ai_addrlen); + if (ret) { + perror("rconnect failed\n"); + rclose(rs); + rs = ret; + } + +free: + freeaddrinfo(res); + return rs; +} + +static int client_open(int rs) +{ + struct msg_open *msg; + struct stat stats; + uint32_t len; + int ret; + + printf("opening..."); + fflush(NULL); + fd = open(src_file, O_RDONLY); + if (fd < 0) + return fd; + + ret = fstat(fd, &stats); + if (ret < 0) + goto err1; + + bytes = (uint64_t) stats.st_size; + file_addr = mmap(NULL, bytes, PROT_READ, MAP_SHARED, fd, 0); + if (file_addr == (void *) -1) { + ret = errno; + goto err1; + } + + len = (((uint32_t) strlen(dst_file)) + 8) & 0xFFFFFFF8; + msg = calloc(1, sizeof(*msg) + len); + if (!msg) { + ret = -1; + goto err2; + } + + msg->hdr.command = CMD_OPEN; + msg->hdr.len = sizeof(*msg) + len; + msg->hdr.data = (uint32_t) stats.st_mode; + strcpy(msg->path, dst_file); + ret = rsend(rs, msg, msg->hdr.len, 0); + if (ret != msg->hdr.len) + goto err3; + + ret = msg_get_resp(rs, &msg->hdr, CMD_OPEN); + if (ret) + goto err3; + + return 0; + +err3: + free(msg); +err2: + munmap(file_addr, bytes); +err1: + close(fd); + return ret; +} + +static int client_start_write(int rs) +{ + struct msg_write msg; + int ret; + + printf("transferring"); + fflush(NULL); + memset(&msg, 0, sizeof msg); + msg.hdr.command = CMD_WRITE; + msg.hdr.len = sizeof(msg); + msg.size = bytes; + + ret = rsend(rs, &msg, sizeof msg, 0); + if (ret != msg.hdr.len) + return ret; + + return 0; +} + +static int client_close(int rs) +{ + struct msg_hdr msg; + int ret; + + printf("closing..."); + fflush(NULL); + memset(&msg, 0, sizeof msg); + msg.command = CMD_CLOSE; + msg.len = sizeof msg; + ret = rsend(rs, (char *) &msg, msg.len, 0); + if (ret != msg.len) + goto out; + + ret = msg_get_resp(rs, &msg, CMD_CLOSE); + if (ret) + goto out; + + printf("done\n"); +out: + munmap(file_addr, bytes); + close(fd); + return ret; +} + +static int client_run(void) +{ + struct msg_hdr ack; + int ret, rs; + size_t len; + + rs = client_connect(); + if (rs < 0) + return rs; + + ret = client_open(rs); + if (ret) + goto shutdown; + + ret = client_start_write(rs); + if (ret) + goto close; + + printf("..."); + fflush(NULL); + gettimeofday(&start, NULL); + len = rsend(rs, file_addr, bytes, 0); + if (len == bytes) + ret = msg_get_resp(rs, &ack, CMD_WRITE); + else + ret = (int) len; + + gettimeofday(&end, NULL); + +close: + client_close(rs); +shutdown: + rshutdown(rs, SHUT_RDWR); + rclose(rs); + if (!ret) + show_perf(); + return ret; +} + +static void show_usage(char *program) +{ + printf("usage 1: %s [options]\n", program); + printf("\t starts the server application\n"); + printf("\t[-p port_number]\n"); + printf("usage 2: %s source server[:destination] [options]\n", program); + printf("\t source - file name and path\n"); + printf("\t server - name or address\n"); + printf("\t destination - file name and path\n"); + printf("\t[-p port_number]\n"); + exit(1); +} + +static void server_opts(int argc, char **argv) +{ + int op; + + while ((op = getopt(argc, argv, "p:")) != -1) { + switch (op) { + case 'p': + port = optarg; + break; + default: + show_usage(argv[0]); + } + } +} + +static void client_opts(int argc, char **argv) +{ + int op; + + if (argc < 3) + show_usage(argv[0]); + + src_file = argv[1]; + dst_addr = argv[2]; + dst_file = strchr(dst_addr, ':'); + if (dst_file) { + *dst_file = '\0'; + dst_file++; + } + if (!dst_file) + dst_file = src_file; + + while ((op = getopt(argc, argv, "p:")) != -1) { + switch (op) { + case 'p': + port = optarg; + break; + default: + show_usage(argv[0]); + } + } + +} + +int main(int argc, char **argv) +{ + int ret; + + if (argc == 1 || argv[1][0] == '-') { + server_opts(argc, argv); + ret = server_run(); + } else { + client_opts(argc, argv); + ret = client_run(); + } + + return ret; +} diff --git a/librdmacm/examples/rdma_client.c b/librdmacm/examples/rdma_client.c new file mode 100644 index 0000000..c27047c --- /dev/null +++ b/librdmacm/examples/rdma_client.c @@ -0,0 +1,166 @@ +/* + * Copyright (c) 2010 Intel Corporation. All rights reserved. + * + * This software is available to you under the OpenIB.org BSD license + * below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AWV + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <netdb.h> +#include <errno.h> +#include <getopt.h> +#include <rdma/rdma_cma.h> +#include <rdma/rdma_verbs.h> + +static const char *server = "127.0.0.1"; +static const char *port = "7471"; + +static struct rdma_cm_id *id; +static struct ibv_mr *mr, *send_mr; +static int send_flags; +static uint8_t send_msg[16]; +static uint8_t recv_msg[16]; + +static int run(void) +{ + struct rdma_addrinfo hints, *res; + struct ibv_qp_init_attr attr; + struct ibv_wc wc; + int ret; + + memset(&hints, 0, sizeof hints); + hints.ai_port_space = RDMA_PS_TCP; + ret = rdma_getaddrinfo(server, port, &hints, &res); + if (ret) { + printf("rdma_getaddrinfo: %s\n", gai_strerror(ret)); + goto out; + } + + memset(&attr, 0, sizeof attr); + attr.cap.max_send_wr = attr.cap.max_recv_wr = 1; + attr.cap.max_send_sge = attr.cap.max_recv_sge = 1; + attr.cap.max_inline_data = 16; + attr.qp_context = id; + attr.sq_sig_all = 1; + ret = rdma_create_ep(&id, res, NULL, &attr); + // Check to see if we got inline data allowed or not + if (attr.cap.max_inline_data >= 16) + send_flags = IBV_SEND_INLINE; + else + printf("rdma_client: device doesn't support IBV_SEND_INLINE, " + "using sge sends\n"); + + if (ret) { + perror("rdma_create_ep"); + goto out_free_addrinfo; + } + + mr = rdma_reg_msgs(id, recv_msg, 16); + if (!mr) { + perror("rdma_reg_msgs for recv_msg"); + ret = -1; + goto out_destroy_ep; + } + if ((send_flags & IBV_SEND_INLINE) == 0) { + send_mr = rdma_reg_msgs(id, send_msg, 16); + if (!send_mr) { + perror("rdma_reg_msgs for send_msg"); + ret = -1; + goto out_dereg_recv; + } + } + + ret = rdma_post_recv(id, NULL, recv_msg, 16, mr); + if (ret) { + perror("rdma_post_recv"); + goto out_dereg_send; + } + + ret = rdma_connect(id, NULL); + if (ret) { + perror("rdma_connect"); + goto out_dereg_send; + } + + ret = rdma_post_send(id, NULL, send_msg, 16, send_mr, send_flags); + if (ret) { + perror("rdma_post_send"); + goto out_disconnect; + } + + while ((ret = rdma_get_send_comp(id, &wc)) == 0); + if (ret < 0) { + perror("rdma_get_send_comp"); + goto out_disconnect; + } + + while ((ret = rdma_get_recv_comp(id, &wc)) == 0); + if (ret < 0) + perror("rdma_get_recv_comp"); + else + ret = 0; + +out_disconnect: + rdma_disconnect(id); +out_dereg_send: + if ((send_flags & IBV_SEND_INLINE) == 0) + rdma_dereg_mr(send_mr); +out_dereg_recv: + rdma_dereg_mr(mr); +out_destroy_ep: + rdma_destroy_ep(id); +out_free_addrinfo: + rdma_freeaddrinfo(res); +out: + return ret; +} + +int main(int argc, char **argv) +{ + int op, ret; + + while ((op = getopt(argc, argv, "s:p:")) != -1) { + switch (op) { + case 's': + server = optarg; + break; + case 'p': + port = optarg; + break; + default: + printf("usage: %s\n", argv[0]); + printf("\t[-s server_address]\n"); + printf("\t[-p port_number]\n"); + exit(1); + } + } + + printf("rdma_client: start\n"); + ret = run(); + printf("rdma_client: end %d\n", ret); + return ret; +} diff --git a/librdmacm/examples/rdma_server.c b/librdmacm/examples/rdma_server.c new file mode 100644 index 0000000..f9c766b --- /dev/null +++ b/librdmacm/examples/rdma_server.c @@ -0,0 +1,187 @@ +/* + * Copyright (c) 2005-2009 Intel Corporation. All rights reserved. + * + * This software is available to you under the OpenIB.org BSD license + * below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AWV + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <errno.h> +#include <getopt.h> +#include <netdb.h> +#include <rdma/rdma_cma.h> +#include <rdma/rdma_verbs.h> + +static const char *server = "0.0.0.0"; +static const char *port = "7471"; + +static struct rdma_cm_id *listen_id, *id; +static struct ibv_mr *mr, *send_mr; +static int send_flags; +static uint8_t send_msg[16]; +static uint8_t recv_msg[16]; + +static int run(void) +{ + struct rdma_addrinfo hints, *res; + struct ibv_qp_init_attr init_attr; + struct ibv_qp_attr qp_attr; + struct ibv_wc wc; + int ret; + + memset(&hints, 0, sizeof hints); + hints.ai_flags = RAI_PASSIVE; + hints.ai_port_space = RDMA_PS_TCP; + ret = rdma_getaddrinfo(server, port, &hints, &res); + if (ret) { + printf("rdma_getaddrinfo: %s\n", gai_strerror(ret)); + return ret; + } + + memset(&init_attr, 0, sizeof init_attr); + init_attr.cap.max_send_wr = init_attr.cap.max_recv_wr = 1; + init_attr.cap.max_send_sge = init_attr.cap.max_recv_sge = 1; + init_attr.cap.max_inline_data = 16; + init_attr.sq_sig_all = 1; + ret = rdma_create_ep(&listen_id, res, NULL, &init_attr); + if (ret) { + perror("rdma_create_ep"); + goto out_free_addrinfo; + } + + ret = rdma_listen(listen_id, 0); + if (ret) { + perror("rdma_listen"); + goto out_destroy_listen_ep; + } + + ret = rdma_get_request(listen_id, &id); + if (ret) { + perror("rdma_get_request"); + goto out_destroy_listen_ep; + } + + memset(&qp_attr, 0, sizeof qp_attr); + memset(&init_attr, 0, sizeof init_attr); + ret = ibv_query_qp(id->qp, &qp_attr, IBV_QP_CAP, + &init_attr); + if (ret) { + perror("ibv_query_qp"); + goto out_destroy_accept_ep; + } + if (init_attr.cap.max_inline_data >= 16) + send_flags = IBV_SEND_INLINE; + else + printf("rdma_server: device doesn't support IBV_SEND_INLINE, " + "using sge sends\n"); + + mr = rdma_reg_msgs(id, recv_msg, 16); + if (!mr) { + ret = -1; + perror("rdma_reg_msgs for recv_msg"); + goto out_destroy_accept_ep; + } + if ((send_flags & IBV_SEND_INLINE) == 0) { + send_mr = rdma_reg_msgs(id, send_msg, 16); + if (!send_mr) { + ret = -1; + perror("rdma_reg_msgs for send_msg"); + goto out_dereg_recv; + } + } + + ret = rdma_post_recv(id, NULL, recv_msg, 16, mr); + if (ret) { + perror("rdma_post_recv"); + goto out_dereg_send; + } + + ret = rdma_accept(id, NULL); + if (ret) { + perror("rdma_accept"); + goto out_dereg_send; + } + + while ((ret = rdma_get_recv_comp(id, &wc)) == 0); + if (ret < 0) { + perror("rdma_get_recv_comp"); + goto out_disconnect; + } + + ret = rdma_post_send(id, NULL, send_msg, 16, send_mr, send_flags); + if (ret) { + perror("rdma_post_send"); + goto out_disconnect; + } + + while ((ret = rdma_get_send_comp(id, &wc)) == 0); + if (ret < 0) + perror("rdma_get_send_comp"); + else + ret = 0; + +out_disconnect: + rdma_disconnect(id); +out_dereg_send: + if ((send_flags & IBV_SEND_INLINE) == 0) + rdma_dereg_mr(send_mr); +out_dereg_recv: + rdma_dereg_mr(mr); +out_destroy_accept_ep: + rdma_destroy_ep(id); +out_destroy_listen_ep: + rdma_destroy_ep(listen_id); +out_free_addrinfo: + rdma_freeaddrinfo(res); + return ret; +} + +int main(int argc, char **argv) +{ + int op, ret; + + while ((op = getopt(argc, argv, "s:p:")) != -1) { + switch (op) { + case 's': + server = optarg; + break; + case 'p': + port = optarg; + break; + default: + printf("usage: %s\n", argv[0]); + printf("\t[-s server_address]\n"); + printf("\t[-p port_number]\n"); + exit(1); + } + } + + printf("rdma_server: start\n"); + ret = run(); + printf("rdma_server: end %d\n", ret); + return ret; +} diff --git a/librdmacm/examples/rdma_xclient.c b/librdmacm/examples/rdma_xclient.c new file mode 100644 index 0000000..63dfb6d --- /dev/null +++ b/librdmacm/examples/rdma_xclient.c @@ -0,0 +1,178 @@ +/* + * Copyright (c) 2010-2014 Intel Corporation. All rights reserved. + * + * This software is available to you under the OpenIB.org BSD license + * below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AWV + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <endian.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <netdb.h> +#include <errno.h> +#include <getopt.h> +#include <ctype.h> +#include <rdma/rdma_cma.h> +#include <rdma/rdma_verbs.h> + +static const char *server = "127.0.0.1"; +static char port[6] = "7471"; + +static struct rdma_cm_id *id; +static struct ibv_mr *mr; +static struct rdma_addrinfo hints; + +static uint8_t send_msg[16]; +static uint32_t srqn; + +static int post_send(void) +{ + struct ibv_send_wr wr, *bad; + struct ibv_sge sge; + int ret; + + sge.addr = (uint64_t) (uintptr_t) send_msg; + sge.length = (uint32_t) sizeof send_msg; + sge.lkey = 0; + wr.wr_id = (uintptr_t) NULL; + wr.next = NULL; + wr.sg_list = &sge; + wr.num_sge = 1; + wr.opcode = IBV_WR_SEND; + wr.send_flags = IBV_SEND_INLINE; + if (hints.ai_qp_type == IBV_QPT_XRC_SEND) + wr.qp_type.xrc.remote_srqn = srqn; + + ret = ibv_post_send(id->qp, &wr, &bad); + if (ret) + perror("rdma_post_send"); + + return ret; +} + +static int test(void) +{ + struct rdma_addrinfo *res; + struct ibv_qp_init_attr attr; + struct ibv_wc wc; + int ret; + + ret = rdma_getaddrinfo(server, port, &hints, &res); + if (ret) { + printf("rdma_getaddrinfo: %s\n", gai_strerror(ret)); + return ret; + } + + memset(&attr, 0, sizeof attr); + attr.cap.max_send_wr = attr.cap.max_recv_wr = 1; + attr.cap.max_send_sge = attr.cap.max_recv_sge = 1; + attr.sq_sig_all = 1; + ret = rdma_create_ep(&id, res, NULL, &attr); + rdma_freeaddrinfo(res); + if (ret) { + perror("rdma_create_ep"); + return ret; + } + + mr = rdma_reg_msgs(id, send_msg, sizeof send_msg); + if (!mr) { + perror("rdma_reg_msgs"); + return ret; + } + + ret = rdma_connect(id, NULL); + if (ret) { + perror("rdma_connect"); + return ret; + } + + if (hints.ai_qp_type == IBV_QPT_XRC_SEND) + srqn = be32toh(*(__be32 *) id->event->param.conn.private_data); + + ret = post_send(); + if (ret) { + perror("post_send"); + return ret; + } + + ret = rdma_get_send_comp(id, &wc); + if (ret <= 0) { + perror("rdma_get_recv_comp"); + return ret; + } + + rdma_disconnect(id); + rdma_dereg_mr(mr); + rdma_destroy_ep(id); + return 0; +} + +int main(int argc, char **argv) +{ + int op, ret; + + hints.ai_port_space = RDMA_PS_TCP; + hints.ai_qp_type = IBV_QPT_RC; + + while ((op = getopt(argc, argv, "s:p:c:")) != -1) { + switch (op) { + case 's': + server = optarg; + break; + case 'p': + strncpy(port, optarg, sizeof port - 1); + break; + case 'c': + switch (tolower(optarg[0])) { + case 'r': + break; + case 'x': + hints.ai_port_space = RDMA_PS_IB; + hints.ai_qp_type = IBV_QPT_XRC_SEND; + break; + default: + goto err; + } + break; + default: + goto err; + } + } + + printf("%s: start\n", argv[0]); + ret = test(); + printf("%s: end %d\n", argv[0], ret); + return ret; + +err: + printf("usage: %s\n", argv[0]); + printf("\t[-s server]\n"); + printf("\t[-p port_number]\n"); + printf("\t[-c communication type]\n"); + printf("\t r - RC: reliable-connected (default)\n"); + printf("\t x - XRC: extended-reliable-connected\n"); + exit(1); +} diff --git a/librdmacm/examples/rdma_xserver.c b/librdmacm/examples/rdma_xserver.c new file mode 100644 index 0000000..8d8ac31 --- /dev/null +++ b/librdmacm/examples/rdma_xserver.c @@ -0,0 +1,190 @@ +/* + * Copyright (c) 2005-2014 Intel Corporation. All rights reserved. + * + * This software is available to you under the OpenIB.org BSD license + * below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AWV + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <endian.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <errno.h> +#include <getopt.h> +#include <netdb.h> +#include <ctype.h> +#include <rdma/rdma_cma.h> +#include <rdma/rdma_verbs.h> + +static const char *port = "7471"; + +static struct rdma_cm_id *listen_id, *id; +static struct ibv_mr *mr; +static struct rdma_addrinfo hints; + +static uint8_t recv_msg[16]; +static __be32 srqn; + +static int create_srq(void) +{ + struct ibv_srq_init_attr attr; + int ret; + uint32_t tmp_srqn; + + attr.attr.max_wr = 1; + attr.attr.max_sge = 1; + attr.attr.srq_limit = 0; + attr.srq_context = id; + + ret = rdma_create_srq(id, NULL, &attr); + if (ret) + perror("rdma_create_srq:"); + + if (id->srq) { + ibv_get_srq_num(id->srq, &tmp_srqn); + srqn = htobe32(tmp_srqn); + } + return ret; +} + +static int test(void) +{ + struct rdma_addrinfo *res; + struct ibv_qp_init_attr attr; + struct rdma_conn_param param; + struct ibv_wc wc; + int ret; + + ret = rdma_getaddrinfo(NULL, port, &hints, &res); + if (ret) { + printf("rdma_getaddrinfo: %s\n", gai_strerror(ret)); + return ret; + } + + memset(&attr, 0, sizeof attr); + attr.cap.max_send_wr = attr.cap.max_recv_wr = 1; + attr.cap.max_send_sge = attr.cap.max_recv_sge = 1; + ret = rdma_create_ep(&listen_id, res, NULL, &attr); + rdma_freeaddrinfo(res); + if (ret) { + perror("rdma_create_ep"); + return ret; + } + + ret = rdma_listen(listen_id, 0); + if (ret) { + perror("rdma_listen"); + return ret; + } + + ret = rdma_get_request(listen_id, &id); + if (ret) { + perror("rdma_get_request"); + return ret; + } + + if (hints.ai_qp_type == IBV_QPT_XRC_RECV) { + ret = create_srq(); + if (ret) + return ret; + } + + mr = rdma_reg_msgs(id, recv_msg, sizeof recv_msg); + if (!mr) { + perror("rdma_reg_msgs"); + return ret; + } + + ret = rdma_post_recv(id, NULL, recv_msg, sizeof recv_msg, mr); + if (ret) { + perror("rdma_post_recv"); + return ret; + } + + memset(¶m, 0, sizeof param); + param.private_data = &srqn; + param.private_data_len = sizeof srqn; + ret = rdma_accept(id, ¶m); + if (ret) { + perror("rdma_accept"); + return ret; + } + + ret = rdma_get_recv_comp(id, &wc); + if (ret <= 0) { + perror("rdma_get_recv_comp"); + return ret; + } + + rdma_disconnect(id); + rdma_dereg_mr(mr); + rdma_destroy_ep(id); + rdma_destroy_ep(listen_id); + return 0; +} + +int main(int argc, char **argv) +{ + int op, ret; + + hints.ai_flags = RAI_PASSIVE; + hints.ai_port_space = RDMA_PS_TCP; + hints.ai_qp_type = IBV_QPT_RC; + + while ((op = getopt(argc, argv, "p:c:")) != -1) { + switch (op) { + case 'p': + port = optarg; + break; + case 'c': + switch (tolower(optarg[0])) { + case 'r': + break; + case 'x': + hints.ai_port_space = RDMA_PS_IB; + hints.ai_qp_type = IBV_QPT_XRC_RECV; + break; + default: + goto err; + } + break; + default: + goto err; + } + } + + printf("%s: start\n", argv[0]); + ret = test(); + printf("%s: end %d\n", argv[0], ret); + return ret; + +err: + printf("usage: %s\n", argv[0]); + printf("\t[-p port_number]\n"); + printf("\t[-c communication type]\n"); + printf("\t r - RC: reliable-connected (default)\n"); + printf("\t x - XRC: extended-reliable-connected\n"); + exit(1); +} diff --git a/librdmacm/examples/riostream.c b/librdmacm/examples/riostream.c new file mode 100644 index 0000000..2da5c0c --- /dev/null +++ b/librdmacm/examples/riostream.c @@ -0,0 +1,672 @@ +/* + * Copyright (c) 2011-2012 Intel Corporation. All rights reserved. + * Copyright (c) 2014 Mellanox Technologies LTD. All rights reserved. + * + * This software is available to you under the OpenIB.org BSD license + * below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AWV + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <strings.h> +#include <errno.h> +#include <getopt.h> +#include <sys/types.h> +#include <sys/socket.h> +#include <sys/time.h> +#include <sys/wait.h> +#include <netdb.h> +#include <fcntl.h> +#include <unistd.h> +#include <netinet/tcp.h> + +#include <rdma/rdma_cma.h> +#include <rdma/rsocket.h> +#include <util/compiler.h> +#include "common.h" + +struct test_size_param { + int size; + int option; +}; + +static struct test_size_param test_size[] = { + { 1 << 6, 0 }, + { 1 << 7, 1 }, { (1 << 7) + (1 << 6), 1}, + { 1 << 8, 1 }, { (1 << 8) + (1 << 7), 1}, + { 1 << 9, 1 }, { (1 << 9) + (1 << 8), 1}, + { 1 << 10, 1 }, { (1 << 10) + (1 << 9), 1}, + { 1 << 11, 1 }, { (1 << 11) + (1 << 10), 1}, + { 1 << 12, 0 }, { (1 << 12) + (1 << 11), 1}, + { 1 << 13, 1 }, { (1 << 13) + (1 << 12), 1}, + { 1 << 14, 1 }, { (1 << 14) + (1 << 13), 1}, + { 1 << 15, 1 }, { (1 << 15) + (1 << 14), 1}, + { 1 << 16, 0 }, { (1 << 16) + (1 << 15), 1}, + { 1 << 17, 1 }, { (1 << 17) + (1 << 16), 1}, + { 1 << 18, 1 }, { (1 << 18) + (1 << 17), 1}, + { 1 << 19, 1 }, { (1 << 19) + (1 << 18), 1}, + { 1 << 20, 0 }, { (1 << 20) + (1 << 19), 1}, + { 1 << 21, 1 }, { (1 << 21) + (1 << 20), 1}, + { 1 << 22, 1 }, { (1 << 22) + (1 << 21), 1}, +}; +#define TEST_CNT (sizeof test_size / sizeof test_size[0]) + +static int rs, lrs; +static int use_async; +static int use_rgai; +static int verify; +static int flags = MSG_DONTWAIT; +static int poll_timeout = 0; +static int custom; +static enum rs_optimization optimization; +static int size_option; +static int iterations = 1; +static int transfer_size = 1000; +static int transfer_count = 1000; +static int buffer_size, inline_size = 64; +static char test_name[10] = "custom"; +static const char *port = "7471"; +static char *dst_addr; +static char *src_addr; +static struct timeval start, end; +static void *buf; +static volatile uint8_t *poll_byte; +static struct rdma_addrinfo rai_hints; +static struct addrinfo ai_hints; + +static void show_perf(void) +{ + char str[32]; + float usec; + long long bytes; + + usec = (end.tv_sec - start.tv_sec) * 1000000 + (end.tv_usec - start.tv_usec); + bytes = (long long) iterations * transfer_count * transfer_size * 2; + + /* name size transfers iterations bytes seconds Gb/sec usec/xfer */ + printf("%-10s", test_name); + size_str(str, sizeof str, transfer_size); + printf("%-8s", str); + cnt_str(str, sizeof str, transfer_count); + printf("%-8s", str); + cnt_str(str, sizeof str, iterations); + printf("%-8s", str); + size_str(str, sizeof str, bytes); + printf("%-8s", str); + printf("%8.2fs%10.2f%11.2f\n", + usec / 1000000., (bytes * 8) / (1000. * usec), + (usec / iterations) / (transfer_count * 2)); +} + +static void init_latency_test(int size) +{ + char sstr[5]; + + size_str(sstr, sizeof sstr, size); + snprintf(test_name, sizeof test_name, "%s_lat", sstr); + transfer_count = 1; + transfer_size = size; + iterations = size_to_count(transfer_size); +} + +static void init_bandwidth_test(int size) +{ + char sstr[5]; + + size_str(sstr, sizeof sstr, size); + snprintf(test_name, sizeof test_name, "%s_bw", sstr); + iterations = 1; + transfer_size = size; + transfer_count = size_to_count(transfer_size); +} + +static int send_msg(int size) +{ + struct pollfd fds; + int offset, ret; + + if (use_async) { + fds.fd = rs; + fds.events = POLLOUT; + } + + for (offset = 0; offset < size; ) { + if (use_async) { + ret = do_poll(&fds, poll_timeout); + if (ret) + return ret; + } + + ret = rsend(rs, buf + offset, size - offset, flags); + if (ret > 0) { + offset += ret; + } else if (errno != EWOULDBLOCK && errno != EAGAIN) { + perror("rsend"); + return ret; + } + } + + return 0; +} + +static int send_xfer(int size) +{ + struct pollfd fds; + int offset, ret; + + if (use_async) { + fds.fd = rs; + fds.events = POLLOUT; + } + + for (offset = 0; offset < size; ) { + if (use_async) { + ret = do_poll(&fds, poll_timeout); + if (ret) + return ret; + } + + ret = riowrite(rs, buf + offset, size - offset, offset, flags); + if (ret > 0) { + offset += ret; + } else if (errno != EWOULDBLOCK && errno != EAGAIN) { + perror("riowrite"); + return ret; + } + } + + return 0; +} + +static int recv_msg(int size) +{ + struct pollfd fds; + int offset, ret; + + if (use_async) { + fds.fd = rs; + fds.events = POLLIN; + } + + for (offset = 0; offset < size; ) { + if (use_async) { + ret = do_poll(&fds, poll_timeout); + if (ret) + return ret; + } + + ret = rrecv(rs, buf + offset, size - offset, flags); + if (ret > 0) { + offset += ret; + } else if (errno != EWOULDBLOCK && errno != EAGAIN) { + perror("rrecv"); + return ret; + } + } + + return 0; +} + +static int recv_xfer(int size, uint8_t marker) +{ + int ret; + + while (*poll_byte != marker) + ; + + if (verify) { + ret = verify_buf(buf, size - 1); + if (ret) + return ret; + } + + return 0; +} + +static int sync_test(void) +{ + int ret; + + ret = dst_addr ? send_msg(16) : recv_msg(16); + if (ret) + return ret; + + return dst_addr ? recv_msg(16) : send_msg(16); +} + +static int run_test(void) +{ + int ret, i, t; + off_t offset; + uint8_t marker = 0; + + poll_byte = buf + transfer_size - 1; + *poll_byte = -1; + offset = riomap(rs, buf, transfer_size, PROT_WRITE, 0, 0); + if (offset == -1) { + perror("riomap"); + ret = -1; + goto out; + } + ret = sync_test(); + if (ret) + goto out; + + gettimeofday(&start, NULL); + for (i = 0; i < iterations; i++) { + if (dst_addr) { + for (t = 0; t < transfer_count - 1; t++) { + ret = send_xfer(transfer_size); + if (ret) + goto out; + } + *poll_byte = (uint8_t) marker++; + if (verify) + format_buf(buf, transfer_size - 1); + ret = send_xfer(transfer_size); + if (ret) + goto out; + + ret = recv_xfer(transfer_size, marker++); + } else { + ret = recv_xfer(transfer_size, marker++); + if (ret) + goto out; + + for (t = 0; t < transfer_count - 1; t++) { + ret = send_xfer(transfer_size); + if (ret) + goto out; + } + *poll_byte = (uint8_t) marker++; + if (verify) + format_buf(buf, transfer_size - 1); + ret = send_xfer(transfer_size); + } + if (ret) + goto out; + } + gettimeofday(&end, NULL); + show_perf(); + ret = riounmap(rs, buf, transfer_size); + +out: + return ret; +} + +static void set_options(int fd) +{ + int val; + + if (buffer_size) { + rsetsockopt(fd, SOL_SOCKET, SO_SNDBUF, (void *) &buffer_size, + sizeof buffer_size); + rsetsockopt(fd, SOL_SOCKET, SO_RCVBUF, (void *) &buffer_size, + sizeof buffer_size); + } else { + val = 1 << 19; + rsetsockopt(fd, SOL_SOCKET, SO_SNDBUF, (void *) &val, sizeof val); + rsetsockopt(fd, SOL_SOCKET, SO_RCVBUF, (void *) &val, sizeof val); + } + + val = 1; + rsetsockopt(fd, IPPROTO_TCP, TCP_NODELAY, (void *) &val, sizeof(val)); + rsetsockopt(fd, SOL_RDMA, RDMA_IOMAPSIZE, (void *) &val, sizeof val); + + if (flags & MSG_DONTWAIT) + rfcntl(fd, F_SETFL, O_NONBLOCK); + + /* Inline size based on experimental data */ + if (optimization == opt_latency) { + rsetsockopt(fd, SOL_RDMA, RDMA_INLINE, &inline_size, + sizeof inline_size); + } else if (optimization == opt_bandwidth) { + val = 0; + rsetsockopt(fd, SOL_RDMA, RDMA_INLINE, &val, sizeof val); + } +} + +static int server_listen(void) +{ + struct rdma_addrinfo *rai = NULL; + struct addrinfo *ai; + int val, ret; + + if (use_rgai) { + rai_hints.ai_flags |= RAI_PASSIVE; + ret = rdma_getaddrinfo(src_addr, port, &rai_hints, &rai); + } else { + ai_hints.ai_flags |= AI_PASSIVE; + ret = getaddrinfo(src_addr, port, &ai_hints, &ai); + } + if (ret) { + printf("getaddrinfo: %s\n", gai_strerror(ret)); + return ret; + } + + lrs = rai ? rs_socket(rai->ai_family, SOCK_STREAM, 0) : + rs_socket(ai->ai_family, SOCK_STREAM, 0); + if (lrs < 0) { + ret = lrs; + goto free; + } + + val = 1; + ret = rsetsockopt(lrs, SOL_SOCKET, SO_REUSEADDR, &val, sizeof val); + if (ret) { + perror("rsetsockopt SO_REUSEADDR"); + goto close; + } + + ret = rai ? rbind(lrs, rai->ai_src_addr, rai->ai_src_len) : + rbind(lrs, ai->ai_addr, ai->ai_addrlen); + if (ret) { + perror("rbind"); + goto close; + } + + ret = rlisten(lrs, 1); + if (ret) + perror("rlisten"); + +close: + if (ret) + rclose(lrs); +free: + if (rai) + rdma_freeaddrinfo(rai); + else + freeaddrinfo(ai); + return ret; +} + +static int server_connect(void) +{ + struct pollfd fds; + int ret = 0; + + set_options(lrs); + do { + if (use_async) { + fds.fd = lrs; + fds.events = POLLIN; + + ret = do_poll(&fds, poll_timeout); + if (ret) { + perror("rpoll"); + return ret; + } + } + + rs = raccept(lrs, NULL, NULL); + } while (rs < 0 && (errno == EAGAIN || errno == EWOULDBLOCK)); + if (rs < 0) { + perror("raccept"); + return rs; + } + + set_options(rs); + return ret; +} + +static int client_connect(void) +{ + struct rdma_addrinfo *rai = NULL; + struct addrinfo *ai; + struct pollfd fds; + int ret, err; + socklen_t len; + + ret = use_rgai ? rdma_getaddrinfo(dst_addr, port, &rai_hints, &rai) : + getaddrinfo(dst_addr, port, &ai_hints, &ai); + if (ret) { + printf("getaddrinfo: %s\n", gai_strerror(ret)); + return ret; + } + + rs = rai ? rs_socket(rai->ai_family, SOCK_STREAM, 0) : + rs_socket(ai->ai_family, SOCK_STREAM, 0); + if (rs < 0) { + ret = rs; + goto free; + } + + set_options(rs); + /* TODO: bind client to src_addr */ + + ret = rai ? rconnect(rs, rai->ai_dst_addr, rai->ai_dst_len) : + rconnect(rs, ai->ai_addr, ai->ai_addrlen); + if (ret && (errno != EINPROGRESS)) { + perror("rconnect"); + goto close; + } + + if (ret && (errno == EINPROGRESS)) { + fds.fd = rs; + fds.events = POLLOUT; + ret = do_poll(&fds, poll_timeout); + if (ret) { + perror("rpoll"); + goto close; + } + + len = sizeof err; + ret = rgetsockopt(rs, SOL_SOCKET, SO_ERROR, &err, &len); + if (ret) + goto close; + if (err) { + ret = -1; + errno = err; + perror("async rconnect"); + } + } + +close: + if (ret) + rclose(rs); +free: + if (rai) + rdma_freeaddrinfo(rai); + else + freeaddrinfo(ai); + return ret; +} + +static int run(void) +{ + int i, ret = 0; + + buf = malloc(!custom ? test_size[TEST_CNT - 1].size : transfer_size); + if (!buf) { + perror("malloc"); + return -1; + } + + if (!dst_addr) { + ret = server_listen(); + if (ret) + goto free; + } + + printf("%-10s%-8s%-8s%-8s%-8s%8s %10s%13s\n", + "name", "bytes", "xfers", "iters", "total", "time", "Gb/sec", "usec/xfer"); + if (!custom) { + optimization = opt_latency; + ret = dst_addr ? client_connect() : server_connect(); + if (ret) + goto free; + + for (i = 0; i < TEST_CNT; i++) { + if (test_size[i].option > size_option) + continue; + init_latency_test(test_size[i].size); + run_test(); + } + rshutdown(rs, SHUT_RDWR); + rclose(rs); + + optimization = opt_bandwidth; + ret = dst_addr ? client_connect() : server_connect(); + if (ret) + goto free; + for (i = 0; i < TEST_CNT; i++) { + if (test_size[i].option > size_option) + continue; + init_bandwidth_test(test_size[i].size); + run_test(); + } + } else { + ret = dst_addr ? client_connect() : server_connect(); + if (ret) + goto free; + + ret = run_test(); + } + + rshutdown(rs, SHUT_RDWR); + rclose(rs); +free: + free(buf); + return ret; +} + +static int set_test_opt(const char *arg) +{ + if (strlen(arg) == 1) { + switch (arg[0]) { + case 'a': + use_async = 1; + break; + case 'b': + flags = (flags & ~MSG_DONTWAIT) | MSG_WAITALL; + break; + case 'n': + flags |= MSG_DONTWAIT; + break; + case 'v': + verify = 1; + break; + default: + return -1; + } + } else { + if (!strncasecmp("async", arg, 5)) { + use_async = 1; + } else if (!strncasecmp("block", arg, 5)) { + flags = (flags & ~MSG_DONTWAIT) | MSG_WAITALL; + } else if (!strncasecmp("nonblock", arg, 8)) { + flags |= MSG_DONTWAIT; + } else if (!strncasecmp("verify", arg, 6)) { + verify = 1; + } else { + return -1; + } + } + return 0; +} + +int main(int argc, char **argv) +{ + int op, ret; + + ai_hints.ai_socktype = SOCK_STREAM; + rai_hints.ai_port_space = RDMA_PS_TCP; + while ((op = getopt(argc, argv, "s:b:f:B:i:I:C:S:p:T:")) != -1) { + switch (op) { + case 's': + dst_addr = optarg; + break; + case 'b': + src_addr = optarg; + break; + case 'f': + if (!strncasecmp("ip", optarg, 2)) { + ai_hints.ai_flags = AI_NUMERICHOST; + } else if (!strncasecmp("gid", optarg, 3)) { + rai_hints.ai_flags = RAI_NUMERICHOST | RAI_FAMILY; + rai_hints.ai_family = AF_IB; + use_rgai = 1; + } else { + fprintf(stderr, "Warning: unknown address format\n"); + } + break; + case 'B': + buffer_size = atoi(optarg); + break; + case 'i': + inline_size = atoi(optarg); + break; + case 'I': + custom = 1; + iterations = atoi(optarg); + break; + case 'C': + custom = 1; + transfer_count = atoi(optarg); + break; + case 'S': + if (!strncasecmp("all", optarg, 3)) { + size_option = 1; + } else { + custom = 1; + transfer_size = atoi(optarg); + } + break; + case 'p': + port = optarg; + break; + case 'T': + if (!set_test_opt(optarg)) + break; + /* invalid option - fall through */ + SWITCH_FALLTHROUGH; + default: + printf("usage: %s\n", argv[0]); + printf("\t[-s server_address]\n"); + printf("\t[-b bind_address]\n"); + printf("\t[-f address_format]\n"); + printf("\t name, ip, ipv6, or gid\n"); + printf("\t[-B buffer_size]\n"); + printf("\t[-i inline_size]\n"); + printf("\t[-I iterations]\n"); + printf("\t[-C transfer_count]\n"); + printf("\t[-S transfer_size or all]\n"); + printf("\t[-p port_number]\n"); + printf("\t[-T test_option]\n"); + printf("\t a|async - asynchronous operation (use poll)\n"); + printf("\t b|blocking - use blocking calls\n"); + printf("\t n|nonblocking - use nonblocking calls\n"); + printf("\t v|verify - verify data\n"); + exit(1); + } + } + + if (!(flags & MSG_DONTWAIT)) + poll_timeout = -1; + + ret = run(); + return ret; +} diff --git a/librdmacm/examples/rping.c b/librdmacm/examples/rping.c new file mode 100644 index 0000000..882f878 --- /dev/null +++ b/librdmacm/examples/rping.c @@ -0,0 +1,1384 @@ +/* + * Copyright (c) 2005 Ammasso, Inc. All rights reserved. + * Copyright (c) 2006 Open Grid Computing, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#define _GNU_SOURCE +#include <endian.h> +#include <getopt.h> +#include <stdlib.h> +#include <string.h> +#include <stdio.h> +#include <errno.h> +#include <sys/types.h> +#include <sys/socket.h> +#include <netdb.h> +#include <semaphore.h> +#include <pthread.h> +#include <inttypes.h> +#include <rdma/rdma_cma.h> +#include "common.h" + +static int debug = 0; +#define DEBUG_LOG if (debug) printf + +/* + * rping "ping/pong" loop: + * client sends source rkey/addr/len + * server receives source rkey/add/len + * server rdma reads "ping" data from source + * server sends "go ahead" on rdma read completion + * client sends sink rkey/addr/len + * server receives sink rkey/addr/len + * server rdma writes "pong" data to sink + * server sends "go ahead" on rdma write completion + * <repeat loop> + */ + +/* + * These states are used to signal events between the completion handler + * and the main client or server thread. + * + * Once CONNECTED, they cycle through RDMA_READ_ADV, RDMA_WRITE_ADV, + * and RDMA_WRITE_COMPLETE for each ping. + */ +enum test_state { + IDLE = 1, + CONNECT_REQUEST, + ADDR_RESOLVED, + ROUTE_RESOLVED, + CONNECTED, + RDMA_READ_ADV, + RDMA_READ_COMPLETE, + RDMA_WRITE_ADV, + RDMA_WRITE_COMPLETE, + DISCONNECTED, + ERROR +}; + +struct rping_rdma_info { + __be64 buf; + __be32 rkey; + __be32 size; +}; + +/* + * Default max buffer size for IO... + */ +#define RPING_BUFSIZE 64*1024 +#define RPING_SQ_DEPTH 16 + +/* Default string for print data and + * minimum buffer size + */ +#define _stringify( _x ) # _x +#define stringify( _x ) _stringify( _x ) + +#define RPING_MSG_FMT "rdma-ping-%d: " +#define RPING_MIN_BUFSIZE sizeof(stringify(INT_MAX)) + sizeof(RPING_MSG_FMT) + +/* + * Control block struct. + */ +struct rping_cb { + int server; /* 0 iff client */ + pthread_t cqthread; + pthread_t persistent_server_thread; + struct ibv_comp_channel *channel; + struct ibv_cq *cq; + struct ibv_pd *pd; + struct ibv_qp *qp; + + struct ibv_recv_wr rq_wr; /* recv work request record */ + struct ibv_sge recv_sgl; /* recv single SGE */ + struct rping_rdma_info recv_buf;/* malloc'd buffer */ + struct ibv_mr *recv_mr; /* MR associated with this buffer */ + + struct ibv_send_wr sq_wr; /* send work request record */ + struct ibv_sge send_sgl; + struct rping_rdma_info send_buf;/* single send buf */ + struct ibv_mr *send_mr; + + struct ibv_send_wr rdma_sq_wr; /* rdma work request record */ + struct ibv_sge rdma_sgl; /* rdma single SGE */ + char *rdma_buf; /* used as rdma sink */ + struct ibv_mr *rdma_mr; + + uint32_t remote_rkey; /* remote guys RKEY */ + uint64_t remote_addr; /* remote guys TO */ + uint32_t remote_len; /* remote guys LEN */ + + char *start_buf; /* rdma read src */ + struct ibv_mr *start_mr; + + enum test_state state; /* used for cond/signalling */ + sem_t sem; + + struct sockaddr_storage sin; + struct sockaddr_storage ssource; + __be16 port; /* dst port in NBO */ + int verbose; /* verbose logging */ + int self_create_qp; /* Create QP not via cma */ + int count; /* ping count */ + int size; /* ping data size */ + int validate; /* validate ping data */ + + /* CM stuff */ + pthread_t cmthread; + struct rdma_event_channel *cm_channel; + struct rdma_cm_id *cm_id; /* connection on client side,*/ + /* listener on service side. */ + struct rdma_cm_id *child_cm_id; /* connection on server side */ +}; + +static int rping_cma_event_handler(struct rdma_cm_id *cma_id, + struct rdma_cm_event *event) +{ + int ret = 0; + struct rping_cb *cb = cma_id->context; + + DEBUG_LOG("cma_event type %s cma_id %p (%s)\n", + rdma_event_str(event->event), cma_id, + (cma_id == cb->cm_id) ? "parent" : "child"); + + switch (event->event) { + case RDMA_CM_EVENT_ADDR_RESOLVED: + cb->state = ADDR_RESOLVED; + ret = rdma_resolve_route(cma_id, 2000); + if (ret) { + cb->state = ERROR; + perror("rdma_resolve_route"); + sem_post(&cb->sem); + } + break; + + case RDMA_CM_EVENT_ROUTE_RESOLVED: + cb->state = ROUTE_RESOLVED; + sem_post(&cb->sem); + break; + + case RDMA_CM_EVENT_CONNECT_REQUEST: + cb->state = CONNECT_REQUEST; + cb->child_cm_id = cma_id; + DEBUG_LOG("child cma %p\n", cb->child_cm_id); + sem_post(&cb->sem); + break; + + case RDMA_CM_EVENT_CONNECT_RESPONSE: + DEBUG_LOG("CONNECT_RESPONSE\n"); + cb->state = CONNECTED; + sem_post(&cb->sem); + break; + + case RDMA_CM_EVENT_ESTABLISHED: + DEBUG_LOG("ESTABLISHED\n"); + + /* + * Server will wake up when first RECV completes. + */ + if (!cb->server) { + cb->state = CONNECTED; + } + sem_post(&cb->sem); + break; + + case RDMA_CM_EVENT_ADDR_ERROR: + case RDMA_CM_EVENT_ROUTE_ERROR: + case RDMA_CM_EVENT_CONNECT_ERROR: + case RDMA_CM_EVENT_UNREACHABLE: + case RDMA_CM_EVENT_REJECTED: + fprintf(stderr, "cma event %s, error %d\n", + rdma_event_str(event->event), event->status); + sem_post(&cb->sem); + ret = -1; + break; + + case RDMA_CM_EVENT_DISCONNECTED: + fprintf(stderr, "%s DISCONNECT EVENT...\n", + cb->server ? "server" : "client"); + cb->state = DISCONNECTED; + sem_post(&cb->sem); + break; + + case RDMA_CM_EVENT_DEVICE_REMOVAL: + fprintf(stderr, "cma detected device removal!!!!\n"); + cb->state = ERROR; + sem_post(&cb->sem); + ret = -1; + break; + + default: + fprintf(stderr, "unhandled event: %s, ignoring\n", + rdma_event_str(event->event)); + break; + } + + return ret; +} + +static int server_recv(struct rping_cb *cb, struct ibv_wc *wc) +{ + if (wc->byte_len != sizeof(cb->recv_buf)) { + fprintf(stderr, "Received bogus data, size %d\n", wc->byte_len); + return -1; + } + + cb->remote_rkey = be32toh(cb->recv_buf.rkey); + cb->remote_addr = be64toh(cb->recv_buf.buf); + cb->remote_len = be32toh(cb->recv_buf.size); + DEBUG_LOG("Received rkey %x addr %" PRIx64 " len %d from peer\n", + cb->remote_rkey, cb->remote_addr, cb->remote_len); + + if (cb->state <= CONNECTED || cb->state == RDMA_WRITE_COMPLETE) + cb->state = RDMA_READ_ADV; + else + cb->state = RDMA_WRITE_ADV; + + return 0; +} + +static int client_recv(struct rping_cb *cb, struct ibv_wc *wc) +{ + if (wc->byte_len != sizeof(cb->recv_buf)) { + fprintf(stderr, "Received bogus data, size %d\n", wc->byte_len); + return -1; + } + + if (cb->state == RDMA_READ_ADV) + cb->state = RDMA_WRITE_ADV; + else + cb->state = RDMA_WRITE_COMPLETE; + + return 0; +} + +static int rping_cq_event_handler(struct rping_cb *cb) +{ + struct ibv_wc wc; + struct ibv_recv_wr *bad_wr; + int ret; + int flushed = 0; + + while ((ret = ibv_poll_cq(cb->cq, 1, &wc)) == 1) { + ret = 0; + + if (wc.status) { + if (wc.status == IBV_WC_WR_FLUSH_ERR) { + flushed = 1; + continue; + + } + fprintf(stderr, + "cq completion failed status %d\n", + wc.status); + ret = -1; + goto error; + } + + switch (wc.opcode) { + case IBV_WC_SEND: + DEBUG_LOG("send completion\n"); + break; + + case IBV_WC_RDMA_WRITE: + DEBUG_LOG("rdma write completion\n"); + cb->state = RDMA_WRITE_COMPLETE; + sem_post(&cb->sem); + break; + + case IBV_WC_RDMA_READ: + DEBUG_LOG("rdma read completion\n"); + cb->state = RDMA_READ_COMPLETE; + sem_post(&cb->sem); + break; + + case IBV_WC_RECV: + DEBUG_LOG("recv completion\n"); + ret = cb->server ? server_recv(cb, &wc) : + client_recv(cb, &wc); + if (ret) { + fprintf(stderr, "recv wc error: %d\n", ret); + goto error; + } + + ret = ibv_post_recv(cb->qp, &cb->rq_wr, &bad_wr); + if (ret) { + fprintf(stderr, "post recv error: %d\n", ret); + goto error; + } + sem_post(&cb->sem); + break; + + default: + DEBUG_LOG("unknown!!!!! completion\n"); + ret = -1; + goto error; + } + } + if (ret) { + fprintf(stderr, "poll error %d\n", ret); + goto error; + } + return flushed; + +error: + cb->state = ERROR; + sem_post(&cb->sem); + return ret; +} + +static void rping_init_conn_param(struct rping_cb *cb, + struct rdma_conn_param *conn_param) +{ + memset(conn_param, 0, sizeof(*conn_param)); + conn_param->responder_resources = 1; + conn_param->initiator_depth = 1; + conn_param->retry_count = 7; + conn_param->rnr_retry_count = 7; + if (cb->self_create_qp) + conn_param->qp_num = cb->qp->qp_num; +} + + +static int rping_self_modify_qp(struct rping_cb *cb, struct rdma_cm_id *id) +{ + struct ibv_qp_attr qp_attr; + int qp_attr_mask, ret; + + qp_attr.qp_state = IBV_QPS_INIT; + ret = rdma_init_qp_attr(id, &qp_attr, &qp_attr_mask); + if (ret) + return ret; + + ret = ibv_modify_qp(cb->qp, &qp_attr, qp_attr_mask); + if (ret) + return ret; + + qp_attr.qp_state = IBV_QPS_RTR; + ret = rdma_init_qp_attr(id, &qp_attr, &qp_attr_mask); + if (ret) + return ret; + + ret = ibv_modify_qp(cb->qp, &qp_attr, qp_attr_mask); + if (ret) + return ret; + + qp_attr.qp_state = IBV_QPS_RTS; + ret = rdma_init_qp_attr(id, &qp_attr, &qp_attr_mask); + if (ret) + return ret; + + return ibv_modify_qp(cb->qp, &qp_attr, qp_attr_mask); +} + +static int rping_accept(struct rping_cb *cb) +{ + struct rdma_conn_param conn_param; + int ret; + + DEBUG_LOG("accepting client connection request\n"); + + if (cb->self_create_qp) { + ret = rping_self_modify_qp(cb, cb->child_cm_id); + if (ret) + return ret; + + rping_init_conn_param(cb, &conn_param); + ret = rdma_accept(cb->child_cm_id, &conn_param); + } else { + ret = rdma_accept(cb->child_cm_id, NULL); + } + if (ret) { + perror("rdma_accept"); + return ret; + } + + sem_wait(&cb->sem); + if (cb->state == ERROR) { + fprintf(stderr, "wait for CONNECTED state %d\n", cb->state); + return -1; + } + return 0; +} + +static int rping_disconnect(struct rping_cb *cb, struct rdma_cm_id *id) +{ + struct ibv_qp_attr qp_attr = {}; + int err = 0; + + if (cb->self_create_qp) { + qp_attr.qp_state = IBV_QPS_ERR; + err = ibv_modify_qp(cb->qp, &qp_attr, IBV_QP_STATE); + if (err) + return err; + } + + return rdma_disconnect(id); +} + +static void rping_setup_wr(struct rping_cb *cb) +{ + cb->recv_sgl.addr = (uint64_t) (unsigned long) &cb->recv_buf; + cb->recv_sgl.length = sizeof cb->recv_buf; + cb->recv_sgl.lkey = cb->recv_mr->lkey; + cb->rq_wr.sg_list = &cb->recv_sgl; + cb->rq_wr.num_sge = 1; + + cb->send_sgl.addr = (uint64_t) (unsigned long) &cb->send_buf; + cb->send_sgl.length = sizeof cb->send_buf; + cb->send_sgl.lkey = cb->send_mr->lkey; + + cb->sq_wr.opcode = IBV_WR_SEND; + cb->sq_wr.send_flags = IBV_SEND_SIGNALED; + cb->sq_wr.sg_list = &cb->send_sgl; + cb->sq_wr.num_sge = 1; + + cb->rdma_sgl.addr = (uint64_t) (unsigned long) cb->rdma_buf; + cb->rdma_sgl.lkey = cb->rdma_mr->lkey; + cb->rdma_sq_wr.send_flags = IBV_SEND_SIGNALED; + cb->rdma_sq_wr.sg_list = &cb->rdma_sgl; + cb->rdma_sq_wr.num_sge = 1; +} + +static int rping_setup_buffers(struct rping_cb *cb) +{ + int ret; + + DEBUG_LOG("rping_setup_buffers called on cb %p\n", cb); + + cb->recv_mr = ibv_reg_mr(cb->pd, &cb->recv_buf, sizeof cb->recv_buf, + IBV_ACCESS_LOCAL_WRITE); + if (!cb->recv_mr) { + fprintf(stderr, "recv_buf reg_mr failed\n"); + return errno; + } + + cb->send_mr = ibv_reg_mr(cb->pd, &cb->send_buf, sizeof cb->send_buf, 0); + if (!cb->send_mr) { + fprintf(stderr, "send_buf reg_mr failed\n"); + ret = errno; + goto err1; + } + + cb->rdma_buf = malloc(cb->size); + if (!cb->rdma_buf) { + fprintf(stderr, "rdma_buf malloc failed\n"); + ret = -ENOMEM; + goto err2; + } + + cb->rdma_mr = ibv_reg_mr(cb->pd, cb->rdma_buf, cb->size, + IBV_ACCESS_LOCAL_WRITE | + IBV_ACCESS_REMOTE_READ | + IBV_ACCESS_REMOTE_WRITE); + if (!cb->rdma_mr) { + fprintf(stderr, "rdma_buf reg_mr failed\n"); + ret = errno; + goto err3; + } + + if (!cb->server) { + cb->start_buf = malloc(cb->size); + if (!cb->start_buf) { + fprintf(stderr, "start_buf malloc failed\n"); + ret = -ENOMEM; + goto err4; + } + + cb->start_mr = ibv_reg_mr(cb->pd, cb->start_buf, cb->size, + IBV_ACCESS_LOCAL_WRITE | + IBV_ACCESS_REMOTE_READ | + IBV_ACCESS_REMOTE_WRITE); + if (!cb->start_mr) { + fprintf(stderr, "start_buf reg_mr failed\n"); + ret = errno; + goto err5; + } + } + + rping_setup_wr(cb); + DEBUG_LOG("allocated & registered buffers...\n"); + return 0; + +err5: + free(cb->start_buf); +err4: + ibv_dereg_mr(cb->rdma_mr); +err3: + free(cb->rdma_buf); +err2: + ibv_dereg_mr(cb->send_mr); +err1: + ibv_dereg_mr(cb->recv_mr); + return ret; +} + +static void rping_free_buffers(struct rping_cb *cb) +{ + DEBUG_LOG("rping_free_buffers called on cb %p\n", cb); + ibv_dereg_mr(cb->recv_mr); + ibv_dereg_mr(cb->send_mr); + ibv_dereg_mr(cb->rdma_mr); + free(cb->rdma_buf); + if (!cb->server) { + ibv_dereg_mr(cb->start_mr); + free(cb->start_buf); + } +} + +static int rping_create_qp(struct rping_cb *cb) +{ + struct ibv_qp_init_attr init_attr; + struct rdma_cm_id *id; + int ret; + + memset(&init_attr, 0, sizeof(init_attr)); + init_attr.cap.max_send_wr = RPING_SQ_DEPTH; + init_attr.cap.max_recv_wr = 2; + init_attr.cap.max_recv_sge = 1; + init_attr.cap.max_send_sge = 1; + init_attr.qp_type = IBV_QPT_RC; + init_attr.send_cq = cb->cq; + init_attr.recv_cq = cb->cq; + + if (cb->self_create_qp) { + cb->qp = ibv_create_qp(cb->pd, &init_attr); + if (!cb->qp) { + perror("ibv_create_qp"); + return -1; + } + return 0; + } + + id = cb->server ? cb->child_cm_id : cb->cm_id; + ret = rdma_create_qp(id, cb->pd, &init_attr); + if (!ret) + cb->qp = id->qp; + else + perror("rdma_create_qp"); + return ret; +} + +static void rping_free_qp(struct rping_cb *cb) +{ + ibv_destroy_qp(cb->qp); + ibv_destroy_cq(cb->cq); + ibv_destroy_comp_channel(cb->channel); + ibv_dealloc_pd(cb->pd); +} + +static int rping_setup_qp(struct rping_cb *cb, struct rdma_cm_id *cm_id) +{ + int ret; + + cb->pd = ibv_alloc_pd(cm_id->verbs); + if (!cb->pd) { + fprintf(stderr, "ibv_alloc_pd failed\n"); + return errno; + } + DEBUG_LOG("created pd %p\n", cb->pd); + + cb->channel = ibv_create_comp_channel(cm_id->verbs); + if (!cb->channel) { + fprintf(stderr, "ibv_create_comp_channel failed\n"); + ret = errno; + goto err1; + } + DEBUG_LOG("created channel %p\n", cb->channel); + + cb->cq = ibv_create_cq(cm_id->verbs, RPING_SQ_DEPTH * 2, cb, + cb->channel, 0); + if (!cb->cq) { + fprintf(stderr, "ibv_create_cq failed\n"); + ret = errno; + goto err2; + } + DEBUG_LOG("created cq %p\n", cb->cq); + + ret = ibv_req_notify_cq(cb->cq, 0); + if (ret) { + fprintf(stderr, "ibv_create_cq failed\n"); + ret = errno; + goto err3; + } + + ret = rping_create_qp(cb); + if (ret) { + goto err3; + } + DEBUG_LOG("created qp %p\n", cb->qp); + return 0; + +err3: + ibv_destroy_cq(cb->cq); +err2: + ibv_destroy_comp_channel(cb->channel); +err1: + ibv_dealloc_pd(cb->pd); + return ret; +} + +static void *cm_thread(void *arg) +{ + struct rping_cb *cb = arg; + struct rdma_cm_event *event; + int ret; + + while (1) { + ret = rdma_get_cm_event(cb->cm_channel, &event); + if (ret) { + perror("rdma_get_cm_event"); + exit(ret); + } + ret = rping_cma_event_handler(event->id, event); + rdma_ack_cm_event(event); + if (ret) + exit(ret); + } +} + +static void *cq_thread(void *arg) +{ + struct rping_cb *cb = arg; + struct ibv_cq *ev_cq; + void *ev_ctx; + int ret; + + DEBUG_LOG("cq_thread started.\n"); + + while (1) { + pthread_testcancel(); + + ret = ibv_get_cq_event(cb->channel, &ev_cq, &ev_ctx); + if (ret) { + fprintf(stderr, "Failed to get cq event!\n"); + pthread_exit(NULL); + } + if (ev_cq != cb->cq) { + fprintf(stderr, "Unknown CQ!\n"); + pthread_exit(NULL); + } + ret = ibv_req_notify_cq(cb->cq, 0); + if (ret) { + fprintf(stderr, "Failed to set notify!\n"); + pthread_exit(NULL); + } + ret = rping_cq_event_handler(cb); + ibv_ack_cq_events(cb->cq, 1); + if (ret) + pthread_exit(NULL); + } +} + +static void rping_format_send(struct rping_cb *cb, char *buf, struct ibv_mr *mr) +{ + struct rping_rdma_info *info = &cb->send_buf; + + info->buf = htobe64((uint64_t) (unsigned long) buf); + info->rkey = htobe32(mr->rkey); + info->size = htobe32(cb->size); + + DEBUG_LOG("RDMA addr %" PRIx64" rkey %x len %d\n", + be64toh(info->buf), be32toh(info->rkey), be32toh(info->size)); +} + +static int rping_test_server(struct rping_cb *cb) +{ + struct ibv_send_wr *bad_wr; + int ret; + + while (1) { + /* Wait for client's Start STAG/TO/Len */ + sem_wait(&cb->sem); + if (cb->state != RDMA_READ_ADV) { + fprintf(stderr, "wait for RDMA_READ_ADV state %d\n", + cb->state); + ret = -1; + break; + } + + DEBUG_LOG("server received sink adv\n"); + + /* Issue RDMA Read. */ + cb->rdma_sq_wr.opcode = IBV_WR_RDMA_READ; + cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey; + cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr; + cb->rdma_sq_wr.sg_list->length = cb->remote_len; + + ret = ibv_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr); + if (ret) { + fprintf(stderr, "post send error %d\n", ret); + break; + } + DEBUG_LOG("server posted rdma read req \n"); + + /* Wait for read completion */ + sem_wait(&cb->sem); + if (cb->state != RDMA_READ_COMPLETE) { + fprintf(stderr, "wait for RDMA_READ_COMPLETE state %d\n", + cb->state); + ret = -1; + break; + } + DEBUG_LOG("server received read complete\n"); + + /* Display data in recv buf */ + if (cb->verbose) + printf("server ping data: %s\n", cb->rdma_buf); + + /* Tell client to continue */ + ret = ibv_post_send(cb->qp, &cb->sq_wr, &bad_wr); + if (ret) { + fprintf(stderr, "post send error %d\n", ret); + break; + } + DEBUG_LOG("server posted go ahead\n"); + + /* Wait for client's RDMA STAG/TO/Len */ + sem_wait(&cb->sem); + if (cb->state != RDMA_WRITE_ADV) { + fprintf(stderr, "wait for RDMA_WRITE_ADV state %d\n", + cb->state); + ret = -1; + break; + } + DEBUG_LOG("server received sink adv\n"); + + /* RDMA Write echo data */ + cb->rdma_sq_wr.opcode = IBV_WR_RDMA_WRITE; + cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey; + cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr; + cb->rdma_sq_wr.sg_list->length = strlen(cb->rdma_buf) + 1; + DEBUG_LOG("rdma write from lkey %x laddr %" PRIx64 " len %d\n", + cb->rdma_sq_wr.sg_list->lkey, + cb->rdma_sq_wr.sg_list->addr, + cb->rdma_sq_wr.sg_list->length); + + ret = ibv_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr); + if (ret) { + fprintf(stderr, "post send error %d\n", ret); + break; + } + + /* Wait for completion */ + ret = sem_wait(&cb->sem); + if (cb->state != RDMA_WRITE_COMPLETE) { + fprintf(stderr, "wait for RDMA_WRITE_COMPLETE state %d\n", + cb->state); + ret = -1; + break; + } + DEBUG_LOG("server rdma write complete \n"); + + /* Tell client to begin again */ + ret = ibv_post_send(cb->qp, &cb->sq_wr, &bad_wr); + if (ret) { + fprintf(stderr, "post send error %d\n", ret); + break; + } + DEBUG_LOG("server posted go ahead\n"); + } + + return (cb->state == DISCONNECTED) ? 0 : ret; +} + +static int rping_bind_server(struct rping_cb *cb) +{ + int ret; + + if (cb->sin.ss_family == AF_INET) + ((struct sockaddr_in *) &cb->sin)->sin_port = cb->port; + else + ((struct sockaddr_in6 *) &cb->sin)->sin6_port = cb->port; + + ret = rdma_bind_addr(cb->cm_id, (struct sockaddr *) &cb->sin); + if (ret) { + perror("rdma_bind_addr"); + return ret; + } + DEBUG_LOG("rdma_bind_addr successful\n"); + + DEBUG_LOG("rdma_listen\n"); + ret = rdma_listen(cb->cm_id, 3); + if (ret) { + perror("rdma_listen"); + return ret; + } + + return 0; +} + +static struct rping_cb *clone_cb(struct rping_cb *listening_cb) +{ + struct rping_cb *cb = malloc(sizeof *cb); + if (!cb) + return NULL; + memset(cb, 0, sizeof *cb); + *cb = *listening_cb; + cb->child_cm_id->context = cb; + return cb; +} + +static void free_cb(struct rping_cb *cb) +{ + free(cb); +} + +static void *rping_persistent_server_thread(void *arg) +{ + struct rping_cb *cb = arg; + struct ibv_recv_wr *bad_wr; + int ret; + + ret = rping_setup_qp(cb, cb->child_cm_id); + if (ret) { + fprintf(stderr, "setup_qp failed: %d\n", ret); + goto err0; + } + + ret = rping_setup_buffers(cb); + if (ret) { + fprintf(stderr, "rping_setup_buffers failed: %d\n", ret); + goto err1; + } + + ret = ibv_post_recv(cb->qp, &cb->rq_wr, &bad_wr); + if (ret) { + fprintf(stderr, "ibv_post_recv failed: %d\n", ret); + goto err2; + } + + ret = pthread_create(&cb->cqthread, NULL, cq_thread, cb); + if (ret) { + perror("pthread_create"); + goto err2; + } + + ret = rping_accept(cb); + if (ret) { + fprintf(stderr, "connect error %d\n", ret); + goto err3; + } + + rping_test_server(cb); + rping_disconnect(cb, cb->child_cm_id); + pthread_join(cb->cqthread, NULL); + rping_free_buffers(cb); + rping_free_qp(cb); + rdma_destroy_id(cb->child_cm_id); + free_cb(cb); + return NULL; +err3: + pthread_cancel(cb->cqthread); + pthread_join(cb->cqthread, NULL); +err2: + rping_free_buffers(cb); +err1: + rping_free_qp(cb); +err0: + free_cb(cb); + return NULL; +} + +static int rping_run_persistent_server(struct rping_cb *listening_cb) +{ + int ret; + struct rping_cb *cb; + pthread_attr_t attr; + + ret = rping_bind_server(listening_cb); + if (ret) + return ret; + + /* + * Set persistent server threads to DEATCHED state so + * they release all their resources when they exit. + */ + ret = pthread_attr_init(&attr); + if (ret) { + perror("pthread_attr_init"); + return ret; + } + ret = pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED); + if (ret) { + perror("pthread_attr_setdetachstate"); + return ret; + } + + while (1) { + sem_wait(&listening_cb->sem); + if (listening_cb->state != CONNECT_REQUEST) { + fprintf(stderr, "wait for CONNECT_REQUEST state %d\n", + listening_cb->state); + return -1; + } + + cb = clone_cb(listening_cb); + if (!cb) + return -1; + + ret = pthread_create(&cb->persistent_server_thread, &attr, rping_persistent_server_thread, cb); + if (ret) { + perror("pthread_create"); + return ret; + } + } + return 0; +} + +static int rping_run_server(struct rping_cb *cb) +{ + struct ibv_recv_wr *bad_wr; + int ret; + + ret = rping_bind_server(cb); + if (ret) + return ret; + + sem_wait(&cb->sem); + if (cb->state != CONNECT_REQUEST) { + fprintf(stderr, "wait for CONNECT_REQUEST state %d\n", + cb->state); + return -1; + } + + ret = rping_setup_qp(cb, cb->child_cm_id); + if (ret) { + fprintf(stderr, "setup_qp failed: %d\n", ret); + return ret; + } + + ret = rping_setup_buffers(cb); + if (ret) { + fprintf(stderr, "rping_setup_buffers failed: %d\n", ret); + goto err1; + } + + ret = ibv_post_recv(cb->qp, &cb->rq_wr, &bad_wr); + if (ret) { + fprintf(stderr, "ibv_post_recv failed: %d\n", ret); + goto err2; + } + + ret = pthread_create(&cb->cqthread, NULL, cq_thread, cb); + if (ret) { + perror("pthread_create"); + goto err2; + } + + ret = rping_accept(cb); + if (ret) { + fprintf(stderr, "connect error %d\n", ret); + goto err2; + } + + ret = rping_test_server(cb); + if (ret) { + fprintf(stderr, "rping server failed: %d\n", ret); + goto err3; + } + + ret = 0; +err3: + rping_disconnect(cb, cb->child_cm_id); + pthread_join(cb->cqthread, NULL); + rdma_destroy_id(cb->child_cm_id); +err2: + rping_free_buffers(cb); +err1: + rping_free_qp(cb); + + return ret; +} + +static int rping_test_client(struct rping_cb *cb) +{ + int ping, start, cc, i, ret = 0; + struct ibv_send_wr *bad_wr; + unsigned char c; + + start = 65; + for (ping = 0; !cb->count || ping < cb->count; ping++) { + cb->state = RDMA_READ_ADV; + + /* Put some ascii text in the buffer. */ + cc = snprintf(cb->start_buf, cb->size, RPING_MSG_FMT, ping); + for (i = cc, c = start; i < cb->size; i++) { + cb->start_buf[i] = c; + c++; + if (c > 122) + c = 65; + } + start++; + if (start > 122) + start = 65; + cb->start_buf[cb->size - 1] = 0; + + rping_format_send(cb, cb->start_buf, cb->start_mr); + ret = ibv_post_send(cb->qp, &cb->sq_wr, &bad_wr); + if (ret) { + fprintf(stderr, "post send error %d\n", ret); + break; + } + + /* Wait for server to ACK */ + sem_wait(&cb->sem); + if (cb->state != RDMA_WRITE_ADV) { + fprintf(stderr, "wait for RDMA_WRITE_ADV state %d\n", + cb->state); + ret = -1; + break; + } + + rping_format_send(cb, cb->rdma_buf, cb->rdma_mr); + ret = ibv_post_send(cb->qp, &cb->sq_wr, &bad_wr); + if (ret) { + fprintf(stderr, "post send error %d\n", ret); + break; + } + + /* Wait for the server to say the RDMA Write is complete. */ + sem_wait(&cb->sem); + if (cb->state != RDMA_WRITE_COMPLETE) { + fprintf(stderr, "wait for RDMA_WRITE_COMPLETE state %d\n", + cb->state); + ret = -1; + break; + } + + if (cb->validate) + if (memcmp(cb->start_buf, cb->rdma_buf, cb->size)) { + fprintf(stderr, "data mismatch!\n"); + ret = -1; + break; + } + + if (cb->verbose) + printf("ping data: %s\n", cb->rdma_buf); + } + + return (cb->state == DISCONNECTED) ? 0 : ret; +} + +static int rping_connect_client(struct rping_cb *cb) +{ + struct rdma_conn_param conn_param; + int ret; + + rping_init_conn_param(cb, &conn_param); + ret = rdma_connect(cb->cm_id, &conn_param); + if (ret) { + perror("rdma_connect"); + return ret; + } + + sem_wait(&cb->sem); + if (cb->state != CONNECTED) { + fprintf(stderr, "wait for CONNECTED state %d\n", cb->state); + return -1; + } + + if (cb->self_create_qp) { + ret = rping_self_modify_qp(cb, cb->cm_id); + if (ret) { + perror("rping_modify_qp"); + return ret; + } + + ret = rdma_establish(cb->cm_id); + if (ret) { + perror("rdma_establish"); + return ret; + } + } + + DEBUG_LOG("rmda_connect successful\n"); + return 0; +} + +static int rping_bind_client(struct rping_cb *cb) +{ + int ret; + + if (cb->sin.ss_family == AF_INET) + ((struct sockaddr_in *) &cb->sin)->sin_port = cb->port; + else + ((struct sockaddr_in6 *) &cb->sin)->sin6_port = cb->port; + + if (cb->ssource.ss_family) + ret = rdma_resolve_addr(cb->cm_id, (struct sockaddr *) &cb->ssource, + (struct sockaddr *) &cb->sin, 2000); + else + ret = rdma_resolve_addr(cb->cm_id, NULL, (struct sockaddr *) &cb->sin, 2000); + + if (ret) { + perror("rdma_resolve_addr"); + return ret; + } + + sem_wait(&cb->sem); + if (cb->state != ROUTE_RESOLVED) { + fprintf(stderr, "waiting for addr/route resolution state %d\n", + cb->state); + return -1; + } + + DEBUG_LOG("rdma_resolve_addr - rdma_resolve_route successful\n"); + return 0; +} + +static int rping_run_client(struct rping_cb *cb) +{ + struct ibv_recv_wr *bad_wr; + int ret; + + ret = rping_bind_client(cb); + if (ret) + return ret; + + ret = rping_setup_qp(cb, cb->cm_id); + if (ret) { + fprintf(stderr, "setup_qp failed: %d\n", ret); + return ret; + } + + ret = rping_setup_buffers(cb); + if (ret) { + fprintf(stderr, "rping_setup_buffers failed: %d\n", ret); + goto err1; + } + + ret = ibv_post_recv(cb->qp, &cb->rq_wr, &bad_wr); + if (ret) { + fprintf(stderr, "ibv_post_recv failed: %d\n", ret); + goto err2; + } + + ret = pthread_create(&cb->cqthread, NULL, cq_thread, cb); + if (ret) { + perror("pthread_create"); + goto err2; + } + + ret = rping_connect_client(cb); + if (ret) { + fprintf(stderr, "connect error %d\n", ret); + goto err3; + } + + ret = rping_test_client(cb); + if (ret) { + fprintf(stderr, "rping client failed: %d\n", ret); + goto err4; + } + + ret = 0; +err4: + rping_disconnect(cb, cb->cm_id); +err3: + pthread_join(cb->cqthread, NULL); +err2: + rping_free_buffers(cb); +err1: + rping_free_qp(cb); + + return ret; +} + +static int get_addr(char *dst, struct sockaddr *addr) +{ + struct addrinfo *res; + int ret; + + ret = getaddrinfo(dst, NULL, NULL, &res); + if (ret) { + printf("getaddrinfo failed (%s) - invalid hostname or IP address\n", gai_strerror(ret)); + return ret; + } + + if (res->ai_family == PF_INET) + memcpy(addr, res->ai_addr, sizeof(struct sockaddr_in)); + else if (res->ai_family == PF_INET6) + memcpy(addr, res->ai_addr, sizeof(struct sockaddr_in6)); + else + ret = -1; + + freeaddrinfo(res); + return ret; +} + +static void usage(const char *name) +{ + printf("%s -s [-vVd] [-S size] [-C count] [-a addr] [-p port]\n", + basename(name)); + printf("%s -c [-vVd] [-S size] [-C count] [-I addr] -a addr [-p port]\n", + basename(name)); + printf("\t-c\t\tclient side\n"); + printf("\t-I\t\tSource address to bind to for client.\n"); + printf("\t-s\t\tserver side. To bind to any address with IPv6 use -a ::0\n"); + printf("\t-v\t\tdisplay ping data to stdout\n"); + printf("\t-V\t\tvalidate ping data\n"); + printf("\t-d\t\tdebug printfs\n"); + printf("\t-S size \tping data size\n"); + printf("\t-C count\tping count times\n"); + printf("\t-a addr\t\taddress\n"); + printf("\t-p port\t\tport\n"); + printf("\t-P\t\tpersistent server mode allowing multiple connections\n"); + printf("\t-q\t\tuse self-created, self-modified QP\n"); +} + +int main(int argc, char *argv[]) +{ + struct rping_cb *cb; + int op; + int ret = 0; + int persistent_server = 0; + + cb = malloc(sizeof(*cb)); + if (!cb) + return -ENOMEM; + + memset(cb, 0, sizeof(*cb)); + cb->server = -1; + cb->state = IDLE; + cb->size = 64; + cb->sin.ss_family = PF_INET; + cb->port = htobe16(7174); + sem_init(&cb->sem, 0, 0); + + opterr = 0; + while ((op = getopt(argc, argv, "a:I:Pp:C:S:t:scvVdq")) != -1) { + switch (op) { + case 'a': + ret = get_addr(optarg, (struct sockaddr *) &cb->sin); + break; + case 'I': + ret = get_addr(optarg, (struct sockaddr *) &cb->ssource); + break; + case 'P': + persistent_server = 1; + break; + case 'p': + cb->port = htobe16(atoi(optarg)); + DEBUG_LOG("port %d\n", (int) atoi(optarg)); + break; + case 's': + cb->server = 1; + DEBUG_LOG("server\n"); + break; + case 'c': + cb->server = 0; + DEBUG_LOG("client\n"); + break; + case 'S': + cb->size = atoi(optarg); + if ((cb->size < RPING_MIN_BUFSIZE) || + (cb->size > (RPING_BUFSIZE - 1))) { + fprintf(stderr, "Invalid size %d " + "(valid range is %zd to %d)\n", + cb->size, RPING_MIN_BUFSIZE, RPING_BUFSIZE); + ret = EINVAL; + } else + DEBUG_LOG("size %d\n", (int) atoi(optarg)); + break; + case 'C': + cb->count = atoi(optarg); + if (cb->count < 0) { + fprintf(stderr, "Invalid count %d\n", + cb->count); + ret = EINVAL; + } else + DEBUG_LOG("count %d\n", (int) cb->count); + break; + case 'v': + cb->verbose++; + DEBUG_LOG("verbose\n"); + break; + case 'V': + cb->validate++; + DEBUG_LOG("validate data\n"); + break; + case 'd': + debug++; + break; + case 'q': + cb->self_create_qp = 1; + break; + default: + usage("rping"); + ret = EINVAL; + goto out; + } + } + if (ret) + goto out; + + if (cb->server == -1) { + usage("rping"); + ret = EINVAL; + goto out; + } + + cb->cm_channel = create_first_event_channel(); + if (!cb->cm_channel) { + ret = errno; + goto out; + } + + ret = rdma_create_id(cb->cm_channel, &cb->cm_id, cb, RDMA_PS_TCP); + if (ret) { + perror("rdma_create_id"); + goto out2; + } + DEBUG_LOG("created cm_id %p\n", cb->cm_id); + + ret = pthread_create(&cb->cmthread, NULL, cm_thread, cb); + if (ret) { + perror("pthread_create"); + goto out2; + } + + if (cb->server) { + if (persistent_server) + ret = rping_run_persistent_server(cb); + else + ret = rping_run_server(cb); + } else { + ret = rping_run_client(cb); + } + + DEBUG_LOG("destroy cm_id %p\n", cb->cm_id); + rdma_destroy_id(cb->cm_id); +out2: + rdma_destroy_event_channel(cb->cm_channel); +out: + free(cb); + return ret; +} diff --git a/librdmacm/examples/rstream.c b/librdmacm/examples/rstream.c new file mode 100644 index 0000000..b692031 --- /dev/null +++ b/librdmacm/examples/rstream.c @@ -0,0 +1,709 @@ +/* + * Copyright (c) 2011-2012 Intel Corporation. All rights reserved. + * Copyright (c) 2014-2015 Mellanox Technologies LTD. All rights reserved. + * + * This software is available to you under the OpenIB.org BSD license + * below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AWV + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <strings.h> +#include <errno.h> +#include <getopt.h> +#include <sys/types.h> +#include <sys/socket.h> +#include <sys/time.h> +#include <sys/wait.h> +#include <netdb.h> +#include <fcntl.h> +#include <unistd.h> +#include <netinet/tcp.h> + +#include <rdma/rdma_cma.h> +#include <rdma/rsocket.h> +#include <util/compiler.h> +#include "common.h" + +struct test_size_param { + int size; + int option; +}; + +static struct test_size_param test_size[] = { + { 1 << 6, 0 }, + { 1 << 7, 1 }, { (1 << 7) + (1 << 6), 1}, + { 1 << 8, 1 }, { (1 << 8) + (1 << 7), 1}, + { 1 << 9, 1 }, { (1 << 9) + (1 << 8), 1}, + { 1 << 10, 1 }, { (1 << 10) + (1 << 9), 1}, + { 1 << 11, 1 }, { (1 << 11) + (1 << 10), 1}, + { 1 << 12, 0 }, { (1 << 12) + (1 << 11), 1}, + { 1 << 13, 1 }, { (1 << 13) + (1 << 12), 1}, + { 1 << 14, 1 }, { (1 << 14) + (1 << 13), 1}, + { 1 << 15, 1 }, { (1 << 15) + (1 << 14), 1}, + { 1 << 16, 0 }, { (1 << 16) + (1 << 15), 1}, + { 1 << 17, 1 }, { (1 << 17) + (1 << 16), 1}, + { 1 << 18, 1 }, { (1 << 18) + (1 << 17), 1}, + { 1 << 19, 1 }, { (1 << 19) + (1 << 18), 1}, + { 1 << 20, 0 }, { (1 << 20) + (1 << 19), 1}, + { 1 << 21, 1 }, { (1 << 21) + (1 << 20), 1}, + { 1 << 22, 1 }, { (1 << 22) + (1 << 21), 1}, +}; +#define TEST_CNT (sizeof test_size / sizeof test_size[0]) + +static int rs, lrs; +static int use_async; +static int use_rgai; +static int verify; +static int flags = MSG_DONTWAIT; +static int poll_timeout = 0; +static int custom; +static int use_fork; +static pid_t fork_pid; +static enum rs_optimization optimization; +static int size_option; +static int iterations = 1; +static int transfer_size = 1000; +static int transfer_count = 1000; +static int buffer_size, inline_size = 64; +static char test_name[10] = "custom"; +static const char *port = "7471"; +static int keepalive; +static char *dst_addr; +static char *src_addr; +static struct timeval start, end; +static void *buf; +static struct rdma_addrinfo rai_hints; +static struct addrinfo ai_hints; + +static void show_perf(void) +{ + char str[32]; + float usec; + long long bytes; + + usec = (end.tv_sec - start.tv_sec) * 1000000 + (end.tv_usec - start.tv_usec); + bytes = (long long) iterations * transfer_count * transfer_size * 2; + + /* name size transfers iterations bytes seconds Gb/sec usec/xfer */ + printf("%-10s", test_name); + size_str(str, sizeof str, transfer_size); + printf("%-8s", str); + cnt_str(str, sizeof str, transfer_count); + printf("%-8s", str); + cnt_str(str, sizeof str, iterations); + printf("%-8s", str); + size_str(str, sizeof str, bytes); + printf("%-8s", str); + printf("%8.2fs%10.2f%11.2f\n", + usec / 1000000., (bytes * 8) / (1000. * usec), + (usec / iterations) / (transfer_count * 2)); +} + +static void init_latency_test(int size) +{ + char sstr[5]; + + size_str(sstr, sizeof sstr, size); + snprintf(test_name, sizeof test_name, "%s_lat", sstr); + transfer_count = 1; + transfer_size = size; + iterations = size_to_count(transfer_size); +} + +static void init_bandwidth_test(int size) +{ + char sstr[5]; + + size_str(sstr, sizeof sstr, size); + snprintf(test_name, sizeof test_name, "%s_bw", sstr); + iterations = 1; + transfer_size = size; + transfer_count = size_to_count(transfer_size); +} + +static int send_xfer(int size) +{ + struct pollfd fds; + int offset, ret; + + if (verify) + format_buf(buf, size); + + if (use_async) { + fds.fd = rs; + fds.events = POLLOUT; + } + + for (offset = 0; offset < size; ) { + if (use_async) { + ret = do_poll(&fds, poll_timeout); + if (ret) + return ret; + } + + ret = rs_send(rs, buf + offset, size - offset, flags); + if (ret > 0) { + offset += ret; + } else if (errno != EWOULDBLOCK && errno != EAGAIN) { + perror("rsend"); + return ret; + } + } + + return 0; +} + +static int recv_xfer(int size) +{ + struct pollfd fds; + int offset, ret; + + if (use_async) { + fds.fd = rs; + fds.events = POLLIN; + } + + for (offset = 0; offset < size; ) { + if (use_async) { + ret = do_poll(&fds, poll_timeout); + if (ret) + return ret; + } + + ret = rs_recv(rs, buf + offset, size - offset, flags); + if (ret > 0) { + offset += ret; + } else if (errno != EWOULDBLOCK && errno != EAGAIN) { + perror("rrecv"); + return ret; + } + } + + if (verify) { + ret = verify_buf(buf, size); + if (ret) + return ret; + } + + return 0; +} + +static int sync_test(void) +{ + int ret; + + ret = dst_addr ? send_xfer(16) : recv_xfer(16); + if (ret) + return ret; + + return dst_addr ? recv_xfer(16) : send_xfer(16); +} + +static int run_test(void) +{ + int ret, i, t; + + ret = sync_test(); + if (ret) + goto out; + + gettimeofday(&start, NULL); + for (i = 0; i < iterations; i++) { + for (t = 0; t < transfer_count; t++) { + ret = dst_addr ? send_xfer(transfer_size) : + recv_xfer(transfer_size); + if (ret) + goto out; + } + + for (t = 0; t < transfer_count; t++) { + ret = dst_addr ? recv_xfer(transfer_size) : + send_xfer(transfer_size); + if (ret) + goto out; + } + } + gettimeofday(&end, NULL); + show_perf(); + ret = 0; + +out: + return ret; +} + +static void set_keepalive(int fd) +{ + int optval; + socklen_t optlen = sizeof(optlen); + + optval = 1; + if (rs_setsockopt(fd, SOL_SOCKET, SO_KEEPALIVE, &optval, optlen)) { + perror("rsetsockopt SO_KEEPALIVE"); + return; + } + + optval = keepalive; + if (rs_setsockopt(fd, IPPROTO_TCP, TCP_KEEPIDLE, &optval, optlen)) + perror("rsetsockopt TCP_KEEPIDLE"); + + if (!(rs_getsockopt(fd, SOL_SOCKET, SO_KEEPALIVE, &optval, &optlen))) + printf("Keepalive: %s\n", (optval ? "ON" : "OFF")); + + if (!(rs_getsockopt(fd, IPPROTO_TCP, TCP_KEEPIDLE, &optval, &optlen))) + printf(" time: %i\n", optval); +} + +static void set_options(int fd) +{ + int val; + + if (buffer_size) { + rs_setsockopt(fd, SOL_SOCKET, SO_SNDBUF, (void *) &buffer_size, + sizeof buffer_size); + rs_setsockopt(fd, SOL_SOCKET, SO_RCVBUF, (void *) &buffer_size, + sizeof buffer_size); + } else { + val = 1 << 19; + rs_setsockopt(fd, SOL_SOCKET, SO_SNDBUF, (void *) &val, sizeof val); + rs_setsockopt(fd, SOL_SOCKET, SO_RCVBUF, (void *) &val, sizeof val); + } + + val = 1; + rs_setsockopt(fd, IPPROTO_TCP, TCP_NODELAY, (void *) &val, sizeof(val)); + + if (flags & MSG_DONTWAIT) + rs_fcntl(fd, F_SETFL, O_NONBLOCK); + + if (use_rs) { + /* Inline size based on experimental data */ + if (optimization == opt_latency) { + rs_setsockopt(fd, SOL_RDMA, RDMA_INLINE, &inline_size, + sizeof inline_size); + } else if (optimization == opt_bandwidth) { + val = 0; + rs_setsockopt(fd, SOL_RDMA, RDMA_INLINE, &val, sizeof val); + } + } + + if (keepalive) + set_keepalive(fd); +} + +static int server_listen(void) +{ + struct rdma_addrinfo *rai = NULL; + struct addrinfo *ai; + int val, ret; + + if (use_rgai) { + rai_hints.ai_flags |= RAI_PASSIVE; + ret = rdma_getaddrinfo(src_addr, port, &rai_hints, &rai); + } else { + ai_hints.ai_flags |= AI_PASSIVE; + ret = getaddrinfo(src_addr, port, &ai_hints, &ai); + } + if (ret) { + printf("getaddrinfo: %s\n", gai_strerror(ret)); + return ret; + } + + lrs = rai ? rs_socket(rai->ai_family, SOCK_STREAM, 0) : + rs_socket(ai->ai_family, SOCK_STREAM, 0); + if (lrs < 0) { + ret = lrs; + goto free; + } + + val = 1; + ret = rs_setsockopt(lrs, SOL_SOCKET, SO_REUSEADDR, &val, sizeof val); + if (ret) { + perror("rsetsockopt SO_REUSEADDR"); + goto close; + } + + ret = rai ? rs_bind(lrs, rai->ai_src_addr, rai->ai_src_len) : + rs_bind(lrs, ai->ai_addr, ai->ai_addrlen); + if (ret) { + perror("rbind"); + goto close; + } + + ret = rs_listen(lrs, 1); + if (ret) + perror("rlisten"); + +close: + if (ret) + rs_close(lrs); +free: + if (rai) + rdma_freeaddrinfo(rai); + else + freeaddrinfo(ai); + return ret; +} + +static int server_connect(void) +{ + struct pollfd fds; + int ret = 0; + + set_options(lrs); + do { + if (use_async) { + fds.fd = lrs; + fds.events = POLLIN; + + ret = do_poll(&fds, poll_timeout); + if (ret) { + perror("rpoll"); + return ret; + } + } + + rs = rs_accept(lrs, NULL, NULL); + } while (rs < 0 && (errno == EAGAIN || errno == EWOULDBLOCK)); + if (rs < 0) { + perror("raccept"); + return rs; + } + + if (use_fork) + fork_pid = fork(); + if (!fork_pid) + set_options(rs); + return ret; +} + +static int client_connect(void) +{ + struct rdma_addrinfo *rai = NULL, *rai_src = NULL; + struct addrinfo *ai = NULL, *ai_src = NULL; + struct pollfd fds; + int ret, err; + socklen_t len; + + ret = use_rgai ? rdma_getaddrinfo(dst_addr, port, &rai_hints, &rai) : + getaddrinfo(dst_addr, port, &ai_hints, &ai); + + if (ret) { + printf("getaddrinfo: %s\n", gai_strerror(ret)); + return ret; + } + + if (src_addr) { + if (use_rgai) { + rai_hints.ai_flags |= RAI_PASSIVE; + ret = rdma_getaddrinfo(src_addr, port, &rai_hints, &rai_src); + } else { + ai_hints.ai_flags |= AI_PASSIVE; + ret = getaddrinfo(src_addr, port, &ai_hints, &ai_src); + } + if (ret) { + printf("getaddrinfo src_addr: %s\n", gai_strerror(ret)); + goto free; + } + } + + rs = rai ? rs_socket(rai->ai_family, SOCK_STREAM, 0) : + rs_socket(ai->ai_family, SOCK_STREAM, 0); + if (rs < 0) { + ret = rs; + goto free; + } + + set_options(rs); + + if (src_addr) { + ret = rai ? rs_bind(rs, rai_src->ai_src_addr, rai_src->ai_src_len) : + rs_bind(rs, ai_src->ai_addr, ai_src->ai_addrlen); + if (ret) { + perror("rbind"); + goto close; + } + } + + if (rai && rai->ai_route) { + ret = rs_setsockopt(rs, SOL_RDMA, RDMA_ROUTE, rai->ai_route, + rai->ai_route_len); + if (ret) { + perror("rsetsockopt RDMA_ROUTE"); + goto close; + } + } + + ret = rai ? rs_connect(rs, rai->ai_dst_addr, rai->ai_dst_len) : + rs_connect(rs, ai->ai_addr, ai->ai_addrlen); + if (ret && (errno != EINPROGRESS)) { + perror("rconnect"); + goto close; + } + + if (ret && (errno == EINPROGRESS)) { + fds.fd = rs; + fds.events = POLLOUT; + ret = do_poll(&fds, poll_timeout); + if (ret) { + perror("rpoll"); + goto close; + } + + len = sizeof err; + ret = rs_getsockopt(rs, SOL_SOCKET, SO_ERROR, &err, &len); + if (ret) + goto close; + if (err) { + ret = -1; + errno = err; + perror("async rconnect"); + } + } + +close: + if (ret) + rs_close(rs); +free: + if (rai) + rdma_freeaddrinfo(rai); + if (ai) + freeaddrinfo(ai); + if (rai_src) + rdma_freeaddrinfo(rai_src); + if (ai_src) + freeaddrinfo(ai_src); + return ret; +} + +static int run(void) +{ + int i, ret = 0; + + buf = malloc(!custom ? test_size[TEST_CNT - 1].size : transfer_size); + if (!buf) { + perror("malloc"); + return -1; + } + + if (!dst_addr) { + ret = server_listen(); + if (ret) + goto free; + } + + printf("%-10s%-8s%-8s%-8s%-8s%8s %10s%13s\n", + "name", "bytes", "xfers", "iters", "total", "time", "Gb/sec", "usec/xfer"); + if (!custom) { + optimization = opt_latency; + ret = dst_addr ? client_connect() : server_connect(); + if (ret) + goto free; + + for (i = 0; i < TEST_CNT && !fork_pid; i++) { + if (test_size[i].option > size_option) + continue; + init_latency_test(test_size[i].size); + run_test(); + } + if (fork_pid) + waitpid(fork_pid, NULL, 0); + else + rs_shutdown(rs, SHUT_RDWR); + rs_close(rs); + + if (!dst_addr && use_fork && !fork_pid) + goto free; + + optimization = opt_bandwidth; + ret = dst_addr ? client_connect() : server_connect(); + if (ret) + goto free; + for (i = 0; i < TEST_CNT && !fork_pid; i++) { + if (test_size[i].option > size_option) + continue; + init_bandwidth_test(test_size[i].size); + run_test(); + } + } else { + ret = dst_addr ? client_connect() : server_connect(); + if (ret) + goto free; + + if (!fork_pid) + ret = run_test(); + } + + if (fork_pid) + waitpid(fork_pid, NULL, 0); + else + rs_shutdown(rs, SHUT_RDWR); + rs_close(rs); +free: + free(buf); + return ret; +} + +static int set_test_opt(const char *arg) +{ + if (strlen(arg) == 1) { + switch (arg[0]) { + case 's': + use_rs = 0; + break; + case 'a': + use_async = 1; + break; + case 'b': + flags = (flags & ~MSG_DONTWAIT) | MSG_WAITALL; + break; + case 'f': + use_fork = 1; + use_rs = 0; + break; + case 'n': + flags |= MSG_DONTWAIT; + break; + case 'r': + use_rgai = 1; + break; + case 'v': + verify = 1; + break; + default: + return -1; + } + } else { + if (!strncasecmp("socket", arg, 6)) { + use_rs = 0; + } else if (!strncasecmp("async", arg, 5)) { + use_async = 1; + } else if (!strncasecmp("block", arg, 5)) { + flags = (flags & ~MSG_DONTWAIT) | MSG_WAITALL; + } else if (!strncasecmp("nonblock", arg, 8)) { + flags |= MSG_DONTWAIT; + } else if (!strncasecmp("resolve", arg, 7)) { + use_rgai = 1; + } else if (!strncasecmp("verify", arg, 6)) { + verify = 1; + } else if (!strncasecmp("fork", arg, 4)) { + use_fork = 1; + use_rs = 0; + } else { + return -1; + } + } + return 0; +} + +int main(int argc, char **argv) +{ + int op, ret; + + ai_hints.ai_socktype = SOCK_STREAM; + rai_hints.ai_port_space = RDMA_PS_TCP; + while ((op = getopt(argc, argv, "s:b:f:B:i:I:C:S:p:k:T:")) != -1) { + switch (op) { + case 's': + dst_addr = optarg; + break; + case 'b': + src_addr = optarg; + break; + case 'f': + if (!strncasecmp("ip", optarg, 2)) { + ai_hints.ai_flags = AI_NUMERICHOST; + } else if (!strncasecmp("gid", optarg, 3)) { + rai_hints.ai_flags = RAI_NUMERICHOST | RAI_FAMILY; + rai_hints.ai_family = AF_IB; + use_rgai = 1; + } else { + fprintf(stderr, "Warning: unknown address format\n"); + } + break; + case 'B': + buffer_size = atoi(optarg); + break; + case 'i': + inline_size = atoi(optarg); + break; + case 'I': + custom = 1; + iterations = atoi(optarg); + break; + case 'C': + custom = 1; + transfer_count = atoi(optarg); + break; + case 'S': + if (!strncasecmp("all", optarg, 3)) { + size_option = 1; + } else { + custom = 1; + transfer_size = atoi(optarg); + } + break; + case 'p': + port = optarg; + break; + case 'k': + keepalive = atoi(optarg); + break; + case 'T': + if (!set_test_opt(optarg)) + break; + /* invalid option - fall through */ + SWITCH_FALLTHROUGH; + default: + printf("usage: %s\n", argv[0]); + printf("\t[-s server_address]\n"); + printf("\t[-b bind_address]\n"); + printf("\t[-f address_format]\n"); + printf("\t name, ip, ipv6, or gid\n"); + printf("\t[-B buffer_size]\n"); + printf("\t[-i inline_size]\n"); + printf("\t[-I iterations]\n"); + printf("\t[-C transfer_count]\n"); + printf("\t[-S transfer_size or all]\n"); + printf("\t[-p port_number]\n"); + printf("\t[-k keepalive_time]\n"); + printf("\t[-T test_option]\n"); + printf("\t s|sockets - use standard tcp/ip sockets\n"); + printf("\t a|async - asynchronous operation (use poll)\n"); + printf("\t b|blocking - use blocking calls\n"); + printf("\t f|fork - fork server processing\n"); + printf("\t n|nonblocking - use nonblocking calls\n"); + printf("\t r|resolve - use rdma cm to resolve address\n"); + printf("\t v|verify - verify data\n"); + exit(1); + } + } + + if (!(flags & MSG_DONTWAIT)) + poll_timeout = -1; + + ret = run(); + return ret; +} diff --git a/librdmacm/examples/udaddy.c b/librdmacm/examples/udaddy.c new file mode 100644 index 0000000..9283caa --- /dev/null +++ b/librdmacm/examples/udaddy.c @@ -0,0 +1,692 @@ +/* + * Copyright (c) 2005-2006 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * $Id$ + */ + +#include <stdlib.h> +#include <string.h> +#include <stdio.h> +#include <errno.h> +#include <sys/types.h> +#include <sys/socket.h> +#include <netdb.h> +#include <getopt.h> + +#include <rdma/rdma_cma.h> +#include "common.h" + +struct cmatest_node { + int id; + struct rdma_cm_id *cma_id; + int connected; + struct ibv_pd *pd; + struct ibv_cq *cq; + struct ibv_mr *mr; + struct ibv_ah *ah; + uint32_t remote_qpn; + uint32_t remote_qkey; + void *mem; +}; + +struct cmatest { + struct rdma_event_channel *channel; + struct cmatest_node *nodes; + int conn_index; + int connects_left; + + struct rdma_addrinfo *rai; +}; + +static struct cmatest test; +static int connections = 1; +static int message_size = 100; +static int message_count = 10; +static const char *port = "7174"; +static uint8_t set_tos = 0; +static uint8_t tos; +static char *dst_addr; +static char *src_addr; +static struct rdma_addrinfo hints; + +static int create_message(struct cmatest_node *node) +{ + if (!message_size) + message_count = 0; + + if (!message_count) + return 0; + + node->mem = malloc(message_size + sizeof(struct ibv_grh)); + if (!node->mem) { + printf("failed message allocation\n"); + return -1; + } + node->mr = ibv_reg_mr(node->pd, node->mem, + message_size + sizeof(struct ibv_grh), + IBV_ACCESS_LOCAL_WRITE); + if (!node->mr) { + printf("failed to reg MR\n"); + goto err; + } + return 0; +err: + free(node->mem); + return -1; +} + +static int verify_test_params(struct cmatest_node *node) +{ + struct ibv_port_attr port_attr; + int ret; + + ret = ibv_query_port(node->cma_id->verbs, node->cma_id->port_num, + &port_attr); + if (ret) + return ret; + + if (message_count && message_size > (1 << (port_attr.active_mtu + 7))) { + printf("udaddy: message_size %d is larger than active mtu %d\n", + message_size, 1 << (port_attr.active_mtu + 7)); + return -EINVAL; + } + + return 0; +} + +static int init_node(struct cmatest_node *node) +{ + struct ibv_qp_init_attr init_qp_attr; + int cqe, ret; + + node->pd = ibv_alloc_pd(node->cma_id->verbs); + if (!node->pd) { + ret = -ENOMEM; + printf("udaddy: unable to allocate PD\n"); + goto out; + } + + cqe = message_count ? message_count * 2 : 2; + node->cq = ibv_create_cq(node->cma_id->verbs, cqe, node, NULL, 0); + if (!node->cq) { + ret = -ENOMEM; + printf("udaddy: unable to create CQ\n"); + goto out; + } + + memset(&init_qp_attr, 0, sizeof init_qp_attr); + init_qp_attr.cap.max_send_wr = message_count ? message_count : 1; + init_qp_attr.cap.max_recv_wr = message_count ? message_count : 1; + init_qp_attr.cap.max_send_sge = 1; + init_qp_attr.cap.max_recv_sge = 1; + init_qp_attr.qp_context = node; + init_qp_attr.sq_sig_all = 0; + init_qp_attr.qp_type = IBV_QPT_UD; + init_qp_attr.send_cq = node->cq; + init_qp_attr.recv_cq = node->cq; + ret = rdma_create_qp(node->cma_id, node->pd, &init_qp_attr); + if (ret) { + perror("udaddy: unable to create QP"); + goto out; + } + + ret = create_message(node); + if (ret) { + printf("udaddy: failed to create messages: %d\n", ret); + goto out; + } +out: + return ret; +} + +static int post_recvs(struct cmatest_node *node) +{ + struct ibv_recv_wr recv_wr, *recv_failure; + struct ibv_sge sge; + int i, ret = 0; + + if (!message_count) + return 0; + + recv_wr.next = NULL; + recv_wr.sg_list = &sge; + recv_wr.num_sge = 1; + recv_wr.wr_id = (uintptr_t) node; + + sge.length = message_size + sizeof(struct ibv_grh); + sge.lkey = node->mr->lkey; + sge.addr = (uintptr_t) node->mem; + + for (i = 0; i < message_count && !ret; i++ ) { + ret = ibv_post_recv(node->cma_id->qp, &recv_wr, &recv_failure); + if (ret) { + printf("failed to post receives: %d\n", ret); + break; + } + } + return ret; +} + +static int post_sends(struct cmatest_node *node, int signal_flag) +{ + struct ibv_send_wr send_wr, *bad_send_wr; + struct ibv_sge sge; + int i, ret = 0; + + if (!node->connected || !message_count) + return 0; + + send_wr.next = NULL; + send_wr.sg_list = &sge; + send_wr.num_sge = 1; + send_wr.opcode = IBV_WR_SEND_WITH_IMM; + send_wr.send_flags = signal_flag; + send_wr.wr_id = (unsigned long)node; + send_wr.imm_data = htobe32(node->cma_id->qp->qp_num); + + send_wr.wr.ud.ah = node->ah; + send_wr.wr.ud.remote_qpn = node->remote_qpn; + send_wr.wr.ud.remote_qkey = node->remote_qkey; + + sge.length = message_size; + sge.lkey = node->mr->lkey; + sge.addr = (uintptr_t) node->mem; + + for (i = 0; i < message_count && !ret; i++) { + ret = ibv_post_send(node->cma_id->qp, &send_wr, &bad_send_wr); + if (ret) + printf("failed to post sends: %d\n", ret); + } + return ret; +} + +static void connect_error(void) +{ + test.connects_left--; +} + +static int addr_handler(struct cmatest_node *node) +{ + int ret; + + if (set_tos) { + ret = rdma_set_option(node->cma_id, RDMA_OPTION_ID, + RDMA_OPTION_ID_TOS, &tos, sizeof tos); + if (ret) + perror("udaddy: set TOS option failed"); + } + + ret = rdma_resolve_route(node->cma_id, 2000); + if (ret) { + perror("udaddy: resolve route failed"); + connect_error(); + } + return ret; +} + +static int route_handler(struct cmatest_node *node) +{ + struct rdma_conn_param conn_param; + int ret; + + ret = verify_test_params(node); + if (ret) + goto err; + + ret = init_node(node); + if (ret) + goto err; + + ret = post_recvs(node); + if (ret) + goto err; + + memset(&conn_param, 0, sizeof conn_param); + conn_param.private_data = test.rai->ai_connect; + conn_param.private_data_len = test.rai->ai_connect_len; + ret = rdma_connect(node->cma_id, &conn_param); + if (ret) { + perror("udaddy: failure connecting"); + goto err; + } + return 0; +err: + connect_error(); + return ret; +} + +static int connect_handler(struct rdma_cm_id *cma_id) +{ + struct cmatest_node *node; + struct rdma_conn_param conn_param; + int ret; + + if (test.conn_index == connections) { + ret = -ENOMEM; + goto err1; + } + node = &test.nodes[test.conn_index++]; + + node->cma_id = cma_id; + cma_id->context = node; + + ret = verify_test_params(node); + if (ret) + goto err2; + + ret = init_node(node); + if (ret) + goto err2; + + ret = post_recvs(node); + if (ret) + goto err2; + + memset(&conn_param, 0, sizeof conn_param); + conn_param.qp_num = node->cma_id->qp->qp_num; + ret = rdma_accept(node->cma_id, &conn_param); + if (ret) { + perror("udaddy: failure accepting"); + goto err2; + } + node->connected = 1; + test.connects_left--; + return 0; + +err2: + node->cma_id = NULL; + connect_error(); +err1: + printf("udaddy: failing connection request\n"); + rdma_reject(cma_id, NULL, 0); + return ret; +} + +static int resolved_handler(struct cmatest_node *node, + struct rdma_cm_event *event) +{ + node->remote_qpn = event->param.ud.qp_num; + node->remote_qkey = event->param.ud.qkey; + node->ah = ibv_create_ah(node->pd, &event->param.ud.ah_attr); + if (!node->ah) { + printf("udaddy: failure creating address handle\n"); + goto err; + } + + node->connected = 1; + test.connects_left--; + return 0; +err: + connect_error(); + return -1; +} + +static int cma_handler(struct rdma_cm_id *cma_id, struct rdma_cm_event *event) +{ + int ret = 0; + + switch (event->event) { + case RDMA_CM_EVENT_ADDR_RESOLVED: + ret = addr_handler(cma_id->context); + break; + case RDMA_CM_EVENT_ROUTE_RESOLVED: + ret = route_handler(cma_id->context); + break; + case RDMA_CM_EVENT_CONNECT_REQUEST: + ret = connect_handler(cma_id); + break; + case RDMA_CM_EVENT_ESTABLISHED: + ret = resolved_handler(cma_id->context, event); + break; + case RDMA_CM_EVENT_ADDR_ERROR: + case RDMA_CM_EVENT_ROUTE_ERROR: + case RDMA_CM_EVENT_CONNECT_ERROR: + case RDMA_CM_EVENT_UNREACHABLE: + case RDMA_CM_EVENT_REJECTED: + printf("udaddy: event: %s, error: %d\n", + rdma_event_str(event->event), event->status); + connect_error(); + ret = event->status; + break; + case RDMA_CM_EVENT_DEVICE_REMOVAL: + /* Cleanup will occur after test completes. */ + break; + default: + break; + } + return ret; +} + +static void destroy_node(struct cmatest_node *node) +{ + if (!node->cma_id) + return; + + if (node->ah) + ibv_destroy_ah(node->ah); + + if (node->cma_id->qp) + rdma_destroy_qp(node->cma_id); + + if (node->cq) + ibv_destroy_cq(node->cq); + + if (node->mem) { + ibv_dereg_mr(node->mr); + free(node->mem); + } + + if (node->pd) + ibv_dealloc_pd(node->pd); + + /* Destroy the RDMA ID after all device resources */ + rdma_destroy_id(node->cma_id); +} + +static int alloc_nodes(void) +{ + int ret, i; + + test.nodes = malloc(sizeof *test.nodes * connections); + if (!test.nodes) { + printf("udaddy: unable to allocate memory for test nodes\n"); + return -ENOMEM; + } + memset(test.nodes, 0, sizeof *test.nodes * connections); + + for (i = 0; i < connections; i++) { + test.nodes[i].id = i; + if (dst_addr) { + ret = rdma_create_id(test.channel, + &test.nodes[i].cma_id, + &test.nodes[i], hints.ai_port_space); + if (ret) + goto err; + } + } + return 0; +err: + while (--i >= 0) + rdma_destroy_id(test.nodes[i].cma_id); + free(test.nodes); + return ret; +} + +static void destroy_nodes(void) +{ + int i; + + for (i = 0; i < connections; i++) + destroy_node(&test.nodes[i]); + free(test.nodes); +} + +static void create_reply_ah(struct cmatest_node *node, struct ibv_wc *wc) +{ + struct ibv_qp_attr attr; + struct ibv_qp_init_attr init_attr; + + node->ah = ibv_create_ah_from_wc(node->pd, wc, node->mem, + node->cma_id->port_num); + node->remote_qpn = be32toh(wc->imm_data); + + ibv_query_qp(node->cma_id->qp, &attr, IBV_QP_QKEY, &init_attr); + node->remote_qkey = attr.qkey; +} + +static int poll_cqs(void) +{ + struct ibv_wc wc[8]; + int done, i, ret; + + for (i = 0; i < connections; i++) { + if (!test.nodes[i].connected) + continue; + + for (done = 0; done < message_count; done += ret) { + ret = ibv_poll_cq(test.nodes[i].cq, 8, wc); + if (ret < 0) { + printf("udaddy: failed polling CQ: %d\n", ret); + return ret; + } + + if (ret && !test.nodes[i].ah) + create_reply_ah(&test.nodes[i], wc); + } + } + return 0; +} + +static int connect_events(void) +{ + struct rdma_cm_event *event; + int ret = 0; + + while (test.connects_left && !ret) { + ret = rdma_get_cm_event(test.channel, &event); + if (!ret) { + ret = cma_handler(event->id, event); + rdma_ack_cm_event(event); + } + } + return ret; +} + +static int run_server(void) +{ + struct rdma_cm_id *listen_id; + int i, ret; + + printf("udaddy: starting server\n"); + ret = rdma_create_id(test.channel, &listen_id, &test, hints.ai_port_space); + if (ret) { + perror("udaddy: listen request failed"); + return ret; + } + + ret = get_rdma_addr(src_addr, dst_addr, port, &hints, &test.rai); + if (ret) { + printf("udaddy: getrdmaaddr error: %s\n", gai_strerror(ret)); + goto out; + } + + ret = rdma_bind_addr(listen_id, test.rai->ai_src_addr); + if (ret) { + perror("udaddy: bind address failed"); + goto out; + } + + ret = rdma_listen(listen_id, 0); + if (ret) { + perror("udaddy: failure trying to listen"); + goto out; + } + + connect_events(); + + if (message_count) { + printf("receiving data transfers\n"); + ret = poll_cqs(); + if (ret) + goto out; + + printf("sending replies\n"); + for (i = 0; i < connections; i++) { + ret = post_sends(&test.nodes[i], IBV_SEND_SIGNALED); + if (ret) + goto out; + } + + ret = poll_cqs(); + if (ret) + goto out; + printf("data transfers complete\n"); + } +out: + rdma_destroy_id(listen_id); + return ret; +} + +static int run_client(void) +{ + int i, ret; + + printf("udaddy: starting client\n"); + + ret = get_rdma_addr(src_addr, dst_addr, port, &hints, &test.rai); + if (ret) { + printf("udaddy: getaddrinfo error: %s\n", gai_strerror(ret)); + return ret; + } + + printf("udaddy: connecting\n"); + for (i = 0; i < connections; i++) { + ret = rdma_resolve_addr(test.nodes[i].cma_id, test.rai->ai_src_addr, + test.rai->ai_dst_addr, 2000); + if (ret) { + perror("udaddy: failure getting addr"); + connect_error(); + return ret; + } + } + + ret = connect_events(); + if (ret) + goto out; + + if (message_count) { + printf("initiating data transfers\n"); + for (i = 0; i < connections; i++) { + ret = post_sends(&test.nodes[i], 0); + if (ret) + goto out; + } + printf("receiving data transfers\n"); + ret = poll_cqs(); + if (ret) + goto out; + + printf("data transfers complete\n"); + } +out: + return ret; +} + +int main(int argc, char **argv) +{ + int op, ret; + + hints.ai_port_space = RDMA_PS_UDP; + while ((op = getopt(argc, argv, "s:b:c:C:S:t:p:P:f:")) != -1) { + switch (op) { + case 's': + dst_addr = optarg; + break; + case 'b': + src_addr = optarg; + break; + case 'c': + connections = atoi(optarg); + break; + case 'C': + message_count = atoi(optarg); + break; + case 'S': + message_size = atoi(optarg); + break; + case 't': + set_tos = 1; + tos = (uint8_t) strtoul(optarg, NULL, 0); + break; + case 'p': /* for backwards compatibility - use -P */ + hints.ai_port_space = strtol(optarg, NULL, 0); + break; + case 'f': + if (!strncasecmp("ip", optarg, 2)) { + hints.ai_flags = RAI_NUMERICHOST; + } else if (!strncasecmp("gid", optarg, 3)) { + hints.ai_flags = RAI_NUMERICHOST | RAI_FAMILY; + hints.ai_family = AF_IB; + } else if (strncasecmp("name", optarg, 4)) { + fprintf(stderr, "Warning: unknown address format\n"); + } + break; + case 'P': + if (!strncasecmp("ipoib", optarg, 5)) { + hints.ai_port_space = RDMA_PS_IPOIB; + } else if (strncasecmp("udp", optarg, 3)) { + fprintf(stderr, "Warning: unknown port space format\n"); + } + break; + default: + printf("usage: %s\n", argv[0]); + printf("\t[-s server_address]\n"); + printf("\t[-b bind_address]\n"); + printf("\t[-f address_format]\n"); + printf("\t name, ip, ipv6, or gid\n"); + printf("\t[-P port_space]\n"); + printf("\t udp or ipoib\n"); + printf("\t[-c connections]\n"); + printf("\t[-C message_count]\n"); + printf("\t[-S message_size]\n"); + printf("\t[-t type_of_service]\n"); + printf("\t[-p port_space - %#x for UDP (default), " + "%#x for IPOIB]\n", RDMA_PS_UDP, RDMA_PS_IPOIB); + exit(1); + } + } + + test.connects_left = connections; + + test.channel = create_first_event_channel(); + if (!test.channel) { + exit(1); + } + + if (alloc_nodes()) + exit(1); + + if (dst_addr) { + ret = run_client(); + } else { + hints.ai_flags |= RAI_PASSIVE; + ret = run_server(); + } + + printf("test complete\n"); + destroy_nodes(); + rdma_destroy_event_channel(test.channel); + if (test.rai) + rdma_freeaddrinfo(test.rai); + + printf("return status %d\n", ret); + return ret; +} diff --git a/librdmacm/examples/udpong.c b/librdmacm/examples/udpong.c new file mode 100644 index 0000000..bee7c54 --- /dev/null +++ b/librdmacm/examples/udpong.c @@ -0,0 +1,571 @@ +/* + * Copyright (c) 2012 Intel Corporation. All rights reserved. + * + * This software is available to you under the OpenIB.org BSD license + * below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AWV + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <strings.h> +#include <errno.h> +#include <getopt.h> +#include <sys/types.h> +#include <sys/socket.h> +#include <sys/time.h> +#include <sys/wait.h> +#include <netdb.h> +#include <fcntl.h> +#include <unistd.h> +#include <netinet/tcp.h> +#include <arpa/inet.h> + +#include <rdma/rdma_cma.h> +#include <rdma/rsocket.h> +#include <util/compiler.h> +#include "common.h" + +static int test_size[] = { + (1 << 6), + (1 << 7), ((1 << 7) + (1 << 6)), + (1 << 8), ((1 << 8) + (1 << 7)), + (1 << 9), ((1 << 9) + (1 << 8)), + (1 << 10), ((1 << 10) + (1 << 9)), +}; +#define TEST_CNT (sizeof test_size / sizeof test_size[0]) + +enum { + msg_op_login, + msg_op_start, + msg_op_data, + msg_op_echo, + msg_op_end +}; + +struct message { + uint8_t op; + uint8_t id; + uint8_t seqno; + uint8_t reserved; + __be32 data; + uint8_t buf[2048]; +}; + +#define CTRL_MSG_SIZE 16 + +struct client { + uint64_t recvcnt; +}; + +static struct client clients[256]; +static uint8_t id; + +static int rs; +static int use_async; +static int flags = MSG_DONTWAIT; +static int poll_timeout; +static int custom; +static int echo; +static int transfer_size = 1000; +static int transfer_count = 1000; +static int buffer_size; +static char test_name[10] = "custom"; +static const char *port = "7174"; +static char *dst_addr; +static char *src_addr; +static union socket_addr g_addr; +static socklen_t g_addrlen; +static struct timeval start, end; +static struct message g_msg; + +static void show_perf(void) +{ + char str[32]; + float usec; + long long bytes; + int transfers; + + usec = (end.tv_sec - start.tv_sec) * 1000000 + (end.tv_usec - start.tv_usec); + transfers = echo ? transfer_count * 2 : be32toh(g_msg.data); + bytes = (long long) transfers * transfer_size; + + /* name size transfers bytes seconds Gb/sec usec/xfer */ + printf("%-10s", test_name); + size_str(str, sizeof str, transfer_size); + printf("%-8s", str); + cnt_str(str, sizeof str, transfers); + printf("%-8s", str); + size_str(str, sizeof str, bytes); + printf("%-8s", str); + printf("%8.2fs%10.2f%11.2f\n", + usec / 1000000., (bytes * 8) / (1000. * usec), + (usec / transfers)); +} + +static void init_latency_test(int size) +{ + char sstr[5]; + + size_str(sstr, sizeof sstr, size); + snprintf(test_name, sizeof test_name, "%s_lat", sstr); + transfer_size = size; + transfer_count = size_to_count(transfer_size) / 10; + echo = 1; +} + +static void init_bandwidth_test(int size) +{ + char sstr[5]; + + size_str(sstr, sizeof sstr, size); + snprintf(test_name, sizeof test_name, "%s_bw", sstr); + transfer_size = size; + transfer_count = size_to_count(transfer_size); + echo = 0; +} + +static void set_options(int fd) +{ + int val; + + if (buffer_size) { + rs_setsockopt(fd, SOL_SOCKET, SO_SNDBUF, (void *) &buffer_size, + sizeof buffer_size); + rs_setsockopt(fd, SOL_SOCKET, SO_RCVBUF, (void *) &buffer_size, + sizeof buffer_size); + } else { + val = 1 << 19; + rs_setsockopt(fd, SOL_SOCKET, SO_SNDBUF, (void *) &val, sizeof val); + rs_setsockopt(fd, SOL_SOCKET, SO_RCVBUF, (void *) &val, sizeof val); + } + + if (flags & MSG_DONTWAIT) + rs_fcntl(fd, F_SETFL, O_NONBLOCK); +} + +static ssize_t svr_send(struct message *msg, size_t size, + union socket_addr *addr, socklen_t addrlen) +{ + struct pollfd fds; + ssize_t ret; + + if (use_async) { + fds.fd = rs; + fds.events = POLLOUT; + } + + do { + if (use_async) { + ret = do_poll(&fds, poll_timeout); + if (ret) + return ret; + } + + ret = rs_sendto(rs, msg, size, flags, &addr->sa, addrlen); + } while (ret < 0 && (errno == EWOULDBLOCK || errno == EAGAIN)); + + if (ret < 0) + perror("rsend"); + + return ret; +} + +static ssize_t svr_recv(struct message *msg, size_t size, + union socket_addr *addr, socklen_t *addrlen) +{ + struct pollfd fds; + ssize_t ret; + + if (use_async) { + fds.fd = rs; + fds.events = POLLIN; + } + + do { + if (use_async) { + ret = do_poll(&fds, poll_timeout); + if (ret) + return ret; + } + + ret = rs_recvfrom(rs, msg, size, flags, &addr->sa, addrlen); + } while (ret < 0 && (errno == EWOULDBLOCK || errno == EAGAIN)); + + if (ret < 0) + perror("rrecv"); + + return ret; +} + +static int svr_process(struct message *msg, size_t size, + union socket_addr *addr, socklen_t addrlen) +{ + char str[64]; + ssize_t ret; + + switch (msg->op) { + case msg_op_login: + if (addr->sa.sa_family == AF_INET) { + printf("client login from %s\n", + inet_ntop(AF_INET, &addr->sin.sin_addr.s_addr, + str, sizeof str)); + } else { + printf("client login from %s\n", + inet_ntop(AF_INET6, &addr->sin6.sin6_addr.s6_addr, + str, sizeof str)); + } + msg->id = id++; + /* fall through */ + case msg_op_start: + memset(&clients[msg->id], 0, sizeof clients[msg->id]); + break; + case msg_op_echo: + clients[msg->id].recvcnt++; + break; + case msg_op_end: + msg->data = htobe32(clients[msg->id].recvcnt); + break; + default: + clients[msg->id].recvcnt++; + return 0; + } + + ret = svr_send(msg, size, addr, addrlen); + return (ret == size) ? 0 : (int) ret; +} + +static int svr_bind(void) +{ + struct addrinfo hints, *res; + int ret; + + memset(&hints, 0, sizeof hints); + hints.ai_socktype = SOCK_DGRAM; + ret = getaddrinfo(src_addr, port, &hints, &res); + if (ret) { + printf("getaddrinfo: %s\n", gai_strerror(ret)); + return ret; + } + + rs = rs_socket(res->ai_family, res->ai_socktype, res->ai_protocol); + if (rs < 0) { + ret = rs; + goto out; + } + + set_options(rs); + ret = rs_bind(rs, res->ai_addr, res->ai_addrlen); + if (ret) { + perror("rbind"); + rs_close(rs); + } + +out: + free(res); + return ret; +} + +static int svr_run(void) +{ + ssize_t len; + int ret; + + ret = svr_bind(); + while (!ret) { + g_addrlen = sizeof g_addr; + len = svr_recv(&g_msg, sizeof g_msg, &g_addr, &g_addrlen); + if (len < 0) + return len; + + ret = svr_process(&g_msg, len, &g_addr, g_addrlen); + } + return ret; +} + +static ssize_t client_send(struct message *msg, size_t size) +{ + struct pollfd fds; + int ret; + + if (use_async) { + fds.fd = rs; + fds.events = POLLOUT; + } + + do { + if (use_async) { + ret = do_poll(&fds, poll_timeout); + if (ret) + return ret; + } + + ret = rs_send(rs, msg, size, flags); + } while (ret < 0 && (errno == EWOULDBLOCK || errno == EAGAIN)); + + if (ret < 0) + perror("rsend"); + + return ret; +} + +static ssize_t client_recv(struct message *msg, size_t size, int timeout) +{ + struct pollfd fds; + int ret; + + if (timeout) { + fds.fd = rs; + fds.events = POLLIN; + + ret = rs_poll(&fds, 1, timeout); + if (ret <= 0) + return ret; + } + + ret = rs_recv(rs, msg, size, flags | MSG_DONTWAIT); + if (ret < 0 && errno != EWOULDBLOCK && errno != EAGAIN) + perror("rrecv"); + + return ret; +} + +static int client_send_recv(struct message *msg, size_t size, int timeout) +{ + static uint8_t seqno; + int ret; + + msg->seqno = seqno; + do { + ret = client_send(msg, size); + if (ret != size) + return ret; + + ret = client_recv(msg, size, timeout); + } while (ret <= 0 || msg->seqno != seqno); + + seqno++; + return ret; +} + +static int run_test(void) +{ + int ret, i; + + g_msg.op = msg_op_start; + ret = client_send_recv(&g_msg, CTRL_MSG_SIZE, 1000); + if (ret != CTRL_MSG_SIZE) + goto out; + + g_msg.op = echo ? msg_op_echo : msg_op_data; + gettimeofday(&start, NULL); + for (i = 0; i < transfer_count; i++) { + ret = echo ? client_send_recv(&g_msg, transfer_size, 1) : + client_send(&g_msg, transfer_size); + if (ret != transfer_size) + goto out; + } + + g_msg.op = msg_op_end; + ret = client_send_recv(&g_msg, CTRL_MSG_SIZE, 1); + if (ret != CTRL_MSG_SIZE) + goto out; + + gettimeofday(&end, NULL); + show_perf(); + ret = 0; + +out: + return ret; +} + +static int client_connect(void) +{ + struct addrinfo hints, *res; + int ret; + + memset(&hints, 0, sizeof hints); + hints.ai_socktype = SOCK_DGRAM; + ret = getaddrinfo(dst_addr, port, &hints, &res); + if (ret) { + printf("getaddrinfo: %s\n", gai_strerror(ret)); + return ret; + } + + rs = rs_socket(res->ai_family, res->ai_socktype, res->ai_protocol); + if (rs < 0) { + ret = rs; + goto out; + } + + set_options(rs); + ret = rs_connect(rs, res->ai_addr, res->ai_addrlen); + if (ret) { + if (errno == ENODEV) + fprintf(stderr, "No RDMA devices were detected\n"); + else + perror("rconnect"); + rs_close(rs); + goto out; + } + + g_msg.op = msg_op_login; + ret = client_send_recv(&g_msg, CTRL_MSG_SIZE, 1000); + if (ret == CTRL_MSG_SIZE) + ret = 0; + +out: + freeaddrinfo(res); + return ret; +} + +static int client_run(void) +{ + int i, ret; + + printf("%-10s%-8s%-8s%-8s%8s %10s%13s\n", + "name", "bytes", "xfers", "total", "time", "Gb/sec", "usec/xfer"); + + ret = client_connect(); + if (ret) + return ret; + + if (!custom) { + for (i = 0; i < TEST_CNT; i++) { + init_latency_test(test_size[i]); + run_test(); + } + for (i = 0; i < TEST_CNT; i++) { + init_bandwidth_test(test_size[i]); + run_test(); + } + } else { + run_test(); + } + rs_close(rs); + + return ret; +} + +static int set_test_opt(const char *arg) +{ + if (strlen(arg) == 1) { + switch (arg[0]) { + case 's': + use_rs = 0; + break; + case 'a': + use_async = 1; + break; + case 'b': + flags = 0; + break; + case 'n': + flags = MSG_DONTWAIT; + break; + case 'e': + echo = 1; + break; + default: + return -1; + } + } else { + if (!strncasecmp("socket", arg, 6)) { + use_rs = 0; + } else if (!strncasecmp("async", arg, 5)) { + use_async = 1; + } else if (!strncasecmp("block", arg, 5)) { + flags = 0; + } else if (!strncasecmp("nonblock", arg, 8)) { + flags = MSG_DONTWAIT; + } else if (!strncasecmp("echo", arg, 4)) { + echo = 1; + } else { + return -1; + } + } + return 0; +} + +int main(int argc, char **argv) +{ + int op, ret; + + while ((op = getopt(argc, argv, "s:b:B:C:S:p:T:")) != -1) { + switch (op) { + case 's': + dst_addr = optarg; + break; + case 'b': + src_addr = optarg; + break; + case 'B': + buffer_size = atoi(optarg); + break; + case 'C': + custom = 1; + transfer_count = atoi(optarg); + break; + case 'S': + custom = 1; + transfer_size = atoi(optarg); + if (transfer_size < CTRL_MSG_SIZE) { + printf("size must be at least %d bytes\n", + CTRL_MSG_SIZE); + exit(1); + } + break; + case 'p': + port = optarg; + break; + case 'T': + if (!set_test_opt(optarg)) + break; + /* invalid option - fall through */ + SWITCH_FALLTHROUGH; + default: + printf("usage: %s\n", argv[0]); + printf("\t[-s server_address]\n"); + printf("\t[-b bind_address]\n"); + printf("\t[-B buffer_size]\n"); + printf("\t[-C transfer_count]\n"); + printf("\t[-S transfer_size]\n"); + printf("\t[-p port_number]\n"); + printf("\t[-T test_option]\n"); + printf("\t s|sockets - use standard tcp/ip sockets\n"); + printf("\t a|async - asynchronous operation (use poll)\n"); + printf("\t b|blocking - use blocking calls\n"); + printf("\t n|nonblocking - use nonblocking calls\n"); + printf("\t e|echo - server echoes all messages\n"); + exit(1); + } + } + + if (flags) + poll_timeout = -1; + + ret = dst_addr ? client_run() : svr_run(); + return ret; +} diff --git a/librdmacm/ib.h b/librdmacm/ib.h new file mode 100644 index 0000000..875cddb --- /dev/null +++ b/librdmacm/ib.h @@ -0,0 +1,98 @@ +/* + * Copyright (c) 2010 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#if !defined(_RDMA_IB_H) +#define _RDMA_IB_H + +#include <linux/types.h> +#include <endian.h> +#include <string.h> + +#ifndef AF_IB +#define AF_IB 27 +#endif +#ifndef PF_IB +#define PF_IB AF_IB +#endif + +struct ib_addr { + union { + __u8 uib_addr8[16]; + __be16 uib_addr16[8]; + __be32 uib_addr32[4]; + __be64 uib_addr64[2]; + } ib_u; +#define sib_addr8 ib_u.uib_addr8 +#define sib_addr16 ib_u.uib_addr16 +#define sib_addr32 ib_u.uib_addr32 +#define sib_addr64 ib_u.uib_addr64 +#define sib_raw ib_u.uib_addr8 +#define sib_subnet_prefix ib_u.uib_addr64[0] +#define sib_interface_id ib_u.uib_addr64[1] +}; + +static inline int ib_addr_any(const struct ib_addr *a) +{ + return ((a->sib_addr64[0] | a->sib_addr64[1]) == 0); +} + +static inline int ib_addr_loopback(const struct ib_addr *a) +{ + return ((a->sib_addr32[0] | a->sib_addr32[1] | + a->sib_addr32[2] | (a->sib_addr32[3] ^ htobe32(1))) == 0); +} + +static inline void ib_addr_set(struct ib_addr *addr, + __be32 w1, __be32 w2, __be32 w3, __be32 w4) +{ + addr->sib_addr32[0] = w1; + addr->sib_addr32[1] = w2; + addr->sib_addr32[2] = w3; + addr->sib_addr32[3] = w4; +} + +static inline int ib_addr_cmp(const struct ib_addr *a1, const struct ib_addr *a2) +{ + return memcmp(a1, a2, sizeof(struct ib_addr)); +} + +struct sockaddr_ib { + unsigned short int sib_family; /* AF_IB */ + __be16 sib_pkey; + __be32 sib_flowinfo; + struct ib_addr sib_addr; + __be64 sib_sid; + __be64 sib_sid_mask; + __u64 sib_scope_id; +}; + +#endif /* _RDMA_IB_H */ diff --git a/librdmacm/indexer.c b/librdmacm/indexer.c new file mode 100644 index 0000000..00be7d0 --- /dev/null +++ b/librdmacm/indexer.c @@ -0,0 +1,164 @@ +/* + * Copyright (c) 2011 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#include <config.h> + +#include <errno.h> +#include <sys/types.h> +#include <stdlib.h> + +#include "indexer.h" + +/* + * Indexer - to find a structure given an index + * + * We store pointers using a double lookup and return an index to the + * user which is then used to retrieve the pointer. The upper bits of + * the index are itself an index into an array of memory allocations. + * The lower bits specify the offset into the allocated memory where + * the pointer is stored. + * + * This allows us to adjust the number of pointers stored by the index + * list without taking a lock during data lookups. + */ + +static int idx_grow(struct indexer *idx) +{ + union idx_entry *entry; + int i, start_index; + + if (idx->size >= IDX_ARRAY_SIZE) + goto nomem; + + idx->array[idx->size] = calloc(IDX_ENTRY_SIZE, sizeof(union idx_entry)); + if (!idx->array[idx->size]) + goto nomem; + + entry = idx->array[idx->size]; + start_index = idx->size << IDX_ENTRY_BITS; + entry[IDX_ENTRY_SIZE - 1].next = idx->free_list; + + for (i = IDX_ENTRY_SIZE - 2; i >= 0; i--) + entry[i].next = start_index + i + 1; + + /* Index 0 is reserved */ + if (start_index == 0) + start_index++; + idx->free_list = start_index; + idx->size++; + return start_index; + +nomem: + errno = ENOMEM; + return -1; +} + +int idx_insert(struct indexer *idx, void *item) +{ + union idx_entry *entry; + int index; + + if ((index = idx->free_list) == 0) { + if ((index = idx_grow(idx)) <= 0) + return index; + } + + entry = idx->array[idx_array_index(index)]; + idx->free_list = entry[idx_entry_index(index)].next; + entry[idx_entry_index(index)].item = item; + return index; +} + +void *idx_remove(struct indexer *idx, int index) +{ + union idx_entry *entry; + void *item; + + entry = idx->array[idx_array_index(index)]; + item = entry[idx_entry_index(index)].item; + entry[idx_entry_index(index)].next = idx->free_list; + idx->free_list = index; + return item; +} + +void idx_replace(struct indexer *idx, int index, void *item) +{ + union idx_entry *entry; + + entry = idx->array[idx_array_index(index)]; + entry[idx_entry_index(index)].item = item; +} + + +static int idm_grow(struct index_map *idm, int index) +{ + idm->array[idx_array_index(index)] = calloc(IDX_ENTRY_SIZE, sizeof(void *)); + if (!idm->array[idx_array_index(index)]) + goto nomem; + + return index; + +nomem: + errno = ENOMEM; + return -1; +} + +int idm_set(struct index_map *idm, int index, void *item) +{ + void **entry; + + if (index > IDX_MAX_INDEX) { + errno = ENOMEM; + return -1; + } + + if (!idm->array[idx_array_index(index)]) { + if (idm_grow(idm, index) < 0) + return -1; + } + + entry = idm->array[idx_array_index(index)]; + entry[idx_entry_index(index)] = item; + return index; +} + +void *idm_clear(struct index_map *idm, int index) +{ + void **entry; + void *item; + + entry = idm->array[idx_array_index(index)]; + item = entry[idx_entry_index(index)]; + entry[idx_entry_index(index)] = NULL; + return item; +} diff --git a/librdmacm/indexer.h b/librdmacm/indexer.h new file mode 100644 index 0000000..e6ffc60 --- /dev/null +++ b/librdmacm/indexer.h @@ -0,0 +1,142 @@ +/* + * Copyright (c) 2011 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#if !defined(INDEXER_H) +#define INDEXER_H + +#include <config.h> +#include <stddef.h> +#include <sys/types.h> + +/* + * Indexer - to find a structure given an index. Synchronization + * must be provided by the caller. Caller must initialize the + * indexer by setting free_list and size to 0. + */ + +union idx_entry { + void *item; + int next; +}; + +#define IDX_INDEX_BITS 16 +#define IDX_ENTRY_BITS 10 +#define IDX_ENTRY_SIZE (1 << IDX_ENTRY_BITS) +#define IDX_ARRAY_SIZE (1 << (IDX_INDEX_BITS - IDX_ENTRY_BITS)) +#define IDX_MAX_INDEX ((1 << IDX_INDEX_BITS) - 1) + +struct indexer +{ + union idx_entry *array[IDX_ARRAY_SIZE]; + int free_list; + int size; +}; + +#define idx_array_index(index) (index >> IDX_ENTRY_BITS) +#define idx_entry_index(index) (index & (IDX_ENTRY_SIZE - 1)) + +int idx_insert(struct indexer *idx, void *item); +void *idx_remove(struct indexer *idx, int index); +void idx_replace(struct indexer *idx, int index, void *item); + +static inline void *idx_at(struct indexer *idx, int index) +{ + return (idx->array[idx_array_index(index)] + idx_entry_index(index))->item; +} + +/* + * Index map - associates a structure with an index. Synchronization + * must be provided by the caller. Caller must initialize the + * index map by setting it to 0. + */ + +struct index_map +{ + void **array[IDX_ARRAY_SIZE]; +}; + +int idm_set(struct index_map *idm, int index, void *item); +void *idm_clear(struct index_map *idm, int index); + +static inline void *idm_at(struct index_map *idm, int index) +{ + void **entry; + entry = idm->array[idx_array_index(index)]; + return entry[idx_entry_index(index)]; +} + +static inline void *idm_lookup(struct index_map *idm, int index) +{ + return ((index <= IDX_MAX_INDEX) && idm->array[idx_array_index(index)]) ? + idm_at(idm, index) : NULL; +} + +typedef struct _dlist_entry { + struct _dlist_entry *next; + struct _dlist_entry *prev; +} dlist_entry; + +static inline void dlist_init(dlist_entry *head) +{ + head->next = head; + head->prev = head; +} + +static inline int dlist_empty(dlist_entry *head) +{ + return head->next == head; +} + +static inline void dlist_insert_after(dlist_entry *item, dlist_entry *head) +{ + item->next = head->next; + item->prev = head; + head->next->prev = item; + head->next = item; +} + +static inline void dlist_insert_before(dlist_entry *item, dlist_entry *head) +{ + dlist_insert_after(item, head->prev); +} + +#define dlist_insert_head dlist_insert_after +#define dlist_insert_tail dlist_insert_before + +static inline void dlist_remove(dlist_entry *item) +{ + item->prev->next = item->next; + item->next->prev = item->prev; +} + +#endif /* INDEXER_H */ diff --git a/librdmacm/librdmacm.map b/librdmacm/librdmacm.map new file mode 100644 index 0000000..7f55e84 --- /dev/null +++ b/librdmacm/librdmacm.map @@ -0,0 +1,84 @@ +/* Do not change this file without reading Documentation/versioning.md */ +RDMACM_1.0 { + global: + rdma_create_event_channel; + rdma_destroy_event_channel; + rdma_create_id; + rdma_destroy_id; + rdma_bind_addr; + rdma_resolve_addr; + rdma_resolve_route; + rdma_create_qp; + rdma_destroy_qp; + rdma_connect; + rdma_listen; + rdma_accept; + rdma_reject; + rdma_notify; + rdma_disconnect; + rdma_get_cm_event; + rdma_ack_cm_event; + rdma_get_src_port; + rdma_get_dst_port; + rdma_join_multicast; + rdma_leave_multicast; + rdma_get_devices; + rdma_free_devices; + rdma_event_str; + rdma_set_option; + rdma_get_local_addr; + rdma_get_peer_addr; + rdma_migrate_id; + rdma_getaddrinfo; + rdma_freeaddrinfo; + rdma_get_request; + rdma_create_ep; + rdma_destroy_ep; + rdma_create_srq; + rdma_destroy_srq; + rsocket; + rbind; + rlisten; + raccept; + rconnect; + rshutdown; + rclose; + rrecv; + rrecvfrom; + rrecvmsg; + rsend; + rsendto; + rsendmsg; + rread; + rreadv; + rwrite; + rwritev; + rpoll; + rselect; + rgetpeername; + rgetsockname; + rsetsockopt; + rgetsockopt; + rfcntl; + rpoll; + rselect; + rdma_get_src_port; + rdma_get_dst_port; + riomap; + riounmap; + riowrite; + rdma_create_srq_ex; + rdma_create_qp_ex; + local: *; +}; + +RDMACM_1.1 { + global: + rdma_join_multicast_ex; +} RDMACM_1.0; + +RDMACM_1.2 { + global: + rdma_establish; + rdma_init_qp_attr; +} RDMACM_1.1; diff --git a/librdmacm/librspreload.map b/librdmacm/librspreload.map new file mode 100644 index 0000000..67ecf33 --- /dev/null +++ b/librdmacm/librspreload.map @@ -0,0 +1,33 @@ +{ + /* FIXME: It is probably not a great idea to not tag these with the + proper symbol version from glibc, at least if glibc ever changes + the signature this will go sideways.. */ + global: + accept; + bind; + close; + connect; + dup2; + fcntl; + getpeername; + getsockname; + getsockopt; + listen; + poll; + read; + readv; + recv; + recvfrom; + recvmsg; + select; + send; + sendfile; + sendmsg; + sendto; + setsockopt; + shutdown; + socket; + write; + writev; + local: *; +}; diff --git a/librdmacm/man/CMakeLists.txt b/librdmacm/man/CMakeLists.txt new file mode 100644 index 0000000..2d1efbf --- /dev/null +++ b/librdmacm/man/CMakeLists.txt @@ -0,0 +1,69 @@ +rdma_man_pages( + cmtime.1 + mckey.1 + rcopy.1 + rdma_accept.3 + rdma_ack_cm_event.3 + rdma_bind_addr.3 + rdma_client.1 + rdma_cm.7 + rdma_connect.3 + rdma_create_ep.3 + rdma_create_event_channel.3 + rdma_create_id.3 + rdma_create_qp.3 + rdma_create_srq.3 + rdma_dereg_mr.3 + rdma_destroy_ep.3 + rdma_destroy_event_channel.3 + rdma_destroy_id.3 + rdma_destroy_qp.3 + rdma_destroy_srq.3 + rdma_disconnect.3 + rdma_establish.3.md + rdma_event_str.3 + rdma_free_devices.3 + rdma_get_cm_event.3 + rdma_get_devices.3 + rdma_get_dst_port.3 + rdma_get_local_addr.3 + rdma_get_peer_addr.3 + rdma_get_recv_comp.3 + rdma_get_request.3 + rdma_get_send_comp.3 + rdma_get_src_port.3 + rdma_getaddrinfo.3 + rdma_init_qp_attr.3.md + rdma_join_multicast.3 + rdma_join_multicast_ex.3 + rdma_leave_multicast.3 + rdma_listen.3 + rdma_migrate_id.3 + rdma_notify.3 + rdma_post_read.3 + rdma_post_readv.3 + rdma_post_recv.3 + rdma_post_recvv.3 + rdma_post_send.3 + rdma_post_sendv.3 + rdma_post_ud_send.3 + rdma_post_write.3 + rdma_post_writev.3 + rdma_reg_msgs.3 + rdma_reg_read.3 + rdma_reg_write.3 + rdma_reject.3 + rdma_resolve_addr.3 + rdma_resolve_route.3 + rdma_server.1 + rdma_set_option.3 + rdma_xclient.1 + rdma_xserver.1 + riostream.1 + rping.1 + rsocket.7.in + rstream.1 + ucmatose.1 + udaddy.1 + udpong.1 + ) diff --git a/librdmacm/man/cmtime.1 b/librdmacm/man/cmtime.1 new file mode 100644 index 0000000..434373d --- /dev/null +++ b/librdmacm/man/cmtime.1 @@ -0,0 +1,50 @@ +.\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md +.TH "CMTIME" 1 "2017-04-28" "librdmacm" "librdmacm" librdmacm +.SH NAME +cmtime \- RDMA CM connection steps timing test. +.SH SYNOPSIS +.sp +.nf +\fIcmtime\fR [-s server_address] [-b bind_address] + [-c connections] [-p port_number] + [-r retries] [-t timeout_ms] +.fi +.SH "DESCRIPTION" +Determines min and max times for various "steps" in RDMA CM +connection setup and teardown between a client and server +application. + +"Steps" that are timed are: create id, bind address, resolve address, +resolve route, create qp, connect, disconnect, and destroy. +.SH "OPTIONS" +.TP +\-s server_address +The network name or IP address of the server system listening for +connections. The used name or address must route over an RDMA device. +This option must be specified by the client. +.TP +\-b bind_address +The local network address to bind to. +.TP +\-c connections +The number of connections to establish between the client and +server. (default 100) +.TP +\-p port_number +The server's port number. +.TP +\-r retries +Number of retries when resolving address or route. (default 2) +.TP +\-t timeout_ms +Timeout in millseconds (ms) when resolving address or +route. (default 2000 - 2 seconds) +.SH "NOTES" +Basic usage is to start cmtime on a server system, then run +cmtime -s server_name on a client system. +.P +Because this test maps RDMA resources to userspace, users must ensure +that they have available system resources and permissions. See the +libibverbs README file for additional details. +.SH "SEE ALSO" +rdma_cm(7) diff --git a/librdmacm/man/mckey.1 b/librdmacm/man/mckey.1 new file mode 100644 index 0000000..5e47ce5 --- /dev/null +++ b/librdmacm/man/mckey.1 @@ -0,0 +1,64 @@ +.\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md +.TH "MCKEY" 1 "2007-05-15" "librdmacm" "librdmacm" librdmacm +.SH NAME +mckey \- RDMA CM multicast setup and simple data transfer test. +.SH SYNOPSIS +.sp +.nf +\fImckey\fR -m multicast_address [-s] [-b bind_address] [-c connections] + [-C message_count] [-S message_size] [-p port_space] +\fImckey\fR -m multicast_address -s [-b bind_address] [-c connections] + [-C message_count] [-S message_size] [-p port_space] +\fImckey\fR -M unmapped_multicast_address -b bind_address [-s] [-c connections] + [-C message_count] [-S message_size] [-p port_space] +.fi +.SH "DESCRIPTION" +Establishes a set of RDMA multicast communication paths between nodes +using the librdmacm, optionally transfers datagrams to receiving nodes, +then tears down the communication. +.SH "OPTIONS" +.TP +\-m multicast_address +IP multicast address to join. +.TP +\-M unmapped_multicast_address +RDMA transport specific multicast address to join. +.TP +\-s +Send datagrams to the multicast group. +.TP +\-b bind_address +The local network address to bind to. +.TP +\-c connections +The number of QPs to join the multicast group. (default 1) +.TP +\-C message_count +The number of messages to transfer over each connection. (default 10) +.TP +\-S message_size +The size of each message transferred, in bytes. This value must be smaller +than the MTU of the underlying RDMA transport, or an error will occur. +(default 100) +.TP +\-o +Join the multicast group as a send-only full-member. Otherwise the group is +joined as a full-member. +.TP +\-p port_space +The port space of the datagram communication. May be either the RDMA +UDP (0x0111) or IPoIB (0x0002) port space. (default RDMA_PS_UDP) +.SH "NOTES" +Basic usage is to start mckey -m multicast_address on a server system, +then run mckey -m multicast_address -s on a client system. +.P +Unique Infiniband SA assigned multicast GIDs can be retrieved by +invoking mckey with a zero MGID or IP address. (Example, -M 0 or +-m 0.0.0.0). The assigned address will be displayed to allow +mckey clients to join the created group. +.P +Because this test maps RDMA resources to userspace, users must ensure +that they have available system resources and permissions. See the +libibverbs README file for additional details. +.SH "SEE ALSO" +rdma_cm(7), ucmatose(1), udaddy(1), rping(1) diff --git a/librdmacm/man/rcopy.1 b/librdmacm/man/rcopy.1 new file mode 100644 index 0000000..1dcca49 --- /dev/null +++ b/librdmacm/man/rcopy.1 @@ -0,0 +1,38 @@ +.\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md +.TH "RCOPY" 1 "2011-12-2" "librdmacm" "librdmacm" librdmacm +.SH NAME +rcopy \- simple file copy over RDMA. +.SH SYNOPSIS +.sp +.nf +\fIrcopy\fR source server[:destination] [-p port] +\fIrcopy\fR [-p port] +.fi +.SH "DESCRIPTION" +Uses sockets over RDMA interface to copy a source file to the +specified destination. +.SH "OPTIONS" +.TP +source +The name and path of the source file to copy. +.TP +server +The name or address of the destination server. +.TP +:destination +An optional destination filename and path. If not given, the destination +filename will match that of the source. +.TP +\-p server_port +The server's port number. +.TP +.SH "NOTES" +Basic usage is to start rcopy on a server system, then run +rcopy sourcefile servername. The server application will continue to run after +copying the file, but is currently single-threaded. +.P +Because this test maps RDMA resources to userspace, users must ensure +that they have available system resources and permissions. See the +libibverbs README file for additional details. +.SH "SEE ALSO" +rdma_cm(7) diff --git a/librdmacm/man/rdma_accept.3 b/librdmacm/man/rdma_accept.3 new file mode 100644 index 0000000..a146a8d --- /dev/null +++ b/librdmacm/man/rdma_accept.3 @@ -0,0 +1,95 @@ +.\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md +.TH "RDMA_ACCEPT" 3 "2014-05-27" "librdmacm" "Librdmacm Programmer's Manual" librdmacm +.SH NAME +rdma_accept \- Called to accept a connection request. +.SH SYNOPSIS +.B "#include <rdma/rdma_cma.h>" +.P +.B "int" rdma_accept +.BI "(struct rdma_cm_id *" id "," +.BI "struct rdma_conn_param *" conn_param ");" +.SH ARGUMENTS +.IP "id" 12 +Connection identifier associated with the request. +.IP "conn_param" 12 +Information needed to establish the connection. See CONNECTION PROPERTIES +below for details. +.SH "DESCRIPTION" +Called from the listening side to accept a connection or datagram +service lookup request. +.SH "RETURN VALUE" +Returns 0 on success, or -1 on error. If an error occurs, errno will be +set to indicate the failure reason. +.SH "NOTES" +Unlike the socket accept routine, rdma_accept is not called on a +listening rdma_cm_id. Instead, after calling rdma_listen, the user +waits for an RDMA_CM_EVENT_CONNECT_REQUEST event to occur. Connection request +events give the user a newly created rdma_cm_id, similar to a new +socket, but the rdma_cm_id is bound to a specific RDMA device. +rdma_accept is called on the new rdma_cm_id. +.SH "CONNECTION PROPERTIES" +The following properties are used to configure the communication and specified +by the conn_param parameter when accepting a connection or datagram +communication request. Users should use the rdma_conn_param values reported +in the connection request event to determine appropriate values for these +fields when accepting. Users may reference the rdma_conn_param structure in +the connection event directly, or can reference their own structure. If the +rdma_conn_param structure from an event is referenced, the event must not be +acked until after this call returns. +.P +If the conn_param parameter is NULL, the values reported in the connection +request event are used, adjusted down based on local hardware restrictions. +.IP private_data +References a user-controlled data buffer. The contents of the buffer are +copied and transparently passed to the remote side as part of the +communication request. May be NULL if private_data is not required. +.IP private_data_len +Specifies the size of the user-controlled data buffer. Note that the actual +amount of data transferred to the remote side is transport dependent and may +be larger than that requested. +.IP responder_resources +The maximum number of outstanding RDMA read and atomic operations that the +local side will accept from the remote side. Applies only to RDMA_PS_TCP. +This value must be less than or equal to the local RDMA device attribute +max_qp_rd_atom, but preferably greater than or equal to the responder_resources +value reported in the connect request event. +.IP initiator_depth +The maximum number of outstanding RDMA read and atomic operations that the +local side will have to the remote side. Applies only to RDMA_PS_TCP. +This value must be less than or equal to the local RDMA device attribute +max_qp_init_rd_atom and the initiator_depth value reported in the connect +request event. +.IP flow_control +Specifies if hardware flow control is available. This value is exchanged +with the remote peer and is not used to configure the QP. Applies only to +RDMA_PS_TCP. +.IP retry_count +This value is ignored. +.IP rnr_retry_count +The maximum number of times that a send operation from the remote peer +should be retried on a connection after receiving a receiver not ready (RNR) +error. RNR errors are generated when a send request arrives before a buffer +has been posted to receive the incoming data. Applies only to RDMA_PS_TCP. +.IP srq +Specifies if the QP associated with the connection is using a shared receive +queue. This field is ignored by the library if a QP has been created on the +rdma_cm_id. Applies only to RDMA_PS_TCP. +.IP qp_num +Specifies the QP number associated with the connection. This field is ignored +by the library if a QP has been created on the rdma_cm_id. +.SH "INFINIBAND SPECIFIC" +In addition to the connection properties defined above, InfiniBand QPs are +configured with minimum RNR NAK timer and local ACK timeout values. The +minimum RNR NAK timer value is set to 0, for a delay of 655 ms. +The local ACK timeout is calculated based on the packet lifetime and local +HCA ACK delay. The packet lifetime is determined by the InfiniBand Subnet +Administrator and is part of the route (path record) information obtained +by the active side of the connection. The HCA ACK delay is a property of +the locally used HCA. +.P +The RNR retry count is a 3-bit value. +.P +The length of the private data provided by the user is limited to 196 bytes +for RDMA_PS_TCP, or 136 bytes for RDMA_PS_UDP. +.SH "SEE ALSO" +rdma_listen(3), rdma_reject(3), rdma_get_cm_event(3) diff --git a/librdmacm/man/rdma_ack_cm_event.3 b/librdmacm/man/rdma_ack_cm_event.3 new file mode 100644 index 0000000..a9a616a --- /dev/null +++ b/librdmacm/man/rdma_ack_cm_event.3 @@ -0,0 +1,22 @@ +.\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md +.TH "RDMA_ACK_CM_EVENT" 3 "2007-05-15" "librdmacm" "Librdmacm Programmer's Manual" librdmacm +.SH NAME +rdma_ack_cm_event \- Free a communication event. +.SH SYNOPSIS +.B "#include <rdma/rdma_cma.h>" +.P +.B "int" rdma_ack_cm_event +.BI "(struct rdma_cm_event *" event ");" +.SH ARGUMENTS +.IP "event" 12 +Event to be released. +.SH "DESCRIPTION" +All events which are allocated by rdma_get_cm_event must be released, +there should be a one-to-one correspondence between successful gets +and acks. This call frees the event structure and any memory that it +references. +.SH "RETURN VALUE" +Returns 0 on success, or -1 on error. If an error occurs, errno will be +set to indicate the failure reason. +.SH "SEE ALSO" +rdma_get_cm_event(3), rdma_destroy_id(3) diff --git a/librdmacm/man/rdma_bind_addr.3 b/librdmacm/man/rdma_bind_addr.3 new file mode 100644 index 0000000..37db52c --- /dev/null +++ b/librdmacm/man/rdma_bind_addr.3 @@ -0,0 +1,33 @@ +.\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md +.TH "RDMA_BIND_ADDR" 3 "2007-05-15" "librdmacm" "Librdmacm Programmer's Manual" librdmacm +.SH NAME +rdma_bind_addr \- Bind an RDMA identifier to a source address. +.SH SYNOPSIS +.B "#include <rdma/rdma_cma.h>" +.P +.B "int" rdma_bind_addr +.BI "(struct rdma_cm_id *" id "," +.BI "struct sockaddr *" addr ");" +.SH ARGUMENTS +.IP "id" 12 +RDMA identifier. +.IP "addr" 12 +Local address information. Wildcard values are permitted. +.SH "DESCRIPTION" +Associates a source address with an rdma_cm_id. The address may be +wildcarded. If binding to a specific local address, the rdma_cm_id +will also be bound to a local RDMA device. +.SH "RETURN VALUE" +Returns 0 on success, or -1 on error. If an error occurs, errno will be +set to indicate the failure reason. +.SH "NOTES" +Typically, this routine is called before calling rdma_listen to bind +to a specific port number, but it may also be called on the active side +of a connection before calling rdma_resolve_addr to bind to a specific +address. +.P +If used to bind to port 0, the rdma_cm will select an available port, +which can be retrieved with rdma_get_src_port(3). +.SH "SEE ALSO" +rdma_create_id(3), rdma_listen(3), rdma_resolve_addr(3), rdma_create_qp(3), +rdma_get_local_addr(3), rdma_get_src_port(3) diff --git a/librdmacm/man/rdma_client.1 b/librdmacm/man/rdma_client.1 new file mode 100644 index 0000000..91b2d63 --- /dev/null +++ b/librdmacm/man/rdma_client.1 @@ -0,0 +1,32 @@ +.\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md +.TH "RDMA_CLIENT" 1 "2010-07-19" "librdmacm" "librdmacm" librdmacm +.SH NAME +rdma_client \- simple RDMA CM connection and ping-pong test. +.SH SYNOPSIS +.sp +.nf +\fIrdma_client\fR [-s server_address] [-p server_port] +.fi +.SH "DESCRIPTION" +Uses synchronous librdmam calls to establish an RDMA connection between +two nodes. This example is intended to provide a very simple coding +example of how to use RDMA. +.SH "OPTIONS" +.TP +\-s server_address +Specifies the address of the system that the rdma_server is running on. +By default, the client will attempt to connect to the server using +127.0.0.1. +.TP +\-p server_port +Specifies the port number that the server listens on. By default the server +listens on port 7471. +.SH "NOTES" +Basic usage is to start rdma_server, then connect to the server using the +rdma_client program. +.P +Because this test maps RDMA resources to userspace, users must ensure +that they have available system resources and permissions. See the +libibverbs README file for additional details. +.SH "SEE ALSO" +rdma_cm(7), udaddy(1), mckey(1), rping(1), rdma_server(1) diff --git a/librdmacm/man/rdma_cm.7 b/librdmacm/man/rdma_cm.7 new file mode 100644 index 0000000..8e5ad99 --- /dev/null +++ b/librdmacm/man/rdma_cm.7 @@ -0,0 +1,230 @@ +.\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md +.TH "RDMA_CM" 7 "2010-07-19" "librdmacm" "Librdmacm Programmer's Manual" librdmacm +.SH NAME +rdma_cm \- RDMA communication manager. +.SH SYNOPSIS +.B "#include <rdma/rdma_cma.h>" +.SH "DESCRIPTION" +Used to establish communication over RDMA transports. +.SH "NOTES" +The RDMA CM is a communication manager used to setup reliable, connected +and unreliable datagram data transfers. It provides an RDMA transport +neutral interface for establishing connections. The API concepts are +based on sockets, but adapted for queue pair (QP) based semantics: +communication must be over a specific RDMA device, and data transfers +are message based. +.P +The RDMA CM can control both the QP and communication management (connection setup / +teardown) portions of an RDMA API, or only the communication management +piece. It works in conjunction with the verbs +API defined by the libibverbs library. The libibverbs library provides the +underlying interfaces needed to send and receive data. +.P +The RDMA CM can operate asynchronously or synchronously. The mode of +operation is controlled by the user through the use of the rdma_cm event channel +parameter in specific calls. If an event channel is provided, an rdma_cm identifier +will report its event data (results of connecting, for example), on that channel. +If a channel is not provided, then all rdma_cm operations for the selected +rdma_cm identifier will block until they complete. +.SH "RDMA VERBS" +The rdma_cm supports the full range of verbs available through the libibverbs +library and interfaces. However, it also provides wrapper functions for some +of the more commonly used verbs funcationality. The full set of abstracted +verb calls are: +.P +rdma_reg_msgs - register an array of buffers for sending and receiving +.P +rdma_reg_read - registers a buffer for RDMA read operations +.P +rdma_reg_write - registers a buffer for RDMA write operations +.P +rdma_dereg_mr - deregisters a memory region +.P +rdma_post_recv - post a buffer to receive a message +.P +rdma_post_send - post a buffer to send a message +.P +rdma_post_read - post an RDMA to read data into a buffer +.P +rdma_post_write - post an RDMA to send data from a buffer +.P +rdma_post_recvv - post a vector of buffers to receive a message +.P +rdma_post_sendv - post a vector of buffers to send a message +.P +rdma_post_readv - post a vector of buffers to receive an RDMA read +.P +rdma_post_writev - post a vector of buffers to send an RDMA write +.P +rdma_post_ud_send - post a buffer to send a message on a UD QP +.P +rdma_get_send_comp - get completion status for a send or RDMA operation +.P +rdma_get_recv_comp - get information about a completed receive +.SH "CLIENT OPERATION" +This section provides a general overview of the basic operation for the active, +or client, side of communication. This flow assume asynchronous operation with +low level call details shown. For +synchronous operation, calls to rdma_create_event_channel, rdma_get_cm_event, +rdma_ack_cm_event, and rdma_destroy_event_channel +would be eliminated. Abstracted calls, such as rdma_create_ep encapsulate +several of these calls under a single API. +Users may also refer to the example applications for +code samples. A general connection flow would be: +.IP rdma_getaddrinfo +retrieve address information of the destination +.IP rdma_create_event_channel +create channel to receive events +.IP rdma_create_id +allocate an rdma_cm_id, this is conceptually similar to a socket +.IP rdma_resolve_addr +obtain a local RDMA device to reach the remote address +.IP rdma_get_cm_event +wait for RDMA_CM_EVENT_ADDR_RESOLVED event +.IP rdma_ack_cm_event +ack event +.IP rdma_create_qp +allocate a QP for the communication +.IP rdma_resolve_route +determine the route to the remote address +.IP rdma_get_cm_event +wait for RDMA_CM_EVENT_ROUTE_RESOLVED event +.IP rdma_ack_cm_event +ack event +.IP rdma_connect +connect to the remote server +.IP rdma_get_cm_event +wait for RDMA_CM_EVENT_ESTABLISHED event +.IP rdma_ack_cm_event +ack event +.P +Perform data transfers over connection +.IP rdma_disconnect +tear-down connection +.IP rdma_get_cm_event +wait for RDMA_CM_EVENT_DISCONNECTED event +.IP rdma_ack_cm_event +ack event +.IP rdma_destroy_qp +destroy the QP +.IP rdma_destroy_id +release the rdma_cm_id +.IP rdma_destroy_event_channel +release the event channel +.P +An almost identical process is used to setup unreliable datagram (UD) +communication between nodes. No actual connection is formed between QPs +however, so disconnection is not needed. +.P +Although this example shows the client initiating the disconnect, either side +of a connection may initiate the disconnect. +.SH "SERVER OPERATION" +This section provides a general overview of the basic operation for the passive, +or server, side of communication. A general connection flow would be: +.IP rdma_create_event_channel +create channel to receive events +.IP rdma_create_id +allocate an rdma_cm_id, this is conceptually similar to a socket +.IP rdma_bind_addr +set the local port number to listen on +.IP rdma_listen +begin listening for connection requests +.IP rdma_get_cm_event +wait for RDMA_CM_EVENT_CONNECT_REQUEST event with a new rdma_cm_id +.IP rdma_create_qp +allocate a QP for the communication on the new rdma_cm_id +.IP rdma_accept +accept the connection request +.IP rdma_ack_cm_event +ack event +.IP rdma_get_cm_event +wait for RDMA_CM_EVENT_ESTABLISHED event +.IP rdma_ack_cm_event +ack event +.P +Perform data transfers over connection +.IP rdma_get_cm_event +wait for RDMA_CM_EVENT_DISCONNECTED event +.IP rdma_ack_cm_event +ack event +.IP rdma_disconnect +tear-down connection +.IP rdma_destroy_qp +destroy the QP +.IP rdma_destroy_id +release the connected rdma_cm_id +.IP rdma_destroy_id +release the listening rdma_cm_id +.IP rdma_destroy_event_channel +release the event channel +.SH "RETURN CODES" +.IP "= 0" +success +.IP "= -1" +error - see errno for more details +.P +Most librdmacm functions return 0 to indicate success, and a -1 return value +to indicate failure. If a function operates asynchronously, a return value of 0 +means that the operation was successfully started. The operation could still +complete in error; users should check the status of the related event. If the +return value is -1, then errno will contain additional information +regarding the reason for the failure. +.P +Prior versions of the library would return -errno and not set errno for some cases +related to ENOMEM, ENODEV, ENODATA, EINVAL, and EADDRNOTAVAIL codes. Applications +that want to check these codes and have compatibility with prior library versions +must manually set errno to the negative of the return code if it is < -1. +.SH "SEE ALSO" +rdma_accept(3), +rdma_ack_cm_event(3), +rdma_bind_addr(3), +rdma_connect(3), +rdma_create_ep(3), +rdma_create_event_channel(3), +rdma_create_id(3), +rdma_create_qp(3), +rdma_dereg_mr(3), +rdma_destroy_ep(3), +rdma_destroy_event_channel(3), +rdma_destroy_id(3), +rdma_destroy_qp(3), +rdma_disconnect(3), +rdma_event_str(3), +rdma_free_devices(3), +rdma_getaddrinfo(3), +rdma_get_cm_event(3), +rdma_get_devices(3), +rdma_get_dst_port(3), +rdma_get_local_addr(3), +rdma_get_peer_addr(3), +rdma_get_recv_comp(3), +rdma_get_request(3), +rdma_get_send_comp(3), +rdma_get_src_port(3), +rdma_join_multicast(3), +rdma_leave_multicast(3), +rdma_listen(3), +rdma_migrate_id(3), +rdma_notify(3), +rdma_post_read(3) +rdma_post_readv(3), +rdma_post_recv(3), +rdma_post_recvv(3), +rdma_post_send(3), +rdma_post_sendv(3), +rdma_post_ud_send(3), +rdma_post_write(3), +rdma_post_writev(3), +rdma_reg_msgs(3), +rdma_reg_read(3), +rdma_reg_write(3), +rdma_reject(3), +rdma_resolve_addr(3), +rdma_resolve_route(3), +rdma_set_option(3) +mckey(1), +rdma_client(1), +rdma_server(1), +rping(1), +ucmatose(1), +udaddy(1) diff --git a/librdmacm/man/rdma_connect.3 b/librdmacm/man/rdma_connect.3 new file mode 100644 index 0000000..ef88e38 --- /dev/null +++ b/librdmacm/man/rdma_connect.3 @@ -0,0 +1,91 @@ +.\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md +.TH "RDMA_CONNECT" 3 "2010-07-19" "librdmacm" "Librdmacm Programmer's Manual" librdmacm +.SH NAME +rdma_connect \- Initiate an active connection request. +.SH SYNOPSIS +.B "#include <rdma/rdma_cma.h>" +.P +.B "int" rdma_connect +.BI "(struct rdma_cm_id *" id "," +.BI "struct rdma_conn_param *" conn_param ");" +.SH ARGUMENTS +.IP "id" 12 +RDMA identifier. +.IP "conn_param" 12 +connection parameters. See CONNECTION PROPERTIES below for details. +.SH "DESCRIPTION" +For an rdma_cm_id of type RDMA_PS_TCP, this call initiates a connection request +to a remote destination. For an rdma_cm_id of type RDMA_PS_UDP, it initiates +a lookup of the remote QP providing the datagram service. +.SH "RETURN VALUE" +Returns 0 on success, or -1 on error. If an error occurs, errno will be +set to indicate the failure reason. +.SH "NOTES" +Users must have resolved a route to the destination address +by having called rdma_resolve_route or rdma_create_ep before calling +this routine. +.SH "CONNECTION PROPERTIES" +The following properties are used to configure the communication and specified +by the conn_param parameter when connecting or establishing datagram +communication. +.IP private_data +References a user-controlled data buffer. The contents of the buffer are +copied and transparently passed to the remote side as part of the +communication request. May be NULL if private_data is not required. +.IP private_data_len +Specifies the size of the user-controlled data buffer. Note that the actual +amount of data transferred to the remote side is transport dependent and may +be larger than that requested. +.IP responder_resources +The maximum number of outstanding RDMA read and atomic operations that the +local side will accept from the remote side. Applies only to RDMA_PS_TCP. +This value must be less than or equal to the local RDMA device attribute +max_qp_rd_atom and remote RDMA device attribute max_qp_init_rd_atom. The +remote endpoint can adjust this value when accepting the connection. +.IP initiator_depth +The maximum number of outstanding RDMA read and atomic operations that the +local side will have to the remote side. Applies only to RDMA_PS_TCP. +This value must be less than or equal to the local RDMA device attribute +max_qp_init_rd_atom and remote RDMA device attribute max_qp_rd_atom. The +remote endpoint can adjust this value when accepting the connection. +.IP flow_control +Specifies if hardware flow control is available. This value is exchanged +with the remote peer and is not used to configure the QP. Applies only to +RDMA_PS_TCP. +.IP retry_count +The maximum number of times that a data transfer operation should be retried +on the connection when an error occurs. This setting controls the number of +times to retry send, RDMA, and atomic operations when timeouts occur. +Applies only to RDMA_PS_TCP. +.IP rnr_retry_count +The maximum number of times that a send operation from the remote peer +should be retried on a connection after receiving a receiver not ready (RNR) +error. RNR errors are generated when a send request arrives before a buffer +has been posted to receive the incoming data. Applies only to RDMA_PS_TCP. +.IP srq +Specifies if the QP associated with the connection is using a shared receive +queue. This field is ignored by the library if a QP has been created on the +rdma_cm_id. Applies only to RDMA_PS_TCP. +.IP qp_num +Specifies the QP number associated with the connection. This field is ignored +by the library if a QP has been created on the rdma_cm_id. Applies only to +RDMA_PS_TCP. +.SH "INFINIBAND SPECIFIC" +In addition to the connection properties defined above, InfiniBand QPs are +configured with minimum RNR NAK timer and local ACK timeout values. The +minimum RNR NAK timer value is set to 0, for a delay of 655 ms. +The local ACK timeout is calculated based on the packet lifetime and local +HCA ACK delay. The packet lifetime is determined by the InfiniBand Subnet +Administrator and is part of the resolved route (path record) information. +The HCA ACK delay is a property of the locally used HCA. +.P +Retry count and RNR retry count values are 3-bit values. +.P +The length of the private data provided by the user is limited to 56 bytes +for RDMA_PS_TCP, or 180 bytes for RDMA_PS_UDP. +.SH "IWARP SPECIFIC" +Connections established over iWarp RDMA devices currently require that the +active side of the connection send the first message. +.SH "SEE ALSO" +rdma_cm(7), rdma_create_id(3), rdma_resolve_route(3), rdma_disconnect(3), +rdma_listen(3), rdma_get_cm_event(3) diff --git a/librdmacm/man/rdma_create_ep.3 b/librdmacm/man/rdma_create_ep.3 new file mode 100644 index 0000000..b1f3976 --- /dev/null +++ b/librdmacm/man/rdma_create_ep.3 @@ -0,0 +1,61 @@ +.\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md +.TH "RDMA_CREATE_EP" 3 "2010-07-19" "librdmacm" "Librdmacm Programmer's Manual" librdmacm +.SH NAME +rdma_create_ep \- Allocate a communication identifier and optional QP. +.SH SYNOPSIS +.B "#include <rdma/rdma_cma.h>" +.P +.B "int" rdma_create_ep +.BI "(struct rdma_cm_id **" id "," +.BI "struct rdma_addrinfo *" res "," +.BI "struct ibv_pd *" pd "," +.BI "struct ibv_qp_init_attr *" qp_init_attr ");" +.SH ARGUMENTS +.IP "id" 12 +A reference where the allocated communication identifier will be +returned. +.IP "res" 12 +Address information associated with the rdma_cm_id returned from +rdma_getaddrinfo. +.IP "pd" 12 +Optional protection domain if a QP is associated with the rdma_cm_id. +.IP "qp_init_attr" 12 +Optional initial QP attributes. +.SH "DESCRIPTION" +Creates an identifier that is used to track communication information. +.SH "RETURN VALUE" +Returns 0 on success, or -1 on error. If an error occurs, errno will be +set to indicate the failure reason. +.SH "NOTES" +After resolving address information using rdma_getaddrinfo, a user +may use this call to allocate an rdma_cm_id based on the results. +.P +If the rdma_cm_id will be used on the active side of a connection, +meaning that res->ai_flag does not have RAI_PASSIVE set, rdma_create_ep +will automatically create a QP on the rdma_cm_id if qp_init_attr is +not NULL. The QP will be associated with the specified protection +domain, if provided, or a default protection domain if not. Users +should see rdma_create_qp for details on the use of the pd and +qp_init_attr parameters. After calling rdma_create_ep, the returned +rdma_cm_id may be connected by calling rdma_connect. The active side +calls rdma_resolve_addr and rdma_resolve_route are not necessary. +.P +If the rdma_cm_id will be used on the passive side of a connection, +indicated by having res->ai_flag RAI_PASSIVE set, this call will save +the provided pd and qp_init_attr parameters. When a new connection +request is retrieved by calling rdma_get_request, the rdma_cm_id +associated with the new connection will automatically be associated +with a QP using the pd and qp_init_attr parameters. After calling +rdma_create_ep, the returned rdma_cm_id may be placed into a listening +state by immediately calling rdma_listen. The passive side call +rdma_bind_addr is not necessary. Connection requests may then be +retrieved by calling rdma_get_request. +.P +The newly created rdma_cm_id will be set to use synchronous operation. +Users that wish asynchronous operation must migrate the rdma_cm_id +to a user created event channel using rdma_migrate_id. +.P +Users must release the created rdma_cm_id by calling rdma_destroy_ep. +.SH "SEE ALSO" +rdma_cm(7), rdma_getaddrinfo(3), rdma_create_event_channel(3), +rdma_connect(3), rdma_listen(3), rdma_destroy_ep(3), rdma_migrate_id(3) diff --git a/librdmacm/man/rdma_create_event_channel.3 b/librdmacm/man/rdma_create_event_channel.3 new file mode 100644 index 0000000..928c797 --- /dev/null +++ b/librdmacm/man/rdma_create_event_channel.3 @@ -0,0 +1,32 @@ +.\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md +.TH "RDMA_CREATE_EVENT_CHANNEL" 3 "2007-05-15" "librdmacm" "Librdmacm Programmer's Manual" librdmacm +.SH NAME +rdma_create_event_channel \- Open a channel used to report communication events. +.SH SYNOPSIS +.B "#include <rdma/rdma_cma.h>" +.P +.B "struct rdma_event_channel *" rdma_create_event_channel +.BI "(" void ");" +.SH ARGUMENTS +.IP "void" 12 +no arguments +.SH "DESCRIPTION" +Asynchronous events are reported to users through event channels. +.SH "RETURN VALUE" +Returns a pointer to the created event channel, or NULL if the request +fails. On failure, errno will be set to indicate the failure reason. +.SH "NOTES" +Event channels are used to direct all events on an rdma_cm_id. For many +clients, a single event channel may be sufficient, however, when managing +a large number of connections or cm_id's, users may find it useful to direct +events for different cm_id's to different channels for processing. +.P +All created event channels must be destroyed by calling +rdma_destroy_event_channel. Users should call rdma_get_cm_event to +retrieve events on an event channel. +.P +Each event channel is mapped to a file descriptor. The associated file +descriptor can be used and manipulated like any other fd to change its +behavior. Users may make the fd non-blocking, poll or select the fd, etc. +.SH "SEE ALSO" +rdma_cm(7), rdma_get_cm_event(3), rdma_destroy_event_channel(3) diff --git a/librdmacm/man/rdma_create_id.3 b/librdmacm/man/rdma_create_id.3 new file mode 100644 index 0000000..0a5093a --- /dev/null +++ b/librdmacm/man/rdma_create_id.3 @@ -0,0 +1,56 @@ +.\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md +.TH "RDMA_CREATE_ID" 3 "2010-07-19" "librdmacm" "Librdmacm Programmer's Manual" librdmacm +.SH NAME +rdma_create_id \- Allocate a communication identifier. +.SH SYNOPSIS +.B "#include <rdma/rdma_cma.h>" +.P +.B "int" rdma_create_id +.BI "(struct rdma_event_channel *" channel "," +.BI "struct rdma_cm_id **" id "," +.BI "void *" context "," +.BI "enum rdma_port_space " ps ");" +.SH ARGUMENTS +.IP "channel" 12 +The communication channel that events associated with the +allocated rdma_cm_id will be reported on. This may be NULL. +.IP "id" 12 +A reference where the allocated communication identifier will be +returned. +.IP "context" 12 +User specified context associated with the rdma_cm_id. +.IP "ps" 12 +RDMA port space. +.SH "DESCRIPTION" +Creates an identifier that is used to track communication information. +.SH "RETURN VALUE" +Returns 0 on success, or -1 on error. If an error occurs, errno will be +set to indicate the failure reason. +.SH "NOTES" +Rdma_cm_id's are conceptually equivalent to a socket for RDMA +communication. The difference is that RDMA communication requires +explicitly binding to a specified RDMA device before communication +can occur, and most operations are asynchronous in nature. Asynchronous +communication events on an rdma_cm_id are reported through the associated +event channel. If the channel parameter is NULL, the rdma_cm_id will +be placed into synchronous operation. While operating synchronously, +calls that result in an event will block until the operation completes. +The event will be returned to the user through the rdma_cm_id structure, +and be available for access until another rdma_cm call is made. +.P +Users must release the rdma_cm_id by calling rdma_destroy_id. +.SH "PORT SPACE" +Details of the services provided by the different port spaces are outlined +below. +.IP RDMA_PS_TCP +Provides reliable, connection-oriented QP communication. Unlike TCP, the RDMA +port space provides message, not stream, based communication. +.IP RDMA_PS_UDP +Provides unreliable, connectionless QP communication. Supports both datagram +and multicast communication. +.IP RDMA_PS_IB +Provides for any IB services (UD, UC, RC, XRC, etc.). +.SH "SEE ALSO" +rdma_cm(7), rdma_create_event_channel(3), rdma_destroy_id(3), rdma_get_devices(3), +rdma_bind_addr(3), rdma_resolve_addr(3), rdma_connect(3), rdma_listen(3), +rdma_set_option(3) diff --git a/librdmacm/man/rdma_create_qp.3 b/librdmacm/man/rdma_create_qp.3 new file mode 100644 index 0000000..cd4708f --- /dev/null +++ b/librdmacm/man/rdma_create_qp.3 @@ -0,0 +1,49 @@ +.\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md +.TH "RDMA_CREATE_QP" 3 "2007-05-15" "librdmacm" "Librdmacm Programmer's Manual" librdmacm +.SH NAME +rdma_create_qp \- Allocate a QP. +.SH SYNOPSIS +.B "#include <rdma/rdma_cma.h>" +.P +.B "int" rdma_create_qp +.BI "(struct rdma_cm_id *" id "," +.BI "struct ibv_pd *" pd "," +.BI "struct ibv_qp_init_attr *" qp_init_attr ");" +.SH ARGUMENTS +.IP "id" 12 +RDMA identifier. +.IP "pd" 12 +Optional protection domain for the QP. +.IP "qp_init_attr" 12 +Initial QP attributes. +.SH "DESCRIPTION" +Allocate a QP associated with the specified rdma_cm_id and transition it +for sending and receiving. +.SH "RETURN VALUE" +Returns 0 on success, or -1 on error. If an error occurs, errno will be +set to indicate the failure reason. +.SH "NOTES" +The rdma_cm_id must be bound to a local RDMA device before calling this +function, and the protection domain must be for that same device. +QPs allocated to an rdma_cm_id are automatically transitioned by the +librdmacm through their states. After being allocated, the QP will be +ready to handle posting of receives. If the QP is unconnected, it will +be ready to post sends. +.P +If a protection domain is not given - pd parameter is NULL - then +the rdma_cm_id will be created using a default protection domain. One +default protection domain is allocated per RDMA device. +.P +The initial QP attributes are specified by the qp_init_attr parameter. The +send_cq and recv_cq fields in the ibv_qp_init_attr are optional. If +a send or receive completion queue is not specified, then a CQ will be +allocated by the rdma_cm for the QP, along with corresponding completion +channels. Completion channels and CQ data created by the rdma_cm are +exposed to the user through the rdma_cm_id structure. +.P +The actual capabilities and properties of the created QP will be +returned to the user through the qp_init_attr parameter. An rdma_cm_id +may only be associated with a single QP. +.SH "SEE ALSO" +rdma_bind_addr(3), rdma_resolve_addr(3), rdma_destroy_qp(3), ibv_create_qp(3), +ibv_modify_qp(3) diff --git a/librdmacm/man/rdma_create_srq.3 b/librdmacm/man/rdma_create_srq.3 new file mode 100644 index 0000000..6257e86 --- /dev/null +++ b/librdmacm/man/rdma_create_srq.3 @@ -0,0 +1,45 @@ +.\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md +.TH "RDMA_CREATE_SRQ" 3 "2011-06-15" "librdmacm" "Librdmacm Programmer's Manual" librdmacm +.SH NAME +rdma_create_srq \- Allocate a shared receive queue. +.SH SYNOPSIS +.B "#include <rdma/rdma_verbs.h>" +.P +.B "int" rdma_create_srq +.BI "(struct rdma_cm_id *" id "," +.BI "struct ibv_pd *" pd "," +.BI "struct ibv_srq_init_attr *" attr ");" +.SH ARGUMENTS +.IP "id" 12 +RDMA identifier. +.IP "pd" 12 +Optional protection domain for the SRQ. +.IP "attr" 12 +Initial SRQ attributes. +.SH "DESCRIPTION" +Allocate a SRQ associated with the specified rdma_cm_id. +.SH "RETURN VALUE" +Returns 0 on success, or -1 on error. If an error occurs, errno will be +set to indicate the failure reason. +.SH "NOTES" +The rdma_cm_id must be bound to a local RDMA device before calling this +function, and the protection domain, if provided, must be for that same device. +After being allocated, the SRQ will be ready to handle posting of receives. +.P +If a protection domain is not given - pd parameter is NULL - then +the rdma_cm_id will be created using a default protection domain. One +default protection domain is allocated per RDMA device. +.P +The initial SRQ attributes are specified by the attr parameter. The +ext.xrc.cq fields in the ibv_srq_init_attr is optional. If +a completion queue is not specified for an XRC SRQ, then a CQ will be +allocated by the rdma_cm for the SRQ, along with corresponding completion +channels. Completion channels and CQ data created by the rdma_cm are +exposed to the user through the rdma_cm_id structure. +.P +The actual capabilities and properties of the created SRQ will be +returned to the user through the attr parameter. An rdma_cm_id +may only be associated with a single SRQ. +.SH "SEE ALSO" +rdma_bind_addr(3), rdma_resolve_addr(3), rdma_create_ep(3), +rdma_destroy_srq(3), ibv_create_srq(3), ibv_create_xsrq(3) diff --git a/librdmacm/man/rdma_dereg_mr.3 b/librdmacm/man/rdma_dereg_mr.3 new file mode 100644 index 0000000..893eb14 --- /dev/null +++ b/librdmacm/man/rdma_dereg_mr.3 @@ -0,0 +1,29 @@ +.\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md +.TH "RDMA_DEREG_MR" 3 "2010-07-19" "librdmacm" "Librdmacm Programmer's Manual" librdmacm +.SH NAME +rdma_dereg_mr \- deregisters a registered memory region. +.SH SYNOPSIS +.B "#include <rdma/rdma_verbs.h>" +.P +.B "int" rdma_dereg_mr +.BI "(struct ibv_mr *" mr ");" +.SH ARGUMENTS +.IP "mr" 12 +A reference to a registered memory buffer. +.SH "DESCRIPTION" +Deregisters a memory buffer that had been registered for RDMA +or message operations. A user should call rdma_dereg_mr for all +registered memory associated with an rdma_cm_id before destroying +the rdma_cm_id. +.SH "RETURN VALUE" +Returns 0 on success, or -1 on error. If an error occurs, errno will be +set to indicate the failure reason. +.SH "NOTES" +All memory registered with an rdma_cm_id is associated with the +protection domain associated with the id. Users must deregister +all registered memory before the protection domain can be destroyed. +.SH "SEE ALSO" +rdma_cm(7), rdma_create_id(3), rdma_create_ep(3), +rdma_destroy_id(3), rdma_destroy_ep(3), +rdma_reg_msgs(3), rdma_reg_read(3), rdma_reg_write(3), +ibv_reg_mr(3), ibv_dereg_mr(3) diff --git a/librdmacm/man/rdma_destroy_ep.3 b/librdmacm/man/rdma_destroy_ep.3 new file mode 100644 index 0000000..1211f4b --- /dev/null +++ b/librdmacm/man/rdma_destroy_ep.3 @@ -0,0 +1,20 @@ +.\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md + +.TH "RDMA_DESTROY_EP" 3 "2011-06-15" "librdmacm" "Librdmacm Programmer's Manual" librdmacm +.SH NAME +rdma_destroy_ep \- Release a communication identifier. +.SH SYNOPSIS +.B "#include <rdma/rdma_cma.h>" +.P +.B "void" rdma_destroy_ep +.BI "(struct rdma_cm_id *" id ");" +.SH ARGUMENTS +.IP "id" 12 +The communication identifier to destroy. +.SH "DESCRIPTION" +Destroys the specified rdma_cm_id and all associated resources +.SH "NOTES" +rdma_destroy_ep will automatically destroy any QP and SRQ associated with +the rdma_cm_id. +.SH "SEE ALSO" +rdma_create_ep(3) diff --git a/librdmacm/man/rdma_destroy_event_channel.3 b/librdmacm/man/rdma_destroy_event_channel.3 new file mode 100644 index 0000000..5c5d663 --- /dev/null +++ b/librdmacm/man/rdma_destroy_event_channel.3 @@ -0,0 +1,22 @@ +.\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md +.TH "RDMA_DESTROY_EVENT_CHANNEL" 3 "2007-05-15" "librdmacm" "Librdmacm Programmer's Manual" librdmacm +.SH NAME +rdma_destroy_event_channel \- Close an event communication channel. +.SH SYNOPSIS +.B "#include <rdma/rdma_cma.h>" +.P +.B "void" rdma_destroy_event_channel +.BI "(struct rdma_event_channel *" channel ");" +.SH ARGUMENTS +.IP "channel" 12 +The communication channel to destroy. +.SH "DESCRIPTION" +Release all resources associated with an event channel and closes the +associated file descriptor. +.SH "RETURN VALUE" +None +.SH "NOTES" +All rdma_cm_id's associated with the event channel must be destroyed, +and all returned events must be acked before calling this function. +.SH "SEE ALSO" +rdma_create_event_channel(3), rdma_get_cm_event(3), rdma_ack_cm_event(3) diff --git a/librdmacm/man/rdma_destroy_id.3 b/librdmacm/man/rdma_destroy_id.3 new file mode 100644 index 0000000..1d95ff2 --- /dev/null +++ b/librdmacm/man/rdma_destroy_id.3 @@ -0,0 +1,23 @@ +.\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md +.TH "RDMA_DESTROY_ID" 3 "2010-07-19" "librdmacm" "Librdmacm Programmer's Manual" librdmacm +.SH NAME +rdma_destroy_id \- Release a communication identifier. +.SH SYNOPSIS +.B "#include <rdma/rdma_cma.h>" +.P +.B "int" rdma_destroy_id +.BI "(struct rdma_cm_id *" id ");" +.SH ARGUMENTS +.IP "id" 12 +The communication identifier to destroy. +.SH "DESCRIPTION" +Destroys the specified rdma_cm_id and cancels any outstanding +asynchronous operation. +.SH "RETURN VALUE" +Returns 0 on success, or -1 on error. If an error occurs, errno will be +set to indicate the failure reason. +.SH "NOTES" +Users must free any associated QP with the rdma_cm_id before +calling this routine and ack all related events. +.SH "SEE ALSO" +rdma_create_id(3), rdma_destroy_qp(3), rdma_ack_cm_event(3) diff --git a/librdmacm/man/rdma_destroy_qp.3 b/librdmacm/man/rdma_destroy_qp.3 new file mode 100644 index 0000000..03fba27 --- /dev/null +++ b/librdmacm/man/rdma_destroy_qp.3 @@ -0,0 +1,19 @@ +.\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md +.TH "RDMA_DESTROY_QP" 3 "2007-05-15" "librdmacm" "Librdmacm Programmer's Manual" librdmacm +.SH NAME +rdma_destroy_qp \- Deallocate a QP. +.SH SYNOPSIS +.B "#include <rdma/rdma_cma.h>" +.P +.B "void" rdma_destroy_qp +.BI "(struct rdma_cm_id *" id ");" +.SH ARGUMENTS +.IP "id" 12 +RDMA identifier. +.SH "DESCRIPTION" +Destroy a QP allocated on the rdma_cm_id. +.SH "NOTES" +Users must destroy any QP associated with an rdma_cm_id before +destroying the ID. +.SH "SEE ALSO" +rdma_create_qp(3), rdma_destroy_id(3), ibv_destroy_qp(3) diff --git a/librdmacm/man/rdma_destroy_srq.3 b/librdmacm/man/rdma_destroy_srq.3 new file mode 100644 index 0000000..75c23c7 --- /dev/null +++ b/librdmacm/man/rdma_destroy_srq.3 @@ -0,0 +1,21 @@ +.\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md +.TH "RDMA_DESTROY_SRQ" 3 "2011-06-15" "librdmacm" "Librdmacm Programmer's Manual" librdmacm +.SH NAME +rdma_destroy_srq \- Deallocate a SRQ. +.SH SYNOPSIS +.B "#include <rdma/rdma_verbs.h>" +.P +.B "void" rdma_destroy_srq +.BI "(struct rdma_cm_id *" id ");" +.SH ARGUMENTS +.IP "id" 12 +RDMA identifier. +.SH "DESCRIPTION" +Destroy an SRQ allocated on the rdma_cm_id. +.SH "RETURN VALUE" +None +.SH "NOTES" +Users should destroy any SRQ associated with an rdma_cm_id before +destroying the ID. +.SH "SEE ALSO" +rdma_create_srq(3), rdma_destroy_id(3), ibv_destroy_srq(3) diff --git a/librdmacm/man/rdma_disconnect.3 b/librdmacm/man/rdma_disconnect.3 new file mode 100644 index 0000000..23aec98 --- /dev/null +++ b/librdmacm/man/rdma_disconnect.3 @@ -0,0 +1,23 @@ +.\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md +.TH "RDMA_DISCONNECT" 3 "2008-01-02" "librdmacm" "Librdmacm Programmer's Manual" librdmacm +.SH NAME +rdma_disconnect \- This function disconnects a connection. +.SH SYNOPSIS +.B "#include <rdma/rdma_cma.h>" +.P +.B "int" rdma_disconnect +.BI "(struct rdma_cm_id *" id ");" +.SH ARGUMENTS +.IP "id" 12 +RDMA identifier. +.SH "DESCRIPTION" +Disconnects a connection and transitions any associated QP to the error state, +which will flush any posted work requests to the completion queue. This +routine should be called by both the client and server side of a connection. +After successfully disconnecting, an RDMA_CM_EVENT_DISCONNECTED event will be +generated on both sides of the connection. +.SH "RETURN VALUE" +Returns 0 on success, or -1 on error. If an error occurs, errno will be +set to indicate the failure reason. +.SH "SEE ALSO" +rdma_connect(3), rdma_listen(3), rdma_accept(3), rdma_get_cm_event(3) diff --git a/librdmacm/man/rdma_establish.3.md b/librdmacm/man/rdma_establish.3.md new file mode 100644 index 0000000..91f390d --- /dev/null +++ b/librdmacm/man/rdma_establish.3.md @@ -0,0 +1,59 @@ +--- +date: 2019-01-16 +footer: librdmacm +header: "Librdmacm Programmer's Manual" +layout: page +license: 'Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md' +section: 3 +title: RDMA_ESTABLISH +--- + +# NAME + +rdma_establish - Complete an active connection request. + +# SYNOPSIS + +```c +#include <rdma/rdma_cma.h> + +int rdma_establish(struct rdma_cm_id *id); +``` + +# DESCRIPTION + +**rdma_establish()** Acknowledge an incoming connection response event and complete the connection establishment. + +Notes: + +If a QP has not been created on the rdma_cm_id, this function should be called by the active side to complete the connection, + +after getting connect response event. + +This will trigger a connection established event on the passive side. + +This function should not be used on an rdma_cm_id on which a QP has been created. + +# ARGUMENTS + +*id* +: RDMA identifier. + +# RETURN VALUE + +**rdma_establish()** returns 0 on success, or -1 on error. If an error occurs, errno will be set to indicate the failure reason. + +# SEE ALSO + +**rdma_connect**(3), +**rdma_disconnect**(3) +**rdma_get_cm_event**(3) + +# AUTHORS + +Danit Goldberg <danitg@mellanox.com> + +Yossi Itigin <yosefe@mellanox.com> + + + diff --git a/librdmacm/man/rdma_event_str.3 b/librdmacm/man/rdma_event_str.3 new file mode 100644 index 0000000..24c9e16 --- /dev/null +++ b/librdmacm/man/rdma_event_str.3 @@ -0,0 +1,18 @@ +.\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md +.TH "RDMA_EVENT_STR" 3 "2007-05-15" "librdmacm" "Librdmacm Programmer's Manual" librdmacm +.SH NAME +rdma_event_str \- Returns a string representation of an rdma cm event. +.SH SYNOPSIS +.B "#include <rdma/rdma_cma.h>" +.P +.B "char *" rdma_event_str +.BI "("enum rdma_cm_event_type " event ");" +.SH ARGUMENTS +.IP "event" 12 +Asynchronous event. +.SH "DESCRIPTION" +Returns a string representation of an asynchronous event. +.SH "RETURN VALUE" +Returns a pointer to a static character string corresponding to the event. +.SH "SEE ALSO" +rdma_get_cm_event(3) diff --git a/librdmacm/man/rdma_free_devices.3 b/librdmacm/man/rdma_free_devices.3 new file mode 100644 index 0000000..c46bcbf --- /dev/null +++ b/librdmacm/man/rdma_free_devices.3 @@ -0,0 +1,18 @@ +.\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md +.TH "RDMA_FREE_DEVICES" 3 "2007-05-15" "librdmacm" "Librdmacm Programmer's Manual" librdmacm +.SH NAME +rdma_free_devices \- Frees the list of devices returned by rdma_get_devices. +.SH SYNOPSIS +.B "#include <rdma/rdma_cma.h>" +.P +.B "void" rdma_free_devices +.BI "(struct ibv_context **" list ");" +.SH ARGUMENTS +.IP "list" 12 +List of devices returned from rdma_get_devices. +.SH "DESCRIPTION" +Frees the device array returned by rdma_get_devices. +.SH "RETURN VALUE" +None +.SH "SEE ALSO" +rdma_get_devices(3) diff --git a/librdmacm/man/rdma_get_cm_event.3 b/librdmacm/man/rdma_get_cm_event.3 new file mode 100644 index 0000000..2623eab --- /dev/null +++ b/librdmacm/man/rdma_get_cm_event.3 @@ -0,0 +1,168 @@ +.\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md +.TH "RDMA_GET_CM_EVENT" 3 "2007-10-31" "librdmacm" "Librdmacm Programmer's Manual" librdmacm +.SH NAME +rdma_get_cm_event \- Retrieves the next pending communication event. +.SH SYNOPSIS +.B "#include <rdma/rdma_cma.h>" +.P +.B "int" rdma_get_cm_event +.BI "(struct rdma_event_channel *" channel "," +.BI "struct rdma_cm_event **" event ");" +.SH ARGUMENTS +.IP "channel" 12 +Event channel to check for events. +.IP "event" 12 +Allocated information about the next communication event. +.SH "DESCRIPTION" +Retrieves a communication event. If no events are pending, by default, +the call will block until an event is received. +.SH "RETURN VALUE" +Returns 0 on success, or -1 on error. If an error occurs, errno will be +set to indicate the failure reason. +.SH "NOTES" +The default synchronous behavior of this routine can be changed by +modifying the file descriptor associated with the given channel. All +events that are reported must be acknowledged by calling rdma_ack_cm_event. +Destruction of an rdma_cm_id will block until related events have been +acknowledged. +.SH "EVENT DATA" +Communication event details are returned in the rdma_cm_event structure. +This structure is allocated by the rdma_cm and released by the +rdma_ack_cm_event routine. Details of the rdma_cm_event structure are +given below. +.IP "id" 12 +The rdma_cm identifier associated with the event. If the event type is +RDMA_CM_EVENT_CONNECT_REQUEST, then this references a new id for that +communication. +.IP "listen_id" 12 +For RDMA_CM_EVENT_CONNECT_REQUEST event types, this references the +corresponding listening request identifier. +.IP "event" 12 +Specifies the type of communication event which occurred. See EVENT TYPES +below. +.IP "status" 12 +Returns any asynchronous error information associated with an event. The +status is zero if the operation was successful, otherwise the status value +is non-zero and is either set to an errno or a transport specific value. +For details on transport specific status values, see the event type information +below. +.IP "param" 12 +Provides additional details based on the type of event. Users should +select the conn or ud subfields based on the rdma_port_space of the +rdma_cm_id associated with the event. See UD EVENT DATA and CONN EVENT +DATA below. +.SH "UD EVENT DATA" +Event parameters related to unreliable datagram (UD) services: RDMA_PS_UDP and +RDMA_PS_IPOIB. The UD event data is valid for RDMA_CM_EVENT_ESTABLISHED and +RDMA_CM_EVENT_MULTICAST_JOIN events, unless stated otherwise. +.IP "private_data" 12 +References any user-specified data associated with RDMA_CM_EVENT_CONNECT_REQUEST +or RDMA_CM_EVENT_ESTABLISHED events. The data referenced by this field matches +that specified by the remote side when calling rdma_connect or rdma_accept. +This field is NULL if the event does not include private data. The buffer +referenced by this pointer is deallocated when calling rdma_ack_cm_event. +.IP "private_data_len" 12 +The size of the private data buffer. Users should note that the size of +the private data buffer may be larger than the amount of private data +sent by the remote side. Any additional space in the buffer will be +zeroed out. +.IP "ah_attr" 12 +Address information needed to send data to the remote endpoint(s). +Users should use this structure when allocating their address handle. +.IP "qp_num" 12 +QP number of the remote endpoint or multicast group. +.IP "qkey" 12 +QKey needed to send data to the remote endpoint(s). +.SH "CONN EVENT DATA" +Event parameters related to connected QP services: RDMA_PS_TCP. The +connection related event data is valid for RDMA_CM_EVENT_CONNECT_REQUEST +and RDMA_CM_EVENT_ESTABLISHED events, unless stated otherwise. +.IP "private_data" 12 +References any user-specified data associated with the event. The data +referenced by this field matches that specified by the remote side when +calling rdma_connect or rdma_accept. This field is NULL if the event +does not include private data. The buffer referenced by this pointer is +deallocated when calling rdma_ack_cm_event. +.IP "private_data_len" 12 +The size of the private data buffer. Users should note that the size of +the private data buffer may be larger than the amount of private data +sent by the remote side. Any additional space in the buffer will be +zeroed out. +.IP "responder_resources" 12 +The number of responder resources requested of the recipient. +This field matches the initiator depth specified by the remote node when +calling rdma_connect and rdma_accept. +.IP "initiator_depth" 12 +The maximum number of outstanding RDMA read/atomic operations +that the recipient may have outstanding. This field matches the responder +resources specified by the remote node when calling rdma_connect and +rdma_accept. +.IP "flow_control" 12 +Indicates if hardware level flow control is provided by the sender. +.IP "retry_count" 12 +For RDMA_CM_EVENT_CONNECT_REQUEST events only, indicates the number of times +that the recipient should retry send operations. +.IP "rnr_retry_count" 12 +The number of times that the recipient should retry receiver not ready (RNR) +NACK errors. +.IP "srq" 12 +Specifies if the sender is using a shared-receive queue. +.IP "qp_num" 12 +Indicates the remote QP number for the connection. +.SH "EVENT TYPES" +The following types of communication events may be reported. +.IP RDMA_CM_EVENT_ADDR_RESOLVED +Address resolution (rdma_resolve_addr) completed successfully. +.IP RDMA_CM_EVENT_ADDR_ERROR +Address resolution (rdma_resolve_addr) failed. +.IP RDMA_CM_EVENT_ROUTE_RESOLVED +Route resolution (rdma_resolve_route) completed successfully. +.IP RDMA_CM_EVENT_ROUTE_ERROR +Route resolution (rdma_resolve_route) failed. +.IP RDMA_CM_EVENT_CONNECT_REQUEST +Generated on the passive side to notify the user of a new connection request. +.IP RDMA_CM_EVENT_CONNECT_RESPONSE +Generated on the active side to notify the user of a successful response +to a connection request. It is only generated on rdma_cm_id's that do not +have a QP associated with them. +.IP RDMA_CM_EVENT_CONNECT_ERROR +Indicates that an error has occurred trying to establish or a connection. +May be generated on the active or passive side of a connection. +.IP RDMA_CM_EVENT_UNREACHABLE +Generated on the active side to notify the user that the remote server is +not reachable or unable to respond to a connection request. If this event +is generated in response to a UD QP resolution request over InfiniBand, +the event status field will contain an errno, if negative, or the status +result carried in the IB CM SIDR REP message. +.IP RDMA_CM_EVENT_REJECTED +Indicates that a connection request or response was rejected by the remote +end point. The event status field will contain the transport specific +reject reason if available. Under InfiniBand, this is the reject reason +carried in the IB CM REJ message. +.IP RDMA_CM_EVENT_ESTABLISHED +Indicates that a connection has been established with the remote end point. +.IP RDMA_CM_EVENT_DISCONNECTED +The connection has been disconnected. +.IP RDMA_CM_EVENT_DEVICE_REMOVAL +The local RDMA device associated with the rdma_cm_id has been removed. +Upon receiving this event, the user must destroy the related rdma_cm_id. +.IP RDMA_CM_EVENT_MULTICAST_JOIN +The multicast join operation (rdma_join_multicast) completed successfully. +.IP RDMA_CM_EVENT_MULTICAST_ERROR +An error either occurred joining a multicast group, or, if the group had +already been joined, on an existing group. The specified multicast group is +no longer accessible and should be rejoined, if desired. +.IP RDMA_CM_EVENT_ADDR_CHANGE +The network device associated with this ID through address resolution changed +its HW address, eg following of bonding failover. This event can serve as a +hint for applications who want the links used for their RDMA sessions to +align with the network stack. +.IP RDMA_CM_EVENT_TIMEWAIT_EXIT +The QP associated with a connection has exited its timewait state and is now +ready to be re-used. After a QP has been disconnected, it is maintained in +a timewait state to allow any in flight packets to exit the network. After +the timewait state has completed, the rdma_cm will report this event. +.SH "SEE ALSO" +rdma_ack_cm_event(3), rdma_create_event_channel(3), rdma_resolve_addr(3), +rdma_resolve_route(3), rdma_connect(3), rdma_listen(3), rdma_join_multicast(3), +rdma_destroy_id(3), rdma_event_str(3) diff --git a/librdmacm/man/rdma_get_devices.3 b/librdmacm/man/rdma_get_devices.3 new file mode 100644 index 0000000..be41514 --- /dev/null +++ b/librdmacm/man/rdma_get_devices.3 @@ -0,0 +1,24 @@ +.\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md +.TH "RDMA_GET_DEVICES" 3 "2007-05-15" "librdmacm" "Librdmacm Programmer's Manual" librdmacm +.SH NAME +rdma_get_devices \- Get a list of RDMA devices currently available. +.SH SYNOPSIS +.B "#include <rdma/rdma_cma.h>" +.P +.B "struct ibv_context **" rdma_get_devices +.BI "(int *" num_devices ");" +.SH ARGUMENTS +.IP "num_devices" 12 +If non-NULL, set to the number of devices returned. +.SH "DESCRIPTION" +Return a NULL-terminated array of opened RDMA devices. Callers can use +this routine to allocate resources on specific RDMA devices that will be +shared across multiple rdma_cm_id's. +.SH "RETURN VALUE" +Returns an array of available RDMA devices, or NULL if the request +fails. On failure, errno will be set to indicate the failure reason. +.SH "NOTES" +The returned array must be released by calling rdma_free_devices. Devices +remain opened while the librdmacm is loaded. +.SH "SEE ALSO" +rdma_free_devices(3) diff --git a/librdmacm/man/rdma_get_dst_port.3 b/librdmacm/man/rdma_get_dst_port.3 new file mode 100644 index 0000000..1863783 --- /dev/null +++ b/librdmacm/man/rdma_get_dst_port.3 @@ -0,0 +1,21 @@ +.\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md +.TH "RDMA_GET_DST_PORT" 3 "2007-05-15" "librdmacm" "Librdmacm Programmer's Manual" librdmacm +.SH NAME +rdma_get_dst_port \- Returns the remote port number of a bound rdma_cm_id. +.SH SYNOPSIS +.B "#include <rdma/rdma_cma.h>" +.P +.B "uint16_t" rdma_get_dst_port +.BI "(struct rdma_cm_id *" id ");" +.SH ARGUMENTS +.IP "id" 12 +RDMA identifier. +.SH "DESCRIPTION" +Returns the remote port number for an rdma_cm_id that has been bound to +a remote address. +.SH "RETURN VALUE" +Returns the 16-bit port identifier associated with the peer endpoint. If +the rdma_cm_id is not connected, the returned value is 0. +.SH "SEE ALSO" +rdma_connect(3), rdma_accept(3), rdma_get_cm_event(3), rdma_get_src_port(3), +rdma_get_local_addr(3), rdma_get_peer_addr(3) diff --git a/librdmacm/man/rdma_get_local_addr.3 b/librdmacm/man/rdma_get_local_addr.3 new file mode 100644 index 0000000..eeaa714 --- /dev/null +++ b/librdmacm/man/rdma_get_local_addr.3 @@ -0,0 +1,22 @@ +.\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md +.TH "RDMA_GET_LOCAL_ADDR" 3 "2007-05-15" "librdmacm" "Librdmacm Programmer's Manual" librdmacm +.SH NAME +rdma_get_local_addr \- Returns the local IP address of a bound rdma_cm_id. +.SH SYNOPSIS +.B "#include <rdma/rdma_cma.h>" +.P +.B "struct sockaddr *" rdma_get_local_addr +.BI "(struct rdma_cm_id *" id ");" +.SH ARGUMENTS +.IP "id" 12 +RDMA identifier. +.SH "DESCRIPTION" +Returns the local IP address for an rdma_cm_id that has been bound to +a local device. +.SH "RETURN VALUE" +Returns a pointer to the local sockaddr address of the rdma_cm_id. If +the rdma_cm_id is not bound to an address, the contents of the sockaddr +structure will be set to all zeroes. +.SH "SEE ALSO" +rdma_bind_addr(3), rdma_resolve_addr(3), rdma_get_src_port(3), +rdma_get_dst_port(3), rdma_get_peer_addr(3) diff --git a/librdmacm/man/rdma_get_peer_addr.3 b/librdmacm/man/rdma_get_peer_addr.3 new file mode 100644 index 0000000..ff8ce4e --- /dev/null +++ b/librdmacm/man/rdma_get_peer_addr.3 @@ -0,0 +1,21 @@ +.\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md +.TH "RDMA_GET_PEER_ADDR" 3 "2007-05-15" "librdmacm" "Librdmacm Programmer's Manual" librdmacm +.SH NAME +rdma_get_peer_addr \- Returns the remote IP address of a bound rdma_cm_id. +.SH SYNOPSIS +.B "#include <rdma/rdma_cma.h>" +.P +.B "struct sockaddr *" rdma_get_peer_addr +.BI "(struct rdma_cm_id *" id ");" +.SH ARGUMENTS +.IP "id" 12 +RDMA identifier. +.SH "DESCRIPTION" +Returns the remote IP address associated with an rdma_cm_id. +.SH "RETURN VALUE" +Returns a pointer to the sockaddr address of the connected peer. If +the rdma_cm_id is not connected, the contents of the sockaddr +structure will be set to all zeroes. +.SH "SEE ALSO" +rdma_resolve_addr(3), rdma_get_src_port(3), rdma_get_dst_port(3), +rdma_get_local_addr(3) diff --git a/librdmacm/man/rdma_get_recv_comp.3 b/librdmacm/man/rdma_get_recv_comp.3 new file mode 100644 index 0000000..6894e5d --- /dev/null +++ b/librdmacm/man/rdma_get_recv_comp.3 @@ -0,0 +1,32 @@ +.\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md +.TH "RDMA_GET_RECV_COMP" 3 "2010-07-19" "librdmacm" "Librdmacm Programmer's Manual" librdmacm +.SH NAME +rdma_get_recv_comp \- retrieves a completed receive request. +.SH SYNOPSIS +.B "#include <rdma/rdma_verbs.h>" +.P +.B "int" rdma_get_recv_comp +.BI "(struct rdma_cm_id *" id "," +.BI "struct ibv_wc *" wc ");" +.SH ARGUMENTS +.IP "id" 12 +A reference to a communication identifier to check for completions. +.IP "wc" 12 +A reference to a work completion structure to fill in. +.SH "DESCRIPTION" +Retrieves a completed work request for a receive +operation. Information about the completed request is returned through +the wc parameter, with the wr_id set to the context of the request. For +details on the work completion structure, see ibv_poll_cq. +.SH "RETURN VALUE" +Returns the number of returned completions (0 or 1) on success, or -1 on error. +If an error occurs, errno will be set to indicate the failure reason. +.SH "NOTES" +This calls polls the receive completion queue associated with an rdma_cm_id. +If a completion is not found, the call blocks until a request completes. +This call should only be used on rdma_cm_id's that do not share CQs +with other rdma_cm_id's, and maintain separate CQs for sends and receive +completions. +.SH "SEE ALSO" +rdma_cm(7), ibv_poll_cq(3), rdma_get_send_comp(3), +rdma_post_send(3), rdma_post_read(3), rdma_post_write(3) diff --git a/librdmacm/man/rdma_get_request.3 b/librdmacm/man/rdma_get_request.3 new file mode 100644 index 0000000..86cb610 --- /dev/null +++ b/librdmacm/man/rdma_get_request.3 @@ -0,0 +1,35 @@ +.\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md +.TH "RDMA_GET_REQUEST" 3 "2007-10-31" "librdmacm" "Librdmacm Programmer's Manual" librdmacm +.SH NAME +rdma_get_request \- Retrieves the next pending connection request event. +.SH SYNOPSIS +.B "#include <rdma/rdma_cma.h>" +.P +.B "int" rdma_get_request +.BI "(struct rdma_cm_id *" listen "," +.BI "struct rdma_cm_id **" id ");" +.SH ARGUMENTS +.IP "listen" 12 +Listening rdma_cm_id. +.IP "id" 12 +rdma_cm_id associated with the new connection. +.SH "DESCRIPTION" +Retrieves a connection request event. If no requests are pending, +the call will block until an event is received. +.SH "RETURN VALUE" +Returns 0 on success, or -1 on error. If an error occurs, errno will be +set to indicate the failure reason. +.SH "NOTES" +This call may only be used on listening rdma_cm_id's operating +synchronously. On success, a new rdma_cm_id representing the +connection request will be returned to the user. The new rdma_cm_id +will reference event information associated with the request until +the user calls rdma_reject, rdma_accept, or rdma_destroy_id on the +newly created identifier. For a description of the event data, +see rdma_get_cm_event. +.P +If QP attributes are associated with the listening endpoint, the +returned rdma_cm_id will also reference an allocated QP. +.SH "SEE ALSO" +rdma_get_cm_event(3), rdma_accept(3), rdma_reject(3), +rdma_connect(3), rdma_listen(3), rdma_destroy_id(3) diff --git a/librdmacm/man/rdma_get_send_comp.3 b/librdmacm/man/rdma_get_send_comp.3 new file mode 100644 index 0000000..5085ddb --- /dev/null +++ b/librdmacm/man/rdma_get_send_comp.3 @@ -0,0 +1,32 @@ +.\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md +.TH "RDMA_GET_SEND_COMP" 3 "2010-07-19" "librdmacm" "Librdmacm Programmer's Manual" librdmacm +.SH NAME +rdma_get_send_comp \- retrieves a completed send, read, or write request. +.SH SYNOPSIS +.B "#include <rdma/rdma_verbs.h>" +.P +.B "int" rdma_get_send_comp +.BI "(struct rdma_cm_id *" id "," +.BI "struct ibv_wc *" wc ");" +.SH ARGUMENTS +.IP "id" 12 +A reference to a communication identifier to check for completions. +.IP "wc" 12 +A reference to a work completion structure to fill in. +.SH "DESCRIPTION" +Retrieves a completed work request for a send, RDMA read, or RDMA write +operation. Information about the completed request is returned through +the wc parameter, with the wr_id set to the context of the request. For +details on the work completion structure, see ibv_poll_cq. +.SH "RETURN VALUE" +Returns the number of returned completions (0 or 1) on success, or -1 on error. +If an error occurs, errno will be set to indicate the failure reason. +.SH "NOTES" +This calls polls the send completion queue associated with an rdma_cm_id. +If a completion is not found, the call blocks until a request completes. +This call should only be used on rdma_cm_id's that do not share CQs +with other rdma_cm_id's, and maintain separate CQs for sends and receive +completions. +.SH "SEE ALSO" +rdma_cm(7), ibv_poll_cq(3), rdma_get_recv_comp(3), +rdma_post_send(3), rdma_post_read(3), rdma_post_write(3) diff --git a/librdmacm/man/rdma_get_src_port.3 b/librdmacm/man/rdma_get_src_port.3 new file mode 100644 index 0000000..1e36a92 --- /dev/null +++ b/librdmacm/man/rdma_get_src_port.3 @@ -0,0 +1,21 @@ +.\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md +.TH "RDMA_GET_SRC_PORT" 3 "2007-05-15" "librdmacm" "Librdmacm Programmer's Manual" librdmacm +.SH NAME +rdma_get_src_port \- Returns the local port number of a bound rdma_cm_id. +.SH SYNOPSIS +.B "#include <rdma/rdma_cma.h>" +.P +.B "uint16_t" rdma_get_src_port +.BI "(struct rdma_cm_id *" id ");" +.SH ARGUMENTS +.IP "id" 12 +RDMA identifier. +.SH "DESCRIPTION" +Returns the local port number for an rdma_cm_id that has been bound to +a local address. +.SH "RETURN VALUE" +Returns the 16-bit port identifier associated with the local endpoint. If +the rdma_cm_id is not bound to a port, the returned value is 0. +.SH "SEE ALSO" +rdma_bind_addr(3), rdma_resolve_addr(3), rdma_get_dst_port(3), +rdma_get_local_addr(3), rdma_get_peer_addr(3) diff --git a/librdmacm/man/rdma_getaddrinfo.3 b/librdmacm/man/rdma_getaddrinfo.3 new file mode 100644 index 0000000..3ad6393 --- /dev/null +++ b/librdmacm/man/rdma_getaddrinfo.3 @@ -0,0 +1,105 @@ +.\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md +.TH "RDMA_GETADDRINFO" 3 "2010-07-19" "librdmacm" "Librdmacm Programmer's Manual" librdmacm +.SH NAME +rdma_getaddrinfo \- Provides transport independent address translation. +.SH SYNOPSIS +.B "#include <rdma/rdma_cma.h>" +.P +.B "int" rdma_getaddrinfo +.BI "(const char *" node "," +.BI "const char *" service "," +.BI "const struct rdma_addrinfo *" hints "," +.BI "struct rdma_addrinfo **" res ");" +.SH ARGUMENTS +.IP "node" 12 +Optional, name, dotted-decimal IPv4, or IPv6 hex address to resolve. +.IP "service" 12 +Service name or port number of address. +.IP "hints" 12 +Reference to an rdma_addrinfo structure containing hints about the type +of service the caller supports. +.IP "res" 12 +A pointer to a linked list of rdma_addrinfo structures containing response +information. +.SH "DESCRIPTION" +Resolves the destination node and service address and returns +information needed to establish communication. Provides the +RDMA functional equivalent to getaddrinfo. +.SH "RETURN VALUE" +Returns 0 on success, or -1 on error. If an error occurs, errno will be +set to indicate the failure reason. +.SH "NOTES" +Either node, service, or hints must be provided. If hints are provided, the +operation will be controlled by hints.ai_flags. If RAI_PASSIVE is +specified, the call will resolve address information for use on the +passive side of a connection. +If node is provided, rdma_getaddrinfo will attempt to resolve the RDMA address, +route, and connection data to the given node. The hints parameter, if provided, +may be used to control the resulting output as indicated below. +If node is not given, rdma_getaddrinfo will attempt to resolve the RDMA addressing +information based on the hints.ai_src_addr, hints.ai_dst_addr, or hints.ai_route. +.SH "rdma_addrinfo" +.IP "ai_flags" 12 +Hint flags that control the operation. Supported flags are: +.IP "RAI_PASSIVE" 12 +Indicates that the results will be used on the passive/listening +side of a connection. +.IP "RAI_NUMERICHOST" 12 +If specified, then the node parameter, if provided, must be a numerical +network address. This flag suppresses any lengthy address resolution. +.IP "RAI_NOROUTE" 12 +If set, this flag suppresses any lengthy route resolution. +.IP "RAI_FAMILY" 12 +If set, the ai_family setting should be used as an input hint for interpretting +the node parameter. +.IP "ai_family" 12 +Address family for the source and destination address. Supported families +are: AF_INET, AF_INET6, and AF_IB. +.IP "ai_qp_type" 12 +Indicates the type of RDMA QP used for communication. Supported types are: +IBV_UD (unreliable datagram) and IBV_RC (reliable connected). +.IP "ai_port_space" 12 +RDMA port space in use. Supported values are: RDMA_PS_UDP, RDMA_PS_TCP, +and RDMA_PS_IB. +.IP "ai_src_len" 12 +The length of the source address referenced by ai_src_addr. This will be 0 +if an appropriate source address could not be discovered for a given +destination. +.IP "ai_dst_len" 12 +The length of the destination address referenced by ai_dst_addr. This +will be 0 if the RAI_PASSIVE flag was specified as part of the hints. +.IP "ai_src_addr" 12 +If provided, the address for the local RDMA device. +.IP "ai_dst_addr" 12 +If provided, the address for the destination RDMA device. +.IP "ai_src_canonname" 12 +The canonical for the source. +.IP "ai_dst_canonname" 12 +The canonical for the destination. +.IP "ai_route_len" 12 +Size of the routing information buffer referenced by ai_route. This will +be 0 if the underlying transport does not require routing data, or none +could be resolved. +.IP "ai_route" 12 +Routing information for RDMA transports that require routing data as part +of connection establishment. The format of the routing data depends on +the underlying transport. If Infiniband transports are +used, ai_route will reference an array of struct ibv_path_data on output, +if routing data is available. Routing paths may be restricted by setting +desired routing data fields on input to rdma_getaddrinfo. For Infiniband, +hints.ai_route may reference an array of struct ibv_path_record or +struct ibv_path_data on input. +.IP "ai_connect_len" 12 +Size of connection information referenced by ai_connect. This will be +0 if the underlying transport does not require additional connection +information. +.IP "ai_connect" 12 +Data exchanged as part of the connection establishment process. If provided, +ai_connect data must be transferred as private data, with any user supplied +private data following it. +.IP "ai_next" 12 +Pointer to the next rdma_addrinfo structure in the list. Will be NULL +if no more structures exist. +.SH "SEE ALSO" +rdma_create_id(3), rdma_resolve_route(3), rdma_connect(3), rdma_create_qp(3), +rdma_bind_addr(3), rdma_create_ep(3) diff --git a/librdmacm/man/rdma_init_qp_attr.3.md b/librdmacm/man/rdma_init_qp_attr.3.md new file mode 100644 index 0000000..99e812a --- /dev/null +++ b/librdmacm/man/rdma_init_qp_attr.3.md @@ -0,0 +1,54 @@ +--- +date: 2018-12-31 +footer: librdmacm +header: "Librdmacm Programmer's Manual" +layout: page +license: 'Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md' +section: 3 +title: RDMA_INIT_QP_ATTR +--- + +# NAME + +rdma_init_qp_attr - Returns qp attributes of a rdma_cm_id. + +# SYNOPSIS + +```c +#include <rdma/rdma_cma.h> + +int rdma_init_qp_attr(struct rdma_cm_id *id, + struct ibv_qp_attr *qp_attr, + int *qp_attr_mask); +``` +# DESCRIPTION + +**rdma_init_qp_attr()** returns qp attributes of a rdma_cm_id. + +Information about qp attributes and qp attributes mask is returned through the *qp_attr* and *qp_attr_mask* parameters. + +For details on the qp_attr structure, see ibv_modify_qp. + +# ARGUMENTS + +*id* +: RDMA identifier. + +*qp_attr* +: A reference to a qp attributes struct containing response information. + +*qp_attr_mask* +: A reference to a qp attributes mask containing response information. + +# RETURN VALUE + +**rdma_init_qp_attr()** returns 0 on success, or -1 on error. If an error occurs, errno will be set to indicate the failure reason. + +# SEE ALSO + +**rdma_cm**(7), +**ibv_modify_qp**(3) + +# AUTHOR + +Danit Goldberg <danitg@mellanox.com> diff --git a/librdmacm/man/rdma_join_multicast.3 b/librdmacm/man/rdma_join_multicast.3 new file mode 100644 index 0000000..4e52e85 --- /dev/null +++ b/librdmacm/man/rdma_join_multicast.3 @@ -0,0 +1,39 @@ +.\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md +.TH "RDMA_JOIN_MULTICAST" 3 "2008-01-02" "librdmacm" "Librdmacm Programmer's Manual" librdmacm +.SH NAME +rdma_join_multicast \- Joins a multicast group. +.SH SYNOPSIS +.B "#include <rdma/rdma_cma.h>" +.P +.B "int" rdma_join_multicast +.BI "(struct rdma_cm_id *" id "," +.BI "struct sockaddr *" addr "," +.BI "void *" context ");" +.SH ARGUMENTS +.IP "id" 12 +Communication identifier associated with the request. +.IP "addr" 12 +Multicast address identifying the group to join. +.IP "context" 12 +User-defined context associated with the join request. +.SH "DESCRIPTION" +Joins a multicast group and attaches an associated QP to the group. +.SH "RETURN VALUE" +Returns 0 on success, or -1 on error. If an error occurs, errno will be +set to indicate the failure reason. +.SH "NOTES" +Before joining a multicast group, the rdma_cm_id must be bound to +an RDMA device by calling rdma_bind_addr or rdma_resolve_addr. Use of +rdma_resolve_addr requires the local routing tables to resolve the +multicast address to an RDMA device, unless a specific source address +is provided. The user must call rdma_leave_multicast to leave the +multicast group and release any multicast resources. After the join +operation completes, if a QP is associated with the rdma_cm_id, +it is automatically attached to the multicast group when the multicast +event is retrieved by the user. Otherwise, the user is responsible +for calling ibv_attach_mcast to bind the QP to the multicast group. +The join context is returned to the user through the private_data +field in the rdma_cm_event. +.SH "SEE ALSO" +rdma_leave_multicast(3), rdma_bind_addr(3), rdma_resolve_addr(3), rdma_create_qp(3), +rdma_get_cm_event(3) diff --git a/librdmacm/man/rdma_join_multicast_ex.3 b/librdmacm/man/rdma_join_multicast_ex.3 new file mode 100644 index 0000000..9e593f3 --- /dev/null +++ b/librdmacm/man/rdma_join_multicast_ex.3 @@ -0,0 +1,66 @@ +.TH "RDMA_JOIN_MULTICAST_EX" 3 "2017-11-17" "librdmacm" "Librdmacm Programmer's Manual" librdmacm +.SH NAME +rdma_join_multicast_ex \- Joins a multicast group with extended options. +.SH SYNOPSIS +.B "#include <rdma/rdma_cma.h>" +.P +.B "int" rdma_join_multicast_ex +.BI "(struct rdma_cm_id *" id "," +.BI "struct rdma_cm_join_mc_attr_ex *" mc_join_attr "," +.BI "void *" context ");" +.SH ARGUMENTS +.IP "id" 20 +Communication identifier associated with the request. +.IP "mc_join_attr" 20 +Is an rdma_cm_join_mc_attr_ex struct, as defined in <rdma/rdma_cma.h>. +.IP "context" 20 +User-defined context associated with the join request. +.SH "DESCRIPTION" +Joins a multicast group (MCG) with extended options. +Currently supporting MC join with a specified join flag. +.P +.nf +struct rdma_cm_join_mc_attr_ex { +.in +8 +uint32_t comp_mask; /* Bitwise OR between "rdma_cm_join_mc_attr_mask" enum */ +uint32_t join_flags; /* Use a single flag from "rdma_cm_mc_join_flags" enum */ +struct sockaddr *addr; /* Multicast address identifying the group to join */ +.in -8 +}; +.fi +.P +The supported join flags are: +.P +.B RDMA_MC_JOIN_FLAG_FULLMEMBER +- Create multicast group, Send multicast messages to MCG, Receive multicast messages from MCG. +.P +.B RDMA_MC_JOIN_FLAG_SENDONLY_FULLMEMBER +- Create multicast group, Send multicast messages to MCG, Don't receive multicast messages from MCG (send-only). +.P +Initiating a MC join as "Send Only Full Member" on InfiniBand requires SM support, otherwise joining will fail. +.P +Initiating a MC join as "Send Only Full Member" on RoCEv2/ETH will not send any IGMP messages unlike a Full Member MC join. +When "Send Only Full Member" is used the QP will not be attached to the MCG. +.P +.SH "RETURN VALUE" +Returns 0 on success, or -1 on error. If an error occurs, errno will be +set to indicate the failure reason. +.SH "NOTES" +Before joining a multicast group, the rdma_cm_id must be bound to +an RDMA device by calling rdma_bind_addr or rdma_resolve_addr. Use of +rdma_resolve_addr requires the local routing tables to resolve the +multicast address to an RDMA device, unless a specific source address +is provided. The user must call rdma_leave_multicast to leave the +multicast group and release any multicast resources. After the join +operation completes, if a QP is associated with the rdma_cm_id, +it is automatically attached to the multicast group when the multicast +event is retrieved by the user. Otherwise, the user is responsible +for calling ibv_attach_mcast to bind the QP to the multicast group. +The join context is returned to the user through the private_data +field in the rdma_cm_event. +.SH "SEE ALSO" +rdma_join_multicast(3), rdma_leave_multicast(3), rdma_bind_addr(3), rdma_resolve_addr(3), rdma_create_qp(3), +rdma_get_cm_event(3) +.SH "AUTHORS" +.TP +Alex Vesker <valex@mellanox.com> diff --git a/librdmacm/man/rdma_leave_multicast.3 b/librdmacm/man/rdma_leave_multicast.3 new file mode 100644 index 0000000..9e112d3 --- /dev/null +++ b/librdmacm/man/rdma_leave_multicast.3 @@ -0,0 +1,28 @@ +.\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md +.TH "RDMA_LEAVE_MULTICAST" 3 "2007-05-15" "librdmacm" "Librdmacm Programmer's Manual" librdmacm +.SH NAME +rdma_leave_multicast \- Leaves a multicast group. +.SH SYNOPSIS +.B "#include <rdma/rdma_cma.h>" +.P +.B "int" rdma_leave_multicast +.BI "(struct rdma_cm_id *" id "," +.BI "struct sockaddr *" addr ");" +.SH ARGUMENTS +.IP "id" 12 +Communication identifier associated with the request. +.IP "addr" 12 +Multicast address identifying the group to leave. +.SH "DESCRIPTION" +Leaves a multicast group and detaches an associated QP from the group. +.SH "RETURN VALUE" +Returns 0 on success, or -1 on error. If an error occurs, errno will be +set to indicate the failure reason. +.SH "NOTES" +Calling this function before a group has been fully joined results in +canceling the join operation. Users should be aware that messages +received from the multicast group may stilled be queued for +completion processing immediately after leaving a multicast group. +Destroying an rdma_cm_id will automatically leave all multicast groups. +.SH "SEE ALSO" +rdma_join_multicast(3), rdma_destroy_qp(3) diff --git a/librdmacm/man/rdma_listen.3 b/librdmacm/man/rdma_listen.3 new file mode 100644 index 0000000..78fd270 --- /dev/null +++ b/librdmacm/man/rdma_listen.3 @@ -0,0 +1,32 @@ +.\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md +.TH "RDMA_LISTEN" 3 "2007-05-15" "librdmacm" "Librdmacm Programmer's Manual" librdmacm +.SH NAME +rdma_listen \- Listen for incoming connection requests. +.SH SYNOPSIS +.B "#include <rdma/rdma_cma.h>" +.P +.B "int" rdma_listen +.BI "(struct rdma_cm_id *" id "," +.BI "int " backlog ");" +.SH ARGUMENTS +.IP "id" 12 +RDMA identifier. +.IP "backlog" 12 +backlog of incoming connection requests. +.SH "DESCRIPTION" +Initiates a listen for incoming connection requests or datagram service +lookup. The listen will be restricted to the locally bound source +address. +.SH "RETURN VALUE" +Returns 0 on success, or -1 on error. If an error occurs, errno will be +set to indicate the failure reason. +.SH "NOTES" +Users must have bound the rdma_cm_id to a local address by calling +rdma_bind_addr before calling this routine. If the rdma_cm_id is +bound to a specific IP address, the listen will be restricted to that +address and the associated RDMA device. If the rdma_cm_id is bound +to an RDMA port number only, the listen will occur across all RDMA +devices. +.SH "SEE ALSO" +rdma_cm(7), rdma_bind_addr(3), rdma_connect(3), rdma_accept(3), rdma_reject(3), +rdma_get_cm_event(3) diff --git a/librdmacm/man/rdma_migrate_id.3 b/librdmacm/man/rdma_migrate_id.3 new file mode 100644 index 0000000..f0f7bce --- /dev/null +++ b/librdmacm/man/rdma_migrate_id.3 @@ -0,0 +1,35 @@ +.\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md +.TH "RDMA_MIGRATE_ID" 3 "2007-11-13" "librdmacm" "Librdmacm Programmer's Manual" librdmacm +.SH NAME +rdma_migrate_id \- Move a communication identifier to a different event channel. +.SH SYNOPSIS +.B "#include <rdma/rdma_cma.h>" +.P +.B "int" rdma_migrate_id +.BI "(struct rdma_cm_id *" id "," +.BI "struct rdma_event_channel *" channel ");" +.SH ARGUMENTS +.IP "id" 12 +An existing communication identifier to migrate. +.IP "channel" 12 +The communication channel that events associated with the +allocated rdma_cm_id will be reported on. May be NULL. +.SH "DESCRIPTION" +Migrates a communication identifier to a different event channel. +.SH "RETURN VALUE" +Returns 0 on success, or -1 on error. If an error occurs, errno will be +set to indicate the failure reason. +.SH "NOTES" +This routine migrates a communication identifier to the specified event +channel and moves any pending events associated with the rdma_cm_id +to the new channel. Users should not poll for events on the +rdma_cm_id's current event channel or invoke other routines on the +rdma_cm_id while migrating between channels. This call will block while +there are any unacknowledged events on the current event channel. +.P +If the channel parameter is NULL, the specified rdma_cm_id will be +placed into synchronous operation mode. All calls on the id +will block until the operation completes. +.SH "SEE ALSO" +rdma_cm(7), rdma_create_event_channel(3), rdma_create_id(3), +rdma_get_cm_event(3) diff --git a/librdmacm/man/rdma_notify.3 b/librdmacm/man/rdma_notify.3 new file mode 100644 index 0000000..c80733a --- /dev/null +++ b/librdmacm/man/rdma_notify.3 @@ -0,0 +1,39 @@ +.\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md +.TH "RDMA_NOTIFY" 3 "2007-05-15" "librdmacm" "Librdmacm Programmer's Manual" librdmacm +.SH NAME +rdma_notify \- Notifies the librdmacm of an asynchronous event. +.SH SYNOPSIS +.B "#include <rdma/rdma_cma.h>" +.P +.B "int" rdma_notify +.BI "(struct rdma_cm_id *" id "," +.BI "enum ibv_event_type " event ");" +.SH ARGUMENTS +.IP "id" 12 +RDMA identifier. +.IP "event" 12 +Asynchronous event. +.SH "DESCRIPTION" +Used to notify the librdmacm of asynchronous events that have occurred +on a QP associated with the rdma_cm_id. +.SH "RETURN VALUE" +Returns 0 on success, or -1 on error. If an error occurs, errno will be +set to indicate the failure reason. If errno is set to EISCONN +(transport endpoint is already connected), this indicates that the +the underlying communication manager established the connection before +the call to rdma_notify could be processed. In this case, the error may +safely be ignored. +.SH "NOTES" +Asynchronous events that occur on a QP are reported through the user's +device event handler. This routine is used to notify the librdmacm of +communication events. In most cases, use of this routine is not +necessary, however if connection establishment is done out of band +(such as done through Infiniband), it's possible to receive data on a +QP that is not yet considered connected. This routine forces the +connection into an established state in this case in order to handle +the rare situation where the connection never forms on its own. +Calling this routine ensures the delivery of the RDMA_CM_EVENT_ESTABLISHED +event to the application. +Events that should be reported to the CM are: IB_EVENT_COMM_EST. +.SH "SEE ALSO" +rdma_connect(3), rdma_accept(3), rdma_listen(3) diff --git a/librdmacm/man/rdma_post_read.3 b/librdmacm/man/rdma_post_read.3 new file mode 100644 index 0000000..11b3e09 --- /dev/null +++ b/librdmacm/man/rdma_post_read.3 @@ -0,0 +1,56 @@ +.\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md +.TH "RDMA_POST_READ" 3 "2010-07-19" "librdmacm" "Librdmacm Programmer's Manual" librdmacm +.SH NAME +rdma_post_read \- post an RDMA read work request. +.SH SYNOPSIS +.B "#include <rdma/rdma_verbs.h>" +.P +.B "int" rdma_post_read +.BI "(struct rdma_cm_id *" id "," +.BI "void *" context "," +.BI "void *" addr "," +.BI "size_t " length "," +.BI "struct ibv_mr *" mr "," +.BI "int " flags "," +.BI "uint64_t " remote_addr "," +.BI "uint32_t " rkey ");" +.SH ARGUMENTS +.IP "id" 12 +A reference to a communication identifier where the request +will be posted. +.IP "context" 12 +User-defined context associated with the request. +.IP "addr" 12 +The address of the local destination of the read request. +.IP "length" 12 +The length of the read operation. +.IP "mr" 12 +Registered memory region associated with the local buffer. +.IP "flags" 12 +Optional flags used to control the read operation. +.IP "remote_addr" 12 +The address of the remote registered memory to read from. +.IP "rkey" 12 +The registered memory key associated with the remote address. +.SH "DESCRIPTION" +Posts a work request to the send queue of the queue pair associated +with the rdma_cm_id. The contents of the remote memory region will be +read into the local data buffer. +.SH "RETURN VALUE" +Returns 0 on success, or -1 on error. If an error occurs, errno will be +set to indicate the failure reason. +.SH "NOTES" +For a list of supported flags, see ibv_post_send. +Both the remote and local data buffers must have been registered +before the read is issued, and the buffers must remain registered +until the read completes. +.P +Read operations may not be posted to an rdma_cm_id or the corresponding +queue pair until it has been connected. +.P +The user-defined context associated with the read request will be +returned to the user through the work completion wr_id, work request +identifier, field. +.SH "SEE ALSO" +rdma_cm(7), rdma_connect(3), rdma_accept(3), +ibv_post_send(3), rdma_post_readv(3), rdma_reg_read(3), rdma_reg_msgs(3) diff --git a/librdmacm/man/rdma_post_readv.3 b/librdmacm/man/rdma_post_readv.3 new file mode 100644 index 0000000..95e4155 --- /dev/null +++ b/librdmacm/man/rdma_post_readv.3 @@ -0,0 +1,53 @@ +.\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md +.TH "RDMA_POST_READV" 3 "2010-07-19" "librdmacm" "Librdmacm Programmer's Manual" librdmacm +.SH NAME +rdma_post_readv \- post an RDMA read work request. +.SH SYNOPSIS +.B "#include <rdma/rdma_verbs.h>" +.P +.B "int" rdma_post_readv +.BI "(struct rdma_cm_id *" id "," +.BI "void *" context "," +.BI "struct ibv_sge *" sgl "," +.BI "int " nsge "," +.BI "int " flags "," +.BI "uint64_t " remote_addr "," +.BI "uint32_t " rkey ");" +.SH ARGUMENTS +.IP "id" 12 +A reference to a communication identifier where the request +will be posted. +.IP "context" 12 +User-defined context associated with the request. +.IP "sgl" 12 +A scatter-gather list of the destination buffers of the read. +.IP "nsge" 12 +The number of scatter-gather array entries. +.IP "flags" 12 +Optional flags used to control the read operation. +.IP "remote_addr" 12 +The address of the remote registered memory to read from. +.IP "rkey" 12 +The registered memory key associated with the remote address. +.SH "DESCRIPTION" +Posts a work request to the send queue of the queue pair associated +with the rdma_cm_id. The contents of the remote memory region will be +read into the local data buffers. +.SH "RETURN VALUE" +Returns 0 on success, or -1 on error. If an error occurs, errno will be +set to indicate the failure reason. +.SH "NOTES" +For a list of supported flags, see ibv_post_send. +Both the remote and local data buffers must have been registered +before the read is issued, and the buffers must remain registered +until the read completes. +.P +Read operations may not be posted to an rdma_cm_id or the corresponding +queue pair until it has been connected. +.P +The user-defined context associated with the read request will be +returned to the user through the work completion wr_id, work request +identifier, field. +.SH "SEE ALSO" +rdma_cm(7), rdma_connect(3), rdma_accept(3), +ibv_post_send(3), rdma_post_read(3), rdma_reg_read(3), rdma_reg_msgs(3) diff --git a/librdmacm/man/rdma_post_recv.3 b/librdmacm/man/rdma_post_recv.3 new file mode 100644 index 0000000..5e0530d --- /dev/null +++ b/librdmacm/man/rdma_post_recv.3 @@ -0,0 +1,51 @@ +.\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md +.TH "RDMA_POST_RECV" 3 "2010-07-19" "librdmacm" "Librdmacm Programmer's Manual" librdmacm +.SH NAME +rdma_post_recv \- post a work request to receive an incoming message. +.SH SYNOPSIS +.B "#include <rdma/rdma_verbs.h>" +.P +.B "int" rdma_post_recv +.BI "(struct rdma_cm_id *" id "," +.BI "void *" context "," +.BI "void *" addr "," +.BI "size_t " length "," +.BI "struct ibv_mr *" mr ");" +.SH ARGUMENTS +.IP "id" 12 +A reference to a communication identifier where the message buffer +will be posted. +.IP "context" 12 +User-defined context associated with the request. +.IP "addr" 12 +The address of the memory buffer to post. +.IP "length" 12 +The length of the memory buffer. +.IP "mr" 12 +A registered memory region associated with the posted buffer. +.SH "DESCRIPTION" +Posts a work request to the receive queue of the queue pair associated +with the rdma_cm_id. The posted buffer will be queued to receive an incoming +message sent by the remote peer. +.SH "RETURN VALUE" +Returns 0 on success, or -1 on error. If an error occurs, errno will be +set to indicate the failure reason. +.SH "NOTES" +The user is responsible for ensuring that a receive buffer is posted +and large enough to contain all sent data before the peer posts the +corresponding send message. The message buffer must have been registered +before being posted, with the mr parameter referencing the registration. +The buffer must remain registered until the receive completes. +.P +Messages may be posted to an rdma_cm_id only after a queue pair has +been associated with it. A queue pair is bound to an rdma_cm_id after +calling rdma_create_ep or rdma_create_qp, if the rdma_cm_id is allocated +using rdma_create_id. +.P +The user-defined context associated with the receive request will be +returned to the user through the work completion wr_id, work request +identifier, field. +.SH "SEE ALSO" +rdma_cm(7), rdma_create_id(3), rdma_create_ep(3), rdma_create_qp(3), +rdma_reg_read(3), ibv_reg_mr(3), ibv_dereg_mr(3), +rdma_post_recvv(3), rdma_post_send(3) diff --git a/librdmacm/man/rdma_post_recvv.3 b/librdmacm/man/rdma_post_recvv.3 new file mode 100644 index 0000000..bd03930 --- /dev/null +++ b/librdmacm/man/rdma_post_recvv.3 @@ -0,0 +1,48 @@ +.\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md +.TH "RDMA_POST_RECVV" 3 "2010-07-19" "librdmacm" "Librdmacm Programmer's Manual" librdmacm +.SH NAME +rdma_post_recvv \- post a work request to receive incoming messages. +.SH SYNOPSIS +.B "#include <rdma/rdma_verbs.h>" +.P +.B "int" rdma_post_recvv +.BI "(struct rdma_cm_id *" id "," +.BI "void *" context "," +.BI "struct ibv_sge *" sgl "," +.BI "int " nsge ");" +.SH ARGUMENTS +.IP "id" 12 +A reference to a communication identifier where the message buffer(s) +will be posted. +.IP "context" 12 +User-defined context associated with the request. +.IP "sgl" 12 +A scatter-gather list of memory buffers posted as a single request. +.IP "nsge" 12 +The number of scatter-gather entries in the sgl array. +.SH "DESCRIPTION" +Posts a single work request to the receive queue of the queue pair associated +with the rdma_cm_id. The posted buffers will be queued to receive an +incoming message sent by the remote peer. +.SH "RETURN VALUE" +Returns 0 on success, or -1 on error. If an error occurs, errno will be +set to indicate the failure reason. +.SH "NOTES" +The user is responsible for ensuring that the receive is posted, +and the total buffer space is large enough to contain all sent data +before the peer posts the corresponding send message. The message buffers +must have been registered before being posted, and the buffers must +remain registered until the receive completes. +.P +Messages may be posted to an rdma_cm_id only after a queue pair has +been associated with it. A queue pair is bound to an rdma_cm_id after +calling rdma_create_ep or rdma_create_qp, if the rdma_cm_id is allocated +using rdma_create_id. +.P +The user-defined context associated with the receive request will be +returned to the user through the work completion wr_id, work request +identifier, field. +.SH "SEE ALSO" +rdma_cm(7), rdma_create_id(3), rdma_create_ep(3), rdma_create_qp(3), +rdma_reg_read(3), ibv_reg_mr(3), ibv_dereg_mr(3), +rdma_post_recv(3), rdma_post_send(3) diff --git a/librdmacm/man/rdma_post_send.3 b/librdmacm/man/rdma_post_send.3 new file mode 100644 index 0000000..5d0e6d6 --- /dev/null +++ b/librdmacm/man/rdma_post_send.3 @@ -0,0 +1,52 @@ +.\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md +.TH "RDMA_POST_SEND" 3 "2010-07-19" "librdmacm" "Librdmacm Programmer's Manual" librdmacm +.SH NAME +rdma_post_send \- post a work request to send a message. +.SH SYNOPSIS +.B "#include <rdma/rdma_verbs.h>" +.P +.B "int" rdma_post_send +.BI "(struct rdma_cm_id *" id "," +.BI "void *" context "," +.BI "void *" addr "," +.BI "size_t " length "," +.BI "struct ibv_mr *" mr "," +.BI "int " flags ");" +.SH ARGUMENTS +.IP "id" 12 +A reference to a communication identifier where the message buffer +will be posted. +.IP "context" 12 +User-defined context associated with the request. +.IP "addr" 12 +The address of the memory buffer to post. +.IP "length" 12 +The length of the memory buffer. +.IP "mr" 12 +Optional registered memory region associated with the posted buffer. +.IP "flags" 12 +Optional flags used to control the send operation. +.SH "DESCRIPTION" +Posts a work request to the send queue of the queue pair associated +with the rdma_cm_id. The contents of the posted buffer will be sent +to the remote peer of a connection. +.SH "RETURN VALUE" +Returns 0 on success, or -1 on error. If an error occurs, errno will be +set to indicate the failure reason. +.SH "NOTES" +The user is responsible for ensuring that the remote peer has queued a +receive request before issuing the send operations. For a list of +supported flags, see ibv_post_send. Unless the send request is using +inline data, the message buffer must have been registered +before being posted, with the mr parameter referencing the registration. +The buffer must remain registered until the send completes. +.P +Send operations may not be posted to an rdma_cm_id or the corresponding +queue pair until it has been connected. +.P +The user-defined context associated with the send request will be +returned to the user through the work completion wr_id, work request +identifier, field. +.SH "SEE ALSO" +rdma_cm(7), rdma_connect(3), rdma_accept(3), +ibv_post_send(3), rdma_post_sendv(3), rdma_post_recv(3) diff --git a/librdmacm/man/rdma_post_sendv.3 b/librdmacm/man/rdma_post_sendv.3 new file mode 100644 index 0000000..9b347b1 --- /dev/null +++ b/librdmacm/man/rdma_post_sendv.3 @@ -0,0 +1,49 @@ +.\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md +.TH "RDMA_POST_SENDV" 3 "2010-07-19" "librdmacm" "Librdmacm Programmer's Manual" librdmacm +.SH NAME +rdma_post_sendv \- post a work request to send a message. +.SH SYNOPSIS +.B "#include <rdma/rdma_verbs.h>" +.P +.B "int" rdma_post_sendv +.BI "(struct rdma_cm_id *" id "," +.BI "void *" context "," +.BI "struct ibv_sge *" slg "," +.BI "int " nsge "," +.BI "int " flags ");" +.SH ARGUMENTS +.IP "id" 12 +A reference to a communication identifier where the message buffer +will be posted. +.IP "context" 12 +User-defined context associated with the request. +.IP "slg" 12 +A scatter-gather list of memory buffers posted as a single request. +.IP "nsge" 12 +The number of scatter-gather entries in the slg array. +.IP "flags" 12 +Optional flags used to control the send operation. +.SH "DESCRIPTION" +Posts a work request to the send queue of the queue pair associated +with the rdma_cm_id. The contents of the posted buffers will be sent +to the remote peer of a connection. +.SH "RETURN VALUE" +Returns 0 on success, or -1 on error. If an error occurs, errno will be +set to indicate the failure reason. +.SH "NOTES" +The user is responsible for ensuring that the remote peer has queued a +receive request before issuing the send operations. For a list of +supported flags, see ibv_post_send. Unless the send request is using +inline data, the message buffers must have been registered +before being posted, and the buffers must remain registered +until the send completes. +.P +Send operations may not be posted to an rdma_cm_id or the corresponding +queue pair until it has been connected. +.P +The user-defined context associated with the send request will be +returned to the user through the work completion wr_id, work request +identifier, field. +.SH "SEE ALSO" +rdma_cm(7), rdma_connect(3), rdma_accept(3), +ibv_post_send(3), rdma_post_send(3), rdma_post_recv(3) diff --git a/librdmacm/man/rdma_post_ud_send.3 b/librdmacm/man/rdma_post_ud_send.3 new file mode 100644 index 0000000..0fcce79 --- /dev/null +++ b/librdmacm/man/rdma_post_ud_send.3 @@ -0,0 +1,55 @@ +.\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md +.TH "RDMA_POST_UD_SEND" 3 "2010-07-19" "librdmacm" "Librdmacm Programmer's Manual" librdmacm +.SH NAME +rdma_post_ud_send \- post a work request to send a datagram. +.SH SYNOPSIS +.B "#include <rdma/rdma_verbs.h>" +.P +.B "int" rdma_post_ud_send +.BI "(struct rdma_cm_id *" id "," +.BI "void *" context "," +.BI "void *" addr "," +.BI "size_t " length "," +.BI "struct ibv_mr *" mr "," +.BI "int " flags "," +.BI "struct ibv_ah *" ah "," +.BI "uint32_t " remote_qpn ");" +.SH ARGUMENTS +.IP "id" 12 +A reference to a communication identifier where the message buffer +will be posted. +.IP "context" 12 +User-defined context associated with the request. +.IP "addr" 12 +The address of the memory buffer to post. +.IP "length" 12 +The length of the memory buffer. +.IP "mr" 12 +Optional registered memory region associated with the posted buffer. +.IP "flags" 12 +Optional flags used to control the send operation. +.IP "ah" 12 +An address handle describing the address of the remote node. +.IP "remote_qpn" 12 +The number of the destination queue pair. +.SH "DESCRIPTION" +Posts a work request to the send queue of the queue pair associated +with the rdma_cm_id. The contents of the posted buffer will be sent +to the specified destination queue pair. +.SH "RETURN VALUE" +Returns 0 on success, or -1 on error. If an error occurs, errno will be +set to indicate the failure reason. +.SH "NOTES" +The user is responsible for ensuring that the destination queue pair +has queued a receive request before issuing the send operations. For a list of +supported flags, see ibv_post_send. Unless the send request is using +inline data, the message buffer must have been registered +before being posted, with the mr parameter referencing the registration. +The buffer must remain registered until the send completes. +.P +The user-defined context associated with the send request will be +returned to the user through the work completion wr_id, work request +identifier, field. +.SH "SEE ALSO" +rdma_cm(7), rdma_connect(3), rdma_accept(3), rdma_reg_msgs(3) +ibv_post_send(3), rdma_post_recv(3) diff --git a/librdmacm/man/rdma_post_write.3 b/librdmacm/man/rdma_post_write.3 new file mode 100644 index 0000000..62a1e49 --- /dev/null +++ b/librdmacm/man/rdma_post_write.3 @@ -0,0 +1,56 @@ +.\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md +.TH "RDMA_POST_WRITE" 3 "2010-07-19" "librdmacm" "Librdmacm Programmer's Manual" librdmacm +.SH NAME +rdma_post_write \- post an RDMA write work request. +.SH SYNOPSIS +.B "#include <rdma/rdma_verbs.h>" +.P +.B "int" rdma_post_write +.BI "(struct rdma_cm_id *" id "," +.BI "void *" context "," +.BI "void *" addr "," +.BI "size_t " length "," +.BI "struct ibv_mr *" mr "," +.BI "int " flags "," +.BI "uint64_t " remote_addr "," +.BI "uint32_t " rkey ");" +.SH ARGUMENTS +.IP "id" 12 +A reference to a communication identifier where the request +will be posted. +.IP "context" 12 +User-defined context associated with the request. +.IP "addr" 12 +The local address of the source of the write request. +.IP "length" 12 +The length of the write operation. +.IP "mr" 12 +Optional memory region associated with the local buffer. +.IP "flags" 12 +Optional flags used to control the write operation. +.IP "remote_addr" 12 +The address of the remote registered memory to write into. +.IP "rkey" 12 +The registered memory key associated with the remote address. +.SH "DESCRIPTION" +Posts a work request to the send queue of the queue pair associated +with the rdma_cm_id. The contents of the local data buffer will be +written into the remote memory region. +.SH "RETURN VALUE" +Returns 0 on success, or -1 on error. If an error occurs, errno will be +set to indicate the failure reason. +.SH "NOTES" +For a list of supported flags, see ibv_post_send. Unless inline +data is specified, the local data buffer must have been registered +before the write is issued, and the buffer must remain registered +until the write completes. The remote buffer must always be registered. +.P +Write operations may not be posted to an rdma_cm_id or the corresponding +queue pair until it has been connected. +.P +The user-defined context associated with the write request will be +returned to the user through the work completion wr_id, work request +identifier, field. +.SH "SEE ALSO" +rdma_cm(7), rdma_connect(3), rdma_accept(3), +ibv_post_send(3), rdma_post_writev(3), rdma_reg_write(3), rdma_reg_msgs(3) diff --git a/librdmacm/man/rdma_post_writev.3 b/librdmacm/man/rdma_post_writev.3 new file mode 100644 index 0000000..22ee0fe --- /dev/null +++ b/librdmacm/man/rdma_post_writev.3 @@ -0,0 +1,53 @@ +.\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md +.TH "RDMA_POST_WRITEV" 3 "2010-07-19" "librdmacm" "Librdmacm Programmer's Manual" librdmacm +.SH NAME +rdma_post_writev \- post an RDMA write work request. +.SH SYNOPSIS +.B "#include <rdma/rdma_verbs.h>" +.P +.B "int" rdma_post_writev +.BI "(struct rdma_cm_id *" id "," +.BI "void *" context "," +.BI "struct ibv_sge *" sgl "," +.BI "int " nsge "," +.BI "int " flags "," +.BI "uint64_t " remote_addr "," +.BI "uint32_t " rkey ");" +.SH ARGUMENTS +.IP "id" 12 +A reference to a communication identifier where the request +will be posted. +.IP "context" 12 +User-defined context associated with the request. +.IP "sgl" 12 +A scatter-gather list of the source buffers of the write. +.IP "nsge" 12 +The number of scatter-gather array entries. +.IP "flags" 12 +Optional flags used to control the write operation. +.IP "remote_addr" 12 +The address of the remote registered memory to write into. +.IP "rkey" 12 +The registered memory key associated with the remote address. +.SH "DESCRIPTION" +Posts a work request to the send queue of the queue pair associated +with the rdma_cm_id. The contents of the local data buffers will be +written into the remote memory region. +.SH "RETURN VALUE" +Returns 0 on success, or -1 on error. If an error occurs, errno will be +set to indicate the failure reason. +.SH "NOTES" +For a list of supported flags, see ibv_post_send. Unless inline +data is specified, the local data buffers must have been registered +before the write is issued, and the buffers must remain registered +until the write completes. The remote buffers must always be registered. +.P +Write operations may not be posted to an rdma_cm_id or the corresponding +queue pair until it has been connected. +.P +The user-defined context associated with the write request will be +returned to the user through the work completion wr_id, work request +identifier, field. +.SH "SEE ALSO" +rdma_cm(7), rdma_connect(3), rdma_accept(3), +ibv_post_send(3), rdma_post_write(3), rdma_reg_write(3), rdma_reg_msgs(3) diff --git a/librdmacm/man/rdma_reg_msgs.3 b/librdmacm/man/rdma_reg_msgs.3 new file mode 100644 index 0000000..48c75b3 --- /dev/null +++ b/librdmacm/man/rdma_reg_msgs.3 @@ -0,0 +1,43 @@ +.\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md +.TH "RDMA_REG_MSGS" 3 "2010-07-19" "librdmacm" "Librdmacm Programmer's Manual" librdmacm +.SH NAME +rdma_reg_msgs \- register data buffer(s) for sending or receiving messages. +.SH SYNOPSIS +.B "#include <rdma/rdma_verbs.h>" +.P +.B "struct ibv_mr *" rdma_reg_msgs +.BI "(struct rdma_cm_id *" id "," +.BI "void *" addr "," +.BI "size_t " length ");" +.SH ARGUMENTS +.IP "id" 12 +A reference to a communication identifier where the message buffer(s) +will be used. +.IP "addr" 12 +The address of the memory buffer(s) to register. +.IP "length" 12 +The total length of the memory to register. +.SH "DESCRIPTION" +Registers an array of memory buffers used for sending and receiving +messages or for RDMA operations. Memory buffers registered using +rdma_reg_msgs may be posted to an rdma_cm_id using +rdma_post_send or rdma_post_recv, or specified as the target of an RDMA +read operation or the source of an RDMA write request. +.SH "RETURN VALUE" +Returns a reference to the registered memory region on success, or NULL on +error. If an error occurs, errno will be set to indicate the failure reason. +.SH "NOTES" +rdma_reg_msgs is used to register an array of data buffers +that will be used send and/or receive messages on a queue pair associated with +an rdma_cm_id. The memory buffer is registered with the proteection +domain associated with the idenfier. The start of the data buffer array +is specified through the addr parameter, and the total size of the array +is given by length. +.P +All data buffers should be registered before being posted as a work request. +Users must deregister all registered memory by calling rdma_dereg_mr. +.SH "SEE ALSO" +rdma_cm(7), rdma_create_id(3), rdma_create_ep(3), +rdma_reg_read(3), rdma_reg_write(3), +ibv_reg_mr(3), ibv_dereg_mr(3), rdma_post_send(3), rdma_post_recv(3), +rdma_post_read(3), rdma_post_readv(3), rdma_post_write(3), rdma_post_writev(3) diff --git a/librdmacm/man/rdma_reg_read.3 b/librdmacm/man/rdma_reg_read.3 new file mode 100644 index 0000000..07813e8 --- /dev/null +++ b/librdmacm/man/rdma_reg_read.3 @@ -0,0 +1,42 @@ +.\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md +.TH "RDMA_REG_READ" 3 "2010-07-19" "librdmacm" "Librdmacm Programmer's Manual" librdmacm +.SH NAME +rdma_reg_read \- register data buffer(s) for remote RDMA read access. +.SH SYNOPSIS +.B "#include <rdma/rdma_verbs.h>" +.P +.B "struct ibv_mr *" rdma_reg_read +.BI "(struct rdma_cm_id *" id "," +.BI "void *" addr "," +.BI "size_t " length ");" +.SH ARGUMENTS +.IP "id" 12 +A reference to a communication identifier where the message buffer(s) +will be used. +.IP "addr" 12 +The address of the memory buffer(s) to register. +.IP "length" 12 +The total length of the memory to register. +.SH "DESCRIPTION" +Registers a memory buffer that will be accessed by a remote RDMA read +operation. Memory buffers registered using rdma_reg_read may be +targeted in an RDMA read request, allowing the buffer to be +specified on the remote side of an RDMA connection as the remote_addr +of rdma_post_read, or similar call. +.SH "RETURN VALUE" +Returns a reference to the registered memory region on success, or NULL on +error. If an error occurs, errno will be set to indicate the failure reason. +.SH "NOTES" +rdma_reg_read is used to register a data buffer that will be the +target of an RDMA read operation on a queue pair associated with +an rdma_cm_id. The memory buffer is registered with the proteection +domain associated with the idenfier. The start of the data buffer +is specified through the addr parameter, and the total size of the buffer +is given by length. +.P +All data buffers should be registered before being posted as a work request. +Users must deregister all registered memory by calling rdma_dereg_mr. +.SH "SEE ALSO" +rdma_cm(7), rdma_create_id(3), rdma_create_ep(3), +rdma_reg_msgs(3), rdma_reg_write(3), +ibv_reg_mr(3), ibv_dereg_mr(3), rdma_post_read(3) diff --git a/librdmacm/man/rdma_reg_write.3 b/librdmacm/man/rdma_reg_write.3 new file mode 100644 index 0000000..2be3cc0 --- /dev/null +++ b/librdmacm/man/rdma_reg_write.3 @@ -0,0 +1,42 @@ +.\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md +.TH "RDMA_REG_WRITE" 3 "2010-07-19" "librdmacm" "Librdmacm Programmer's Manual" librdmacm +.SH NAME +rdma_reg_write \- register data buffer(s) for remote RDMA write access. +.SH SYNOPSIS +.B "#include <rdma/rdma_verbs.h>" +.P +.B "struct ibv_mr *" rdma_reg_write +.BI "(struct rdma_cm_id *" id "," +.BI "void *" addr "," +.BI "size_t " length ");" +.SH ARGUMENTS +.IP "id" 12 +A reference to a communication identifier where the message buffer(s) +will be used. +.IP "addr" 12 +The address of the memory buffer(s) to register. +.IP "length" 12 +The total length of the memory to register. +.SH "DESCRIPTION" +Registers a memory buffer that will be accessed by a remote RDMA write +operation. Memory buffers registered using rdma_reg_write may be +targeted in an RDMA write request, allowing the buffer to be +specified on the remote side of an RDMA connection as the remote_addr +of rdma_post_write, or similar call. +.SH "RETURN VALUE" +Returns a reference to the registered memory region on success, or NULL on +error. If an error occurs, errno will be set to indicate the failure reason. +.SH "NOTES" +rdma_reg_write is used to register a data buffer that will be the +target of an RDMA write operation on a queue pair associated with +an rdma_cm_id. The memory buffer is registered with the proteection +domain associated with the idenfier. The start of the data buffer +is specified through the addr parameter, and the total size of the buffer +is given by length. +.P +All data buffers should be registered before being posted as a work request. +Users must deregister all registered memory by calling rdma_dereg_mr. +.SH "SEE ALSO" +rdma_cm(7), rdma_create_id(3), rdma_create_ep(3), +rdma_reg_msgs(3), rdma_reg_read(3), +ibv_reg_mr(3), ibv_dereg_mr(3), rdma_post_write(3) diff --git a/librdmacm/man/rdma_reject.3 b/librdmacm/man/rdma_reject.3 new file mode 100644 index 0000000..53f038f --- /dev/null +++ b/librdmacm/man/rdma_reject.3 @@ -0,0 +1,33 @@ +.\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md +.TH "RDMA_REJECT" 3 "2007-05-15" "librdmacm" "Librdmacm Programmer's Manual" librdmacm +.SH NAME +rdma_reject \- Called to reject a connection request. +.SH SYNOPSIS +.B "#include <rdma/rdma_cma.h>" +.P +.B "int" rdma_reject +.BI "(struct rdma_cm_id *" id "," +.BI "const void *" private_data "," +.BI "uint8_t " private_data_len ");" +.SH ARGUMENTS +.IP "id" 12 +Connection identifier associated with the request. +.IP "private_data" 12 +Optional private data to send with the reject message. +.IP "private_data_len" 12 +Specifies the size of the user-controlled data buffer. Note that the actual +amount of data transferred to the remote side is transport dependent and may +be larger than that requested. +.SH "DESCRIPTION" +Called from the listening side to reject a connection or datagram +service lookup request. +.SH "RETURN VALUE" +Returns 0 on success, or -1 on error. If an error occurs, errno will be +set to indicate the failure reason. +.SH "NOTES" +After receiving a connection request event, a user may call rdma_reject +to reject the request. If the underlying RDMA transport supports +private data in the reject message, the specified data will be passed to +the remote side. +.SH "SEE ALSO" +rdma_listen(3), rdma_accept(3), rdma_get_cm_event(3) diff --git a/librdmacm/man/rdma_resolve_addr.3 b/librdmacm/man/rdma_resolve_addr.3 new file mode 100644 index 0000000..ce6dcd6 --- /dev/null +++ b/librdmacm/man/rdma_resolve_addr.3 @@ -0,0 +1,47 @@ +.\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md +.TH "RDMA_RESOLVE_ADDR" 3 "2007-10-31" "librdmacm" "Librdmacm Programmer's Manual" librdmacm +.SH NAME +rdma_resolve_addr \- Resolve destination and optional source addresses. +.SH SYNOPSIS +.B "#include <rdma/rdma_cma.h>" +.P +.B "int" rdma_resolve_addr +.BI "(struct rdma_cm_id *" id "," +.BI "struct sockaddr *" src_addr "," +.BI "struct sockaddr *" dst_addr "," +.BI "int " timeout_ms ");" +.SH ARGUMENTS +.IP "id" 12 +RDMA identifier. +.IP "src_addr" 12 +Source address information. This parameter may be NULL. +.IP "dst_addr" 12 +Destination address information. +.IP "timeout_ms" 12 +Time to wait for resolution to complete. +.SH "DESCRIPTION" +Resolve destination and optional source addresses from IP addresses +to an RDMA address. If successful, the specified rdma_cm_id will +be bound to a local device. +.SH "RETURN VALUE" +Returns 0 on success, or -1 on error. If an error occurs, errno will be +set to indicate the failure reason. +.SH "NOTES" +This call is used to map a given destination IP address to a usable RDMA +address. The IP to RDMA address mapping is done using the local routing +tables, or via ARP. +If a source address is given, the rdma_cm_id is bound to that +address, the same as if rdma_bind_addr were called. If no source +address is given, and the rdma_cm_id has not yet been bound to a device, +then the rdma_cm_id will be bound to a source address based on the +local routing tables. After this call, the rdma_cm_id will be bound to +an RDMA device. This call is typically made from the active side of a +connection before calling rdma_resolve_route and rdma_connect. +.SH "INFINIBAND SPECIFIC" +This call maps the destination and, if given, source IP addresses to GIDs. +In order to perform the mapping, IPoIB must be running on both the local +and remote nodes. +.SH "SEE ALSO" +rdma_create_id(3), rdma_resolve_route(3), rdma_connect(3), rdma_create_qp(3), +rdma_get_cm_event(3), rdma_bind_addr(3), rdma_get_src_port(3), +rdma_get_dst_port(3), rdma_get_local_addr(3), rdma_get_peer_addr(3) diff --git a/librdmacm/man/rdma_resolve_route.3 b/librdmacm/man/rdma_resolve_route.3 new file mode 100644 index 0000000..114f666 --- /dev/null +++ b/librdmacm/man/rdma_resolve_route.3 @@ -0,0 +1,29 @@ +.\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md +.TH "RDMA_RESOLVE_ROUTE" 3 "2007-10-31" "librdmacm" "Librdmacm Programmer's Manual" librdmacm +.SH NAME +rdma_resolve_route \- Resolve the route information needed to establish a connection. +.SH SYNOPSIS +.B "#include <rdma/rdma_cma.h>" +.P +.B "int" rdma_resolve_route +.BI "(struct rdma_cm_id *" id "," +.BI "int " timeout_ms ");" +.SH ARGUMENTS +.IP "id" 12 +RDMA identifier. +.IP "timeout_ms" 12 +Time to wait for resolution to complete. +.SH "DESCRIPTION" +Resolves an RDMA route to the destination address in order to establish +a connection. The destination address must have already been resolved +by calling rdma_resolve_addr. +.SH "RETURN VALUE" +Returns 0 on success, or -1 on error. If an error occurs, errno will be +set to indicate the failure reason. +.SH "NOTES" +This is called on the client side of a connection after calling +rdma_resolve_addr, but before calling rdma_connect. +.SH "INFINIBAND SPECIFIC" +This call obtains a path record that is used by the connection. +.SH "SEE ALSO" +rdma_resolve_addr(3), rdma_connect(3), rdma_get_cm_event(3) diff --git a/librdmacm/man/rdma_server.1 b/librdmacm/man/rdma_server.1 new file mode 100644 index 0000000..ada2564 --- /dev/null +++ b/librdmacm/man/rdma_server.1 @@ -0,0 +1,27 @@ +.\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md +.TH "RDMA_SERVER" 1 "2010-07-19" "librdmacm" "librdmacm" librdmacm +.SH NAME +rdma_server \- simple RDMA CM connection and ping-pong test. +.SH SYNOPSIS +.sp +.nf +\fIrdma_server\fR [-p port] +.fi +.SH "DESCRIPTION" +Uses synchronous librdmam calls to establish an RDMA connections between +two nodes. This example is intended to provide a very simple coding +example of how to use RDMA. +.SH "OPTIONS" +.TP +\-p port +Changes the port number that the server listens on. By default the server +listens on port 7471. +.SH "NOTES" +Basic usage is to start rdma_server, then connect to the server using the +rdma_client program. +.P +Because this test maps RDMA resources to userspace, users must ensure +that they have available system resources and permissions. See the +libibverbs README file for additional details. +.SH "SEE ALSO" +rdma_cm(7), udaddy(1), mckey(1), rping(1), rdma_client(1) diff --git a/librdmacm/man/rdma_set_option.3 b/librdmacm/man/rdma_set_option.3 new file mode 100644 index 0000000..c6660c6 --- /dev/null +++ b/librdmacm/man/rdma_set_option.3 @@ -0,0 +1,50 @@ +.\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md +.TH "RDMA_SET_OPTION" 3 "2007-08-06" "librdmacm" "Librdmacm Programmer's Manual" librdmacm +.SH NAME +rdma_set_option \- Set communication options for an rdma_cm_id. +.SH SYNOPSIS +.B "#include <rdma/rdma_cma.h>" +.P +.B "int" rdma_set_option +.BI "(struct rdma_cm_id *" id "," +.BI "int " level "," +.BI "int " optname "," +.BI "void *" optval "," +.BI "size_t " optlen ");" +.SH ARGUMENTS +.IP "id" 12 +RDMA identifier. +.IP "level" 12 +Protocol level of the option to set. +.IP "optname" 12 +Name of the option, relative to the level, to set. +.IP "optval" 12 +Reference to the option data. The data is dependent on the level and optname. +.IP "optlen" 12 +The size of the %optval buffer. +.SH "DESCRIPTION" +Sets communication options for an rdma_cm_id. This call is used to override +the default system settings. +.IP "optname can be one of" 12 +.IP "RDMA_OPTION_ID_TOS" 12 +Specify the quality of service provided by a connection. +The expected optlen is size of uint8_t. +.IP "RDMA_OPTION_ID_REUSEADDR" 12 +Bound the rdma_cm_id to a reuseable address. This will allow other users to bind to that same address. +The expected optlen is size of int. +.IP "RDMA_OPTION_ID_AFONLY" 12 +Set IPV6_V6ONLY socket. +The expected optlen is size of int. +.IP "RDMA_OPTION_IB_PATH" 12 +Set IB path record data. +The expected optlen is size of struct ibv_path_data[]. +.IP "RDMA_OPTION_ID_ACK_TIMEOUT" 12 +Set QP ACK timeout. +The value calculated according to the formula 4.096 * 2^(ack_timeout) usec. +.SH "RETURN VALUE" +Returns 0 on success, or -1 on error. If an error occurs, errno will be +set to indicate the failure reason. +.SH "NOTES" +Option details may be found in the relevant header files. +.SH "SEE ALSO" +rdma_create_id(3) diff --git a/librdmacm/man/rdma_xclient.1 b/librdmacm/man/rdma_xclient.1 new file mode 100644 index 0000000..d874c6e --- /dev/null +++ b/librdmacm/man/rdma_xclient.1 @@ -0,0 +1,37 @@ +.\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md +.TH "RDMA_XCLIENT" 1 "2011-06-15" "librdmacm" "librdmacm" librdmacm +.SH NAME +rdma_xclient \- RDMA CM communication client test program +.SH SYNOPSIS +.sp +.nf +\fIrdma_xclient\fR [-s server_address] [-p server_port] [-c comm_type] +.fi +.SH "DESCRIPTION" +Uses synchronous librdmam calls to establish an RDMA connection between +two nodes. This example is intended to provide a very simple coding +example of how to use RDMA. +.SH "OPTIONS" +.TP +\-s server_address +Specifies the address of the system that the rdma_server is running on. +By default, the client will attempt to connect to the server using +127.0.0.1. +.TP +\-p server_port +Specifies the port number that the server listens on. By default the server +listens on port 7471. +.TP +\-c communication type +Specifies the type of communication established with the server program. 'r' +results in using a reliable-connected QP (the default). 'x' uses +extended reliable-connected XRC QPs. +.SH "NOTES" +Basic usage is to start rdma_xserver, then connect to the server using the +rdma_client program. +.P +Because this test maps RDMA resources to userspace, users must ensure +that they have available system resources and permissions. See the +libibverbs README file for additional details. +.SH "SEE ALSO" +rdma_cm(7), udaddy(1), mckey(1), rping(1), rdma_xserver(1), rdma_client(1) diff --git a/librdmacm/man/rdma_xserver.1 b/librdmacm/man/rdma_xserver.1 new file mode 100644 index 0000000..4c12e25 --- /dev/null +++ b/librdmacm/man/rdma_xserver.1 @@ -0,0 +1,31 @@ +.\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md +.TH "RDMA_XSERVER" 1 "2011-06-15" "librdmacm" "librdmacm" librdmacm +.SH NAME +rdma_xserver \- RDMA CM communication server test program +.SH SYNOPSIS +.sp +.nf +\fIrdma_xserver\fR [-p port] [-c comm_type] +.fi +.SH "DESCRIPTION" +Uses the librdmacm to establish various forms of communication and exchange +data. +.SH "OPTIONS" +.TP +\-p port +Changes the port number that the server listens on. By default the server +listens on port 7471. +.TP +\-c communication type +Specifies the type of communication established with the client program. 'r' +results in using a reliable-connected QP (the default). 'x' uses +extended reliable-connected XRC QPs. +.SH "NOTES" +Basic usage is to start rdma_xserver, then connect to the server using the +rdma_xclient program. +.P +Because this test maps RDMA resources to userspace, users must ensure +that they have available system resources and permissions. See the +libibverbs README file for additional details. +.SH "SEE ALSO" +rdma_cm(7), udaddy(1), mckey(1), rping(1), rdma_server(1), rdma_xclient(1) diff --git a/librdmacm/man/riostream.1 b/librdmacm/man/riostream.1 new file mode 100644 index 0000000..537302b --- /dev/null +++ b/librdmacm/man/riostream.1 @@ -0,0 +1,65 @@ +.\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md +.TH "RIOSTREAM" 1 "2012-10-24" "librdmacm" "librdmacm" librdmacm +.SH NAME +riostream \- zero-copy streaming over RDMA ping-pong test. +.SH SYNOPSIS +.sp +.nf +\fIriostream\fR [-s server_address] [-b bind_address] [-B buffer_size] + [-I iterations] [-C transfer_count] + [-S transfer_size] [-p server_port] [-T test_option] +.fi +.SH "DESCRIPTION" +Uses the streaming over RDMA protocol (rsocket) to connect and exchange +data between a client and server application. +.SH "OPTIONS" +.TP +\-s server_address +The network name or IP address of the server system listening for +connections. The used name or address must route over an RDMA device. +This option must be specified by the client. +.TP +\-b bind_address +The local network address to bind to. +.TP +\-B buffer_size +Indicates the size of the send and receive network buffers. +.TP +\-I iterations +The number of times that the specified number of messages will be +exchanged between the client and server. (default 1000) +.TP +\-C transfer_count +The number of messages to transfer from the client to the server and +back again on each iteration. (default 1) +.TP +\-S transfer_size +The size of each send transfer, in bytes. (default 1000) If 'all' +is specified, rstream will run a series of tests of various sizes. +.TP +\-p server_port +The server's port number. +.TP +\-T test_option +Specifies test parameters. Available options are: +.P +a | async - uses asynchronous operation (e.g. select / poll) +.P +b | blocking - uses blocking calls +.P +n | nonblocking - uses non-blocking calls +.P +v | verify - verifies data transfers +.SH "NOTES" +Basic usage is to start riostream on a server system, then run +riostream -s server_name on a client system. By default, riostream +will run a series of latency and bandwidth performance tests. +Specifying a different iterations, transfer_count, or transfer_size +will run a user customized test using default values where none +have been specified. +.P +Because this test maps RDMA resources to userspace, users must ensure +that they have available system resources and permissions. See the +libibverbs README file for additional details. +.SH "SEE ALSO" +rdma_cm(7) rstream(1) diff --git a/librdmacm/man/rping.1 b/librdmacm/man/rping.1 new file mode 100644 index 0000000..7ec530e --- /dev/null +++ b/librdmacm/man/rping.1 @@ -0,0 +1,63 @@ +.\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md +.TH "RPING" 1 "2007-05-15" "librdmacm" "librdmacm" librdmacm +.SH NAME +rping \- RDMA CM connection and RDMA ping-pong test. +.SH SYNOPSIS +.sp +.nf +\fIrping\fR -s [-v] [-V] [-d] [-P] [-a address] [-p port] + [-C message_count] [-S message_size] +\fIrping\fR -c [-v] [-V] [-d] [-I address] -a address [-p port] + [-C message_count] [-S message_size] +.fi +.SH "DESCRIPTION" +Establishes a reliable RDMA connection between two nodes using the +librdmacm, optionally performs RDMA transfers between the nodes, +then disconnects. +.SH "OPTIONS" +.TP +\-s +Run as the server. +.TP +\-c +Run as the client. +.TP +\-a address +On the server, specifies the network address to bind the connection to. +To bind to any address with IPv6 use -a ::0 . +On the client, specifies the server address to connect to. +.TP +\-I address +The address to bind to as the source IP address to use. This is useful +if you have multiple addresses on the same network or complex routing. +.TP +\-p +Port number for listening server. +.TP +\-v +Display ping data. +.TP +\-V +Validate ping data. +.TP +\-d +Display debug information. +.TP +\-C message_count +The number of messages to transfer over each connection. (default infinite) +.TP +\-S message_size +The size of each message transferred, in bytes. (default 100) +.TP +\-P +Run the server in persistent mode. This allows multiple rping clients +to connect to a single server instance. The server will run until killed. +.TP +\-q +Control QP Creation/Modification directly from the application, instead of rdma_cm. +.SH "NOTES" +Because this test maps RDMA resources to userspace, users must ensure +that they have available system resources and permissions. See the +libibverbs README file for additional details. +.SH "SEE ALSO" +rdma_cm(7), ucmatose(1), udaddy(1), mckey(1) diff --git a/librdmacm/man/rsocket.7.in b/librdmacm/man/rsocket.7.in new file mode 100644 index 0000000..7dc479e --- /dev/null +++ b/librdmacm/man/rsocket.7.in @@ -0,0 +1,163 @@ +.\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md +.TH "RSOCKET" 7 "2019-04-16" "librdmacm" "Librdmacm Programmer's Manual" librdmacm +.SH NAME +rsocket \- RDMA socket API +.SH SYNOPSIS +.B "#include <rdma/rsocket.h>" +.SH "DESCRIPTION" +RDMA socket API and protocol +.SH "NOTES" +Rsockets is a protocol over RDMA that supports a socket-level API +for applications. Rsocket APIs are intended to match the behavior +of corresponding socket calls, except where noted. Rsocket +functions match the name and function signature of socket calls, +with the exception that all function calls are prefixed with an 'r'. +.P +The following functions are defined: +.P +rsocket +.P +rbind, rlisten, raccept, rconnect +.P +rshutdown, rclose +.P +rrecv, rrecvfrom, rrecvmsg, rread, rreadv +.P +rsend, rsendto, rsendmsg, rwrite, rwritev +.P +rpoll, rselect +.P +rgetpeername, rgetsockname +.P +rsetsockopt, rgetsockopt, rfcntl +.P +Functions take the same parameters as that used for sockets. The +follow capabilities and flags are supported at this time: +.P +PF_INET, PF_INET6, SOCK_STREAM, SOCK_DGRAM +.P +SOL_SOCKET - SO_ERROR, SO_KEEPALIVE (flag supported, but ignored), +SO_LINGER, SO_OOBINLINE, SO_RCVBUF, SO_REUSEADDR, SO_SNDBUF +.P +IPPROTO_TCP - TCP_NODELAY, TCP_MAXSEG +.P +IPPROTO_IPV6 - IPV6_V6ONLY +.P +MSG_DONTWAIT, MSG_PEEK, O_NONBLOCK +.P +Rsockets provides extensions beyond normal socket routines that +allow for direct placement of data into an application's buffer. +This is also known as zero-copy support, since data is sent and +received directly, bypassing copies into network controlled buffers. +The following calls and options support direct data placement. +.P +riomap, riounmap, riowrite +.TP +off_t riomap(int socket, void *buf, size_t len, int prot, int flags, off_t offset) +.TP +Riomap registers an application buffer with the RDMA hardware +associated with an rsocket. The buffer is registered either for +local only access (PROT_NONE) or for remote write access (PROT_WRITE). +When registered for remote access, the buffer is mapped to a given +offset. The offset is either provided by the user, or if the user +selects -1 for the offset, rsockets selects one. The remote peer may +access an iomapped buffer directly by specifying the correct offset. +The mapping is not guaranteed to be available until after the remote +peer receives a data transfer initiated after riomap has completed. +.PP +In order to enable the use of remote IO mapping calls on an rsocket, +an application must set the number of IO mappings that are available +to the remote peer. This may be done using the rsetsockopt +RDMA_IOMAPSIZE option. By default, an rsocket does not support +remote IO mappings. +riounmap +.TP +int riounmap(int socket, void *buf, size_t len) +.TP +Riounmap removes the mapping between a buffer and an rsocket. +.P +riowrite +.TP +size_t riowrite(int socket, const void *buf, size_t count, off_t offset, int flags) +.TP +Riowrite allows an application to transfer data over an rsocket +directly into a remotely iomapped buffer. The remote buffer is specified +through an offset parameter, which corresponds to a remote iomapped buffer. +From the sender's perspective, riowrite behaves similar to rwrite. From +a receiver's view, riowrite transfers are silently redirected into a pre- +determined data buffer. Data is received automatically, and the receiver +is not informed of the transfer. However, iowrite data is still considered +part of the data stream, such that iowrite data will be written before a +subsequent transfer is received. A message sent immediately after initiating +an iowrite may be used to notify the receiver of the iowrite. +.P +In addition to standard socket options, rsockets supports options +specific to RDMA devices and protocols. These options are accessible +through rsetsockopt using SOL_RDMA option level. +.TP +RDMA_SQSIZE - Integer size of the underlying send queue. +.TP +RDMA_RQSIZE - Integer size of the underlying receive queue. +.TP +RDMA_INLINE - Integer size of inline data. +.TP +RDMA_IOMAPSIZE - Integer number of remote IO mappings supported +.TP +RDMA_ROUTE - struct ibv_path_data of path record for connection. +.P +Note that rsockets fd's cannot be passed into non-rsocket calls. For +applications which must mix rsocket fd's with standard socket fd's or +opened files, rpoll and rselect support polling both rsockets and +normal fd's. +.P +Existing applications can make use of rsockets through the use of a +preload library. Because rsockets implements an end-to-end protocol, +both sides of a connection must use rsockets. The rdma_cm library +provides such a preload library, librspreload. To reduce the chance +of the preload library intercepting calls without the user's explicit +knowledge, the librspreload library is installed into %libdir%/rsocket +subdirectory. +.P +The preload library can be used by setting LD_PRELOAD when running. +Note that not all applications will work with rsockets. Support is +limited based on the socket options used by the application. +Support for fork() is limited, but available. To use rsockets with +the preload library for applications that call fork, users must +set the environment variable RDMAV_FORK_SAFE=1 on both the client +and server side of the connection. In general, fork is +supportable for server applications that accept a connection, then +fork off a process to handle the new connection. +.P +rsockets uses configuration files that give an administrator control +over the default settings used by rsockets. Use files under +@CMAKE_INSTALL_FULL_SYSCONFDIR@/rdma/rsocket as shown: +.P +.P +mem_default - default size of receive buffer(s) +.P +wmem_default - default size of send buffer(s) +.P +sqsize_default - default size of send queue +.P +rqsize_default - default size of receive queue +.P +inline_default - default size of inline data +.P +iomap_size - default size of remote iomapping table +.P +polling_time - default number of microseconds to poll for data before waiting +.P +wake_up_interval - maximum number of milliseconds to block in poll. +This value is used to safe guard against potential application hangs +in rpoll(). +.P +All configuration files should contain a single integer value. Values may +be set by issuing a command similar to the following example. +.P +echo 1000000 > @CMAKE_INSTALL_FULL_SYSCONFDIR@/rdma/rsocket/mem_default +.P +If configuration files are not available, rsockets uses internal defaults. +Applications can override default values programmatically through the +rsetsockopt routine. +.SH "SEE ALSO" +rdma_cm(7) diff --git a/librdmacm/man/rstream.1 b/librdmacm/man/rstream.1 new file mode 100644 index 0000000..3fc3b17 --- /dev/null +++ b/librdmacm/man/rstream.1 @@ -0,0 +1,74 @@ +.\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md +.TH "RSTREAM" 1 "2011-11-16" "librdmacm" "librdmacm" librdmacm +.SH NAME +rstream \- streaming over RDMA ping-pong test. +.SH SYNOPSIS +.sp +.nf +\fIrstream\fR [-s server_address] [-b bind_address] [-f address_format] + [-B buffer_size] [-I iterations] [-C transfer_count] + [-S transfer_size] [-p server_port] [-T test_option] +.fi +.SH "DESCRIPTION" +Uses the streaming over RDMA protocol (rsocket) to connect and exchange +data between a client and server application. +.SH "OPTIONS" +.TP +\-s server_address +The network name or IP address of the server system listening for +connections. The used name or address must route over an RDMA device. +This option must be specified by the client. +.TP +\-b bind_address +The local network address to bind to. +.TP +\-f address_format +Supported address formats are ip, ipv6, gid, or name. +.TP +\-B buffer_size +Indicates the size of the send and receive network buffers. +.TP +\-I iterations +The number of times that the specified number of messages will be +exchanged between the client and server. (default 1000) +.TP +\-C transfer_count +The number of messages to transfer from the client to the server and +back again on each iteration. (default 1000) +.TP +\-S transfer_size +The size of each send transfer, in bytes. (default 1000) If 'all' +is specified, rstream will run a series of tests of various sizes. +.TP +\-p server_port +The server's port number. +.TP +\-T test_option +Specifies test parameters. Available options are: +.P +s | socket - uses standard socket calls to transfer data +.P +a | async - uses asynchronous operation (e.g. select / poll) +.P +b | blocking - uses blocking calls +.P +f | fork - fork server processing (forces -T s option) +.P +n | nonblocking - uses non-blocking calls +.P +r | resolve - use rdma cm to resolve address +.P +v | verify - verifies data transfers +.SH "NOTES" +Basic usage is to start rstream on a server system, then run +rstream -s server_name on a client system. By default, rstream +will run a series of latency and bandwidth performance tests. +Specifying a different iterations, transfer_count, or transfer_size +will run a user customized test using default values where none +have been specified. +.P +Because this test maps RDMA resources to userspace, users must ensure +that they have available system resources and permissions. See the +libibverbs README file for additional details. +.SH "SEE ALSO" +rdma_cm(7) diff --git a/librdmacm/man/ucmatose.1 b/librdmacm/man/ucmatose.1 new file mode 100644 index 0000000..d672e9d --- /dev/null +++ b/librdmacm/man/ucmatose.1 @@ -0,0 +1,70 @@ +.\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md +.TH "UCMATOSE" 1 "2007-05-15" "librdmacm" "librdmacm" librdmacm +.SH NAME +ucmatose \- RDMA CM connection and simple ping-pong test. +.SH SYNOPSIS +.sp +.nf +\fIucmatose\fR [-s server_address] [-b bind_address] + [-f address_format] [-P port_space] + [-c connections] [-C message_count] [-S message_size] [-a ack_timeout] +\fIucmatose\fR -s server_address [-b bind_address] + [-f address_format] [-P port_space] + [-c connections] [-C message_count] [-S message_size] [-t tos] [-a ack_timeout] +.fi +.SH "DESCRIPTION" +Establishes a set of reliable RDMA connections between two nodes using the +librdmacm, optionally transfers data between the nodes, then disconnects. +.SH "OPTIONS" +.TP +\-s server_address +The network name or IP address of the server system listening for +connections. The used name or address must route over an RDMA device. +This option must be specified by the client. +.TP +\-b bind_address +The local network address to bind to. +To bind to any address with IPv6 use -b ::0 . +.TP +\-f address_format +Specifies the format of the server and bind address. Be default, the +format is determined by getaddrinfo() as either being a hostname, an IPv4 +address, or an IPv6 address. This option may be used to indicate that +a specific address format has been provided. Supported address_format +values are: name, ip, ipv6, and gid. +.TP +\-P port_space +Specifies the port space for the connection. Be default, the port space +is the RDMA TCP port space. (Note that the RDMA port space may be separate +from that used for IP.) Supported port_space values are: tcp and ib. +.TP +\-c connections +The number of connections to establish between the client and server. +(default 1) +.TP +\-C message_count +The number of messages to transfer over each connection. (default 10) +.TP +\-S message_size +The size of each message transferred, in bytes. (default 100) +.TP +\-t tos +Indicates the type of service used for the communication. Type of service +is implementation dependent based on subnet configuration. +.TP +\-a ack_timeout +Indicates the QP ACK timeout value that should be used. +The value calculated according to the formula 4.096 * 2^(ack_timeout) usec. +.TP +\-m +Tests event channel migration. Migrates all communication identifiers to +a different event channel for disconnect events. +.SH "NOTES" +Basic usage is to start ucmatose on a server system, then run +ucmatose -s server_name on a client system. +.P +Because this test maps RDMA resources to userspace, users must ensure +that they have available system resources and permissions. See the +libibverbs README file for additional details. +.SH "SEE ALSO" +rdma_cm(7), udaddy(1), mckey(1), rping(1) diff --git a/librdmacm/man/udaddy.1 b/librdmacm/man/udaddy.1 new file mode 100644 index 0000000..bc84504 --- /dev/null +++ b/librdmacm/man/udaddy.1 @@ -0,0 +1,56 @@ +.\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md +.TH "UDADDY" 1 "2007-05-15" "librdmacm" "librdmacm" librdmacm +.SH NAME +udaddy \- RDMA CM datagram setup and simple ping-pong test. +.SH SYNOPSIS +.sp +.nf +\fIudaddy\fR [-s server_address] [-b bind_address] [-c connections] + [-C message_count] [-S message_size] [-p port_space] +\fIudaddy\fR -s server_address [-b bind_address] [-c connections] + [-C message_count] [-S message_size] [-t tos] [-p port_space] +.fi +.SH "DESCRIPTION" +Establishes a set of unreliable RDMA datagram communication paths between two +nodes using the librdmacm, optionally transfers datagrams between the nodes, +then tears down the communication. +.SH "OPTIONS" +.TP +\-s server_address +The network name or IP address of the server system listening for +communication. The used name or address must route over an RDMA device. +This option must be specified by the client. +.TP +\-b bind_address +The local network address to bind to. +To bind to any address with IPv6 use -b ::0 . +.TP +\-c connections +The number of communication paths to establish between the client and server. +The test uses unreliable datagram communication, so no actual connections are +formed. (default 1) +.TP +\-C message_count +The number of messages to transfer over each connection. (default 10) +.TP +\-S message_size +The size of each message transferred, in bytes. This value must be smaller +than the MTU of the underlying RDMA transport, or an error will occur. +(default 100) +.TP +\-t tos +Indicates the type of service used for the communication. Type of service +is implementation dependent based on subnet configuration. +.TP +\-p port_space +The port space of the datagram communication. May be either the RDMA +UDP (0x0111) or IPoIB (0x0002) port space. (default RDMA_PS_UDP) +.SH "NOTES" +Basic usage is to start udaddy on a server system, then run +udaddy -s server_name on a client system. +.P +Because this test maps RDMA resources to userspace, users must ensure +that they have available system resources and permissions. See the +libibverbs README file for additional details. +.SH "SEE ALSO" +rdma_cm(7), ucmatose(1), mckey(1), rping(1) diff --git a/librdmacm/man/udpong.1 b/librdmacm/man/udpong.1 new file mode 100644 index 0000000..008178b --- /dev/null +++ b/librdmacm/man/udpong.1 @@ -0,0 +1,62 @@ +.\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md +.TH "UDPONG" 1 "2017-04-28" "librdmacm" "librdmacm" librdmacm +.SH NAME +udpong \- unreliable datagram streaming over RDMA ping-pong test. +.SH SYNOPSIS +.sp +.nf +\fIudpong\fR [-s server_address] [-b bind_address] + [-B buffer_size] [-C transfer_count] + [-S transfer_size] [-p server_port] [-T test_option] +.fi +.SH "DESCRIPTION" +Uses unreliable datagram streaming over RDMA protocol (rsocket) to +connect and exchange data between a client and server application. +.SH "OPTIONS" +.TP +\-s server_address +The network name or IP address of the server system listening for +connections. The used name or address must route over an RDMA device. +This option must be specified by the client. +.TP +\-b bind_address +The local network address to bind to. +.TP +\-B buffer_size +Indicates the size of the send and receive network buffers. +.TP +\-C transfer_count +The number of messages to transfer from the client to the server and +back again on each iteration. (default 1000) +.TP +\-S transfer_size +The size of each send transfer, in bytes. (default 1000) +.TP +\-p server_port +The server's port number. +.TP +\-T test_option +Specifies test parameters. Available options are: +.P +s | socket - uses standard socket calls to transfer data +.P +a | async - uses asynchronous operation (e.g. select / poll) +.P +b | blocking - uses blocking calls +.P +n | nonblocking - uses non-blocking calls +.P +e | echo - server echoes all messages +.SH "NOTES" +Basic usage is to start udpong on a server system, then run +udpong -s server_name on a client system. udpong +will run a series of latency and bandwidth performance tests. +Specifying a different transfer_count or transfer_size +will run a user customized test using default values where none +have been specified. +.P +Because this test maps RDMA resources to userspace, users must ensure +that they have available system resources and permissions. See the +libibverbs README file for additional details. +.SH "SEE ALSO" +rdma_cm(7) diff --git a/librdmacm/preload.c b/librdmacm/preload.c new file mode 100644 index 0000000..d46beb1 --- /dev/null +++ b/librdmacm/preload.c @@ -0,0 +1,1188 @@ +/* + * Copyright (c) 2011-2012 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#define _GNU_SOURCE +#include <config.h> + +#include <sys/types.h> +#include <sys/socket.h> +#include <sys/stat.h> +#include <sys/mman.h> +#include <sys/sendfile.h> +#include <stdarg.h> +#include <dlfcn.h> +#include <netdb.h> +#include <unistd.h> +#include <fcntl.h> +#include <string.h> +#include <netinet/tcp.h> +#include <unistd.h> +#include <semaphore.h> +#include <ctype.h> +#include <stdlib.h> +#include <stdio.h> + +#include <sys/uio.h> + +#include <rdma/rdma_cma.h> +#include <rdma/rdma_verbs.h> +#include <rdma/rsocket.h> +#include "cma.h" +#include "indexer.h" + +struct socket_calls { + int (*socket)(int domain, int type, int protocol); + int (*bind)(int socket, const struct sockaddr *addr, socklen_t addrlen); + int (*listen)(int socket, int backlog); + int (*accept)(int socket, struct sockaddr *addr, socklen_t *addrlen); + int (*connect)(int socket, const struct sockaddr *addr, socklen_t addrlen); + ssize_t (*recv)(int socket, void *buf, size_t len, int flags); + ssize_t (*recvfrom)(int socket, void *buf, size_t len, int flags, + struct sockaddr *src_addr, socklen_t *addrlen); + ssize_t (*recvmsg)(int socket, struct msghdr *msg, int flags); + ssize_t (*read)(int socket, void *buf, size_t count); + ssize_t (*readv)(int socket, const struct iovec *iov, int iovcnt); + ssize_t (*send)(int socket, const void *buf, size_t len, int flags); + ssize_t (*sendto)(int socket, const void *buf, size_t len, int flags, + const struct sockaddr *dest_addr, socklen_t addrlen); + ssize_t (*sendmsg)(int socket, const struct msghdr *msg, int flags); + ssize_t (*write)(int socket, const void *buf, size_t count); + ssize_t (*writev)(int socket, const struct iovec *iov, int iovcnt); + int (*poll)(struct pollfd *fds, nfds_t nfds, int timeout); + int (*shutdown)(int socket, int how); + int (*close)(int socket); + int (*getpeername)(int socket, struct sockaddr *addr, socklen_t *addrlen); + int (*getsockname)(int socket, struct sockaddr *addr, socklen_t *addrlen); + int (*setsockopt)(int socket, int level, int optname, + const void *optval, socklen_t optlen); + int (*getsockopt)(int socket, int level, int optname, + void *optval, socklen_t *optlen); + int (*fcntl)(int socket, int cmd, ... /* arg */); + int (*dup2)(int oldfd, int newfd); + ssize_t (*sendfile)(int out_fd, int in_fd, off_t *offset, size_t count); + int (*fxstat)(int ver, int fd, struct stat *buf); +}; + +static struct socket_calls real; +static struct socket_calls rs; + +static struct index_map idm; +static pthread_mutex_t mut = PTHREAD_MUTEX_INITIALIZER; + +static int sq_size; +static int rq_size; +static int sq_inline; +static int fork_support; + +enum fd_type { + fd_normal, + fd_rsocket +}; + +enum fd_fork_state { + fd_ready, + fd_fork, + fd_fork_listen, + fd_fork_active, + fd_fork_passive +}; + +struct fd_info { + enum fd_type type; + enum fd_fork_state state; + int fd; + int dupfd; + _Atomic(int) refcnt; +}; + +struct config_entry { + char *name; + int domain; + int type; + int protocol; +}; + +static struct config_entry *config; +static int config_cnt; + +static void free_config(void) +{ + while (config_cnt) + free(config[--config_cnt].name); + + free(config); +} + +/* + * Config file format: + * # Starting '#' indicates comment + * # wild card values are supported using '*' + * # domain - *, INET, INET6, IB + * # type - *, STREAM, DGRAM + * # protocol - *, TCP, UDP + * program_name domain type protocol + */ +static void scan_config(void) +{ + struct config_entry *new_config; + FILE *fp; + char line[120], prog[64], dom[16], type[16], proto[16]; + + fp = fopen(RS_CONF_DIR "/preload_config", "r"); + if (!fp) + return; + + while (fgets(line, sizeof(line), fp)) { + if (line[0] == '#') + continue; + + if (sscanf(line, "%63s%15s%15s%15s", prog, dom, type, proto) != 4) + continue; + + new_config = realloc(config, (config_cnt + 1) * + sizeof(struct config_entry)); + if (!new_config) + break; + + config = new_config; + memset(&config[config_cnt], 0, sizeof(struct config_entry)); + + if (!strcasecmp(dom, "INET") || + !strcasecmp(dom, "AF_INET") || + !strcasecmp(dom, "PF_INET")) { + config[config_cnt].domain = AF_INET; + } else if (!strcasecmp(dom, "INET6") || + !strcasecmp(dom, "AF_INET6") || + !strcasecmp(dom, "PF_INET6")) { + config[config_cnt].domain = AF_INET6; + } else if (!strcasecmp(dom, "IB") || + !strcasecmp(dom, "AF_IB") || + !strcasecmp(dom, "PF_IB")) { + config[config_cnt].domain = AF_IB; + } else if (strcmp(dom, "*")) { + continue; + } + + if (!strcasecmp(type, "STREAM") || + !strcasecmp(type, "SOCK_STREAM")) { + config[config_cnt].type = SOCK_STREAM; + } else if (!strcasecmp(type, "DGRAM") || + !strcasecmp(type, "SOCK_DGRAM")) { + config[config_cnt].type = SOCK_DGRAM; + } else if (strcmp(type, "*")) { + continue; + } + + if (!strcasecmp(proto, "TCP") || + !strcasecmp(proto, "IPPROTO_TCP")) { + config[config_cnt].protocol = IPPROTO_TCP; + } else if (!strcasecmp(proto, "UDP") || + !strcasecmp(proto, "IPPROTO_UDP")) { + config[config_cnt].protocol = IPPROTO_UDP; + } else if (strcmp(proto, "*")) { + continue; + } + + if (strcmp(prog, "*")) { + if (!(config[config_cnt].name = strdup(prog))) + continue; + } + + config_cnt++; + } + + fclose(fp); + if (config_cnt) + atexit(free_config); +} + +static int intercept_socket(int domain, int type, int protocol) +{ + int i; + + if (!config_cnt) + return 1; + + if (!protocol) { + if (type == SOCK_STREAM) + protocol = IPPROTO_TCP; + else if (type == SOCK_DGRAM) + protocol = IPPROTO_UDP; + } + + for (i = 0; i < config_cnt; i++) { + if ((!config[i].name || + !strncasecmp(config[i].name, program_invocation_short_name, + strlen(config[i].name))) && + (!config[i].domain || config[i].domain == domain) && + (!config[i].type || config[i].type == type) && + (!config[i].protocol || config[i].protocol == protocol)) + return 1; + } + + return 0; +} + +static int fd_open(void) +{ + struct fd_info *fdi; + int ret, index; + + fdi = calloc(1, sizeof(*fdi)); + if (!fdi) + return ERR(ENOMEM); + + index = open("/dev/null", O_RDONLY); + if (index < 0) { + ret = index; + goto err1; + } + + fdi->dupfd = -1; + atomic_store(&fdi->refcnt, 1); + pthread_mutex_lock(&mut); + ret = idm_set(&idm, index, fdi); + pthread_mutex_unlock(&mut); + if (ret < 0) + goto err2; + + return index; + +err2: + real.close(index); +err1: + free(fdi); + return ret; +} + +static void fd_store(int index, int fd, enum fd_type type, enum fd_fork_state state) +{ + struct fd_info *fdi; + + fdi = idm_at(&idm, index); + fdi->fd = fd; + fdi->type = type; + fdi->state = state; +} + +static inline enum fd_type fd_get(int index, int *fd) +{ + struct fd_info *fdi; + + fdi = idm_lookup(&idm, index); + if (fdi) { + *fd = fdi->fd; + return fdi->type; + + } else { + *fd = index; + return fd_normal; + } +} + +static inline int fd_getd(int index) +{ + struct fd_info *fdi; + + fdi = idm_lookup(&idm, index); + return fdi ? fdi->fd : index; +} + +static inline enum fd_fork_state fd_gets(int index) +{ + struct fd_info *fdi; + + fdi = idm_lookup(&idm, index); + return fdi ? fdi->state : fd_ready; +} + +static inline enum fd_type fd_gett(int index) +{ + struct fd_info *fdi; + + fdi = idm_lookup(&idm, index); + return fdi ? fdi->type : fd_normal; +} + +static enum fd_type fd_close(int index, int *fd) +{ + struct fd_info *fdi; + enum fd_type type; + + fdi = idm_lookup(&idm, index); + if (fdi) { + idm_clear(&idm, index); + *fd = fdi->fd; + type = fdi->type; + real.close(index); + free(fdi); + } else { + *fd = index; + type = fd_normal; + } + return type; +} + +static void getenv_options(void) +{ + char *var; + + var = getenv("RS_SQ_SIZE"); + if (var) + sq_size = atoi(var); + + var = getenv("RS_RQ_SIZE"); + if (var) + rq_size = atoi(var); + + var = getenv("RS_INLINE"); + if (var) + sq_inline = atoi(var); + + var = getenv("RDMAV_FORK_SAFE"); + if (var) + fork_support = atoi(var); +} + +static void init_preload(void) +{ + static int init; + + /* Quick check without lock */ + if (init) + return; + + pthread_mutex_lock(&mut); + if (init) + goto out; + + real.socket = dlsym(RTLD_NEXT, "socket"); + real.bind = dlsym(RTLD_NEXT, "bind"); + real.listen = dlsym(RTLD_NEXT, "listen"); + real.accept = dlsym(RTLD_NEXT, "accept"); + real.connect = dlsym(RTLD_NEXT, "connect"); + real.recv = dlsym(RTLD_NEXT, "recv"); + real.recvfrom = dlsym(RTLD_NEXT, "recvfrom"); + real.recvmsg = dlsym(RTLD_NEXT, "recvmsg"); + real.read = dlsym(RTLD_NEXT, "read"); + real.readv = dlsym(RTLD_NEXT, "readv"); + real.send = dlsym(RTLD_NEXT, "send"); + real.sendto = dlsym(RTLD_NEXT, "sendto"); + real.sendmsg = dlsym(RTLD_NEXT, "sendmsg"); + real.write = dlsym(RTLD_NEXT, "write"); + real.writev = dlsym(RTLD_NEXT, "writev"); + real.poll = dlsym(RTLD_NEXT, "poll"); + real.shutdown = dlsym(RTLD_NEXT, "shutdown"); + real.close = dlsym(RTLD_NEXT, "close"); + real.getpeername = dlsym(RTLD_NEXT, "getpeername"); + real.getsockname = dlsym(RTLD_NEXT, "getsockname"); + real.setsockopt = dlsym(RTLD_NEXT, "setsockopt"); + real.getsockopt = dlsym(RTLD_NEXT, "getsockopt"); + real.fcntl = dlsym(RTLD_NEXT, "fcntl"); + real.dup2 = dlsym(RTLD_NEXT, "dup2"); + real.sendfile = dlsym(RTLD_NEXT, "sendfile"); + real.fxstat = dlsym(RTLD_NEXT, "__fxstat"); + + rs.socket = dlsym(RTLD_DEFAULT, "rsocket"); + rs.bind = dlsym(RTLD_DEFAULT, "rbind"); + rs.listen = dlsym(RTLD_DEFAULT, "rlisten"); + rs.accept = dlsym(RTLD_DEFAULT, "raccept"); + rs.connect = dlsym(RTLD_DEFAULT, "rconnect"); + rs.recv = dlsym(RTLD_DEFAULT, "rrecv"); + rs.recvfrom = dlsym(RTLD_DEFAULT, "rrecvfrom"); + rs.recvmsg = dlsym(RTLD_DEFAULT, "rrecvmsg"); + rs.read = dlsym(RTLD_DEFAULT, "rread"); + rs.readv = dlsym(RTLD_DEFAULT, "rreadv"); + rs.send = dlsym(RTLD_DEFAULT, "rsend"); + rs.sendto = dlsym(RTLD_DEFAULT, "rsendto"); + rs.sendmsg = dlsym(RTLD_DEFAULT, "rsendmsg"); + rs.write = dlsym(RTLD_DEFAULT, "rwrite"); + rs.writev = dlsym(RTLD_DEFAULT, "rwritev"); + rs.poll = dlsym(RTLD_DEFAULT, "rpoll"); + rs.shutdown = dlsym(RTLD_DEFAULT, "rshutdown"); + rs.close = dlsym(RTLD_DEFAULT, "rclose"); + rs.getpeername = dlsym(RTLD_DEFAULT, "rgetpeername"); + rs.getsockname = dlsym(RTLD_DEFAULT, "rgetsockname"); + rs.setsockopt = dlsym(RTLD_DEFAULT, "rsetsockopt"); + rs.getsockopt = dlsym(RTLD_DEFAULT, "rgetsockopt"); + rs.fcntl = dlsym(RTLD_DEFAULT, "rfcntl"); + + getenv_options(); + scan_config(); + init = 1; +out: + pthread_mutex_unlock(&mut); +} + +/* + * We currently only handle copying a few common values. + */ +static int copysockopts(int dfd, int sfd, struct socket_calls *dapi, + struct socket_calls *sapi) +{ + socklen_t len; + int param, ret; + + ret = sapi->fcntl(sfd, F_GETFL); + if (ret > 0) + ret = dapi->fcntl(dfd, F_SETFL, ret); + if (ret) + return ret; + + len = sizeof param; + ret = sapi->getsockopt(sfd, SOL_SOCKET, SO_REUSEADDR, ¶m, &len); + if (param && !ret) + ret = dapi->setsockopt(dfd, SOL_SOCKET, SO_REUSEADDR, ¶m, len); + if (ret) + return ret; + + len = sizeof param; + ret = sapi->getsockopt(sfd, IPPROTO_TCP, TCP_NODELAY, ¶m, &len); + if (param && !ret) + ret = dapi->setsockopt(dfd, IPPROTO_TCP, TCP_NODELAY, ¶m, len); + if (ret) + return ret; + + return 0; +} + +/* + * Convert between an rsocket and a normal socket. + */ +static int transpose_socket(int socket, enum fd_type new_type) +{ + socklen_t len = 0; + int sfd, dfd, param, ret; + struct socket_calls *sapi, *dapi; + + sfd = fd_getd(socket); + if (new_type == fd_rsocket) { + dapi = &rs; + sapi = ℜ + } else { + dapi = ℜ + sapi = &rs; + } + + ret = sapi->getsockname(sfd, NULL, &len); + if (ret) + return ret; + + param = (len == sizeof(struct sockaddr_in6)) ? PF_INET6 : PF_INET; + dfd = dapi->socket(param, SOCK_STREAM, 0); + if (dfd < 0) + return dfd; + + ret = copysockopts(dfd, sfd, dapi, sapi); + if (ret) + goto err; + + fd_store(socket, dfd, new_type, fd_ready); + return dfd; + +err: + dapi->close(dfd); + return ret; +} + +/* + * Use defaults on failure. + */ +static void set_rsocket_options(int rsocket) +{ + if (sq_size) + rsetsockopt(rsocket, SOL_RDMA, RDMA_SQSIZE, &sq_size, sizeof sq_size); + + if (rq_size) + rsetsockopt(rsocket, SOL_RDMA, RDMA_RQSIZE, &rq_size, sizeof rq_size); + + if (sq_inline) + rsetsockopt(rsocket, SOL_RDMA, RDMA_INLINE, &sq_inline, sizeof sq_inline); +} + +int socket(int domain, int type, int protocol) +{ + static __thread int recursive; + int index, ret; + + init_preload(); + + if (recursive || !intercept_socket(domain, type, protocol)) + goto real; + + index = fd_open(); + if (index < 0) + return index; + + if (fork_support && (domain == PF_INET || domain == PF_INET6) && + (type == SOCK_STREAM) && (!protocol || protocol == IPPROTO_TCP)) { + ret = real.socket(domain, type, protocol); + if (ret < 0) + return ret; + fd_store(index, ret, fd_normal, fd_fork); + return index; + } + + recursive = 1; + ret = rsocket(domain, type, protocol); + recursive = 0; + if (ret >= 0) { + fd_store(index, ret, fd_rsocket, fd_ready); + set_rsocket_options(ret); + return index; + } + fd_close(index, &ret); +real: + return real.socket(domain, type, protocol); +} + +int bind(int socket, const struct sockaddr *addr, socklen_t addrlen) +{ + int fd; + return (fd_get(socket, &fd) == fd_rsocket) ? + rbind(fd, addr, addrlen) : real.bind(fd, addr, addrlen); +} + +int listen(int socket, int backlog) +{ + int fd, ret; + if (fd_get(socket, &fd) == fd_rsocket) { + ret = rlisten(fd, backlog); + } else { + ret = real.listen(fd, backlog); + if (!ret && fd_gets(socket) == fd_fork) + fd_store(socket, fd, fd_normal, fd_fork_listen); + } + return ret; +} + +int accept(int socket, struct sockaddr *addr, socklen_t *addrlen) +{ + int fd, index, ret; + + if (fd_get(socket, &fd) == fd_rsocket) { + index = fd_open(); + if (index < 0) + return index; + + ret = raccept(fd, addr, addrlen); + if (ret < 0) { + fd_close(index, &fd); + return ret; + } + + fd_store(index, ret, fd_rsocket, fd_ready); + return index; + } else if (fd_gets(socket) == fd_fork_listen) { + index = fd_open(); + if (index < 0) + return index; + + ret = real.accept(fd, addr, addrlen); + if (ret < 0) { + fd_close(index, &fd); + return ret; + } + + fd_store(index, ret, fd_normal, fd_fork_passive); + return index; + } else { + return real.accept(fd, addr, addrlen); + } +} + +/* + * We can't fork RDMA connections and pass them from the parent to the child + * process. Instead, we need to establish the RDMA connection after calling + * fork. To do this, we delay establishing the RDMA connection until we try + * to send/receive on the server side. + */ +static void fork_active(int socket) +{ + struct sockaddr_storage addr; + int sfd, dfd, ret; + socklen_t len; + uint32_t msg; + long flags; + + sfd = fd_getd(socket); + + flags = real.fcntl(sfd, F_GETFL); + real.fcntl(sfd, F_SETFL, 0); + ret = real.recv(sfd, &msg, sizeof msg, MSG_PEEK); + real.fcntl(sfd, F_SETFL, flags); + if ((ret != sizeof msg) || msg) + goto err1; + + len = sizeof addr; + ret = real.getpeername(sfd, (struct sockaddr *) &addr, &len); + if (ret) + goto err1; + + dfd = rsocket(addr.ss_family, SOCK_STREAM, 0); + if (dfd < 0) + goto err1; + + ret = rconnect(dfd, (struct sockaddr *) &addr, len); + if (ret) + goto err2; + + set_rsocket_options(dfd); + copysockopts(dfd, sfd, &rs, &real); + real.shutdown(sfd, SHUT_RDWR); + real.close(sfd); + fd_store(socket, dfd, fd_rsocket, fd_ready); + return; + +err2: + rclose(dfd); +err1: + fd_store(socket, sfd, fd_normal, fd_ready); +} + +/* + * The server will start listening for the new connection, then send a + * message to the active side when the listen is ready. This does leave + * fork unsupported in the following case: the server is nonblocking and + * calls select/poll waiting to receive data from the client. + */ +static void fork_passive(int socket) +{ + struct sockaddr_in6 sin6; + sem_t *sem; + int lfd, sfd, dfd, ret, param; + socklen_t len; + uint32_t msg; + + sfd = fd_getd(socket); + + len = sizeof sin6; + ret = real.getsockname(sfd, (struct sockaddr *) &sin6, &len); + if (ret) + goto out; + sin6.sin6_flowinfo = 0; + sin6.sin6_scope_id = 0; + memset(&sin6.sin6_addr, 0, sizeof sin6.sin6_addr); + + sem = sem_open("/rsocket_fork", O_CREAT | O_RDWR, + S_IRWXU | S_IRWXG, 1); + if (sem == SEM_FAILED) { + ret = -1; + goto out; + } + + lfd = rsocket(sin6.sin6_family, SOCK_STREAM, 0); + if (lfd < 0) { + ret = lfd; + goto sclose; + } + + param = 1; + rsetsockopt(lfd, SOL_SOCKET, SO_REUSEADDR, ¶m, sizeof param); + + sem_wait(sem); + ret = rbind(lfd, (struct sockaddr *) &sin6, sizeof sin6); + if (ret) + goto lclose; + + ret = rlisten(lfd, 1); + if (ret) + goto lclose; + + msg = 0; + len = real.write(sfd, &msg, sizeof msg); + if (len != sizeof msg) + goto lclose; + + dfd = raccept(lfd, NULL, NULL); + if (dfd < 0) { + ret = dfd; + goto lclose; + } + + set_rsocket_options(dfd); + copysockopts(dfd, sfd, &rs, &real); + real.shutdown(sfd, SHUT_RDWR); + real.close(sfd); + fd_store(socket, dfd, fd_rsocket, fd_ready); + +lclose: + rclose(lfd); + sem_post(sem); +sclose: + sem_close(sem); +out: + if (ret) + fd_store(socket, sfd, fd_normal, fd_ready); +} + +static inline enum fd_type fd_fork_get(int index, int *fd) +{ + struct fd_info *fdi; + + fdi = idm_lookup(&idm, index); + if (fdi) { + if (fdi->state == fd_fork_passive) + fork_passive(index); + else if (fdi->state == fd_fork_active) + fork_active(index); + *fd = fdi->fd; + return fdi->type; + + } else { + *fd = index; + return fd_normal; + } +} + +int connect(int socket, const struct sockaddr *addr, socklen_t addrlen) +{ + int fd, ret; + + if (fd_get(socket, &fd) == fd_rsocket) { + ret = rconnect(fd, addr, addrlen); + if (!ret || errno == EINPROGRESS) + return ret; + + ret = transpose_socket(socket, fd_normal); + if (ret < 0) + return ret; + + rclose(fd); + fd = ret; + } else if (fd_gets(socket) == fd_fork) { + fd_store(socket, fd, fd_normal, fd_fork_active); + } + + return real.connect(fd, addr, addrlen); +} + +ssize_t recv(int socket, void *buf, size_t len, int flags) +{ + int fd; + return (fd_fork_get(socket, &fd) == fd_rsocket) ? + rrecv(fd, buf, len, flags) : real.recv(fd, buf, len, flags); +} + +ssize_t recvfrom(int socket, void *buf, size_t len, int flags, + struct sockaddr *src_addr, socklen_t *addrlen) +{ + int fd; + return (fd_fork_get(socket, &fd) == fd_rsocket) ? + rrecvfrom(fd, buf, len, flags, src_addr, addrlen) : + real.recvfrom(fd, buf, len, flags, src_addr, addrlen); +} + +ssize_t recvmsg(int socket, struct msghdr *msg, int flags) +{ + int fd; + return (fd_fork_get(socket, &fd) == fd_rsocket) ? + rrecvmsg(fd, msg, flags) : real.recvmsg(fd, msg, flags); +} + +ssize_t read(int socket, void *buf, size_t count) +{ + int fd; + init_preload(); + return (fd_fork_get(socket, &fd) == fd_rsocket) ? + rread(fd, buf, count) : real.read(fd, buf, count); +} + +ssize_t readv(int socket, const struct iovec *iov, int iovcnt) +{ + int fd; + init_preload(); + return (fd_fork_get(socket, &fd) == fd_rsocket) ? + rreadv(fd, iov, iovcnt) : real.readv(fd, iov, iovcnt); +} + +ssize_t send(int socket, const void *buf, size_t len, int flags) +{ + int fd; + return (fd_fork_get(socket, &fd) == fd_rsocket) ? + rsend(fd, buf, len, flags) : real.send(fd, buf, len, flags); +} + +ssize_t sendto(int socket, const void *buf, size_t len, int flags, + const struct sockaddr *dest_addr, socklen_t addrlen) +{ + int fd; + return (fd_fork_get(socket, &fd) == fd_rsocket) ? + rsendto(fd, buf, len, flags, dest_addr, addrlen) : + real.sendto(fd, buf, len, flags, dest_addr, addrlen); +} + +ssize_t sendmsg(int socket, const struct msghdr *msg, int flags) +{ + int fd; + return (fd_fork_get(socket, &fd) == fd_rsocket) ? + rsendmsg(fd, msg, flags) : real.sendmsg(fd, msg, flags); +} + +ssize_t write(int socket, const void *buf, size_t count) +{ + int fd; + init_preload(); + return (fd_fork_get(socket, &fd) == fd_rsocket) ? + rwrite(fd, buf, count) : real.write(fd, buf, count); +} + +ssize_t writev(int socket, const struct iovec *iov, int iovcnt) +{ + int fd; + init_preload(); + return (fd_fork_get(socket, &fd) == fd_rsocket) ? + rwritev(fd, iov, iovcnt) : real.writev(fd, iov, iovcnt); +} + +static struct pollfd *fds_alloc(nfds_t nfds) +{ + static __thread struct pollfd *rfds; + static __thread nfds_t rnfds; + + if (nfds > rnfds) { + if (rfds) + free(rfds); + + rfds = malloc(sizeof(*rfds) * nfds); + rnfds = rfds ? nfds : 0; + } + + return rfds; +} + +int poll(struct pollfd *fds, nfds_t nfds, int timeout) +{ + struct pollfd *rfds; + int i, ret; + + init_preload(); + for (i = 0; i < nfds; i++) { + if (fd_gett(fds[i].fd) == fd_rsocket) + goto use_rpoll; + } + + return real.poll(fds, nfds, timeout); + +use_rpoll: + rfds = fds_alloc(nfds); + if (!rfds) + return ERR(ENOMEM); + + for (i = 0; i < nfds; i++) { + rfds[i].fd = fd_getd(fds[i].fd); + rfds[i].events = fds[i].events; + rfds[i].revents = 0; + } + + ret = rpoll(rfds, nfds, timeout); + + for (i = 0; i < nfds; i++) + fds[i].revents = rfds[i].revents; + + return ret; +} + +static void select_to_rpoll(struct pollfd *fds, int *nfds, + fd_set *readfds, fd_set *writefds, fd_set *exceptfds) +{ + int fd, events, i = 0; + + for (fd = 0; fd < *nfds; fd++) { + events = (readfds && FD_ISSET(fd, readfds)) ? POLLIN : 0; + if (writefds && FD_ISSET(fd, writefds)) + events |= POLLOUT; + + if (events || (exceptfds && FD_ISSET(fd, exceptfds))) { + fds[i].fd = fd_getd(fd); + fds[i++].events = events; + } + } + + *nfds = i; +} + +static int rpoll_to_select(struct pollfd *fds, int nfds, + fd_set *readfds, fd_set *writefds, fd_set *exceptfds) +{ + int fd, rfd, i, cnt = 0; + + for (i = 0, fd = 0; i < nfds; fd++) { + rfd = fd_getd(fd); + if (rfd != fds[i].fd) + continue; + + if (readfds && (fds[i].revents & POLLIN)) { + FD_SET(fd, readfds); + cnt++; + } + + if (writefds && (fds[i].revents & POLLOUT)) { + FD_SET(fd, writefds); + cnt++; + } + + if (exceptfds && (fds[i].revents & ~(POLLIN | POLLOUT))) { + FD_SET(fd, exceptfds); + cnt++; + } + i++; + } + + return cnt; +} + +static int rs_convert_timeout(struct timeval *timeout) +{ + return !timeout ? -1 : timeout->tv_sec * 1000 + timeout->tv_usec / 1000; +} + +int select(int nfds, fd_set *readfds, fd_set *writefds, + fd_set *exceptfds, struct timeval *timeout) +{ + struct pollfd *fds; + int ret; + + fds = fds_alloc(nfds); + if (!fds) + return ERR(ENOMEM); + + select_to_rpoll(fds, &nfds, readfds, writefds, exceptfds); + ret = rpoll(fds, nfds, rs_convert_timeout(timeout)); + + if (readfds) + FD_ZERO(readfds); + if (writefds) + FD_ZERO(writefds); + if (exceptfds) + FD_ZERO(exceptfds); + + if (ret > 0) + ret = rpoll_to_select(fds, nfds, readfds, writefds, exceptfds); + + return ret; +} + +int shutdown(int socket, int how) +{ + int fd; + return (fd_get(socket, &fd) == fd_rsocket) ? + rshutdown(fd, how) : real.shutdown(fd, how); +} + +int close(int socket) +{ + struct fd_info *fdi; + int ret; + + init_preload(); + fdi = idm_lookup(&idm, socket); + if (!fdi) + return real.close(socket); + + if (fdi->dupfd != -1) { + ret = close(fdi->dupfd); + if (ret) + return ret; + } + + if (atomic_fetch_sub(&fdi->refcnt, 1) != 1) + return 0; + + idm_clear(&idm, socket); + real.close(socket); + ret = (fdi->type == fd_rsocket) ? rclose(fdi->fd) : real.close(fdi->fd); + free(fdi); + return ret; +} + +int getpeername(int socket, struct sockaddr *addr, socklen_t *addrlen) +{ + int fd; + return (fd_get(socket, &fd) == fd_rsocket) ? + rgetpeername(fd, addr, addrlen) : + real.getpeername(fd, addr, addrlen); +} + +int getsockname(int socket, struct sockaddr *addr, socklen_t *addrlen) +{ + int fd; + init_preload(); + return (fd_get(socket, &fd) == fd_rsocket) ? + rgetsockname(fd, addr, addrlen) : + real.getsockname(fd, addr, addrlen); +} + +int setsockopt(int socket, int level, int optname, + const void *optval, socklen_t optlen) +{ + int fd; + return (fd_get(socket, &fd) == fd_rsocket) ? + rsetsockopt(fd, level, optname, optval, optlen) : + real.setsockopt(fd, level, optname, optval, optlen); +} + +int getsockopt(int socket, int level, int optname, + void *optval, socklen_t *optlen) +{ + int fd; + return (fd_get(socket, &fd) == fd_rsocket) ? + rgetsockopt(fd, level, optname, optval, optlen) : + real.getsockopt(fd, level, optname, optval, optlen); +} + +int fcntl(int socket, int cmd, ... /* arg */) +{ + va_list args; + long lparam; + void *pparam; + int fd, ret; + + init_preload(); + va_start(args, cmd); + switch (cmd) { + case F_GETFD: + case F_GETFL: + case F_GETOWN: + case F_GETSIG: + case F_GETLEASE: + ret = (fd_get(socket, &fd) == fd_rsocket) ? + rfcntl(fd, cmd) : real.fcntl(fd, cmd); + break; + case F_DUPFD: + /*case F_DUPFD_CLOEXEC:*/ + case F_SETFD: + case F_SETFL: + case F_SETOWN: + case F_SETSIG: + case F_SETLEASE: + case F_NOTIFY: + lparam = va_arg(args, long); + ret = (fd_get(socket, &fd) == fd_rsocket) ? + rfcntl(fd, cmd, lparam) : real.fcntl(fd, cmd, lparam); + break; + default: + pparam = va_arg(args, void *); + ret = (fd_get(socket, &fd) == fd_rsocket) ? + rfcntl(fd, cmd, pparam) : real.fcntl(fd, cmd, pparam); + break; + } + va_end(args); + return ret; +} + +/* + * dup2 is not thread safe + */ +int dup2(int oldfd, int newfd) +{ + struct fd_info *oldfdi, *newfdi; + int ret; + + init_preload(); + oldfdi = idm_lookup(&idm, oldfd); + if (oldfdi) { + if (oldfdi->state == fd_fork_passive) + fork_passive(oldfd); + else if (oldfdi->state == fd_fork_active) + fork_active(oldfd); + } + + newfdi = idm_lookup(&idm, newfd); + if (newfdi) { + /* newfd cannot have been dup'ed directly */ + if (atomic_load(&newfdi->refcnt) > 1) + return ERR(EBUSY); + close(newfd); + } + + ret = real.dup2(oldfd, newfd); + if (!oldfdi || ret != newfd) + return ret; + + newfdi = calloc(1, sizeof(*newfdi)); + if (!newfdi) { + close(newfd); + return ERR(ENOMEM); + } + + pthread_mutex_lock(&mut); + idm_set(&idm, newfd, newfdi); + pthread_mutex_unlock(&mut); + + newfdi->fd = oldfdi->fd; + newfdi->type = oldfdi->type; + if (oldfdi->dupfd != -1) { + newfdi->dupfd = oldfdi->dupfd; + oldfdi = idm_lookup(&idm, oldfdi->dupfd); + } else { + newfdi->dupfd = oldfd; + } + atomic_store(&newfdi->refcnt, 1); + atomic_fetch_add(&oldfdi->refcnt, 1); + return newfd; +} + +ssize_t sendfile(int out_fd, int in_fd, off_t *offset, size_t count) +{ + void *file_addr; + int fd; + size_t ret; + + if (fd_get(out_fd, &fd) != fd_rsocket) + return real.sendfile(fd, in_fd, offset, count); + + file_addr = mmap(NULL, count, PROT_READ, 0, in_fd, offset ? *offset : 0); + if (file_addr == (void *) -1) + return -1; + + ret = rwrite(fd, file_addr, count); + if ((ret > 0) && offset) + lseek(in_fd, ret, SEEK_CUR); + munmap(file_addr, count); + return ret; +} + +int __fxstat(int ver, int socket, struct stat *buf) +{ + int fd, ret; + + init_preload(); + if (fd_get(socket, &fd) == fd_rsocket) { + ret = real.fxstat(ver, socket, buf); + if (!ret) + buf->st_mode = (buf->st_mode & ~S_IFMT) | S_IFSOCK; + } else { + ret = real.fxstat(ver, fd, buf); + } + return ret; +} diff --git a/librdmacm/rdma_cma.h b/librdmacm/rdma_cma.h new file mode 100644 index 0000000..1905033 --- /dev/null +++ b/librdmacm/rdma_cma.h @@ -0,0 +1,760 @@ +/* + * Copyright (c) 2005 Voltaire Inc. All rights reserved. + * Copyright (c) 2005-2014 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#if !defined(RDMA_CMA_H) +#define RDMA_CMA_H + +#include <netinet/in.h> +#include <sys/socket.h> +#include <infiniband/verbs.h> +#include <infiniband/sa.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Upon receiving a device removal event, users must destroy the associated + * RDMA identifier and release all resources allocated with the device. + */ +enum rdma_cm_event_type { + RDMA_CM_EVENT_ADDR_RESOLVED, + RDMA_CM_EVENT_ADDR_ERROR, + RDMA_CM_EVENT_ROUTE_RESOLVED, + RDMA_CM_EVENT_ROUTE_ERROR, + RDMA_CM_EVENT_CONNECT_REQUEST, + RDMA_CM_EVENT_CONNECT_RESPONSE, + RDMA_CM_EVENT_CONNECT_ERROR, + RDMA_CM_EVENT_UNREACHABLE, + RDMA_CM_EVENT_REJECTED, + RDMA_CM_EVENT_ESTABLISHED, + RDMA_CM_EVENT_DISCONNECTED, + RDMA_CM_EVENT_DEVICE_REMOVAL, + RDMA_CM_EVENT_MULTICAST_JOIN, + RDMA_CM_EVENT_MULTICAST_ERROR, + RDMA_CM_EVENT_ADDR_CHANGE, + RDMA_CM_EVENT_TIMEWAIT_EXIT +}; + +enum rdma_port_space { + RDMA_PS_IPOIB = 0x0002, + RDMA_PS_TCP = 0x0106, + RDMA_PS_UDP = 0x0111, + RDMA_PS_IB = 0x013F, +}; + +#define RDMA_IB_IP_PS_MASK 0xFFFFFFFFFFFF0000ULL +#define RDMA_IB_IP_PORT_MASK 0x000000000000FFFFULL +#define RDMA_IB_IP_PS_TCP 0x0000000001060000ULL +#define RDMA_IB_IP_PS_UDP 0x0000000001110000ULL +#define RDMA_IB_PS_IB 0x00000000013F0000ULL + +/* + * Global qkey value for UDP QPs and multicast groups created via the + * RDMA CM. + */ +#define RDMA_UDP_QKEY 0x01234567 + +struct rdma_ib_addr { + union ibv_gid sgid; + union ibv_gid dgid; + __be16 pkey; +}; + +struct rdma_addr { + union { + struct sockaddr src_addr; + struct sockaddr_in src_sin; + struct sockaddr_in6 src_sin6; + struct sockaddr_storage src_storage; + }; + union { + struct sockaddr dst_addr; + struct sockaddr_in dst_sin; + struct sockaddr_in6 dst_sin6; + struct sockaddr_storage dst_storage; + }; + union { + struct rdma_ib_addr ibaddr; + } addr; +}; + +struct rdma_route { + struct rdma_addr addr; + struct ibv_sa_path_rec *path_rec; + int num_paths; +}; + +struct rdma_event_channel { + int fd; +}; + +struct rdma_cm_id { + struct ibv_context *verbs; + struct rdma_event_channel *channel; + void *context; + struct ibv_qp *qp; + struct rdma_route route; + enum rdma_port_space ps; + uint8_t port_num; + struct rdma_cm_event *event; + struct ibv_comp_channel *send_cq_channel; + struct ibv_cq *send_cq; + struct ibv_comp_channel *recv_cq_channel; + struct ibv_cq *recv_cq; + struct ibv_srq *srq; + struct ibv_pd *pd; + enum ibv_qp_type qp_type; +}; + +enum { + RDMA_MAX_RESP_RES = 0xFF, + RDMA_MAX_INIT_DEPTH = 0xFF +}; + +struct rdma_conn_param { + const void *private_data; + uint8_t private_data_len; + uint8_t responder_resources; + uint8_t initiator_depth; + uint8_t flow_control; + uint8_t retry_count; /* ignored when accepting */ + uint8_t rnr_retry_count; + /* Fields below ignored if a QP is created on the rdma_cm_id. */ + uint8_t srq; + uint32_t qp_num; +}; + +struct rdma_ud_param { + const void *private_data; + uint8_t private_data_len; + struct ibv_ah_attr ah_attr; + uint32_t qp_num; + uint32_t qkey; +}; + +struct rdma_cm_event { + struct rdma_cm_id *id; + struct rdma_cm_id *listen_id; + enum rdma_cm_event_type event; + int status; + union { + struct rdma_conn_param conn; + struct rdma_ud_param ud; + } param; +}; + +#define RAI_PASSIVE 0x00000001 +#define RAI_NUMERICHOST 0x00000002 +#define RAI_NOROUTE 0x00000004 +#define RAI_FAMILY 0x00000008 + +struct rdma_addrinfo { + int ai_flags; + int ai_family; + int ai_qp_type; + int ai_port_space; + socklen_t ai_src_len; + socklen_t ai_dst_len; + struct sockaddr *ai_src_addr; + struct sockaddr *ai_dst_addr; + char *ai_src_canonname; + char *ai_dst_canonname; + size_t ai_route_len; + void *ai_route; + size_t ai_connect_len; + void *ai_connect; + struct rdma_addrinfo *ai_next; +}; + +/* Multicast join compatibility mask attributes */ +enum rdma_cm_join_mc_attr_mask { + RDMA_CM_JOIN_MC_ATTR_ADDRESS = 1 << 0, + RDMA_CM_JOIN_MC_ATTR_JOIN_FLAGS = 1 << 1, + RDMA_CM_JOIN_MC_ATTR_RESERVED = 1 << 2, +}; + +/* Multicast join flags */ +enum rdma_cm_mc_join_flags { + RDMA_MC_JOIN_FLAG_FULLMEMBER, + RDMA_MC_JOIN_FLAG_SENDONLY_FULLMEMBER, + RDMA_MC_JOIN_FLAG_RESERVED, +}; + +struct rdma_cm_join_mc_attr_ex { + /* Bitwise OR between "rdma_cm_join_mc_attr_mask" enum */ + uint32_t comp_mask; + /* Use a flag from "rdma_cm_mc_join_flags" enum */ + uint32_t join_flags; + /* Multicast address identifying the group to join */ + struct sockaddr *addr; +}; + +/** + * rdma_create_event_channel - Open a channel used to report communication events. + * Description: + * Asynchronous events are reported to users through event channels. Each + * event channel maps to a file descriptor. + * Notes: + * All created event channels must be destroyed by calling + * rdma_destroy_event_channel. Users should call rdma_get_cm_event to + * retrieve events on an event channel. + * See also: + * rdma_get_cm_event, rdma_destroy_event_channel + */ +struct rdma_event_channel *rdma_create_event_channel(void); + +/** + * rdma_destroy_event_channel - Close an event communication channel. + * @channel: The communication channel to destroy. + * Description: + * Release all resources associated with an event channel and closes the + * associated file descriptor. + * Notes: + * All rdma_cm_id's associated with the event channel must be destroyed, + * and all returned events must be acked before calling this function. + * See also: + * rdma_create_event_channel, rdma_get_cm_event, rdma_ack_cm_event + */ +void rdma_destroy_event_channel(struct rdma_event_channel *channel); + +/** + * rdma_create_id - Allocate a communication identifier. + * @channel: The communication channel that events associated with the + * allocated rdma_cm_id will be reported on. + * @id: A reference where the allocated communication identifier will be + * returned. + * @context: User specified context associated with the rdma_cm_id. + * @ps: RDMA port space. + * Description: + * Creates an identifier that is used to track communication information. + * Notes: + * Rdma_cm_id's are conceptually equivalent to a socket for RDMA + * communication. The difference is that RDMA communication requires + * explicitly binding to a specified RDMA device before communication + * can occur, and most operations are asynchronous in nature. Communication + * events on an rdma_cm_id are reported through the associated event + * channel. Users must release the rdma_cm_id by calling rdma_destroy_id. + * See also: + * rdma_create_event_channel, rdma_destroy_id, rdma_get_devices, + * rdma_bind_addr, rdma_resolve_addr, rdma_connect, rdma_listen, + */ +int rdma_create_id(struct rdma_event_channel *channel, + struct rdma_cm_id **id, void *context, + enum rdma_port_space ps); + +/** + * rdma_create_ep - Allocate a communication identifier and qp. + * @id: A reference where the allocated communication identifier will be + * returned. + * @res: Result from rdma_getaddrinfo, which specifies the source and + * destination addresses, plus optional routing and connection information. + * @pd: Optional protection domain. This parameter is ignored if qp_init_attr + * is NULL. + * @qp_init_attr: Optional attributes for a QP created on the rdma_cm_id. + * Description: + * Create an identifier and option QP used for communication. + * Notes: + * If qp_init_attr is provided, then a queue pair will be allocated and + * associated with the rdma_cm_id. If a pd is provided, the QP will be + * created on that PD. Otherwise, the QP will be allocated on a default + * PD. + * The rdma_cm_id will be set to use synchronous operations (connect, + * listen, and get_request). To convert to asynchronous operation, the + * rdma_cm_id should be migrated to a user allocated event channel. + * See also: + * rdma_create_id, rdma_create_qp, rdma_migrate_id, rdma_connect, + * rdma_listen + */ +int rdma_create_ep(struct rdma_cm_id **id, struct rdma_addrinfo *res, + struct ibv_pd *pd, struct ibv_qp_init_attr *qp_init_attr); + +/** + * rdma_destroy_ep - Deallocates a communication identifier and qp. + * @id: The communication identifier to destroy. + * Description: + * Destroys the specified rdma_cm_id and any associated QP created + * on that id. + * See also: + * rdma_create_ep + */ +void rdma_destroy_ep(struct rdma_cm_id *id); + +/** + * rdma_destroy_id - Release a communication identifier. + * @id: The communication identifier to destroy. + * Description: + * Destroys the specified rdma_cm_id and cancels any outstanding + * asynchronous operation. + * Notes: + * Users must free any associated QP with the rdma_cm_id before + * calling this routine and ack an related events. + * See also: + * rdma_create_id, rdma_destroy_qp, rdma_ack_cm_event + */ +int rdma_destroy_id(struct rdma_cm_id *id); + +/** + * rdma_bind_addr - Bind an RDMA identifier to a source address. + * @id: RDMA identifier. + * @addr: Local address information. Wildcard values are permitted. + * Description: + * Associates a source address with an rdma_cm_id. The address may be + * wildcarded. If binding to a specific local address, the rdma_cm_id + * will also be bound to a local RDMA device. + * Notes: + * Typically, this routine is called before calling rdma_listen to bind + * to a specific port number, but it may also be called on the active side + * of a connection before calling rdma_resolve_addr to bind to a specific + * address. + * See also: + * rdma_create_id, rdma_listen, rdma_resolve_addr, rdma_create_qp + */ +int rdma_bind_addr(struct rdma_cm_id *id, struct sockaddr *addr); + +/** + * rdma_resolve_addr - Resolve destination and optional source addresses. + * @id: RDMA identifier. + * @src_addr: Source address information. This parameter may be NULL. + * @dst_addr: Destination address information. + * @timeout_ms: Time to wait for resolution to complete. + * Description: + * Resolve destination and optional source addresses from IP addresses + * to an RDMA address. If successful, the specified rdma_cm_id will + * be bound to a local device. + * Notes: + * This call is used to map a given destination IP address to a usable RDMA + * address. If a source address is given, the rdma_cm_id is bound to that + * address, the same as if rdma_bind_addr were called. If no source + * address is given, and the rdma_cm_id has not yet been bound to a device, + * then the rdma_cm_id will be bound to a source address based on the + * local routing tables. After this call, the rdma_cm_id will be bound to + * an RDMA device. This call is typically made from the active side of a + * connection before calling rdma_resolve_route and rdma_connect. + * See also: + * rdma_create_id, rdma_resolve_route, rdma_connect, rdma_create_qp, + * rdma_get_cm_event, rdma_bind_addr + */ +int rdma_resolve_addr(struct rdma_cm_id *id, struct sockaddr *src_addr, + struct sockaddr *dst_addr, int timeout_ms); + +/** + * rdma_resolve_route - Resolve the route information needed to establish a connection. + * @id: RDMA identifier. + * @timeout_ms: Time to wait for resolution to complete. + * Description: + * Resolves an RDMA route to the destination address in order to establish + * a connection. The destination address must have already been resolved + * by calling rdma_resolve_addr. + * Notes: + * This is called on the client side of a connection after calling + * rdma_resolve_addr, but before calling rdma_connect. + * See also: + * rdma_resolve_addr, rdma_connect, rdma_get_cm_event + */ +int rdma_resolve_route(struct rdma_cm_id *id, int timeout_ms); + +/** + * rdma_create_qp - Allocate a QP. + * @id: RDMA identifier. + * @pd: Optional protection domain for the QP. + * @qp_init_attr: initial QP attributes. + * Description: + * Allocate a QP associated with the specified rdma_cm_id and transition it + * for sending and receiving. + * Notes: + * The rdma_cm_id must be bound to a local RDMA device before calling this + * function, and the protection domain must be for that same device. + * QPs allocated to an rdma_cm_id are automatically transitioned by the + * librdmacm through their states. After being allocated, the QP will be + * ready to handle posting of receives. If the QP is unconnected, it will + * be ready to post sends. + * If pd is NULL, then the QP will be allocated using a default protection + * domain associated with the underlying RDMA device. + * See also: + * rdma_bind_addr, rdma_resolve_addr, rdma_destroy_qp, ibv_create_qp, + * ibv_modify_qp + */ +int rdma_create_qp(struct rdma_cm_id *id, struct ibv_pd *pd, + struct ibv_qp_init_attr *qp_init_attr); +int rdma_create_qp_ex(struct rdma_cm_id *id, + struct ibv_qp_init_attr_ex *qp_init_attr); + +/** + * rdma_destroy_qp - Deallocate a QP. + * @id: RDMA identifier. + * Description: + * Destroy a QP allocated on the rdma_cm_id. + * Notes: + * Users must destroy any QP associated with an rdma_cm_id before + * destroying the ID. + * See also: + * rdma_create_qp, rdma_destroy_id, ibv_destroy_qp + */ +void rdma_destroy_qp(struct rdma_cm_id *id); + +/** + * rdma_connect - Initiate an active connection request. + * @id: RDMA identifier. + * @conn_param: optional connection parameters. + * Description: + * For a connected rdma_cm_id, this call initiates a connection request + * to a remote destination. For an unconnected rdma_cm_id, it initiates + * a lookup of the remote QP providing the datagram service. + * Notes: + * Users must have resolved a route to the destination address + * by having called rdma_resolve_route before calling this routine. + * A user may override the default connection parameters and exchange + * private data as part of the connection by using the conn_param parameter. + * See also: + * rdma_resolve_route, rdma_disconnect, rdma_listen, rdma_get_cm_event + */ +int rdma_connect(struct rdma_cm_id *id, struct rdma_conn_param *conn_param); + +/** + * rdma_establish - Complete an active connection request. + * @id: RDMA identifier. + * Description: + * Acknowledge an incoming connection response event and complete the + * connection establishment. + * Notes: + * If a QP has not been created on the rdma_cm_id, this function should be + * called by the active side to complete the connection, after getting connect + * response event. This will trigger a connection established event on the + * passive side. + * This function should not be used on an rdma_cm_id on which a QP has been + * created. + * See also: + * rdma_connect, rdma_disconnect, rdma_get_cm_event + */ +int rdma_establish(struct rdma_cm_id *id); + +/** + * rdma_listen - Listen for incoming connection requests. + * @id: RDMA identifier. + * @backlog: backlog of incoming connection requests. + * Description: + * Initiates a listen for incoming connection requests or datagram service + * lookup. The listen will be restricted to the locally bound source + * address. + * Notes: + * Users must have bound the rdma_cm_id to a local address by calling + * rdma_bind_addr before calling this routine. If the rdma_cm_id is + * bound to a specific IP address, the listen will be restricted to that + * address and the associated RDMA device. If the rdma_cm_id is bound + * to an RDMA port number only, the listen will occur across all RDMA + * devices. + * See also: + * rdma_bind_addr, rdma_connect, rdma_accept, rdma_reject, rdma_get_cm_event + */ +int rdma_listen(struct rdma_cm_id *id, int backlog); + +/** + * rdma_get_request + */ +int rdma_get_request(struct rdma_cm_id *listen, struct rdma_cm_id **id); + +/** + * rdma_accept - Called to accept a connection request. + * @id: Connection identifier associated with the request. + * @conn_param: Optional information needed to establish the connection. + * Description: + * Called from the listening side to accept a connection or datagram + * service lookup request. + * Notes: + * Unlike the socket accept routine, rdma_accept is not called on a + * listening rdma_cm_id. Instead, after calling rdma_listen, the user + * waits for a connection request event to occur. Connection request + * events give the user a newly created rdma_cm_id, similar to a new + * socket, but the rdma_cm_id is bound to a specific RDMA device. + * rdma_accept is called on the new rdma_cm_id. + * A user may override the default connection parameters and exchange + * private data as part of the connection by using the conn_param parameter. + * See also: + * rdma_listen, rdma_reject, rdma_get_cm_event + */ +int rdma_accept(struct rdma_cm_id *id, struct rdma_conn_param *conn_param); + +/** + * rdma_reject - Called to reject a connection request. + * @id: Connection identifier associated with the request. + * @private_data: Optional private data to send with the reject message. + * @private_data_len: Size of the private_data to send, in bytes. + * Description: + * Called from the listening side to reject a connection or datagram + * service lookup request. + * Notes: + * After receiving a connection request event, a user may call rdma_reject + * to reject the request. If the underlying RDMA transport supports + * private data in the reject message, the specified data will be passed to + * the remote side. + * See also: + * rdma_listen, rdma_accept, rdma_get_cm_event + */ +int rdma_reject(struct rdma_cm_id *id, const void *private_data, + uint8_t private_data_len); + +/** + * rdma_notify - Notifies the librdmacm of an asynchronous event. + * @id: RDMA identifier. + * @event: Asynchronous event. + * Description: + * Used to notify the librdmacm of asynchronous events that have occurred + * on a QP associated with the rdma_cm_id. + * Notes: + * Asynchronous events that occur on a QP are reported through the user's + * device event handler. This routine is used to notify the librdmacm of + * communication events. In most cases, use of this routine is not + * necessary, however if connection establishment is done out of band + * (such as done through Infiniband), it's possible to receive data on a + * QP that is not yet considered connected. This routine forces the + * connection into an established state in this case in order to handle + * the rare situation where the connection never forms on its own. + * Events that should be reported to the CM are: IB_EVENT_COMM_EST. + * See also: + * rdma_connect, rdma_accept, rdma_listen + */ +int rdma_notify(struct rdma_cm_id *id, enum ibv_event_type event); + +/** + * rdma_disconnect - This function disconnects a connection. + * @id: RDMA identifier. + * Description: + * Disconnects a connection and transitions any associated QP to the + * error state. + * See also: + * rdma_connect, rdma_listen, rdma_accept + */ +int rdma_disconnect(struct rdma_cm_id *id); + +/** + * rdma_join_multicast - Joins a multicast group. + * @id: Communication identifier associated with the request. + * @addr: Multicast address identifying the group to join. + * @context: User-defined context associated with the join request. + * Description: + * Joins a multicast group and attaches an associated QP to the group. + * Notes: + * Before joining a multicast group, the rdma_cm_id must be bound to + * an RDMA device by calling rdma_bind_addr or rdma_resolve_addr. Use of + * rdma_resolve_addr requires the local routing tables to resolve the + * multicast address to an RDMA device. The user must call + * rdma_leave_multicast to leave the multicast group and release any + * multicast resources. The context is returned to the user through + * the private_data field in the rdma_cm_event. + * See also: + * rdma_leave_multicast, rdma_bind_addr, rdma_resolve_addr, rdma_create_qp + */ +int rdma_join_multicast(struct rdma_cm_id *id, struct sockaddr *addr, + void *context); + +/** + * rdma_leave_multicast - Leaves a multicast group. + * @id: Communication identifier associated with the request. + * @addr: Multicast address identifying the group to leave. + * Description: + * Leaves a multicast group and detaches an associated QP from the group. + * Notes: + * Calling this function before a group has been fully joined results in + * canceling the join operation. Users should be aware that messages + * received from the multicast group may stilled be queued for + * completion processing immediately after leaving a multicast group. + * Destroying an rdma_cm_id will automatically leave all multicast groups. + * See also: + * rdma_join_multicast, rdma_destroy_qp + */ +int rdma_leave_multicast(struct rdma_cm_id *id, struct sockaddr *addr); + +/** + * rdma_multicast_ex - Joins a multicast group with options. + * @id: Communication identifier associated with the request. + * @mc_join_attr: Extensive struct containing multicast join parameters. + * @context: User-defined context associated with the join request. + * Description: + * Joins a multicast group with options. Currently supporting MC join flags. + * The QP will be attached based on the given join flag. + * Join message will be sent according to the join flag. + * Notes: + * Before joining a multicast group, the rdma_cm_id must be bound to + * an RDMA device by calling rdma_bind_addr or rdma_resolve_addr. Use of + * rdma_resolve_addr requires the local routing tables to resolve the + * multicast address to an RDMA device. The user must call + * rdma_leave_multicast to leave the multicast group and release any + * multicast resources. The context is returned to the user through + * the private_data field in the rdma_cm_event. + * See also: + * rdma_leave_multicast, rdma_bind_addr, rdma_resolve_addr, rdma_create_qp + */ +int rdma_join_multicast_ex(struct rdma_cm_id *id, + struct rdma_cm_join_mc_attr_ex *mc_join_attr, + void *context); + +/** + * rdma_get_cm_event - Retrieves the next pending communication event. + * @channel: Event channel to check for events. + * @event: Allocated information about the next communication event. + * Description: + * Retrieves a communication event. If no events are pending, by default, + * the call will block until an event is received. + * Notes: + * The default synchronous behavior of this routine can be changed by + * modifying the file descriptor associated with the given channel. All + * events that are reported must be acknowledged by calling rdma_ack_cm_event. + * Destruction of an rdma_cm_id will block until related events have been + * acknowledged. + * See also: + * rdma_ack_cm_event, rdma_create_event_channel, rdma_event_str + */ +int rdma_get_cm_event(struct rdma_event_channel *channel, + struct rdma_cm_event **event); + +/** + * rdma_ack_cm_event - Free a communication event. + * @event: Event to be released. + * Description: + * All events which are allocated by rdma_get_cm_event must be released, + * there should be a one-to-one correspondence between successful gets + * and acks. + * See also: + * rdma_get_cm_event, rdma_destroy_id + */ +int rdma_ack_cm_event(struct rdma_cm_event *event); + +__be16 rdma_get_src_port(struct rdma_cm_id *id); +__be16 rdma_get_dst_port(struct rdma_cm_id *id); + +static inline struct sockaddr *rdma_get_local_addr(struct rdma_cm_id *id) +{ + return &id->route.addr.src_addr; +} + +static inline struct sockaddr *rdma_get_peer_addr(struct rdma_cm_id *id) +{ + return &id->route.addr.dst_addr; +} + +/** + * rdma_get_devices - Get list of RDMA devices currently available. + * @num_devices: If non-NULL, set to the number of devices returned. + * Description: + * Return a NULL-terminated array of opened RDMA devices. Callers can use + * this routine to allocate resources on specific RDMA devices that will be + * shared across multiple rdma_cm_id's. + * Notes: + * The returned array must be released by calling rdma_free_devices. Devices + * remain opened while the librdmacm is loaded. + * See also: + * rdma_free_devices + */ +struct ibv_context **rdma_get_devices(int *num_devices); + +/** + * rdma_free_devices - Frees the list of devices returned by rdma_get_devices. + * @list: List of devices returned from rdma_get_devices. + * Description: + * Frees the device array returned by rdma_get_devices. + * See also: + * rdma_get_devices + */ +void rdma_free_devices(struct ibv_context **list); + +/** + * rdma_event_str - Returns a string representation of an rdma cm event. + * @event: Asynchronous event. + * Description: + * Returns a string representation of an asynchronous event. + * See also: + * rdma_get_cm_event + */ +const char *rdma_event_str(enum rdma_cm_event_type event); + +/* Option levels */ +enum { + RDMA_OPTION_ID = 0, + RDMA_OPTION_IB = 1 +}; + +/* Option details */ +enum { + RDMA_OPTION_ID_TOS = 0, /* uint8_t: RFC 2474 */ + RDMA_OPTION_ID_REUSEADDR = 1, /* int: ~SO_REUSEADDR */ + RDMA_OPTION_ID_AFONLY = 2, /* int: ~IPV6_V6ONLY */ + RDMA_OPTION_ID_ACK_TIMEOUT = 3 /* uint8_t */ +}; + +enum { + RDMA_OPTION_IB_PATH = 1 /* struct ibv_path_data[] */ +}; + +/** + * rdma_set_option - Set options for an rdma_cm_id. + * @id: Communication identifier to set option for. + * @level: Protocol level of the option to set. + * @optname: Name of the option to set. + * @optval: Reference to the option data. + * @optlen: The size of the %optval buffer. + */ +int rdma_set_option(struct rdma_cm_id *id, int level, int optname, + void *optval, size_t optlen); + +/** + * rdma_migrate_id - Move an rdma_cm_id to a new event channel. + * @id: Communication identifier to migrate. + * @channel: New event channel for rdma_cm_id events. + */ +int rdma_migrate_id(struct rdma_cm_id *id, struct rdma_event_channel *channel); + +/** + * rdma_getaddrinfo - RDMA address and route resolution service. + */ +int rdma_getaddrinfo(const char *node, const char *service, + const struct rdma_addrinfo *hints, + struct rdma_addrinfo **res); + +void rdma_freeaddrinfo(struct rdma_addrinfo *res); + +/** + * rdma_init_qp_attr - Returns QP attributes. + * @id: Communication identifier. + * @qp_attr: A reference to a QP attributes struct containing + * response information. + * @qp_attr_mask: A reference to a QP attributes mask containing + * response information. + */ +int rdma_init_qp_attr(struct rdma_cm_id *id, struct ibv_qp_attr *qp_attr, + int *qp_attr_mask); +#ifdef __cplusplus +} +#endif + +#endif /* RDMA_CMA_H */ diff --git a/librdmacm/rdma_cma_abi.h b/librdmacm/rdma_cma_abi.h new file mode 100644 index 0000000..ab4adb0 --- /dev/null +++ b/librdmacm/rdma_cma_abi.h @@ -0,0 +1,351 @@ +/* + * Copyright (c) 2005-2011 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef RDMA_CMA_ABI_H +#define RDMA_CMA_ABI_H + +#include <rdma/ib_user_sa.h> +#include <rdma/ib_user_verbs.h> +#include <infiniband/sa.h> +#include <netinet/in.h> + +/* + * This file must be kept in sync with the kernel's version of rdma_user_cm.h + */ + +#define RDMA_USER_CM_MIN_ABI_VERSION 3 +#define RDMA_USER_CM_MAX_ABI_VERSION 4 + +#define RDMA_MAX_PRIVATE_DATA 256 + +enum { + UCMA_CMD_CREATE_ID, + UCMA_CMD_DESTROY_ID, + UCMA_CMD_BIND_IP, + UCMA_CMD_RESOLVE_IP, + UCMA_CMD_RESOLVE_ROUTE, + UCMA_CMD_QUERY_ROUTE, + UCMA_CMD_CONNECT, + UCMA_CMD_LISTEN, + UCMA_CMD_ACCEPT, + UCMA_CMD_REJECT, + UCMA_CMD_DISCONNECT, + UCMA_CMD_INIT_QP_ATTR, + UCMA_CMD_GET_EVENT, + UCMA_CMD_GET_OPTION, + UCMA_CMD_SET_OPTION, + UCMA_CMD_NOTIFY, + UCMA_CMD_JOIN_IP_MCAST, + UCMA_CMD_LEAVE_MCAST, + UCMA_CMD_MIGRATE_ID, + UCMA_CMD_QUERY, + UCMA_CMD_BIND, + UCMA_CMD_RESOLVE_ADDR, + UCMA_CMD_JOIN_MCAST +}; + +struct ucma_abi_cmd_hdr { + __u32 cmd; + __u16 in; + __u16 out; +}; + +struct ucma_abi_create_id { + __u32 cmd; + __u16 in; + __u16 out; + __u64 uid; + __u64 response; + __u16 ps; + __u8 qp_type; + __u8 reserved[5]; +}; + +struct ucma_abi_create_id_resp { + __u32 id; +}; + +struct ucma_abi_destroy_id { + __u32 cmd; + __u16 in; + __u16 out; + __u64 response; + __u32 id; + __u32 reserved; +}; + +struct ucma_abi_destroy_id_resp { + __u32 events_reported; +}; + +struct ucma_abi_bind_ip { + __u32 cmd; + __u16 in; + __u16 out; + __u64 response; + struct sockaddr_in6 addr; + __u32 id; +}; + +struct ucma_abi_bind { + __u32 cmd; + __u16 in; + __u16 out; + __u32 id; + __u16 addr_size; + __u16 reserved; + struct sockaddr_storage addr; +}; + +struct ucma_abi_resolve_ip { + __u32 cmd; + __u16 in; + __u16 out; + struct sockaddr_in6 src_addr; + struct sockaddr_in6 dst_addr; + __u32 id; + __u32 timeout_ms; +}; + +struct ucma_abi_resolve_addr { + __u32 cmd; + __u16 in; + __u16 out; + __u32 id; + __u32 timeout_ms; + __u16 src_size; + __u16 dst_size; + __u32 reserved; + struct sockaddr_storage src_addr; + struct sockaddr_storage dst_addr; +}; + +struct ucma_abi_resolve_route { + __u32 cmd; + __u16 in; + __u16 out; + __u32 id; + __u32 timeout_ms; +}; + +enum { + UCMA_QUERY_ADDR, + UCMA_QUERY_PATH, + UCMA_QUERY_GID +}; + +struct ucma_abi_query { + __u32 cmd; + __u16 in; + __u16 out; + __u64 response; + __u32 id; + __u32 option; +}; + +struct ucma_abi_query_route_resp { + __be64 node_guid; + struct ib_user_path_rec ib_route[2]; + struct sockaddr_in6 src_addr; + struct sockaddr_in6 dst_addr; + __u32 num_paths; + __u8 port_num; + __u8 reserved[3]; +}; + +struct ucma_abi_query_addr_resp { + __be64 node_guid; + __u8 port_num; + __u8 reserved; + __be16 pkey; + __u16 src_size; + __u16 dst_size; + struct sockaddr_storage src_addr; + struct sockaddr_storage dst_addr; +}; + +struct ucma_abi_query_path_resp { + __u32 num_paths; + __u32 reserved; + struct ibv_path_data path_data[0]; +}; + +struct ucma_abi_conn_param { + __u32 qp_num; + __u32 reserved; + __u8 private_data[RDMA_MAX_PRIVATE_DATA]; + __u8 private_data_len; + __u8 srq; + __u8 responder_resources; + __u8 initiator_depth; + __u8 flow_control; + __u8 retry_count; + __u8 rnr_retry_count; + __u8 valid; +}; + +struct ucma_abi_ud_param { + __u32 qp_num; + __u32 qkey; + struct ib_uverbs_ah_attr ah_attr; + __u8 private_data[RDMA_MAX_PRIVATE_DATA]; + __u8 private_data_len; + __u8 reserved[7]; + __u8 reserved2[4]; /* Round to 8-byte boundary to support 32/64 */ +}; + +struct ucma_abi_connect { + __u32 cmd; + __u16 in; + __u16 out; + struct ucma_abi_conn_param conn_param; + __u32 id; + __u32 reserved; +}; + +struct ucma_abi_listen { + __u32 cmd; + __u16 in; + __u16 out; + __u32 id; + __u32 backlog; +}; + +struct ucma_abi_accept { + __u32 cmd; + __u16 in; + __u16 out; + __u64 uid; + struct ucma_abi_conn_param conn_param; + __u32 id; + __u32 reserved; +}; + +struct ucma_abi_reject { + __u32 cmd; + __u16 in; + __u16 out; + __u32 id; + __u8 private_data_len; + __u8 reserved[3]; + __u8 private_data[RDMA_MAX_PRIVATE_DATA]; +}; + +struct ucma_abi_disconnect { + __u32 cmd; + __u16 in; + __u16 out; + __u32 id; +}; + +struct ucma_abi_init_qp_attr { + __u32 cmd; + __u16 in; + __u16 out; + __u64 response; + __u32 id; + __u32 qp_state; +}; + +struct ucma_abi_notify { + __u32 cmd; + __u16 in; + __u16 out; + __u32 id; + __u32 event; +}; + +struct ucma_abi_join_ip_mcast { + __u32 cmd; + __u16 in; + __u16 out; + __u64 response; /* ucma_abi_create_id_resp */ + __u64 uid; + struct sockaddr_in6 addr; + __u32 id; +}; + +struct ucma_abi_join_mcast { + __u32 cmd; + __u16 in; + __u16 out; + __u64 response; /* rdma_ucma_create_id_resp */ + __u64 uid; + __u32 id; + __u16 addr_size; + __u16 join_flags; + struct sockaddr_storage addr; +}; + +struct ucma_abi_get_event { + __u32 cmd; + __u16 in; + __u16 out; + __u64 response; +}; + +struct ucma_abi_event_resp { + __u64 uid; + __u32 id; + __u32 event; + __u32 status; + union { + struct ucma_abi_conn_param conn; + struct ucma_abi_ud_param ud; + } param; +}; + +struct ucma_abi_set_option { + __u32 cmd; + __u16 in; + __u16 out; + __u64 optval; + __u32 id; + __u32 level; + __u32 optname; + __u32 optlen; +}; + +struct ucma_abi_migrate_id { + __u32 cmd; + __u16 in; + __u16 out; + __u64 response; + __u32 id; + __u32 fd; +}; + +struct ucma_abi_migrate_resp { + __u32 events_reported; +}; + +#endif /* RDMA_CMA_ABI_H */ diff --git a/librdmacm/rdma_verbs.h b/librdmacm/rdma_verbs.h new file mode 100644 index 0000000..10049c3 --- /dev/null +++ b/librdmacm/rdma_verbs.h @@ -0,0 +1,317 @@ +/* + * Copyright (c) 2010-2014 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#if !defined(RDMA_VERBS_H) +#define RDMA_VERBS_H + +#include <assert.h> +#include <infiniband/verbs.h> +#include <rdma/rdma_cma.h> +#include <errno.h> + +#ifdef __cplusplus +extern "C" { +#endif + +static inline int rdma_seterrno(int ret) +{ + if (ret) { + errno = ret; + ret = -1; + } + return ret; +} + +/* + * Shared receive queues. + */ +int rdma_create_srq(struct rdma_cm_id *id, struct ibv_pd *pd, + struct ibv_srq_init_attr *attr); +int rdma_create_srq_ex(struct rdma_cm_id *id, struct ibv_srq_init_attr_ex *attr); + +void rdma_destroy_srq(struct rdma_cm_id *id); + + +/* + * Memory registration helpers. + */ +static inline struct ibv_mr * +rdma_reg_msgs(struct rdma_cm_id *id, void *addr, size_t length) +{ + return ibv_reg_mr(id->pd, addr, length, IBV_ACCESS_LOCAL_WRITE); +} + +static inline struct ibv_mr * +rdma_reg_read(struct rdma_cm_id *id, void *addr, size_t length) +{ + return ibv_reg_mr(id->pd, addr, length, IBV_ACCESS_LOCAL_WRITE | + IBV_ACCESS_REMOTE_READ); +} + +static inline struct ibv_mr * +rdma_reg_write(struct rdma_cm_id *id, void *addr, size_t length) +{ + return ibv_reg_mr(id->pd, addr, length, IBV_ACCESS_LOCAL_WRITE | + IBV_ACCESS_REMOTE_WRITE); +} + +static inline int +rdma_dereg_mr(struct ibv_mr *mr) +{ + return rdma_seterrno(ibv_dereg_mr(mr)); +} + + +/* + * Vectored send, receive, and RDMA operations. + * Support multiple scatter-gather entries. + */ +static inline int +rdma_post_recvv(struct rdma_cm_id *id, void *context, struct ibv_sge *sgl, + int nsge) +{ + struct ibv_recv_wr wr, *bad; + + wr.wr_id = (uintptr_t) context; + wr.next = NULL; + wr.sg_list = sgl; + wr.num_sge = nsge; + + if (id->srq) + return rdma_seterrno(ibv_post_srq_recv(id->srq, &wr, &bad)); + else + return rdma_seterrno(ibv_post_recv(id->qp, &wr, &bad)); +} + +static inline int +rdma_post_sendv(struct rdma_cm_id *id, void *context, struct ibv_sge *sgl, + int nsge, int flags) +{ + struct ibv_send_wr wr, *bad; + + wr.wr_id = (uintptr_t) context; + wr.next = NULL; + wr.sg_list = sgl; + wr.num_sge = nsge; + wr.opcode = IBV_WR_SEND; + wr.send_flags = flags; + + return rdma_seterrno(ibv_post_send(id->qp, &wr, &bad)); +} + +static inline int +rdma_post_readv(struct rdma_cm_id *id, void *context, struct ibv_sge *sgl, + int nsge, int flags, uint64_t remote_addr, uint32_t rkey) +{ + struct ibv_send_wr wr, *bad; + + wr.wr_id = (uintptr_t) context; + wr.next = NULL; + wr.sg_list = sgl; + wr.num_sge = nsge; + wr.opcode = IBV_WR_RDMA_READ; + wr.send_flags = flags; + wr.wr.rdma.remote_addr = remote_addr; + wr.wr.rdma.rkey = rkey; + + return rdma_seterrno(ibv_post_send(id->qp, &wr, &bad)); +} + +static inline int +rdma_post_writev(struct rdma_cm_id *id, void *context, struct ibv_sge *sgl, + int nsge, int flags, uint64_t remote_addr, uint32_t rkey) +{ + struct ibv_send_wr wr, *bad; + + wr.wr_id = (uintptr_t) context; + wr.next = NULL; + wr.sg_list = sgl; + wr.num_sge = nsge; + wr.opcode = IBV_WR_RDMA_WRITE; + wr.send_flags = flags; + wr.wr.rdma.remote_addr = remote_addr; + wr.wr.rdma.rkey = rkey; + + return rdma_seterrno(ibv_post_send(id->qp, &wr, &bad)); +} + +/* + * Simple send, receive, and RDMA calls. + */ +static inline int +rdma_post_recv(struct rdma_cm_id *id, void *context, void *addr, + size_t length, struct ibv_mr *mr) +{ + struct ibv_sge sge; + + assert((addr >= mr->addr) && + (((uint8_t *) addr + length) <= ((uint8_t *) mr->addr + mr->length))); + sge.addr = (uint64_t) (uintptr_t) addr; + sge.length = (uint32_t) length; + sge.lkey = mr->lkey; + + return rdma_post_recvv(id, context, &sge, 1); +} + +static inline int +rdma_post_send(struct rdma_cm_id *id, void *context, void *addr, + size_t length, struct ibv_mr *mr, int flags) +{ + struct ibv_sge sge; + + sge.addr = (uint64_t) (uintptr_t) addr; + sge.length = (uint32_t) length; + sge.lkey = mr ? mr->lkey : 0; + + return rdma_post_sendv(id, context, &sge, 1, flags); +} + +static inline int +rdma_post_read(struct rdma_cm_id *id, void *context, void *addr, + size_t length, struct ibv_mr *mr, int flags, + uint64_t remote_addr, uint32_t rkey) +{ + struct ibv_sge sge; + + sge.addr = (uint64_t) (uintptr_t) addr; + sge.length = (uint32_t) length; + sge.lkey = mr->lkey; + + return rdma_post_readv(id, context, &sge, 1, flags, remote_addr, rkey); +} + +static inline int +rdma_post_write(struct rdma_cm_id *id, void *context, void *addr, + size_t length, struct ibv_mr *mr, int flags, + uint64_t remote_addr, uint32_t rkey) +{ + struct ibv_sge sge; + + sge.addr = (uint64_t) (uintptr_t) addr; + sge.length = (uint32_t) length; + sge.lkey = mr ? mr->lkey : 0; + + return rdma_post_writev(id, context, &sge, 1, flags, remote_addr, rkey); +} + +static inline int +rdma_post_ud_send(struct rdma_cm_id *id, void *context, void *addr, + size_t length, struct ibv_mr *mr, int flags, + struct ibv_ah *ah, uint32_t remote_qpn) +{ + struct ibv_send_wr wr, *bad; + struct ibv_sge sge; + + sge.addr = (uint64_t) (uintptr_t) addr; + sge.length = (uint32_t) length; + sge.lkey = mr ? mr->lkey : 0; + + wr.wr_id = (uintptr_t) context; + wr.next = NULL; + wr.sg_list = &sge; + wr.num_sge = 1; + wr.opcode = IBV_WR_SEND; + wr.send_flags = flags; + wr.wr.ud.ah = ah; + wr.wr.ud.remote_qpn = remote_qpn; + wr.wr.ud.remote_qkey = RDMA_UDP_QKEY; + + return rdma_seterrno(ibv_post_send(id->qp, &wr, &bad)); +} + +static inline int +rdma_get_send_comp(struct rdma_cm_id *id, struct ibv_wc *wc) +{ + struct ibv_cq *cq; + void *context; + int ret; + + do { + ret = ibv_poll_cq(id->send_cq, 1, wc); + if (ret) + break; + + ret = ibv_req_notify_cq(id->send_cq, 0); + if (ret) + return rdma_seterrno(ret); + + ret = ibv_poll_cq(id->send_cq, 1, wc); + if (ret) + break; + + ret = ibv_get_cq_event(id->send_cq_channel, &cq, &context); + if (ret) + return ret; + + assert(cq == id->send_cq && context == id); + ibv_ack_cq_events(id->send_cq, 1); + } while (1); + + return (ret < 0) ? rdma_seterrno(ret) : ret; +} + +static inline int +rdma_get_recv_comp(struct rdma_cm_id *id, struct ibv_wc *wc) +{ + struct ibv_cq *cq; + void *context; + int ret; + + do { + ret = ibv_poll_cq(id->recv_cq, 1, wc); + if (ret) + break; + + ret = ibv_req_notify_cq(id->recv_cq, 0); + if (ret) + return rdma_seterrno(ret); + + ret = ibv_poll_cq(id->recv_cq, 1, wc); + if (ret) + break; + + ret = ibv_get_cq_event(id->recv_cq_channel, &cq, &context); + if (ret) + return ret; + + assert(cq == id->recv_cq && context == id); + ibv_ack_cq_events(id->recv_cq, 1); + } while (1); + + return (ret < 0) ? rdma_seterrno(ret) : ret; +} + +#ifdef __cplusplus +} +#endif + +#endif /* RDMA_CMA_H */ diff --git a/librdmacm/rsocket.c b/librdmacm/rsocket.c new file mode 100644 index 0000000..ae26efd --- /dev/null +++ b/librdmacm/rsocket.c @@ -0,0 +1,4670 @@ +/* + * Copyright (c) 2008-2019 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#define _GNU_SOURCE +#include <config.h> + +#include <sys/types.h> +#include <sys/socket.h> +#include <sys/time.h> +#include <endian.h> +#include <stdarg.h> +#include <netdb.h> +#include <unistd.h> +#include <fcntl.h> +#include <stdio.h> +#include <stddef.h> +#include <string.h> +#include <netinet/tcp.h> +#include <sys/epoll.h> +#include <sys/eventfd.h> +#include <search.h> +#include <time.h> +#include <byteswap.h> +#include <util/compiler.h> +#include <util/util.h> +#include <ccan/container_of.h> + +#include <rdma/rdma_cma.h> +#include <rdma/rdma_verbs.h> +#include <rdma/rsocket.h> +#include "cma.h" +#include "indexer.h" + +#define RS_OLAP_START_SIZE 2048 +#define RS_MAX_TRANSFER 65536 +#define RS_SNDLOWAT 2048 +#define RS_QP_MIN_SIZE 16 +#define RS_QP_MAX_SIZE 0xFFFE +#define RS_QP_CTRL_SIZE 4 /* must be power of 2 */ +#define RS_CONN_RETRIES 6 +#define RS_SGL_SIZE 2 +static struct index_map idm; +static pthread_mutex_t mut = PTHREAD_MUTEX_INITIALIZER; +static pthread_mutex_t svc_mut = PTHREAD_MUTEX_INITIALIZER; + +struct rsocket; + +enum { + RS_SVC_NOOP, + RS_SVC_ADD_DGRAM, + RS_SVC_REM_DGRAM, + RS_SVC_ADD_KEEPALIVE, + RS_SVC_REM_KEEPALIVE, + RS_SVC_MOD_KEEPALIVE, + RS_SVC_ADD_CM, + RS_SVC_REM_CM, +}; + +struct rs_svc_msg { + uint32_t cmd; + uint32_t status; + struct rsocket *rs; +}; + +struct rs_svc { + pthread_t id; + int sock[2]; + int cnt; + int size; + int context_size; + void *(*run)(void *svc); + struct rsocket **rss; + void *contexts; +}; + +static struct pollfd *udp_svc_fds; +static void *udp_svc_run(void *arg); +static struct rs_svc udp_svc = { + .context_size = sizeof(*udp_svc_fds), + .run = udp_svc_run +}; +static uint64_t *tcp_svc_timeouts; +static void *tcp_svc_run(void *arg); +static struct rs_svc tcp_svc = { + .context_size = sizeof(*tcp_svc_timeouts), + .run = tcp_svc_run +}; +static void *cm_svc_run(void *arg); +static struct rs_svc listen_svc = { + .context_size = sizeof(struct pollfd), + .run = cm_svc_run +}; +static struct rs_svc connect_svc = { + .context_size = sizeof(struct pollfd), + .run = cm_svc_run +}; + +static uint32_t pollcnt; +static bool suspendpoll; +static int pollsignal = -1; + +static uint16_t def_iomap_size = 0; +static uint16_t def_inline = 64; +static uint16_t def_sqsize = 384; +static uint16_t def_rqsize = 384; +static uint32_t def_mem = (1 << 17); +static uint32_t def_wmem = (1 << 17); +static uint32_t polling_time = 10; +static int wake_up_interval = 5000; + +/* + * Immediate data format is determined by the upper bits + * bit 31: message type, 0 - data, 1 - control + * bit 30: buffers updated, 0 - target, 1 - direct-receive + * bit 29: more data, 0 - end of transfer, 1 - more data available + * + * for data transfers: + * bits [28:0]: bytes transferred + * for control messages: + * SGL, CTRL + * bits [28-0]: receive credits granted + * IOMAP_SGL + * bits [28-16]: reserved, bits [15-0]: index + */ + +enum { + RS_OP_DATA, + RS_OP_RSVD_DATA_MORE, + RS_OP_WRITE, /* opcode is not transmitted over the network */ + RS_OP_RSVD_DRA_MORE, + RS_OP_SGL, + RS_OP_RSVD, + RS_OP_IOMAP_SGL, + RS_OP_CTRL +}; +#define rs_msg_set(op, data) ((op << 29) | (uint32_t) (data)) +#define rs_msg_op(imm_data) (imm_data >> 29) +#define rs_msg_data(imm_data) (imm_data & 0x1FFFFFFF) +#define RS_MSG_SIZE sizeof(uint32_t) + +#define RS_WR_ID_FLAG_RECV (((uint64_t) 1) << 63) +#define RS_WR_ID_FLAG_MSG_SEND (((uint64_t) 1) << 62) /* See RS_OPT_MSG_SEND */ +#define rs_send_wr_id(data) ((uint64_t) data) +#define rs_recv_wr_id(data) (RS_WR_ID_FLAG_RECV | (uint64_t) data) +#define rs_wr_is_recv(wr_id) (wr_id & RS_WR_ID_FLAG_RECV) +#define rs_wr_is_msg_send(wr_id) (wr_id & RS_WR_ID_FLAG_MSG_SEND) +#define rs_wr_data(wr_id) ((uint32_t) wr_id) + +enum { + RS_CTRL_DISCONNECT, + RS_CTRL_KEEPALIVE, + RS_CTRL_SHUTDOWN +}; + +struct rs_msg { + uint32_t op; + uint32_t data; +}; + +struct ds_qp; + +struct ds_rmsg { + struct ds_qp *qp; + uint32_t offset; + uint32_t length; +}; + +struct ds_smsg { + struct ds_smsg *next; +}; + +struct rs_sge { + uint64_t addr; + uint32_t key; + uint32_t length; +}; + +struct rs_iomap { + uint64_t offset; + struct rs_sge sge; +}; + +struct rs_iomap_mr { + uint64_t offset; + struct ibv_mr *mr; + dlist_entry entry; + _Atomic(int) refcnt; + int index; /* -1 if mapping is local and not in iomap_list */ +}; + +#define RS_MAX_CTRL_MSG (sizeof(struct rs_sge)) +#define rs_host_is_net() (__BYTE_ORDER == __BIG_ENDIAN) +#define RS_CONN_FLAG_NET (1 << 0) +#define RS_CONN_FLAG_IOMAP (1 << 1) + +struct rs_conn_data { + uint8_t version; + uint8_t flags; + __be16 credits; + uint8_t reserved[3]; + uint8_t target_iomap_size; + struct rs_sge target_sgl; + struct rs_sge data_buf; +}; + +struct rs_conn_private_data { + union { + struct rs_conn_data conn_data; + struct { + struct ib_connect_hdr ib_hdr; + struct rs_conn_data conn_data; + } af_ib; + }; +}; + +/* + * rsocket states are ordered as passive, connecting, connected, disconnected. + */ +enum rs_state { + rs_init, + rs_bound = 0x0001, + rs_listening = 0x0002, + rs_opening = 0x0004, + rs_resolving_addr = rs_opening | 0x0010, + rs_resolving_route = rs_opening | 0x0020, + rs_connecting = rs_opening | 0x0040, + rs_accepting = rs_opening | 0x0080, + rs_connected = 0x0100, + rs_writable = 0x0200, + rs_readable = 0x0400, + rs_connect_rdwr = rs_connected | rs_readable | rs_writable, + rs_connect_error = 0x0800, + rs_disconnected = 0x1000, + rs_error = 0x2000, +}; + +#define RS_OPT_SWAP_SGL (1 << 0) +/* + * iWarp does not support RDMA write with immediate data. For iWarp, we + * transfer rsocket messages as inline sends. + */ +#define RS_OPT_MSG_SEND (1 << 1) +#define RS_OPT_UDP_SVC (1 << 2) +#define RS_OPT_KEEPALIVE (1 << 3) +#define RS_OPT_CM_SVC (1 << 4) + +union socket_addr { + struct sockaddr sa; + struct sockaddr_in sin; + struct sockaddr_in6 sin6; +}; + +struct ds_header { + uint8_t version; + uint8_t length; + __be16 port; + union { + __be32 ipv4; + struct { + __be32 flowinfo; + uint8_t addr[16]; + } ipv6; + } addr; +}; + +#define DS_IPV4_HDR_LEN 8 +#define DS_IPV6_HDR_LEN 24 + +struct ds_dest { + union socket_addr addr; /* must be first */ + struct ds_qp *qp; + struct ibv_ah *ah; + uint32_t qpn; +}; + +struct ds_qp { + dlist_entry list; + struct rsocket *rs; + struct rdma_cm_id *cm_id; + struct ds_header hdr; + struct ds_dest dest; + + struct ibv_mr *smr; + struct ibv_mr *rmr; + uint8_t *rbuf; + + int cq_armed; +}; + +struct rsocket { + int type; + int index; + fastlock_t slock; + fastlock_t rlock; + fastlock_t cq_lock; + fastlock_t cq_wait_lock; + fastlock_t map_lock; /* acquire slock first if needed */ + + union { + /* data stream */ + struct { + struct rdma_cm_id *cm_id; + uint64_t tcp_opts; + unsigned int keepalive_time; + int accept_queue[2]; + + unsigned int ctrl_seqno; + unsigned int ctrl_max_seqno; + uint16_t sseq_no; + uint16_t sseq_comp; + uint16_t rseq_no; + uint16_t rseq_comp; + + int remote_sge; + struct rs_sge remote_sgl; + struct rs_sge remote_iomap; + + struct ibv_mr *target_mr; + int target_sge; + int target_iomap_size; + void *target_buffer_list; + volatile struct rs_sge *target_sgl; + struct rs_iomap *target_iomap; + + int rbuf_msg_index; + int rbuf_bytes_avail; + int rbuf_free_offset; + int rbuf_offset; + struct ibv_mr *rmr; + uint8_t *rbuf; + + int sbuf_bytes_avail; + struct ibv_mr *smr; + struct ibv_sge ssgl[2]; + }; + /* datagram */ + struct { + struct ds_qp *qp_list; + void *dest_map; + struct ds_dest *conn_dest; + + int udp_sock; + int epfd; + int rqe_avail; + struct ds_smsg *smsg_free; + }; + }; + + int opts; + int fd_flags; + uint64_t so_opts; + uint64_t ipv6_opts; + void *optval; + size_t optlen; + int state; + int cq_armed; + int retries; + int err; + + int sqe_avail; + uint32_t sbuf_size; + uint16_t sq_size; + uint16_t sq_inline; + + uint32_t rbuf_size; + uint16_t rq_size; + int rmsg_head; + int rmsg_tail; + union { + struct rs_msg *rmsg; + struct ds_rmsg *dmsg; + }; + + uint8_t *sbuf; + struct rs_iomap_mr *remote_iomappings; + dlist_entry iomap_list; + dlist_entry iomap_queue; + int iomap_pending; + int unack_cqe; +}; + +#define DS_UDP_TAG 0x55555555 + +struct ds_udp_header { + __be32 tag; + uint8_t version; + uint8_t op; + uint8_t length; + uint8_t reserved; + __be32 qpn; /* lower 8-bits reserved */ + union { + __be32 ipv4; + uint8_t ipv6[16]; + } addr; +}; + +#define DS_UDP_IPV4_HDR_LEN 16 +#define DS_UDP_IPV6_HDR_LEN 28 + +#define ds_next_qp(qp) container_of((qp)->list.next, struct ds_qp, list) + +static void write_all(int fd, const void *msg, size_t len) +{ + // FIXME: if fd is a socket this really needs to handle EINTR and other conditions. + ssize_t __attribute__((unused)) rc = write(fd, msg, len); + assert(rc == len); +} + +static void read_all(int fd, void *msg, size_t len) +{ + // FIXME: if fd is a socket this really needs to handle EINTR and other conditions. + ssize_t __attribute__((unused)) rc = read(fd, msg, len); + assert(rc == len); +} + +static uint64_t rs_time_us(void) +{ + struct timespec now; + + clock_gettime(CLOCK_MONOTONIC, &now); + return now.tv_sec * 1000000 + now.tv_nsec / 1000; +} + +static void ds_insert_qp(struct rsocket *rs, struct ds_qp *qp) +{ + if (!rs->qp_list) + dlist_init(&qp->list); + else + dlist_insert_head(&qp->list, &rs->qp_list->list); + rs->qp_list = qp; +} + +static void ds_remove_qp(struct rsocket *rs, struct ds_qp *qp) +{ + if (qp->list.next != &qp->list) { + rs->qp_list = ds_next_qp(qp); + dlist_remove(&qp->list); + } else { + rs->qp_list = NULL; + } +} + +static int rs_notify_svc(struct rs_svc *svc, struct rsocket *rs, int cmd) +{ + struct rs_svc_msg msg; + int ret; + + pthread_mutex_lock(&svc_mut); + if (!svc->cnt) { + ret = socketpair(AF_UNIX, SOCK_STREAM, 0, svc->sock); + if (ret) + goto unlock; + + ret = pthread_create(&svc->id, NULL, svc->run, svc); + if (ret) { + ret = ERR(ret); + goto closepair; + } + } + + msg.cmd = cmd; + msg.status = EINVAL; + msg.rs = rs; + write_all(svc->sock[0], &msg, sizeof(msg)); + read_all(svc->sock[0], &msg, sizeof(msg)); + ret = rdma_seterrno(msg.status); + if (svc->cnt) + goto unlock; + + pthread_join(svc->id, NULL); +closepair: + close(svc->sock[0]); + close(svc->sock[1]); +unlock: + pthread_mutex_unlock(&svc_mut); + return ret; +} + +static int ds_compare_addr(const void *dst1, const void *dst2) +{ + const struct sockaddr *sa1, *sa2; + size_t len; + + sa1 = (const struct sockaddr *) dst1; + sa2 = (const struct sockaddr *) dst2; + + len = (sa1->sa_family == AF_INET6 && sa2->sa_family == AF_INET6) ? + sizeof(struct sockaddr_in6) : sizeof(struct sockaddr_in); + return memcmp(dst1, dst2, len); +} + +static int rs_value_to_scale(int value, int bits) +{ + return value <= (1 << (bits - 1)) ? + value : (1 << (bits - 1)) | (value >> bits); +} + +static int rs_scale_to_value(int value, int bits) +{ + return value <= (1 << (bits - 1)) ? + value : (value & ~(1 << (bits - 1))) << bits; +} + +/* gcc > ~5 will not allow (void)fscanf to suppress -Wunused-result, but this + will do it. In this case ignoring the result is OK (but horribly + unfriendly to user) since the library has a sane default. */ +#define failable_fscanf(f, fmt, ...) \ + { \ + int rc = fscanf(f, fmt, __VA_ARGS__); \ + (void) rc; \ + } + +static void rs_configure(void) +{ + FILE *f; + static int init; + + if (init) + return; + + pthread_mutex_lock(&mut); + if (init) + goto out; + + if (ucma_init()) + goto out; + ucma_ib_init(); + + if ((f = fopen(RS_CONF_DIR "/polling_time", "r"))) { + failable_fscanf(f, "%u", &polling_time); + fclose(f); + } + + f = fopen(RS_CONF_DIR "/wake_up_interval", "r"); + if (f) { + failable_fscanf(f, "%d", &wake_up_interval); + fclose(f); + } + if ((f = fopen(RS_CONF_DIR "/inline_default", "r"))) { + failable_fscanf(f, "%hu", &def_inline); + fclose(f); + } + + if ((f = fopen(RS_CONF_DIR "/sqsize_default", "r"))) { + failable_fscanf(f, "%hu", &def_sqsize); + fclose(f); + } + + if ((f = fopen(RS_CONF_DIR "/rqsize_default", "r"))) { + failable_fscanf(f, "%hu", &def_rqsize); + fclose(f); + } + + if ((f = fopen(RS_CONF_DIR "/mem_default", "r"))) { + failable_fscanf(f, "%u", &def_mem); + fclose(f); + + if (def_mem < 1) + def_mem = 1; + } + + if ((f = fopen(RS_CONF_DIR "/wmem_default", "r"))) { + failable_fscanf(f, "%u", &def_wmem); + fclose(f); + if (def_wmem < RS_SNDLOWAT) + def_wmem = RS_SNDLOWAT << 1; + } + + if ((f = fopen(RS_CONF_DIR "/iomap_size", "r"))) { + failable_fscanf(f, "%hu", &def_iomap_size); + fclose(f); + + /* round to supported values */ + def_iomap_size = (uint8_t) rs_value_to_scale( + (uint16_t) rs_scale_to_value(def_iomap_size, 8), 8); + } + init = 1; +out: + pthread_mutex_unlock(&mut); +} + +static int rs_insert(struct rsocket *rs, int index) +{ + pthread_mutex_lock(&mut); + rs->index = idm_set(&idm, index, rs); + pthread_mutex_unlock(&mut); + return rs->index; +} + +static void rs_remove(struct rsocket *rs) +{ + pthread_mutex_lock(&mut); + idm_clear(&idm, rs->index); + pthread_mutex_unlock(&mut); +} + +/* We only inherit from listening sockets */ +static struct rsocket *rs_alloc(struct rsocket *inherited_rs, int type) +{ + struct rsocket *rs; + + rs = calloc(1, sizeof(*rs)); + if (!rs) + return NULL; + + rs->type = type; + rs->index = -1; + if (type == SOCK_DGRAM) { + rs->udp_sock = -1; + rs->epfd = -1; + } + + if (inherited_rs) { + rs->sbuf_size = inherited_rs->sbuf_size; + rs->rbuf_size = inherited_rs->rbuf_size; + rs->sq_inline = inherited_rs->sq_inline; + rs->sq_size = inherited_rs->sq_size; + rs->rq_size = inherited_rs->rq_size; + if (type == SOCK_STREAM) { + rs->ctrl_max_seqno = inherited_rs->ctrl_max_seqno; + rs->target_iomap_size = inherited_rs->target_iomap_size; + } + } else { + rs->sbuf_size = def_wmem; + rs->rbuf_size = def_mem; + rs->sq_inline = def_inline; + rs->sq_size = def_sqsize; + rs->rq_size = def_rqsize; + if (type == SOCK_STREAM) { + rs->ctrl_max_seqno = RS_QP_CTRL_SIZE; + rs->target_iomap_size = def_iomap_size; + } + } + fastlock_init(&rs->slock); + fastlock_init(&rs->rlock); + fastlock_init(&rs->cq_lock); + fastlock_init(&rs->cq_wait_lock); + fastlock_init(&rs->map_lock); + dlist_init(&rs->iomap_list); + dlist_init(&rs->iomap_queue); + return rs; +} + +static int rs_set_nonblocking(struct rsocket *rs, int arg) +{ + struct ds_qp *qp; + int ret = 0; + + if (rs->type == SOCK_STREAM) { + if (rs->cm_id->recv_cq_channel) + ret = fcntl(rs->cm_id->recv_cq_channel->fd, F_SETFL, arg); + + if (rs->state == rs_listening) + ret = fcntl(rs->accept_queue[0], F_SETFL, arg); + else if (!ret && rs->state < rs_connected) + ret = fcntl(rs->cm_id->channel->fd, F_SETFL, arg); + } else { + ret = fcntl(rs->epfd, F_SETFL, arg); + if (!ret && rs->qp_list) { + qp = rs->qp_list; + do { + ret = fcntl(qp->cm_id->recv_cq_channel->fd, + F_SETFL, arg); + qp = ds_next_qp(qp); + } while (qp != rs->qp_list && !ret); + } + } + + return ret; +} + +static void rs_set_qp_size(struct rsocket *rs) +{ + uint16_t max_size; + + max_size = min(ucma_max_qpsize(rs->cm_id), RS_QP_MAX_SIZE); + + if (rs->sq_size > max_size) + rs->sq_size = max_size; + else if (rs->sq_size < RS_QP_MIN_SIZE) + rs->sq_size = RS_QP_MIN_SIZE; + + if (rs->rq_size > max_size) + rs->rq_size = max_size; + else if (rs->rq_size < RS_QP_MIN_SIZE) + rs->rq_size = RS_QP_MIN_SIZE; +} + +static void ds_set_qp_size(struct rsocket *rs) +{ + uint16_t max_size; + + max_size = min(ucma_max_qpsize(NULL), RS_QP_MAX_SIZE); + + if (rs->sq_size > max_size) + rs->sq_size = max_size; + if (rs->rq_size > max_size) + rs->rq_size = max_size; + + if (rs->rq_size > (rs->rbuf_size / RS_SNDLOWAT)) + rs->rq_size = rs->rbuf_size / RS_SNDLOWAT; + else + rs->rbuf_size = rs->rq_size * RS_SNDLOWAT; + + if (rs->sq_size > (rs->sbuf_size / RS_SNDLOWAT)) + rs->sq_size = rs->sbuf_size / RS_SNDLOWAT; + else + rs->sbuf_size = rs->sq_size * RS_SNDLOWAT; +} + +static int rs_init_bufs(struct rsocket *rs) +{ + uint32_t total_rbuf_size, total_sbuf_size; + size_t len; + + rs->rmsg = calloc(rs->rq_size + 1, sizeof(*rs->rmsg)); + if (!rs->rmsg) + return ERR(ENOMEM); + + total_sbuf_size = rs->sbuf_size; + if (rs->sq_inline < RS_MAX_CTRL_MSG) + total_sbuf_size += RS_MAX_CTRL_MSG * RS_QP_CTRL_SIZE; + rs->sbuf = calloc(total_sbuf_size, 1); + if (!rs->sbuf) + return ERR(ENOMEM); + + rs->smr = rdma_reg_msgs(rs->cm_id, rs->sbuf, total_sbuf_size); + if (!rs->smr) + return -1; + + len = sizeof(*rs->target_sgl) * RS_SGL_SIZE + + sizeof(*rs->target_iomap) * rs->target_iomap_size; + rs->target_buffer_list = malloc(len); + if (!rs->target_buffer_list) + return ERR(ENOMEM); + + rs->target_mr = rdma_reg_write(rs->cm_id, rs->target_buffer_list, len); + if (!rs->target_mr) + return -1; + + memset(rs->target_buffer_list, 0, len); + rs->target_sgl = rs->target_buffer_list; + if (rs->target_iomap_size) + rs->target_iomap = (struct rs_iomap *) (rs->target_sgl + RS_SGL_SIZE); + + total_rbuf_size = rs->rbuf_size; + if (rs->opts & RS_OPT_MSG_SEND) + total_rbuf_size += rs->rq_size * RS_MSG_SIZE; + rs->rbuf = calloc(total_rbuf_size, 1); + if (!rs->rbuf) + return ERR(ENOMEM); + + rs->rmr = rdma_reg_write(rs->cm_id, rs->rbuf, total_rbuf_size); + if (!rs->rmr) + return -1; + + rs->ssgl[0].addr = rs->ssgl[1].addr = (uintptr_t) rs->sbuf; + rs->sbuf_bytes_avail = rs->sbuf_size; + rs->ssgl[0].lkey = rs->ssgl[1].lkey = rs->smr->lkey; + + rs->rbuf_free_offset = rs->rbuf_size >> 1; + rs->rbuf_bytes_avail = rs->rbuf_size >> 1; + rs->sqe_avail = rs->sq_size - rs->ctrl_max_seqno; + rs->rseq_comp = rs->rq_size >> 1; + return 0; +} + +static int ds_init_bufs(struct ds_qp *qp) +{ + qp->rbuf = calloc(qp->rs->rbuf_size + sizeof(struct ibv_grh), 1); + if (!qp->rbuf) + return ERR(ENOMEM); + + qp->smr = rdma_reg_msgs(qp->cm_id, qp->rs->sbuf, qp->rs->sbuf_size); + if (!qp->smr) + return -1; + + qp->rmr = rdma_reg_msgs(qp->cm_id, qp->rbuf, qp->rs->rbuf_size + + sizeof(struct ibv_grh)); + if (!qp->rmr) + return -1; + + return 0; +} + +/* + * If a user is waiting on a datagram rsocket through poll or select, then + * we need the first completion to generate an event on the related epoll fd + * in order to signal the user. We arm the CQ on creation for this purpose + */ +static int rs_create_cq(struct rsocket *rs, struct rdma_cm_id *cm_id) +{ + cm_id->recv_cq_channel = ibv_create_comp_channel(cm_id->verbs); + if (!cm_id->recv_cq_channel) + return -1; + + cm_id->recv_cq = ibv_create_cq(cm_id->verbs, rs->sq_size + rs->rq_size, + cm_id, cm_id->recv_cq_channel, 0); + if (!cm_id->recv_cq) + goto err1; + + if (rs->fd_flags & O_NONBLOCK) { + if (set_fd_nonblock(cm_id->recv_cq_channel->fd, true)) + goto err2; + } + + ibv_req_notify_cq(cm_id->recv_cq, 0); + cm_id->send_cq_channel = cm_id->recv_cq_channel; + cm_id->send_cq = cm_id->recv_cq; + return 0; + +err2: + ibv_destroy_cq(cm_id->recv_cq); + cm_id->recv_cq = NULL; +err1: + ibv_destroy_comp_channel(cm_id->recv_cq_channel); + cm_id->recv_cq_channel = NULL; + return -1; +} + +static inline int rs_post_recv(struct rsocket *rs) +{ + struct ibv_recv_wr wr, *bad; + struct ibv_sge sge; + + wr.next = NULL; + if (!(rs->opts & RS_OPT_MSG_SEND)) { + wr.wr_id = rs_recv_wr_id(0); + wr.sg_list = NULL; + wr.num_sge = 0; + } else { + wr.wr_id = rs_recv_wr_id(rs->rbuf_msg_index); + sge.addr = (uintptr_t) rs->rbuf + rs->rbuf_size + + (rs->rbuf_msg_index * RS_MSG_SIZE); + sge.length = RS_MSG_SIZE; + sge.lkey = rs->rmr->lkey; + + wr.sg_list = &sge; + wr.num_sge = 1; + if(++rs->rbuf_msg_index == rs->rq_size) + rs->rbuf_msg_index = 0; + } + + return rdma_seterrno(ibv_post_recv(rs->cm_id->qp, &wr, &bad)); +} + +static inline int ds_post_recv(struct rsocket *rs, struct ds_qp *qp, uint32_t offset) +{ + struct ibv_recv_wr wr, *bad; + struct ibv_sge sge[2]; + + sge[0].addr = (uintptr_t) qp->rbuf + rs->rbuf_size; + sge[0].length = sizeof(struct ibv_grh); + sge[0].lkey = qp->rmr->lkey; + sge[1].addr = (uintptr_t) qp->rbuf + offset; + sge[1].length = RS_SNDLOWAT; + sge[1].lkey = qp->rmr->lkey; + + wr.wr_id = rs_recv_wr_id(offset); + wr.next = NULL; + wr.sg_list = sge; + wr.num_sge = 2; + + return rdma_seterrno(ibv_post_recv(qp->cm_id->qp, &wr, &bad)); +} + +static int rs_create_ep(struct rsocket *rs) +{ + struct ibv_qp_init_attr qp_attr; + int i, ret; + + rs_set_qp_size(rs); + if (rs->cm_id->verbs->device->transport_type == IBV_TRANSPORT_IWARP) + rs->opts |= RS_OPT_MSG_SEND; + ret = rs_create_cq(rs, rs->cm_id); + if (ret) + return ret; + + memset(&qp_attr, 0, sizeof qp_attr); + qp_attr.qp_context = rs; + qp_attr.send_cq = rs->cm_id->send_cq; + qp_attr.recv_cq = rs->cm_id->recv_cq; + qp_attr.qp_type = IBV_QPT_RC; + qp_attr.sq_sig_all = 1; + qp_attr.cap.max_send_wr = rs->sq_size; + qp_attr.cap.max_recv_wr = rs->rq_size; + qp_attr.cap.max_send_sge = 2; + qp_attr.cap.max_recv_sge = 1; + qp_attr.cap.max_inline_data = rs->sq_inline; + + ret = rdma_create_qp(rs->cm_id, NULL, &qp_attr); + if (ret) + return ret; + + rs->sq_inline = qp_attr.cap.max_inline_data; + if ((rs->opts & RS_OPT_MSG_SEND) && (rs->sq_inline < RS_MSG_SIZE)) + return ERR(ENOTSUP); + + ret = rs_init_bufs(rs); + if (ret) + return ret; + + for (i = 0; i < rs->rq_size; i++) { + ret = rs_post_recv(rs); + if (ret) + return ret; + } + return 0; +} + +static void rs_release_iomap_mr(struct rs_iomap_mr *iomr) +{ + if (atomic_fetch_sub(&iomr->refcnt, 1) != 1) + return; + + dlist_remove(&iomr->entry); + ibv_dereg_mr(iomr->mr); + if (iomr->index >= 0) + iomr->mr = NULL; + else + free(iomr); +} + +static void rs_free_iomappings(struct rsocket *rs) +{ + struct rs_iomap_mr *iomr; + + while (!dlist_empty(&rs->iomap_list)) { + iomr = container_of(rs->iomap_list.next, + struct rs_iomap_mr, entry); + riounmap(rs->index, iomr->mr->addr, iomr->mr->length); + } + while (!dlist_empty(&rs->iomap_queue)) { + iomr = container_of(rs->iomap_queue.next, + struct rs_iomap_mr, entry); + riounmap(rs->index, iomr->mr->addr, iomr->mr->length); + } +} + +static void ds_free_qp(struct ds_qp *qp) +{ + if (qp->smr) + rdma_dereg_mr(qp->smr); + + if (qp->rbuf) { + if (qp->rmr) + rdma_dereg_mr(qp->rmr); + free(qp->rbuf); + } + + if (qp->cm_id) { + if (qp->cm_id->qp) { + tdelete(&qp->dest.addr, &qp->rs->dest_map, ds_compare_addr); + epoll_ctl(qp->rs->epfd, EPOLL_CTL_DEL, + qp->cm_id->recv_cq_channel->fd, NULL); + rdma_destroy_qp(qp->cm_id); + } + rdma_destroy_id(qp->cm_id); + } + + free(qp); +} + +static void ds_free(struct rsocket *rs) +{ + struct ds_qp *qp; + + if (rs->udp_sock >= 0) + close(rs->udp_sock); + + if (rs->index >= 0) + rs_remove(rs); + + if (rs->dmsg) + free(rs->dmsg); + + while ((qp = rs->qp_list)) { + ds_remove_qp(rs, qp); + ds_free_qp(qp); + } + + if (rs->epfd >= 0) + close(rs->epfd); + + if (rs->sbuf) + free(rs->sbuf); + + tdestroy(rs->dest_map, free); + fastlock_destroy(&rs->map_lock); + fastlock_destroy(&rs->cq_wait_lock); + fastlock_destroy(&rs->cq_lock); + fastlock_destroy(&rs->rlock); + fastlock_destroy(&rs->slock); + free(rs); +} + +static void rs_free(struct rsocket *rs) +{ + if (rs->type == SOCK_DGRAM) { + ds_free(rs); + return; + } + + if (rs->rmsg) + free(rs->rmsg); + + if (rs->sbuf) { + if (rs->smr) + rdma_dereg_mr(rs->smr); + free(rs->sbuf); + } + + if (rs->rbuf) { + if (rs->rmr) + rdma_dereg_mr(rs->rmr); + free(rs->rbuf); + } + + if (rs->target_buffer_list) { + if (rs->target_mr) + rdma_dereg_mr(rs->target_mr); + free(rs->target_buffer_list); + } + + if (rs->index >= 0) + rs_remove(rs); + + if (rs->cm_id) { + rs_free_iomappings(rs); + if (rs->cm_id->qp) { + ibv_ack_cq_events(rs->cm_id->recv_cq, rs->unack_cqe); + rdma_destroy_qp(rs->cm_id); + } + rdma_destroy_id(rs->cm_id); + } + + if (rs->accept_queue[0] > 0 || rs->accept_queue[1] > 0) { + close(rs->accept_queue[0]); + close(rs->accept_queue[1]); + } + + fastlock_destroy(&rs->map_lock); + fastlock_destroy(&rs->cq_wait_lock); + fastlock_destroy(&rs->cq_lock); + fastlock_destroy(&rs->rlock); + fastlock_destroy(&rs->slock); + free(rs); +} + +static size_t rs_conn_data_offset(struct rsocket *rs) +{ + return (rs->cm_id->route.addr.src_addr.sa_family == AF_IB) ? + sizeof(struct ib_connect_hdr) : 0; +} + +static void rs_format_conn_data(struct rsocket *rs, struct rs_conn_data *conn) +{ + conn->version = 1; + conn->flags = RS_CONN_FLAG_IOMAP | + (rs_host_is_net() ? RS_CONN_FLAG_NET : 0); + conn->credits = htobe16(rs->rq_size); + memset(conn->reserved, 0, sizeof conn->reserved); + conn->target_iomap_size = (uint8_t) rs_value_to_scale(rs->target_iomap_size, 8); + + conn->target_sgl.addr = (__force uint64_t)htobe64((uintptr_t) rs->target_sgl); + conn->target_sgl.length = (__force uint32_t)htobe32(RS_SGL_SIZE); + conn->target_sgl.key = (__force uint32_t)htobe32(rs->target_mr->rkey); + + conn->data_buf.addr = (__force uint64_t)htobe64((uintptr_t) rs->rbuf); + conn->data_buf.length = (__force uint32_t)htobe32(rs->rbuf_size >> 1); + conn->data_buf.key = (__force uint32_t)htobe32(rs->rmr->rkey); +} + +static void rs_save_conn_data(struct rsocket *rs, struct rs_conn_data *conn) +{ + rs->remote_sgl.addr = be64toh((__force __be64)conn->target_sgl.addr); + rs->remote_sgl.length = be32toh((__force __be32)conn->target_sgl.length); + rs->remote_sgl.key = be32toh((__force __be32)conn->target_sgl.key); + rs->remote_sge = 1; + if ((rs_host_is_net() && !(conn->flags & RS_CONN_FLAG_NET)) || + (!rs_host_is_net() && (conn->flags & RS_CONN_FLAG_NET))) + rs->opts = RS_OPT_SWAP_SGL; + + if (conn->flags & RS_CONN_FLAG_IOMAP) { + rs->remote_iomap.addr = rs->remote_sgl.addr + + sizeof(rs->remote_sgl) * rs->remote_sgl.length; + rs->remote_iomap.length = rs_scale_to_value(conn->target_iomap_size, 8); + rs->remote_iomap.key = rs->remote_sgl.key; + } + + rs->target_sgl[0].addr = be64toh((__force __be64)conn->data_buf.addr); + rs->target_sgl[0].length = be32toh((__force __be32)conn->data_buf.length); + rs->target_sgl[0].key = be32toh((__force __be32)conn->data_buf.key); + + rs->sseq_comp = be16toh(conn->credits); +} + +static int ds_init(struct rsocket *rs, int domain) +{ + rs->udp_sock = socket(domain, SOCK_DGRAM, 0); + if (rs->udp_sock < 0) + return rs->udp_sock; + + rs->epfd = epoll_create(2); + if (rs->epfd < 0) + return rs->epfd; + + return 0; +} + +static int ds_init_ep(struct rsocket *rs) +{ + struct ds_smsg *msg; + int i, ret; + + ds_set_qp_size(rs); + + rs->sbuf = calloc(rs->sq_size, RS_SNDLOWAT); + if (!rs->sbuf) + return ERR(ENOMEM); + + rs->dmsg = calloc(rs->rq_size + 1, sizeof(*rs->dmsg)); + if (!rs->dmsg) + return ERR(ENOMEM); + + rs->sqe_avail = rs->sq_size; + rs->rqe_avail = rs->rq_size; + + rs->smsg_free = (struct ds_smsg *) rs->sbuf; + msg = rs->smsg_free; + for (i = 0; i < rs->sq_size - 1; i++) { + msg->next = (void *) msg + RS_SNDLOWAT; + msg = msg->next; + } + msg->next = NULL; + + ret = rs_notify_svc(&udp_svc, rs, RS_SVC_ADD_DGRAM); + if (ret) + return ret; + + rs->state = rs_readable | rs_writable; + return 0; +} + +int rsocket(int domain, int type, int protocol) +{ + struct rsocket *rs; + int index, ret; + + if ((domain != AF_INET && domain != AF_INET6 && domain != AF_IB) || + ((type != SOCK_STREAM) && (type != SOCK_DGRAM)) || + (type == SOCK_STREAM && protocol && protocol != IPPROTO_TCP) || + (type == SOCK_DGRAM && protocol && protocol != IPPROTO_UDP)) + return ERR(ENOTSUP); + + rs_configure(); + rs = rs_alloc(NULL, type); + if (!rs) + return ERR(ENOMEM); + + if (type == SOCK_STREAM) { + ret = rdma_create_id(NULL, &rs->cm_id, rs, RDMA_PS_TCP); + if (ret) + goto err; + + rs->cm_id->route.addr.src_addr.sa_family = domain; + index = rs->cm_id->channel->fd; + } else { + ret = ds_init(rs, domain); + if (ret) + goto err; + + index = rs->udp_sock; + } + + ret = rs_insert(rs, index); + if (ret < 0) + goto err; + + return rs->index; + +err: + rs_free(rs); + return ret; +} + +int rbind(int socket, const struct sockaddr *addr, socklen_t addrlen) +{ + struct rsocket *rs; + int ret; + + rs = idm_lookup(&idm, socket); + if (!rs) + return ERR(EBADF); + if (rs->type == SOCK_STREAM) { + ret = rdma_bind_addr(rs->cm_id, (struct sockaddr *) addr); + if (!ret) + rs->state = rs_bound; + } else { + if (rs->state == rs_init) { + ret = ds_init_ep(rs); + if (ret) + return ret; + } + ret = bind(rs->udp_sock, addr, addrlen); + } + return ret; +} + +int rlisten(int socket, int backlog) +{ + struct rsocket *rs; + int ret; + + rs = idm_lookup(&idm, socket); + if (!rs) + return ERR(EBADF); + + if (rs->state == rs_listening) + return 0; + + ret = rdma_listen(rs->cm_id, backlog); + if (ret) + return ret; + + ret = socketpair(AF_UNIX, SOCK_STREAM, 0, rs->accept_queue); + if (ret) + return ret; + + if (rs->fd_flags & O_NONBLOCK) { + ret = set_fd_nonblock(rs->accept_queue[0], true); + if (ret) + return ret; + } + + ret = set_fd_nonblock(rs->cm_id->channel->fd, true); + if (ret) + return ret; + + ret = rs_notify_svc(&listen_svc, rs, RS_SVC_ADD_CM); + if (ret) + return ret; + + rs->state = rs_listening; + return 0; +} + +/* Accepting new connection requests is currently a blocking operation */ +static void rs_accept(struct rsocket *rs) +{ + struct rsocket *new_rs; + struct rdma_conn_param param; + struct rs_conn_data *creq, cresp; + struct rdma_cm_id *cm_id; + int ret; + + ret = rdma_get_request(rs->cm_id, &cm_id); + if (ret) + return; + + new_rs = rs_alloc(rs, rs->type); + if (!new_rs) + goto err; + new_rs->cm_id = cm_id; + + ret = rs_insert(new_rs, new_rs->cm_id->channel->fd); + if (ret < 0) + goto err; + + creq = (struct rs_conn_data *) + (new_rs->cm_id->event->param.conn.private_data + rs_conn_data_offset(rs)); + if (creq->version != 1) + goto err; + + ret = rs_create_ep(new_rs); + if (ret) + goto err; + + rs_save_conn_data(new_rs, creq); + param = new_rs->cm_id->event->param.conn; + rs_format_conn_data(new_rs, &cresp); + param.private_data = &cresp; + param.private_data_len = sizeof cresp; + ret = rdma_accept(new_rs->cm_id, ¶m); + if (!ret) + new_rs->state = rs_connect_rdwr; + else if (errno == EAGAIN || errno == EWOULDBLOCK) + new_rs->state = rs_accepting; + else + goto err; + + write_all(rs->accept_queue[1], &new_rs, sizeof(new_rs)); + return; + +err: + rdma_reject(cm_id, NULL, 0); + if (new_rs) + rs_free(new_rs); +} + +int raccept(int socket, struct sockaddr *addr, socklen_t *addrlen) +{ + struct rsocket *rs, *new_rs; + int ret; + + rs = idm_lookup(&idm, socket); + if (!rs) + return ERR(EBADF); + + if (rs->state != rs_listening) + return ERR(EBADF); + + ret = read(rs->accept_queue[0], &new_rs, sizeof(new_rs)); + if (ret != sizeof(new_rs)) + return ret; + + if (addr && addrlen) + rgetpeername(new_rs->index, addr, addrlen); + return new_rs->index; +} + +static int rs_do_connect(struct rsocket *rs) +{ + struct rdma_conn_param param; + struct rs_conn_private_data cdata; + struct rs_conn_data *creq, *cresp; + int to, ret; + + fastlock_acquire(&rs->slock); + switch (rs->state) { + case rs_init: + case rs_bound: +resolve_addr: + to = 1000 << rs->retries++; + ret = rdma_resolve_addr(rs->cm_id, NULL, + &rs->cm_id->route.addr.dst_addr, to); + if (!ret) + goto resolve_route; + if (errno == EAGAIN || errno == EWOULDBLOCK) + rs->state = rs_resolving_addr; + break; + case rs_resolving_addr: + ret = ucma_complete(rs->cm_id); + if (ret) { + if (errno == ETIMEDOUT && rs->retries <= RS_CONN_RETRIES) + goto resolve_addr; + break; + } + + rs->retries = 0; +resolve_route: + to = 1000 << rs->retries++; + if (rs->optval) { + ret = rdma_set_option(rs->cm_id, RDMA_OPTION_IB, + RDMA_OPTION_IB_PATH, rs->optval, + rs->optlen); + free(rs->optval); + rs->optval = NULL; + if (!ret) { + rs->state = rs_resolving_route; + goto resolving_route; + } + } else { + ret = rdma_resolve_route(rs->cm_id, to); + if (!ret) + goto do_connect; + } + if (errno == EAGAIN || errno == EWOULDBLOCK) + rs->state = rs_resolving_route; + break; + case rs_resolving_route: +resolving_route: + ret = ucma_complete(rs->cm_id); + if (ret) { + if (errno == ETIMEDOUT && rs->retries <= RS_CONN_RETRIES) + goto resolve_route; + break; + } +do_connect: + ret = rs_create_ep(rs); + if (ret) + break; + + memset(¶m, 0, sizeof param); + creq = (void *) &cdata + rs_conn_data_offset(rs); + rs_format_conn_data(rs, creq); + param.private_data = (void *) creq - rs_conn_data_offset(rs); + param.private_data_len = sizeof(*creq) + rs_conn_data_offset(rs); + param.flow_control = 1; + param.retry_count = 7; + param.rnr_retry_count = 7; + /* work-around: iWarp issues RDMA read during connection */ + if (rs->opts & RS_OPT_MSG_SEND) + param.initiator_depth = 1; + rs->retries = 0; + + ret = rdma_connect(rs->cm_id, ¶m); + if (!ret) + goto connected; + if (errno == EAGAIN || errno == EWOULDBLOCK) + rs->state = rs_connecting; + break; + case rs_connecting: + ret = ucma_complete(rs->cm_id); + if (ret) + break; +connected: + cresp = (struct rs_conn_data *) rs->cm_id->event->param.conn.private_data; + if (cresp->version != 1) { + ret = ERR(ENOTSUP); + break; + } + + rs_save_conn_data(rs, cresp); + rs->state = rs_connect_rdwr; + break; + case rs_accepting: + if (!(rs->fd_flags & O_NONBLOCK)) + set_fd_nonblock(rs->cm_id->channel->fd, true); + + ret = ucma_complete(rs->cm_id); + if (ret) + break; + + rs->state = rs_connect_rdwr; + break; + case rs_connect_error: + case rs_disconnected: + case rs_error: + ret = ERR(ENOTCONN); + goto unlock; + default: + ret = (rs->state & rs_connected) ? 0 : ERR(EINVAL); + goto unlock; + } + + if (ret) { + if (errno == EAGAIN || errno == EWOULDBLOCK) { + errno = EINPROGRESS; + } else { + rs->state = rs_connect_error; + rs->err = errno; + } + } +unlock: + fastlock_release(&rs->slock); + return ret; +} + +static int rs_any_addr(const union socket_addr *addr) +{ + if (addr->sa.sa_family == AF_INET) { + return (addr->sin.sin_addr.s_addr == htobe32(INADDR_ANY) || + addr->sin.sin_addr.s_addr == htobe32(INADDR_LOOPBACK)); + } else { + return (!memcmp(&addr->sin6.sin6_addr, &in6addr_any, 16) || + !memcmp(&addr->sin6.sin6_addr, &in6addr_loopback, 16)); + } +} + +static int ds_get_src_addr(struct rsocket *rs, + const struct sockaddr *dest_addr, socklen_t dest_len, + union socket_addr *src_addr, socklen_t *src_len) +{ + int sock, ret; + __be16 port; + + *src_len = sizeof(*src_addr); + ret = getsockname(rs->udp_sock, &src_addr->sa, src_len); + if (ret || !rs_any_addr(src_addr)) + return ret; + + port = src_addr->sin.sin_port; + sock = socket(dest_addr->sa_family, SOCK_DGRAM, 0); + if (sock < 0) + return sock; + + ret = connect(sock, dest_addr, dest_len); + if (ret) + goto out; + + *src_len = sizeof(*src_addr); + ret = getsockname(sock, &src_addr->sa, src_len); + src_addr->sin.sin_port = port; +out: + close(sock); + return ret; +} + +static void ds_format_hdr(struct ds_header *hdr, union socket_addr *addr) +{ + if (addr->sa.sa_family == AF_INET) { + hdr->version = 4; + hdr->length = DS_IPV4_HDR_LEN; + hdr->port = addr->sin.sin_port; + hdr->addr.ipv4 = addr->sin.sin_addr.s_addr; + } else { + hdr->version = 6; + hdr->length = DS_IPV6_HDR_LEN; + hdr->port = addr->sin6.sin6_port; + hdr->addr.ipv6.flowinfo= addr->sin6.sin6_flowinfo; + memcpy(&hdr->addr.ipv6.addr, &addr->sin6.sin6_addr, 16); + } +} + +static int ds_add_qp_dest(struct ds_qp *qp, union socket_addr *addr, + socklen_t addrlen) +{ + struct ibv_port_attr port_attr; + struct ibv_ah_attr attr; + int ret; + + memcpy(&qp->dest.addr, addr, addrlen); + qp->dest.qp = qp; + qp->dest.qpn = qp->cm_id->qp->qp_num; + + ret = ibv_query_port(qp->cm_id->verbs, qp->cm_id->port_num, &port_attr); + if (ret) + return ret; + + memset(&attr, 0, sizeof attr); + attr.dlid = port_attr.lid; + attr.port_num = qp->cm_id->port_num; + qp->dest.ah = ibv_create_ah(qp->cm_id->pd, &attr); + if (!qp->dest.ah) + return ERR(ENOMEM); + + tsearch(&qp->dest.addr, &qp->rs->dest_map, ds_compare_addr); + return 0; +} + +static int ds_create_qp(struct rsocket *rs, union socket_addr *src_addr, + socklen_t addrlen, struct ds_qp **new_qp) +{ + struct ds_qp *qp; + struct ibv_qp_init_attr qp_attr; + struct epoll_event event; + int i, ret; + + qp = calloc(1, sizeof(*qp)); + if (!qp) + return ERR(ENOMEM); + + qp->rs = rs; + ret = rdma_create_id(NULL, &qp->cm_id, qp, RDMA_PS_UDP); + if (ret) + goto err; + + ds_format_hdr(&qp->hdr, src_addr); + ret = rdma_bind_addr(qp->cm_id, &src_addr->sa); + if (ret) + goto err; + + ret = ds_init_bufs(qp); + if (ret) + goto err; + + ret = rs_create_cq(rs, qp->cm_id); + if (ret) + goto err; + + memset(&qp_attr, 0, sizeof qp_attr); + qp_attr.qp_context = qp; + qp_attr.send_cq = qp->cm_id->send_cq; + qp_attr.recv_cq = qp->cm_id->recv_cq; + qp_attr.qp_type = IBV_QPT_UD; + qp_attr.sq_sig_all = 1; + qp_attr.cap.max_send_wr = rs->sq_size; + qp_attr.cap.max_recv_wr = rs->rq_size; + qp_attr.cap.max_send_sge = 1; + qp_attr.cap.max_recv_sge = 2; + qp_attr.cap.max_inline_data = rs->sq_inline; + ret = rdma_create_qp(qp->cm_id, NULL, &qp_attr); + if (ret) + goto err; + + rs->sq_inline = qp_attr.cap.max_inline_data; + ret = ds_add_qp_dest(qp, src_addr, addrlen); + if (ret) + goto err; + + event.events = EPOLLIN; + event.data.ptr = qp; + ret = epoll_ctl(rs->epfd, EPOLL_CTL_ADD, + qp->cm_id->recv_cq_channel->fd, &event); + if (ret) + goto err; + + for (i = 0; i < rs->rq_size; i++) { + ret = ds_post_recv(rs, qp, i * RS_SNDLOWAT); + if (ret) + goto err; + } + + ds_insert_qp(rs, qp); + *new_qp = qp; + return 0; +err: + ds_free_qp(qp); + return ret; +} + +static int ds_get_qp(struct rsocket *rs, union socket_addr *src_addr, + socklen_t addrlen, struct ds_qp **qp) +{ + if (rs->qp_list) { + *qp = rs->qp_list; + do { + if (!ds_compare_addr(rdma_get_local_addr((*qp)->cm_id), + src_addr)) + return 0; + + *qp = ds_next_qp(*qp); + } while (*qp != rs->qp_list); + } + + return ds_create_qp(rs, src_addr, addrlen, qp); +} + +static int ds_get_dest(struct rsocket *rs, const struct sockaddr *addr, + socklen_t addrlen, struct ds_dest **dest) +{ + union socket_addr src_addr; + socklen_t src_len; + struct ds_qp *qp; + struct ds_dest **tdest, *new_dest; + int ret = 0; + + fastlock_acquire(&rs->map_lock); + tdest = tfind(addr, &rs->dest_map, ds_compare_addr); + if (tdest) + goto found; + + ret = ds_get_src_addr(rs, addr, addrlen, &src_addr, &src_len); + if (ret) + goto out; + + ret = ds_get_qp(rs, &src_addr, src_len, &qp); + if (ret) + goto out; + + tdest = tfind(addr, &rs->dest_map, ds_compare_addr); + if (!tdest) { + new_dest = calloc(1, sizeof(*new_dest)); + if (!new_dest) { + ret = ERR(ENOMEM); + goto out; + } + + memcpy(&new_dest->addr, addr, addrlen); + new_dest->qp = qp; + tdest = tsearch(&new_dest->addr, &rs->dest_map, ds_compare_addr); + } + +found: + *dest = *tdest; +out: + fastlock_release(&rs->map_lock); + return ret; +} + +int rconnect(int socket, const struct sockaddr *addr, socklen_t addrlen) +{ + struct rsocket *rs; + int ret, save_errno; + + rs = idm_lookup(&idm, socket); + if (!rs) + return ERR(EBADF); + if (rs->type == SOCK_STREAM) { + memcpy(&rs->cm_id->route.addr.dst_addr, addr, addrlen); + ret = rs_do_connect(rs); + if (ret == -1 && errno == EINPROGRESS) { + save_errno = errno; + /* The app can still drive the CM state on failure */ + rs_notify_svc(&connect_svc, rs, RS_SVC_ADD_CM); + errno = save_errno; + } + } else { + if (rs->state == rs_init) { + ret = ds_init_ep(rs); + if (ret) + return ret; + } + + fastlock_acquire(&rs->slock); + ret = connect(rs->udp_sock, addr, addrlen); + if (!ret) + ret = ds_get_dest(rs, addr, addrlen, &rs->conn_dest); + fastlock_release(&rs->slock); + } + return ret; +} + +static void *rs_get_ctrl_buf(struct rsocket *rs) +{ + return rs->sbuf + rs->sbuf_size + + RS_MAX_CTRL_MSG * (rs->ctrl_seqno & (RS_QP_CTRL_SIZE - 1)); +} + +static int rs_post_msg(struct rsocket *rs, uint32_t msg) +{ + struct ibv_send_wr wr, *bad; + struct ibv_sge sge; + + wr.wr_id = rs_send_wr_id(msg); + wr.next = NULL; + if (!(rs->opts & RS_OPT_MSG_SEND)) { + wr.sg_list = NULL; + wr.num_sge = 0; + wr.opcode = IBV_WR_RDMA_WRITE_WITH_IMM; + wr.send_flags = 0; + wr.imm_data = htobe32(msg); + } else { + sge.addr = (uintptr_t) &msg; + sge.lkey = 0; + sge.length = sizeof msg; + wr.sg_list = &sge; + wr.num_sge = 1; + wr.opcode = IBV_WR_SEND; + wr.send_flags = IBV_SEND_INLINE; + } + + return rdma_seterrno(ibv_post_send(rs->cm_id->qp, &wr, &bad)); +} + +static int rs_post_write(struct rsocket *rs, + struct ibv_sge *sgl, int nsge, + uint32_t wr_data, int flags, + uint64_t addr, uint32_t rkey) +{ + struct ibv_send_wr wr, *bad; + + wr.wr_id = rs_send_wr_id(wr_data); + wr.next = NULL; + wr.sg_list = sgl; + wr.num_sge = nsge; + wr.opcode = IBV_WR_RDMA_WRITE; + wr.send_flags = flags; + wr.wr.rdma.remote_addr = addr; + wr.wr.rdma.rkey = rkey; + + return rdma_seterrno(ibv_post_send(rs->cm_id->qp, &wr, &bad)); +} + +static int rs_post_write_msg(struct rsocket *rs, + struct ibv_sge *sgl, int nsge, + uint32_t msg, int flags, + uint64_t addr, uint32_t rkey) +{ + struct ibv_send_wr wr, *bad; + struct ibv_sge sge; + int ret; + + wr.next = NULL; + if (!(rs->opts & RS_OPT_MSG_SEND)) { + wr.wr_id = rs_send_wr_id(msg); + wr.sg_list = sgl; + wr.num_sge = nsge; + wr.opcode = IBV_WR_RDMA_WRITE_WITH_IMM; + wr.send_flags = flags; + wr.imm_data = htobe32(msg); + wr.wr.rdma.remote_addr = addr; + wr.wr.rdma.rkey = rkey; + + return rdma_seterrno(ibv_post_send(rs->cm_id->qp, &wr, &bad)); + } else { + ret = rs_post_write(rs, sgl, nsge, msg, flags, addr, rkey); + if (!ret) { + wr.wr_id = rs_send_wr_id(rs_msg_set(rs_msg_op(msg), 0)) | + RS_WR_ID_FLAG_MSG_SEND; + sge.addr = (uintptr_t) &msg; + sge.lkey = 0; + sge.length = sizeof msg; + wr.sg_list = &sge; + wr.num_sge = 1; + wr.opcode = IBV_WR_SEND; + wr.send_flags = IBV_SEND_INLINE; + + ret = rdma_seterrno(ibv_post_send(rs->cm_id->qp, &wr, &bad)); + } + return ret; + } +} + +static int ds_post_send(struct rsocket *rs, struct ibv_sge *sge, + uint32_t wr_data) +{ + struct ibv_send_wr wr, *bad; + + wr.wr_id = rs_send_wr_id(wr_data); + wr.next = NULL; + wr.sg_list = sge; + wr.num_sge = 1; + wr.opcode = IBV_WR_SEND; + wr.send_flags = (sge->length <= rs->sq_inline) ? IBV_SEND_INLINE : 0; + wr.wr.ud.ah = rs->conn_dest->ah; + wr.wr.ud.remote_qpn = rs->conn_dest->qpn; + wr.wr.ud.remote_qkey = RDMA_UDP_QKEY; + + return rdma_seterrno(ibv_post_send(rs->conn_dest->qp->cm_id->qp, &wr, &bad)); +} + +/* + * Update target SGE before sending data. Otherwise the remote side may + * update the entry before we do. + */ +static int rs_write_data(struct rsocket *rs, + struct ibv_sge *sgl, int nsge, + uint32_t length, int flags) +{ + uint64_t addr; + uint32_t rkey; + + rs->sseq_no++; + rs->sqe_avail--; + if (rs->opts & RS_OPT_MSG_SEND) + rs->sqe_avail--; + rs->sbuf_bytes_avail -= length; + + addr = rs->target_sgl[rs->target_sge].addr; + rkey = rs->target_sgl[rs->target_sge].key; + + rs->target_sgl[rs->target_sge].addr += length; + rs->target_sgl[rs->target_sge].length -= length; + + if (!rs->target_sgl[rs->target_sge].length) { + if (++rs->target_sge == RS_SGL_SIZE) + rs->target_sge = 0; + } + + return rs_post_write_msg(rs, sgl, nsge, rs_msg_set(RS_OP_DATA, length), + flags, addr, rkey); +} + +static int rs_write_direct(struct rsocket *rs, struct rs_iomap *iom, uint64_t offset, + struct ibv_sge *sgl, int nsge, uint32_t length, int flags) +{ + uint64_t addr; + + rs->sqe_avail--; + rs->sbuf_bytes_avail -= length; + + addr = iom->sge.addr + offset - iom->offset; + return rs_post_write(rs, sgl, nsge, rs_msg_set(RS_OP_WRITE, length), + flags, addr, iom->sge.key); +} + +static int rs_write_iomap(struct rsocket *rs, struct rs_iomap_mr *iomr, + struct ibv_sge *sgl, int nsge, int flags) +{ + uint64_t addr; + + rs->sseq_no++; + rs->sqe_avail--; + if (rs->opts & RS_OPT_MSG_SEND) + rs->sqe_avail--; + rs->sbuf_bytes_avail -= sizeof(struct rs_iomap); + + addr = rs->remote_iomap.addr + iomr->index * sizeof(struct rs_iomap); + return rs_post_write_msg(rs, sgl, nsge, rs_msg_set(RS_OP_IOMAP_SGL, iomr->index), + flags, addr, rs->remote_iomap.key); +} + +static uint32_t rs_sbuf_left(struct rsocket *rs) +{ + return (uint32_t) (((uint64_t) (uintptr_t) &rs->sbuf[rs->sbuf_size]) - + rs->ssgl[0].addr); +} + +static void rs_send_credits(struct rsocket *rs) +{ + struct ibv_sge ibsge; + struct rs_sge sge, *sge_buf; + int flags; + + rs->ctrl_seqno++; + rs->rseq_comp = rs->rseq_no + (rs->rq_size >> 1); + if (rs->rbuf_bytes_avail >= (rs->rbuf_size >> 1)) { + if (rs->opts & RS_OPT_MSG_SEND) + rs->ctrl_seqno++; + + if (!(rs->opts & RS_OPT_SWAP_SGL)) { + sge.addr = (uintptr_t) &rs->rbuf[rs->rbuf_free_offset]; + sge.key = rs->rmr->rkey; + sge.length = rs->rbuf_size >> 1; + } else { + sge.addr = bswap_64((uintptr_t) &rs->rbuf[rs->rbuf_free_offset]); + sge.key = bswap_32(rs->rmr->rkey); + sge.length = bswap_32(rs->rbuf_size >> 1); + } + + if (rs->sq_inline < sizeof sge) { + sge_buf = rs_get_ctrl_buf(rs); + memcpy(sge_buf, &sge, sizeof sge); + ibsge.addr = (uintptr_t) sge_buf; + ibsge.lkey = rs->smr->lkey; + flags = 0; + } else { + ibsge.addr = (uintptr_t) &sge; + ibsge.lkey = 0; + flags = IBV_SEND_INLINE; + } + ibsge.length = sizeof(sge); + + rs_post_write_msg(rs, &ibsge, 1, + rs_msg_set(RS_OP_SGL, rs->rseq_no + rs->rq_size), flags, + rs->remote_sgl.addr + rs->remote_sge * sizeof(struct rs_sge), + rs->remote_sgl.key); + + rs->rbuf_bytes_avail -= rs->rbuf_size >> 1; + rs->rbuf_free_offset += rs->rbuf_size >> 1; + if (rs->rbuf_free_offset >= rs->rbuf_size) + rs->rbuf_free_offset = 0; + if (++rs->remote_sge == rs->remote_sgl.length) + rs->remote_sge = 0; + } else { + rs_post_msg(rs, rs_msg_set(RS_OP_SGL, rs->rseq_no + rs->rq_size)); + } +} + +static inline int rs_ctrl_avail(struct rsocket *rs) +{ + return rs->ctrl_seqno != rs->ctrl_max_seqno; +} + +/* Protocols that do not support RDMA write with immediate may require 2 msgs */ +static inline int rs_2ctrl_avail(struct rsocket *rs) +{ + return (int)((rs->ctrl_seqno + 1) - rs->ctrl_max_seqno) < 0; +} + +static int rs_give_credits(struct rsocket *rs) +{ + if (!(rs->opts & RS_OPT_MSG_SEND)) { + return ((rs->rbuf_bytes_avail >= (rs->rbuf_size >> 1)) || + ((short) ((short) rs->rseq_no - (short) rs->rseq_comp) >= 0)) && + rs_ctrl_avail(rs) && (rs->state & rs_connected); + } else { + return ((rs->rbuf_bytes_avail >= (rs->rbuf_size >> 1)) || + ((short) ((short) rs->rseq_no - (short) rs->rseq_comp) >= 0)) && + rs_2ctrl_avail(rs) && (rs->state & rs_connected); + } +} + +static void rs_update_credits(struct rsocket *rs) +{ + if (rs_give_credits(rs)) + rs_send_credits(rs); +} + +static int rs_poll_cq(struct rsocket *rs) +{ + struct ibv_wc wc; + uint32_t msg; + int ret, rcnt = 0; + + while ((ret = ibv_poll_cq(rs->cm_id->recv_cq, 1, &wc)) > 0) { + if (rs_wr_is_recv(wc.wr_id)) { + if (wc.status != IBV_WC_SUCCESS) + continue; + rcnt++; + + if (wc.wc_flags & IBV_WC_WITH_IMM) { + msg = be32toh(wc.imm_data); + } else { + msg = ((uint32_t *) (rs->rbuf + rs->rbuf_size)) + [rs_wr_data(wc.wr_id)]; + + } + switch (rs_msg_op(msg)) { + case RS_OP_SGL: + rs->sseq_comp = (uint16_t) rs_msg_data(msg); + break; + case RS_OP_IOMAP_SGL: + /* The iomap was updated, that's nice to know. */ + break; + case RS_OP_CTRL: + if (rs_msg_data(msg) == RS_CTRL_DISCONNECT) { + rs->state = rs_disconnected; + return 0; + } else if (rs_msg_data(msg) == RS_CTRL_SHUTDOWN) { + if (rs->state & rs_writable) { + rs->state &= ~rs_readable; + } else { + rs->state = rs_disconnected; + return 0; + } + } + break; + case RS_OP_WRITE: + /* We really shouldn't be here. */ + break; + default: + rs->rmsg[rs->rmsg_tail].op = rs_msg_op(msg); + rs->rmsg[rs->rmsg_tail].data = rs_msg_data(msg); + if (++rs->rmsg_tail == rs->rq_size + 1) + rs->rmsg_tail = 0; + break; + } + } else { + switch (rs_msg_op(rs_wr_data(wc.wr_id))) { + case RS_OP_SGL: + rs->ctrl_max_seqno++; + break; + case RS_OP_CTRL: + rs->ctrl_max_seqno++; + if (rs_msg_data(rs_wr_data(wc.wr_id)) == RS_CTRL_DISCONNECT) + rs->state = rs_disconnected; + break; + case RS_OP_IOMAP_SGL: + rs->sqe_avail++; + if (!rs_wr_is_msg_send(wc.wr_id)) + rs->sbuf_bytes_avail += sizeof(struct rs_iomap); + break; + default: + rs->sqe_avail++; + rs->sbuf_bytes_avail += rs_msg_data(rs_wr_data(wc.wr_id)); + break; + } + if (wc.status != IBV_WC_SUCCESS && (rs->state & rs_connected)) { + rs->state = rs_error; + rs->err = EIO; + } + } + } + + if (rs->state & rs_connected) { + while (!ret && rcnt--) + ret = rs_post_recv(rs); + + if (ret) { + rs->state = rs_error; + rs->err = errno; + } + } + return ret; +} + +static int rs_get_cq_event(struct rsocket *rs) +{ + struct ibv_cq *cq; + void *context; + int ret; + + if (!rs->cq_armed) + return 0; + + ret = ibv_get_cq_event(rs->cm_id->recv_cq_channel, &cq, &context); + if (!ret) { + if (++rs->unack_cqe >= rs->sq_size + rs->rq_size) { + ibv_ack_cq_events(rs->cm_id->recv_cq, rs->unack_cqe); + rs->unack_cqe = 0; + } + rs->cq_armed = 0; + } else if (!(errno == EAGAIN || errno == EINTR)) { + rs->state = rs_error; + } + + return ret; +} + +/* + * Although we serialize rsend and rrecv calls with respect to themselves, + * both calls may run simultaneously and need to poll the CQ for completions. + * We need to serialize access to the CQ, but rsend and rrecv need to + * allow each other to make forward progress. + * + * For example, rsend may need to wait for credits from the remote side, + * which could be stalled until the remote process calls rrecv. This should + * not block rrecv from receiving data from the remote side however. + * + * We handle this by using two locks. The cq_lock protects against polling + * the CQ and processing completions. The cq_wait_lock serializes access to + * waiting on the CQ. + */ +static int rs_process_cq(struct rsocket *rs, int nonblock, int (*test)(struct rsocket *rs)) +{ + int ret; + + fastlock_acquire(&rs->cq_lock); + do { + rs_update_credits(rs); + ret = rs_poll_cq(rs); + if (test(rs)) { + ret = 0; + break; + } else if (ret) { + break; + } else if (nonblock) { + ret = ERR(EWOULDBLOCK); + } else if (!rs->cq_armed) { + ibv_req_notify_cq(rs->cm_id->recv_cq, 0); + rs->cq_armed = 1; + } else { + rs_update_credits(rs); + fastlock_acquire(&rs->cq_wait_lock); + fastlock_release(&rs->cq_lock); + + ret = rs_get_cq_event(rs); + fastlock_release(&rs->cq_wait_lock); + fastlock_acquire(&rs->cq_lock); + } + } while (!ret); + + rs_update_credits(rs); + fastlock_release(&rs->cq_lock); + return ret; +} + +static int rs_get_comp(struct rsocket *rs, int nonblock, int (*test)(struct rsocket *rs)) +{ + uint64_t start_time = 0; + uint32_t poll_time; + int ret; + + do { + ret = rs_process_cq(rs, 1, test); + if (!ret || nonblock || errno != EWOULDBLOCK) + return ret; + + if (!start_time) + start_time = rs_time_us(); + + poll_time = (uint32_t) (rs_time_us() - start_time); + } while (poll_time <= polling_time); + + ret = rs_process_cq(rs, 0, test); + return ret; +} + +static int ds_valid_recv(struct ds_qp *qp, struct ibv_wc *wc) +{ + struct ds_header *hdr; + + hdr = (struct ds_header *) (qp->rbuf + rs_wr_data(wc->wr_id)); + return ((wc->byte_len >= sizeof(struct ibv_grh) + DS_IPV4_HDR_LEN) && + ((hdr->version == 4 && hdr->length == DS_IPV4_HDR_LEN) || + (hdr->version == 6 && hdr->length == DS_IPV6_HDR_LEN))); +} + +/* + * Poll all CQs associated with a datagram rsocket. We need to drop any + * received messages that we do not have room to store. To limit drops, + * we only poll if we have room to store the receive or we need a send + * buffer. To ensure fairness, we poll the CQs round robin, remembering + * where we left off. + */ +static void ds_poll_cqs(struct rsocket *rs) +{ + struct ds_qp *qp; + struct ds_smsg *smsg; + struct ds_rmsg *rmsg; + struct ibv_wc wc; + int ret, cnt; + + if (!(qp = rs->qp_list)) + return; + + do { + cnt = 0; + do { + ret = ibv_poll_cq(qp->cm_id->recv_cq, 1, &wc); + if (ret <= 0) { + qp = ds_next_qp(qp); + continue; + } + + if (rs_wr_is_recv(wc.wr_id)) { + if (rs->rqe_avail && wc.status == IBV_WC_SUCCESS && + ds_valid_recv(qp, &wc)) { + rs->rqe_avail--; + rmsg = &rs->dmsg[rs->rmsg_tail]; + rmsg->qp = qp; + rmsg->offset = rs_wr_data(wc.wr_id); + rmsg->length = wc.byte_len - sizeof(struct ibv_grh); + if (++rs->rmsg_tail == rs->rq_size + 1) + rs->rmsg_tail = 0; + } else { + ds_post_recv(rs, qp, rs_wr_data(wc.wr_id)); + } + } else { + smsg = (struct ds_smsg *) (rs->sbuf + rs_wr_data(wc.wr_id)); + smsg->next = rs->smsg_free; + rs->smsg_free = smsg; + rs->sqe_avail++; + } + + qp = ds_next_qp(qp); + if (!rs->rqe_avail && rs->sqe_avail) { + rs->qp_list = qp; + return; + } + cnt++; + } while (qp != rs->qp_list); + } while (cnt); +} + +static void ds_req_notify_cqs(struct rsocket *rs) +{ + struct ds_qp *qp; + + if (!(qp = rs->qp_list)) + return; + + do { + if (!qp->cq_armed) { + ibv_req_notify_cq(qp->cm_id->recv_cq, 0); + qp->cq_armed = 1; + } + qp = ds_next_qp(qp); + } while (qp != rs->qp_list); +} + +static int ds_get_cq_event(struct rsocket *rs) +{ + struct epoll_event event; + struct ds_qp *qp; + struct ibv_cq *cq; + void *context; + int ret; + + if (!rs->cq_armed) + return 0; + + ret = epoll_wait(rs->epfd, &event, 1, -1); + if (ret <= 0) + return ret; + + qp = event.data.ptr; + ret = ibv_get_cq_event(qp->cm_id->recv_cq_channel, &cq, &context); + if (!ret) { + ibv_ack_cq_events(qp->cm_id->recv_cq, 1); + qp->cq_armed = 0; + rs->cq_armed = 0; + } + + return ret; +} + +static int ds_process_cqs(struct rsocket *rs, int nonblock, int (*test)(struct rsocket *rs)) +{ + int ret = 0; + + fastlock_acquire(&rs->cq_lock); + do { + ds_poll_cqs(rs); + if (test(rs)) { + ret = 0; + break; + } else if (nonblock) { + ret = ERR(EWOULDBLOCK); + } else if (!rs->cq_armed) { + ds_req_notify_cqs(rs); + rs->cq_armed = 1; + } else { + fastlock_acquire(&rs->cq_wait_lock); + fastlock_release(&rs->cq_lock); + + ret = ds_get_cq_event(rs); + fastlock_release(&rs->cq_wait_lock); + fastlock_acquire(&rs->cq_lock); + } + } while (!ret); + + fastlock_release(&rs->cq_lock); + return ret; +} + +static int ds_get_comp(struct rsocket *rs, int nonblock, int (*test)(struct rsocket *rs)) +{ + uint64_t start_time = 0; + uint32_t poll_time; + int ret; + + do { + ret = ds_process_cqs(rs, 1, test); + if (!ret || nonblock || errno != EWOULDBLOCK) + return ret; + + if (!start_time) + start_time = rs_time_us(); + + poll_time = (uint32_t) (rs_time_us() - start_time); + } while (poll_time <= polling_time); + + ret = ds_process_cqs(rs, 0, test); + return ret; +} + +static int rs_nonblocking(struct rsocket *rs, int flags) +{ + return (rs->fd_flags & O_NONBLOCK) || (flags & MSG_DONTWAIT); +} + +static int rs_is_cq_armed(struct rsocket *rs) +{ + return rs->cq_armed; +} + +static int rs_poll_all(struct rsocket *rs) +{ + return 1; +} + +/* + * We use hardware flow control to prevent over running the remote + * receive queue. However, data transfers still require space in + * the remote rmsg queue, or we risk losing notification that data + * has been transfered. + * + * Be careful with race conditions in the check below. The target SGL + * may be updated by a remote RDMA write. + */ +static int rs_can_send(struct rsocket *rs) +{ + if (!(rs->opts & RS_OPT_MSG_SEND)) { + return rs->sqe_avail && (rs->sbuf_bytes_avail >= RS_SNDLOWAT) && + (rs->sseq_no != rs->sseq_comp) && + (rs->target_sgl[rs->target_sge].length != 0); + } else { + return (rs->sqe_avail >= 2) && (rs->sbuf_bytes_avail >= RS_SNDLOWAT) && + (rs->sseq_no != rs->sseq_comp) && + (rs->target_sgl[rs->target_sge].length != 0); + } +} + +static int ds_can_send(struct rsocket *rs) +{ + return rs->sqe_avail; +} + +static int ds_all_sends_done(struct rsocket *rs) +{ + return rs->sqe_avail == rs->sq_size; +} + +static int rs_conn_can_send(struct rsocket *rs) +{ + return rs_can_send(rs) || !(rs->state & rs_writable); +} + +static int rs_conn_can_send_ctrl(struct rsocket *rs) +{ + return rs_ctrl_avail(rs) || !(rs->state & rs_connected); +} + +static int rs_have_rdata(struct rsocket *rs) +{ + return (rs->rmsg_head != rs->rmsg_tail); +} + +static int rs_conn_have_rdata(struct rsocket *rs) +{ + return rs_have_rdata(rs) || !(rs->state & rs_readable); +} + +static int rs_conn_all_sends_done(struct rsocket *rs) +{ + return ((((int) rs->ctrl_max_seqno) - ((int) rs->ctrl_seqno)) + + rs->sqe_avail == rs->sq_size) || + !(rs->state & rs_connected); +} + +static void ds_set_src(struct sockaddr *addr, socklen_t *addrlen, + struct ds_header *hdr) +{ + union socket_addr sa; + + memset(&sa, 0, sizeof sa); + if (hdr->version == 4) { + if (*addrlen > sizeof(sa.sin)) + *addrlen = sizeof(sa.sin); + + sa.sin.sin_family = AF_INET; + sa.sin.sin_port = hdr->port; + sa.sin.sin_addr.s_addr = hdr->addr.ipv4; + } else { + if (*addrlen > sizeof(sa.sin6)) + *addrlen = sizeof(sa.sin6); + + sa.sin6.sin6_family = AF_INET6; + sa.sin6.sin6_port = hdr->port; + sa.sin6.sin6_flowinfo = hdr->addr.ipv6.flowinfo; + memcpy(&sa.sin6.sin6_addr, &hdr->addr.ipv6.addr, 16); + } + memcpy(addr, &sa, *addrlen); +} + +static ssize_t ds_recvfrom(struct rsocket *rs, void *buf, size_t len, int flags, + struct sockaddr *src_addr, socklen_t *addrlen) +{ + struct ds_rmsg *rmsg; + struct ds_header *hdr; + int ret; + + if (!(rs->state & rs_readable)) + return ERR(EINVAL); + + if (!rs_have_rdata(rs)) { + ret = ds_get_comp(rs, rs_nonblocking(rs, flags), + rs_have_rdata); + if (ret) + return ret; + } + + rmsg = &rs->dmsg[rs->rmsg_head]; + hdr = (struct ds_header *) (rmsg->qp->rbuf + rmsg->offset); + if (len > rmsg->length - hdr->length) + len = rmsg->length - hdr->length; + + memcpy(buf, (void *) hdr + hdr->length, len); + if (addrlen) + ds_set_src(src_addr, addrlen, hdr); + + if (!(flags & MSG_PEEK)) { + ds_post_recv(rs, rmsg->qp, rmsg->offset); + if (++rs->rmsg_head == rs->rq_size + 1) + rs->rmsg_head = 0; + rs->rqe_avail++; + } + + return len; +} + +static ssize_t rs_peek(struct rsocket *rs, void *buf, size_t len) +{ + size_t left = len; + uint32_t end_size, rsize; + int rmsg_head, rbuf_offset; + + rmsg_head = rs->rmsg_head; + rbuf_offset = rs->rbuf_offset; + + for (; left && (rmsg_head != rs->rmsg_tail); left -= rsize) { + if (left < rs->rmsg[rmsg_head].data) { + rsize = left; + } else { + rsize = rs->rmsg[rmsg_head].data; + if (++rmsg_head == rs->rq_size + 1) + rmsg_head = 0; + } + + end_size = rs->rbuf_size - rbuf_offset; + if (rsize > end_size) { + memcpy(buf, &rs->rbuf[rbuf_offset], end_size); + rbuf_offset = 0; + buf += end_size; + rsize -= end_size; + left -= end_size; + } + memcpy(buf, &rs->rbuf[rbuf_offset], rsize); + rbuf_offset += rsize; + buf += rsize; + } + + return len - left; +} + +/* + * Continue to receive any queued data even if the remote side has disconnected. + */ +ssize_t rrecv(int socket, void *buf, size_t len, int flags) +{ + struct rsocket *rs; + size_t left = len; + uint32_t end_size, rsize; + int ret = 0; + + rs = idm_at(&idm, socket); + if (!rs) + return ERR(EBADF); + if (rs->type == SOCK_DGRAM) { + fastlock_acquire(&rs->rlock); + ret = ds_recvfrom(rs, buf, len, flags, NULL, NULL); + fastlock_release(&rs->rlock); + return ret; + } + + if (rs->state & rs_opening) { + ret = rs_do_connect(rs); + if (ret) { + if (errno == EINPROGRESS) + errno = EAGAIN; + return ret; + } + } + fastlock_acquire(&rs->rlock); + do { + if (!rs_have_rdata(rs)) { + ret = rs_get_comp(rs, rs_nonblocking(rs, flags), + rs_conn_have_rdata); + if (ret) + break; + } + + if (flags & MSG_PEEK) { + left = len - rs_peek(rs, buf, left); + break; + } + + for (; left && rs_have_rdata(rs); left -= rsize) { + if (left < rs->rmsg[rs->rmsg_head].data) { + rsize = left; + rs->rmsg[rs->rmsg_head].data -= left; + } else { + rs->rseq_no++; + rsize = rs->rmsg[rs->rmsg_head].data; + if (++rs->rmsg_head == rs->rq_size + 1) + rs->rmsg_head = 0; + } + + end_size = rs->rbuf_size - rs->rbuf_offset; + if (rsize > end_size) { + memcpy(buf, &rs->rbuf[rs->rbuf_offset], end_size); + rs->rbuf_offset = 0; + buf += end_size; + rsize -= end_size; + left -= end_size; + rs->rbuf_bytes_avail += end_size; + } + memcpy(buf, &rs->rbuf[rs->rbuf_offset], rsize); + rs->rbuf_offset += rsize; + buf += rsize; + rs->rbuf_bytes_avail += rsize; + } + + } while (left && (flags & MSG_WAITALL) && (rs->state & rs_readable)); + + fastlock_release(&rs->rlock); + return (ret && left == len) ? ret : len - left; +} + +ssize_t rrecvfrom(int socket, void *buf, size_t len, int flags, + struct sockaddr *src_addr, socklen_t *addrlen) +{ + struct rsocket *rs; + int ret; + + rs = idm_at(&idm, socket); + if (!rs) + return ERR(EBADF); + if (rs->type == SOCK_DGRAM) { + fastlock_acquire(&rs->rlock); + ret = ds_recvfrom(rs, buf, len, flags, src_addr, addrlen); + fastlock_release(&rs->rlock); + return ret; + } + + ret = rrecv(socket, buf, len, flags); + if (ret > 0 && src_addr) + rgetpeername(socket, src_addr, addrlen); + + return ret; +} + +/* + * Simple, straightforward implementation for now that only tries to fill + * in the first vector. + */ +static ssize_t rrecvv(int socket, const struct iovec *iov, int iovcnt, int flags) +{ + return rrecv(socket, iov[0].iov_base, iov[0].iov_len, flags); +} + +ssize_t rrecvmsg(int socket, struct msghdr *msg, int flags) +{ + if (msg->msg_control && msg->msg_controllen) + return ERR(ENOTSUP); + + return rrecvv(socket, msg->msg_iov, (int) msg->msg_iovlen, msg->msg_flags); +} + +ssize_t rread(int socket, void *buf, size_t count) +{ + return rrecv(socket, buf, count, 0); +} + +ssize_t rreadv(int socket, const struct iovec *iov, int iovcnt) +{ + return rrecvv(socket, iov, iovcnt, 0); +} + +static int rs_send_iomaps(struct rsocket *rs, int flags) +{ + struct rs_iomap_mr *iomr; + struct ibv_sge sge; + struct rs_iomap iom; + int ret; + + fastlock_acquire(&rs->map_lock); + while (!dlist_empty(&rs->iomap_queue)) { + if (!rs_can_send(rs)) { + ret = rs_get_comp(rs, rs_nonblocking(rs, flags), + rs_conn_can_send); + if (ret) + break; + if (!(rs->state & rs_writable)) { + ret = ERR(ECONNRESET); + break; + } + } + + iomr = container_of(rs->iomap_queue.next, struct rs_iomap_mr, entry); + if (!(rs->opts & RS_OPT_SWAP_SGL)) { + iom.offset = iomr->offset; + iom.sge.addr = (uintptr_t) iomr->mr->addr; + iom.sge.length = iomr->mr->length; + iom.sge.key = iomr->mr->rkey; + } else { + iom.offset = bswap_64(iomr->offset); + iom.sge.addr = bswap_64((uintptr_t) iomr->mr->addr); + iom.sge.length = bswap_32(iomr->mr->length); + iom.sge.key = bswap_32(iomr->mr->rkey); + } + + if (rs->sq_inline >= sizeof iom) { + sge.addr = (uintptr_t) &iom; + sge.length = sizeof iom; + sge.lkey = 0; + ret = rs_write_iomap(rs, iomr, &sge, 1, IBV_SEND_INLINE); + } else if (rs_sbuf_left(rs) >= sizeof iom) { + memcpy((void *) (uintptr_t) rs->ssgl[0].addr, &iom, sizeof iom); + rs->ssgl[0].length = sizeof iom; + ret = rs_write_iomap(rs, iomr, rs->ssgl, 1, 0); + if (rs_sbuf_left(rs) > sizeof iom) + rs->ssgl[0].addr += sizeof iom; + else + rs->ssgl[0].addr = (uintptr_t) rs->sbuf; + } else { + rs->ssgl[0].length = rs_sbuf_left(rs); + memcpy((void *) (uintptr_t) rs->ssgl[0].addr, &iom, + rs->ssgl[0].length); + rs->ssgl[1].length = sizeof iom - rs->ssgl[0].length; + memcpy(rs->sbuf, ((void *) &iom) + rs->ssgl[0].length, + rs->ssgl[1].length); + ret = rs_write_iomap(rs, iomr, rs->ssgl, 2, 0); + rs->ssgl[0].addr = (uintptr_t) rs->sbuf + rs->ssgl[1].length; + } + dlist_remove(&iomr->entry); + dlist_insert_tail(&iomr->entry, &rs->iomap_list); + if (ret) + break; + } + + rs->iomap_pending = !dlist_empty(&rs->iomap_queue); + fastlock_release(&rs->map_lock); + return ret; +} + +static ssize_t ds_sendv_udp(struct rsocket *rs, const struct iovec *iov, + int iovcnt, int flags, uint8_t op) +{ + struct ds_udp_header hdr; + struct msghdr msg; + struct iovec miov[8]; + ssize_t ret; + + if (iovcnt > 8) + return ERR(ENOTSUP); + + hdr.tag = htobe32(DS_UDP_TAG); + hdr.version = rs->conn_dest->qp->hdr.version; + hdr.op = op; + hdr.reserved = 0; + hdr.qpn = htobe32(rs->conn_dest->qp->cm_id->qp->qp_num & 0xFFFFFF); + if (rs->conn_dest->qp->hdr.version == 4) { + hdr.length = DS_UDP_IPV4_HDR_LEN; + hdr.addr.ipv4 = rs->conn_dest->qp->hdr.addr.ipv4; + } else { + hdr.length = DS_UDP_IPV6_HDR_LEN; + memcpy(hdr.addr.ipv6, &rs->conn_dest->qp->hdr.addr.ipv6, 16); + } + + miov[0].iov_base = &hdr; + miov[0].iov_len = hdr.length; + if (iov && iovcnt) + memcpy(&miov[1], iov, sizeof(*iov) * iovcnt); + + memset(&msg, 0, sizeof msg); + msg.msg_name = &rs->conn_dest->addr; + msg.msg_namelen = ucma_addrlen(&rs->conn_dest->addr.sa); + msg.msg_iov = miov; + msg.msg_iovlen = iovcnt + 1; + ret = sendmsg(rs->udp_sock, &msg, flags); + return ret > 0 ? ret - hdr.length : ret; +} + +static ssize_t ds_send_udp(struct rsocket *rs, const void *buf, size_t len, + int flags, uint8_t op) +{ + struct iovec iov; + if (buf && len) { + iov.iov_base = (void *) buf; + iov.iov_len = len; + return ds_sendv_udp(rs, &iov, 1, flags, op); + } else { + return ds_sendv_udp(rs, NULL, 0, flags, op); + } +} + +static ssize_t dsend(struct rsocket *rs, const void *buf, size_t len, int flags) +{ + struct ds_smsg *msg; + struct ibv_sge sge; + uint64_t offset; + int ret = 0; + + if (!rs->conn_dest->ah) + return ds_send_udp(rs, buf, len, flags, RS_OP_DATA); + + if (!ds_can_send(rs)) { + ret = ds_get_comp(rs, rs_nonblocking(rs, flags), ds_can_send); + if (ret) + return ret; + } + + msg = rs->smsg_free; + rs->smsg_free = msg->next; + rs->sqe_avail--; + + memcpy((void *) msg, &rs->conn_dest->qp->hdr, rs->conn_dest->qp->hdr.length); + memcpy((void *) msg + rs->conn_dest->qp->hdr.length, buf, len); + sge.addr = (uintptr_t) msg; + sge.length = rs->conn_dest->qp->hdr.length + len; + sge.lkey = rs->conn_dest->qp->smr->lkey; + offset = (uint8_t *) msg - rs->sbuf; + + ret = ds_post_send(rs, &sge, offset); + return ret ? ret : len; +} + +/* + * We overlap sending the data, by posting a small work request immediately, + * then increasing the size of the send on each iteration. + */ +ssize_t rsend(int socket, const void *buf, size_t len, int flags) +{ + struct rsocket *rs; + struct ibv_sge sge; + size_t left = len; + uint32_t xfer_size, olen = RS_OLAP_START_SIZE; + int ret = 0; + + rs = idm_at(&idm, socket); + if (!rs) + return ERR(EBADF); + if (rs->type == SOCK_DGRAM) { + fastlock_acquire(&rs->slock); + ret = dsend(rs, buf, len, flags); + fastlock_release(&rs->slock); + return ret; + } + + if (rs->state & rs_opening) { + ret = rs_do_connect(rs); + if (ret) { + if (errno == EINPROGRESS) + errno = EAGAIN; + return ret; + } + } + + fastlock_acquire(&rs->slock); + if (rs->iomap_pending) { + ret = rs_send_iomaps(rs, flags); + if (ret) + goto out; + } + for (; left; left -= xfer_size, buf += xfer_size) { + if (!rs_can_send(rs)) { + ret = rs_get_comp(rs, rs_nonblocking(rs, flags), + rs_conn_can_send); + if (ret) + break; + if (!(rs->state & rs_writable)) { + ret = ERR(ECONNRESET); + break; + } + } + + if (olen < left) { + xfer_size = olen; + if (olen < RS_MAX_TRANSFER) + olen <<= 1; + } else { + xfer_size = left; + } + + if (xfer_size > rs->sbuf_bytes_avail) + xfer_size = rs->sbuf_bytes_avail; + if (xfer_size > rs->target_sgl[rs->target_sge].length) + xfer_size = rs->target_sgl[rs->target_sge].length; + + if (xfer_size <= rs->sq_inline) { + sge.addr = (uintptr_t) buf; + sge.length = xfer_size; + sge.lkey = 0; + ret = rs_write_data(rs, &sge, 1, xfer_size, IBV_SEND_INLINE); + } else if (xfer_size <= rs_sbuf_left(rs)) { + memcpy((void *) (uintptr_t) rs->ssgl[0].addr, buf, xfer_size); + rs->ssgl[0].length = xfer_size; + ret = rs_write_data(rs, rs->ssgl, 1, xfer_size, 0); + if (xfer_size < rs_sbuf_left(rs)) + rs->ssgl[0].addr += xfer_size; + else + rs->ssgl[0].addr = (uintptr_t) rs->sbuf; + } else { + rs->ssgl[0].length = rs_sbuf_left(rs); + memcpy((void *) (uintptr_t) rs->ssgl[0].addr, buf, + rs->ssgl[0].length); + rs->ssgl[1].length = xfer_size - rs->ssgl[0].length; + memcpy(rs->sbuf, buf + rs->ssgl[0].length, rs->ssgl[1].length); + ret = rs_write_data(rs, rs->ssgl, 2, xfer_size, 0); + rs->ssgl[0].addr = (uintptr_t) rs->sbuf + rs->ssgl[1].length; + } + if (ret) + break; + } +out: + fastlock_release(&rs->slock); + + return (ret && left == len) ? ret : len - left; +} + +ssize_t rsendto(int socket, const void *buf, size_t len, int flags, + const struct sockaddr *dest_addr, socklen_t addrlen) +{ + struct rsocket *rs; + int ret; + + rs = idm_at(&idm, socket); + if (!rs) + return ERR(EBADF); + if (rs->type == SOCK_STREAM) { + if (dest_addr || addrlen) + return ERR(EISCONN); + + return rsend(socket, buf, len, flags); + } + + if (rs->state == rs_init) { + ret = ds_init_ep(rs); + if (ret) + return ret; + } + + fastlock_acquire(&rs->slock); + if (!rs->conn_dest || ds_compare_addr(dest_addr, &rs->conn_dest->addr)) { + ret = ds_get_dest(rs, dest_addr, addrlen, &rs->conn_dest); + if (ret) + goto out; + } + + ret = dsend(rs, buf, len, flags); +out: + fastlock_release(&rs->slock); + return ret; +} + +static void rs_copy_iov(void *dst, const struct iovec **iov, size_t *offset, size_t len) +{ + size_t size; + + while (len) { + size = (*iov)->iov_len - *offset; + if (size > len) { + memcpy (dst, (*iov)->iov_base + *offset, len); + *offset += len; + break; + } + + memcpy(dst, (*iov)->iov_base + *offset, size); + len -= size; + dst += size; + (*iov)++; + *offset = 0; + } +} + +static ssize_t rsendv(int socket, const struct iovec *iov, int iovcnt, int flags) +{ + struct rsocket *rs; + const struct iovec *cur_iov; + size_t left, len, offset = 0; + uint32_t xfer_size, olen = RS_OLAP_START_SIZE; + int i, ret = 0; + + rs = idm_at(&idm, socket); + if (!rs) + return ERR(EBADF); + if (rs->state & rs_opening) { + ret = rs_do_connect(rs); + if (ret) { + if (errno == EINPROGRESS) + errno = EAGAIN; + return ret; + } + } + + cur_iov = iov; + len = iov[0].iov_len; + for (i = 1; i < iovcnt; i++) + len += iov[i].iov_len; + left = len; + + fastlock_acquire(&rs->slock); + if (rs->iomap_pending) { + ret = rs_send_iomaps(rs, flags); + if (ret) + goto out; + } + for (; left; left -= xfer_size) { + if (!rs_can_send(rs)) { + ret = rs_get_comp(rs, rs_nonblocking(rs, flags), + rs_conn_can_send); + if (ret) + break; + if (!(rs->state & rs_writable)) { + ret = ERR(ECONNRESET); + break; + } + } + + if (olen < left) { + xfer_size = olen; + if (olen < RS_MAX_TRANSFER) + olen <<= 1; + } else { + xfer_size = left; + } + + if (xfer_size > rs->sbuf_bytes_avail) + xfer_size = rs->sbuf_bytes_avail; + if (xfer_size > rs->target_sgl[rs->target_sge].length) + xfer_size = rs->target_sgl[rs->target_sge].length; + + if (xfer_size <= rs_sbuf_left(rs)) { + rs_copy_iov((void *) (uintptr_t) rs->ssgl[0].addr, + &cur_iov, &offset, xfer_size); + rs->ssgl[0].length = xfer_size; + ret = rs_write_data(rs, rs->ssgl, 1, xfer_size, + xfer_size <= rs->sq_inline ? IBV_SEND_INLINE : 0); + if (xfer_size < rs_sbuf_left(rs)) + rs->ssgl[0].addr += xfer_size; + else + rs->ssgl[0].addr = (uintptr_t) rs->sbuf; + } else { + rs->ssgl[0].length = rs_sbuf_left(rs); + rs_copy_iov((void *) (uintptr_t) rs->ssgl[0].addr, &cur_iov, + &offset, rs->ssgl[0].length); + rs->ssgl[1].length = xfer_size - rs->ssgl[0].length; + rs_copy_iov(rs->sbuf, &cur_iov, &offset, rs->ssgl[1].length); + ret = rs_write_data(rs, rs->ssgl, 2, xfer_size, + xfer_size <= rs->sq_inline ? IBV_SEND_INLINE : 0); + rs->ssgl[0].addr = (uintptr_t) rs->sbuf + rs->ssgl[1].length; + } + if (ret) + break; + } +out: + fastlock_release(&rs->slock); + + return (ret && left == len) ? ret : len - left; +} + +ssize_t rsendmsg(int socket, const struct msghdr *msg, int flags) +{ + if (msg->msg_control && msg->msg_controllen) + return ERR(ENOTSUP); + + return rsendv(socket, msg->msg_iov, (int) msg->msg_iovlen, flags); +} + +ssize_t rwrite(int socket, const void *buf, size_t count) +{ + return rsend(socket, buf, count, 0); +} + +ssize_t rwritev(int socket, const struct iovec *iov, int iovcnt) +{ + return rsendv(socket, iov, iovcnt, 0); +} + +/* When mapping rpoll to poll, the events reported on the RDMA + * fd are independent from the events rpoll may be looking for. + * To avoid threads hanging in poll, whenever any event occurs, + * we need to wakeup all threads in poll, so that they can check + * if there has been a change on the rsockets they are monitoring. + * To support this, we 'gate' threads entering and leaving rpoll. + */ +static int rs_pollinit(void) +{ + int ret = 0; + + pthread_mutex_lock(&mut); + if (pollsignal >= 0) + goto unlock; + + pollsignal = eventfd(0, EFD_NONBLOCK | EFD_SEMAPHORE); + if (pollsignal < 0) + ret = -errno; + +unlock: + pthread_mutex_unlock(&mut); + return ret; +} + +/* When an event occurs, we must wait until the state of all rsockets + * has settled. Then we need to re-check the rsocket state prior to + * blocking on poll(). + */ +static int rs_poll_enter(void) +{ + pthread_mutex_lock(&mut); + if (suspendpoll) { + pthread_mutex_unlock(&mut); + sched_yield(); + return -EBUSY; + } + + pollcnt++; + pthread_mutex_unlock(&mut); + return 0; +} + +static void rs_poll_exit(void) +{ + uint64_t c; + int save_errno; + ssize_t ret; + + pthread_mutex_lock(&mut); + if (!--pollcnt) { + /* Keep errno value from poll() call. We try to clear + * a single signal. But there's no guarantee that we'll + * find one. Additional signals indicate that a change + * occurred on an rsocket, which requires all threads to + * re-check before blocking on poll. + */ + save_errno = errno; + ret = read(pollsignal, &c, sizeof(c)); + if (ret != sizeof(c)) + errno = save_errno; + suspendpoll = 0; + } + pthread_mutex_unlock(&mut); +} + +/* When an event occurs, it's possible for a single thread blocked in + * poll to return from the kernel, read the event, and update the state + * of an rsocket. However, that can leave threads blocked in the kernel + * on poll (trying to read the CQ fd), which have had their rsocket + * state set. To avoid those threads remaining blocked in the kernel, + * we must wake them up and ensure that they all return to user space, + * in order to re-check the state of their rsockets. + * + * Because poll is racy wrt updating the rsocket states, we need to + * signal state checks whenever a thread updates the state of a + * monitored rsocket, independent of whether that thread actually + * reads an event from an fd. In other words, we must wake up all + * polling threads whenever poll() indicates that there is a new + * completion to process, and when rpoll() will return a successful + * value after having blocked. + */ +static void rs_poll_stop(void) +{ + uint64_t c; + int save_errno; + ssize_t ret; + + /* See comment in rs_poll_exit */ + save_errno = errno; + + pthread_mutex_lock(&mut); + if (!--pollcnt) { + ret = read(pollsignal, &c, sizeof(c)); + suspendpoll = 0; + } else if (!suspendpoll) { + suspendpoll = 1; + c = 1; + ret = write(pollsignal, &c, sizeof(c)); + } else { + ret = sizeof(c); + } + pthread_mutex_unlock(&mut); + + if (ret != sizeof(c)) + errno = save_errno; +} + +static int rs_poll_signal(void) +{ + uint64_t c; + ssize_t ret; + + pthread_mutex_lock(&mut); + if (pollcnt && !suspendpoll) { + suspendpoll = 1; + c = 1; + ret = write(pollsignal, &c, sizeof(c)); + if (ret == sizeof(c)) + ret = 0; + } else { + ret = 0; + } + pthread_mutex_unlock(&mut); + return ret; +} + +/* We always add the pollsignal read fd to the poll fd set, so + * that we can signal any blocked threads. + */ +static struct pollfd *rs_fds_alloc(nfds_t nfds) +{ + static __thread struct pollfd *rfds; + static __thread nfds_t rnfds; + + if (nfds + 1 > rnfds) { + if (rfds) + free(rfds); + else if (rs_pollinit()) + return NULL; + + rfds = malloc(sizeof(*rfds) * nfds + 1); + rnfds = rfds ? nfds + 1 : 0; + } + + if (rfds) { + rfds[nfds].fd = pollsignal; + rfds[nfds].events = POLLIN; + } + return rfds; +} + +static int rs_poll_rs(struct rsocket *rs, int events, + int nonblock, int (*test)(struct rsocket *rs)) +{ + struct pollfd fds; + short revents; + int ret; + +check_cq: + if ((rs->type == SOCK_STREAM) && ((rs->state & rs_connected) || + (rs->state == rs_disconnected) || (rs->state & rs_error))) { + rs_process_cq(rs, nonblock, test); + + revents = 0; + if ((events & POLLIN) && rs_conn_have_rdata(rs)) + revents |= POLLIN; + if ((events & POLLOUT) && rs_can_send(rs)) + revents |= POLLOUT; + if (!(rs->state & rs_connected)) { + if (rs->state == rs_disconnected) + revents |= POLLHUP; + else + revents |= POLLERR; + } + + return revents; + } else if (rs->type == SOCK_DGRAM) { + ds_process_cqs(rs, nonblock, test); + + revents = 0; + if ((events & POLLIN) && rs_have_rdata(rs)) + revents |= POLLIN; + if ((events & POLLOUT) && ds_can_send(rs)) + revents |= POLLOUT; + + return revents; + } + + if (rs->state == rs_listening) { + fds.fd = rs->accept_queue[0]; + fds.events = events; + fds.revents = 0; + poll(&fds, 1, 0); + return fds.revents; + } + + if (rs->state & rs_opening) { + ret = rs_do_connect(rs); + if (ret && (errno == EINPROGRESS)) { + errno = 0; + } else { + goto check_cq; + } + } + + if (rs->state == rs_connect_error) { + revents = 0; + if (events & POLLOUT) + revents |= POLLOUT; + if (events & POLLIN) + revents |= POLLIN; + revents |= POLLERR; + return revents; + } + + return 0; +} + +static int rs_poll_check(struct pollfd *fds, nfds_t nfds) +{ + struct rsocket *rs; + int i, cnt = 0; + + for (i = 0; i < nfds; i++) { + rs = idm_lookup(&idm, fds[i].fd); + if (rs) + fds[i].revents = rs_poll_rs(rs, fds[i].events, 1, rs_poll_all); + else + poll(&fds[i], 1, 0); + + if (fds[i].revents) + cnt++; + } + return cnt; +} + +static int rs_poll_arm(struct pollfd *rfds, struct pollfd *fds, nfds_t nfds) +{ + struct rsocket *rs; + int i; + + for (i = 0; i < nfds; i++) { + rs = idm_lookup(&idm, fds[i].fd); + if (rs) { + fds[i].revents = rs_poll_rs(rs, fds[i].events, 0, rs_is_cq_armed); + if (fds[i].revents) + return 1; + + if (rs->type == SOCK_STREAM) { + if (rs->state >= rs_connected) + rfds[i].fd = rs->cm_id->recv_cq_channel->fd; + else + rfds[i].fd = rs->cm_id->channel->fd; + } else { + rfds[i].fd = rs->epfd; + } + rfds[i].events = POLLIN; + } else { + rfds[i].fd = fds[i].fd; + rfds[i].events = fds[i].events; + } + rfds[i].revents = 0; + } + return 0; +} + +static int rs_poll_events(struct pollfd *rfds, struct pollfd *fds, nfds_t nfds) +{ + struct rsocket *rs; + int i, cnt = 0; + + for (i = 0; i < nfds; i++) { + rs = idm_lookup(&idm, fds[i].fd); + if (rs) { + if (rfds[i].revents) { + fastlock_acquire(&rs->cq_wait_lock); + if (rs->type == SOCK_STREAM) + rs_get_cq_event(rs); + else + ds_get_cq_event(rs); + fastlock_release(&rs->cq_wait_lock); + } + fds[i].revents = rs_poll_rs(rs, fds[i].events, 1, rs_poll_all); + } else { + fds[i].revents = rfds[i].revents; + } + if (fds[i].revents) + cnt++; + } + return cnt; +} + +/* + * We need to poll *all* fd's that the user specifies at least once. + * Note that we may receive events on an rsocket that may not be reported + * to the user (e.g. connection events or credit updates). Process those + * events, then return to polling until we find ones of interest. + */ +int rpoll(struct pollfd *fds, nfds_t nfds, int timeout) +{ + struct pollfd *rfds; + uint64_t start_time = 0; + uint32_t poll_time; + int pollsleep, ret; + + do { + ret = rs_poll_check(fds, nfds); + if (ret || !timeout) + return ret; + + if (!start_time) + start_time = rs_time_us(); + + poll_time = (uint32_t) (rs_time_us() - start_time); + } while (poll_time <= polling_time); + + rfds = rs_fds_alloc(nfds); + if (!rfds) + return ERR(ENOMEM); + + do { + ret = rs_poll_arm(rfds, fds, nfds); + if (ret) + break; + + if (rs_poll_enter()) + continue; + + if (timeout >= 0) { + timeout -= (int) ((rs_time_us() - start_time) / 1000); + if (timeout <= 0) + return 0; + pollsleep = min(timeout, wake_up_interval); + } else { + pollsleep = wake_up_interval; + } + + ret = poll(rfds, nfds + 1, pollsleep); + if (ret < 0) { + rs_poll_exit(); + break; + } + + ret = rs_poll_events(rfds, fds, nfds); + rs_poll_stop(); + } while (!ret); + + return ret; +} + +static struct pollfd * +rs_select_to_poll(int *nfds, fd_set *readfds, fd_set *writefds, fd_set *exceptfds) +{ + struct pollfd *fds; + int fd, i = 0; + + fds = calloc(*nfds, sizeof(*fds)); + if (!fds) + return NULL; + + for (fd = 0; fd < *nfds; fd++) { + if (readfds && FD_ISSET(fd, readfds)) { + fds[i].fd = fd; + fds[i].events = POLLIN; + } + + if (writefds && FD_ISSET(fd, writefds)) { + fds[i].fd = fd; + fds[i].events |= POLLOUT; + } + + if (exceptfds && FD_ISSET(fd, exceptfds)) + fds[i].fd = fd; + + if (fds[i].fd) + i++; + } + + *nfds = i; + return fds; +} + +static int +rs_poll_to_select(int nfds, struct pollfd *fds, fd_set *readfds, + fd_set *writefds, fd_set *exceptfds) +{ + int i, cnt = 0; + + for (i = 0; i < nfds; i++) { + if (readfds && (fds[i].revents & (POLLIN | POLLHUP))) { + FD_SET(fds[i].fd, readfds); + cnt++; + } + + if (writefds && (fds[i].revents & POLLOUT)) { + FD_SET(fds[i].fd, writefds); + cnt++; + } + + if (exceptfds && (fds[i].revents & ~(POLLIN | POLLOUT))) { + FD_SET(fds[i].fd, exceptfds); + cnt++; + } + } + return cnt; +} + +static int rs_convert_timeout(struct timeval *timeout) +{ + return !timeout ? -1 : + timeout->tv_sec * 1000 + timeout->tv_usec / 1000; +} + +int rselect(int nfds, fd_set *readfds, fd_set *writefds, + fd_set *exceptfds, struct timeval *timeout) +{ + struct pollfd *fds; + int ret; + + fds = rs_select_to_poll(&nfds, readfds, writefds, exceptfds); + if (!fds) + return ERR(ENOMEM); + + ret = rpoll(fds, nfds, rs_convert_timeout(timeout)); + + if (readfds) + FD_ZERO(readfds); + if (writefds) + FD_ZERO(writefds); + if (exceptfds) + FD_ZERO(exceptfds); + + if (ret > 0) + ret = rs_poll_to_select(nfds, fds, readfds, writefds, exceptfds); + + free(fds); + return ret; +} + +/* + * For graceful disconnect, notify the remote side that we're + * disconnecting and wait until all outstanding sends complete, provided + * that the remote side has not sent a disconnect message. + */ +int rshutdown(int socket, int how) +{ + struct rsocket *rs; + int ctrl, ret = 0; + + rs = idm_lookup(&idm, socket); + if (!rs) + return ERR(EBADF); + if (rs->opts & RS_OPT_KEEPALIVE) + rs_notify_svc(&tcp_svc, rs, RS_SVC_REM_KEEPALIVE); + + if (rs->fd_flags & O_NONBLOCK) + rs_set_nonblocking(rs, 0); + + if (rs->state & rs_connected) { + if (how == SHUT_RDWR) { + ctrl = RS_CTRL_DISCONNECT; + rs->state &= ~(rs_readable | rs_writable); + } else if (how == SHUT_WR) { + rs->state &= ~rs_writable; + ctrl = (rs->state & rs_readable) ? + RS_CTRL_SHUTDOWN : RS_CTRL_DISCONNECT; + } else { + rs->state &= ~rs_readable; + if (rs->state & rs_writable) + goto out; + ctrl = RS_CTRL_DISCONNECT; + } + if (!rs_ctrl_avail(rs)) { + ret = rs_process_cq(rs, 0, rs_conn_can_send_ctrl); + if (ret) + goto out; + } + + if ((rs->state & rs_connected) && rs_ctrl_avail(rs)) { + rs->ctrl_seqno++; + ret = rs_post_msg(rs, rs_msg_set(RS_OP_CTRL, ctrl)); + } + } + + if (rs->state & rs_connected) + rs_process_cq(rs, 0, rs_conn_all_sends_done); + +out: + if ((rs->fd_flags & O_NONBLOCK) && (rs->state & rs_connected)) + rs_set_nonblocking(rs, rs->fd_flags); + + if (rs->state & rs_disconnected) { + /* Generate event by flushing receives to unblock rpoll */ + ibv_req_notify_cq(rs->cm_id->recv_cq, 0); + ucma_shutdown(rs->cm_id); + } + + return ret; +} + +static void ds_shutdown(struct rsocket *rs) +{ + if (rs->opts & RS_OPT_UDP_SVC) + rs_notify_svc(&udp_svc, rs, RS_SVC_REM_DGRAM); + + if (rs->fd_flags & O_NONBLOCK) + rs_set_nonblocking(rs, 0); + + rs->state &= ~(rs_readable | rs_writable); + ds_process_cqs(rs, 0, ds_all_sends_done); + + if (rs->fd_flags & O_NONBLOCK) + rs_set_nonblocking(rs, rs->fd_flags); +} + +int rclose(int socket) +{ + struct rsocket *rs; + + rs = idm_lookup(&idm, socket); + if (!rs) + return EBADF; + if (rs->type == SOCK_STREAM) { + if (rs->state & rs_connected) + rshutdown(socket, SHUT_RDWR); + if (rs->opts & RS_OPT_KEEPALIVE) + rs_notify_svc(&tcp_svc, rs, RS_SVC_REM_KEEPALIVE); + if (rs->opts & RS_OPT_CM_SVC && rs->state == rs_listening) + rs_notify_svc(&listen_svc, rs, RS_SVC_REM_CM); + if (rs->opts & RS_OPT_CM_SVC) + rs_notify_svc(&connect_svc, rs, RS_SVC_REM_CM); + } else { + ds_shutdown(rs); + } + + rs_free(rs); + return 0; +} + +static void rs_copy_addr(struct sockaddr *dst, struct sockaddr *src, socklen_t *len) +{ + socklen_t size; + + if (src->sa_family == AF_INET) { + size = min_t(socklen_t, *len, sizeof(struct sockaddr_in)); + *len = sizeof(struct sockaddr_in); + } else { + size = min_t(socklen_t, *len, sizeof(struct sockaddr_in6)); + *len = sizeof(struct sockaddr_in6); + } + memcpy(dst, src, size); +} + +int rgetpeername(int socket, struct sockaddr *addr, socklen_t *addrlen) +{ + struct rsocket *rs; + + rs = idm_lookup(&idm, socket); + if (!rs) + return ERR(EBADF); + if (rs->type == SOCK_STREAM) { + rs_copy_addr(addr, rdma_get_peer_addr(rs->cm_id), addrlen); + return 0; + } else { + return getpeername(rs->udp_sock, addr, addrlen); + } +} + +int rgetsockname(int socket, struct sockaddr *addr, socklen_t *addrlen) +{ + struct rsocket *rs; + + rs = idm_lookup(&idm, socket); + if (!rs) + return ERR(EBADF); + if (rs->type == SOCK_STREAM) { + rs_copy_addr(addr, rdma_get_local_addr(rs->cm_id), addrlen); + return 0; + } else { + return getsockname(rs->udp_sock, addr, addrlen); + } +} + +static int rs_set_keepalive(struct rsocket *rs, int on) +{ + FILE *f; + int ret; + + if ((on && (rs->opts & RS_OPT_KEEPALIVE)) || + (!on && !(rs->opts & RS_OPT_KEEPALIVE))) + return 0; + + if (on) { + if (!rs->keepalive_time) { + if ((f = fopen("/proc/sys/net/ipv4/tcp_keepalive_time", "r"))) { + if (fscanf(f, "%u", &rs->keepalive_time) != 1) + rs->keepalive_time = 7200; + fclose(f); + } else { + rs->keepalive_time = 7200; + } + } + ret = rs_notify_svc(&tcp_svc, rs, RS_SVC_ADD_KEEPALIVE); + } else { + ret = rs_notify_svc(&tcp_svc, rs, RS_SVC_REM_KEEPALIVE); + } + + return ret; +} + +int rsetsockopt(int socket, int level, int optname, + const void *optval, socklen_t optlen) +{ + struct rsocket *rs; + int ret, opt_on = 0; + uint64_t *opts = NULL; + + ret = ERR(ENOTSUP); + rs = idm_lookup(&idm, socket); + if (!rs) + return ERR(EBADF); + if (rs->type == SOCK_DGRAM && level != SOL_RDMA) { + ret = setsockopt(rs->udp_sock, level, optname, optval, optlen); + if (ret) + return ret; + } + + switch (level) { + case SOL_SOCKET: + opts = &rs->so_opts; + switch (optname) { + case SO_REUSEADDR: + if (rs->type == SOCK_STREAM) { + ret = rdma_set_option(rs->cm_id, RDMA_OPTION_ID, + RDMA_OPTION_ID_REUSEADDR, + (void *) optval, optlen); + if (ret && ((errno == ENOSYS) || ((rs->state != rs_init) && + rs->cm_id->context && + (rs->cm_id->verbs->device->transport_type == IBV_TRANSPORT_IB)))) + ret = 0; + } + opt_on = *(int *) optval; + break; + case SO_RCVBUF: + if ((rs->type == SOCK_STREAM && !rs->rbuf) || + (rs->type == SOCK_DGRAM && !rs->qp_list)) + rs->rbuf_size = (*(uint32_t *) optval) << 1; + ret = 0; + break; + case SO_SNDBUF: + if (!rs->sbuf) + rs->sbuf_size = (*(uint32_t *) optval) << 1; + if (rs->sbuf_size < RS_SNDLOWAT) + rs->sbuf_size = RS_SNDLOWAT << 1; + ret = 0; + break; + case SO_LINGER: + /* Invert value so default so_opt = 0 is on */ + opt_on = !((struct linger *) optval)->l_onoff; + ret = 0; + break; + case SO_KEEPALIVE: + ret = rs_set_keepalive(rs, *(int *) optval); + opt_on = rs->opts & RS_OPT_KEEPALIVE; + break; + case SO_OOBINLINE: + opt_on = *(int *) optval; + ret = 0; + break; + default: + break; + } + break; + case IPPROTO_TCP: + opts = &rs->tcp_opts; + switch (optname) { + case TCP_KEEPCNT: + case TCP_KEEPINTVL: + ret = 0; /* N/A - we're using a reliable connection */ + break; + case TCP_KEEPIDLE: + if (*(int *) optval <= 0) { + ret = ERR(EINVAL); + break; + } + rs->keepalive_time = *(int *) optval; + ret = (rs->opts & RS_OPT_KEEPALIVE) ? + rs_notify_svc(&tcp_svc, rs, RS_SVC_MOD_KEEPALIVE) : 0; + break; + case TCP_NODELAY: + opt_on = *(int *) optval; + ret = 0; + break; + case TCP_MAXSEG: + ret = 0; + break; + default: + break; + } + break; + case IPPROTO_IPV6: + opts = &rs->ipv6_opts; + switch (optname) { + case IPV6_V6ONLY: + if (rs->type == SOCK_STREAM) { + ret = rdma_set_option(rs->cm_id, RDMA_OPTION_ID, + RDMA_OPTION_ID_AFONLY, + (void *) optval, optlen); + } + opt_on = *(int *) optval; + break; + default: + break; + } + break; + case SOL_RDMA: + if (rs->state >= rs_opening) { + ret = ERR(EINVAL); + break; + } + + switch (optname) { + case RDMA_SQSIZE: + rs->sq_size = min_t(uint32_t, (*(uint32_t *)optval), + RS_QP_MAX_SIZE); + ret = 0; + break; + case RDMA_RQSIZE: + rs->rq_size = min_t(uint32_t, (*(uint32_t *)optval), + RS_QP_MAX_SIZE); + ret = 0; + break; + case RDMA_INLINE: + rs->sq_inline = min_t(uint32_t, *(uint32_t *)optval, + RS_QP_MAX_SIZE); + ret = 0; + break; + case RDMA_IOMAPSIZE: + rs->target_iomap_size = (uint16_t) rs_scale_to_value( + (uint8_t) rs_value_to_scale(*(int *) optval, 8), 8); + ret = 0; + break; + case RDMA_ROUTE: + if ((rs->optval = malloc(optlen))) { + memcpy(rs->optval, optval, optlen); + rs->optlen = optlen; + ret = 0; + } else { + ret = ERR(ENOMEM); + } + break; + default: + break; + } + break; + default: + break; + } + + if (!ret && opts) { + if (opt_on) + *opts |= (1 << optname); + else + *opts &= ~(1 << optname); + } + + return ret; +} + +static void rs_convert_sa_path(struct ibv_sa_path_rec *sa_path, + struct ibv_path_data *path_data) +{ + uint32_t fl_hop; + + memset(path_data, 0, sizeof(*path_data)); + path_data->path.dgid = sa_path->dgid; + path_data->path.sgid = sa_path->sgid; + path_data->path.dlid = sa_path->dlid; + path_data->path.slid = sa_path->slid; + fl_hop = be32toh(sa_path->flow_label) << 8; + path_data->path.flowlabel_hoplimit = htobe32(fl_hop | sa_path->hop_limit); + path_data->path.tclass = sa_path->traffic_class; + path_data->path.reversible_numpath = sa_path->reversible << 7 | 1; + path_data->path.pkey = sa_path->pkey; + path_data->path.qosclass_sl = htobe16(sa_path->sl); + path_data->path.mtu = sa_path->mtu | 2 << 6; /* exactly */ + path_data->path.rate = sa_path->rate | 2 << 6; + path_data->path.packetlifetime = sa_path->packet_life_time | 2 << 6; + path_data->flags= sa_path->preference; +} + +int rgetsockopt(int socket, int level, int optname, + void *optval, socklen_t *optlen) +{ + struct rsocket *rs; + void *opt; + struct ibv_sa_path_rec *path_rec; + struct ibv_path_data path_data; + socklen_t len; + int ret = 0; + int num_paths; + + rs = idm_lookup(&idm, socket); + if (!rs) + return ERR(EBADF); + switch (level) { + case SOL_SOCKET: + switch (optname) { + case SO_REUSEADDR: + case SO_KEEPALIVE: + case SO_OOBINLINE: + *((int *) optval) = !!(rs->so_opts & (1 << optname)); + *optlen = sizeof(int); + break; + case SO_RCVBUF: + *((int *) optval) = rs->rbuf_size; + *optlen = sizeof(int); + break; + case SO_SNDBUF: + *((int *) optval) = rs->sbuf_size; + *optlen = sizeof(int); + break; + case SO_LINGER: + /* Value is inverted so default so_opt = 0 is on */ + ((struct linger *) optval)->l_onoff = + !(rs->so_opts & (1 << optname)); + ((struct linger *) optval)->l_linger = 0; + *optlen = sizeof(struct linger); + break; + case SO_ERROR: + *((int *) optval) = rs->err; + *optlen = sizeof(int); + rs->err = 0; + break; + default: + ret = ENOTSUP; + break; + } + break; + case IPPROTO_TCP: + switch (optname) { + case TCP_KEEPCNT: + case TCP_KEEPINTVL: + *((int *) optval) = 1; /* N/A */ + break; + case TCP_KEEPIDLE: + *((int *) optval) = (int) rs->keepalive_time; + *optlen = sizeof(int); + break; + case TCP_NODELAY: + *((int *) optval) = !!(rs->tcp_opts & (1 << optname)); + *optlen = sizeof(int); + break; + case TCP_MAXSEG: + *((int *) optval) = (rs->cm_id && rs->cm_id->route.num_paths) ? + 1 << (7 + rs->cm_id->route.path_rec->mtu) : + 2048; + *optlen = sizeof(int); + break; + default: + ret = ENOTSUP; + break; + } + break; + case IPPROTO_IPV6: + switch (optname) { + case IPV6_V6ONLY: + *((int *) optval) = !!(rs->ipv6_opts & (1 << optname)); + *optlen = sizeof(int); + break; + default: + ret = ENOTSUP; + break; + } + break; + case SOL_RDMA: + switch (optname) { + case RDMA_SQSIZE: + *((int *) optval) = rs->sq_size; + *optlen = sizeof(int); + break; + case RDMA_RQSIZE: + *((int *) optval) = rs->rq_size; + *optlen = sizeof(int); + break; + case RDMA_INLINE: + *((int *) optval) = rs->sq_inline; + *optlen = sizeof(int); + break; + case RDMA_IOMAPSIZE: + *((int *) optval) = rs->target_iomap_size; + *optlen = sizeof(int); + break; + case RDMA_ROUTE: + if (rs->optval) { + if (*optlen < rs->optlen) { + ret = EINVAL; + } else { + memcpy(rs->optval, optval, rs->optlen); + *optlen = rs->optlen; + } + } else { + if (*optlen < sizeof(path_data)) { + ret = EINVAL; + } else { + len = 0; + opt = optval; + path_rec = rs->cm_id->route.path_rec; + num_paths = 0; + while (len + sizeof(path_data) <= *optlen && + num_paths < rs->cm_id->route.num_paths) { + rs_convert_sa_path(path_rec, &path_data); + memcpy(opt, &path_data, sizeof(path_data)); + len += sizeof(path_data); + opt += sizeof(path_data); + path_rec++; + num_paths++; + } + *optlen = len; + ret = 0; + } + } + break; + default: + ret = ENOTSUP; + break; + } + break; + default: + ret = ENOTSUP; + break; + } + + return rdma_seterrno(ret); +} + +int rfcntl(int socket, int cmd, ... /* arg */ ) +{ + struct rsocket *rs; + va_list args; + int param; + int ret = 0; + + rs = idm_lookup(&idm, socket); + if (!rs) + return ERR(EBADF); + va_start(args, cmd); + switch (cmd) { + case F_GETFL: + ret = rs->fd_flags; + break; + case F_SETFL: + param = va_arg(args, int); + if ((rs->fd_flags & O_NONBLOCK) != (param & O_NONBLOCK)) + ret = rs_set_nonblocking(rs, param & O_NONBLOCK); + + if (!ret) + rs->fd_flags = param; + break; + default: + ret = ERR(ENOTSUP); + break; + } + va_end(args); + return ret; +} + +static struct rs_iomap_mr *rs_get_iomap_mr(struct rsocket *rs) +{ + int i; + + if (!rs->remote_iomappings) { + rs->remote_iomappings = calloc(rs->remote_iomap.length, + sizeof(*rs->remote_iomappings)); + if (!rs->remote_iomappings) + return NULL; + + for (i = 0; i < rs->remote_iomap.length; i++) + rs->remote_iomappings[i].index = i; + } + + for (i = 0; i < rs->remote_iomap.length; i++) { + if (!rs->remote_iomappings[i].mr) + return &rs->remote_iomappings[i]; + } + return NULL; +} + +/* + * If an offset is given, we map to it. If offset is -1, then we map the + * offset to the address of buf. We do not check for conflicts, which must + * be fixed at some point. + */ +off_t riomap(int socket, void *buf, size_t len, int prot, int flags, off_t offset) +{ + struct rsocket *rs; + struct rs_iomap_mr *iomr; + int access = IBV_ACCESS_LOCAL_WRITE; + + rs = idm_at(&idm, socket); + if (!rs) + return ERR(EBADF); + if (!rs->cm_id->pd || (prot & ~(PROT_WRITE | PROT_NONE))) + return ERR(EINVAL); + + fastlock_acquire(&rs->map_lock); + if (prot & PROT_WRITE) { + iomr = rs_get_iomap_mr(rs); + access |= IBV_ACCESS_REMOTE_WRITE; + } else { + iomr = calloc(1, sizeof(*iomr)); + iomr->index = -1; + } + if (!iomr) { + offset = ERR(ENOMEM); + goto out; + } + + iomr->mr = ibv_reg_mr(rs->cm_id->pd, buf, len, access); + if (!iomr->mr) { + if (iomr->index < 0) + free(iomr); + offset = -1; + goto out; + } + + if (offset == -1) + offset = (uintptr_t) buf; + iomr->offset = offset; + atomic_store(&iomr->refcnt, 1); + + if (iomr->index >= 0) { + dlist_insert_tail(&iomr->entry, &rs->iomap_queue); + rs->iomap_pending = 1; + } else { + dlist_insert_tail(&iomr->entry, &rs->iomap_list); + } +out: + fastlock_release(&rs->map_lock); + return offset; +} + +int riounmap(int socket, void *buf, size_t len) +{ + struct rsocket *rs; + struct rs_iomap_mr *iomr; + dlist_entry *entry; + int ret = 0; + + rs = idm_at(&idm, socket); + if (!rs) + return ERR(EBADF); + fastlock_acquire(&rs->map_lock); + + for (entry = rs->iomap_list.next; entry != &rs->iomap_list; + entry = entry->next) { + iomr = container_of(entry, struct rs_iomap_mr, entry); + if (iomr->mr->addr == buf && iomr->mr->length == len) { + rs_release_iomap_mr(iomr); + goto out; + } + } + + for (entry = rs->iomap_queue.next; entry != &rs->iomap_queue; + entry = entry->next) { + iomr = container_of(entry, struct rs_iomap_mr, entry); + if (iomr->mr->addr == buf && iomr->mr->length == len) { + rs_release_iomap_mr(iomr); + goto out; + } + } + ret = ERR(EINVAL); +out: + fastlock_release(&rs->map_lock); + return ret; +} + +static struct rs_iomap *rs_find_iomap(struct rsocket *rs, off_t offset) +{ + int i; + + for (i = 0; i < rs->target_iomap_size; i++) { + if (offset >= rs->target_iomap[i].offset && + offset < rs->target_iomap[i].offset + rs->target_iomap[i].sge.length) + return &rs->target_iomap[i]; + } + return NULL; +} + +size_t riowrite(int socket, const void *buf, size_t count, off_t offset, int flags) +{ + struct rsocket *rs; + struct rs_iomap *iom = NULL; + struct ibv_sge sge; + size_t left = count; + uint32_t xfer_size, olen = RS_OLAP_START_SIZE; + int ret = 0; + + rs = idm_at(&idm, socket); + if (!rs) + return ERR(EBADF); + fastlock_acquire(&rs->slock); + if (rs->iomap_pending) { + ret = rs_send_iomaps(rs, flags); + if (ret) + goto out; + } + for (; left; left -= xfer_size, buf += xfer_size, offset += xfer_size) { + if (!iom || offset > iom->offset + iom->sge.length) { + iom = rs_find_iomap(rs, offset); + if (!iom) + break; + } + + if (!rs_can_send(rs)) { + ret = rs_get_comp(rs, rs_nonblocking(rs, flags), + rs_conn_can_send); + if (ret) + break; + if (!(rs->state & rs_writable)) { + ret = ERR(ECONNRESET); + break; + } + } + + if (olen < left) { + xfer_size = olen; + if (olen < RS_MAX_TRANSFER) + olen <<= 1; + } else { + xfer_size = left; + } + + if (xfer_size > rs->sbuf_bytes_avail) + xfer_size = rs->sbuf_bytes_avail; + if (xfer_size > iom->offset + iom->sge.length - offset) + xfer_size = iom->offset + iom->sge.length - offset; + + if (xfer_size <= rs->sq_inline) { + sge.addr = (uintptr_t) buf; + sge.length = xfer_size; + sge.lkey = 0; + ret = rs_write_direct(rs, iom, offset, &sge, 1, + xfer_size, IBV_SEND_INLINE); + } else if (xfer_size <= rs_sbuf_left(rs)) { + memcpy((void *) (uintptr_t) rs->ssgl[0].addr, buf, xfer_size); + rs->ssgl[0].length = xfer_size; + ret = rs_write_direct(rs, iom, offset, rs->ssgl, 1, xfer_size, 0); + if (xfer_size < rs_sbuf_left(rs)) + rs->ssgl[0].addr += xfer_size; + else + rs->ssgl[0].addr = (uintptr_t) rs->sbuf; + } else { + rs->ssgl[0].length = rs_sbuf_left(rs); + memcpy((void *) (uintptr_t) rs->ssgl[0].addr, buf, + rs->ssgl[0].length); + rs->ssgl[1].length = xfer_size - rs->ssgl[0].length; + memcpy(rs->sbuf, buf + rs->ssgl[0].length, rs->ssgl[1].length); + ret = rs_write_direct(rs, iom, offset, rs->ssgl, 2, xfer_size, 0); + rs->ssgl[0].addr = (uintptr_t) rs->sbuf + rs->ssgl[1].length; + } + if (ret) + break; + } +out: + fastlock_release(&rs->slock); + + return (ret && left == count) ? ret : count - left; +} + +/**************************************************************************** + * Service Processing Threads + ****************************************************************************/ + +static int rs_svc_grow_sets(struct rs_svc *svc, int grow_size) +{ + struct rsocket **rss; + void *set, *contexts; + + set = calloc(svc->size + grow_size, sizeof(*rss) + svc->context_size); + if (!set) + return ENOMEM; + + svc->size += grow_size; + rss = set; + contexts = set + sizeof(*rss) * svc->size; + if (svc->cnt) { + memcpy(rss, svc->rss, sizeof(*rss) * (svc->cnt + 1)); + memcpy(contexts, svc->contexts, svc->context_size * (svc->cnt + 1)); + } + + free(svc->rss); + svc->rss = rss; + svc->contexts = contexts; + return 0; +} + +/* + * Index 0 is reserved for the service's communication socket. + */ +static int rs_svc_add_rs(struct rs_svc *svc, struct rsocket *rs) +{ + int ret; + + if (svc->cnt >= svc->size - 1) { + ret = rs_svc_grow_sets(svc, 4); + if (ret) + return ret; + } + + svc->rss[++svc->cnt] = rs; + return 0; +} + +static int rs_svc_index(struct rs_svc *svc, struct rsocket *rs) +{ + int i; + + for (i = 1; i <= svc->cnt; i++) { + if (svc->rss[i] == rs) + return i; + } + return -1; +} + +static int rs_svc_rm_rs(struct rs_svc *svc, struct rsocket *rs) +{ + int i; + + if ((i = rs_svc_index(svc, rs)) >= 0) { + svc->rss[i] = svc->rss[svc->cnt]; + memcpy(svc->contexts + i * svc->context_size, + svc->contexts + svc->cnt * svc->context_size, + svc->context_size); + svc->cnt--; + return 0; + } + return EBADF; +} + +static void udp_svc_process_sock(struct rs_svc *svc) +{ + struct rs_svc_msg msg; + + read_all(svc->sock[1], &msg, sizeof msg); + switch (msg.cmd) { + case RS_SVC_ADD_DGRAM: + msg.status = rs_svc_add_rs(svc, msg.rs); + if (!msg.status) { + msg.rs->opts |= RS_OPT_UDP_SVC; + udp_svc_fds = svc->contexts; + udp_svc_fds[svc->cnt].fd = msg.rs->udp_sock; + udp_svc_fds[svc->cnt].events = POLLIN; + udp_svc_fds[svc->cnt].revents = 0; + } + break; + case RS_SVC_REM_DGRAM: + msg.status = rs_svc_rm_rs(svc, msg.rs); + if (!msg.status) + msg.rs->opts &= ~RS_OPT_UDP_SVC; + break; + case RS_SVC_NOOP: + msg.status = 0; + break; + default: + break; + } + + write_all(svc->sock[1], &msg, sizeof msg); +} + +static uint8_t udp_svc_sgid_index(struct ds_dest *dest, union ibv_gid *sgid) +{ + union ibv_gid gid; + int i; + + for (i = 0; i < 16; i++) { + ibv_query_gid(dest->qp->cm_id->verbs, dest->qp->cm_id->port_num, + i, &gid); + if (!memcmp(sgid, &gid, sizeof gid)) + return i; + } + return 0; +} + +static uint8_t udp_svc_path_bits(struct ds_dest *dest) +{ + struct ibv_port_attr attr; + + if (!ibv_query_port(dest->qp->cm_id->verbs, dest->qp->cm_id->port_num, &attr)) + return (uint8_t) ((1 << attr.lmc) - 1); + return 0x7f; +} + +static void udp_svc_create_ah(struct rsocket *rs, struct ds_dest *dest, uint32_t qpn) +{ + union socket_addr saddr; + struct rdma_cm_id *id; + struct ibv_ah_attr attr; + int ret; + + if (dest->ah) { + fastlock_acquire(&rs->slock); + ibv_destroy_ah(dest->ah); + dest->ah = NULL; + fastlock_release(&rs->slock); + } + + ret = rdma_create_id(NULL, &id, NULL, dest->qp->cm_id->ps); + if (ret) + return; + + memcpy(&saddr, rdma_get_local_addr(dest->qp->cm_id), + ucma_addrlen(rdma_get_local_addr(dest->qp->cm_id))); + if (saddr.sa.sa_family == AF_INET) + saddr.sin.sin_port = 0; + else + saddr.sin6.sin6_port = 0; + ret = rdma_resolve_addr(id, &saddr.sa, &dest->addr.sa, 2000); + if (ret) + goto out; + + ret = rdma_resolve_route(id, 2000); + if (ret) + goto out; + + memset(&attr, 0, sizeof attr); + if (id->route.path_rec->hop_limit > 1) { + attr.is_global = 1; + attr.grh.dgid = id->route.path_rec->dgid; + attr.grh.flow_label = be32toh(id->route.path_rec->flow_label); + attr.grh.sgid_index = udp_svc_sgid_index(dest, &id->route.path_rec->sgid); + attr.grh.hop_limit = id->route.path_rec->hop_limit; + attr.grh.traffic_class = id->route.path_rec->traffic_class; + } + attr.dlid = be16toh(id->route.path_rec->dlid); + attr.sl = id->route.path_rec->sl; + attr.src_path_bits = be16toh(id->route.path_rec->slid) & udp_svc_path_bits(dest); + attr.static_rate = id->route.path_rec->rate; + attr.port_num = id->port_num; + + fastlock_acquire(&rs->slock); + dest->qpn = qpn; + dest->ah = ibv_create_ah(dest->qp->cm_id->pd, &attr); + fastlock_release(&rs->slock); +out: + rdma_destroy_id(id); +} + +static int udp_svc_valid_udp_hdr(struct ds_udp_header *udp_hdr, + union socket_addr *addr) +{ + return (udp_hdr->tag == htobe32(DS_UDP_TAG)) && + ((udp_hdr->version == 4 && addr->sa.sa_family == AF_INET && + udp_hdr->length == DS_UDP_IPV4_HDR_LEN) || + (udp_hdr->version == 6 && addr->sa.sa_family == AF_INET6 && + udp_hdr->length == DS_UDP_IPV6_HDR_LEN)); +} + +static void udp_svc_forward(struct rsocket *rs, void *buf, size_t len, + union socket_addr *src) +{ + struct ds_header hdr; + struct ds_smsg *msg; + struct ibv_sge sge; + uint64_t offset; + + if (!ds_can_send(rs)) { + if (ds_get_comp(rs, 0, ds_can_send)) + return; + } + + msg = rs->smsg_free; + rs->smsg_free = msg->next; + rs->sqe_avail--; + + ds_format_hdr(&hdr, src); + memcpy((void *) msg, &hdr, hdr.length); + memcpy((void *) msg + hdr.length, buf, len); + sge.addr = (uintptr_t) msg; + sge.length = hdr.length + len; + sge.lkey = rs->conn_dest->qp->smr->lkey; + offset = (uint8_t *) msg - rs->sbuf; + + ds_post_send(rs, &sge, offset); +} + +static void udp_svc_process_rs(struct rsocket *rs) +{ + static uint8_t buf[RS_SNDLOWAT]; + struct ds_dest *dest, *cur_dest; + struct ds_udp_header *udp_hdr; + union socket_addr addr; + socklen_t addrlen = sizeof addr; + int len, ret; + uint32_t qpn; + + ret = recvfrom(rs->udp_sock, buf, sizeof buf, 0, &addr.sa, &addrlen); + if (ret < DS_UDP_IPV4_HDR_LEN) + return; + + udp_hdr = (struct ds_udp_header *) buf; + if (!udp_svc_valid_udp_hdr(udp_hdr, &addr)) + return; + + len = ret - udp_hdr->length; + qpn = be32toh(udp_hdr->qpn) & 0xFFFFFF; + + udp_hdr->tag = (__force __be32)be32toh(udp_hdr->tag); + udp_hdr->qpn = (__force __be32)qpn; + + ret = ds_get_dest(rs, &addr.sa, addrlen, &dest); + if (ret) + return; + + if (udp_hdr->op == RS_OP_DATA) { + fastlock_acquire(&rs->slock); + cur_dest = rs->conn_dest; + rs->conn_dest = dest; + ds_send_udp(rs, NULL, 0, 0, RS_OP_CTRL); + rs->conn_dest = cur_dest; + fastlock_release(&rs->slock); + } + + if (!dest->ah || (dest->qpn != qpn)) + udp_svc_create_ah(rs, dest, qpn); + + /* to do: handle when dest local ip address doesn't match udp ip */ + if (udp_hdr->op == RS_OP_DATA) { + fastlock_acquire(&rs->slock); + cur_dest = rs->conn_dest; + rs->conn_dest = &dest->qp->dest; + udp_svc_forward(rs, buf + udp_hdr->length, len, &addr); + rs->conn_dest = cur_dest; + fastlock_release(&rs->slock); + } +} + +static void *udp_svc_run(void *arg) +{ + struct rs_svc *svc = arg; + struct rs_svc_msg msg; + int i, ret; + + ret = rs_svc_grow_sets(svc, 4); + if (ret) { + msg.status = ret; + write_all(svc->sock[1], &msg, sizeof msg); + return (void *) (uintptr_t) ret; + } + + udp_svc_fds = svc->contexts; + udp_svc_fds[0].fd = svc->sock[1]; + udp_svc_fds[0].events = POLLIN; + do { + for (i = 0; i <= svc->cnt; i++) + udp_svc_fds[i].revents = 0; + + poll(udp_svc_fds, svc->cnt + 1, -1); + if (udp_svc_fds[0].revents) + udp_svc_process_sock(svc); + + for (i = 1; i <= svc->cnt; i++) { + if (udp_svc_fds[i].revents) + udp_svc_process_rs(svc->rss[i]); + } + } while (svc->cnt >= 1); + + return NULL; +} + +static uint64_t rs_get_time(void) +{ + return rs_time_us() / 1000000; +} + +static void tcp_svc_process_sock(struct rs_svc *svc) +{ + struct rs_svc_msg msg; + int i; + + read_all(svc->sock[1], &msg, sizeof msg); + switch (msg.cmd) { + case RS_SVC_ADD_KEEPALIVE: + msg.status = rs_svc_add_rs(svc, msg.rs); + if (!msg.status) { + msg.rs->opts |= RS_OPT_KEEPALIVE; + tcp_svc_timeouts = svc->contexts; + tcp_svc_timeouts[svc->cnt] = rs_get_time() + + msg.rs->keepalive_time; + } + break; + case RS_SVC_REM_KEEPALIVE: + msg.status = rs_svc_rm_rs(svc, msg.rs); + if (!msg.status) + msg.rs->opts &= ~RS_OPT_KEEPALIVE; + break; + case RS_SVC_MOD_KEEPALIVE: + i = rs_svc_index(svc, msg.rs); + if (i >= 0) { + tcp_svc_timeouts[i] = rs_get_time() + msg.rs->keepalive_time; + msg.status = 0; + } else { + msg.status = EBADF; + } + break; + case RS_SVC_NOOP: + msg.status = 0; + break; + default: + break; + } + write_all(svc->sock[1], &msg, sizeof msg); +} + +/* + * Send a 0 byte RDMA write with immediate as keep-alive message. + * This avoids the need for the receive side to do any acknowledgment. + */ +static void tcp_svc_send_keepalive(struct rsocket *rs) +{ + fastlock_acquire(&rs->cq_lock); + if (rs_ctrl_avail(rs) && (rs->state & rs_connected)) { + rs->ctrl_seqno++; + rs_post_write(rs, NULL, 0, rs_msg_set(RS_OP_CTRL, RS_CTRL_KEEPALIVE), + 0, (uintptr_t) NULL, (uintptr_t) NULL); + } + fastlock_release(&rs->cq_lock); +} + +static void *tcp_svc_run(void *arg) +{ + struct rs_svc *svc = arg; + struct rs_svc_msg msg; + struct pollfd fds; + uint64_t now, next_timeout; + int i, ret, timeout; + + ret = rs_svc_grow_sets(svc, 16); + if (ret) { + msg.status = ret; + write_all(svc->sock[1], &msg, sizeof msg); + return (void *) (uintptr_t) ret; + } + + tcp_svc_timeouts = svc->contexts; + fds.fd = svc->sock[1]; + fds.events = POLLIN; + timeout = -1; + do { + poll(&fds, 1, timeout * 1000); + if (fds.revents) + tcp_svc_process_sock(svc); + + now = rs_get_time(); + next_timeout = ~0; + for (i = 1; i <= svc->cnt; i++) { + if (tcp_svc_timeouts[i] <= now) { + tcp_svc_send_keepalive(svc->rss[i]); + tcp_svc_timeouts[i] = + now + svc->rss[i]->keepalive_time; + } + if (tcp_svc_timeouts[i] < next_timeout) + next_timeout = tcp_svc_timeouts[i]; + } + timeout = (int) (next_timeout - now); + } while (svc->cnt >= 1); + + return NULL; +} + +static void rs_handle_cm_event(struct rsocket *rs) +{ + int ret; + + if (rs->state & rs_opening) { + rs_do_connect(rs); + } else { + ret = ucma_complete(rs->cm_id); + if (!ret && rs->cm_id->event && (rs->state & rs_connected) && + (rs->cm_id->event->event == RDMA_CM_EVENT_DISCONNECTED)) + rs->state = rs_disconnected; + } + + if (!(rs->state & rs_opening)) + rs_poll_signal(); +} + +static void cm_svc_process_sock(struct rs_svc *svc) +{ + struct rs_svc_msg msg; + struct pollfd *fds; + + read_all(svc->sock[1], &msg, sizeof(msg)); + switch (msg.cmd) { + case RS_SVC_ADD_CM: + msg.status = rs_svc_add_rs(svc, msg.rs); + if (!msg.status) { + msg.rs->opts |= RS_OPT_CM_SVC; + fds = svc->contexts; + fds[svc->cnt].fd = msg.rs->cm_id->channel->fd; + fds[svc->cnt].events = POLLIN; + fds[svc->cnt].revents = 0; + } + break; + case RS_SVC_REM_CM: + msg.status = rs_svc_rm_rs(svc, msg.rs); + if (!msg.status) + msg.rs->opts &= ~RS_OPT_CM_SVC; + break; + case RS_SVC_NOOP: + msg.status = 0; + break; + default: + break; + } + write_all(svc->sock[1], &msg, sizeof(msg)); +} + +static void *cm_svc_run(void *arg) +{ + struct rs_svc *svc = arg; + struct pollfd *fds; + struct rs_svc_msg msg; + int i, ret; + + ret = rs_svc_grow_sets(svc, 4); + if (ret) { + msg.status = ret; + write_all(svc->sock[1], &msg, sizeof(msg)); + return (void *) (uintptr_t) ret; + } + + fds = svc->contexts; + fds[0].fd = svc->sock[1]; + fds[0].events = POLLIN; + do { + for (i = 0; i <= svc->cnt; i++) + fds[i].revents = 0; + + poll(fds, svc->cnt + 1, -1); + if (fds[0].revents) + cm_svc_process_sock(svc); + + for (i = 1; i <= svc->cnt; i++) { + if (!fds[i].revents) + continue; + + if (svc == &listen_svc) + rs_accept(svc->rss[i]); + else + rs_handle_cm_event(svc->rss[i]); + } + } while (svc->cnt >= 1); + + return NULL; +} diff --git a/librdmacm/rsocket.h b/librdmacm/rsocket.h new file mode 100644 index 0000000..efd0db5 --- /dev/null +++ b/librdmacm/rsocket.h @@ -0,0 +1,99 @@ +/* + * Copyright (c) 2011-2012 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#if !defined(RSOCKET_H) +#define RSOCKET_H + +#include <infiniband/verbs.h> +#include <rdma/rdma_cma.h> +#include <sys/socket.h> +#include <errno.h> +#include <poll.h> +#include <sys/select.h> +#include <sys/mman.h> + +#ifdef __cplusplus +extern "C" { +#endif + +int rsocket(int domain, int type, int protocol); +int rbind(int socket, const struct sockaddr *addr, socklen_t addrlen); +int rlisten(int socket, int backlog); +int raccept(int socket, struct sockaddr *addr, socklen_t *addrlen); +int rconnect(int socket, const struct sockaddr *addr, socklen_t addrlen); +int rshutdown(int socket, int how); +int rclose(int socket); + +ssize_t rrecv(int socket, void *buf, size_t len, int flags); +ssize_t rrecvfrom(int socket, void *buf, size_t len, int flags, + struct sockaddr *src_addr, socklen_t *addrlen); +ssize_t rrecvmsg(int socket, struct msghdr *msg, int flags); +ssize_t rsend(int socket, const void *buf, size_t len, int flags); +ssize_t rsendto(int socket, const void *buf, size_t len, int flags, + const struct sockaddr *dest_addr, socklen_t addrlen); +ssize_t rsendmsg(int socket, const struct msghdr *msg, int flags); +ssize_t rread(int socket, void *buf, size_t count); +ssize_t rreadv(int socket, const struct iovec *iov, int iovcnt); +ssize_t rwrite(int socket, const void *buf, size_t count); +ssize_t rwritev(int socket, const struct iovec *iov, int iovcnt); + +int rpoll(struct pollfd *fds, nfds_t nfds, int timeout); +int rselect(int nfds, fd_set *readfds, fd_set *writefds, + fd_set *exceptfds, struct timeval *timeout); + +int rgetpeername(int socket, struct sockaddr *addr, socklen_t *addrlen); +int rgetsockname(int socket, struct sockaddr *addr, socklen_t *addrlen); + +#define SOL_RDMA 0x10000 +enum { + RDMA_SQSIZE, + RDMA_RQSIZE, + RDMA_INLINE, + RDMA_IOMAPSIZE, + RDMA_ROUTE +}; + +int rsetsockopt(int socket, int level, int optname, + const void *optval, socklen_t optlen); +int rgetsockopt(int socket, int level, int optname, + void *optval, socklen_t *optlen); +int rfcntl(int socket, int cmd, ... /* arg */ ); + +off_t riomap(int socket, void *buf, size_t len, int prot, int flags, off_t offset); +int riounmap(int socket, void *buf, size_t len); +size_t riowrite(int socket, const void *buf, size_t count, off_t offset, int flags); + +#ifdef __cplusplus +} +#endif + +#endif /* RSOCKET_H */ diff --git a/providers/bnxt_re/CMakeLists.txt b/providers/bnxt_re/CMakeLists.txt new file mode 100644 index 0000000..13ad287 --- /dev/null +++ b/providers/bnxt_re/CMakeLists.txt @@ -0,0 +1,6 @@ +rdma_provider(bnxt_re + db.c + main.c + memory.c + verbs.c +) diff --git a/providers/bnxt_re/bnxt_re-abi.h b/providers/bnxt_re/bnxt_re-abi.h new file mode 100644 index 0000000..c6998e8 --- /dev/null +++ b/providers/bnxt_re/bnxt_re-abi.h @@ -0,0 +1,321 @@ +/* + * Broadcom NetXtreme-E User Space RoCE driver + * + * Copyright (c) 2015-2017, Broadcom. All rights reserved. The term + * Broadcom refers to Broadcom Limited and/or its subsidiaries. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE + * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN + * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * Description: ABI data structure definition + */ + +#ifndef __BNXT_RE_ABI_H__ +#define __BNXT_RE_ABI_H__ + +#include <infiniband/kern-abi.h> +#include <rdma/bnxt_re-abi.h> +#include <kernel-abi/bnxt_re-abi.h> + +#define BNXT_RE_FULL_FLAG_DELTA 0x80 + +DECLARE_DRV_CMD(ubnxt_re_pd, IB_USER_VERBS_CMD_ALLOC_PD, + empty, bnxt_re_pd_resp); +DECLARE_DRV_CMD(ubnxt_re_cq, IB_USER_VERBS_CMD_CREATE_CQ, + bnxt_re_cq_req, bnxt_re_cq_resp); +DECLARE_DRV_CMD(ubnxt_re_qp, IB_USER_VERBS_CMD_CREATE_QP, + bnxt_re_qp_req, bnxt_re_qp_resp); +DECLARE_DRV_CMD(ubnxt_re_cntx, IB_USER_VERBS_CMD_GET_CONTEXT, + empty, bnxt_re_uctx_resp); +DECLARE_DRV_CMD(ubnxt_re_mr, IB_USER_VERBS_CMD_REG_MR, + empty, empty); +DECLARE_DRV_CMD(ubnxt_re_srq, IB_USER_VERBS_CMD_CREATE_SRQ, + bnxt_re_srq_req, bnxt_re_srq_resp); + +enum bnxt_re_wr_opcode { + BNXT_RE_WR_OPCD_SEND = 0x00, + BNXT_RE_WR_OPCD_SEND_IMM = 0x01, + BNXT_RE_WR_OPCD_SEND_INVAL = 0x02, + BNXT_RE_WR_OPCD_RDMA_WRITE = 0x04, + BNXT_RE_WR_OPCD_RDMA_WRITE_IMM = 0x05, + BNXT_RE_WR_OPCD_RDMA_READ = 0x06, + BNXT_RE_WR_OPCD_ATOMIC_CS = 0x08, + BNXT_RE_WR_OPCD_ATOMIC_FA = 0x0B, + BNXT_RE_WR_OPCD_LOC_INVAL = 0x0C, + BNXT_RE_WR_OPCD_BIND = 0x0E, + BNXT_RE_WR_OPCD_RECV = 0x80, + BNXT_RE_WR_OPCD_INVAL = 0xFF +}; + +enum bnxt_re_wr_flags { + BNXT_RE_WR_FLAGS_INLINE = 0x10, + BNXT_RE_WR_FLAGS_SE = 0x08, + BNXT_RE_WR_FLAGS_UC_FENCE = 0x04, + BNXT_RE_WR_FLAGS_RD_FENCE = 0x02, + BNXT_RE_WR_FLAGS_SIGNALED = 0x01 +}; + +enum bnxt_re_wc_type { + BNXT_RE_WC_TYPE_SEND = 0x00, + BNXT_RE_WC_TYPE_RECV_RC = 0x01, + BNXT_RE_WC_TYPE_RECV_UD = 0x02, + BNXT_RE_WC_TYPE_RECV_RAW = 0x03, + BNXT_RE_WC_TYPE_TERM = 0x0E, + BNXT_RE_WC_TYPE_COFF = 0x0F +}; + +enum bnxt_re_req_wc_status { + BNXT_RE_REQ_ST_OK = 0x00, + BNXT_RE_REQ_ST_BAD_RESP = 0x01, + BNXT_RE_REQ_ST_LOC_LEN = 0x02, + BNXT_RE_REQ_ST_LOC_QP_OP = 0x03, + BNXT_RE_REQ_ST_PROT = 0x04, + BNXT_RE_REQ_ST_MEM_OP = 0x05, + BNXT_RE_REQ_ST_REM_INVAL = 0x06, + BNXT_RE_REQ_ST_REM_ACC = 0x07, + BNXT_RE_REQ_ST_REM_OP = 0x08, + BNXT_RE_REQ_ST_RNR_NAK_XCED = 0x09, + BNXT_RE_REQ_ST_TRNSP_XCED = 0x0A, + BNXT_RE_REQ_ST_WR_FLUSH = 0x0B +}; + +enum bnxt_re_rsp_wc_status { + BNXT_RE_RSP_ST_OK = 0x00, + BNXT_RE_RSP_ST_LOC_ACC = 0x01, + BNXT_RE_RSP_ST_LOC_LEN = 0x02, + BNXT_RE_RSP_ST_LOC_PROT = 0x03, + BNXT_RE_RSP_ST_LOC_QP_OP = 0x04, + BNXT_RE_RSP_ST_MEM_OP = 0x05, + BNXT_RE_RSP_ST_REM_INVAL = 0x06, + BNXT_RE_RSP_ST_WR_FLUSH = 0x07, + BNXT_RE_RSP_ST_HW_FLUSH = 0x08 +}; + +enum bnxt_re_hdr_offset { + BNXT_RE_HDR_WT_MASK = 0xFF, + BNXT_RE_HDR_FLAGS_MASK = 0xFF, + BNXT_RE_HDR_FLAGS_SHIFT = 0x08, + BNXT_RE_HDR_WS_MASK = 0xFF, + BNXT_RE_HDR_WS_SHIFT = 0x10 +}; + +enum bnxt_re_db_que_type { + BNXT_RE_QUE_TYPE_SQ = 0x00, + BNXT_RE_QUE_TYPE_RQ = 0x01, + BNXT_RE_QUE_TYPE_SRQ = 0x02, + BNXT_RE_QUE_TYPE_SRQ_ARM = 0x03, + BNXT_RE_QUE_TYPE_CQ = 0x04, + BNXT_RE_QUE_TYPE_CQ_ARMSE = 0x05, + BNXT_RE_QUE_TYPE_CQ_ARMALL = 0x06, + BNXT_RE_QUE_TYPE_CQ_ARMENA = 0x07, + BNXT_RE_QUE_TYPE_SRQ_ARMENA = 0x08, + BNXT_RE_QUE_TYPE_CQ_CUT_ACK = 0x09, + BNXT_RE_QUE_TYPE_NULL = 0x0F +}; + +enum bnxt_re_db_mask { + BNXT_RE_DB_INDX_MASK = 0xFFFFFUL, + BNXT_RE_DB_QID_MASK = 0xFFFFFUL, + BNXT_RE_DB_TYP_MASK = 0x0FUL, + BNXT_RE_DB_TYP_SHIFT = 0x1C +}; + +enum bnxt_re_psns_mask { + BNXT_RE_PSNS_SPSN_MASK = 0xFFFFFF, + BNXT_RE_PSNS_OPCD_MASK = 0xFF, + BNXT_RE_PSNS_OPCD_SHIFT = 0x18, + BNXT_RE_PSNS_NPSN_MASK = 0xFFFFFF, + BNXT_RE_PSNS_FLAGS_MASK = 0xFF, + BNXT_RE_PSNS_FLAGS_SHIFT = 0x18 +}; + +enum bnxt_re_bcqe_mask { + BNXT_RE_BCQE_PH_MASK = 0x01, + BNXT_RE_BCQE_TYPE_MASK = 0x0F, + BNXT_RE_BCQE_TYPE_SHIFT = 0x01, + BNXT_RE_BCQE_STATUS_MASK = 0xFF, + BNXT_RE_BCQE_STATUS_SHIFT = 0x08, + BNXT_RE_BCQE_FLAGS_MASK = 0xFFFFU, + BNXT_RE_BCQE_FLAGS_SHIFT = 0x10, + BNXT_RE_BCQE_RWRID_MASK = 0xFFFFFU, + BNXT_RE_BCQE_SRCQP_MASK = 0xFF, + BNXT_RE_BCQE_SRCQP_SHIFT = 0x18 +}; + +enum bnxt_re_rc_flags_mask { + BNXT_RE_RC_FLAGS_SRQ_RQ_MASK = 0x01, + BNXT_RE_RC_FLAGS_IMM_MASK = 0x02, + BNXT_RE_RC_FLAGS_IMM_SHIFT = 0x01, + BNXT_RE_RC_FLAGS_INV_MASK = 0x04, + BNXT_RE_RC_FLAGS_INV_SHIFT = 0x02, + BNXT_RE_RC_FLAGS_RDMA_MASK = 0x08, + BNXT_RE_RC_FLAGS_RDMA_SHIFT = 0x03 +}; + +enum bnxt_re_ud_flags_mask { + BNXT_RE_UD_FLAGS_SRQ_RQ_MASK = 0x01, + BNXT_RE_UD_FLAGS_IMM_MASK = 0x02, + BNXT_RE_UD_FLAGS_HDR_TYP_MASK = 0x0C, + + BNXT_RE_UD_FLAGS_SRQ = 0x01, + BNXT_RE_UD_FLAGS_RQ = 0x00, + BNXT_RE_UD_FLAGS_ROCE = 0x00, + BNXT_RE_UD_FLAGS_ROCE_IPV4 = 0x02, + BNXT_RE_UD_FLAGS_ROCE_IPV6 = 0x03 +}; + +enum bnxt_re_ud_cqe_mask { + BNXT_RE_UD_CQE_MAC_MASK = 0xFFFFFFFFFFFFULL, + BNXT_RE_UD_CQE_SRCQPLO_MASK = 0xFFFF, + BNXT_RE_UD_CQE_SRCQPLO_SHIFT = 0x30 +}; + +struct bnxt_re_db_hdr { + __le32 indx; + __le32 typ_qid; /* typ: 4, qid:20*/ +}; + +struct bnxt_re_bcqe { + __le32 flg_st_typ_ph; + __le32 qphi_rwrid; +}; + +struct bnxt_re_req_cqe { + __le64 qp_handle; + __le32 con_indx; /* 16 bits valid. */ + __le32 rsvd1; + __le64 rsvd2; +}; + +struct bnxt_re_rc_cqe { + __le32 length; + __le32 imm_key; + __le64 qp_handle; + __le64 mr_handle; +}; + +struct bnxt_re_ud_cqe { + __le32 length; /* 14 bits */ + __le32 immd; + __le64 qp_handle; + __le64 qplo_mac; /* 16:48*/ +}; + +struct bnxt_re_term_cqe { + __le64 qp_handle; + __le32 rq_sq_cidx; + __le32 rsvd; + __le64 rsvd1; +}; + +struct bnxt_re_bsqe { + __le32 rsv_ws_fl_wt; + __le32 key_immd; +}; + +struct bnxt_re_psns { + __le32 opc_spsn; + __le32 flg_npsn; +}; + +struct bnxt_re_psns_ext { + __u32 opc_spsn; + __u32 flg_npsn; + __u16 st_slot_idx; + __u16 rsvd0; + __u32 rsvd1; +}; + +struct bnxt_re_sge { + __le64 pa; + __le32 lkey; + __le32 length; +}; + +/* Cu+ max inline data */ +#define BNXT_RE_MAX_INLINE_SIZE 0x60 + +struct bnxt_re_send { + __le32 length; + __le32 qkey; + __le32 dst_qp; + __le32 avid; + __le64 rsvd; +}; + +struct bnxt_re_raw { + __le32 length; + __le32 rsvd1; + __le32 cfa_meta; + __le32 rsvd2; + __le64 rsvd3; +}; + +struct bnxt_re_rdma { + __le32 length; + __le32 rsvd1; + __le64 rva; + __le32 rkey; + __le32 rsvd2; +}; + +struct bnxt_re_atomic { + __le64 rva; + __le64 swp_dt; + __le64 cmp_dt; +}; + +struct bnxt_re_inval { + __le64 rsvd[3]; +}; + +struct bnxt_re_bind { + __le32 plkey; + __le32 lkey; + __le64 va; + __le64 len; /* only 40 bits are valid */ +}; + +struct bnxt_re_brqe { + __le32 rsv_ws_fl_wt; + __le32 rsvd; +}; + +struct bnxt_re_rqe { + __le32 wrid; + __le32 rsvd1; + __le64 rsvd[2]; +}; + +struct bnxt_re_srqe { + __le32 srq_tag; /* 20 bits are valid */ + __le32 rsvd1; + __le64 rsvd[2]; +}; +#endif diff --git a/providers/bnxt_re/db.c b/providers/bnxt_re/db.c new file mode 100644 index 0000000..85da182 --- /dev/null +++ b/providers/bnxt_re/db.c @@ -0,0 +1,110 @@ +/* + * Broadcom NetXtreme-E User Space RoCE driver + * + * Copyright (c) 2015-2017, Broadcom. All rights reserved. The term + * Broadcom refers to Broadcom Limited and/or its subsidiaries. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE + * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN + * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * Description: Doorbell handling functions. + */ + +#include <util/mmio.h> +#include "main.h" + +static void bnxt_re_ring_db(struct bnxt_re_dpi *dpi, + struct bnxt_re_db_hdr *hdr) +{ + __le64 *dbval; + + dbval = (__le64 *)&hdr->indx; + mmio_wc_start(); + mmio_write64_le(dpi->dbpage, *dbval); + mmio_flush_writes(); +} + +static void bnxt_re_init_db_hdr(struct bnxt_re_db_hdr *hdr, uint32_t indx, + uint32_t qid, uint32_t typ) +{ + hdr->indx = htole32(indx & BNXT_RE_DB_INDX_MASK); + hdr->typ_qid = htole32(qid & BNXT_RE_DB_QID_MASK); + hdr->typ_qid |= htole32(((typ & BNXT_RE_DB_TYP_MASK) << + BNXT_RE_DB_TYP_SHIFT)); +} + +void bnxt_re_ring_rq_db(struct bnxt_re_qp *qp) +{ + struct bnxt_re_db_hdr hdr; + + bnxt_re_init_db_hdr(&hdr, qp->rqq->tail, qp->qpid, BNXT_RE_QUE_TYPE_RQ); + bnxt_re_ring_db(qp->udpi, &hdr); +} + +void bnxt_re_ring_sq_db(struct bnxt_re_qp *qp) +{ + struct bnxt_re_db_hdr hdr; + + bnxt_re_init_db_hdr(&hdr, qp->sqq->tail, qp->qpid, BNXT_RE_QUE_TYPE_SQ); + bnxt_re_ring_db(qp->udpi, &hdr); +} + +void bnxt_re_ring_srq_db(struct bnxt_re_srq *srq) +{ + struct bnxt_re_db_hdr hdr; + + bnxt_re_init_db_hdr(&hdr, srq->srqq->tail, srq->srqid, + BNXT_RE_QUE_TYPE_SRQ); + bnxt_re_ring_db(srq->udpi, &hdr); +} + +void bnxt_re_ring_srq_arm(struct bnxt_re_srq *srq) +{ + struct bnxt_re_db_hdr hdr; + + bnxt_re_init_db_hdr(&hdr, srq->cap.srq_limit, srq->srqid, + BNXT_RE_QUE_TYPE_SRQ_ARM); + bnxt_re_ring_db(srq->udpi, &hdr); +} + +void bnxt_re_ring_cq_db(struct bnxt_re_cq *cq) +{ + struct bnxt_re_db_hdr hdr; + + bnxt_re_init_db_hdr(&hdr, cq->cqq.head, cq->cqid, BNXT_RE_QUE_TYPE_CQ); + bnxt_re_ring_db(cq->udpi, &hdr); +} + +void bnxt_re_ring_cq_arm_db(struct bnxt_re_cq *cq, uint8_t aflag) +{ + struct bnxt_re_db_hdr hdr; + + bnxt_re_init_db_hdr(&hdr, cq->cqq.head, cq->cqid, aflag); + bnxt_re_ring_db(cq->udpi, &hdr); +} diff --git a/providers/bnxt_re/flush.h b/providers/bnxt_re/flush.h new file mode 100644 index 0000000..a39ea71 --- /dev/null +++ b/providers/bnxt_re/flush.h @@ -0,0 +1,85 @@ +/* + * Broadcom NetXtreme-E User Space RoCE driver + * + * Copyright (c) 2015-2017, Broadcom. All rights reserved. The term + * Broadcom refers to Broadcom Limited and/or its subsidiaries. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE + * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN + * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * Description: A few wrappers for flush queue management + */ + +#ifndef __FLUSH_H__ +#define __FLUSH_H__ + +#include <ccan/list.h> + +struct bnxt_re_fque_node { + uint8_t valid; + struct list_node list; +}; + +static inline void fque_init_node(struct bnxt_re_fque_node *node) +{ + list_node_init(&node->list); + node->valid = false; +} + +static inline void fque_add_node_tail(struct list_head *head, + struct bnxt_re_fque_node *new) +{ + list_add_tail(head, &new->list); + new->valid = true; +} + +static inline void fque_del_node(struct bnxt_re_fque_node *entry) +{ + entry->valid = false; + list_del(&entry->list); +} + +static inline uint8_t _fque_node_valid(struct bnxt_re_fque_node *node) +{ + return node->valid; +} + +static inline void bnxt_re_fque_add_node(struct list_head *head, + struct bnxt_re_fque_node *node) +{ + if (!_fque_node_valid(node)) + fque_add_node_tail(head, node); +} + +static inline void bnxt_re_fque_del_node(struct bnxt_re_fque_node *node) +{ + if (_fque_node_valid(node)) + fque_del_node(node); +} +#endif /* __FLUSH_H__ */ diff --git a/providers/bnxt_re/main.c b/providers/bnxt_re/main.c new file mode 100644 index 0000000..8893673 --- /dev/null +++ b/providers/bnxt_re/main.c @@ -0,0 +1,225 @@ +/* + * Broadcom NetXtreme-E User Space RoCE driver + * + * Copyright (c) 2015-2017, Broadcom. All rights reserved. The term + * Broadcom refers to Broadcom Limited and/or its subsidiaries. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE + * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN + * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * Description: Device detection and initializatoin + */ + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <errno.h> +#include <sys/mman.h> +#include <pthread.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> + +#include "main.h" +#include "verbs.h" + +static void bnxt_re_free_context(struct ibv_context *ibvctx); + +#define PCI_VENDOR_ID_BROADCOM 0x14E4 + +#define CNA(v, d) VERBS_PCI_MATCH(PCI_VENDOR_ID_##v, d, NULL) +static const struct verbs_match_ent cna_table[] = { + VERBS_DRIVER_ID(RDMA_DRIVER_BNXT_RE), + CNA(BROADCOM, 0x1605), /* BCM57454 NPAR */ + CNA(BROADCOM, 0x1606), /* BCM57454 VF */ + CNA(BROADCOM, 0x1614), /* BCM57454 */ + CNA(BROADCOM, 0x16C0), /* BCM57417 NPAR */ + CNA(BROADCOM, 0x16C1), /* BMC57414 VF */ + CNA(BROADCOM, 0x16CE), /* BMC57311 */ + CNA(BROADCOM, 0x16CF), /* BMC57312 */ + CNA(BROADCOM, 0x16D6), /* BMC57412*/ + CNA(BROADCOM, 0x16D7), /* BMC57414 */ + CNA(BROADCOM, 0x16D8), /* BMC57416 Cu */ + CNA(BROADCOM, 0x16D9), /* BMC57417 Cu */ + CNA(BROADCOM, 0x16DF), /* BMC57314 */ + CNA(BROADCOM, 0x16E2), /* BMC57417 */ + CNA(BROADCOM, 0x16E3), /* BMC57416 */ + CNA(BROADCOM, 0x16E5), /* BMC57314 VF */ + CNA(BROADCOM, 0x16ED), /* BCM57414 NPAR */ + CNA(BROADCOM, 0x16EB), /* BCM57412 NPAR */ + CNA(BROADCOM, 0x16EF), /* BCM57416 NPAR */ + CNA(BROADCOM, 0x16F0), /* BCM58730 */ + CNA(BROADCOM, 0x16F1), /* BCM57452 */ + CNA(BROADCOM, 0x1750), /* BCM57508 */ + CNA(BROADCOM, 0x1751), /* BCM57504 */ + CNA(BROADCOM, 0x1752), /* BCM57502 */ + CNA(BROADCOM, 0x1803), /* BCM57508 NPAR */ + CNA(BROADCOM, 0x1804), /* BCM57504 NPAR */ + CNA(BROADCOM, 0x1805), /* BCM57502 NPAR */ + CNA(BROADCOM, 0x1807), /* BCM5750x VF */ + CNA(BROADCOM, 0xD800), /* BCM880xx VF */ + CNA(BROADCOM, 0xD802), /* BCM58802 */ + CNA(BROADCOM, 0xD804), /* BCM8804 SR */ + {} +}; + +static const struct verbs_context_ops bnxt_re_cntx_ops = { + .query_device = bnxt_re_query_device, + .query_port = bnxt_re_query_port, + .alloc_pd = bnxt_re_alloc_pd, + .dealloc_pd = bnxt_re_free_pd, + .reg_mr = bnxt_re_reg_mr, + .dereg_mr = bnxt_re_dereg_mr, + .create_cq = bnxt_re_create_cq, + .poll_cq = bnxt_re_poll_cq, + .req_notify_cq = bnxt_re_arm_cq, + .cq_event = bnxt_re_cq_event, + .resize_cq = bnxt_re_resize_cq, + .destroy_cq = bnxt_re_destroy_cq, + .create_srq = bnxt_re_create_srq, + .modify_srq = bnxt_re_modify_srq, + .query_srq = bnxt_re_query_srq, + .destroy_srq = bnxt_re_destroy_srq, + .post_srq_recv = bnxt_re_post_srq_recv, + .create_qp = bnxt_re_create_qp, + .query_qp = bnxt_re_query_qp, + .modify_qp = bnxt_re_modify_qp, + .destroy_qp = bnxt_re_destroy_qp, + .post_send = bnxt_re_post_send, + .post_recv = bnxt_re_post_recv, + .create_ah = bnxt_re_create_ah, + .destroy_ah = bnxt_re_destroy_ah, + .free_context = bnxt_re_free_context, +}; + +bool bnxt_re_is_chip_gen_p5(struct bnxt_re_chip_ctx *cctx) +{ + return (cctx->chip_num == CHIP_NUM_57508 || + cctx->chip_num == CHIP_NUM_57504 || + cctx->chip_num == CHIP_NUM_57502); +} + +/* Context Init functions */ +static struct verbs_context *bnxt_re_alloc_context(struct ibv_device *vdev, + int cmd_fd, + void *private_data) +{ + struct ibv_get_context cmd; + struct ubnxt_re_cntx_resp resp; + struct bnxt_re_dev *dev = to_bnxt_re_dev(vdev); + struct bnxt_re_context *cntx; + + cntx = verbs_init_and_alloc_context(vdev, cmd_fd, cntx, ibvctx, + RDMA_DRIVER_BNXT_RE); + if (!cntx) + return NULL; + + memset(&resp, 0, sizeof(resp)); + if (ibv_cmd_get_context(&cntx->ibvctx, &cmd, sizeof(cmd), + &resp.ibv_resp, sizeof(resp))) + goto failed; + + cntx->dev_id = resp.dev_id; + cntx->max_qp = resp.max_qp; + dev->pg_size = resp.pg_size; + dev->cqe_size = resp.cqe_sz; + dev->max_cq_depth = resp.max_cqd; + if (resp.comp_mask & BNXT_RE_UCNTX_CMASK_HAVE_CCTX) { + cntx->cctx.chip_num = resp.chip_id0 & 0xFFFF; + cntx->cctx.chip_rev = (resp.chip_id0 >> + BNXT_RE_CHIP_ID0_CHIP_REV_SFT) & 0xFF; + cntx->cctx.chip_metal = (resp.chip_id0 >> + BNXT_RE_CHIP_ID0_CHIP_MET_SFT) & + 0xFF; + } + pthread_spin_init(&cntx->fqlock, PTHREAD_PROCESS_PRIVATE); + /* mmap shared page. */ + cntx->shpg = mmap(NULL, dev->pg_size, PROT_READ | PROT_WRITE, + MAP_SHARED, cmd_fd, 0); + if (cntx->shpg == MAP_FAILED) { + cntx->shpg = NULL; + goto failed; + } + pthread_mutex_init(&cntx->shlock, NULL); + + verbs_set_ops(&cntx->ibvctx, &bnxt_re_cntx_ops); + + return &cntx->ibvctx; + +failed: + verbs_uninit_context(&cntx->ibvctx); + free(cntx); + return NULL; +} + +static void bnxt_re_free_context(struct ibv_context *ibvctx) +{ + struct bnxt_re_context *cntx = to_bnxt_re_context(ibvctx); + struct bnxt_re_dev *dev = to_bnxt_re_dev(ibvctx->device); + + /* Unmap if anything device specific was mapped in init_context. */ + pthread_mutex_destroy(&cntx->shlock); + if (cntx->shpg) + munmap(cntx->shpg, dev->pg_size); + pthread_spin_destroy(&cntx->fqlock); + + /* Un-map DPI only for the first PD that was + * allocated in this context. + */ + if (cntx->udpi.dbpage && cntx->udpi.dbpage != MAP_FAILED) { + munmap(cntx->udpi.dbpage, dev->pg_size); + cntx->udpi.dbpage = NULL; + } + + verbs_uninit_context(&cntx->ibvctx); + free(cntx); +} + +static struct verbs_device * +bnxt_re_device_alloc(struct verbs_sysfs_dev *sysfs_dev) +{ + struct bnxt_re_dev *dev; + + dev = calloc(1, sizeof(*dev)); + if (!dev) + return NULL; + + return &dev->vdev; +} + +static const struct verbs_device_ops bnxt_re_dev_ops = { + .name = "bnxt_re", + .match_min_abi_version = BNXT_RE_ABI_VERSION, + .match_max_abi_version = BNXT_RE_ABI_VERSION, + .match_table = cna_table, + .alloc_device = bnxt_re_device_alloc, + .alloc_context = bnxt_re_alloc_context, +}; +PROVIDER_DRIVER(bnxt_re, bnxt_re_dev_ops); diff --git a/providers/bnxt_re/main.h b/providers/bnxt_re/main.h new file mode 100644 index 0000000..368297e --- /dev/null +++ b/providers/bnxt_re/main.h @@ -0,0 +1,428 @@ +/* + * Broadcom NetXtreme-E User Space RoCE driver + * + * Copyright (c) 2015-2017, Broadcom. All rights reserved. The term + * Broadcom refers to Broadcom Limited and/or its subsidiaries. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE + * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN + * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * Description: Basic device data structures needed for book-keeping + */ + +#ifndef __MAIN_H__ +#define __MAIN_H__ + +#include <inttypes.h> +#include <stdbool.h> +#include <stddef.h> +#include <endian.h> +#include <pthread.h> + +#include <infiniband/driver.h> +#include <util/udma_barrier.h> + +#include "bnxt_re-abi.h" +#include "memory.h" +#include "flush.h" + +#define DEV "bnxt_re : " + +#define BNXT_RE_UD_QP_HW_STALL 0x400000 + +#define CHIP_NUM_57508 0x1750 +#define CHIP_NUM_57504 0x1751 +#define CHIP_NUM_57502 0x1752 + +struct bnxt_re_chip_ctx { + __u16 chip_num; + __u8 chip_rev; + __u8 chip_metal; +}; + +struct bnxt_re_dpi { + __u32 dpindx; + __u64 *dbpage; +}; + +struct bnxt_re_pd { + struct ibv_pd ibvpd; + uint32_t pdid; +}; + +struct bnxt_re_cq { + struct ibv_cq ibvcq; + uint32_t cqid; + struct bnxt_re_queue cqq; + struct bnxt_re_dpi *udpi; + struct list_head sfhead; + struct list_head rfhead; + uint32_t cqe_size; + uint8_t phase; + int deferred_arm_flags; + bool first_arm; + bool deferred_arm; +}; + +struct bnxt_re_wrid { + struct bnxt_re_psns_ext *psns_ext; + struct bnxt_re_psns *psns; + uint64_t wrid; + uint32_t bytes; + int next_idx; + uint8_t sig; +}; + +struct bnxt_re_qpcap { + uint32_t max_swr; + uint32_t max_rwr; + uint32_t max_ssge; + uint32_t max_rsge; + uint32_t max_inline; + uint8_t sqsig; +}; + +struct bnxt_re_srq { + struct ibv_srq ibvsrq; + struct ibv_srq_attr cap; + struct bnxt_re_queue *srqq; + struct bnxt_re_wrid *srwrid; + struct bnxt_re_dpi *udpi; + uint32_t srqid; + int start_idx; + int last_idx; + bool arm_req; +}; + +struct bnxt_re_qp { + struct ibv_qp ibvqp; + struct bnxt_re_chip_ctx *cctx; + struct bnxt_re_queue *sqq; + struct bnxt_re_wrid *swrid; + struct bnxt_re_queue *rqq; + struct bnxt_re_wrid *rwrid; + struct bnxt_re_srq *srq; + struct bnxt_re_cq *scq; + struct bnxt_re_cq *rcq; + struct bnxt_re_dpi *udpi; + struct bnxt_re_qpcap cap; + struct bnxt_re_fque_node snode; + struct bnxt_re_fque_node rnode; + uint32_t qpid; + uint32_t tbl_indx; + uint32_t sq_psn; + uint32_t pending_db; + uint64_t wqe_cnt; + uint16_t mtu; + uint16_t qpst; + uint8_t qptyp; + /* irdord? */ +}; + +struct bnxt_re_mr { + struct verbs_mr vmr; +}; + +struct bnxt_re_ah { + struct ibv_ah ibvah; + uint32_t avid; +}; + +struct bnxt_re_dev { + struct verbs_device vdev; + uint8_t abi_version; + uint32_t pg_size; + + uint32_t cqe_size; + uint32_t max_cq_depth; +}; + +struct bnxt_re_context { + struct verbs_context ibvctx; + uint32_t dev_id; + uint32_t max_qp; + struct bnxt_re_chip_ctx cctx; + uint32_t max_srq; + struct bnxt_re_dpi udpi; + void *shpg; + pthread_mutex_t shlock; + pthread_spinlock_t fqlock; +}; + +/* Chip context related functions */ +bool bnxt_re_is_chip_gen_p5(struct bnxt_re_chip_ctx *cctx); + +/* DB ring functions used internally*/ +void bnxt_re_ring_rq_db(struct bnxt_re_qp *qp); +void bnxt_re_ring_sq_db(struct bnxt_re_qp *qp); +void bnxt_re_ring_srq_arm(struct bnxt_re_srq *srq); +void bnxt_re_ring_srq_db(struct bnxt_re_srq *srq); +void bnxt_re_ring_cq_db(struct bnxt_re_cq *cq); +void bnxt_re_ring_cq_arm_db(struct bnxt_re_cq *cq, uint8_t aflag); + +/* pointer conversion functions*/ +static inline struct bnxt_re_dev *to_bnxt_re_dev(struct ibv_device *ibvdev) +{ + return container_of(ibvdev, struct bnxt_re_dev, vdev.device); +} + +static inline struct bnxt_re_context *to_bnxt_re_context( + struct ibv_context *ibvctx) +{ + return container_of(ibvctx, struct bnxt_re_context, ibvctx.context); +} + +static inline struct bnxt_re_pd *to_bnxt_re_pd(struct ibv_pd *ibvpd) +{ + return container_of(ibvpd, struct bnxt_re_pd, ibvpd); +} + +static inline struct bnxt_re_cq *to_bnxt_re_cq(struct ibv_cq *ibvcq) +{ + return container_of(ibvcq, struct bnxt_re_cq, ibvcq); +} + +static inline struct bnxt_re_qp *to_bnxt_re_qp(struct ibv_qp *ibvqp) +{ + return container_of(ibvqp, struct bnxt_re_qp, ibvqp); +} + +static inline struct bnxt_re_srq *to_bnxt_re_srq(struct ibv_srq *ibvsrq) +{ + return container_of(ibvsrq, struct bnxt_re_srq, ibvsrq); +} + +static inline struct bnxt_re_ah *to_bnxt_re_ah(struct ibv_ah *ibvah) +{ + return container_of(ibvah, struct bnxt_re_ah, ibvah); +} + +static inline uint32_t bnxt_re_get_sqe_sz(void) +{ + return sizeof(struct bnxt_re_bsqe) + + sizeof(struct bnxt_re_send) + + BNXT_RE_MAX_INLINE_SIZE; +} + +static inline uint32_t bnxt_re_get_sqe_hdr_sz(void) +{ + return sizeof(struct bnxt_re_bsqe) + sizeof(struct bnxt_re_send); +} + +static inline uint32_t bnxt_re_get_rqe_sz(void) +{ + return sizeof(struct bnxt_re_brqe) + + sizeof(struct bnxt_re_rqe) + + BNXT_RE_MAX_INLINE_SIZE; +} + +static inline uint32_t bnxt_re_get_rqe_hdr_sz(void) +{ + return sizeof(struct bnxt_re_brqe) + sizeof(struct bnxt_re_rqe); +} + +static inline uint32_t bnxt_re_get_srqe_sz(void) +{ + return sizeof(struct bnxt_re_brqe) + + sizeof(struct bnxt_re_srqe) + + BNXT_RE_MAX_INLINE_SIZE; +} + +static inline uint32_t bnxt_re_get_srqe_hdr_sz(void) +{ + return sizeof(struct bnxt_re_brqe) + sizeof(struct bnxt_re_srqe); +} + +static inline uint32_t bnxt_re_get_cqe_sz(void) +{ + return sizeof(struct bnxt_re_req_cqe) + sizeof(struct bnxt_re_bcqe); +} + +static inline uint8_t bnxt_re_ibv_to_bnxt_wr_opcd(uint8_t ibv_opcd) +{ + uint8_t bnxt_opcd; + + switch (ibv_opcd) { + case IBV_WR_SEND: + bnxt_opcd = BNXT_RE_WR_OPCD_SEND; + break; + case IBV_WR_SEND_WITH_IMM: + bnxt_opcd = BNXT_RE_WR_OPCD_SEND_IMM; + break; + case IBV_WR_RDMA_WRITE: + bnxt_opcd = BNXT_RE_WR_OPCD_RDMA_WRITE; + break; + case IBV_WR_RDMA_WRITE_WITH_IMM: + bnxt_opcd = BNXT_RE_WR_OPCD_RDMA_WRITE_IMM; + break; + case IBV_WR_RDMA_READ: + bnxt_opcd = BNXT_RE_WR_OPCD_RDMA_READ; + break; + case IBV_WR_ATOMIC_CMP_AND_SWP: + bnxt_opcd = BNXT_RE_WR_OPCD_ATOMIC_CS; + break; + case IBV_WR_ATOMIC_FETCH_AND_ADD: + bnxt_opcd = BNXT_RE_WR_OPCD_ATOMIC_FA; + break; + /* TODO: Add other opcodes */ + default: + bnxt_opcd = BNXT_RE_WR_OPCD_INVAL; + break; + }; + + return bnxt_opcd; +} + +static inline uint8_t bnxt_re_ibv_wr_to_wc_opcd(uint8_t wr_opcd) +{ + uint8_t wc_opcd; + + switch (wr_opcd) { + case IBV_WR_SEND_WITH_IMM: + case IBV_WR_SEND: + wc_opcd = IBV_WC_SEND; + break; + case IBV_WR_RDMA_WRITE_WITH_IMM: + case IBV_WR_RDMA_WRITE: + wc_opcd = IBV_WC_RDMA_WRITE; + break; + case IBV_WR_RDMA_READ: + wc_opcd = IBV_WC_RDMA_READ; + break; + case IBV_WR_ATOMIC_CMP_AND_SWP: + wc_opcd = IBV_WC_COMP_SWAP; + break; + case IBV_WR_ATOMIC_FETCH_AND_ADD: + wc_opcd = IBV_WC_FETCH_ADD; + break; + default: + wc_opcd = 0xFF; + break; + } + + return wc_opcd; +} + +static inline uint8_t bnxt_re_to_ibv_wc_status(uint8_t bnxt_wcst, + uint8_t is_req) +{ + uint8_t ibv_wcst; + + if (is_req) { + switch (bnxt_wcst) { + case BNXT_RE_REQ_ST_BAD_RESP: + ibv_wcst = IBV_WC_BAD_RESP_ERR; + break; + case BNXT_RE_REQ_ST_LOC_LEN: + ibv_wcst = IBV_WC_LOC_LEN_ERR; + break; + case BNXT_RE_REQ_ST_LOC_QP_OP: + ibv_wcst = IBV_WC_LOC_QP_OP_ERR; + break; + case BNXT_RE_REQ_ST_PROT: + ibv_wcst = IBV_WC_LOC_PROT_ERR; + break; + case BNXT_RE_REQ_ST_MEM_OP: + ibv_wcst = IBV_WC_MW_BIND_ERR; + break; + case BNXT_RE_REQ_ST_REM_INVAL: + ibv_wcst = IBV_WC_REM_INV_REQ_ERR; + break; + case BNXT_RE_REQ_ST_REM_ACC: + ibv_wcst = IBV_WC_REM_ACCESS_ERR; + break; + case BNXT_RE_REQ_ST_REM_OP: + ibv_wcst = IBV_WC_REM_OP_ERR; + break; + case BNXT_RE_REQ_ST_RNR_NAK_XCED: + ibv_wcst = IBV_WC_RNR_RETRY_EXC_ERR; + break; + case BNXT_RE_REQ_ST_TRNSP_XCED: + ibv_wcst = IBV_WC_RETRY_EXC_ERR; + break; + case BNXT_RE_REQ_ST_WR_FLUSH: + ibv_wcst = IBV_WC_WR_FLUSH_ERR; + break; + default: + ibv_wcst = IBV_WC_GENERAL_ERR; + break; + } + } else { + switch (bnxt_wcst) { + case BNXT_RE_RSP_ST_LOC_ACC: + ibv_wcst = IBV_WC_LOC_ACCESS_ERR; + break; + case BNXT_RE_RSP_ST_LOC_LEN: + ibv_wcst = IBV_WC_LOC_LEN_ERR; + break; + case BNXT_RE_RSP_ST_LOC_PROT: + ibv_wcst = IBV_WC_LOC_PROT_ERR; + break; + case BNXT_RE_RSP_ST_LOC_QP_OP: + ibv_wcst = IBV_WC_LOC_QP_OP_ERR; + break; + case BNXT_RE_RSP_ST_MEM_OP: + ibv_wcst = IBV_WC_MW_BIND_ERR; + break; + case BNXT_RE_RSP_ST_REM_INVAL: + ibv_wcst = IBV_WC_REM_INV_REQ_ERR; + break; + case BNXT_RE_RSP_ST_WR_FLUSH: + ibv_wcst = IBV_WC_WR_FLUSH_ERR; + break; + case BNXT_RE_RSP_ST_HW_FLUSH: + ibv_wcst = IBV_WC_FATAL_ERR; + break; + default: + ibv_wcst = IBV_WC_GENERAL_ERR; + break; + } + } + + return ibv_wcst; +} + +static inline uint8_t bnxt_re_is_cqe_valid(struct bnxt_re_cq *cq, + struct bnxt_re_bcqe *hdr) +{ + uint8_t valid = 0; + + valid = ((le32toh(hdr->flg_st_typ_ph) & + BNXT_RE_BCQE_PH_MASK) == cq->phase); + udma_from_device_barrier(); + + return valid; +} + +static inline void bnxt_re_change_cq_phase(struct bnxt_re_cq *cq) +{ + if (!cq->cqq.head) + cq->phase = (~cq->phase & BNXT_RE_BCQE_PH_MASK); +} +#endif diff --git a/providers/bnxt_re/memory.c b/providers/bnxt_re/memory.c new file mode 100644 index 0000000..67125e9 --- /dev/null +++ b/providers/bnxt_re/memory.c @@ -0,0 +1,76 @@ +/* + * Broadcom NetXtreme-E User Space RoCE driver + * + * Copyright (c) 2015-2017, Broadcom. All rights reserved. The term + * Broadcom refers to Broadcom Limited and/or its subsidiaries. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE + * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN + * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * Description: Implements method to allocate page-aligned memory + * buffers. + */ + +#include <string.h> +#include <sys/mman.h> + +#include "main.h" + +int bnxt_re_alloc_aligned(struct bnxt_re_queue *que, uint32_t pg_size) +{ + int ret, bytes; + + bytes = (que->depth * que->stride); + que->bytes = get_aligned(bytes, pg_size); + que->va = mmap(NULL, que->bytes, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (que->va == MAP_FAILED) { + que->bytes = 0; + return errno; + } + /* Touch pages before proceeding. */ + memset(que->va, 0, que->bytes); + + ret = ibv_dontfork_range(que->va, que->bytes); + if (ret) { + munmap(que->va, que->bytes); + que->bytes = 0; + } + + return ret; +} + +void bnxt_re_free_aligned(struct bnxt_re_queue *que) +{ + if (que->bytes) { + ibv_dofork_range(que->va, que->bytes); + munmap(que->va, que->bytes); + que->bytes = 0; + } +} diff --git a/providers/bnxt_re/memory.h b/providers/bnxt_re/memory.h new file mode 100644 index 0000000..75564c4 --- /dev/null +++ b/providers/bnxt_re/memory.h @@ -0,0 +1,110 @@ +/* + * Broadcom NetXtreme-E User Space RoCE driver + * + * Copyright (c) 2015-2017, Broadcom. All rights reserved. The term + * Broadcom refers to Broadcom Limited and/or its subsidiaries. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE + * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN + * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * Description: Implements data-struture to allocate page-aligned + * memory buffer. + */ + +#ifndef __MEMORY_H__ +#define __MEMORY_H__ + +#include <pthread.h> + +struct bnxt_re_queue { + void *va; + uint32_t bytes; /* for munmap */ + uint32_t depth; /* no. of entries */ + uint32_t head; + uint32_t tail; + uint32_t stride; + /* Represents the difference between the real queue depth allocated in + * HW and the user requested queue depth and is used to correctly flag + * queue full condition based on user supplied queue depth. + * This value can vary depending on the type of queue and any HW + * requirements that mandate keeping a fixed gap between the producer + * and the consumer indices in the queue + */ + uint32_t diff; + pthread_spinlock_t qlock; +}; + +static inline unsigned long get_aligned(uint32_t size, uint32_t al_size) +{ + return (unsigned long)(size + al_size - 1) & ~(al_size - 1); +} + +static inline unsigned long roundup_pow_of_two(unsigned long val) +{ + unsigned long roundup = 1; + + if (val == 1) + return (roundup << 1); + + while (roundup < val) + roundup <<= 1; + + return roundup; +} + +int bnxt_re_alloc_aligned(struct bnxt_re_queue *que, uint32_t pg_size); +void bnxt_re_free_aligned(struct bnxt_re_queue *que); + +/* Basic queue operation */ +static inline uint32_t bnxt_re_is_que_full(struct bnxt_re_queue *que) +{ + return (((que->diff + que->tail) & (que->depth - 1)) == que->head); +} + +static inline uint32_t bnxt_re_is_que_empty(struct bnxt_re_queue *que) +{ + return que->tail == que->head; +} + +static inline uint32_t bnxt_re_incr(uint32_t val, uint32_t max) +{ + return (++val & (max - 1)); +} + +static inline void bnxt_re_incr_tail(struct bnxt_re_queue *que) +{ + que->tail = bnxt_re_incr(que->tail, que->depth); +} + +static inline void bnxt_re_incr_head(struct bnxt_re_queue *que) +{ + que->head = bnxt_re_incr(que->head, que->depth); +} + +#endif diff --git a/providers/bnxt_re/verbs.c b/providers/bnxt_re/verbs.c new file mode 100644 index 0000000..2218e3a --- /dev/null +++ b/providers/bnxt_re/verbs.c @@ -0,0 +1,1701 @@ +/* + * Broadcom NetXtreme-E User Space RoCE driver + * + * Copyright (c) 2015-2017, Broadcom. All rights reserved. The term + * Broadcom refers to Broadcom Limited and/or its subsidiaries. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE + * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN + * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * Description: User IB-Verbs implementation + */ + +#include <assert.h> +#include <stdlib.h> +#include <stdio.h> +#include <string.h> +#include <unistd.h> +#include <signal.h> +#include <errno.h> +#include <pthread.h> +#include <sys/mman.h> +#include <netinet/in.h> +#include <unistd.h> + +#include <util/compiler.h> + +#include "main.h" +#include "verbs.h" + +int bnxt_re_query_device(struct ibv_context *ibvctx, + struct ibv_device_attr *dev_attr) +{ + struct ibv_query_device cmd; + uint8_t fw_ver[8]; + int status; + + memset(dev_attr, 0, sizeof(struct ibv_device_attr)); + status = ibv_cmd_query_device(ibvctx, dev_attr, (uint64_t *)&fw_ver, + &cmd, sizeof(cmd)); + snprintf(dev_attr->fw_ver, 64, "%d.%d.%d.%d", + fw_ver[0], fw_ver[1], fw_ver[2], fw_ver[3]); + return status; +} + +int bnxt_re_query_port(struct ibv_context *ibvctx, uint8_t port, + struct ibv_port_attr *port_attr) +{ + struct ibv_query_port cmd; + + return ibv_cmd_query_port(ibvctx, port, port_attr, &cmd, sizeof(cmd)); +} + +struct ibv_pd *bnxt_re_alloc_pd(struct ibv_context *ibvctx) +{ + struct ibv_alloc_pd cmd; + struct ubnxt_re_pd_resp resp; + struct bnxt_re_context *cntx = to_bnxt_re_context(ibvctx); + struct bnxt_re_dev *dev = to_bnxt_re_dev(ibvctx->device); + struct bnxt_re_pd *pd; + uint64_t dbr; + + pd = calloc(1, sizeof(*pd)); + if (!pd) + return NULL; + + memset(&resp, 0, sizeof(resp)); + if (ibv_cmd_alloc_pd(ibvctx, &pd->ibvpd, &cmd, sizeof(cmd), + &resp.ibv_resp, sizeof(resp))) + goto out; + + pd->pdid = resp.pdid; + dbr = resp.dbr; + static_assert(offsetof(struct ubnxt_re_pd_resp, dbr) == 4 * 3, + "Bad dbr placement"); + + /* Map DB page now. */ + if (!cntx->udpi.dbpage) { + cntx->udpi.dpindx = resp.dpi; + cntx->udpi.dbpage = mmap(NULL, dev->pg_size, PROT_WRITE, + MAP_SHARED, ibvctx->cmd_fd, dbr); + if (cntx->udpi.dbpage == MAP_FAILED) { + (void)ibv_cmd_dealloc_pd(&pd->ibvpd); + goto out; + } + } + + return &pd->ibvpd; +out: + free(pd); + return NULL; +} + +int bnxt_re_free_pd(struct ibv_pd *ibvpd) +{ + struct bnxt_re_pd *pd = to_bnxt_re_pd(ibvpd); + int status; + + status = ibv_cmd_dealloc_pd(ibvpd); + if (status) + return status; + /* DPI un-mapping will be during uninit_ucontext */ + free(pd); + + return 0; +} + +struct ibv_mr *bnxt_re_reg_mr(struct ibv_pd *ibvpd, void *sva, size_t len, + uint64_t hca_va, int access) +{ + struct bnxt_re_mr *mr; + struct ibv_reg_mr cmd; + struct ubnxt_re_mr_resp resp; + + mr = calloc(1, sizeof(*mr)); + if (!mr) + return NULL; + + if (ibv_cmd_reg_mr(ibvpd, sva, len, hca_va, access, &mr->vmr, &cmd, + sizeof(cmd), &resp.ibv_resp, sizeof(resp))) { + free(mr); + return NULL; + } + + return &mr->vmr.ibv_mr; +} + +int bnxt_re_dereg_mr(struct verbs_mr *vmr) +{ + struct bnxt_re_mr *mr = (struct bnxt_re_mr *)vmr; + int status; + + status = ibv_cmd_dereg_mr(vmr); + if (status) + return status; + free(mr); + + return 0; +} + +struct ibv_cq *bnxt_re_create_cq(struct ibv_context *ibvctx, int ncqe, + struct ibv_comp_channel *channel, int vec) +{ + struct bnxt_re_cq *cq; + struct ubnxt_re_cq cmd; + struct ubnxt_re_cq_resp resp; + + struct bnxt_re_context *cntx = to_bnxt_re_context(ibvctx); + struct bnxt_re_dev *dev = to_bnxt_re_dev(ibvctx->device); + + if (ncqe > dev->max_cq_depth) + return NULL; + + cq = calloc(1, sizeof(*cq)); + if (!cq) + return NULL; + + cq->cqq.depth = roundup_pow_of_two(ncqe + 1); + if (cq->cqq.depth > dev->max_cq_depth + 1) + cq->cqq.depth = dev->max_cq_depth + 1; + cq->cqq.stride = dev->cqe_size; + if (bnxt_re_alloc_aligned(&cq->cqq, dev->pg_size)) + goto fail; + + pthread_spin_init(&cq->cqq.qlock, PTHREAD_PROCESS_PRIVATE); + + cmd.cq_va = (uintptr_t)cq->cqq.va; + cmd.cq_handle = (uintptr_t)cq; + + memset(&resp, 0, sizeof(resp)); + if (ibv_cmd_create_cq(ibvctx, ncqe, channel, vec, + &cq->ibvcq, &cmd.ibv_cmd, sizeof(cmd), + &resp.ibv_resp, sizeof(resp))) + goto cmdfail; + + cq->cqid = resp.cqid; + cq->phase = resp.phase; + cq->cqq.tail = resp.tail; + cq->udpi = &cntx->udpi; + cq->first_arm = true; + + list_head_init(&cq->sfhead); + list_head_init(&cq->rfhead); + + return &cq->ibvcq; +cmdfail: + bnxt_re_free_aligned(&cq->cqq); +fail: + free(cq); + return NULL; +} + +int bnxt_re_resize_cq(struct ibv_cq *ibvcq, int ncqe) +{ + return -ENOSYS; +} + +int bnxt_re_destroy_cq(struct ibv_cq *ibvcq) +{ + int status; + struct bnxt_re_cq *cq = to_bnxt_re_cq(ibvcq); + + status = ibv_cmd_destroy_cq(ibvcq); + if (status) + return status; + + bnxt_re_free_aligned(&cq->cqq); + free(cq); + + return 0; +} + +static uint8_t bnxt_re_poll_err_scqe(struct bnxt_re_qp *qp, + struct ibv_wc *ibvwc, + struct bnxt_re_bcqe *hdr, + struct bnxt_re_req_cqe *scqe, int *cnt) +{ + struct bnxt_re_queue *sq = qp->sqq; + struct bnxt_re_context *cntx; + struct bnxt_re_wrid *swrid; + struct bnxt_re_psns *spsn; + struct bnxt_re_cq *scq; + uint32_t head = sq->head; + uint8_t status; + + scq = to_bnxt_re_cq(qp->ibvqp.send_cq); + cntx = to_bnxt_re_context(scq->ibvcq.context); + swrid = &qp->swrid[head]; + spsn = swrid->psns; + + *cnt = 1; + status = (le32toh(hdr->flg_st_typ_ph) >> BNXT_RE_BCQE_STATUS_SHIFT) & + BNXT_RE_BCQE_STATUS_MASK; + ibvwc->status = bnxt_re_to_ibv_wc_status(status, true); + ibvwc->wc_flags = 0; + ibvwc->wr_id = swrid->wrid; + ibvwc->qp_num = qp->qpid; + ibvwc->opcode = (le32toh(spsn->opc_spsn) >> + BNXT_RE_PSNS_OPCD_SHIFT) & + BNXT_RE_PSNS_OPCD_MASK; + ibvwc->byte_len = 0; + + bnxt_re_incr_head(qp->sqq); + + if (qp->qpst != IBV_QPS_ERR) + qp->qpst = IBV_QPS_ERR; + pthread_spin_lock(&cntx->fqlock); + bnxt_re_fque_add_node(&scq->sfhead, &qp->snode); + pthread_spin_unlock(&cntx->fqlock); + + return false; +} + +static uint8_t bnxt_re_poll_success_scqe(struct bnxt_re_qp *qp, + struct ibv_wc *ibvwc, + struct bnxt_re_bcqe *hdr, + struct bnxt_re_req_cqe *scqe, + int *cnt) +{ + struct bnxt_re_queue *sq = qp->sqq; + struct bnxt_re_wrid *swrid; + struct bnxt_re_psns *spsn; + uint8_t pcqe = false; + uint32_t head = sq->head; + uint32_t cindx; + + swrid = &qp->swrid[head]; + spsn = swrid->psns; + cindx = le32toh(scqe->con_indx); + + if (!(swrid->sig & IBV_SEND_SIGNALED)) { + *cnt = 0; + } else { + ibvwc->status = IBV_WC_SUCCESS; + ibvwc->wc_flags = 0; + ibvwc->qp_num = qp->qpid; + ibvwc->wr_id = swrid->wrid; + ibvwc->opcode = (le32toh(spsn->opc_spsn) >> + BNXT_RE_PSNS_OPCD_SHIFT) & + BNXT_RE_PSNS_OPCD_MASK; + if (ibvwc->opcode == IBV_WC_RDMA_READ || + ibvwc->opcode == IBV_WC_COMP_SWAP || + ibvwc->opcode == IBV_WC_FETCH_ADD) + ibvwc->byte_len = swrid->bytes; + + *cnt = 1; + } + + bnxt_re_incr_head(sq); + if (sq->head != cindx) + pcqe = true; + + return pcqe; +} + +static uint8_t bnxt_re_poll_scqe(struct bnxt_re_qp *qp, struct ibv_wc *ibvwc, + void *cqe, int *cnt) +{ + struct bnxt_re_bcqe *hdr; + struct bnxt_re_req_cqe *scqe; + uint8_t status, pcqe = false; + + scqe = cqe; + hdr = cqe + sizeof(struct bnxt_re_req_cqe); + + status = (le32toh(hdr->flg_st_typ_ph) >> BNXT_RE_BCQE_STATUS_SHIFT) & + BNXT_RE_BCQE_STATUS_MASK; + if (status == BNXT_RE_REQ_ST_OK) + pcqe = bnxt_re_poll_success_scqe(qp, ibvwc, hdr, scqe, cnt); + else + pcqe = bnxt_re_poll_err_scqe(qp, ibvwc, hdr, scqe, cnt); + + return pcqe; +} + +static void bnxt_re_release_srqe(struct bnxt_re_srq *srq, int tag) +{ + pthread_spin_lock(&srq->srqq->qlock); + srq->srwrid[srq->last_idx].next_idx = tag; + srq->last_idx = tag; + srq->srwrid[srq->last_idx].next_idx = -1; + pthread_spin_unlock(&srq->srqq->qlock); +} + +static int bnxt_re_poll_err_rcqe(struct bnxt_re_qp *qp, struct ibv_wc *ibvwc, + struct bnxt_re_bcqe *hdr, void *cqe) +{ + struct bnxt_re_queue *rq; + struct bnxt_re_cq *rcq; + struct bnxt_re_context *cntx; + uint8_t status; + + rcq = to_bnxt_re_cq(qp->ibvqp.recv_cq); + cntx = to_bnxt_re_context(rcq->ibvcq.context); + + if (!qp->srq) { + rq = qp->rqq; + ibvwc->wr_id = qp->rwrid[rq->head].wrid; + } else { + struct bnxt_re_srq *srq; + int tag; + + srq = qp->srq; + rq = srq->srqq; + tag = le32toh(hdr->qphi_rwrid) & BNXT_RE_BCQE_RWRID_MASK; + ibvwc->wr_id = srq->srwrid[tag].wrid; + bnxt_re_release_srqe(srq, tag); + } + + status = (le32toh(hdr->flg_st_typ_ph) >> BNXT_RE_BCQE_STATUS_SHIFT) & + BNXT_RE_BCQE_STATUS_MASK; + /* skip h/w flush errors */ + if (status == BNXT_RE_RSP_ST_HW_FLUSH) + return 0; + + ibvwc->status = bnxt_re_to_ibv_wc_status(status, false); + ibvwc->qp_num = qp->qpid; + ibvwc->opcode = IBV_WC_RECV; + ibvwc->byte_len = 0; + ibvwc->wc_flags = 0; + if (qp->qptyp == IBV_QPT_UD) + ibvwc->src_qp = 0; + bnxt_re_incr_head(rq); + + if (!qp->srq) { + pthread_spin_lock(&cntx->fqlock); + bnxt_re_fque_add_node(&rcq->rfhead, &qp->rnode); + pthread_spin_unlock(&cntx->fqlock); + } + + return 1; +} + +static void bnxt_re_fill_ud_cqe(struct ibv_wc *ibvwc, + struct bnxt_re_bcqe *hdr, void *cqe) +{ + struct bnxt_re_ud_cqe *ucqe = cqe; + uint32_t qpid; + + qpid = ((le32toh(hdr->qphi_rwrid) >> BNXT_RE_BCQE_SRCQP_SHIFT) & + BNXT_RE_BCQE_SRCQP_SHIFT) << 0x10; /* higher 8 bits of 24 */ + qpid |= (le64toh(ucqe->qplo_mac) >> BNXT_RE_UD_CQE_SRCQPLO_SHIFT) & + BNXT_RE_UD_CQE_SRCQPLO_MASK; /*lower 16 of 24 */ + ibvwc->src_qp = qpid; + ibvwc->wc_flags |= IBV_WC_GRH; + /*IB-stack ABI in user do not ask for MAC to be reported. */ +} + +static void bnxt_re_poll_success_rcqe(struct bnxt_re_qp *qp, + struct ibv_wc *ibvwc, + struct bnxt_re_bcqe *hdr, void *cqe) +{ + struct bnxt_re_queue *rq; + struct bnxt_re_rc_cqe *rcqe; + uint8_t flags, is_imm, is_rdma; + + rcqe = cqe; + if (!qp->srq) { + rq = qp->rqq; + ibvwc->wr_id = qp->rwrid[rq->head].wrid; + } else { + struct bnxt_re_srq *srq; + int tag; + + srq = qp->srq; + rq = srq->srqq; + tag = le32toh(hdr->qphi_rwrid) & BNXT_RE_BCQE_RWRID_MASK; + ibvwc->wr_id = srq->srwrid[tag].wrid; + bnxt_re_release_srqe(srq, tag); + } + + ibvwc->status = IBV_WC_SUCCESS; + ibvwc->qp_num = qp->qpid; + ibvwc->byte_len = le32toh(rcqe->length); + ibvwc->opcode = IBV_WC_RECV; + + flags = (le32toh(hdr->flg_st_typ_ph) >> BNXT_RE_BCQE_FLAGS_SHIFT) & + BNXT_RE_BCQE_FLAGS_MASK; + is_imm = (flags & BNXT_RE_RC_FLAGS_IMM_MASK) >> + BNXT_RE_RC_FLAGS_IMM_SHIFT; + is_rdma = (flags & BNXT_RE_RC_FLAGS_RDMA_MASK) >> + BNXT_RE_RC_FLAGS_RDMA_SHIFT; + ibvwc->wc_flags = 0; + if (is_imm) { + ibvwc->wc_flags |= IBV_WC_WITH_IMM; + /* Completion reports the raw-data in LE format, While + * user expects it in BE format. Thus, swapping on outgoing + * data is needed. On a BE platform le32toh will do the swap + * while on LE platform htobe32 will do the job. + */ + ibvwc->imm_data = htobe32(le32toh(rcqe->imm_key)); + if (is_rdma) + ibvwc->opcode = IBV_WC_RECV_RDMA_WITH_IMM; + } + + if (qp->qptyp == IBV_QPT_UD) + bnxt_re_fill_ud_cqe(ibvwc, hdr, cqe); + + bnxt_re_incr_head(rq); +} + +static uint8_t bnxt_re_poll_rcqe(struct bnxt_re_qp *qp, struct ibv_wc *ibvwc, + void *cqe, int *cnt) +{ + struct bnxt_re_bcqe *hdr; + uint8_t status, pcqe = false; + + hdr = cqe + sizeof(struct bnxt_re_rc_cqe); + + status = (le32toh(hdr->flg_st_typ_ph) >> BNXT_RE_BCQE_STATUS_SHIFT) & + BNXT_RE_BCQE_STATUS_MASK; + *cnt = 1; + if (status == BNXT_RE_RSP_ST_OK) + bnxt_re_poll_success_rcqe(qp, ibvwc, hdr, cqe); + else + *cnt = bnxt_re_poll_err_rcqe(qp, ibvwc, hdr, cqe); + + return pcqe; +} + +static uint8_t bnxt_re_poll_term_cqe(struct bnxt_re_qp *qp, + struct ibv_wc *ibvwc, void *cqe, int *cnt) +{ + struct bnxt_re_context *cntx; + struct bnxt_re_cq *scq, *rcq; + uint8_t pcqe = false; + + scq = to_bnxt_re_cq(qp->ibvqp.send_cq); + rcq = to_bnxt_re_cq(qp->ibvqp.recv_cq); + cntx = to_bnxt_re_context(scq->ibvcq.context); + /* For now just add the QP to flush list without + * considering the index reported in the CQE. + * Continue reporting flush completions until the + * SQ and RQ are empty. + */ + *cnt = 0; + if (qp->qpst != IBV_QPS_ERR) + qp->qpst = IBV_QPS_ERR; + pthread_spin_lock(&cntx->fqlock); + bnxt_re_fque_add_node(&rcq->rfhead, &qp->rnode); + bnxt_re_fque_add_node(&scq->sfhead, &qp->snode); + pthread_spin_unlock(&cntx->fqlock); + + return pcqe; +} + +static int bnxt_re_poll_one(struct bnxt_re_cq *cq, int nwc, struct ibv_wc *wc) +{ + struct bnxt_re_queue *cqq = &cq->cqq; + struct bnxt_re_qp *qp; + struct bnxt_re_bcqe *hdr; + struct bnxt_re_req_cqe *scqe; + struct bnxt_re_ud_cqe *rcqe; + void *cqe; + uint64_t *qp_handle = NULL; + int type, cnt = 0, dqed = 0, hw_polled = 0; + uint8_t pcqe = false; + + while (nwc) { + cqe = cqq->va + cqq->head * bnxt_re_get_cqe_sz(); + hdr = cqe + sizeof(struct bnxt_re_req_cqe); + if (!bnxt_re_is_cqe_valid(cq, hdr)) + break; + type = (le32toh(hdr->flg_st_typ_ph) >> + BNXT_RE_BCQE_TYPE_SHIFT) & BNXT_RE_BCQE_TYPE_MASK; + switch (type) { + case BNXT_RE_WC_TYPE_SEND: + scqe = cqe; + qp_handle = (uint64_t *)&scqe->qp_handle; + qp = (struct bnxt_re_qp *) + (uintptr_t)le64toh(scqe->qp_handle); + if (!qp) + break; /*stale cqe. should be rung.*/ + pcqe = bnxt_re_poll_scqe(qp, wc, cqe, &cnt); + break; + case BNXT_RE_WC_TYPE_RECV_RC: + case BNXT_RE_WC_TYPE_RECV_UD: + rcqe = cqe; + qp_handle = (uint64_t *)&rcqe->qp_handle; + qp = (struct bnxt_re_qp *) + (uintptr_t)le64toh(rcqe->qp_handle); + if (!qp) + break; /*stale cqe. should be rung.*/ + pcqe = bnxt_re_poll_rcqe(qp, wc, cqe, &cnt); + break; + case BNXT_RE_WC_TYPE_RECV_RAW: + break; + case BNXT_RE_WC_TYPE_TERM: + scqe = cqe; + qp_handle = (uint64_t *)&scqe->qp_handle; + qp = (struct bnxt_re_qp *) + (uintptr_t)le64toh(scqe->qp_handle); + if (!qp) + break; + pcqe = bnxt_re_poll_term_cqe(qp, wc, cqe, &cnt); + break; + case BNXT_RE_WC_TYPE_COFF: + break; + default: + break; + }; + + if (pcqe) + goto skipp_real; + + hw_polled++; + if (qp_handle) { + *qp_handle = 0x0ULL; /* mark cqe as read */ + qp_handle = NULL; + } + bnxt_re_incr_head(&cq->cqq); + bnxt_re_change_cq_phase(cq); +skipp_real: + if (cnt) { + cnt = 0; + dqed++; + nwc--; + wc++; + } + } + + if (hw_polled) + bnxt_re_ring_cq_db(cq); + + return dqed; +} + +static int bnxt_re_poll_flush_wcs(struct bnxt_re_queue *que, + struct bnxt_re_wrid *wridp, + struct ibv_wc *ibvwc, uint32_t qpid, + int nwc) +{ + struct bnxt_re_wrid *wrid; + struct bnxt_re_psns *psns; + uint32_t cnt = 0, head; + uint8_t opcode = IBV_WC_RECV; + + while (nwc) { + if (bnxt_re_is_que_empty(que)) + break; + head = que->head; + wrid = &wridp[head]; + if (wrid->psns) { + psns = wrid->psns; + opcode = (le32toh(psns->opc_spsn) >> + BNXT_RE_PSNS_OPCD_SHIFT) & + BNXT_RE_PSNS_OPCD_MASK; + } + + ibvwc->status = IBV_WC_WR_FLUSH_ERR; + ibvwc->opcode = opcode; + ibvwc->wr_id = wrid->wrid; + ibvwc->qp_num = qpid; + ibvwc->byte_len = 0; + ibvwc->wc_flags = 0; + + bnxt_re_incr_head(que); + nwc--; + cnt++; + ibvwc++; + } + + return cnt; +} + +static int bnxt_re_poll_flush_wqes(struct bnxt_re_cq *cq, + struct list_head *lhead, + struct ibv_wc *ibvwc, + int32_t nwc) +{ + struct bnxt_re_fque_node *cur, *tmp; + struct bnxt_re_wrid *wridp; + struct bnxt_re_queue *que; + struct bnxt_re_qp *qp; + bool sq_list = false; + uint32_t polled = 0; + + sq_list = (lhead == &cq->sfhead) ? true : false; + if (!list_empty(lhead)) { + list_for_each_safe(lhead, cur, tmp, list) { + if (sq_list) { + qp = container_of(cur, struct bnxt_re_qp, + snode); + que = qp->sqq; + wridp = qp->swrid; + } else { + qp = container_of(cur, struct bnxt_re_qp, + rnode); + que = qp->rqq; + wridp = qp->rwrid; + } + if (bnxt_re_is_que_empty(que)) + continue; + polled += bnxt_re_poll_flush_wcs(que, wridp, + ibvwc + polled, + qp->qpid, + nwc - polled); + if (!(nwc - polled)) + break; + } + } + + return polled; +} + +static int bnxt_re_poll_flush_lists(struct bnxt_re_cq *cq, uint32_t nwc, + struct ibv_wc *ibvwc) +{ + int left, polled = 0; + + /* Check if flush Qs are empty */ + if (list_empty(&cq->sfhead) && list_empty(&cq->rfhead)) + return 0; + + polled = bnxt_re_poll_flush_wqes(cq, &cq->sfhead, ibvwc, nwc); + left = nwc - polled; + + if (!left) + return polled; + + polled += bnxt_re_poll_flush_wqes(cq, &cq->rfhead, + ibvwc + polled, left); + return polled; +} + +int bnxt_re_poll_cq(struct ibv_cq *ibvcq, int nwc, struct ibv_wc *wc) +{ + struct bnxt_re_cq *cq = to_bnxt_re_cq(ibvcq); + struct bnxt_re_context *cntx = to_bnxt_re_context(ibvcq->context); + int dqed, left = 0; + + pthread_spin_lock(&cq->cqq.qlock); + dqed = bnxt_re_poll_one(cq, nwc, wc); + if (cq->deferred_arm) { + bnxt_re_ring_cq_arm_db(cq, cq->deferred_arm_flags); + cq->deferred_arm = false; + cq->deferred_arm_flags = 0; + } + pthread_spin_unlock(&cq->cqq.qlock); + left = nwc - dqed; + if (left) { + /* Check if anything is there to flush. */ + pthread_spin_lock(&cntx->fqlock); + dqed += bnxt_re_poll_flush_lists(cq, left, (wc + dqed)); + pthread_spin_unlock(&cntx->fqlock); + } + + return dqed; +} + +static void bnxt_re_cleanup_cq(struct bnxt_re_qp *qp, struct bnxt_re_cq *cq) +{ + struct bnxt_re_context *cntx; + struct bnxt_re_queue *que = &cq->cqq; + struct bnxt_re_bcqe *hdr; + struct bnxt_re_req_cqe *scqe; + struct bnxt_re_rc_cqe *rcqe; + void *cqe; + int indx, type; + + cntx = to_bnxt_re_context(cq->ibvcq.context); + + pthread_spin_lock(&que->qlock); + for (indx = 0; indx < que->depth; indx++) { + cqe = que->va + indx * bnxt_re_get_cqe_sz(); + hdr = cqe + sizeof(struct bnxt_re_req_cqe); + type = (le32toh(hdr->flg_st_typ_ph) >> + BNXT_RE_BCQE_TYPE_SHIFT) & BNXT_RE_BCQE_TYPE_MASK; + + if (type == BNXT_RE_WC_TYPE_COFF) + continue; + if (type == BNXT_RE_WC_TYPE_SEND || + type == BNXT_RE_WC_TYPE_TERM) { + scqe = cqe; + if (le64toh(scqe->qp_handle) == (uintptr_t)qp) + scqe->qp_handle = 0ULL; + } else { + rcqe = cqe; + if (le64toh(rcqe->qp_handle) == (uintptr_t)qp) + rcqe->qp_handle = 0ULL; + } + + } + pthread_spin_unlock(&que->qlock); + + pthread_spin_lock(&cntx->fqlock); + bnxt_re_fque_del_node(&qp->snode); + bnxt_re_fque_del_node(&qp->rnode); + pthread_spin_unlock(&cntx->fqlock); +} + +void bnxt_re_cq_event(struct ibv_cq *ibvcq) +{ + +} + +int bnxt_re_arm_cq(struct ibv_cq *ibvcq, int flags) +{ + struct bnxt_re_cq *cq = to_bnxt_re_cq(ibvcq); + + pthread_spin_lock(&cq->cqq.qlock); + flags = !flags ? BNXT_RE_QUE_TYPE_CQ_ARMALL : + BNXT_RE_QUE_TYPE_CQ_ARMSE; + if (cq->first_arm) { + bnxt_re_ring_cq_arm_db(cq, flags); + cq->first_arm = false; + } + cq->deferred_arm = true; + cq->deferred_arm_flags = flags; + pthread_spin_unlock(&cq->cqq.qlock); + + return 0; +} + +static int bnxt_re_check_qp_limits(struct bnxt_re_context *cntx, + struct ibv_qp_init_attr *attr) +{ + struct ibv_device_attr devattr; + int ret; + + ret = bnxt_re_query_device(&cntx->ibvctx.context, &devattr); + if (ret) + return ret; + if (attr->cap.max_send_sge > devattr.max_sge) + return EINVAL; + if (attr->cap.max_recv_sge > devattr.max_sge) + return EINVAL; + if (attr->cap.max_inline_data > BNXT_RE_MAX_INLINE_SIZE) + return EINVAL; + if (attr->cap.max_send_wr > devattr.max_qp_wr) + attr->cap.max_send_wr = devattr.max_qp_wr; + if (attr->cap.max_recv_wr > devattr.max_qp_wr) + attr->cap.max_recv_wr = devattr.max_qp_wr; + + return 0; +} + +static void bnxt_re_free_queue_ptr(struct bnxt_re_qp *qp) +{ + if (qp->rqq) + free(qp->rqq); + if (qp->sqq) + free(qp->sqq); +} + +static int bnxt_re_alloc_queue_ptr(struct bnxt_re_qp *qp, + struct ibv_qp_init_attr *attr) +{ + qp->sqq = calloc(1, sizeof(struct bnxt_re_queue)); + if (!qp->sqq) + return -ENOMEM; + if (!attr->srq) { + qp->rqq = calloc(1, sizeof(struct bnxt_re_queue)); + if (!qp->rqq) { + free(qp->sqq); + return -ENOMEM; + } + } + + return 0; +} + +static void bnxt_re_free_queues(struct bnxt_re_qp *qp) +{ + if (qp->rqq) { + if (qp->rwrid) + free(qp->rwrid); + pthread_spin_destroy(&qp->rqq->qlock); + bnxt_re_free_aligned(qp->rqq); + } + + if (qp->swrid) + free(qp->swrid); + pthread_spin_destroy(&qp->sqq->qlock); + bnxt_re_free_aligned(qp->sqq); +} + +static int bnxt_re_alloc_queues(struct bnxt_re_qp *qp, + struct ibv_qp_init_attr *attr, + uint32_t pg_size) { + struct bnxt_re_psns_ext *psns_ext; + struct bnxt_re_queue *que; + struct bnxt_re_psns *psns; + uint32_t psn_depth; + uint32_t psn_size; + int ret, indx; + + que = qp->sqq; + que->stride = bnxt_re_get_sqe_sz(); + /* 8916 adjustment */ + que->depth = roundup_pow_of_two(attr->cap.max_send_wr + 1 + + BNXT_RE_FULL_FLAG_DELTA); + que->diff = que->depth - attr->cap.max_send_wr; + + /* psn_depth extra entries of size que->stride */ + psn_size = bnxt_re_is_chip_gen_p5(qp->cctx) ? + sizeof(struct bnxt_re_psns_ext) : + sizeof(struct bnxt_re_psns); + psn_depth = (que->depth * psn_size) / que->stride; + if ((que->depth * psn_size) % que->stride) + psn_depth++; + que->depth += psn_depth; + /* PSN-search memory is allocated without checking for + * QP-Type. Kenrel driver do not map this memory if it + * is UD-qp. UD-qp use this memory to maintain WC-opcode. + * See definition of bnxt_re_fill_psns() for the use case. + */ + ret = bnxt_re_alloc_aligned(qp->sqq, pg_size); + if (ret) + return ret; + /* exclude psns depth*/ + que->depth -= psn_depth; + /* start of spsn space sizeof(struct bnxt_re_psns) each. */ + psns = (que->va + que->stride * que->depth); + psns_ext = (struct bnxt_re_psns_ext *)psns; + pthread_spin_init(&que->qlock, PTHREAD_PROCESS_PRIVATE); + qp->swrid = calloc(que->depth, sizeof(struct bnxt_re_wrid)); + if (!qp->swrid) { + ret = -ENOMEM; + goto fail; + } + + for (indx = 0 ; indx < que->depth; indx++, psns++) + qp->swrid[indx].psns = psns; + if (bnxt_re_is_chip_gen_p5(qp->cctx)) { + for (indx = 0 ; indx < que->depth; indx++, psns_ext++) { + qp->swrid[indx].psns_ext = psns_ext; + qp->swrid[indx].psns = (struct bnxt_re_psns *)psns_ext; + } + } + + qp->cap.max_swr = que->depth; + + if (qp->rqq) { + que = qp->rqq; + que->stride = bnxt_re_get_rqe_sz(); + que->depth = roundup_pow_of_two(attr->cap.max_recv_wr + 1); + que->diff = que->depth - attr->cap.max_recv_wr; + ret = bnxt_re_alloc_aligned(qp->rqq, pg_size); + if (ret) + goto fail; + pthread_spin_init(&que->qlock, PTHREAD_PROCESS_PRIVATE); + /* For RQ only bnxt_re_wri.wrid is used. */ + qp->rwrid = calloc(que->depth, sizeof(struct bnxt_re_wrid)); + if (!qp->rwrid) { + ret = -ENOMEM; + goto fail; + } + qp->cap.max_rwr = que->depth; + } + + return 0; +fail: + bnxt_re_free_queues(qp); + return ret; +} + +struct ibv_qp *bnxt_re_create_qp(struct ibv_pd *ibvpd, + struct ibv_qp_init_attr *attr) +{ + struct bnxt_re_qp *qp; + struct ubnxt_re_qp req; + struct ubnxt_re_qp_resp resp; + struct bnxt_re_qpcap *cap; + + struct bnxt_re_context *cntx = to_bnxt_re_context(ibvpd->context); + struct bnxt_re_dev *dev = to_bnxt_re_dev(cntx->ibvctx.context.device); + + if (bnxt_re_check_qp_limits(cntx, attr)) + return NULL; + + qp = calloc(1, sizeof(*qp)); + if (!qp) + return NULL; + /* alloc queue pointers */ + if (bnxt_re_alloc_queue_ptr(qp, attr)) + goto fail; + /* alloc queues */ + qp->cctx = &cntx->cctx; + if (bnxt_re_alloc_queues(qp, attr, dev->pg_size)) + goto failq; + /* Fill ibv_cmd */ + cap = &qp->cap; + req.qpsva = (uintptr_t)qp->sqq->va; + req.qprva = qp->rqq ? (uintptr_t)qp->rqq->va : 0; + req.qp_handle = (uintptr_t)qp; + + if (ibv_cmd_create_qp(ibvpd, &qp->ibvqp, attr, &req.ibv_cmd, sizeof(req), + &resp.ibv_resp, sizeof(resp))) { + goto failcmd; + } + + qp->qpid = resp.qpid; + qp->qptyp = attr->qp_type; + qp->qpst = IBV_QPS_RESET; + qp->scq = to_bnxt_re_cq(attr->send_cq); + qp->rcq = to_bnxt_re_cq(attr->recv_cq); + if (attr->srq) + qp->srq = to_bnxt_re_srq(attr->srq); + qp->udpi = &cntx->udpi; + /* Save/return the altered Caps. */ + cap->max_ssge = attr->cap.max_send_sge; + cap->max_rsge = attr->cap.max_recv_sge; + cap->max_inline = attr->cap.max_inline_data; + cap->sqsig = attr->sq_sig_all; + fque_init_node(&qp->snode); + fque_init_node(&qp->rnode); + + return &qp->ibvqp; +failcmd: + bnxt_re_free_queues(qp); +failq: + bnxt_re_free_queue_ptr(qp); +fail: + free(qp); + + return NULL; +} + +int bnxt_re_modify_qp(struct ibv_qp *ibvqp, struct ibv_qp_attr *attr, + int attr_mask) +{ + struct ibv_modify_qp cmd = {}; + struct bnxt_re_qp *qp = to_bnxt_re_qp(ibvqp); + int rc; + + rc = ibv_cmd_modify_qp(ibvqp, attr, attr_mask, &cmd, sizeof(cmd)); + if (!rc) { + if (attr_mask & IBV_QP_STATE) { + qp->qpst = attr->qp_state; + /* transition to reset */ + if (qp->qpst == IBV_QPS_RESET) { + qp->sqq->head = 0; + qp->sqq->tail = 0; + if (qp->rqq) { + qp->rqq->head = 0; + qp->rqq->tail = 0; + } + } + } + + if (attr_mask & IBV_QP_SQ_PSN) + qp->sq_psn = attr->sq_psn; + if (attr_mask & IBV_QP_PATH_MTU) + qp->mtu = (0x80 << attr->path_mtu); + } + + return rc; +} + +int bnxt_re_query_qp(struct ibv_qp *ibvqp, struct ibv_qp_attr *attr, + int attr_mask, struct ibv_qp_init_attr *init_attr) +{ + struct ibv_query_qp cmd; + struct bnxt_re_qp *qp = to_bnxt_re_qp(ibvqp); + int rc; + + rc = ibv_cmd_query_qp(ibvqp, attr, attr_mask, init_attr, + &cmd, sizeof(cmd)); + if (!rc) + qp->qpst = ibvqp->state; + + return rc; +} + +int bnxt_re_destroy_qp(struct ibv_qp *ibvqp) +{ + struct bnxt_re_qp *qp = to_bnxt_re_qp(ibvqp); + int status; + + status = ibv_cmd_destroy_qp(ibvqp); + if (status) + return status; + + bnxt_re_cleanup_cq(qp, qp->rcq); + bnxt_re_cleanup_cq(qp, qp->scq); + bnxt_re_free_queues(qp); + bnxt_re_free_queue_ptr(qp); + free(qp); + + return 0; +} + +static inline uint8_t bnxt_re_set_hdr_flags(struct bnxt_re_bsqe *hdr, + uint32_t send_flags, uint8_t sqsig) +{ + uint8_t is_inline = false; + uint32_t hdrval = 0; + + if (send_flags & IBV_SEND_SIGNALED || sqsig) + hdrval |= ((BNXT_RE_WR_FLAGS_SIGNALED & BNXT_RE_HDR_FLAGS_MASK) + << BNXT_RE_HDR_FLAGS_SHIFT); + if (send_flags & IBV_SEND_FENCE) + /*TODO: See when RD fence can be used. */ + hdrval |= ((BNXT_RE_WR_FLAGS_UC_FENCE & BNXT_RE_HDR_FLAGS_MASK) + << BNXT_RE_HDR_FLAGS_SHIFT); + if (send_flags & IBV_SEND_SOLICITED) + hdrval |= ((BNXT_RE_WR_FLAGS_SE & BNXT_RE_HDR_FLAGS_MASK) + << BNXT_RE_HDR_FLAGS_SHIFT); + + if (send_flags & IBV_SEND_INLINE) { + hdrval |= ((BNXT_RE_WR_FLAGS_INLINE & BNXT_RE_HDR_FLAGS_MASK) + << BNXT_RE_HDR_FLAGS_SHIFT); + is_inline = true; + } + hdr->rsv_ws_fl_wt = htole32(hdrval); + + return is_inline; +} + +static int bnxt_re_build_sge(struct bnxt_re_sge *sge, struct ibv_sge *sg_list, + uint32_t num_sge, uint8_t is_inline) { + int indx, length = 0; + void *dst; + + if (!num_sge) { + memset(sge, 0, sizeof(*sge)); + return 0; + } + + if (is_inline) { + dst = sge; + for (indx = 0; indx < num_sge; indx++) { + length += sg_list[indx].length; + if (length > BNXT_RE_MAX_INLINE_SIZE) + return -ENOMEM; + memcpy(dst, (void *)(uintptr_t)sg_list[indx].addr, + sg_list[indx].length); + dst = dst + sg_list[indx].length; + } + } else { + for (indx = 0; indx < num_sge; indx++) { + sge[indx].pa = htole64(sg_list[indx].addr); + sge[indx].lkey = htole32(sg_list[indx].lkey); + sge[indx].length = htole32(sg_list[indx].length); + length += sg_list[indx].length; + } + } + + return length; +} + +static void bnxt_re_fill_psns(struct bnxt_re_qp *qp, struct bnxt_re_wrid *wrid, + uint8_t opcode, uint32_t len) +{ + uint32_t opc_spsn = 0, flg_npsn = 0; + struct bnxt_re_psns_ext *psns_ext; + uint32_t pkt_cnt = 0, nxt_psn = 0; + struct bnxt_re_psns *psns; + + psns = wrid->psns; + psns_ext = wrid->psns_ext; + + if (qp->qptyp == IBV_QPT_RC) { + opc_spsn = qp->sq_psn & BNXT_RE_PSNS_SPSN_MASK; + pkt_cnt = (len / qp->mtu); + if (len % qp->mtu) + pkt_cnt++; + if (len == 0) + pkt_cnt = 1; + nxt_psn = ((qp->sq_psn + pkt_cnt) & BNXT_RE_PSNS_NPSN_MASK); + flg_npsn = nxt_psn; + qp->sq_psn = nxt_psn; + } + opcode = bnxt_re_ibv_wr_to_wc_opcd(opcode); + opc_spsn |= (((uint32_t)opcode & BNXT_RE_PSNS_OPCD_MASK) << + BNXT_RE_PSNS_OPCD_SHIFT); + memset(psns, 0, sizeof(*psns)); + psns->opc_spsn = htole32(opc_spsn); + psns->flg_npsn = htole32(flg_npsn); + if (bnxt_re_is_chip_gen_p5(qp->cctx)) + psns_ext->st_slot_idx = 0; +} + +static void bnxt_re_fill_wrid(struct bnxt_re_wrid *wrid, struct ibv_send_wr *wr, + uint32_t len, uint8_t sqsig) +{ + wrid->wrid = wr->wr_id; + wrid->bytes = len; + wrid->sig = 0; + if (wr->send_flags & IBV_SEND_SIGNALED || sqsig) + wrid->sig = IBV_SEND_SIGNALED; +} + +static int bnxt_re_build_send_sqe(struct bnxt_re_qp *qp, void *wqe, + struct ibv_send_wr *wr, uint8_t is_inline) +{ + struct bnxt_re_bsqe *hdr = wqe; + struct bnxt_re_send *sqe = ((void *)wqe + sizeof(struct bnxt_re_bsqe)); + struct bnxt_re_sge *sge = ((void *)wqe + bnxt_re_get_sqe_hdr_sz()); + uint32_t wrlen, hdrval = 0; + int len; + uint8_t opcode, qesize; + + len = bnxt_re_build_sge(sge, wr->sg_list, wr->num_sge, is_inline); + if (len < 0) + return len; + sqe->length = htole32(len); + + /* Fill Header */ + opcode = bnxt_re_ibv_to_bnxt_wr_opcd(wr->opcode); + if (opcode == BNXT_RE_WR_OPCD_INVAL) + return -EINVAL; + hdrval = (opcode & BNXT_RE_HDR_WT_MASK); + + if (is_inline) { + wrlen = get_aligned(len, 16); + qesize = wrlen >> 4; + } else { + qesize = wr->num_sge; + } + /* HW requires wqe size has room for atleast one sge even if none was + * supplied by application + */ + if (!wr->num_sge) + qesize++; + qesize += (bnxt_re_get_sqe_hdr_sz() >> 4); + hdrval |= (qesize & BNXT_RE_HDR_WS_MASK) << BNXT_RE_HDR_WS_SHIFT; + hdr->rsv_ws_fl_wt |= htole32(hdrval); + return len; +} + +static int bnxt_re_build_ud_sqe(struct bnxt_re_qp *qp, void *wqe, + struct ibv_send_wr *wr, uint8_t is_inline) +{ + struct bnxt_re_send *sqe = ((void *)wqe + sizeof(struct bnxt_re_bsqe)); + struct bnxt_re_ah *ah; + int len; + + len = bnxt_re_build_send_sqe(qp, wqe, wr, is_inline); + sqe->qkey = htole32(wr->wr.ud.remote_qkey); + sqe->dst_qp = htole32(wr->wr.ud.remote_qpn); + if (!wr->wr.ud.ah) { + len = -EINVAL; + goto bail; + } + ah = to_bnxt_re_ah(wr->wr.ud.ah); + sqe->avid = htole32(ah->avid & 0xFFFFF); +bail: + return len; +} + +static int bnxt_re_build_rdma_sqe(struct bnxt_re_qp *qp, void *wqe, + struct ibv_send_wr *wr, uint8_t is_inline) +{ + struct bnxt_re_rdma *sqe = ((void *)wqe + sizeof(struct bnxt_re_bsqe)); + int len; + + len = bnxt_re_build_send_sqe(qp, wqe, wr, is_inline); + sqe->rva = htole64(wr->wr.rdma.remote_addr); + sqe->rkey = htole32(wr->wr.rdma.rkey); + + return len; +} + +static int bnxt_re_build_cns_sqe(struct bnxt_re_qp *qp, void *wqe, + struct ibv_send_wr *wr) +{ + struct bnxt_re_bsqe *hdr = wqe; + struct bnxt_re_atomic *sqe = ((void *)wqe + + sizeof(struct bnxt_re_bsqe)); + int len; + + len = bnxt_re_build_send_sqe(qp, wqe, wr, false); + hdr->key_immd = htole32(wr->wr.atomic.rkey); + sqe->rva = htole64(wr->wr.atomic.remote_addr); + sqe->cmp_dt = htole64(wr->wr.atomic.compare_add); + sqe->swp_dt = htole64(wr->wr.atomic.swap); + + return len; +} + +static int bnxt_re_build_fna_sqe(struct bnxt_re_qp *qp, void *wqe, + struct ibv_send_wr *wr) +{ + struct bnxt_re_bsqe *hdr = wqe; + struct bnxt_re_atomic *sqe = ((void *)wqe + + sizeof(struct bnxt_re_bsqe)); + int len; + + len = bnxt_re_build_send_sqe(qp, wqe, wr, false); + hdr->key_immd = htole32(wr->wr.atomic.rkey); + sqe->rva = htole64(wr->wr.atomic.remote_addr); + sqe->cmp_dt = htole64(wr->wr.atomic.compare_add); + + return len; +} + +int bnxt_re_post_send(struct ibv_qp *ibvqp, struct ibv_send_wr *wr, + struct ibv_send_wr **bad) +{ + struct bnxt_re_qp *qp = to_bnxt_re_qp(ibvqp); + struct bnxt_re_queue *sq = qp->sqq; + struct bnxt_re_wrid *wrid; + uint8_t is_inline = false; + struct bnxt_re_bsqe *hdr; + int ret = 0, bytes = 0; + bool ring_db = false; + void *sqe; + + pthread_spin_lock(&sq->qlock); + while (wr) { + if ((qp->qpst != IBV_QPS_RTS) && (qp->qpst != IBV_QPS_SQD)) { + *bad = wr; + ret = EINVAL; + goto bad_wr; + } + + if ((qp->qptyp == IBV_QPT_UD) && + (wr->opcode != IBV_WR_SEND && + wr->opcode != IBV_WR_SEND_WITH_IMM)) { + *bad = wr; + ret = EINVAL; + goto bad_wr; + } + + if (bnxt_re_is_que_full(sq) || + wr->num_sge > qp->cap.max_ssge) { + *bad = wr; + ret = ENOMEM; + goto bad_wr; + } + + sqe = (void *)(sq->va + (sq->tail * sq->stride)); + wrid = &qp->swrid[sq->tail]; + + memset(sqe, 0, bnxt_re_get_sqe_sz()); + hdr = sqe; + is_inline = bnxt_re_set_hdr_flags(hdr, wr->send_flags, + qp->cap.sqsig); + switch (wr->opcode) { + case IBV_WR_SEND_WITH_IMM: + /* Since our h/w is LE and user supplies raw-data in + * BE format. Swapping on incoming data is needed. + * On a BE platform htole32 will do the swap while on + * LE platform be32toh will do the job. + */ + hdr->key_immd = htole32(be32toh(wr->imm_data)); + SWITCH_FALLTHROUGH; + case IBV_WR_SEND: + if (qp->qptyp == IBV_QPT_UD) + bytes = bnxt_re_build_ud_sqe(qp, sqe, wr, + is_inline); + else + bytes = bnxt_re_build_send_sqe(qp, sqe, wr, + is_inline); + break; + case IBV_WR_RDMA_WRITE_WITH_IMM: + hdr->key_immd = htole32(be32toh(wr->imm_data)); + SWITCH_FALLTHROUGH; + case IBV_WR_RDMA_WRITE: + bytes = bnxt_re_build_rdma_sqe(qp, sqe, wr, is_inline); + break; + case IBV_WR_RDMA_READ: + bytes = bnxt_re_build_rdma_sqe(qp, sqe, wr, false); + break; + case IBV_WR_ATOMIC_CMP_AND_SWP: + bytes = bnxt_re_build_cns_sqe(qp, sqe, wr); + break; + case IBV_WR_ATOMIC_FETCH_AND_ADD: + bytes = bnxt_re_build_fna_sqe(qp, sqe, wr); + break; + default: + bytes = -EINVAL; + break; + } + + if (bytes < 0) { + ret = (bytes == -EINVAL) ? EINVAL : ENOMEM; + *bad = wr; + break; + } + + bnxt_re_fill_wrid(wrid, wr, bytes, qp->cap.sqsig); + bnxt_re_fill_psns(qp, wrid, wr->opcode, bytes); + bnxt_re_incr_tail(sq); + qp->wqe_cnt++; + wr = wr->next; + ring_db = true; + + if (qp->wqe_cnt == BNXT_RE_UD_QP_HW_STALL && + qp->qptyp == IBV_QPT_UD) { + /* Move RTS to RTS since it is time. */ + struct ibv_qp_attr attr; + int attr_mask; + + attr_mask = IBV_QP_STATE; + attr.qp_state = IBV_QPS_RTS; + bnxt_re_modify_qp(&qp->ibvqp, &attr, attr_mask); + qp->wqe_cnt = 0; + } + } + +bad_wr: + if (ring_db) + bnxt_re_ring_sq_db(qp); + + pthread_spin_unlock(&sq->qlock); + return ret; +} + +static int bnxt_re_build_rqe(struct bnxt_re_qp *qp, struct ibv_recv_wr *wr, + void *rqe) +{ + struct bnxt_re_brqe *hdr = rqe; + struct bnxt_re_rqe *rwr; + struct bnxt_re_sge *sge; + struct bnxt_re_wrid *wrid; + int wqe_sz, len; + uint32_t hdrval; + + rwr = (rqe + sizeof(struct bnxt_re_brqe)); + sge = (rqe + bnxt_re_get_rqe_hdr_sz()); + wrid = &qp->rwrid[qp->rqq->tail]; + + len = bnxt_re_build_sge(sge, wr->sg_list, wr->num_sge, false); + wqe_sz = wr->num_sge + (bnxt_re_get_rqe_hdr_sz() >> 4); /* 16B align */ + /* HW requires wqe size has room for atleast one sge even if none was + * supplied by application + */ + if (!wr->num_sge) + wqe_sz++; + hdrval = BNXT_RE_WR_OPCD_RECV; + hdrval |= ((wqe_sz & BNXT_RE_HDR_WS_MASK) << BNXT_RE_HDR_WS_SHIFT); + hdr->rsv_ws_fl_wt = htole32(hdrval); + rwr->wrid = htole32(qp->rqq->tail); + + /* Fill wrid */ + wrid->wrid = wr->wr_id; + wrid->bytes = len; /* N.A. for RQE */ + wrid->sig = 0; /* N.A. for RQE */ + + return len; +} + +int bnxt_re_post_recv(struct ibv_qp *ibvqp, struct ibv_recv_wr *wr, + struct ibv_recv_wr **bad) +{ + struct bnxt_re_qp *qp = to_bnxt_re_qp(ibvqp); + struct bnxt_re_queue *rq = qp->rqq; + void *rqe; + int ret; + + pthread_spin_lock(&rq->qlock); + while (wr) { + /* check QP state, abort if it is ERR or RST */ + if (qp->qpst == IBV_QPS_RESET || qp->qpst == IBV_QPS_ERR) { + *bad = wr; + pthread_spin_unlock(&rq->qlock); + return EINVAL; + } + + if (bnxt_re_is_que_full(rq) || + wr->num_sge > qp->cap.max_rsge) { + pthread_spin_unlock(&rq->qlock); + *bad = wr; + return ENOMEM; + } + + rqe = (void *)(rq->va + (rq->tail * rq->stride)); + memset(rqe, 0, bnxt_re_get_rqe_sz()); + ret = bnxt_re_build_rqe(qp, wr, rqe); + if (ret < 0) { + pthread_spin_unlock(&rq->qlock); + *bad = wr; + return ENOMEM; + } + + bnxt_re_incr_tail(rq); + wr = wr->next; + bnxt_re_ring_rq_db(qp); + } + pthread_spin_unlock(&rq->qlock); + + return 0; +} + +static void bnxt_re_srq_free_queue_ptr(struct bnxt_re_srq *srq) +{ + free(srq->srqq); + free(srq); +} + +static struct bnxt_re_srq *bnxt_re_srq_alloc_queue_ptr(void) +{ + struct bnxt_re_srq *srq; + + srq = calloc(1, sizeof(struct bnxt_re_srq)); + if (!srq) + return NULL; + + srq->srqq = calloc(1, sizeof(struct bnxt_re_queue)); + if (!srq->srqq) { + free(srq); + return NULL; + } + + return srq; +} + +static void bnxt_re_srq_free_queue(struct bnxt_re_srq *srq) +{ + free(srq->srwrid); + pthread_spin_destroy(&srq->srqq->qlock); + bnxt_re_free_aligned(srq->srqq); +} + +static int bnxt_re_srq_alloc_queue(struct bnxt_re_srq *srq, + struct ibv_srq_init_attr *attr, + uint32_t pg_size) +{ + struct bnxt_re_queue *que; + int ret, idx; + + que = srq->srqq; + que->depth = roundup_pow_of_two(attr->attr.max_wr + 1); + que->diff = que->depth - attr->attr.max_wr; + que->stride = bnxt_re_get_srqe_sz(); + ret = bnxt_re_alloc_aligned(que, pg_size); + if (ret) + goto bail; + pthread_spin_init(&que->qlock, PTHREAD_PROCESS_PRIVATE); + /* For SRQ only bnxt_re_wrid.wrid is used. */ + srq->srwrid = calloc(que->depth, sizeof(struct bnxt_re_wrid)); + if (!srq->srwrid) { + ret = -ENOMEM; + goto bail; + } + + srq->start_idx = 0; + srq->last_idx = que->depth - 1; + for (idx = 0; idx < que->depth; idx++) + srq->srwrid[idx].next_idx = idx + 1; + srq->srwrid[srq->last_idx].next_idx = -1; + + /*TODO: update actual max depth. */ + return 0; +bail: + bnxt_re_srq_free_queue(srq); + return ret; +} + +struct ibv_srq *bnxt_re_create_srq(struct ibv_pd *ibvpd, + struct ibv_srq_init_attr *attr) +{ + struct bnxt_re_srq *srq; + struct ubnxt_re_srq req; + struct ubnxt_re_srq_resp resp; + struct bnxt_re_context *cntx = to_bnxt_re_context(ibvpd->context); + struct bnxt_re_dev *dev = to_bnxt_re_dev(cntx->ibvctx.context.device); + int ret; + + /*TODO: Check max limit on queue depth and sge.*/ + srq = bnxt_re_srq_alloc_queue_ptr(); + if (!srq) + goto fail; + + if (bnxt_re_srq_alloc_queue(srq, attr, dev->pg_size)) + goto fail; + + req.srqva = (uintptr_t)srq->srqq->va; + req.srq_handle = (uintptr_t)srq; + ret = ibv_cmd_create_srq(ibvpd, &srq->ibvsrq, attr, + &req.ibv_cmd, sizeof(req), + &resp.ibv_resp, sizeof(resp)); + if (ret) + goto fail; + + srq->srqid = resp.srqid; + srq->udpi = &cntx->udpi; + srq->cap.max_wr = srq->srqq->depth; + srq->cap.max_sge = attr->attr.max_sge; + srq->cap.srq_limit = attr->attr.srq_limit; + srq->arm_req = false; + + return &srq->ibvsrq; +fail: + bnxt_re_srq_free_queue_ptr(srq); + return NULL; +} + +int bnxt_re_modify_srq(struct ibv_srq *ibvsrq, struct ibv_srq_attr *attr, + int attr_mask) +{ + struct bnxt_re_srq *srq = to_bnxt_re_srq(ibvsrq); + struct ibv_modify_srq cmd; + int status = 0; + + status = ibv_cmd_modify_srq(ibvsrq, attr, attr_mask, + &cmd, sizeof(cmd)); + if (!status && ((attr_mask & IBV_SRQ_LIMIT) && + (srq->cap.srq_limit != attr->srq_limit))) { + srq->cap.srq_limit = attr->srq_limit; + } + srq->arm_req = true; + return status; +} + +int bnxt_re_destroy_srq(struct ibv_srq *ibvsrq) +{ + struct bnxt_re_srq *srq = to_bnxt_re_srq(ibvsrq); + int ret; + + ret = ibv_cmd_destroy_srq(ibvsrq); + if (ret) + return ret; + bnxt_re_srq_free_queue(srq); + bnxt_re_srq_free_queue_ptr(srq); + + return 0; +} + +int bnxt_re_query_srq(struct ibv_srq *ibvsrq, struct ibv_srq_attr *attr) +{ + struct ibv_query_srq cmd; + + return ibv_cmd_query_srq(ibvsrq, attr, &cmd, sizeof(cmd)); +} + +static int bnxt_re_build_srqe(struct bnxt_re_srq *srq, + struct ibv_recv_wr *wr, void *srqe) +{ + struct bnxt_re_brqe *hdr = srqe; + struct bnxt_re_rqe *rwr; + struct bnxt_re_sge *sge; + struct bnxt_re_wrid *wrid; + int wqe_sz, len, next; + uint32_t hdrval = 0; + + rwr = (srqe + sizeof(struct bnxt_re_brqe)); + sge = (srqe + bnxt_re_get_srqe_hdr_sz()); + next = srq->start_idx; + wrid = &srq->srwrid[next]; + + len = bnxt_re_build_sge(sge, wr->sg_list, wr->num_sge, false); + hdrval = BNXT_RE_WR_OPCD_RECV; + wqe_sz = wr->num_sge + (bnxt_re_get_srqe_hdr_sz() >> 4); /* 16B align */ + hdrval |= ((wqe_sz & BNXT_RE_HDR_WS_MASK) << BNXT_RE_HDR_WS_SHIFT); + hdr->rsv_ws_fl_wt = htole32(hdrval); + rwr->wrid = htole32((uint32_t)next); + + /* Fill wrid */ + wrid->wrid = wr->wr_id; + wrid->bytes = len; /* N.A. for RQE */ + wrid->sig = 0; /* N.A. for RQE */ + + return len; +} + +int bnxt_re_post_srq_recv(struct ibv_srq *ibvsrq, struct ibv_recv_wr *wr, + struct ibv_recv_wr **bad) +{ + struct bnxt_re_srq *srq = to_bnxt_re_srq(ibvsrq); + struct bnxt_re_queue *rq = srq->srqq; + void *srqe; + int ret, count = 0; + + pthread_spin_lock(&rq->qlock); + count = rq->tail > rq->head ? rq->tail - rq->head : + rq->depth - rq->head + rq->tail; + while (wr) { + if (srq->start_idx == srq->last_idx || + wr->num_sge > srq->cap.max_sge) { + *bad = wr; + pthread_spin_unlock(&rq->qlock); + return ENOMEM; + } + + srqe = (void *) (rq->va + (rq->tail * rq->stride)); + memset(srqe, 0, bnxt_re_get_srqe_sz()); + ret = bnxt_re_build_srqe(srq, wr, srqe); + if (ret < 0) { + pthread_spin_unlock(&rq->qlock); + *bad = wr; + return ENOMEM; + } + + srq->start_idx = srq->srwrid[srq->start_idx].next_idx; + bnxt_re_incr_tail(rq); + wr = wr->next; + bnxt_re_ring_srq_db(srq); + count++; + if (srq->arm_req == true && count > srq->cap.srq_limit) { + srq->arm_req = false; + bnxt_re_ring_srq_arm(srq); + } + } + pthread_spin_unlock(&rq->qlock); + + return 0; +} + +struct ibv_ah *bnxt_re_create_ah(struct ibv_pd *ibvpd, struct ibv_ah_attr *attr) +{ + struct bnxt_re_context *uctx; + struct bnxt_re_ah *ah; + struct ib_uverbs_create_ah_resp resp; + int status; + + uctx = to_bnxt_re_context(ibvpd->context); + + ah = calloc(1, sizeof(*ah)); + if (!ah) + goto failed; + + pthread_mutex_lock(&uctx->shlock); + memset(&resp, 0, sizeof(resp)); + status = ibv_cmd_create_ah(ibvpd, &ah->ibvah, attr, + &resp, sizeof(resp)); + if (status) { + pthread_mutex_unlock(&uctx->shlock); + free(ah); + goto failed; + } + /* read AV ID now. */ + ah->avid = *(uint32_t *)(uctx->shpg + BNXT_RE_AVID_OFFT); + pthread_mutex_unlock(&uctx->shlock); + + return &ah->ibvah; +failed: + return NULL; +} + +int bnxt_re_destroy_ah(struct ibv_ah *ibvah) +{ + struct bnxt_re_ah *ah; + int status; + + ah = to_bnxt_re_ah(ibvah); + status = ibv_cmd_destroy_ah(ibvah); + if (status) + return status; + free(ah); + + return 0; +} diff --git a/providers/bnxt_re/verbs.h b/providers/bnxt_re/verbs.h new file mode 100644 index 0000000..2e99488 --- /dev/null +++ b/providers/bnxt_re/verbs.h @@ -0,0 +1,100 @@ +/* + * Broadcom NetXtreme-E User Space RoCE driver + * + * Copyright (c) 2015-2017, Broadcom. All rights reserved. The term + * Broadcom refers to Broadcom Limited and/or its subsidiaries. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE + * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN + * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * Description: Internal IB-verbs function declaration + */ + +#ifndef __VERBS_H__ +#define __VERBS_H__ + +#include <assert.h> +#include <stdlib.h> +#include <stdio.h> +#include <string.h> +#include <unistd.h> +#include <signal.h> +#include <errno.h> +#include <pthread.h> +#include <sys/mman.h> +#include <netinet/in.h> +#include <unistd.h> + +#include <infiniband/driver.h> +#include <infiniband/verbs.h> + +int bnxt_re_query_device(struct ibv_context *uctx, + struct ibv_device_attr *attr); +int bnxt_re_query_port(struct ibv_context *uctx, uint8_t port, + struct ibv_port_attr *attr); +struct ibv_pd *bnxt_re_alloc_pd(struct ibv_context *uctx); +int bnxt_re_free_pd(struct ibv_pd *ibvpd); +struct ibv_mr *bnxt_re_reg_mr(struct ibv_pd *ibvpd, void *buf, size_t len, + uint64_t hca_va, int ibv_access_flags); +int bnxt_re_dereg_mr(struct verbs_mr *vmr); + +struct ibv_cq *bnxt_re_create_cq(struct ibv_context *uctx, int ncqe, + struct ibv_comp_channel *ch, int vec); +int bnxt_re_resize_cq(struct ibv_cq *ibvcq, int ncqe); +int bnxt_re_destroy_cq(struct ibv_cq *ibvcq); +int bnxt_re_poll_cq(struct ibv_cq *ibvcq, int nwc, struct ibv_wc *wc); +void bnxt_re_cq_event(struct ibv_cq *ibvcq); +int bnxt_re_arm_cq(struct ibv_cq *ibvcq, int flags); + +struct ibv_qp *bnxt_re_create_qp(struct ibv_pd *ibvpd, + struct ibv_qp_init_attr *attr); +int bnxt_re_modify_qp(struct ibv_qp *ibvqp, struct ibv_qp_attr *attr, + int ibv_qp_attr_mask); +int bnxt_re_query_qp(struct ibv_qp *ibvqp, struct ibv_qp_attr *attr, + int attr_mask, struct ibv_qp_init_attr *init_attr); +int bnxt_re_destroy_qp(struct ibv_qp *ibvqp); +int bnxt_re_post_send(struct ibv_qp *ibvqp, struct ibv_send_wr *wr, + struct ibv_send_wr **bad); +int bnxt_re_post_recv(struct ibv_qp *ibvqp, struct ibv_recv_wr *wr, + struct ibv_recv_wr **bad); + +struct ibv_srq *bnxt_re_create_srq(struct ibv_pd *ibvpd, + struct ibv_srq_init_attr *attr); +int bnxt_re_modify_srq(struct ibv_srq *ibvsrq, + struct ibv_srq_attr *attr, int mask); +int bnxt_re_destroy_srq(struct ibv_srq *ibvsrq); +int bnxt_re_query_srq(struct ibv_srq *ibvsrq, struct ibv_srq_attr *attr); +int bnxt_re_post_srq_recv(struct ibv_srq *ibvsrq, struct ibv_recv_wr *wr, + struct ibv_recv_wr **bad); + +struct ibv_ah *bnxt_re_create_ah(struct ibv_pd *ibvpd, + struct ibv_ah_attr *attr); +int bnxt_re_destroy_ah(struct ibv_ah *ibvah); + +#endif /* __BNXT_RE_VERBS_H__ */ diff --git a/providers/cxgb4/CMakeLists.txt b/providers/cxgb4/CMakeLists.txt new file mode 100644 index 0000000..a9b6546 --- /dev/null +++ b/providers/cxgb4/CMakeLists.txt @@ -0,0 +1,8 @@ +set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${NO_STRICT_ALIASING_FLAGS}") + +rdma_provider(cxgb4 + cq.c + dev.c + qp.c + verbs.c +) diff --git a/providers/cxgb4/cq.c b/providers/cxgb4/cq.c new file mode 100644 index 0000000..81baddf --- /dev/null +++ b/providers/cxgb4/cq.c @@ -0,0 +1,934 @@ +/* + * Copyright (c) 2006-2016 Chelsio, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include <config.h> + +#include <stdio.h> +#include <syslog.h> +#include <pthread.h> +#include <sys/errno.h> +#include <infiniband/opcode.h> +#include <util/compiler.h> +#include "libcxgb4.h" +#include "cxgb4-abi.h" + +static void insert_recv_cqe(struct t4_wq *wq, struct t4_cq *cq, u32 srqidx) +{ + union t4_cqe cqe = {}; + __be64 *gen = GEN_ADDR(&cqe); + + PDBG("%s wq %p cq %p sw_cidx %u sw_pidx %u\n", __func__, + wq, cq, cq->sw_cidx, cq->sw_pidx); + cqe.com.header = htobe32(V_CQE_STATUS(T4_ERR_SWFLUSH) | + V_CQE_OPCODE(FW_RI_SEND) | + V_CQE_TYPE(0) | + V_CQE_SWCQE(1) | + V_CQE_QPID(wq->sq.qid)); + *gen = htobe64(V_CQE_GENBIT((u64)cq->gen)); + if (srqidx) + cqe.b64.u.srcqe.abs_rqe_idx = htobe32(srqidx); + + memcpy(Q_ENTRY(cq->sw_queue, cq->sw_pidx), &cqe, CQE_SIZE(&cqe)); + t4_swcq_produce(cq); +} + +int c4iw_flush_rq(struct t4_wq *wq, struct t4_cq *cq, int count) +{ + int flushed = 0; + int in_use = wq->rq.in_use - count; + + BUG_ON(in_use < 0); + PDBG("%s wq %p cq %p rq.in_use %u skip count %u\n", __func__, + wq, cq, wq->rq.in_use, count); + while (in_use--) { + insert_recv_cqe(wq, cq, 0); + flushed++; + } + return flushed; +} + +static void insert_sq_cqe(struct t4_wq *wq, struct t4_cq *cq, + struct t4_swsqe *swcqe) +{ + union t4_cqe cqe = {}; + __be64 *gen = GEN_ADDR(&cqe); + + PDBG("%s wq %p cq %p sw_cidx %u sw_pidx %u\n", __func__, + wq, cq, cq->sw_cidx, cq->sw_pidx); + cqe.com.header = htobe32(V_CQE_STATUS(T4_ERR_SWFLUSH) | + V_CQE_OPCODE(swcqe->opcode) | + V_CQE_TYPE(1) | + V_CQE_SWCQE(1) | + V_CQE_QPID(wq->sq.qid)); + CQE_WRID_SQ_IDX(&cqe.com) = swcqe->idx; + *gen = htobe64(V_CQE_GENBIT((u64)cq->gen)); + memcpy(Q_ENTRY(cq->sw_queue, cq->sw_pidx), &cqe, CQE_SIZE(&cqe)); + t4_swcq_produce(cq); +} + +static void advance_oldest_read(struct t4_wq *wq); + +void c4iw_flush_sq(struct c4iw_qp *qhp) +{ + unsigned short flushed = 0; + struct t4_wq *wq = &qhp->wq; + struct c4iw_cq *chp = to_c4iw_cq(qhp->ibv_qp.send_cq); + struct t4_cq *cq = &chp->cq; + int idx; + struct t4_swsqe *swsqe; + + if (wq->sq.flush_cidx == -1) + wq->sq.flush_cidx = wq->sq.cidx; + idx = wq->sq.flush_cidx; + BUG_ON(idx >= wq->sq.size); + while (idx != wq->sq.pidx) { + swsqe = &wq->sq.sw_sq[idx]; + BUG_ON(swsqe->flushed); + swsqe->flushed = 1; + insert_sq_cqe(wq, cq, swsqe); + if (wq->sq.oldest_read == swsqe) { + BUG_ON(swsqe->opcode != FW_RI_READ_REQ); + advance_oldest_read(wq); + } + flushed++; + if (++idx == wq->sq.size) + idx = 0; + } + wq->sq.flush_cidx += flushed; + if (wq->sq.flush_cidx >= wq->sq.size) + wq->sq.flush_cidx -= wq->sq.size; +} + +static void flush_completed_wrs(struct t4_wq *wq, struct t4_cq *cq) +{ + struct t4_swsqe *swsqe; + unsigned short cidx; + + if (wq->sq.flush_cidx == -1) + wq->sq.flush_cidx = wq->sq.cidx; + cidx = wq->sq.flush_cidx; + BUG_ON(cidx >= wq->sq.size); + + while (cidx != wq->sq.pidx) { + swsqe = &wq->sq.sw_sq[cidx]; + if (!swsqe->signaled) { + if (++cidx == wq->sq.size) + cidx = 0; + } else if (swsqe->complete) { + + BUG_ON(swsqe->flushed); + + /* + * Insert this completed cqe into the swcq. + */ + PDBG("%s moving cqe into swcq sq idx %u cq idx %u\n", + __func__, cidx, cq->sw_pidx); + + swsqe->cqe.com.header |= htobe32(V_CQE_SWCQE(1)); + memcpy(Q_ENTRY(cq->sw_queue, cq->sw_pidx), + &swsqe->cqe, CQE_SIZE(&swsqe->cqe)); + t4_swcq_produce(cq); + swsqe->flushed = 1; + if (++cidx == wq->sq.size) + cidx = 0; + wq->sq.flush_cidx = cidx; + } else + break; + } +} + +static void create_read_req_cqe(struct t4_wq *wq, union t4_cqe *hw_cqe, + union t4_cqe *read_cqe) +{ + __be64 *gen = GEN_ADDR(read_cqe); + + memset(read_cqe, 0, sizeof(*read_cqe)); + read_cqe->com.u.scqe.cidx = wq->sq.oldest_read->idx; + read_cqe->com.len = be32toh(wq->sq.oldest_read->read_len); + read_cqe->com.header = htobe32(V_CQE_QPID(CQE_QPID(&hw_cqe->com)) | + V_CQE_SWCQE(SW_CQE(&hw_cqe->com)) | + V_CQE_OPCODE(FW_RI_READ_REQ) | + V_CQE_TYPE(1)); + *gen = GEN_BIT(hw_cqe); +} + +static void advance_oldest_read(struct t4_wq *wq) +{ + + u32 rptr = wq->sq.oldest_read - wq->sq.sw_sq + 1; + + if (rptr == wq->sq.size) + rptr = 0; + while (rptr != wq->sq.pidx) { + wq->sq.oldest_read = &wq->sq.sw_sq[rptr]; + + if (wq->sq.oldest_read->opcode == FW_RI_READ_REQ) + return; + if (++rptr == wq->sq.size) + rptr = 0; + } + wq->sq.oldest_read = NULL; +} + +/* + * Move all CQEs from the HWCQ into the SWCQ. + * Deal with out-of-order and/or completions that complete + * prior unsignalled WRs. + */ +void c4iw_flush_hw_cq(struct c4iw_cq *chp, struct c4iw_qp *flush_qhp) +{ + union t4_cqe *hw_cqe, *swcqe, read_cqe; + struct t4_cqe_common *com; + struct c4iw_qp *qhp; + struct t4_swsqe *swsqe; + int ret; + + PDBG("%s cqid 0x%x\n", __func__, chp->cq.cqid); + ret = t4_next_hw_cqe(&chp->cq, &hw_cqe); + com = &hw_cqe->com; + + /* + * This logic is similar to poll_cq(), but not quite the same + * unfortunately. Need to move pertinent HW CQEs to the SW CQ but + * also do any translation magic that poll_cq() normally does. + */ + while (!ret) { + qhp = get_qhp(chp->rhp, CQE_QPID(com)); + + /* + * drop CQEs with no associated QP + */ + if (qhp == NULL) + goto next_cqe; + + if (flush_qhp != qhp) { + pthread_spin_lock(&qhp->lock); + + if (qhp->wq.flushed == 1) { + goto next_cqe; + } + } + + if (CQE_OPCODE(com) == FW_RI_TERMINATE) + goto next_cqe; + + if (CQE_OPCODE(com) == FW_RI_READ_RESP) { + + /* + * If we have reached here because of async + * event or other error, and have egress error + * then drop + */ + if (CQE_TYPE(com) == 1) { + syslog(LOG_CRIT, "%s: got egress error in \ + read-response, dropping!\n", __func__); + goto next_cqe; + } + + /* + * drop peer2peer RTR reads. + */ + if (CQE_WRID_STAG(com) == 1) + goto next_cqe; + + /* + * Eat completions for unsignaled read WRs. + */ + if (!qhp->wq.sq.oldest_read->signaled) { + advance_oldest_read(&qhp->wq); + goto next_cqe; + } + + /* + * Don't write to the HWCQ, create a new read req CQE + * in local memory and move it into the swcq. + */ + create_read_req_cqe(&qhp->wq, hw_cqe, &read_cqe); + hw_cqe = &read_cqe; + com = &hw_cqe->com; + advance_oldest_read(&qhp->wq); + } + + /* if its a SQ completion, then do the magic to move all the + * unsignaled and now in-order completions into the swcq. + */ + if (SQ_TYPE(com)) { + int idx = CQE_WRID_SQ_IDX(com); + + BUG_ON(idx >= qhp->wq.sq.size); + swsqe = &qhp->wq.sq.sw_sq[idx]; + swsqe->cqe = *hw_cqe; + swsqe->complete = 1; + flush_completed_wrs(&qhp->wq, &chp->cq); + } else { + swcqe = Q_ENTRY(chp->cq.sw_queue, chp->cq.sw_pidx); + memcpy(swcqe, hw_cqe, CQE_SIZE(hw_cqe)); + swcqe->com.header |= htobe32(V_CQE_SWCQE(1)); + t4_swcq_produce(&chp->cq); + } +next_cqe: + t4_hwcq_consume(&chp->cq); + ret = t4_next_hw_cqe(&chp->cq, &hw_cqe); + if (qhp && flush_qhp != qhp) + pthread_spin_unlock(&qhp->lock); + } +} + +static int cqe_completes_wr(union t4_cqe *cqe, struct t4_wq *wq) +{ + struct t4_cqe_common *com = &cqe->com; + + if (CQE_OPCODE(com) == FW_RI_TERMINATE) + return 0; + + if ((CQE_OPCODE(com) == FW_RI_RDMA_WRITE) && RQ_TYPE(com)) + return 0; + + if ((CQE_OPCODE(com) == FW_RI_READ_RESP) && SQ_TYPE(com)) + return 0; + + if (CQE_SEND_OPCODE(com) && RQ_TYPE(com) && t4_rq_empty(wq)) + return 0; + return 1; +} + +void c4iw_count_rcqes(struct t4_cq *cq, struct t4_wq *wq, int *count) +{ + struct t4_cqe_common *com; + union t4_cqe *cqe; + u32 ptr; + + *count = 0; + ptr = cq->sw_cidx; + BUG_ON(ptr >= cq->size); + while (ptr != cq->sw_pidx) { + cqe = Q_ENTRY(cq->sw_queue, ptr); + com = &cqe->com; + if (RQ_TYPE(com) && (CQE_OPCODE(com) != FW_RI_READ_RESP) && + (CQE_QPID(com) == wq->sq.qid) && cqe_completes_wr(cqe, wq)) + (*count)++; + if (++ptr == cq->size) + ptr = 0; + } + PDBG("%s cq %p count %d\n", __func__, cq, *count); +} + +static void dump_cqe(void *arg) +{ + u64 *p = arg; + syslog(LOG_NOTICE, "cxgb4 err cqe %016llx %016llx %016llx %016llx\n", + (long long)be64toh(p[0]), + (long long)be64toh(p[1]), + (long long)be64toh(p[2]), + (long long)be64toh(p[3])); + if (is_64b_cqe) + syslog(LOG_NOTICE, + "cxgb4 err cqe %016llx %016llx %016llx %016llx\n", + (long long)be64toh(p[4]), + (long long)be64toh(p[5]), + (long long)be64toh(p[6]), + (long long)be64toh(p[7])); + +} + +static void post_pending_srq_wrs(struct t4_srq *srq) +{ + struct t4_srq_pending_wr *pwr; + u16 idx = 0; + + while (srq->pending_in_use) { + + assert(!srq->sw_rq[srq->pidx].valid); + + pwr = &srq->pending_wrs[srq->pending_cidx]; + srq->sw_rq[srq->pidx].wr_id = pwr->wr_id; + srq->sw_rq[srq->pidx].valid = 1; + + PDBG("%s posting pending cidx %u pidx %u wq_pidx %u in_use %u rq_size %u wr_id %llx\n", + __func__, srq->cidx, srq->pidx, srq->wq_pidx, + srq->in_use, srq->size, (unsigned long long)pwr->wr_id); + + c4iw_copy_wr_to_srq(srq, &pwr->wqe, pwr->len16); + t4_srq_consume_pending_wr(srq); + t4_srq_produce(srq, pwr->len16); + idx += DIV_ROUND_UP(pwr->len16*16, T4_EQ_ENTRY_SIZE); + } + + if (idx) { + t4_ring_srq_db(srq, idx, pwr->len16, &pwr->wqe); + srq->queue[srq->size].status.host_wq_pidx = + srq->wq_pidx; + } +} + +static u64 reap_srq_cqe(union t4_cqe *hw_cqe, struct t4_srq *srq) +{ + int rel_idx = CQE_ABS_RQE_IDX(&hw_cqe->b64) - srq->rqt_abs_idx; + u64 wr_id; + + BUG_ON(rel_idx >= srq->size); + + assert(srq->sw_rq[rel_idx].valid); + srq->sw_rq[rel_idx].valid = 0; + wr_id = srq->sw_rq[rel_idx].wr_id; + + if (rel_idx == srq->cidx) { + PDBG("%s in order cqe rel_idx %u cidx %u pidx %u wq_pidx %u in_use %u rq_size %u wr_id %llx\n", + __func__, rel_idx, srq->cidx, srq->pidx, + srq->wq_pidx, srq->in_use, srq->size, + (unsigned long long)srq->sw_rq[rel_idx].wr_id); + t4_srq_consume(srq); + while (srq->ooo_count && !srq->sw_rq[srq->cidx].valid) { + PDBG("%s eat ooo cidx %u pidx %u wq_pidx %u in_use %u rq_size %u ooo_count %u wr_id %llx\n", + __func__, srq->cidx, srq->pidx, srq->wq_pidx, + srq->in_use, srq->size, srq->ooo_count, + (unsigned long long)srq->sw_rq[srq->cidx].wr_id); + t4_srq_consume_ooo(srq); + } + if (srq->ooo_count == 0 && srq->pending_in_use) + post_pending_srq_wrs(srq); + } else { + BUG_ON(srq->in_use == 0); + PDBG("%s ooo cqe rel_idx %u cidx %u pidx %u wq_pidx %u in_use %u rq_size %u ooo_count %u wr_id %llx\n", + __func__, rel_idx, srq->cidx, srq->pidx, + srq->wq_pidx, srq->in_use, srq->size, srq->ooo_count, + (unsigned long long)srq->sw_rq[rel_idx].wr_id); + t4_srq_produce_ooo(srq); + } + return wr_id; +} + +/* + * poll_cq + * + * Caller must: + * check the validity of the first CQE, + * supply the wq assicated with the qpid. + * + * credit: cq credit to return to sge. + * cqe_flushed: 1 iff the CQE is flushed. + * cqe: copy of the polled CQE. + * + * return value: + * 0 CQE returned ok. + * -EAGAIN CQE skipped, try again. + * -EOVERFLOW CQ overflow detected. + */ +static int poll_cq(struct t4_wq *wq, struct t4_cq *cq, + union t4_cqe *cqe, u8 *cqe_flushed, + u64 *cookie, u32 *credit, struct t4_srq *srq) +{ + int ret = 0; + union t4_cqe *hw_cqe, read_cqe; + struct t4_cqe_common *com; + + *cqe_flushed = 0; + *credit = 0; + + ret = t4_next_cqe(cq, &hw_cqe); + if (ret) + return ret; + + com = &hw_cqe->com; + + PDBG("%s CQE OVF %u qpid 0x%0x genbit %u type %u status 0x%0x" + " opcode 0x%0x len 0x%0x wrid_hi_stag 0x%x wrid_low_msn 0x%x\n", + __func__, + is_64b_cqe ? CQE_OVFBIT(&hw_cqe->b64) : CQE_OVFBIT(&hw_cqe->b32), + CQE_QPID(com), + is_64b_cqe ? CQE_GENBIT(&hw_cqe->b64) : CQE_GENBIT(&hw_cqe->b32), + CQE_TYPE(com), CQE_STATUS(com), CQE_OPCODE(com), CQE_LEN(com), + CQE_WRID_HI(com), CQE_WRID_LOW(com)); + + /* + * skip cqe's not affiliated with a QP. + */ + if (wq == NULL) { + ret = -EAGAIN; + goto skip_cqe; + } + + /* + * skip HW cqe's if wq is already flushed. + */ + if (wq->flushed && !SW_CQE(com)) { + ret = -EAGAIN; + goto skip_cqe; + } + + /* + * Gotta tweak READ completions: + * 1) the cqe doesn't contain the sq_wptr from the wr. + * 2) opcode not reflected from the wr. + * 3) read_len not reflected from the wr. + * 4) T4 HW (for now) inserts target read response failures which + * need to be skipped. + */ + if (CQE_OPCODE(com) == FW_RI_READ_RESP) { + + /* + * If we have reached here because of async + * event or other error, and have egress error + * then drop + */ + if (CQE_TYPE(com) == 1) { + syslog(LOG_CRIT, "%s: got egress error in \ + read-response, dropping!\n", __func__); + if (CQE_STATUS(com)) + t4_set_wq_in_error(wq); + ret = -EAGAIN; + goto skip_cqe; + } + + /* + * If this is an unsolicited read response, then the read + * was generated by the kernel driver as part of peer-2-peer + * connection setup, or a target read response failure. + * So skip the completion. + */ + if (CQE_WRID_STAG(com) == 1) { + if (CQE_STATUS(com)) + t4_set_wq_in_error(wq); + ret = -EAGAIN; + goto skip_cqe; + } + + /* + * Eat completions for unsignaled read WRs. + */ + if (!wq->sq.oldest_read->signaled) { + advance_oldest_read(wq); + ret = -EAGAIN; + goto skip_cqe; + } + + /* + * Don't write to the HWCQ, so create a new read req CQE + * in local memory. + */ + create_read_req_cqe(wq, hw_cqe, &read_cqe); + hw_cqe = &read_cqe; + com = &hw_cqe->com; + advance_oldest_read(wq); + } + + if (CQE_OPCODE(com) == FW_RI_TERMINATE) { + ret = -EAGAIN; + goto skip_cqe; + } + + if (CQE_STATUS(com) || t4_wq_in_error(wq)) { + *cqe_flushed = (CQE_STATUS(com) == T4_ERR_SWFLUSH); + wq->error = 1; + + if (!*cqe_flushed && CQE_STATUS(com)) + dump_cqe(hw_cqe); + + assert(!((*cqe_flushed == 0) && !SW_CQE(com))); + goto proc_cqe; + } + + /* + * RECV completion. + */ + if (RQ_TYPE(com)) { + + /* + * HW only validates 4 bits of MSN. So we must validate that + * the MSN in the SEND is the next expected MSN. If its not, + * then we complete this with T4_ERR_MSN and mark the wq in + * error. + */ + + if (srq ? t4_srq_empty(srq) : t4_rq_empty(wq)) { + t4_set_wq_in_error(wq); + ret = -EAGAIN; + goto skip_cqe; + } + if (unlikely((CQE_WRID_MSN(com) != (wq->rq.msn)))) { + t4_set_wq_in_error(wq); + hw_cqe->com.header |= htobe32(V_CQE_STATUS(T4_ERR_MSN)); + goto proc_cqe; + } + goto proc_cqe; + } + + /* + * If we get here its a send completion. + * + * Handle out of order completion. These get stuffed + * in the SW SQ. Then the SW SQ is walked to move any + * now in-order completions into the SW CQ. This handles + * 2 cases: + * 1) reaping unsignaled WRs when the first subsequent + * signaled WR is completed. + * 2) out of order read completions. + */ + if (!SW_CQE(com) && (CQE_WRID_SQ_IDX(com) != wq->sq.cidx)) { + struct t4_swsqe *swsqe; + int idx = CQE_WRID_SQ_IDX(com); + + PDBG("%s out of order completion going in sw_sq at idx %u\n", + __func__, idx); + BUG_ON(idx >= wq->sq.size); + swsqe = &wq->sq.sw_sq[idx]; + swsqe->cqe = *hw_cqe; + swsqe->complete = 1; + ret = -EAGAIN; + goto flush_wq; + } + +proc_cqe: + *cqe = *hw_cqe; + + /* + * Reap the associated WR(s) that are freed up with this + * completion. + */ + if (SQ_TYPE(com)) { + int idx = CQE_WRID_SQ_IDX(com); + BUG_ON(idx >= wq->sq.size); + + /* + * Account for any unsignaled completions completed by + * this signaled completion. In this case, cidx points + * to the first unsignaled one, and idx points to the + * signaled one. So adjust in_use based on this delta. + * if this is not completing any unsigned wrs, then the + * delta will be 0. Handle wrapping also! + */ + if (idx < wq->sq.cidx) + wq->sq.in_use -= wq->sq.size + idx - wq->sq.cidx; + else + wq->sq.in_use -= idx - wq->sq.cidx; + BUG_ON(wq->sq.in_use <= 0 || wq->sq.in_use >= wq->sq.size); + + wq->sq.cidx = (u16)idx; + PDBG("%s completing sq idx %u\n", __func__, wq->sq.cidx); + *cookie = wq->sq.sw_sq[wq->sq.cidx].wr_id; + t4_sq_consume(wq); + } else { + if (!srq) { + PDBG("%s completing rq idx %u\n", + __func__, wq->rq.cidx); + BUG_ON(wq->rq.cidx >= wq->rq.size); + *cookie = wq->rq.sw_rq[wq->rq.cidx].wr_id; + BUG_ON(t4_rq_empty(wq)); + t4_rq_consume(wq); + } else + *cookie = reap_srq_cqe(hw_cqe, srq); + wq->rq.msn++; + goto skip_cqe; + } + +flush_wq: + /* + * Flush any completed cqes that are now in-order. + */ + flush_completed_wrs(wq, cq); + +skip_cqe: + if (SW_CQE(com)) { + PDBG("%s cq %p cqid 0x%x skip sw cqe cidx %u\n", + __func__, cq, cq->cqid, cq->sw_cidx); + t4_swcq_consume(cq); + } else { + PDBG("%s cq %p cqid 0x%x skip hw cqe cidx %u\n", + __func__, cq, cq->cqid, cq->cidx); + t4_hwcq_consume(cq); + } + return ret; +} + +static void generate_srq_limit_event(struct c4iw_srq *srq) +{ + struct ibv_modify_srq cmd; + struct ibv_srq_attr attr = {}; + int ret; + + srq->armed = 0; + ret = ibv_cmd_modify_srq(&srq->ibv_srq, &attr, 0, &cmd, sizeof(cmd)); + if (ret) + fprintf(stderr, + "Failure to send srq_limit event - ret %d errno %d\n", + ret, errno); +} + +/* + * Get one cq entry from c4iw and map it to openib. + * + * Returns: + * 0 cqe returned + * -ENODATA EMPTY; + * -EAGAIN caller must try again + * any other -errno fatal error + */ +static int c4iw_poll_cq_one(struct c4iw_cq *chp, struct ibv_wc *wc) +{ + struct c4iw_qp *qhp = NULL; + struct c4iw_srq *srq = NULL; + struct t4_cqe_common *com; + union t4_cqe uninitialized_var(cqe), *rd_cqe; + struct t4_wq *wq; + u32 credit = 0; + u8 cqe_flushed; + u64 cookie = 0; + int ret; + + ret = t4_next_cqe(&chp->cq, &rd_cqe); + + if (ret) { +#ifdef STALL_DETECTION + if (ret == -ENODATA && stall_to && !chp->dumped) { + struct timeval t; + + gettimeofday(&t, NULL); + if ((t.tv_sec - chp->time.tv_sec) > stall_to) { + dump_state(); + chp->dumped = 1; + } + } +#endif + return ret; + } + +#ifdef STALL_DETECTION + gettimeofday(&chp->time, NULL); +#endif + + qhp = get_qhp(chp->rhp, CQE_QPID(&rd_cqe->com)); + if (!qhp) + wq = NULL; + else { + pthread_spin_lock(&qhp->lock); + wq = &(qhp->wq); + srq = qhp->srq; + if (srq) + pthread_spin_lock(&srq->lock); + } + ret = poll_cq(wq, &(chp->cq), &cqe, &cqe_flushed, &cookie, &credit, + srq ? &srq->wq : NULL); + if (ret) + goto out; + + com = &cqe.com; + INC_STAT(cqe); + wc->wr_id = cookie; + wc->qp_num = qhp->wq.sq.qid; + wc->vendor_err = CQE_STATUS(com); + wc->wc_flags = 0; + + /* + * Simulate a SRQ_LIMIT_REACHED HW notification if required. + */ + if (srq && !(srq->flags & T4_SRQ_LIMIT_SUPPORT) && srq->armed && + srq->wq.in_use < srq->srq_limit) + generate_srq_limit_event(srq); + + PDBG("%s qpid 0x%x type %d opcode %d status 0x%x wrid hi 0x%x " + "lo 0x%x cookie 0x%llx\n", __func__, + CQE_QPID(com), CQE_TYPE(com), + CQE_OPCODE(com), CQE_STATUS(com), CQE_WRID_HI(com), + CQE_WRID_LOW(com), (unsigned long long)cookie); + + if (CQE_TYPE(com) == 0) { + if (!CQE_STATUS(com)) + wc->byte_len = CQE_LEN(com); + else + wc->byte_len = 0; + + switch (CQE_OPCODE(com)) { + case FW_RI_SEND: + wc->opcode = IBV_WC_RECV; + break; + case FW_RI_SEND_WITH_INV: + case FW_RI_SEND_WITH_SE_INV: + wc->opcode = IBV_WC_RECV; + wc->wc_flags |= IBV_WC_WITH_INV; + wc->invalidated_rkey = CQE_WRID_STAG(com); + break; + case FW_RI_WRITE_IMMEDIATE: + wc->opcode = IBV_WC_RECV_RDMA_WITH_IMM; + wc->imm_data = CQE_IMM_DATA(&cqe.b64); + wc->wc_flags |= IBV_WC_WITH_IMM; + break; + default: + PDBG("Unexpected opcode %d in the CQE received for QPID=0x%0x\n", + CQE_OPCODE(com), CQE_QPID(com)); + ret = -EINVAL; + goto out; + } + } else { + switch (CQE_OPCODE(com)) { + case FW_RI_RDMA_WRITE: + case FW_RI_WRITE_IMMEDIATE: + wc->opcode = IBV_WC_RDMA_WRITE; + break; + case FW_RI_READ_REQ: + wc->opcode = IBV_WC_RDMA_READ; + wc->byte_len = CQE_LEN(com); + break; + case FW_RI_SEND: + case FW_RI_SEND_WITH_SE: + wc->opcode = IBV_WC_SEND; + break; + case FW_RI_SEND_WITH_INV: + case FW_RI_SEND_WITH_SE_INV: + wc->wc_flags |= IBV_WC_WITH_INV; + wc->opcode = IBV_WC_SEND; + break; + case FW_RI_BIND_MW: + wc->opcode = IBV_WC_BIND_MW; + break; + default: + PDBG("Unexpected opcode %d " + "in the CQE received for QPID=0x%0x\n", + CQE_OPCODE(com), CQE_QPID(com)); + ret = -EINVAL; + goto out; + } + } + + if (cqe_flushed) + wc->status = IBV_WC_WR_FLUSH_ERR; + else { + + switch (CQE_STATUS(com)) { + case T4_ERR_SUCCESS: + wc->status = IBV_WC_SUCCESS; + break; + case T4_ERR_STAG: + wc->status = IBV_WC_LOC_ACCESS_ERR; + break; + case T4_ERR_PDID: + wc->status = IBV_WC_LOC_PROT_ERR; + break; + case T4_ERR_QPID: + case T4_ERR_ACCESS: + wc->status = IBV_WC_LOC_ACCESS_ERR; + break; + case T4_ERR_WRAP: + wc->status = IBV_WC_GENERAL_ERR; + break; + case T4_ERR_BOUND: + wc->status = IBV_WC_LOC_LEN_ERR; + break; + case T4_ERR_INVALIDATE_SHARED_MR: + case T4_ERR_INVALIDATE_MR_WITH_MW_BOUND: + wc->status = IBV_WC_MW_BIND_ERR; + break; + case T4_ERR_CRC: + case T4_ERR_MARKER: + case T4_ERR_PDU_LEN_ERR: + case T4_ERR_OUT_OF_RQE: + case T4_ERR_DDP_VERSION: + case T4_ERR_RDMA_VERSION: + case T4_ERR_DDP_QUEUE_NUM: + case T4_ERR_MSN: + case T4_ERR_TBIT: + case T4_ERR_MO: + case T4_ERR_MSN_RANGE: + case T4_ERR_IRD_OVERFLOW: + case T4_ERR_OPCODE: + case T4_ERR_INTERNAL_ERR: + wc->status = IBV_WC_FATAL_ERR; + break; + case T4_ERR_SWFLUSH: + wc->status = IBV_WC_WR_FLUSH_ERR; + break; + default: + PDBG("Unexpected cqe_status 0x%x for QPID=0x%0x\n", + CQE_STATUS(com), CQE_QPID(com)); + wc->status = IBV_WC_FATAL_ERR; + } + } + if (wc->status && wc->status != IBV_WC_WR_FLUSH_ERR) + syslog(LOG_NOTICE, "cxgb4 app err cqid %u qpid %u " + "type %u opcode %u status 0x%x\n", + chp->cq.cqid, CQE_QPID(com), CQE_TYPE(com), + CQE_OPCODE(com), CQE_STATUS(com)); +out: + if (wq) { + pthread_spin_unlock(&qhp->lock); + if (srq) + pthread_spin_unlock(&srq->lock); + } + return ret; +} + +int c4iw_poll_cq(struct ibv_cq *ibcq, int num_entries, struct ibv_wc *wc) +{ + struct c4iw_cq *chp; + int npolled; + int err = 0; + + chp = to_c4iw_cq(ibcq); + + if (t4_cq_in_error(&chp->cq)) { + t4_reset_cq_in_error(&chp->cq); + c4iw_flush_qps(chp->rhp); + } + + if (!num_entries) + return t4_cq_notempty(&chp->cq); + + pthread_spin_lock(&chp->lock); + for (npolled = 0; npolled < num_entries; ++npolled) { + do { + err = c4iw_poll_cq_one(chp, wc + npolled); + } while (err == -EAGAIN); + if (err) + break; + } + pthread_spin_unlock(&chp->lock); + return !err || err == -ENODATA ? npolled : err; +} + +int c4iw_arm_cq(struct ibv_cq *ibcq, int solicited) +{ + struct c4iw_cq *chp; + int ret; + + INC_STAT(arm); + chp = to_c4iw_cq(ibcq); + pthread_spin_lock(&chp->lock); + ret = t4_arm_cq(&chp->cq, solicited); + pthread_spin_unlock(&chp->lock); + return ret; +} + +void c4iw_flush_srqidx(struct c4iw_qp *qhp, u32 srqidx) +{ + struct c4iw_cq *rchp = to_c4iw_cq(qhp->ibv_qp.recv_cq); + + /* create a SRQ RECV CQE for srqidx */ + insert_recv_cqe(&qhp->wq, &rchp->cq, srqidx); +} diff --git a/providers/cxgb4/cxgb4-abi.h b/providers/cxgb4/cxgb4-abi.h new file mode 100644 index 0000000..d514066 --- /dev/null +++ b/providers/cxgb4/cxgb4-abi.h @@ -0,0 +1,71 @@ +/* + * Copyright (c) 2006-2016 Chelsio, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef IWCH_ABI_H +#define IWCH_ABI_H + +#include <stdint.h> +#include <infiniband/kern-abi.h> +#include <rdma/cxgb4-abi.h> +#include <kernel-abi/cxgb4-abi.h> + +/* compat for ABI version 0 */ +#define _c4iw_create_qp_resp_v0 \ + { \ + __u64 sq_key; \ + __u64 rq_key; \ + __u64 sq_db_gts_key; \ + __u64 rq_db_gts_key; \ + __u64 sq_memsize; \ + __u64 rq_memsize; \ + __u32 sqid; \ + __u32 rqid; \ + __u32 sq_size; \ + __u32 rq_size; \ + __u32 qid_mask; \ + }; +struct c4iw_create_qp_resp_v0 _c4iw_create_qp_resp_v0; +#define _STRUCT_c4iw_create_qp_resp_v0 struct _c4iw_create_qp_resp_v0 + +DECLARE_DRV_CMD(uc4iw_alloc_pd, IB_USER_VERBS_CMD_ALLOC_PD, + empty, c4iw_alloc_pd_resp); +DECLARE_DRV_CMD(uc4iw_create_cq, IB_USER_VERBS_CMD_CREATE_CQ, + c4iw_create_cq, c4iw_create_cq_resp); +DECLARE_DRV_CMD(uc4iw_create_srq, IB_USER_VERBS_CMD_CREATE_SRQ, + empty, c4iw_create_srq_resp); +DECLARE_DRV_CMD(uc4iw_create_qp, IB_USER_VERBS_CMD_CREATE_QP, + empty, c4iw_create_qp_resp); +DECLARE_DRV_CMD(uc4iw_create_qp_v0, IB_USER_VERBS_CMD_CREATE_QP, + empty, c4iw_create_qp_resp_v0); +DECLARE_DRV_CMD(uc4iw_alloc_ucontext, IB_USER_VERBS_CMD_GET_CONTEXT, + empty, c4iw_alloc_ucontext_resp); + +#endif /* IWCH_ABI_H */ diff --git a/providers/cxgb4/dev.c b/providers/cxgb4/dev.c new file mode 100644 index 0000000..06948ef --- /dev/null +++ b/providers/cxgb4/dev.c @@ -0,0 +1,472 @@ +/* + * Copyright (c) 2006-2016 Chelsio, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include <config.h> + +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <errno.h> +#include <sys/mman.h> +#include <pthread.h> +#include <string.h> +#include <signal.h> + +#include "libcxgb4.h" +#include "cxgb4-abi.h" + +static void c4iw_free_context(struct ibv_context *ibctx); + +#define PCI_VENDOR_ID_CHELSIO 0x1425 + +/* + * Macros needed to support the PCI Device ID Table ... + */ +#define CH_PCI_DEVICE_ID_TABLE_DEFINE_BEGIN \ + static const struct verbs_match_ent hca_table[] = { \ + VERBS_DRIVER_ID(RDMA_DRIVER_CXGB4), + +#define CH_PCI_DEVICE_ID_FUNCTION \ + 0x4 + +#define CH_PCI_ID_TABLE_ENTRY(__DeviceID) \ + VERBS_PCI_MATCH(PCI_VENDOR_ID_CHELSIO, __DeviceID, NULL) + +#define CH_PCI_DEVICE_ID_TABLE_DEFINE_END \ + {} } + +#include "t4_chip_type.h" +#include "t4_pci_id_tbl.h" + +unsigned long c4iw_page_size; +unsigned long c4iw_page_shift; +unsigned long c4iw_page_mask; +int ma_wr; +int t5_en_wc = 1; + +static LIST_HEAD(devices); + +static const struct verbs_context_ops c4iw_ctx_common_ops = { + .query_device = c4iw_query_device, + .query_port = c4iw_query_port, + .alloc_pd = c4iw_alloc_pd, + .dealloc_pd = c4iw_free_pd, + .reg_mr = c4iw_reg_mr, + .dereg_mr = c4iw_dereg_mr, + .create_cq = c4iw_create_cq, + .destroy_cq = c4iw_destroy_cq, + .create_srq = c4iw_create_srq, + .modify_srq = c4iw_modify_srq, + .destroy_srq = c4iw_destroy_srq, + .query_srq = c4iw_query_srq, + .create_qp = c4iw_create_qp, + .modify_qp = c4iw_modify_qp, + .destroy_qp = c4iw_destroy_qp, + .query_qp = c4iw_query_qp, + .attach_mcast = c4iw_attach_mcast, + .detach_mcast = c4iw_detach_mcast, + .post_srq_recv = c4iw_post_srq_recv, + .req_notify_cq = c4iw_arm_cq, + .free_context = c4iw_free_context, +}; + +static const struct verbs_context_ops c4iw_ctx_t4_ops = { + .async_event = c4iw_async_event, + .poll_cq = c4iw_poll_cq, + .post_recv = c4iw_post_receive, + .post_send = c4iw_post_send, + .req_notify_cq = c4iw_arm_cq, +}; + +static struct verbs_context *c4iw_alloc_context(struct ibv_device *ibdev, + int cmd_fd, + void *private_data) +{ + struct c4iw_context *context; + struct ibv_get_context cmd; + struct uc4iw_alloc_ucontext_resp resp; + struct c4iw_dev *rhp = to_c4iw_dev(ibdev); + struct ibv_query_device qcmd; + uint64_t raw_fw_ver; + struct ibv_device_attr attr; + + context = verbs_init_and_alloc_context(ibdev, cmd_fd, context, ibv_ctx, + RDMA_DRIVER_CXGB4); + if (!context) + return NULL; + + resp.status_page_size = 0; + resp.reserved = 0; + if (ibv_cmd_get_context(&context->ibv_ctx, &cmd, sizeof cmd, + &resp.ibv_resp, sizeof resp)) + goto err_free; + + if (resp.reserved) + PDBG("%s c4iw_alloc_ucontext_resp reserved field modified by kernel\n", + __FUNCTION__); + + context->status_page_size = resp.status_page_size; + if (resp.status_page_size) { + context->status_page = mmap(NULL, resp.status_page_size, + PROT_READ, MAP_SHARED, cmd_fd, + resp.status_page_key); + if (context->status_page == MAP_FAILED) + goto err_free; + } + + verbs_set_ops(&context->ibv_ctx, &c4iw_ctx_common_ops); + if (ibv_cmd_query_device(&context->ibv_ctx.context, &attr, + &raw_fw_ver, &qcmd, sizeof(qcmd))) + goto err_unmap; + + if (!rhp->mmid2ptr) { + rhp->max_mr = attr.max_mr; + rhp->mmid2ptr = calloc(attr.max_mr, sizeof(void *)); + if (!rhp->mmid2ptr) { + goto err_unmap; + } + if (rhp->abi_version < 3) { + fprintf(stderr, "Warning: iw_cxgb4 driver is of older version" + " than libcxgb4:: %d\n", rhp->abi_version); + rhp->max_qp = T4_QID_BASE + attr.max_qp; + } else { + rhp->max_qp = context->status_page->qp_start + + context->status_page->qp_size; + } + rhp->qpid2ptr = calloc(rhp->max_qp, sizeof(void *)); + if (!rhp->qpid2ptr) { + goto err_unmap; + } + if (rhp->abi_version < 3) + rhp->max_cq = T4_QID_BASE + attr.max_cq; + else + rhp->max_cq = context->status_page->cq_start + + context->status_page->cq_size; + rhp->cqid2ptr = calloc(rhp->max_cq, sizeof(void *)); + if (!rhp->cqid2ptr) + goto err_unmap; + rhp->write_cmpl_supported = + context->status_page->write_cmpl_supported; + } + + rhp->chip_version = CHELSIO_CHIP_VERSION(attr.vendor_part_id >> 8); + switch (rhp->chip_version) { + case CHELSIO_T6: + PDBG("%s T6/T5/T4 device\n", __func__); + case CHELSIO_T5: + PDBG("%s T5/T4 device\n", __func__); + case CHELSIO_T4: + PDBG("%s T4 device\n", __func__); + verbs_set_ops(&context->ibv_ctx, &c4iw_ctx_t4_ops); + break; + default: + PDBG("%s unknown hca type %d\n", __func__, rhp->chip_version); + goto err_unmap; + } + + return &context->ibv_ctx; + +err_unmap: + munmap(context->status_page, context->status_page_size); +err_free: + if (rhp->cqid2ptr) + free(rhp->cqid2ptr); + if (rhp->qpid2ptr) + free(rhp->qpid2ptr); + if (rhp->mmid2ptr) + free(rhp->mmid2ptr); + verbs_uninit_context(&context->ibv_ctx); + free(context); + return NULL; +} + +static void c4iw_free_context(struct ibv_context *ibctx) +{ + struct c4iw_context *context = to_c4iw_context(ibctx); + + if (context->status_page_size) + munmap(context->status_page, context->status_page_size); + + verbs_uninit_context(&context->ibv_ctx); + free(context); +} + +static void c4iw_uninit_device(struct verbs_device *verbs_device) +{ + struct c4iw_dev *dev = to_c4iw_dev(&verbs_device->device); + + free(dev); +} + +#ifdef STALL_DETECTION + +int stall_to; + +static void dump_cq(struct c4iw_cq *chp) +{ + int i; + + fprintf(stderr, + "CQ: %p id %u queue %p cidx 0x%08x sw_queue %p sw_cidx %d sw_pidx %d sw_in_use %d depth %u error %u gen %d " + "cidx_inc %d bits_type_ts %016" PRIx64 " notempty %d\n", chp, + chp->cq.cqid, chp->cq.queue, chp->cq.cidx, + chp->cq.sw_queue, chp->cq.sw_cidx, chp->cq.sw_pidx, chp->cq.sw_in_use, + chp->cq.size, chp->cq.error, chp->cq.gen, chp->cq.cidx_inc, be64toh(chp->cq.bits_type_ts), + t4_cq_notempty(&chp->cq)); + + for (i=0; i < chp->cq.size; i++) { + u64 *p = (u64 *)(chp->cq.queue + i); + + fprintf(stderr, "%02x: %016" PRIx64 " %016" PRIx64, i, be64toh(p[0]), be64toh(p[1])); + if (i == chp->cq.cidx) + fprintf(stderr, " <-- cidx\n"); + else + fprintf(stderr, "\n"); + p+= 2; + fprintf(stderr, "%02x: %016" PRIx64 " %016" PRIx64 "\n", i, be64toh(p[0]), be64toh(p[1])); + p+= 2; + fprintf(stderr, "%02x: %016" PRIx64 " %016" PRIx64 "\n", i, be64toh(p[0]), be64toh(p[1])); + p+= 2; + fprintf(stderr, "%02x: %016" PRIx64 " %016" PRIx64 "\n", i, be64toh(p[0]), be64toh(p[1])); + p+= 2; + } +} + +static void dump_qp(struct c4iw_qp *qhp) +{ + int i; + int j; + struct t4_swsqe *swsqe; + struct t4_swrqe *swrqe; + u16 cidx, pidx; + u64 *p; + + fprintf(stderr, + "QP: %p id %u error %d flushed %d qid_mask 0x%x\n" + " SQ: id %u queue %p sw_queue %p cidx %u pidx %u in_use %u wq_pidx %u depth %u flags 0x%x flush_cidx %d\n" + " RQ: id %u queue %p sw_queue %p cidx %u pidx %u in_use %u depth %u\n", + qhp, + qhp->wq.sq.qid, + qhp->wq.error, + qhp->wq.flushed, + qhp->wq.qid_mask, + qhp->wq.sq.qid, + qhp->wq.sq.queue, + qhp->wq.sq.sw_sq, + qhp->wq.sq.cidx, + qhp->wq.sq.pidx, + qhp->wq.sq.in_use, + qhp->wq.sq.wq_pidx, + qhp->wq.sq.size, + qhp->wq.sq.flags, + qhp->wq.sq.flush_cidx, + qhp->wq.rq.qid, + qhp->wq.rq.queue, + qhp->wq.rq.sw_rq, + qhp->wq.rq.cidx, + qhp->wq.rq.pidx, + qhp->wq.rq.in_use, + qhp->wq.rq.size); + cidx = qhp->wq.sq.cidx; + pidx = qhp->wq.sq.pidx; + if (cidx != pidx) + fprintf(stderr, "SQ: \n"); + while (cidx != pidx) { + swsqe = &qhp->wq.sq.sw_sq[cidx]; + fprintf(stderr, "%04u: wr_id %016" PRIx64 + " sq_wptr %08x read_len %u opcode 0x%x " + "complete %u signaled %u cqe %016" PRIx64 " %016" PRIx64 " %016" PRIx64 " %016" PRIx64 "\n", + cidx, + swsqe->wr_id, + swsqe->idx, + swsqe->read_len, + swsqe->opcode, + swsqe->complete, + swsqe->signaled, + htobe64(((uint64_t *)&swsqe->cqe)[0]), + htobe64(((uint64_t *)&swsqe->cqe)[1]), + htobe64(((uint64_t *)&swsqe->cqe)[2]), + htobe64(((uint64_t *)&swsqe->cqe)[3])); + if (++cidx == qhp->wq.sq.size) + cidx = 0; + } + + fprintf(stderr, "SQ WQ: \n"); + p = (u64 *)qhp->wq.sq.queue; + for (i=0; i < qhp->wq.sq.size * T4_SQ_NUM_SLOTS; i++) { + for (j=0; j < T4_EQ_ENTRY_SIZE / 16; j++) { + fprintf(stderr, "%04u %016" PRIx64 " %016" PRIx64 " ", + i, be64toh(p[0]), be64toh(p[1])); + if (j == 0 && i == qhp->wq.sq.wq_pidx) + fprintf(stderr, " <-- pidx"); + fprintf(stderr, "\n"); + p += 2; + } + } + cidx = qhp->wq.rq.cidx; + pidx = qhp->wq.rq.pidx; + if (cidx != pidx) + fprintf(stderr, "RQ: \n"); + while (cidx != pidx) { + swrqe = &qhp->wq.rq.sw_rq[cidx]; + fprintf(stderr, "%04u: wr_id %016" PRIx64 "\n", + cidx, + swrqe->wr_id ); + if (++cidx == qhp->wq.rq.size) + cidx = 0; + } + + fprintf(stderr, "RQ WQ: \n"); + p = (u64 *)qhp->wq.rq.queue; + for (i=0; i < qhp->wq.rq.size * T4_RQ_NUM_SLOTS; i++) { + for (j=0; j < T4_EQ_ENTRY_SIZE / 16; j++) { + fprintf(stderr, "%04u %016" PRIx64 " %016" PRIx64 " ", + i, be64toh(p[0]), be64toh(p[1])); + if (j == 0 && i == qhp->wq.rq.pidx) + fprintf(stderr, " <-- pidx"); + if (j == 0 && i == qhp->wq.rq.cidx) + fprintf(stderr, " <-- cidx"); + fprintf(stderr, "\n"); + p+=2; + } + } +} + +void dump_state(void) +{ + struct c4iw_dev *dev; + int i; + + fprintf(stderr, "STALL DETECTED:\n"); + list_for_each(&devices, dev, list) { + //pthread_spin_lock(&dev->lock); + fprintf(stderr, "Device %s\n", dev->ibv_dev.name); + for (i=0; i < dev->max_cq; i++) { + if (dev->cqid2ptr[i]) { + struct c4iw_cq *chp = dev->cqid2ptr[i]; + //pthread_spin_lock(&chp->lock); + dump_cq(chp); + //pthread_spin_unlock(&chp->lock); + } + } + for (i=0; i < dev->max_qp; i++) { + if (dev->qpid2ptr[i]) { + struct c4iw_qp *qhp = dev->qpid2ptr[i]; + //pthread_spin_lock(&qhp->lock); + dump_qp(qhp); + //pthread_spin_unlock(&qhp->lock); + } + } + //pthread_spin_unlock(&dev->lock); + } + fprintf(stderr, "DUMP COMPLETE:\n"); + fflush(stderr); +} +#endif /* end of STALL_DETECTION */ + +/* + * c4iw_abi_version is used to store ABI for iw_cxgb4 so the user mode library + * can know if the driver supports the kernel mode db ringing. + */ +int c4iw_abi_version = 1; + +static struct verbs_device *c4iw_device_alloc(struct verbs_sysfs_dev *sysfs_dev) +{ + struct c4iw_dev *dev; + + c4iw_page_size = sysconf(_SC_PAGESIZE); + c4iw_page_shift = long_log2(c4iw_page_size); + c4iw_page_mask = ~(c4iw_page_size - 1); + + dev = calloc(1, sizeof *dev); + if (!dev) + return NULL; + + pthread_spin_init(&dev->lock, PTHREAD_PROCESS_PRIVATE); + c4iw_abi_version = sysfs_dev->abi_ver; + dev->abi_version = sysfs_dev->abi_ver; + list_node_init(&dev->list); + + list_head_init(&dev->srq_list); + PDBG("%s device claimed\n", __FUNCTION__); + list_add_tail(&devices, &dev->list); +#ifdef STALL_DETECTION +{ + char *c = getenv("CXGB4_STALL_TIMEOUT"); + if (c) { + stall_to = strtol(c, NULL, 0); + if (errno || stall_to < 0) + stall_to = 0; + } +} +#endif +{ + char *c = getenv("CXGB4_MA_WR"); + if (c) { + ma_wr = strtol(c, NULL, 0); + if (ma_wr != 1) + ma_wr = 0; + } +} +{ + char *c = getenv("T5_ENABLE_WC"); + if (c) { + t5_en_wc = strtol(c, NULL, 0); + if (t5_en_wc != 1) + t5_en_wc = 0; + } +} + + return &dev->ibv_dev; +} + +static const struct verbs_device_ops c4iw_dev_ops = { + .name = "cxgb4", + .match_min_abi_version = 0, + .match_max_abi_version = INT_MAX, + .match_table = hca_table, + .alloc_device = c4iw_device_alloc, + .uninit_device = c4iw_uninit_device, + .alloc_context = c4iw_alloc_context, +}; +PROVIDER_DRIVER(cxgb4, c4iw_dev_ops); + +#ifdef STATS +void __attribute__ ((destructor)) cs_fini(void); +void __attribute__ ((destructor)) cs_fini(void) +{ + syslog(LOG_NOTICE, "cxgb4 stats - sends %lu recv %lu read %lu " + "write %lu arm %lu cqe %lu mr %lu qp %lu cq %lu\n", + c4iw_stats.send, c4iw_stats.recv, c4iw_stats.read, + c4iw_stats.write, c4iw_stats.arm, c4iw_stats.cqe, + c4iw_stats.mr, c4iw_stats.qp, c4iw_stats.cq); +} +#endif diff --git a/providers/cxgb4/libcxgb4.h b/providers/cxgb4/libcxgb4.h new file mode 100644 index 0000000..c5036d0 --- /dev/null +++ b/providers/cxgb4/libcxgb4.h @@ -0,0 +1,281 @@ +/* + * Copyright (c) 2006-2016 Chelsio, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef IWCH_H +#define IWCH_H + +#include <pthread.h> +#include <inttypes.h> +#include <stddef.h> +#include <string.h> +#include <syslog.h> +#include <sys/errno.h> +#include <sys/time.h> +#include <infiniband/driver.h> +#include <util/udma_barrier.h> +#include <ccan/list.h> +#include "t4.h" + +extern unsigned long c4iw_page_size; +extern unsigned long c4iw_page_shift; +extern unsigned long c4iw_page_mask; + +struct c4iw_mr; + +struct c4iw_dev { + struct verbs_device ibv_dev; + unsigned chip_version; + int max_mr; + struct c4iw_mr **mmid2ptr; + int max_qp; + struct c4iw_qp **qpid2ptr; + int max_cq; + struct c4iw_cq **cqid2ptr; + struct list_head srq_list; + pthread_spinlock_t lock; + struct list_node list; + int abi_version; + bool write_cmpl_supported; +}; + +static inline int dev_is_t6(struct c4iw_dev *dev) +{ + return dev->chip_version == CHELSIO_T6; +} + +static inline int dev_is_t5(struct c4iw_dev *dev) +{ + return dev->chip_version == CHELSIO_T5; +} + +static inline int dev_is_t4(struct c4iw_dev *dev) +{ + return dev->chip_version == CHELSIO_T4; +} + +struct c4iw_context { + struct verbs_context ibv_ctx; + struct t4_dev_status_page *status_page; + int status_page_size; +}; + +struct c4iw_pd { + struct ibv_pd ibv_pd; +}; + +struct c4iw_mr { + struct verbs_mr vmr; + uint64_t va_fbo; + uint32_t len; +}; + +static inline u32 c4iw_mmid(u32 stag) +{ + return (stag >> 8); +} + +struct c4iw_cq { + struct ibv_cq ibv_cq; + struct c4iw_dev *rhp; + struct t4_cq cq; + pthread_spinlock_t lock; +#ifdef STALL_DETECTION + struct timeval time; + int dumped; +#endif +}; + +struct c4iw_qp { + struct ibv_qp ibv_qp; + struct c4iw_dev *rhp; + struct t4_wq wq; + pthread_spinlock_t lock; + int sq_sig_all; + struct c4iw_srq *srq; +}; + +#define to_c4iw_xxx(xxx, type) \ + container_of(ib##xxx, struct c4iw_##type, ibv_##xxx) + +struct c4iw_srq { + struct ibv_srq ibv_srq; + int type; /* must be 2nd in this struct */ + struct c4iw_dev *rhp; + struct t4_srq wq; + struct list_node list; + pthread_spinlock_t lock; + uint32_t srq_limit; + int armed; + __u32 flags; +}; + +static inline struct c4iw_srq *to_c4iw_srq(struct ibv_srq *ibsrq) +{ + return to_c4iw_xxx(srq, srq); +} + +static inline struct c4iw_dev *to_c4iw_dev(struct ibv_device *ibdev) +{ + return container_of(ibdev, struct c4iw_dev, ibv_dev.device); +} + +static inline struct c4iw_context *to_c4iw_context(struct ibv_context *ibctx) +{ + return container_of(ibctx, struct c4iw_context, ibv_ctx.context); +} + +static inline struct c4iw_pd *to_c4iw_pd(struct ibv_pd *ibpd) +{ + return to_c4iw_xxx(pd, pd); +} + +static inline struct c4iw_cq *to_c4iw_cq(struct ibv_cq *ibcq) +{ + return to_c4iw_xxx(cq, cq); +} + +static inline struct c4iw_qp *to_c4iw_qp(struct ibv_qp *ibqp) +{ + return to_c4iw_xxx(qp, qp); +} + +static inline struct c4iw_mr *to_c4iw_mr(struct verbs_mr *vmr) +{ + return container_of(vmr, struct c4iw_mr, vmr); +} + +static inline struct c4iw_qp *get_qhp(struct c4iw_dev *rhp, u32 qid) +{ + return rhp->qpid2ptr[qid]; +} + +static inline struct c4iw_cq *get_chp(struct c4iw_dev *rhp, u32 qid) +{ + return rhp->cqid2ptr[qid]; +} + +static inline unsigned long_log2(unsigned long x) +{ + unsigned r = 0; + for (x >>= 1; x > 0; x >>= 1) + r++; + return r; +} + +int c4iw_query_device(struct ibv_context *context, + struct ibv_device_attr *attr); +int c4iw_query_port(struct ibv_context *context, uint8_t port, + struct ibv_port_attr *attr); + +struct ibv_pd *c4iw_alloc_pd(struct ibv_context *context); +int c4iw_free_pd(struct ibv_pd *pd); + +struct ibv_mr *c4iw_reg_mr(struct ibv_pd *pd, void *addr, size_t length, + uint64_t hca_va, int access); +int c4iw_dereg_mr(struct verbs_mr *vmr); + +struct ibv_cq *c4iw_create_cq(struct ibv_context *context, int cqe, + struct ibv_comp_channel *channel, + int comp_vector); +int c4iw_destroy_cq(struct ibv_cq *cq); +int c4iw_poll_cq(struct ibv_cq *cq, int ne, struct ibv_wc *wc); +int c4iw_arm_cq(struct ibv_cq *cq, int solicited); +void c4iw_cq_event(struct ibv_cq *cq); +void c4iw_init_cq_buf(struct c4iw_cq *cq, int nent); + +struct ibv_srq *c4iw_create_srq(struct ibv_pd *pd, + struct ibv_srq_init_attr *attr); +int c4iw_modify_srq(struct ibv_srq *srq, + struct ibv_srq_attr *attr, + int mask); +int c4iw_destroy_srq(struct ibv_srq *srq); +int c4iw_post_srq_recv(struct ibv_srq *ibsrq, + struct ibv_recv_wr *wr, + struct ibv_recv_wr **bad_wr); +int c4iw_query_srq(struct ibv_srq *ibsrq, struct ibv_srq_attr *attr); + +struct ibv_qp *c4iw_create_qp(struct ibv_pd *pd, + struct ibv_qp_init_attr *attr); +int c4iw_modify_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, + int attr_mask); +int c4iw_destroy_qp(struct ibv_qp *qp); +int c4iw_query_qp(struct ibv_qp *qp, + struct ibv_qp_attr *attr, + int attr_mask, + struct ibv_qp_init_attr *init_attr); +void c4iw_flush_qp(struct c4iw_qp *qhp); +void c4iw_flush_qps(struct c4iw_dev *dev); +int c4iw_post_send(struct ibv_qp *ibqp, struct ibv_send_wr *wr, + struct ibv_send_wr **bad_wr); +int c4iw_post_receive(struct ibv_qp *ibqp, struct ibv_recv_wr *wr, + struct ibv_recv_wr **bad_wr); +int c4iw_attach_mcast(struct ibv_qp *qp, const union ibv_gid *gid, + uint16_t lid); +int c4iw_detach_mcast(struct ibv_qp *qp, const union ibv_gid *gid, + uint16_t lid); +void c4iw_async_event(struct ibv_context *context, + struct ibv_async_event *event); +void c4iw_flush_hw_cq(struct c4iw_cq *chp, struct c4iw_qp *flush_qhp); +int c4iw_flush_rq(struct t4_wq *wq, struct t4_cq *cq, int count); +void c4iw_flush_sq(struct c4iw_qp *qhp); +void c4iw_count_rcqes(struct t4_cq *cq, struct t4_wq *wq, int *count); +void c4iw_copy_wr_to_srq(struct t4_srq *srq, union t4_recv_wr *wqe, u8 len16); +void c4iw_flush_srqidx(struct c4iw_qp *qhp, u32 srqidx); + +#define FW_MAJ 0 +#define FW_MIN 0 + +#ifdef STATS + +#define INC_STAT(a) { c4iw_stats.a++; } + +struct c4iw_stats { + unsigned long send; + unsigned long recv; + unsigned long read; + unsigned long write; + unsigned long arm; + unsigned long cqe; + unsigned long mr; + unsigned long qp; + unsigned long cq; +}; +extern struct c4iw_stats c4iw_stats; +#else +#define INC_STAT(a) +#endif + +#ifdef STALL_DETECTION +void dump_state(void); +extern int stall_to; +#endif + +#endif /* IWCH_H */ diff --git a/providers/cxgb4/qp.c b/providers/cxgb4/qp.c new file mode 100644 index 0000000..fd028d5 --- /dev/null +++ b/providers/cxgb4/qp.c @@ -0,0 +1,840 @@ +/* + * Copyright (c) 2006-2016 Chelsio, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include <config.h> + +#include <assert.h> +#include <stdlib.h> +#include <pthread.h> +#include <string.h> +#include <stdio.h> +#include <util/compiler.h> +#include "libcxgb4.h" + +#ifdef STATS +struct c4iw_stats c4iw_stats; +#endif + +static void copy_wr_to_sq(struct t4_wq *wq, union t4_wr *wqe, u8 len16) +{ + u64 *src, *dst; + + src = (u64 *)wqe; + dst = (u64 *)((u8 *)wq->sq.queue + wq->sq.wq_pidx * T4_EQ_ENTRY_SIZE); + if (t4_sq_onchip(wq)) { + len16 = align(len16, 4); + + /* In onchip mode the copy below will be made to WC memory and + * could trigger DMA. In offchip mode the copy below only + * queues the WQE, DMA cannot start until t4_ring_sq_db + * happens */ + mmio_wc_start(); + } + while (len16) { + *dst++ = *src++; + if (dst == (u64 *)&wq->sq.queue[wq->sq.size]) + dst = (u64 *)wq->sq.queue; + *dst++ = *src++; + if (dst == (u64 *)&wq->sq.queue[wq->sq.size]) + dst = (u64 *)wq->sq.queue; + len16--; + + /* NOTE len16 cannot be large enough to write to the + same sq.queue memory twice in this loop */ + } + + if (t4_sq_onchip(wq)) + mmio_flush_writes(); +} + +static void copy_wr_to_rq(struct t4_wq *wq, union t4_recv_wr *wqe, u8 len16) +{ + u64 *src, *dst; + + src = (u64 *)wqe; + dst = (u64 *)((u8 *)wq->rq.queue + wq->rq.wq_pidx * T4_EQ_ENTRY_SIZE); + while (len16) { + *dst++ = *src++; + if (dst >= (u64 *)&wq->rq.queue[wq->rq.size]) + dst = (u64 *)wq->rq.queue; + *dst++ = *src++; + if (dst >= (u64 *)&wq->rq.queue[wq->rq.size]) + dst = (u64 *)wq->rq.queue; + len16--; + } +} + +void c4iw_copy_wr_to_srq(struct t4_srq *srq, union t4_recv_wr *wqe, u8 len16) +{ + u64 *src, *dst; + + src = (u64 *)wqe; + dst = (u64 *)((u8 *)srq->queue + srq->wq_pidx * T4_EQ_ENTRY_SIZE); + while (len16) { + *dst++ = *src++; + if (dst >= (u64 *)&srq->queue[srq->size]) + dst = (u64 *)srq->queue; + *dst++ = *src++; + if (dst >= (u64 *)&srq->queue[srq->size]) + dst = (u64 *)srq->queue; + len16--; + } +} + +static int build_immd(struct t4_sq *sq, struct fw_ri_immd *immdp, + struct ibv_send_wr *wr, int max, u32 *plenp) +{ + u8 *dstp, *srcp; + u32 plen = 0; + int i; + int len; + + dstp = (u8 *)immdp->data; + for (i = 0; i < wr->num_sge; i++) { + if ((plen + wr->sg_list[i].length) > max) + return -EMSGSIZE; + srcp = (u8 *)(unsigned long)wr->sg_list[i].addr; + plen += wr->sg_list[i].length; + len = wr->sg_list[i].length; + memcpy(dstp, srcp, len); + dstp += len; + srcp += len; + } + len = ROUND_UP(plen + 8, 16) - (plen + 8); + if (len) + memset(dstp, 0, len); + immdp->op = FW_RI_DATA_IMMD; + immdp->r1 = 0; + immdp->r2 = 0; + immdp->immdlen = htobe32(plen); + *plenp = plen; + return 0; +} + +static int build_isgl(__be64 *queue_start, __be64 *queue_end, + struct fw_ri_isgl *isglp, struct ibv_sge *sg_list, + int num_sge, u32 *plenp) +{ + int i; + u32 plen = 0; + __be64 *flitp; + + if ((__be64 *)isglp == queue_end) + isglp = (struct fw_ri_isgl *)queue_start; + + flitp = (__be64 *)isglp->sge; + for (i = 0; i < num_sge; i++) { + if ((plen + sg_list[i].length) < plen) + return -EMSGSIZE; + plen += sg_list[i].length; + *flitp = htobe64(((u64)sg_list[i].lkey << 32) | + sg_list[i].length); + if (++flitp == queue_end) + flitp = queue_start; + *flitp = htobe64(sg_list[i].addr); + if (++flitp == queue_end) + flitp = queue_start; + } + *flitp = 0; + isglp->op = FW_RI_DATA_ISGL; + isglp->r1 = 0; + isglp->nsge = htobe16(num_sge); + isglp->r2 = 0; + if (plenp) + *plenp = plen; + return 0; +} + +static int build_rdma_send(struct t4_sq *sq, union t4_wr *wqe, + struct ibv_send_wr *wr, u8 *len16) +{ + u32 plen; + int size; + int ret; + + if (wr->num_sge > T4_MAX_SEND_SGE) + return -EINVAL; + switch (wr->opcode) { + case IBV_WR_SEND: + if (wr->send_flags & IBV_SEND_SOLICITED) + wqe->send.sendop_pkd = htobe32(FW_RI_SEND_WR_SENDOP_V(FW_RI_SEND_WITH_SE)); + else + wqe->send.sendop_pkd = htobe32(FW_RI_SEND_WR_SENDOP_V(FW_RI_SEND)); + wqe->send.stag_inv = 0; + break; + case IBV_WR_SEND_WITH_INV: + if (wr->send_flags & IBV_SEND_SOLICITED) + wqe->send.sendop_pkd = htobe32(FW_RI_SEND_WR_SENDOP_V(FW_RI_SEND_WITH_SE_INV)); + else + wqe->send.sendop_pkd = htobe32(FW_RI_SEND_WR_SENDOP_V(FW_RI_SEND_WITH_INV)); + wqe->send.stag_inv = htobe32(wr->invalidate_rkey); + break; + default: + return -EINVAL; + } + wqe->send.r3 = 0; + wqe->send.r4 = 0; + + plen = 0; + if (wr->num_sge) { + if (wr->send_flags & IBV_SEND_INLINE) { + ret = build_immd(sq, wqe->send.u.immd_src, wr, + T4_MAX_SEND_INLINE, &plen); + if (ret) + return ret; + size = sizeof wqe->send + sizeof(struct fw_ri_immd) + + plen; + } else { + ret = build_isgl((__be64 *)sq->queue, + (__be64 *)&sq->queue[sq->size], + wqe->send.u.isgl_src, + wr->sg_list, wr->num_sge, &plen); + if (ret) + return ret; + size = sizeof wqe->send + sizeof(struct fw_ri_isgl) + + wr->num_sge * sizeof (struct fw_ri_sge); + } + } else { + wqe->send.u.immd_src[0].op = FW_RI_DATA_IMMD; + wqe->send.u.immd_src[0].r1 = 0; + wqe->send.u.immd_src[0].r2 = 0; + wqe->send.u.immd_src[0].immdlen = 0; + size = sizeof wqe->send + sizeof(struct fw_ri_immd); + plen = 0; + } + *len16 = DIV_ROUND_UP(size, 16); + wqe->send.plen = htobe32(plen); + return 0; +} + +static int build_rdma_write(struct t4_sq *sq, union t4_wr *wqe, + struct ibv_send_wr *wr, u8 *len16) +{ + u32 plen; + int size; + int ret; + + if (wr->num_sge > T4_MAX_SEND_SGE) + return -EINVAL; + if (wr->opcode == IBV_WR_RDMA_WRITE_WITH_IMM) + wqe->write.iw_imm_data.ib_imm_data.imm_data32 = wr->imm_data; + else + wqe->write.iw_imm_data.ib_imm_data.imm_data32 = 0; + wqe->write.stag_sink = htobe32(wr->wr.rdma.rkey); + wqe->write.to_sink = htobe64(wr->wr.rdma.remote_addr); + if (wr->num_sge) { + if (wr->send_flags & IBV_SEND_INLINE) { + ret = build_immd(sq, wqe->write.u.immd_src, wr, + T4_MAX_WRITE_INLINE, &plen); + if (ret) + return ret; + size = sizeof wqe->write + sizeof(struct fw_ri_immd) + + plen; + } else { + ret = build_isgl((__be64 *)sq->queue, + (__be64 *)&sq->queue[sq->size], + wqe->write.u.isgl_src, + wr->sg_list, wr->num_sge, &plen); + if (ret) + return ret; + size = sizeof wqe->write + sizeof(struct fw_ri_isgl) + + wr->num_sge * sizeof (struct fw_ri_sge); + } + } else { + wqe->write.u.immd_src[0].op = FW_RI_DATA_IMMD; + wqe->write.u.immd_src[0].r1 = 0; + wqe->write.u.immd_src[0].r2 = 0; + wqe->write.u.immd_src[0].immdlen = 0; + size = sizeof wqe->write + sizeof(struct fw_ri_immd); + plen = 0; + } + *len16 = DIV_ROUND_UP(size, 16); + wqe->write.plen = htobe32(plen); + return 0; +} + +static void build_immd_cmpl(struct t4_sq *sq, struct fw_ri_immd_cmpl *immdp, + struct ibv_send_wr *wr) +{ + memcpy((u8 *)immdp->data, (u8 *)(uintptr_t)wr->sg_list->addr, 16); + memset(immdp->r1, 0, 6); + immdp->op = FW_RI_DATA_IMMD; + immdp->immdlen = 16; +} + +static void build_rdma_write_cmpl(struct t4_sq *sq, + struct fw_ri_rdma_write_cmpl_wr *wcwr, + struct ibv_send_wr *wr, u8 *len16) +{ + u32 plen; + int size; + + /* + * This code assumes the struct fields preceding the write isgl fit + * in one 64B WR slot. This is because the WQE is built directly in + * the dma queue, and wrapping is only handled by the code buildling + * sgls. IE the "fixed part" of the wr structs must all fit in 64B. + * The WQE build code should probably be redesigned to avoid this + * restriction, but for now just add a static_assert() to catch if + * this WQE struct gets too big. + */ + static_assert(offsetof(struct fw_ri_rdma_write_cmpl_wr, u) <= 64, + "WQE structure too BIG!"); + + wcwr->stag_sink = htobe32(wr->wr.rdma.rkey); + wcwr->to_sink = htobe64(wr->wr.rdma.remote_addr); + if (wr->next->opcode == IBV_WR_SEND) + wcwr->stag_inv = 0; + else + wcwr->stag_inv = htobe32(wr->next->invalidate_rkey); + wcwr->r2 = 0; + wcwr->r3 = 0; + + /* SEND_INV SGL */ + if (wr->next->send_flags & IBV_SEND_INLINE) + build_immd_cmpl(sq, &wcwr->u_cmpl.immd_src, wr->next); + else + build_isgl((__be64 *)sq->queue, (__be64 *)&sq->queue[sq->size], + &wcwr->u_cmpl.isgl_src, wr->next->sg_list, 1, NULL); + + /* WRITE SGL */ + build_isgl((__be64 *)sq->queue, (__be64 *)&sq->queue[sq->size], + wcwr->u.isgl_src, wr->sg_list, wr->num_sge, &plen); + + size = sizeof(*wcwr) + sizeof(struct fw_ri_isgl) + + wr->num_sge * sizeof(struct fw_ri_sge); + wcwr->plen = htobe32(plen); + *len16 = DIV_ROUND_UP(size, 16); +} + +static int build_rdma_read(union t4_wr *wqe, struct ibv_send_wr *wr, u8 *len16) +{ + if (wr->num_sge > 1) + return -EINVAL; + if (wr->num_sge) { + wqe->read.stag_src = htobe32(wr->wr.rdma.rkey); + wqe->read.to_src_hi = htobe32((u32)(wr->wr.rdma.remote_addr >>32)); + wqe->read.to_src_lo = htobe32((u32)wr->wr.rdma.remote_addr); + wqe->read.stag_sink = htobe32(wr->sg_list[0].lkey); + wqe->read.plen = htobe32(wr->sg_list[0].length); + wqe->read.to_sink_hi = htobe32((u32)(wr->sg_list[0].addr >> 32)); + wqe->read.to_sink_lo = htobe32((u32)(wr->sg_list[0].addr)); + } else { + wqe->read.stag_src = htobe32(2); + wqe->read.to_src_hi = 0; + wqe->read.to_src_lo = 0; + wqe->read.stag_sink = htobe32(2); + wqe->read.plen = 0; + wqe->read.to_sink_hi = 0; + wqe->read.to_sink_lo = 0; + } + wqe->read.r2 = 0; + wqe->read.r5 = 0; + *len16 = DIV_ROUND_UP(sizeof wqe->read, 16); + return 0; +} + +static int build_rdma_recv(struct t4_rq *rq, union t4_recv_wr *wqe, + struct ibv_recv_wr *wr, u8 *len16) +{ + int ret; + + ret = build_isgl((__be64 *)rq->queue, (__be64 *)&rq->queue[rq->size], + &wqe->recv.isgl, wr->sg_list, wr->num_sge, NULL); + if (ret) + return ret; + *len16 = DIV_ROUND_UP(sizeof wqe->recv + + wr->num_sge * sizeof(struct fw_ri_sge), 16); + return 0; +} + +static int build_srq_recv(union t4_recv_wr *wqe, struct ibv_recv_wr *wr, + u8 *len16) +{ + int ret; + + ret = build_isgl((__be64 *)wqe, (__be64 *)(wqe + 1), + &wqe->recv.isgl, wr->sg_list, wr->num_sge, NULL); + if (ret) + return ret; + *len16 = DIV_ROUND_UP(sizeof(wqe->recv) + + wr->num_sge * sizeof(struct fw_ri_sge), 16); + return 0; +} + +static void ring_kernel_db(struct c4iw_qp *qhp, u32 qid, u16 idx) +{ + struct ibv_modify_qp cmd = {}; + struct ibv_qp_attr attr; + int mask; + int __attribute__((unused)) ret; + + /* FIXME: Why do we need this barrier if the kernel is going to + trigger the DMA? */ + udma_to_device_barrier(); + if (qid == qhp->wq.sq.qid) { + attr.sq_psn = idx; + mask = IBV_QP_SQ_PSN; + } else { + attr.rq_psn = idx; + mask = IBV_QP_RQ_PSN; + } + ret = ibv_cmd_modify_qp(&qhp->ibv_qp, &attr, mask, &cmd, sizeof cmd); + assert(!ret); +} + +static void post_write_cmpl(struct c4iw_qp *qhp, struct ibv_send_wr *wr) +{ + bool send_signaled = (wr->next->send_flags & IBV_SEND_SIGNALED) || + qhp->sq_sig_all; + bool write_signaled = (wr->send_flags & IBV_SEND_SIGNALED) || + qhp->sq_sig_all; + struct t4_swsqe *swsqe; + union t4_wr *wqe; + u16 write_wrid; + u8 len16; + u16 idx; + + /* + * The sw_sq entries still look like a WRITE and a SEND and consume + * 2 slots. The FW WR, however, will be a single uber-WR. + */ + wqe = (union t4_wr *)((u8 *)qhp->wq.sq.queue + + qhp->wq.sq.wq_pidx * T4_EQ_ENTRY_SIZE); + build_rdma_write_cmpl(&qhp->wq.sq, &wqe->write_cmpl, wr, &len16); + + /* WRITE swsqe */ + swsqe = &qhp->wq.sq.sw_sq[qhp->wq.sq.pidx]; + swsqe->opcode = FW_RI_RDMA_WRITE; + swsqe->idx = qhp->wq.sq.pidx; + swsqe->complete = 0; + swsqe->signaled = write_signaled; + swsqe->flushed = 0; + swsqe->wr_id = wr->wr_id; + + write_wrid = qhp->wq.sq.pidx; + + /* just bump the sw_sq */ + qhp->wq.sq.in_use++; + if (++qhp->wq.sq.pidx == qhp->wq.sq.size) + qhp->wq.sq.pidx = 0; + + /* SEND swsqe */ + swsqe = &qhp->wq.sq.sw_sq[qhp->wq.sq.pidx]; + if (wr->next->opcode == IBV_WR_SEND) + swsqe->opcode = FW_RI_SEND; + else + swsqe->opcode = FW_RI_SEND_WITH_INV; + swsqe->idx = qhp->wq.sq.pidx; + swsqe->complete = 0; + swsqe->signaled = send_signaled; + swsqe->flushed = 0; + swsqe->wr_id = wr->next->wr_id; + + wqe->write_cmpl.flags_send = send_signaled ? FW_RI_COMPLETION_FLAG : 0; + wqe->write_cmpl.wrid_send = qhp->wq.sq.pidx; + + init_wr_hdr(wqe, write_wrid, FW_RI_RDMA_WRITE_CMPL_WR, + write_signaled ? FW_RI_COMPLETION_FLAG : 0, len16); + t4_sq_produce(&qhp->wq, len16); + idx = DIV_ROUND_UP(len16 * 16, T4_EQ_ENTRY_SIZE); + + t4_ring_sq_db(&qhp->wq, idx, dev_is_t4(qhp->rhp), + len16, wqe); +} + +int c4iw_post_send(struct ibv_qp *ibqp, struct ibv_send_wr *wr, + struct ibv_send_wr **bad_wr) +{ + int err = 0; + u8 uninitialized_var(len16); + enum fw_wr_opcodes fw_opcode; + enum fw_ri_wr_flags fw_flags; + struct c4iw_qp *qhp; + union t4_wr *wqe, lwqe; + u32 num_wrs; + struct t4_swsqe *swsqe; + u16 idx = 0; + + qhp = to_c4iw_qp(ibqp); + pthread_spin_lock(&qhp->lock); + if (t4_wq_in_error(&qhp->wq)) { + pthread_spin_unlock(&qhp->lock); + *bad_wr = wr; + return -EINVAL; + } + num_wrs = t4_sq_avail(&qhp->wq); + if (num_wrs == 0) { + pthread_spin_unlock(&qhp->lock); + *bad_wr = wr; + return -ENOMEM; + } + + /* + * Fastpath for NVMe-oF target WRITE + SEND_WITH_INV wr chain which is + * the response for small NVMEe-oF READ requests. If the chain is + * exactly a WRITE->SEND_WITH_INV or a WRITE->SEND and the sgl depths + * and lengths meet the requirements of the fw_ri_write_cmpl_wr work + * request, then build and post the write_cmpl WR. If any of the tests + * below are not true, then we continue on with the tradtional WRITE + * and SEND WRs. + */ + if (qhp->rhp->write_cmpl_supported && + qhp->rhp->chip_version >= CHELSIO_T5 && + wr && wr->next && !wr->next->next && + wr->opcode == IBV_WR_RDMA_WRITE && wr->sg_list[0].length && + wr->num_sge <= T4_WRITE_CMPL_MAX_SGL && + (wr->next->opcode == IBV_WR_SEND_WITH_INV || + wr->next->opcode == IBV_WR_SEND) && + wr->next->sg_list[0].length == T4_WRITE_CMPL_MAX_CQE && + wr->next->num_sge == 1 && num_wrs >= 2) { + post_write_cmpl(qhp, wr); + pthread_spin_unlock(&qhp->lock); + return 0; + } + + while (wr) { + if (num_wrs == 0) { + err = -ENOMEM; + *bad_wr = wr; + break; + } + + wqe = &lwqe; + fw_flags = 0; + if (wr->send_flags & IBV_SEND_SOLICITED) + fw_flags |= FW_RI_SOLICITED_EVENT_FLAG; + if (wr->send_flags & IBV_SEND_SIGNALED || qhp->sq_sig_all) + fw_flags |= FW_RI_COMPLETION_FLAG; + swsqe = &qhp->wq.sq.sw_sq[qhp->wq.sq.pidx]; + switch (wr->opcode) { + case IBV_WR_SEND_WITH_INV: + case IBV_WR_SEND: + INC_STAT(send); + if (wr->send_flags & IBV_SEND_FENCE) + fw_flags |= FW_RI_READ_FENCE_FLAG; + fw_opcode = FW_RI_SEND_WR; + if (wr->opcode == IBV_WR_SEND) + swsqe->opcode = FW_RI_SEND; + else + swsqe->opcode = FW_RI_SEND_WITH_INV; + err = build_rdma_send(&qhp->wq.sq, wqe, wr, &len16); + break; + case IBV_WR_RDMA_WRITE_WITH_IMM: + if (unlikely(!(qhp->wq.sq.flags & T4_SQ_WRITE_W_IMM))) { + err = -EINVAL; + break; + } + fw_flags |= FW_RI_RDMA_WRITE_WITH_IMMEDIATE; + /*FALLTHROUGH*/ + case IBV_WR_RDMA_WRITE: + INC_STAT(write); + fw_opcode = FW_RI_RDMA_WRITE_WR; + swsqe->opcode = FW_RI_RDMA_WRITE; + err = build_rdma_write(&qhp->wq.sq, wqe, wr, &len16); + break; + case IBV_WR_RDMA_READ: + INC_STAT(read); + fw_opcode = FW_RI_RDMA_READ_WR; + swsqe->opcode = FW_RI_READ_REQ; + fw_flags = 0; + err = build_rdma_read(wqe, wr, &len16); + if (err) + break; + swsqe->read_len = wr->sg_list ? wr->sg_list[0].length : + 0; + if (!qhp->wq.sq.oldest_read) + qhp->wq.sq.oldest_read = swsqe; + break; + default: + PDBG("%s post of type=%d TBD!\n", __func__, + wr->opcode); + err = -EINVAL; + } + if (err) { + *bad_wr = wr; + break; + } + swsqe->idx = qhp->wq.sq.pidx; + swsqe->complete = 0; + swsqe->signaled = (wr->send_flags & IBV_SEND_SIGNALED) || + qhp->sq_sig_all; + swsqe->flushed = 0; + swsqe->wr_id = wr->wr_id; + + init_wr_hdr(wqe, qhp->wq.sq.pidx, fw_opcode, fw_flags, len16); + PDBG("%s cookie 0x%llx pidx 0x%x opcode 0x%x\n", + __func__, (unsigned long long)wr->wr_id, qhp->wq.sq.pidx, + swsqe->opcode); + wr = wr->next; + num_wrs--; + copy_wr_to_sq(&qhp->wq, wqe, len16); + t4_sq_produce(&qhp->wq, len16); + idx += DIV_ROUND_UP(len16*16, T4_EQ_ENTRY_SIZE); + } + if (t4_wq_db_enabled(&qhp->wq)) { + t4_ring_sq_db(&qhp->wq, idx, dev_is_t4(qhp->rhp), + len16, wqe); + } else + ring_kernel_db(qhp, qhp->wq.sq.qid, idx); + /* This write is only for debugging, the value does not matter for DMA + */ + qhp->wq.sq.queue[qhp->wq.sq.size].status.host_wq_pidx = \ + (qhp->wq.sq.wq_pidx); + + pthread_spin_unlock(&qhp->lock); + return err; +} + +static void defer_srq_wr(struct t4_srq *srq, union t4_recv_wr *wqe, + uint64_t wr_id, u8 len16) +{ + struct t4_srq_pending_wr *pwr = &srq->pending_wrs[srq->pending_pidx]; + + PDBG("%s cidx %u pidx %u wq_pidx %u in_use %u ooo_count %u wr_id 0x%llx pending_cidx %u pending_pidx %u pending_in_use %u\n", + __func__, srq->cidx, srq->pidx, srq->wq_pidx, + srq->in_use, srq->ooo_count, (unsigned long long)wr_id, + srq->pending_cidx, srq->pending_pidx, srq->pending_in_use); + pwr->wr_id = wr_id; + pwr->len16 = len16; + memcpy(&pwr->wqe, wqe, len16*16); + t4_srq_produce_pending_wr(srq); +} + +int c4iw_post_srq_recv(struct ibv_srq *ibsrq, struct ibv_recv_wr *wr, + struct ibv_recv_wr **bad_wr) +{ + int err = 0; + struct c4iw_srq *srq; + union t4_recv_wr *wqe, lwqe; + u32 num_wrs; + u8 len16 = 0; + u16 idx = 0; + + srq = to_c4iw_srq(ibsrq); + pthread_spin_lock(&srq->lock); + INC_STAT(srq_recv); + num_wrs = t4_srq_avail(&srq->wq); + if (num_wrs == 0) { + pthread_spin_unlock(&srq->lock); + return -ENOMEM; + } + while (wr) { + if (wr->num_sge > T4_MAX_RECV_SGE) { + err = -EINVAL; + *bad_wr = wr; + break; + } + wqe = &lwqe; + if (num_wrs) + err = build_srq_recv(wqe, wr, &len16); + else + err = -ENOMEM; + if (err) { + *bad_wr = wr; + break; + } + + wqe->recv.opcode = FW_RI_RECV_WR; + wqe->recv.r1 = 0; + wqe->recv.wrid = srq->wq.pidx; + wqe->recv.r2[0] = 0; + wqe->recv.r2[1] = 0; + wqe->recv.r2[2] = 0; + wqe->recv.len16 = len16; + + if (srq->wq.ooo_count || srq->wq.pending_in_use || + srq->wq.sw_rq[srq->wq.pidx].valid) + defer_srq_wr(&srq->wq, wqe, wr->wr_id, len16); + else { + srq->wq.sw_rq[srq->wq.pidx].wr_id = wr->wr_id; + srq->wq.sw_rq[srq->wq.pidx].valid = 1; + c4iw_copy_wr_to_srq(&srq->wq, wqe, len16); + PDBG("%s cidx %u pidx %u wq_pidx %u in_use %u wr_id 0x%llx\n", + __func__, srq->wq.cidx, srq->wq.pidx, + srq->wq.wq_pidx, srq->wq.in_use, + (unsigned long long)wr->wr_id); + t4_srq_produce(&srq->wq, len16); + idx += DIV_ROUND_UP(len16*16, T4_EQ_ENTRY_SIZE); + } + wr = wr->next; + num_wrs--; + } + + if (idx) { + t4_ring_srq_db(&srq->wq, idx, len16, wqe); + srq->wq.queue[srq->wq.size].status.host_wq_pidx = + srq->wq.wq_pidx; + } + pthread_spin_unlock(&srq->lock); + return err; +} + +int c4iw_post_receive(struct ibv_qp *ibqp, struct ibv_recv_wr *wr, + struct ibv_recv_wr **bad_wr) +{ + int err = 0; + struct c4iw_qp *qhp; + union t4_recv_wr *wqe, lwqe; + u32 num_wrs; + u8 len16 = 0; + u16 idx = 0; + + qhp = to_c4iw_qp(ibqp); + pthread_spin_lock(&qhp->lock); + if (t4_wq_in_error(&qhp->wq)) { + pthread_spin_unlock(&qhp->lock); + *bad_wr = wr; + return -EINVAL; + } + INC_STAT(recv); + num_wrs = t4_rq_avail(&qhp->wq); + if (num_wrs == 0) { + pthread_spin_unlock(&qhp->lock); + *bad_wr = wr; + return -ENOMEM; + } + while (wr) { + if (wr->num_sge > T4_MAX_RECV_SGE) { + err = -EINVAL; + *bad_wr = wr; + break; + } + wqe = &lwqe; + if (num_wrs) + err = build_rdma_recv(&qhp->wq.rq, wqe, wr, &len16); + else + err = -ENOMEM; + if (err) { + *bad_wr = wr; + break; + } + + qhp->wq.rq.sw_rq[qhp->wq.rq.pidx].wr_id = wr->wr_id; + + wqe->recv.opcode = FW_RI_RECV_WR; + wqe->recv.r1 = 0; + wqe->recv.wrid = qhp->wq.rq.pidx; + wqe->recv.r2[0] = 0; + wqe->recv.r2[1] = 0; + wqe->recv.r2[2] = 0; + wqe->recv.len16 = len16; + PDBG("%s cookie 0x%llx pidx %u\n", __func__, + (unsigned long long) wr->wr_id, qhp->wq.rq.pidx); + copy_wr_to_rq(&qhp->wq, wqe, len16); + t4_rq_produce(&qhp->wq, len16); + idx += DIV_ROUND_UP(len16*16, T4_EQ_ENTRY_SIZE); + wr = wr->next; + num_wrs--; + } + if (t4_wq_db_enabled(&qhp->wq)) + t4_ring_rq_db(&qhp->wq, idx, dev_is_t4(qhp->rhp), + len16, wqe); + else + ring_kernel_db(qhp, qhp->wq.rq.qid, idx); + qhp->wq.rq.queue[qhp->wq.rq.size].status.host_wq_pidx = \ + (qhp->wq.rq.wq_pidx); + pthread_spin_unlock(&qhp->lock); + return err; +} + +void c4iw_flush_qp(struct c4iw_qp *qhp) +{ + struct c4iw_cq *rchp, *schp; + u32 srqidx; + int count; + + srqidx = t4_wq_srqidx(&qhp->wq); + rchp = to_c4iw_cq(qhp->ibv_qp.recv_cq); + schp = to_c4iw_cq(qhp->ibv_qp.send_cq); + + PDBG("%s qhp %p rchp %p schp %p\n", __func__, qhp, rchp, schp); + + /* locking heirarchy: cq lock first, then qp lock. */ + pthread_spin_lock(&rchp->lock); + if (schp != rchp) + pthread_spin_lock(&schp->lock); + pthread_spin_lock(&qhp->lock); + + if (qhp->wq.flushed) { + pthread_spin_unlock(&qhp->lock); + if (rchp != schp) + pthread_spin_unlock(&schp->lock); + pthread_spin_unlock(&rchp->lock); + return; + } + + qhp->wq.flushed = 1; + t4_set_wq_in_error(&qhp->wq); + + if (qhp->srq) + pthread_spin_lock(&qhp->srq->lock); + + if (srqidx) + c4iw_flush_srqidx(qhp, srqidx); + + qhp->ibv_qp.state = IBV_QPS_ERR; + + c4iw_flush_hw_cq(rchp, qhp); + if (!qhp->srq) { + c4iw_count_rcqes(&rchp->cq, &qhp->wq, &count); + c4iw_flush_rq(&qhp->wq, &rchp->cq, count); + } + + if (schp != rchp) + c4iw_flush_hw_cq(schp, qhp); + + c4iw_flush_sq(qhp); + if (qhp->srq) + pthread_spin_unlock(&qhp->srq->lock); + + pthread_spin_unlock(&qhp->lock); + if (schp != rchp) + pthread_spin_unlock(&schp->lock); + pthread_spin_unlock(&rchp->lock); + +} + +void c4iw_flush_qps(struct c4iw_dev *dev) +{ + int i; + + pthread_spin_lock(&dev->lock); + for (i=0; i < dev->max_qp; i++) { + struct c4iw_qp *qhp = dev->qpid2ptr[i]; + if (qhp) { + if (!qhp->wq.flushed && t4_wq_in_error(&qhp->wq)) { + c4iw_flush_qp(qhp); + } + } + } + pthread_spin_unlock(&dev->lock); +} diff --git a/providers/cxgb4/t4.h b/providers/cxgb4/t4.h new file mode 100644 index 0000000..5af2e74 --- /dev/null +++ b/providers/cxgb4/t4.h @@ -0,0 +1,944 @@ +/* + * Copyright (c) 2006-2016 Chelsio, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __T4_H__ +#define __T4_H__ + +#include <assert.h> +#include <errno.h> +#include <stddef.h> +#include <stdint.h> +#include <syslog.h> +#include <linux/types.h> +#include <util/compiler.h> +#include <util/udma_barrier.h> +#include <util/util.h> +#include <endian.h> + +/* + * Try and minimize the changes from the kernel code that is pull in + * here for kernel bypass ops. + */ +#define u8 uint8_t +#define u16 uint16_t +#define u32 uint32_t +#define u64 uint64_t +#define DECLARE_PCI_UNMAP_ADDR(a) +#define __iomem +#define BUG_ON(c) assert(!(c)) +#define ROUND_UP(x, n) (((x) + (n) - 1u) & ~((n) - 1u)) + +/* FIXME: Move me to a generic PCI mmio accessor */ +#define cpu_to_pci32(val) htole32(val) + +#define writel(v, a) do { *((volatile u32 *)(a)) = cpu_to_pci32(v); } while (0) + +#include "t4_regs.h" +#include "t4_chip_type.h" +#include "t4fw_api.h" +#include "t4fw_ri_api.h" + +extern bool is_64b_cqe; + +#ifdef DEBUG +#define DBGLOG(s) +#define PDBG(fmt, args...) do {syslog(LOG_DEBUG, fmt, ##args); } while (0) +#else +#define DBGLOG(s) +#define PDBG(fmt, args...) do {} while (0) +#endif + +#define A_PCIE_MA_SYNC 0x30b4 + +#define T4_MAX_READ_DEPTH 16 +#define T4_QID_BASE 1024 +#define T4_MAX_QIDS 256 +#define T4_MAX_NUM_PD 65536 +#define T4_EQ_STATUS_ENTRIES (L1_CACHE_BYTES > 64 ? 2 : 1) +#define T4_MAX_EQ_SIZE (65520 - T4_EQ_STATUS_ENTRIES) +#define T4_MAX_IQ_SIZE (65520 - 1) +#define T4_MAX_RQ_SIZE (8192 - T4_EQ_STATUS_ENTRIES) +#define T4_MAX_SQ_SIZE (T4_MAX_EQ_SIZE - 1) +#define T4_MAX_QP_DEPTH (T4_MAX_RQ_SIZE - 1) +#define T4_MAX_CQ_DEPTH (T4_MAX_IQ_SIZE - 1) +#define T4_MAX_NUM_STAG (1<<15) +#define T4_MAX_MR_SIZE (~0ULL - 1) +#define T4_PAGESIZE_MASK 0xffff000 /* 4KB-128MB */ +#define T4_STAG_UNSET 0xffffffff +#define T4_FW_MAJ 0 + +struct t4_status_page { + __be32 rsvd1; /* flit 0 - hw owns */ + __be16 rsvd2; + __be16 qid; + __be16 cidx; + __be16 pidx; + u8 qp_err; /* flit 1 - sw owns */ + u8 db_off; + u8 pad[2]; + u16 host_wq_pidx; + u16 host_cidx; + u16 host_pidx; + u16 pad2; + u32 srqidx; +}; + +#define T4_EQ_ENTRY_SIZE 64 + +#define T4_SQ_NUM_SLOTS 5 +#define T4_SQ_NUM_BYTES (T4_EQ_ENTRY_SIZE * T4_SQ_NUM_SLOTS) +#define T4_MAX_SEND_SGE ((T4_SQ_NUM_BYTES - sizeof(struct fw_ri_send_wr) - sizeof(struct fw_ri_isgl)) / sizeof (struct fw_ri_sge)) +#define T4_MAX_SEND_INLINE ((T4_SQ_NUM_BYTES - sizeof(struct fw_ri_send_wr) - sizeof(struct fw_ri_immd))) +#define T4_MAX_WRITE_INLINE ((T4_SQ_NUM_BYTES - sizeof(struct fw_ri_rdma_write_wr) - sizeof(struct fw_ri_immd))) +#define T4_MAX_WRITE_SGE ((T4_SQ_NUM_BYTES - sizeof(struct fw_ri_rdma_write_wr) - sizeof(struct fw_ri_isgl)) / sizeof (struct fw_ri_sge)) +#define T4_MAX_FR_IMMD ((T4_SQ_NUM_BYTES - sizeof(struct fw_ri_fr_nsmr_wr) - sizeof(struct fw_ri_immd))) +#define T4_MAX_FR_DEPTH 255 + +#define T4_RQ_NUM_SLOTS 2 +#define T4_RQ_NUM_BYTES (T4_EQ_ENTRY_SIZE * T4_RQ_NUM_SLOTS) +#define T4_MAX_RECV_SGE 4 + +#define T4_WRITE_CMPL_MAX_SGL 4 +#define T4_WRITE_CMPL_MAX_CQE 16 + +union t4_wr { + struct fw_ri_res_wr res; + struct fw_ri_wr init; + struct fw_ri_rdma_write_wr write; + struct fw_ri_send_wr send; + struct fw_ri_rdma_read_wr read; + struct fw_ri_bind_mw_wr bind; + struct fw_ri_fr_nsmr_wr fr; + struct fw_ri_inv_lstag_wr inv; + struct fw_ri_rdma_write_cmpl_wr write_cmpl; + struct t4_status_page status; + __be64 flits[T4_EQ_ENTRY_SIZE / sizeof(__be64) * T4_SQ_NUM_SLOTS]; +}; + +union t4_recv_wr { + struct fw_ri_recv_wr recv; + struct t4_status_page status; + __be64 flits[T4_EQ_ENTRY_SIZE / sizeof(__be64) * T4_RQ_NUM_SLOTS]; +}; + +static inline void init_wr_hdr(union t4_wr *wqe, u16 wrid, + enum fw_wr_opcodes opcode, u8 flags, u8 len16) +{ + wqe->send.opcode = (u8)opcode; + wqe->send.flags = flags; + wqe->send.wrid = wrid; + wqe->send.r1[0] = 0; + wqe->send.r1[1] = 0; + wqe->send.r1[2] = 0; + wqe->send.len16 = len16; +} + +/* CQE/AE status codes */ +#define T4_ERR_SUCCESS 0x0 +#define T4_ERR_STAG 0x1 /* STAG invalid: either the */ + /* STAG is offlimt, being 0, */ + /* or STAG_key mismatch */ +#define T4_ERR_PDID 0x2 /* PDID mismatch */ +#define T4_ERR_QPID 0x3 /* QPID mismatch */ +#define T4_ERR_ACCESS 0x4 /* Invalid access right */ +#define T4_ERR_WRAP 0x5 /* Wrap error */ +#define T4_ERR_BOUND 0x6 /* base and bounds voilation */ +#define T4_ERR_INVALIDATE_SHARED_MR 0x7 /* attempt to invalidate a */ + /* shared memory region */ +#define T4_ERR_INVALIDATE_MR_WITH_MW_BOUND 0x8 /* attempt to invalidate a */ + /* shared memory region */ +#define T4_ERR_ECC 0x9 /* ECC error detected */ +#define T4_ERR_ECC_PSTAG 0xA /* ECC error detected when */ + /* reading PSTAG for a MW */ + /* Invalidate */ +#define T4_ERR_PBL_ADDR_BOUND 0xB /* pbl addr out of bounds: */ + /* software error */ +#define T4_ERR_SWFLUSH 0xC /* SW FLUSHED */ +#define T4_ERR_CRC 0x10 /* CRC error */ +#define T4_ERR_MARKER 0x11 /* Marker error */ +#define T4_ERR_PDU_LEN_ERR 0x12 /* invalid PDU length */ +#define T4_ERR_OUT_OF_RQE 0x13 /* out of RQE */ +#define T4_ERR_DDP_VERSION 0x14 /* wrong DDP version */ +#define T4_ERR_RDMA_VERSION 0x15 /* wrong RDMA version */ +#define T4_ERR_OPCODE 0x16 /* invalid rdma opcode */ +#define T4_ERR_DDP_QUEUE_NUM 0x17 /* invalid ddp queue number */ +#define T4_ERR_MSN 0x18 /* MSN error */ +#define T4_ERR_TBIT 0x19 /* tag bit not set correctly */ +#define T4_ERR_MO 0x1A /* MO not 0 for TERMINATE */ + /* or READ_REQ */ +#define T4_ERR_MSN_GAP 0x1B +#define T4_ERR_MSN_RANGE 0x1C +#define T4_ERR_IRD_OVERFLOW 0x1D +#define T4_ERR_RQE_ADDR_BOUND 0x1E /* RQE addr out of bounds: */ + /* software error */ +#define T4_ERR_INTERNAL_ERR 0x1F /* internal error (opcode */ + /* mismatch) */ +/* + * CQE defs + */ +struct t4_cqe_common { + __be32 header; + __be32 len; + union { + struct { + __be32 stag; + __be32 msn; + } rcqe; + struct { + __be32 stag; + u16 nada2; + u16 cidx; + } scqe; + struct { + __be32 wrid_hi; + __be32 wrid_low; + } gen; + struct { + __be32 stag; + __be32 msn; + } srcqe; + struct { + __be32 mo; + __be32 msn; + } imm_data_rcqe; + u64 drain_cookie; + } u; +}; + +struct t4_cqe_b32 { + struct t4_cqe_common com; + __be64 reserved; + __be64 bits_type_ts; +}; + +struct t4_cqe_b64 { + struct t4_cqe_common com; + union { + struct { + __be32 reserved; + __be32 abs_rqe_idx; + } srcqe; + union { + struct { + __be32 imm_data32; + u32 reserved; + } ib_imm_data; + __be64 imm_data64; + } imm_data_rcqe; + __be64 flits[3]; + } u; + __be64 reserved[2]; + __be64 bits_type_ts; + +}; + +union t4_cqe { + struct t4_cqe_common com; + struct t4_cqe_b32 b32; + struct t4_cqe_b64 b64; +}; + +/* macros for flit 0 of the cqe */ + +#define S_CQE_QPID 12 +#define M_CQE_QPID 0xFFFFF +#define G_CQE_QPID(x) ((((x) >> S_CQE_QPID)) & M_CQE_QPID) +#define V_CQE_QPID(x) ((x)<<S_CQE_QPID) + +#define S_CQE_SWCQE 11 +#define M_CQE_SWCQE 0x1 +#define G_CQE_SWCQE(x) ((((x) >> S_CQE_SWCQE)) & M_CQE_SWCQE) +#define V_CQE_SWCQE(x) ((x)<<S_CQE_SWCQE) + +#define S_CQE_STATUS 5 +#define M_CQE_STATUS 0x1F +#define G_CQE_STATUS(x) ((((x) >> S_CQE_STATUS)) & M_CQE_STATUS) +#define V_CQE_STATUS(x) ((x)<<S_CQE_STATUS) + +#define S_CQE_TYPE 4 +#define M_CQE_TYPE 0x1 +#define G_CQE_TYPE(x) ((((x) >> S_CQE_TYPE)) & M_CQE_TYPE) +#define V_CQE_TYPE(x) ((x)<<S_CQE_TYPE) + +#define S_CQE_OPCODE 0 +#define M_CQE_OPCODE 0xF +#define G_CQE_OPCODE(x) ((((x) >> S_CQE_OPCODE)) & M_CQE_OPCODE) +#define V_CQE_OPCODE(x) ((x)<<S_CQE_OPCODE) + +#define SW_CQE(x) (G_CQE_SWCQE(be32toh((x)->header))) +#define CQE_QPID(x) (G_CQE_QPID(be32toh((x)->header))) +#define CQE_TYPE(x) (G_CQE_TYPE(be32toh((x)->header))) +#define SQ_TYPE(x) (CQE_TYPE((x))) +#define RQ_TYPE(x) (!CQE_TYPE((x))) +#define CQE_STATUS(x) (G_CQE_STATUS(be32toh((x)->header))) +#define CQE_OPCODE(x) (G_CQE_OPCODE(be32toh((x)->header))) + +#define CQE_SEND_OPCODE(x)( \ + (G_CQE_OPCODE(be32toh((x)->header)) == FW_RI_SEND) || \ + (G_CQE_OPCODE(be32toh((x)->header)) == FW_RI_SEND_WITH_SE) || \ + (G_CQE_OPCODE(be32toh((x)->header)) == FW_RI_SEND_WITH_INV) || \ + (G_CQE_OPCODE(be32toh((x)->header)) == FW_RI_SEND_WITH_SE_INV)) + +#define CQE_LEN(x) (be32toh((x)->len)) + +/* used for RQ completion processing */ +#define CQE_WRID_STAG(x) (be32toh((x)->u.rcqe.stag)) +#define CQE_WRID_MSN(x) (be32toh((x)->u.rcqe.msn)) +#define CQE_ABS_RQE_IDX(x) (be32toh((x)->u.srcqe.abs_rqe_idx)) +#define CQE_IMM_DATA(x) ((x)->u.imm_data_rcqe.ib_imm_data.imm_data32) + +/* used for SQ completion processing */ +#define CQE_WRID_SQ_IDX(x) (x)->u.scqe.cidx + +/* generic accessor macros */ +#define CQE_WRID_HI(x) ((x)->u.gen.wrid_hi) +#define CQE_WRID_LOW(x) ((x)->u.gen.wrid_low) + +/* macros for flit 3 of the cqe */ +#define S_CQE_GENBIT 63 +#define M_CQE_GENBIT 0x1 +#define G_CQE_GENBIT(x) (((x) >> S_CQE_GENBIT) & M_CQE_GENBIT) +#define V_CQE_GENBIT(x) ((x)<<S_CQE_GENBIT) + +#define S_CQE_OVFBIT 62 +#define M_CQE_OVFBIT 0x1 +#define G_CQE_OVFBIT(x) ((((x) >> S_CQE_OVFBIT)) & M_CQE_OVFBIT) + +#define S_CQE_IQTYPE 60 +#define M_CQE_IQTYPE 0x3 +#define G_CQE_IQTYPE(x) ((((x) >> S_CQE_IQTYPE)) & M_CQE_IQTYPE) + +#define M_CQE_TS 0x0fffffffffffffffULL +#define G_CQE_TS(x) ((x) & M_CQE_TS) + +#define CQE_OVFBIT(x) ((unsigned)G_CQE_OVFBIT(be64toh((x)->bits_type_ts))) +#define CQE_GENBIT(x) ((unsigned)G_CQE_GENBIT(be64toh((x)->bits_type_ts))) +#define CQE_TS(x) (G_CQE_TS(be64toh((x)->bits_type_ts))) + +#define CQE_SIZE(x) (is_64b_cqe ? sizeof(*(x)) : sizeof(*(x))/2) +#define Q_ENTRY(x, y) ((union t4_cqe *)(((u8 *)x) + ((CQE_SIZE(x))*y))) +#define GEN_BIT(x) (is_64b_cqe ? \ + ((x)->b64.bits_type_ts) : ((x)->b32.bits_type_ts)) +#define GEN_ADDR(x) (is_64b_cqe ? \ + (&((x)->b64.bits_type_ts)) : (&((x)->b32.bits_type_ts))) + +struct t4_swsqe { + u64 wr_id; + union t4_cqe cqe; + __be32 read_len; + int opcode; + int complete; + int signaled; + u16 idx; + int flushed; +}; + +enum { + T4_SQ_ONCHIP = (1 << 0), + T4_SQ_WRITE_W_IMM = (1 << 1) +}; + +struct t4_sq { + /* queue is either host memory or WC MMIO memory if + * t4_sq_onchip(). */ + union t4_wr *queue; + struct t4_swsqe *sw_sq; + struct t4_swsqe *oldest_read; + /* udb is either UC or WC MMIO memory depending on device version. */ + volatile u32 *udb; + size_t memsize; + u32 qid; + u32 bar2_qid; + void *ma_sync; + u16 in_use; + u16 size; + u16 cidx; + u16 pidx; + u16 wq_pidx; + u16 flags; + short flush_cidx; + int wc_reg_available; +}; + +struct t4_swrqe { + u64 wr_id; + int valid; +}; + +struct t4_rq { + union t4_recv_wr *queue; + struct t4_swrqe *sw_rq; + volatile u32 *udb; + size_t memsize; + u32 qid; + u32 bar2_qid; + u32 msn; + u32 rqt_hwaddr; + u16 rqt_size; + u16 in_use; + u16 size; + u16 cidx; + u16 pidx; + u16 wq_pidx; + int wc_reg_available; +}; + +struct t4_wq { + struct t4_sq sq; + struct t4_rq rq; + struct c4iw_rdev *rdev; + u32 qid_mask; + int error; + int flushed; + u8 *db_offp; + u8 *qp_errp; + u32 *srqidxp; +}; + +static inline int t4_rqes_posted(struct t4_wq *wq) +{ + return wq->rq.in_use; +} + +static inline int t4_rq_empty(struct t4_wq *wq) +{ + return wq->rq.in_use == 0; +} + +static inline int t4_rq_full(struct t4_wq *wq) +{ + return wq->rq.in_use == (wq->rq.size - 1); +} + +static inline u32 t4_rq_avail(struct t4_wq *wq) +{ + return wq->rq.size - 1 - wq->rq.in_use; +} + +static inline void t4_rq_produce(struct t4_wq *wq, u8 len16) +{ + wq->rq.in_use++; + if (++wq->rq.pidx == wq->rq.size) + wq->rq.pidx = 0; + wq->rq.wq_pidx += DIV_ROUND_UP(len16*16, T4_EQ_ENTRY_SIZE); + if (wq->rq.wq_pidx >= wq->rq.size * T4_RQ_NUM_SLOTS) + wq->rq.wq_pidx %= wq->rq.size * T4_RQ_NUM_SLOTS; + if (!wq->error) + wq->rq.queue[wq->rq.size].status.host_pidx = wq->rq.pidx; +} + +static inline void t4_rq_consume(struct t4_wq *wq) +{ + wq->rq.in_use--; + if (++wq->rq.cidx == wq->rq.size) + wq->rq.cidx = 0; + assert((wq->rq.cidx != wq->rq.pidx) || wq->rq.in_use == 0); + if (!wq->error) + wq->rq.queue[wq->rq.size].status.host_cidx = wq->rq.cidx; +} + +struct t4_srq_pending_wr { + u64 wr_id; + union t4_recv_wr wqe; + u8 len16; +}; + +struct t4_srq { + union t4_recv_wr *queue; + struct t4_swrqe *sw_rq; + u32 *udb; + size_t memsize; + u32 qid; + u32 bar2_qid; + u32 msn; + u32 rqt_hwaddr; + u32 rqt_abs_idx; + u16 in_use; + u16 size; + u16 cidx; + u16 pidx; + u16 wq_pidx; + int wc_reg_available; + struct t4_srq_pending_wr *pending_wrs; + u16 pending_cidx; + u16 pending_pidx; + u16 pending_in_use; + u16 ooo_count; +}; + +static inline u32 t4_srq_avail(struct t4_srq *srq) +{ + return srq->size - 1 - srq->in_use; +} + +static inline int t4_srq_empty(struct t4_srq *srq) +{ + return srq->in_use == 0; +} + +static inline int t4_srq_cidx_at_end(struct t4_srq *srq) +{ + assert(srq->cidx != srq->pidx); + if (srq->cidx < srq->pidx) + return srq->cidx == (srq->pidx - 1); + else + return srq->cidx == (srq->size - 1) && srq->pidx == 0; +} + +static inline int t4_srq_wrs_pending(struct t4_srq *srq) +{ + return srq->pending_cidx != srq->pending_pidx; +} + +static inline void t4_srq_produce(struct t4_srq *srq, u8 len16) +{ + srq->in_use++; + assert(srq->in_use < srq->size); + if (++srq->pidx == srq->size) + srq->pidx = 0; + assert(srq->cidx != srq->pidx); /* overflow */ + srq->wq_pidx += DIV_ROUND_UP(len16*16, T4_EQ_ENTRY_SIZE); + if (srq->wq_pidx >= srq->size * T4_RQ_NUM_SLOTS) + srq->wq_pidx %= srq->size * T4_RQ_NUM_SLOTS; + srq->queue[srq->size].status.host_pidx = srq->pidx; +} + +static inline void t4_srq_produce_pending_wr(struct t4_srq *srq) +{ + srq->pending_in_use++; + srq->in_use++; + assert(srq->pending_in_use < srq->size); + assert(srq->in_use < srq->size); + assert(srq->pending_pidx < srq->size); + if (++srq->pending_pidx == srq->size) + srq->pending_pidx = 0; +} + +static inline void t4_srq_consume_pending_wr(struct t4_srq *srq) +{ + assert(srq->pending_in_use > 0); + srq->pending_in_use--; + assert(srq->in_use > 0); + srq->in_use--; + if (++srq->pending_cidx == srq->size) + srq->pending_cidx = 0; + assert((srq->pending_cidx != srq->pending_pidx) || + srq->pending_in_use == 0); +} + +static inline void t4_srq_produce_ooo(struct t4_srq *srq) +{ + assert(srq->in_use > 0); + srq->in_use--; + srq->ooo_count++; + assert(srq->ooo_count < srq->size); +} + +static inline void t4_srq_consume_ooo(struct t4_srq *srq) +{ + srq->cidx++; + if (srq->cidx == srq->size) + srq->cidx = 0; + srq->queue[srq->size].status.host_cidx = srq->cidx; + assert(srq->ooo_count > 0); + srq->ooo_count--; +} + +static inline void t4_srq_consume(struct t4_srq *srq) +{ + assert(srq->in_use > 0); + srq->in_use--; + if (++srq->cidx == srq->size) + srq->cidx = 0; + assert((srq->cidx != srq->pidx) || srq->in_use == 0); + srq->queue[srq->size].status.host_cidx = srq->cidx; +} + +static inline int t4_wq_in_error(struct t4_wq *wq) +{ + return wq->error || *wq->qp_errp; +} + +static inline u32 t4_wq_srqidx(struct t4_wq *wq) +{ + u32 srqidx; + + if (!wq->srqidxp) + return 0; + srqidx = *wq->srqidxp; + wq->srqidxp = 0; + return srqidx; +} + +static inline int t4_sq_empty(struct t4_wq *wq) +{ + return wq->sq.in_use == 0; +} + +static inline int t4_sq_full(struct t4_wq *wq) +{ + return wq->sq.in_use == (wq->sq.size - 1); +} + +static inline u32 t4_sq_avail(struct t4_wq *wq) +{ + return wq->sq.size - 1 - wq->sq.in_use; +} + +static inline int t4_sq_onchip(struct t4_wq *wq) +{ + return wq->sq.flags & T4_SQ_ONCHIP; +} + +static inline void t4_sq_produce(struct t4_wq *wq, u8 len16) +{ + wq->sq.in_use++; + if (++wq->sq.pidx == wq->sq.size) + wq->sq.pidx = 0; + wq->sq.wq_pidx += DIV_ROUND_UP(len16*16, T4_EQ_ENTRY_SIZE); + if (wq->sq.wq_pidx >= wq->sq.size * T4_SQ_NUM_SLOTS) + wq->sq.wq_pidx %= wq->sq.size * T4_SQ_NUM_SLOTS; + if (!wq->error) { + /* This write is only for debugging, the value does not matter + * for DMA */ + wq->sq.queue[wq->sq.size].status.host_pidx = (wq->sq.pidx); + } +} + +static inline void t4_sq_consume(struct t4_wq *wq) +{ + assert(wq->sq.in_use >= 1); + if (wq->sq.cidx == wq->sq.flush_cidx) + wq->sq.flush_cidx = -1; + wq->sq.in_use--; + if (++wq->sq.cidx == wq->sq.size) + wq->sq.cidx = 0; + assert((wq->sq.cidx != wq->sq.pidx) || wq->sq.in_use == 0); + if (!wq->error){ + /* This write is only for debugging, the value does not matter + * for DMA */ + wq->sq.queue[wq->sq.size].status.host_cidx = wq->sq.cidx; + } +} + +/* Copies to WC MMIO memory */ +static void copy_wqe_to_udb(volatile u32 *udb_offset, void *wqe) +{ + u64 *src, *dst; + int len16 = 4; + + src = (u64 *)wqe; + dst = (u64 *)udb_offset; + + while (len16) { + *dst++ = *src++; + *dst++ = *src++; + len16--; + } +} + +extern int ma_wr; +extern int t5_en_wc; + +static inline void t4_ring_sq_db(struct t4_wq *wq, u16 inc, u8 t4, u8 len16, + union t4_wr *wqe) +{ + if (!t4) { + mmio_wc_start(); + if (t5_en_wc && inc == 1 && wq->sq.wc_reg_available) { + PDBG("%s: WC wq->sq.pidx = %d; len16=%d\n", + __func__, wq->sq.pidx, len16); + copy_wqe_to_udb(wq->sq.udb + 14, wqe); + } else { + PDBG("%s: DB wq->sq.pidx = %d; len16=%d\n", + __func__, wq->sq.pidx, len16); + writel(QID_V(wq->sq.bar2_qid) | PIDX_T5_V(inc), + wq->sq.udb); + } + /* udb is WC for > t4 devices */ + mmio_flush_writes(); + return; + } + + udma_to_device_barrier(); + if (ma_wr) { + if (t4_sq_onchip(wq)) { + int i; + + mmio_wc_start(); + for (i = 0; i < 16; i++) + *(volatile u32 *)&wq->sq.queue[wq->sq.size].flits[2+i] = i; + mmio_flush_writes(); + } + } else { + if (t4_sq_onchip(wq)) { + int i; + + mmio_wc_start(); + for (i = 0; i < 16; i++) + /* FIXME: What is this supposed to be doing? + * Writing to the same address multiple times + * with WC memory is not guarenteed to + * generate any more than one TLP. Why isn't + * writing to WC memory marked volatile? */ + *(u32 *)&wq->sq.queue[wq->sq.size].flits[2] = i; + mmio_flush_writes(); + } + } + /* udb is UC for t4 devices */ + writel(QID_V(wq->sq.qid & wq->qid_mask) | PIDX_V(inc), wq->sq.udb); +} + +static inline void t4_ring_rq_db(struct t4_wq *wq, u16 inc, u8 t4, u8 len16, + union t4_recv_wr *wqe) +{ + if (!t4) { + mmio_wc_start(); + if (t5_en_wc && inc == 1 && wq->sq.wc_reg_available) { + PDBG("%s: WC wq->rq.pidx = %d; len16=%d\n", + __func__, wq->rq.pidx, len16); + copy_wqe_to_udb(wq->rq.udb + 14, wqe); + } else { + PDBG("%s: DB wq->rq.pidx = %d; len16=%d\n", + __func__, wq->rq.pidx, len16); + writel(QID_V(wq->rq.bar2_qid) | PIDX_T5_V(inc), + wq->rq.udb); + } + /* udb is WC for > t4 devices */ + mmio_flush_writes(); + return; + } + /* udb is UC for t4 devices */ + udma_to_device_barrier(); + writel(QID_V(wq->rq.qid & wq->qid_mask) | PIDX_V(inc), wq->rq.udb); +} + +static inline void t4_ring_srq_db(struct t4_srq *srq, u16 inc, u8 len16, + union t4_recv_wr *wqe) +{ + mmio_wc_start(); + if (t5_en_wc && inc == 1 && srq->wc_reg_available) { + PDBG("%s: WC srq->pidx = %d; len16=%d\n", + __func__, srq->pidx, len16); + copy_wqe_to_udb(srq->udb + 14, wqe); + } else { + PDBG("%s: DB srq->pidx = %d; len16=%d\n", + __func__, srq->pidx, len16); + writel(QID_V(srq->bar2_qid) | PIDX_T5_V(inc), srq->udb); + } + mmio_flush_writes(); + return; +} + +static inline void t4_set_wq_in_error(struct t4_wq *wq) +{ + *wq->qp_errp = 1; +} + +extern int c4iw_abi_version; + +static inline int t4_wq_db_enabled(struct t4_wq *wq) +{ + /* + * If iw_cxgb4 driver supports door bell drop recovery then its + * c4iw_abi_version would be greater than or equal to 2. In such + * case return the status of db_off flag to ring the kernel mode + * DB from user mode library. + */ + if ( c4iw_abi_version >= 2 ) + return ! *wq->db_offp; + else + return 1; +} + +struct t4_cq { + union t4_cqe *queue; + union t4_cqe *sw_queue; + struct c4iw_rdev *rdev; + volatile u32 *ugts; + size_t memsize; + u64 bits_type_ts; + u32 cqid; + u32 qid_mask; + u16 size; /* including status page */ + u16 cidx; + u16 sw_pidx; + u16 sw_cidx; + u16 sw_in_use; + u16 cidx_inc; + u8 gen; + u8 error; + u8 *qp_errp; +}; + +static inline int t4_arm_cq(struct t4_cq *cq, int se) +{ + u32 val; + + while (cq->cidx_inc > CIDXINC_M) { + val = SEINTARM_V(0) | CIDXINC_V(CIDXINC_M) | TIMERREG_V(7) | + INGRESSQID_V(cq->cqid & cq->qid_mask); + writel(val, cq->ugts); + cq->cidx_inc -= CIDXINC_M; + } + val = SEINTARM_V(se) | CIDXINC_V(cq->cidx_inc) | TIMERREG_V(6) | + INGRESSQID_V(cq->cqid & cq->qid_mask); + writel(val, cq->ugts); + cq->cidx_inc = 0; + return 0; +} + +static inline void t4_swcq_produce(struct t4_cq *cq) +{ + cq->sw_in_use++; + if (cq->sw_in_use == cq->size) { + syslog(LOG_NOTICE, "cxgb4 sw cq overflow cqid %u\n", cq->cqid); + cq->error = 1; + assert(0); + } + if (++cq->sw_pidx == cq->size) + cq->sw_pidx = 0; +} + +static inline void t4_swcq_consume(struct t4_cq *cq) +{ + assert(cq->sw_in_use >= 1); + cq->sw_in_use--; + if (++cq->sw_cidx == cq->size) + cq->sw_cidx = 0; +} + +static inline void t4_hwcq_consume(struct t4_cq *cq) +{ + cq->bits_type_ts = GEN_BIT(Q_ENTRY(cq->queue, cq->cidx)); + if (++cq->cidx_inc == (cq->size >> 4) || cq->cidx_inc == CIDXINC_M) { + uint32_t val; + + val = SEINTARM_V(0) | CIDXINC_V(cq->cidx_inc) | TIMERREG_V(7) | + INGRESSQID_V(cq->cqid & cq->qid_mask); + writel(val, cq->ugts); + cq->cidx_inc = 0; + } + if (++cq->cidx == cq->size) { + cq->cidx = 0; + cq->gen ^= 1; + } + ((struct t4_status_page *)Q_ENTRY(cq->queue, cq->size))->host_cidx = + cq->cidx; +} + +static inline int t4_valid_cqe(struct t4_cq *cq, union t4_cqe *cqe) +{ + return (is_64b_cqe ? CQE_GENBIT(&cqe->b64) : (CQE_GENBIT(&cqe->b32))) + == cq->gen; +} + +static inline int t4_next_hw_cqe(struct t4_cq *cq, union t4_cqe **cqe) +{ + int ret; + u16 prev_cidx; + + if (cq->cidx == 0) + prev_cidx = cq->size - 1; + else + prev_cidx = cq->cidx - 1; + + if (GEN_BIT(Q_ENTRY(cq->queue, prev_cidx)) != cq->bits_type_ts) { + ret = -EOVERFLOW; + syslog(LOG_NOTICE, "cxgb4 cq overflow cqid %u\n", cq->cqid); + cq->error = 1; + assert(0); + } else if (t4_valid_cqe(cq, Q_ENTRY(cq->queue, cq->cidx))) { + udma_from_device_barrier(); + *cqe = Q_ENTRY(cq->queue, cq->cidx); + ret = 0; + } else + ret = -ENODATA; + return ret; +} + +static inline union t4_cqe *t4_next_sw_cqe(struct t4_cq *cq) +{ + if (cq->sw_in_use == cq->size) { + syslog(LOG_NOTICE, "cxgb4 sw cq overflow cqid %u\n", cq->cqid); + cq->error = 1; + assert(0); + return NULL; + } + if (cq->sw_in_use) + return Q_ENTRY(cq->sw_queue, cq->sw_cidx); + return NULL; +} + +static inline int t4_cq_notempty(struct t4_cq *cq) +{ + return cq->sw_in_use || t4_valid_cqe(cq, Q_ENTRY(cq->queue, cq->cidx)); +} + +static inline int t4_next_cqe(struct t4_cq *cq, union t4_cqe **cqe) +{ + int ret = 0; + + if (cq->error) + ret = -ENODATA; + else if (cq->sw_in_use) + *cqe = Q_ENTRY(cq->sw_queue, cq->sw_cidx); + else ret = t4_next_hw_cqe(cq, cqe); + return ret; +} + +static inline int t4_cq_in_error(struct t4_cq *cq) +{ + return *cq->qp_errp; +} + +static inline void t4_set_cq_in_error(struct t4_cq *cq) +{ + *cq->qp_errp = 1; +} + +static inline void t4_reset_cq_in_error(struct t4_cq *cq) +{ + *cq->qp_errp = 0; +} + +struct t4_dev_status_page +{ + u8 db_off; + u8 write_cmpl_supported; + u16 pad2; + u32 pad3; + u64 qp_start; + u64 qp_size; + u64 cq_start; + u64 cq_size; +}; + +#endif diff --git a/providers/cxgb4/t4_chip_type.h b/providers/cxgb4/t4_chip_type.h new file mode 100644 index 0000000..54b7181 --- /dev/null +++ b/providers/cxgb4/t4_chip_type.h @@ -0,0 +1,85 @@ +/* + * This file is part of the Chelsio T4 Ethernet driver for Linux. + * + * Copyright (c) 2003-2015 Chelsio Communications, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __T4_CHIP_TYPE_H__ +#define __T4_CHIP_TYPE_H__ + +#define CHELSIO_T4 0x4 +#define CHELSIO_T5 0x5 +#define CHELSIO_T6 0x6 + +/* We code the Chelsio T4 Family "Chip Code" as a tuple: + * + * (Chip Version, Chip Revision) + * + * where: + * + * Chip Version: is T4, T5, etc. + * Chip Revision: is the FAB "spin" of the Chip Version. + */ +#define CHELSIO_CHIP_CODE(version, revision) (((version) << 4) | (revision)) +#define CHELSIO_CHIP_VERSION(code) (((code) >> 4) & 0xf) +#define CHELSIO_CHIP_RELEASE(code) ((code) & 0xf) + +enum chip_type { + T4_A1 = CHELSIO_CHIP_CODE(CHELSIO_T4, 1), + T4_A2 = CHELSIO_CHIP_CODE(CHELSIO_T4, 2), + T4_FIRST_REV = T4_A1, + T4_LAST_REV = T4_A2, + + T5_A0 = CHELSIO_CHIP_CODE(CHELSIO_T5, 0), + T5_A1 = CHELSIO_CHIP_CODE(CHELSIO_T5, 1), + T5_FIRST_REV = T5_A0, + T5_LAST_REV = T5_A1, + + T6_A0 = CHELSIO_CHIP_CODE(CHELSIO_T6, 0), + T6_FIRST_REV = T6_A0, + T6_LAST_REV = T6_A0, +}; + +static inline int is_t4(enum chip_type chip) +{ + return (CHELSIO_CHIP_VERSION(chip) == CHELSIO_T4); +} + +static inline int is_t5(enum chip_type chip) +{ + return (CHELSIO_CHIP_VERSION(chip) == CHELSIO_T5); +} + +static inline int is_t6(enum chip_type chip) +{ + return (CHELSIO_CHIP_VERSION(chip) == CHELSIO_T6); +} + +#endif /* __T4_CHIP_TYPE_H__ */ diff --git a/providers/cxgb4/t4_pci_id_tbl.h b/providers/cxgb4/t4_pci_id_tbl.h new file mode 100644 index 0000000..ba6f13e --- /dev/null +++ b/providers/cxgb4/t4_pci_id_tbl.h @@ -0,0 +1,214 @@ +/* + * This file is part of the Chelsio T4/T5 Ethernet driver for Linux. + * + * Copyright (c) 2003-2014 Chelsio Communications, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __T4_PCI_ID_TBL_H__ +#define __T4_PCI_ID_TBL_H__ + +/* The code can defined cpp macros for creating a PCI Device ID Table. This is + * useful because it allows the PCI ID Table to be maintained in a single place. + * + * The macros are: + * + * CH_PCI_DEVICE_ID_TABLE_DEFINE_BEGIN + * -- Used to start the definition of the PCI ID Table. + * + * CH_PCI_DEVICE_ID_FUNCTION + * -- The PCI Function Number to use in the PCI Device ID Table. "0" + * -- for drivers attaching to PF0-3, "4" for drivers attaching to PF4, + * -- "8" for drivers attaching to SR-IOV Virtual Functions, etc. + * + * CH_PCI_DEVICE_ID_FUNCTION2 [optional] + * -- If defined, create a PCI Device ID Table with both + * -- CH_PCI_DEVICE_ID_FUNCTION and CH_PCI_DEVICE_ID_FUNCTION2 populated. + * + * CH_PCI_ID_TABLE_ENTRY(DeviceID) + * -- Used for the individual PCI Device ID entries. Note that we will + * -- be adding a trailing comma (",") after all of the entries (and + * -- between the pairs of entries if CH_PCI_DEVICE_ID_FUNCTION2 is defined). + * + * CH_PCI_DEVICE_ID_TABLE_DEFINE_END + * -- Used to finish the definition of the PCI ID Table. Note that we + * -- will be adding a trailing semi-colon (";") here. + */ +#ifndef CH_PCI_DEVICE_ID_FUNCTION +#error CH_PCI_DEVICE_ID_FUNCTION not defined! +#endif +#ifndef CH_PCI_ID_TABLE_ENTRY +#error CH_PCI_ID_TABLE_ENTRY not defined! +#endif +#ifndef CH_PCI_DEVICE_ID_TABLE_DEFINE_END +#error CH_PCI_DEVICE_ID_TABLE_DEFINE_END not defined! +#endif + +/* T4 and later ASICs use a PCI Device ID scheme of 0xVFPP where: + * + * V = "4" for T4; "5" for T5, etc. + * F = "0" for PF 0..3; "4".."7" for PF4..7; and "8" for VFs + * PP = adapter product designation + * + * We use this consistency in order to create the proper PCI Device IDs + * for the specified CH_PCI_DEVICE_ID_FUNCTION. + */ +#ifndef CH_PCI_DEVICE_ID_FUNCTION2 +#define CH_PCI_ID_TABLE_FENTRY(devid) \ + CH_PCI_ID_TABLE_ENTRY((devid) | \ + ((CH_PCI_DEVICE_ID_FUNCTION) << 8)) +#else +#define CH_PCI_ID_TABLE_FENTRY(devid) \ + CH_PCI_ID_TABLE_ENTRY((devid) | \ + ((CH_PCI_DEVICE_ID_FUNCTION) << 8)), \ + CH_PCI_ID_TABLE_ENTRY((devid) | \ + ((CH_PCI_DEVICE_ID_FUNCTION2) << 8)) +#endif + +CH_PCI_DEVICE_ID_TABLE_DEFINE_BEGIN + /* T4 adapters: + */ + CH_PCI_ID_TABLE_FENTRY(0x4000), /* T440-dbg */ + CH_PCI_ID_TABLE_FENTRY(0x4001), /* T420-cr */ + CH_PCI_ID_TABLE_FENTRY(0x4002), /* T422-cr */ + CH_PCI_ID_TABLE_FENTRY(0x4003), /* T440-cr */ + CH_PCI_ID_TABLE_FENTRY(0x4004), /* T420-bch */ + CH_PCI_ID_TABLE_FENTRY(0x4005), /* T440-bch */ + CH_PCI_ID_TABLE_FENTRY(0x4006), /* T440-ch */ + CH_PCI_ID_TABLE_FENTRY(0x4007), /* T420-so */ + CH_PCI_ID_TABLE_FENTRY(0x4008), /* T420-cx */ + CH_PCI_ID_TABLE_FENTRY(0x4009), /* T420-bt */ + CH_PCI_ID_TABLE_FENTRY(0x400a), /* T404-bt */ + CH_PCI_ID_TABLE_FENTRY(0x400b), /* B420-sr */ + CH_PCI_ID_TABLE_FENTRY(0x400c), /* B404-bt */ + CH_PCI_ID_TABLE_FENTRY(0x400d), /* T480-cr */ + CH_PCI_ID_TABLE_FENTRY(0x400e), /* T440-LP-cr */ + CH_PCI_ID_TABLE_FENTRY(0x4080), /* Custom T480-cr */ + CH_PCI_ID_TABLE_FENTRY(0x4081), /* Custom T440-cr */ + CH_PCI_ID_TABLE_FENTRY(0x4082), /* Custom T420-cr */ + CH_PCI_ID_TABLE_FENTRY(0x4083), /* Custom T420-xaui */ + CH_PCI_ID_TABLE_FENTRY(0x4084), /* Custom T440-cr */ + CH_PCI_ID_TABLE_FENTRY(0x4085), /* Custom T420-cr */ + CH_PCI_ID_TABLE_FENTRY(0x4086), /* Custom T440-bt */ + CH_PCI_ID_TABLE_FENTRY(0x4087), /* Custom T440-cr */ + CH_PCI_ID_TABLE_FENTRY(0x4088), /* Custom T440 2-xaui, 2-xfi */ + + /* T5 adapters: + */ + CH_PCI_ID_TABLE_FENTRY(0x5000), /* T580-dbg */ + CH_PCI_ID_TABLE_FENTRY(0x5001), /* T520-cr */ + CH_PCI_ID_TABLE_FENTRY(0x5002), /* T522-cr */ + CH_PCI_ID_TABLE_FENTRY(0x5003), /* T540-cr */ + CH_PCI_ID_TABLE_FENTRY(0x5004), /* T520-bch */ + CH_PCI_ID_TABLE_FENTRY(0x5005), /* T540-bch */ + CH_PCI_ID_TABLE_FENTRY(0x5006), /* T540-ch */ + CH_PCI_ID_TABLE_FENTRY(0x5007), /* T520-so */ + CH_PCI_ID_TABLE_FENTRY(0x5008), /* T520-cx */ + CH_PCI_ID_TABLE_FENTRY(0x5009), /* T520-bt */ + CH_PCI_ID_TABLE_FENTRY(0x500a), /* T504-bt */ + CH_PCI_ID_TABLE_FENTRY(0x500b), /* B520-sr */ + CH_PCI_ID_TABLE_FENTRY(0x500c), /* B504-bt */ + CH_PCI_ID_TABLE_FENTRY(0x500d), /* T580-cr */ + CH_PCI_ID_TABLE_FENTRY(0x500e), /* T540-LP-cr */ + CH_PCI_ID_TABLE_FENTRY(0x5010), /* T580-LP-cr */ + CH_PCI_ID_TABLE_FENTRY(0x5011), /* T520-LL-cr */ + CH_PCI_ID_TABLE_FENTRY(0x5012), /* T560-cr */ + CH_PCI_ID_TABLE_FENTRY(0x5013), /* T580-chr */ + CH_PCI_ID_TABLE_FENTRY(0x5014), /* T580-so */ + CH_PCI_ID_TABLE_FENTRY(0x5015), /* T502-bt */ + CH_PCI_ID_TABLE_FENTRY(0x5016), /* T580-OCP-SO */ + CH_PCI_ID_TABLE_FENTRY(0x5017), /* T520-OCP-SO */ + CH_PCI_ID_TABLE_FENTRY(0x5018), /* T540-BT */ + CH_PCI_ID_TABLE_FENTRY(0x5080), /* Custom T540-cr */ + CH_PCI_ID_TABLE_FENTRY(0x5081), /* Custom T540-LL-cr */ + CH_PCI_ID_TABLE_FENTRY(0x5082), /* Custom T504-cr */ + CH_PCI_ID_TABLE_FENTRY(0x5083), /* Custom T540-LP-CR */ + CH_PCI_ID_TABLE_FENTRY(0x5084), /* Custom T580-cr */ + CH_PCI_ID_TABLE_FENTRY(0x5085), /* Custom 3x T580-CR */ + CH_PCI_ID_TABLE_FENTRY(0x5086), /* Custom 2x T580-CR */ + CH_PCI_ID_TABLE_FENTRY(0x5087), /* Custom T580-CR */ + CH_PCI_ID_TABLE_FENTRY(0x5088), /* Custom T570-CR */ + CH_PCI_ID_TABLE_FENTRY(0x5089), /* Custom T520-CR */ + CH_PCI_ID_TABLE_FENTRY(0x5090), /* Custom T540-CR */ + CH_PCI_ID_TABLE_FENTRY(0x5091), /* Custom T522-CR */ + CH_PCI_ID_TABLE_FENTRY(0x5092), /* Custom T520-CR */ + CH_PCI_ID_TABLE_FENTRY(0x5093), /* Custom T580-LP-CR */ + CH_PCI_ID_TABLE_FENTRY(0x5094), /* Custom T540-CR */ + CH_PCI_ID_TABLE_FENTRY(0x5095), /* Custom T540-CR-SO */ + CH_PCI_ID_TABLE_FENTRY(0x5096), /* Custom T580-CR */ + CH_PCI_ID_TABLE_FENTRY(0x5097), /* Custom T520-KR */ + CH_PCI_ID_TABLE_FENTRY(0x5098), /* Custom 2x40G QSFP */ + CH_PCI_ID_TABLE_FENTRY(0x5099), /* Custom 2x40G QSFP */ + CH_PCI_ID_TABLE_FENTRY(0x509a), /* Custom T520-CR */ + CH_PCI_ID_TABLE_FENTRY(0x509b), /* Custom T540-CR LOM */ + CH_PCI_ID_TABLE_FENTRY(0x509c), /* Custom T520-CR*/ + CH_PCI_ID_TABLE_FENTRY(0x509d), /* Custom T540-CR*/ + CH_PCI_ID_TABLE_FENTRY(0x509e), /* Custom T520-CR */ + CH_PCI_ID_TABLE_FENTRY(0x509f), /* Custom T540-CR */ + CH_PCI_ID_TABLE_FENTRY(0x50a0), /* Custom T540-CR */ + CH_PCI_ID_TABLE_FENTRY(0x50a1), /* Custom T540-CR */ + CH_PCI_ID_TABLE_FENTRY(0x50a2), /* Custom T540-KR4 */ + CH_PCI_ID_TABLE_FENTRY(0x50a3), /* Custom T580-KR4 */ + CH_PCI_ID_TABLE_FENTRY(0x50a4), /* Custom 2x T540-CR */ + CH_PCI_ID_TABLE_FENTRY(0x50a5), /* Custom T522-BT */ + CH_PCI_ID_TABLE_FENTRY(0x50a6), /* Custom T522-BT-SO */ + CH_PCI_ID_TABLE_FENTRY(0x50a7), /* Custom T580-CR */ + CH_PCI_ID_TABLE_FENTRY(0x50a8), /* Custom T580-KR */ + CH_PCI_ID_TABLE_FENTRY(0x50a9), /* Custom T580-KR */ + CH_PCI_ID_TABLE_FENTRY(0x50aa), /* Custom T580-CR */ + CH_PCI_ID_TABLE_FENTRY(0x50ab), /* Custom T520-CR */ + CH_PCI_ID_TABLE_FENTRY(0x50ac), /* Custom T540-BT */ + + /* T6 adapters: + */ + CH_PCI_ID_TABLE_FENTRY(0x6001), + CH_PCI_ID_TABLE_FENTRY(0x6002), + CH_PCI_ID_TABLE_FENTRY(0x6003), + CH_PCI_ID_TABLE_FENTRY(0x6004), + CH_PCI_ID_TABLE_FENTRY(0x6005), + CH_PCI_ID_TABLE_FENTRY(0x6006), + CH_PCI_ID_TABLE_FENTRY(0x6007), + CH_PCI_ID_TABLE_FENTRY(0x6008), + CH_PCI_ID_TABLE_FENTRY(0x6009), + CH_PCI_ID_TABLE_FENTRY(0x600d), + CH_PCI_ID_TABLE_FENTRY(0x6010), + CH_PCI_ID_TABLE_FENTRY(0x6011), + CH_PCI_ID_TABLE_FENTRY(0x6014), + CH_PCI_ID_TABLE_FENTRY(0x6015), + CH_PCI_ID_TABLE_FENTRY(0x6080), + CH_PCI_ID_TABLE_FENTRY(0x6081), + CH_PCI_ID_TABLE_FENTRY(0x6082), /* Custom T6225-CR SFP28 */ + CH_PCI_ID_TABLE_FENTRY(0x6083), /* Custom T62100-CR QSFP28 */ + CH_PCI_ID_TABLE_FENTRY(0x6084), /* Custom T64100-CR QSFP28 */ + CH_PCI_ID_TABLE_FENTRY(0x6085), /* Custom T6240-SO */ + CH_PCI_ID_TABLE_FENTRY(0x6086), /* Custom T6225-SO-CR */ + CH_PCI_ID_TABLE_FENTRY(0x6087), /* Custom T6225-CR */ +CH_PCI_DEVICE_ID_TABLE_DEFINE_END; + +#endif /* __T4_PCI_ID_TBL_H__ */ diff --git a/providers/cxgb4/t4_regs.h b/providers/cxgb4/t4_regs.h new file mode 100644 index 0000000..7351622 --- /dev/null +++ b/providers/cxgb4/t4_regs.h @@ -0,0 +1,3129 @@ +/* + * This file is part of the Chelsio T4 Ethernet driver for Linux. + * + * Copyright (c) 2003-2014 Chelsio Communications, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef __T4_REGS_H +#define __T4_REGS_H + +#define MYPF_BASE 0x1b000 +#define MYPF_REG(reg_addr) (MYPF_BASE + (reg_addr)) + +#define PF0_BASE 0x1e000 +#define PF0_REG(reg_addr) (PF0_BASE + (reg_addr)) + +#define PF_STRIDE 0x400 +#define PF_BASE(idx) (PF0_BASE + (idx) * PF_STRIDE) +#define PF_REG(idx, reg) (PF_BASE(idx) + (reg)) + +#define MYPORT_BASE 0x1c000 +#define MYPORT_REG(reg_addr) (MYPORT_BASE + (reg_addr)) + +#define PORT0_BASE 0x20000 +#define PORT0_REG(reg_addr) (PORT0_BASE + (reg_addr)) + +#define PORT_STRIDE 0x2000 +#define PORT_BASE(idx) (PORT0_BASE + (idx) * PORT_STRIDE) +#define PORT_REG(idx, reg) (PORT_BASE(idx) + (reg)) + +#define EDC_STRIDE (EDC_1_BASE_ADDR - EDC_0_BASE_ADDR) +#define EDC_REG(reg, idx) (reg + EDC_STRIDE * idx) + +#define PCIE_MEM_ACCESS_REG(reg_addr, idx) ((reg_addr) + (idx) * 8) +#define PCIE_MAILBOX_REG(reg_addr, idx) ((reg_addr) + (idx) * 8) +#define MC_BIST_STATUS_REG(reg_addr, idx) ((reg_addr) + (idx) * 4) +#define EDC_BIST_STATUS_REG(reg_addr, idx) ((reg_addr) + (idx) * 4) + +#define PCIE_FW_REG(reg_addr, idx) ((reg_addr) + (idx) * 4) + +#define SGE_PF_KDOORBELL_A 0x0 + +#define QID_S 15 +#define QID_V(x) ((x) << QID_S) + +#define DBPRIO_S 14 +#define DBPRIO_V(x) ((x) << DBPRIO_S) +#define DBPRIO_F DBPRIO_V(1U) + +#define PIDX_S 0 +#define PIDX_V(x) ((x) << PIDX_S) + +#define SGE_VF_KDOORBELL_A 0x0 + +#define DBTYPE_S 13 +#define DBTYPE_V(x) ((x) << DBTYPE_S) +#define DBTYPE_F DBTYPE_V(1U) + +#define PIDX_T5_S 0 +#define PIDX_T5_M 0x1fffU +#define PIDX_T5_V(x) ((x) << PIDX_T5_S) +#define PIDX_T5_G(x) (((x) >> PIDX_T5_S) & PIDX_T5_M) + +#define SGE_PF_GTS_A 0x4 + +#define INGRESSQID_S 16 +#define INGRESSQID_V(x) ((x) << INGRESSQID_S) + +#define TIMERREG_S 13 +#define TIMERREG_V(x) ((x) << TIMERREG_S) + +#define SEINTARM_S 12 +#define SEINTARM_V(x) ((x) << SEINTARM_S) + +#define CIDXINC_S 0 +#define CIDXINC_M 0xfffU +#define CIDXINC_V(x) ((x) << CIDXINC_S) + +#define SGE_CONTROL_A 0x1008 +#define SGE_CONTROL2_A 0x1124 + +#define RXPKTCPLMODE_S 18 +#define RXPKTCPLMODE_V(x) ((x) << RXPKTCPLMODE_S) +#define RXPKTCPLMODE_F RXPKTCPLMODE_V(1U) + +#define EGRSTATUSPAGESIZE_S 17 +#define EGRSTATUSPAGESIZE_V(x) ((x) << EGRSTATUSPAGESIZE_S) +#define EGRSTATUSPAGESIZE_F EGRSTATUSPAGESIZE_V(1U) + +#define PKTSHIFT_S 10 +#define PKTSHIFT_M 0x7U +#define PKTSHIFT_V(x) ((x) << PKTSHIFT_S) +#define PKTSHIFT_G(x) (((x) >> PKTSHIFT_S) & PKTSHIFT_M) + +#define INGPCIEBOUNDARY_S 7 +#define INGPCIEBOUNDARY_V(x) ((x) << INGPCIEBOUNDARY_S) + +#define INGPADBOUNDARY_S 4 +#define INGPADBOUNDARY_M 0x7U +#define INGPADBOUNDARY_V(x) ((x) << INGPADBOUNDARY_S) +#define INGPADBOUNDARY_G(x) (((x) >> INGPADBOUNDARY_S) & INGPADBOUNDARY_M) + +#define EGRPCIEBOUNDARY_S 1 +#define EGRPCIEBOUNDARY_V(x) ((x) << EGRPCIEBOUNDARY_S) + +#define INGPACKBOUNDARY_S 16 +#define INGPACKBOUNDARY_M 0x7U +#define INGPACKBOUNDARY_V(x) ((x) << INGPACKBOUNDARY_S) +#define INGPACKBOUNDARY_G(x) (((x) >> INGPACKBOUNDARY_S) \ + & INGPACKBOUNDARY_M) + +#define VFIFO_ENABLE_S 10 +#define VFIFO_ENABLE_V(x) ((x) << VFIFO_ENABLE_S) +#define VFIFO_ENABLE_F VFIFO_ENABLE_V(1U) + +#define SGE_DBVFIFO_BADDR_A 0x1138 + +#define DBVFIFO_SIZE_S 6 +#define DBVFIFO_SIZE_M 0xfffU +#define DBVFIFO_SIZE_G(x) (((x) >> DBVFIFO_SIZE_S) & DBVFIFO_SIZE_M) + +#define T6_DBVFIFO_SIZE_S 0 +#define T6_DBVFIFO_SIZE_M 0x1fffU +#define T6_DBVFIFO_SIZE_G(x) (((x) >> T6_DBVFIFO_SIZE_S) & T6_DBVFIFO_SIZE_M) + +#define GLOBALENABLE_S 0 +#define GLOBALENABLE_V(x) ((x) << GLOBALENABLE_S) +#define GLOBALENABLE_F GLOBALENABLE_V(1U) + +#define SGE_HOST_PAGE_SIZE_A 0x100c + +#define HOSTPAGESIZEPF7_S 28 +#define HOSTPAGESIZEPF7_M 0xfU +#define HOSTPAGESIZEPF7_V(x) ((x) << HOSTPAGESIZEPF7_S) +#define HOSTPAGESIZEPF7_G(x) (((x) >> HOSTPAGESIZEPF7_S) & HOSTPAGESIZEPF7_M) + +#define HOSTPAGESIZEPF6_S 24 +#define HOSTPAGESIZEPF6_M 0xfU +#define HOSTPAGESIZEPF6_V(x) ((x) << HOSTPAGESIZEPF6_S) +#define HOSTPAGESIZEPF6_G(x) (((x) >> HOSTPAGESIZEPF6_S) & HOSTPAGESIZEPF6_M) + +#define HOSTPAGESIZEPF5_S 20 +#define HOSTPAGESIZEPF5_M 0xfU +#define HOSTPAGESIZEPF5_V(x) ((x) << HOSTPAGESIZEPF5_S) +#define HOSTPAGESIZEPF5_G(x) (((x) >> HOSTPAGESIZEPF5_S) & HOSTPAGESIZEPF5_M) + +#define HOSTPAGESIZEPF4_S 16 +#define HOSTPAGESIZEPF4_M 0xfU +#define HOSTPAGESIZEPF4_V(x) ((x) << HOSTPAGESIZEPF4_S) +#define HOSTPAGESIZEPF4_G(x) (((x) >> HOSTPAGESIZEPF4_S) & HOSTPAGESIZEPF4_M) + +#define HOSTPAGESIZEPF3_S 12 +#define HOSTPAGESIZEPF3_M 0xfU +#define HOSTPAGESIZEPF3_V(x) ((x) << HOSTPAGESIZEPF3_S) +#define HOSTPAGESIZEPF3_G(x) (((x) >> HOSTPAGESIZEPF3_S) & HOSTPAGESIZEPF3_M) + +#define HOSTPAGESIZEPF2_S 8 +#define HOSTPAGESIZEPF2_M 0xfU +#define HOSTPAGESIZEPF2_V(x) ((x) << HOSTPAGESIZEPF2_S) +#define HOSTPAGESIZEPF2_G(x) (((x) >> HOSTPAGESIZEPF2_S) & HOSTPAGESIZEPF2_M) + +#define HOSTPAGESIZEPF1_S 4 +#define HOSTPAGESIZEPF1_M 0xfU +#define HOSTPAGESIZEPF1_V(x) ((x) << HOSTPAGESIZEPF1_S) +#define HOSTPAGESIZEPF1_G(x) (((x) >> HOSTPAGESIZEPF1_S) & HOSTPAGESIZEPF1_M) + +#define HOSTPAGESIZEPF0_S 0 +#define HOSTPAGESIZEPF0_M 0xfU +#define HOSTPAGESIZEPF0_V(x) ((x) << HOSTPAGESIZEPF0_S) +#define HOSTPAGESIZEPF0_G(x) (((x) >> HOSTPAGESIZEPF0_S) & HOSTPAGESIZEPF0_M) + +#define SGE_EGRESS_QUEUES_PER_PAGE_PF_A 0x1010 +#define SGE_EGRESS_QUEUES_PER_PAGE_VF_A 0x1014 + +#define QUEUESPERPAGEPF1_S 4 + +#define QUEUESPERPAGEPF0_S 0 +#define QUEUESPERPAGEPF0_M 0xfU +#define QUEUESPERPAGEPF0_V(x) ((x) << QUEUESPERPAGEPF0_S) +#define QUEUESPERPAGEPF0_G(x) (((x) >> QUEUESPERPAGEPF0_S) & QUEUESPERPAGEPF0_M) + +#define SGE_INT_CAUSE1_A 0x1024 +#define SGE_INT_CAUSE2_A 0x1030 +#define SGE_INT_CAUSE3_A 0x103c + +#define ERR_FLM_DBP_S 31 +#define ERR_FLM_DBP_V(x) ((x) << ERR_FLM_DBP_S) +#define ERR_FLM_DBP_F ERR_FLM_DBP_V(1U) + +#define ERR_FLM_IDMA1_S 30 +#define ERR_FLM_IDMA1_V(x) ((x) << ERR_FLM_IDMA1_S) +#define ERR_FLM_IDMA1_F ERR_FLM_IDMA1_V(1U) + +#define ERR_FLM_IDMA0_S 29 +#define ERR_FLM_IDMA0_V(x) ((x) << ERR_FLM_IDMA0_S) +#define ERR_FLM_IDMA0_F ERR_FLM_IDMA0_V(1U) + +#define ERR_FLM_HINT_S 28 +#define ERR_FLM_HINT_V(x) ((x) << ERR_FLM_HINT_S) +#define ERR_FLM_HINT_F ERR_FLM_HINT_V(1U) + +#define ERR_PCIE_ERROR3_S 27 +#define ERR_PCIE_ERROR3_V(x) ((x) << ERR_PCIE_ERROR3_S) +#define ERR_PCIE_ERROR3_F ERR_PCIE_ERROR3_V(1U) + +#define ERR_PCIE_ERROR2_S 26 +#define ERR_PCIE_ERROR2_V(x) ((x) << ERR_PCIE_ERROR2_S) +#define ERR_PCIE_ERROR2_F ERR_PCIE_ERROR2_V(1U) + +#define ERR_PCIE_ERROR1_S 25 +#define ERR_PCIE_ERROR1_V(x) ((x) << ERR_PCIE_ERROR1_S) +#define ERR_PCIE_ERROR1_F ERR_PCIE_ERROR1_V(1U) + +#define ERR_PCIE_ERROR0_S 24 +#define ERR_PCIE_ERROR0_V(x) ((x) << ERR_PCIE_ERROR0_S) +#define ERR_PCIE_ERROR0_F ERR_PCIE_ERROR0_V(1U) + +#define ERR_CPL_EXCEED_IQE_SIZE_S 22 +#define ERR_CPL_EXCEED_IQE_SIZE_V(x) ((x) << ERR_CPL_EXCEED_IQE_SIZE_S) +#define ERR_CPL_EXCEED_IQE_SIZE_F ERR_CPL_EXCEED_IQE_SIZE_V(1U) + +#define ERR_INVALID_CIDX_INC_S 21 +#define ERR_INVALID_CIDX_INC_V(x) ((x) << ERR_INVALID_CIDX_INC_S) +#define ERR_INVALID_CIDX_INC_F ERR_INVALID_CIDX_INC_V(1U) + +#define ERR_CPL_OPCODE_0_S 19 +#define ERR_CPL_OPCODE_0_V(x) ((x) << ERR_CPL_OPCODE_0_S) +#define ERR_CPL_OPCODE_0_F ERR_CPL_OPCODE_0_V(1U) + +#define ERR_DROPPED_DB_S 18 +#define ERR_DROPPED_DB_V(x) ((x) << ERR_DROPPED_DB_S) +#define ERR_DROPPED_DB_F ERR_DROPPED_DB_V(1U) + +#define ERR_DATA_CPL_ON_HIGH_QID1_S 17 +#define ERR_DATA_CPL_ON_HIGH_QID1_V(x) ((x) << ERR_DATA_CPL_ON_HIGH_QID1_S) +#define ERR_DATA_CPL_ON_HIGH_QID1_F ERR_DATA_CPL_ON_HIGH_QID1_V(1U) + +#define ERR_DATA_CPL_ON_HIGH_QID0_S 16 +#define ERR_DATA_CPL_ON_HIGH_QID0_V(x) ((x) << ERR_DATA_CPL_ON_HIGH_QID0_S) +#define ERR_DATA_CPL_ON_HIGH_QID0_F ERR_DATA_CPL_ON_HIGH_QID0_V(1U) + +#define ERR_BAD_DB_PIDX3_S 15 +#define ERR_BAD_DB_PIDX3_V(x) ((x) << ERR_BAD_DB_PIDX3_S) +#define ERR_BAD_DB_PIDX3_F ERR_BAD_DB_PIDX3_V(1U) + +#define ERR_BAD_DB_PIDX2_S 14 +#define ERR_BAD_DB_PIDX2_V(x) ((x) << ERR_BAD_DB_PIDX2_S) +#define ERR_BAD_DB_PIDX2_F ERR_BAD_DB_PIDX2_V(1U) + +#define ERR_BAD_DB_PIDX1_S 13 +#define ERR_BAD_DB_PIDX1_V(x) ((x) << ERR_BAD_DB_PIDX1_S) +#define ERR_BAD_DB_PIDX1_F ERR_BAD_DB_PIDX1_V(1U) + +#define ERR_BAD_DB_PIDX0_S 12 +#define ERR_BAD_DB_PIDX0_V(x) ((x) << ERR_BAD_DB_PIDX0_S) +#define ERR_BAD_DB_PIDX0_F ERR_BAD_DB_PIDX0_V(1U) + +#define ERR_ING_CTXT_PRIO_S 10 +#define ERR_ING_CTXT_PRIO_V(x) ((x) << ERR_ING_CTXT_PRIO_S) +#define ERR_ING_CTXT_PRIO_F ERR_ING_CTXT_PRIO_V(1U) + +#define ERR_EGR_CTXT_PRIO_S 9 +#define ERR_EGR_CTXT_PRIO_V(x) ((x) << ERR_EGR_CTXT_PRIO_S) +#define ERR_EGR_CTXT_PRIO_F ERR_EGR_CTXT_PRIO_V(1U) + +#define DBFIFO_HP_INT_S 8 +#define DBFIFO_HP_INT_V(x) ((x) << DBFIFO_HP_INT_S) +#define DBFIFO_HP_INT_F DBFIFO_HP_INT_V(1U) + +#define DBFIFO_LP_INT_S 7 +#define DBFIFO_LP_INT_V(x) ((x) << DBFIFO_LP_INT_S) +#define DBFIFO_LP_INT_F DBFIFO_LP_INT_V(1U) + +#define INGRESS_SIZE_ERR_S 5 +#define INGRESS_SIZE_ERR_V(x) ((x) << INGRESS_SIZE_ERR_S) +#define INGRESS_SIZE_ERR_F INGRESS_SIZE_ERR_V(1U) + +#define EGRESS_SIZE_ERR_S 4 +#define EGRESS_SIZE_ERR_V(x) ((x) << EGRESS_SIZE_ERR_S) +#define EGRESS_SIZE_ERR_F EGRESS_SIZE_ERR_V(1U) + +#define SGE_INT_ENABLE3_A 0x1040 +#define SGE_FL_BUFFER_SIZE0_A 0x1044 +#define SGE_FL_BUFFER_SIZE1_A 0x1048 +#define SGE_FL_BUFFER_SIZE2_A 0x104c +#define SGE_FL_BUFFER_SIZE3_A 0x1050 +#define SGE_FL_BUFFER_SIZE4_A 0x1054 +#define SGE_FL_BUFFER_SIZE5_A 0x1058 +#define SGE_FL_BUFFER_SIZE6_A 0x105c +#define SGE_FL_BUFFER_SIZE7_A 0x1060 +#define SGE_FL_BUFFER_SIZE8_A 0x1064 + +#define SGE_IMSG_CTXT_BADDR_A 0x1088 +#define SGE_FLM_CACHE_BADDR_A 0x108c +#define SGE_INGRESS_RX_THRESHOLD_A 0x10a0 + +#define THRESHOLD_0_S 24 +#define THRESHOLD_0_M 0x3fU +#define THRESHOLD_0_V(x) ((x) << THRESHOLD_0_S) +#define THRESHOLD_0_G(x) (((x) >> THRESHOLD_0_S) & THRESHOLD_0_M) + +#define THRESHOLD_1_S 16 +#define THRESHOLD_1_M 0x3fU +#define THRESHOLD_1_V(x) ((x) << THRESHOLD_1_S) +#define THRESHOLD_1_G(x) (((x) >> THRESHOLD_1_S) & THRESHOLD_1_M) + +#define THRESHOLD_2_S 8 +#define THRESHOLD_2_M 0x3fU +#define THRESHOLD_2_V(x) ((x) << THRESHOLD_2_S) +#define THRESHOLD_2_G(x) (((x) >> THRESHOLD_2_S) & THRESHOLD_2_M) + +#define THRESHOLD_3_S 0 +#define THRESHOLD_3_M 0x3fU +#define THRESHOLD_3_V(x) ((x) << THRESHOLD_3_S) +#define THRESHOLD_3_G(x) (((x) >> THRESHOLD_3_S) & THRESHOLD_3_M) + +#define SGE_CONM_CTRL_A 0x1094 + +#define EGRTHRESHOLD_S 8 +#define EGRTHRESHOLD_M 0x3fU +#define EGRTHRESHOLD_V(x) ((x) << EGRTHRESHOLD_S) +#define EGRTHRESHOLD_G(x) (((x) >> EGRTHRESHOLD_S) & EGRTHRESHOLD_M) + +#define EGRTHRESHOLDPACKING_S 14 +#define EGRTHRESHOLDPACKING_M 0x3fU +#define EGRTHRESHOLDPACKING_V(x) ((x) << EGRTHRESHOLDPACKING_S) +#define EGRTHRESHOLDPACKING_G(x) \ + (((x) >> EGRTHRESHOLDPACKING_S) & EGRTHRESHOLDPACKING_M) + +#define T6_EGRTHRESHOLDPACKING_S 16 +#define T6_EGRTHRESHOLDPACKING_M 0xffU +#define T6_EGRTHRESHOLDPACKING_G(x) \ + (((x) >> T6_EGRTHRESHOLDPACKING_S) & T6_EGRTHRESHOLDPACKING_M) + +#define SGE_TIMESTAMP_LO_A 0x1098 +#define SGE_TIMESTAMP_HI_A 0x109c + +#define TSOP_S 28 +#define TSOP_M 0x3U +#define TSOP_V(x) ((x) << TSOP_S) +#define TSOP_G(x) (((x) >> TSOP_S) & TSOP_M) + +#define TSVAL_S 0 +#define TSVAL_M 0xfffffffU +#define TSVAL_V(x) ((x) << TSVAL_S) +#define TSVAL_G(x) (((x) >> TSVAL_S) & TSVAL_M) + +#define SGE_DBFIFO_STATUS_A 0x10a4 +#define SGE_DBVFIFO_SIZE_A 0x113c + +#define HP_INT_THRESH_S 28 +#define HP_INT_THRESH_M 0xfU +#define HP_INT_THRESH_V(x) ((x) << HP_INT_THRESH_S) + +#define LP_INT_THRESH_S 12 +#define LP_INT_THRESH_M 0xfU +#define LP_INT_THRESH_V(x) ((x) << LP_INT_THRESH_S) + +#define SGE_DOORBELL_CONTROL_A 0x10a8 + +#define NOCOALESCE_S 26 +#define NOCOALESCE_V(x) ((x) << NOCOALESCE_S) +#define NOCOALESCE_F NOCOALESCE_V(1U) + +#define ENABLE_DROP_S 13 +#define ENABLE_DROP_V(x) ((x) << ENABLE_DROP_S) +#define ENABLE_DROP_F ENABLE_DROP_V(1U) + +#define SGE_TIMER_VALUE_0_AND_1_A 0x10b8 + +#define TIMERVALUE0_S 16 +#define TIMERVALUE0_M 0xffffU +#define TIMERVALUE0_V(x) ((x) << TIMERVALUE0_S) +#define TIMERVALUE0_G(x) (((x) >> TIMERVALUE0_S) & TIMERVALUE0_M) + +#define TIMERVALUE1_S 0 +#define TIMERVALUE1_M 0xffffU +#define TIMERVALUE1_V(x) ((x) << TIMERVALUE1_S) +#define TIMERVALUE1_G(x) (((x) >> TIMERVALUE1_S) & TIMERVALUE1_M) + +#define SGE_TIMER_VALUE_2_AND_3_A 0x10bc + +#define TIMERVALUE2_S 16 +#define TIMERVALUE2_M 0xffffU +#define TIMERVALUE2_V(x) ((x) << TIMERVALUE2_S) +#define TIMERVALUE2_G(x) (((x) >> TIMERVALUE2_S) & TIMERVALUE2_M) + +#define TIMERVALUE3_S 0 +#define TIMERVALUE3_M 0xffffU +#define TIMERVALUE3_V(x) ((x) << TIMERVALUE3_S) +#define TIMERVALUE3_G(x) (((x) >> TIMERVALUE3_S) & TIMERVALUE3_M) + +#define SGE_TIMER_VALUE_4_AND_5_A 0x10c0 + +#define TIMERVALUE4_S 16 +#define TIMERVALUE4_M 0xffffU +#define TIMERVALUE4_V(x) ((x) << TIMERVALUE4_S) +#define TIMERVALUE4_G(x) (((x) >> TIMERVALUE4_S) & TIMERVALUE4_M) + +#define TIMERVALUE5_S 0 +#define TIMERVALUE5_M 0xffffU +#define TIMERVALUE5_V(x) ((x) << TIMERVALUE5_S) +#define TIMERVALUE5_G(x) (((x) >> TIMERVALUE5_S) & TIMERVALUE5_M) + +#define SGE_DEBUG_INDEX_A 0x10cc +#define SGE_DEBUG_DATA_HIGH_A 0x10d0 +#define SGE_DEBUG_DATA_LOW_A 0x10d4 + +#define SGE_DEBUG_DATA_LOW_INDEX_2_A 0x12c8 +#define SGE_DEBUG_DATA_LOW_INDEX_3_A 0x12cc +#define SGE_DEBUG_DATA_HIGH_INDEX_10_A 0x12a8 + +#define SGE_INGRESS_QUEUES_PER_PAGE_PF_A 0x10f4 +#define SGE_INGRESS_QUEUES_PER_PAGE_VF_A 0x10f8 + +#define SGE_ERROR_STATS_A 0x1100 + +#define UNCAPTURED_ERROR_S 18 +#define UNCAPTURED_ERROR_V(x) ((x) << UNCAPTURED_ERROR_S) +#define UNCAPTURED_ERROR_F UNCAPTURED_ERROR_V(1U) + +#define ERROR_QID_VALID_S 17 +#define ERROR_QID_VALID_V(x) ((x) << ERROR_QID_VALID_S) +#define ERROR_QID_VALID_F ERROR_QID_VALID_V(1U) + +#define ERROR_QID_S 0 +#define ERROR_QID_M 0x1ffffU +#define ERROR_QID_G(x) (((x) >> ERROR_QID_S) & ERROR_QID_M) + +#define HP_INT_THRESH_S 28 +#define HP_INT_THRESH_M 0xfU +#define HP_INT_THRESH_V(x) ((x) << HP_INT_THRESH_S) + +#define HP_COUNT_S 16 +#define HP_COUNT_M 0x7ffU +#define HP_COUNT_G(x) (((x) >> HP_COUNT_S) & HP_COUNT_M) + +#define LP_INT_THRESH_S 12 +#define LP_INT_THRESH_M 0xfU +#define LP_INT_THRESH_V(x) ((x) << LP_INT_THRESH_S) + +#define LP_COUNT_S 0 +#define LP_COUNT_M 0x7ffU +#define LP_COUNT_G(x) (((x) >> LP_COUNT_S) & LP_COUNT_M) + +#define LP_INT_THRESH_T5_S 18 +#define LP_INT_THRESH_T5_M 0xfffU +#define LP_INT_THRESH_T5_V(x) ((x) << LP_INT_THRESH_T5_S) + +#define LP_COUNT_T5_S 0 +#define LP_COUNT_T5_M 0x3ffffU +#define LP_COUNT_T5_G(x) (((x) >> LP_COUNT_T5_S) & LP_COUNT_T5_M) + +#define SGE_DOORBELL_CONTROL_A 0x10a8 + +#define SGE_STAT_TOTAL_A 0x10e4 +#define SGE_STAT_MATCH_A 0x10e8 +#define SGE_STAT_CFG_A 0x10ec + +#define STATMODE_S 2 +#define STATMODE_V(x) ((x) << STATMODE_S) + +#define STATSOURCE_T5_S 9 +#define STATSOURCE_T5_M 0xfU +#define STATSOURCE_T5_V(x) ((x) << STATSOURCE_T5_S) +#define STATSOURCE_T5_G(x) (((x) >> STATSOURCE_T5_S) & STATSOURCE_T5_M) + +#define T6_STATMODE_S 0 +#define T6_STATMODE_V(x) ((x) << T6_STATMODE_S) + +#define SGE_DBFIFO_STATUS2_A 0x1118 + +#define HP_INT_THRESH_T5_S 10 +#define HP_INT_THRESH_T5_M 0xfU +#define HP_INT_THRESH_T5_V(x) ((x) << HP_INT_THRESH_T5_S) + +#define HP_COUNT_T5_S 0 +#define HP_COUNT_T5_M 0x3ffU +#define HP_COUNT_T5_G(x) (((x) >> HP_COUNT_T5_S) & HP_COUNT_T5_M) + +#define ENABLE_DROP_S 13 +#define ENABLE_DROP_V(x) ((x) << ENABLE_DROP_S) +#define ENABLE_DROP_F ENABLE_DROP_V(1U) + +#define DROPPED_DB_S 0 +#define DROPPED_DB_V(x) ((x) << DROPPED_DB_S) +#define DROPPED_DB_F DROPPED_DB_V(1U) + +#define SGE_CTXT_CMD_A 0x11fc +#define SGE_DBQ_CTXT_BADDR_A 0x1084 + +/* registers for module PCIE */ +#define PCIE_PF_CFG_A 0x40 + +#define AIVEC_S 4 +#define AIVEC_M 0x3ffU +#define AIVEC_V(x) ((x) << AIVEC_S) + +#define PCIE_PF_CLI_A 0x44 +#define PCIE_INT_CAUSE_A 0x3004 + +#define UNXSPLCPLERR_S 29 +#define UNXSPLCPLERR_V(x) ((x) << UNXSPLCPLERR_S) +#define UNXSPLCPLERR_F UNXSPLCPLERR_V(1U) + +#define PCIEPINT_S 28 +#define PCIEPINT_V(x) ((x) << PCIEPINT_S) +#define PCIEPINT_F PCIEPINT_V(1U) + +#define PCIESINT_S 27 +#define PCIESINT_V(x) ((x) << PCIESINT_S) +#define PCIESINT_F PCIESINT_V(1U) + +#define RPLPERR_S 26 +#define RPLPERR_V(x) ((x) << RPLPERR_S) +#define RPLPERR_F RPLPERR_V(1U) + +#define RXWRPERR_S 25 +#define RXWRPERR_V(x) ((x) << RXWRPERR_S) +#define RXWRPERR_F RXWRPERR_V(1U) + +#define RXCPLPERR_S 24 +#define RXCPLPERR_V(x) ((x) << RXCPLPERR_S) +#define RXCPLPERR_F RXCPLPERR_V(1U) + +#define PIOTAGPERR_S 23 +#define PIOTAGPERR_V(x) ((x) << PIOTAGPERR_S) +#define PIOTAGPERR_F PIOTAGPERR_V(1U) + +#define MATAGPERR_S 22 +#define MATAGPERR_V(x) ((x) << MATAGPERR_S) +#define MATAGPERR_F MATAGPERR_V(1U) + +#define INTXCLRPERR_S 21 +#define INTXCLRPERR_V(x) ((x) << INTXCLRPERR_S) +#define INTXCLRPERR_F INTXCLRPERR_V(1U) + +#define FIDPERR_S 20 +#define FIDPERR_V(x) ((x) << FIDPERR_S) +#define FIDPERR_F FIDPERR_V(1U) + +#define CFGSNPPERR_S 19 +#define CFGSNPPERR_V(x) ((x) << CFGSNPPERR_S) +#define CFGSNPPERR_F CFGSNPPERR_V(1U) + +#define HRSPPERR_S 18 +#define HRSPPERR_V(x) ((x) << HRSPPERR_S) +#define HRSPPERR_F HRSPPERR_V(1U) + +#define HREQPERR_S 17 +#define HREQPERR_V(x) ((x) << HREQPERR_S) +#define HREQPERR_F HREQPERR_V(1U) + +#define HCNTPERR_S 16 +#define HCNTPERR_V(x) ((x) << HCNTPERR_S) +#define HCNTPERR_F HCNTPERR_V(1U) + +#define DRSPPERR_S 15 +#define DRSPPERR_V(x) ((x) << DRSPPERR_S) +#define DRSPPERR_F DRSPPERR_V(1U) + +#define DREQPERR_S 14 +#define DREQPERR_V(x) ((x) << DREQPERR_S) +#define DREQPERR_F DREQPERR_V(1U) + +#define DCNTPERR_S 13 +#define DCNTPERR_V(x) ((x) << DCNTPERR_S) +#define DCNTPERR_F DCNTPERR_V(1U) + +#define CRSPPERR_S 12 +#define CRSPPERR_V(x) ((x) << CRSPPERR_S) +#define CRSPPERR_F CRSPPERR_V(1U) + +#define CREQPERR_S 11 +#define CREQPERR_V(x) ((x) << CREQPERR_S) +#define CREQPERR_F CREQPERR_V(1U) + +#define CCNTPERR_S 10 +#define CCNTPERR_V(x) ((x) << CCNTPERR_S) +#define CCNTPERR_F CCNTPERR_V(1U) + +#define TARTAGPERR_S 9 +#define TARTAGPERR_V(x) ((x) << TARTAGPERR_S) +#define TARTAGPERR_F TARTAGPERR_V(1U) + +#define PIOREQPERR_S 8 +#define PIOREQPERR_V(x) ((x) << PIOREQPERR_S) +#define PIOREQPERR_F PIOREQPERR_V(1U) + +#define PIOCPLPERR_S 7 +#define PIOCPLPERR_V(x) ((x) << PIOCPLPERR_S) +#define PIOCPLPERR_F PIOCPLPERR_V(1U) + +#define MSIXDIPERR_S 6 +#define MSIXDIPERR_V(x) ((x) << MSIXDIPERR_S) +#define MSIXDIPERR_F MSIXDIPERR_V(1U) + +#define MSIXDATAPERR_S 5 +#define MSIXDATAPERR_V(x) ((x) << MSIXDATAPERR_S) +#define MSIXDATAPERR_F MSIXDATAPERR_V(1U) + +#define MSIXADDRHPERR_S 4 +#define MSIXADDRHPERR_V(x) ((x) << MSIXADDRHPERR_S) +#define MSIXADDRHPERR_F MSIXADDRHPERR_V(1U) + +#define MSIXADDRLPERR_S 3 +#define MSIXADDRLPERR_V(x) ((x) << MSIXADDRLPERR_S) +#define MSIXADDRLPERR_F MSIXADDRLPERR_V(1U) + +#define MSIDATAPERR_S 2 +#define MSIDATAPERR_V(x) ((x) << MSIDATAPERR_S) +#define MSIDATAPERR_F MSIDATAPERR_V(1U) + +#define MSIADDRHPERR_S 1 +#define MSIADDRHPERR_V(x) ((x) << MSIADDRHPERR_S) +#define MSIADDRHPERR_F MSIADDRHPERR_V(1U) + +#define MSIADDRLPERR_S 0 +#define MSIADDRLPERR_V(x) ((x) << MSIADDRLPERR_S) +#define MSIADDRLPERR_F MSIADDRLPERR_V(1U) + +#define READRSPERR_S 29 +#define READRSPERR_V(x) ((x) << READRSPERR_S) +#define READRSPERR_F READRSPERR_V(1U) + +#define TRGT1GRPPERR_S 28 +#define TRGT1GRPPERR_V(x) ((x) << TRGT1GRPPERR_S) +#define TRGT1GRPPERR_F TRGT1GRPPERR_V(1U) + +#define IPSOTPERR_S 27 +#define IPSOTPERR_V(x) ((x) << IPSOTPERR_S) +#define IPSOTPERR_F IPSOTPERR_V(1U) + +#define IPRETRYPERR_S 26 +#define IPRETRYPERR_V(x) ((x) << IPRETRYPERR_S) +#define IPRETRYPERR_F IPRETRYPERR_V(1U) + +#define IPRXDATAGRPPERR_S 25 +#define IPRXDATAGRPPERR_V(x) ((x) << IPRXDATAGRPPERR_S) +#define IPRXDATAGRPPERR_F IPRXDATAGRPPERR_V(1U) + +#define IPRXHDRGRPPERR_S 24 +#define IPRXHDRGRPPERR_V(x) ((x) << IPRXHDRGRPPERR_S) +#define IPRXHDRGRPPERR_F IPRXHDRGRPPERR_V(1U) + +#define MAGRPPERR_S 22 +#define MAGRPPERR_V(x) ((x) << MAGRPPERR_S) +#define MAGRPPERR_F MAGRPPERR_V(1U) + +#define VFIDPERR_S 21 +#define VFIDPERR_V(x) ((x) << VFIDPERR_S) +#define VFIDPERR_F VFIDPERR_V(1U) + +#define HREQWRPERR_S 16 +#define HREQWRPERR_V(x) ((x) << HREQWRPERR_S) +#define HREQWRPERR_F HREQWRPERR_V(1U) + +#define DREQWRPERR_S 13 +#define DREQWRPERR_V(x) ((x) << DREQWRPERR_S) +#define DREQWRPERR_F DREQWRPERR_V(1U) + +#define CREQRDPERR_S 11 +#define CREQRDPERR_V(x) ((x) << CREQRDPERR_S) +#define CREQRDPERR_F CREQRDPERR_V(1U) + +#define MSTTAGQPERR_S 10 +#define MSTTAGQPERR_V(x) ((x) << MSTTAGQPERR_S) +#define MSTTAGQPERR_F MSTTAGQPERR_V(1U) + +#define PIOREQGRPPERR_S 8 +#define PIOREQGRPPERR_V(x) ((x) << PIOREQGRPPERR_S) +#define PIOREQGRPPERR_F PIOREQGRPPERR_V(1U) + +#define PIOCPLGRPPERR_S 7 +#define PIOCPLGRPPERR_V(x) ((x) << PIOCPLGRPPERR_S) +#define PIOCPLGRPPERR_F PIOCPLGRPPERR_V(1U) + +#define MSIXSTIPERR_S 2 +#define MSIXSTIPERR_V(x) ((x) << MSIXSTIPERR_S) +#define MSIXSTIPERR_F MSIXSTIPERR_V(1U) + +#define MSTTIMEOUTPERR_S 1 +#define MSTTIMEOUTPERR_V(x) ((x) << MSTTIMEOUTPERR_S) +#define MSTTIMEOUTPERR_F MSTTIMEOUTPERR_V(1U) + +#define MSTGRPPERR_S 0 +#define MSTGRPPERR_V(x) ((x) << MSTGRPPERR_S) +#define MSTGRPPERR_F MSTGRPPERR_V(1U) + +#define PCIE_NONFAT_ERR_A 0x3010 +#define PCIE_CFG_SPACE_REQ_A 0x3060 +#define PCIE_CFG_SPACE_DATA_A 0x3064 +#define PCIE_MEM_ACCESS_BASE_WIN_A 0x3068 + +#define PCIEOFST_S 10 +#define PCIEOFST_M 0x3fffffU +#define PCIEOFST_G(x) (((x) >> PCIEOFST_S) & PCIEOFST_M) + +#define BIR_S 8 +#define BIR_M 0x3U +#define BIR_V(x) ((x) << BIR_S) +#define BIR_G(x) (((x) >> BIR_S) & BIR_M) + +#define WINDOW_S 0 +#define WINDOW_M 0xffU +#define WINDOW_V(x) ((x) << WINDOW_S) +#define WINDOW_G(x) (((x) >> WINDOW_S) & WINDOW_M) + +#define PCIE_MEM_ACCESS_OFFSET_A 0x306c + +#define ENABLE_S 30 +#define ENABLE_V(x) ((x) << ENABLE_S) +#define ENABLE_F ENABLE_V(1U) + +#define LOCALCFG_S 28 +#define LOCALCFG_V(x) ((x) << LOCALCFG_S) +#define LOCALCFG_F LOCALCFG_V(1U) + +#define FUNCTION_S 12 +#define FUNCTION_V(x) ((x) << FUNCTION_S) + +#define REGISTER_S 0 +#define REGISTER_V(x) ((x) << REGISTER_S) + +#define T6_ENABLE_S 31 +#define T6_ENABLE_V(x) ((x) << T6_ENABLE_S) +#define T6_ENABLE_F T6_ENABLE_V(1U) + +#define PFNUM_S 0 +#define PFNUM_V(x) ((x) << PFNUM_S) + +#define PCIE_FW_A 0x30b8 +#define PCIE_FW_PF_A 0x30bc + +#define PCIE_CORE_UTL_SYSTEM_BUS_AGENT_STATUS_A 0x5908 + +#define RNPP_S 31 +#define RNPP_V(x) ((x) << RNPP_S) +#define RNPP_F RNPP_V(1U) + +#define RPCP_S 29 +#define RPCP_V(x) ((x) << RPCP_S) +#define RPCP_F RPCP_V(1U) + +#define RCIP_S 27 +#define RCIP_V(x) ((x) << RCIP_S) +#define RCIP_F RCIP_V(1U) + +#define RCCP_S 26 +#define RCCP_V(x) ((x) << RCCP_S) +#define RCCP_F RCCP_V(1U) + +#define RFTP_S 23 +#define RFTP_V(x) ((x) << RFTP_S) +#define RFTP_F RFTP_V(1U) + +#define PTRP_S 20 +#define PTRP_V(x) ((x) << PTRP_S) +#define PTRP_F PTRP_V(1U) + +#define PCIE_CORE_UTL_PCI_EXPRESS_PORT_STATUS_A 0x59a4 + +#define TPCP_S 30 +#define TPCP_V(x) ((x) << TPCP_S) +#define TPCP_F TPCP_V(1U) + +#define TNPP_S 29 +#define TNPP_V(x) ((x) << TNPP_S) +#define TNPP_F TNPP_V(1U) + +#define TFTP_S 28 +#define TFTP_V(x) ((x) << TFTP_S) +#define TFTP_F TFTP_V(1U) + +#define TCAP_S 27 +#define TCAP_V(x) ((x) << TCAP_S) +#define TCAP_F TCAP_V(1U) + +#define TCIP_S 26 +#define TCIP_V(x) ((x) << TCIP_S) +#define TCIP_F TCIP_V(1U) + +#define RCAP_S 25 +#define RCAP_V(x) ((x) << RCAP_S) +#define RCAP_F RCAP_V(1U) + +#define PLUP_S 23 +#define PLUP_V(x) ((x) << PLUP_S) +#define PLUP_F PLUP_V(1U) + +#define PLDN_S 22 +#define PLDN_V(x) ((x) << PLDN_S) +#define PLDN_F PLDN_V(1U) + +#define OTDD_S 21 +#define OTDD_V(x) ((x) << OTDD_S) +#define OTDD_F OTDD_V(1U) + +#define GTRP_S 20 +#define GTRP_V(x) ((x) << GTRP_S) +#define GTRP_F GTRP_V(1U) + +#define RDPE_S 18 +#define RDPE_V(x) ((x) << RDPE_S) +#define RDPE_F RDPE_V(1U) + +#define TDCE_S 17 +#define TDCE_V(x) ((x) << TDCE_S) +#define TDCE_F TDCE_V(1U) + +#define TDUE_S 16 +#define TDUE_V(x) ((x) << TDUE_S) +#define TDUE_F TDUE_V(1U) + +/* registers for module MC */ +#define MC_INT_CAUSE_A 0x7518 +#define MC_P_INT_CAUSE_A 0x41318 + +#define ECC_UE_INT_CAUSE_S 2 +#define ECC_UE_INT_CAUSE_V(x) ((x) << ECC_UE_INT_CAUSE_S) +#define ECC_UE_INT_CAUSE_F ECC_UE_INT_CAUSE_V(1U) + +#define ECC_CE_INT_CAUSE_S 1 +#define ECC_CE_INT_CAUSE_V(x) ((x) << ECC_CE_INT_CAUSE_S) +#define ECC_CE_INT_CAUSE_F ECC_CE_INT_CAUSE_V(1U) + +#define PERR_INT_CAUSE_S 0 +#define PERR_INT_CAUSE_V(x) ((x) << PERR_INT_CAUSE_S) +#define PERR_INT_CAUSE_F PERR_INT_CAUSE_V(1U) + +#define MC_ECC_STATUS_A 0x751c +#define MC_P_ECC_STATUS_A 0x4131c + +#define ECC_CECNT_S 16 +#define ECC_CECNT_M 0xffffU +#define ECC_CECNT_V(x) ((x) << ECC_CECNT_S) +#define ECC_CECNT_G(x) (((x) >> ECC_CECNT_S) & ECC_CECNT_M) + +#define ECC_UECNT_S 0 +#define ECC_UECNT_M 0xffffU +#define ECC_UECNT_V(x) ((x) << ECC_UECNT_S) +#define ECC_UECNT_G(x) (((x) >> ECC_UECNT_S) & ECC_UECNT_M) + +#define MC_BIST_CMD_A 0x7600 + +#define START_BIST_S 31 +#define START_BIST_V(x) ((x) << START_BIST_S) +#define START_BIST_F START_BIST_V(1U) + +#define BIST_CMD_GAP_S 8 +#define BIST_CMD_GAP_V(x) ((x) << BIST_CMD_GAP_S) + +#define BIST_OPCODE_S 0 +#define BIST_OPCODE_V(x) ((x) << BIST_OPCODE_S) + +#define MC_BIST_CMD_ADDR_A 0x7604 +#define MC_BIST_CMD_LEN_A 0x7608 +#define MC_BIST_DATA_PATTERN_A 0x760c + +#define MC_BIST_STATUS_RDATA_A 0x7688 + +/* registers for module MA */ +#define MA_EDRAM0_BAR_A 0x77c0 + +#define EDRAM0_BASE_S 16 +#define EDRAM0_BASE_M 0xfffU +#define EDRAM0_BASE_G(x) (((x) >> EDRAM0_BASE_S) & EDRAM0_BASE_M) + +#define EDRAM0_SIZE_S 0 +#define EDRAM0_SIZE_M 0xfffU +#define EDRAM0_SIZE_V(x) ((x) << EDRAM0_SIZE_S) +#define EDRAM0_SIZE_G(x) (((x) >> EDRAM0_SIZE_S) & EDRAM0_SIZE_M) + +#define MA_EDRAM1_BAR_A 0x77c4 + +#define EDRAM1_BASE_S 16 +#define EDRAM1_BASE_M 0xfffU +#define EDRAM1_BASE_G(x) (((x) >> EDRAM1_BASE_S) & EDRAM1_BASE_M) + +#define EDRAM1_SIZE_S 0 +#define EDRAM1_SIZE_M 0xfffU +#define EDRAM1_SIZE_V(x) ((x) << EDRAM1_SIZE_S) +#define EDRAM1_SIZE_G(x) (((x) >> EDRAM1_SIZE_S) & EDRAM1_SIZE_M) + +#define MA_EXT_MEMORY_BAR_A 0x77c8 + +#define EXT_MEM_BASE_S 16 +#define EXT_MEM_BASE_M 0xfffU +#define EXT_MEM_BASE_V(x) ((x) << EXT_MEM_BASE_S) +#define EXT_MEM_BASE_G(x) (((x) >> EXT_MEM_BASE_S) & EXT_MEM_BASE_M) + +#define EXT_MEM_SIZE_S 0 +#define EXT_MEM_SIZE_M 0xfffU +#define EXT_MEM_SIZE_V(x) ((x) << EXT_MEM_SIZE_S) +#define EXT_MEM_SIZE_G(x) (((x) >> EXT_MEM_SIZE_S) & EXT_MEM_SIZE_M) + +#define MA_EXT_MEMORY1_BAR_A 0x7808 + +#define EXT_MEM1_BASE_S 16 +#define EXT_MEM1_BASE_M 0xfffU +#define EXT_MEM1_BASE_G(x) (((x) >> EXT_MEM1_BASE_S) & EXT_MEM1_BASE_M) + +#define EXT_MEM1_SIZE_S 0 +#define EXT_MEM1_SIZE_M 0xfffU +#define EXT_MEM1_SIZE_V(x) ((x) << EXT_MEM1_SIZE_S) +#define EXT_MEM1_SIZE_G(x) (((x) >> EXT_MEM1_SIZE_S) & EXT_MEM1_SIZE_M) + +#define MA_EXT_MEMORY0_BAR_A 0x77c8 + +#define EXT_MEM0_BASE_S 16 +#define EXT_MEM0_BASE_M 0xfffU +#define EXT_MEM0_BASE_G(x) (((x) >> EXT_MEM0_BASE_S) & EXT_MEM0_BASE_M) + +#define EXT_MEM0_SIZE_S 0 +#define EXT_MEM0_SIZE_M 0xfffU +#define EXT_MEM0_SIZE_V(x) ((x) << EXT_MEM0_SIZE_S) +#define EXT_MEM0_SIZE_G(x) (((x) >> EXT_MEM0_SIZE_S) & EXT_MEM0_SIZE_M) + +#define MA_TARGET_MEM_ENABLE_A 0x77d8 + +#define EXT_MEM_ENABLE_S 2 +#define EXT_MEM_ENABLE_V(x) ((x) << EXT_MEM_ENABLE_S) +#define EXT_MEM_ENABLE_F EXT_MEM_ENABLE_V(1U) + +#define EDRAM1_ENABLE_S 1 +#define EDRAM1_ENABLE_V(x) ((x) << EDRAM1_ENABLE_S) +#define EDRAM1_ENABLE_F EDRAM1_ENABLE_V(1U) + +#define EDRAM0_ENABLE_S 0 +#define EDRAM0_ENABLE_V(x) ((x) << EDRAM0_ENABLE_S) +#define EDRAM0_ENABLE_F EDRAM0_ENABLE_V(1U) + +#define EXT_MEM1_ENABLE_S 4 +#define EXT_MEM1_ENABLE_V(x) ((x) << EXT_MEM1_ENABLE_S) +#define EXT_MEM1_ENABLE_F EXT_MEM1_ENABLE_V(1U) + +#define EXT_MEM0_ENABLE_S 2 +#define EXT_MEM0_ENABLE_V(x) ((x) << EXT_MEM0_ENABLE_S) +#define EXT_MEM0_ENABLE_F EXT_MEM0_ENABLE_V(1U) + +#define MA_INT_CAUSE_A 0x77e0 + +#define MEM_PERR_INT_CAUSE_S 1 +#define MEM_PERR_INT_CAUSE_V(x) ((x) << MEM_PERR_INT_CAUSE_S) +#define MEM_PERR_INT_CAUSE_F MEM_PERR_INT_CAUSE_V(1U) + +#define MEM_WRAP_INT_CAUSE_S 0 +#define MEM_WRAP_INT_CAUSE_V(x) ((x) << MEM_WRAP_INT_CAUSE_S) +#define MEM_WRAP_INT_CAUSE_F MEM_WRAP_INT_CAUSE_V(1U) + +#define MA_INT_WRAP_STATUS_A 0x77e4 + +#define MEM_WRAP_ADDRESS_S 4 +#define MEM_WRAP_ADDRESS_M 0xfffffffU +#define MEM_WRAP_ADDRESS_G(x) (((x) >> MEM_WRAP_ADDRESS_S) & MEM_WRAP_ADDRESS_M) + +#define MEM_WRAP_CLIENT_NUM_S 0 +#define MEM_WRAP_CLIENT_NUM_M 0xfU +#define MEM_WRAP_CLIENT_NUM_G(x) \ + (((x) >> MEM_WRAP_CLIENT_NUM_S) & MEM_WRAP_CLIENT_NUM_M) + +#define MA_PARITY_ERROR_STATUS_A 0x77f4 +#define MA_PARITY_ERROR_STATUS1_A 0x77f4 +#define MA_PARITY_ERROR_STATUS2_A 0x7804 + +/* registers for module EDC_0 */ +#define EDC_0_BASE_ADDR 0x7900 + +#define EDC_BIST_CMD_A 0x7904 +#define EDC_BIST_CMD_ADDR_A 0x7908 +#define EDC_BIST_CMD_LEN_A 0x790c +#define EDC_BIST_DATA_PATTERN_A 0x7910 +#define EDC_BIST_STATUS_RDATA_A 0x7928 +#define EDC_INT_CAUSE_A 0x7978 + +#define ECC_UE_PAR_S 5 +#define ECC_UE_PAR_V(x) ((x) << ECC_UE_PAR_S) +#define ECC_UE_PAR_F ECC_UE_PAR_V(1U) + +#define ECC_CE_PAR_S 4 +#define ECC_CE_PAR_V(x) ((x) << ECC_CE_PAR_S) +#define ECC_CE_PAR_F ECC_CE_PAR_V(1U) + +#define PERR_PAR_CAUSE_S 3 +#define PERR_PAR_CAUSE_V(x) ((x) << PERR_PAR_CAUSE_S) +#define PERR_PAR_CAUSE_F PERR_PAR_CAUSE_V(1U) + +#define EDC_ECC_STATUS_A 0x797c + +/* registers for module EDC_1 */ +#define EDC_1_BASE_ADDR 0x7980 + +/* registers for module CIM */ +#define CIM_BOOT_CFG_A 0x7b00 +#define CIM_SDRAM_BASE_ADDR_A 0x7b14 +#define CIM_SDRAM_ADDR_SIZE_A 0x7b18 +#define CIM_EXTMEM2_BASE_ADDR_A 0x7b1c +#define CIM_EXTMEM2_ADDR_SIZE_A 0x7b20 +#define CIM_PF_MAILBOX_CTRL_SHADOW_COPY_A 0x290 + +#define BOOTADDR_M 0xffffff00U + +#define UPCRST_S 0 +#define UPCRST_V(x) ((x) << UPCRST_S) +#define UPCRST_F UPCRST_V(1U) + +#define CIM_PF_MAILBOX_DATA_A 0x240 +#define CIM_PF_MAILBOX_CTRL_A 0x280 + +#define MBMSGVALID_S 3 +#define MBMSGVALID_V(x) ((x) << MBMSGVALID_S) +#define MBMSGVALID_F MBMSGVALID_V(1U) + +#define MBINTREQ_S 2 +#define MBINTREQ_V(x) ((x) << MBINTREQ_S) +#define MBINTREQ_F MBINTREQ_V(1U) + +#define MBOWNER_S 0 +#define MBOWNER_M 0x3U +#define MBOWNER_V(x) ((x) << MBOWNER_S) +#define MBOWNER_G(x) (((x) >> MBOWNER_S) & MBOWNER_M) + +#define CIM_PF_HOST_INT_ENABLE_A 0x288 + +#define MBMSGRDYINTEN_S 19 +#define MBMSGRDYINTEN_V(x) ((x) << MBMSGRDYINTEN_S) +#define MBMSGRDYINTEN_F MBMSGRDYINTEN_V(1U) + +#define CIM_PF_HOST_INT_CAUSE_A 0x28c + +#define MBMSGRDYINT_S 19 +#define MBMSGRDYINT_V(x) ((x) << MBMSGRDYINT_S) +#define MBMSGRDYINT_F MBMSGRDYINT_V(1U) + +#define CIM_HOST_INT_CAUSE_A 0x7b2c + +#define TIEQOUTPARERRINT_S 20 +#define TIEQOUTPARERRINT_V(x) ((x) << TIEQOUTPARERRINT_S) +#define TIEQOUTPARERRINT_F TIEQOUTPARERRINT_V(1U) + +#define TIEQINPARERRINT_S 19 +#define TIEQINPARERRINT_V(x) ((x) << TIEQINPARERRINT_S) +#define TIEQINPARERRINT_F TIEQINPARERRINT_V(1U) + +#define PREFDROPINT_S 1 +#define PREFDROPINT_V(x) ((x) << PREFDROPINT_S) +#define PREFDROPINT_F PREFDROPINT_V(1U) + +#define UPACCNONZERO_S 0 +#define UPACCNONZERO_V(x) ((x) << UPACCNONZERO_S) +#define UPACCNONZERO_F UPACCNONZERO_V(1U) + +#define MBHOSTPARERR_S 18 +#define MBHOSTPARERR_V(x) ((x) << MBHOSTPARERR_S) +#define MBHOSTPARERR_F MBHOSTPARERR_V(1U) + +#define MBUPPARERR_S 17 +#define MBUPPARERR_V(x) ((x) << MBUPPARERR_S) +#define MBUPPARERR_F MBUPPARERR_V(1U) + +#define IBQTP0PARERR_S 16 +#define IBQTP0PARERR_V(x) ((x) << IBQTP0PARERR_S) +#define IBQTP0PARERR_F IBQTP0PARERR_V(1U) + +#define IBQTP1PARERR_S 15 +#define IBQTP1PARERR_V(x) ((x) << IBQTP1PARERR_S) +#define IBQTP1PARERR_F IBQTP1PARERR_V(1U) + +#define IBQULPPARERR_S 14 +#define IBQULPPARERR_V(x) ((x) << IBQULPPARERR_S) +#define IBQULPPARERR_F IBQULPPARERR_V(1U) + +#define IBQSGELOPARERR_S 13 +#define IBQSGELOPARERR_V(x) ((x) << IBQSGELOPARERR_S) +#define IBQSGELOPARERR_F IBQSGELOPARERR_V(1U) + +#define IBQSGEHIPARERR_S 12 +#define IBQSGEHIPARERR_V(x) ((x) << IBQSGEHIPARERR_S) +#define IBQSGEHIPARERR_F IBQSGEHIPARERR_V(1U) + +#define IBQNCSIPARERR_S 11 +#define IBQNCSIPARERR_V(x) ((x) << IBQNCSIPARERR_S) +#define IBQNCSIPARERR_F IBQNCSIPARERR_V(1U) + +#define OBQULP0PARERR_S 10 +#define OBQULP0PARERR_V(x) ((x) << OBQULP0PARERR_S) +#define OBQULP0PARERR_F OBQULP0PARERR_V(1U) + +#define OBQULP1PARERR_S 9 +#define OBQULP1PARERR_V(x) ((x) << OBQULP1PARERR_S) +#define OBQULP1PARERR_F OBQULP1PARERR_V(1U) + +#define OBQULP2PARERR_S 8 +#define OBQULP2PARERR_V(x) ((x) << OBQULP2PARERR_S) +#define OBQULP2PARERR_F OBQULP2PARERR_V(1U) + +#define OBQULP3PARERR_S 7 +#define OBQULP3PARERR_V(x) ((x) << OBQULP3PARERR_S) +#define OBQULP3PARERR_F OBQULP3PARERR_V(1U) + +#define OBQSGEPARERR_S 6 +#define OBQSGEPARERR_V(x) ((x) << OBQSGEPARERR_S) +#define OBQSGEPARERR_F OBQSGEPARERR_V(1U) + +#define OBQNCSIPARERR_S 5 +#define OBQNCSIPARERR_V(x) ((x) << OBQNCSIPARERR_S) +#define OBQNCSIPARERR_F OBQNCSIPARERR_V(1U) + +#define CIM_HOST_UPACC_INT_CAUSE_A 0x7b34 + +#define EEPROMWRINT_S 30 +#define EEPROMWRINT_V(x) ((x) << EEPROMWRINT_S) +#define EEPROMWRINT_F EEPROMWRINT_V(1U) + +#define TIMEOUTMAINT_S 29 +#define TIMEOUTMAINT_V(x) ((x) << TIMEOUTMAINT_S) +#define TIMEOUTMAINT_F TIMEOUTMAINT_V(1U) + +#define TIMEOUTINT_S 28 +#define TIMEOUTINT_V(x) ((x) << TIMEOUTINT_S) +#define TIMEOUTINT_F TIMEOUTINT_V(1U) + +#define RSPOVRLOOKUPINT_S 27 +#define RSPOVRLOOKUPINT_V(x) ((x) << RSPOVRLOOKUPINT_S) +#define RSPOVRLOOKUPINT_F RSPOVRLOOKUPINT_V(1U) + +#define REQOVRLOOKUPINT_S 26 +#define REQOVRLOOKUPINT_V(x) ((x) << REQOVRLOOKUPINT_S) +#define REQOVRLOOKUPINT_F REQOVRLOOKUPINT_V(1U) + +#define BLKWRPLINT_S 25 +#define BLKWRPLINT_V(x) ((x) << BLKWRPLINT_S) +#define BLKWRPLINT_F BLKWRPLINT_V(1U) + +#define BLKRDPLINT_S 24 +#define BLKRDPLINT_V(x) ((x) << BLKRDPLINT_S) +#define BLKRDPLINT_F BLKRDPLINT_V(1U) + +#define SGLWRPLINT_S 23 +#define SGLWRPLINT_V(x) ((x) << SGLWRPLINT_S) +#define SGLWRPLINT_F SGLWRPLINT_V(1U) + +#define SGLRDPLINT_S 22 +#define SGLRDPLINT_V(x) ((x) << SGLRDPLINT_S) +#define SGLRDPLINT_F SGLRDPLINT_V(1U) + +#define BLKWRCTLINT_S 21 +#define BLKWRCTLINT_V(x) ((x) << BLKWRCTLINT_S) +#define BLKWRCTLINT_F BLKWRCTLINT_V(1U) + +#define BLKRDCTLINT_S 20 +#define BLKRDCTLINT_V(x) ((x) << BLKRDCTLINT_S) +#define BLKRDCTLINT_F BLKRDCTLINT_V(1U) + +#define SGLWRCTLINT_S 19 +#define SGLWRCTLINT_V(x) ((x) << SGLWRCTLINT_S) +#define SGLWRCTLINT_F SGLWRCTLINT_V(1U) + +#define SGLRDCTLINT_S 18 +#define SGLRDCTLINT_V(x) ((x) << SGLRDCTLINT_S) +#define SGLRDCTLINT_F SGLRDCTLINT_V(1U) + +#define BLKWREEPROMINT_S 17 +#define BLKWREEPROMINT_V(x) ((x) << BLKWREEPROMINT_S) +#define BLKWREEPROMINT_F BLKWREEPROMINT_V(1U) + +#define BLKRDEEPROMINT_S 16 +#define BLKRDEEPROMINT_V(x) ((x) << BLKRDEEPROMINT_S) +#define BLKRDEEPROMINT_F BLKRDEEPROMINT_V(1U) + +#define SGLWREEPROMINT_S 15 +#define SGLWREEPROMINT_V(x) ((x) << SGLWREEPROMINT_S) +#define SGLWREEPROMINT_F SGLWREEPROMINT_V(1U) + +#define SGLRDEEPROMINT_S 14 +#define SGLRDEEPROMINT_V(x) ((x) << SGLRDEEPROMINT_S) +#define SGLRDEEPROMINT_F SGLRDEEPROMINT_V(1U) + +#define BLKWRFLASHINT_S 13 +#define BLKWRFLASHINT_V(x) ((x) << BLKWRFLASHINT_S) +#define BLKWRFLASHINT_F BLKWRFLASHINT_V(1U) + +#define BLKRDFLASHINT_S 12 +#define BLKRDFLASHINT_V(x) ((x) << BLKRDFLASHINT_S) +#define BLKRDFLASHINT_F BLKRDFLASHINT_V(1U) + +#define SGLWRFLASHINT_S 11 +#define SGLWRFLASHINT_V(x) ((x) << SGLWRFLASHINT_S) +#define SGLWRFLASHINT_F SGLWRFLASHINT_V(1U) + +#define SGLRDFLASHINT_S 10 +#define SGLRDFLASHINT_V(x) ((x) << SGLRDFLASHINT_S) +#define SGLRDFLASHINT_F SGLRDFLASHINT_V(1U) + +#define BLKWRBOOTINT_S 9 +#define BLKWRBOOTINT_V(x) ((x) << BLKWRBOOTINT_S) +#define BLKWRBOOTINT_F BLKWRBOOTINT_V(1U) + +#define BLKRDBOOTINT_S 8 +#define BLKRDBOOTINT_V(x) ((x) << BLKRDBOOTINT_S) +#define BLKRDBOOTINT_F BLKRDBOOTINT_V(1U) + +#define SGLWRBOOTINT_S 7 +#define SGLWRBOOTINT_V(x) ((x) << SGLWRBOOTINT_S) +#define SGLWRBOOTINT_F SGLWRBOOTINT_V(1U) + +#define SGLRDBOOTINT_S 6 +#define SGLRDBOOTINT_V(x) ((x) << SGLRDBOOTINT_S) +#define SGLRDBOOTINT_F SGLRDBOOTINT_V(1U) + +#define ILLWRBEINT_S 5 +#define ILLWRBEINT_V(x) ((x) << ILLWRBEINT_S) +#define ILLWRBEINT_F ILLWRBEINT_V(1U) + +#define ILLRDBEINT_S 4 +#define ILLRDBEINT_V(x) ((x) << ILLRDBEINT_S) +#define ILLRDBEINT_F ILLRDBEINT_V(1U) + +#define ILLRDINT_S 3 +#define ILLRDINT_V(x) ((x) << ILLRDINT_S) +#define ILLRDINT_F ILLRDINT_V(1U) + +#define ILLWRINT_S 2 +#define ILLWRINT_V(x) ((x) << ILLWRINT_S) +#define ILLWRINT_F ILLWRINT_V(1U) + +#define ILLTRANSINT_S 1 +#define ILLTRANSINT_V(x) ((x) << ILLTRANSINT_S) +#define ILLTRANSINT_F ILLTRANSINT_V(1U) + +#define RSVDSPACEINT_S 0 +#define RSVDSPACEINT_V(x) ((x) << RSVDSPACEINT_S) +#define RSVDSPACEINT_F RSVDSPACEINT_V(1U) + +/* registers for module TP */ +#define DBGLAWHLF_S 23 +#define DBGLAWHLF_V(x) ((x) << DBGLAWHLF_S) +#define DBGLAWHLF_F DBGLAWHLF_V(1U) + +#define DBGLAWPTR_S 16 +#define DBGLAWPTR_M 0x7fU +#define DBGLAWPTR_G(x) (((x) >> DBGLAWPTR_S) & DBGLAWPTR_M) + +#define DBGLAENABLE_S 12 +#define DBGLAENABLE_V(x) ((x) << DBGLAENABLE_S) +#define DBGLAENABLE_F DBGLAENABLE_V(1U) + +#define DBGLARPTR_S 0 +#define DBGLARPTR_M 0x7fU +#define DBGLARPTR_V(x) ((x) << DBGLARPTR_S) + +#define TP_DBG_LA_DATAL_A 0x7ed8 +#define TP_DBG_LA_CONFIG_A 0x7ed4 +#define TP_OUT_CONFIG_A 0x7d04 +#define TP_GLOBAL_CONFIG_A 0x7d08 + +#define TP_CMM_TCB_BASE_A 0x7d10 +#define TP_CMM_MM_BASE_A 0x7d14 +#define TP_CMM_TIMER_BASE_A 0x7d18 +#define TP_PMM_TX_BASE_A 0x7d20 +#define TP_PMM_RX_BASE_A 0x7d28 +#define TP_PMM_RX_PAGE_SIZE_A 0x7d2c +#define TP_PMM_RX_MAX_PAGE_A 0x7d30 +#define TP_PMM_TX_PAGE_SIZE_A 0x7d34 +#define TP_PMM_TX_MAX_PAGE_A 0x7d38 +#define TP_CMM_MM_MAX_PSTRUCT_A 0x7e6c + +#define PMRXNUMCHN_S 31 +#define PMRXNUMCHN_V(x) ((x) << PMRXNUMCHN_S) +#define PMRXNUMCHN_F PMRXNUMCHN_V(1U) + +#define PMTXNUMCHN_S 30 +#define PMTXNUMCHN_M 0x3U +#define PMTXNUMCHN_G(x) (((x) >> PMTXNUMCHN_S) & PMTXNUMCHN_M) + +#define PMTXMAXPAGE_S 0 +#define PMTXMAXPAGE_M 0x1fffffU +#define PMTXMAXPAGE_G(x) (((x) >> PMTXMAXPAGE_S) & PMTXMAXPAGE_M) + +#define PMRXMAXPAGE_S 0 +#define PMRXMAXPAGE_M 0x1fffffU +#define PMRXMAXPAGE_G(x) (((x) >> PMRXMAXPAGE_S) & PMRXMAXPAGE_M) + +#define DBGLAMODE_S 14 +#define DBGLAMODE_M 0x3U +#define DBGLAMODE_G(x) (((x) >> DBGLAMODE_S) & DBGLAMODE_M) + +#define FIVETUPLELOOKUP_S 17 +#define FIVETUPLELOOKUP_M 0x3U +#define FIVETUPLELOOKUP_V(x) ((x) << FIVETUPLELOOKUP_S) +#define FIVETUPLELOOKUP_G(x) (((x) >> FIVETUPLELOOKUP_S) & FIVETUPLELOOKUP_M) + +#define TP_PARA_REG2_A 0x7d68 + +#define MAXRXDATA_S 16 +#define MAXRXDATA_M 0xffffU +#define MAXRXDATA_G(x) (((x) >> MAXRXDATA_S) & MAXRXDATA_M) + +#define TP_TIMER_RESOLUTION_A 0x7d90 + +#define TIMERRESOLUTION_S 16 +#define TIMERRESOLUTION_M 0xffU +#define TIMERRESOLUTION_G(x) (((x) >> TIMERRESOLUTION_S) & TIMERRESOLUTION_M) + +#define TIMESTAMPRESOLUTION_S 8 +#define TIMESTAMPRESOLUTION_M 0xffU +#define TIMESTAMPRESOLUTION_G(x) \ + (((x) >> TIMESTAMPRESOLUTION_S) & TIMESTAMPRESOLUTION_M) + +#define DELAYEDACKRESOLUTION_S 0 +#define DELAYEDACKRESOLUTION_M 0xffU +#define DELAYEDACKRESOLUTION_G(x) \ + (((x) >> DELAYEDACKRESOLUTION_S) & DELAYEDACKRESOLUTION_M) + +#define TP_SHIFT_CNT_A 0x7dc0 +#define TP_RXT_MIN_A 0x7d98 +#define TP_RXT_MAX_A 0x7d9c +#define TP_PERS_MIN_A 0x7da0 +#define TP_PERS_MAX_A 0x7da4 +#define TP_KEEP_IDLE_A 0x7da8 +#define TP_KEEP_INTVL_A 0x7dac +#define TP_INIT_SRTT_A 0x7db0 +#define TP_DACK_TIMER_A 0x7db4 +#define TP_FINWAIT2_TIMER_A 0x7db8 + +#define INITSRTT_S 0 +#define INITSRTT_M 0xffffU +#define INITSRTT_G(x) (((x) >> INITSRTT_S) & INITSRTT_M) + +#define PERSMAX_S 0 +#define PERSMAX_M 0x3fffffffU +#define PERSMAX_V(x) ((x) << PERSMAX_S) +#define PERSMAX_G(x) (((x) >> PERSMAX_S) & PERSMAX_M) + +#define SYNSHIFTMAX_S 24 +#define SYNSHIFTMAX_M 0xffU +#define SYNSHIFTMAX_V(x) ((x) << SYNSHIFTMAX_S) +#define SYNSHIFTMAX_G(x) (((x) >> SYNSHIFTMAX_S) & SYNSHIFTMAX_M) + +#define RXTSHIFTMAXR1_S 20 +#define RXTSHIFTMAXR1_M 0xfU +#define RXTSHIFTMAXR1_V(x) ((x) << RXTSHIFTMAXR1_S) +#define RXTSHIFTMAXR1_G(x) (((x) >> RXTSHIFTMAXR1_S) & RXTSHIFTMAXR1_M) + +#define RXTSHIFTMAXR2_S 16 +#define RXTSHIFTMAXR2_M 0xfU +#define RXTSHIFTMAXR2_V(x) ((x) << RXTSHIFTMAXR2_S) +#define RXTSHIFTMAXR2_G(x) (((x) >> RXTSHIFTMAXR2_S) & RXTSHIFTMAXR2_M) + +#define PERSHIFTBACKOFFMAX_S 12 +#define PERSHIFTBACKOFFMAX_M 0xfU +#define PERSHIFTBACKOFFMAX_V(x) ((x) << PERSHIFTBACKOFFMAX_S) +#define PERSHIFTBACKOFFMAX_G(x) \ + (((x) >> PERSHIFTBACKOFFMAX_S) & PERSHIFTBACKOFFMAX_M) + +#define PERSHIFTMAX_S 8 +#define PERSHIFTMAX_M 0xfU +#define PERSHIFTMAX_V(x) ((x) << PERSHIFTMAX_S) +#define PERSHIFTMAX_G(x) (((x) >> PERSHIFTMAX_S) & PERSHIFTMAX_M) + +#define KEEPALIVEMAXR1_S 4 +#define KEEPALIVEMAXR1_M 0xfU +#define KEEPALIVEMAXR1_V(x) ((x) << KEEPALIVEMAXR1_S) +#define KEEPALIVEMAXR1_G(x) (((x) >> KEEPALIVEMAXR1_S) & KEEPALIVEMAXR1_M) + +#define KEEPALIVEMAXR2_S 0 +#define KEEPALIVEMAXR2_M 0xfU +#define KEEPALIVEMAXR2_V(x) ((x) << KEEPALIVEMAXR2_S) +#define KEEPALIVEMAXR2_G(x) (((x) >> KEEPALIVEMAXR2_S) & KEEPALIVEMAXR2_M) + +#define ROWINDEX_S 16 +#define ROWINDEX_V(x) ((x) << ROWINDEX_S) + +#define TP_CCTRL_TABLE_A 0x7ddc +#define TP_MTU_TABLE_A 0x7de4 + +#define MTUINDEX_S 24 +#define MTUINDEX_V(x) ((x) << MTUINDEX_S) + +#define MTUWIDTH_S 16 +#define MTUWIDTH_M 0xfU +#define MTUWIDTH_V(x) ((x) << MTUWIDTH_S) +#define MTUWIDTH_G(x) (((x) >> MTUWIDTH_S) & MTUWIDTH_M) + +#define MTUVALUE_S 0 +#define MTUVALUE_M 0x3fffU +#define MTUVALUE_V(x) ((x) << MTUVALUE_S) +#define MTUVALUE_G(x) (((x) >> MTUVALUE_S) & MTUVALUE_M) + +#define TP_RSS_LKP_TABLE_A 0x7dec +#define TP_CMM_MM_RX_FLST_BASE_A 0x7e60 +#define TP_CMM_MM_TX_FLST_BASE_A 0x7e64 +#define TP_CMM_MM_PS_FLST_BASE_A 0x7e68 + +#define LKPTBLROWVLD_S 31 +#define LKPTBLROWVLD_V(x) ((x) << LKPTBLROWVLD_S) +#define LKPTBLROWVLD_F LKPTBLROWVLD_V(1U) + +#define LKPTBLQUEUE1_S 10 +#define LKPTBLQUEUE1_M 0x3ffU +#define LKPTBLQUEUE1_G(x) (((x) >> LKPTBLQUEUE1_S) & LKPTBLQUEUE1_M) + +#define LKPTBLQUEUE0_S 0 +#define LKPTBLQUEUE0_M 0x3ffU +#define LKPTBLQUEUE0_G(x) (((x) >> LKPTBLQUEUE0_S) & LKPTBLQUEUE0_M) + +#define TP_PIO_ADDR_A 0x7e40 +#define TP_PIO_DATA_A 0x7e44 +#define TP_MIB_INDEX_A 0x7e50 +#define TP_MIB_DATA_A 0x7e54 +#define TP_INT_CAUSE_A 0x7e74 +#define SRQTABLEPERR_S 1 +#define SRQTABLEPERR_V(x) ((x) << SRQTABLEPERR_S) +#define SRQTABLEPERR_F SRQTABLEPERR_V(1U) + + +#define FLMTXFLSTEMPTY_S 30 +#define FLMTXFLSTEMPTY_V(x) ((x) << FLMTXFLSTEMPTY_S) +#define FLMTXFLSTEMPTY_F FLMTXFLSTEMPTY_V(1U) + +#define TP_TX_ORATE_A 0x7ebc + +#define OFDRATE3_S 24 +#define OFDRATE3_M 0xffU +#define OFDRATE3_G(x) (((x) >> OFDRATE3_S) & OFDRATE3_M) + +#define OFDRATE2_S 16 +#define OFDRATE2_M 0xffU +#define OFDRATE2_G(x) (((x) >> OFDRATE2_S) & OFDRATE2_M) + +#define OFDRATE1_S 8 +#define OFDRATE1_M 0xffU +#define OFDRATE1_G(x) (((x) >> OFDRATE1_S) & OFDRATE1_M) + +#define OFDRATE0_S 0 +#define OFDRATE0_M 0xffU +#define OFDRATE0_G(x) (((x) >> OFDRATE0_S) & OFDRATE0_M) + +#define TP_TX_TRATE_A 0x7ed0 + +#define TNLRATE3_S 24 +#define TNLRATE3_M 0xffU +#define TNLRATE3_G(x) (((x) >> TNLRATE3_S) & TNLRATE3_M) + +#define TNLRATE2_S 16 +#define TNLRATE2_M 0xffU +#define TNLRATE2_G(x) (((x) >> TNLRATE2_S) & TNLRATE2_M) + +#define TNLRATE1_S 8 +#define TNLRATE1_M 0xffU +#define TNLRATE1_G(x) (((x) >> TNLRATE1_S) & TNLRATE1_M) + +#define TNLRATE0_S 0 +#define TNLRATE0_M 0xffU +#define TNLRATE0_G(x) (((x) >> TNLRATE0_S) & TNLRATE0_M) + +#define TP_VLAN_PRI_MAP_A 0x140 + +#define FRAGMENTATION_S 9 +#define FRAGMENTATION_V(x) ((x) << FRAGMENTATION_S) +#define FRAGMENTATION_F FRAGMENTATION_V(1U) + +#define MPSHITTYPE_S 8 +#define MPSHITTYPE_V(x) ((x) << MPSHITTYPE_S) +#define MPSHITTYPE_F MPSHITTYPE_V(1U) + +#define MACMATCH_S 7 +#define MACMATCH_V(x) ((x) << MACMATCH_S) +#define MACMATCH_F MACMATCH_V(1U) + +#define ETHERTYPE_S 6 +#define ETHERTYPE_V(x) ((x) << ETHERTYPE_S) +#define ETHERTYPE_F ETHERTYPE_V(1U) + +#define PROTOCOL_S 5 +#define PROTOCOL_V(x) ((x) << PROTOCOL_S) +#define PROTOCOL_F PROTOCOL_V(1U) + +#define TOS_S 4 +#define TOS_V(x) ((x) << TOS_S) +#define TOS_F TOS_V(1U) + +#define VLAN_S 3 +#define VLAN_V(x) ((x) << VLAN_S) +#define VLAN_F VLAN_V(1U) + +#define VNIC_ID_S 2 +#define VNIC_ID_V(x) ((x) << VNIC_ID_S) +#define VNIC_ID_F VNIC_ID_V(1U) + +#define PORT_S 1 +#define PORT_V(x) ((x) << PORT_S) +#define PORT_F PORT_V(1U) + +#define FCOE_S 0 +#define FCOE_V(x) ((x) << FCOE_S) +#define FCOE_F FCOE_V(1U) + +#define FILTERMODE_S 15 +#define FILTERMODE_V(x) ((x) << FILTERMODE_S) +#define FILTERMODE_F FILTERMODE_V(1U) + +#define FCOEMASK_S 14 +#define FCOEMASK_V(x) ((x) << FCOEMASK_S) +#define FCOEMASK_F FCOEMASK_V(1U) + +#define TP_INGRESS_CONFIG_A 0x141 + +#define VNIC_S 11 +#define VNIC_V(x) ((x) << VNIC_S) +#define VNIC_F VNIC_V(1U) + +#define CSUM_HAS_PSEUDO_HDR_S 10 +#define CSUM_HAS_PSEUDO_HDR_V(x) ((x) << CSUM_HAS_PSEUDO_HDR_S) +#define CSUM_HAS_PSEUDO_HDR_F CSUM_HAS_PSEUDO_HDR_V(1U) + +#define TP_MIB_MAC_IN_ERR_0_A 0x0 +#define TP_MIB_HDR_IN_ERR_0_A 0x4 +#define TP_MIB_TCP_IN_ERR_0_A 0x8 +#define TP_MIB_TCP_OUT_RST_A 0xc +#define TP_MIB_TCP_IN_SEG_HI_A 0x10 +#define TP_MIB_TCP_IN_SEG_LO_A 0x11 +#define TP_MIB_TCP_OUT_SEG_HI_A 0x12 +#define TP_MIB_TCP_OUT_SEG_LO_A 0x13 +#define TP_MIB_TCP_RXT_SEG_HI_A 0x14 +#define TP_MIB_TCP_RXT_SEG_LO_A 0x15 +#define TP_MIB_TNL_CNG_DROP_0_A 0x18 +#define TP_MIB_OFD_CHN_DROP_0_A 0x1c +#define TP_MIB_TCP_V6IN_ERR_0_A 0x28 +#define TP_MIB_TCP_V6OUT_RST_A 0x2c +#define TP_MIB_OFD_ARP_DROP_A 0x36 +#define TP_MIB_CPL_IN_REQ_0_A 0x38 +#define TP_MIB_CPL_OUT_RSP_0_A 0x3c +#define TP_MIB_TNL_DROP_0_A 0x44 +#define TP_MIB_FCOE_DDP_0_A 0x48 +#define TP_MIB_FCOE_DROP_0_A 0x4c +#define TP_MIB_FCOE_BYTE_0_HI_A 0x50 +#define TP_MIB_OFD_VLN_DROP_0_A 0x58 +#define TP_MIB_USM_PKTS_A 0x5c +#define TP_MIB_RQE_DFR_PKT_A 0x64 + +#define ULP_TX_INT_CAUSE_A 0x8dcc +#define ULP_TX_TPT_LLIMIT_A 0x8dd4 +#define ULP_TX_TPT_ULIMIT_A 0x8dd8 +#define ULP_TX_PBL_LLIMIT_A 0x8ddc +#define ULP_TX_PBL_ULIMIT_A 0x8de0 +#define ULP_TX_ERR_TABLE_BASE_A 0x8e04 + +#define PBL_BOUND_ERR_CH3_S 31 +#define PBL_BOUND_ERR_CH3_V(x) ((x) << PBL_BOUND_ERR_CH3_S) +#define PBL_BOUND_ERR_CH3_F PBL_BOUND_ERR_CH3_V(1U) + +#define PBL_BOUND_ERR_CH2_S 30 +#define PBL_BOUND_ERR_CH2_V(x) ((x) << PBL_BOUND_ERR_CH2_S) +#define PBL_BOUND_ERR_CH2_F PBL_BOUND_ERR_CH2_V(1U) + +#define PBL_BOUND_ERR_CH1_S 29 +#define PBL_BOUND_ERR_CH1_V(x) ((x) << PBL_BOUND_ERR_CH1_S) +#define PBL_BOUND_ERR_CH1_F PBL_BOUND_ERR_CH1_V(1U) + +#define PBL_BOUND_ERR_CH0_S 28 +#define PBL_BOUND_ERR_CH0_V(x) ((x) << PBL_BOUND_ERR_CH0_S) +#define PBL_BOUND_ERR_CH0_F PBL_BOUND_ERR_CH0_V(1U) + +#define PM_RX_INT_CAUSE_A 0x8fdc +#define PM_RX_STAT_CONFIG_A 0x8fc8 +#define PM_RX_STAT_COUNT_A 0x8fcc +#define PM_RX_STAT_LSB_A 0x8fd0 +#define PM_RX_DBG_CTRL_A 0x8fd0 +#define PM_RX_DBG_DATA_A 0x8fd4 +#define PM_RX_DBG_STAT_MSB_A 0x10013 + +#define PMRX_FRAMING_ERROR_F 0x003ffff0U + +#define ZERO_E_CMD_ERROR_S 22 +#define ZERO_E_CMD_ERROR_V(x) ((x) << ZERO_E_CMD_ERROR_S) +#define ZERO_E_CMD_ERROR_F ZERO_E_CMD_ERROR_V(1U) + +#define OCSPI_PAR_ERROR_S 3 +#define OCSPI_PAR_ERROR_V(x) ((x) << OCSPI_PAR_ERROR_S) +#define OCSPI_PAR_ERROR_F OCSPI_PAR_ERROR_V(1U) + +#define DB_OPTIONS_PAR_ERROR_S 2 +#define DB_OPTIONS_PAR_ERROR_V(x) ((x) << DB_OPTIONS_PAR_ERROR_S) +#define DB_OPTIONS_PAR_ERROR_F DB_OPTIONS_PAR_ERROR_V(1U) + +#define IESPI_PAR_ERROR_S 1 +#define IESPI_PAR_ERROR_V(x) ((x) << IESPI_PAR_ERROR_S) +#define IESPI_PAR_ERROR_F IESPI_PAR_ERROR_V(1U) + +#define PMRX_E_PCMD_PAR_ERROR_S 0 +#define PMRX_E_PCMD_PAR_ERROR_V(x) ((x) << PMRX_E_PCMD_PAR_ERROR_S) +#define PMRX_E_PCMD_PAR_ERROR_F PMRX_E_PCMD_PAR_ERROR_V(1U) + +#define PM_TX_INT_CAUSE_A 0x8ffc +#define PM_TX_STAT_CONFIG_A 0x8fe8 +#define PM_TX_STAT_COUNT_A 0x8fec +#define PM_TX_STAT_LSB_A 0x8ff0 +#define PM_TX_DBG_CTRL_A 0x8ff0 +#define PM_TX_DBG_DATA_A 0x8ff4 +#define PM_TX_DBG_STAT_MSB_A 0x1001a + +#define PCMD_LEN_OVFL0_S 31 +#define PCMD_LEN_OVFL0_V(x) ((x) << PCMD_LEN_OVFL0_S) +#define PCMD_LEN_OVFL0_F PCMD_LEN_OVFL0_V(1U) + +#define PCMD_LEN_OVFL1_S 30 +#define PCMD_LEN_OVFL1_V(x) ((x) << PCMD_LEN_OVFL1_S) +#define PCMD_LEN_OVFL1_F PCMD_LEN_OVFL1_V(1U) + +#define PCMD_LEN_OVFL2_S 29 +#define PCMD_LEN_OVFL2_V(x) ((x) << PCMD_LEN_OVFL2_S) +#define PCMD_LEN_OVFL2_F PCMD_LEN_OVFL2_V(1U) + +#define ZERO_C_CMD_ERROR_S 28 +#define ZERO_C_CMD_ERROR_V(x) ((x) << ZERO_C_CMD_ERROR_S) +#define ZERO_C_CMD_ERROR_F ZERO_C_CMD_ERROR_V(1U) + +#define PMTX_FRAMING_ERROR_F 0x0ffffff0U + +#define OESPI_PAR_ERROR_S 3 +#define OESPI_PAR_ERROR_V(x) ((x) << OESPI_PAR_ERROR_S) +#define OESPI_PAR_ERROR_F OESPI_PAR_ERROR_V(1U) + +#define ICSPI_PAR_ERROR_S 1 +#define ICSPI_PAR_ERROR_V(x) ((x) << ICSPI_PAR_ERROR_S) +#define ICSPI_PAR_ERROR_F ICSPI_PAR_ERROR_V(1U) + +#define PMTX_C_PCMD_PAR_ERROR_S 0 +#define PMTX_C_PCMD_PAR_ERROR_V(x) ((x) << PMTX_C_PCMD_PAR_ERROR_S) +#define PMTX_C_PCMD_PAR_ERROR_F PMTX_C_PCMD_PAR_ERROR_V(1U) + +#define MPS_PORT_STAT_TX_PORT_BYTES_L 0x400 +#define MPS_PORT_STAT_TX_PORT_BYTES_H 0x404 +#define MPS_PORT_STAT_TX_PORT_FRAMES_L 0x408 +#define MPS_PORT_STAT_TX_PORT_FRAMES_H 0x40c +#define MPS_PORT_STAT_TX_PORT_BCAST_L 0x410 +#define MPS_PORT_STAT_TX_PORT_BCAST_H 0x414 +#define MPS_PORT_STAT_TX_PORT_MCAST_L 0x418 +#define MPS_PORT_STAT_TX_PORT_MCAST_H 0x41c +#define MPS_PORT_STAT_TX_PORT_UCAST_L 0x420 +#define MPS_PORT_STAT_TX_PORT_UCAST_H 0x424 +#define MPS_PORT_STAT_TX_PORT_ERROR_L 0x428 +#define MPS_PORT_STAT_TX_PORT_ERROR_H 0x42c +#define MPS_PORT_STAT_TX_PORT_64B_L 0x430 +#define MPS_PORT_STAT_TX_PORT_64B_H 0x434 +#define MPS_PORT_STAT_TX_PORT_65B_127B_L 0x438 +#define MPS_PORT_STAT_TX_PORT_65B_127B_H 0x43c +#define MPS_PORT_STAT_TX_PORT_128B_255B_L 0x440 +#define MPS_PORT_STAT_TX_PORT_128B_255B_H 0x444 +#define MPS_PORT_STAT_TX_PORT_256B_511B_L 0x448 +#define MPS_PORT_STAT_TX_PORT_256B_511B_H 0x44c +#define MPS_PORT_STAT_TX_PORT_512B_1023B_L 0x450 +#define MPS_PORT_STAT_TX_PORT_512B_1023B_H 0x454 +#define MPS_PORT_STAT_TX_PORT_1024B_1518B_L 0x458 +#define MPS_PORT_STAT_TX_PORT_1024B_1518B_H 0x45c +#define MPS_PORT_STAT_TX_PORT_1519B_MAX_L 0x460 +#define MPS_PORT_STAT_TX_PORT_1519B_MAX_H 0x464 +#define MPS_PORT_STAT_TX_PORT_DROP_L 0x468 +#define MPS_PORT_STAT_TX_PORT_DROP_H 0x46c +#define MPS_PORT_STAT_TX_PORT_PAUSE_L 0x470 +#define MPS_PORT_STAT_TX_PORT_PAUSE_H 0x474 +#define MPS_PORT_STAT_TX_PORT_PPP0_L 0x478 +#define MPS_PORT_STAT_TX_PORT_PPP0_H 0x47c +#define MPS_PORT_STAT_TX_PORT_PPP1_L 0x480 +#define MPS_PORT_STAT_TX_PORT_PPP1_H 0x484 +#define MPS_PORT_STAT_TX_PORT_PPP2_L 0x488 +#define MPS_PORT_STAT_TX_PORT_PPP2_H 0x48c +#define MPS_PORT_STAT_TX_PORT_PPP3_L 0x490 +#define MPS_PORT_STAT_TX_PORT_PPP3_H 0x494 +#define MPS_PORT_STAT_TX_PORT_PPP4_L 0x498 +#define MPS_PORT_STAT_TX_PORT_PPP4_H 0x49c +#define MPS_PORT_STAT_TX_PORT_PPP5_L 0x4a0 +#define MPS_PORT_STAT_TX_PORT_PPP5_H 0x4a4 +#define MPS_PORT_STAT_TX_PORT_PPP6_L 0x4a8 +#define MPS_PORT_STAT_TX_PORT_PPP6_H 0x4ac +#define MPS_PORT_STAT_TX_PORT_PPP7_L 0x4b0 +#define MPS_PORT_STAT_TX_PORT_PPP7_H 0x4b4 +#define MPS_PORT_STAT_LB_PORT_BYTES_L 0x4c0 +#define MPS_PORT_STAT_LB_PORT_BYTES_H 0x4c4 +#define MPS_PORT_STAT_LB_PORT_FRAMES_L 0x4c8 +#define MPS_PORT_STAT_LB_PORT_FRAMES_H 0x4cc +#define MPS_PORT_STAT_LB_PORT_BCAST_L 0x4d0 +#define MPS_PORT_STAT_LB_PORT_BCAST_H 0x4d4 +#define MPS_PORT_STAT_LB_PORT_MCAST_L 0x4d8 +#define MPS_PORT_STAT_LB_PORT_MCAST_H 0x4dc +#define MPS_PORT_STAT_LB_PORT_UCAST_L 0x4e0 +#define MPS_PORT_STAT_LB_PORT_UCAST_H 0x4e4 +#define MPS_PORT_STAT_LB_PORT_ERROR_L 0x4e8 +#define MPS_PORT_STAT_LB_PORT_ERROR_H 0x4ec +#define MPS_PORT_STAT_LB_PORT_64B_L 0x4f0 +#define MPS_PORT_STAT_LB_PORT_64B_H 0x4f4 +#define MPS_PORT_STAT_LB_PORT_65B_127B_L 0x4f8 +#define MPS_PORT_STAT_LB_PORT_65B_127B_H 0x4fc +#define MPS_PORT_STAT_LB_PORT_128B_255B_L 0x500 +#define MPS_PORT_STAT_LB_PORT_128B_255B_H 0x504 +#define MPS_PORT_STAT_LB_PORT_256B_511B_L 0x508 +#define MPS_PORT_STAT_LB_PORT_256B_511B_H 0x50c +#define MPS_PORT_STAT_LB_PORT_512B_1023B_L 0x510 +#define MPS_PORT_STAT_LB_PORT_512B_1023B_H 0x514 +#define MPS_PORT_STAT_LB_PORT_1024B_1518B_L 0x518 +#define MPS_PORT_STAT_LB_PORT_1024B_1518B_H 0x51c +#define MPS_PORT_STAT_LB_PORT_1519B_MAX_L 0x520 +#define MPS_PORT_STAT_LB_PORT_1519B_MAX_H 0x524 +#define MPS_PORT_STAT_LB_PORT_DROP_FRAMES 0x528 +#define MPS_PORT_STAT_LB_PORT_DROP_FRAMES_L 0x528 +#define MPS_PORT_STAT_RX_PORT_BYTES_L 0x540 +#define MPS_PORT_STAT_RX_PORT_BYTES_H 0x544 +#define MPS_PORT_STAT_RX_PORT_FRAMES_L 0x548 +#define MPS_PORT_STAT_RX_PORT_FRAMES_H 0x54c +#define MPS_PORT_STAT_RX_PORT_BCAST_L 0x550 +#define MPS_PORT_STAT_RX_PORT_BCAST_H 0x554 +#define MPS_PORT_STAT_RX_PORT_MCAST_L 0x558 +#define MPS_PORT_STAT_RX_PORT_MCAST_H 0x55c +#define MPS_PORT_STAT_RX_PORT_UCAST_L 0x560 +#define MPS_PORT_STAT_RX_PORT_UCAST_H 0x564 +#define MPS_PORT_STAT_RX_PORT_MTU_ERROR_L 0x568 +#define MPS_PORT_STAT_RX_PORT_MTU_ERROR_H 0x56c +#define MPS_PORT_STAT_RX_PORT_MTU_CRC_ERROR_L 0x570 +#define MPS_PORT_STAT_RX_PORT_MTU_CRC_ERROR_H 0x574 +#define MPS_PORT_STAT_RX_PORT_CRC_ERROR_L 0x578 +#define MPS_PORT_STAT_RX_PORT_CRC_ERROR_H 0x57c +#define MPS_PORT_STAT_RX_PORT_LEN_ERROR_L 0x580 +#define MPS_PORT_STAT_RX_PORT_LEN_ERROR_H 0x584 +#define MPS_PORT_STAT_RX_PORT_SYM_ERROR_L 0x588 +#define MPS_PORT_STAT_RX_PORT_SYM_ERROR_H 0x58c +#define MPS_PORT_STAT_RX_PORT_64B_L 0x590 +#define MPS_PORT_STAT_RX_PORT_64B_H 0x594 +#define MPS_PORT_STAT_RX_PORT_65B_127B_L 0x598 +#define MPS_PORT_STAT_RX_PORT_65B_127B_H 0x59c +#define MPS_PORT_STAT_RX_PORT_128B_255B_L 0x5a0 +#define MPS_PORT_STAT_RX_PORT_128B_255B_H 0x5a4 +#define MPS_PORT_STAT_RX_PORT_256B_511B_L 0x5a8 +#define MPS_PORT_STAT_RX_PORT_256B_511B_H 0x5ac +#define MPS_PORT_STAT_RX_PORT_512B_1023B_L 0x5b0 +#define MPS_PORT_STAT_RX_PORT_512B_1023B_H 0x5b4 +#define MPS_PORT_STAT_RX_PORT_1024B_1518B_L 0x5b8 +#define MPS_PORT_STAT_RX_PORT_1024B_1518B_H 0x5bc +#define MPS_PORT_STAT_RX_PORT_1519B_MAX_L 0x5c0 +#define MPS_PORT_STAT_RX_PORT_1519B_MAX_H 0x5c4 +#define MPS_PORT_STAT_RX_PORT_PAUSE_L 0x5c8 +#define MPS_PORT_STAT_RX_PORT_PAUSE_H 0x5cc +#define MPS_PORT_STAT_RX_PORT_PPP0_L 0x5d0 +#define MPS_PORT_STAT_RX_PORT_PPP0_H 0x5d4 +#define MPS_PORT_STAT_RX_PORT_PPP1_L 0x5d8 +#define MPS_PORT_STAT_RX_PORT_PPP1_H 0x5dc +#define MPS_PORT_STAT_RX_PORT_PPP2_L 0x5e0 +#define MPS_PORT_STAT_RX_PORT_PPP2_H 0x5e4 +#define MPS_PORT_STAT_RX_PORT_PPP3_L 0x5e8 +#define MPS_PORT_STAT_RX_PORT_PPP3_H 0x5ec +#define MPS_PORT_STAT_RX_PORT_PPP4_L 0x5f0 +#define MPS_PORT_STAT_RX_PORT_PPP4_H 0x5f4 +#define MPS_PORT_STAT_RX_PORT_PPP5_L 0x5f8 +#define MPS_PORT_STAT_RX_PORT_PPP5_H 0x5fc +#define MPS_PORT_STAT_RX_PORT_PPP6_L 0x600 +#define MPS_PORT_STAT_RX_PORT_PPP6_H 0x604 +#define MPS_PORT_STAT_RX_PORT_PPP7_L 0x608 +#define MPS_PORT_STAT_RX_PORT_PPP7_H 0x60c +#define MPS_PORT_STAT_RX_PORT_LESS_64B_L 0x610 +#define MPS_PORT_STAT_RX_PORT_LESS_64B_H 0x614 +#define MAC_PORT_MAGIC_MACID_LO 0x824 +#define MAC_PORT_MAGIC_MACID_HI 0x828 + +#define MAC_PORT_EPIO_DATA0_A 0x8c0 +#define MAC_PORT_EPIO_DATA1_A 0x8c4 +#define MAC_PORT_EPIO_DATA2_A 0x8c8 +#define MAC_PORT_EPIO_DATA3_A 0x8cc +#define MAC_PORT_EPIO_OP_A 0x8d0 + +#define MAC_PORT_CFG2_A 0x818 + +#define MPS_CMN_CTL_A 0x9000 + +#define NUMPORTS_S 0 +#define NUMPORTS_M 0x3U +#define NUMPORTS_G(x) (((x) >> NUMPORTS_S) & NUMPORTS_M) + +#define MPS_INT_CAUSE_A 0x9008 +#define MPS_TX_INT_CAUSE_A 0x9408 + +#define FRMERR_S 15 +#define FRMERR_V(x) ((x) << FRMERR_S) +#define FRMERR_F FRMERR_V(1U) + +#define SECNTERR_S 14 +#define SECNTERR_V(x) ((x) << SECNTERR_S) +#define SECNTERR_F SECNTERR_V(1U) + +#define BUBBLE_S 13 +#define BUBBLE_V(x) ((x) << BUBBLE_S) +#define BUBBLE_F BUBBLE_V(1U) + +#define TXDESCFIFO_S 9 +#define TXDESCFIFO_M 0xfU +#define TXDESCFIFO_V(x) ((x) << TXDESCFIFO_S) + +#define TXDATAFIFO_S 5 +#define TXDATAFIFO_M 0xfU +#define TXDATAFIFO_V(x) ((x) << TXDATAFIFO_S) + +#define NCSIFIFO_S 4 +#define NCSIFIFO_V(x) ((x) << NCSIFIFO_S) +#define NCSIFIFO_F NCSIFIFO_V(1U) + +#define TPFIFO_S 0 +#define TPFIFO_M 0xfU +#define TPFIFO_V(x) ((x) << TPFIFO_S) + +#define MPS_STAT_PERR_INT_CAUSE_SRAM_A 0x9614 +#define MPS_STAT_PERR_INT_CAUSE_TX_FIFO_A 0x9620 +#define MPS_STAT_PERR_INT_CAUSE_RX_FIFO_A 0x962c + +#define MPS_STAT_RX_BG_0_MAC_DROP_FRAME_L 0x9640 +#define MPS_STAT_RX_BG_0_MAC_DROP_FRAME_H 0x9644 +#define MPS_STAT_RX_BG_1_MAC_DROP_FRAME_L 0x9648 +#define MPS_STAT_RX_BG_1_MAC_DROP_FRAME_H 0x964c +#define MPS_STAT_RX_BG_2_MAC_DROP_FRAME_L 0x9650 +#define MPS_STAT_RX_BG_2_MAC_DROP_FRAME_H 0x9654 +#define MPS_STAT_RX_BG_3_MAC_DROP_FRAME_L 0x9658 +#define MPS_STAT_RX_BG_3_MAC_DROP_FRAME_H 0x965c +#define MPS_STAT_RX_BG_0_LB_DROP_FRAME_L 0x9660 +#define MPS_STAT_RX_BG_0_LB_DROP_FRAME_H 0x9664 +#define MPS_STAT_RX_BG_1_LB_DROP_FRAME_L 0x9668 +#define MPS_STAT_RX_BG_1_LB_DROP_FRAME_H 0x966c +#define MPS_STAT_RX_BG_2_LB_DROP_FRAME_L 0x9670 +#define MPS_STAT_RX_BG_2_LB_DROP_FRAME_H 0x9674 +#define MPS_STAT_RX_BG_3_LB_DROP_FRAME_L 0x9678 +#define MPS_STAT_RX_BG_3_LB_DROP_FRAME_H 0x967c +#define MPS_STAT_RX_BG_0_MAC_TRUNC_FRAME_L 0x9680 +#define MPS_STAT_RX_BG_0_MAC_TRUNC_FRAME_H 0x9684 +#define MPS_STAT_RX_BG_1_MAC_TRUNC_FRAME_L 0x9688 +#define MPS_STAT_RX_BG_1_MAC_TRUNC_FRAME_H 0x968c +#define MPS_STAT_RX_BG_2_MAC_TRUNC_FRAME_L 0x9690 +#define MPS_STAT_RX_BG_2_MAC_TRUNC_FRAME_H 0x9694 +#define MPS_STAT_RX_BG_3_MAC_TRUNC_FRAME_L 0x9698 +#define MPS_STAT_RX_BG_3_MAC_TRUNC_FRAME_H 0x969c +#define MPS_STAT_RX_BG_0_LB_TRUNC_FRAME_L 0x96a0 +#define MPS_STAT_RX_BG_0_LB_TRUNC_FRAME_H 0x96a4 +#define MPS_STAT_RX_BG_1_LB_TRUNC_FRAME_L 0x96a8 +#define MPS_STAT_RX_BG_1_LB_TRUNC_FRAME_H 0x96ac +#define MPS_STAT_RX_BG_2_LB_TRUNC_FRAME_L 0x96b0 +#define MPS_STAT_RX_BG_2_LB_TRUNC_FRAME_H 0x96b4 +#define MPS_STAT_RX_BG_3_LB_TRUNC_FRAME_L 0x96b8 +#define MPS_STAT_RX_BG_3_LB_TRUNC_FRAME_H 0x96bc + +#define MPS_TRC_CFG_A 0x9800 + +#define TRCFIFOEMPTY_S 4 +#define TRCFIFOEMPTY_V(x) ((x) << TRCFIFOEMPTY_S) +#define TRCFIFOEMPTY_F TRCFIFOEMPTY_V(1U) + +#define TRCIGNOREDROPINPUT_S 3 +#define TRCIGNOREDROPINPUT_V(x) ((x) << TRCIGNOREDROPINPUT_S) +#define TRCIGNOREDROPINPUT_F TRCIGNOREDROPINPUT_V(1U) + +#define TRCKEEPDUPLICATES_S 2 +#define TRCKEEPDUPLICATES_V(x) ((x) << TRCKEEPDUPLICATES_S) +#define TRCKEEPDUPLICATES_F TRCKEEPDUPLICATES_V(1U) + +#define TRCEN_S 1 +#define TRCEN_V(x) ((x) << TRCEN_S) +#define TRCEN_F TRCEN_V(1U) + +#define TRCMULTIFILTER_S 0 +#define TRCMULTIFILTER_V(x) ((x) << TRCMULTIFILTER_S) +#define TRCMULTIFILTER_F TRCMULTIFILTER_V(1U) + +#define MPS_TRC_RSS_CONTROL_A 0x9808 +#define MPS_TRC_FILTER1_RSS_CONTROL_A 0x9ff4 +#define MPS_TRC_FILTER2_RSS_CONTROL_A 0x9ffc +#define MPS_TRC_FILTER3_RSS_CONTROL_A 0xa004 +#define MPS_T5_TRC_RSS_CONTROL_A 0xa00c + +#define RSSCONTROL_S 16 +#define RSSCONTROL_V(x) ((x) << RSSCONTROL_S) + +#define QUEUENUMBER_S 0 +#define QUEUENUMBER_V(x) ((x) << QUEUENUMBER_S) + +#define TFINVERTMATCH_S 24 +#define TFINVERTMATCH_V(x) ((x) << TFINVERTMATCH_S) +#define TFINVERTMATCH_F TFINVERTMATCH_V(1U) + +#define TFEN_S 22 +#define TFEN_V(x) ((x) << TFEN_S) +#define TFEN_F TFEN_V(1U) + +#define TFPORT_S 18 +#define TFPORT_M 0xfU +#define TFPORT_V(x) ((x) << TFPORT_S) +#define TFPORT_G(x) (((x) >> TFPORT_S) & TFPORT_M) + +#define TFLENGTH_S 8 +#define TFLENGTH_M 0x1fU +#define TFLENGTH_V(x) ((x) << TFLENGTH_S) +#define TFLENGTH_G(x) (((x) >> TFLENGTH_S) & TFLENGTH_M) + +#define TFOFFSET_S 0 +#define TFOFFSET_M 0x1fU +#define TFOFFSET_V(x) ((x) << TFOFFSET_S) +#define TFOFFSET_G(x) (((x) >> TFOFFSET_S) & TFOFFSET_M) + +#define T5_TFINVERTMATCH_S 25 +#define T5_TFINVERTMATCH_V(x) ((x) << T5_TFINVERTMATCH_S) +#define T5_TFINVERTMATCH_F T5_TFINVERTMATCH_V(1U) + +#define T5_TFEN_S 23 +#define T5_TFEN_V(x) ((x) << T5_TFEN_S) +#define T5_TFEN_F T5_TFEN_V(1U) + +#define T5_TFPORT_S 18 +#define T5_TFPORT_M 0x1fU +#define T5_TFPORT_V(x) ((x) << T5_TFPORT_S) +#define T5_TFPORT_G(x) (((x) >> T5_TFPORT_S) & T5_TFPORT_M) + +#define MPS_TRC_FILTER_MATCH_CTL_A_A 0x9810 +#define MPS_TRC_FILTER_MATCH_CTL_B_A 0x9820 + +#define TFMINPKTSIZE_S 16 +#define TFMINPKTSIZE_M 0x1ffU +#define TFMINPKTSIZE_V(x) ((x) << TFMINPKTSIZE_S) +#define TFMINPKTSIZE_G(x) (((x) >> TFMINPKTSIZE_S) & TFMINPKTSIZE_M) + +#define TFCAPTUREMAX_S 0 +#define TFCAPTUREMAX_M 0x3fffU +#define TFCAPTUREMAX_V(x) ((x) << TFCAPTUREMAX_S) +#define TFCAPTUREMAX_G(x) (((x) >> TFCAPTUREMAX_S) & TFCAPTUREMAX_M) + +#define MPS_TRC_FILTER0_MATCH_A 0x9c00 +#define MPS_TRC_FILTER0_DONT_CARE_A 0x9c80 +#define MPS_TRC_FILTER1_MATCH_A 0x9d00 + +#define TP_RSS_CONFIG_A 0x7df0 + +#define TNL4TUPENIPV6_S 31 +#define TNL4TUPENIPV6_V(x) ((x) << TNL4TUPENIPV6_S) +#define TNL4TUPENIPV6_F TNL4TUPENIPV6_V(1U) + +#define TNL2TUPENIPV6_S 30 +#define TNL2TUPENIPV6_V(x) ((x) << TNL2TUPENIPV6_S) +#define TNL2TUPENIPV6_F TNL2TUPENIPV6_V(1U) + +#define TNL4TUPENIPV4_S 29 +#define TNL4TUPENIPV4_V(x) ((x) << TNL4TUPENIPV4_S) +#define TNL4TUPENIPV4_F TNL4TUPENIPV4_V(1U) + +#define TNL2TUPENIPV4_S 28 +#define TNL2TUPENIPV4_V(x) ((x) << TNL2TUPENIPV4_S) +#define TNL2TUPENIPV4_F TNL2TUPENIPV4_V(1U) + +#define TNLTCPSEL_S 27 +#define TNLTCPSEL_V(x) ((x) << TNLTCPSEL_S) +#define TNLTCPSEL_F TNLTCPSEL_V(1U) + +#define TNLIP6SEL_S 26 +#define TNLIP6SEL_V(x) ((x) << TNLIP6SEL_S) +#define TNLIP6SEL_F TNLIP6SEL_V(1U) + +#define TNLVRTSEL_S 25 +#define TNLVRTSEL_V(x) ((x) << TNLVRTSEL_S) +#define TNLVRTSEL_F TNLVRTSEL_V(1U) + +#define TNLMAPEN_S 24 +#define TNLMAPEN_V(x) ((x) << TNLMAPEN_S) +#define TNLMAPEN_F TNLMAPEN_V(1U) + +#define OFDHASHSAVE_S 19 +#define OFDHASHSAVE_V(x) ((x) << OFDHASHSAVE_S) +#define OFDHASHSAVE_F OFDHASHSAVE_V(1U) + +#define OFDVRTSEL_S 18 +#define OFDVRTSEL_V(x) ((x) << OFDVRTSEL_S) +#define OFDVRTSEL_F OFDVRTSEL_V(1U) + +#define OFDMAPEN_S 17 +#define OFDMAPEN_V(x) ((x) << OFDMAPEN_S) +#define OFDMAPEN_F OFDMAPEN_V(1U) + +#define OFDLKPEN_S 16 +#define OFDLKPEN_V(x) ((x) << OFDLKPEN_S) +#define OFDLKPEN_F OFDLKPEN_V(1U) + +#define SYN4TUPENIPV6_S 15 +#define SYN4TUPENIPV6_V(x) ((x) << SYN4TUPENIPV6_S) +#define SYN4TUPENIPV6_F SYN4TUPENIPV6_V(1U) + +#define SYN2TUPENIPV6_S 14 +#define SYN2TUPENIPV6_V(x) ((x) << SYN2TUPENIPV6_S) +#define SYN2TUPENIPV6_F SYN2TUPENIPV6_V(1U) + +#define SYN4TUPENIPV4_S 13 +#define SYN4TUPENIPV4_V(x) ((x) << SYN4TUPENIPV4_S) +#define SYN4TUPENIPV4_F SYN4TUPENIPV4_V(1U) + +#define SYN2TUPENIPV4_S 12 +#define SYN2TUPENIPV4_V(x) ((x) << SYN2TUPENIPV4_S) +#define SYN2TUPENIPV4_F SYN2TUPENIPV4_V(1U) + +#define SYNIP6SEL_S 11 +#define SYNIP6SEL_V(x) ((x) << SYNIP6SEL_S) +#define SYNIP6SEL_F SYNIP6SEL_V(1U) + +#define SYNVRTSEL_S 10 +#define SYNVRTSEL_V(x) ((x) << SYNVRTSEL_S) +#define SYNVRTSEL_F SYNVRTSEL_V(1U) + +#define SYNMAPEN_S 9 +#define SYNMAPEN_V(x) ((x) << SYNMAPEN_S) +#define SYNMAPEN_F SYNMAPEN_V(1U) + +#define SYNLKPEN_S 8 +#define SYNLKPEN_V(x) ((x) << SYNLKPEN_S) +#define SYNLKPEN_F SYNLKPEN_V(1U) + +#define CHANNELENABLE_S 7 +#define CHANNELENABLE_V(x) ((x) << CHANNELENABLE_S) +#define CHANNELENABLE_F CHANNELENABLE_V(1U) + +#define PORTENABLE_S 6 +#define PORTENABLE_V(x) ((x) << PORTENABLE_S) +#define PORTENABLE_F PORTENABLE_V(1U) + +#define TNLALLLOOKUP_S 5 +#define TNLALLLOOKUP_V(x) ((x) << TNLALLLOOKUP_S) +#define TNLALLLOOKUP_F TNLALLLOOKUP_V(1U) + +#define VIRTENABLE_S 4 +#define VIRTENABLE_V(x) ((x) << VIRTENABLE_S) +#define VIRTENABLE_F VIRTENABLE_V(1U) + +#define CONGESTIONENABLE_S 3 +#define CONGESTIONENABLE_V(x) ((x) << CONGESTIONENABLE_S) +#define CONGESTIONENABLE_F CONGESTIONENABLE_V(1U) + +#define HASHTOEPLITZ_S 2 +#define HASHTOEPLITZ_V(x) ((x) << HASHTOEPLITZ_S) +#define HASHTOEPLITZ_F HASHTOEPLITZ_V(1U) + +#define UDPENABLE_S 1 +#define UDPENABLE_V(x) ((x) << UDPENABLE_S) +#define UDPENABLE_F UDPENABLE_V(1U) + +#define DISABLE_S 0 +#define DISABLE_V(x) ((x) << DISABLE_S) +#define DISABLE_F DISABLE_V(1U) + +#define TP_RSS_CONFIG_TNL_A 0x7df4 + +#define MASKSIZE_S 28 +#define MASKSIZE_M 0xfU +#define MASKSIZE_V(x) ((x) << MASKSIZE_S) +#define MASKSIZE_G(x) (((x) >> MASKSIZE_S) & MASKSIZE_M) + +#define MASKFILTER_S 16 +#define MASKFILTER_M 0x7ffU +#define MASKFILTER_V(x) ((x) << MASKFILTER_S) +#define MASKFILTER_G(x) (((x) >> MASKFILTER_S) & MASKFILTER_M) + +#define USEWIRECH_S 0 +#define USEWIRECH_V(x) ((x) << USEWIRECH_S) +#define USEWIRECH_F USEWIRECH_V(1U) + +#define HASHALL_S 2 +#define HASHALL_V(x) ((x) << HASHALL_S) +#define HASHALL_F HASHALL_V(1U) + +#define HASHETH_S 1 +#define HASHETH_V(x) ((x) << HASHETH_S) +#define HASHETH_F HASHETH_V(1U) + +#define TP_RSS_CONFIG_OFD_A 0x7df8 + +#define RRCPLMAPEN_S 20 +#define RRCPLMAPEN_V(x) ((x) << RRCPLMAPEN_S) +#define RRCPLMAPEN_F RRCPLMAPEN_V(1U) + +#define RRCPLQUEWIDTH_S 16 +#define RRCPLQUEWIDTH_M 0xfU +#define RRCPLQUEWIDTH_V(x) ((x) << RRCPLQUEWIDTH_S) +#define RRCPLQUEWIDTH_G(x) (((x) >> RRCPLQUEWIDTH_S) & RRCPLQUEWIDTH_M) + +#define TP_RSS_CONFIG_SYN_A 0x7dfc +#define TP_RSS_CONFIG_VRT_A 0x7e00 + +#define VFRDRG_S 25 +#define VFRDRG_V(x) ((x) << VFRDRG_S) +#define VFRDRG_F VFRDRG_V(1U) + +#define VFRDEN_S 24 +#define VFRDEN_V(x) ((x) << VFRDEN_S) +#define VFRDEN_F VFRDEN_V(1U) + +#define VFPERREN_S 23 +#define VFPERREN_V(x) ((x) << VFPERREN_S) +#define VFPERREN_F VFPERREN_V(1U) + +#define KEYPERREN_S 22 +#define KEYPERREN_V(x) ((x) << KEYPERREN_S) +#define KEYPERREN_F KEYPERREN_V(1U) + +#define DISABLEVLAN_S 21 +#define DISABLEVLAN_V(x) ((x) << DISABLEVLAN_S) +#define DISABLEVLAN_F DISABLEVLAN_V(1U) + +#define ENABLEUP0_S 20 +#define ENABLEUP0_V(x) ((x) << ENABLEUP0_S) +#define ENABLEUP0_F ENABLEUP0_V(1U) + +#define HASHDELAY_S 16 +#define HASHDELAY_M 0xfU +#define HASHDELAY_V(x) ((x) << HASHDELAY_S) +#define HASHDELAY_G(x) (((x) >> HASHDELAY_S) & HASHDELAY_M) + +#define VFWRADDR_S 8 +#define VFWRADDR_M 0x7fU +#define VFWRADDR_V(x) ((x) << VFWRADDR_S) +#define VFWRADDR_G(x) (((x) >> VFWRADDR_S) & VFWRADDR_M) + +#define KEYMODE_S 6 +#define KEYMODE_M 0x3U +#define KEYMODE_V(x) ((x) << KEYMODE_S) +#define KEYMODE_G(x) (((x) >> KEYMODE_S) & KEYMODE_M) + +#define VFWREN_S 5 +#define VFWREN_V(x) ((x) << VFWREN_S) +#define VFWREN_F VFWREN_V(1U) + +#define KEYWREN_S 4 +#define KEYWREN_V(x) ((x) << KEYWREN_S) +#define KEYWREN_F KEYWREN_V(1U) + +#define KEYWRADDR_S 0 +#define KEYWRADDR_M 0xfU +#define KEYWRADDR_V(x) ((x) << KEYWRADDR_S) +#define KEYWRADDR_G(x) (((x) >> KEYWRADDR_S) & KEYWRADDR_M) + +#define KEYWRADDRX_S 30 +#define KEYWRADDRX_M 0x3U +#define KEYWRADDRX_V(x) ((x) << KEYWRADDRX_S) +#define KEYWRADDRX_G(x) (((x) >> KEYWRADDRX_S) & KEYWRADDRX_M) + +#define KEYEXTEND_S 26 +#define KEYEXTEND_V(x) ((x) << KEYEXTEND_S) +#define KEYEXTEND_F KEYEXTEND_V(1U) + +#define LKPIDXSIZE_S 24 +#define LKPIDXSIZE_M 0x3U +#define LKPIDXSIZE_V(x) ((x) << LKPIDXSIZE_S) +#define LKPIDXSIZE_G(x) (((x) >> LKPIDXSIZE_S) & LKPIDXSIZE_M) + +#define TP_RSS_VFL_CONFIG_A 0x3a +#define TP_RSS_VFH_CONFIG_A 0x3b + +#define ENABLEUDPHASH_S 31 +#define ENABLEUDPHASH_V(x) ((x) << ENABLEUDPHASH_S) +#define ENABLEUDPHASH_F ENABLEUDPHASH_V(1U) + +#define VFUPEN_S 30 +#define VFUPEN_V(x) ((x) << VFUPEN_S) +#define VFUPEN_F VFUPEN_V(1U) + +#define VFVLNEX_S 28 +#define VFVLNEX_V(x) ((x) << VFVLNEX_S) +#define VFVLNEX_F VFVLNEX_V(1U) + +#define VFPRTEN_S 27 +#define VFPRTEN_V(x) ((x) << VFPRTEN_S) +#define VFPRTEN_F VFPRTEN_V(1U) + +#define VFCHNEN_S 26 +#define VFCHNEN_V(x) ((x) << VFCHNEN_S) +#define VFCHNEN_F VFCHNEN_V(1U) + +#define DEFAULTQUEUE_S 16 +#define DEFAULTQUEUE_M 0x3ffU +#define DEFAULTQUEUE_G(x) (((x) >> DEFAULTQUEUE_S) & DEFAULTQUEUE_M) + +#define VFIP6TWOTUPEN_S 6 +#define VFIP6TWOTUPEN_V(x) ((x) << VFIP6TWOTUPEN_S) +#define VFIP6TWOTUPEN_F VFIP6TWOTUPEN_V(1U) + +#define VFIP4FOURTUPEN_S 5 +#define VFIP4FOURTUPEN_V(x) ((x) << VFIP4FOURTUPEN_S) +#define VFIP4FOURTUPEN_F VFIP4FOURTUPEN_V(1U) + +#define VFIP4TWOTUPEN_S 4 +#define VFIP4TWOTUPEN_V(x) ((x) << VFIP4TWOTUPEN_S) +#define VFIP4TWOTUPEN_F VFIP4TWOTUPEN_V(1U) + +#define KEYINDEX_S 0 +#define KEYINDEX_M 0xfU +#define KEYINDEX_G(x) (((x) >> KEYINDEX_S) & KEYINDEX_M) + +#define MAPENABLE_S 31 +#define MAPENABLE_V(x) ((x) << MAPENABLE_S) +#define MAPENABLE_F MAPENABLE_V(1U) + +#define CHNENABLE_S 30 +#define CHNENABLE_V(x) ((x) << CHNENABLE_S) +#define CHNENABLE_F CHNENABLE_V(1U) + +#define PRTENABLE_S 29 +#define PRTENABLE_V(x) ((x) << PRTENABLE_S) +#define PRTENABLE_F PRTENABLE_V(1U) + +#define UDPFOURTUPEN_S 28 +#define UDPFOURTUPEN_V(x) ((x) << UDPFOURTUPEN_S) +#define UDPFOURTUPEN_F UDPFOURTUPEN_V(1U) + +#define IP6FOURTUPEN_S 27 +#define IP6FOURTUPEN_V(x) ((x) << IP6FOURTUPEN_S) +#define IP6FOURTUPEN_F IP6FOURTUPEN_V(1U) + +#define IP6TWOTUPEN_S 26 +#define IP6TWOTUPEN_V(x) ((x) << IP6TWOTUPEN_S) +#define IP6TWOTUPEN_F IP6TWOTUPEN_V(1U) + +#define IP4FOURTUPEN_S 25 +#define IP4FOURTUPEN_V(x) ((x) << IP4FOURTUPEN_S) +#define IP4FOURTUPEN_F IP4FOURTUPEN_V(1U) + +#define IP4TWOTUPEN_S 24 +#define IP4TWOTUPEN_V(x) ((x) << IP4TWOTUPEN_S) +#define IP4TWOTUPEN_F IP4TWOTUPEN_V(1U) + +#define IVFWIDTH_S 20 +#define IVFWIDTH_M 0xfU +#define IVFWIDTH_V(x) ((x) << IVFWIDTH_S) +#define IVFWIDTH_G(x) (((x) >> IVFWIDTH_S) & IVFWIDTH_M) + +#define CH1DEFAULTQUEUE_S 10 +#define CH1DEFAULTQUEUE_M 0x3ffU +#define CH1DEFAULTQUEUE_V(x) ((x) << CH1DEFAULTQUEUE_S) +#define CH1DEFAULTQUEUE_G(x) (((x) >> CH1DEFAULTQUEUE_S) & CH1DEFAULTQUEUE_M) + +#define CH0DEFAULTQUEUE_S 0 +#define CH0DEFAULTQUEUE_M 0x3ffU +#define CH0DEFAULTQUEUE_V(x) ((x) << CH0DEFAULTQUEUE_S) +#define CH0DEFAULTQUEUE_G(x) (((x) >> CH0DEFAULTQUEUE_S) & CH0DEFAULTQUEUE_M) + +#define VFLKPIDX_S 8 +#define VFLKPIDX_M 0xffU +#define VFLKPIDX_G(x) (((x) >> VFLKPIDX_S) & VFLKPIDX_M) + +#define T6_VFWRADDR_S 8 +#define T6_VFWRADDR_M 0xffU +#define T6_VFWRADDR_V(x) ((x) << T6_VFWRADDR_S) +#define T6_VFWRADDR_G(x) (((x) >> T6_VFWRADDR_S) & T6_VFWRADDR_M) + +#define TP_RSS_CONFIG_CNG_A 0x7e04 +#define TP_RSS_SECRET_KEY0_A 0x40 +#define TP_RSS_PF0_CONFIG_A 0x30 +#define TP_RSS_PF_MAP_A 0x38 +#define TP_RSS_PF_MSK_A 0x39 + +#define PF1LKPIDX_S 3 + +#define PF0LKPIDX_M 0x7U + +#define PF1MSKSIZE_S 4 +#define PF1MSKSIZE_M 0xfU + +#define CHNCOUNT3_S 31 +#define CHNCOUNT3_V(x) ((x) << CHNCOUNT3_S) +#define CHNCOUNT3_F CHNCOUNT3_V(1U) + +#define CHNCOUNT2_S 30 +#define CHNCOUNT2_V(x) ((x) << CHNCOUNT2_S) +#define CHNCOUNT2_F CHNCOUNT2_V(1U) + +#define CHNCOUNT1_S 29 +#define CHNCOUNT1_V(x) ((x) << CHNCOUNT1_S) +#define CHNCOUNT1_F CHNCOUNT1_V(1U) + +#define CHNCOUNT0_S 28 +#define CHNCOUNT0_V(x) ((x) << CHNCOUNT0_S) +#define CHNCOUNT0_F CHNCOUNT0_V(1U) + +#define CHNUNDFLOW3_S 27 +#define CHNUNDFLOW3_V(x) ((x) << CHNUNDFLOW3_S) +#define CHNUNDFLOW3_F CHNUNDFLOW3_V(1U) + +#define CHNUNDFLOW2_S 26 +#define CHNUNDFLOW2_V(x) ((x) << CHNUNDFLOW2_S) +#define CHNUNDFLOW2_F CHNUNDFLOW2_V(1U) + +#define CHNUNDFLOW1_S 25 +#define CHNUNDFLOW1_V(x) ((x) << CHNUNDFLOW1_S) +#define CHNUNDFLOW1_F CHNUNDFLOW1_V(1U) + +#define CHNUNDFLOW0_S 24 +#define CHNUNDFLOW0_V(x) ((x) << CHNUNDFLOW0_S) +#define CHNUNDFLOW0_F CHNUNDFLOW0_V(1U) + +#define RSTCHN3_S 19 +#define RSTCHN3_V(x) ((x) << RSTCHN3_S) +#define RSTCHN3_F RSTCHN3_V(1U) + +#define RSTCHN2_S 18 +#define RSTCHN2_V(x) ((x) << RSTCHN2_S) +#define RSTCHN2_F RSTCHN2_V(1U) + +#define RSTCHN1_S 17 +#define RSTCHN1_V(x) ((x) << RSTCHN1_S) +#define RSTCHN1_F RSTCHN1_V(1U) + +#define RSTCHN0_S 16 +#define RSTCHN0_V(x) ((x) << RSTCHN0_S) +#define RSTCHN0_F RSTCHN0_V(1U) + +#define UPDVLD_S 15 +#define UPDVLD_V(x) ((x) << UPDVLD_S) +#define UPDVLD_F UPDVLD_V(1U) + +#define XOFF_S 14 +#define XOFF_V(x) ((x) << XOFF_S) +#define XOFF_F XOFF_V(1U) + +#define UPDCHN3_S 13 +#define UPDCHN3_V(x) ((x) << UPDCHN3_S) +#define UPDCHN3_F UPDCHN3_V(1U) + +#define UPDCHN2_S 12 +#define UPDCHN2_V(x) ((x) << UPDCHN2_S) +#define UPDCHN2_F UPDCHN2_V(1U) + +#define UPDCHN1_S 11 +#define UPDCHN1_V(x) ((x) << UPDCHN1_S) +#define UPDCHN1_F UPDCHN1_V(1U) + +#define UPDCHN0_S 10 +#define UPDCHN0_V(x) ((x) << UPDCHN0_S) +#define UPDCHN0_F UPDCHN0_V(1U) + +#define QUEUE_S 0 +#define QUEUE_M 0x3ffU +#define QUEUE_V(x) ((x) << QUEUE_S) +#define QUEUE_G(x) (((x) >> QUEUE_S) & QUEUE_M) + +#define MPS_TRC_INT_CAUSE_A 0x985c + +#define MISCPERR_S 8 +#define MISCPERR_V(x) ((x) << MISCPERR_S) +#define MISCPERR_F MISCPERR_V(1U) + +#define PKTFIFO_S 4 +#define PKTFIFO_M 0xfU +#define PKTFIFO_V(x) ((x) << PKTFIFO_S) + +#define FILTMEM_S 0 +#define FILTMEM_M 0xfU +#define FILTMEM_V(x) ((x) << FILTMEM_S) + +#define MPS_CLS_INT_CAUSE_A 0xd028 + +#define HASHSRAM_S 2 +#define HASHSRAM_V(x) ((x) << HASHSRAM_S) +#define HASHSRAM_F HASHSRAM_V(1U) + +#define MATCHTCAM_S 1 +#define MATCHTCAM_V(x) ((x) << MATCHTCAM_S) +#define MATCHTCAM_F MATCHTCAM_V(1U) + +#define MATCHSRAM_S 0 +#define MATCHSRAM_V(x) ((x) << MATCHSRAM_S) +#define MATCHSRAM_F MATCHSRAM_V(1U) + +#define MPS_RX_PG_RSV0_A 0x11010 +#define MPS_RX_PG_RSV4_A 0x11020 +#define MPS_RX_PERR_INT_CAUSE_A 0x11074 +#define MPS_RX_MAC_BG_PG_CNT0_A 0x11208 +#define MPS_RX_LPBK_BG_PG_CNT0_A 0x11218 + +#define MPS_CLS_TCAM_Y_L_A 0xf000 +#define MPS_CLS_TCAM_DATA0_A 0xf000 +#define MPS_CLS_TCAM_DATA1_A 0xf004 + +#define VIDL_S 16 +#define VIDL_M 0xffffU +#define VIDL_G(x) (((x) >> VIDL_S) & VIDL_M) + +#define DATALKPTYPE_S 10 +#define DATALKPTYPE_M 0x3U +#define DATALKPTYPE_G(x) (((x) >> DATALKPTYPE_S) & DATALKPTYPE_M) + +#define DATAPORTNUM_S 12 +#define DATAPORTNUM_M 0xfU +#define DATAPORTNUM_G(x) (((x) >> DATAPORTNUM_S) & DATAPORTNUM_M) + +#define DATADIPHIT_S 8 +#define DATADIPHIT_V(x) ((x) << DATADIPHIT_S) +#define DATADIPHIT_F DATADIPHIT_V(1U) + +#define DATAVIDH2_S 7 +#define DATAVIDH2_V(x) ((x) << DATAVIDH2_S) +#define DATAVIDH2_F DATAVIDH2_V(1U) + +#define DATAVIDH1_S 0 +#define DATAVIDH1_M 0x7fU +#define DATAVIDH1_G(x) (((x) >> DATAVIDH1_S) & DATAVIDH1_M) + +#define USED_S 16 +#define USED_M 0x7ffU +#define USED_G(x) (((x) >> USED_S) & USED_M) + +#define ALLOC_S 0 +#define ALLOC_M 0x7ffU +#define ALLOC_G(x) (((x) >> ALLOC_S) & ALLOC_M) + +#define T5_USED_S 16 +#define T5_USED_M 0xfffU +#define T5_USED_G(x) (((x) >> T5_USED_S) & T5_USED_M) + +#define T5_ALLOC_S 0 +#define T5_ALLOC_M 0xfffU +#define T5_ALLOC_G(x) (((x) >> T5_ALLOC_S) & T5_ALLOC_M) + +#define DMACH_S 0 +#define DMACH_M 0xffffU +#define DMACH_G(x) (((x) >> DMACH_S) & DMACH_M) + +#define MPS_CLS_TCAM_X_L_A 0xf008 +#define MPS_CLS_TCAM_DATA2_CTL_A 0xf008 + +#define CTLCMDTYPE_S 31 +#define CTLCMDTYPE_V(x) ((x) << CTLCMDTYPE_S) +#define CTLCMDTYPE_F CTLCMDTYPE_V(1U) + +#define CTLTCAMSEL_S 25 +#define CTLTCAMSEL_V(x) ((x) << CTLTCAMSEL_S) + +#define CTLTCAMINDEX_S 17 +#define CTLTCAMINDEX_V(x) ((x) << CTLTCAMINDEX_S) + +#define CTLXYBITSEL_S 16 +#define CTLXYBITSEL_V(x) ((x) << CTLXYBITSEL_S) + +#define MPS_CLS_TCAM_Y_L(idx) (MPS_CLS_TCAM_Y_L_A + (idx) * 16) +#define NUM_MPS_CLS_TCAM_Y_L_INSTANCES 512 + +#define MPS_CLS_TCAM_X_L(idx) (MPS_CLS_TCAM_X_L_A + (idx) * 16) +#define NUM_MPS_CLS_TCAM_X_L_INSTANCES 512 + +#define MPS_CLS_SRAM_L_A 0xe000 + +#define T6_MULTILISTEN0_S 26 + +#define T6_SRAM_PRIO3_S 23 +#define T6_SRAM_PRIO3_M 0x7U +#define T6_SRAM_PRIO3_G(x) (((x) >> T6_SRAM_PRIO3_S) & T6_SRAM_PRIO3_M) + +#define T6_SRAM_PRIO2_S 20 +#define T6_SRAM_PRIO2_M 0x7U +#define T6_SRAM_PRIO2_G(x) (((x) >> T6_SRAM_PRIO2_S) & T6_SRAM_PRIO2_M) + +#define T6_SRAM_PRIO1_S 17 +#define T6_SRAM_PRIO1_M 0x7U +#define T6_SRAM_PRIO1_G(x) (((x) >> T6_SRAM_PRIO1_S) & T6_SRAM_PRIO1_M) + +#define T6_SRAM_PRIO0_S 14 +#define T6_SRAM_PRIO0_M 0x7U +#define T6_SRAM_PRIO0_G(x) (((x) >> T6_SRAM_PRIO0_S) & T6_SRAM_PRIO0_M) + +#define T6_SRAM_VLD_S 13 +#define T6_SRAM_VLD_V(x) ((x) << T6_SRAM_VLD_S) +#define T6_SRAM_VLD_F T6_SRAM_VLD_V(1U) + +#define T6_REPLICATE_S 12 +#define T6_REPLICATE_V(x) ((x) << T6_REPLICATE_S) +#define T6_REPLICATE_F T6_REPLICATE_V(1U) + +#define T6_PF_S 9 +#define T6_PF_M 0x7U +#define T6_PF_G(x) (((x) >> T6_PF_S) & T6_PF_M) + +#define T6_VF_VALID_S 8 +#define T6_VF_VALID_V(x) ((x) << T6_VF_VALID_S) +#define T6_VF_VALID_F T6_VF_VALID_V(1U) + +#define T6_VF_S 0 +#define T6_VF_M 0xffU +#define T6_VF_G(x) (((x) >> T6_VF_S) & T6_VF_M) + +#define MPS_CLS_SRAM_H_A 0xe004 + +#define MPS_CLS_SRAM_L(idx) (MPS_CLS_SRAM_L_A + (idx) * 8) +#define NUM_MPS_CLS_SRAM_L_INSTANCES 336 + +#define MPS_CLS_SRAM_H(idx) (MPS_CLS_SRAM_H_A + (idx) * 8) +#define NUM_MPS_CLS_SRAM_H_INSTANCES 336 + +#define MULTILISTEN0_S 25 + +#define REPLICATE_S 11 +#define REPLICATE_V(x) ((x) << REPLICATE_S) +#define REPLICATE_F REPLICATE_V(1U) + +#define PF_S 8 +#define PF_M 0x7U +#define PF_G(x) (((x) >> PF_S) & PF_M) + +#define VF_VALID_S 7 +#define VF_VALID_V(x) ((x) << VF_VALID_S) +#define VF_VALID_F VF_VALID_V(1U) + +#define VF_S 0 +#define VF_M 0x7fU +#define VF_G(x) (((x) >> VF_S) & VF_M) + +#define SRAM_PRIO3_S 22 +#define SRAM_PRIO3_M 0x7U +#define SRAM_PRIO3_G(x) (((x) >> SRAM_PRIO3_S) & SRAM_PRIO3_M) + +#define SRAM_PRIO2_S 19 +#define SRAM_PRIO2_M 0x7U +#define SRAM_PRIO2_G(x) (((x) >> SRAM_PRIO2_S) & SRAM_PRIO2_M) + +#define SRAM_PRIO1_S 16 +#define SRAM_PRIO1_M 0x7U +#define SRAM_PRIO1_G(x) (((x) >> SRAM_PRIO1_S) & SRAM_PRIO1_M) + +#define SRAM_PRIO0_S 13 +#define SRAM_PRIO0_M 0x7U +#define SRAM_PRIO0_G(x) (((x) >> SRAM_PRIO0_S) & SRAM_PRIO0_M) + +#define SRAM_VLD_S 12 +#define SRAM_VLD_V(x) ((x) << SRAM_VLD_S) +#define SRAM_VLD_F SRAM_VLD_V(1U) + +#define PORTMAP_S 0 +#define PORTMAP_M 0xfU +#define PORTMAP_G(x) (((x) >> PORTMAP_S) & PORTMAP_M) + +#define CPL_INTR_CAUSE_A 0x19054 + +#define CIM_OP_MAP_PERR_S 5 +#define CIM_OP_MAP_PERR_V(x) ((x) << CIM_OP_MAP_PERR_S) +#define CIM_OP_MAP_PERR_F CIM_OP_MAP_PERR_V(1U) + +#define CIM_OVFL_ERROR_S 4 +#define CIM_OVFL_ERROR_V(x) ((x) << CIM_OVFL_ERROR_S) +#define CIM_OVFL_ERROR_F CIM_OVFL_ERROR_V(1U) + +#define TP_FRAMING_ERROR_S 3 +#define TP_FRAMING_ERROR_V(x) ((x) << TP_FRAMING_ERROR_S) +#define TP_FRAMING_ERROR_F TP_FRAMING_ERROR_V(1U) + +#define SGE_FRAMING_ERROR_S 2 +#define SGE_FRAMING_ERROR_V(x) ((x) << SGE_FRAMING_ERROR_S) +#define SGE_FRAMING_ERROR_F SGE_FRAMING_ERROR_V(1U) + +#define CIM_FRAMING_ERROR_S 1 +#define CIM_FRAMING_ERROR_V(x) ((x) << CIM_FRAMING_ERROR_S) +#define CIM_FRAMING_ERROR_F CIM_FRAMING_ERROR_V(1U) + +#define ZERO_SWITCH_ERROR_S 0 +#define ZERO_SWITCH_ERROR_V(x) ((x) << ZERO_SWITCH_ERROR_S) +#define ZERO_SWITCH_ERROR_F ZERO_SWITCH_ERROR_V(1U) + +#define SMB_INT_CAUSE_A 0x19090 + +#define MSTTXFIFOPARINT_S 21 +#define MSTTXFIFOPARINT_V(x) ((x) << MSTTXFIFOPARINT_S) +#define MSTTXFIFOPARINT_F MSTTXFIFOPARINT_V(1U) + +#define MSTRXFIFOPARINT_S 20 +#define MSTRXFIFOPARINT_V(x) ((x) << MSTRXFIFOPARINT_S) +#define MSTRXFIFOPARINT_F MSTRXFIFOPARINT_V(1U) + +#define SLVFIFOPARINT_S 19 +#define SLVFIFOPARINT_V(x) ((x) << SLVFIFOPARINT_S) +#define SLVFIFOPARINT_F SLVFIFOPARINT_V(1U) + +#define ULP_RX_INT_CAUSE_A 0x19158 +#define ULP_RX_ISCSI_LLIMIT_A 0x1915c +#define ULP_RX_ISCSI_ULIMIT_A 0x19160 +#define ULP_RX_ISCSI_TAGMASK_A 0x19164 +#define ULP_RX_ISCSI_PSZ_A 0x19168 +#define ULP_RX_TDDP_LLIMIT_A 0x1916c +#define ULP_RX_TDDP_ULIMIT_A 0x19170 +#define ULP_RX_STAG_LLIMIT_A 0x1917c +#define ULP_RX_STAG_ULIMIT_A 0x19180 +#define ULP_RX_RQ_LLIMIT_A 0x19184 +#define ULP_RX_RQ_ULIMIT_A 0x19188 +#define ULP_RX_PBL_LLIMIT_A 0x1918c +#define ULP_RX_PBL_ULIMIT_A 0x19190 +#define ULP_RX_CTX_BASE_A 0x19194 +#define ULP_RX_RQUDP_LLIMIT_A 0x191a4 +#define ULP_RX_RQUDP_ULIMIT_A 0x191a8 +#define ULP_RX_LA_CTL_A 0x1923c +#define ULP_RX_LA_RDPTR_A 0x19240 +#define ULP_RX_LA_RDDATA_A 0x19244 +#define ULP_RX_LA_WRPTR_A 0x19248 + +#define HPZ3_S 24 +#define HPZ3_V(x) ((x) << HPZ3_S) + +#define HPZ2_S 16 +#define HPZ2_V(x) ((x) << HPZ2_S) + +#define HPZ1_S 8 +#define HPZ1_V(x) ((x) << HPZ1_S) + +#define HPZ0_S 0 +#define HPZ0_V(x) ((x) << HPZ0_S) + +#define ULP_RX_TDDP_PSZ_A 0x19178 + +/* registers for module SF */ +#define SF_DATA_A 0x193f8 +#define SF_OP_A 0x193fc + +#define SF_BUSY_S 31 +#define SF_BUSY_V(x) ((x) << SF_BUSY_S) +#define SF_BUSY_F SF_BUSY_V(1U) + +#define SF_LOCK_S 4 +#define SF_LOCK_V(x) ((x) << SF_LOCK_S) +#define SF_LOCK_F SF_LOCK_V(1U) + +#define SF_CONT_S 3 +#define SF_CONT_V(x) ((x) << SF_CONT_S) +#define SF_CONT_F SF_CONT_V(1U) + +#define BYTECNT_S 1 +#define BYTECNT_V(x) ((x) << BYTECNT_S) + +#define OP_S 0 +#define OP_V(x) ((x) << OP_S) +#define OP_F OP_V(1U) + +#define PL_PF_INT_CAUSE_A 0x3c0 + +#define PFSW_S 3 +#define PFSW_V(x) ((x) << PFSW_S) +#define PFSW_F PFSW_V(1U) + +#define PFCIM_S 1 +#define PFCIM_V(x) ((x) << PFCIM_S) +#define PFCIM_F PFCIM_V(1U) + +#define PL_PF_INT_ENABLE_A 0x3c4 +#define PL_PF_CTL_A 0x3c8 + +#define PL_WHOAMI_A 0x19400 + +#define SOURCEPF_S 8 +#define SOURCEPF_M 0x7U +#define SOURCEPF_G(x) (((x) >> SOURCEPF_S) & SOURCEPF_M) + +#define T6_SOURCEPF_S 9 +#define T6_SOURCEPF_M 0x7U +#define T6_SOURCEPF_G(x) (((x) >> T6_SOURCEPF_S) & T6_SOURCEPF_M) + +#define PL_INT_CAUSE_A 0x1940c + +#define ULP_TX_S 27 +#define ULP_TX_V(x) ((x) << ULP_TX_S) +#define ULP_TX_F ULP_TX_V(1U) + +#define SGE_S 26 +#define SGE_V(x) ((x) << SGE_S) +#define SGE_F SGE_V(1U) + +#define CPL_SWITCH_S 24 +#define CPL_SWITCH_V(x) ((x) << CPL_SWITCH_S) +#define CPL_SWITCH_F CPL_SWITCH_V(1U) + +#define ULP_RX_S 23 +#define ULP_RX_V(x) ((x) << ULP_RX_S) +#define ULP_RX_F ULP_RX_V(1U) + +#define PM_RX_S 22 +#define PM_RX_V(x) ((x) << PM_RX_S) +#define PM_RX_F PM_RX_V(1U) + +#define PM_TX_S 21 +#define PM_TX_V(x) ((x) << PM_TX_S) +#define PM_TX_F PM_TX_V(1U) + +#define MA_S 20 +#define MA_V(x) ((x) << MA_S) +#define MA_F MA_V(1U) + +#define TP_S 19 +#define TP_V(x) ((x) << TP_S) +#define TP_F TP_V(1U) + +#define LE_S 18 +#define LE_V(x) ((x) << LE_S) +#define LE_F LE_V(1U) + +#define EDC1_S 17 +#define EDC1_V(x) ((x) << EDC1_S) +#define EDC1_F EDC1_V(1U) + +#define EDC0_S 16 +#define EDC0_V(x) ((x) << EDC0_S) +#define EDC0_F EDC0_V(1U) + +#define MC_S 15 +#define MC_V(x) ((x) << MC_S) +#define MC_F MC_V(1U) + +#define PCIE_S 14 +#define PCIE_V(x) ((x) << PCIE_S) +#define PCIE_F PCIE_V(1U) + +#define XGMAC_KR1_S 12 +#define XGMAC_KR1_V(x) ((x) << XGMAC_KR1_S) +#define XGMAC_KR1_F XGMAC_KR1_V(1U) + +#define XGMAC_KR0_S 11 +#define XGMAC_KR0_V(x) ((x) << XGMAC_KR0_S) +#define XGMAC_KR0_F XGMAC_KR0_V(1U) + +#define XGMAC1_S 10 +#define XGMAC1_V(x) ((x) << XGMAC1_S) +#define XGMAC1_F XGMAC1_V(1U) + +#define XGMAC0_S 9 +#define XGMAC0_V(x) ((x) << XGMAC0_S) +#define XGMAC0_F XGMAC0_V(1U) + +#define SMB_S 8 +#define SMB_V(x) ((x) << SMB_S) +#define SMB_F SMB_V(1U) + +#define SF_S 7 +#define SF_V(x) ((x) << SF_S) +#define SF_F SF_V(1U) + +#define PL_S 6 +#define PL_V(x) ((x) << PL_S) +#define PL_F PL_V(1U) + +#define NCSI_S 5 +#define NCSI_V(x) ((x) << NCSI_S) +#define NCSI_F NCSI_V(1U) + +#define MPS_S 4 +#define MPS_V(x) ((x) << MPS_S) +#define MPS_F MPS_V(1U) + +#define CIM_S 0 +#define CIM_V(x) ((x) << CIM_S) +#define CIM_F CIM_V(1U) + +#define MC1_S 31 +#define MC1_V(x) ((x) << MC1_S) +#define MC1_F MC1_V(1U) + +#define PL_INT_ENABLE_A 0x19410 +#define PL_INT_MAP0_A 0x19414 +#define PL_RST_A 0x19428 + +#define PIORST_S 1 +#define PIORST_V(x) ((x) << PIORST_S) +#define PIORST_F PIORST_V(1U) + +#define PIORSTMODE_S 0 +#define PIORSTMODE_V(x) ((x) << PIORSTMODE_S) +#define PIORSTMODE_F PIORSTMODE_V(1U) + +#define PL_PL_INT_CAUSE_A 0x19430 + +#define FATALPERR_S 4 +#define FATALPERR_V(x) ((x) << FATALPERR_S) +#define FATALPERR_F FATALPERR_V(1U) + +#define PERRVFID_S 0 +#define PERRVFID_V(x) ((x) << PERRVFID_S) +#define PERRVFID_F PERRVFID_V(1U) + +#define PL_REV_A 0x1943c + +#define REV_S 0 +#define REV_M 0xfU +#define REV_V(x) ((x) << REV_S) +#define REV_G(x) (((x) >> REV_S) & REV_M) + +#define T6_UNKNOWNCMD_S 3 +#define T6_UNKNOWNCMD_V(x) ((x) << T6_UNKNOWNCMD_S) +#define T6_UNKNOWNCMD_F T6_UNKNOWNCMD_V(1U) + +#define T6_LIP0_S 2 +#define T6_LIP0_V(x) ((x) << T6_LIP0_S) +#define T6_LIP0_F T6_LIP0_V(1U) + +#define T6_LIPMISS_S 1 +#define T6_LIPMISS_V(x) ((x) << T6_LIPMISS_S) +#define T6_LIPMISS_F T6_LIPMISS_V(1U) + +#define LE_DB_CONFIG_A 0x19c04 +#define LE_DB_SERVER_INDEX_A 0x19c18 +#define LE_DB_SRVR_START_INDEX_A 0x19c18 +#define LE_DB_ACT_CNT_IPV4_A 0x19c20 +#define LE_DB_ACT_CNT_IPV6_A 0x19c24 +#define LE_DB_HASH_TID_BASE_A 0x19c30 +#define LE_DB_HASH_TBL_BASE_ADDR_A 0x19c30 +#define LE_DB_INT_CAUSE_A 0x19c3c +#define LE_DB_TID_HASHBASE_A 0x19df8 +#define T6_LE_DB_HASH_TID_BASE_A 0x19df8 + +#define HASHEN_S 20 +#define HASHEN_V(x) ((x) << HASHEN_S) +#define HASHEN_F HASHEN_V(1U) + +#define ASLIPCOMPEN_S 17 +#define ASLIPCOMPEN_V(x) ((x) << ASLIPCOMPEN_S) +#define ASLIPCOMPEN_F ASLIPCOMPEN_V(1U) + +#define REQQPARERR_S 16 +#define REQQPARERR_V(x) ((x) << REQQPARERR_S) +#define REQQPARERR_F REQQPARERR_V(1U) + +#define UNKNOWNCMD_S 15 +#define UNKNOWNCMD_V(x) ((x) << UNKNOWNCMD_S) +#define UNKNOWNCMD_F UNKNOWNCMD_V(1U) + +#define PARITYERR_S 6 +#define PARITYERR_V(x) ((x) << PARITYERR_S) +#define PARITYERR_F PARITYERR_V(1U) + +#define LIPMISS_S 5 +#define LIPMISS_V(x) ((x) << LIPMISS_S) +#define LIPMISS_F LIPMISS_V(1U) + +#define LIP0_S 4 +#define LIP0_V(x) ((x) << LIP0_S) +#define LIP0_F LIP0_V(1U) + +#define BASEADDR_S 3 +#define BASEADDR_M 0x1fffffffU +#define BASEADDR_G(x) (((x) >> BASEADDR_S) & BASEADDR_M) + +#define TCAMINTPERR_S 13 +#define TCAMINTPERR_V(x) ((x) << TCAMINTPERR_S) +#define TCAMINTPERR_F TCAMINTPERR_V(1U) + +#define SSRAMINTPERR_S 10 +#define SSRAMINTPERR_V(x) ((x) << SSRAMINTPERR_S) +#define SSRAMINTPERR_F SSRAMINTPERR_V(1U) + +#define NCSI_INT_CAUSE_A 0x1a0d8 + +#define CIM_DM_PRTY_ERR_S 8 +#define CIM_DM_PRTY_ERR_V(x) ((x) << CIM_DM_PRTY_ERR_S) +#define CIM_DM_PRTY_ERR_F CIM_DM_PRTY_ERR_V(1U) + +#define MPS_DM_PRTY_ERR_S 7 +#define MPS_DM_PRTY_ERR_V(x) ((x) << MPS_DM_PRTY_ERR_S) +#define MPS_DM_PRTY_ERR_F MPS_DM_PRTY_ERR_V(1U) + +#define TXFIFO_PRTY_ERR_S 1 +#define TXFIFO_PRTY_ERR_V(x) ((x) << TXFIFO_PRTY_ERR_S) +#define TXFIFO_PRTY_ERR_F TXFIFO_PRTY_ERR_V(1U) + +#define RXFIFO_PRTY_ERR_S 0 +#define RXFIFO_PRTY_ERR_V(x) ((x) << RXFIFO_PRTY_ERR_S) +#define RXFIFO_PRTY_ERR_F RXFIFO_PRTY_ERR_V(1U) + +#define XGMAC_PORT_CFG2_A 0x1018 + +#define PATEN_S 18 +#define PATEN_V(x) ((x) << PATEN_S) +#define PATEN_F PATEN_V(1U) + +#define MAGICEN_S 17 +#define MAGICEN_V(x) ((x) << MAGICEN_S) +#define MAGICEN_F MAGICEN_V(1U) + +#define XGMAC_PORT_MAGIC_MACID_LO 0x1024 +#define XGMAC_PORT_MAGIC_MACID_HI 0x1028 + +#define XGMAC_PORT_EPIO_DATA0_A 0x10c0 +#define XGMAC_PORT_EPIO_DATA1_A 0x10c4 +#define XGMAC_PORT_EPIO_DATA2_A 0x10c8 +#define XGMAC_PORT_EPIO_DATA3_A 0x10cc +#define XGMAC_PORT_EPIO_OP_A 0x10d0 + +#define EPIOWR_S 8 +#define EPIOWR_V(x) ((x) << EPIOWR_S) +#define EPIOWR_F EPIOWR_V(1U) + +#define ADDRESS_S 0 +#define ADDRESS_V(x) ((x) << ADDRESS_S) + +#define MAC_PORT_INT_CAUSE_A 0x8dc +#define XGMAC_PORT_INT_CAUSE_A 0x10dc + +#define TP_TX_MOD_QUEUE_REQ_MAP_A 0x7e28 + +#define TP_TX_MOD_QUEUE_WEIGHT0_A 0x7e30 +#define TP_TX_MOD_CHANNEL_WEIGHT_A 0x7e34 + +#define TX_MOD_QUEUE_REQ_MAP_S 0 +#define TX_MOD_QUEUE_REQ_MAP_V(x) ((x) << TX_MOD_QUEUE_REQ_MAP_S) + +#define TX_MODQ_WEIGHT3_S 24 +#define TX_MODQ_WEIGHT3_V(x) ((x) << TX_MODQ_WEIGHT3_S) + +#define TX_MODQ_WEIGHT2_S 16 +#define TX_MODQ_WEIGHT2_V(x) ((x) << TX_MODQ_WEIGHT2_S) + +#define TX_MODQ_WEIGHT1_S 8 +#define TX_MODQ_WEIGHT1_V(x) ((x) << TX_MODQ_WEIGHT1_S) + +#define TX_MODQ_WEIGHT0_S 0 +#define TX_MODQ_WEIGHT0_V(x) ((x) << TX_MODQ_WEIGHT0_S) + +#define TP_TX_SCHED_HDR_A 0x23 +#define TP_TX_SCHED_FIFO_A 0x24 +#define TP_TX_SCHED_PCMD_A 0x25 + +#define NUM_MPS_CLS_SRAM_L_INSTANCES 336 +#define NUM_MPS_T5_CLS_SRAM_L_INSTANCES 512 + +#define T5_PORT0_BASE 0x30000 +#define T5_PORT_STRIDE 0x4000 +#define T5_PORT_BASE(idx) (T5_PORT0_BASE + (idx) * T5_PORT_STRIDE) +#define T5_PORT_REG(idx, reg) (T5_PORT_BASE(idx) + (reg)) + +#define MC_0_BASE_ADDR 0x40000 +#define MC_1_BASE_ADDR 0x48000 +#define MC_STRIDE (MC_1_BASE_ADDR - MC_0_BASE_ADDR) +#define MC_REG(reg, idx) (reg + MC_STRIDE * idx) + +#define MC_P_BIST_CMD_A 0x41400 +#define MC_P_BIST_CMD_ADDR_A 0x41404 +#define MC_P_BIST_CMD_LEN_A 0x41408 +#define MC_P_BIST_DATA_PATTERN_A 0x4140c +#define MC_P_BIST_STATUS_RDATA_A 0x41488 + +#define EDC_T50_BASE_ADDR 0x50000 + +#define EDC_H_BIST_CMD_A 0x50004 +#define EDC_H_BIST_CMD_ADDR_A 0x50008 +#define EDC_H_BIST_CMD_LEN_A 0x5000c +#define EDC_H_BIST_DATA_PATTERN_A 0x50010 +#define EDC_H_BIST_STATUS_RDATA_A 0x50028 + +#define EDC_H_ECC_ERR_ADDR_A 0x50084 +#define EDC_T51_BASE_ADDR 0x50800 + +#define EDC_T5_STRIDE (EDC_T51_BASE_ADDR - EDC_T50_BASE_ADDR) +#define EDC_T5_REG(reg, idx) (reg + EDC_T5_STRIDE * idx) + +#define PL_VF_REV_A 0x4 +#define PL_VF_WHOAMI_A 0x0 +#define PL_VF_REVISION_A 0x8 + +/* registers for module CIM */ +#define CIM_HOST_ACC_CTRL_A 0x7b50 +#define CIM_HOST_ACC_DATA_A 0x7b54 +#define UP_UP_DBG_LA_CFG_A 0x140 +#define UP_UP_DBG_LA_DATA_A 0x144 + +#define HOSTBUSY_S 17 +#define HOSTBUSY_V(x) ((x) << HOSTBUSY_S) +#define HOSTBUSY_F HOSTBUSY_V(1U) + +#define HOSTWRITE_S 16 +#define HOSTWRITE_V(x) ((x) << HOSTWRITE_S) +#define HOSTWRITE_F HOSTWRITE_V(1U) + +#define CIM_IBQ_DBG_CFG_A 0x7b60 + +#define IBQDBGADDR_S 16 +#define IBQDBGADDR_M 0xfffU +#define IBQDBGADDR_V(x) ((x) << IBQDBGADDR_S) +#define IBQDBGADDR_G(x) (((x) >> IBQDBGADDR_S) & IBQDBGADDR_M) + +#define IBQDBGBUSY_S 1 +#define IBQDBGBUSY_V(x) ((x) << IBQDBGBUSY_S) +#define IBQDBGBUSY_F IBQDBGBUSY_V(1U) + +#define IBQDBGEN_S 0 +#define IBQDBGEN_V(x) ((x) << IBQDBGEN_S) +#define IBQDBGEN_F IBQDBGEN_V(1U) + +#define CIM_OBQ_DBG_CFG_A 0x7b64 + +#define OBQDBGADDR_S 16 +#define OBQDBGADDR_M 0xfffU +#define OBQDBGADDR_V(x) ((x) << OBQDBGADDR_S) +#define OBQDBGADDR_G(x) (((x) >> OBQDBGADDR_S) & OBQDBGADDR_M) + +#define OBQDBGBUSY_S 1 +#define OBQDBGBUSY_V(x) ((x) << OBQDBGBUSY_S) +#define OBQDBGBUSY_F OBQDBGBUSY_V(1U) + +#define OBQDBGEN_S 0 +#define OBQDBGEN_V(x) ((x) << OBQDBGEN_S) +#define OBQDBGEN_F OBQDBGEN_V(1U) + +#define CIM_IBQ_DBG_DATA_A 0x7b68 +#define CIM_OBQ_DBG_DATA_A 0x7b6c +#define CIM_DEBUGCFG_A 0x7b70 +#define CIM_DEBUGSTS_A 0x7b74 + +#define POLADBGRDPTR_S 23 +#define POLADBGRDPTR_M 0x1ffU +#define POLADBGRDPTR_V(x) ((x) << POLADBGRDPTR_S) + +#define POLADBGWRPTR_S 16 +#define POLADBGWRPTR_M 0x1ffU +#define POLADBGWRPTR_G(x) (((x) >> POLADBGWRPTR_S) & POLADBGWRPTR_M) + +#define PILADBGRDPTR_S 14 +#define PILADBGRDPTR_M 0x1ffU +#define PILADBGRDPTR_V(x) ((x) << PILADBGRDPTR_S) + +#define PILADBGWRPTR_S 0 +#define PILADBGWRPTR_M 0x1ffU +#define PILADBGWRPTR_G(x) (((x) >> PILADBGWRPTR_S) & PILADBGWRPTR_M) + +#define LADBGEN_S 12 +#define LADBGEN_V(x) ((x) << LADBGEN_S) +#define LADBGEN_F LADBGEN_V(1U) + +#define CIM_PO_LA_DEBUGDATA_A 0x7b78 +#define CIM_PI_LA_DEBUGDATA_A 0x7b7c +#define CIM_PO_LA_MADEBUGDATA_A 0x7b80 +#define CIM_PI_LA_MADEBUGDATA_A 0x7b84 + +#define UPDBGLARDEN_S 1 +#define UPDBGLARDEN_V(x) ((x) << UPDBGLARDEN_S) +#define UPDBGLARDEN_F UPDBGLARDEN_V(1U) + +#define UPDBGLAEN_S 0 +#define UPDBGLAEN_V(x) ((x) << UPDBGLAEN_S) +#define UPDBGLAEN_F UPDBGLAEN_V(1U) + +#define UPDBGLARDPTR_S 2 +#define UPDBGLARDPTR_M 0xfffU +#define UPDBGLARDPTR_V(x) ((x) << UPDBGLARDPTR_S) + +#define UPDBGLAWRPTR_S 16 +#define UPDBGLAWRPTR_M 0xfffU +#define UPDBGLAWRPTR_G(x) (((x) >> UPDBGLAWRPTR_S) & UPDBGLAWRPTR_M) + +#define UPDBGLACAPTPCONLY_S 30 +#define UPDBGLACAPTPCONLY_V(x) ((x) << UPDBGLACAPTPCONLY_S) +#define UPDBGLACAPTPCONLY_F UPDBGLACAPTPCONLY_V(1U) + +#define CIM_QUEUE_CONFIG_REF_A 0x7b48 +#define CIM_QUEUE_CONFIG_CTRL_A 0x7b4c + +#define CIMQSIZE_S 24 +#define CIMQSIZE_M 0x3fU +#define CIMQSIZE_G(x) (((x) >> CIMQSIZE_S) & CIMQSIZE_M) + +#define CIMQBASE_S 16 +#define CIMQBASE_M 0x3fU +#define CIMQBASE_G(x) (((x) >> CIMQBASE_S) & CIMQBASE_M) + +#define QUEFULLTHRSH_S 0 +#define QUEFULLTHRSH_M 0x1ffU +#define QUEFULLTHRSH_G(x) (((x) >> QUEFULLTHRSH_S) & QUEFULLTHRSH_M) + +#define UP_IBQ_0_RDADDR_A 0x10 +#define UP_IBQ_0_SHADOW_RDADDR_A 0x280 +#define UP_OBQ_0_REALADDR_A 0x104 +#define UP_OBQ_0_SHADOW_REALADDR_A 0x394 + +#define IBQRDADDR_S 0 +#define IBQRDADDR_M 0x1fffU +#define IBQRDADDR_G(x) (((x) >> IBQRDADDR_S) & IBQRDADDR_M) + +#define IBQWRADDR_S 0 +#define IBQWRADDR_M 0x1fffU +#define IBQWRADDR_G(x) (((x) >> IBQWRADDR_S) & IBQWRADDR_M) + +#define QUERDADDR_S 0 +#define QUERDADDR_M 0x7fffU +#define QUERDADDR_G(x) (((x) >> QUERDADDR_S) & QUERDADDR_M) + +#define QUEREMFLITS_S 0 +#define QUEREMFLITS_M 0x7ffU +#define QUEREMFLITS_G(x) (((x) >> QUEREMFLITS_S) & QUEREMFLITS_M) + +#define QUEEOPCNT_S 16 +#define QUEEOPCNT_M 0xfffU +#define QUEEOPCNT_G(x) (((x) >> QUEEOPCNT_S) & QUEEOPCNT_M) + +#define QUESOPCNT_S 0 +#define QUESOPCNT_M 0xfffU +#define QUESOPCNT_G(x) (((x) >> QUESOPCNT_S) & QUESOPCNT_M) + +#define OBQSELECT_S 4 +#define OBQSELECT_V(x) ((x) << OBQSELECT_S) +#define OBQSELECT_F OBQSELECT_V(1U) + +#define IBQSELECT_S 3 +#define IBQSELECT_V(x) ((x) << IBQSELECT_S) +#define IBQSELECT_F IBQSELECT_V(1U) + +#define QUENUMSELECT_S 0 +#define QUENUMSELECT_V(x) ((x) << QUENUMSELECT_S) + +#endif /* __T4_REGS_H */ diff --git a/providers/cxgb4/t4fw_api.h b/providers/cxgb4/t4fw_api.h new file mode 100644 index 0000000..0ed0534 --- /dev/null +++ b/providers/cxgb4/t4fw_api.h @@ -0,0 +1,3263 @@ +/* + * This file is part of the Chelsio T4 Ethernet driver for Linux. + * + * Copyright (c) 2009-2014 Chelsio Communications, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef _T4FW_INTERFACE_H_ +#define _T4FW_INTERFACE_H_ + +#include <linux/types.h> + +enum fw_retval { + FW_SUCCESS = 0, /* completed successfully */ + FW_EPERM = 1, /* operation not permitted */ + FW_ENOENT = 2, /* no such file or directory */ + FW_EIO = 5, /* input/output error; hw bad */ + FW_ENOEXEC = 8, /* exec format error; inv microcode */ + FW_EAGAIN = 11, /* try again */ + FW_ENOMEM = 12, /* out of memory */ + FW_EFAULT = 14, /* bad address; fw bad */ + FW_EBUSY = 16, /* resource busy */ + FW_EEXIST = 17, /* file exists */ + FW_ENODEV = 19, /* no such device */ + FW_EINVAL = 22, /* invalid argument */ + FW_ENOSPC = 28, /* no space left on device */ + FW_ENOSYS = 38, /* functionality not implemented */ + FW_ENODATA = 61, /* no data available */ + FW_EPROTO = 71, /* protocol error */ + FW_EADDRINUSE = 98, /* address already in use */ + FW_EADDRNOTAVAIL = 99, /* cannot assigned requested address */ + FW_ENETDOWN = 100, /* network is down */ + FW_ENETUNREACH = 101, /* network is unreachable */ + FW_ENOBUFS = 105, /* no buffer space available */ + FW_ETIMEDOUT = 110, /* timeout */ + FW_EINPROGRESS = 115, /* fw internal */ + FW_SCSI_ABORT_REQUESTED = 128, /* */ + FW_SCSI_ABORT_TIMEDOUT = 129, /* */ + FW_SCSI_ABORTED = 130, /* */ + FW_SCSI_CLOSE_REQUESTED = 131, /* */ + FW_ERR_LINK_DOWN = 132, /* */ + FW_RDEV_NOT_READY = 133, /* */ + FW_ERR_RDEV_LOST = 134, /* */ + FW_ERR_RDEV_LOGO = 135, /* */ + FW_FCOE_NO_XCHG = 136, /* */ + FW_SCSI_RSP_ERR = 137, /* */ + FW_ERR_RDEV_IMPL_LOGO = 138, /* */ + FW_SCSI_UNDER_FLOW_ERR = 139, /* */ + FW_SCSI_OVER_FLOW_ERR = 140, /* */ + FW_SCSI_DDP_ERR = 141, /* DDP error*/ + FW_SCSI_TASK_ERR = 142, /* No SCSI tasks available */ +}; + +#define FW_T4VF_SGE_BASE_ADDR 0x0000 +#define FW_T4VF_MPS_BASE_ADDR 0x0100 +#define FW_T4VF_PL_BASE_ADDR 0x0200 +#define FW_T4VF_MBDATA_BASE_ADDR 0x0240 +#define FW_T4VF_CIM_BASE_ADDR 0x0300 + +enum fw_wr_opcodes { + FW_FILTER_WR = 0x02, + FW_ULPTX_WR = 0x04, + FW_TP_WR = 0x05, + FW_ETH_TX_PKT_WR = 0x08, + FW_OFLD_CONNECTION_WR = 0x2f, + FW_FLOWC_WR = 0x0a, + FW_OFLD_TX_DATA_WR = 0x0b, + FW_CMD_WR = 0x10, + FW_ETH_TX_PKT_VM_WR = 0x11, + FW_RI_RES_WR = 0x0c, + FW_RI_INIT_WR = 0x0d, + FW_RI_RDMA_WRITE_WR = 0x14, + FW_RI_SEND_WR = 0x15, + FW_RI_RDMA_READ_WR = 0x16, + FW_RI_RECV_WR = 0x17, + FW_RI_BIND_MW_WR = 0x18, + FW_RI_FR_NSMR_WR = 0x19, + FW_RI_RDMA_WRITE_CMPL_WR = 0x21, + FW_RI_INV_LSTAG_WR = 0x1a, + FW_ISCSI_TX_DATA_WR = 0x45, + FW_LASTC2E_WR = 0x70 +}; + +struct fw_wr_hdr { + __be32 hi; + __be32 lo; +}; + +/* work request opcode (hi) */ +#define FW_WR_OP_S 24 +#define FW_WR_OP_M 0xff +#define FW_WR_OP_V(x) ((x) << FW_WR_OP_S) +#define FW_WR_OP_G(x) (((x) >> FW_WR_OP_S) & FW_WR_OP_M) + +/* atomic flag (hi) - firmware encapsulates CPLs in CPL_BARRIER */ +#define FW_WR_ATOMIC_S 23 +#define FW_WR_ATOMIC_V(x) ((x) << FW_WR_ATOMIC_S) + +/* flush flag (hi) - firmware flushes flushable work request buffered + * in the flow context. + */ +#define FW_WR_FLUSH_S 22 +#define FW_WR_FLUSH_V(x) ((x) << FW_WR_FLUSH_S) + +/* completion flag (hi) - firmware generates a cpl_fw6_ack */ +#define FW_WR_COMPL_S 21 +#define FW_WR_COMPL_V(x) ((x) << FW_WR_COMPL_S) +#define FW_WR_COMPL_F FW_WR_COMPL_V(1U) + +/* work request immediate data length (hi) */ +#define FW_WR_IMMDLEN_S 0 +#define FW_WR_IMMDLEN_M 0xff +#define FW_WR_IMMDLEN_V(x) ((x) << FW_WR_IMMDLEN_S) + +/* egress queue status update to associated ingress queue entry (lo) */ +#define FW_WR_EQUIQ_S 31 +#define FW_WR_EQUIQ_V(x) ((x) << FW_WR_EQUIQ_S) +#define FW_WR_EQUIQ_F FW_WR_EQUIQ_V(1U) + +/* egress queue status update to egress queue status entry (lo) */ +#define FW_WR_EQUEQ_S 30 +#define FW_WR_EQUEQ_V(x) ((x) << FW_WR_EQUEQ_S) +#define FW_WR_EQUEQ_F FW_WR_EQUEQ_V(1U) + +/* flow context identifier (lo) */ +#define FW_WR_FLOWID_S 8 +#define FW_WR_FLOWID_V(x) ((x) << FW_WR_FLOWID_S) + +/* length in units of 16-bytes (lo) */ +#define FW_WR_LEN16_S 0 +#define FW_WR_LEN16_V(x) ((x) << FW_WR_LEN16_S) + +#define HW_TPL_FR_MT_PR_IV_P_FC 0X32B +#define HW_TPL_FR_MT_PR_OV_P_FC 0X327 + +/* filter wr reply code in cookie in CPL_SET_TCB_RPL */ +enum fw_filter_wr_cookie { + FW_FILTER_WR_SUCCESS, + FW_FILTER_WR_FLT_ADDED, + FW_FILTER_WR_FLT_DELETED, + FW_FILTER_WR_SMT_TBL_FULL, + FW_FILTER_WR_EINVAL, +}; + +struct fw_filter_wr { + __be32 op_pkd; + __be32 len16_pkd; + __be64 r3; + __be32 tid_to_iq; + __be32 del_filter_to_l2tix; + __be16 ethtype; + __be16 ethtypem; + __u8 frag_to_ovlan_vldm; + __u8 smac_sel; + __be16 rx_chan_rx_rpl_iq; + __be32 maci_to_matchtypem; + __u8 ptcl; + __u8 ptclm; + __u8 ttyp; + __u8 ttypm; + __be16 ivlan; + __be16 ivlanm; + __be16 ovlan; + __be16 ovlanm; + __u8 lip[16]; + __u8 lipm[16]; + __u8 fip[16]; + __u8 fipm[16]; + __be16 lp; + __be16 lpm; + __be16 fp; + __be16 fpm; + __be16 r7; + __u8 sma[6]; +}; + +#define FW_FILTER_WR_TID_S 12 +#define FW_FILTER_WR_TID_M 0xfffff +#define FW_FILTER_WR_TID_V(x) ((x) << FW_FILTER_WR_TID_S) +#define FW_FILTER_WR_TID_G(x) \ + (((x) >> FW_FILTER_WR_TID_S) & FW_FILTER_WR_TID_M) + +#define FW_FILTER_WR_RQTYPE_S 11 +#define FW_FILTER_WR_RQTYPE_M 0x1 +#define FW_FILTER_WR_RQTYPE_V(x) ((x) << FW_FILTER_WR_RQTYPE_S) +#define FW_FILTER_WR_RQTYPE_G(x) \ + (((x) >> FW_FILTER_WR_RQTYPE_S) & FW_FILTER_WR_RQTYPE_M) +#define FW_FILTER_WR_RQTYPE_F FW_FILTER_WR_RQTYPE_V(1U) + +#define FW_FILTER_WR_NOREPLY_S 10 +#define FW_FILTER_WR_NOREPLY_M 0x1 +#define FW_FILTER_WR_NOREPLY_V(x) ((x) << FW_FILTER_WR_NOREPLY_S) +#define FW_FILTER_WR_NOREPLY_G(x) \ + (((x) >> FW_FILTER_WR_NOREPLY_S) & FW_FILTER_WR_NOREPLY_M) +#define FW_FILTER_WR_NOREPLY_F FW_FILTER_WR_NOREPLY_V(1U) + +#define FW_FILTER_WR_IQ_S 0 +#define FW_FILTER_WR_IQ_M 0x3ff +#define FW_FILTER_WR_IQ_V(x) ((x) << FW_FILTER_WR_IQ_S) +#define FW_FILTER_WR_IQ_G(x) \ + (((x) >> FW_FILTER_WR_IQ_S) & FW_FILTER_WR_IQ_M) + +#define FW_FILTER_WR_DEL_FILTER_S 31 +#define FW_FILTER_WR_DEL_FILTER_M 0x1 +#define FW_FILTER_WR_DEL_FILTER_V(x) ((x) << FW_FILTER_WR_DEL_FILTER_S) +#define FW_FILTER_WR_DEL_FILTER_G(x) \ + (((x) >> FW_FILTER_WR_DEL_FILTER_S) & FW_FILTER_WR_DEL_FILTER_M) +#define FW_FILTER_WR_DEL_FILTER_F FW_FILTER_WR_DEL_FILTER_V(1U) + +#define FW_FILTER_WR_RPTTID_S 25 +#define FW_FILTER_WR_RPTTID_M 0x1 +#define FW_FILTER_WR_RPTTID_V(x) ((x) << FW_FILTER_WR_RPTTID_S) +#define FW_FILTER_WR_RPTTID_G(x) \ + (((x) >> FW_FILTER_WR_RPTTID_S) & FW_FILTER_WR_RPTTID_M) +#define FW_FILTER_WR_RPTTID_F FW_FILTER_WR_RPTTID_V(1U) + +#define FW_FILTER_WR_DROP_S 24 +#define FW_FILTER_WR_DROP_M 0x1 +#define FW_FILTER_WR_DROP_V(x) ((x) << FW_FILTER_WR_DROP_S) +#define FW_FILTER_WR_DROP_G(x) \ + (((x) >> FW_FILTER_WR_DROP_S) & FW_FILTER_WR_DROP_M) +#define FW_FILTER_WR_DROP_F FW_FILTER_WR_DROP_V(1U) + +#define FW_FILTER_WR_DIRSTEER_S 23 +#define FW_FILTER_WR_DIRSTEER_M 0x1 +#define FW_FILTER_WR_DIRSTEER_V(x) ((x) << FW_FILTER_WR_DIRSTEER_S) +#define FW_FILTER_WR_DIRSTEER_G(x) \ + (((x) >> FW_FILTER_WR_DIRSTEER_S) & FW_FILTER_WR_DIRSTEER_M) +#define FW_FILTER_WR_DIRSTEER_F FW_FILTER_WR_DIRSTEER_V(1U) + +#define FW_FILTER_WR_MASKHASH_S 22 +#define FW_FILTER_WR_MASKHASH_M 0x1 +#define FW_FILTER_WR_MASKHASH_V(x) ((x) << FW_FILTER_WR_MASKHASH_S) +#define FW_FILTER_WR_MASKHASH_G(x) \ + (((x) >> FW_FILTER_WR_MASKHASH_S) & FW_FILTER_WR_MASKHASH_M) +#define FW_FILTER_WR_MASKHASH_F FW_FILTER_WR_MASKHASH_V(1U) + +#define FW_FILTER_WR_DIRSTEERHASH_S 21 +#define FW_FILTER_WR_DIRSTEERHASH_M 0x1 +#define FW_FILTER_WR_DIRSTEERHASH_V(x) ((x) << FW_FILTER_WR_DIRSTEERHASH_S) +#define FW_FILTER_WR_DIRSTEERHASH_G(x) \ + (((x) >> FW_FILTER_WR_DIRSTEERHASH_S) & FW_FILTER_WR_DIRSTEERHASH_M) +#define FW_FILTER_WR_DIRSTEERHASH_F FW_FILTER_WR_DIRSTEERHASH_V(1U) + +#define FW_FILTER_WR_LPBK_S 20 +#define FW_FILTER_WR_LPBK_M 0x1 +#define FW_FILTER_WR_LPBK_V(x) ((x) << FW_FILTER_WR_LPBK_S) +#define FW_FILTER_WR_LPBK_G(x) \ + (((x) >> FW_FILTER_WR_LPBK_S) & FW_FILTER_WR_LPBK_M) +#define FW_FILTER_WR_LPBK_F FW_FILTER_WR_LPBK_V(1U) + +#define FW_FILTER_WR_DMAC_S 19 +#define FW_FILTER_WR_DMAC_M 0x1 +#define FW_FILTER_WR_DMAC_V(x) ((x) << FW_FILTER_WR_DMAC_S) +#define FW_FILTER_WR_DMAC_G(x) \ + (((x) >> FW_FILTER_WR_DMAC_S) & FW_FILTER_WR_DMAC_M) +#define FW_FILTER_WR_DMAC_F FW_FILTER_WR_DMAC_V(1U) + +#define FW_FILTER_WR_SMAC_S 18 +#define FW_FILTER_WR_SMAC_M 0x1 +#define FW_FILTER_WR_SMAC_V(x) ((x) << FW_FILTER_WR_SMAC_S) +#define FW_FILTER_WR_SMAC_G(x) \ + (((x) >> FW_FILTER_WR_SMAC_S) & FW_FILTER_WR_SMAC_M) +#define FW_FILTER_WR_SMAC_F FW_FILTER_WR_SMAC_V(1U) + +#define FW_FILTER_WR_INSVLAN_S 17 +#define FW_FILTER_WR_INSVLAN_M 0x1 +#define FW_FILTER_WR_INSVLAN_V(x) ((x) << FW_FILTER_WR_INSVLAN_S) +#define FW_FILTER_WR_INSVLAN_G(x) \ + (((x) >> FW_FILTER_WR_INSVLAN_S) & FW_FILTER_WR_INSVLAN_M) +#define FW_FILTER_WR_INSVLAN_F FW_FILTER_WR_INSVLAN_V(1U) + +#define FW_FILTER_WR_RMVLAN_S 16 +#define FW_FILTER_WR_RMVLAN_M 0x1 +#define FW_FILTER_WR_RMVLAN_V(x) ((x) << FW_FILTER_WR_RMVLAN_S) +#define FW_FILTER_WR_RMVLAN_G(x) \ + (((x) >> FW_FILTER_WR_RMVLAN_S) & FW_FILTER_WR_RMVLAN_M) +#define FW_FILTER_WR_RMVLAN_F FW_FILTER_WR_RMVLAN_V(1U) + +#define FW_FILTER_WR_HITCNTS_S 15 +#define FW_FILTER_WR_HITCNTS_M 0x1 +#define FW_FILTER_WR_HITCNTS_V(x) ((x) << FW_FILTER_WR_HITCNTS_S) +#define FW_FILTER_WR_HITCNTS_G(x) \ + (((x) >> FW_FILTER_WR_HITCNTS_S) & FW_FILTER_WR_HITCNTS_M) +#define FW_FILTER_WR_HITCNTS_F FW_FILTER_WR_HITCNTS_V(1U) + +#define FW_FILTER_WR_TXCHAN_S 13 +#define FW_FILTER_WR_TXCHAN_M 0x3 +#define FW_FILTER_WR_TXCHAN_V(x) ((x) << FW_FILTER_WR_TXCHAN_S) +#define FW_FILTER_WR_TXCHAN_G(x) \ + (((x) >> FW_FILTER_WR_TXCHAN_S) & FW_FILTER_WR_TXCHAN_M) + +#define FW_FILTER_WR_PRIO_S 12 +#define FW_FILTER_WR_PRIO_M 0x1 +#define FW_FILTER_WR_PRIO_V(x) ((x) << FW_FILTER_WR_PRIO_S) +#define FW_FILTER_WR_PRIO_G(x) \ + (((x) >> FW_FILTER_WR_PRIO_S) & FW_FILTER_WR_PRIO_M) +#define FW_FILTER_WR_PRIO_F FW_FILTER_WR_PRIO_V(1U) + +#define FW_FILTER_WR_L2TIX_S 0 +#define FW_FILTER_WR_L2TIX_M 0xfff +#define FW_FILTER_WR_L2TIX_V(x) ((x) << FW_FILTER_WR_L2TIX_S) +#define FW_FILTER_WR_L2TIX_G(x) \ + (((x) >> FW_FILTER_WR_L2TIX_S) & FW_FILTER_WR_L2TIX_M) + +#define FW_FILTER_WR_FRAG_S 7 +#define FW_FILTER_WR_FRAG_M 0x1 +#define FW_FILTER_WR_FRAG_V(x) ((x) << FW_FILTER_WR_FRAG_S) +#define FW_FILTER_WR_FRAG_G(x) \ + (((x) >> FW_FILTER_WR_FRAG_S) & FW_FILTER_WR_FRAG_M) +#define FW_FILTER_WR_FRAG_F FW_FILTER_WR_FRAG_V(1U) + +#define FW_FILTER_WR_FRAGM_S 6 +#define FW_FILTER_WR_FRAGM_M 0x1 +#define FW_FILTER_WR_FRAGM_V(x) ((x) << FW_FILTER_WR_FRAGM_S) +#define FW_FILTER_WR_FRAGM_G(x) \ + (((x) >> FW_FILTER_WR_FRAGM_S) & FW_FILTER_WR_FRAGM_M) +#define FW_FILTER_WR_FRAGM_F FW_FILTER_WR_FRAGM_V(1U) + +#define FW_FILTER_WR_IVLAN_VLD_S 5 +#define FW_FILTER_WR_IVLAN_VLD_M 0x1 +#define FW_FILTER_WR_IVLAN_VLD_V(x) ((x) << FW_FILTER_WR_IVLAN_VLD_S) +#define FW_FILTER_WR_IVLAN_VLD_G(x) \ + (((x) >> FW_FILTER_WR_IVLAN_VLD_S) & FW_FILTER_WR_IVLAN_VLD_M) +#define FW_FILTER_WR_IVLAN_VLD_F FW_FILTER_WR_IVLAN_VLD_V(1U) + +#define FW_FILTER_WR_OVLAN_VLD_S 4 +#define FW_FILTER_WR_OVLAN_VLD_M 0x1 +#define FW_FILTER_WR_OVLAN_VLD_V(x) ((x) << FW_FILTER_WR_OVLAN_VLD_S) +#define FW_FILTER_WR_OVLAN_VLD_G(x) \ + (((x) >> FW_FILTER_WR_OVLAN_VLD_S) & FW_FILTER_WR_OVLAN_VLD_M) +#define FW_FILTER_WR_OVLAN_VLD_F FW_FILTER_WR_OVLAN_VLD_V(1U) + +#define FW_FILTER_WR_IVLAN_VLDM_S 3 +#define FW_FILTER_WR_IVLAN_VLDM_M 0x1 +#define FW_FILTER_WR_IVLAN_VLDM_V(x) ((x) << FW_FILTER_WR_IVLAN_VLDM_S) +#define FW_FILTER_WR_IVLAN_VLDM_G(x) \ + (((x) >> FW_FILTER_WR_IVLAN_VLDM_S) & FW_FILTER_WR_IVLAN_VLDM_M) +#define FW_FILTER_WR_IVLAN_VLDM_F FW_FILTER_WR_IVLAN_VLDM_V(1U) + +#define FW_FILTER_WR_OVLAN_VLDM_S 2 +#define FW_FILTER_WR_OVLAN_VLDM_M 0x1 +#define FW_FILTER_WR_OVLAN_VLDM_V(x) ((x) << FW_FILTER_WR_OVLAN_VLDM_S) +#define FW_FILTER_WR_OVLAN_VLDM_G(x) \ + (((x) >> FW_FILTER_WR_OVLAN_VLDM_S) & FW_FILTER_WR_OVLAN_VLDM_M) +#define FW_FILTER_WR_OVLAN_VLDM_F FW_FILTER_WR_OVLAN_VLDM_V(1U) + +#define FW_FILTER_WR_RX_CHAN_S 15 +#define FW_FILTER_WR_RX_CHAN_M 0x1 +#define FW_FILTER_WR_RX_CHAN_V(x) ((x) << FW_FILTER_WR_RX_CHAN_S) +#define FW_FILTER_WR_RX_CHAN_G(x) \ + (((x) >> FW_FILTER_WR_RX_CHAN_S) & FW_FILTER_WR_RX_CHAN_M) +#define FW_FILTER_WR_RX_CHAN_F FW_FILTER_WR_RX_CHAN_V(1U) + +#define FW_FILTER_WR_RX_RPL_IQ_S 0 +#define FW_FILTER_WR_RX_RPL_IQ_M 0x3ff +#define FW_FILTER_WR_RX_RPL_IQ_V(x) ((x) << FW_FILTER_WR_RX_RPL_IQ_S) +#define FW_FILTER_WR_RX_RPL_IQ_G(x) \ + (((x) >> FW_FILTER_WR_RX_RPL_IQ_S) & FW_FILTER_WR_RX_RPL_IQ_M) + +#define FW_FILTER_WR_MACI_S 23 +#define FW_FILTER_WR_MACI_M 0x1ff +#define FW_FILTER_WR_MACI_V(x) ((x) << FW_FILTER_WR_MACI_S) +#define FW_FILTER_WR_MACI_G(x) \ + (((x) >> FW_FILTER_WR_MACI_S) & FW_FILTER_WR_MACI_M) + +#define FW_FILTER_WR_MACIM_S 14 +#define FW_FILTER_WR_MACIM_M 0x1ff +#define FW_FILTER_WR_MACIM_V(x) ((x) << FW_FILTER_WR_MACIM_S) +#define FW_FILTER_WR_MACIM_G(x) \ + (((x) >> FW_FILTER_WR_MACIM_S) & FW_FILTER_WR_MACIM_M) + +#define FW_FILTER_WR_FCOE_S 13 +#define FW_FILTER_WR_FCOE_M 0x1 +#define FW_FILTER_WR_FCOE_V(x) ((x) << FW_FILTER_WR_FCOE_S) +#define FW_FILTER_WR_FCOE_G(x) \ + (((x) >> FW_FILTER_WR_FCOE_S) & FW_FILTER_WR_FCOE_M) +#define FW_FILTER_WR_FCOE_F FW_FILTER_WR_FCOE_V(1U) + +#define FW_FILTER_WR_FCOEM_S 12 +#define FW_FILTER_WR_FCOEM_M 0x1 +#define FW_FILTER_WR_FCOEM_V(x) ((x) << FW_FILTER_WR_FCOEM_S) +#define FW_FILTER_WR_FCOEM_G(x) \ + (((x) >> FW_FILTER_WR_FCOEM_S) & FW_FILTER_WR_FCOEM_M) +#define FW_FILTER_WR_FCOEM_F FW_FILTER_WR_FCOEM_V(1U) + +#define FW_FILTER_WR_PORT_S 9 +#define FW_FILTER_WR_PORT_M 0x7 +#define FW_FILTER_WR_PORT_V(x) ((x) << FW_FILTER_WR_PORT_S) +#define FW_FILTER_WR_PORT_G(x) \ + (((x) >> FW_FILTER_WR_PORT_S) & FW_FILTER_WR_PORT_M) + +#define FW_FILTER_WR_PORTM_S 6 +#define FW_FILTER_WR_PORTM_M 0x7 +#define FW_FILTER_WR_PORTM_V(x) ((x) << FW_FILTER_WR_PORTM_S) +#define FW_FILTER_WR_PORTM_G(x) \ + (((x) >> FW_FILTER_WR_PORTM_S) & FW_FILTER_WR_PORTM_M) + +#define FW_FILTER_WR_MATCHTYPE_S 3 +#define FW_FILTER_WR_MATCHTYPE_M 0x7 +#define FW_FILTER_WR_MATCHTYPE_V(x) ((x) << FW_FILTER_WR_MATCHTYPE_S) +#define FW_FILTER_WR_MATCHTYPE_G(x) \ + (((x) >> FW_FILTER_WR_MATCHTYPE_S) & FW_FILTER_WR_MATCHTYPE_M) + +#define FW_FILTER_WR_MATCHTYPEM_S 0 +#define FW_FILTER_WR_MATCHTYPEM_M 0x7 +#define FW_FILTER_WR_MATCHTYPEM_V(x) ((x) << FW_FILTER_WR_MATCHTYPEM_S) +#define FW_FILTER_WR_MATCHTYPEM_G(x) \ + (((x) >> FW_FILTER_WR_MATCHTYPEM_S) & FW_FILTER_WR_MATCHTYPEM_M) + +struct fw_ulptx_wr { + __be32 op_to_compl; + __be32 flowid_len16; + u64 cookie; +}; + +struct fw_tp_wr { + __be32 op_to_immdlen; + __be32 flowid_len16; + u64 cookie; +}; + +struct fw_eth_tx_pkt_wr { + __be32 op_immdlen; + __be32 equiq_to_len16; + __be64 r3; +}; + +struct fw_ofld_connection_wr { + __be32 op_compl; + __be32 len16_pkd; + __u64 cookie; + __be64 r2; + __be64 r3; + struct fw_ofld_connection_le { + __be32 version_cpl; + __be32 filter; + __be32 r1; + __be16 lport; + __be16 pport; + union fw_ofld_connection_leip { + struct fw_ofld_connection_le_ipv4 { + __be32 pip; + __be32 lip; + __be64 r0; + __be64 r1; + __be64 r2; + } ipv4; + struct fw_ofld_connection_le_ipv6 { + __be64 pip_hi; + __be64 pip_lo; + __be64 lip_hi; + __be64 lip_lo; + } ipv6; + } u; + } le; + struct fw_ofld_connection_tcb { + __be32 t_state_to_astid; + __be16 cplrxdataack_cplpassacceptrpl; + __be16 rcv_adv; + __be32 rcv_nxt; + __be32 tx_max; + __be64 opt0; + __be32 opt2; + __be32 r1; + __be64 r2; + __be64 r3; + } tcb; +}; + +#define FW_OFLD_CONNECTION_WR_VERSION_S 31 +#define FW_OFLD_CONNECTION_WR_VERSION_M 0x1 +#define FW_OFLD_CONNECTION_WR_VERSION_V(x) \ + ((x) << FW_OFLD_CONNECTION_WR_VERSION_S) +#define FW_OFLD_CONNECTION_WR_VERSION_G(x) \ + (((x) >> FW_OFLD_CONNECTION_WR_VERSION_S) & \ + FW_OFLD_CONNECTION_WR_VERSION_M) +#define FW_OFLD_CONNECTION_WR_VERSION_F \ + FW_OFLD_CONNECTION_WR_VERSION_V(1U) + +#define FW_OFLD_CONNECTION_WR_CPL_S 30 +#define FW_OFLD_CONNECTION_WR_CPL_M 0x1 +#define FW_OFLD_CONNECTION_WR_CPL_V(x) ((x) << FW_OFLD_CONNECTION_WR_CPL_S) +#define FW_OFLD_CONNECTION_WR_CPL_G(x) \ + (((x) >> FW_OFLD_CONNECTION_WR_CPL_S) & FW_OFLD_CONNECTION_WR_CPL_M) +#define FW_OFLD_CONNECTION_WR_CPL_F FW_OFLD_CONNECTION_WR_CPL_V(1U) + +#define FW_OFLD_CONNECTION_WR_T_STATE_S 28 +#define FW_OFLD_CONNECTION_WR_T_STATE_M 0xf +#define FW_OFLD_CONNECTION_WR_T_STATE_V(x) \ + ((x) << FW_OFLD_CONNECTION_WR_T_STATE_S) +#define FW_OFLD_CONNECTION_WR_T_STATE_G(x) \ + (((x) >> FW_OFLD_CONNECTION_WR_T_STATE_S) & \ + FW_OFLD_CONNECTION_WR_T_STATE_M) + +#define FW_OFLD_CONNECTION_WR_RCV_SCALE_S 24 +#define FW_OFLD_CONNECTION_WR_RCV_SCALE_M 0xf +#define FW_OFLD_CONNECTION_WR_RCV_SCALE_V(x) \ + ((x) << FW_OFLD_CONNECTION_WR_RCV_SCALE_S) +#define FW_OFLD_CONNECTION_WR_RCV_SCALE_G(x) \ + (((x) >> FW_OFLD_CONNECTION_WR_RCV_SCALE_S) & \ + FW_OFLD_CONNECTION_WR_RCV_SCALE_M) + +#define FW_OFLD_CONNECTION_WR_ASTID_S 0 +#define FW_OFLD_CONNECTION_WR_ASTID_M 0xffffff +#define FW_OFLD_CONNECTION_WR_ASTID_V(x) \ + ((x) << FW_OFLD_CONNECTION_WR_ASTID_S) +#define FW_OFLD_CONNECTION_WR_ASTID_G(x) \ + (((x) >> FW_OFLD_CONNECTION_WR_ASTID_S) & FW_OFLD_CONNECTION_WR_ASTID_M) + +#define FW_OFLD_CONNECTION_WR_CPLRXDATAACK_S 15 +#define FW_OFLD_CONNECTION_WR_CPLRXDATAACK_M 0x1 +#define FW_OFLD_CONNECTION_WR_CPLRXDATAACK_V(x) \ + ((x) << FW_OFLD_CONNECTION_WR_CPLRXDATAACK_S) +#define FW_OFLD_CONNECTION_WR_CPLRXDATAACK_G(x) \ + (((x) >> FW_OFLD_CONNECTION_WR_CPLRXDATAACK_S) & \ + FW_OFLD_CONNECTION_WR_CPLRXDATAACK_M) +#define FW_OFLD_CONNECTION_WR_CPLRXDATAACK_F \ + FW_OFLD_CONNECTION_WR_CPLRXDATAACK_V(1U) + +#define FW_OFLD_CONNECTION_WR_CPLPASSACCEPTRPL_S 14 +#define FW_OFLD_CONNECTION_WR_CPLPASSACCEPTRPL_M 0x1 +#define FW_OFLD_CONNECTION_WR_CPLPASSACCEPTRPL_V(x) \ + ((x) << FW_OFLD_CONNECTION_WR_CPLPASSACCEPTRPL_S) +#define FW_OFLD_CONNECTION_WR_CPLPASSACCEPTRPL_G(x) \ + (((x) >> FW_OFLD_CONNECTION_WR_CPLPASSACCEPTRPL_S) & \ + FW_OFLD_CONNECTION_WR_CPLPASSACCEPTRPL_M) +#define FW_OFLD_CONNECTION_WR_CPLPASSACCEPTRPL_F \ + FW_OFLD_CONNECTION_WR_CPLPASSACCEPTRPL_V(1U) + +enum fw_flowc_mnem { + FW_FLOWC_MNEM_PFNVFN, /* PFN [15:8] VFN [7:0] */ + FW_FLOWC_MNEM_CH, + FW_FLOWC_MNEM_PORT, + FW_FLOWC_MNEM_IQID, + FW_FLOWC_MNEM_SNDNXT, + FW_FLOWC_MNEM_RCVNXT, + FW_FLOWC_MNEM_SNDBUF, + FW_FLOWC_MNEM_MSS, + FW_FLOWC_MNEM_TXDATAPLEN_MAX, + FW_FLOWC_MNEM_TCPSTATE, + FW_FLOWC_MNEM_EOSTATE, + FW_FLOWC_MNEM_SCHEDCLASS, + FW_FLOWC_MNEM_DCBPRIO, + FW_FLOWC_MNEM_SND_SCALE, + FW_FLOWC_MNEM_RCV_SCALE, +}; + +struct fw_flowc_mnemval { + u8 mnemonic; + u8 r4[3]; + __be32 val; +}; + +struct fw_flowc_wr { + __be32 op_to_nparams; + __be32 flowid_len16; + struct fw_flowc_mnemval mnemval[0]; +}; + +#define FW_FLOWC_WR_NPARAMS_S 0 +#define FW_FLOWC_WR_NPARAMS_V(x) ((x) << FW_FLOWC_WR_NPARAMS_S) + +struct fw_ofld_tx_data_wr { + __be32 op_to_immdlen; + __be32 flowid_len16; + __be32 plen; + __be32 tunnel_to_proxy; +}; + +#define FW_OFLD_TX_DATA_WR_TUNNEL_S 19 +#define FW_OFLD_TX_DATA_WR_TUNNEL_V(x) ((x) << FW_OFLD_TX_DATA_WR_TUNNEL_S) + +#define FW_OFLD_TX_DATA_WR_SAVE_S 18 +#define FW_OFLD_TX_DATA_WR_SAVE_V(x) ((x) << FW_OFLD_TX_DATA_WR_SAVE_S) + +#define FW_OFLD_TX_DATA_WR_FLUSH_S 17 +#define FW_OFLD_TX_DATA_WR_FLUSH_V(x) ((x) << FW_OFLD_TX_DATA_WR_FLUSH_S) +#define FW_OFLD_TX_DATA_WR_FLUSH_F FW_OFLD_TX_DATA_WR_FLUSH_V(1U) + +#define FW_OFLD_TX_DATA_WR_URGENT_S 16 +#define FW_OFLD_TX_DATA_WR_URGENT_V(x) ((x) << FW_OFLD_TX_DATA_WR_URGENT_S) + +#define FW_OFLD_TX_DATA_WR_MORE_S 15 +#define FW_OFLD_TX_DATA_WR_MORE_V(x) ((x) << FW_OFLD_TX_DATA_WR_MORE_S) + +#define FW_OFLD_TX_DATA_WR_SHOVE_S 14 +#define FW_OFLD_TX_DATA_WR_SHOVE_V(x) ((x) << FW_OFLD_TX_DATA_WR_SHOVE_S) +#define FW_OFLD_TX_DATA_WR_SHOVE_F FW_OFLD_TX_DATA_WR_SHOVE_V(1U) + +#define FW_OFLD_TX_DATA_WR_ULPMODE_S 10 +#define FW_OFLD_TX_DATA_WR_ULPMODE_V(x) ((x) << FW_OFLD_TX_DATA_WR_ULPMODE_S) + +#define FW_OFLD_TX_DATA_WR_ULPSUBMODE_S 6 +#define FW_OFLD_TX_DATA_WR_ULPSUBMODE_V(x) \ + ((x) << FW_OFLD_TX_DATA_WR_ULPSUBMODE_S) + +struct fw_cmd_wr { + __be32 op_dma; + __be32 len16_pkd; + __be64 cookie_daddr; +}; + +#define FW_CMD_WR_DMA_S 17 +#define FW_CMD_WR_DMA_V(x) ((x) << FW_CMD_WR_DMA_S) + +struct fw_eth_tx_pkt_vm_wr { + __be32 op_immdlen; + __be32 equiq_to_len16; + __be32 r3[2]; + u8 ethmacdst[6]; + u8 ethmacsrc[6]; + __be16 ethtype; + __be16 vlantci; +}; + +#define FW_CMD_MAX_TIMEOUT 10000 + +/* + * If a host driver does a HELLO and discovers that there's already a MASTER + * selected, we may have to wait for that MASTER to finish issuing RESET, + * configuration and INITIALIZE commands. Also, there's a possibility that + * our own HELLO may get lost if it happens right as the MASTER is issuign a + * RESET command, so we need to be willing to make a few retries of our HELLO. + */ +#define FW_CMD_HELLO_TIMEOUT (3 * FW_CMD_MAX_TIMEOUT) +#define FW_CMD_HELLO_RETRIES 3 + + +enum fw_cmd_opcodes { + FW_LDST_CMD = 0x01, + FW_RESET_CMD = 0x03, + FW_HELLO_CMD = 0x04, + FW_BYE_CMD = 0x05, + FW_INITIALIZE_CMD = 0x06, + FW_CAPS_CONFIG_CMD = 0x07, + FW_PARAMS_CMD = 0x08, + FW_PFVF_CMD = 0x09, + FW_IQ_CMD = 0x10, + FW_EQ_MNGT_CMD = 0x11, + FW_EQ_ETH_CMD = 0x12, + FW_EQ_CTRL_CMD = 0x13, + FW_EQ_OFLD_CMD = 0x21, + FW_VI_CMD = 0x14, + FW_VI_MAC_CMD = 0x15, + FW_VI_RXMODE_CMD = 0x16, + FW_VI_ENABLE_CMD = 0x17, + FW_ACL_MAC_CMD = 0x18, + FW_ACL_VLAN_CMD = 0x19, + FW_VI_STATS_CMD = 0x1a, + FW_PORT_CMD = 0x1b, + FW_PORT_STATS_CMD = 0x1c, + FW_PORT_LB_STATS_CMD = 0x1d, + FW_PORT_TRACE_CMD = 0x1e, + FW_PORT_TRACE_MMAP_CMD = 0x1f, + FW_RSS_IND_TBL_CMD = 0x20, + FW_RSS_GLB_CONFIG_CMD = 0x22, + FW_RSS_VI_CONFIG_CMD = 0x23, + FW_DEVLOG_CMD = 0x25, + FW_CLIP_CMD = 0x28, + FW_LASTC2E_CMD = 0x40, + FW_ERROR_CMD = 0x80, + FW_DEBUG_CMD = 0x81, +}; + +enum fw_cmd_cap { + FW_CMD_CAP_PF = 0x01, + FW_CMD_CAP_DMAQ = 0x02, + FW_CMD_CAP_PORT = 0x04, + FW_CMD_CAP_PORTPROMISC = 0x08, + FW_CMD_CAP_PORTSTATS = 0x10, + FW_CMD_CAP_VF = 0x80, +}; + +/* + * Generic command header flit0 + */ +struct fw_cmd_hdr { + __be32 hi; + __be32 lo; +}; + +#define FW_CMD_OP_S 24 +#define FW_CMD_OP_M 0xff +#define FW_CMD_OP_V(x) ((x) << FW_CMD_OP_S) +#define FW_CMD_OP_G(x) (((x) >> FW_CMD_OP_S) & FW_CMD_OP_M) + +#define FW_CMD_REQUEST_S 23 +#define FW_CMD_REQUEST_V(x) ((x) << FW_CMD_REQUEST_S) +#define FW_CMD_REQUEST_F FW_CMD_REQUEST_V(1U) + +#define FW_CMD_READ_S 22 +#define FW_CMD_READ_V(x) ((x) << FW_CMD_READ_S) +#define FW_CMD_READ_F FW_CMD_READ_V(1U) + +#define FW_CMD_WRITE_S 21 +#define FW_CMD_WRITE_V(x) ((x) << FW_CMD_WRITE_S) +#define FW_CMD_WRITE_F FW_CMD_WRITE_V(1U) + +#define FW_CMD_EXEC_S 20 +#define FW_CMD_EXEC_V(x) ((x) << FW_CMD_EXEC_S) +#define FW_CMD_EXEC_F FW_CMD_EXEC_V(1U) + +#define FW_CMD_RAMASK_S 20 +#define FW_CMD_RAMASK_V(x) ((x) << FW_CMD_RAMASK_S) + +#define FW_CMD_RETVAL_S 8 +#define FW_CMD_RETVAL_M 0xff +#define FW_CMD_RETVAL_V(x) ((x) << FW_CMD_RETVAL_S) +#define FW_CMD_RETVAL_G(x) (((x) >> FW_CMD_RETVAL_S) & FW_CMD_RETVAL_M) + +#define FW_CMD_LEN16_S 0 +#define FW_CMD_LEN16_V(x) ((x) << FW_CMD_LEN16_S) + +#define FW_LEN16(fw_struct) FW_CMD_LEN16_V(sizeof(fw_struct) / 16) + +enum fw_ldst_addrspc { + FW_LDST_ADDRSPC_FIRMWARE = 0x0001, + FW_LDST_ADDRSPC_SGE_EGRC = 0x0008, + FW_LDST_ADDRSPC_SGE_INGC = 0x0009, + FW_LDST_ADDRSPC_SGE_FLMC = 0x000a, + FW_LDST_ADDRSPC_SGE_CONMC = 0x000b, + FW_LDST_ADDRSPC_TP_PIO = 0x0010, + FW_LDST_ADDRSPC_TP_TM_PIO = 0x0011, + FW_LDST_ADDRSPC_TP_MIB = 0x0012, + FW_LDST_ADDRSPC_MDIO = 0x0018, + FW_LDST_ADDRSPC_MPS = 0x0020, + FW_LDST_ADDRSPC_FUNC = 0x0028, + FW_LDST_ADDRSPC_FUNC_PCIE = 0x0029, +}; + +enum fw_ldst_mps_fid { + FW_LDST_MPS_ATRB, + FW_LDST_MPS_RPLC +}; + +enum fw_ldst_func_access_ctl { + FW_LDST_FUNC_ACC_CTL_VIID, + FW_LDST_FUNC_ACC_CTL_FID +}; + +enum fw_ldst_func_mod_index { + FW_LDST_FUNC_MPS +}; + +struct fw_ldst_cmd { + __be32 op_to_addrspace; + __be32 cycles_to_len16; + union fw_ldst { + struct fw_ldst_addrval { + __be32 addr; + __be32 val; + } addrval; + struct fw_ldst_idctxt { + __be32 physid; + __be32 msg_ctxtflush; + __be32 ctxt_data7; + __be32 ctxt_data6; + __be32 ctxt_data5; + __be32 ctxt_data4; + __be32 ctxt_data3; + __be32 ctxt_data2; + __be32 ctxt_data1; + __be32 ctxt_data0; + } idctxt; + struct fw_ldst_mdio { + __be16 paddr_mmd; + __be16 raddr; + __be16 vctl; + __be16 rval; + } mdio; + struct fw_ldst_cim_rq { + u8 req_first64[8]; + u8 req_second64[8]; + u8 resp_first64[8]; + u8 resp_second64[8]; + __be32 r3[2]; + } cim_rq; + union fw_ldst_mps { + struct fw_ldst_mps_rplc { + __be16 fid_idx; + __be16 rplcpf_pkd; + __be32 rplc255_224; + __be32 rplc223_192; + __be32 rplc191_160; + __be32 rplc159_128; + __be32 rplc127_96; + __be32 rplc95_64; + __be32 rplc63_32; + __be32 rplc31_0; + } rplc; + struct fw_ldst_mps_atrb { + __be16 fid_mpsid; + __be16 r2[3]; + __be32 r3[2]; + __be32 r4; + __be32 atrb; + __be16 vlan[16]; + } atrb; + } mps; + struct fw_ldst_func { + u8 access_ctl; + u8 mod_index; + __be16 ctl_id; + __be32 offset; + __be64 data0; + __be64 data1; + } func; + struct fw_ldst_pcie { + u8 ctrl_to_fn; + u8 bnum; + u8 r; + u8 ext_r; + u8 select_naccess; + u8 pcie_fn; + __be16 nset_pkd; + __be32 data[12]; + } pcie; + struct fw_ldst_i2c_deprecated { + u8 pid_pkd; + u8 base; + u8 boffset; + u8 data; + __be32 r9; + } i2c_deprecated; + struct fw_ldst_i2c { + u8 pid; + u8 did; + u8 boffset; + u8 blen; + __be32 r9; + __u8 data[48]; + } i2c; + struct fw_ldst_le { + __be32 index; + __be32 r9; + u8 val[33]; + u8 r11[7]; + } le; + } u; +}; + +#define FW_LDST_CMD_ADDRSPACE_S 0 +#define FW_LDST_CMD_ADDRSPACE_V(x) ((x) << FW_LDST_CMD_ADDRSPACE_S) + +#define FW_LDST_CMD_MSG_S 31 +#define FW_LDST_CMD_MSG_V(x) ((x) << FW_LDST_CMD_MSG_S) + +#define FW_LDST_CMD_CTXTFLUSH_S 30 +#define FW_LDST_CMD_CTXTFLUSH_V(x) ((x) << FW_LDST_CMD_CTXTFLUSH_S) +#define FW_LDST_CMD_CTXTFLUSH_F FW_LDST_CMD_CTXTFLUSH_V(1U) + +#define FW_LDST_CMD_PADDR_S 8 +#define FW_LDST_CMD_PADDR_V(x) ((x) << FW_LDST_CMD_PADDR_S) + +#define FW_LDST_CMD_MMD_S 0 +#define FW_LDST_CMD_MMD_V(x) ((x) << FW_LDST_CMD_MMD_S) + +#define FW_LDST_CMD_FID_S 15 +#define FW_LDST_CMD_FID_V(x) ((x) << FW_LDST_CMD_FID_S) + +#define FW_LDST_CMD_IDX_S 0 +#define FW_LDST_CMD_IDX_V(x) ((x) << FW_LDST_CMD_IDX_S) + +#define FW_LDST_CMD_RPLCPF_S 0 +#define FW_LDST_CMD_RPLCPF_V(x) ((x) << FW_LDST_CMD_RPLCPF_S) + +#define FW_LDST_CMD_LC_S 4 +#define FW_LDST_CMD_LC_V(x) ((x) << FW_LDST_CMD_LC_S) +#define FW_LDST_CMD_LC_F FW_LDST_CMD_LC_V(1U) + +#define FW_LDST_CMD_FN_S 0 +#define FW_LDST_CMD_FN_V(x) ((x) << FW_LDST_CMD_FN_S) + +#define FW_LDST_CMD_NACCESS_S 0 +#define FW_LDST_CMD_NACCESS_V(x) ((x) << FW_LDST_CMD_NACCESS_S) + +struct fw_reset_cmd { + __be32 op_to_write; + __be32 retval_len16; + __be32 val; + __be32 halt_pkd; +}; + +#define FW_RESET_CMD_HALT_S 31 +#define FW_RESET_CMD_HALT_M 0x1 +#define FW_RESET_CMD_HALT_V(x) ((x) << FW_RESET_CMD_HALT_S) +#define FW_RESET_CMD_HALT_G(x) \ + (((x) >> FW_RESET_CMD_HALT_S) & FW_RESET_CMD_HALT_M) +#define FW_RESET_CMD_HALT_F FW_RESET_CMD_HALT_V(1U) + +enum fw_hellow_cmd { + fw_hello_cmd_stage_os = 0x0 +}; + +struct fw_hello_cmd { + __be32 op_to_write; + __be32 retval_len16; + __be32 err_to_clearinit; + __be32 fwrev; +}; + +#define FW_HELLO_CMD_ERR_S 31 +#define FW_HELLO_CMD_ERR_V(x) ((x) << FW_HELLO_CMD_ERR_S) +#define FW_HELLO_CMD_ERR_F FW_HELLO_CMD_ERR_V(1U) + +#define FW_HELLO_CMD_INIT_S 30 +#define FW_HELLO_CMD_INIT_V(x) ((x) << FW_HELLO_CMD_INIT_S) +#define FW_HELLO_CMD_INIT_F FW_HELLO_CMD_INIT_V(1U) + +#define FW_HELLO_CMD_MASTERDIS_S 29 +#define FW_HELLO_CMD_MASTERDIS_V(x) ((x) << FW_HELLO_CMD_MASTERDIS_S) + +#define FW_HELLO_CMD_MASTERFORCE_S 28 +#define FW_HELLO_CMD_MASTERFORCE_V(x) ((x) << FW_HELLO_CMD_MASTERFORCE_S) + +#define FW_HELLO_CMD_MBMASTER_S 24 +#define FW_HELLO_CMD_MBMASTER_M 0xfU +#define FW_HELLO_CMD_MBMASTER_V(x) ((x) << FW_HELLO_CMD_MBMASTER_S) +#define FW_HELLO_CMD_MBMASTER_G(x) \ + (((x) >> FW_HELLO_CMD_MBMASTER_S) & FW_HELLO_CMD_MBMASTER_M) + +#define FW_HELLO_CMD_MBASYNCNOTINT_S 23 +#define FW_HELLO_CMD_MBASYNCNOTINT_V(x) ((x) << FW_HELLO_CMD_MBASYNCNOTINT_S) + +#define FW_HELLO_CMD_MBASYNCNOT_S 20 +#define FW_HELLO_CMD_MBASYNCNOT_V(x) ((x) << FW_HELLO_CMD_MBASYNCNOT_S) + +#define FW_HELLO_CMD_STAGE_S 17 +#define FW_HELLO_CMD_STAGE_V(x) ((x) << FW_HELLO_CMD_STAGE_S) + +#define FW_HELLO_CMD_CLEARINIT_S 16 +#define FW_HELLO_CMD_CLEARINIT_V(x) ((x) << FW_HELLO_CMD_CLEARINIT_S) +#define FW_HELLO_CMD_CLEARINIT_F FW_HELLO_CMD_CLEARINIT_V(1U) + +struct fw_bye_cmd { + __be32 op_to_write; + __be32 retval_len16; + __be64 r3; +}; + +struct fw_initialize_cmd { + __be32 op_to_write; + __be32 retval_len16; + __be64 r3; +}; + +enum fw_caps_config_hm { + FW_CAPS_CONFIG_HM_PCIE = 0x00000001, + FW_CAPS_CONFIG_HM_PL = 0x00000002, + FW_CAPS_CONFIG_HM_SGE = 0x00000004, + FW_CAPS_CONFIG_HM_CIM = 0x00000008, + FW_CAPS_CONFIG_HM_ULPTX = 0x00000010, + FW_CAPS_CONFIG_HM_TP = 0x00000020, + FW_CAPS_CONFIG_HM_ULPRX = 0x00000040, + FW_CAPS_CONFIG_HM_PMRX = 0x00000080, + FW_CAPS_CONFIG_HM_PMTX = 0x00000100, + FW_CAPS_CONFIG_HM_MC = 0x00000200, + FW_CAPS_CONFIG_HM_LE = 0x00000400, + FW_CAPS_CONFIG_HM_MPS = 0x00000800, + FW_CAPS_CONFIG_HM_XGMAC = 0x00001000, + FW_CAPS_CONFIG_HM_CPLSWITCH = 0x00002000, + FW_CAPS_CONFIG_HM_T4DBG = 0x00004000, + FW_CAPS_CONFIG_HM_MI = 0x00008000, + FW_CAPS_CONFIG_HM_I2CM = 0x00010000, + FW_CAPS_CONFIG_HM_NCSI = 0x00020000, + FW_CAPS_CONFIG_HM_SMB = 0x00040000, + FW_CAPS_CONFIG_HM_MA = 0x00080000, + FW_CAPS_CONFIG_HM_EDRAM = 0x00100000, + FW_CAPS_CONFIG_HM_PMU = 0x00200000, + FW_CAPS_CONFIG_HM_UART = 0x00400000, + FW_CAPS_CONFIG_HM_SF = 0x00800000, +}; + +enum fw_caps_config_nbm { + FW_CAPS_CONFIG_NBM_IPMI = 0x00000001, + FW_CAPS_CONFIG_NBM_NCSI = 0x00000002, +}; + +enum fw_caps_config_link { + FW_CAPS_CONFIG_LINK_PPP = 0x00000001, + FW_CAPS_CONFIG_LINK_QFC = 0x00000002, + FW_CAPS_CONFIG_LINK_DCBX = 0x00000004, +}; + +enum fw_caps_config_switch { + FW_CAPS_CONFIG_SWITCH_INGRESS = 0x00000001, + FW_CAPS_CONFIG_SWITCH_EGRESS = 0x00000002, +}; + +enum fw_caps_config_nic { + FW_CAPS_CONFIG_NIC = 0x00000001, + FW_CAPS_CONFIG_NIC_VM = 0x00000002, +}; + +enum fw_caps_config_ofld { + FW_CAPS_CONFIG_OFLD = 0x00000001, +}; + +enum fw_caps_config_rdma { + FW_CAPS_CONFIG_RDMA_RDDP = 0x00000001, + FW_CAPS_CONFIG_RDMA_RDMAC = 0x00000002, +}; + +enum fw_caps_config_iscsi { + FW_CAPS_CONFIG_ISCSI_INITIATOR_PDU = 0x00000001, + FW_CAPS_CONFIG_ISCSI_TARGET_PDU = 0x00000002, + FW_CAPS_CONFIG_ISCSI_INITIATOR_CNXOFLD = 0x00000004, + FW_CAPS_CONFIG_ISCSI_TARGET_CNXOFLD = 0x00000008, +}; + +enum fw_caps_config_fcoe { + FW_CAPS_CONFIG_FCOE_INITIATOR = 0x00000001, + FW_CAPS_CONFIG_FCOE_TARGET = 0x00000002, + FW_CAPS_CONFIG_FCOE_CTRL_OFLD = 0x00000004, +}; + +enum fw_memtype_cf { + FW_MEMTYPE_CF_EDC0 = 0x0, + FW_MEMTYPE_CF_EDC1 = 0x1, + FW_MEMTYPE_CF_EXTMEM = 0x2, + FW_MEMTYPE_CF_FLASH = 0x4, + FW_MEMTYPE_CF_INTERNAL = 0x5, + FW_MEMTYPE_CF_EXTMEM1 = 0x6, +}; + +struct fw_caps_config_cmd { + __be32 op_to_write; + __be32 cfvalid_to_len16; + __be32 r2; + __be32 hwmbitmap; + __be16 nbmcaps; + __be16 linkcaps; + __be16 switchcaps; + __be16 r3; + __be16 niccaps; + __be16 ofldcaps; + __be16 rdmacaps; + __be16 r4; + __be16 iscsicaps; + __be16 fcoecaps; + __be32 cfcsum; + __be32 finiver; + __be32 finicsum; +}; + +#define FW_CAPS_CONFIG_CMD_CFVALID_S 27 +#define FW_CAPS_CONFIG_CMD_CFVALID_V(x) ((x) << FW_CAPS_CONFIG_CMD_CFVALID_S) +#define FW_CAPS_CONFIG_CMD_CFVALID_F FW_CAPS_CONFIG_CMD_CFVALID_V(1U) + +#define FW_CAPS_CONFIG_CMD_MEMTYPE_CF_S 24 +#define FW_CAPS_CONFIG_CMD_MEMTYPE_CF_V(x) \ + ((x) << FW_CAPS_CONFIG_CMD_MEMTYPE_CF_S) + +#define FW_CAPS_CONFIG_CMD_MEMADDR64K_CF_S 16 +#define FW_CAPS_CONFIG_CMD_MEMADDR64K_CF_V(x) \ + ((x) << FW_CAPS_CONFIG_CMD_MEMADDR64K_CF_S) + +/* + * params command mnemonics + */ +enum fw_params_mnem { + FW_PARAMS_MNEM_DEV = 1, /* device params */ + FW_PARAMS_MNEM_PFVF = 2, /* function params */ + FW_PARAMS_MNEM_REG = 3, /* limited register access */ + FW_PARAMS_MNEM_DMAQ = 4, /* dma queue params */ + FW_PARAMS_MNEM_CHNET = 5, /* chnet params */ + FW_PARAMS_MNEM_LAST +}; + +/* + * device parameters + */ +enum fw_params_param_dev { + FW_PARAMS_PARAM_DEV_CCLK = 0x00, /* chip core clock in khz */ + FW_PARAMS_PARAM_DEV_PORTVEC = 0x01, /* the port vector */ + FW_PARAMS_PARAM_DEV_NTID = 0x02, /* reads the number of TIDs + * allocated by the device's + * Lookup Engine + */ + FW_PARAMS_PARAM_DEV_FLOWC_BUFFIFO_SZ = 0x03, + FW_PARAMS_PARAM_DEV_INTVER_NIC = 0x04, + FW_PARAMS_PARAM_DEV_INTVER_VNIC = 0x05, + FW_PARAMS_PARAM_DEV_INTVER_OFLD = 0x06, + FW_PARAMS_PARAM_DEV_INTVER_RI = 0x07, + FW_PARAMS_PARAM_DEV_INTVER_ISCSIPDU = 0x08, + FW_PARAMS_PARAM_DEV_INTVER_ISCSI = 0x09, + FW_PARAMS_PARAM_DEV_INTVER_FCOE = 0x0A, + FW_PARAMS_PARAM_DEV_FWREV = 0x0B, + FW_PARAMS_PARAM_DEV_TPREV = 0x0C, + FW_PARAMS_PARAM_DEV_CF = 0x0D, + FW_PARAMS_PARAM_DEV_PHYFW = 0x0F, + FW_PARAMS_PARAM_DEV_DIAG = 0x11, + FW_PARAMS_PARAM_DEV_MAXORDIRD_QP = 0x13, /* max supported QP IRD/ORD */ + FW_PARAMS_PARAM_DEV_MAXIRD_ADAPTER = 0x14, /* max supported adap IRD */ + FW_PARAMS_PARAM_DEV_ULPTX_MEMWRITE_DSGL = 0x17, + FW_PARAMS_PARAM_DEV_FWCACHE = 0x18, +}; + +/* + * physical and virtual function parameters + */ +enum fw_params_param_pfvf { + FW_PARAMS_PARAM_PFVF_RWXCAPS = 0x00, + FW_PARAMS_PARAM_PFVF_ROUTE_START = 0x01, + FW_PARAMS_PARAM_PFVF_ROUTE_END = 0x02, + FW_PARAMS_PARAM_PFVF_CLIP_START = 0x03, + FW_PARAMS_PARAM_PFVF_CLIP_END = 0x04, + FW_PARAMS_PARAM_PFVF_FILTER_START = 0x05, + FW_PARAMS_PARAM_PFVF_FILTER_END = 0x06, + FW_PARAMS_PARAM_PFVF_SERVER_START = 0x07, + FW_PARAMS_PARAM_PFVF_SERVER_END = 0x08, + FW_PARAMS_PARAM_PFVF_TDDP_START = 0x09, + FW_PARAMS_PARAM_PFVF_TDDP_END = 0x0A, + FW_PARAMS_PARAM_PFVF_ISCSI_START = 0x0B, + FW_PARAMS_PARAM_PFVF_ISCSI_END = 0x0C, + FW_PARAMS_PARAM_PFVF_STAG_START = 0x0D, + FW_PARAMS_PARAM_PFVF_STAG_END = 0x0E, + FW_PARAMS_PARAM_PFVF_RQ_START = 0x1F, + FW_PARAMS_PARAM_PFVF_RQ_END = 0x10, + FW_PARAMS_PARAM_PFVF_PBL_START = 0x11, + FW_PARAMS_PARAM_PFVF_PBL_END = 0x12, + FW_PARAMS_PARAM_PFVF_L2T_START = 0x13, + FW_PARAMS_PARAM_PFVF_L2T_END = 0x14, + FW_PARAMS_PARAM_PFVF_SQRQ_START = 0x15, + FW_PARAMS_PARAM_PFVF_SQRQ_END = 0x16, + FW_PARAMS_PARAM_PFVF_CQ_START = 0x17, + FW_PARAMS_PARAM_PFVF_CQ_END = 0x18, + FW_PARAMS_PARAM_PFVF_SRQ_START = 0x19, + FW_PARAMS_PARAM_PFVF_SRQ_END = 0x1A, + FW_PARAMS_PARAM_PFVF_SCHEDCLASS_ETH = 0x20, + FW_PARAMS_PARAM_PFVF_VIID = 0x24, + FW_PARAMS_PARAM_PFVF_CPMASK = 0x25, + FW_PARAMS_PARAM_PFVF_OCQ_START = 0x26, + FW_PARAMS_PARAM_PFVF_OCQ_END = 0x27, + FW_PARAMS_PARAM_PFVF_CONM_MAP = 0x28, + FW_PARAMS_PARAM_PFVF_IQFLINT_START = 0x29, + FW_PARAMS_PARAM_PFVF_IQFLINT_END = 0x2A, + FW_PARAMS_PARAM_PFVF_EQ_START = 0x2B, + FW_PARAMS_PARAM_PFVF_EQ_END = 0x2C, + FW_PARAMS_PARAM_PFVF_ACTIVE_FILTER_START = 0x2D, + FW_PARAMS_PARAM_PFVF_ACTIVE_FILTER_END = 0x2E, + FW_PARAMS_PARAM_PFVF_ETHOFLD_END = 0x30, + FW_PARAMS_PARAM_PFVF_CPLFW4MSG_ENCAP = 0x31 +}; + +/* + * dma queue parameters + */ +enum fw_params_param_dmaq { + FW_PARAMS_PARAM_DMAQ_IQ_DCAEN_DCACPU = 0x00, + FW_PARAMS_PARAM_DMAQ_IQ_INTCNTTHRESH = 0x01, + FW_PARAMS_PARAM_DMAQ_EQ_CMPLIQID_MNGT = 0x10, + FW_PARAMS_PARAM_DMAQ_EQ_CMPLIQID_CTRL = 0x11, + FW_PARAMS_PARAM_DMAQ_EQ_SCHEDCLASS_ETH = 0x12, + FW_PARAMS_PARAM_DMAQ_EQ_DCBPRIO_ETH = 0x13, + FW_PARAMS_PARAM_DMAQ_CONM_CTXT = 0x20, +}; + +enum fw_params_param_dev_phyfw { + FW_PARAMS_PARAM_DEV_PHYFW_DOWNLOAD = 0x00, + FW_PARAMS_PARAM_DEV_PHYFW_VERSION = 0x01, +}; + +enum fw_params_param_dev_diag { + FW_PARAM_DEV_DIAG_TMP = 0x00, + FW_PARAM_DEV_DIAG_VDD = 0x01, +}; + +enum fw_params_param_dev_fwcache { + FW_PARAM_DEV_FWCACHE_FLUSH = 0x00, + FW_PARAM_DEV_FWCACHE_FLUSHINV = 0x01, +}; + +#define FW_PARAMS_MNEM_S 24 +#define FW_PARAMS_MNEM_V(x) ((x) << FW_PARAMS_MNEM_S) + +#define FW_PARAMS_PARAM_X_S 16 +#define FW_PARAMS_PARAM_X_V(x) ((x) << FW_PARAMS_PARAM_X_S) + +#define FW_PARAMS_PARAM_Y_S 8 +#define FW_PARAMS_PARAM_Y_M 0xffU +#define FW_PARAMS_PARAM_Y_V(x) ((x) << FW_PARAMS_PARAM_Y_S) +#define FW_PARAMS_PARAM_Y_G(x) (((x) >> FW_PARAMS_PARAM_Y_S) &\ + FW_PARAMS_PARAM_Y_M) + +#define FW_PARAMS_PARAM_Z_S 0 +#define FW_PARAMS_PARAM_Z_M 0xffu +#define FW_PARAMS_PARAM_Z_V(x) ((x) << FW_PARAMS_PARAM_Z_S) +#define FW_PARAMS_PARAM_Z_G(x) (((x) >> FW_PARAMS_PARAM_Z_S) &\ + FW_PARAMS_PARAM_Z_M) + +#define FW_PARAMS_PARAM_XYZ_S 0 +#define FW_PARAMS_PARAM_XYZ_V(x) ((x) << FW_PARAMS_PARAM_XYZ_S) + +#define FW_PARAMS_PARAM_YZ_S 0 +#define FW_PARAMS_PARAM_YZ_V(x) ((x) << FW_PARAMS_PARAM_YZ_S) + +struct fw_params_cmd { + __be32 op_to_vfn; + __be32 retval_len16; + struct fw_params_param { + __be32 mnem; + __be32 val; + } param[7]; +}; + +#define FW_PARAMS_CMD_PFN_S 8 +#define FW_PARAMS_CMD_PFN_V(x) ((x) << FW_PARAMS_CMD_PFN_S) + +#define FW_PARAMS_CMD_VFN_S 0 +#define FW_PARAMS_CMD_VFN_V(x) ((x) << FW_PARAMS_CMD_VFN_S) + +struct fw_pfvf_cmd { + __be32 op_to_vfn; + __be32 retval_len16; + __be32 niqflint_niq; + __be32 type_to_neq; + __be32 tc_to_nexactf; + __be32 r_caps_to_nethctrl; + __be16 nricq; + __be16 nriqp; + __be32 r4; +}; + +#define FW_PFVF_CMD_PFN_S 8 +#define FW_PFVF_CMD_PFN_V(x) ((x) << FW_PFVF_CMD_PFN_S) + +#define FW_PFVF_CMD_VFN_S 0 +#define FW_PFVF_CMD_VFN_V(x) ((x) << FW_PFVF_CMD_VFN_S) + +#define FW_PFVF_CMD_NIQFLINT_S 20 +#define FW_PFVF_CMD_NIQFLINT_M 0xfff +#define FW_PFVF_CMD_NIQFLINT_V(x) ((x) << FW_PFVF_CMD_NIQFLINT_S) +#define FW_PFVF_CMD_NIQFLINT_G(x) \ + (((x) >> FW_PFVF_CMD_NIQFLINT_S) & FW_PFVF_CMD_NIQFLINT_M) + +#define FW_PFVF_CMD_NIQ_S 0 +#define FW_PFVF_CMD_NIQ_M 0xfffff +#define FW_PFVF_CMD_NIQ_V(x) ((x) << FW_PFVF_CMD_NIQ_S) +#define FW_PFVF_CMD_NIQ_G(x) \ + (((x) >> FW_PFVF_CMD_NIQ_S) & FW_PFVF_CMD_NIQ_M) + +#define FW_PFVF_CMD_TYPE_S 31 +#define FW_PFVF_CMD_TYPE_M 0x1 +#define FW_PFVF_CMD_TYPE_V(x) ((x) << FW_PFVF_CMD_TYPE_S) +#define FW_PFVF_CMD_TYPE_G(x) \ + (((x) >> FW_PFVF_CMD_TYPE_S) & FW_PFVF_CMD_TYPE_M) +#define FW_PFVF_CMD_TYPE_F FW_PFVF_CMD_TYPE_V(1U) + +#define FW_PFVF_CMD_CMASK_S 24 +#define FW_PFVF_CMD_CMASK_M 0xf +#define FW_PFVF_CMD_CMASK_V(x) ((x) << FW_PFVF_CMD_CMASK_S) +#define FW_PFVF_CMD_CMASK_G(x) \ + (((x) >> FW_PFVF_CMD_CMASK_S) & FW_PFVF_CMD_CMASK_M) + +#define FW_PFVF_CMD_PMASK_S 20 +#define FW_PFVF_CMD_PMASK_M 0xf +#define FW_PFVF_CMD_PMASK_V(x) ((x) << FW_PFVF_CMD_PMASK_S) +#define FW_PFVF_CMD_PMASK_G(x) \ + (((x) >> FW_PFVF_CMD_PMASK_S) & FW_PFVF_CMD_PMASK_M) + +#define FW_PFVF_CMD_NEQ_S 0 +#define FW_PFVF_CMD_NEQ_M 0xfffff +#define FW_PFVF_CMD_NEQ_V(x) ((x) << FW_PFVF_CMD_NEQ_S) +#define FW_PFVF_CMD_NEQ_G(x) \ + (((x) >> FW_PFVF_CMD_NEQ_S) & FW_PFVF_CMD_NEQ_M) + +#define FW_PFVF_CMD_TC_S 24 +#define FW_PFVF_CMD_TC_M 0xff +#define FW_PFVF_CMD_TC_V(x) ((x) << FW_PFVF_CMD_TC_S) +#define FW_PFVF_CMD_TC_G(x) (((x) >> FW_PFVF_CMD_TC_S) & FW_PFVF_CMD_TC_M) + +#define FW_PFVF_CMD_NVI_S 16 +#define FW_PFVF_CMD_NVI_M 0xff +#define FW_PFVF_CMD_NVI_V(x) ((x) << FW_PFVF_CMD_NVI_S) +#define FW_PFVF_CMD_NVI_G(x) (((x) >> FW_PFVF_CMD_NVI_S) & FW_PFVF_CMD_NVI_M) + +#define FW_PFVF_CMD_NEXACTF_S 0 +#define FW_PFVF_CMD_NEXACTF_M 0xffff +#define FW_PFVF_CMD_NEXACTF_V(x) ((x) << FW_PFVF_CMD_NEXACTF_S) +#define FW_PFVF_CMD_NEXACTF_G(x) \ + (((x) >> FW_PFVF_CMD_NEXACTF_S) & FW_PFVF_CMD_NEXACTF_M) + +#define FW_PFVF_CMD_R_CAPS_S 24 +#define FW_PFVF_CMD_R_CAPS_M 0xff +#define FW_PFVF_CMD_R_CAPS_V(x) ((x) << FW_PFVF_CMD_R_CAPS_S) +#define FW_PFVF_CMD_R_CAPS_G(x) \ + (((x) >> FW_PFVF_CMD_R_CAPS_S) & FW_PFVF_CMD_R_CAPS_M) + +#define FW_PFVF_CMD_WX_CAPS_S 16 +#define FW_PFVF_CMD_WX_CAPS_M 0xff +#define FW_PFVF_CMD_WX_CAPS_V(x) ((x) << FW_PFVF_CMD_WX_CAPS_S) +#define FW_PFVF_CMD_WX_CAPS_G(x) \ + (((x) >> FW_PFVF_CMD_WX_CAPS_S) & FW_PFVF_CMD_WX_CAPS_M) + +#define FW_PFVF_CMD_NETHCTRL_S 0 +#define FW_PFVF_CMD_NETHCTRL_M 0xffff +#define FW_PFVF_CMD_NETHCTRL_V(x) ((x) << FW_PFVF_CMD_NETHCTRL_S) +#define FW_PFVF_CMD_NETHCTRL_G(x) \ + (((x) >> FW_PFVF_CMD_NETHCTRL_S) & FW_PFVF_CMD_NETHCTRL_M) + +enum fw_iq_type { + FW_IQ_TYPE_FL_INT_CAP, + FW_IQ_TYPE_NO_FL_INT_CAP +}; + +struct fw_iq_cmd { + __be32 op_to_vfn; + __be32 alloc_to_len16; + __be16 physiqid; + __be16 iqid; + __be16 fl0id; + __be16 fl1id; + __be32 type_to_iqandstindex; + __be16 iqdroprss_to_iqesize; + __be16 iqsize; + __be64 iqaddr; + __be32 iqns_to_fl0congen; + __be16 fl0dcaen_to_fl0cidxfthresh; + __be16 fl0size; + __be64 fl0addr; + __be32 fl1cngchmap_to_fl1congen; + __be16 fl1dcaen_to_fl1cidxfthresh; + __be16 fl1size; + __be64 fl1addr; +}; + +#define FW_IQ_CMD_PFN_S 8 +#define FW_IQ_CMD_PFN_V(x) ((x) << FW_IQ_CMD_PFN_S) + +#define FW_IQ_CMD_VFN_S 0 +#define FW_IQ_CMD_VFN_V(x) ((x) << FW_IQ_CMD_VFN_S) + +#define FW_IQ_CMD_ALLOC_S 31 +#define FW_IQ_CMD_ALLOC_V(x) ((x) << FW_IQ_CMD_ALLOC_S) +#define FW_IQ_CMD_ALLOC_F FW_IQ_CMD_ALLOC_V(1U) + +#define FW_IQ_CMD_FREE_S 30 +#define FW_IQ_CMD_FREE_V(x) ((x) << FW_IQ_CMD_FREE_S) +#define FW_IQ_CMD_FREE_F FW_IQ_CMD_FREE_V(1U) + +#define FW_IQ_CMD_MODIFY_S 29 +#define FW_IQ_CMD_MODIFY_V(x) ((x) << FW_IQ_CMD_MODIFY_S) +#define FW_IQ_CMD_MODIFY_F FW_IQ_CMD_MODIFY_V(1U) + +#define FW_IQ_CMD_IQSTART_S 28 +#define FW_IQ_CMD_IQSTART_V(x) ((x) << FW_IQ_CMD_IQSTART_S) +#define FW_IQ_CMD_IQSTART_F FW_IQ_CMD_IQSTART_V(1U) + +#define FW_IQ_CMD_IQSTOP_S 27 +#define FW_IQ_CMD_IQSTOP_V(x) ((x) << FW_IQ_CMD_IQSTOP_S) +#define FW_IQ_CMD_IQSTOP_F FW_IQ_CMD_IQSTOP_V(1U) + +#define FW_IQ_CMD_TYPE_S 29 +#define FW_IQ_CMD_TYPE_V(x) ((x) << FW_IQ_CMD_TYPE_S) + +#define FW_IQ_CMD_IQASYNCH_S 28 +#define FW_IQ_CMD_IQASYNCH_V(x) ((x) << FW_IQ_CMD_IQASYNCH_S) + +#define FW_IQ_CMD_VIID_S 16 +#define FW_IQ_CMD_VIID_V(x) ((x) << FW_IQ_CMD_VIID_S) + +#define FW_IQ_CMD_IQANDST_S 15 +#define FW_IQ_CMD_IQANDST_V(x) ((x) << FW_IQ_CMD_IQANDST_S) + +#define FW_IQ_CMD_IQANUS_S 14 +#define FW_IQ_CMD_IQANUS_V(x) ((x) << FW_IQ_CMD_IQANUS_S) + +#define FW_IQ_CMD_IQANUD_S 12 +#define FW_IQ_CMD_IQANUD_V(x) ((x) << FW_IQ_CMD_IQANUD_S) + +#define FW_IQ_CMD_IQANDSTINDEX_S 0 +#define FW_IQ_CMD_IQANDSTINDEX_V(x) ((x) << FW_IQ_CMD_IQANDSTINDEX_S) + +#define FW_IQ_CMD_IQDROPRSS_S 15 +#define FW_IQ_CMD_IQDROPRSS_V(x) ((x) << FW_IQ_CMD_IQDROPRSS_S) +#define FW_IQ_CMD_IQDROPRSS_F FW_IQ_CMD_IQDROPRSS_V(1U) + +#define FW_IQ_CMD_IQGTSMODE_S 14 +#define FW_IQ_CMD_IQGTSMODE_V(x) ((x) << FW_IQ_CMD_IQGTSMODE_S) +#define FW_IQ_CMD_IQGTSMODE_F FW_IQ_CMD_IQGTSMODE_V(1U) + +#define FW_IQ_CMD_IQPCIECH_S 12 +#define FW_IQ_CMD_IQPCIECH_V(x) ((x) << FW_IQ_CMD_IQPCIECH_S) + +#define FW_IQ_CMD_IQDCAEN_S 11 +#define FW_IQ_CMD_IQDCAEN_V(x) ((x) << FW_IQ_CMD_IQDCAEN_S) + +#define FW_IQ_CMD_IQDCACPU_S 6 +#define FW_IQ_CMD_IQDCACPU_V(x) ((x) << FW_IQ_CMD_IQDCACPU_S) + +#define FW_IQ_CMD_IQINTCNTTHRESH_S 4 +#define FW_IQ_CMD_IQINTCNTTHRESH_V(x) ((x) << FW_IQ_CMD_IQINTCNTTHRESH_S) + +#define FW_IQ_CMD_IQO_S 3 +#define FW_IQ_CMD_IQO_V(x) ((x) << FW_IQ_CMD_IQO_S) +#define FW_IQ_CMD_IQO_F FW_IQ_CMD_IQO_V(1U) + +#define FW_IQ_CMD_IQCPRIO_S 2 +#define FW_IQ_CMD_IQCPRIO_V(x) ((x) << FW_IQ_CMD_IQCPRIO_S) + +#define FW_IQ_CMD_IQESIZE_S 0 +#define FW_IQ_CMD_IQESIZE_V(x) ((x) << FW_IQ_CMD_IQESIZE_S) + +#define FW_IQ_CMD_IQNS_S 31 +#define FW_IQ_CMD_IQNS_V(x) ((x) << FW_IQ_CMD_IQNS_S) + +#define FW_IQ_CMD_IQRO_S 30 +#define FW_IQ_CMD_IQRO_V(x) ((x) << FW_IQ_CMD_IQRO_S) + +#define FW_IQ_CMD_IQFLINTIQHSEN_S 28 +#define FW_IQ_CMD_IQFLINTIQHSEN_V(x) ((x) << FW_IQ_CMD_IQFLINTIQHSEN_S) + +#define FW_IQ_CMD_IQFLINTCONGEN_S 27 +#define FW_IQ_CMD_IQFLINTCONGEN_V(x) ((x) << FW_IQ_CMD_IQFLINTCONGEN_S) +#define FW_IQ_CMD_IQFLINTCONGEN_F FW_IQ_CMD_IQFLINTCONGEN_V(1U) + +#define FW_IQ_CMD_IQFLINTISCSIC_S 26 +#define FW_IQ_CMD_IQFLINTISCSIC_V(x) ((x) << FW_IQ_CMD_IQFLINTISCSIC_S) + +#define FW_IQ_CMD_FL0CNGCHMAP_S 20 +#define FW_IQ_CMD_FL0CNGCHMAP_V(x) ((x) << FW_IQ_CMD_FL0CNGCHMAP_S) + +#define FW_IQ_CMD_FL0CACHELOCK_S 15 +#define FW_IQ_CMD_FL0CACHELOCK_V(x) ((x) << FW_IQ_CMD_FL0CACHELOCK_S) + +#define FW_IQ_CMD_FL0DBP_S 14 +#define FW_IQ_CMD_FL0DBP_V(x) ((x) << FW_IQ_CMD_FL0DBP_S) + +#define FW_IQ_CMD_FL0DATANS_S 13 +#define FW_IQ_CMD_FL0DATANS_V(x) ((x) << FW_IQ_CMD_FL0DATANS_S) + +#define FW_IQ_CMD_FL0DATARO_S 12 +#define FW_IQ_CMD_FL0DATARO_V(x) ((x) << FW_IQ_CMD_FL0DATARO_S) +#define FW_IQ_CMD_FL0DATARO_F FW_IQ_CMD_FL0DATARO_V(1U) + +#define FW_IQ_CMD_FL0CONGCIF_S 11 +#define FW_IQ_CMD_FL0CONGCIF_V(x) ((x) << FW_IQ_CMD_FL0CONGCIF_S) +#define FW_IQ_CMD_FL0CONGCIF_F FW_IQ_CMD_FL0CONGCIF_V(1U) + +#define FW_IQ_CMD_FL0ONCHIP_S 10 +#define FW_IQ_CMD_FL0ONCHIP_V(x) ((x) << FW_IQ_CMD_FL0ONCHIP_S) + +#define FW_IQ_CMD_FL0STATUSPGNS_S 9 +#define FW_IQ_CMD_FL0STATUSPGNS_V(x) ((x) << FW_IQ_CMD_FL0STATUSPGNS_S) + +#define FW_IQ_CMD_FL0STATUSPGRO_S 8 +#define FW_IQ_CMD_FL0STATUSPGRO_V(x) ((x) << FW_IQ_CMD_FL0STATUSPGRO_S) + +#define FW_IQ_CMD_FL0FETCHNS_S 7 +#define FW_IQ_CMD_FL0FETCHNS_V(x) ((x) << FW_IQ_CMD_FL0FETCHNS_S) + +#define FW_IQ_CMD_FL0FETCHRO_S 6 +#define FW_IQ_CMD_FL0FETCHRO_V(x) ((x) << FW_IQ_CMD_FL0FETCHRO_S) +#define FW_IQ_CMD_FL0FETCHRO_F FW_IQ_CMD_FL0FETCHRO_V(1U) + +#define FW_IQ_CMD_FL0HOSTFCMODE_S 4 +#define FW_IQ_CMD_FL0HOSTFCMODE_V(x) ((x) << FW_IQ_CMD_FL0HOSTFCMODE_S) + +#define FW_IQ_CMD_FL0CPRIO_S 3 +#define FW_IQ_CMD_FL0CPRIO_V(x) ((x) << FW_IQ_CMD_FL0CPRIO_S) + +#define FW_IQ_CMD_FL0PADEN_S 2 +#define FW_IQ_CMD_FL0PADEN_V(x) ((x) << FW_IQ_CMD_FL0PADEN_S) +#define FW_IQ_CMD_FL0PADEN_F FW_IQ_CMD_FL0PADEN_V(1U) + +#define FW_IQ_CMD_FL0PACKEN_S 1 +#define FW_IQ_CMD_FL0PACKEN_V(x) ((x) << FW_IQ_CMD_FL0PACKEN_S) +#define FW_IQ_CMD_FL0PACKEN_F FW_IQ_CMD_FL0PACKEN_V(1U) + +#define FW_IQ_CMD_FL0CONGEN_S 0 +#define FW_IQ_CMD_FL0CONGEN_V(x) ((x) << FW_IQ_CMD_FL0CONGEN_S) +#define FW_IQ_CMD_FL0CONGEN_F FW_IQ_CMD_FL0CONGEN_V(1U) + +#define FW_IQ_CMD_FL0DCAEN_S 15 +#define FW_IQ_CMD_FL0DCAEN_V(x) ((x) << FW_IQ_CMD_FL0DCAEN_S) + +#define FW_IQ_CMD_FL0DCACPU_S 10 +#define FW_IQ_CMD_FL0DCACPU_V(x) ((x) << FW_IQ_CMD_FL0DCACPU_S) + +#define FW_IQ_CMD_FL0FBMIN_S 7 +#define FW_IQ_CMD_FL0FBMIN_V(x) ((x) << FW_IQ_CMD_FL0FBMIN_S) + +#define FW_IQ_CMD_FL0FBMAX_S 4 +#define FW_IQ_CMD_FL0FBMAX_V(x) ((x) << FW_IQ_CMD_FL0FBMAX_S) + +#define FW_IQ_CMD_FL0CIDXFTHRESHO_S 3 +#define FW_IQ_CMD_FL0CIDXFTHRESHO_V(x) ((x) << FW_IQ_CMD_FL0CIDXFTHRESHO_S) +#define FW_IQ_CMD_FL0CIDXFTHRESHO_F FW_IQ_CMD_FL0CIDXFTHRESHO_V(1U) + +#define FW_IQ_CMD_FL0CIDXFTHRESH_S 0 +#define FW_IQ_CMD_FL0CIDXFTHRESH_V(x) ((x) << FW_IQ_CMD_FL0CIDXFTHRESH_S) + +#define FW_IQ_CMD_FL1CNGCHMAP_S 20 +#define FW_IQ_CMD_FL1CNGCHMAP_V(x) ((x) << FW_IQ_CMD_FL1CNGCHMAP_S) + +#define FW_IQ_CMD_FL1CACHELOCK_S 15 +#define FW_IQ_CMD_FL1CACHELOCK_V(x) ((x) << FW_IQ_CMD_FL1CACHELOCK_S) + +#define FW_IQ_CMD_FL1DBP_S 14 +#define FW_IQ_CMD_FL1DBP_V(x) ((x) << FW_IQ_CMD_FL1DBP_S) + +#define FW_IQ_CMD_FL1DATANS_S 13 +#define FW_IQ_CMD_FL1DATANS_V(x) ((x) << FW_IQ_CMD_FL1DATANS_S) + +#define FW_IQ_CMD_FL1DATARO_S 12 +#define FW_IQ_CMD_FL1DATARO_V(x) ((x) << FW_IQ_CMD_FL1DATARO_S) + +#define FW_IQ_CMD_FL1CONGCIF_S 11 +#define FW_IQ_CMD_FL1CONGCIF_V(x) ((x) << FW_IQ_CMD_FL1CONGCIF_S) + +#define FW_IQ_CMD_FL1ONCHIP_S 10 +#define FW_IQ_CMD_FL1ONCHIP_V(x) ((x) << FW_IQ_CMD_FL1ONCHIP_S) + +#define FW_IQ_CMD_FL1STATUSPGNS_S 9 +#define FW_IQ_CMD_FL1STATUSPGNS_V(x) ((x) << FW_IQ_CMD_FL1STATUSPGNS_S) + +#define FW_IQ_CMD_FL1STATUSPGRO_S 8 +#define FW_IQ_CMD_FL1STATUSPGRO_V(x) ((x) << FW_IQ_CMD_FL1STATUSPGRO_S) + +#define FW_IQ_CMD_FL1FETCHNS_S 7 +#define FW_IQ_CMD_FL1FETCHNS_V(x) ((x) << FW_IQ_CMD_FL1FETCHNS_S) + +#define FW_IQ_CMD_FL1FETCHRO_S 6 +#define FW_IQ_CMD_FL1FETCHRO_V(x) ((x) << FW_IQ_CMD_FL1FETCHRO_S) + +#define FW_IQ_CMD_FL1HOSTFCMODE_S 4 +#define FW_IQ_CMD_FL1HOSTFCMODE_V(x) ((x) << FW_IQ_CMD_FL1HOSTFCMODE_S) + +#define FW_IQ_CMD_FL1CPRIO_S 3 +#define FW_IQ_CMD_FL1CPRIO_V(x) ((x) << FW_IQ_CMD_FL1CPRIO_S) + +#define FW_IQ_CMD_FL1PADEN_S 2 +#define FW_IQ_CMD_FL1PADEN_V(x) ((x) << FW_IQ_CMD_FL1PADEN_S) +#define FW_IQ_CMD_FL1PADEN_F FW_IQ_CMD_FL1PADEN_V(1U) + +#define FW_IQ_CMD_FL1PACKEN_S 1 +#define FW_IQ_CMD_FL1PACKEN_V(x) ((x) << FW_IQ_CMD_FL1PACKEN_S) +#define FW_IQ_CMD_FL1PACKEN_F FW_IQ_CMD_FL1PACKEN_V(1U) + +#define FW_IQ_CMD_FL1CONGEN_S 0 +#define FW_IQ_CMD_FL1CONGEN_V(x) ((x) << FW_IQ_CMD_FL1CONGEN_S) +#define FW_IQ_CMD_FL1CONGEN_F FW_IQ_CMD_FL1CONGEN_V(1U) + +#define FW_IQ_CMD_FL1DCAEN_S 15 +#define FW_IQ_CMD_FL1DCAEN_V(x) ((x) << FW_IQ_CMD_FL1DCAEN_S) + +#define FW_IQ_CMD_FL1DCACPU_S 10 +#define FW_IQ_CMD_FL1DCACPU_V(x) ((x) << FW_IQ_CMD_FL1DCACPU_S) + +#define FW_IQ_CMD_FL1FBMIN_S 7 +#define FW_IQ_CMD_FL1FBMIN_V(x) ((x) << FW_IQ_CMD_FL1FBMIN_S) + +#define FW_IQ_CMD_FL1FBMAX_S 4 +#define FW_IQ_CMD_FL1FBMAX_V(x) ((x) << FW_IQ_CMD_FL1FBMAX_S) + +#define FW_IQ_CMD_FL1CIDXFTHRESHO_S 3 +#define FW_IQ_CMD_FL1CIDXFTHRESHO_V(x) ((x) << FW_IQ_CMD_FL1CIDXFTHRESHO_S) +#define FW_IQ_CMD_FL1CIDXFTHRESHO_F FW_IQ_CMD_FL1CIDXFTHRESHO_V(1U) + +#define FW_IQ_CMD_FL1CIDXFTHRESH_S 0 +#define FW_IQ_CMD_FL1CIDXFTHRESH_V(x) ((x) << FW_IQ_CMD_FL1CIDXFTHRESH_S) + +struct fw_eq_eth_cmd { + __be32 op_to_vfn; + __be32 alloc_to_len16; + __be32 eqid_pkd; + __be32 physeqid_pkd; + __be32 fetchszm_to_iqid; + __be32 dcaen_to_eqsize; + __be64 eqaddr; + __be32 viid_pkd; + __be32 r8_lo; + __be64 r9; +}; + +#define FW_EQ_ETH_CMD_PFN_S 8 +#define FW_EQ_ETH_CMD_PFN_V(x) ((x) << FW_EQ_ETH_CMD_PFN_S) + +#define FW_EQ_ETH_CMD_VFN_S 0 +#define FW_EQ_ETH_CMD_VFN_V(x) ((x) << FW_EQ_ETH_CMD_VFN_S) + +#define FW_EQ_ETH_CMD_ALLOC_S 31 +#define FW_EQ_ETH_CMD_ALLOC_V(x) ((x) << FW_EQ_ETH_CMD_ALLOC_S) +#define FW_EQ_ETH_CMD_ALLOC_F FW_EQ_ETH_CMD_ALLOC_V(1U) + +#define FW_EQ_ETH_CMD_FREE_S 30 +#define FW_EQ_ETH_CMD_FREE_V(x) ((x) << FW_EQ_ETH_CMD_FREE_S) +#define FW_EQ_ETH_CMD_FREE_F FW_EQ_ETH_CMD_FREE_V(1U) + +#define FW_EQ_ETH_CMD_MODIFY_S 29 +#define FW_EQ_ETH_CMD_MODIFY_V(x) ((x) << FW_EQ_ETH_CMD_MODIFY_S) +#define FW_EQ_ETH_CMD_MODIFY_F FW_EQ_ETH_CMD_MODIFY_V(1U) + +#define FW_EQ_ETH_CMD_EQSTART_S 28 +#define FW_EQ_ETH_CMD_EQSTART_V(x) ((x) << FW_EQ_ETH_CMD_EQSTART_S) +#define FW_EQ_ETH_CMD_EQSTART_F FW_EQ_ETH_CMD_EQSTART_V(1U) + +#define FW_EQ_ETH_CMD_EQSTOP_S 27 +#define FW_EQ_ETH_CMD_EQSTOP_V(x) ((x) << FW_EQ_ETH_CMD_EQSTOP_S) +#define FW_EQ_ETH_CMD_EQSTOP_F FW_EQ_ETH_CMD_EQSTOP_V(1U) + +#define FW_EQ_ETH_CMD_EQID_S 0 +#define FW_EQ_ETH_CMD_EQID_M 0xfffff +#define FW_EQ_ETH_CMD_EQID_V(x) ((x) << FW_EQ_ETH_CMD_EQID_S) +#define FW_EQ_ETH_CMD_EQID_G(x) \ + (((x) >> FW_EQ_ETH_CMD_EQID_S) & FW_EQ_ETH_CMD_EQID_M) + +#define FW_EQ_ETH_CMD_PHYSEQID_S 0 +#define FW_EQ_ETH_CMD_PHYSEQID_M 0xfffff +#define FW_EQ_ETH_CMD_PHYSEQID_V(x) ((x) << FW_EQ_ETH_CMD_PHYSEQID_S) +#define FW_EQ_ETH_CMD_PHYSEQID_G(x) \ + (((x) >> FW_EQ_ETH_CMD_PHYSEQID_S) & FW_EQ_ETH_CMD_PHYSEQID_M) + +#define FW_EQ_ETH_CMD_FETCHSZM_S 26 +#define FW_EQ_ETH_CMD_FETCHSZM_V(x) ((x) << FW_EQ_ETH_CMD_FETCHSZM_S) +#define FW_EQ_ETH_CMD_FETCHSZM_F FW_EQ_ETH_CMD_FETCHSZM_V(1U) + +#define FW_EQ_ETH_CMD_STATUSPGNS_S 25 +#define FW_EQ_ETH_CMD_STATUSPGNS_V(x) ((x) << FW_EQ_ETH_CMD_STATUSPGNS_S) + +#define FW_EQ_ETH_CMD_STATUSPGRO_S 24 +#define FW_EQ_ETH_CMD_STATUSPGRO_V(x) ((x) << FW_EQ_ETH_CMD_STATUSPGRO_S) + +#define FW_EQ_ETH_CMD_FETCHNS_S 23 +#define FW_EQ_ETH_CMD_FETCHNS_V(x) ((x) << FW_EQ_ETH_CMD_FETCHNS_S) + +#define FW_EQ_ETH_CMD_FETCHRO_S 22 +#define FW_EQ_ETH_CMD_FETCHRO_V(x) ((x) << FW_EQ_ETH_CMD_FETCHRO_S) +#define FW_EQ_ETH_CMD_FETCHRO_F FW_EQ_ETH_CMD_FETCHRO_V(1U) + +#define FW_EQ_ETH_CMD_HOSTFCMODE_S 20 +#define FW_EQ_ETH_CMD_HOSTFCMODE_V(x) ((x) << FW_EQ_ETH_CMD_HOSTFCMODE_S) + +#define FW_EQ_ETH_CMD_CPRIO_S 19 +#define FW_EQ_ETH_CMD_CPRIO_V(x) ((x) << FW_EQ_ETH_CMD_CPRIO_S) + +#define FW_EQ_ETH_CMD_ONCHIP_S 18 +#define FW_EQ_ETH_CMD_ONCHIP_V(x) ((x) << FW_EQ_ETH_CMD_ONCHIP_S) + +#define FW_EQ_ETH_CMD_PCIECHN_S 16 +#define FW_EQ_ETH_CMD_PCIECHN_V(x) ((x) << FW_EQ_ETH_CMD_PCIECHN_S) + +#define FW_EQ_ETH_CMD_IQID_S 0 +#define FW_EQ_ETH_CMD_IQID_V(x) ((x) << FW_EQ_ETH_CMD_IQID_S) + +#define FW_EQ_ETH_CMD_DCAEN_S 31 +#define FW_EQ_ETH_CMD_DCAEN_V(x) ((x) << FW_EQ_ETH_CMD_DCAEN_S) + +#define FW_EQ_ETH_CMD_DCACPU_S 26 +#define FW_EQ_ETH_CMD_DCACPU_V(x) ((x) << FW_EQ_ETH_CMD_DCACPU_S) + +#define FW_EQ_ETH_CMD_FBMIN_S 23 +#define FW_EQ_ETH_CMD_FBMIN_V(x) ((x) << FW_EQ_ETH_CMD_FBMIN_S) + +#define FW_EQ_ETH_CMD_FBMAX_S 20 +#define FW_EQ_ETH_CMD_FBMAX_V(x) ((x) << FW_EQ_ETH_CMD_FBMAX_S) + +#define FW_EQ_ETH_CMD_CIDXFTHRESHO_S 19 +#define FW_EQ_ETH_CMD_CIDXFTHRESHO_V(x) ((x) << FW_EQ_ETH_CMD_CIDXFTHRESHO_S) + +#define FW_EQ_ETH_CMD_CIDXFTHRESH_S 16 +#define FW_EQ_ETH_CMD_CIDXFTHRESH_V(x) ((x) << FW_EQ_ETH_CMD_CIDXFTHRESH_S) + +#define FW_EQ_ETH_CMD_EQSIZE_S 0 +#define FW_EQ_ETH_CMD_EQSIZE_V(x) ((x) << FW_EQ_ETH_CMD_EQSIZE_S) + +#define FW_EQ_ETH_CMD_AUTOEQUEQE_S 30 +#define FW_EQ_ETH_CMD_AUTOEQUEQE_V(x) ((x) << FW_EQ_ETH_CMD_AUTOEQUEQE_S) +#define FW_EQ_ETH_CMD_AUTOEQUEQE_F FW_EQ_ETH_CMD_AUTOEQUEQE_V(1U) + +#define FW_EQ_ETH_CMD_VIID_S 16 +#define FW_EQ_ETH_CMD_VIID_V(x) ((x) << FW_EQ_ETH_CMD_VIID_S) + +struct fw_eq_ctrl_cmd { + __be32 op_to_vfn; + __be32 alloc_to_len16; + __be32 cmpliqid_eqid; + __be32 physeqid_pkd; + __be32 fetchszm_to_iqid; + __be32 dcaen_to_eqsize; + __be64 eqaddr; +}; + +#define FW_EQ_CTRL_CMD_PFN_S 8 +#define FW_EQ_CTRL_CMD_PFN_V(x) ((x) << FW_EQ_CTRL_CMD_PFN_S) + +#define FW_EQ_CTRL_CMD_VFN_S 0 +#define FW_EQ_CTRL_CMD_VFN_V(x) ((x) << FW_EQ_CTRL_CMD_VFN_S) + +#define FW_EQ_CTRL_CMD_ALLOC_S 31 +#define FW_EQ_CTRL_CMD_ALLOC_V(x) ((x) << FW_EQ_CTRL_CMD_ALLOC_S) +#define FW_EQ_CTRL_CMD_ALLOC_F FW_EQ_CTRL_CMD_ALLOC_V(1U) + +#define FW_EQ_CTRL_CMD_FREE_S 30 +#define FW_EQ_CTRL_CMD_FREE_V(x) ((x) << FW_EQ_CTRL_CMD_FREE_S) +#define FW_EQ_CTRL_CMD_FREE_F FW_EQ_CTRL_CMD_FREE_V(1U) + +#define FW_EQ_CTRL_CMD_MODIFY_S 29 +#define FW_EQ_CTRL_CMD_MODIFY_V(x) ((x) << FW_EQ_CTRL_CMD_MODIFY_S) +#define FW_EQ_CTRL_CMD_MODIFY_F FW_EQ_CTRL_CMD_MODIFY_V(1U) + +#define FW_EQ_CTRL_CMD_EQSTART_S 28 +#define FW_EQ_CTRL_CMD_EQSTART_V(x) ((x) << FW_EQ_CTRL_CMD_EQSTART_S) +#define FW_EQ_CTRL_CMD_EQSTART_F FW_EQ_CTRL_CMD_EQSTART_V(1U) + +#define FW_EQ_CTRL_CMD_EQSTOP_S 27 +#define FW_EQ_CTRL_CMD_EQSTOP_V(x) ((x) << FW_EQ_CTRL_CMD_EQSTOP_S) +#define FW_EQ_CTRL_CMD_EQSTOP_F FW_EQ_CTRL_CMD_EQSTOP_V(1U) + +#define FW_EQ_CTRL_CMD_CMPLIQID_S 20 +#define FW_EQ_CTRL_CMD_CMPLIQID_V(x) ((x) << FW_EQ_CTRL_CMD_CMPLIQID_S) + +#define FW_EQ_CTRL_CMD_EQID_S 0 +#define FW_EQ_CTRL_CMD_EQID_M 0xfffff +#define FW_EQ_CTRL_CMD_EQID_V(x) ((x) << FW_EQ_CTRL_CMD_EQID_S) +#define FW_EQ_CTRL_CMD_EQID_G(x) \ + (((x) >> FW_EQ_CTRL_CMD_EQID_S) & FW_EQ_CTRL_CMD_EQID_M) + +#define FW_EQ_CTRL_CMD_PHYSEQID_S 0 +#define FW_EQ_CTRL_CMD_PHYSEQID_M 0xfffff +#define FW_EQ_CTRL_CMD_PHYSEQID_G(x) \ + (((x) >> FW_EQ_CTRL_CMD_PHYSEQID_S) & FW_EQ_CTRL_CMD_PHYSEQID_M) + +#define FW_EQ_CTRL_CMD_FETCHSZM_S 26 +#define FW_EQ_CTRL_CMD_FETCHSZM_V(x) ((x) << FW_EQ_CTRL_CMD_FETCHSZM_S) +#define FW_EQ_CTRL_CMD_FETCHSZM_F FW_EQ_CTRL_CMD_FETCHSZM_V(1U) + +#define FW_EQ_CTRL_CMD_STATUSPGNS_S 25 +#define FW_EQ_CTRL_CMD_STATUSPGNS_V(x) ((x) << FW_EQ_CTRL_CMD_STATUSPGNS_S) +#define FW_EQ_CTRL_CMD_STATUSPGNS_F FW_EQ_CTRL_CMD_STATUSPGNS_V(1U) + +#define FW_EQ_CTRL_CMD_STATUSPGRO_S 24 +#define FW_EQ_CTRL_CMD_STATUSPGRO_V(x) ((x) << FW_EQ_CTRL_CMD_STATUSPGRO_S) +#define FW_EQ_CTRL_CMD_STATUSPGRO_F FW_EQ_CTRL_CMD_STATUSPGRO_V(1U) + +#define FW_EQ_CTRL_CMD_FETCHNS_S 23 +#define FW_EQ_CTRL_CMD_FETCHNS_V(x) ((x) << FW_EQ_CTRL_CMD_FETCHNS_S) +#define FW_EQ_CTRL_CMD_FETCHNS_F FW_EQ_CTRL_CMD_FETCHNS_V(1U) + +#define FW_EQ_CTRL_CMD_FETCHRO_S 22 +#define FW_EQ_CTRL_CMD_FETCHRO_V(x) ((x) << FW_EQ_CTRL_CMD_FETCHRO_S) +#define FW_EQ_CTRL_CMD_FETCHRO_F FW_EQ_CTRL_CMD_FETCHRO_V(1U) + +#define FW_EQ_CTRL_CMD_HOSTFCMODE_S 20 +#define FW_EQ_CTRL_CMD_HOSTFCMODE_V(x) ((x) << FW_EQ_CTRL_CMD_HOSTFCMODE_S) + +#define FW_EQ_CTRL_CMD_CPRIO_S 19 +#define FW_EQ_CTRL_CMD_CPRIO_V(x) ((x) << FW_EQ_CTRL_CMD_CPRIO_S) + +#define FW_EQ_CTRL_CMD_ONCHIP_S 18 +#define FW_EQ_CTRL_CMD_ONCHIP_V(x) ((x) << FW_EQ_CTRL_CMD_ONCHIP_S) + +#define FW_EQ_CTRL_CMD_PCIECHN_S 16 +#define FW_EQ_CTRL_CMD_PCIECHN_V(x) ((x) << FW_EQ_CTRL_CMD_PCIECHN_S) + +#define FW_EQ_CTRL_CMD_IQID_S 0 +#define FW_EQ_CTRL_CMD_IQID_V(x) ((x) << FW_EQ_CTRL_CMD_IQID_S) + +#define FW_EQ_CTRL_CMD_DCAEN_S 31 +#define FW_EQ_CTRL_CMD_DCAEN_V(x) ((x) << FW_EQ_CTRL_CMD_DCAEN_S) + +#define FW_EQ_CTRL_CMD_DCACPU_S 26 +#define FW_EQ_CTRL_CMD_DCACPU_V(x) ((x) << FW_EQ_CTRL_CMD_DCACPU_S) + +#define FW_EQ_CTRL_CMD_FBMIN_S 23 +#define FW_EQ_CTRL_CMD_FBMIN_V(x) ((x) << FW_EQ_CTRL_CMD_FBMIN_S) + +#define FW_EQ_CTRL_CMD_FBMAX_S 20 +#define FW_EQ_CTRL_CMD_FBMAX_V(x) ((x) << FW_EQ_CTRL_CMD_FBMAX_S) + +#define FW_EQ_CTRL_CMD_CIDXFTHRESHO_S 19 +#define FW_EQ_CTRL_CMD_CIDXFTHRESHO_V(x) \ + ((x) << FW_EQ_CTRL_CMD_CIDXFTHRESHO_S) + +#define FW_EQ_CTRL_CMD_CIDXFTHRESH_S 16 +#define FW_EQ_CTRL_CMD_CIDXFTHRESH_V(x) ((x) << FW_EQ_CTRL_CMD_CIDXFTHRESH_S) + +#define FW_EQ_CTRL_CMD_EQSIZE_S 0 +#define FW_EQ_CTRL_CMD_EQSIZE_V(x) ((x) << FW_EQ_CTRL_CMD_EQSIZE_S) + +struct fw_eq_ofld_cmd { + __be32 op_to_vfn; + __be32 alloc_to_len16; + __be32 eqid_pkd; + __be32 physeqid_pkd; + __be32 fetchszm_to_iqid; + __be32 dcaen_to_eqsize; + __be64 eqaddr; +}; + +#define FW_EQ_OFLD_CMD_PFN_S 8 +#define FW_EQ_OFLD_CMD_PFN_V(x) ((x) << FW_EQ_OFLD_CMD_PFN_S) + +#define FW_EQ_OFLD_CMD_VFN_S 0 +#define FW_EQ_OFLD_CMD_VFN_V(x) ((x) << FW_EQ_OFLD_CMD_VFN_S) + +#define FW_EQ_OFLD_CMD_ALLOC_S 31 +#define FW_EQ_OFLD_CMD_ALLOC_V(x) ((x) << FW_EQ_OFLD_CMD_ALLOC_S) +#define FW_EQ_OFLD_CMD_ALLOC_F FW_EQ_OFLD_CMD_ALLOC_V(1U) + +#define FW_EQ_OFLD_CMD_FREE_S 30 +#define FW_EQ_OFLD_CMD_FREE_V(x) ((x) << FW_EQ_OFLD_CMD_FREE_S) +#define FW_EQ_OFLD_CMD_FREE_F FW_EQ_OFLD_CMD_FREE_V(1U) + +#define FW_EQ_OFLD_CMD_MODIFY_S 29 +#define FW_EQ_OFLD_CMD_MODIFY_V(x) ((x) << FW_EQ_OFLD_CMD_MODIFY_S) +#define FW_EQ_OFLD_CMD_MODIFY_F FW_EQ_OFLD_CMD_MODIFY_V(1U) + +#define FW_EQ_OFLD_CMD_EQSTART_S 28 +#define FW_EQ_OFLD_CMD_EQSTART_V(x) ((x) << FW_EQ_OFLD_CMD_EQSTART_S) +#define FW_EQ_OFLD_CMD_EQSTART_F FW_EQ_OFLD_CMD_EQSTART_V(1U) + +#define FW_EQ_OFLD_CMD_EQSTOP_S 27 +#define FW_EQ_OFLD_CMD_EQSTOP_V(x) ((x) << FW_EQ_OFLD_CMD_EQSTOP_S) +#define FW_EQ_OFLD_CMD_EQSTOP_F FW_EQ_OFLD_CMD_EQSTOP_V(1U) + +#define FW_EQ_OFLD_CMD_EQID_S 0 +#define FW_EQ_OFLD_CMD_EQID_M 0xfffff +#define FW_EQ_OFLD_CMD_EQID_V(x) ((x) << FW_EQ_OFLD_CMD_EQID_S) +#define FW_EQ_OFLD_CMD_EQID_G(x) \ + (((x) >> FW_EQ_OFLD_CMD_EQID_S) & FW_EQ_OFLD_CMD_EQID_M) + +#define FW_EQ_OFLD_CMD_PHYSEQID_S 0 +#define FW_EQ_OFLD_CMD_PHYSEQID_M 0xfffff +#define FW_EQ_OFLD_CMD_PHYSEQID_G(x) \ + (((x) >> FW_EQ_OFLD_CMD_PHYSEQID_S) & FW_EQ_OFLD_CMD_PHYSEQID_M) + +#define FW_EQ_OFLD_CMD_FETCHSZM_S 26 +#define FW_EQ_OFLD_CMD_FETCHSZM_V(x) ((x) << FW_EQ_OFLD_CMD_FETCHSZM_S) + +#define FW_EQ_OFLD_CMD_STATUSPGNS_S 25 +#define FW_EQ_OFLD_CMD_STATUSPGNS_V(x) ((x) << FW_EQ_OFLD_CMD_STATUSPGNS_S) + +#define FW_EQ_OFLD_CMD_STATUSPGRO_S 24 +#define FW_EQ_OFLD_CMD_STATUSPGRO_V(x) ((x) << FW_EQ_OFLD_CMD_STATUSPGRO_S) + +#define FW_EQ_OFLD_CMD_FETCHNS_S 23 +#define FW_EQ_OFLD_CMD_FETCHNS_V(x) ((x) << FW_EQ_OFLD_CMD_FETCHNS_S) + +#define FW_EQ_OFLD_CMD_FETCHRO_S 22 +#define FW_EQ_OFLD_CMD_FETCHRO_V(x) ((x) << FW_EQ_OFLD_CMD_FETCHRO_S) +#define FW_EQ_OFLD_CMD_FETCHRO_F FW_EQ_OFLD_CMD_FETCHRO_V(1U) + +#define FW_EQ_OFLD_CMD_HOSTFCMODE_S 20 +#define FW_EQ_OFLD_CMD_HOSTFCMODE_V(x) ((x) << FW_EQ_OFLD_CMD_HOSTFCMODE_S) + +#define FW_EQ_OFLD_CMD_CPRIO_S 19 +#define FW_EQ_OFLD_CMD_CPRIO_V(x) ((x) << FW_EQ_OFLD_CMD_CPRIO_S) + +#define FW_EQ_OFLD_CMD_ONCHIP_S 18 +#define FW_EQ_OFLD_CMD_ONCHIP_V(x) ((x) << FW_EQ_OFLD_CMD_ONCHIP_S) + +#define FW_EQ_OFLD_CMD_PCIECHN_S 16 +#define FW_EQ_OFLD_CMD_PCIECHN_V(x) ((x) << FW_EQ_OFLD_CMD_PCIECHN_S) + +#define FW_EQ_OFLD_CMD_IQID_S 0 +#define FW_EQ_OFLD_CMD_IQID_V(x) ((x) << FW_EQ_OFLD_CMD_IQID_S) + +#define FW_EQ_OFLD_CMD_DCAEN_S 31 +#define FW_EQ_OFLD_CMD_DCAEN_V(x) ((x) << FW_EQ_OFLD_CMD_DCAEN_S) + +#define FW_EQ_OFLD_CMD_DCACPU_S 26 +#define FW_EQ_OFLD_CMD_DCACPU_V(x) ((x) << FW_EQ_OFLD_CMD_DCACPU_S) + +#define FW_EQ_OFLD_CMD_FBMIN_S 23 +#define FW_EQ_OFLD_CMD_FBMIN_V(x) ((x) << FW_EQ_OFLD_CMD_FBMIN_S) + +#define FW_EQ_OFLD_CMD_FBMAX_S 20 +#define FW_EQ_OFLD_CMD_FBMAX_V(x) ((x) << FW_EQ_OFLD_CMD_FBMAX_S) + +#define FW_EQ_OFLD_CMD_CIDXFTHRESHO_S 19 +#define FW_EQ_OFLD_CMD_CIDXFTHRESHO_V(x) \ + ((x) << FW_EQ_OFLD_CMD_CIDXFTHRESHO_S) + +#define FW_EQ_OFLD_CMD_CIDXFTHRESH_S 16 +#define FW_EQ_OFLD_CMD_CIDXFTHRESH_V(x) ((x) << FW_EQ_OFLD_CMD_CIDXFTHRESH_S) + +#define FW_EQ_OFLD_CMD_EQSIZE_S 0 +#define FW_EQ_OFLD_CMD_EQSIZE_V(x) ((x) << FW_EQ_OFLD_CMD_EQSIZE_S) + +/* + * Macros for VIID parsing: + * VIID - [10:8] PFN, [7] VI Valid, [6:0] VI number + */ + +#define FW_VIID_PFN_S 8 +#define FW_VIID_PFN_M 0x7 +#define FW_VIID_PFN_G(x) (((x) >> FW_VIID_PFN_S) & FW_VIID_PFN_M) + +#define FW_VIID_VIVLD_S 7 +#define FW_VIID_VIVLD_M 0x1 +#define FW_VIID_VIVLD_G(x) (((x) >> FW_VIID_VIVLD_S) & FW_VIID_VIVLD_M) + +#define FW_VIID_VIN_S 0 +#define FW_VIID_VIN_M 0x7F +#define FW_VIID_VIN_G(x) (((x) >> FW_VIID_VIN_S) & FW_VIID_VIN_M) + +struct fw_vi_cmd { + __be32 op_to_vfn; + __be32 alloc_to_len16; + __be16 type_viid; + u8 mac[6]; + u8 portid_pkd; + u8 nmac; + u8 nmac0[6]; + __be16 rsssize_pkd; + u8 nmac1[6]; + __be16 idsiiq_pkd; + u8 nmac2[6]; + __be16 idseiq_pkd; + u8 nmac3[6]; + __be64 r9; + __be64 r10; +}; + +#define FW_VI_CMD_PFN_S 8 +#define FW_VI_CMD_PFN_V(x) ((x) << FW_VI_CMD_PFN_S) + +#define FW_VI_CMD_VFN_S 0 +#define FW_VI_CMD_VFN_V(x) ((x) << FW_VI_CMD_VFN_S) + +#define FW_VI_CMD_ALLOC_S 31 +#define FW_VI_CMD_ALLOC_V(x) ((x) << FW_VI_CMD_ALLOC_S) +#define FW_VI_CMD_ALLOC_F FW_VI_CMD_ALLOC_V(1U) + +#define FW_VI_CMD_FREE_S 30 +#define FW_VI_CMD_FREE_V(x) ((x) << FW_VI_CMD_FREE_S) +#define FW_VI_CMD_FREE_F FW_VI_CMD_FREE_V(1U) + +#define FW_VI_CMD_VIID_S 0 +#define FW_VI_CMD_VIID_M 0xfff +#define FW_VI_CMD_VIID_V(x) ((x) << FW_VI_CMD_VIID_S) +#define FW_VI_CMD_VIID_G(x) (((x) >> FW_VI_CMD_VIID_S) & FW_VI_CMD_VIID_M) + +#define FW_VI_CMD_PORTID_S 4 +#define FW_VI_CMD_PORTID_M 0xf +#define FW_VI_CMD_PORTID_V(x) ((x) << FW_VI_CMD_PORTID_S) +#define FW_VI_CMD_PORTID_G(x) \ + (((x) >> FW_VI_CMD_PORTID_S) & FW_VI_CMD_PORTID_M) + +#define FW_VI_CMD_RSSSIZE_S 0 +#define FW_VI_CMD_RSSSIZE_M 0x7ff +#define FW_VI_CMD_RSSSIZE_G(x) \ + (((x) >> FW_VI_CMD_RSSSIZE_S) & FW_VI_CMD_RSSSIZE_M) + +/* Special VI_MAC command index ids */ +#define FW_VI_MAC_ADD_MAC 0x3FF +#define FW_VI_MAC_ADD_PERSIST_MAC 0x3FE +#define FW_VI_MAC_MAC_BASED_FREE 0x3FD +#define FW_CLS_TCAM_NUM_ENTRIES 336 + +enum fw_vi_mac_smac { + FW_VI_MAC_MPS_TCAM_ENTRY, + FW_VI_MAC_MPS_TCAM_ONLY, + FW_VI_MAC_SMT_ONLY, + FW_VI_MAC_SMT_AND_MPSTCAM +}; + +enum fw_vi_mac_result { + FW_VI_MAC_R_SUCCESS, + FW_VI_MAC_R_F_NONEXISTENT_NOMEM, + FW_VI_MAC_R_SMAC_FAIL, + FW_VI_MAC_R_F_ACL_CHECK +}; + +struct fw_vi_mac_cmd { + __be32 op_to_viid; + __be32 freemacs_to_len16; + union fw_vi_mac { + struct fw_vi_mac_exact { + __be16 valid_to_idx; + u8 macaddr[6]; + } exact[7]; + struct fw_vi_mac_hash { + __be64 hashvec; + } hash; + } u; +}; + +#define FW_VI_MAC_CMD_VIID_S 0 +#define FW_VI_MAC_CMD_VIID_V(x) ((x) << FW_VI_MAC_CMD_VIID_S) + +#define FW_VI_MAC_CMD_FREEMACS_S 31 +#define FW_VI_MAC_CMD_FREEMACS_V(x) ((x) << FW_VI_MAC_CMD_FREEMACS_S) + +#define FW_VI_MAC_CMD_HASHVECEN_S 23 +#define FW_VI_MAC_CMD_HASHVECEN_V(x) ((x) << FW_VI_MAC_CMD_HASHVECEN_S) +#define FW_VI_MAC_CMD_HASHVECEN_F FW_VI_MAC_CMD_HASHVECEN_V(1U) + +#define FW_VI_MAC_CMD_HASHUNIEN_S 22 +#define FW_VI_MAC_CMD_HASHUNIEN_V(x) ((x) << FW_VI_MAC_CMD_HASHUNIEN_S) + +#define FW_VI_MAC_CMD_VALID_S 15 +#define FW_VI_MAC_CMD_VALID_V(x) ((x) << FW_VI_MAC_CMD_VALID_S) +#define FW_VI_MAC_CMD_VALID_F FW_VI_MAC_CMD_VALID_V(1U) + +#define FW_VI_MAC_CMD_PRIO_S 12 +#define FW_VI_MAC_CMD_PRIO_V(x) ((x) << FW_VI_MAC_CMD_PRIO_S) + +#define FW_VI_MAC_CMD_SMAC_RESULT_S 10 +#define FW_VI_MAC_CMD_SMAC_RESULT_M 0x3 +#define FW_VI_MAC_CMD_SMAC_RESULT_V(x) ((x) << FW_VI_MAC_CMD_SMAC_RESULT_S) +#define FW_VI_MAC_CMD_SMAC_RESULT_G(x) \ + (((x) >> FW_VI_MAC_CMD_SMAC_RESULT_S) & FW_VI_MAC_CMD_SMAC_RESULT_M) + +#define FW_VI_MAC_CMD_IDX_S 0 +#define FW_VI_MAC_CMD_IDX_M 0x3ff +#define FW_VI_MAC_CMD_IDX_V(x) ((x) << FW_VI_MAC_CMD_IDX_S) +#define FW_VI_MAC_CMD_IDX_G(x) \ + (((x) >> FW_VI_MAC_CMD_IDX_S) & FW_VI_MAC_CMD_IDX_M) + +#define FW_RXMODE_MTU_NO_CHG 65535 + +struct fw_vi_rxmode_cmd { + __be32 op_to_viid; + __be32 retval_len16; + __be32 mtu_to_vlanexen; + __be32 r4_lo; +}; + +#define FW_VI_RXMODE_CMD_VIID_S 0 +#define FW_VI_RXMODE_CMD_VIID_V(x) ((x) << FW_VI_RXMODE_CMD_VIID_S) + +#define FW_VI_RXMODE_CMD_MTU_S 16 +#define FW_VI_RXMODE_CMD_MTU_M 0xffff +#define FW_VI_RXMODE_CMD_MTU_V(x) ((x) << FW_VI_RXMODE_CMD_MTU_S) + +#define FW_VI_RXMODE_CMD_PROMISCEN_S 14 +#define FW_VI_RXMODE_CMD_PROMISCEN_M 0x3 +#define FW_VI_RXMODE_CMD_PROMISCEN_V(x) ((x) << FW_VI_RXMODE_CMD_PROMISCEN_S) + +#define FW_VI_RXMODE_CMD_ALLMULTIEN_S 12 +#define FW_VI_RXMODE_CMD_ALLMULTIEN_M 0x3 +#define FW_VI_RXMODE_CMD_ALLMULTIEN_V(x) \ + ((x) << FW_VI_RXMODE_CMD_ALLMULTIEN_S) + +#define FW_VI_RXMODE_CMD_BROADCASTEN_S 10 +#define FW_VI_RXMODE_CMD_BROADCASTEN_M 0x3 +#define FW_VI_RXMODE_CMD_BROADCASTEN_V(x) \ + ((x) << FW_VI_RXMODE_CMD_BROADCASTEN_S) + +#define FW_VI_RXMODE_CMD_VLANEXEN_S 8 +#define FW_VI_RXMODE_CMD_VLANEXEN_M 0x3 +#define FW_VI_RXMODE_CMD_VLANEXEN_V(x) ((x) << FW_VI_RXMODE_CMD_VLANEXEN_S) + +struct fw_vi_enable_cmd { + __be32 op_to_viid; + __be32 ien_to_len16; + __be16 blinkdur; + __be16 r3; + __be32 r4; +}; + +#define FW_VI_ENABLE_CMD_VIID_S 0 +#define FW_VI_ENABLE_CMD_VIID_V(x) ((x) << FW_VI_ENABLE_CMD_VIID_S) + +#define FW_VI_ENABLE_CMD_IEN_S 31 +#define FW_VI_ENABLE_CMD_IEN_V(x) ((x) << FW_VI_ENABLE_CMD_IEN_S) + +#define FW_VI_ENABLE_CMD_EEN_S 30 +#define FW_VI_ENABLE_CMD_EEN_V(x) ((x) << FW_VI_ENABLE_CMD_EEN_S) + +#define FW_VI_ENABLE_CMD_LED_S 29 +#define FW_VI_ENABLE_CMD_LED_V(x) ((x) << FW_VI_ENABLE_CMD_LED_S) +#define FW_VI_ENABLE_CMD_LED_F FW_VI_ENABLE_CMD_LED_V(1U) + +#define FW_VI_ENABLE_CMD_DCB_INFO_S 28 +#define FW_VI_ENABLE_CMD_DCB_INFO_V(x) ((x) << FW_VI_ENABLE_CMD_DCB_INFO_S) + +/* VI VF stats offset definitions */ +#define VI_VF_NUM_STATS 16 +enum fw_vi_stats_vf_index { + FW_VI_VF_STAT_TX_BCAST_BYTES_IX, + FW_VI_VF_STAT_TX_BCAST_FRAMES_IX, + FW_VI_VF_STAT_TX_MCAST_BYTES_IX, + FW_VI_VF_STAT_TX_MCAST_FRAMES_IX, + FW_VI_VF_STAT_TX_UCAST_BYTES_IX, + FW_VI_VF_STAT_TX_UCAST_FRAMES_IX, + FW_VI_VF_STAT_TX_DROP_FRAMES_IX, + FW_VI_VF_STAT_TX_OFLD_BYTES_IX, + FW_VI_VF_STAT_TX_OFLD_FRAMES_IX, + FW_VI_VF_STAT_RX_BCAST_BYTES_IX, + FW_VI_VF_STAT_RX_BCAST_FRAMES_IX, + FW_VI_VF_STAT_RX_MCAST_BYTES_IX, + FW_VI_VF_STAT_RX_MCAST_FRAMES_IX, + FW_VI_VF_STAT_RX_UCAST_BYTES_IX, + FW_VI_VF_STAT_RX_UCAST_FRAMES_IX, + FW_VI_VF_STAT_RX_ERR_FRAMES_IX +}; + +/* VI PF stats offset definitions */ +#define VI_PF_NUM_STATS 17 +enum fw_vi_stats_pf_index { + FW_VI_PF_STAT_TX_BCAST_BYTES_IX, + FW_VI_PF_STAT_TX_BCAST_FRAMES_IX, + FW_VI_PF_STAT_TX_MCAST_BYTES_IX, + FW_VI_PF_STAT_TX_MCAST_FRAMES_IX, + FW_VI_PF_STAT_TX_UCAST_BYTES_IX, + FW_VI_PF_STAT_TX_UCAST_FRAMES_IX, + FW_VI_PF_STAT_TX_OFLD_BYTES_IX, + FW_VI_PF_STAT_TX_OFLD_FRAMES_IX, + FW_VI_PF_STAT_RX_BYTES_IX, + FW_VI_PF_STAT_RX_FRAMES_IX, + FW_VI_PF_STAT_RX_BCAST_BYTES_IX, + FW_VI_PF_STAT_RX_BCAST_FRAMES_IX, + FW_VI_PF_STAT_RX_MCAST_BYTES_IX, + FW_VI_PF_STAT_RX_MCAST_FRAMES_IX, + FW_VI_PF_STAT_RX_UCAST_BYTES_IX, + FW_VI_PF_STAT_RX_UCAST_FRAMES_IX, + FW_VI_PF_STAT_RX_ERR_FRAMES_IX +}; + +struct fw_vi_stats_cmd { + __be32 op_to_viid; + __be32 retval_len16; + union fw_vi_stats { + struct fw_vi_stats_ctl { + __be16 nstats_ix; + __be16 r6; + __be32 r7; + __be64 stat0; + __be64 stat1; + __be64 stat2; + __be64 stat3; + __be64 stat4; + __be64 stat5; + } ctl; + struct fw_vi_stats_pf { + __be64 tx_bcast_bytes; + __be64 tx_bcast_frames; + __be64 tx_mcast_bytes; + __be64 tx_mcast_frames; + __be64 tx_ucast_bytes; + __be64 tx_ucast_frames; + __be64 tx_offload_bytes; + __be64 tx_offload_frames; + __be64 rx_pf_bytes; + __be64 rx_pf_frames; + __be64 rx_bcast_bytes; + __be64 rx_bcast_frames; + __be64 rx_mcast_bytes; + __be64 rx_mcast_frames; + __be64 rx_ucast_bytes; + __be64 rx_ucast_frames; + __be64 rx_err_frames; + } pf; + struct fw_vi_stats_vf { + __be64 tx_bcast_bytes; + __be64 tx_bcast_frames; + __be64 tx_mcast_bytes; + __be64 tx_mcast_frames; + __be64 tx_ucast_bytes; + __be64 tx_ucast_frames; + __be64 tx_drop_frames; + __be64 tx_offload_bytes; + __be64 tx_offload_frames; + __be64 rx_bcast_bytes; + __be64 rx_bcast_frames; + __be64 rx_mcast_bytes; + __be64 rx_mcast_frames; + __be64 rx_ucast_bytes; + __be64 rx_ucast_frames; + __be64 rx_err_frames; + } vf; + } u; +}; + +#define FW_VI_STATS_CMD_VIID_S 0 +#define FW_VI_STATS_CMD_VIID_V(x) ((x) << FW_VI_STATS_CMD_VIID_S) + +#define FW_VI_STATS_CMD_NSTATS_S 12 +#define FW_VI_STATS_CMD_NSTATS_V(x) ((x) << FW_VI_STATS_CMD_NSTATS_S) + +#define FW_VI_STATS_CMD_IX_S 0 +#define FW_VI_STATS_CMD_IX_V(x) ((x) << FW_VI_STATS_CMD_IX_S) + +struct fw_acl_mac_cmd { + __be32 op_to_vfn; + __be32 en_to_len16; + u8 nmac; + u8 r3[7]; + __be16 r4; + u8 macaddr0[6]; + __be16 r5; + u8 macaddr1[6]; + __be16 r6; + u8 macaddr2[6]; + __be16 r7; + u8 macaddr3[6]; +}; + +#define FW_ACL_MAC_CMD_PFN_S 8 +#define FW_ACL_MAC_CMD_PFN_V(x) ((x) << FW_ACL_MAC_CMD_PFN_S) + +#define FW_ACL_MAC_CMD_VFN_S 0 +#define FW_ACL_MAC_CMD_VFN_V(x) ((x) << FW_ACL_MAC_CMD_VFN_S) + +#define FW_ACL_MAC_CMD_EN_S 31 +#define FW_ACL_MAC_CMD_EN_V(x) ((x) << FW_ACL_MAC_CMD_EN_S) + +struct fw_acl_vlan_cmd { + __be32 op_to_vfn; + __be32 en_to_len16; + u8 nvlan; + u8 dropnovlan_fm; + u8 r3_lo[6]; + __be16 vlanid[16]; +}; + +#define FW_ACL_VLAN_CMD_PFN_S 8 +#define FW_ACL_VLAN_CMD_PFN_V(x) ((x) << FW_ACL_VLAN_CMD_PFN_S) + +#define FW_ACL_VLAN_CMD_VFN_S 0 +#define FW_ACL_VLAN_CMD_VFN_V(x) ((x) << FW_ACL_VLAN_CMD_VFN_S) + +#define FW_ACL_VLAN_CMD_EN_S 31 +#define FW_ACL_VLAN_CMD_EN_V(x) ((x) << FW_ACL_VLAN_CMD_EN_S) + +#define FW_ACL_VLAN_CMD_DROPNOVLAN_S 7 +#define FW_ACL_VLAN_CMD_DROPNOVLAN_V(x) ((x) << FW_ACL_VLAN_CMD_DROPNOVLAN_S) + +#define FW_ACL_VLAN_CMD_FM_S 6 +#define FW_ACL_VLAN_CMD_FM_V(x) ((x) << FW_ACL_VLAN_CMD_FM_S) + +enum fw_port_cap { + FW_PORT_CAP_SPEED_100M = 0x0001, + FW_PORT_CAP_SPEED_1G = 0x0002, + FW_PORT_CAP_SPEED_25G = 0x0004, + FW_PORT_CAP_SPEED_10G = 0x0008, + FW_PORT_CAP_SPEED_40G = 0x0010, + FW_PORT_CAP_SPEED_100G = 0x0020, + FW_PORT_CAP_FC_RX = 0x0040, + FW_PORT_CAP_FC_TX = 0x0080, + FW_PORT_CAP_ANEG = 0x0100, + FW_PORT_CAP_MDIX = 0x0200, + FW_PORT_CAP_MDIAUTO = 0x0400, + FW_PORT_CAP_FEC = 0x0800, + FW_PORT_CAP_TECHKR = 0x1000, + FW_PORT_CAP_TECHKX4 = 0x2000, + FW_PORT_CAP_802_3_PAUSE = 0x4000, + FW_PORT_CAP_802_3_ASM_DIR = 0x8000, +}; + +#define FW_PORT_CAP_SPEED_S 0 +#define FW_PORT_CAP_SPEED_M 0x3f +#define FW_PORT_CAP_SPEED_V(x) ((x) << FW_PORT_CAP_SPEED_S) +#define FW_PORT_CAP_SPEED_G(x) \ + (((x) >> FW_PORT_CAP_SPEED_S) & FW_PORT_CAP_SPEED_M) + +enum fw_port_mdi { + FW_PORT_CAP_MDI_UNCHANGED, + FW_PORT_CAP_MDI_AUTO, + FW_PORT_CAP_MDI_F_STRAIGHT, + FW_PORT_CAP_MDI_F_CROSSOVER +}; + +#define FW_PORT_CAP_MDI_S 9 +#define FW_PORT_CAP_MDI_V(x) ((x) << FW_PORT_CAP_MDI_S) + +enum fw_port_action { + FW_PORT_ACTION_L1_CFG = 0x0001, + FW_PORT_ACTION_L2_CFG = 0x0002, + FW_PORT_ACTION_GET_PORT_INFO = 0x0003, + FW_PORT_ACTION_L2_PPP_CFG = 0x0004, + FW_PORT_ACTION_L2_DCB_CFG = 0x0005, + FW_PORT_ACTION_DCB_READ_TRANS = 0x0006, + FW_PORT_ACTION_DCB_READ_RECV = 0x0007, + FW_PORT_ACTION_DCB_READ_DET = 0x0008, + FW_PORT_ACTION_LOW_PWR_TO_NORMAL = 0x0010, + FW_PORT_ACTION_L1_LOW_PWR_EN = 0x0011, + FW_PORT_ACTION_L2_WOL_MODE_EN = 0x0012, + FW_PORT_ACTION_LPBK_TO_NORMAL = 0x0020, + FW_PORT_ACTION_L1_LPBK = 0x0021, + FW_PORT_ACTION_L1_PMA_LPBK = 0x0022, + FW_PORT_ACTION_L1_PCS_LPBK = 0x0023, + FW_PORT_ACTION_L1_PHYXS_CSIDE_LPBK = 0x0024, + FW_PORT_ACTION_L1_PHYXS_ESIDE_LPBK = 0x0025, + FW_PORT_ACTION_PHY_RESET = 0x0040, + FW_PORT_ACTION_PMA_RESET = 0x0041, + FW_PORT_ACTION_PCS_RESET = 0x0042, + FW_PORT_ACTION_PHYXS_RESET = 0x0043, + FW_PORT_ACTION_DTEXS_REEST = 0x0044, + FW_PORT_ACTION_AN_RESET = 0x0045 +}; + +enum fw_port_l2cfg_ctlbf { + FW_PORT_L2_CTLBF_OVLAN0 = 0x01, + FW_PORT_L2_CTLBF_OVLAN1 = 0x02, + FW_PORT_L2_CTLBF_OVLAN2 = 0x04, + FW_PORT_L2_CTLBF_OVLAN3 = 0x08, + FW_PORT_L2_CTLBF_IVLAN = 0x10, + FW_PORT_L2_CTLBF_TXIPG = 0x20 +}; + +enum fw_port_dcb_versions { + FW_PORT_DCB_VER_UNKNOWN, + FW_PORT_DCB_VER_CEE1D0, + FW_PORT_DCB_VER_CEE1D01, + FW_PORT_DCB_VER_IEEE, + FW_PORT_DCB_VER_AUTO = 7 +}; + +enum fw_port_dcb_cfg { + FW_PORT_DCB_CFG_PG = 0x01, + FW_PORT_DCB_CFG_PFC = 0x02, + FW_PORT_DCB_CFG_APPL = 0x04 +}; + +enum fw_port_dcb_cfg_rc { + FW_PORT_DCB_CFG_SUCCESS = 0x0, + FW_PORT_DCB_CFG_ERROR = 0x1 +}; + +enum fw_port_dcb_type { + FW_PORT_DCB_TYPE_PGID = 0x00, + FW_PORT_DCB_TYPE_PGRATE = 0x01, + FW_PORT_DCB_TYPE_PRIORATE = 0x02, + FW_PORT_DCB_TYPE_PFC = 0x03, + FW_PORT_DCB_TYPE_APP_ID = 0x04, + FW_PORT_DCB_TYPE_CONTROL = 0x05, +}; + +enum fw_port_dcb_feature_state { + FW_PORT_DCB_FEATURE_STATE_PENDING = 0x0, + FW_PORT_DCB_FEATURE_STATE_SUCCESS = 0x1, + FW_PORT_DCB_FEATURE_STATE_ERROR = 0x2, + FW_PORT_DCB_FEATURE_STATE_TIMEOUT = 0x3, +}; + +struct fw_port_cmd { + __be32 op_to_portid; + __be32 action_to_len16; + union fw_port { + struct fw_port_l1cfg { + __be32 rcap; + __be32 r; + } l1cfg; + struct fw_port_l2cfg { + __u8 ctlbf; + __u8 ovlan3_to_ivlan0; + __be16 ivlantype; + __be16 txipg_force_pinfo; + __be16 mtu; + __be16 ovlan0mask; + __be16 ovlan0type; + __be16 ovlan1mask; + __be16 ovlan1type; + __be16 ovlan2mask; + __be16 ovlan2type; + __be16 ovlan3mask; + __be16 ovlan3type; + } l2cfg; + struct fw_port_info { + __be32 lstatus_to_modtype; + __be16 pcap; + __be16 acap; + __be16 mtu; + __u8 cbllen; + __u8 auxlinfo; + __u8 dcbxdis_pkd; + __u8 r8_lo; + __be16 lpacap; + __be64 r9; + } info; + struct fw_port_diags { + __u8 diagop; + __u8 r[3]; + __be32 diagval; + } diags; + union fw_port_dcb { + struct fw_port_dcb_pgid { + __u8 type; + __u8 apply_pkd; + __u8 r10_lo[2]; + __be32 pgid; + __be64 r11; + } pgid; + struct fw_port_dcb_pgrate { + __u8 type; + __u8 apply_pkd; + __u8 r10_lo[5]; + __u8 num_tcs_supported; + __u8 pgrate[8]; + __u8 tsa[8]; + } pgrate; + struct fw_port_dcb_priorate { + __u8 type; + __u8 apply_pkd; + __u8 r10_lo[6]; + __u8 strict_priorate[8]; + } priorate; + struct fw_port_dcb_pfc { + __u8 type; + __u8 pfcen; + __u8 r10[5]; + __u8 max_pfc_tcs; + __be64 r11; + } pfc; + struct fw_port_app_priority { + __u8 type; + __u8 r10[2]; + __u8 idx; + __u8 user_prio_map; + __u8 sel_field; + __be16 protocolid; + __be64 r12; + } app_priority; + struct fw_port_dcb_control { + __u8 type; + __u8 all_syncd_pkd; + __be16 dcb_version_to_app_state; + __be32 r11; + __be64 r12; + } control; + } dcb; + } u; +}; + +#define FW_PORT_CMD_READ_S 22 +#define FW_PORT_CMD_READ_V(x) ((x) << FW_PORT_CMD_READ_S) +#define FW_PORT_CMD_READ_F FW_PORT_CMD_READ_V(1U) + +#define FW_PORT_CMD_PORTID_S 0 +#define FW_PORT_CMD_PORTID_M 0xf +#define FW_PORT_CMD_PORTID_V(x) ((x) << FW_PORT_CMD_PORTID_S) +#define FW_PORT_CMD_PORTID_G(x) \ + (((x) >> FW_PORT_CMD_PORTID_S) & FW_PORT_CMD_PORTID_M) + +#define FW_PORT_CMD_ACTION_S 16 +#define FW_PORT_CMD_ACTION_M 0xffff +#define FW_PORT_CMD_ACTION_V(x) ((x) << FW_PORT_CMD_ACTION_S) +#define FW_PORT_CMD_ACTION_G(x) \ + (((x) >> FW_PORT_CMD_ACTION_S) & FW_PORT_CMD_ACTION_M) + +#define FW_PORT_CMD_OVLAN3_S 7 +#define FW_PORT_CMD_OVLAN3_V(x) ((x) << FW_PORT_CMD_OVLAN3_S) + +#define FW_PORT_CMD_OVLAN2_S 6 +#define FW_PORT_CMD_OVLAN2_V(x) ((x) << FW_PORT_CMD_OVLAN2_S) + +#define FW_PORT_CMD_OVLAN1_S 5 +#define FW_PORT_CMD_OVLAN1_V(x) ((x) << FW_PORT_CMD_OVLAN1_S) + +#define FW_PORT_CMD_OVLAN0_S 4 +#define FW_PORT_CMD_OVLAN0_V(x) ((x) << FW_PORT_CMD_OVLAN0_S) + +#define FW_PORT_CMD_IVLAN0_S 3 +#define FW_PORT_CMD_IVLAN0_V(x) ((x) << FW_PORT_CMD_IVLAN0_S) + +#define FW_PORT_CMD_TXIPG_S 3 +#define FW_PORT_CMD_TXIPG_V(x) ((x) << FW_PORT_CMD_TXIPG_S) + +#define FW_PORT_CMD_LSTATUS_S 31 +#define FW_PORT_CMD_LSTATUS_M 0x1 +#define FW_PORT_CMD_LSTATUS_V(x) ((x) << FW_PORT_CMD_LSTATUS_S) +#define FW_PORT_CMD_LSTATUS_G(x) \ + (((x) >> FW_PORT_CMD_LSTATUS_S) & FW_PORT_CMD_LSTATUS_M) +#define FW_PORT_CMD_LSTATUS_F FW_PORT_CMD_LSTATUS_V(1U) + +#define FW_PORT_CMD_LSPEED_S 24 +#define FW_PORT_CMD_LSPEED_M 0x3f +#define FW_PORT_CMD_LSPEED_V(x) ((x) << FW_PORT_CMD_LSPEED_S) +#define FW_PORT_CMD_LSPEED_G(x) \ + (((x) >> FW_PORT_CMD_LSPEED_S) & FW_PORT_CMD_LSPEED_M) + +#define FW_PORT_CMD_TXPAUSE_S 23 +#define FW_PORT_CMD_TXPAUSE_V(x) ((x) << FW_PORT_CMD_TXPAUSE_S) +#define FW_PORT_CMD_TXPAUSE_F FW_PORT_CMD_TXPAUSE_V(1U) + +#define FW_PORT_CMD_RXPAUSE_S 22 +#define FW_PORT_CMD_RXPAUSE_V(x) ((x) << FW_PORT_CMD_RXPAUSE_S) +#define FW_PORT_CMD_RXPAUSE_F FW_PORT_CMD_RXPAUSE_V(1U) + +#define FW_PORT_CMD_MDIOCAP_S 21 +#define FW_PORT_CMD_MDIOCAP_V(x) ((x) << FW_PORT_CMD_MDIOCAP_S) +#define FW_PORT_CMD_MDIOCAP_F FW_PORT_CMD_MDIOCAP_V(1U) + +#define FW_PORT_CMD_MDIOADDR_S 16 +#define FW_PORT_CMD_MDIOADDR_M 0x1f +#define FW_PORT_CMD_MDIOADDR_G(x) \ + (((x) >> FW_PORT_CMD_MDIOADDR_S) & FW_PORT_CMD_MDIOADDR_M) + +#define FW_PORT_CMD_LPTXPAUSE_S 15 +#define FW_PORT_CMD_LPTXPAUSE_V(x) ((x) << FW_PORT_CMD_LPTXPAUSE_S) +#define FW_PORT_CMD_LPTXPAUSE_F FW_PORT_CMD_LPTXPAUSE_V(1U) + +#define FW_PORT_CMD_LPRXPAUSE_S 14 +#define FW_PORT_CMD_LPRXPAUSE_V(x) ((x) << FW_PORT_CMD_LPRXPAUSE_S) +#define FW_PORT_CMD_LPRXPAUSE_F FW_PORT_CMD_LPRXPAUSE_V(1U) + +#define FW_PORT_CMD_PTYPE_S 8 +#define FW_PORT_CMD_PTYPE_M 0x1f +#define FW_PORT_CMD_PTYPE_G(x) \ + (((x) >> FW_PORT_CMD_PTYPE_S) & FW_PORT_CMD_PTYPE_M) + +#define FW_PORT_CMD_LINKDNRC_S 5 +#define FW_PORT_CMD_LINKDNRC_M 0x7 +#define FW_PORT_CMD_LINKDNRC_G(x) \ + (((x) >> FW_PORT_CMD_LINKDNRC_S) & FW_PORT_CMD_LINKDNRC_M) + +#define FW_PORT_CMD_MODTYPE_S 0 +#define FW_PORT_CMD_MODTYPE_M 0x1f +#define FW_PORT_CMD_MODTYPE_V(x) ((x) << FW_PORT_CMD_MODTYPE_S) +#define FW_PORT_CMD_MODTYPE_G(x) \ + (((x) >> FW_PORT_CMD_MODTYPE_S) & FW_PORT_CMD_MODTYPE_M) + +#define FW_PORT_CMD_DCBXDIS_S 7 +#define FW_PORT_CMD_DCBXDIS_V(x) ((x) << FW_PORT_CMD_DCBXDIS_S) +#define FW_PORT_CMD_DCBXDIS_F FW_PORT_CMD_DCBXDIS_V(1U) + +#define FW_PORT_CMD_APPLY_S 7 +#define FW_PORT_CMD_APPLY_V(x) ((x) << FW_PORT_CMD_APPLY_S) +#define FW_PORT_CMD_APPLY_F FW_PORT_CMD_APPLY_V(1U) + +#define FW_PORT_CMD_ALL_SYNCD_S 7 +#define FW_PORT_CMD_ALL_SYNCD_V(x) ((x) << FW_PORT_CMD_ALL_SYNCD_S) +#define FW_PORT_CMD_ALL_SYNCD_F FW_PORT_CMD_ALL_SYNCD_V(1U) + +#define FW_PORT_CMD_DCB_VERSION_S 12 +#define FW_PORT_CMD_DCB_VERSION_M 0x7 +#define FW_PORT_CMD_DCB_VERSION_G(x) \ + (((x) >> FW_PORT_CMD_DCB_VERSION_S) & FW_PORT_CMD_DCB_VERSION_M) + +enum fw_port_type { + FW_PORT_TYPE_FIBER_XFI, + FW_PORT_TYPE_FIBER_XAUI, + FW_PORT_TYPE_BT_SGMII, + FW_PORT_TYPE_BT_XFI, + FW_PORT_TYPE_BT_XAUI, + FW_PORT_TYPE_KX4, + FW_PORT_TYPE_CX4, + FW_PORT_TYPE_KX, + FW_PORT_TYPE_KR, + FW_PORT_TYPE_SFP, + FW_PORT_TYPE_BP_AP, + FW_PORT_TYPE_BP4_AP, + FW_PORT_TYPE_QSFP_10G, + FW_PORT_TYPE_QSA, + FW_PORT_TYPE_QSFP, + FW_PORT_TYPE_BP40_BA, + FW_PORT_TYPE_KR4_100G, + FW_PORT_TYPE_CR4_QSFP, + FW_PORT_TYPE_CR_QSFP, + FW_PORT_TYPE_CR2_QSFP, + FW_PORT_TYPE_SFP28, + + FW_PORT_TYPE_NONE = FW_PORT_CMD_PTYPE_M +}; + +enum fw_port_module_type { + FW_PORT_MOD_TYPE_NA, + FW_PORT_MOD_TYPE_LR, + FW_PORT_MOD_TYPE_SR, + FW_PORT_MOD_TYPE_ER, + FW_PORT_MOD_TYPE_TWINAX_PASSIVE, + FW_PORT_MOD_TYPE_TWINAX_ACTIVE, + FW_PORT_MOD_TYPE_LRM, + FW_PORT_MOD_TYPE_ERROR = FW_PORT_CMD_MODTYPE_M - 3, + FW_PORT_MOD_TYPE_UNKNOWN = FW_PORT_CMD_MODTYPE_M - 2, + FW_PORT_MOD_TYPE_NOTSUPPORTED = FW_PORT_CMD_MODTYPE_M - 1, + + FW_PORT_MOD_TYPE_NONE = FW_PORT_CMD_MODTYPE_M +}; + +enum fw_port_mod_sub_type { + FW_PORT_MOD_SUB_TYPE_NA, + FW_PORT_MOD_SUB_TYPE_MV88E114X = 0x1, + FW_PORT_MOD_SUB_TYPE_TN8022 = 0x2, + FW_PORT_MOD_SUB_TYPE_AQ1202 = 0x3, + FW_PORT_MOD_SUB_TYPE_88x3120 = 0x4, + FW_PORT_MOD_SUB_TYPE_BCM84834 = 0x5, + FW_PORT_MOD_SUB_TYPE_BT_VSC8634 = 0x8, + + /* The following will never been in the VPD. They are TWINAX cable + * lengths decoded from SFP+ module i2c PROMs. These should + * almost certainly go somewhere else ... + */ + FW_PORT_MOD_SUB_TYPE_TWINAX_1 = 0x9, + FW_PORT_MOD_SUB_TYPE_TWINAX_3 = 0xA, + FW_PORT_MOD_SUB_TYPE_TWINAX_5 = 0xB, + FW_PORT_MOD_SUB_TYPE_TWINAX_7 = 0xC, +}; + +enum fw_port_stats_tx_index { + FW_STAT_TX_PORT_BYTES_IX = 0, + FW_STAT_TX_PORT_FRAMES_IX, + FW_STAT_TX_PORT_BCAST_IX, + FW_STAT_TX_PORT_MCAST_IX, + FW_STAT_TX_PORT_UCAST_IX, + FW_STAT_TX_PORT_ERROR_IX, + FW_STAT_TX_PORT_64B_IX, + FW_STAT_TX_PORT_65B_127B_IX, + FW_STAT_TX_PORT_128B_255B_IX, + FW_STAT_TX_PORT_256B_511B_IX, + FW_STAT_TX_PORT_512B_1023B_IX, + FW_STAT_TX_PORT_1024B_1518B_IX, + FW_STAT_TX_PORT_1519B_MAX_IX, + FW_STAT_TX_PORT_DROP_IX, + FW_STAT_TX_PORT_PAUSE_IX, + FW_STAT_TX_PORT_PPP0_IX, + FW_STAT_TX_PORT_PPP1_IX, + FW_STAT_TX_PORT_PPP2_IX, + FW_STAT_TX_PORT_PPP3_IX, + FW_STAT_TX_PORT_PPP4_IX, + FW_STAT_TX_PORT_PPP5_IX, + FW_STAT_TX_PORT_PPP6_IX, + FW_STAT_TX_PORT_PPP7_IX, + FW_NUM_PORT_TX_STATS +}; + +enum fw_port_stat_rx_index { + FW_STAT_RX_PORT_BYTES_IX = 0, + FW_STAT_RX_PORT_FRAMES_IX, + FW_STAT_RX_PORT_BCAST_IX, + FW_STAT_RX_PORT_MCAST_IX, + FW_STAT_RX_PORT_UCAST_IX, + FW_STAT_RX_PORT_MTU_ERROR_IX, + FW_STAT_RX_PORT_MTU_CRC_ERROR_IX, + FW_STAT_RX_PORT_CRC_ERROR_IX, + FW_STAT_RX_PORT_LEN_ERROR_IX, + FW_STAT_RX_PORT_SYM_ERROR_IX, + FW_STAT_RX_PORT_64B_IX, + FW_STAT_RX_PORT_65B_127B_IX, + FW_STAT_RX_PORT_128B_255B_IX, + FW_STAT_RX_PORT_256B_511B_IX, + FW_STAT_RX_PORT_512B_1023B_IX, + FW_STAT_RX_PORT_1024B_1518B_IX, + FW_STAT_RX_PORT_1519B_MAX_IX, + FW_STAT_RX_PORT_PAUSE_IX, + FW_STAT_RX_PORT_PPP0_IX, + FW_STAT_RX_PORT_PPP1_IX, + FW_STAT_RX_PORT_PPP2_IX, + FW_STAT_RX_PORT_PPP3_IX, + FW_STAT_RX_PORT_PPP4_IX, + FW_STAT_RX_PORT_PPP5_IX, + FW_STAT_RX_PORT_PPP6_IX, + FW_STAT_RX_PORT_PPP7_IX, + FW_STAT_RX_PORT_LESS_64B_IX, + FW_STAT_RX_PORT_MAC_ERROR_IX, + FW_NUM_PORT_RX_STATS +}; + +/* port stats */ +#define FW_NUM_PORT_STATS (FW_NUM_PORT_TX_STATS + FW_NUM_PORT_RX_STATS) + +struct fw_port_stats_cmd { + __be32 op_to_portid; + __be32 retval_len16; + union fw_port_stats { + struct fw_port_stats_ctl { + u8 nstats_bg_bm; + u8 tx_ix; + __be16 r6; + __be32 r7; + __be64 stat0; + __be64 stat1; + __be64 stat2; + __be64 stat3; + __be64 stat4; + __be64 stat5; + } ctl; + struct fw_port_stats_all { + __be64 tx_bytes; + __be64 tx_frames; + __be64 tx_bcast; + __be64 tx_mcast; + __be64 tx_ucast; + __be64 tx_error; + __be64 tx_64b; + __be64 tx_65b_127b; + __be64 tx_128b_255b; + __be64 tx_256b_511b; + __be64 tx_512b_1023b; + __be64 tx_1024b_1518b; + __be64 tx_1519b_max; + __be64 tx_drop; + __be64 tx_pause; + __be64 tx_ppp0; + __be64 tx_ppp1; + __be64 tx_ppp2; + __be64 tx_ppp3; + __be64 tx_ppp4; + __be64 tx_ppp5; + __be64 tx_ppp6; + __be64 tx_ppp7; + __be64 rx_bytes; + __be64 rx_frames; + __be64 rx_bcast; + __be64 rx_mcast; + __be64 rx_ucast; + __be64 rx_mtu_error; + __be64 rx_mtu_crc_error; + __be64 rx_crc_error; + __be64 rx_len_error; + __be64 rx_sym_error; + __be64 rx_64b; + __be64 rx_65b_127b; + __be64 rx_128b_255b; + __be64 rx_256b_511b; + __be64 rx_512b_1023b; + __be64 rx_1024b_1518b; + __be64 rx_1519b_max; + __be64 rx_pause; + __be64 rx_ppp0; + __be64 rx_ppp1; + __be64 rx_ppp2; + __be64 rx_ppp3; + __be64 rx_ppp4; + __be64 rx_ppp5; + __be64 rx_ppp6; + __be64 rx_ppp7; + __be64 rx_less_64b; + __be64 rx_bg_drop; + __be64 rx_bg_trunc; + } all; + } u; +}; + +/* port loopback stats */ +#define FW_NUM_LB_STATS 16 +enum fw_port_lb_stats_index { + FW_STAT_LB_PORT_BYTES_IX, + FW_STAT_LB_PORT_FRAMES_IX, + FW_STAT_LB_PORT_BCAST_IX, + FW_STAT_LB_PORT_MCAST_IX, + FW_STAT_LB_PORT_UCAST_IX, + FW_STAT_LB_PORT_ERROR_IX, + FW_STAT_LB_PORT_64B_IX, + FW_STAT_LB_PORT_65B_127B_IX, + FW_STAT_LB_PORT_128B_255B_IX, + FW_STAT_LB_PORT_256B_511B_IX, + FW_STAT_LB_PORT_512B_1023B_IX, + FW_STAT_LB_PORT_1024B_1518B_IX, + FW_STAT_LB_PORT_1519B_MAX_IX, + FW_STAT_LB_PORT_DROP_FRAMES_IX +}; + +struct fw_port_lb_stats_cmd { + __be32 op_to_lbport; + __be32 retval_len16; + union fw_port_lb_stats { + struct fw_port_lb_stats_ctl { + u8 nstats_bg_bm; + u8 ix_pkd; + __be16 r6; + __be32 r7; + __be64 stat0; + __be64 stat1; + __be64 stat2; + __be64 stat3; + __be64 stat4; + __be64 stat5; + } ctl; + struct fw_port_lb_stats_all { + __be64 tx_bytes; + __be64 tx_frames; + __be64 tx_bcast; + __be64 tx_mcast; + __be64 tx_ucast; + __be64 tx_error; + __be64 tx_64b; + __be64 tx_65b_127b; + __be64 tx_128b_255b; + __be64 tx_256b_511b; + __be64 tx_512b_1023b; + __be64 tx_1024b_1518b; + __be64 tx_1519b_max; + __be64 rx_lb_drop; + __be64 rx_lb_trunc; + } all; + } u; +}; + +struct fw_rss_ind_tbl_cmd { + __be32 op_to_viid; + __be32 retval_len16; + __be16 niqid; + __be16 startidx; + __be32 r3; + __be32 iq0_to_iq2; + __be32 iq3_to_iq5; + __be32 iq6_to_iq8; + __be32 iq9_to_iq11; + __be32 iq12_to_iq14; + __be32 iq15_to_iq17; + __be32 iq18_to_iq20; + __be32 iq21_to_iq23; + __be32 iq24_to_iq26; + __be32 iq27_to_iq29; + __be32 iq30_iq31; + __be32 r15_lo; +}; + +#define FW_RSS_IND_TBL_CMD_VIID_S 0 +#define FW_RSS_IND_TBL_CMD_VIID_V(x) ((x) << FW_RSS_IND_TBL_CMD_VIID_S) + +#define FW_RSS_IND_TBL_CMD_IQ0_S 20 +#define FW_RSS_IND_TBL_CMD_IQ0_V(x) ((x) << FW_RSS_IND_TBL_CMD_IQ0_S) + +#define FW_RSS_IND_TBL_CMD_IQ1_S 10 +#define FW_RSS_IND_TBL_CMD_IQ1_V(x) ((x) << FW_RSS_IND_TBL_CMD_IQ1_S) + +#define FW_RSS_IND_TBL_CMD_IQ2_S 0 +#define FW_RSS_IND_TBL_CMD_IQ2_V(x) ((x) << FW_RSS_IND_TBL_CMD_IQ2_S) + +struct fw_rss_glb_config_cmd { + __be32 op_to_write; + __be32 retval_len16; + union fw_rss_glb_config { + struct fw_rss_glb_config_manual { + __be32 mode_pkd; + __be32 r3; + __be64 r4; + __be64 r5; + } manual; + struct fw_rss_glb_config_basicvirtual { + __be32 mode_pkd; + __be32 synmapen_to_hashtoeplitz; + __be64 r8; + __be64 r9; + } basicvirtual; + } u; +}; + +#define FW_RSS_GLB_CONFIG_CMD_MODE_S 28 +#define FW_RSS_GLB_CONFIG_CMD_MODE_M 0xf +#define FW_RSS_GLB_CONFIG_CMD_MODE_V(x) ((x) << FW_RSS_GLB_CONFIG_CMD_MODE_S) +#define FW_RSS_GLB_CONFIG_CMD_MODE_G(x) \ + (((x) >> FW_RSS_GLB_CONFIG_CMD_MODE_S) & FW_RSS_GLB_CONFIG_CMD_MODE_M) + +#define FW_RSS_GLB_CONFIG_CMD_MODE_MANUAL 0 +#define FW_RSS_GLB_CONFIG_CMD_MODE_BASICVIRTUAL 1 + +#define FW_RSS_GLB_CONFIG_CMD_SYNMAPEN_S 8 +#define FW_RSS_GLB_CONFIG_CMD_SYNMAPEN_V(x) \ + ((x) << FW_RSS_GLB_CONFIG_CMD_SYNMAPEN_S) +#define FW_RSS_GLB_CONFIG_CMD_SYNMAPEN_F \ + FW_RSS_GLB_CONFIG_CMD_SYNMAPEN_V(1U) + +#define FW_RSS_GLB_CONFIG_CMD_SYN4TUPENIPV6_S 7 +#define FW_RSS_GLB_CONFIG_CMD_SYN4TUPENIPV6_V(x) \ + ((x) << FW_RSS_GLB_CONFIG_CMD_SYN4TUPENIPV6_S) +#define FW_RSS_GLB_CONFIG_CMD_SYN4TUPENIPV6_F \ + FW_RSS_GLB_CONFIG_CMD_SYN4TUPENIPV6_V(1U) + +#define FW_RSS_GLB_CONFIG_CMD_SYN2TUPENIPV6_S 6 +#define FW_RSS_GLB_CONFIG_CMD_SYN2TUPENIPV6_V(x) \ + ((x) << FW_RSS_GLB_CONFIG_CMD_SYN2TUPENIPV6_S) +#define FW_RSS_GLB_CONFIG_CMD_SYN2TUPENIPV6_F \ + FW_RSS_GLB_CONFIG_CMD_SYN2TUPENIPV6_V(1U) + +#define FW_RSS_GLB_CONFIG_CMD_SYN4TUPENIPV4_S 5 +#define FW_RSS_GLB_CONFIG_CMD_SYN4TUPENIPV4_V(x) \ + ((x) << FW_RSS_GLB_CONFIG_CMD_SYN4TUPENIPV4_S) +#define FW_RSS_GLB_CONFIG_CMD_SYN4TUPENIPV4_F \ + FW_RSS_GLB_CONFIG_CMD_SYN4TUPENIPV4_V(1U) + +#define FW_RSS_GLB_CONFIG_CMD_SYN2TUPENIPV4_S 4 +#define FW_RSS_GLB_CONFIG_CMD_SYN2TUPENIPV4_V(x) \ + ((x) << FW_RSS_GLB_CONFIG_CMD_SYN2TUPENIPV4_S) +#define FW_RSS_GLB_CONFIG_CMD_SYN2TUPENIPV4_F \ + FW_RSS_GLB_CONFIG_CMD_SYN2TUPENIPV4_V(1U) + +#define FW_RSS_GLB_CONFIG_CMD_OFDMAPEN_S 3 +#define FW_RSS_GLB_CONFIG_CMD_OFDMAPEN_V(x) \ + ((x) << FW_RSS_GLB_CONFIG_CMD_OFDMAPEN_S) +#define FW_RSS_GLB_CONFIG_CMD_OFDMAPEN_F \ + FW_RSS_GLB_CONFIG_CMD_OFDMAPEN_V(1U) + +#define FW_RSS_GLB_CONFIG_CMD_TNLMAPEN_S 2 +#define FW_RSS_GLB_CONFIG_CMD_TNLMAPEN_V(x) \ + ((x) << FW_RSS_GLB_CONFIG_CMD_TNLMAPEN_S) +#define FW_RSS_GLB_CONFIG_CMD_TNLMAPEN_F \ + FW_RSS_GLB_CONFIG_CMD_TNLMAPEN_V(1U) + +#define FW_RSS_GLB_CONFIG_CMD_TNLALLLKP_S 1 +#define FW_RSS_GLB_CONFIG_CMD_TNLALLLKP_V(x) \ + ((x) << FW_RSS_GLB_CONFIG_CMD_TNLALLLKP_S) +#define FW_RSS_GLB_CONFIG_CMD_TNLALLLKP_F \ + FW_RSS_GLB_CONFIG_CMD_TNLALLLKP_V(1U) + +#define FW_RSS_GLB_CONFIG_CMD_HASHTOEPLITZ_S 0 +#define FW_RSS_GLB_CONFIG_CMD_HASHTOEPLITZ_V(x) \ + ((x) << FW_RSS_GLB_CONFIG_CMD_HASHTOEPLITZ_S) +#define FW_RSS_GLB_CONFIG_CMD_HASHTOEPLITZ_F \ + FW_RSS_GLB_CONFIG_CMD_HASHTOEPLITZ_V(1U) + +struct fw_rss_vi_config_cmd { + __be32 op_to_viid; +#define FW_RSS_VI_CONFIG_CMD_VIID(x) ((x) << 0) + __be32 retval_len16; + union fw_rss_vi_config { + struct fw_rss_vi_config_manual { + __be64 r3; + __be64 r4; + __be64 r5; + } manual; + struct fw_rss_vi_config_basicvirtual { + __be32 r6; + __be32 defaultq_to_udpen; + __be64 r9; + __be64 r10; + } basicvirtual; + } u; +}; + +#define FW_RSS_VI_CONFIG_CMD_VIID_S 0 +#define FW_RSS_VI_CONFIG_CMD_VIID_V(x) ((x) << FW_RSS_VI_CONFIG_CMD_VIID_S) + +#define FW_RSS_VI_CONFIG_CMD_DEFAULTQ_S 16 +#define FW_RSS_VI_CONFIG_CMD_DEFAULTQ_M 0x3ff +#define FW_RSS_VI_CONFIG_CMD_DEFAULTQ_V(x) \ + ((x) << FW_RSS_VI_CONFIG_CMD_DEFAULTQ_S) +#define FW_RSS_VI_CONFIG_CMD_DEFAULTQ_G(x) \ + (((x) >> FW_RSS_VI_CONFIG_CMD_DEFAULTQ_S) & \ + FW_RSS_VI_CONFIG_CMD_DEFAULTQ_M) + +#define FW_RSS_VI_CONFIG_CMD_IP6FOURTUPEN_S 4 +#define FW_RSS_VI_CONFIG_CMD_IP6FOURTUPEN_V(x) \ + ((x) << FW_RSS_VI_CONFIG_CMD_IP6FOURTUPEN_S) +#define FW_RSS_VI_CONFIG_CMD_IP6FOURTUPEN_F \ + FW_RSS_VI_CONFIG_CMD_IP6FOURTUPEN_V(1U) + +#define FW_RSS_VI_CONFIG_CMD_IP6TWOTUPEN_S 3 +#define FW_RSS_VI_CONFIG_CMD_IP6TWOTUPEN_V(x) \ + ((x) << FW_RSS_VI_CONFIG_CMD_IP6TWOTUPEN_S) +#define FW_RSS_VI_CONFIG_CMD_IP6TWOTUPEN_F \ + FW_RSS_VI_CONFIG_CMD_IP6TWOTUPEN_V(1U) + +#define FW_RSS_VI_CONFIG_CMD_IP4FOURTUPEN_S 2 +#define FW_RSS_VI_CONFIG_CMD_IP4FOURTUPEN_V(x) \ + ((x) << FW_RSS_VI_CONFIG_CMD_IP4FOURTUPEN_S) +#define FW_RSS_VI_CONFIG_CMD_IP4FOURTUPEN_F \ + FW_RSS_VI_CONFIG_CMD_IP4FOURTUPEN_V(1U) + +#define FW_RSS_VI_CONFIG_CMD_IP4TWOTUPEN_S 1 +#define FW_RSS_VI_CONFIG_CMD_IP4TWOTUPEN_V(x) \ + ((x) << FW_RSS_VI_CONFIG_CMD_IP4TWOTUPEN_S) +#define FW_RSS_VI_CONFIG_CMD_IP4TWOTUPEN_F \ + FW_RSS_VI_CONFIG_CMD_IP4TWOTUPEN_V(1U) + +#define FW_RSS_VI_CONFIG_CMD_UDPEN_S 0 +#define FW_RSS_VI_CONFIG_CMD_UDPEN_V(x) ((x) << FW_RSS_VI_CONFIG_CMD_UDPEN_S) +#define FW_RSS_VI_CONFIG_CMD_UDPEN_F FW_RSS_VI_CONFIG_CMD_UDPEN_V(1U) + +struct fw_clip_cmd { + __be32 op_to_write; + __be32 alloc_to_len16; + __be64 ip_hi; + __be64 ip_lo; + __be32 r4[2]; +}; + +#define FW_CLIP_CMD_ALLOC_S 31 +#define FW_CLIP_CMD_ALLOC_V(x) ((x) << FW_CLIP_CMD_ALLOC_S) +#define FW_CLIP_CMD_ALLOC_F FW_CLIP_CMD_ALLOC_V(1U) + +#define FW_CLIP_CMD_FREE_S 30 +#define FW_CLIP_CMD_FREE_V(x) ((x) << FW_CLIP_CMD_FREE_S) +#define FW_CLIP_CMD_FREE_F FW_CLIP_CMD_FREE_V(1U) + +enum fw_error_type { + FW_ERROR_TYPE_EXCEPTION = 0x0, + FW_ERROR_TYPE_HWMODULE = 0x1, + FW_ERROR_TYPE_WR = 0x2, + FW_ERROR_TYPE_ACL = 0x3, +}; + +struct fw_error_cmd { + __be32 op_to_type; + __be32 len16_pkd; + union fw_error { + struct fw_error_exception { + __be32 info[6]; + } exception; + struct fw_error_hwmodule { + __be32 regaddr; + __be32 regval; + } hwmodule; + struct fw_error_wr { + __be16 cidx; + __be16 pfn_vfn; + __be32 eqid; + u8 wrhdr[16]; + } wr; + struct fw_error_acl { + __be16 cidx; + __be16 pfn_vfn; + __be32 eqid; + __be16 mv_pkd; + u8 val[6]; + __be64 r4; + } acl; + } u; +}; + +struct fw_debug_cmd { + __be32 op_type; + __be32 len16_pkd; + union fw_debug { + struct fw_debug_assert { + __be32 fcid; + __be32 line; + __be32 x; + __be32 y; + u8 filename_0_7[8]; + u8 filename_8_15[8]; + __be64 r3; + } assert; + struct fw_debug_prt { + __be16 dprtstridx; + __be16 r3[3]; + __be32 dprtstrparam0; + __be32 dprtstrparam1; + __be32 dprtstrparam2; + __be32 dprtstrparam3; + } prt; + } u; +}; + +#define FW_DEBUG_CMD_TYPE_S 0 +#define FW_DEBUG_CMD_TYPE_M 0xff +#define FW_DEBUG_CMD_TYPE_G(x) \ + (((x) >> FW_DEBUG_CMD_TYPE_S) & FW_DEBUG_CMD_TYPE_M) + +#define PCIE_FW_ERR_S 31 +#define PCIE_FW_ERR_V(x) ((x) << PCIE_FW_ERR_S) +#define PCIE_FW_ERR_F PCIE_FW_ERR_V(1U) + +#define PCIE_FW_INIT_S 30 +#define PCIE_FW_INIT_V(x) ((x) << PCIE_FW_INIT_S) +#define PCIE_FW_INIT_F PCIE_FW_INIT_V(1U) + +#define PCIE_FW_HALT_S 29 +#define PCIE_FW_HALT_V(x) ((x) << PCIE_FW_HALT_S) +#define PCIE_FW_HALT_F PCIE_FW_HALT_V(1U) + +#define PCIE_FW_EVAL_S 24 +#define PCIE_FW_EVAL_M 0x7 +#define PCIE_FW_EVAL_G(x) (((x) >> PCIE_FW_EVAL_S) & PCIE_FW_EVAL_M) + +#define PCIE_FW_MASTER_VLD_S 15 +#define PCIE_FW_MASTER_VLD_V(x) ((x) << PCIE_FW_MASTER_VLD_S) +#define PCIE_FW_MASTER_VLD_F PCIE_FW_MASTER_VLD_V(1U) + +#define PCIE_FW_MASTER_S 12 +#define PCIE_FW_MASTER_M 0x7 +#define PCIE_FW_MASTER_V(x) ((x) << PCIE_FW_MASTER_S) +#define PCIE_FW_MASTER_G(x) (((x) >> PCIE_FW_MASTER_S) & PCIE_FW_MASTER_M) + +struct fw_hdr { + u8 ver; + u8 chip; /* terminator chip type */ + __be16 len512; /* bin length in units of 512-bytes */ + __be32 fw_ver; /* firmware version */ + __be32 tp_microcode_ver; + u8 intfver_nic; + u8 intfver_vnic; + u8 intfver_ofld; + u8 intfver_ri; + u8 intfver_iscsipdu; + u8 intfver_iscsi; + u8 intfver_fcoepdu; + u8 intfver_fcoe; + __u32 reserved2; + __u32 reserved3; + __u32 reserved4; + __be32 flags; + __be32 reserved6[23]; +}; + +enum fw_hdr_chip { + FW_HDR_CHIP_T4, + FW_HDR_CHIP_T5, + FW_HDR_CHIP_T6 +}; + +#define FW_HDR_FW_VER_MAJOR_S 24 +#define FW_HDR_FW_VER_MAJOR_M 0xff +#define FW_HDR_FW_VER_MAJOR_V(x) \ + ((x) << FW_HDR_FW_VER_MAJOR_S) +#define FW_HDR_FW_VER_MAJOR_G(x) \ + (((x) >> FW_HDR_FW_VER_MAJOR_S) & FW_HDR_FW_VER_MAJOR_M) + +#define FW_HDR_FW_VER_MINOR_S 16 +#define FW_HDR_FW_VER_MINOR_M 0xff +#define FW_HDR_FW_VER_MINOR_V(x) \ + ((x) << FW_HDR_FW_VER_MINOR_S) +#define FW_HDR_FW_VER_MINOR_G(x) \ + (((x) >> FW_HDR_FW_VER_MINOR_S) & FW_HDR_FW_VER_MINOR_M) + +#define FW_HDR_FW_VER_MICRO_S 8 +#define FW_HDR_FW_VER_MICRO_M 0xff +#define FW_HDR_FW_VER_MICRO_V(x) \ + ((x) << FW_HDR_FW_VER_MICRO_S) +#define FW_HDR_FW_VER_MICRO_G(x) \ + (((x) >> FW_HDR_FW_VER_MICRO_S) & FW_HDR_FW_VER_MICRO_M) + +#define FW_HDR_FW_VER_BUILD_S 0 +#define FW_HDR_FW_VER_BUILD_M 0xff +#define FW_HDR_FW_VER_BUILD_V(x) \ + ((x) << FW_HDR_FW_VER_BUILD_S) +#define FW_HDR_FW_VER_BUILD_G(x) \ + (((x) >> FW_HDR_FW_VER_BUILD_S) & FW_HDR_FW_VER_BUILD_M) + +enum fw_hdr_intfver { + FW_HDR_INTFVER_NIC = 0x00, + FW_HDR_INTFVER_VNIC = 0x00, + FW_HDR_INTFVER_OFLD = 0x00, + FW_HDR_INTFVER_RI = 0x00, + FW_HDR_INTFVER_ISCSIPDU = 0x00, + FW_HDR_INTFVER_ISCSI = 0x00, + FW_HDR_INTFVER_FCOEPDU = 0x00, + FW_HDR_INTFVER_FCOE = 0x00, +}; + +enum fw_hdr_flags { + FW_HDR_FLAGS_RESET_HALT = 0x00000001, +}; + +/* length of the formatting string */ +#define FW_DEVLOG_FMT_LEN 192 + +/* maximum number of the formatting string parameters */ +#define FW_DEVLOG_FMT_PARAMS_NUM 8 + +/* priority levels */ +enum fw_devlog_level { + FW_DEVLOG_LEVEL_EMERG = 0x0, + FW_DEVLOG_LEVEL_CRIT = 0x1, + FW_DEVLOG_LEVEL_ERR = 0x2, + FW_DEVLOG_LEVEL_NOTICE = 0x3, + FW_DEVLOG_LEVEL_INFO = 0x4, + FW_DEVLOG_LEVEL_DEBUG = 0x5, + FW_DEVLOG_LEVEL_MAX = 0x5, +}; + +/* facilities that may send a log message */ +enum fw_devlog_facility { + FW_DEVLOG_FACILITY_CORE = 0x00, + FW_DEVLOG_FACILITY_CF = 0x01, + FW_DEVLOG_FACILITY_SCHED = 0x02, + FW_DEVLOG_FACILITY_TIMER = 0x04, + FW_DEVLOG_FACILITY_RES = 0x06, + FW_DEVLOG_FACILITY_HW = 0x08, + FW_DEVLOG_FACILITY_FLR = 0x10, + FW_DEVLOG_FACILITY_DMAQ = 0x12, + FW_DEVLOG_FACILITY_PHY = 0x14, + FW_DEVLOG_FACILITY_MAC = 0x16, + FW_DEVLOG_FACILITY_PORT = 0x18, + FW_DEVLOG_FACILITY_VI = 0x1A, + FW_DEVLOG_FACILITY_FILTER = 0x1C, + FW_DEVLOG_FACILITY_ACL = 0x1E, + FW_DEVLOG_FACILITY_TM = 0x20, + FW_DEVLOG_FACILITY_QFC = 0x22, + FW_DEVLOG_FACILITY_DCB = 0x24, + FW_DEVLOG_FACILITY_ETH = 0x26, + FW_DEVLOG_FACILITY_OFLD = 0x28, + FW_DEVLOG_FACILITY_RI = 0x2A, + FW_DEVLOG_FACILITY_ISCSI = 0x2C, + FW_DEVLOG_FACILITY_FCOE = 0x2E, + FW_DEVLOG_FACILITY_FOISCSI = 0x30, + FW_DEVLOG_FACILITY_FOFCOE = 0x32, + FW_DEVLOG_FACILITY_CHNET = 0x34, + FW_DEVLOG_FACILITY_MAX = 0x34, +}; + +/* log message format */ +struct fw_devlog_e { + __be64 timestamp; + __be32 seqno; + __be16 reserved1; + __u8 level; + __u8 facility; + __u8 fmt[FW_DEVLOG_FMT_LEN]; + __be32 params[FW_DEVLOG_FMT_PARAMS_NUM]; + __be32 reserved3[4]; +}; + +struct fw_devlog_cmd { + __be32 op_to_write; + __be32 retval_len16; + __u8 level; + __u8 r2[7]; + __be32 memtype_devlog_memaddr16_devlog; + __be32 memsize_devlog; + __be32 r3[2]; +}; + +#define FW_DEVLOG_CMD_MEMTYPE_DEVLOG_S 28 +#define FW_DEVLOG_CMD_MEMTYPE_DEVLOG_M 0xf +#define FW_DEVLOG_CMD_MEMTYPE_DEVLOG_G(x) \ + (((x) >> FW_DEVLOG_CMD_MEMTYPE_DEVLOG_S) & \ + FW_DEVLOG_CMD_MEMTYPE_DEVLOG_M) + +#define FW_DEVLOG_CMD_MEMADDR16_DEVLOG_S 0 +#define FW_DEVLOG_CMD_MEMADDR16_DEVLOG_M 0xfffffff +#define FW_DEVLOG_CMD_MEMADDR16_DEVLOG_G(x) \ + (((x) >> FW_DEVLOG_CMD_MEMADDR16_DEVLOG_S) & \ + FW_DEVLOG_CMD_MEMADDR16_DEVLOG_M) + +/* P C I E F W P F 7 R E G I S T E R */ + +/* PF7 stores the Firmware Device Log parameters which allows Host Drivers to + * access the "devlog" which needing to contact firmware. The encoding is + * mostly the same as that returned by the DEVLOG command except for the size + * which is encoded as the number of entries in multiples-1 of 128 here rather + * than the memory size as is done in the DEVLOG command. Thus, 0 means 128 + * and 15 means 2048. This of course in turn constrains the allowed values + * for the devlog size ... + */ +#define PCIE_FW_PF_DEVLOG 7 + +#define PCIE_FW_PF_DEVLOG_NENTRIES128_S 28 +#define PCIE_FW_PF_DEVLOG_NENTRIES128_M 0xf +#define PCIE_FW_PF_DEVLOG_NENTRIES128_V(x) \ + ((x) << PCIE_FW_PF_DEVLOG_NENTRIES128_S) +#define PCIE_FW_PF_DEVLOG_NENTRIES128_G(x) \ + (((x) >> PCIE_FW_PF_DEVLOG_NENTRIES128_S) & \ + PCIE_FW_PF_DEVLOG_NENTRIES128_M) + +#define PCIE_FW_PF_DEVLOG_ADDR16_S 4 +#define PCIE_FW_PF_DEVLOG_ADDR16_M 0xffffff +#define PCIE_FW_PF_DEVLOG_ADDR16_V(x) ((x) << PCIE_FW_PF_DEVLOG_ADDR16_S) +#define PCIE_FW_PF_DEVLOG_ADDR16_G(x) \ + (((x) >> PCIE_FW_PF_DEVLOG_ADDR16_S) & PCIE_FW_PF_DEVLOG_ADDR16_M) + +#define PCIE_FW_PF_DEVLOG_MEMTYPE_S 0 +#define PCIE_FW_PF_DEVLOG_MEMTYPE_M 0xf +#define PCIE_FW_PF_DEVLOG_MEMTYPE_V(x) ((x) << PCIE_FW_PF_DEVLOG_MEMTYPE_S) +#define PCIE_FW_PF_DEVLOG_MEMTYPE_G(x) \ + (((x) >> PCIE_FW_PF_DEVLOG_MEMTYPE_S) & PCIE_FW_PF_DEVLOG_MEMTYPE_M) + +#endif /* _T4FW_INTERFACE_H_ */ diff --git a/providers/cxgb4/t4fw_ri_api.h b/providers/cxgb4/t4fw_ri_api.h new file mode 100644 index 0000000..9b18462 --- /dev/null +++ b/providers/cxgb4/t4fw_ri_api.h @@ -0,0 +1,814 @@ +/* + * Copyright (c) 2009-2010 Chelsio, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef _T4FW_RI_API_H_ +#define _T4FW_RI_API_H_ + +#include "t4fw_api.h" + +enum fw_ri_wr_opcode { + FW_RI_RDMA_WRITE = 0x0, /* IETF RDMAP v1.0 ... */ + FW_RI_READ_REQ = 0x1, + FW_RI_READ_RESP = 0x2, + FW_RI_SEND = 0x3, + FW_RI_SEND_WITH_INV = 0x4, + FW_RI_SEND_WITH_SE = 0x5, + FW_RI_SEND_WITH_SE_INV = 0x6, + FW_RI_TERMINATE = 0x7, + FW_RI_RDMA_INIT = 0x8, /* CHELSIO RI specific ... */ + FW_RI_BIND_MW = 0x9, + FW_RI_FAST_REGISTER = 0xa, + FW_RI_LOCAL_INV = 0xb, + FW_RI_QP_MODIFY = 0xc, + FW_RI_BYPASS = 0xd, + FW_RI_RECEIVE = 0xe, + + FW_RI_SGE_EC_CR_RETURN = 0xf, + FW_RI_WRITE_IMMEDIATE = FW_RI_RDMA_INIT, +}; + +enum fw_ri_wr_flags { + FW_RI_COMPLETION_FLAG = 0x01, + FW_RI_NOTIFICATION_FLAG = 0x02, + FW_RI_SOLICITED_EVENT_FLAG = 0x04, + FW_RI_READ_FENCE_FLAG = 0x08, + FW_RI_LOCAL_FENCE_FLAG = 0x10, + FW_RI_RDMA_READ_INVALIDATE = 0x20, + FW_RI_RDMA_WRITE_WITH_IMMEDIATE = 0x40, +}; + +enum fw_ri_mpa_attrs { + FW_RI_MPA_RX_MARKER_ENABLE = 0x01, + FW_RI_MPA_TX_MARKER_ENABLE = 0x02, + FW_RI_MPA_CRC_ENABLE = 0x04, + FW_RI_MPA_IETF_ENABLE = 0x08 +}; + +enum fw_ri_qp_caps { + FW_RI_QP_RDMA_READ_ENABLE = 0x01, + FW_RI_QP_RDMA_WRITE_ENABLE = 0x02, + FW_RI_QP_BIND_ENABLE = 0x04, + FW_RI_QP_FAST_REGISTER_ENABLE = 0x08, + FW_RI_QP_STAG0_ENABLE = 0x10 +}; + +enum fw_ri_addr_type { + FW_RI_ZERO_BASED_TO = 0x00, + FW_RI_VA_BASED_TO = 0x01 +}; + +enum fw_ri_mem_perms { + FW_RI_MEM_ACCESS_REM_WRITE = 0x01, + FW_RI_MEM_ACCESS_REM_READ = 0x02, + FW_RI_MEM_ACCESS_REM = 0x03, + FW_RI_MEM_ACCESS_LOCAL_WRITE = 0x04, + FW_RI_MEM_ACCESS_LOCAL_READ = 0x08, + FW_RI_MEM_ACCESS_LOCAL = 0x0C +}; + +enum fw_ri_stag_type { + FW_RI_STAG_NSMR = 0x00, + FW_RI_STAG_SMR = 0x01, + FW_RI_STAG_MW = 0x02, + FW_RI_STAG_MW_RELAXED = 0x03 +}; + +enum fw_ri_data_op { + FW_RI_DATA_IMMD = 0x81, + FW_RI_DATA_DSGL = 0x82, + FW_RI_DATA_ISGL = 0x83 +}; + +enum fw_ri_sgl_depth { + FW_RI_SGL_DEPTH_MAX_SQ = 16, + FW_RI_SGL_DEPTH_MAX_RQ = 4 +}; + +struct fw_ri_dsge_pair { + __be32 len[2]; + __be64 addr[2]; +}; + +struct fw_ri_dsgl { + __u8 op; + __u8 r1; + __be16 nsge; + __be32 len0; + __be64 addr0; +#ifndef C99_NOT_SUPPORTED + struct fw_ri_dsge_pair sge[0]; +#endif +}; + +struct fw_ri_sge { + __be32 stag; + __be32 len; + __be64 to; +}; + +struct fw_ri_isgl { + __u8 op; + __u8 r1; + __be16 nsge; + __be32 r2; +#ifndef C99_NOT_SUPPORTED + struct fw_ri_sge sge[0]; +#endif +}; + +struct fw_ri_immd { + __u8 op; + __u8 r1; + __be16 r2; + __be32 immdlen; +#ifndef C99_NOT_SUPPORTED + __u8 data[0]; +#endif +}; + +struct fw_ri_tpte { + __be32 valid_to_pdid; + __be32 locread_to_qpid; + __be32 nosnoop_pbladdr; + __be32 len_lo; + __be32 va_hi; + __be32 va_lo_fbo; + __be32 dca_mwbcnt_pstag; + __be32 len_hi; +}; + +#define FW_RI_TPTE_VALID_S 31 +#define FW_RI_TPTE_VALID_M 0x1 +#define FW_RI_TPTE_VALID_V(x) ((x) << FW_RI_TPTE_VALID_S) +#define FW_RI_TPTE_VALID_G(x) \ + (((x) >> FW_RI_TPTE_VALID_S) & FW_RI_TPTE_VALID_M) +#define FW_RI_TPTE_VALID_F FW_RI_TPTE_VALID_V(1U) + +#define FW_RI_TPTE_STAGKEY_S 23 +#define FW_RI_TPTE_STAGKEY_M 0xff +#define FW_RI_TPTE_STAGKEY_V(x) ((x) << FW_RI_TPTE_STAGKEY_S) +#define FW_RI_TPTE_STAGKEY_G(x) \ + (((x) >> FW_RI_TPTE_STAGKEY_S) & FW_RI_TPTE_STAGKEY_M) + +#define FW_RI_TPTE_STAGSTATE_S 22 +#define FW_RI_TPTE_STAGSTATE_M 0x1 +#define FW_RI_TPTE_STAGSTATE_V(x) ((x) << FW_RI_TPTE_STAGSTATE_S) +#define FW_RI_TPTE_STAGSTATE_G(x) \ + (((x) >> FW_RI_TPTE_STAGSTATE_S) & FW_RI_TPTE_STAGSTATE_M) +#define FW_RI_TPTE_STAGSTATE_F FW_RI_TPTE_STAGSTATE_V(1U) + +#define FW_RI_TPTE_STAGTYPE_S 20 +#define FW_RI_TPTE_STAGTYPE_M 0x3 +#define FW_RI_TPTE_STAGTYPE_V(x) ((x) << FW_RI_TPTE_STAGTYPE_S) +#define FW_RI_TPTE_STAGTYPE_G(x) \ + (((x) >> FW_RI_TPTE_STAGTYPE_S) & FW_RI_TPTE_STAGTYPE_M) + +#define FW_RI_TPTE_PDID_S 0 +#define FW_RI_TPTE_PDID_M 0xfffff +#define FW_RI_TPTE_PDID_V(x) ((x) << FW_RI_TPTE_PDID_S) +#define FW_RI_TPTE_PDID_G(x) \ + (((x) >> FW_RI_TPTE_PDID_S) & FW_RI_TPTE_PDID_M) + +#define FW_RI_TPTE_PERM_S 28 +#define FW_RI_TPTE_PERM_M 0xf +#define FW_RI_TPTE_PERM_V(x) ((x) << FW_RI_TPTE_PERM_S) +#define FW_RI_TPTE_PERM_G(x) \ + (((x) >> FW_RI_TPTE_PERM_S) & FW_RI_TPTE_PERM_M) + +#define FW_RI_TPTE_REMINVDIS_S 27 +#define FW_RI_TPTE_REMINVDIS_M 0x1 +#define FW_RI_TPTE_REMINVDIS_V(x) ((x) << FW_RI_TPTE_REMINVDIS_S) +#define FW_RI_TPTE_REMINVDIS_G(x) \ + (((x) >> FW_RI_TPTE_REMINVDIS_S) & FW_RI_TPTE_REMINVDIS_M) +#define FW_RI_TPTE_REMINVDIS_F FW_RI_TPTE_REMINVDIS_V(1U) + +#define FW_RI_TPTE_ADDRTYPE_S 26 +#define FW_RI_TPTE_ADDRTYPE_M 1 +#define FW_RI_TPTE_ADDRTYPE_V(x) ((x) << FW_RI_TPTE_ADDRTYPE_S) +#define FW_RI_TPTE_ADDRTYPE_G(x) \ + (((x) >> FW_RI_TPTE_ADDRTYPE_S) & FW_RI_TPTE_ADDRTYPE_M) +#define FW_RI_TPTE_ADDRTYPE_F FW_RI_TPTE_ADDRTYPE_V(1U) + +#define FW_RI_TPTE_MWBINDEN_S 25 +#define FW_RI_TPTE_MWBINDEN_M 0x1 +#define FW_RI_TPTE_MWBINDEN_V(x) ((x) << FW_RI_TPTE_MWBINDEN_S) +#define FW_RI_TPTE_MWBINDEN_G(x) \ + (((x) >> FW_RI_TPTE_MWBINDEN_S) & FW_RI_TPTE_MWBINDEN_M) +#define FW_RI_TPTE_MWBINDEN_F FW_RI_TPTE_MWBINDEN_V(1U) + +#define FW_RI_TPTE_PS_S 20 +#define FW_RI_TPTE_PS_M 0x1f +#define FW_RI_TPTE_PS_V(x) ((x) << FW_RI_TPTE_PS_S) +#define FW_RI_TPTE_PS_G(x) \ + (((x) >> FW_RI_TPTE_PS_S) & FW_RI_TPTE_PS_M) + +#define FW_RI_TPTE_QPID_S 0 +#define FW_RI_TPTE_QPID_M 0xfffff +#define FW_RI_TPTE_QPID_V(x) ((x) << FW_RI_TPTE_QPID_S) +#define FW_RI_TPTE_QPID_G(x) \ + (((x) >> FW_RI_TPTE_QPID_S) & FW_RI_TPTE_QPID_M) + +#define FW_RI_TPTE_NOSNOOP_S 30 +#define FW_RI_TPTE_NOSNOOP_M 0x1 +#define FW_RI_TPTE_NOSNOOP_V(x) ((x) << FW_RI_TPTE_NOSNOOP_S) +#define FW_RI_TPTE_NOSNOOP_G(x) \ + (((x) >> FW_RI_TPTE_NOSNOOP_S) & FW_RI_TPTE_NOSNOOP_M) +#define FW_RI_TPTE_NOSNOOP_F FW_RI_TPTE_NOSNOOP_V(1U) + +#define FW_RI_TPTE_PBLADDR_S 0 +#define FW_RI_TPTE_PBLADDR_M 0x1fffffff +#define FW_RI_TPTE_PBLADDR_V(x) ((x) << FW_RI_TPTE_PBLADDR_S) +#define FW_RI_TPTE_PBLADDR_G(x) \ + (((x) >> FW_RI_TPTE_PBLADDR_S) & FW_RI_TPTE_PBLADDR_M) + +#define FW_RI_TPTE_DCA_S 24 +#define FW_RI_TPTE_DCA_M 0x1f +#define FW_RI_TPTE_DCA_V(x) ((x) << FW_RI_TPTE_DCA_S) +#define FW_RI_TPTE_DCA_G(x) \ + (((x) >> FW_RI_TPTE_DCA_S) & FW_RI_TPTE_DCA_M) + +#define FW_RI_TPTE_MWBCNT_PSTAG_S 0 +#define FW_RI_TPTE_MWBCNT_PSTAG_M 0xffffff +#define FW_RI_TPTE_MWBCNT_PSTAT_V(x) \ + ((x) << FW_RI_TPTE_MWBCNT_PSTAG_S) +#define FW_RI_TPTE_MWBCNT_PSTAG_G(x) \ + (((x) >> FW_RI_TPTE_MWBCNT_PSTAG_S) & FW_RI_TPTE_MWBCNT_PSTAG_M) + +enum fw_ri_res_type { + FW_RI_RES_TYPE_SQ, + FW_RI_RES_TYPE_RQ, + FW_RI_RES_TYPE_CQ, + FW_RI_RES_TYPE_SRQ, +}; + +enum fw_ri_res_op { + FW_RI_RES_OP_WRITE, + FW_RI_RES_OP_RESET, +}; + +struct fw_ri_res { + union fw_ri_restype { + struct fw_ri_res_sqrq { + __u8 restype; + __u8 op; + __be16 r3; + __be32 eqid; + __be32 r4[2]; + __be32 fetchszm_to_iqid; + __be32 dcaen_to_eqsize; + __be64 eqaddr; + } sqrq; + struct fw_ri_res_cq { + __u8 restype; + __u8 op; + __be16 r3; + __be32 iqid; + __be32 r4[2]; + __be32 iqandst_to_iqandstindex; + __be16 iqdroprss_to_iqesize; + __be16 iqsize; + __be64 iqaddr; + __be32 iqns_iqro; + __be32 r6_lo; + __be64 r7; + } cq; + struct fw_ri_res_srq { + __u8 restype; + __u8 op; + __be16 r3; + __be32 eqid; + __be32 r4[2]; + __be32 fetchszm_to_iqid; + __be32 dcaen_to_eqsize; + __be64 eqaddr; + __be32 srqid; + __be32 pdid; + __be32 hwsrqsize; + __be32 hwsrqaddr; + } srq; + } u; +}; + +struct fw_ri_res_wr { + __be32 op_nres; + __be32 len16_pkd; + __u64 cookie; +#ifndef C99_NOT_SUPPORTED + struct fw_ri_res res[0]; +#endif +}; + +#define FW_RI_RES_WR_NRES_S 0 +#define FW_RI_RES_WR_NRES_M 0xff +#define FW_RI_RES_WR_NRES_V(x) ((x) << FW_RI_RES_WR_NRES_S) +#define FW_RI_RES_WR_NRES_G(x) \ + (((x) >> FW_RI_RES_WR_NRES_S) & FW_RI_RES_WR_NRES_M) + +#define FW_RI_RES_WR_FETCHSZM_S 26 +#define FW_RI_RES_WR_FETCHSZM_M 0x1 +#define FW_RI_RES_WR_FETCHSZM_V(x) ((x) << FW_RI_RES_WR_FETCHSZM_S) +#define FW_RI_RES_WR_FETCHSZM_G(x) \ + (((x) >> FW_RI_RES_WR_FETCHSZM_S) & FW_RI_RES_WR_FETCHSZM_M) +#define FW_RI_RES_WR_FETCHSZM_F FW_RI_RES_WR_FETCHSZM_V(1U) + +#define FW_RI_RES_WR_STATUSPGNS_S 25 +#define FW_RI_RES_WR_STATUSPGNS_M 0x1 +#define FW_RI_RES_WR_STATUSPGNS_V(x) ((x) << FW_RI_RES_WR_STATUSPGNS_S) +#define FW_RI_RES_WR_STATUSPGNS_G(x) \ + (((x) >> FW_RI_RES_WR_STATUSPGNS_S) & FW_RI_RES_WR_STATUSPGNS_M) +#define FW_RI_RES_WR_STATUSPGNS_F FW_RI_RES_WR_STATUSPGNS_V(1U) + +#define FW_RI_RES_WR_STATUSPGRO_S 24 +#define FW_RI_RES_WR_STATUSPGRO_M 0x1 +#define FW_RI_RES_WR_STATUSPGRO_V(x) ((x) << FW_RI_RES_WR_STATUSPGRO_S) +#define FW_RI_RES_WR_STATUSPGRO_G(x) \ + (((x) >> FW_RI_RES_WR_STATUSPGRO_S) & FW_RI_RES_WR_STATUSPGRO_M) +#define FW_RI_RES_WR_STATUSPGRO_F FW_RI_RES_WR_STATUSPGRO_V(1U) + +#define FW_RI_RES_WR_FETCHNS_S 23 +#define FW_RI_RES_WR_FETCHNS_M 0x1 +#define FW_RI_RES_WR_FETCHNS_V(x) ((x) << FW_RI_RES_WR_FETCHNS_S) +#define FW_RI_RES_WR_FETCHNS_G(x) \ + (((x) >> FW_RI_RES_WR_FETCHNS_S) & FW_RI_RES_WR_FETCHNS_M) +#define FW_RI_RES_WR_FETCHNS_F FW_RI_RES_WR_FETCHNS_V(1U) + +#define FW_RI_RES_WR_FETCHRO_S 22 +#define FW_RI_RES_WR_FETCHRO_M 0x1 +#define FW_RI_RES_WR_FETCHRO_V(x) ((x) << FW_RI_RES_WR_FETCHRO_S) +#define FW_RI_RES_WR_FETCHRO_G(x) \ + (((x) >> FW_RI_RES_WR_FETCHRO_S) & FW_RI_RES_WR_FETCHRO_M) +#define FW_RI_RES_WR_FETCHRO_F FW_RI_RES_WR_FETCHRO_V(1U) + +#define FW_RI_RES_WR_HOSTFCMODE_S 20 +#define FW_RI_RES_WR_HOSTFCMODE_M 0x3 +#define FW_RI_RES_WR_HOSTFCMODE_V(x) ((x) << FW_RI_RES_WR_HOSTFCMODE_S) +#define FW_RI_RES_WR_HOSTFCMODE_G(x) \ + (((x) >> FW_RI_RES_WR_HOSTFCMODE_S) & FW_RI_RES_WR_HOSTFCMODE_M) + +#define FW_RI_RES_WR_CPRIO_S 19 +#define FW_RI_RES_WR_CPRIO_M 0x1 +#define FW_RI_RES_WR_CPRIO_V(x) ((x) << FW_RI_RES_WR_CPRIO_S) +#define FW_RI_RES_WR_CPRIO_G(x) \ + (((x) >> FW_RI_RES_WR_CPRIO_S) & FW_RI_RES_WR_CPRIO_M) +#define FW_RI_RES_WR_CPRIO_F FW_RI_RES_WR_CPRIO_V(1U) + +#define FW_RI_RES_WR_ONCHIP_S 18 +#define FW_RI_RES_WR_ONCHIP_M 0x1 +#define FW_RI_RES_WR_ONCHIP_V(x) ((x) << FW_RI_RES_WR_ONCHIP_S) +#define FW_RI_RES_WR_ONCHIP_G(x) \ + (((x) >> FW_RI_RES_WR_ONCHIP_S) & FW_RI_RES_WR_ONCHIP_M) +#define FW_RI_RES_WR_ONCHIP_F FW_RI_RES_WR_ONCHIP_V(1U) + +#define FW_RI_RES_WR_PCIECHN_S 16 +#define FW_RI_RES_WR_PCIECHN_M 0x3 +#define FW_RI_RES_WR_PCIECHN_V(x) ((x) << FW_RI_RES_WR_PCIECHN_S) +#define FW_RI_RES_WR_PCIECHN_G(x) \ + (((x) >> FW_RI_RES_WR_PCIECHN_S) & FW_RI_RES_WR_PCIECHN_M) + +#define FW_RI_RES_WR_IQID_S 0 +#define FW_RI_RES_WR_IQID_M 0xffff +#define FW_RI_RES_WR_IQID_V(x) ((x) << FW_RI_RES_WR_IQID_S) +#define FW_RI_RES_WR_IQID_G(x) \ + (((x) >> FW_RI_RES_WR_IQID_S) & FW_RI_RES_WR_IQID_M) + +#define FW_RI_RES_WR_DCAEN_S 31 +#define FW_RI_RES_WR_DCAEN_M 0x1 +#define FW_RI_RES_WR_DCAEN_V(x) ((x) << FW_RI_RES_WR_DCAEN_S) +#define FW_RI_RES_WR_DCAEN_G(x) \ + (((x) >> FW_RI_RES_WR_DCAEN_S) & FW_RI_RES_WR_DCAEN_M) +#define FW_RI_RES_WR_DCAEN_F FW_RI_RES_WR_DCAEN_V(1U) + +#define FW_RI_RES_WR_DCACPU_S 26 +#define FW_RI_RES_WR_DCACPU_M 0x1f +#define FW_RI_RES_WR_DCACPU_V(x) ((x) << FW_RI_RES_WR_DCACPU_S) +#define FW_RI_RES_WR_DCACPU_G(x) \ + (((x) >> FW_RI_RES_WR_DCACPU_S) & FW_RI_RES_WR_DCACPU_M) + +#define FW_RI_RES_WR_FBMIN_S 23 +#define FW_RI_RES_WR_FBMIN_M 0x7 +#define FW_RI_RES_WR_FBMIN_V(x) ((x) << FW_RI_RES_WR_FBMIN_S) +#define FW_RI_RES_WR_FBMIN_G(x) \ + (((x) >> FW_RI_RES_WR_FBMIN_S) & FW_RI_RES_WR_FBMIN_M) + +#define FW_RI_RES_WR_FBMAX_S 20 +#define FW_RI_RES_WR_FBMAX_M 0x7 +#define FW_RI_RES_WR_FBMAX_V(x) ((x) << FW_RI_RES_WR_FBMAX_S) +#define FW_RI_RES_WR_FBMAX_G(x) \ + (((x) >> FW_RI_RES_WR_FBMAX_S) & FW_RI_RES_WR_FBMAX_M) + +#define FW_RI_RES_WR_CIDXFTHRESHO_S 19 +#define FW_RI_RES_WR_CIDXFTHRESHO_M 0x1 +#define FW_RI_RES_WR_CIDXFTHRESHO_V(x) ((x) << FW_RI_RES_WR_CIDXFTHRESHO_S) +#define FW_RI_RES_WR_CIDXFTHRESHO_G(x) \ + (((x) >> FW_RI_RES_WR_CIDXFTHRESHO_S) & FW_RI_RES_WR_CIDXFTHRESHO_M) +#define FW_RI_RES_WR_CIDXFTHRESHO_F FW_RI_RES_WR_CIDXFTHRESHO_V(1U) + +#define FW_RI_RES_WR_CIDXFTHRESH_S 16 +#define FW_RI_RES_WR_CIDXFTHRESH_M 0x7 +#define FW_RI_RES_WR_CIDXFTHRESH_V(x) ((x) << FW_RI_RES_WR_CIDXFTHRESH_S) +#define FW_RI_RES_WR_CIDXFTHRESH_G(x) \ + (((x) >> FW_RI_RES_WR_CIDXFTHRESH_S) & FW_RI_RES_WR_CIDXFTHRESH_M) + +#define FW_RI_RES_WR_EQSIZE_S 0 +#define FW_RI_RES_WR_EQSIZE_M 0xffff +#define FW_RI_RES_WR_EQSIZE_V(x) ((x) << FW_RI_RES_WR_EQSIZE_S) +#define FW_RI_RES_WR_EQSIZE_G(x) \ + (((x) >> FW_RI_RES_WR_EQSIZE_S) & FW_RI_RES_WR_EQSIZE_M) + +#define FW_RI_RES_WR_IQANDST_S 15 +#define FW_RI_RES_WR_IQANDST_M 0x1 +#define FW_RI_RES_WR_IQANDST_V(x) ((x) << FW_RI_RES_WR_IQANDST_S) +#define FW_RI_RES_WR_IQANDST_G(x) \ + (((x) >> FW_RI_RES_WR_IQANDST_S) & FW_RI_RES_WR_IQANDST_M) +#define FW_RI_RES_WR_IQANDST_F FW_RI_RES_WR_IQANDST_V(1U) + +#define FW_RI_RES_WR_IQANUS_S 14 +#define FW_RI_RES_WR_IQANUS_M 0x1 +#define FW_RI_RES_WR_IQANUS_V(x) ((x) << FW_RI_RES_WR_IQANUS_S) +#define FW_RI_RES_WR_IQANUS_G(x) \ + (((x) >> FW_RI_RES_WR_IQANUS_S) & FW_RI_RES_WR_IQANUS_M) +#define FW_RI_RES_WR_IQANUS_F FW_RI_RES_WR_IQANUS_V(1U) + +#define FW_RI_RES_WR_IQANUD_S 12 +#define FW_RI_RES_WR_IQANUD_M 0x3 +#define FW_RI_RES_WR_IQANUD_V(x) ((x) << FW_RI_RES_WR_IQANUD_S) +#define FW_RI_RES_WR_IQANUD_G(x) \ + (((x) >> FW_RI_RES_WR_IQANUD_S) & FW_RI_RES_WR_IQANUD_M) + +#define FW_RI_RES_WR_IQANDSTINDEX_S 0 +#define FW_RI_RES_WR_IQANDSTINDEX_M 0xfff +#define FW_RI_RES_WR_IQANDSTINDEX_V(x) ((x) << FW_RI_RES_WR_IQANDSTINDEX_S) +#define FW_RI_RES_WR_IQANDSTINDEX_G(x) \ + (((x) >> FW_RI_RES_WR_IQANDSTINDEX_S) & FW_RI_RES_WR_IQANDSTINDEX_M) + +#define FW_RI_RES_WR_IQDROPRSS_S 15 +#define FW_RI_RES_WR_IQDROPRSS_M 0x1 +#define FW_RI_RES_WR_IQDROPRSS_V(x) ((x) << FW_RI_RES_WR_IQDROPRSS_S) +#define FW_RI_RES_WR_IQDROPRSS_G(x) \ + (((x) >> FW_RI_RES_WR_IQDROPRSS_S) & FW_RI_RES_WR_IQDROPRSS_M) +#define FW_RI_RES_WR_IQDROPRSS_F FW_RI_RES_WR_IQDROPRSS_V(1U) + +#define FW_RI_RES_WR_IQGTSMODE_S 14 +#define FW_RI_RES_WR_IQGTSMODE_M 0x1 +#define FW_RI_RES_WR_IQGTSMODE_V(x) ((x) << FW_RI_RES_WR_IQGTSMODE_S) +#define FW_RI_RES_WR_IQGTSMODE_G(x) \ + (((x) >> FW_RI_RES_WR_IQGTSMODE_S) & FW_RI_RES_WR_IQGTSMODE_M) +#define FW_RI_RES_WR_IQGTSMODE_F FW_RI_RES_WR_IQGTSMODE_V(1U) + +#define FW_RI_RES_WR_IQPCIECH_S 12 +#define FW_RI_RES_WR_IQPCIECH_M 0x3 +#define FW_RI_RES_WR_IQPCIECH_V(x) ((x) << FW_RI_RES_WR_IQPCIECH_S) +#define FW_RI_RES_WR_IQPCIECH_G(x) \ + (((x) >> FW_RI_RES_WR_IQPCIECH_S) & FW_RI_RES_WR_IQPCIECH_M) + +#define FW_RI_RES_WR_IQDCAEN_S 11 +#define FW_RI_RES_WR_IQDCAEN_M 0x1 +#define FW_RI_RES_WR_IQDCAEN_V(x) ((x) << FW_RI_RES_WR_IQDCAEN_S) +#define FW_RI_RES_WR_IQDCAEN_G(x) \ + (((x) >> FW_RI_RES_WR_IQDCAEN_S) & FW_RI_RES_WR_IQDCAEN_M) +#define FW_RI_RES_WR_IQDCAEN_F FW_RI_RES_WR_IQDCAEN_V(1U) + +#define FW_RI_RES_WR_IQDCACPU_S 6 +#define FW_RI_RES_WR_IQDCACPU_M 0x1f +#define FW_RI_RES_WR_IQDCACPU_V(x) ((x) << FW_RI_RES_WR_IQDCACPU_S) +#define FW_RI_RES_WR_IQDCACPU_G(x) \ + (((x) >> FW_RI_RES_WR_IQDCACPU_S) & FW_RI_RES_WR_IQDCACPU_M) + +#define FW_RI_RES_WR_IQINTCNTTHRESH_S 4 +#define FW_RI_RES_WR_IQINTCNTTHRESH_M 0x3 +#define FW_RI_RES_WR_IQINTCNTTHRESH_V(x) \ + ((x) << FW_RI_RES_WR_IQINTCNTTHRESH_S) +#define FW_RI_RES_WR_IQINTCNTTHRESH_G(x) \ + (((x) >> FW_RI_RES_WR_IQINTCNTTHRESH_S) & FW_RI_RES_WR_IQINTCNTTHRESH_M) + +#define FW_RI_RES_WR_IQO_S 3 +#define FW_RI_RES_WR_IQO_M 0x1 +#define FW_RI_RES_WR_IQO_V(x) ((x) << FW_RI_RES_WR_IQO_S) +#define FW_RI_RES_WR_IQO_G(x) \ + (((x) >> FW_RI_RES_WR_IQO_S) & FW_RI_RES_WR_IQO_M) +#define FW_RI_RES_WR_IQO_F FW_RI_RES_WR_IQO_V(1U) + +#define FW_RI_RES_WR_IQCPRIO_S 2 +#define FW_RI_RES_WR_IQCPRIO_M 0x1 +#define FW_RI_RES_WR_IQCPRIO_V(x) ((x) << FW_RI_RES_WR_IQCPRIO_S) +#define FW_RI_RES_WR_IQCPRIO_G(x) \ + (((x) >> FW_RI_RES_WR_IQCPRIO_S) & FW_RI_RES_WR_IQCPRIO_M) +#define FW_RI_RES_WR_IQCPRIO_F FW_RI_RES_WR_IQCPRIO_V(1U) + +#define FW_RI_RES_WR_IQESIZE_S 0 +#define FW_RI_RES_WR_IQESIZE_M 0x3 +#define FW_RI_RES_WR_IQESIZE_V(x) ((x) << FW_RI_RES_WR_IQESIZE_S) +#define FW_RI_RES_WR_IQESIZE_G(x) \ + (((x) >> FW_RI_RES_WR_IQESIZE_S) & FW_RI_RES_WR_IQESIZE_M) + +#define FW_RI_RES_WR_IQNS_S 31 +#define FW_RI_RES_WR_IQNS_M 0x1 +#define FW_RI_RES_WR_IQNS_V(x) ((x) << FW_RI_RES_WR_IQNS_S) +#define FW_RI_RES_WR_IQNS_G(x) \ + (((x) >> FW_RI_RES_WR_IQNS_S) & FW_RI_RES_WR_IQNS_M) +#define FW_RI_RES_WR_IQNS_F FW_RI_RES_WR_IQNS_V(1U) + +#define FW_RI_RES_WR_IQRO_S 30 +#define FW_RI_RES_WR_IQRO_M 0x1 +#define FW_RI_RES_WR_IQRO_V(x) ((x) << FW_RI_RES_WR_IQRO_S) +#define FW_RI_RES_WR_IQRO_G(x) \ + (((x) >> FW_RI_RES_WR_IQRO_S) & FW_RI_RES_WR_IQRO_M) +#define FW_RI_RES_WR_IQRO_F FW_RI_RES_WR_IQRO_V(1U) + +struct fw_ri_rdma_write_wr { + __u8 opcode; + __u8 flags; + __u16 wrid; + __u8 r1[3]; + __u8 len16; + union { + struct { + __be32 imm_data32; + u32 reserved; + } ib_imm_data; + __be64 imm_data64; + } iw_imm_data; + __be32 plen; + __be32 stag_sink; + __be64 to_sink; +#ifndef C99_NOT_SUPPORTED + union { + struct fw_ri_immd immd_src[0]; + struct fw_ri_isgl isgl_src[0]; + } u; +#endif +}; + +struct fw_ri_send_wr { + __u8 opcode; + __u8 flags; + __u16 wrid; + __u8 r1[3]; + __u8 len16; + __be32 sendop_pkd; + __be32 stag_inv; + __be32 plen; + __be32 r3; + __be64 r4; +#ifndef C99_NOT_SUPPORTED + union { + struct fw_ri_immd immd_src[0]; + struct fw_ri_isgl isgl_src[0]; + } u; +#endif +}; + +#define FW_RI_SEND_WR_SENDOP_S 0 +#define FW_RI_SEND_WR_SENDOP_M 0xf +#define FW_RI_SEND_WR_SENDOP_V(x) ((x) << FW_RI_SEND_WR_SENDOP_S) +#define FW_RI_SEND_WR_SENDOP_G(x) \ + (((x) >> FW_RI_SEND_WR_SENDOP_S) & FW_RI_SEND_WR_SENDOP_M) + +struct fw_ri_rdma_write_cmpl_wr { + __u8 opcode; + __u8 flags; + __u16 wrid; + __u8 r1[3]; + __u8 len16; + __u8 r2; + __u8 flags_send; + __u16 wrid_send; + __be32 stag_inv; + __be32 plen; + __be32 stag_sink; + __be64 to_sink; + union fw_ri_cmpl { + struct fw_ri_immd_cmpl { + __u8 op; + __u8 r1[6]; + __u8 immdlen; + __u8 data[16]; + } immd_src; + struct fw_ri_isgl isgl_src; + } u_cmpl; + __be64 r3; +#ifndef C99_NOT_SUPPORTED + union fw_ri_write { + struct fw_ri_immd immd_src[0]; + struct fw_ri_isgl isgl_src[0]; + } u; +#endif +}; + +struct fw_ri_rdma_read_wr { + __u8 opcode; + __u8 flags; + __u16 wrid; + __u8 r1[3]; + __u8 len16; + __be64 r2; + __be32 stag_sink; + __be32 to_sink_hi; + __be32 to_sink_lo; + __be32 plen; + __be32 stag_src; + __be32 to_src_hi; + __be32 to_src_lo; + __be32 r5; +}; + +struct fw_ri_recv_wr { + __u8 opcode; + __u8 r1; + __u16 wrid; + __u8 r2[3]; + __u8 len16; + struct fw_ri_isgl isgl; +}; + +struct fw_ri_bind_mw_wr { + __u8 opcode; + __u8 flags; + __u16 wrid; + __u8 r1[3]; + __u8 len16; + __u8 qpbinde_to_dcacpu; + __u8 pgsz_shift; + __u8 addr_type; + __u8 mem_perms; + __be32 stag_mr; + __be32 stag_mw; + __be32 r3; + __be64 len_mw; + __be64 va_fbo; + __be64 r4; +}; + +#define FW_RI_BIND_MW_WR_QPBINDE_S 6 +#define FW_RI_BIND_MW_WR_QPBINDE_M 0x1 +#define FW_RI_BIND_MW_WR_QPBINDE_V(x) ((x) << FW_RI_BIND_MW_WR_QPBINDE_S) +#define FW_RI_BIND_MW_WR_QPBINDE_G(x) \ + (((x) >> FW_RI_BIND_MW_WR_QPBINDE_S) & FW_RI_BIND_MW_WR_QPBINDE_M) +#define FW_RI_BIND_MW_WR_QPBINDE_F FW_RI_BIND_MW_WR_QPBINDE_V(1U) + +#define FW_RI_BIND_MW_WR_NS_S 5 +#define FW_RI_BIND_MW_WR_NS_M 0x1 +#define FW_RI_BIND_MW_WR_NS_V(x) ((x) << FW_RI_BIND_MW_WR_NS_S) +#define FW_RI_BIND_MW_WR_NS_G(x) \ + (((x) >> FW_RI_BIND_MW_WR_NS_S) & FW_RI_BIND_MW_WR_NS_M) +#define FW_RI_BIND_MW_WR_NS_F FW_RI_BIND_MW_WR_NS_V(1U) + +#define FW_RI_BIND_MW_WR_DCACPU_S 0 +#define FW_RI_BIND_MW_WR_DCACPU_M 0x1f +#define FW_RI_BIND_MW_WR_DCACPU_V(x) ((x) << FW_RI_BIND_MW_WR_DCACPU_S) +#define FW_RI_BIND_MW_WR_DCACPU_G(x) \ + (((x) >> FW_RI_BIND_MW_WR_DCACPU_S) & FW_RI_BIND_MW_WR_DCACPU_M) + +struct fw_ri_fr_nsmr_wr { + __u8 opcode; + __u8 flags; + __u16 wrid; + __u8 r1[3]; + __u8 len16; + __u8 qpbinde_to_dcacpu; + __u8 pgsz_shift; + __u8 addr_type; + __u8 mem_perms; + __be32 stag; + __be32 len_hi; + __be32 len_lo; + __be32 va_hi; + __be32 va_lo_fbo; +}; + +#define FW_RI_FR_NSMR_WR_QPBINDE_S 6 +#define FW_RI_FR_NSMR_WR_QPBINDE_M 0x1 +#define FW_RI_FR_NSMR_WR_QPBINDE_V(x) ((x) << FW_RI_FR_NSMR_WR_QPBINDE_S) +#define FW_RI_FR_NSMR_WR_QPBINDE_G(x) \ + (((x) >> FW_RI_FR_NSMR_WR_QPBINDE_S) & FW_RI_FR_NSMR_WR_QPBINDE_M) +#define FW_RI_FR_NSMR_WR_QPBINDE_F FW_RI_FR_NSMR_WR_QPBINDE_V(1U) + +#define FW_RI_FR_NSMR_WR_NS_S 5 +#define FW_RI_FR_NSMR_WR_NS_M 0x1 +#define FW_RI_FR_NSMR_WR_NS_V(x) ((x) << FW_RI_FR_NSMR_WR_NS_S) +#define FW_RI_FR_NSMR_WR_NS_G(x) \ + (((x) >> FW_RI_FR_NSMR_WR_NS_S) & FW_RI_FR_NSMR_WR_NS_M) +#define FW_RI_FR_NSMR_WR_NS_F FW_RI_FR_NSMR_WR_NS_V(1U) + +#define FW_RI_FR_NSMR_WR_DCACPU_S 0 +#define FW_RI_FR_NSMR_WR_DCACPU_M 0x1f +#define FW_RI_FR_NSMR_WR_DCACPU_V(x) ((x) << FW_RI_FR_NSMR_WR_DCACPU_S) +#define FW_RI_FR_NSMR_WR_DCACPU_G(x) \ + (((x) >> FW_RI_FR_NSMR_WR_DCACPU_S) & FW_RI_FR_NSMR_WR_DCACPU_M) + +struct fw_ri_inv_lstag_wr { + __u8 opcode; + __u8 flags; + __u16 wrid; + __u8 r1[3]; + __u8 len16; + __be32 r2; + __be32 stag_inv; +}; + +enum fw_ri_type { + FW_RI_TYPE_INIT, + FW_RI_TYPE_FINI, + FW_RI_TYPE_TERMINATE +}; + +enum fw_ri_init_p2ptype { + FW_RI_INIT_P2PTYPE_RDMA_WRITE = FW_RI_RDMA_WRITE, + FW_RI_INIT_P2PTYPE_READ_REQ = FW_RI_READ_REQ, + FW_RI_INIT_P2PTYPE_SEND = FW_RI_SEND, + FW_RI_INIT_P2PTYPE_SEND_WITH_INV = FW_RI_SEND_WITH_INV, + FW_RI_INIT_P2PTYPE_SEND_WITH_SE = FW_RI_SEND_WITH_SE, + FW_RI_INIT_P2PTYPE_SEND_WITH_SE_INV = FW_RI_SEND_WITH_SE_INV, + FW_RI_INIT_P2PTYPE_DISABLED = 0xf, +}; + +enum fw_ri_init_rqeqid_srq { + FW_RI_INIT_RQEQID_SRQ = 1 << 31, +}; + +struct fw_ri_wr { + __be32 op_compl; + __be32 flowid_len16; + __u64 cookie; + union fw_ri { + struct fw_ri_init { + __u8 type; + __u8 mpareqbit_p2ptype; + __u8 r4[2]; + __u8 mpa_attrs; + __u8 qp_caps; + __be16 nrqe; + __be32 pdid; + __be32 qpid; + __be32 sq_eqid; + __be32 rq_eqid; + __be32 scqid; + __be32 rcqid; + __be32 ord_max; + __be32 ird_max; + __be32 iss; + __be32 irs; + __be32 hwrqsize; + __be32 hwrqaddr; + __be64 r5; + union fw_ri_init_p2p { + struct fw_ri_rdma_write_wr write; + struct fw_ri_rdma_read_wr read; + struct fw_ri_send_wr send; + } u; + } init; + struct fw_ri_fini { + __u8 type; + __u8 r3[7]; + __be64 r4; + } fini; + struct fw_ri_terminate { + __u8 type; + __u8 r3[3]; + __be32 immdlen; + __u8 termmsg[40]; + } terminate; + } u; +}; + +#define FW_RI_WR_MPAREQBIT_S 7 +#define FW_RI_WR_MPAREQBIT_M 0x1 +#define FW_RI_WR_MPAREQBIT_V(x) ((x) << FW_RI_WR_MPAREQBIT_S) +#define FW_RI_WR_MPAREQBIT_G(x) \ + (((x) >> FW_RI_WR_MPAREQBIT_S) & FW_RI_WR_MPAREQBIT_M) +#define FW_RI_WR_MPAREQBIT_F FW_RI_WR_MPAREQBIT_V(1U) + +#define FW_RI_WR_P2PTYPE_S 0 +#define FW_RI_WR_P2PTYPE_M 0xf +#define FW_RI_WR_P2PTYPE_V(x) ((x) << FW_RI_WR_P2PTYPE_S) +#define FW_RI_WR_P2PTYPE_G(x) \ + (((x) >> FW_RI_WR_P2PTYPE_S) & FW_RI_WR_P2PTYPE_M) + +#endif /* _T4FW_RI_API_H_ */ diff --git a/providers/cxgb4/verbs.c b/providers/cxgb4/verbs.c new file mode 100644 index 0000000..32bae69 --- /dev/null +++ b/providers/cxgb4/verbs.c @@ -0,0 +1,828 @@ +/* + * Copyright (c) 2006-2016 Chelsio, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include <config.h> + +#include <stdlib.h> +#include <stdio.h> +#include <string.h> +#include <errno.h> +#include <pthread.h> +#include <sys/mman.h> +#include <inttypes.h> +#include <assert.h> + +#include "libcxgb4.h" +#include "cxgb4-abi.h" + +bool is_64b_cqe; + +#define MASKED(x) (void *)((unsigned long)(x) & c4iw_page_mask) + +int c4iw_query_device(struct ibv_context *context, struct ibv_device_attr *attr) +{ + struct ibv_query_device cmd; + uint64_t raw_fw_ver; + u8 major, minor, sub_minor, build; + int ret; + + ret = ibv_cmd_query_device(context, attr, &raw_fw_ver, &cmd, + sizeof cmd); + if (ret) + return ret; + + major = (raw_fw_ver >> 24) & 0xff; + minor = (raw_fw_ver >> 16) & 0xff; + sub_minor = (raw_fw_ver >> 8) & 0xff; + build = raw_fw_ver & 0xff; + + snprintf(attr->fw_ver, sizeof attr->fw_ver, + "%d.%d.%d.%d", major, minor, sub_minor, build); + + return 0; +} + +int c4iw_query_port(struct ibv_context *context, uint8_t port, + struct ibv_port_attr *attr) +{ + struct ibv_query_port cmd; + + return ibv_cmd_query_port(context, port, attr, &cmd, sizeof cmd); +} + +struct ibv_pd *c4iw_alloc_pd(struct ibv_context *context) +{ + struct ibv_alloc_pd cmd; + struct uc4iw_alloc_pd_resp resp; + struct c4iw_pd *pd; + + pd = malloc(sizeof *pd); + if (!pd) + return NULL; + + if (ibv_cmd_alloc_pd(context, &pd->ibv_pd, &cmd, sizeof cmd, + &resp.ibv_resp, sizeof resp)) { + free(pd); + return NULL; + } + + return &pd->ibv_pd; +} + +int c4iw_free_pd(struct ibv_pd *pd) +{ + int ret; + + ret = ibv_cmd_dealloc_pd(pd); + if (ret) + return ret; + + free(pd); + return 0; +} + +struct ibv_mr *c4iw_reg_mr(struct ibv_pd *pd, void *addr, size_t length, + uint64_t hca_va, int access) +{ + struct c4iw_mr *mhp; + struct ibv_reg_mr cmd; + struct ib_uverbs_reg_mr_resp resp; + struct c4iw_dev *dev = to_c4iw_dev(pd->context->device); + + PDBG("%s addr %p length %ld hca_va %p\n", __func__, addr, length, + hca_va); + + mhp = malloc(sizeof *mhp); + if (!mhp) + return NULL; + + if (ibv_cmd_reg_mr(pd, addr, length, hca_va, + access, &mhp->vmr, &cmd, sizeof(cmd), + &resp, sizeof resp)) { + free(mhp); + return NULL; + } + + mhp->va_fbo = hca_va; + mhp->len = length; + + PDBG("%s stag 0x%x va_fbo 0x%" PRIx64 " len %d\n", + __func__, mhp->vmr.ibv_mr.rkey, mhp->va_fbo, mhp->len); + + pthread_spin_lock(&dev->lock); + dev->mmid2ptr[c4iw_mmid(mhp->vmr.ibv_mr.lkey)] = mhp; + pthread_spin_unlock(&dev->lock); + INC_STAT(mr); + return &mhp->vmr.ibv_mr; +} + +int c4iw_dereg_mr(struct verbs_mr *vmr) +{ + int ret; + struct c4iw_dev *dev = to_c4iw_dev(vmr->ibv_mr.pd->context->device); + + ret = ibv_cmd_dereg_mr(vmr); + if (ret) + return ret; + + pthread_spin_lock(&dev->lock); + dev->mmid2ptr[c4iw_mmid(vmr->ibv_mr.lkey)] = NULL; + pthread_spin_unlock(&dev->lock); + + free(to_c4iw_mr(vmr)); + + return 0; +} + +struct ibv_cq *c4iw_create_cq(struct ibv_context *context, int cqe, + struct ibv_comp_channel *channel, int comp_vector) +{ + struct uc4iw_create_cq cmd = {}; + struct uc4iw_create_cq_resp resp; + struct c4iw_cq *chp; + struct c4iw_dev *dev = to_c4iw_dev(context->device); + int ret; + + chp = calloc(1, sizeof *chp); + if (!chp) { + return NULL; + } + + resp.flags = 0; + cmd.flags = C4IW_64B_CQE; + + ret = ibv_cmd_create_cq(context, cqe, channel, comp_vector, + &chp->ibv_cq, &cmd.ibv_cmd, sizeof(cmd), + &resp.ibv_resp, sizeof resp); + if (ret) + goto err1; + + if (resp.flags & C4IW_64B_CQE) + is_64b_cqe = true; + + pthread_spin_init(&chp->lock, PTHREAD_PROCESS_PRIVATE); +#ifdef STALL_DETECTION + gettimeofday(&chp->time, NULL); +#endif + chp->rhp = dev; + chp->cq.qid_mask = resp.qid_mask; + chp->cq.cqid = resp.cqid; + chp->cq.size = resp.size; + chp->cq.memsize = resp.memsize; + chp->cq.gen = 1; + chp->cq.queue = mmap(NULL, chp->cq.memsize, PROT_READ|PROT_WRITE, + MAP_SHARED, context->cmd_fd, resp.key); + if (chp->cq.queue == MAP_FAILED) + goto err2; + + chp->cq.qp_errp = + &((struct t4_status_page *) + Q_ENTRY(chp->cq.queue, chp->cq.size))->qp_err; + + chp->cq.ugts = mmap(NULL, c4iw_page_size, PROT_WRITE, MAP_SHARED, + context->cmd_fd, resp.gts_key); + if (chp->cq.ugts == MAP_FAILED) + goto err3; + + if (dev_is_t4(chp->rhp)) + chp->cq.ugts += 1; + else + chp->cq.ugts += 5; + chp->cq.sw_queue = calloc(chp->cq.size, CQE_SIZE(chp->cq.queue)); + if (!chp->cq.sw_queue) + goto err4; + + PDBG("%s cqid 0x%x key %" PRIx64 " va %p memsize %lu gts_key %" + PRIx64 " va %p qid_mask 0x%x\n", + __func__, chp->cq.cqid, resp.key, chp->cq.queue, + chp->cq.memsize, resp.gts_key, chp->cq.ugts, chp->cq.qid_mask); + + pthread_spin_lock(&dev->lock); + dev->cqid2ptr[chp->cq.cqid] = chp; + pthread_spin_unlock(&dev->lock); + INC_STAT(cq); + return &chp->ibv_cq; +err4: + munmap(MASKED(chp->cq.ugts), c4iw_page_size); +err3: + munmap(chp->cq.queue, chp->cq.memsize); +err2: + (void)ibv_cmd_destroy_cq(&chp->ibv_cq); +err1: + free(chp); + return NULL; +} + +int c4iw_destroy_cq(struct ibv_cq *ibcq) +{ + int ret; + struct c4iw_cq *chp = to_c4iw_cq(ibcq); + struct c4iw_dev *dev = to_c4iw_dev(ibcq->context->device); + + chp->cq.error = 1; + ret = ibv_cmd_destroy_cq(ibcq); + if (ret) { + return ret; + } + munmap(MASKED(chp->cq.ugts), c4iw_page_size); + munmap(chp->cq.queue, chp->cq.memsize); + + pthread_spin_lock(&dev->lock); + dev->cqid2ptr[chp->cq.cqid] = NULL; + pthread_spin_unlock(&dev->lock); + + free(chp->cq.sw_queue); + free(chp); + return 0; +} + +struct ibv_srq *c4iw_create_srq(struct ibv_pd *pd, + struct ibv_srq_init_attr *attr) +{ + struct c4iw_dev *dev = to_c4iw_dev(pd->context->device); + struct uc4iw_create_srq_resp resp; + unsigned long segment_offset; + struct ibv_create_srq cmd; + struct c4iw_srq *srq; + void *dbva; + int ret; + + PDBG("%s enter\n", __func__); + srq = calloc(1, sizeof(*srq)); + if (!srq) + goto err; + + ret = ibv_cmd_create_srq(pd, &srq->ibv_srq, attr, &cmd, + sizeof(cmd), &resp.ibv_resp, sizeof(resp)); + if (ret) + goto err_free_srq_mem; + + PDBG("%s srq id 0x%x srq key %" PRIx64 " srq db/gts key %" PRIx64 + " qid_mask 0x%x\n", __func__, + resp.srqid, resp.srq_key, resp.srq_db_gts_key, + resp.qid_mask); + + srq->rhp = dev; + srq->wq.qid = resp.srqid; + srq->wq.size = resp.srq_size; + srq->wq.memsize = resp.srq_memsize; + srq->wq.rqt_abs_idx = resp.rqt_abs_idx; + srq->flags = resp.flags; + pthread_spin_init(&srq->lock, PTHREAD_PROCESS_PRIVATE); + + dbva = mmap(NULL, c4iw_page_size, PROT_WRITE, MAP_SHARED, + pd->context->cmd_fd, resp.srq_db_gts_key); + if (dbva == MAP_FAILED) + goto err_destroy_srq; + srq->wq.udb = dbva; + + segment_offset = 128 * (srq->wq.qid & resp.qid_mask); + if (segment_offset < c4iw_page_size) { + srq->wq.udb += segment_offset / 4; + srq->wq.wc_reg_available = 1; + } else + srq->wq.bar2_qid = srq->wq.qid & resp.qid_mask; + srq->wq.udb += 2; + + srq->wq.queue = mmap(NULL, srq->wq.memsize, + PROT_WRITE, MAP_SHARED, + pd->context->cmd_fd, resp.srq_key); + if (srq->wq.queue == MAP_FAILED) + goto err_unmap_udb; + + srq->wq.sw_rq = calloc(srq->wq.size, sizeof(struct t4_swrqe)); + if (!srq->wq.sw_rq) + goto err_unmap_queue; + srq->wq.pending_wrs = + calloc(srq->wq.size, sizeof(*srq->wq.pending_wrs)); + if (!srq->wq.pending_wrs) + goto err_free_sw_rq; + + pthread_spin_lock(&dev->lock); + list_add_tail(&dev->srq_list, &srq->list); + pthread_spin_unlock(&dev->lock); + + PDBG("%s srq dbva %p srq qva %p srq depth %u srq memsize %lu\n", + __func__, srq->wq.udb, srq->wq.queue, + srq->wq.size, srq->wq.memsize); + + INC_STAT(srq); + return &srq->ibv_srq; +err_free_sw_rq: + free(srq->wq.sw_rq); +err_unmap_queue: + munmap((void *)srq->wq.queue, srq->wq.memsize); +err_unmap_udb: + munmap(MASKED(srq->wq.udb), c4iw_page_size); +err_destroy_srq: + (void)ibv_cmd_destroy_srq(&srq->ibv_srq); +err_free_srq_mem: + free(srq); +err: + + return NULL; +} + +int c4iw_modify_srq(struct ibv_srq *ibsrq, struct ibv_srq_attr *attr, + int attr_mask) +{ + struct c4iw_srq *srq = to_c4iw_srq(ibsrq); + struct ibv_modify_srq cmd; + int ret; + + /* XXX no support for this yet */ + if (attr_mask & IBV_SRQ_MAX_WR) + return EINVAL; + + ret = ibv_cmd_modify_srq(ibsrq, attr, attr_mask, &cmd, sizeof(cmd)); + if (!ret) { + if (attr_mask & IBV_SRQ_LIMIT) { + srq->armed = 1; + srq->srq_limit = attr->srq_limit; + } + } + return ret; +} + +int c4iw_destroy_srq(struct ibv_srq *ibsrq) +{ + int ret; + struct c4iw_srq *srq = to_c4iw_srq(ibsrq); + + PDBG("%s enter qp %p\n", __func__, ibsrq); + + ret = ibv_cmd_destroy_srq(ibsrq); + if (ret) + return ret; + + pthread_spin_lock(&srq->rhp->lock); + list_del(&srq->list); + pthread_spin_unlock(&srq->rhp->lock); + + munmap(MASKED(srq->wq.udb), c4iw_page_size); + munmap(srq->wq.queue, srq->wq.memsize); + + free(srq->wq.pending_wrs); + free(srq->wq.sw_rq); + free(srq); + return 0; + +} + +int c4iw_query_srq(struct ibv_srq *ibsrq, struct ibv_srq_attr *attr) +{ + struct ibv_query_srq cmd; + + return ibv_cmd_query_srq(ibsrq, attr, &cmd, sizeof(cmd)); +} + +static struct ibv_qp *create_qp_v0(struct ibv_pd *pd, + struct ibv_qp_init_attr *attr) +{ + struct ibv_create_qp cmd; + struct uc4iw_create_qp_v0_resp resp; + struct c4iw_qp *qhp; + struct c4iw_dev *dev = to_c4iw_dev(pd->context->device); + int ret; + void *dbva; + + PDBG("%s enter qp\n", __func__); + qhp = calloc(1, sizeof *qhp); + if (!qhp) + goto err1; + + ret = ibv_cmd_create_qp(pd, &qhp->ibv_qp, attr, &cmd, + sizeof cmd, &resp.ibv_resp, sizeof resp); + if (ret) + goto err2; + + PDBG("%s sqid 0x%x sq key %" PRIx64 " sq db/gts key %" PRIx64 + " rqid 0x%x rq key %" PRIx64 " rq db/gts key %" PRIx64 + " qid_mask 0x%x\n", + __func__, + resp.sqid, resp.sq_key, resp.sq_db_gts_key, + resp.rqid, resp.rq_key, resp.rq_db_gts_key, resp.qid_mask); + + qhp->wq.qid_mask = resp.qid_mask; + qhp->rhp = dev; + qhp->wq.sq.qid = resp.sqid; + qhp->wq.sq.size = resp.sq_size; + qhp->wq.sq.memsize = resp.sq_memsize; + qhp->wq.sq.flags = 0; + qhp->wq.rq.msn = 1; + qhp->wq.rq.qid = resp.rqid; + qhp->wq.rq.size = resp.rq_size; + qhp->wq.rq.memsize = resp.rq_memsize; + pthread_spin_init(&qhp->lock, PTHREAD_PROCESS_PRIVATE); + + dbva = mmap(NULL, c4iw_page_size, PROT_WRITE, MAP_SHARED, + pd->context->cmd_fd, resp.sq_db_gts_key); + if (dbva == MAP_FAILED) + goto err3; + + qhp->wq.sq.udb = dbva; + qhp->wq.sq.queue = mmap(NULL, qhp->wq.sq.memsize, + PROT_WRITE, MAP_SHARED, + pd->context->cmd_fd, resp.sq_key); + if (qhp->wq.sq.queue == MAP_FAILED) + goto err4; + + dbva = mmap(NULL, c4iw_page_size, PROT_WRITE, MAP_SHARED, + pd->context->cmd_fd, resp.rq_db_gts_key); + if (dbva == MAP_FAILED) + goto err5; + qhp->wq.rq.udb = dbva; + qhp->wq.rq.queue = mmap(NULL, qhp->wq.rq.memsize, + PROT_WRITE, MAP_SHARED, + pd->context->cmd_fd, resp.rq_key); + if (qhp->wq.rq.queue == MAP_FAILED) + goto err6; + + qhp->wq.sq.sw_sq = calloc(qhp->wq.sq.size, sizeof (struct t4_swsqe)); + if (!qhp->wq.sq.sw_sq) + goto err7; + + qhp->wq.rq.sw_rq = calloc(qhp->wq.rq.size, sizeof(struct t4_swrqe)); + if (!qhp->wq.rq.sw_rq) + goto err8; + + PDBG("%s sq dbva %p sq qva %p sq depth %u sq memsize %lu " + " rq dbva %p rq qva %p rq depth %u rq memsize %lu\n", + __func__, + qhp->wq.sq.udb, qhp->wq.sq.queue, + qhp->wq.sq.size, qhp->wq.sq.memsize, + qhp->wq.rq.udb, qhp->wq.rq.queue, + qhp->wq.rq.size, qhp->wq.rq.memsize); + + qhp->sq_sig_all = attr->sq_sig_all; + + pthread_spin_lock(&dev->lock); + dev->qpid2ptr[qhp->wq.sq.qid] = qhp; + pthread_spin_unlock(&dev->lock); + INC_STAT(qp); + return &qhp->ibv_qp; +err8: + free(qhp->wq.sq.sw_sq); +err7: + munmap((void *)qhp->wq.rq.queue, qhp->wq.rq.memsize); +err6: + munmap(MASKED(qhp->wq.rq.udb), c4iw_page_size); +err5: + munmap((void *)qhp->wq.sq.queue, qhp->wq.sq.memsize); +err4: + munmap(MASKED(qhp->wq.sq.udb), c4iw_page_size); +err3: + (void)ibv_cmd_destroy_qp(&qhp->ibv_qp); +err2: + free(qhp); +err1: + return NULL; +} + +static struct ibv_qp *create_qp(struct ibv_pd *pd, + struct ibv_qp_init_attr *attr) +{ + struct ibv_create_qp cmd; + struct uc4iw_create_qp_resp resp; + struct c4iw_qp *qhp; + struct c4iw_dev *dev = to_c4iw_dev(pd->context->device); + struct c4iw_context *ctx = to_c4iw_context(pd->context); + int ret; + void *dbva; + + PDBG("%s enter qp\n", __func__); + qhp = calloc(1, sizeof *qhp); + if (!qhp) + goto err1; + + ret = ibv_cmd_create_qp(pd, &qhp->ibv_qp, attr, &cmd, + sizeof cmd, &resp.ibv_resp, sizeof resp); + if (ret) + goto err2; + + PDBG("%s sqid 0x%x sq key %" PRIx64 " sq db/gts key %" PRIx64 + " rqid 0x%x rq key %" PRIx64 " rq db/gts key %" PRIx64 + " qid_mask 0x%x\n", + __func__, + resp.sqid, resp.sq_key, resp.sq_db_gts_key, + resp.rqid, resp.rq_key, resp.rq_db_gts_key, resp.qid_mask); + + qhp->wq.qid_mask = resp.qid_mask; + qhp->rhp = dev; + qhp->wq.sq.qid = resp.sqid; + qhp->wq.sq.size = resp.sq_size; + qhp->wq.sq.memsize = resp.sq_memsize; + qhp->wq.sq.flags = resp.flags & C4IW_QPF_ONCHIP ? T4_SQ_ONCHIP : 0; + if (resp.flags & C4IW_QPF_WRITE_W_IMM) + qhp->wq.sq.flags |= T4_SQ_WRITE_W_IMM; + qhp->wq.sq.flush_cidx = -1; + qhp->wq.rq.msn = 1; + qhp->srq = to_c4iw_srq(attr->srq); + if (!attr->srq) { + qhp->wq.rq.qid = resp.rqid; + qhp->wq.rq.size = resp.rq_size; + qhp->wq.rq.memsize = resp.rq_memsize; + } + if (ma_wr && resp.sq_memsize < (resp.sq_size + 1) * + sizeof *qhp->wq.sq.queue + 16*sizeof(__be64) ) { + ma_wr = 0; + fprintf(stderr, "libcxgb4 warning - downlevel iw_cxgb4 driver. " + "MA workaround disabled.\n"); + } + pthread_spin_init(&qhp->lock, PTHREAD_PROCESS_PRIVATE); + + dbva = mmap(NULL, c4iw_page_size, PROT_WRITE, MAP_SHARED, + pd->context->cmd_fd, resp.sq_db_gts_key); + if (dbva == MAP_FAILED) + goto err3; + qhp->wq.sq.udb = dbva; + if (!dev_is_t4(qhp->rhp)) { + unsigned long segment_offset = 128 * (qhp->wq.sq.qid & + qhp->wq.qid_mask); + + if (segment_offset < c4iw_page_size) { + qhp->wq.sq.udb += segment_offset / 4; + qhp->wq.sq.wc_reg_available = 1; + } else + qhp->wq.sq.bar2_qid = qhp->wq.sq.qid & qhp->wq.qid_mask; + qhp->wq.sq.udb += 2; + } + + qhp->wq.sq.queue = mmap(NULL, qhp->wq.sq.memsize, + PROT_WRITE, MAP_SHARED, + pd->context->cmd_fd, resp.sq_key); + if (qhp->wq.sq.queue == MAP_FAILED) + goto err4; + + if (!attr->srq) { + dbva = mmap(NULL, c4iw_page_size, PROT_WRITE, MAP_SHARED, + pd->context->cmd_fd, resp.rq_db_gts_key); + if (dbva == MAP_FAILED) + goto err5; + qhp->wq.rq.udb = dbva; + if (!dev_is_t4(qhp->rhp)) { + unsigned long segment_offset = 128 * (qhp->wq.rq.qid & + qhp->wq.qid_mask); + + if (segment_offset < c4iw_page_size) { + qhp->wq.rq.udb += segment_offset / 4; + qhp->wq.rq.wc_reg_available = 1; + } else + qhp->wq.rq.bar2_qid = + qhp->wq.rq.qid & qhp->wq.qid_mask; + qhp->wq.rq.udb += 2; + } + qhp->wq.rq.queue = mmap(NULL, qhp->wq.rq.memsize, + PROT_WRITE, MAP_SHARED, + pd->context->cmd_fd, resp.rq_key); + if (qhp->wq.rq.queue == MAP_FAILED) + goto err6; + } + + qhp->wq.sq.sw_sq = calloc(qhp->wq.sq.size, sizeof (struct t4_swsqe)); + if (!qhp->wq.sq.sw_sq) + goto err7; + + if (!attr->srq) { + qhp->wq.rq.sw_rq = + calloc(qhp->wq.rq.size, sizeof(struct t4_swrqe)); + if (!qhp->wq.rq.sw_rq) + goto err8; + } + + if (t4_sq_onchip(&qhp->wq)) { + qhp->wq.sq.ma_sync = mmap(NULL, c4iw_page_size, PROT_WRITE, + MAP_SHARED, pd->context->cmd_fd, + resp.ma_sync_key); + if (qhp->wq.sq.ma_sync == MAP_FAILED) + goto err9; + qhp->wq.sq.ma_sync += (A_PCIE_MA_SYNC & (c4iw_page_size - 1)); + } + + if (ctx->status_page_size) { + qhp->wq.db_offp = &ctx->status_page->db_off; + } else if (!attr->srq) { + qhp->wq.db_offp = + &qhp->wq.rq.queue[qhp->wq.rq.size].status.db_off; + } + + if (!attr->srq) + qhp->wq.qp_errp = + &qhp->wq.rq.queue[qhp->wq.rq.size].status.qp_err; + else { + qhp->wq.qp_errp = + &qhp->wq.sq.queue[qhp->wq.sq.size].status.qp_err; + qhp->wq.srqidxp = + &qhp->wq.sq.queue[qhp->wq.sq.size].status.srqidx; + } + + PDBG("%s sq dbva %p sq qva %p sq depth %u sq memsize %lu " + " rq dbva %p rq qva %p rq depth %u rq memsize %lu\n", + __func__, + qhp->wq.sq.udb, qhp->wq.sq.queue, + qhp->wq.sq.size, qhp->wq.sq.memsize, + qhp->wq.rq.udb, qhp->wq.rq.queue, + qhp->wq.rq.size, qhp->wq.rq.memsize); + + qhp->sq_sig_all = attr->sq_sig_all; + + pthread_spin_lock(&dev->lock); + dev->qpid2ptr[qhp->wq.sq.qid] = qhp; + pthread_spin_unlock(&dev->lock); + INC_STAT(qp); + return &qhp->ibv_qp; +err9: + if (!attr->srq) + free(qhp->wq.rq.sw_rq); +err8: + free(qhp->wq.sq.sw_sq); +err7: + if (!attr->srq) + munmap((void *)qhp->wq.rq.queue, qhp->wq.rq.memsize); +err6: + if (!attr->srq) + munmap(MASKED(qhp->wq.rq.udb), c4iw_page_size); +err5: + munmap((void *)qhp->wq.sq.queue, qhp->wq.sq.memsize); +err4: + munmap(MASKED(qhp->wq.sq.udb), c4iw_page_size); +err3: + (void)ibv_cmd_destroy_qp(&qhp->ibv_qp); +err2: + free(qhp); +err1: + return NULL; +} + +struct ibv_qp *c4iw_create_qp(struct ibv_pd *pd, + struct ibv_qp_init_attr *attr) +{ + struct c4iw_dev *dev = to_c4iw_dev(pd->context->device); + + if (dev->abi_version == 0) + return create_qp_v0(pd, attr); + return create_qp(pd, attr); +} + +static void reset_qp(struct c4iw_qp *qhp) +{ + PDBG("%s enter qp %p\n", __func__, qhp); + qhp->wq.sq.cidx = 0; + qhp->wq.sq.wq_pidx = qhp->wq.sq.pidx = qhp->wq.sq.in_use = 0; + qhp->wq.rq.cidx = qhp->wq.rq.pidx = qhp->wq.rq.in_use = 0; + qhp->wq.sq.oldest_read = NULL; + memset(qhp->wq.sq.queue, 0, qhp->wq.sq.memsize); + if (t4_sq_onchip(&qhp->wq)) + mmio_flush_writes(); + memset(qhp->wq.rq.queue, 0, qhp->wq.rq.memsize); +} + +int c4iw_modify_qp(struct ibv_qp *ibqp, struct ibv_qp_attr *attr, + int attr_mask) +{ + struct ibv_modify_qp cmd = {}; + struct c4iw_qp *qhp = to_c4iw_qp(ibqp); + int ret; + + PDBG("%s enter qp %p new state %d\n", __func__, ibqp, attr_mask & IBV_QP_STATE ? attr->qp_state : -1); + if (t4_wq_in_error(&qhp->wq)) + c4iw_flush_qp(qhp); + pthread_spin_lock(&qhp->lock); + ret = ibv_cmd_modify_qp(ibqp, attr, attr_mask, &cmd, sizeof cmd); + if (!ret && (attr_mask & IBV_QP_STATE) && attr->qp_state == IBV_QPS_RESET) + reset_qp(qhp); + pthread_spin_unlock(&qhp->lock); + return ret; +} + +int c4iw_destroy_qp(struct ibv_qp *ibqp) +{ + int ret; + struct c4iw_qp *qhp = to_c4iw_qp(ibqp); + struct c4iw_dev *dev = to_c4iw_dev(ibqp->context->device); + + PDBG("%s enter qp %p\n", __func__, ibqp); + c4iw_flush_qp(qhp); + + ret = ibv_cmd_destroy_qp(ibqp); + if (ret) { + return ret; + } + if (t4_sq_onchip(&qhp->wq)) { + qhp->wq.sq.ma_sync -= (A_PCIE_MA_SYNC & (c4iw_page_size - 1)); + munmap((void *)qhp->wq.sq.ma_sync, c4iw_page_size); + } + munmap(MASKED(qhp->wq.sq.udb), c4iw_page_size); + munmap(qhp->wq.sq.queue, qhp->wq.sq.memsize); + if (!qhp->srq) { + munmap(MASKED(qhp->wq.rq.udb), c4iw_page_size); + munmap(qhp->wq.rq.queue, qhp->wq.rq.memsize); + } + + pthread_spin_lock(&dev->lock); + dev->qpid2ptr[qhp->wq.sq.qid] = NULL; + pthread_spin_unlock(&dev->lock); + + if (!qhp->srq) + free(qhp->wq.rq.sw_rq); + free(qhp->wq.sq.sw_sq); + free(qhp); + return 0; +} + +int c4iw_query_qp(struct ibv_qp *ibqp, struct ibv_qp_attr *attr, + int attr_mask, struct ibv_qp_init_attr *init_attr) +{ + struct ibv_query_qp cmd; + struct c4iw_qp *qhp = to_c4iw_qp(ibqp); + int ret; + + if (t4_wq_in_error(&qhp->wq)) + c4iw_flush_qp(qhp); + pthread_spin_lock(&qhp->lock); + ret = ibv_cmd_query_qp(ibqp, attr, attr_mask, init_attr, &cmd, sizeof cmd); + pthread_spin_unlock(&qhp->lock); + return ret; +} + +int c4iw_attach_mcast(struct ibv_qp *ibqp, const union ibv_gid *gid, + uint16_t lid) +{ + struct c4iw_qp *qhp = to_c4iw_qp(ibqp); + int ret; + + if (t4_wq_in_error(&qhp->wq)) + c4iw_flush_qp(qhp); + pthread_spin_lock(&qhp->lock); + ret = ibv_cmd_attach_mcast(ibqp, gid, lid); + pthread_spin_unlock(&qhp->lock); + return ret; +} + +int c4iw_detach_mcast(struct ibv_qp *ibqp, const union ibv_gid *gid, + uint16_t lid) +{ + struct c4iw_qp *qhp = to_c4iw_qp(ibqp); + int ret; + + if (t4_wq_in_error(&qhp->wq)) + c4iw_flush_qp(qhp); + pthread_spin_lock(&qhp->lock); + ret = ibv_cmd_detach_mcast(ibqp, gid, lid); + pthread_spin_unlock(&qhp->lock); + return ret; +} + +void c4iw_async_event(struct ibv_context *context, + struct ibv_async_event *event) +{ + PDBG("%s type %d obj %p\n", __func__, event->event_type, + event->element.cq); + + switch (event->event_type) { + case IBV_EVENT_CQ_ERR: + break; + case IBV_EVENT_QP_FATAL: + case IBV_EVENT_QP_REQ_ERR: + case IBV_EVENT_QP_ACCESS_ERR: + case IBV_EVENT_PATH_MIG_ERR: { + struct c4iw_qp *qhp = to_c4iw_qp(event->element.qp); + c4iw_flush_qp(qhp); + break; + } + case IBV_EVENT_SQ_DRAINED: + case IBV_EVENT_PATH_MIG: + case IBV_EVENT_COMM_EST: + case IBV_EVENT_QP_LAST_WQE_REACHED: + default: + break; + } +} diff --git a/providers/efa/CMakeLists.txt b/providers/efa/CMakeLists.txt new file mode 100644 index 0000000..ba0032a --- /dev/null +++ b/providers/efa/CMakeLists.txt @@ -0,0 +1,11 @@ +rdma_shared_provider(efa libefa.map + 1 1.1.${PACKAGE_VERSION} + efa.c + verbs.c +) + +publish_headers(infiniband + efadv.h +) + +rdma_pkg_config("efa" "libibverbs" "${CMAKE_THREAD_LIBS_INIT}") diff --git a/providers/efa/efa-abi.h b/providers/efa/efa-abi.h new file mode 100644 index 0000000..7e1a878 --- /dev/null +++ b/providers/efa/efa-abi.h @@ -0,0 +1,28 @@ +/* SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause */ +/* + * Copyright 2019 Amazon.com, Inc. or its affiliates. All rights reserved. + */ + +#ifndef __EFA_ABI_H__ +#define __EFA_ABI_H__ + +#include <infiniband/kern-abi.h> +#include <kernel-abi/efa-abi.h> +#include <rdma/efa-abi.h> + +#define EFA_ABI_VERSION 1 + +DECLARE_DRV_CMD(efa_alloc_ucontext, IB_USER_VERBS_CMD_GET_CONTEXT, empty, + efa_ibv_alloc_ucontext_resp); +DECLARE_DRV_CMD(efa_alloc_pd, IB_USER_VERBS_CMD_ALLOC_PD, empty, + efa_ibv_alloc_pd_resp); +DECLARE_DRV_CMD(efa_create_cq, IB_USER_VERBS_CMD_CREATE_CQ, efa_ibv_create_cq, + efa_ibv_create_cq_resp); +DECLARE_DRV_CMD(efa_create_qp, IB_USER_VERBS_CMD_CREATE_QP, efa_ibv_create_qp, + efa_ibv_create_qp_resp); +DECLARE_DRV_CMD(efa_create_ah, IB_USER_VERBS_CMD_CREATE_AH, empty, + efa_ibv_create_ah_resp); +DECLARE_DRV_CMD(efa_query_device_ex, IB_USER_VERBS_EX_CMD_QUERY_DEVICE, empty, + efa_ibv_ex_query_device_resp); + +#endif /* __EFA_ABI_H__ */ diff --git a/providers/efa/efa.c b/providers/efa/efa.c new file mode 100644 index 0000000..41955e5 --- /dev/null +++ b/providers/efa/efa.c @@ -0,0 +1,149 @@ +// SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause +/* + * Copyright 2019 Amazon.com, Inc. or its affiliates. All rights reserved. + */ + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/types.h> +#include <unistd.h> +#include <util/util.h> + +#include "efa.h" +#include "verbs.h" + +static void efa_free_context(struct ibv_context *ibvctx); + +#define PCI_VENDOR_ID_AMAZON 0x1d0f + +static const struct verbs_match_ent efa_table[] = { + VERBS_DRIVER_ID(RDMA_DRIVER_EFA), + VERBS_PCI_MATCH(PCI_VENDOR_ID_AMAZON, 0xefa0, NULL), + {} +}; + +static const struct verbs_context_ops efa_ctx_ops = { + .alloc_pd = efa_alloc_pd, + .create_ah = efa_create_ah, + .create_cq = efa_create_cq, + .create_qp = efa_create_qp, + .create_qp_ex = efa_create_qp_ex, + .dealloc_pd = efa_dealloc_pd, + .dereg_mr = efa_dereg_mr, + .destroy_ah = efa_destroy_ah, + .destroy_cq = efa_destroy_cq, + .destroy_qp = efa_destroy_qp, + .modify_qp = efa_modify_qp, + .poll_cq = efa_poll_cq, + .post_recv = efa_post_recv, + .post_send = efa_post_send, + .query_device = efa_query_device, + .query_device_ex = efa_query_device_ex, + .query_port = efa_query_port, + .query_qp = efa_query_qp, + .reg_mr = efa_reg_mr, + .free_context = efa_free_context, +}; + +static struct verbs_context *efa_alloc_context(struct ibv_device *vdev, + int cmd_fd, + void *private_data) +{ + struct efa_alloc_ucontext_resp resp = {}; + struct ibv_device_attr_ex attr; + struct ibv_get_context cmd; + unsigned int qp_table_sz; + struct efa_context *ctx; + int err; + + ctx = verbs_init_and_alloc_context(vdev, cmd_fd, ctx, ibvctx, + RDMA_DRIVER_EFA); + if (!ctx) + return NULL; + + if (ibv_cmd_get_context(&ctx->ibvctx, &cmd, sizeof(cmd), + &resp.ibv_resp, sizeof(resp))) + goto err_free_ctx; + + ctx->sub_cqs_per_cq = resp.sub_cqs_per_cq; + ctx->cmds_supp_udata_mask = resp.cmds_supp_udata_mask; + ctx->cqe_size = sizeof(struct efa_io_rx_cdesc); + ctx->inline_buf_size = resp.inline_buf_size; + ctx->max_llq_size = resp.max_llq_size; + pthread_spin_init(&ctx->qp_table_lock, PTHREAD_PROCESS_PRIVATE); + + /* ah udata is mandatory for ah number retrieval */ + if (!(ctx->cmds_supp_udata_mask & EFA_USER_CMDS_SUPP_UDATA_CREATE_AH)) + goto err_free_spinlock; + + verbs_set_ops(&ctx->ibvctx, &efa_ctx_ops); + + err = efa_query_device_ex(&ctx->ibvctx.context, NULL, &attr, + sizeof(attr)); + if (err) + goto err_free_spinlock; + + qp_table_sz = roundup_pow_of_two(attr.orig_attr.max_qp); + ctx->qp_table_sz_m1 = qp_table_sz - 1; + ctx->qp_table = calloc(qp_table_sz, sizeof(*ctx->qp_table)); + if (!ctx->qp_table) + goto err_free_spinlock; + + return &ctx->ibvctx; + +err_free_spinlock: + pthread_spin_destroy(&ctx->qp_table_lock); +err_free_ctx: + verbs_uninit_context(&ctx->ibvctx); + free(ctx); + return NULL; +} + +static void efa_free_context(struct ibv_context *ibvctx) +{ + struct efa_context *ctx = to_efa_context(ibvctx); + + free(ctx->qp_table); + pthread_spin_destroy(&ctx->qp_table_lock); + verbs_uninit_context(&ctx->ibvctx); + free(ctx); +} + +static struct verbs_device *efa_device_alloc(struct verbs_sysfs_dev *sysfs_dev) +{ + struct efa_dev *dev; + + dev = calloc(1, sizeof(*dev)); + if (!dev) + return NULL; + + dev->pg_sz = sysconf(_SC_PAGESIZE); + + return &dev->vdev; +} + +static void efa_uninit_device(struct verbs_device *verbs_device) +{ + struct efa_dev *dev = to_efa_dev(&verbs_device->device); + + free(dev); +} + +static const struct verbs_device_ops efa_dev_ops = { + .name = "efa", + .match_min_abi_version = EFA_ABI_VERSION, + .match_max_abi_version = EFA_ABI_VERSION, + .match_table = efa_table, + .alloc_device = efa_device_alloc, + .uninit_device = efa_uninit_device, + .alloc_context = efa_alloc_context, +}; + +bool is_efa_dev(struct ibv_device *device) +{ + struct verbs_device *verbs_device = verbs_get_device(device); + + return verbs_device->ops == &efa_dev_ops; +} +PROVIDER_DRIVER(efa, efa_dev_ops); diff --git a/providers/efa/efa.h b/providers/efa/efa.h new file mode 100644 index 0000000..5be7d71 --- /dev/null +++ b/providers/efa/efa.h @@ -0,0 +1,180 @@ +/* SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause */ +/* + * Copyright 2019 Amazon.com, Inc. or its affiliates. All rights reserved. + */ + +#ifndef __EFA_H__ +#define __EFA_H__ + +#include <inttypes.h> +#include <pthread.h> +#include <stddef.h> + +#include <infiniband/driver.h> +#include <util/udma_barrier.h> + +#include "efa-abi.h" +#include "efa_io_defs.h" + +struct efa_context { + struct verbs_context ibvctx; + uint32_t cmds_supp_udata_mask; + uint16_t sub_cqs_per_cq; + uint16_t inline_buf_size; + uint32_t max_llq_size; + size_t cqe_size; + struct efa_qp **qp_table; + unsigned int qp_table_sz_m1; + pthread_spinlock_t qp_table_lock; +}; + +struct efa_pd { + struct ibv_pd ibvpd; + uint16_t pdn; +}; + +struct efa_sub_cq { + uint16_t consumed_cnt; + int phase; + uint8_t *buf; + int qmask; + int cqe_size; + uint32_t ref_cnt; +}; + +struct efa_cq { + struct ibv_cq ibvcq; + uint32_t cqn; + size_t cqe_size; + uint8_t *buf; + size_t buf_size; + uint16_t num_sub_cqs; + /* Index of next sub cq idx to poll. This is used to guarantee fairness for sub cqs */ + uint16_t next_poll_idx; + pthread_spinlock_t lock; + struct efa_sub_cq sub_cq_arr[]; +}; + +struct efa_wq { + uint64_t *wrid; + /* wrid_idx_pool: Pool of free indexes in the wrid array, used to select the + * wrid entry to be used to hold the next tx packet's context. + * At init time, entry N will hold value N, as OOO tx-completions arrive, + * the value stored in a given entry might not equal the entry's index. + */ + uint32_t *wrid_idx_pool; + uint32_t wqe_cnt; + uint32_t wqe_posted; + uint32_t wqe_completed; + uint16_t desc_idx; + uint16_t desc_mask; + /* wrid_idx_pool_next: Index of the next entry to use in wrid_idx_pool. */ + uint16_t wrid_idx_pool_next; + int max_sge; + int phase; + pthread_spinlock_t wqlock; +}; + +struct efa_rq { + struct efa_wq wq; + uint32_t *db; + uint8_t *buf; + size_t buf_size; + uint16_t sub_cq_idx; +}; + +struct efa_sq { + struct efa_wq wq; + uint32_t *db; + uint8_t *desc; + uint32_t desc_offset; + size_t desc_ring_mmap_size; + size_t max_inline_data; + size_t max_wr_rdma_sge; + uint16_t sub_cq_idx; + + /* Buffer for pending WR entries in the current session */ + uint8_t *local_queue; + /* Number of WR entries posted in the current session */ + uint32_t num_wqe_pending; + /* Phase before current session */ + int phase_rb; + /* Current wqe being built */ + struct efa_io_tx_wqe *curr_tx_wqe; +}; + +struct efa_qp { + struct verbs_qp verbs_qp; + struct efa_sq sq; + struct efa_rq rq; + int page_size; + struct efa_cq *rcq; + struct efa_cq *scq; + int sq_sig_all; + int wr_session_err; +}; + +struct efa_mr { + struct verbs_mr vmr; +}; + +struct efa_ah { + struct ibv_ah ibvah; + uint16_t efa_ah; +}; + +struct efa_dev { + struct verbs_device vdev; + uint32_t pg_sz; + uint32_t device_caps; + uint32_t max_sq_wr; + uint32_t max_rq_wr; + uint16_t max_sq_sge; + uint16_t max_rq_sge; + uint32_t max_rdma_size; + uint16_t max_wr_rdma_sge; +}; + +static inline bool is_rdma_read_cap(struct efa_dev *dev) +{ + return dev->device_caps & EFA_QUERY_DEVICE_CAPS_RDMA_READ; +} + +static inline struct efa_dev *to_efa_dev(struct ibv_device *ibvdev) +{ + return container_of(ibvdev, struct efa_dev, vdev.device); +} + +static inline struct efa_context *to_efa_context(struct ibv_context *ibvctx) +{ + return container_of(ibvctx, struct efa_context, ibvctx.context); +} + +static inline struct efa_pd *to_efa_pd(struct ibv_pd *ibvpd) +{ + return container_of(ibvpd, struct efa_pd, ibvpd); +} + +static inline struct efa_cq *to_efa_cq(struct ibv_cq *ibvcq) +{ + return container_of(ibvcq, struct efa_cq, ibvcq); +} + +static inline struct efa_qp *to_efa_qp(struct ibv_qp *ibvqp) +{ + return container_of(ibvqp, struct efa_qp, verbs_qp.qp); +} + +static inline struct efa_qp *to_efa_qp_ex(struct ibv_qp_ex *ibvqpx) +{ + return container_of(ibvqpx, struct efa_qp, verbs_qp.qp_ex); +} + +static inline struct efa_ah *to_efa_ah(struct ibv_ah *ibvah) +{ + return container_of(ibvah, struct efa_ah, ibvah); +} + +bool is_efa_dev(struct ibv_device *device); + +#endif /* __EFA_H__ */ diff --git a/providers/efa/efa_io_defs.h b/providers/efa/efa_io_defs.h new file mode 100644 index 0000000..6fc813c --- /dev/null +++ b/providers/efa/efa_io_defs.h @@ -0,0 +1,334 @@ +/* SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause */ +/* + * Copyright 2018-2019 Amazon.com, Inc. or its affiliates. All rights reserved. + */ + +#ifndef _EFA_IO_H_ +#define _EFA_IO_H_ + +#define EFA_GET(ptr, type) \ + ((*(ptr) & type##_MASK) >> type##_SHIFT) + +#define EFA_SET(ptr, type, value) \ + ({ *(ptr) |= ((value) << type##_SHIFT) & type##_MASK; }) + +#define BIT(nr) (1UL << (nr)) +#define GENMASK(h, l) (((1U << ((h) - (l) + 1)) - 1) << (l)) + +#define EFA_IO_TX_DESC_NUM_BUFS 2 +#define EFA_IO_TX_DESC_NUM_RDMA_BUFS 1 +#define EFA_IO_TX_DESC_INLINE_MAX_SIZE 32 +#define EFA_IO_TX_DESC_IMM_DATA_SIZE 4 + +enum efa_io_queue_type { + /* send queue (of a QP) */ + EFA_IO_SEND_QUEUE = 1, + /* recv queue (of a QP) */ + EFA_IO_RECV_QUEUE = 2, +}; + +enum efa_io_send_op_type { + /* send message */ + EFA_IO_SEND = 0, + /* RDMA read */ + EFA_IO_RDMA_READ = 1, +}; + +enum efa_io_comp_status { + /* Successful completion */ + EFA_IO_COMP_STATUS_OK = 0, + /* Flushed during QP destroy */ + EFA_IO_COMP_STATUS_FLUSHED = 1, + /* Internal QP error */ + EFA_IO_COMP_STATUS_LOCAL_ERROR_QP_INTERNAL_ERROR = 2, + /* Bad operation type */ + EFA_IO_COMP_STATUS_LOCAL_ERROR_INVALID_OP_TYPE = 3, + /* Bad AH */ + EFA_IO_COMP_STATUS_LOCAL_ERROR_INVALID_AH = 4, + /* LKEY not registered or does not match IOVA */ + EFA_IO_COMP_STATUS_LOCAL_ERROR_INVALID_LKEY = 5, + /* Message too long */ + EFA_IO_COMP_STATUS_LOCAL_ERROR_BAD_LENGTH = 6, + /* Destination ENI is down or does not run EFA */ + EFA_IO_COMP_STATUS_REMOTE_ERROR_BAD_ADDRESS = 7, + /* Connection was reset by remote side */ + EFA_IO_COMP_STATUS_REMOTE_ERROR_ABORT = 8, + /* Bad dest QP number (QP does not exist or is in error state) */ + EFA_IO_COMP_STATUS_REMOTE_ERROR_BAD_DEST_QPN = 9, + /* Destination resource not ready (no WQEs posted on RQ) */ + EFA_IO_COMP_STATUS_REMOTE_ERROR_RNR = 10, + /* Receiver SGL too short */ + EFA_IO_COMP_STATUS_REMOTE_ERROR_BAD_LENGTH = 11, + /* Unexpected status returned by responder */ + EFA_IO_COMP_STATUS_REMOTE_ERROR_BAD_STATUS = 12, +}; + +struct efa_io_tx_meta_desc { + /* Verbs-generated Request ID */ + uint16_t req_id; + + /* + * control flags + * 3:0 : op_type - operation type: send/rdma/fast mem + * ops/etc + * 4 : has_imm - immediate_data field carries valid + * data. + * 5 : inline_msg - inline mode - inline message data + * follows this descriptor (no buffer descriptors). + * Note that it is different from immediate data + * 6 : meta_extension - Extended metadata. MBZ + * 7 : meta_desc - Indicates metadata descriptor. + * Must be set. + */ + uint8_t ctrl1; + + /* + * control flags + * 0 : phase + * 1 : reserved25 - MBZ + * 2 : first - Indicates first descriptor in + * transaction. Must be set. + * 3 : last - Indicates last descriptor in + * transaction. Must be set. + * 4 : comp_req - Indicates whether completion should + * be posted, after packet is transmitted. Valid only + * for the first descriptor + * 7:5 : reserved29 - MBZ + */ + uint8_t ctrl2; + + uint16_t dest_qp_num; + + /* + * If inline_msg bit is set, length of inline message in bytes, + * otherwise length of SGL (number of buffers). + */ + uint16_t length; + + /* + * immediate data: if has_imm is set, then this field is included + * within Tx message and reported in remote Rx completion. + */ + uint32_t immediate_data; + + uint16_t ah; + + uint16_t reserved; + + /* Queue key */ + uint32_t qkey; + + uint8_t reserved2[12]; +}; + +/* + * Tx queue buffer descriptor, for any transport type. Preceded by metadata + * descriptor. + */ +struct efa_io_tx_buf_desc { + /* length in bytes */ + uint32_t length; + + /* + * 23:0 : lkey - local memory translation key + * 31:24 : reserved - MBZ + */ + uint32_t lkey; + + /* Buffer address bits[31:0] */ + uint32_t buf_addr_lo; + + /* Buffer address bits[63:32] */ + uint32_t buf_addr_hi; +}; + +struct efa_io_remote_mem_addr { + /* length in bytes */ + uint32_t length; + + /* remote memory translation key */ + uint32_t rkey; + + /* Buffer address bits[31:0] */ + uint32_t buf_addr_lo; + + /* Buffer address bits[63:32] */ + uint32_t buf_addr_hi; +}; + +struct efa_io_rdma_req { + /* Remote memory address */ + struct efa_io_remote_mem_addr remote_mem; + + /* Local memory address */ + struct efa_io_tx_buf_desc local_mem[1]; +}; + +/* + * Tx WQE, composed of tx meta descriptors followed by either tx buffer + * descriptors or inline data + */ +struct efa_io_tx_wqe { + /* TX meta */ + struct efa_io_tx_meta_desc meta; + + union { + /* Send buffer descriptors */ + struct efa_io_tx_buf_desc sgl[2]; + + uint8_t inline_data[32]; + + /* RDMA local and remote memory addresses */ + struct efa_io_rdma_req rdma_req; + } data; +}; + +/* + * Rx buffer descriptor; RX WQE is composed of one or more RX buffer + * descriptors. + */ +struct efa_io_rx_desc { + /* Buffer address bits[31:0] */ + uint32_t buf_addr_lo; + + /* Buffer Pointer[63:32] */ + uint32_t buf_addr_hi; + + /* Verbs-generated request id. */ + uint16_t req_id; + + /* Length in bytes. */ + uint16_t length; + + /* + * LKey and control flags + * 23:0 : lkey + * 29:24 : reserved - MBZ + * 30 : first - Indicates first descriptor in WQE + * 31 : last - Indicates last descriptor in WQE + */ + uint32_t lkey_ctrl; +}; + +/* Common IO completion descriptor */ +struct efa_io_cdesc_common { + /* + * verbs-generated request ID, as provided in the completed tx or rx + * descriptor. + */ + uint16_t req_id; + + uint8_t status; + + /* + * flags + * 0 : phase - Phase bit + * 2:1 : q_type - enum efa_io_queue_type: send/recv + * 3 : has_imm - indicates that immediate data is + * present - for RX completions only + * 4 : wide_completion - indicates that wide + * completion format is used + * 7:5 : reserved29 + */ + uint8_t flags; + + /* local QP number */ + uint16_t qp_num; + + /* Transferred length */ + uint16_t length; +}; + +/* Tx completion descriptor */ +struct efa_io_tx_cdesc { + /* Common completion info */ + struct efa_io_cdesc_common common; +}; + +/* Rx Completion Descriptor */ +struct efa_io_rx_cdesc { + /* Common completion info */ + struct efa_io_cdesc_common common; + + /* Remote Address Handle FW index, 0xFFFF indicates invalid ah */ + uint16_t ah; + + uint16_t src_qp_num; + + /* Immediate data */ + uint32_t imm; +}; + +/* Extended Rx Completion Descriptor */ +struct efa_io_rx_cdesc_wide { + /* Base RX completion info */ + struct efa_io_rx_cdesc rx_cdesc_base; + + /* + * Word 0 of remote (source) address, needed only for in-band + * ad-hoc AH support + */ + uint32_t src_addr_0; + + /* + * Word 1 of remote (source) address, needed only for in-band + * ad-hoc AH support + */ + uint32_t src_addr_1; + + /* + * Word 2 of remote (source) address, needed only for in-band + * ad-hoc AH support + */ + uint32_t src_addr_2; + + /* + * Word 3 of remote (source) address, needed only for in-band + * ad-hoc AH support + */ + uint32_t src_addr_3; +}; + +/* tx_meta_desc */ +#define EFA_IO_TX_META_DESC_OP_TYPE_SHIFT 0 +#define EFA_IO_TX_META_DESC_OP_TYPE_MASK GENMASK(3, 0) +#define EFA_IO_TX_META_DESC_HAS_IMM_SHIFT 4 +#define EFA_IO_TX_META_DESC_HAS_IMM_MASK BIT(4) +#define EFA_IO_TX_META_DESC_INLINE_MSG_SHIFT 5 +#define EFA_IO_TX_META_DESC_INLINE_MSG_MASK BIT(5) +#define EFA_IO_TX_META_DESC_META_EXTENSION_SHIFT 6 +#define EFA_IO_TX_META_DESC_META_EXTENSION_MASK BIT(6) +#define EFA_IO_TX_META_DESC_META_DESC_SHIFT 7 +#define EFA_IO_TX_META_DESC_META_DESC_MASK BIT(7) +#define EFA_IO_TX_META_DESC_PHASE_SHIFT 0 +#define EFA_IO_TX_META_DESC_PHASE_MASK BIT(0) +#define EFA_IO_TX_META_DESC_FIRST_SHIFT 2 +#define EFA_IO_TX_META_DESC_FIRST_MASK BIT(2) +#define EFA_IO_TX_META_DESC_LAST_SHIFT 3 +#define EFA_IO_TX_META_DESC_LAST_MASK BIT(3) +#define EFA_IO_TX_META_DESC_COMP_REQ_SHIFT 4 +#define EFA_IO_TX_META_DESC_COMP_REQ_MASK BIT(4) + +/* tx_buf_desc */ +#define EFA_IO_TX_BUF_DESC_LKEY_SHIFT 0 +#define EFA_IO_TX_BUF_DESC_LKEY_MASK GENMASK(23, 0) + +/* rx_desc */ +#define EFA_IO_RX_DESC_LKEY_SHIFT 0 +#define EFA_IO_RX_DESC_LKEY_MASK GENMASK(23, 0) +#define EFA_IO_RX_DESC_FIRST_SHIFT 30 +#define EFA_IO_RX_DESC_FIRST_MASK BIT(30) +#define EFA_IO_RX_DESC_LAST_SHIFT 31 +#define EFA_IO_RX_DESC_LAST_MASK BIT(31) + +/* cdesc_common */ +#define EFA_IO_CDESC_COMMON_PHASE_SHIFT 0 +#define EFA_IO_CDESC_COMMON_PHASE_MASK BIT(0) +#define EFA_IO_CDESC_COMMON_Q_TYPE_SHIFT 1 +#define EFA_IO_CDESC_COMMON_Q_TYPE_MASK GENMASK(2, 1) +#define EFA_IO_CDESC_COMMON_HAS_IMM_SHIFT 3 +#define EFA_IO_CDESC_COMMON_HAS_IMM_MASK BIT(3) +#define EFA_IO_CDESC_COMMON_WIDE_COMPLETION_SHIFT 4 +#define EFA_IO_CDESC_COMMON_WIDE_COMPLETION_MASK BIT(4) + +#endif /* _EFA_IO_H_ */ diff --git a/providers/efa/efadv.h b/providers/efa/efadv.h new file mode 100644 index 0000000..91458f3 --- /dev/null +++ b/providers/efa/efadv.h @@ -0,0 +1,71 @@ +/* SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause */ +/* + * Copyright 2019 Amazon.com, Inc. or its affiliates. All rights reserved. + */ + +#ifndef __EFADV_H__ +#define __EFADV_H__ + +#include <stdio.h> +#include <sys/types.h> + +#include <infiniband/verbs.h> + +#ifdef __cplusplus +extern "C" { +#endif + +enum { + /* Values must match the values in efa-abi.h */ + EFADV_QP_DRIVER_TYPE_SRD = 0, +}; + +struct ibv_qp *efadv_create_driver_qp(struct ibv_pd *ibvpd, + struct ibv_qp_init_attr *attr, + uint32_t driver_qp_type); + +struct efadv_qp_init_attr { + uint64_t comp_mask; + uint32_t driver_qp_type; + uint8_t reserved[4]; +}; + +struct ibv_qp *efadv_create_qp_ex(struct ibv_context *ibvctx, + struct ibv_qp_init_attr_ex *attr_ex, + struct efadv_qp_init_attr *efa_attr, + uint32_t inlen); + +enum { + EFADV_DEVICE_ATTR_CAPS_RDMA_READ = 1 << 0, +}; + +struct efadv_device_attr { + uint64_t comp_mask; + uint32_t max_sq_wr; + uint32_t max_rq_wr; + uint16_t max_sq_sge; + uint16_t max_rq_sge; + uint16_t inline_buf_size; + uint8_t reserved[2]; + uint32_t device_caps; + uint32_t max_rdma_size; +}; + +int efadv_query_device(struct ibv_context *ibvctx, + struct efadv_device_attr *attr, + uint32_t inlen); + +struct efadv_ah_attr { + uint64_t comp_mask; + uint16_t ahn; + uint8_t reserved[6]; +}; + +int efadv_query_ah(struct ibv_ah *ibvah, struct efadv_ah_attr *attr, + uint32_t inlen); + +#ifdef __cplusplus +} +#endif + +#endif /* __EFADV_H__ */ diff --git a/providers/efa/libefa.map b/providers/efa/libefa.map new file mode 100644 index 0000000..e9df1da --- /dev/null +++ b/providers/efa/libefa.map @@ -0,0 +1,14 @@ +/* Export symbols should be added below according to + Documentation/versioning.md document. */ +EFA_1.0 { + global: + efadv_create_driver_qp; + local: *; +}; + +EFA_1.1 { + global: + efadv_create_qp_ex; + efadv_query_ah; + efadv_query_device; +} EFA_1.0; diff --git a/providers/efa/man/CMakeLists.txt b/providers/efa/man/CMakeLists.txt new file mode 100644 index 0000000..d6a4f57 --- /dev/null +++ b/providers/efa/man/CMakeLists.txt @@ -0,0 +1,4 @@ +rdma_man_pages( + efadv_create_driver_qp.3.md + efadv.7.md +) diff --git a/providers/efa/man/efadv.7.md b/providers/efa/man/efadv.7.md new file mode 100644 index 0000000..aef6470 --- /dev/null +++ b/providers/efa/man/efadv.7.md @@ -0,0 +1,36 @@ +--- +layout: page +title: EFADV +section: 7 +tagline: Verbs +date: 2019-01-19 +header: "EFA Direct Verbs Manual" +footer: efa +--- + +# NAME + +efadv - Direct verbs for efa devices + +This provides low level access to efa devices to perform direct operations, +without general branching performed by libibverbs. + +# DESCRIPTION +The libibverbs API is an abstract one. It is agnostic to any underlying +provider specific implementation. While this abstraction has the advantage +of user applications portability, it has a performance penalty. For some +applications optimizing performance is more important than portability. + +The efa direct verbs API is intended for such applications. +It exposes efa specific low level operations, allowing the application +to bypass the libibverbs API. + +The direct include of efadv.h together with linkage to efa library will +allow usage of this new interface. + +# SEE ALSO +**verbs**(7) + +# AUTHORS + +Gal Pressman <galpress@amazon.com> diff --git a/providers/efa/man/efadv_create_driver_qp.3.md b/providers/efa/man/efadv_create_driver_qp.3.md new file mode 100644 index 0000000..8e4e73a --- /dev/null +++ b/providers/efa/man/efadv_create_driver_qp.3.md @@ -0,0 +1,44 @@ +--- +layout: page +title: EFADV_CREATE_DRIVER_QP +section: 3 +tagline: Verbs +date: 2019-01-23 +header: "EFA Direct Verbs Manual" +footer: efa +--- + +# NAME +efadv_create_driver_qp - Create EFA specific Queue Pair +# SYNOPSIS +```c +#include <infiniband/efadv.h> + +struct ibv_qp *efadv_create_driver_qp(struct ibv_pd *ibvpd, + struct ibv_qp_init_attr *attr, + uint32_t driver_qp_type); +``` + +# DESCRIPTION +**efadv_create_driver_qp()** +Create device-specific Queue Pairs. + +Scalable Reliable Datagram (SRD) transport provides reliable out-of-order +delivery, transparently utilizing multiple network paths to reduce network tail +latency. Its interface is similar to UD, in particular it supports message size +up to MTU, with error handling extended to support reliable communication. + +*driver_qp_type* +: The type of QP to be created: + + EFADV_QP_DRIVER_TYPE_SRD: + Create an SRD QP. + +# RETURN VALUE +efadv_create_driver_qp() returns a pointer to the created QP, or NULL if the request fails. + +# SEE ALSO +**efadv**(7) + +# AUTHORS +Gal Pressman <galpress@amazon.com> diff --git a/providers/efa/man/efadv_create_qp_ex.3.md b/providers/efa/man/efadv_create_qp_ex.3.md new file mode 100644 index 0000000..8d927ff --- /dev/null +++ b/providers/efa/man/efadv_create_qp_ex.3.md @@ -0,0 +1,74 @@ +--- +layout: page +title: EFADV_CREATE_QP_EX +section: 3 +tagline: Verbs +date: 2019-08-06 +header: "EFA Direct Verbs Manual" +footer: efa +--- + +# NAME + +efadv_create_qp_ex - Create EFA specific extended Queue Pair + +# SYNOPSIS + +```c +#include <infiniband/efadv.h> + +struct ibv_qp *efadv_create_qp_ex(struct ibv_context *ibvctx, + struct ibv_qp_init_attr_ex *attr_ex, + struct efadv_qp_init_attr *efa_attr, + uint32_t inlen); +``` + +# DESCRIPTION + +**efadv_create_qp_ex()** creates device-specific extended Queue Pair. + +The argument attr_ex is an ibv_qp_init_attr_ex struct, +as defined in <infiniband/verbs.h>. + +Use ibv_qp_to_qp_ex() to get the ibv_qp_ex for accessing the send ops +iterator interface, when QP create attr IBV_QP_INIT_ATTR_SEND_OPS_FLAGS is used. + +Scalable Reliable Datagram (SRD) transport provides reliable out-of-order +delivery, transparently utilizing multiple network paths to reduce network tail +latency. Its interface is similar to UD, in particular it supports message size +up to MTU, with error handling extended to support reliable communication. + +Compatibility is handled using the comp_mask and inlen fields. + +```c +struct efadv_qp_init_attr { + uint64_t comp_mask; + uint32_t driver_qp_type; + uint8_t reserved[4]; +}; +``` + +*inlen* +: In: Size of struct efadv_qp_init_attr. + +*comp_mask* +: Compatibility mask. + +*driver_qp_type* +: The type of QP to be created: + + EFADV_QP_DRIVER_TYPE_SRD: + Create an SRD QP. + +# RETURN VALUE + +efadv_create_qp_ex() returns a pointer to the created QP, or NULL if the request fails. + +# SEE ALSO + +**efadv**(7), **ibv_create_qp_ex**(3) + +# AUTHORS + +Gal Pressman <galpress@amazon.com> +Daniel Kranzdorf <dkkranzd@amazon.com> diff --git a/providers/efa/man/efadv_query_ah.3.md b/providers/efa/man/efadv_query_ah.3.md new file mode 100644 index 0000000..c81f15c --- /dev/null +++ b/providers/efa/man/efadv_query_ah.3.md @@ -0,0 +1,62 @@ +--- +layout: page +title: EFADV_QUERY_AH +section: 3 +tagline: Verbs +date: 2019-05-19 +header: "EFA Direct Verbs Manual" +footer: efa +--- + +# NAME + +efadv_query_ah - Query EFA specific Address Handle attributes + +# SYNOPSIS + +```c +#include <infiniband/efadv.h> + +int efadv_query_ah(struct ibv_ah *ibvah, struct efadv_ah_attr *attr, + uint32_t inlen); +``` + +# DESCRIPTION + +**efadv_query_ah()** queries device-specific Address Handle attributes. + +Compatibility is handled using the comp_mask and inlen fields. + +```c +struct efadv_ah_attr { + uint64_t comp_mask; + uint16_t ahn; + uint8_t reserved[6]; +}; +``` + +*inlen* +: In: Size of struct efadv_ah_attr. + +*comp_mask* +: Compatibility mask. + +*ahn* +: Device's Address Handle number. + +# RETURN VALUE + +**efadv_query_ah()** returns 0 on success, or the value of errno on failure +(which indicates the failure reason). + +# SEE ALSO + +**efadv**(7) + +# NOTES + +* Compatibility mask (comp_mask) is an out field and currently has no values. + +# AUTHORS + +Gal Pressman <galpress@amazon.com> diff --git a/providers/efa/man/efadv_query_device.3.md b/providers/efa/man/efadv_query_device.3.md new file mode 100644 index 0000000..863090e --- /dev/null +++ b/providers/efa/man/efadv_query_device.3.md @@ -0,0 +1,90 @@ +--- +layout: page +title: EFADV_QUERY_DEVICE +section: 3 +tagline: Verbs +date: 2019-04-22 +header: "EFA Direct Verbs Manual" +footer: efa +--- + +# NAME + +efadv_query_device - Query device capabilities + +# SYNOPSIS + +```c +#include <infiniband/efadv.h> + +int efadv_query_device(struct ibv_context *ibvctx, + struct efadv_device_attr *attr, + uint32_t inlen); +``` + +# DESCRIPTION + +**efadv_query_device()** Queries EFA device specific attributes. + +Compatibility is handled using the comp_mask and inlen fields. + +```c +struct efadv_device_attr { + uint64_t comp_mask; + uint32_t max_sq_wr; + uint32_t max_rq_wr; + uint16_t max_sq_sge; + uint16_t max_rq_sge; + uint16_t inline_buf_size; + uint8_t reserved[2]; + uint32_t device_caps; + uint32_t max_rdma_size; +}; +``` + +*inlen* +: In: Size of struct efadv_device_attr. + +*comp_mask* +: Compatibility mask. + +*max_sq_wr* +: Maximum Send Queue (SQ) Work Requests (WRs). + +*max_rq_wr* +: Maximum Receive Queue (RQ) Work Requests (WRs). + +*max_sq_sge* +: Maximum Send Queue (SQ) Scatter Gather Elements (SGEs). + +*max_rq_sge* +: Maximum Receive Queue (RQ) Scatter Gather Elements (SGEs). + +*inline_buf_size* +: Maximum inline buffer size. + +*device_caps* +: Bitmask of device capabilities: + + EFADV_DEVICE_ATTR_CAPS_RDMA_READ: + RDMA read is supported. + +*max_rdma_size* +: Maximum RDMA transfer size in bytes. + +# RETURN VALUE + +**efadv_query_device()** returns 0 on success, or the value of errno on failure +(which indicates the failure reason). + +# SEE ALSO + +**efadv**(7) + +# NOTES + +* Compatibility mask (comp_mask) is an out field and currently has no values. + +# AUTHORS + +Gal Pressman <galpress@amazon.com> diff --git a/providers/efa/verbs.c b/providers/efa/verbs.c new file mode 100644 index 0000000..03b8cf9 --- /dev/null +++ b/providers/efa/verbs.c @@ -0,0 +1,1686 @@ +// SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause +/* + * Copyright 2019 Amazon.com, Inc. or its affiliates. All rights reserved. + */ + +#include <assert.h> +#include <errno.h> +#include <pthread.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/mman.h> +#include <unistd.h> + +#include <ccan/minmax.h> + +#include <util/compiler.h> +#include <util/mmio.h> +#include <util/util.h> + +#include "efa.h" +#include "efadv.h" +#include "verbs.h" + +static bool is_buf_cleared(void *buf, size_t len) +{ + int i; + + for (i = 0; i < len; i++) { + if (((uint8_t *)buf)[i]) + return false; + } + + return true; +} + +#define is_ext_cleared(ptr, inlen) \ + is_buf_cleared(ptr + sizeof(*ptr), inlen - sizeof(*ptr)) + +#define is_reserved_cleared(reserved) is_buf_cleared(reserved, sizeof(reserved)) + +int efa_query_device(struct ibv_context *ibvctx, + struct ibv_device_attr *dev_attr) +{ + struct efa_context *ctx = to_efa_context(ibvctx); + struct ibv_query_device cmd; + uint8_t fw_ver[8]; + int err; + + err = ibv_cmd_query_device(ibvctx, dev_attr, (uint64_t *)&fw_ver, + &cmd, sizeof(cmd)); + if (err) + return err; + + dev_attr->max_qp_wr = min_t(int, dev_attr->max_qp_wr, + ctx->max_llq_size / sizeof(struct efa_io_tx_wqe)); + snprintf(dev_attr->fw_ver, sizeof(dev_attr->fw_ver), "%u.%u.%u.%u", + fw_ver[0], fw_ver[1], fw_ver[2], fw_ver[3]); + + return 0; +} + +int efa_query_port(struct ibv_context *ibvctx, uint8_t port, + struct ibv_port_attr *port_attr) +{ + struct ibv_query_port cmd; + + return ibv_cmd_query_port(ibvctx, port, port_attr, &cmd, sizeof(cmd)); +} + +int efa_query_device_ex(struct ibv_context *context, + const struct ibv_query_device_ex_input *input, + struct ibv_device_attr_ex *attr, + size_t attr_size) +{ + struct efa_context *ctx = to_efa_context(context); + struct efa_dev *dev = to_efa_dev(context->device); + int cmd_supp_uhw = ctx->cmds_supp_udata_mask & + EFA_USER_CMDS_SUPP_UDATA_QUERY_DEVICE; + struct ibv_device_attr *a = &attr->orig_attr; + struct efa_query_device_ex_resp resp = {}; + struct ibv_query_device_ex cmd = {}; + uint8_t fw_ver[8]; + int err; + + err = ibv_cmd_query_device_ex( + context, input, attr, attr_size, (uint64_t *)&fw_ver, &cmd, + sizeof(cmd), &resp.ibv_resp, + cmd_supp_uhw ? sizeof(resp) : sizeof(resp.ibv_resp)); + if (err) + return err; + + dev->device_caps = resp.device_caps; + dev->max_sq_wr = resp.max_sq_wr; + dev->max_rq_wr = resp.max_rq_wr; + dev->max_sq_sge = resp.max_sq_sge; + dev->max_rq_sge = resp.max_rq_sge; + dev->max_rdma_size = resp.max_rdma_size; + dev->max_wr_rdma_sge = a->max_sge_rd; + + a->max_qp_wr = min_t(int, a->max_qp_wr, + ctx->max_llq_size / sizeof(struct efa_io_tx_wqe)); + snprintf(a->fw_ver, sizeof(a->fw_ver), "%u.%u.%u.%u", + fw_ver[0], fw_ver[1], fw_ver[2], fw_ver[3]); + + return 0; +} + +int efadv_query_device(struct ibv_context *ibvctx, + struct efadv_device_attr *attr, + uint32_t inlen) +{ + struct efa_context *ctx = to_efa_context(ibvctx); + struct efa_dev *dev = to_efa_dev(ibvctx->device); + uint64_t comp_mask_out = 0; + + if (!is_efa_dev(ibvctx->device)) + return EOPNOTSUPP; + + if (!vext_field_avail(typeof(*attr), inline_buf_size, inlen)) + return EINVAL; + + memset(attr, 0, inlen); + attr->max_sq_wr = dev->max_sq_wr; + attr->max_rq_wr = dev->max_rq_wr; + attr->max_sq_sge = dev->max_sq_sge; + attr->max_rq_sge = dev->max_rq_sge; + attr->inline_buf_size = ctx->inline_buf_size; + + if (vext_field_avail(typeof(*attr), max_rdma_size, inlen)) { + attr->max_rdma_size = dev->max_rdma_size; + + if (is_rdma_read_cap(dev)) + attr->device_caps |= EFADV_DEVICE_ATTR_CAPS_RDMA_READ; + } + + attr->comp_mask = comp_mask_out; + + return 0; +} + +struct ibv_pd *efa_alloc_pd(struct ibv_context *ibvctx) +{ + struct efa_alloc_pd_resp resp = {}; + struct ibv_alloc_pd cmd; + struct efa_pd *pd; + int err; + + pd = calloc(1, sizeof(*pd)); + if (!pd) + return NULL; + + err = ibv_cmd_alloc_pd(ibvctx, &pd->ibvpd, &cmd, sizeof(cmd), + &resp.ibv_resp, sizeof(resp)); + if (err) + goto out; + + pd->pdn = resp.pdn; + + return &pd->ibvpd; + +out: + free(pd); + errno = err; + return NULL; +} + +int efa_dealloc_pd(struct ibv_pd *ibvpd) +{ + struct efa_pd *pd = to_efa_pd(ibvpd); + int err; + + err = ibv_cmd_dealloc_pd(ibvpd); + if (err) + return err; + free(pd); + + return 0; +} + +struct ibv_mr *efa_reg_mr(struct ibv_pd *ibvpd, void *sva, size_t len, + uint64_t hca_va, int access) +{ + struct ib_uverbs_reg_mr_resp resp; + struct ibv_reg_mr cmd; + struct efa_mr *mr; + int err; + + mr = calloc(1, sizeof(*mr)); + if (!mr) + return NULL; + + err = ibv_cmd_reg_mr(ibvpd, sva, len, hca_va, access, &mr->vmr, + &cmd, sizeof(cmd), &resp, sizeof(resp)); + if (err) { + free(mr); + errno = err; + return NULL; + } + + return &mr->vmr.ibv_mr; +} + +int efa_dereg_mr(struct verbs_mr *vmr) +{ + struct efa_mr *mr = container_of(vmr, struct efa_mr, vmr); + int err; + + err = ibv_cmd_dereg_mr(vmr); + if (err) + return err; + free(mr); + + return 0; +} + +static uint32_t efa_sub_cq_get_current_index(struct efa_sub_cq *sub_cq) +{ + return sub_cq->consumed_cnt & sub_cq->qmask; +} + +static int efa_cqe_is_pending(struct efa_io_cdesc_common *cqe_common, + int phase) +{ + return EFA_GET(&cqe_common->flags, EFA_IO_CDESC_COMMON_PHASE) == phase; +} + +static struct efa_io_cdesc_common * +efa_sub_cq_get_cqe(struct efa_sub_cq *sub_cq, int entry) +{ + return (struct efa_io_cdesc_common *)(sub_cq->buf + + (entry * sub_cq->cqe_size)); +} + +static void efa_sub_cq_initialize(struct efa_sub_cq *sub_cq, uint8_t *buf, + int sub_cq_size, int cqe_size) +{ + sub_cq->consumed_cnt = 0; + sub_cq->phase = 1; + sub_cq->buf = buf; + sub_cq->qmask = sub_cq_size - 1; + sub_cq->cqe_size = cqe_size; + sub_cq->ref_cnt = 0; +} + +struct ibv_cq *efa_create_cq(struct ibv_context *ibvctx, int ncqe, + struct ibv_comp_channel *channel, int vec) +{ + struct efa_context *ctx = to_efa_context(ibvctx); + struct efa_create_cq_resp resp = {}; + struct efa_create_cq cmd = {}; + uint16_t num_sub_cqs; + struct efa_cq *cq; + int sub_buf_size; + int sub_cq_size; + uint8_t *buf; + int err; + int i; + + cq = calloc(1, sizeof(*cq) + + sizeof(*cq->sub_cq_arr) * ctx->sub_cqs_per_cq); + if (!cq) + return NULL; + + num_sub_cqs = ctx->sub_cqs_per_cq; + cmd.num_sub_cqs = num_sub_cqs; + cmd.cq_entry_size = ctx->cqe_size; + + ncqe = roundup_pow_of_two(ncqe); + err = ibv_cmd_create_cq(ibvctx, ncqe, channel, vec, + &cq->ibvcq, &cmd.ibv_cmd, sizeof(cmd), + &resp.ibv_resp, sizeof(resp)); + if (err) { + errno = err; + goto err_free_cq; + } + + sub_cq_size = cq->ibvcq.cqe; + cq->cqn = resp.cq_idx; + cq->buf_size = resp.q_mmap_size; + cq->num_sub_cqs = num_sub_cqs; + cq->cqe_size = ctx->cqe_size; + + cq->buf = mmap(NULL, cq->buf_size, PROT_READ, MAP_SHARED, + ibvctx->cmd_fd, resp.q_mmap_key); + if (cq->buf == MAP_FAILED) + goto err_destroy_cq; + + buf = cq->buf; + sub_buf_size = cq->cqe_size * sub_cq_size; + for (i = 0; i < num_sub_cqs; i++) { + efa_sub_cq_initialize(&cq->sub_cq_arr[i], buf, sub_cq_size, + cq->cqe_size); + buf += sub_buf_size; + } + + pthread_spin_init(&cq->lock, PTHREAD_PROCESS_PRIVATE); + + return &cq->ibvcq; + +err_destroy_cq: + ibv_cmd_destroy_cq(&cq->ibvcq); +err_free_cq: + free(cq); + return NULL; +} + +int efa_destroy_cq(struct ibv_cq *ibvcq) +{ + struct efa_cq *cq = to_efa_cq(ibvcq); + int err; + + munmap(cq->buf, cq->buf_size); + + pthread_spin_destroy(&cq->lock); + + err = ibv_cmd_destroy_cq(ibvcq); + if (err) + return err; + + free(cq); + + return 0; +} + +static struct efa_io_cdesc_common * +cq_next_sub_cqe_get(struct efa_sub_cq *sub_cq) +{ + struct efa_io_cdesc_common *cqe; + uint32_t current_index; + + current_index = efa_sub_cq_get_current_index(sub_cq); + cqe = efa_sub_cq_get_cqe(sub_cq, current_index); + if (efa_cqe_is_pending(cqe, sub_cq->phase)) { + /* Do not read the rest of the completion entry before the + * phase bit has been validated. + */ + udma_from_device_barrier(); + sub_cq->consumed_cnt++; + if (!efa_sub_cq_get_current_index(sub_cq)) + sub_cq->phase = 1 - sub_cq->phase; + return cqe; + } + + return NULL; +} + +static enum ibv_wc_status to_ibv_status(enum efa_io_comp_status status) +{ + switch (status) { + case EFA_IO_COMP_STATUS_OK: + return IBV_WC_SUCCESS; + case EFA_IO_COMP_STATUS_FLUSHED: + return IBV_WC_WR_FLUSH_ERR; + case EFA_IO_COMP_STATUS_LOCAL_ERROR_QP_INTERNAL_ERROR: + case EFA_IO_COMP_STATUS_LOCAL_ERROR_INVALID_OP_TYPE: + case EFA_IO_COMP_STATUS_LOCAL_ERROR_INVALID_AH: + return IBV_WC_LOC_QP_OP_ERR; + case EFA_IO_COMP_STATUS_LOCAL_ERROR_INVALID_LKEY: + return IBV_WC_LOC_PROT_ERR; + case EFA_IO_COMP_STATUS_LOCAL_ERROR_BAD_LENGTH: + return IBV_WC_LOC_LEN_ERR; + case EFA_IO_COMP_STATUS_REMOTE_ERROR_ABORT: + return IBV_WC_REM_ABORT_ERR; + case EFA_IO_COMP_STATUS_REMOTE_ERROR_RNR: + return IBV_WC_RNR_RETRY_EXC_ERR; + case EFA_IO_COMP_STATUS_REMOTE_ERROR_BAD_DEST_QPN: + return IBV_WC_REM_INV_RD_REQ_ERR; + case EFA_IO_COMP_STATUS_REMOTE_ERROR_BAD_STATUS: + return IBV_WC_BAD_RESP_ERR; + case EFA_IO_COMP_STATUS_REMOTE_ERROR_BAD_LENGTH: + return IBV_WC_REM_INV_REQ_ERR; + case EFA_IO_COMP_STATUS_REMOTE_ERROR_BAD_ADDRESS: + default: + return IBV_WC_GENERAL_ERR; + } +} + +static int efa_poll_sub_cq(struct efa_cq *cq, struct efa_sub_cq *sub_cq, + struct efa_qp **cur_qp, struct ibv_wc *wc) +{ + struct efa_context *ctx = to_efa_context(cq->ibvcq.context); + struct efa_io_cdesc_common *cqe; + uint32_t qpn, wrid_idx; + struct efa_wq *wq; + + cqe = cq_next_sub_cqe_get(sub_cq); + if (!cqe) + return ENOMEM; + + qpn = cqe->qp_num; + if (!*cur_qp || qpn != (*cur_qp)->verbs_qp.qp.qp_num) { + /* We do not have to take the QP table lock here, + * because CQs will be locked while QPs are removed + * from the table. + */ + *cur_qp = ctx->qp_table[qpn & ctx->qp_table_sz_m1]; + if (!*cur_qp) + return EINVAL; + } + + wrid_idx = cqe->req_id; + wc->status = to_ibv_status(cqe->status); + wc->vendor_err = cqe->status; + wc->wc_flags = 0; + wc->qp_num = qpn; + + if (EFA_GET(&cqe->flags, EFA_IO_CDESC_COMMON_Q_TYPE) == + EFA_IO_SEND_QUEUE) { + wq = &(*cur_qp)->sq.wq; + wc->opcode = IBV_WC_SEND; + } else { + struct efa_io_rx_cdesc *rcqe = + container_of(cqe, struct efa_io_rx_cdesc, common); + + wq = &(*cur_qp)->rq.wq; + + wc->byte_len = cqe->length; + wc->opcode = IBV_WC_RECV; + wc->src_qp = rcqe->src_qp_num; + wc->sl = 0; + wc->slid = rcqe->ah; + + if (EFA_GET(&cqe->flags, EFA_IO_CDESC_COMMON_HAS_IMM)) { + wc->imm_data = htobe32(rcqe->imm); + wc->wc_flags |= IBV_WC_WITH_IMM; + } + } + + pthread_spin_lock(&wq->wqlock); + wq->wrid_idx_pool_next--; + wq->wrid_idx_pool[wq->wrid_idx_pool_next] = wrid_idx; + wc->wr_id = wq->wrid[wrid_idx]; + wq->wqe_completed++; + pthread_spin_unlock(&wq->wqlock); + + return 0; +} + +static int efa_poll_sub_cqs(struct efa_cq *cq, struct ibv_wc *wc) +{ + uint16_t num_sub_cqs = cq->num_sub_cqs; + struct efa_sub_cq *sub_cq; + struct efa_qp *qp = NULL; + uint16_t sub_cq_idx; + int err = ENOMEM; + + for (sub_cq_idx = 0; sub_cq_idx < num_sub_cqs; sub_cq_idx++) { + sub_cq = &cq->sub_cq_arr[cq->next_poll_idx++]; + cq->next_poll_idx %= num_sub_cqs; + + if (!sub_cq->ref_cnt) + continue; + + err = efa_poll_sub_cq(cq, sub_cq, &qp, wc); + if (err != ENOMEM) + break; + } + + return err; +} + +int efa_poll_cq(struct ibv_cq *ibvcq, int nwc, struct ibv_wc *wc) +{ + struct efa_cq *cq = to_efa_cq(ibvcq); + int ret = 0; + int i; + + pthread_spin_lock(&cq->lock); + for (i = 0; i < nwc; i++) { + ret = efa_poll_sub_cqs(cq, &wc[i]); + if (ret) { + if (ret == ENOMEM) + ret = 0; + break; + } + } + pthread_spin_unlock(&cq->lock); + + return i ?: -ret; +} + +static void efa_cq_inc_ref_cnt(struct efa_cq *cq, uint8_t sub_cq_idx) +{ + cq->sub_cq_arr[sub_cq_idx].ref_cnt++; +} + +static void efa_cq_dec_ref_cnt(struct efa_cq *cq, uint8_t sub_cq_idx) +{ + cq->sub_cq_arr[sub_cq_idx].ref_cnt--; +} + +static void efa_wq_terminate(struct efa_wq *wq) +{ + pthread_spin_destroy(&wq->wqlock); + free(wq->wrid_idx_pool); + free(wq->wrid); +} + +static int efa_wq_initialize(struct efa_wq *wq) +{ + int err; + int i; + + wq->wrid = malloc(wq->wqe_cnt * sizeof(*wq->wrid)); + if (!wq->wrid) + return ENOMEM; + + wq->wrid_idx_pool = malloc(wq->wqe_cnt * sizeof(uint32_t)); + if (!wq->wrid_idx_pool) { + err = ENOMEM; + goto err_free_wrid; + } + + /* Initialize the wrid free indexes pool. */ + for (i = 0; i < wq->wqe_cnt; i++) + wq->wrid_idx_pool[i] = i; + + pthread_spin_init(&wq->wqlock, PTHREAD_PROCESS_PRIVATE); + + return 0; + +err_free_wrid: + free(wq->wrid); + + return err; +} + +static void efa_sq_terminate(struct efa_qp *qp) +{ + void *db_aligned; + + if (!qp->sq.wq.wrid) + return; + + db_aligned = (void *)((uintptr_t)qp->sq.db & ~(qp->page_size - 1)); + munmap(db_aligned, qp->page_size); + munmap(qp->sq.desc - qp->sq.desc_offset, qp->sq.desc_ring_mmap_size); + free(qp->sq.local_queue); + + efa_wq_terminate(&qp->sq.wq); +} + +static int efa_sq_initialize(struct efa_qp *qp, struct efa_create_qp_resp *resp) +{ + struct efa_dev *dev = to_efa_dev(qp->verbs_qp.qp.context->device); + size_t desc_ring_size; + uint8_t *db_base; + int err; + + if (!qp->sq.wq.wqe_cnt) + return 0; + + err = efa_wq_initialize(&qp->sq.wq); + if (err) + return err; + + qp->sq.desc_offset = resp->llq_desc_offset; + desc_ring_size = qp->sq.wq.wqe_cnt * sizeof(struct efa_io_tx_wqe); + qp->sq.desc_ring_mmap_size = align(desc_ring_size + qp->sq.desc_offset, + qp->page_size); + qp->sq.max_inline_data = resp->ibv_resp.max_inline_data; + + qp->sq.local_queue = malloc(desc_ring_size); + if (!qp->sq.local_queue) { + err = ENOMEM; + goto err_terminate_wq; + } + + qp->sq.desc = mmap(NULL, qp->sq.desc_ring_mmap_size, PROT_WRITE, + MAP_SHARED, qp->verbs_qp.qp.context->cmd_fd, + resp->llq_desc_mmap_key); + if (qp->sq.desc == MAP_FAILED) { + err = errno; + goto err_free_local_queue; + } + + qp->sq.desc += qp->sq.desc_offset; + + db_base = mmap(NULL, qp->page_size, PROT_WRITE, MAP_SHARED, + qp->verbs_qp.qp.context->cmd_fd, resp->sq_db_mmap_key); + if (db_base == MAP_FAILED) { + err = errno; + goto err_unmap_desc_ring; + } + + qp->sq.db = (uint32_t *)(db_base + resp->sq_db_offset); + qp->sq.sub_cq_idx = resp->send_sub_cq_idx; + qp->sq.max_wr_rdma_sge = min_t(uint16_t, dev->max_wr_rdma_sge, + EFA_IO_TX_DESC_NUM_RDMA_BUFS); + + return 0; + +err_unmap_desc_ring: + munmap(qp->sq.desc - qp->sq.desc_offset, qp->sq.desc_ring_mmap_size); +err_free_local_queue: + free(qp->sq.local_queue); +err_terminate_wq: + efa_wq_terminate(&qp->sq.wq); + return err; +} + +static void efa_rq_terminate(struct efa_qp *qp) +{ + void *db_aligned; + + if (!qp->rq.wq.wrid) + return; + + db_aligned = (void *)((uintptr_t)qp->rq.db & ~(qp->page_size - 1)); + munmap(db_aligned, qp->page_size); + munmap(qp->rq.buf, qp->rq.buf_size); + + efa_wq_terminate(&qp->rq.wq); +} + +static int efa_rq_initialize(struct efa_qp *qp, struct efa_create_qp_resp *resp) +{ + uint8_t *db_base; + int err; + + if (!qp->rq.wq.wqe_cnt) + return 0; + + err = efa_wq_initialize(&qp->rq.wq); + if (err) + return err; + + qp->rq.buf_size = resp->rq_mmap_size; + qp->rq.buf = mmap(NULL, qp->rq.buf_size, PROT_WRITE, MAP_SHARED, + qp->verbs_qp.qp.context->cmd_fd, resp->rq_mmap_key); + if (qp->rq.buf == MAP_FAILED) { + err = errno; + goto err_terminate_wq; + } + + db_base = mmap(NULL, qp->page_size, PROT_WRITE, MAP_SHARED, + qp->verbs_qp.qp.context->cmd_fd, resp->rq_db_mmap_key); + if (db_base == MAP_FAILED) { + err = errno; + goto err_unmap_rq_buf; + } + + qp->rq.db = (uint32_t *)(db_base + resp->rq_db_offset); + qp->rq.sub_cq_idx = resp->recv_sub_cq_idx; + + return 0; + +err_unmap_rq_buf: + munmap(qp->rq.buf, qp->rq.buf_size); +err_terminate_wq: + efa_wq_terminate(&qp->rq.wq); + return err; +} + +static void efa_qp_init_indices(struct efa_qp *qp) +{ + qp->sq.wq.wqe_posted = 0; + qp->sq.wq.wqe_completed = 0; + qp->sq.wq.desc_idx = 0; + qp->sq.wq.wrid_idx_pool_next = 0; + + qp->rq.wq.wqe_posted = 0; + qp->rq.wq.wqe_completed = 0; + qp->rq.wq.desc_idx = 0; + qp->rq.wq.wrid_idx_pool_next = 0; +} + +static void efa_setup_qp(struct efa_qp *qp, + struct ibv_qp_cap *cap, + size_t page_size) +{ + uint16_t rq_desc_cnt; + + efa_qp_init_indices(qp); + + qp->sq.wq.wqe_cnt = roundup_pow_of_two(cap->max_send_wr); + qp->sq.wq.max_sge = cap->max_send_sge; + qp->sq.wq.desc_mask = qp->sq.wq.wqe_cnt - 1; + + qp->rq.wq.max_sge = cap->max_recv_sge; + rq_desc_cnt = roundup_pow_of_two(cap->max_recv_sge * cap->max_recv_wr); + qp->rq.wq.desc_mask = rq_desc_cnt - 1; + qp->rq.wq.wqe_cnt = rq_desc_cnt / qp->rq.wq.max_sge; + + qp->page_size = page_size; +} + +static void efa_lock_cqs(struct ibv_qp *ibvqp) +{ + struct efa_cq *send_cq = to_efa_cq(ibvqp->send_cq); + struct efa_cq *recv_cq = to_efa_cq(ibvqp->recv_cq); + + if (recv_cq == send_cq && recv_cq) { + pthread_spin_lock(&recv_cq->lock); + } else { + if (recv_cq) + pthread_spin_lock(&recv_cq->lock); + if (send_cq) + pthread_spin_lock(&send_cq->lock); + } +} + +static void efa_unlock_cqs(struct ibv_qp *ibvqp) +{ + struct efa_cq *send_cq = to_efa_cq(ibvqp->send_cq); + struct efa_cq *recv_cq = to_efa_cq(ibvqp->recv_cq); + + if (recv_cq == send_cq && recv_cq) { + pthread_spin_unlock(&recv_cq->lock); + } else { + if (recv_cq) + pthread_spin_unlock(&recv_cq->lock); + if (send_cq) + pthread_spin_unlock(&send_cq->lock); + } +} + +static void efa_qp_fill_wr_pfns(struct ibv_qp_ex *ibvqpx, + struct ibv_qp_init_attr_ex *attr_ex); + +static int efa_check_qp_attr(struct efa_dev *dev, + struct ibv_qp_init_attr_ex *attr, + struct efadv_qp_init_attr *efa_attr) +{ + uint64_t supp_send_ops_mask; + uint64_t supp_ud_send_ops_mask = IBV_QP_EX_WITH_SEND | + IBV_QP_EX_WITH_SEND_WITH_IMM; + uint64_t supp_srd_send_ops_mask = + IBV_QP_EX_WITH_SEND | IBV_QP_EX_WITH_SEND_WITH_IMM | + (is_rdma_read_cap(dev) ? IBV_QP_EX_WITH_RDMA_READ : 0); + +#define EFA_CREATE_QP_SUPP_ATTR_MASK \ + (IBV_QP_INIT_ATTR_PD | IBV_QP_INIT_ATTR_SEND_OPS_FLAGS) + + if (attr->qp_type == IBV_QPT_DRIVER && + efa_attr->driver_qp_type != EFADV_QP_DRIVER_TYPE_SRD) + return EOPNOTSUPP; + + if (!check_comp_mask(attr->comp_mask, EFA_CREATE_QP_SUPP_ATTR_MASK)) + return EOPNOTSUPP; + + if (!(attr->comp_mask & IBV_QP_INIT_ATTR_PD)) + return EINVAL; + + if (attr->comp_mask & IBV_QP_INIT_ATTR_SEND_OPS_FLAGS) { + switch (attr->qp_type) { + case IBV_QPT_UD: + supp_send_ops_mask = supp_ud_send_ops_mask; + break; + case IBV_QPT_DRIVER: + supp_send_ops_mask = supp_srd_send_ops_mask; + break; + default: + return EOPNOTSUPP; + } + + if (!check_comp_mask(attr->send_ops_flags, supp_send_ops_mask)) + return EOPNOTSUPP; + } + + if (!attr->recv_cq || !attr->send_cq) + return EINVAL; + + if (attr->srq) + return EINVAL; + + return 0; +} + +static int efa_check_qp_limits(struct efa_dev *dev, + struct ibv_qp_init_attr_ex *attr) +{ + if (attr->cap.max_send_sge > dev->max_sq_sge) + return EINVAL; + + if (attr->cap.max_recv_sge > dev->max_rq_sge) + return EINVAL; + + if (attr->cap.max_send_wr > dev->max_sq_wr) + return EINVAL; + + if (attr->cap.max_recv_wr > dev->max_rq_wr) + return EINVAL; + + return 0; +} + +static struct ibv_qp *create_qp(struct ibv_context *ibvctx, + struct ibv_qp_init_attr_ex *attr, + struct efadv_qp_init_attr *efa_attr) +{ + struct efa_context *ctx = to_efa_context(ibvctx); + struct efa_dev *dev = to_efa_dev(ibvctx->device); + struct efa_create_qp_resp resp = {}; + struct efa_create_qp req = {}; + struct efa_cq *send_cq; + struct efa_cq *recv_cq; + struct ibv_qp *ibvqp; + struct efa_qp *qp; + int err; + + err = efa_check_qp_attr(dev, attr, efa_attr); + if (err) + goto err_out; + + err = efa_check_qp_limits(dev, attr); + if (err) + goto err_out; + + qp = calloc(1, sizeof(*qp)); + if (!qp) { + err = ENOMEM; + goto err_out; + } + + efa_setup_qp(qp, &attr->cap, dev->pg_sz); + + attr->cap.max_send_wr = qp->sq.wq.wqe_cnt; + attr->cap.max_recv_wr = qp->rq.wq.wqe_cnt; + + req.rq_ring_size = (qp->rq.wq.desc_mask + 1) * + sizeof(struct efa_io_rx_desc); + req.sq_ring_size = (attr->cap.max_send_wr) * + sizeof(struct efa_io_tx_wqe); + if (attr->qp_type == IBV_QPT_DRIVER) + req.driver_qp_type = efa_attr->driver_qp_type; + + err = ibv_cmd_create_qp_ex(ibvctx, &qp->verbs_qp, sizeof(qp->verbs_qp), + attr, &req.ibv_cmd, sizeof(req), + &resp.ibv_resp, sizeof(resp)); + if (err) + goto err_free_qp; + + ibvqp = &qp->verbs_qp.qp; + ibvqp->state = IBV_QPS_RESET; + qp->sq_sig_all = attr->sq_sig_all; + + err = efa_rq_initialize(qp, &resp); + if (err) + goto err_destroy_qp; + + err = efa_sq_initialize(qp, &resp); + if (err) + goto err_terminate_rq; + + pthread_spin_lock(&ctx->qp_table_lock); + ctx->qp_table[ibvqp->qp_num & ctx->qp_table_sz_m1] = qp; + pthread_spin_unlock(&ctx->qp_table_lock); + + if (attr->send_cq) { + send_cq = to_efa_cq(attr->send_cq); + qp->scq = send_cq; + pthread_spin_lock(&send_cq->lock); + efa_cq_inc_ref_cnt(send_cq, resp.send_sub_cq_idx); + pthread_spin_unlock(&send_cq->lock); + } + + if (attr->recv_cq) { + recv_cq = to_efa_cq(attr->recv_cq); + qp->rcq = recv_cq; + pthread_spin_lock(&recv_cq->lock); + efa_cq_inc_ref_cnt(recv_cq, resp.recv_sub_cq_idx); + pthread_spin_unlock(&recv_cq->lock); + } + + if (attr->comp_mask & IBV_QP_INIT_ATTR_SEND_OPS_FLAGS) { + efa_qp_fill_wr_pfns(&qp->verbs_qp.qp_ex, attr); + qp->verbs_qp.comp_mask |= VERBS_QP_EX; + } + + return ibvqp; + +err_terminate_rq: + efa_rq_terminate(qp); +err_destroy_qp: + ibv_cmd_destroy_qp(ibvqp); +err_free_qp: + free(qp); +err_out: + errno = err; + return NULL; +} + +struct ibv_qp *efa_create_qp(struct ibv_pd *ibvpd, + struct ibv_qp_init_attr *attr) +{ + struct ibv_qp_init_attr_ex attr_ex = {}; + struct ibv_qp *ibvqp; + + if (attr->qp_type != IBV_QPT_UD) { + errno = EOPNOTSUPP; + return NULL; + } + + memcpy(&attr_ex, attr, sizeof(*attr)); + attr_ex.comp_mask = IBV_QP_INIT_ATTR_PD; + attr_ex.pd = ibvpd; + + ibvqp = create_qp(ibvpd->context, &attr_ex, NULL); + if (ibvqp) + memcpy(attr, &attr_ex, sizeof(*attr)); + + return ibvqp; +} + +struct ibv_qp *efa_create_qp_ex(struct ibv_context *ibvctx, + struct ibv_qp_init_attr_ex *attr_ex) +{ + if (attr_ex->qp_type != IBV_QPT_UD) { + errno = EINVAL; + return NULL; + } + + return create_qp(ibvctx, attr_ex, NULL); +} + +struct ibv_qp *efadv_create_driver_qp(struct ibv_pd *ibvpd, + struct ibv_qp_init_attr *attr, + uint32_t driver_qp_type) +{ + struct ibv_qp_init_attr_ex attr_ex = {}; + struct efadv_qp_init_attr efa_attr = {}; + struct ibv_qp *ibvqp; + + if (!is_efa_dev(ibvpd->context->device)) { + errno = EOPNOTSUPP; + return NULL; + } + + if (attr->qp_type != IBV_QPT_DRIVER) { + errno = EINVAL; + return NULL; + } + + memcpy(&attr_ex, attr, sizeof(*attr)); + attr_ex.comp_mask = IBV_QP_INIT_ATTR_PD; + attr_ex.pd = ibvpd; + efa_attr.driver_qp_type = driver_qp_type; + + ibvqp = create_qp(ibvpd->context, &attr_ex, &efa_attr); + if (ibvqp) + memcpy(attr, &attr_ex, sizeof(*attr)); + + return ibvqp; +} + +struct ibv_qp *efadv_create_qp_ex(struct ibv_context *ibvctx, + struct ibv_qp_init_attr_ex *attr_ex, + struct efadv_qp_init_attr *efa_attr, + uint32_t inlen) +{ + if (!is_efa_dev(ibvctx->device)) { + errno = EOPNOTSUPP; + return NULL; + } + + if (attr_ex->qp_type != IBV_QPT_DRIVER || + !vext_field_avail(struct efadv_qp_init_attr, + driver_qp_type, inlen) || + efa_attr->comp_mask || + !is_reserved_cleared(efa_attr->reserved) || + (inlen > sizeof(efa_attr) && !is_ext_cleared(efa_attr, inlen))) { + errno = EINVAL; + return NULL; + } + + return create_qp(ibvctx, attr_ex, efa_attr); +} + +int efa_modify_qp(struct ibv_qp *ibvqp, struct ibv_qp_attr *attr, + int attr_mask) +{ + struct efa_qp *qp = to_efa_qp(ibvqp); + struct ibv_modify_qp cmd = {}; + int err; + + err = ibv_cmd_modify_qp(ibvqp, attr, attr_mask, &cmd, sizeof(cmd)); + if (err) + return err; + + if (attr_mask & IBV_QP_STATE) { + qp->verbs_qp.qp.state = attr->qp_state; + /* transition to reset */ + if (qp->verbs_qp.qp.state == IBV_QPS_RESET) + efa_qp_init_indices(qp); + } + + return 0; +} + +int efa_query_qp(struct ibv_qp *ibvqp, struct ibv_qp_attr *attr, + int attr_mask, struct ibv_qp_init_attr *init_attr) +{ + struct ibv_query_qp cmd; + + return ibv_cmd_query_qp(ibvqp, attr, attr_mask, init_attr, + &cmd, sizeof(cmd)); +} + +int efa_destroy_qp(struct ibv_qp *ibvqp) +{ + struct efa_context *ctx = to_efa_context(ibvqp->context); + struct efa_qp *qp = to_efa_qp(ibvqp); + int err; + + pthread_spin_lock(&ctx->qp_table_lock); + efa_lock_cqs(ibvqp); + + if (ibvqp->send_cq) + efa_cq_dec_ref_cnt(to_efa_cq(ibvqp->send_cq), + qp->sq.sub_cq_idx); + + if (ibvqp->recv_cq) + efa_cq_dec_ref_cnt(to_efa_cq(ibvqp->recv_cq), + qp->rq.sub_cq_idx); + + ctx->qp_table[ibvqp->qp_num & ctx->qp_table_sz_m1] = NULL; + + efa_unlock_cqs(ibvqp); + pthread_spin_unlock(&ctx->qp_table_lock); + + efa_sq_terminate(qp); + efa_rq_terminate(qp); + + err = ibv_cmd_destroy_qp(ibvqp); + if (err) + return err; + + free(qp); + return 0; +} + +static void efa_set_tx_buf(struct efa_io_tx_buf_desc *tx_buf, + uint64_t addr, uint32_t lkey, + uint32_t length) +{ + tx_buf->length = length; + EFA_SET(&tx_buf->lkey, EFA_IO_TX_BUF_DESC_LKEY, lkey); + tx_buf->buf_addr_lo = addr & 0xffffffff; + tx_buf->buf_addr_hi = addr >> 32; +} + +static void efa_post_send_sgl(struct efa_io_tx_buf_desc *tx_bufs, + const struct ibv_sge *sg_list, + int num_sge) +{ + const struct ibv_sge *sge; + size_t i; + + for (i = 0; i < num_sge; i++) { + sge = &sg_list[i]; + efa_set_tx_buf(&tx_bufs[i], sge->addr, sge->lkey, sge->length); + } +} + +static void efa_post_send_inline_data(const struct ibv_send_wr *wr, + struct efa_io_tx_wqe *tx_wqe) +{ + const struct ibv_sge *sgl = wr->sg_list; + uint32_t total_length = 0; + uint32_t length; + size_t i; + + for (i = 0; i < wr->num_sge; i++) { + length = sgl[i].length; + + memcpy(tx_wqe->data.inline_data + total_length, + (void *)(uintptr_t)sgl[i].addr, length); + total_length += length; + } + + EFA_SET(&tx_wqe->meta.ctrl1, EFA_IO_TX_META_DESC_INLINE_MSG, 1); + tx_wqe->meta.length = total_length; +} + +static size_t efa_sge_total_bytes(const struct ibv_sge *sg_list, int num_sge) +{ + size_t bytes = 0; + size_t i; + + for (i = 0; i < num_sge; i++) + bytes += sg_list[i].length; + + return bytes; +} + +static size_t efa_buf_list_total_bytes(const struct ibv_data_buf *buf_list, + size_t num_buf) +{ + size_t bytes = 0; + size_t i; + + for (i = 0; i < num_buf; i++) + bytes += buf_list[i].length; + + return bytes; +} + +static void efa_sq_advance_post_idx(struct efa_qp *qp) +{ + qp->sq.wq.wqe_posted++; + qp->sq.wq.desc_idx++; + + if (!(qp->sq.wq.desc_idx & qp->sq.wq.desc_mask)) + qp->sq.wq.phase++; +} + +static uint32_t efa_wq_get_next_wrid_idx(struct efa_wq *wq, uint64_t wr_id) +{ + uint32_t wrid_idx; + + /* Get the next wrid to be used from the index pool */ + wrid_idx = wq->wrid_idx_pool[wq->wrid_idx_pool_next]; + wq->wrid[wrid_idx] = wr_id; + + /* Will never overlap, as validate function succeeded */ + wq->wrid_idx_pool_next++; + assert(wq->wrid_idx_pool_next <= wq->wqe_cnt); + + return wrid_idx; +} + +static void efa_set_common_ctrl_flags(struct efa_io_tx_meta_desc *desc, + struct efa_qp *qp, + enum efa_io_send_op_type op_type) +{ + EFA_SET(&desc->ctrl1, EFA_IO_TX_META_DESC_META_DESC, 1); + EFA_SET(&desc->ctrl1, EFA_IO_TX_META_DESC_OP_TYPE, op_type); + EFA_SET(&desc->ctrl2, EFA_IO_TX_META_DESC_PHASE, qp->sq.wq.phase); + EFA_SET(&desc->ctrl2, EFA_IO_TX_META_DESC_FIRST, 1); + EFA_SET(&desc->ctrl2, EFA_IO_TX_META_DESC_LAST, 1); + EFA_SET(&desc->ctrl2, EFA_IO_TX_META_DESC_COMP_REQ, 1); +} + +static int efa_post_send_validate(struct efa_qp *qp, + unsigned int wr_flags) +{ + if (unlikely(qp->verbs_qp.qp.state != IBV_QPS_RTS && + qp->verbs_qp.qp.state != IBV_QPS_SQD)) + return EINVAL; + + if (unlikely(!qp->scq)) + return EINVAL; + + if (unlikely(!(wr_flags & IBV_SEND_SIGNALED) && !qp->sq_sig_all)) + return EINVAL; + + if (unlikely(wr_flags & ~(IBV_SEND_SIGNALED | IBV_SEND_INLINE))) + return EINVAL; + + if (unlikely(qp->sq.wq.wqe_posted - qp->sq.wq.wqe_completed == + qp->sq.wq.wqe_cnt)) + return ENOMEM; + + return 0; +} + +static int efa_post_send_validate_wr(struct efa_qp *qp, + const struct ibv_send_wr *wr) +{ + int err; + + err = efa_post_send_validate(qp, wr->send_flags); + if (unlikely(err)) + return err; + + if (unlikely(wr->opcode != IBV_WR_SEND && + wr->opcode != IBV_WR_SEND_WITH_IMM)) + return EINVAL; + + if (wr->send_flags & IBV_SEND_INLINE) { + if (unlikely(efa_sge_total_bytes(wr->sg_list, wr->num_sge) > + qp->sq.max_inline_data)) + return EINVAL; + } else { + if (unlikely(wr->num_sge > qp->sq.wq.max_sge)) + return EINVAL; + } + + return 0; +} + +int efa_post_send(struct ibv_qp *ibvqp, struct ibv_send_wr *wr, + struct ibv_send_wr **bad) +{ + struct efa_io_tx_meta_desc *meta_desc; + struct efa_qp *qp = to_efa_qp(ibvqp); + struct efa_io_tx_wqe tx_wqe; + uint32_t sq_desc_offset; + struct efa_ah *ah; + int err = 0; + + mmio_wc_spinlock(&qp->sq.wq.wqlock); + while (wr) { + err = efa_post_send_validate_wr(qp, wr); + if (err) { + *bad = wr; + goto ring_db; + } + + memset(&tx_wqe, 0, sizeof(tx_wqe)); + meta_desc = &tx_wqe.meta; + ah = to_efa_ah(wr->wr.ud.ah); + + if (wr->send_flags & IBV_SEND_INLINE) { + efa_post_send_inline_data(wr, &tx_wqe); + } else { + meta_desc->length = wr->num_sge; + efa_post_send_sgl(tx_wqe.data.sgl, wr->sg_list, + wr->num_sge); + } + + if (wr->opcode == IBV_WR_SEND_WITH_IMM) { + meta_desc->immediate_data = be32toh(wr->imm_data); + EFA_SET(&meta_desc->ctrl1, EFA_IO_TX_META_DESC_HAS_IMM, + 1); + } + + /* Set rest of the descriptor fields */ + efa_set_common_ctrl_flags(meta_desc, qp, EFA_IO_SEND); + meta_desc->req_id = efa_wq_get_next_wrid_idx(&qp->sq.wq, wr->wr_id); + meta_desc->dest_qp_num = wr->wr.ud.remote_qpn; + meta_desc->ah = ah->efa_ah; + meta_desc->qkey = wr->wr.ud.remote_qkey; + + /* Copy descriptor */ + sq_desc_offset = (qp->sq.wq.desc_idx & qp->sq.wq.desc_mask) * + sizeof(tx_wqe); + memcpy(qp->sq.desc + sq_desc_offset, &tx_wqe, sizeof(tx_wqe)); + + /* advance index and change phase */ + efa_sq_advance_post_idx(qp); + + wr = wr->next; + } + +ring_db: + mmio_flush_writes(); + mmio_write32(qp->sq.db, qp->sq.wq.desc_idx); + + /* + * Not using mmio_wc_spinunlock as the doorbell write should be done + * inside the lock. + */ + pthread_spin_unlock(&qp->sq.wq.wqlock); + return err; +} + +static int efa_send_wr_common(struct ibv_qp_ex *ibvqpx, + enum efa_io_send_op_type op_type) +{ + struct efa_qp *qp = to_efa_qp_ex(ibvqpx); + struct efa_io_tx_meta_desc *meta_desc; + int err; + + if (unlikely(qp->wr_session_err)) + return qp->wr_session_err; + + err = efa_post_send_validate(qp, ibvqpx->wr_flags); + if (unlikely(err)) { + qp->wr_session_err = err; + return err; + } + + qp->sq.curr_tx_wqe = (struct efa_io_tx_wqe *)qp->sq.local_queue + + qp->sq.num_wqe_pending; + memset(qp->sq.curr_tx_wqe, 0, sizeof(*qp->sq.curr_tx_wqe)); + + meta_desc = &qp->sq.curr_tx_wqe->meta; + efa_set_common_ctrl_flags(meta_desc, qp, op_type); + meta_desc->req_id = efa_wq_get_next_wrid_idx(&qp->sq.wq, ibvqpx->wr_id); + + /* advance index and change phase */ + efa_sq_advance_post_idx(qp); + qp->sq.num_wqe_pending++; + + return 0; +} + +static void efa_send_wr_send(struct ibv_qp_ex *ibvqpx) +{ + efa_send_wr_common(ibvqpx, EFA_IO_SEND); +} + +static void efa_send_wr_send_imm(struct ibv_qp_ex *ibvqpx, __be32 imm_data) +{ + struct efa_qp *qp = to_efa_qp_ex(ibvqpx); + struct efa_io_tx_meta_desc *meta_desc; + int err; + + err = efa_send_wr_common(ibvqpx, EFA_IO_SEND); + if (unlikely(err)) + return; + + meta_desc = &qp->sq.curr_tx_wqe->meta; + meta_desc->immediate_data = be32toh(imm_data); + EFA_SET(&meta_desc->ctrl1, EFA_IO_TX_META_DESC_HAS_IMM, 1); +} + +static void efa_send_wr_rdma_read(struct ibv_qp_ex *ibvqpx, uint32_t rkey, + uint64_t remote_addr) +{ + struct efa_io_remote_mem_addr *remote_mem; + struct efa_qp *qp = to_efa_qp_ex(ibvqpx); + struct efa_io_tx_wqe *tx_wqe; + int err; + + err = efa_send_wr_common(ibvqpx, EFA_IO_RDMA_READ); + if (unlikely(err)) + return; + + tx_wqe = qp->sq.curr_tx_wqe; + remote_mem = &tx_wqe->data.rdma_req.remote_mem; + remote_mem->rkey = rkey; + remote_mem->buf_addr_lo = remote_addr & 0xFFFFFFFF; + remote_mem->buf_addr_hi = remote_addr >> 32; +} + +static void efa_send_wr_set_sge(struct ibv_qp_ex *ibvqpx, uint32_t lkey, + uint64_t addr, uint32_t length) +{ + struct efa_qp *qp = to_efa_qp_ex(ibvqpx); + struct efa_io_tx_buf_desc *buf; + struct efa_io_tx_wqe *tx_wqe; + uint8_t op_type; + + if (unlikely(qp->wr_session_err)) + return; + + tx_wqe = qp->sq.curr_tx_wqe; + tx_wqe->meta.length = 1; + + op_type = EFA_GET(&tx_wqe->meta.ctrl1, EFA_IO_TX_META_DESC_OP_TYPE); + switch (op_type) { + case EFA_IO_SEND: + buf = &tx_wqe->data.sgl[0]; + break; + case EFA_IO_RDMA_READ: + tx_wqe->data.rdma_req.remote_mem.length = length; + buf = &tx_wqe->data.rdma_req.local_mem[0]; + break; + default: + return; + } + + efa_set_tx_buf(buf, addr, lkey, length); +} + +static void efa_send_wr_set_sge_list(struct ibv_qp_ex *ibvqpx, size_t num_sge, + const struct ibv_sge *sg_list) +{ + struct efa_qp *qp = to_efa_qp_ex(ibvqpx); + struct efa_io_rdma_req *rdma_req; + struct efa_io_tx_wqe *tx_wqe; + uint8_t op_type; + + if (unlikely(qp->wr_session_err)) + return; + + tx_wqe = qp->sq.curr_tx_wqe; + op_type = EFA_GET(&tx_wqe->meta.ctrl1, EFA_IO_TX_META_DESC_OP_TYPE); + switch (op_type) { + case EFA_IO_SEND: + if (unlikely(num_sge > qp->sq.wq.max_sge)) { + qp->wr_session_err = EINVAL; + return; + } + efa_post_send_sgl(tx_wqe->data.sgl, sg_list, num_sge); + break; + case EFA_IO_RDMA_READ: + if (unlikely(num_sge > qp->sq.max_wr_rdma_sge)) { + qp->wr_session_err = EINVAL; + return; + } + rdma_req = &tx_wqe->data.rdma_req; + rdma_req->remote_mem.length = efa_sge_total_bytes(sg_list, + num_sge); + efa_post_send_sgl(rdma_req->local_mem, sg_list, num_sge); + break; + default: + return; + } + + tx_wqe->meta.length = num_sge; +} + +static void efa_send_wr_set_inline_data(struct ibv_qp_ex *ibvqpx, void *addr, + size_t length) +{ + struct efa_qp *qp = to_efa_qp_ex(ibvqpx); + struct efa_io_tx_wqe *tx_wqe = qp->sq.curr_tx_wqe; + + if (unlikely(qp->wr_session_err)) + return; + + if (unlikely(length > qp->sq.max_inline_data)) { + qp->wr_session_err = EINVAL; + return; + } + + EFA_SET(&tx_wqe->meta.ctrl1, EFA_IO_TX_META_DESC_INLINE_MSG, 1); + memcpy(tx_wqe->data.inline_data, addr, length); + tx_wqe->meta.length = length; +} + +static void +efa_send_wr_set_inline_data_list(struct ibv_qp_ex *ibvqpx, + size_t num_buf, + const struct ibv_data_buf *buf_list) +{ + struct efa_qp *qp = to_efa_qp_ex(ibvqpx); + struct efa_io_tx_wqe *tx_wqe = qp->sq.curr_tx_wqe; + uint32_t total_length = 0; + uint32_t length; + size_t i; + + if (unlikely(qp->wr_session_err)) + return; + + if (unlikely(efa_buf_list_total_bytes(buf_list, num_buf) > + qp->sq.max_inline_data)) { + qp->wr_session_err = EINVAL; + return; + } + + for (i = 0; i < num_buf; i++) { + length = buf_list[i].length; + + memcpy(tx_wqe->data.inline_data + total_length, + buf_list[i].addr, length); + total_length += length; + } + + EFA_SET(&tx_wqe->meta.ctrl1, EFA_IO_TX_META_DESC_INLINE_MSG, 1); + tx_wqe->meta.length = total_length; +} + +static void efa_send_wr_set_addr(struct ibv_qp_ex *ibvqpx, + struct ibv_ah *ibvah, + uint32_t remote_qpn, uint32_t remote_qkey) +{ + struct efa_qp *qp = to_efa_qp_ex(ibvqpx); + struct efa_ah *ah = to_efa_ah(ibvah); + struct efa_io_tx_wqe *tx_wqe = qp->sq.curr_tx_wqe; + + if (unlikely(qp->wr_session_err)) + return; + + tx_wqe->meta.dest_qp_num = remote_qpn; + tx_wqe->meta.ah = ah->efa_ah; + tx_wqe->meta.qkey = remote_qkey; +} + +static void efa_send_wr_start(struct ibv_qp_ex *ibvqpx) +{ + struct efa_qp *qp = to_efa_qp_ex(ibvqpx); + + mmio_wc_spinlock(&qp->sq.wq.wqlock); + qp->wr_session_err = 0; + qp->sq.num_wqe_pending = 0; + qp->sq.phase_rb = qp->sq.wq.phase; +} + +static inline void efa_sq_roll_back(struct efa_qp *qp) +{ + qp->sq.wq.wqe_posted -= qp->sq.num_wqe_pending; + qp->sq.wq.desc_idx -= qp->sq.num_wqe_pending; + qp->sq.wq.wrid_idx_pool_next -= qp->sq.num_wqe_pending; + qp->sq.wq.phase = qp->sq.phase_rb; +} + +static int efa_send_wr_complete(struct ibv_qp_ex *ibvqpx) +{ + struct efa_qp *qp = to_efa_qp_ex(ibvqpx); + uint32_t num_wqe_to_copy; + uint16_t local_idx = 0; + uint16_t sq_desc_idx; + + if (unlikely(qp->wr_session_err)) { + efa_sq_roll_back(qp); + goto out; + } + + /* + * Copy local queue to device in chunks, as the descriptor index + * might have wrapped around the submission queue. + */ + sq_desc_idx = (qp->sq.wq.desc_idx - qp->sq.num_wqe_pending) & + qp->sq.wq.desc_mask; + + while (qp->sq.num_wqe_pending) { + num_wqe_to_copy = min(qp->sq.num_wqe_pending, + qp->sq.wq.wqe_cnt - sq_desc_idx); + memcpy((struct efa_io_tx_wqe *)qp->sq.desc + sq_desc_idx, + (struct efa_io_tx_wqe *)qp->sq.local_queue + local_idx, + num_wqe_to_copy * sizeof(struct efa_io_tx_wqe)); + + qp->sq.num_wqe_pending -= num_wqe_to_copy; + local_idx += num_wqe_to_copy; + sq_desc_idx = (sq_desc_idx + num_wqe_to_copy) & + qp->sq.wq.desc_mask; + } + + mmio_flush_writes(); + mmio_write32(qp->sq.db, qp->sq.wq.desc_idx); +out: + /* + * Not using mmio_wc_spinunlock as the doorbell write should be done + * inside the lock. + */ + pthread_spin_unlock(&qp->sq.wq.wqlock); + + return qp->wr_session_err; +} + +static void efa_send_wr_abort(struct ibv_qp_ex *ibvqpx) +{ + struct efa_qp *qp = to_efa_qp_ex(ibvqpx); + + efa_sq_roll_back(qp); + pthread_spin_unlock(&qp->sq.wq.wqlock); +} + +static void efa_qp_fill_wr_pfns(struct ibv_qp_ex *ibvqpx, + struct ibv_qp_init_attr_ex *attr_ex) +{ + ibvqpx->wr_start = efa_send_wr_start; + ibvqpx->wr_complete = efa_send_wr_complete; + ibvqpx->wr_abort = efa_send_wr_abort; + + if (attr_ex->send_ops_flags & IBV_QP_EX_WITH_SEND) + ibvqpx->wr_send = efa_send_wr_send; + + if (attr_ex->send_ops_flags & IBV_QP_EX_WITH_SEND_WITH_IMM) + ibvqpx->wr_send_imm = efa_send_wr_send_imm; + + if (attr_ex->send_ops_flags & IBV_QP_EX_WITH_RDMA_READ) + ibvqpx->wr_rdma_read = efa_send_wr_rdma_read; + + ibvqpx->wr_set_inline_data = efa_send_wr_set_inline_data; + ibvqpx->wr_set_inline_data_list = efa_send_wr_set_inline_data_list; + ibvqpx->wr_set_sge = efa_send_wr_set_sge; + ibvqpx->wr_set_sge_list = efa_send_wr_set_sge_list; + ibvqpx->wr_set_ud_addr = efa_send_wr_set_addr; +} + +static int efa_post_recv_validate(struct efa_qp *qp, struct ibv_recv_wr *wr) +{ + if (unlikely(qp->verbs_qp.qp.state == IBV_QPS_RESET || + qp->verbs_qp.qp.state == IBV_QPS_ERR)) + return EINVAL; + + if (unlikely(!qp->rcq)) + return EINVAL; + + if (unlikely(wr->num_sge > qp->rq.wq.max_sge)) + return EINVAL; + + if (unlikely(qp->rq.wq.wqe_posted - qp->rq.wq.wqe_completed == + qp->rq.wq.wqe_cnt)) + return ENOMEM; + + return 0; +} + +int efa_post_recv(struct ibv_qp *ibvqp, struct ibv_recv_wr *wr, + struct ibv_recv_wr **bad) +{ + struct efa_qp *qp = to_efa_qp(ibvqp); + struct efa_io_rx_desc rx_buf; + uint32_t rq_desc_offset; + uintptr_t addr; + int err = 0; + size_t i; + + pthread_spin_lock(&qp->rq.wq.wqlock); + while (wr) { + err = efa_post_recv_validate(qp, wr); + if (err) { + *bad = wr; + goto ring_db; + } + + memset(&rx_buf, 0, sizeof(rx_buf)); + + rx_buf.req_id = efa_wq_get_next_wrid_idx(&qp->rq.wq, wr->wr_id); + qp->rq.wq.wqe_posted++; + + /* Default init of the rx buffer */ + EFA_SET(&rx_buf.lkey_ctrl, EFA_IO_RX_DESC_FIRST, 1); + EFA_SET(&rx_buf.lkey_ctrl, EFA_IO_RX_DESC_LAST, 0); + + for (i = 0; i < wr->num_sge; i++) { + /* Set last indication if need) */ + if (i == wr->num_sge - 1) + EFA_SET(&rx_buf.lkey_ctrl, EFA_IO_RX_DESC_LAST, + 1); + + addr = wr->sg_list[i].addr; + + /* Set RX buffer desc from SGE */ + rx_buf.length = wr->sg_list[i].length; + EFA_SET(&rx_buf.lkey_ctrl, EFA_IO_RX_DESC_LKEY, + wr->sg_list[i].lkey); + rx_buf.buf_addr_lo = addr; + rx_buf.buf_addr_hi = (uint64_t)addr >> 32; + + /* Copy descriptor to RX ring */ + rq_desc_offset = (qp->rq.wq.desc_idx & qp->rq.wq.desc_mask) * sizeof(rx_buf); + memcpy(qp->rq.buf + rq_desc_offset, &rx_buf, sizeof(rx_buf)); + + /* Wrap rx descriptor index */ + qp->rq.wq.desc_idx++; + if (!(qp->rq.wq.desc_idx & qp->rq.wq.desc_mask)) + qp->rq.wq.phase++; + + /* reset descriptor for next iov */ + memset(&rx_buf, 0, sizeof(rx_buf)); + } + wr = wr->next; + } + +ring_db: + udma_to_device_barrier(); + mmio_write32(qp->rq.db, qp->rq.wq.desc_idx); + + pthread_spin_unlock(&qp->rq.wq.wqlock); + return err; +} + +int efadv_query_ah(struct ibv_ah *ibvah, struct efadv_ah_attr *attr, + uint32_t inlen) +{ + uint64_t comp_mask_out = 0; + + if (!is_efa_dev(ibvah->context->device)) + return EOPNOTSUPP; + + if (!vext_field_avail(typeof(*attr), ahn, inlen)) + return EINVAL; + + memset(attr, 0, inlen); + attr->ahn = to_efa_ah(ibvah)->efa_ah; + + attr->comp_mask = comp_mask_out; + + return 0; +} + +struct ibv_ah *efa_create_ah(struct ibv_pd *ibvpd, struct ibv_ah_attr *attr) +{ + struct efa_create_ah_resp resp = {}; + struct efa_ah *ah; + int err; + + ah = calloc(1, sizeof(*ah)); + if (!ah) + return NULL; + + err = ibv_cmd_create_ah(ibvpd, &ah->ibvah, attr, + &resp.ibv_resp, sizeof(resp)); + if (err) { + free(ah); + errno = err; + return NULL; + } + + ah->efa_ah = resp.efa_address_handle; + + return &ah->ibvah; +} + +int efa_destroy_ah(struct ibv_ah *ibvah) +{ + struct efa_ah *ah; + int err; + + ah = to_efa_ah(ibvah); + err = ibv_cmd_destroy_ah(ibvah); + if (err) + return err; + free(ah); + + return 0; +} diff --git a/providers/efa/verbs.h b/providers/efa/verbs.h new file mode 100644 index 0000000..8bf468d --- /dev/null +++ b/providers/efa/verbs.h @@ -0,0 +1,46 @@ +/* SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause */ +/* + * Copyright 2019 Amazon.com, Inc. or its affiliates. All rights reserved. + */ + +#ifndef __EFA_VERBS_H__ +#define __EFA_VERBS_H__ + +#include <infiniband/driver.h> +#include <infiniband/verbs.h> + +int efa_query_device(struct ibv_context *uctx, struct ibv_device_attr *attr); +int efa_query_port(struct ibv_context *uctx, uint8_t port, + struct ibv_port_attr *attr); +int efa_query_device_ex(struct ibv_context *context, + const struct ibv_query_device_ex_input *input, + struct ibv_device_attr_ex *attr, size_t attr_size); +struct ibv_pd *efa_alloc_pd(struct ibv_context *uctx); +int efa_dealloc_pd(struct ibv_pd *ibvpd); +struct ibv_mr *efa_reg_mr(struct ibv_pd *ibvpd, void *buf, size_t len, + uint64_t hca_va, int ibv_access_flags); +int efa_dereg_mr(struct verbs_mr *vmr); + +struct ibv_cq *efa_create_cq(struct ibv_context *uctx, int ncqe, + struct ibv_comp_channel *ch, int vec); +int efa_destroy_cq(struct ibv_cq *ibvcq); +int efa_poll_cq(struct ibv_cq *ibvcq, int nwc, struct ibv_wc *wc); + +struct ibv_qp *efa_create_qp(struct ibv_pd *ibvpd, + struct ibv_qp_init_attr *attr); +struct ibv_qp *efa_create_qp_ex(struct ibv_context *ibvctx, + struct ibv_qp_init_attr_ex *attr_ex); +int efa_modify_qp(struct ibv_qp *ibvqp, struct ibv_qp_attr *attr, + int ibv_qp_attr_mask); +int efa_query_qp(struct ibv_qp *ibvqp, struct ibv_qp_attr *attr, int attr_mask, + struct ibv_qp_init_attr *init_attr); +int efa_destroy_qp(struct ibv_qp *ibvqp); +int efa_post_send(struct ibv_qp *ibvqp, struct ibv_send_wr *wr, + struct ibv_send_wr **bad); +int efa_post_recv(struct ibv_qp *ibvqp, struct ibv_recv_wr *wr, + struct ibv_recv_wr **bad); + +struct ibv_ah *efa_create_ah(struct ibv_pd *ibvpd, struct ibv_ah_attr *attr); +int efa_destroy_ah(struct ibv_ah *ibvah); + +#endif /* __EFA_VERBS_H__ */ diff --git a/providers/hfi1verbs/CMakeLists.txt b/providers/hfi1verbs/CMakeLists.txt new file mode 100644 index 0000000..702bb5e --- /dev/null +++ b/providers/hfi1verbs/CMakeLists.txt @@ -0,0 +1,4 @@ +rdma_provider(hfi1verbs + hfiverbs.c + verbs.c + ) diff --git a/providers/hfi1verbs/hfi-abi.h b/providers/hfi1verbs/hfi-abi.h new file mode 100644 index 0000000..edf05e8 --- /dev/null +++ b/providers/hfi1verbs/hfi-abi.h @@ -0,0 +1,92 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation + www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + Copyright (C) 2006-2007 QLogic Corporation, All rights reserved. + +*/ + +#ifndef HFI1_ABI_H +#define HFI1_ABI_H + +#include <infiniband/kern-abi.h> + +struct hfi1_get_context_resp { + struct ib_uverbs_get_context_resp ibv_resp; + __u32 version; +}; + +struct hfi1_create_cq_resp { + struct ib_uverbs_create_cq_resp ibv_resp; + __u64 offset; +}; + +struct hfi1_resize_cq_resp { + struct ib_uverbs_resize_cq_resp ibv_resp; + __u64 offset; +}; + +struct hfi1_create_qp_resp { + struct ib_uverbs_create_qp_resp ibv_resp; + __u64 offset; +}; + +struct hfi1_create_srq_resp { + struct ib_uverbs_create_srq_resp ibv_resp; + __u64 offset; +}; + +struct hfi1_modify_srq_cmd { + struct ibv_modify_srq ibv_cmd; + __u64 offset_addr; +}; + +#endif /* HFI1_ABI_H */ diff --git a/providers/hfi1verbs/hfiverbs.c b/providers/hfi1verbs/hfiverbs.c new file mode 100644 index 0000000..9bfb967 --- /dev/null +++ b/providers/hfi1verbs/hfiverbs.c @@ -0,0 +1,212 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation + www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + Copyright (C) 2006-2007 QLogic Corporation, All rights reserved. + Copyright (c) 2005. PathScale, Inc. All rights reserved. + +*/ + +#include <config.h> + +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <string.h> + +#include "hfiverbs.h" +#include "hfi-abi.h" + +static void hfi1_free_context(struct ibv_context *ibctx); + +#ifndef PCI_VENDOR_ID_INTEL +#define PCI_VENDOR_ID_INTEL 0x8086 +#endif + +#ifndef PCI_DEVICE_ID_INTEL0 +#define PCI_DEVICE_ID_HFI_INTEL0 0x24f0 +#endif + +#ifndef PCI_DEVICE_ID_INTEL1 +#define PCI_DEVICE_ID_HFI_INTEL1 0x24f1 +#endif + +#define HFI(v, d) \ + VERBS_PCI_MATCH(PCI_VENDOR_ID_##v, PCI_DEVICE_ID_HFI_##d, NULL) +static const struct verbs_match_ent hca_table[] = { + VERBS_DRIVER_ID(RDMA_DRIVER_HFI1), + HFI(INTEL, INTEL0), + HFI(INTEL, INTEL1), + {} +}; + +static const struct verbs_context_ops hfi1_ctx_common_ops = { + .free_context = hfi1_free_context, + .query_device = hfi1_query_device, + .query_port = hfi1_query_port, + + .alloc_pd = hfi1_alloc_pd, + .dealloc_pd = hfi1_free_pd, + + .reg_mr = hfi1_reg_mr, + .dereg_mr = hfi1_dereg_mr, + + .create_cq = hfi1_create_cq, + .poll_cq = hfi1_poll_cq, + .req_notify_cq = ibv_cmd_req_notify_cq, + .resize_cq = hfi1_resize_cq, + .destroy_cq = hfi1_destroy_cq, + + .create_srq = hfi1_create_srq, + .modify_srq = hfi1_modify_srq, + .query_srq = hfi1_query_srq, + .destroy_srq = hfi1_destroy_srq, + .post_srq_recv = hfi1_post_srq_recv, + + .create_qp = hfi1_create_qp, + .query_qp = hfi1_query_qp, + .modify_qp = hfi1_modify_qp, + .destroy_qp = hfi1_destroy_qp, + + .post_send = hfi1_post_send, + .post_recv = hfi1_post_recv, + + .create_ah = hfi1_create_ah, + .destroy_ah = hfi1_destroy_ah, + + .attach_mcast = ibv_cmd_attach_mcast, + .detach_mcast = ibv_cmd_detach_mcast +}; + +static const struct verbs_context_ops hfi1_ctx_v1_ops = { + .create_cq = hfi1_create_cq_v1, + .create_qp = hfi1_create_qp_v1, + .create_srq = hfi1_create_srq_v1, + .destroy_cq = hfi1_destroy_cq_v1, + .destroy_qp = hfi1_destroy_qp_v1, + .destroy_srq = hfi1_destroy_srq_v1, + .modify_srq = hfi1_modify_srq_v1, + .poll_cq = ibv_cmd_poll_cq, + .post_recv = ibv_cmd_post_recv, + .post_srq_recv = ibv_cmd_post_srq_recv, + .resize_cq = hfi1_resize_cq_v1, +}; + +static struct verbs_context *hfi1_alloc_context(struct ibv_device *ibdev, + int cmd_fd, + void *private_data) +{ + struct hfi1_context *context; + struct ibv_get_context cmd; + struct ib_uverbs_get_context_resp resp; + struct hfi1_device *dev; + + context = verbs_init_and_alloc_context(ibdev, cmd_fd, context, ibv_ctx, + RDMA_DRIVER_HFI1); + if (!context) + return NULL; + + if (ibv_cmd_get_context(&context->ibv_ctx, &cmd, + sizeof cmd, &resp, sizeof resp)) + goto err_free; + + verbs_set_ops(&context->ibv_ctx, &hfi1_ctx_common_ops); + + dev = to_idev(ibdev); + if (dev->abi_version == 1) + verbs_set_ops(&context->ibv_ctx, &hfi1_ctx_v1_ops); + + return &context->ibv_ctx; + +err_free: + verbs_uninit_context(&context->ibv_ctx); + free(context); + return NULL; +} + +static void hfi1_free_context(struct ibv_context *ibctx) +{ + struct hfi1_context *context = to_ictx(ibctx); + + verbs_uninit_context(&context->ibv_ctx); + free(context); +} + +static void hf11_uninit_device(struct verbs_device *verbs_device) +{ + struct hfi1_device *dev = to_idev(&verbs_device->device); + + free(dev); +} + +static struct verbs_device *hfi1_device_alloc(struct verbs_sysfs_dev *sysfs_dev) +{ + struct hfi1_device *dev; + + dev = calloc(1, sizeof(*dev)); + if (!dev) + return NULL; + + dev->abi_version = sysfs_dev->abi_ver; + + return &dev->ibv_dev; +} + +static const struct verbs_device_ops hfi1_dev_ops = { + .name = "hfi1verbs", + .match_min_abi_version = 0, + .match_max_abi_version = INT_MAX, + .match_table = hca_table, + .alloc_device = hfi1_device_alloc, + .uninit_device = hf11_uninit_device, + .alloc_context = hfi1_alloc_context, +}; +PROVIDER_DRIVER(hfi1verbs, hfi1_dev_ops); diff --git a/providers/hfi1verbs/hfiverbs.h b/providers/hfi1verbs/hfiverbs.h new file mode 100644 index 0000000..b9e91d8 --- /dev/null +++ b/providers/hfi1verbs/hfiverbs.h @@ -0,0 +1,280 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation + www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + Copyright (C) 2006-2009 QLogic Corporation, All rights reserved. + Copyright (c) 2005. PathScale, Inc. All rights reserved. + +*/ + +#ifndef HFI1_H +#define HFI1_H + +#include <endian.h> +#include <pthread.h> +#include <stddef.h> +#include <stdatomic.h> + +#include <infiniband/driver.h> +#include <infiniband/verbs.h> + +#define PFX "hfi1: " + +struct hfi1_device { + struct verbs_device ibv_dev; + int abi_version; +}; + +struct hfi1_context { + struct verbs_context ibv_ctx; +}; + +/* + * This structure needs to have the same size and offsets as + * the kernel's ib_wc structure since it is memory mapped. + */ +struct hfi1_wc { + uint64_t wr_id; + enum ibv_wc_status status; + enum ibv_wc_opcode opcode; + uint32_t vendor_err; + uint32_t byte_len; + uint32_t imm_data; /* in network byte order */ + uint32_t qp_num; + uint32_t src_qp; + enum ibv_wc_flags wc_flags; + uint16_t pkey_index; + uint16_t slid; + uint8_t sl; + uint8_t dlid_path_bits; + uint8_t port_num; +}; + +struct hfi1_cq_wc { + _Atomic(uint32_t) head; + _Atomic(uint32_t) tail; + struct hfi1_wc queue[1]; +}; + +struct hfi1_cq { + struct ibv_cq ibv_cq; + struct hfi1_cq_wc *queue; + pthread_spinlock_t lock; +}; + +/* + * Receive work request queue entry. + * The size of the sg_list is determined when the QP is created and stored + * in qp->r_max_sge. + */ +struct hfi1_rwqe { + uint64_t wr_id; + uint8_t num_sge; + uint8_t padding[7]; + struct ibv_sge sg_list[0]; +}; + +/* + * This struture is used to contain the head pointer, tail pointer, + * and receive work queue entries as a single memory allocation so + * it can be mmap'ed into user space. + * Note that the wq array elements are variable size so you can't + * just index into the array to get the N'th element; + * use get_rwqe_ptr() instead. + */ +struct hfi1_rwq { + _Atomic(uint32_t) head; /* new requests posted to the head. */ + _Atomic(uint32_t) tail; /* receives pull requests from here. */ + struct hfi1_rwqe wq[0]; +}; + +struct hfi1_rq { + struct hfi1_rwq *rwq; + pthread_spinlock_t lock; + uint32_t size; + uint32_t max_sge; +}; + +struct hfi1_qp { + struct ibv_qp ibv_qp; + struct hfi1_rq rq; +}; + +struct hfi1_srq { + struct ibv_srq ibv_srq; + struct hfi1_rq rq; +}; + +#define to_ixxx(xxx, type) \ + container_of(ib##xxx, struct hfi1_##type, ibv_##xxx) + +static inline struct hfi1_context *to_ictx(struct ibv_context *ibctx) +{ + return container_of(ibctx, struct hfi1_context, ibv_ctx.context); +} + +static inline struct hfi1_device *to_idev(struct ibv_device *ibdev) +{ + return container_of(ibdev, struct hfi1_device, ibv_dev.device); +} + +static inline struct hfi1_cq *to_icq(struct ibv_cq *ibcq) +{ + return to_ixxx(cq, cq); +} + +static inline struct hfi1_qp *to_iqp(struct ibv_qp *ibqp) +{ + return to_ixxx(qp, qp); +} + +static inline struct hfi1_srq *to_isrq(struct ibv_srq *ibsrq) +{ + return to_ixxx(srq, srq); +} + +/* + * Since struct hfi1_rwqe is not a fixed size, we can't simply index into + * struct hfi1_rq.wq. This function does the array index computation. + */ +static inline struct hfi1_rwqe *get_rwqe_ptr(struct hfi1_rq *rq, + unsigned n) +{ + return (struct hfi1_rwqe *) + ((char *) rq->rwq->wq + + (sizeof(struct hfi1_rwqe) + + rq->max_sge * sizeof(struct ibv_sge)) * n); +} + +extern int hfi1_query_device(struct ibv_context *context, + struct ibv_device_attr *attr); + +extern int hfi1_query_port(struct ibv_context *context, uint8_t port, + struct ibv_port_attr *attr); + +struct ibv_pd *hfi1_alloc_pd(struct ibv_context *pd); + +int hfi1_free_pd(struct ibv_pd *pd); + +struct ibv_mr *hfi1_reg_mr(struct ibv_pd *pd, void *addr, size_t length, + uint64_t hca_va, int access); + +int hfi1_dereg_mr(struct verbs_mr *vmr); + +struct ibv_cq *hfi1_create_cq(struct ibv_context *context, int cqe, + struct ibv_comp_channel *channel, + int comp_vector); + +struct ibv_cq *hfi1_create_cq_v1(struct ibv_context *context, int cqe, + struct ibv_comp_channel *channel, + int comp_vector); + +int hfi1_resize_cq(struct ibv_cq *cq, int cqe); + +int hfi1_resize_cq_v1(struct ibv_cq *cq, int cqe); + +int hfi1_destroy_cq(struct ibv_cq *cq); + +int hfi1_destroy_cq_v1(struct ibv_cq *cq); + +int hfi1_poll_cq(struct ibv_cq *cq, int ne, struct ibv_wc *wc); + +struct ibv_qp *hfi1_create_qp(struct ibv_pd *pd, + struct ibv_qp_init_attr *attr); + +struct ibv_qp *hfi1_create_qp_v1(struct ibv_pd *pd, + struct ibv_qp_init_attr *attr); + +int hfi1_query_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, + int attr_mask, + struct ibv_qp_init_attr *init_attr); + +int hfi1_modify_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, + int attr_mask); + +int hfi1_destroy_qp(struct ibv_qp *qp); + +int hfi1_destroy_qp_v1(struct ibv_qp *qp); + +int hfi1_post_send(struct ibv_qp *ibqp, struct ibv_send_wr *wr, + struct ibv_send_wr **bad_wr); + +int hfi1_post_recv(struct ibv_qp *ibqp, struct ibv_recv_wr *wr, + struct ibv_recv_wr **bad_wr); + +struct ibv_srq *hfi1_create_srq(struct ibv_pd *pd, + struct ibv_srq_init_attr *attr); + +struct ibv_srq *hfi1_create_srq_v1(struct ibv_pd *pd, + struct ibv_srq_init_attr *attr); + +int hfi1_modify_srq(struct ibv_srq *srq, + struct ibv_srq_attr *attr, + int attr_mask); + +int hfi1_modify_srq_v1(struct ibv_srq *srq, + struct ibv_srq_attr *attr, + int attr_mask); + +int hfi1_query_srq(struct ibv_srq *srq, struct ibv_srq_attr *attr); + +int hfi1_destroy_srq(struct ibv_srq *srq); + +int hfi1_destroy_srq_v1(struct ibv_srq *srq); + +int hfi1_post_srq_recv(struct ibv_srq *srq, struct ibv_recv_wr *wr, + struct ibv_recv_wr **bad_wr); + +struct ibv_ah *hfi1_create_ah(struct ibv_pd *pd, struct ibv_ah_attr *attr); + +int hfi1_destroy_ah(struct ibv_ah *ah); + +#endif /* HFI1_H */ diff --git a/providers/hfi1verbs/verbs.c b/providers/hfi1verbs/verbs.c new file mode 100644 index 0000000..275f8d5 --- /dev/null +++ b/providers/hfi1verbs/verbs.c @@ -0,0 +1,703 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation + www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + Copyright (C) 2006-2009 QLogic Corporation, All rights reserved. + Copyright (c) 2005. PathScale, Inc. All rights reserved. + +*/ + +#include <config.h> + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <pthread.h> +#include <sys/mman.h> +#include <errno.h> + +#include "hfiverbs.h" +#include "hfi-abi.h" + +int hfi1_query_device(struct ibv_context *context, + struct ibv_device_attr *attr) +{ + struct ibv_query_device cmd; + uint64_t raw_fw_ver; + unsigned major, minor, sub_minor; + int ret; + + ret = ibv_cmd_query_device(context, attr, &raw_fw_ver, + &cmd, sizeof cmd); + if (ret) + return ret; + + major = (raw_fw_ver >> 32) & 0xffff; + minor = (raw_fw_ver >> 16) & 0xffff; + sub_minor = raw_fw_ver & 0xffff; + + snprintf(attr->fw_ver, sizeof attr->fw_ver, + "%d.%d.%d", major, minor, sub_minor); + + return 0; +} + +int hfi1_query_port(struct ibv_context *context, uint8_t port, + struct ibv_port_attr *attr) +{ + struct ibv_query_port cmd; + + return ibv_cmd_query_port(context, port, attr, &cmd, sizeof cmd); +} + +struct ibv_pd *hfi1_alloc_pd(struct ibv_context *context) +{ + struct ibv_alloc_pd cmd; + struct ib_uverbs_alloc_pd_resp resp; + struct ibv_pd *pd; + + pd = malloc(sizeof *pd); + if (!pd) + return NULL; + + if (ibv_cmd_alloc_pd(context, pd, &cmd, sizeof cmd, + &resp, sizeof resp)) { + free(pd); + return NULL; + } + + return pd; +} + +int hfi1_free_pd(struct ibv_pd *pd) +{ + int ret; + + ret = ibv_cmd_dealloc_pd(pd); + if (ret) + return ret; + + free(pd); + return 0; +} + +struct ibv_mr *hfi1_reg_mr(struct ibv_pd *pd, void *addr, size_t length, + uint64_t hca_va, int access) +{ + struct verbs_mr *vmr; + struct ibv_reg_mr cmd; + struct ib_uverbs_reg_mr_resp resp; + int ret; + + vmr = malloc(sizeof(*vmr)); + if (!vmr) + return NULL; + + ret = ibv_cmd_reg_mr(pd, addr, length, hca_va, access, vmr, &cmd, + sizeof(cmd), &resp, sizeof(resp)); + + if (ret) { + free(vmr); + return NULL; + } + + return &vmr->ibv_mr; +} + +int hfi1_dereg_mr(struct verbs_mr *vmr) +{ + int ret; + + ret = ibv_cmd_dereg_mr(vmr); + if (ret) + return ret; + + free(vmr); + return 0; +} + +struct ibv_cq *hfi1_create_cq(struct ibv_context *context, int cqe, + struct ibv_comp_channel *channel, + int comp_vector) +{ + struct hfi1_cq *cq; + struct hfi1_create_cq_resp resp; + int ret; + size_t size; + + memset(&resp, 0, sizeof(resp)); + cq = malloc(sizeof *cq); + if (!cq) + return NULL; + + ret = ibv_cmd_create_cq(context, cqe, channel, comp_vector, + &cq->ibv_cq, NULL, 0, + &resp.ibv_resp, sizeof resp); + if (ret) { + free(cq); + return NULL; + } + + size = sizeof(struct hfi1_cq_wc) + sizeof(struct hfi1_wc) * cqe; + cq->queue = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, + context->cmd_fd, resp.offset); + if ((void *) cq->queue == MAP_FAILED) { + ibv_cmd_destroy_cq(&cq->ibv_cq); + free(cq); + return NULL; + } + + pthread_spin_init(&cq->lock, PTHREAD_PROCESS_PRIVATE); + return &cq->ibv_cq; +} + +struct ibv_cq *hfi1_create_cq_v1(struct ibv_context *context, int cqe, + struct ibv_comp_channel *channel, + int comp_vector) +{ + struct ibv_cq *cq; + int ret; + + cq = malloc(sizeof *cq); + if (!cq) + return NULL; + + ret = ibv_cmd_create_cq(context, cqe, channel, comp_vector, + cq, NULL, 0, NULL, 0); + if (ret) { + free(cq); + return NULL; + } + + return cq; +} + +int hfi1_resize_cq(struct ibv_cq *ibcq, int cqe) +{ + struct hfi1_cq *cq = to_icq(ibcq); + struct ibv_resize_cq cmd; + struct hfi1_resize_cq_resp resp; + size_t size; + int ret; + + memset(&resp, 0, sizeof(resp)); + pthread_spin_lock(&cq->lock); + /* Save the old size so we can unmmap the queue. */ + size = sizeof(struct hfi1_cq_wc) + + (sizeof(struct hfi1_wc) * cq->ibv_cq.cqe); + ret = ibv_cmd_resize_cq(ibcq, cqe, &cmd, sizeof cmd, + &resp.ibv_resp, sizeof resp); + if (ret) { + pthread_spin_unlock(&cq->lock); + return ret; + } + (void) munmap(cq->queue, size); + size = sizeof(struct hfi1_cq_wc) + + (sizeof(struct hfi1_wc) * cq->ibv_cq.cqe); + cq->queue = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, + ibcq->context->cmd_fd, resp.offset); + ret = errno; + pthread_spin_unlock(&cq->lock); + if ((void *) cq->queue == MAP_FAILED) + return ret; + return 0; +} + +int hfi1_resize_cq_v1(struct ibv_cq *ibcq, int cqe) +{ + struct ibv_resize_cq cmd; + struct ib_uverbs_resize_cq_resp resp; + + return ibv_cmd_resize_cq(ibcq, cqe, &cmd, sizeof cmd, + &resp, sizeof resp); +} + +int hfi1_destroy_cq(struct ibv_cq *ibcq) +{ + struct hfi1_cq *cq = to_icq(ibcq); + int ret; + + ret = ibv_cmd_destroy_cq(ibcq); + if (ret) + return ret; + + (void) munmap(cq->queue, sizeof(struct hfi1_cq_wc) + + (sizeof(struct hfi1_wc) * cq->ibv_cq.cqe)); + free(cq); + return 0; +} + +int hfi1_destroy_cq_v1(struct ibv_cq *ibcq) +{ + int ret; + + ret = ibv_cmd_destroy_cq(ibcq); + if (!ret) + free(ibcq); + return ret; +} + +int hfi1_poll_cq(struct ibv_cq *ibcq, int ne, struct ibv_wc *wc) +{ + struct hfi1_cq *cq = to_icq(ibcq); + struct hfi1_cq_wc *q; + int npolled; + uint32_t tail; + + pthread_spin_lock(&cq->lock); + q = cq->queue; + tail = atomic_load_explicit(&q->tail, memory_order_relaxed); + for (npolled = 0; npolled < ne; ++npolled, ++wc) { + if (tail == atomic_load(&q->head)) + break; + /* Make sure entry is read after head index is read. */ + atomic_thread_fence(memory_order_acquire); + memcpy(wc, &q->queue[tail], sizeof(*wc)); + if (tail == cq->ibv_cq.cqe) + tail = 0; + else + tail++; + } + atomic_store(&q->tail, tail); + pthread_spin_unlock(&cq->lock); + + return npolled; +} + +struct ibv_qp *hfi1_create_qp(struct ibv_pd *pd, struct ibv_qp_init_attr *attr) +{ + struct ibv_create_qp cmd; + struct hfi1_create_qp_resp resp; + struct hfi1_qp *qp; + int ret; + size_t size; + + memset(&resp, 0, sizeof(resp)); + qp = malloc(sizeof *qp); + if (!qp) + return NULL; + + ret = ibv_cmd_create_qp(pd, &qp->ibv_qp, attr, &cmd, sizeof cmd, + &resp.ibv_resp, sizeof resp); + if (ret) { + free(qp); + return NULL; + } + + if (attr->srq) { + qp->rq.size = 0; + qp->rq.max_sge = 0; + qp->rq.rwq = NULL; + } else { + qp->rq.size = attr->cap.max_recv_wr + 1; + qp->rq.max_sge = attr->cap.max_recv_sge; + size = sizeof(struct hfi1_rwq) + + (sizeof(struct hfi1_rwqe) + + (sizeof(struct ibv_sge) * qp->rq.max_sge)) * + qp->rq.size; + qp->rq.rwq = mmap(NULL, size, + PROT_READ | PROT_WRITE, MAP_SHARED, + pd->context->cmd_fd, resp.offset); + if ((void *) qp->rq.rwq == MAP_FAILED) { + ibv_cmd_destroy_qp(&qp->ibv_qp); + free(qp); + return NULL; + } + } + + pthread_spin_init(&qp->rq.lock, PTHREAD_PROCESS_PRIVATE); + return &qp->ibv_qp; +} + +struct ibv_qp *hfi1_create_qp_v1(struct ibv_pd *pd, + struct ibv_qp_init_attr *attr) +{ + struct ibv_create_qp cmd; + struct ib_uverbs_create_qp_resp resp; + struct ibv_qp *qp; + int ret; + + qp = malloc(sizeof *qp); + if (!qp) + return NULL; + + ret = ibv_cmd_create_qp(pd, qp, attr, &cmd, sizeof cmd, + &resp, sizeof resp); + if (ret) { + free(qp); + return NULL; + } + + return qp; +} + +int hfi1_query_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, + int attr_mask, + struct ibv_qp_init_attr *init_attr) +{ + struct ibv_query_qp cmd; + + return ibv_cmd_query_qp(qp, attr, attr_mask, init_attr, + &cmd, sizeof cmd); +} + +int hfi1_modify_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, + int attr_mask) +{ + struct ibv_modify_qp cmd = {}; + + return ibv_cmd_modify_qp(qp, attr, attr_mask, &cmd, sizeof cmd); +} + +int hfi1_destroy_qp(struct ibv_qp *ibqp) +{ + struct hfi1_qp *qp = to_iqp(ibqp); + int ret; + + ret = ibv_cmd_destroy_qp(ibqp); + if (ret) + return ret; + + if (qp->rq.rwq) { + size_t size; + + size = sizeof(struct hfi1_rwq) + + (sizeof(struct hfi1_rwqe) + + (sizeof(struct ibv_sge) * qp->rq.max_sge)) * + qp->rq.size; + (void) munmap(qp->rq.rwq, size); + } + free(qp); + return 0; +} + +int hfi1_destroy_qp_v1(struct ibv_qp *ibqp) +{ + int ret; + + ret = ibv_cmd_destroy_qp(ibqp); + if (!ret) + free(ibqp); + return ret; +} + +int hfi1_post_send(struct ibv_qp *qp, struct ibv_send_wr *wr, + struct ibv_send_wr **bad_wr) +{ + unsigned wr_count; + struct ibv_send_wr *i; + + /* Sanity check the number of WRs being posted */ + for (i = wr, wr_count = 0; i; i = i->next) + if (++wr_count > 10) + goto iter; + + return ibv_cmd_post_send(qp, wr, bad_wr); + +iter: + do { + struct ibv_send_wr *next; + int ret; + + next = i->next; + i->next = NULL; + ret = ibv_cmd_post_send(qp, wr, bad_wr); + i->next = next; + if (ret) + return ret; + if (next == NULL) + break; + wr = next; + for (i = wr, wr_count = 0; i->next; i = i->next) + if (++wr_count > 2) + break; + } while (1); + return 0; +} + +static int post_recv(struct hfi1_rq *rq, struct ibv_recv_wr *wr, + struct ibv_recv_wr **bad_wr) +{ + struct ibv_recv_wr *i; + struct hfi1_rwq *rwq; + struct hfi1_rwqe *wqe; + uint32_t head; + int n, ret; + + pthread_spin_lock(&rq->lock); + rwq = rq->rwq; + head = atomic_load_explicit(&rwq->head, memory_order_relaxed); + for (i = wr; i; i = i->next) { + if ((unsigned) i->num_sge > rq->max_sge) { + ret = EINVAL; + goto bad; + } + wqe = get_rwqe_ptr(rq, head); + if (++head >= rq->size) + head = 0; + if (head == atomic_load(&rwq->tail)) { + ret = ENOMEM; + goto bad; + } + wqe->wr_id = i->wr_id; + wqe->num_sge = i->num_sge; + for (n = 0; n < wqe->num_sge; n++) + wqe->sg_list[n] = i->sg_list[n]; + + /* Make sure queue entry is written before the head index. */ + atomic_thread_fence(memory_order_release); + atomic_store(&rwq->head, head); + } + ret = 0; + goto done; + +bad: + if (bad_wr) + *bad_wr = i; +done: + pthread_spin_unlock(&rq->lock); + return ret; +} + +int hfi1_post_recv(struct ibv_qp *ibqp, struct ibv_recv_wr *wr, + struct ibv_recv_wr **bad_wr) +{ + struct hfi1_qp *qp = to_iqp(ibqp); + + return post_recv(&qp->rq, wr, bad_wr); +} + +struct ibv_srq *hfi1_create_srq(struct ibv_pd *pd, + struct ibv_srq_init_attr *attr) +{ + struct hfi1_srq *srq; + struct ibv_create_srq cmd; + struct hfi1_create_srq_resp resp; + int ret; + size_t size; + + memset(&resp, 0, sizeof(resp)); + srq = malloc(sizeof *srq); + if (srq == NULL) + return NULL; + + ret = ibv_cmd_create_srq(pd, &srq->ibv_srq, attr, &cmd, sizeof cmd, + &resp.ibv_resp, sizeof resp); + if (ret) { + free(srq); + return NULL; + } + + srq->rq.size = attr->attr.max_wr + 1; + srq->rq.max_sge = attr->attr.max_sge; + size = sizeof(struct hfi1_rwq) + + (sizeof(struct hfi1_rwqe) + + (sizeof(struct ibv_sge) * srq->rq.max_sge)) * srq->rq.size; + srq->rq.rwq = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, + pd->context->cmd_fd, resp.offset); + if ((void *) srq->rq.rwq == MAP_FAILED) { + ibv_cmd_destroy_srq(&srq->ibv_srq); + free(srq); + return NULL; + } + + pthread_spin_init(&srq->rq.lock, PTHREAD_PROCESS_PRIVATE); + return &srq->ibv_srq; +} + +struct ibv_srq *hfi1_create_srq_v1(struct ibv_pd *pd, + struct ibv_srq_init_attr *attr) +{ + struct ibv_srq *srq; + struct ibv_create_srq cmd; + struct ib_uverbs_create_srq_resp resp; + int ret; + + srq = malloc(sizeof *srq); + if (srq == NULL) + return NULL; + + ret = ibv_cmd_create_srq(pd, srq, attr, &cmd, sizeof cmd, + &resp, sizeof resp); + if (ret) { + free(srq); + return NULL; + } + + return srq; +} + +int hfi1_modify_srq(struct ibv_srq *ibsrq, + struct ibv_srq_attr *attr, + int attr_mask) +{ + struct hfi1_srq *srq = to_isrq(ibsrq); + struct hfi1_modify_srq_cmd cmd; + __u64 offset; + size_t size = 0; /* Shut up gcc */ + int ret; + + if (attr_mask & IBV_SRQ_MAX_WR) { + pthread_spin_lock(&srq->rq.lock); + /* Save the old size so we can unmmap the queue. */ + size = sizeof(struct hfi1_rwq) + + (sizeof(struct hfi1_rwqe) + + (sizeof(struct ibv_sge) * srq->rq.max_sge)) * + srq->rq.size; + } + cmd.offset_addr = (uintptr_t) &offset; + ret = ibv_cmd_modify_srq(ibsrq, attr, attr_mask, + &cmd.ibv_cmd, sizeof cmd); + if (ret) { + if (attr_mask & IBV_SRQ_MAX_WR) + pthread_spin_unlock(&srq->rq.lock); + return ret; + } + if (attr_mask & IBV_SRQ_MAX_WR) { + (void) munmap(srq->rq.rwq, size); + srq->rq.size = attr->max_wr + 1; + size = sizeof(struct hfi1_rwq) + + (sizeof(struct hfi1_rwqe) + + (sizeof(struct ibv_sge) * srq->rq.max_sge)) * + srq->rq.size; + srq->rq.rwq = mmap(NULL, size, + PROT_READ | PROT_WRITE, MAP_SHARED, + ibsrq->context->cmd_fd, offset); + pthread_spin_unlock(&srq->rq.lock); + /* XXX Now we have no receive queue. */ + if ((void *) srq->rq.rwq == MAP_FAILED) + return errno; + } + return 0; +} + +int hfi1_modify_srq_v1(struct ibv_srq *ibsrq, + struct ibv_srq_attr *attr, + int attr_mask) +{ + struct ibv_modify_srq cmd; + + return ibv_cmd_modify_srq(ibsrq, attr, attr_mask, + &cmd, sizeof cmd); +} + +int hfi1_query_srq(struct ibv_srq *srq, struct ibv_srq_attr *attr) +{ + struct ibv_query_srq cmd; + + return ibv_cmd_query_srq(srq, attr, &cmd, sizeof cmd); +} + +int hfi1_destroy_srq(struct ibv_srq *ibsrq) +{ + struct hfi1_srq *srq = to_isrq(ibsrq); + size_t size; + int ret; + + ret = ibv_cmd_destroy_srq(ibsrq); + if (ret) + return ret; + + size = sizeof(struct hfi1_rwq) + + (sizeof(struct hfi1_rwqe) + + (sizeof(struct ibv_sge) * srq->rq.max_sge)) * srq->rq.size; + (void) munmap(srq->rq.rwq, size); + free(srq); + return 0; +} + +int hfi1_destroy_srq_v1(struct ibv_srq *ibsrq) +{ + int ret; + + ret = ibv_cmd_destroy_srq(ibsrq); + if (!ret) + free(ibsrq); + return ret; +} + +int hfi1_post_srq_recv(struct ibv_srq *ibsrq, struct ibv_recv_wr *wr, + struct ibv_recv_wr **bad_wr) +{ + struct hfi1_srq *srq = to_isrq(ibsrq); + + return post_recv(&srq->rq, wr, bad_wr); +} + +struct ibv_ah *hfi1_create_ah(struct ibv_pd *pd, struct ibv_ah_attr *attr) +{ + struct ibv_ah *ah; + struct ib_uverbs_create_ah_resp resp; + + ah = malloc(sizeof *ah); + if (ah == NULL) + return NULL; + + memset(&resp, 0, sizeof(resp)); + if (ibv_cmd_create_ah(pd, ah, attr, &resp, sizeof(resp))) { + free(ah); + return NULL; + } + + return ah; +} + +int hfi1_destroy_ah(struct ibv_ah *ah) +{ + int ret; + + ret = ibv_cmd_destroy_ah(ah); + if (ret) + return ret; + + free(ah); + return 0; +} diff --git a/providers/hns/CMakeLists.txt b/providers/hns/CMakeLists.txt new file mode 100644 index 0000000..697dbd7 --- /dev/null +++ b/providers/hns/CMakeLists.txt @@ -0,0 +1,8 @@ +rdma_provider(hns + hns_roce_u.c + hns_roce_u_buf.c + hns_roce_u_db.c + hns_roce_u_hw_v1.c + hns_roce_u_hw_v2.c + hns_roce_u_verbs.c +) diff --git a/providers/hns/hns_roce_u.c b/providers/hns/hns_roce_u.c new file mode 100644 index 0000000..e5b9488 --- /dev/null +++ b/providers/hns/hns_roce_u.c @@ -0,0 +1,214 @@ +/* + * Copyright (c) 2016-2017 Hisilicon Limited. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <pthread.h> +#include <sys/mman.h> +#include <unistd.h> + +#include "hns_roce_u.h" +#include "hns_roce_u_abi.h" + +static void hns_roce_free_context(struct ibv_context *ibctx); + +#define HID_LEN 15 +#define DEV_MATCH_LEN 128 + +#ifndef PCI_VENDOR_ID_HUAWEI +#define PCI_VENDOR_ID_HUAWEI 0x19E5 +#endif + +static const struct verbs_match_ent hca_table[] = { + VERBS_MODALIAS_MATCH("acpi*:HISI00D1:*", &hns_roce_u_hw_v1), + VERBS_MODALIAS_MATCH("of:N*T*Chisilicon,hns-roce-v1C*", &hns_roce_u_hw_v1), + VERBS_MODALIAS_MATCH("of:N*T*Chisilicon,hns-roce-v1", &hns_roce_u_hw_v1), + VERBS_PCI_MATCH(PCI_VENDOR_ID_HUAWEI, 0xA222, &hns_roce_u_hw_v2), + VERBS_PCI_MATCH(PCI_VENDOR_ID_HUAWEI, 0xA223, &hns_roce_u_hw_v2), + VERBS_PCI_MATCH(PCI_VENDOR_ID_HUAWEI, 0xA224, &hns_roce_u_hw_v2), + VERBS_PCI_MATCH(PCI_VENDOR_ID_HUAWEI, 0xA225, &hns_roce_u_hw_v2), + VERBS_PCI_MATCH(PCI_VENDOR_ID_HUAWEI, 0xA226, &hns_roce_u_hw_v2), + VERBS_PCI_MATCH(PCI_VENDOR_ID_HUAWEI, 0xA227, &hns_roce_u_hw_v2), + {} +}; + +static const struct verbs_context_ops hns_common_ops = { + .alloc_mw = hns_roce_u_alloc_mw, + .alloc_pd = hns_roce_u_alloc_pd, + .bind_mw = hns_roce_u_bind_mw, + .cq_event = hns_roce_u_cq_event, + .create_cq = hns_roce_u_create_cq, + .create_qp = hns_roce_u_create_qp, + .dealloc_mw = hns_roce_u_dealloc_mw, + .dealloc_pd = hns_roce_u_free_pd, + .dereg_mr = hns_roce_u_dereg_mr, + .destroy_cq = hns_roce_u_destroy_cq, + .modify_cq = hns_roce_u_modify_cq, + .query_device = hns_roce_u_query_device, + .query_port = hns_roce_u_query_port, + .query_qp = hns_roce_u_query_qp, + .reg_mr = hns_roce_u_reg_mr, + .rereg_mr = hns_roce_u_rereg_mr, + .create_srq = hns_roce_u_create_srq, + .modify_srq = hns_roce_u_modify_srq, + .query_srq = hns_roce_u_query_srq, + .destroy_srq = hns_roce_u_destroy_srq, + .free_context = hns_roce_free_context, +}; + +static struct verbs_context *hns_roce_alloc_context(struct ibv_device *ibdev, + int cmd_fd, + void *private_data) +{ + int i; + struct ibv_get_context cmd; + struct ibv_device_attr dev_attrs; + struct hns_roce_context *context; + struct hns_roce_alloc_ucontext_resp resp = {}; + struct hns_roce_device *hr_dev = to_hr_dev(ibdev); + + context = verbs_init_and_alloc_context(ibdev, cmd_fd, context, ibv_ctx, + RDMA_DRIVER_HNS); + if (!context) + return NULL; + + if (ibv_cmd_get_context(&context->ibv_ctx, &cmd, sizeof(cmd), + &resp.ibv_resp, sizeof(resp))) + goto err_free; + + context->num_qps = resp.qp_tab_size; + context->qp_table_shift = ffs(context->num_qps) - 1 - + HNS_ROCE_QP_TABLE_BITS; + context->qp_table_mask = (1 << context->qp_table_shift) - 1; + + pthread_mutex_init(&context->qp_table_mutex, NULL); + for (i = 0; i < HNS_ROCE_QP_TABLE_SIZE; ++i) + context->qp_table[i].refcnt = 0; + + context->uar = mmap(NULL, hr_dev->page_size, + PROT_READ | PROT_WRITE, MAP_SHARED, cmd_fd, 0); + if (context->uar == MAP_FAILED) { + fprintf(stderr, PFX "Warning: failed to mmap() uar page.\n"); + goto err_free; + } + + if (hr_dev->hw_version == HNS_ROCE_HW_VER1) { + /* + * when vma->vm_pgoff is 1, the cq_tptr_base includes 64K CQ, + * a pointer of CQ need 2B size + */ + context->cq_tptr_base = mmap(NULL, HNS_ROCE_CQ_DB_BUF_SIZE, + PROT_READ | PROT_WRITE, MAP_SHARED, + cmd_fd, HNS_ROCE_TPTR_OFFSET); + if (context->cq_tptr_base == MAP_FAILED) { + fprintf(stderr, + PFX "Warning: Failed to mmap cq_tptr page.\n"); + goto db_free; + } + } + + pthread_spin_init(&context->uar_lock, PTHREAD_PROCESS_PRIVATE); + + verbs_set_ops(&context->ibv_ctx, &hns_common_ops); + verbs_set_ops(&context->ibv_ctx, &hr_dev->u_hw->hw_ops); + + if (hns_roce_u_query_device(&context->ibv_ctx.context, &dev_attrs)) + goto tptr_free; + + context->max_qp_wr = dev_attrs.max_qp_wr; + context->max_sge = dev_attrs.max_sge; + context->max_cqe = dev_attrs.max_cqe; + + return &context->ibv_ctx; + +tptr_free: + if (hr_dev->hw_version == HNS_ROCE_HW_VER1) { + if (munmap(context->cq_tptr_base, HNS_ROCE_CQ_DB_BUF_SIZE)) + fprintf(stderr, PFX "Warning: Munmap tptr failed.\n"); + context->cq_tptr_base = NULL; + } + +db_free: + munmap(context->uar, hr_dev->page_size); + context->uar = NULL; + +err_free: + verbs_uninit_context(&context->ibv_ctx); + free(context); + return NULL; +} + +static void hns_roce_free_context(struct ibv_context *ibctx) +{ + struct hns_roce_device *hr_dev = to_hr_dev(ibctx->device); + struct hns_roce_context *context = to_hr_ctx(ibctx); + + munmap(context->uar, hr_dev->page_size); + if (hr_dev->hw_version == HNS_ROCE_HW_VER1) + munmap(context->cq_tptr_base, HNS_ROCE_CQ_DB_BUF_SIZE); + + verbs_uninit_context(&context->ibv_ctx); + free(context); +} + +static void hns_uninit_device(struct verbs_device *verbs_device) +{ + struct hns_roce_device *dev = to_hr_dev(&verbs_device->device); + + free(dev); +} + +static struct verbs_device *hns_device_alloc(struct verbs_sysfs_dev *sysfs_dev) +{ + struct hns_roce_device *dev; + + dev = calloc(1, sizeof(*dev)); + if (!dev) + return NULL; + + dev->u_hw = sysfs_dev->match->driver_data; + dev->hw_version = dev->u_hw->hw_version; + dev->page_size = sysconf(_SC_PAGESIZE); + return &dev->ibv_dev; +} + +static const struct verbs_device_ops hns_roce_dev_ops = { + .name = "hns", + .match_min_abi_version = 0, + .match_max_abi_version = INT_MAX, + .match_table = hca_table, + .alloc_device = hns_device_alloc, + .uninit_device = hns_uninit_device, + .alloc_context = hns_roce_alloc_context, +}; +PROVIDER_DRIVER(hns, hns_roce_dev_ops); diff --git a/providers/hns/hns_roce_u.h b/providers/hns/hns_roce_u.h new file mode 100644 index 0000000..3579070 --- /dev/null +++ b/providers/hns/hns_roce_u.h @@ -0,0 +1,345 @@ +/* + * Copyright (c) 2016-2017 Hisilicon Limited. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef _HNS_ROCE_U_H +#define _HNS_ROCE_U_H + +#include <stddef.h> +#include <endian.h> +#include <util/compiler.h> + +#include <infiniband/driver.h> +#include <util/udma_barrier.h> +#include <infiniband/verbs.h> +#include <ccan/bitmap.h> +#include <ccan/container_of.h> + +#define HNS_ROCE_HW_VER1 ('h' << 24 | 'i' << 16 | '0' << 8 | '6') + +#define HNS_ROCE_HW_VER2 ('h' << 24 | 'i' << 16 | '0' << 8 | '8') + +#define PFX "hns: " + +#define HNS_ROCE_MAX_INLINE_DATA_LEN 32 +#define HNS_ROCE_MAX_CQ_NUM 0x10000 +#define HNS_ROCE_MAX_SRQWQE_NUM 0x8000 +#define HNS_ROCE_MAX_SRQSGE_NUM 0x100 +#define HNS_ROCE_MIN_CQE_NUM 0x40 +#define HNS_ROCE_V1_MIN_WQE_NUM 0x20 +#define HNS_ROCE_V2_MIN_WQE_NUM 0x40 + +#define HNS_ROCE_CQE_ENTRY_SIZE 0x20 +#define HNS_ROCE_SQWQE_SHIFT 6 +#define HNS_ROCE_SGE_IN_WQE 2 +#define HNS_ROCE_SGE_SIZE 16 +#define HNS_ROCE_SGE_SHIFT 4 + +#define HNS_ROCE_GID_SIZE 16 + +#define HNS_ROCE_CQ_DB_BUF_SIZE ((HNS_ROCE_MAX_CQ_NUM >> 11) << 12) +#define HNS_ROCE_TPTR_OFFSET 0x1000 +#define HNS_ROCE_STATIC_RATE 3 /* Gbps */ + +#define HNS_ROCE_ADDRESS_MASK 0xFFFFFFFF +#define HNS_ROCE_ADDRESS_SHIFT 32 + +#define roce_get_field(origin, mask, shift) \ + (((le32toh(origin)) & (mask)) >> (shift)) + +#define roce_get_bit(origin, shift) \ + roce_get_field((origin), (1ul << (shift)), (shift)) + +#define roce_set_field(origin, mask, shift, val) \ + do { \ + (origin) &= ~htole32(mask); \ + (origin) |= htole32(((unsigned int)(val) << (shift)) & (mask)); \ + } while (0) + +#define roce_set_bit(origin, shift, val) \ + roce_set_field((origin), (1ul << (shift)), (shift), (val)) + +#define hr_ilog32(n) ilog32((n) - 1) + +enum { + HNS_ROCE_QP_TABLE_BITS = 8, + HNS_ROCE_QP_TABLE_SIZE = 1 << HNS_ROCE_QP_TABLE_BITS, +}; + +/* operation type list */ +enum { + /* rq&srq operation */ + HNS_ROCE_OPCODE_SEND_DATA_RECEIVE = 0x06, + HNS_ROCE_OPCODE_RDMA_WITH_IMM_RECEIVE = 0x07, +}; + +struct hns_roce_device { + struct verbs_device ibv_dev; + int page_size; + const struct hns_roce_u_hw *u_hw; + int hw_version; +}; + +struct hns_roce_buf { + void *buf; + unsigned int length; +}; + +#define BIT_CNT_PER_BYTE 8 +#define BIT_CNT_PER_U64 64 + +/* the sw doorbell type; */ +enum hns_roce_db_type { + HNS_ROCE_QP_TYPE_DB, + HNS_ROCE_CQ_TYPE_DB, + HNS_ROCE_DB_TYPE_NUM +}; + +struct hns_roce_db_page { + struct hns_roce_db_page *prev, *next; + struct hns_roce_buf buf; + unsigned int num_db; + unsigned int use_cnt; + bitmap *bitmap; +}; + +struct hns_roce_context { + struct verbs_context ibv_ctx; + void *uar; + pthread_spinlock_t uar_lock; + + void *cq_tptr_base; + + struct { + struct hns_roce_qp **table; + int refcnt; + } qp_table[HNS_ROCE_QP_TABLE_SIZE]; + + pthread_mutex_t qp_table_mutex; + + int num_qps; + int qp_table_shift; + int qp_table_mask; + + struct hns_roce_db_page *db_list[HNS_ROCE_DB_TYPE_NUM]; + pthread_mutex_t db_list_mutex; + + unsigned int max_qp_wr; + unsigned int max_sge; + int max_cqe; +}; + +struct hns_roce_pd { + struct ibv_pd ibv_pd; + unsigned int pdn; +}; + +struct hns_roce_cq { + struct ibv_cq ibv_cq; + struct hns_roce_buf buf; + pthread_spinlock_t lock; + unsigned int cqn; + unsigned int cq_depth; + unsigned int cons_index; + unsigned int *set_ci_db; + unsigned int *arm_db; + int arm_sn; + unsigned long flags; +}; + +struct hns_roce_idx_que { + struct hns_roce_buf buf; + int buf_size; + int entry_sz; + unsigned long *bitmap; +}; + +struct hns_roce_srq { + struct verbs_srq verbs_srq; + struct hns_roce_buf buf; + pthread_spinlock_t lock; + unsigned long *wrid; + unsigned int srqn; + unsigned int max_wqe; + unsigned int max_gs; + unsigned int wqe_shift; + int head; + int tail; + unsigned int *db; + unsigned short counter; + struct hns_roce_idx_que idx_que; +}; + +struct hns_roce_wq { + unsigned long *wrid; + pthread_spinlock_t lock; + unsigned int wqe_cnt; + int max_post; + unsigned int head; + unsigned int tail; + unsigned int max_gs; + unsigned int wqe_shift; + unsigned int shift; /* wq size is 2^shift */ + int offset; +}; + +/* record the result of sge process */ +struct hns_roce_sge_info { + unsigned int valid_num; /* sge length is not 0 */ + unsigned int start_idx; /* start position of extend sge */ + unsigned int total_len; /* total length of valid sges */ +}; + +struct hns_roce_sge_ex { + int offset; + unsigned int sge_cnt; + int sge_shift; +}; + +struct hns_roce_rinl_sge { + void *addr; + unsigned int len; +}; + +struct hns_roce_rinl_wqe { + struct hns_roce_rinl_sge *sg_list; + unsigned int sge_cnt; +}; + +struct hns_roce_rinl_buf { + struct hns_roce_rinl_wqe *wqe_list; + unsigned int wqe_cnt; +}; + +struct hns_roce_qp { + struct ibv_qp ibv_qp; + struct hns_roce_buf buf; + int max_inline_data; + int buf_size; + unsigned int sq_signal_bits; + struct hns_roce_wq sq; + struct hns_roce_wq rq; + unsigned int *rdb; + unsigned int *sdb; + struct hns_roce_sge_ex ex_sge; + unsigned int next_sge; + int port_num; + int sl; + + struct hns_roce_rinl_buf rq_rinl_buf; + unsigned long flags; +}; + +struct hns_roce_u_hw { + uint32_t hw_version; + struct verbs_context_ops hw_ops; +}; + +static inline struct hns_roce_device *to_hr_dev(struct ibv_device *ibv_dev) +{ + return container_of(ibv_dev, struct hns_roce_device, ibv_dev.device); +} + +static inline struct hns_roce_context *to_hr_ctx(struct ibv_context *ibv_ctx) +{ + return container_of(ibv_ctx, struct hns_roce_context, ibv_ctx.context); +} + +static inline struct hns_roce_pd *to_hr_pd(struct ibv_pd *ibv_pd) +{ + return container_of(ibv_pd, struct hns_roce_pd, ibv_pd); +} + +static inline struct hns_roce_cq *to_hr_cq(struct ibv_cq *ibv_cq) +{ + return container_of(ibv_cq, struct hns_roce_cq, ibv_cq); +} + +static inline struct hns_roce_srq *to_hr_srq(struct ibv_srq *ibv_srq) +{ + return container_of(container_of(ibv_srq, struct verbs_srq, srq), + struct hns_roce_srq, verbs_srq); +} + +static inline struct hns_roce_qp *to_hr_qp(struct ibv_qp *ibv_qp) +{ + return container_of(ibv_qp, struct hns_roce_qp, ibv_qp); +} + +int hns_roce_u_query_device(struct ibv_context *context, + struct ibv_device_attr *attr); +int hns_roce_u_query_port(struct ibv_context *context, uint8_t port, + struct ibv_port_attr *attr); + +struct ibv_pd *hns_roce_u_alloc_pd(struct ibv_context *context); +int hns_roce_u_free_pd(struct ibv_pd *pd); + +struct ibv_mr *hns_roce_u_reg_mr(struct ibv_pd *pd, void *addr, size_t length, + uint64_t hca_va, int access); +int hns_roce_u_rereg_mr(struct verbs_mr *mr, int flags, struct ibv_pd *pd, + void *addr, size_t length, int access); +int hns_roce_u_dereg_mr(struct verbs_mr *mr); + +struct ibv_mw *hns_roce_u_alloc_mw(struct ibv_pd *pd, enum ibv_mw_type type); +int hns_roce_u_dealloc_mw(struct ibv_mw *mw); +int hns_roce_u_bind_mw(struct ibv_qp *qp, struct ibv_mw *mw, + struct ibv_mw_bind *mw_bind); + +struct ibv_cq *hns_roce_u_create_cq(struct ibv_context *context, int cqe, + struct ibv_comp_channel *channel, + int comp_vector); + +int hns_roce_u_modify_cq(struct ibv_cq *cq, struct ibv_modify_cq_attr *attr); +int hns_roce_u_destroy_cq(struct ibv_cq *cq); +void hns_roce_u_cq_event(struct ibv_cq *cq); + +struct ibv_srq *hns_roce_u_create_srq(struct ibv_pd *pd, + struct ibv_srq_init_attr *srq_init_attr); +int hns_roce_u_modify_srq(struct ibv_srq *srq, struct ibv_srq_attr *srq_attr, + int srq_attr_mask); +int hns_roce_u_query_srq(struct ibv_srq *srq, struct ibv_srq_attr *srq_attr); +int hns_roce_u_destroy_srq(struct ibv_srq *srq); +struct ibv_qp *hns_roce_u_create_qp(struct ibv_pd *pd, + struct ibv_qp_init_attr *attr); + +int hns_roce_u_query_qp(struct ibv_qp *ibqp, struct ibv_qp_attr *attr, + int attr_mask, struct ibv_qp_init_attr *init_attr); + +int hns_roce_alloc_buf(struct hns_roce_buf *buf, unsigned int size, + int page_size); +void hns_roce_free_buf(struct hns_roce_buf *buf); + +void hns_roce_init_qp_indices(struct hns_roce_qp *qp); + +extern const struct hns_roce_u_hw hns_roce_u_hw_v1; +extern const struct hns_roce_u_hw hns_roce_u_hw_v2; + +#endif /* _HNS_ROCE_U_H */ diff --git a/providers/hns/hns_roce_u_abi.h b/providers/hns/hns_roce_u_abi.h new file mode 100644 index 0000000..79fd7dd --- /dev/null +++ b/providers/hns/hns_roce_u_abi.h @@ -0,0 +1,52 @@ +/* + * Copyright (c) 2016 Hisilicon Limited. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef _HNS_ROCE_U_ABI_H +#define _HNS_ROCE_U_ABI_H + +#include <infiniband/kern-abi.h> +#include <rdma/hns-abi.h> +#include <kernel-abi/hns-abi.h> + +DECLARE_DRV_CMD(hns_roce_alloc_pd, IB_USER_VERBS_CMD_ALLOC_PD, + empty, hns_roce_ib_alloc_pd_resp); +DECLARE_DRV_CMD(hns_roce_create_cq, IB_USER_VERBS_CMD_CREATE_CQ, + hns_roce_ib_create_cq, hns_roce_ib_create_cq_resp); +DECLARE_DRV_CMD(hns_roce_create_qp, IB_USER_VERBS_CMD_CREATE_QP, + hns_roce_ib_create_qp, hns_roce_ib_create_qp_resp); +DECLARE_DRV_CMD(hns_roce_alloc_ucontext, IB_USER_VERBS_CMD_GET_CONTEXT, + empty, hns_roce_ib_alloc_ucontext_resp); + +DECLARE_DRV_CMD(hns_roce_create_srq, IB_USER_VERBS_CMD_CREATE_SRQ, + hns_roce_ib_create_srq, hns_roce_ib_create_srq_resp); + +#endif /* _HNS_ROCE_U_ABI_H */ diff --git a/providers/hns/hns_roce_u_buf.c b/providers/hns/hns_roce_u_buf.c new file mode 100644 index 0000000..471dd9c --- /dev/null +++ b/providers/hns/hns_roce_u_buf.c @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2016 Hisilicon Limited. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <errno.h> +#include <sys/mman.h> +#include <util/util.h> + +#include "hns_roce_u.h" + +int hns_roce_alloc_buf(struct hns_roce_buf *buf, unsigned int size, + int page_size) +{ + int ret; + + buf->length = align(size, page_size); + buf->buf = mmap(NULL, buf->length, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (buf->buf == MAP_FAILED) + return errno; + + ret = ibv_dontfork_range(buf->buf, buf->length); + if (ret) + munmap(buf->buf, buf->length); + + return ret; +} + +void hns_roce_free_buf(struct hns_roce_buf *buf) +{ + ibv_dofork_range(buf->buf, buf->length); + + munmap(buf->buf, buf->length); +} diff --git a/providers/hns/hns_roce_u_db.c b/providers/hns/hns_roce_u_db.c new file mode 100644 index 0000000..6b1dd97 --- /dev/null +++ b/providers/hns/hns_roce_u_db.c @@ -0,0 +1,163 @@ +/* + * Copyright (c) 2017 Hisilicon Limited. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <stdlib.h> +#include <string.h> +#include <stdio.h> +#include <ccan/bitmap.h> +#include "hns_roce_u.h" +#include "hns_roce_u_db.h" + +/* the sw db length, on behalf of the qp/cq/srq length from left to right */ +static const unsigned int db_size[] = { + [HNS_ROCE_QP_TYPE_DB] = 4, + [HNS_ROCE_CQ_TYPE_DB] = 4, +}; + +static struct hns_roce_db_page *hns_roce_add_db_page( + struct hns_roce_context *ctx, + enum hns_roce_db_type type) +{ + struct hns_roce_db_page *page; + int page_size; + + page_size = to_hr_dev(ctx->ibv_ctx.context.device)->page_size; + page = calloc(1, sizeof(*page)); + if (!page) + goto err_page; + + /* allocate bitmap space for sw db and init all bitmap to 1 */ + page->num_db = page_size / db_size[type]; + page->use_cnt = 0; + page->bitmap = bitmap_alloc1(page->num_db); + if (!page->bitmap) + goto err_map; + + if (hns_roce_alloc_buf(&(page->buf), page_size, page_size)) + goto err; + + /* add the set ctx->db_list */ + page->prev = NULL; + page->next = ctx->db_list[type]; + ctx->db_list[type] = page; + if (page->next) + page->next->prev = page; + + return page; +err: + free(page->bitmap); + +err_map: + free(page); + +err_page: + return NULL; +} + +static void hns_roce_clear_db_page(struct hns_roce_db_page *page) +{ + assert(page); + + free(page->bitmap); + hns_roce_free_buf(&(page->buf)); +} + +void *hns_roce_alloc_db(struct hns_roce_context *ctx, + enum hns_roce_db_type type) +{ + struct hns_roce_db_page *page; + void *db = NULL; + uint32_t npos; + + pthread_mutex_lock((pthread_mutex_t *)&ctx->db_list_mutex); + + for (page = ctx->db_list[type]; page != NULL; page = page->next) + if (page->use_cnt < page->num_db) + goto found; + + page = hns_roce_add_db_page(ctx, type); + if (!page) + goto out; + +found: + ++page->use_cnt; + + npos = bitmap_ffs(page->bitmap, 0, page->num_db); + bitmap_clear_bit(page->bitmap, npos); + db = page->buf.buf + npos * db_size[type]; + +out: + pthread_mutex_unlock((pthread_mutex_t *)&ctx->db_list_mutex); + + return db; +} + +void hns_roce_free_db(struct hns_roce_context *ctx, unsigned int *db, + enum hns_roce_db_type type) +{ + struct hns_roce_db_page *page; + uint32_t npos; + uint32_t page_size; + + pthread_mutex_lock((pthread_mutex_t *)&ctx->db_list_mutex); + + page_size = to_hr_dev(ctx->ibv_ctx.context.device)->page_size; + for (page = ctx->db_list[type]; page != NULL; page = page->next) + if (((uintptr_t)db & (~((uintptr_t)page_size - 1))) == + (uintptr_t)(page->buf.buf)) + goto found; + + goto out; + +found: + --page->use_cnt; + if (!page->use_cnt) { + if (page->prev) + page->prev->next = page->next; + else + ctx->db_list[type] = page->next; + + if (page->next) + page->next->prev = page->prev; + + hns_roce_clear_db_page(page); + free(page); + + goto out; + } + + npos = ((uintptr_t)db - (uintptr_t)page->buf.buf) / db_size[type]; + bitmap_set_bit(page->bitmap, npos); + +out: + pthread_mutex_unlock((pthread_mutex_t *)&ctx->db_list_mutex); +} diff --git a/providers/hns/hns_roce_u_db.h b/providers/hns/hns_roce_u_db.h new file mode 100644 index 0000000..b44e64d --- /dev/null +++ b/providers/hns/hns_roce_u_db.h @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2016 Hisilicon Limited. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <linux/types.h> + +#include "hns_roce_u.h" + +#ifndef _HNS_ROCE_U_DB_H +#define _HNS_ROCE_U_DB_H + +#if __BYTE_ORDER == __LITTLE_ENDIAN +#define HNS_ROCE_PAIR_TO_64(val) ((uint64_t) val[1] << 32 | val[0]) +#elif __BYTE_ORDER == __BIG_ENDIAN +#define HNS_ROCE_PAIR_TO_64(val) ((uint64_t) val[0] << 32 | val[1]) +#else +#error __BYTE_ORDER not defined +#endif + +static inline void hns_roce_write64(uint32_t val[2], + struct hns_roce_context *ctx, int offset) +{ + *(volatile uint64_t *) (ctx->uar + offset) = HNS_ROCE_PAIR_TO_64(val); +} + +void *hns_roce_alloc_db(struct hns_roce_context *ctx, + enum hns_roce_db_type type); +void hns_roce_free_db(struct hns_roce_context *ctx, unsigned int *db, + enum hns_roce_db_type type); + +#endif /* _HNS_ROCE_U_DB_H */ diff --git a/providers/hns/hns_roce_u_hw_v1.c b/providers/hns/hns_roce_u_hw_v1.c new file mode 100644 index 0000000..247e797 --- /dev/null +++ b/providers/hns/hns_roce_u_hw_v1.c @@ -0,0 +1,825 @@ +/* + * Copyright (c) 2016 Hisilicon Limited. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <stdio.h> +#include <string.h> +#include "hns_roce_u_db.h" +#include "hns_roce_u_hw_v1.h" +#include "hns_roce_u.h" + +static inline void set_raddr_seg(struct hns_roce_wqe_raddr_seg *rseg, + uint64_t remote_addr, uint32_t rkey) +{ + rseg->raddr = htole64(remote_addr); + rseg->rkey = htole32(rkey); + rseg->len = 0; +} + +static void set_data_seg(struct hns_roce_wqe_data_seg *dseg, struct ibv_sge *sg) +{ + + dseg->lkey = htole32(sg->lkey); + dseg->addr = htole64(sg->addr); + dseg->len = htole32(sg->length); +} + +static void hns_roce_update_rq_head(struct hns_roce_context *ctx, + unsigned int qpn, unsigned int rq_head) +{ + struct hns_roce_rq_db rq_db = {}; + + roce_set_field(rq_db.u32_4, RQ_DB_U32_4_RQ_HEAD_M, + RQ_DB_U32_4_RQ_HEAD_S, rq_head); + roce_set_field(rq_db.u32_8, RQ_DB_U32_8_QPN_M, RQ_DB_U32_8_QPN_S, qpn); + roce_set_field(rq_db.u32_8, RQ_DB_U32_8_CMD_M, RQ_DB_U32_8_CMD_S, 1); + roce_set_bit(rq_db.u32_8, RQ_DB_U32_8_HW_SYNC_S, 1); + + udma_to_device_barrier(); + + hns_roce_write64((uint32_t *)&rq_db, ctx, ROCEE_DB_OTHERS_L_0_REG); +} + +static void hns_roce_update_sq_head(struct hns_roce_context *ctx, + unsigned int qpn, unsigned int port, + unsigned int sl, unsigned int sq_head) +{ + struct hns_roce_sq_db sq_db = {}; + + roce_set_field(sq_db.u32_4, SQ_DB_U32_4_SQ_HEAD_M, + SQ_DB_U32_4_SQ_HEAD_S, sq_head); + roce_set_field(sq_db.u32_4, SQ_DB_U32_4_PORT_M, SQ_DB_U32_4_PORT_S, + port); + roce_set_field(sq_db.u32_4, SQ_DB_U32_4_SL_M, SQ_DB_U32_4_SL_S, sl); + roce_set_field(sq_db.u32_8, SQ_DB_U32_8_QPN_M, SQ_DB_U32_8_QPN_S, qpn); + roce_set_bit(sq_db.u32_8, SQ_DB_U32_8_HW_SYNC, 1); + + udma_to_device_barrier(); + + hns_roce_write64((uint32_t *)&sq_db, ctx, ROCEE_DB_SQ_L_0_REG); +} + +static void hns_roce_update_cq_cons_index(struct hns_roce_context *ctx, + struct hns_roce_cq *cq) +{ + struct hns_roce_cq_db cq_db = {}; + + roce_set_bit(cq_db.u32_8, CQ_DB_U32_8_HW_SYNC_S, 1); + roce_set_field(cq_db.u32_8, CQ_DB_U32_8_CMD_M, CQ_DB_U32_8_CMD_S, 3); + roce_set_field(cq_db.u32_8, CQ_DB_U32_8_CMD_MDF_M, + CQ_DB_U32_8_CMD_MDF_S, 0); + roce_set_field(cq_db.u32_8, CQ_DB_U32_8_CQN_M, CQ_DB_U32_8_CQN_S, + cq->cqn); + roce_set_field(cq_db.u32_4, CQ_DB_U32_4_CONS_IDX_M, + CQ_DB_U32_4_CONS_IDX_S, + cq->cons_index & ((cq->cq_depth << 1) - 1)); + + hns_roce_write64((uint32_t *)&cq_db, ctx, ROCEE_DB_OTHERS_L_0_REG); +} + +static void hns_roce_handle_error_cqe(struct hns_roce_cqe *cqe, + struct ibv_wc *wc) +{ + fprintf(stderr, PFX "error cqe!\n"); + switch (roce_get_field(cqe->cqe_byte_4, + CQE_BYTE_4_STATUS_OF_THE_OPERATION_M, + CQE_BYTE_4_STATUS_OF_THE_OPERATION_S) & + HNS_ROCE_CQE_STATUS_MASK) { + case HNS_ROCE_CQE_SYNDROME_LOCAL_LENGTH_ERR: + wc->status = IBV_WC_LOC_LEN_ERR; + break; + case HNS_ROCE_CQE_SYNDROME_LOCAL_QP_OP_ERR: + wc->status = IBV_WC_LOC_QP_OP_ERR; + break; + case HNS_ROCE_CQE_SYNDROME_LOCAL_PROT_ERR: + wc->status = IBV_WC_LOC_PROT_ERR; + break; + case HNS_ROCE_CQE_SYNDROME_WR_FLUSH_ERR: + wc->status = IBV_WC_WR_FLUSH_ERR; + break; + case HNS_ROCE_CQE_SYNDROME_MEM_MANAGE_OPERATE_ERR: + wc->status = IBV_WC_MW_BIND_ERR; + break; + case HNS_ROCE_CQE_SYNDROME_BAD_RESP_ERR: + wc->status = IBV_WC_BAD_RESP_ERR; + break; + case HNS_ROCE_CQE_SYNDROME_LOCAL_ACCESS_ERR: + wc->status = IBV_WC_LOC_ACCESS_ERR; + break; + case HNS_ROCE_CQE_SYNDROME_REMOTE_INVAL_REQ_ERR: + wc->status = IBV_WC_REM_INV_REQ_ERR; + break; + case HNS_ROCE_CQE_SYNDROME_REMOTE_ACCESS_ERR: + wc->status = IBV_WC_REM_ACCESS_ERR; + break; + case HNS_ROCE_CQE_SYNDROME_REMOTE_OP_ERR: + wc->status = IBV_WC_REM_OP_ERR; + break; + case HNS_ROCE_CQE_SYNDROME_TRANSPORT_RETRY_EXC_ERR: + wc->status = IBV_WC_RETRY_EXC_ERR; + break; + case HNS_ROCE_CQE_SYNDROME_RNR_RETRY_EXC_ERR: + wc->status = IBV_WC_RNR_RETRY_EXC_ERR; + break; + default: + wc->status = IBV_WC_GENERAL_ERR; + break; + } +} + +static struct hns_roce_cqe *get_cqe(struct hns_roce_cq *cq, int entry) +{ + return cq->buf.buf + entry * HNS_ROCE_CQE_ENTRY_SIZE; +} + +static void *get_sw_cqe(struct hns_roce_cq *cq, int n) +{ + struct hns_roce_cqe *cqe = get_cqe(cq, n & cq->ibv_cq.cqe); + + return (!!(roce_get_bit(cqe->cqe_byte_4, CQE_BYTE_4_OWNER_S)) ^ + !!(n & (cq->ibv_cq.cqe + 1))) ? cqe : NULL; +} + +static struct hns_roce_cqe *next_cqe_sw(struct hns_roce_cq *cq) +{ + return get_sw_cqe(cq, cq->cons_index); +} + +static void *get_recv_wqe(struct hns_roce_qp *qp, int n) +{ + if ((n < 0) || (n > qp->rq.wqe_cnt)) { + printf("rq wqe index:%d,rq wqe cnt:%d\r\n", n, qp->rq.wqe_cnt); + return NULL; + } + + return qp->buf.buf + qp->rq.offset + (n << qp->rq.wqe_shift); +} + +static void *get_send_wqe(struct hns_roce_qp *qp, int n) +{ + if ((n < 0) || (n > qp->sq.wqe_cnt)) { + printf("sq wqe index:%d,sq wqe cnt:%d\r\n", n, qp->sq.wqe_cnt); + return NULL; + } + + return (void *)(qp->buf.buf + qp->sq.offset + (n << qp->sq.wqe_shift)); +} + +static int hns_roce_wq_overflow(struct hns_roce_wq *wq, int nreq, + struct hns_roce_cq *cq) +{ + unsigned int cur; + + cur = wq->head - wq->tail; + if (cur + nreq < wq->max_post) + return 0; + + /* While the num of wqe exceeds cap of the device, cq will be locked */ + pthread_spin_lock(&cq->lock); + cur = wq->head - wq->tail; + pthread_spin_unlock(&cq->lock); + + printf("wq:(head = %d, tail = %d, max_post = %d), nreq = 0x%x\n", + wq->head, wq->tail, wq->max_post, nreq); + + return cur + nreq >= wq->max_post; +} + +static struct hns_roce_qp *hns_roce_find_qp(struct hns_roce_context *ctx, + uint32_t qpn) +{ + int tind = (qpn & (ctx->num_qps - 1)) >> ctx->qp_table_shift; + + if (ctx->qp_table[tind].refcnt) { + return ctx->qp_table[tind].table[qpn & ctx->qp_table_mask]; + } else { + printf("hns_roce_find_qp fail!\n"); + return NULL; + } +} + +static void hns_roce_clear_qp(struct hns_roce_context *ctx, uint32_t qpn) +{ + int tind = (qpn & (ctx->num_qps - 1)) >> ctx->qp_table_shift; + + if (!--ctx->qp_table[tind].refcnt) + free(ctx->qp_table[tind].table); + else + ctx->qp_table[tind].table[qpn & ctx->qp_table_mask] = NULL; +} + +static int hns_roce_v1_poll_one(struct hns_roce_cq *cq, + struct hns_roce_qp **cur_qp, struct ibv_wc *wc) +{ + uint32_t qpn; + int is_send; + uint16_t wqe_ctr; + uint32_t local_qpn; + struct hns_roce_wq *wq = NULL; + struct hns_roce_cqe *cqe = NULL; + struct hns_roce_wqe_ctrl_seg *sq_wqe = NULL; + + /* According to CI, find the relative cqe */ + cqe = next_cqe_sw(cq); + if (!cqe) + return CQ_EMPTY; + + /* Get the next cqe, CI will be added gradually */ + ++cq->cons_index; + + udma_from_device_barrier(); + + qpn = roce_get_field(cqe->cqe_byte_16, CQE_BYTE_16_LOCAL_QPN_M, + CQE_BYTE_16_LOCAL_QPN_S); + + is_send = (roce_get_bit(cqe->cqe_byte_4, CQE_BYTE_4_SQ_RQ_FLAG_S) == + HNS_ROCE_CQE_IS_SQ); + + local_qpn = roce_get_field(cqe->cqe_byte_16, CQE_BYTE_16_LOCAL_QPN_M, + CQE_BYTE_16_LOCAL_QPN_S); + + /* if qp is zero, it will not get the correct qpn */ + if (!*cur_qp || + (local_qpn & HNS_ROCE_CQE_QPN_MASK) != (*cur_qp)->ibv_qp.qp_num) { + + *cur_qp = hns_roce_find_qp(to_hr_ctx(cq->ibv_cq.context), + qpn & 0xffffff); + if (!*cur_qp) { + fprintf(stderr, PFX "can't find qp!\n"); + return CQ_POLL_ERR; + } + } + wc->qp_num = qpn & 0xffffff; + + if (is_send) { + wq = &(*cur_qp)->sq; + /* + * if sq_signal_bits is 1, the tail pointer first update to + * the wqe corresponding the current cqe + */ + if ((*cur_qp)->sq_signal_bits) { + wqe_ctr = (uint16_t)(roce_get_field(cqe->cqe_byte_4, + CQE_BYTE_4_WQE_INDEX_M, + CQE_BYTE_4_WQE_INDEX_S)); + /* + * wq->tail will plus a positive number every time, + * when wq->tail exceeds 32b, it is 0 and acc + */ + wq->tail += (wqe_ctr - (uint16_t) wq->tail) & + (wq->wqe_cnt - 1); + } + /* write the wr_id of wq into the wc */ + wc->wr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)]; + ++wq->tail; + } else { + wq = &(*cur_qp)->rq; + wc->wr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)]; + ++wq->tail; + } + + /* + * HW maintains wc status, set the err type and directly return, after + * generated the incorrect CQE + */ + if (roce_get_field(cqe->cqe_byte_4, + CQE_BYTE_4_STATUS_OF_THE_OPERATION_M, + CQE_BYTE_4_STATUS_OF_THE_OPERATION_S) != HNS_ROCE_CQE_SUCCESS) { + hns_roce_handle_error_cqe(cqe, wc); + return CQ_OK; + } + wc->status = IBV_WC_SUCCESS; + + /* + * According to the opcode type of cqe, mark the opcode and other + * information of wc + */ + if (is_send) { + /* Get opcode and flag before update the tail point for send */ + sq_wqe = (struct hns_roce_wqe_ctrl_seg *) + get_send_wqe(*cur_qp, roce_get_field(cqe->cqe_byte_4, + CQE_BYTE_4_WQE_INDEX_M, + CQE_BYTE_4_WQE_INDEX_S)); + switch (le32toh(sq_wqe->flag) & HNS_ROCE_WQE_OPCODE_MASK) { + case HNS_ROCE_WQE_OPCODE_SEND: + wc->opcode = IBV_WC_SEND; + break; + case HNS_ROCE_WQE_OPCODE_RDMA_READ: + wc->opcode = IBV_WC_RDMA_READ; + wc->byte_len = le32toh(cqe->byte_cnt); + break; + case HNS_ROCE_WQE_OPCODE_RDMA_WRITE: + wc->opcode = IBV_WC_RDMA_WRITE; + break; + case HNS_ROCE_WQE_OPCODE_BIND_MW2: + wc->opcode = IBV_WC_BIND_MW; + break; + default: + wc->status = IBV_WC_GENERAL_ERR; + break; + } + wc->wc_flags = (le32toh(sq_wqe->flag) & HNS_ROCE_WQE_IMM ? + IBV_WC_WITH_IMM : 0); + } else { + /* Get opcode and flag in rq&srq */ + wc->byte_len = le32toh(cqe->byte_cnt); + + switch (roce_get_field(cqe->cqe_byte_4, + CQE_BYTE_4_OPERATION_TYPE_M, + CQE_BYTE_4_OPERATION_TYPE_S) & + HNS_ROCE_CQE_OPCODE_MASK) { + case HNS_ROCE_OPCODE_RDMA_WITH_IMM_RECEIVE: + wc->opcode = IBV_WC_RECV_RDMA_WITH_IMM; + wc->wc_flags = IBV_WC_WITH_IMM; + wc->imm_data = htobe32(le32toh(cqe->immediate_data)); + break; + case HNS_ROCE_OPCODE_SEND_DATA_RECEIVE: + if (roce_get_bit(cqe->cqe_byte_4, + CQE_BYTE_4_IMMEDIATE_DATA_FLAG_S)) { + wc->opcode = IBV_WC_RECV; + wc->wc_flags = IBV_WC_WITH_IMM; + wc->imm_data = + htobe32(le32toh(cqe->immediate_data)); + } else { + wc->opcode = IBV_WC_RECV; + wc->wc_flags = 0; + } + break; + default: + wc->status = IBV_WC_GENERAL_ERR; + break; + } + } + + return CQ_OK; +} + +static int hns_roce_u_v1_poll_cq(struct ibv_cq *ibvcq, int ne, + struct ibv_wc *wc) +{ + int npolled; + int err = CQ_OK; + struct hns_roce_qp *qp = NULL; + struct hns_roce_cq *cq = to_hr_cq(ibvcq); + struct hns_roce_context *ctx = to_hr_ctx(ibvcq->context); + struct hns_roce_device *dev = to_hr_dev(ibvcq->context->device); + + pthread_spin_lock(&cq->lock); + + for (npolled = 0; npolled < ne; ++npolled) { + err = hns_roce_v1_poll_one(cq, &qp, wc + npolled); + if (err != CQ_OK) + break; + } + + if (npolled) { + if (dev->hw_version == HNS_ROCE_HW_VER1) { + *cq->set_ci_db = (cq->cons_index & + ((cq->cq_depth << 1) - 1)); + mmio_ordered_writes_hack(); + } + + hns_roce_update_cq_cons_index(ctx, cq); + } + + pthread_spin_unlock(&cq->lock); + + return err == CQ_POLL_ERR ? err : npolled; +} + +/** + * hns_roce_u_v1_arm_cq - request completion notification on a CQ + * @ibvcq: The completion queue to request notification for. + * @solicited: If non-zero, a event will be generated only for + * the next solicited CQ entry. If zero, any CQ entry, + * solicited or not, will generate an event + */ +static int hns_roce_u_v1_arm_cq(struct ibv_cq *ibvcq, int solicited) +{ + uint32_t ci; + uint32_t solicited_flag; + struct hns_roce_cq_db cq_db = {}; + struct hns_roce_cq *cq = to_hr_cq(ibvcq); + + ci = cq->cons_index & ((cq->cq_depth << 1) - 1); + solicited_flag = solicited ? HNS_ROCE_CQ_DB_REQ_SOL : + HNS_ROCE_CQ_DB_REQ_NEXT; + + roce_set_bit(cq_db.u32_8, CQ_DB_U32_8_HW_SYNC_S, 1); + roce_set_field(cq_db.u32_8, CQ_DB_U32_8_CMD_M, CQ_DB_U32_8_CMD_S, 3); + roce_set_field(cq_db.u32_8, CQ_DB_U32_8_CMD_MDF_M, + CQ_DB_U32_8_CMD_MDF_S, 1); + roce_set_bit(cq_db.u32_8, CQ_DB_U32_8_NOTIFY_TYPE_S, solicited_flag); + roce_set_field(cq_db.u32_8, CQ_DB_U32_8_CQN_M, CQ_DB_U32_8_CQN_S, + cq->cqn); + roce_set_field(cq_db.u32_4, CQ_DB_U32_4_CONS_IDX_M, + CQ_DB_U32_4_CONS_IDX_S, ci); + + hns_roce_write64((uint32_t *)&cq_db, to_hr_ctx(ibvcq->context), + ROCEE_DB_OTHERS_L_0_REG); + return 0; +} + +static int hns_roce_u_v1_post_send(struct ibv_qp *ibvqp, struct ibv_send_wr *wr, + struct ibv_send_wr **bad_wr) +{ + void *wqe; + int nreq; + int ps_opcode, i; + int ret = 0; + struct hns_roce_wqe_ctrl_seg *ctrl = NULL; + struct hns_roce_wqe_data_seg *dseg = NULL; + struct hns_roce_qp *qp = to_hr_qp(ibvqp); + struct hns_roce_context *ctx = to_hr_ctx(ibvqp->context); + unsigned int wqe_idx; + + pthread_spin_lock(&qp->sq.lock); + + for (nreq = 0; wr; ++nreq, wr = wr->next) { + if (hns_roce_wq_overflow(&qp->sq, nreq, + to_hr_cq(qp->ibv_qp.send_cq))) { + ret = -1; + *bad_wr = wr; + goto out; + } + + wqe_idx = (qp->sq.head + nreq) & (qp->rq.wqe_cnt - 1); + + if (wr->num_sge > qp->sq.max_gs) { + ret = -1; + *bad_wr = wr; + printf("wr->num_sge(<=%d) = %d, check failed!\r\n", + qp->sq.max_gs, wr->num_sge); + goto out; + } + + ctrl = wqe = get_send_wqe(qp, wqe_idx); + memset(ctrl, 0, sizeof(struct hns_roce_wqe_ctrl_seg)); + + qp->sq.wrid[wqe_idx] = wr->wr_id; + for (i = 0; i < wr->num_sge; i++) + ctrl->msg_length = htole32(le32toh(ctrl->msg_length) + + wr->sg_list[i].length); + + ctrl->flag |= htole32(((wr->send_flags & IBV_SEND_SIGNALED) ? + HNS_ROCE_WQE_CQ_NOTIFY : 0) | + (wr->send_flags & IBV_SEND_SOLICITED ? + HNS_ROCE_WQE_SE : 0) | + ((wr->opcode == IBV_WR_SEND_WITH_IMM || + wr->opcode == IBV_WR_RDMA_WRITE_WITH_IMM) ? + HNS_ROCE_WQE_IMM : 0) | + (wr->send_flags & IBV_SEND_FENCE ? + HNS_ROCE_WQE_FENCE : 0)); + + if (wr->opcode == IBV_WR_SEND_WITH_IMM || + wr->opcode == IBV_WR_RDMA_WRITE_WITH_IMM) + ctrl->imm_data = htole32(be32toh(wr->imm_data)); + + wqe += sizeof(struct hns_roce_wqe_ctrl_seg); + + /* set remote addr segment */ + switch (ibvqp->qp_type) { + case IBV_QPT_RC: + switch (wr->opcode) { + case IBV_WR_RDMA_READ: + ps_opcode = HNS_ROCE_WQE_OPCODE_RDMA_READ; + set_raddr_seg(wqe, wr->wr.rdma.remote_addr, + wr->wr.rdma.rkey); + break; + case IBV_WR_RDMA_WRITE: + case IBV_WR_RDMA_WRITE_WITH_IMM: + ps_opcode = HNS_ROCE_WQE_OPCODE_RDMA_WRITE; + set_raddr_seg(wqe, wr->wr.rdma.remote_addr, + wr->wr.rdma.rkey); + break; + case IBV_WR_SEND: + case IBV_WR_SEND_WITH_IMM: + ps_opcode = HNS_ROCE_WQE_OPCODE_SEND; + break; + case IBV_WR_ATOMIC_CMP_AND_SWP: + case IBV_WR_ATOMIC_FETCH_AND_ADD: + default: + ps_opcode = HNS_ROCE_WQE_OPCODE_MASK; + break; + } + ctrl->flag |= htole32(ps_opcode); + wqe += sizeof(struct hns_roce_wqe_raddr_seg); + break; + case IBV_QPT_UC: + case IBV_QPT_UD: + default: + break; + } + + dseg = wqe; + + /* Inline */ + if (wr->send_flags & IBV_SEND_INLINE && wr->num_sge) { + if (le32toh(ctrl->msg_length) > qp->max_inline_data) { + ret = -1; + *bad_wr = wr; + printf("inline data len(1-32)=%d, send_flags = 0x%x, check failed!\r\n", + wr->send_flags, ctrl->msg_length); + return ret; + } + + for (i = 0; i < wr->num_sge; i++) { + memcpy(wqe, + ((void *) (uintptr_t) wr->sg_list[i].addr), + wr->sg_list[i].length); + wqe = wqe + wr->sg_list[i].length; + } + + ctrl->flag |= htole32(HNS_ROCE_WQE_INLINE); + } else { + /* set sge */ + for (i = 0; i < wr->num_sge; i++) + set_data_seg(dseg+i, wr->sg_list + i); + + ctrl->flag |= + htole32(wr->num_sge << HNS_ROCE_WQE_SGE_NUM_BIT); + } + } + +out: + /* Set DB return */ + if (likely(nreq)) { + qp->sq.head += nreq; + + hns_roce_update_sq_head(ctx, qp->ibv_qp.qp_num, + qp->port_num - 1, qp->sl, + qp->sq.head & ((qp->sq.wqe_cnt << 1) - 1)); + } + + pthread_spin_unlock(&qp->sq.lock); + + return ret; +} + +static void __hns_roce_v1_cq_clean(struct hns_roce_cq *cq, uint32_t qpn, + struct hns_roce_srq *srq) +{ + int nfreed = 0; + uint32_t prod_index; + uint8_t owner_bit = 0; + struct hns_roce_cqe *cqe, *dest; + struct hns_roce_context *ctx = to_hr_ctx(cq->ibv_cq.context); + + for (prod_index = cq->cons_index; get_sw_cqe(cq, prod_index); + ++prod_index) + if (prod_index == cq->cons_index + cq->ibv_cq.cqe) + break; + + while ((int) --prod_index - (int) cq->cons_index >= 0) { + cqe = get_cqe(cq, prod_index & cq->ibv_cq.cqe); + if ((roce_get_field(cqe->cqe_byte_16, CQE_BYTE_16_LOCAL_QPN_M, + CQE_BYTE_16_LOCAL_QPN_S) & 0xffffff) == qpn) { + ++nfreed; + } else if (nfreed) { + dest = get_cqe(cq, + (prod_index + nfreed) & cq->ibv_cq.cqe); + owner_bit = roce_get_bit(dest->cqe_byte_4, + CQE_BYTE_4_OWNER_S); + memcpy(dest, cqe, sizeof(*cqe)); + roce_set_bit(dest->cqe_byte_4, CQE_BYTE_4_OWNER_S, + owner_bit); + } + } + + if (nfreed) { + cq->cons_index += nfreed; + udma_to_device_barrier(); + hns_roce_update_cq_cons_index(ctx, cq); + } +} + +static void hns_roce_v1_cq_clean(struct hns_roce_cq *cq, unsigned int qpn, + struct hns_roce_srq *srq) +{ + pthread_spin_lock(&cq->lock); + __hns_roce_v1_cq_clean(cq, qpn, srq); + pthread_spin_unlock(&cq->lock); +} + +static int hns_roce_u_v1_modify_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, + int attr_mask) +{ + int ret; + struct ibv_modify_qp cmd = {}; + struct hns_roce_qp *hr_qp = to_hr_qp(qp); + + ret = ibv_cmd_modify_qp(qp, attr, attr_mask, &cmd, sizeof(cmd)); + + if (!ret && (attr_mask & IBV_QP_STATE) && + attr->qp_state == IBV_QPS_RESET) { + hns_roce_v1_cq_clean(to_hr_cq(qp->recv_cq), qp->qp_num, + qp->srq ? to_hr_srq(qp->srq) : NULL); + if (qp->send_cq != qp->recv_cq) + hns_roce_v1_cq_clean(to_hr_cq(qp->send_cq), qp->qp_num, + NULL); + + hns_roce_init_qp_indices(to_hr_qp(qp)); + } + + if (!ret && (attr_mask & IBV_QP_PORT)) { + hr_qp->port_num = attr->port_num; + printf("hr_qp->port_num= 0x%x\n", hr_qp->port_num); + } + + hr_qp->sl = attr->ah_attr.sl; + + return ret; +} + +static void hns_roce_lock_cqs(struct ibv_qp *qp) +{ + struct hns_roce_cq *send_cq = to_hr_cq(qp->send_cq); + struct hns_roce_cq *recv_cq = to_hr_cq(qp->recv_cq); + + if (send_cq == recv_cq) { + pthread_spin_lock(&send_cq->lock); + } else if (send_cq->cqn < recv_cq->cqn) { + pthread_spin_lock(&send_cq->lock); + pthread_spin_lock(&recv_cq->lock); + } else { + pthread_spin_lock(&recv_cq->lock); + pthread_spin_lock(&send_cq->lock); + } +} + +static void hns_roce_unlock_cqs(struct ibv_qp *qp) +{ + struct hns_roce_cq *send_cq = to_hr_cq(qp->send_cq); + struct hns_roce_cq *recv_cq = to_hr_cq(qp->recv_cq); + + if (send_cq == recv_cq) { + pthread_spin_unlock(&send_cq->lock); + } else if (send_cq->cqn < recv_cq->cqn) { + pthread_spin_unlock(&recv_cq->lock); + pthread_spin_unlock(&send_cq->lock); + } else { + pthread_spin_unlock(&send_cq->lock); + pthread_spin_unlock(&recv_cq->lock); + } +} + +static int hns_roce_u_v1_destroy_qp(struct ibv_qp *ibqp) +{ + int ret; + struct hns_roce_qp *qp = to_hr_qp(ibqp); + + pthread_mutex_lock(&to_hr_ctx(ibqp->context)->qp_table_mutex); + ret = ibv_cmd_destroy_qp(ibqp); + if (ret) { + pthread_mutex_unlock(&to_hr_ctx(ibqp->context)->qp_table_mutex); + return ret; + } + + hns_roce_lock_cqs(ibqp); + + __hns_roce_v1_cq_clean(to_hr_cq(ibqp->recv_cq), ibqp->qp_num, + ibqp->srq ? to_hr_srq(ibqp->srq) : NULL); + + if (ibqp->send_cq != ibqp->recv_cq) + __hns_roce_v1_cq_clean(to_hr_cq(ibqp->send_cq), ibqp->qp_num, + NULL); + + hns_roce_clear_qp(to_hr_ctx(ibqp->context), ibqp->qp_num); + + hns_roce_unlock_cqs(ibqp); + pthread_mutex_unlock(&to_hr_ctx(ibqp->context)->qp_table_mutex); + + free(qp->sq.wrid); + if (qp->rq.wqe_cnt) + free(qp->rq.wrid); + + hns_roce_free_buf(&qp->buf); + free(qp); + + return ret; +} + +static int hns_roce_u_v1_post_recv(struct ibv_qp *ibvqp, struct ibv_recv_wr *wr, + struct ibv_recv_wr **bad_wr) +{ + int ret = 0; + int nreq; + struct ibv_sge *sg; + struct hns_roce_rc_rq_wqe *rq_wqe; + struct hns_roce_qp *qp = to_hr_qp(ibvqp); + struct hns_roce_context *ctx = to_hr_ctx(ibvqp->context); + unsigned int wqe_idx; + + pthread_spin_lock(&qp->rq.lock); + + for (nreq = 0; wr; ++nreq, wr = wr->next) { + if (hns_roce_wq_overflow(&qp->rq, nreq, + to_hr_cq(qp->ibv_qp.recv_cq))) { + ret = -1; + *bad_wr = wr; + goto out; + } + + wqe_idx = (qp->rq.head + nreq) & (qp->rq.wqe_cnt - 1); + + if (wr->num_sge > qp->rq.max_gs) { + ret = -1; + *bad_wr = wr; + goto out; + } + + rq_wqe = get_recv_wqe(qp, wqe_idx); + if (wr->num_sge > HNS_ROCE_RC_RQ_WQE_MAX_SGE_NUM) { + ret = -1; + *bad_wr = wr; + goto out; + } + + if (wr->num_sge == HNS_ROCE_RC_RQ_WQE_MAX_SGE_NUM) { + roce_set_field(rq_wqe->u32_2, + RC_RQ_WQE_NUMBER_OF_DATA_SEG_M, + RC_RQ_WQE_NUMBER_OF_DATA_SEG_S, + HNS_ROCE_RC_RQ_WQE_MAX_SGE_NUM); + sg = wr->sg_list; + + rq_wqe->va0 = htole64(sg->addr); + rq_wqe->l_key0 = htole32(sg->lkey); + rq_wqe->length0 = htole32(sg->length); + + sg = wr->sg_list + 1; + + rq_wqe->va1 = htole64(sg->addr); + rq_wqe->l_key1 = htole32(sg->lkey); + rq_wqe->length1 = htole32(sg->length); + } else if (wr->num_sge == HNS_ROCE_RC_RQ_WQE_MAX_SGE_NUM - 1) { + roce_set_field(rq_wqe->u32_2, + RC_RQ_WQE_NUMBER_OF_DATA_SEG_M, + RC_RQ_WQE_NUMBER_OF_DATA_SEG_S, + HNS_ROCE_RC_RQ_WQE_MAX_SGE_NUM - 1); + sg = wr->sg_list; + + rq_wqe->va0 = htole64(sg->addr); + rq_wqe->l_key0 = htole32(sg->lkey); + rq_wqe->length0 = htole32(sg->length); + + } else if (wr->num_sge == HNS_ROCE_RC_RQ_WQE_MAX_SGE_NUM - 2) { + roce_set_field(rq_wqe->u32_2, + RC_RQ_WQE_NUMBER_OF_DATA_SEG_M, + RC_RQ_WQE_NUMBER_OF_DATA_SEG_S, + HNS_ROCE_RC_RQ_WQE_MAX_SGE_NUM - 2); + } + + qp->rq.wrid[wqe_idx] = wr->wr_id; + } + +out: + if (nreq) { + qp->rq.head += nreq; + + hns_roce_update_rq_head(ctx, qp->ibv_qp.qp_num, + qp->rq.head & ((qp->rq.wqe_cnt << 1) - 1)); + } + + pthread_spin_unlock(&qp->rq.lock); + + return ret; +} + +const struct hns_roce_u_hw hns_roce_u_hw_v1 = { + .hw_version = HNS_ROCE_HW_VER1, + .hw_ops = { + .poll_cq = hns_roce_u_v1_poll_cq, + .req_notify_cq = hns_roce_u_v1_arm_cq, + .post_send = hns_roce_u_v1_post_send, + .post_recv = hns_roce_u_v1_post_recv, + .modify_qp = hns_roce_u_v1_modify_qp, + .destroy_qp = hns_roce_u_v1_destroy_qp, + }, +}; diff --git a/providers/hns/hns_roce_u_hw_v1.h b/providers/hns/hns_roce_u_hw_v1.h new file mode 100644 index 0000000..bb7aec6 --- /dev/null +++ b/providers/hns/hns_roce_u_hw_v1.h @@ -0,0 +1,244 @@ +/* + * Copyright (c) 2016 Hisilicon Limited. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef _HNS_ROCE_U_HW_V1_H +#define _HNS_ROCE_U_HW_V1_H + +#include <stdint.h> +#include <linux/types.h> + +#define HNS_ROCE_CQ_DB_REQ_SOL 1 +#define HNS_ROCE_CQ_DB_REQ_NEXT 0 + +#define HNS_ROCE_CQE_IS_SQ 0 + +#define HNS_ROCE_RC_RQ_WQE_MAX_SGE_NUM 2 + +enum { + HNS_ROCE_WQE_INLINE = 1 << 31, + HNS_ROCE_WQE_SE = 1 << 30, + HNS_ROCE_WQE_SGE_NUM_BIT = 24, + HNS_ROCE_WQE_IMM = 1 << 23, + HNS_ROCE_WQE_FENCE = 1 << 21, + HNS_ROCE_WQE_CQ_NOTIFY = 1 << 20, + HNS_ROCE_WQE_OPCODE_SEND = 0 << 16, + HNS_ROCE_WQE_OPCODE_RDMA_READ = 1 << 16, + HNS_ROCE_WQE_OPCODE_RDMA_WRITE = 2 << 16, + HNS_ROCE_WQE_OPCODE_BIND_MW2 = 6 << 16, + HNS_ROCE_WQE_OPCODE_MASK = 15 << 16, +}; + +struct hns_roce_wqe_ctrl_seg { + __le32 sgl_pa_h; + __le32 flag; + __le32 imm_data; + __le32 msg_length; +}; + +struct hns_roce_wqe_data_seg { + __le64 addr; + __le32 lkey; + __le32 len; +}; + +struct hns_roce_wqe_raddr_seg { + __le32 rkey; + __le32 len; + __le64 raddr; +}; + +enum { + CQ_OK = 0, + CQ_EMPTY = -1, + CQ_POLL_ERR = -2, +}; + +enum { + HNS_ROCE_CQE_QPN_MASK = 0x3ffff, + HNS_ROCE_CQE_STATUS_MASK = 0x1f, + HNS_ROCE_CQE_OPCODE_MASK = 0xf, +}; + +enum { + HNS_ROCE_CQE_SUCCESS, + HNS_ROCE_CQE_SYNDROME_LOCAL_LENGTH_ERR, + HNS_ROCE_CQE_SYNDROME_LOCAL_QP_OP_ERR, + HNS_ROCE_CQE_SYNDROME_LOCAL_PROT_ERR, + HNS_ROCE_CQE_SYNDROME_WR_FLUSH_ERR, + HNS_ROCE_CQE_SYNDROME_MEM_MANAGE_OPERATE_ERR, + HNS_ROCE_CQE_SYNDROME_BAD_RESP_ERR, + HNS_ROCE_CQE_SYNDROME_LOCAL_ACCESS_ERR, + HNS_ROCE_CQE_SYNDROME_REMOTE_INVAL_REQ_ERR, + HNS_ROCE_CQE_SYNDROME_REMOTE_ACCESS_ERR, + HNS_ROCE_CQE_SYNDROME_REMOTE_OP_ERR, + HNS_ROCE_CQE_SYNDROME_TRANSPORT_RETRY_EXC_ERR, + HNS_ROCE_CQE_SYNDROME_RNR_RETRY_EXC_ERR, +}; + +struct hns_roce_cq_db { + __le32 u32_4; + __le32 u32_8; +}; +#define CQ_DB_U32_4_CONS_IDX_S 0 +#define CQ_DB_U32_4_CONS_IDX_M (((1UL << 16) - 1) << CQ_DB_U32_4_CONS_IDX_S) + +#define CQ_DB_U32_8_CQN_S 0 +#define CQ_DB_U32_8_CQN_M (((1UL << 16) - 1) << CQ_DB_U32_8_CQN_S) + +#define CQ_DB_U32_8_NOTIFY_TYPE_S 16 + +#define CQ_DB_U32_8_CMD_MDF_S 24 +#define CQ_DB_U32_8_CMD_MDF_M (((1UL << 4) - 1) << CQ_DB_U32_8_CMD_MDF_S) + +#define CQ_DB_U32_8_CMD_S 28 +#define CQ_DB_U32_8_CMD_M (((1UL << 3) - 1) << CQ_DB_U32_8_CMD_S) + +#define CQ_DB_U32_8_HW_SYNC_S 31 + +struct hns_roce_rq_db { + __le32 u32_4; + __le32 u32_8; +}; + +#define RQ_DB_U32_4_RQ_HEAD_S 0 +#define RQ_DB_U32_4_RQ_HEAD_M (((1UL << 15) - 1) << RQ_DB_U32_4_RQ_HEAD_S) + +#define RQ_DB_U32_8_QPN_S 0 +#define RQ_DB_U32_8_QPN_M (((1UL << 24) - 1) << RQ_DB_U32_8_QPN_S) + +#define RQ_DB_U32_8_CMD_S 28 +#define RQ_DB_U32_8_CMD_M (((1UL << 3) - 1) << RQ_DB_U32_8_CMD_S) + +#define RQ_DB_U32_8_HW_SYNC_S 31 + +struct hns_roce_sq_db { + __le32 u32_4; + __le32 u32_8; +}; + +#define SQ_DB_U32_4_SQ_HEAD_S 0 +#define SQ_DB_U32_4_SQ_HEAD_M (((1UL << 15) - 1) << SQ_DB_U32_4_SQ_HEAD_S) + +#define SQ_DB_U32_4_SL_S 16 +#define SQ_DB_U32_4_SL_M (((1UL << 2) - 1) << SQ_DB_U32_4_SL_S) + +#define SQ_DB_U32_4_PORT_S 18 +#define SQ_DB_U32_4_PORT_M (((1UL << 3) - 1) << SQ_DB_U32_4_PORT_S) + +#define SQ_DB_U32_4_DIRECT_WQE_S 31 + +#define SQ_DB_U32_8_QPN_S 0 +#define SQ_DB_U32_8_QPN_M (((1UL << 24) - 1) << SQ_DB_U32_8_QPN_S) + +#define SQ_DB_U32_8_HW_SYNC 31 + +struct hns_roce_cqe { + __le32 cqe_byte_4; + union { + __le32 r_key; + __le32 immediate_data; + }; + __le32 byte_cnt; + __le32 cqe_byte_16; + __le32 cqe_byte_20; + __le32 s_mac_l; + __le32 cqe_byte_28; + __le32 reserved; +}; +#define CQE_BYTE_4_OPERATION_TYPE_S 0 +#define CQE_BYTE_4_OPERATION_TYPE_M \ + (((1UL << 4) - 1) << CQE_BYTE_4_OPERATION_TYPE_S) + +#define CQE_BYTE_4_OWNER_S 7 + +#define CQE_BYTE_4_STATUS_OF_THE_OPERATION_S 8 +#define CQE_BYTE_4_STATUS_OF_THE_OPERATION_M \ + (((1UL << 5) - 1) << CQE_BYTE_4_STATUS_OF_THE_OPERATION_S) + +#define CQE_BYTE_4_SQ_RQ_FLAG_S 14 + +#define CQE_BYTE_4_IMMEDIATE_DATA_FLAG_S 15 + +#define CQE_BYTE_4_WQE_INDEX_S 16 +#define CQE_BYTE_4_WQE_INDEX_M (((1UL << 14) - 1) << CQE_BYTE_4_WQE_INDEX_S) + +#define CQE_BYTE_16_LOCAL_QPN_S 0 +#define CQE_BYTE_16_LOCAL_QPN_M (((1UL << 24) - 1) << CQE_BYTE_16_LOCAL_QPN_S) + +#define ROCEE_DB_SQ_L_0_REG 0x230 + +#define ROCEE_DB_OTHERS_L_0_REG 0x238 + +struct hns_roce_rc_send_wqe { + __le32 sgl_ba_31_0; + __le32 u32_1; + union { + __le32 r_key; + __le32 immediate_data; + }; + __le32 msg_length; + __le32 rvd_3; + __le32 rvd_4; + __le32 rvd_5; + __le32 rvd_6; + __le64 va0; + __le32 l_key0; + __le32 length0; + + __le64 va1; + __le32 l_key1; + __le32 length1; +}; + +struct hns_roce_rc_rq_wqe { + __le32 u32_0; + __le32 sgl_ba_31_0; + __le32 u32_2; + __le32 rvd_5; + __le32 rvd_6; + __le32 rvd_7; + __le32 rvd_8; + __le32 rvd_9; + + __le64 va0; + __le32 l_key0; + __le32 length0; + + __le64 va1; + __le32 l_key1; + __le32 length1; +}; +#define RC_RQ_WQE_NUMBER_OF_DATA_SEG_S 16 +#define RC_RQ_WQE_NUMBER_OF_DATA_SEG_M \ + (((1UL << 6) - 1) << RC_RQ_WQE_NUMBER_OF_DATA_SEG_S) + +#endif /* _HNS_ROCE_U_HW_V1_H */ diff --git a/providers/hns/hns_roce_u_hw_v2.c b/providers/hns/hns_roce_u_hw_v2.c new file mode 100644 index 0000000..be3490f --- /dev/null +++ b/providers/hns/hns_roce_u_hw_v2.c @@ -0,0 +1,1311 @@ +/* + * Copyright (c) 2016-2017 Hisilicon Limited. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#define _GNU_SOURCE +#include <stdio.h> +#include <string.h> +#include "hns_roce_u.h" +#include "hns_roce_u_db.h" +#include "hns_roce_u_hw_v2.h" + +static void *get_send_sge_ex(struct hns_roce_qp *qp, int n); + +static void set_data_seg_v2(struct hns_roce_v2_wqe_data_seg *dseg, + struct ibv_sge *sg) +{ + dseg->lkey = htole32(sg->lkey); + dseg->addr = htole64(sg->addr); + dseg->len = htole32(sg->length); +} + +static void set_extend_atomic_seg(struct hns_roce_qp *qp, + unsigned int atomic_buf, + struct hns_roce_sge_info *sge_info, + void *buf) +{ + unsigned int sge_mask = qp->ex_sge.sge_cnt - 1; + int i; + + for (i = 0; i < atomic_buf; i++, sge_info->start_idx++) + memcpy(get_send_sge_ex(qp, sge_info->start_idx & sge_mask), + buf + i * HNS_ROCE_SGE_SIZE, HNS_ROCE_SGE_SIZE); +} + +static int set_atomic_seg(struct hns_roce_qp *qp, struct ibv_send_wr *wr, + unsigned int msg_len, void *dseg, + struct hns_roce_sge_info *sge_info) +{ + struct hns_roce_wqe_atomic_seg *aseg; + unsigned int ext_sg_num; + + aseg = dseg; + + if (msg_len == STANDARD_ATOMIC_U_BYTE_8) { + if (wr->opcode == IBV_WR_ATOMIC_CMP_AND_SWP) { + aseg->fetchadd_swap_data = htole64(wr->wr.atomic.swap); + aseg->cmp_data = htole64(wr->wr.atomic.compare_add); + } else { + aseg->fetchadd_swap_data = + htole64(wr->wr.atomic.compare_add); + aseg->cmp_data = 0; + } + } else if (msg_len == EXTEND_ATOMIC_U_BYTE_16 || + msg_len == EXTEND_ATOMIC_U_BYTE_32 || + msg_len == EXTEND_ATOMIC_U_BYTE_64) { + ext_sg_num = msg_len * DATA_TYPE_NUM >> HNS_ROCE_SGE_SHIFT; + aseg->fetchadd_swap_data = 0; + aseg->cmp_data = 0; + if (wr->opcode == IBV_WR_ATOMIC_CMP_AND_SWP) { + if (!wr->wr.atomic.swap || !wr->wr.atomic.compare_add) + return EINVAL; + + set_extend_atomic_seg(qp, ext_sg_num / DATA_TYPE_NUM, + sge_info, + (void *) (uintptr_t) wr->wr.atomic.swap); + set_extend_atomic_seg(qp, ext_sg_num / DATA_TYPE_NUM, + sge_info, + (void *) (uintptr_t) wr->wr.atomic.compare_add); + } else { + uint8_t buf[EXTEND_ATOMIC_U_BYTE_64] = {}; + + if (!wr->wr.atomic.compare_add) + return EINVAL; + + set_extend_atomic_seg(qp, ext_sg_num / DATA_TYPE_NUM, + sge_info, + (void *) (uintptr_t) wr->wr.atomic.compare_add); + set_extend_atomic_seg(qp, ext_sg_num / DATA_TYPE_NUM, + sge_info, buf); + } + } else + return EINVAL; + + return 0; +} + +static void hns_roce_v2_handle_error_cqe(struct hns_roce_v2_cqe *cqe, + struct ibv_wc *wc) +{ + unsigned int status = roce_get_field(cqe->byte_4, CQE_BYTE_4_STATUS_M, + CQE_BYTE_4_STATUS_S); + unsigned int cqe_status = status & HNS_ROCE_V2_CQE_STATUS_MASK; + + switch (cqe_status) { + case HNS_ROCE_V2_CQE_LOCAL_LENGTH_ERR: + wc->status = IBV_WC_LOC_LEN_ERR; + break; + case HNS_ROCE_V2_CQE_LOCAL_QP_OP_ERR: + wc->status = IBV_WC_LOC_QP_OP_ERR; + break; + case HNS_ROCE_V2_CQE_LOCAL_PROT_ERR: + wc->status = IBV_WC_LOC_PROT_ERR; + break; + case HNS_ROCE_V2_CQE_WR_FLUSH_ERR: + wc->status = IBV_WC_WR_FLUSH_ERR; + break; + case HNS_ROCE_V2_CQE_MEM_MANAGERENT_OP_ERR: + wc->status = IBV_WC_MW_BIND_ERR; + break; + case HNS_ROCE_V2_CQE_BAD_RESP_ERR: + wc->status = IBV_WC_BAD_RESP_ERR; + break; + case HNS_ROCE_V2_CQE_LOCAL_ACCESS_ERR: + wc->status = IBV_WC_LOC_ACCESS_ERR; + break; + case HNS_ROCE_V2_CQE_REMOTE_INVAL_REQ_ERR: + wc->status = IBV_WC_REM_INV_REQ_ERR; + break; + case HNS_ROCE_V2_CQE_REMOTE_ACCESS_ERR: + wc->status = IBV_WC_REM_ACCESS_ERR; + break; + case HNS_ROCE_V2_CQE_REMOTE_OP_ERR: + wc->status = IBV_WC_REM_OP_ERR; + break; + case HNS_ROCE_V2_CQE_TRANSPORT_RETRY_EXC_ERR: + wc->status = IBV_WC_RETRY_EXC_ERR; + break; + case HNS_ROCE_V2_CQE_RNR_RETRY_EXC_ERR: + wc->status = IBV_WC_RNR_RETRY_EXC_ERR; + break; + case HNS_ROCE_V2_CQE_REMOTE_ABORTED_ERR: + wc->status = IBV_WC_REM_ABORT_ERR; + break; + default: + wc->status = IBV_WC_GENERAL_ERR; + break; + } +} + +static struct hns_roce_v2_cqe *get_cqe_v2(struct hns_roce_cq *cq, int entry) +{ + return cq->buf.buf + entry * HNS_ROCE_CQE_ENTRY_SIZE; +} + +static void *get_sw_cqe_v2(struct hns_roce_cq *cq, int n) +{ + struct hns_roce_v2_cqe *cqe = get_cqe_v2(cq, n & cq->ibv_cq.cqe); + + return (!!(roce_get_bit(cqe->byte_4, CQE_BYTE_4_OWNER_S)) ^ + !!(n & (cq->ibv_cq.cqe + 1))) ? cqe : NULL; +} + +static struct hns_roce_v2_cqe *next_cqe_sw_v2(struct hns_roce_cq *cq) +{ + return get_sw_cqe_v2(cq, cq->cons_index); +} + +static void *get_recv_wqe_v2(struct hns_roce_qp *qp, int n) +{ + if ((n < 0) || (n > qp->rq.wqe_cnt)) { + printf("rq wqe index:%d,rq wqe cnt:%d\r\n", n, qp->rq.wqe_cnt); + return NULL; + } + + return qp->buf.buf + qp->rq.offset + (n << qp->rq.wqe_shift); +} + +static void *get_send_wqe(struct hns_roce_qp *qp, int n) +{ + return qp->buf.buf + qp->sq.offset + (n << qp->sq.wqe_shift); +} + +static void *get_send_sge_ex(struct hns_roce_qp *qp, int n) +{ + return qp->buf.buf + qp->ex_sge.offset + (n << qp->ex_sge.sge_shift); +} + +static void *get_srq_wqe(struct hns_roce_srq *srq, int n) +{ + return srq->buf.buf + (n << srq->wqe_shift); +} + +static void hns_roce_free_srq_wqe(struct hns_roce_srq *srq, uint16_t ind) +{ + uint32_t bitmap_num; + int bit_num; + + pthread_spin_lock(&srq->lock); + + bitmap_num = ind / BIT_CNT_PER_U64; + bit_num = ind % BIT_CNT_PER_U64; + srq->idx_que.bitmap[bitmap_num] |= (1ULL << bit_num); + srq->tail++; + + pthread_spin_unlock(&srq->lock); +} + +static int hns_roce_v2_wq_overflow(struct hns_roce_wq *wq, int nreq, + struct hns_roce_cq *cq) +{ + unsigned int cur; + + cur = wq->head - wq->tail; + if (cur + nreq < wq->max_post) + return 0; + + pthread_spin_lock(&cq->lock); + cur = wq->head - wq->tail; + pthread_spin_unlock(&cq->lock); + + return cur + nreq >= wq->max_post; +} + +static void hns_roce_update_rq_db(struct hns_roce_context *ctx, + unsigned int qpn, unsigned int rq_head) +{ + struct hns_roce_db rq_db = {}; + + roce_set_field(rq_db.byte_4, DB_BYTE_4_TAG_M, DB_BYTE_4_TAG_S, qpn); + roce_set_field(rq_db.byte_4, DB_BYTE_4_CMD_M, DB_BYTE_4_CMD_S, + HNS_ROCE_V2_RQ_DB); + roce_set_field(rq_db.parameter, DB_PARAM_RQ_PRODUCER_IDX_M, + DB_PARAM_RQ_PRODUCER_IDX_S, rq_head); + + udma_to_device_barrier(); + + hns_roce_write64((uint32_t *)&rq_db, ctx, ROCEE_VF_DB_CFG0_OFFSET); +} + +static void hns_roce_update_sq_db(struct hns_roce_context *ctx, + unsigned int qpn, unsigned int sl, + unsigned int sq_head) +{ + struct hns_roce_db sq_db = {}; + + /* cmd: 0 sq db; 1 rq db; 2; 2 srq db; 3 cq db ptr; 4 cq db ntr */ + roce_set_field(sq_db.byte_4, DB_BYTE_4_CMD_M, DB_BYTE_4_CMD_S, + HNS_ROCE_V2_SQ_DB); + roce_set_field(sq_db.byte_4, DB_BYTE_4_TAG_M, DB_BYTE_4_TAG_S, qpn); + + roce_set_field(sq_db.parameter, DB_PARAM_SQ_PRODUCER_IDX_M, + DB_PARAM_SQ_PRODUCER_IDX_S, sq_head); + roce_set_field(sq_db.parameter, DB_PARAM_SL_M, DB_PARAM_SL_S, sl); + + udma_to_device_barrier(); + + hns_roce_write64((uint32_t *)&sq_db, ctx, ROCEE_VF_DB_CFG0_OFFSET); +} + +static void hns_roce_v2_update_cq_cons_index(struct hns_roce_context *ctx, + struct hns_roce_cq *cq) +{ + struct hns_roce_db cq_db = {}; + + roce_set_field(cq_db.byte_4, DB_BYTE_4_TAG_M, DB_BYTE_4_TAG_S, cq->cqn); + roce_set_field(cq_db.byte_4, DB_BYTE_4_CMD_M, DB_BYTE_4_CMD_S, + HNS_ROCE_V2_CQ_DB_PTR); + + roce_set_field(cq_db.parameter, DB_PARAM_CQ_CONSUMER_IDX_M, + DB_PARAM_CQ_CONSUMER_IDX_S, + cq->cons_index & ((cq->cq_depth << 1) - 1)); + roce_set_field(cq_db.parameter, DB_PARAM_CQ_CMD_SN_M, + DB_PARAM_CQ_CMD_SN_S, 1); + roce_set_bit(cq_db.parameter, DB_PARAM_CQ_NOTIFY_S, 0); + + hns_roce_write64((uint32_t *)&cq_db, ctx, ROCEE_VF_DB_CFG0_OFFSET); +} + +static struct hns_roce_qp *hns_roce_v2_find_qp(struct hns_roce_context *ctx, + uint32_t qpn) +{ + int tind = (qpn & (ctx->num_qps - 1)) >> ctx->qp_table_shift; + + if (ctx->qp_table[tind].refcnt) + return ctx->qp_table[tind].table[qpn & ctx->qp_table_mask]; + else + return NULL; +} + +static void hns_roce_v2_clear_qp(struct hns_roce_context *ctx, uint32_t qpn) +{ + int tind = (qpn & (ctx->num_qps - 1)) >> ctx->qp_table_shift; + + if (!--ctx->qp_table[tind].refcnt) + free(ctx->qp_table[tind].table); + else + ctx->qp_table[tind].table[qpn & ctx->qp_table_mask] = NULL; +} + +static int hns_roce_u_v2_modify_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, + int attr_mask); + +static int hns_roce_flush_cqe(struct hns_roce_qp **cur_qp, struct ibv_wc *wc) +{ + struct ibv_qp_attr attr; + int attr_mask; + int ret; + + if ((wc->status != IBV_WC_SUCCESS) && + (wc->status != IBV_WC_WR_FLUSH_ERR)) { + attr_mask = IBV_QP_STATE; + attr.qp_state = IBV_QPS_ERR; + ret = hns_roce_u_v2_modify_qp(&(*cur_qp)->ibv_qp, + &attr, attr_mask); + if (ret) + fprintf(stderr, PFX "failed to modify qp!\n"); + + (*cur_qp)->ibv_qp.state = IBV_QPS_ERR; + } + + return V2_CQ_OK; +} + +static void hns_roce_v2_get_opcode_from_sender(struct hns_roce_v2_cqe *cqe, + struct ibv_wc *wc) +{ + /* Get opcode and flag before update the tail point for send */ + switch (roce_get_field(cqe->byte_4, CQE_BYTE_4_OPCODE_M, + CQE_BYTE_4_OPCODE_S) & HNS_ROCE_V2_CQE_OPCODE_MASK) { + case HNS_ROCE_SQ_OP_SEND: + wc->opcode = IBV_WC_SEND; + wc->wc_flags = 0; + break; + case HNS_ROCE_SQ_OP_SEND_WITH_IMM: + wc->opcode = IBV_WC_SEND; + wc->wc_flags = IBV_WC_WITH_IMM; + break; + case HNS_ROCE_SQ_OP_SEND_WITH_INV: + wc->opcode = IBV_WC_SEND; + break; + case HNS_ROCE_SQ_OP_RDMA_READ: + wc->opcode = IBV_WC_RDMA_READ; + wc->byte_len = le32toh(cqe->byte_cnt); + wc->wc_flags = 0; + break; + case HNS_ROCE_SQ_OP_RDMA_WRITE: + wc->opcode = IBV_WC_RDMA_WRITE; + wc->wc_flags = 0; + break; + + case HNS_ROCE_SQ_OP_RDMA_WRITE_WITH_IMM: + wc->opcode = IBV_WC_RDMA_WRITE; + wc->wc_flags = IBV_WC_WITH_IMM; + break; + case HNS_ROCE_SQ_OP_LOCAL_INV: + wc->opcode = IBV_WC_LOCAL_INV; + wc->wc_flags = IBV_WC_WITH_INV; + break; + case HNS_ROCE_SQ_OP_ATOMIC_COMP_AND_SWAP: + wc->opcode = IBV_WC_COMP_SWAP; + wc->byte_len = le32toh(cqe->byte_cnt); + wc->wc_flags = 0; + break; + case HNS_ROCE_SQ_OP_ATOMIC_FETCH_AND_ADD: + wc->opcode = IBV_WC_FETCH_ADD; + wc->byte_len = le32toh(cqe->byte_cnt); + wc->wc_flags = 0; + break; + case HNS_ROCE_SQ_OP_BIND_MW: + wc->opcode = IBV_WC_BIND_MW; + wc->wc_flags = 0; + break; + default: + wc->status = IBV_WC_GENERAL_ERR; + wc->wc_flags = 0; + break; + } +} + +static void hns_roce_v2_get_opcode_from_receiver(struct hns_roce_v2_cqe *cqe, + struct ibv_wc *wc, + uint32_t opcode) +{ + switch (opcode) { + case HNS_ROCE_RECV_OP_RDMA_WRITE_IMM: + wc->opcode = IBV_WC_RECV_RDMA_WITH_IMM; + wc->wc_flags = IBV_WC_WITH_IMM; + wc->imm_data = htobe32(le32toh(cqe->immtdata)); + break; + case HNS_ROCE_RECV_OP_SEND: + wc->opcode = IBV_WC_RECV; + wc->wc_flags = 0; + break; + case HNS_ROCE_RECV_OP_SEND_WITH_IMM: + wc->opcode = IBV_WC_RECV; + wc->wc_flags = IBV_WC_WITH_IMM; + wc->imm_data = htobe32(le32toh(cqe->immtdata)); + break; + case HNS_ROCE_RECV_OP_SEND_WITH_INV: + wc->opcode = IBV_WC_RECV; + wc->wc_flags = IBV_WC_WITH_INV; + wc->invalidated_rkey = le32toh(cqe->rkey); + break; + default: + wc->status = IBV_WC_GENERAL_ERR; + break; + } +} + +static int hns_roce_handle_recv_inl_wqe(struct hns_roce_v2_cqe *cqe, + struct hns_roce_qp **cur_qp, + struct ibv_wc *wc, uint32_t opcode) +{ + if (((*cur_qp)->ibv_qp.qp_type == IBV_QPT_RC || + (*cur_qp)->ibv_qp.qp_type == IBV_QPT_UC) && + (opcode == HNS_ROCE_RECV_OP_SEND || + opcode == HNS_ROCE_RECV_OP_SEND_WITH_IMM || + opcode == HNS_ROCE_RECV_OP_SEND_WITH_INV) && + (roce_get_bit(cqe->byte_4, CQE_BYTE_4_RQ_INLINE_S))) { + struct hns_roce_rinl_sge *sge_list; + uint32_t wr_num, wr_cnt, sge_num, data_len; + uint8_t *wqe_buf; + uint32_t sge_cnt, size; + + wr_num = (uint16_t)roce_get_field(cqe->byte_4, + CQE_BYTE_4_WQE_IDX_M, + CQE_BYTE_4_WQE_IDX_S) & 0xffff; + wr_cnt = wr_num & ((*cur_qp)->rq.wqe_cnt - 1); + + sge_list = (*cur_qp)->rq_rinl_buf.wqe_list[wr_cnt].sg_list; + sge_num = (*cur_qp)->rq_rinl_buf.wqe_list[wr_cnt].sge_cnt; + wqe_buf = (uint8_t *)get_recv_wqe_v2(*cur_qp, wr_cnt); + if (!wqe_buf) + return V2_CQ_POLL_ERR; + + data_len = wc->byte_len; + + for (sge_cnt = 0; (sge_cnt < sge_num) && (data_len); + sge_cnt++) { + size = sge_list[sge_cnt].len < data_len ? + sge_list[sge_cnt].len : data_len; + + memcpy((void *)sge_list[sge_cnt].addr, + (void *)wqe_buf, size); + data_len -= size; + wqe_buf += size; + } + + if (data_len) { + wc->status = IBV_WC_LOC_LEN_ERR; + return V2_CQ_POLL_ERR; + } + } + + return V2_CQ_OK; +} + +static int hns_roce_v2_poll_one(struct hns_roce_cq *cq, + struct hns_roce_qp **cur_qp, struct ibv_wc *wc) +{ + uint32_t qpn; + int is_send; + uint16_t wqe_ctr; + struct hns_roce_wq *wq = NULL; + struct hns_roce_v2_cqe *cqe; + struct hns_roce_srq *srq; + uint32_t opcode; + int ret; + + /* According to CI, find the relative cqe */ + cqe = next_cqe_sw_v2(cq); + if (!cqe) + return V2_CQ_EMPTY; + + /* Get the next cqe, CI will be added gradually */ + ++cq->cons_index; + + udma_from_device_barrier(); + + qpn = roce_get_field(cqe->byte_16, CQE_BYTE_16_LCL_QPN_M, + CQE_BYTE_16_LCL_QPN_S); + + is_send = (roce_get_bit(cqe->byte_4, CQE_BYTE_4_S_R_S) == + HNS_ROCE_V2_CQE_IS_SQ); + + /* if qp is zero, it will not get the correct qpn */ + if (!*cur_qp || qpn != (*cur_qp)->ibv_qp.qp_num) { + *cur_qp = hns_roce_v2_find_qp(to_hr_ctx(cq->ibv_cq.context), + qpn); + if (!*cur_qp) { + fprintf(stderr, PFX "can't find qp!\n"); + return V2_CQ_POLL_ERR; + } + } + wc->qp_num = qpn; + + srq = (*cur_qp)->ibv_qp.srq ? to_hr_srq((*cur_qp)->ibv_qp.srq) : NULL; + if (is_send) { + wq = &(*cur_qp)->sq; + /* + * if sq_signal_bits is 1, the tail pointer first update to + * the wqe corresponding the current cqe + */ + if ((*cur_qp)->sq_signal_bits) { + wqe_ctr = (uint16_t)(roce_get_field(cqe->byte_4, + CQE_BYTE_4_WQE_IDX_M, + CQE_BYTE_4_WQE_IDX_S)); + /* + * wq->tail will plus a positive number every time, + * when wq->tail exceeds 32b, it is 0 and acc + */ + wq->tail += (wqe_ctr - (uint16_t) wq->tail) & + (wq->wqe_cnt - 1); + } + /* write the wr_id of wq into the wc */ + wc->wr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)]; + ++wq->tail; + } else if (srq) { + wqe_ctr = (uint16_t)(roce_get_field(cqe->byte_4, + CQE_BYTE_4_WQE_IDX_M, + CQE_BYTE_4_WQE_IDX_S)); + wc->wr_id = srq->wrid[wqe_ctr & (srq->max_wqe - 1)]; + hns_roce_free_srq_wqe(srq, wqe_ctr); + } else { + wq = &(*cur_qp)->rq; + wc->wr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)]; + ++wq->tail; + } + + /* + * HW maintains wc status, set the err type and directly return, after + * generated the incorrect CQE + */ + if (roce_get_field(cqe->byte_4, CQE_BYTE_4_STATUS_M, + CQE_BYTE_4_STATUS_S) != HNS_ROCE_V2_CQE_SUCCESS) { + hns_roce_v2_handle_error_cqe(cqe, wc); + return hns_roce_flush_cqe(cur_qp, wc); + } + + wc->status = IBV_WC_SUCCESS; + + /* + * According to the opcode type of cqe, mark the opcode and other + * information of wc + */ + if (is_send) { + hns_roce_v2_get_opcode_from_sender(cqe, wc); + } else { + /* Get opcode and flag in rq&srq */ + wc->byte_len = le32toh(cqe->byte_cnt); + opcode = roce_get_field(cqe->byte_4, CQE_BYTE_4_OPCODE_M, + CQE_BYTE_4_OPCODE_S) & HNS_ROCE_V2_CQE_OPCODE_MASK; + hns_roce_v2_get_opcode_from_receiver(cqe, wc, opcode); + + ret = hns_roce_handle_recv_inl_wqe(cqe, cur_qp, wc, opcode); + if (ret) { + fprintf(stderr, + PFX "failed to handle recv inline wqe!\n"); + return ret; + } + + wc->sl = (uint8_t)roce_get_field(cqe->byte_32, CQE_BYTE_32_SL_M, + CQE_BYTE_32_SL_S); + wc->src_qp = roce_get_field(cqe->byte_32, CQE_BYTE_32_RMT_QPN_M, + CQE_BYTE_32_RMT_QPN_S); + wc->slid = 0; + wc->wc_flags |= roce_get_bit(cqe->byte_32, CQE_BYTE_32_GRH_S) ? + IBV_WC_GRH : 0; + wc->pkey_index = 0; + } + + return V2_CQ_OK; +} + +static int hns_roce_u_v2_poll_cq(struct ibv_cq *ibvcq, int ne, + struct ibv_wc *wc) +{ + int npolled; + int err = V2_CQ_OK; + struct hns_roce_qp *qp = NULL; + struct hns_roce_cq *cq = to_hr_cq(ibvcq); + struct hns_roce_context *ctx = to_hr_ctx(ibvcq->context); + + pthread_spin_lock(&cq->lock); + + for (npolled = 0; npolled < ne; ++npolled) { + err = hns_roce_v2_poll_one(cq, &qp, wc + npolled); + if (err != V2_CQ_OK) + break; + } + + if (npolled || err == V2_CQ_POLL_ERR) { + mmio_ordered_writes_hack(); + + if (cq->flags & HNS_ROCE_SUPPORT_CQ_RECORD_DB) + *cq->set_ci_db = (unsigned int)(cq->cons_index & + ((cq->cq_depth << 1) - 1)); + else + hns_roce_v2_update_cq_cons_index(ctx, cq); + } + + pthread_spin_unlock(&cq->lock); + + return err == V2_CQ_POLL_ERR ? err : npolled; +} + +static int hns_roce_u_v2_arm_cq(struct ibv_cq *ibvcq, int solicited) +{ + uint32_t ci; + uint32_t cmd_sn; + uint32_t solicited_flag; + struct hns_roce_db cq_db = {}; + struct hns_roce_cq *cq = to_hr_cq(ibvcq); + + ci = cq->cons_index & ((cq->cq_depth << 1) - 1); + cmd_sn = cq->arm_sn & HNS_ROCE_CMDSN_MASK; + solicited_flag = solicited ? HNS_ROCE_V2_CQ_DB_REQ_SOL : + HNS_ROCE_V2_CQ_DB_REQ_NEXT; + + roce_set_field(cq_db.byte_4, DB_BYTE_4_TAG_M, DB_BYTE_4_TAG_S, cq->cqn); + roce_set_field(cq_db.byte_4, DB_BYTE_4_CMD_M, DB_BYTE_4_CMD_S, + HNS_ROCE_V2_CQ_DB_NTR); + + roce_set_field(cq_db.parameter, DB_PARAM_CQ_CONSUMER_IDX_M, + DB_PARAM_CQ_CONSUMER_IDX_S, ci); + + roce_set_field(cq_db.parameter, DB_PARAM_CQ_CMD_SN_M, + DB_PARAM_CQ_CMD_SN_S, cmd_sn); + roce_set_bit(cq_db.parameter, DB_PARAM_CQ_NOTIFY_S, solicited_flag); + + hns_roce_write64((uint32_t *)&cq_db, to_hr_ctx(ibvcq->context), + ROCEE_VF_DB_CFG0_OFFSET); + return 0; +} + +static void set_sge(struct hns_roce_v2_wqe_data_seg *dseg, + struct hns_roce_qp *qp, struct ibv_send_wr *wr, + struct hns_roce_sge_info *sge_info) +{ + int i; + + sge_info->valid_num = 0; + sge_info->total_len = 0; + + for (i = 0; i < wr->num_sge; i++) { + if (unlikely(!wr->sg_list[i].length)) + continue; + + sge_info->total_len += wr->sg_list[i].length; + sge_info->valid_num++; + + /* No inner sge in UD wqe */ + if (sge_info->valid_num <= HNS_ROCE_SGE_IN_WQE && + qp->ibv_qp.qp_type != IBV_QPT_UD) { + set_data_seg_v2(dseg, wr->sg_list + i); + dseg++; + } else { + dseg = get_send_sge_ex(qp, sge_info->start_idx & + (qp->ex_sge.sge_cnt - 1)); + set_data_seg_v2(dseg, wr->sg_list + i); + sge_info->start_idx++; + } + } +} + +static int set_rc_wqe(void *wqe, struct hns_roce_qp *qp, struct ibv_send_wr *wr, + int nreq, struct hns_roce_sge_info *sge_info) +{ + struct hns_roce_rc_sq_wqe *rc_sq_wqe = wqe; + struct hns_roce_v2_wqe_data_seg *dseg; + int hr_op; + int i; + + memset(rc_sq_wqe, 0, sizeof(struct hns_roce_rc_sq_wqe)); + + switch (wr->opcode) { + case IBV_WR_RDMA_READ: + hr_op = HNS_ROCE_WQE_OP_RDMA_READ; + rc_sq_wqe->va = htole64(wr->wr.rdma.remote_addr); + rc_sq_wqe->rkey = htole32(wr->wr.rdma.rkey); + break; + case IBV_WR_RDMA_WRITE: + hr_op = HNS_ROCE_WQE_OP_RDMA_WRITE; + rc_sq_wqe->va = htole64(wr->wr.rdma.remote_addr); + rc_sq_wqe->rkey = htole32(wr->wr.rdma.rkey); + break; + case IBV_WR_RDMA_WRITE_WITH_IMM: + hr_op = HNS_ROCE_WQE_OP_RDMA_WRITE_WITH_IMM; + rc_sq_wqe->va = htole64(wr->wr.rdma.remote_addr); + rc_sq_wqe->rkey = htole32(wr->wr.rdma.rkey); + rc_sq_wqe->immtdata = htole32(be32toh(wr->imm_data)); + break; + case IBV_WR_SEND: + hr_op = HNS_ROCE_WQE_OP_SEND; + break; + case IBV_WR_SEND_WITH_INV: + hr_op = HNS_ROCE_WQE_OP_SEND_WITH_INV; + rc_sq_wqe->inv_key = htole32(wr->invalidate_rkey); + break; + case IBV_WR_SEND_WITH_IMM: + hr_op = HNS_ROCE_WQE_OP_SEND_WITH_IMM; + rc_sq_wqe->immtdata = htole32(be32toh(wr->imm_data)); + break; + case IBV_WR_LOCAL_INV: + hr_op = HNS_ROCE_WQE_OP_LOCAL_INV; + roce_set_bit(rc_sq_wqe->byte_4, RC_SQ_WQE_BYTE_4_SO_S, 1); + rc_sq_wqe->inv_key = htole32(wr->invalidate_rkey); + break; + case IBV_WR_BIND_MW: + hr_op = HNS_ROCE_WQE_OP_BIND_MW_TYPE; + roce_set_bit(rc_sq_wqe->byte_4, RC_SQ_WQE_BYTE_4_MW_TYPE_S, + wr->bind_mw.mw->type - 1); + roce_set_bit(rc_sq_wqe->byte_4, RC_SQ_WQE_BYTE_4_ATOMIC_S, + (wr->bind_mw.bind_info.mw_access_flags & + IBV_ACCESS_REMOTE_ATOMIC) ? 1 : 0); + roce_set_bit(rc_sq_wqe->byte_4, RC_SQ_WQE_BYTE_4_RDMA_READ_S, + (wr->bind_mw.bind_info.mw_access_flags & + IBV_ACCESS_REMOTE_READ) ? 1 : 0); + roce_set_bit(rc_sq_wqe->byte_4, RC_SQ_WQE_BYTE_4_RDMA_WRITE_S, + (wr->bind_mw.bind_info.mw_access_flags & + IBV_ACCESS_REMOTE_WRITE) ? 1 : 0); + rc_sq_wqe->new_rkey = htole32(wr->bind_mw.rkey); + rc_sq_wqe->byte_16 = htole32(wr->bind_mw.bind_info.length & + HNS_ROCE_ADDRESS_MASK); + rc_sq_wqe->byte_20 = htole32(wr->bind_mw.bind_info.length >> + HNS_ROCE_ADDRESS_SHIFT); + rc_sq_wqe->rkey = htole32(wr->bind_mw.bind_info.mr->rkey); + rc_sq_wqe->va = htole64(wr->bind_mw.bind_info.addr); + break; + case IBV_WR_ATOMIC_CMP_AND_SWP: + hr_op = HNS_ROCE_WQE_OP_ATOMIC_COM_AND_SWAP; + rc_sq_wqe->rkey = htole32(wr->wr.atomic.rkey); + rc_sq_wqe->va = htole64(wr->wr.atomic.remote_addr); + roce_set_field(rc_sq_wqe->byte_16, RC_SQ_WQE_BYTE_16_SGE_NUM_M, + RC_SQ_WQE_BYTE_16_SGE_NUM_S, + sge_info->valid_num); + break; + case IBV_WR_ATOMIC_FETCH_AND_ADD: + hr_op = HNS_ROCE_WQE_OP_ATOMIC_FETCH_AND_ADD; + rc_sq_wqe->rkey = htole32(wr->wr.atomic.rkey); + rc_sq_wqe->va = htole64(wr->wr.atomic.remote_addr); + roce_set_field(rc_sq_wqe->byte_16, RC_SQ_WQE_BYTE_16_SGE_NUM_M, + RC_SQ_WQE_BYTE_16_SGE_NUM_S, + sge_info->valid_num); + break; + default: + hr_op = HNS_ROCE_WQE_OP_MASK; + return EINVAL; + } + + roce_set_field(rc_sq_wqe->byte_4, RC_SQ_WQE_BYTE_4_OPCODE_M, + RC_SQ_WQE_BYTE_4_OPCODE_S, hr_op); + + roce_set_bit(rc_sq_wqe->byte_4, RC_SQ_WQE_BYTE_4_CQE_S, + (wr->send_flags & IBV_SEND_SIGNALED) ? 1 : 0); + + roce_set_bit(rc_sq_wqe->byte_4, RC_SQ_WQE_BYTE_4_FENCE_S, + (wr->send_flags & IBV_SEND_FENCE) ? 1 : 0); + + roce_set_bit(rc_sq_wqe->byte_4, RC_SQ_WQE_BYTE_4_SE_S, + (wr->send_flags & IBV_SEND_SOLICITED) ? 1 : 0); + + roce_set_bit(rc_sq_wqe->byte_4, RC_SQ_WQE_BYTE_4_OWNER_S, + ~(((qp->sq.head + nreq) >> qp->sq.shift) & 0x1)); + + roce_set_field(rc_sq_wqe->byte_20, + RC_SQ_WQE_BYTE_20_MSG_START_SGE_IDX_M, + RC_SQ_WQE_BYTE_20_MSG_START_SGE_IDX_S, + sge_info->start_idx & (qp->ex_sge.sge_cnt - 1)); + + if (wr->opcode == IBV_WR_BIND_MW) + return 0; + + wqe += sizeof(struct hns_roce_rc_sq_wqe); + dseg = wqe; + + set_sge(dseg, qp, wr, sge_info); + + rc_sq_wqe->msg_len = htole32(sge_info->total_len); + + roce_set_field(rc_sq_wqe->byte_16, RC_SQ_WQE_BYTE_16_SGE_NUM_M, + RC_SQ_WQE_BYTE_16_SGE_NUM_S, sge_info->valid_num); + + if (wr->opcode == IBV_WR_ATOMIC_FETCH_AND_ADD || + wr->opcode == IBV_WR_ATOMIC_CMP_AND_SWP) { + dseg++; + return set_atomic_seg(qp, wr, le32toh(rc_sq_wqe->msg_len), + dseg, sge_info); + } + + if (wr->send_flags & IBV_SEND_INLINE) { + if (wr->opcode == IBV_WR_RDMA_READ) + return EINVAL; + + if (sge_info->total_len > qp->max_inline_data) + return EINVAL; + + for (i = 0; i < wr->num_sge; i++) { + memcpy(dseg, (void *)(uintptr_t)(wr->sg_list[i].addr), + wr->sg_list[i].length); + dseg += wr->sg_list[i].length; + } + roce_set_bit(rc_sq_wqe->byte_4, RC_SQ_WQE_BYTE_4_INLINE_S, 1); + } + + return 0; +} + +int hns_roce_u_v2_post_send(struct ibv_qp *ibvqp, struct ibv_send_wr *wr, + struct ibv_send_wr **bad_wr) +{ + struct hns_roce_context *ctx = to_hr_ctx(ibvqp->context); + struct hns_roce_qp *qp = to_hr_qp(ibvqp); + struct hns_roce_sge_info sge_info = {}; + struct ibv_qp_attr attr; + unsigned int wqe_idx; + int attr_mask; + int ret = 0; + void *wqe; + int nreq; + + /* check that state is OK to post send */ + if (ibvqp->state == IBV_QPS_RESET || ibvqp->state == IBV_QPS_INIT || + ibvqp->state == IBV_QPS_RTR) { + *bad_wr = wr; + return EINVAL; + } + + pthread_spin_lock(&qp->sq.lock); + + sge_info.start_idx = qp->next_sge; /* start index of extend sge */ + + for (nreq = 0; wr; ++nreq, wr = wr->next) { + if (hns_roce_v2_wq_overflow(&qp->sq, nreq, + to_hr_cq(qp->ibv_qp.send_cq))) { + ret = ENOMEM; + *bad_wr = wr; + goto out; + } + + if (wr->num_sge > qp->sq.max_gs) { + ret = EINVAL; + *bad_wr = wr; + goto out; + } + + wqe_idx = (qp->sq.head + nreq) & (qp->sq.wqe_cnt - 1); + wqe = get_send_wqe(qp, wqe_idx); + qp->sq.wrid[wqe_idx] = wr->wr_id; + + switch (ibvqp->qp_type) { + case IBV_QPT_RC: + ret = set_rc_wqe(wqe, qp, wr, nreq, &sge_info); + if (ret) { + *bad_wr = wr; + goto out; + } + break; + case IBV_QPT_UC: + case IBV_QPT_UD: + default: + ret = EINVAL; + *bad_wr = wr; + goto out; + } + } + +out: + if (likely(nreq)) { + qp->sq.head += nreq; + qp->next_sge = sge_info.start_idx; + + hns_roce_update_sq_db(ctx, qp->ibv_qp.qp_num, qp->sl, + qp->sq.head & ((qp->sq.wqe_cnt << 1) - 1)); + + if (qp->flags & HNS_ROCE_SUPPORT_SQ_RECORD_DB) + *(qp->sdb) = qp->sq.head & 0xffff; + } + + pthread_spin_unlock(&qp->sq.lock); + + if (ibvqp->state == IBV_QPS_ERR) { + attr_mask = IBV_QP_STATE; + attr.qp_state = IBV_QPS_ERR; + + hns_roce_u_v2_modify_qp(ibvqp, &attr, attr_mask); + } + + return ret; +} + +static int hns_roce_u_v2_post_recv(struct ibv_qp *ibvqp, struct ibv_recv_wr *wr, + struct ibv_recv_wr **bad_wr) +{ + struct hns_roce_qp *qp = to_hr_qp(ibvqp); + struct hns_roce_context *ctx = to_hr_ctx(ibvqp->context); + struct hns_roce_v2_wqe_data_seg *dseg; + struct hns_roce_rinl_sge *sge_list; + struct ibv_qp_attr attr; + int attr_mask; + int ret = 0; + int wqe_idx; + void *wqe; + int nreq; + int i; + + /* check that state is OK to post receive */ + if (ibvqp->state == IBV_QPS_RESET) { + *bad_wr = wr; + return EINVAL; + } + + pthread_spin_lock(&qp->rq.lock); + + for (nreq = 0; wr; ++nreq, wr = wr->next) { + if (hns_roce_v2_wq_overflow(&qp->rq, nreq, + to_hr_cq(qp->ibv_qp.recv_cq))) { + ret = ENOMEM; + *bad_wr = wr; + goto out; + } + + wqe_idx = (qp->rq.head + nreq) & (qp->rq.wqe_cnt - 1); + + if (wr->num_sge > qp->rq.max_gs) { + ret = EINVAL; + *bad_wr = wr; + goto out; + } + + wqe = get_recv_wqe_v2(qp, wqe_idx); + if (!wqe) { + ret = EINVAL; + *bad_wr = wr; + goto out; + } + + dseg = (struct hns_roce_v2_wqe_data_seg *)wqe; + + for (i = 0; i < wr->num_sge; i++) { + if (!wr->sg_list[i].length) + continue; + set_data_seg_v2(dseg, wr->sg_list + i); + dseg++; + } + + /* hw stop reading when identify the last one */ + if (i < qp->rq.max_gs) { + dseg->lkey = htole32(0x100); + dseg->addr = 0; + } + + /* QP support receive inline wqe */ + sge_list = qp->rq_rinl_buf.wqe_list[wqe_idx].sg_list; + qp->rq_rinl_buf.wqe_list[wqe_idx].sge_cnt = + (unsigned int)wr->num_sge; + + for (i = 0; i < wr->num_sge; i++) { + sge_list[i].addr = + (void *)(uintptr_t)wr->sg_list[i].addr; + sge_list[i].len = wr->sg_list[i].length; + } + + qp->rq.wrid[wqe_idx] = wr->wr_id; + } + +out: + if (nreq) { + qp->rq.head += nreq; + + udma_to_device_barrier(); + + if (qp->flags & HNS_ROCE_SUPPORT_RQ_RECORD_DB) + *qp->rdb = qp->rq.head & 0xffff; + else + hns_roce_update_rq_db(ctx, qp->ibv_qp.qp_num, + qp->rq.head & ((qp->rq.wqe_cnt << 1) - 1)); + } + + pthread_spin_unlock(&qp->rq.lock); + + if (ibvqp->state == IBV_QPS_ERR) { + attr_mask = IBV_QP_STATE; + attr.qp_state = IBV_QPS_ERR; + + hns_roce_u_v2_modify_qp(ibvqp, &attr, attr_mask); + } + + return ret; +} + +static void __hns_roce_v2_cq_clean(struct hns_roce_cq *cq, uint32_t qpn, + struct hns_roce_srq *srq) +{ + int nfreed = 0; + bool is_recv_cqe; + uint16_t wqe_index; + uint32_t prod_index; + uint8_t owner_bit = 0; + struct hns_roce_v2_cqe *cqe, *dest; + struct hns_roce_context *ctx = to_hr_ctx(cq->ibv_cq.context); + + for (prod_index = cq->cons_index; get_sw_cqe_v2(cq, prod_index); + ++prod_index) + if (prod_index > cq->cons_index + cq->ibv_cq.cqe) + break; + + while ((int) --prod_index - (int) cq->cons_index >= 0) { + cqe = get_cqe_v2(cq, prod_index & cq->ibv_cq.cqe); + if ((roce_get_field(cqe->byte_16, CQE_BYTE_16_LCL_QPN_M, + CQE_BYTE_16_LCL_QPN_S) & 0xffffff) == qpn) { + is_recv_cqe = roce_get_bit(cqe->byte_4, + CQE_BYTE_4_S_R_S); + + if (srq && is_recv_cqe) { + wqe_index = roce_get_field(cqe->byte_4, + CQE_BYTE_4_WQE_IDX_M, + CQE_BYTE_4_WQE_IDX_S); + hns_roce_free_srq_wqe(srq, wqe_index); + } + ++nfreed; + } else if (nfreed) { + dest = get_cqe_v2(cq, + (prod_index + nfreed) & cq->ibv_cq.cqe); + owner_bit = roce_get_bit(dest->byte_4, + CQE_BYTE_4_OWNER_S); + memcpy(dest, cqe, sizeof(*cqe)); + roce_set_bit(dest->byte_4, CQE_BYTE_4_OWNER_S, + owner_bit); + } + } + + if (nfreed) { + cq->cons_index += nfreed; + udma_to_device_barrier(); + hns_roce_v2_update_cq_cons_index(ctx, cq); + } +} + +static void hns_roce_v2_cq_clean(struct hns_roce_cq *cq, unsigned int qpn, + struct hns_roce_srq *srq) +{ + pthread_spin_lock(&cq->lock); + __hns_roce_v2_cq_clean(cq, qpn, srq); + pthread_spin_unlock(&cq->lock); +} + +static int hns_roce_u_v2_modify_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, + int attr_mask) +{ + int ret; + struct ibv_modify_qp cmd; + struct hns_roce_qp *hr_qp = to_hr_qp(qp); + bool flag = false; /* modify qp to error */ + + if ((attr_mask & IBV_QP_STATE) && (attr->qp_state == IBV_QPS_ERR)) { + pthread_spin_lock(&hr_qp->sq.lock); + pthread_spin_lock(&hr_qp->rq.lock); + flag = true; + } + + ret = ibv_cmd_modify_qp(qp, attr, attr_mask, &cmd, sizeof(cmd)); + + if (flag) { + pthread_spin_unlock(&hr_qp->rq.lock); + pthread_spin_unlock(&hr_qp->sq.lock); + } + + if (ret) + return ret; + + if (attr_mask & IBV_QP_STATE) + qp->state = attr->qp_state; + + if ((attr_mask & IBV_QP_STATE) && attr->qp_state == IBV_QPS_RESET) { + hns_roce_v2_cq_clean(to_hr_cq(qp->recv_cq), qp->qp_num, + qp->srq ? to_hr_srq(qp->srq) : NULL); + if (qp->send_cq != qp->recv_cq) + hns_roce_v2_cq_clean(to_hr_cq(qp->send_cq), qp->qp_num, + NULL); + + hns_roce_init_qp_indices(to_hr_qp(qp)); + } + + if (attr_mask & IBV_QP_PORT) + hr_qp->port_num = attr->port_num; + + if (attr_mask & IBV_QP_AV) + hr_qp->sl = attr->ah_attr.sl; + + return ret; +} + +static void hns_roce_lock_cqs(struct ibv_qp *qp) +{ + struct hns_roce_cq *send_cq = to_hr_cq(qp->send_cq); + struct hns_roce_cq *recv_cq = to_hr_cq(qp->recv_cq); + + if (send_cq == recv_cq) { + pthread_spin_lock(&send_cq->lock); + } else if (send_cq->cqn < recv_cq->cqn) { + pthread_spin_lock(&send_cq->lock); + pthread_spin_lock(&recv_cq->lock); + } else { + pthread_spin_lock(&recv_cq->lock); + pthread_spin_lock(&send_cq->lock); + } +} + +static void hns_roce_unlock_cqs(struct ibv_qp *qp) +{ + struct hns_roce_cq *send_cq = to_hr_cq(qp->send_cq); + struct hns_roce_cq *recv_cq = to_hr_cq(qp->recv_cq); + + if (send_cq == recv_cq) { + pthread_spin_unlock(&send_cq->lock); + } else if (send_cq->cqn < recv_cq->cqn) { + pthread_spin_unlock(&recv_cq->lock); + pthread_spin_unlock(&send_cq->lock); + } else { + pthread_spin_unlock(&send_cq->lock); + pthread_spin_unlock(&recv_cq->lock); + } +} + +static int hns_roce_u_v2_destroy_qp(struct ibv_qp *ibqp) +{ + int ret; + struct hns_roce_qp *qp = to_hr_qp(ibqp); + + pthread_mutex_lock(&to_hr_ctx(ibqp->context)->qp_table_mutex); + ret = ibv_cmd_destroy_qp(ibqp); + if (ret) { + pthread_mutex_unlock(&to_hr_ctx(ibqp->context)->qp_table_mutex); + return ret; + } + + hns_roce_lock_cqs(ibqp); + + if (ibqp->recv_cq) + __hns_roce_v2_cq_clean(to_hr_cq(ibqp->recv_cq), ibqp->qp_num, + ibqp->srq ? to_hr_srq(ibqp->srq) : NULL); + + if (ibqp->send_cq && ibqp->send_cq != ibqp->recv_cq) + __hns_roce_v2_cq_clean(to_hr_cq(ibqp->send_cq), ibqp->qp_num, + NULL); + + hns_roce_v2_clear_qp(to_hr_ctx(ibqp->context), ibqp->qp_num); + + hns_roce_unlock_cqs(ibqp); + pthread_mutex_unlock(&to_hr_ctx(ibqp->context)->qp_table_mutex); + + if (qp->rq.max_gs) + hns_roce_free_db(to_hr_ctx(ibqp->context), qp->rdb, + HNS_ROCE_QP_TYPE_DB); + if (qp->sq.wqe_cnt) + hns_roce_free_db(to_hr_ctx(ibqp->context), qp->sdb, + HNS_ROCE_QP_TYPE_DB); + + hns_roce_free_buf(&qp->buf); + if (qp->rq_rinl_buf.wqe_list) { + if (qp->rq_rinl_buf.wqe_list[0].sg_list) { + free(qp->rq_rinl_buf.wqe_list[0].sg_list); + qp->rq_rinl_buf.wqe_list[0].sg_list = NULL; + } + + free(qp->rq_rinl_buf.wqe_list); + qp->rq_rinl_buf.wqe_list = NULL; + } + + free(qp->sq.wrid); + if (qp->rq.wqe_cnt) + free(qp->rq.wrid); + + free(qp); + + return ret; +} + +static void fill_idx_queue(struct hns_roce_idx_que *idx_que, + int cur_idx, int wqe_idx) +{ + unsigned int *addr; + + addr = idx_que->buf.buf + cur_idx * idx_que->entry_sz; + *addr = wqe_idx; +} + +static int find_empty_entry(struct hns_roce_idx_que *idx_que) +{ + int bit_num; + int i; + + /* bitmap[i] is set zero if all bits are allocated */ + for (i = 0; idx_que->bitmap[i] == 0; ++i) + ; + bit_num = ffsl(idx_que->bitmap[i]); + idx_que->bitmap[i] &= ~(1ULL << (bit_num - 1)); + + return i * BIT_CNT_PER_U64 + (bit_num - 1); +} + +static int hns_roce_u_v2_post_srq_recv(struct ibv_srq *ib_srq, + struct ibv_recv_wr *wr, + struct ibv_recv_wr **bad_wr) +{ + struct hns_roce_context *ctx = to_hr_ctx(ib_srq->context); + struct hns_roce_srq *srq = to_hr_srq(ib_srq); + struct hns_roce_v2_wqe_data_seg *dseg; + struct hns_roce_db srq_db; + int ret = 0; + int wqe_idx; + void *wqe; + int nreq; + int ind; + int i; + + pthread_spin_lock(&srq->lock); + + /* current idx of srqwq */ + ind = srq->head & (srq->max_wqe - 1); + + for (nreq = 0; wr; ++nreq, wr = wr->next) { + if (wr->num_sge > srq->max_gs) { + ret = -1; + *bad_wr = wr; + break; + } + + if (srq->head == srq->tail) { + /* SRQ is full */ + ret = -1; + *bad_wr = wr; + break; + } + + wqe_idx = find_empty_entry(&srq->idx_que); + fill_idx_queue(&srq->idx_que, ind, wqe_idx); + + wqe = get_srq_wqe(srq, wqe_idx); + dseg = (struct hns_roce_v2_wqe_data_seg *)wqe; + + for (i = 0; i < wr->num_sge; ++i) { + dseg[i].len = htole32(wr->sg_list[i].length); + dseg[i].lkey = htole32(wr->sg_list[i].lkey); + dseg[i].addr = htole64(wr->sg_list[i].addr); + } + + /* hw stop reading when identify the last one */ + if (i < srq->max_gs) { + dseg[i].len = 0; + dseg[i].lkey = htole32(0x100); + dseg[i].addr = 0; + } + + srq->wrid[wqe_idx] = wr->wr_id; + ind = (ind + 1) & (srq->max_wqe - 1); + } + + if (nreq) { + srq->head += nreq; + + /* + * Make sure that descriptors are written before + * we write doorbell record. + */ + udma_to_device_barrier(); + + srq_db.byte_4 = htole32(HNS_ROCE_V2_SRQ_DB << DB_BYTE_4_CMD_S + | srq->srqn); + srq_db.parameter = htole32(srq->head); + + hns_roce_write64((uint32_t *)&srq_db, ctx, + ROCEE_VF_DB_CFG0_OFFSET); + } + + pthread_spin_unlock(&srq->lock); + + return ret; +} + +const struct hns_roce_u_hw hns_roce_u_hw_v2 = { + .hw_version = HNS_ROCE_HW_VER2, + .hw_ops = { + .poll_cq = hns_roce_u_v2_poll_cq, + .req_notify_cq = hns_roce_u_v2_arm_cq, + .post_send = hns_roce_u_v2_post_send, + .post_recv = hns_roce_u_v2_post_recv, + .modify_qp = hns_roce_u_v2_modify_qp, + .destroy_qp = hns_roce_u_v2_destroy_qp, + .post_srq_recv = hns_roce_u_v2_post_srq_recv, + }, +}; diff --git a/providers/hns/hns_roce_u_hw_v2.h b/providers/hns/hns_roce_u_hw_v2.h new file mode 100644 index 0000000..366bc13 --- /dev/null +++ b/providers/hns/hns_roce_u_hw_v2.h @@ -0,0 +1,299 @@ +/* + * Copyright (c) 2016-2017 Hisilicon Limited. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef _HNS_ROCE_U_HW_V2_H +#define _HNS_ROCE_U_HW_V2_H + +#define HNS_ROCE_V2_CQE_IS_SQ 0 + +#define HNS_ROCE_V2_CQ_DB_REQ_SOL 1 +#define HNS_ROCE_V2_CQ_DB_REQ_NEXT 0 + +#define HNS_ROCE_CMDSN_MASK 0x3 + +enum { + HNS_ROCE_SUPPORT_RQ_RECORD_DB = 1 << 0, + HNS_ROCE_SUPPORT_SQ_RECORD_DB = 1 << 1, +}; + +enum { + HNS_ROCE_SUPPORT_CQ_RECORD_DB = 1 << 0, +}; + +/* V2 REG DEFINITION */ +#define ROCEE_VF_DB_CFG0_OFFSET 0x0230 + +#define HNS_ROCE_IDX_QUE_ENTRY_SZ 4 + +enum { + HNS_ROCE_WQE_OP_SEND = 0x0, + HNS_ROCE_WQE_OP_SEND_WITH_INV = 0x1, + HNS_ROCE_WQE_OP_SEND_WITH_IMM = 0x2, + HNS_ROCE_WQE_OP_RDMA_WRITE = 0x3, + HNS_ROCE_WQE_OP_RDMA_WRITE_WITH_IMM = 0x4, + HNS_ROCE_WQE_OP_RDMA_READ = 0x5, + HNS_ROCE_WQE_OP_ATOMIC_COM_AND_SWAP = 0x6, + HNS_ROCE_WQE_OP_ATOMIC_FETCH_AND_ADD = 0x7, + HNS_ROCE_WQE_OP_ATOMIC_MASK_COMP_AND_SWAP = 0x8, + HNS_ROCE_WQE_OP_ATOMIC_MASK_FETCH_AND_ADD = 0x9, + HNS_ROCE_WQE_OP_FAST_REG_PMR = 0xa, + HNS_ROCE_WQE_OP_LOCAL_INV = 0xb, + HNS_ROCE_WQE_OP_BIND_MW_TYPE = 0xc, + HNS_ROCE_WQE_OP_MASK = 0x1f +}; + +enum { + /* rq operations */ + HNS_ROCE_RECV_OP_RDMA_WRITE_IMM = 0x0, + HNS_ROCE_RECV_OP_SEND = 0x1, + HNS_ROCE_RECV_OP_SEND_WITH_IMM = 0x2, + HNS_ROCE_RECV_OP_SEND_WITH_INV = 0x3, +}; + +enum { + HNS_ROCE_SQ_OP_SEND = 0x0, + HNS_ROCE_SQ_OP_SEND_WITH_INV = 0x1, + HNS_ROCE_SQ_OP_SEND_WITH_IMM = 0x2, + HNS_ROCE_SQ_OP_RDMA_WRITE = 0x3, + HNS_ROCE_SQ_OP_RDMA_WRITE_WITH_IMM = 0x4, + HNS_ROCE_SQ_OP_RDMA_READ = 0x5, + HNS_ROCE_SQ_OP_ATOMIC_COMP_AND_SWAP = 0x6, + HNS_ROCE_SQ_OP_ATOMIC_FETCH_AND_ADD = 0x7, + HNS_ROCE_SQ_OP_ATOMIC_MASK_COMP_AND_SWAP = 0x8, + HNS_ROCE_SQ_OP_ATOMIC_MASK_FETCH_AND_ADD = 0x9, + HNS_ROCE_SQ_OP_FAST_REG_PMR = 0xa, + HNS_ROCE_SQ_OP_LOCAL_INV = 0xb, + HNS_ROCE_SQ_OP_BIND_MW = 0xc, +}; + +enum { + V2_CQ_OK = 0, + V2_CQ_EMPTY = -1, + V2_CQ_POLL_ERR = -2, +}; + +enum { + HNS_ROCE_V2_CQE_STATUS_MASK = 0xff, + HNS_ROCE_V2_CQE_OPCODE_MASK = 0x1f, +}; + +enum { + HNS_ROCE_V2_CQE_SUCCESS = 0x00, + HNS_ROCE_V2_CQE_LOCAL_LENGTH_ERR = 0x01, + HNS_ROCE_V2_CQE_LOCAL_QP_OP_ERR = 0x02, + HNS_ROCE_V2_CQE_LOCAL_PROT_ERR = 0x04, + HNS_ROCE_V2_CQE_WR_FLUSH_ERR = 0x05, + HNS_ROCE_V2_CQE_MEM_MANAGERENT_OP_ERR = 0x06, + HNS_ROCE_V2_CQE_BAD_RESP_ERR = 0x10, + HNS_ROCE_V2_CQE_LOCAL_ACCESS_ERR = 0x11, + HNS_ROCE_V2_CQE_REMOTE_INVAL_REQ_ERR = 0x12, + HNS_ROCE_V2_CQE_REMOTE_ACCESS_ERR = 0x13, + HNS_ROCE_V2_CQE_REMOTE_OP_ERR = 0x14, + HNS_ROCE_V2_CQE_TRANSPORT_RETRY_EXC_ERR = 0x15, + HNS_ROCE_V2_CQE_RNR_RETRY_EXC_ERR = 0x16, + HNS_ROCE_V2_CQE_REMOTE_ABORTED_ERR = 0x22, +}; + +enum { + HNS_ROCE_V2_SQ_DB, + HNS_ROCE_V2_RQ_DB, + HNS_ROCE_V2_SRQ_DB, + HNS_ROCE_V2_CQ_DB_PTR, + HNS_ROCE_V2_CQ_DB_NTR, +}; + +struct hns_roce_db { + __le32 byte_4; + __le32 parameter; +}; +#define DB_BYTE_4_TAG_S 0 +#define DB_BYTE_4_TAG_M (((1UL << 23) - 1) << DB_BYTE_4_TAG_S) + +#define DB_BYTE_4_CMD_S 24 +#define DB_BYTE_4_CMD_M (((1UL << 4) - 1) << DB_BYTE_4_CMD_S) + +#define DB_PARAM_SQ_PRODUCER_IDX_S 0 +#define DB_PARAM_SQ_PRODUCER_IDX_M \ + (((1UL << 16) - 1) << DB_PARAM_SQ_PRODUCER_IDX_S) + +#define DB_PARAM_RQ_PRODUCER_IDX_S 0 +#define DB_PARAM_RQ_PRODUCER_IDX_M \ + (((1UL << 16) - 1) << DB_PARAM_RQ_PRODUCER_IDX_S) + +#define DB_PARAM_SRQ_PRODUCER_COUNTER_S 0 +#define DB_PARAM_SRQ_PRODUCER_COUNTER_M \ + (((1UL << 16) - 1) << DB_PARAM_SRQ_PRODUCER_COUNTER_S) + +#define DB_PARAM_SL_S 16 +#define DB_PARAM_SL_M \ + (((1UL << 3) - 1) << DB_PARAM_SL_S) + +#define DB_PARAM_CQ_CONSUMER_IDX_S 0 +#define DB_PARAM_CQ_CONSUMER_IDX_M \ + (((1UL << 24) - 1) << DB_PARAM_CQ_CONSUMER_IDX_S) + +#define DB_PARAM_CQ_NOTIFY_S 24 + +#define DB_PARAM_CQ_CMD_SN_S 25 +#define DB_PARAM_CQ_CMD_SN_M \ + (((1UL << 2) - 1) << DB_PARAM_CQ_CMD_SN_S) + +struct hns_roce_v2_cqe { + __le32 byte_4; + union { + __le32 rkey; + __le32 immtdata; + }; + __le32 byte_12; + __le32 byte_16; + __le32 byte_cnt; + __le32 smac; + __le32 byte_28; + __le32 byte_32; +}; + +#define CQE_BYTE_4_OPCODE_S 0 +#define CQE_BYTE_4_OPCODE_M (((1UL << 5) - 1) << CQE_BYTE_4_OPCODE_S) + +#define CQE_BYTE_4_RQ_INLINE_S 5 + +#define CQE_BYTE_4_S_R_S 6 +#define CQE_BYTE_4_OWNER_S 7 + +#define CQE_BYTE_4_STATUS_S 8 +#define CQE_BYTE_4_STATUS_M (((1UL << 8) - 1) << CQE_BYTE_4_STATUS_S) + +#define CQE_BYTE_4_WQE_IDX_S 16 +#define CQE_BYTE_4_WQE_IDX_M (((1UL << 16) - 1) << CQE_BYTE_4_WQE_IDX_S) + +#define CQE_BYTE_12_XRC_SRQN_S 0 +#define CQE_BYTE_12_XRC_SRQN_M (((1UL << 24) - 1) << CQE_BYTE_12_XRC_SRQN_S) + +#define CQE_BYTE_16_LCL_QPN_S 0 +#define CQE_BYTE_16_LCL_QPN_M (((1UL << 24) - 1) << CQE_BYTE_16_LCL_QPN_S) + +#define CQE_BYTE_28_SMAC_S 0 +#define CQE_BYTE_28_SMAC_M (((1UL << 16) - 1) << CQE_BYTE_28_SMAC_S) + +#define CQE_BYTE_28_PORT_TYPE_S 16 +#define CQE_BYTE_28_PORT_TYPE_M (((1UL << 2) - 1) << CQE_BYTE_28_PORT_TYPE_S) + +#define CQE_BYTE_32_RMT_QPN_S 0 +#define CQE_BYTE_32_RMT_QPN_M (((1UL << 24) - 1) << CQE_BYTE_32_RMT_QPN_S) + +#define CQE_BYTE_32_SL_S 24 +#define CQE_BYTE_32_SL_M (((1UL << 3) - 1) << CQE_BYTE_32_SL_S) + +#define CQE_BYTE_32_PORTN_S 27 +#define CQE_BYTE_32_PORTN_M (((1UL << 3) - 1) << CQE_BYTE_32_PORTN_S) + +#define CQE_BYTE_32_GRH_S 30 + +#define CQE_BYTE_32_LPK_S 31 + +struct hns_roce_rc_sq_wqe { + __le32 byte_4; + __le32 msg_len; + union { + __le32 inv_key; + __le32 immtdata; + __le32 new_rkey; + }; + __le32 byte_16; + __le32 byte_20; + __le32 rkey; + __le64 va; +}; + +#define RC_SQ_WQE_BYTE_4_OPCODE_S 0 +#define RC_SQ_WQE_BYTE_4_OPCODE_M \ + (((1UL << 5) - 1) << RC_SQ_WQE_BYTE_4_OPCODE_S) + +#define RC_SQ_WQE_BYTE_4_OWNER_S 7 + +#define RC_SQ_WQE_BYTE_4_CQE_S 8 + +#define RC_SQ_WQE_BYTE_4_FENCE_S 9 + +#define RC_SQ_WQE_BYTE_4_SO_S 10 + +#define RC_SQ_WQE_BYTE_4_SE_S 11 + +#define RC_SQ_WQE_BYTE_4_INLINE_S 12 + +#define RC_SQ_WQE_BYTE_4_MW_TYPE_S 14 + +#define RC_SQ_WQE_BYTE_4_ATOMIC_S 20 + +#define RC_SQ_WQE_BYTE_4_RDMA_READ_S 21 + +#define RC_SQ_WQE_BYTE_4_RDMA_WRITE_S 22 + +#define RC_SQ_WQE_BYTE_16_XRC_SRQN_S 0 +#define RC_SQ_WQE_BYTE_16_XRC_SRQN_M \ + (((1UL << 24) - 1) << RC_SQ_WQE_BYTE_16_XRC_SRQN_S) + +#define RC_SQ_WQE_BYTE_16_SGE_NUM_S 24 +#define RC_SQ_WQE_BYTE_16_SGE_NUM_M \ + (((1UL << 8) - 1) << RC_SQ_WQE_BYTE_16_SGE_NUM_S) + +#define RC_SQ_WQE_BYTE_20_MSG_START_SGE_IDX_S 0 +#define RC_SQ_WQE_BYTE_20_MSG_START_SGE_IDX_M \ + (((1UL << 24) - 1) << RC_SQ_WQE_BYTE_20_MSG_START_SGE_IDX_S) + +struct hns_roce_v2_wqe_data_seg { + __le32 len; + __le32 lkey; + __le64 addr; +}; + +struct hns_roce_v2_wqe_raddr_seg { + __le32 rkey; + __le32 len; + __le64 raddr; +}; + +struct hns_roce_wqe_atomic_seg { + __le64 fetchadd_swap_data; + __le64 cmp_data; +}; + +int hns_roce_u_v2_post_send(struct ibv_qp *ibvqp, struct ibv_send_wr *wr, + struct ibv_send_wr **bad_wr); + +#define DATA_TYPE_NUM 2 +#define STANDARD_ATOMIC_U_BYTE_8 0x8 +#define EXTEND_ATOMIC_U_BYTE_16 0x10 +#define EXTEND_ATOMIC_U_BYTE_32 0x20 +#define EXTEND_ATOMIC_U_BYTE_64 0x40 + +#endif /* _HNS_ROCE_U_HW_V2_H */ diff --git a/providers/hns/hns_roce_u_verbs.c b/providers/hns/hns_roce_u_verbs.c new file mode 100644 index 0000000..bbc307a --- /dev/null +++ b/providers/hns/hns_roce_u_verbs.c @@ -0,0 +1,953 @@ +/* + * Copyright (c) 2016-2017 Hisilicon Limited. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <errno.h> +#include <pthread.h> +#include <sys/mman.h> +#include <ccan/ilog.h> +#include <ccan/minmax.h> +#include <util/util.h> +#include "hns_roce_u.h" +#include "hns_roce_u_abi.h" +#include "hns_roce_u_db.h" +#include "hns_roce_u_hw_v1.h" +#include "hns_roce_u_hw_v2.h" + +void hns_roce_init_qp_indices(struct hns_roce_qp *qp) +{ + qp->sq.head = 0; + qp->sq.tail = 0; + qp->rq.head = 0; + qp->rq.tail = 0; + qp->next_sge = 0; +} + +int hns_roce_u_query_device(struct ibv_context *context, + struct ibv_device_attr *attr) +{ + int ret; + struct ibv_query_device cmd; + uint64_t raw_fw_ver; + unsigned int major, minor, sub_minor; + + ret = ibv_cmd_query_device(context, attr, &raw_fw_ver, &cmd, + sizeof(cmd)); + if (ret) + return ret; + + major = (raw_fw_ver >> 32) & 0xffff; + minor = (raw_fw_ver >> 16) & 0xffff; + sub_minor = raw_fw_ver & 0xffff; + + snprintf(attr->fw_ver, sizeof(attr->fw_ver), "%d.%d.%03d", major, minor, + sub_minor); + + return 0; +} + +int hns_roce_u_query_port(struct ibv_context *context, uint8_t port, + struct ibv_port_attr *attr) +{ + struct ibv_query_port cmd; + + return ibv_cmd_query_port(context, port, attr, &cmd, sizeof(cmd)); +} + +struct ibv_pd *hns_roce_u_alloc_pd(struct ibv_context *context) +{ + struct ibv_alloc_pd cmd; + struct hns_roce_pd *pd; + struct hns_roce_alloc_pd_resp resp = {}; + + pd = malloc(sizeof(*pd)); + if (!pd) + return NULL; + + if (ibv_cmd_alloc_pd(context, &pd->ibv_pd, &cmd, sizeof(cmd), + &resp.ibv_resp, sizeof(resp))) { + free(pd); + return NULL; + } + + pd->pdn = resp.pdn; + + return &pd->ibv_pd; +} + +int hns_roce_u_free_pd(struct ibv_pd *pd) +{ + int ret; + + ret = ibv_cmd_dealloc_pd(pd); + if (ret) + return ret; + + free(to_hr_pd(pd)); + + return ret; +} + +struct ibv_mr *hns_roce_u_reg_mr(struct ibv_pd *pd, void *addr, size_t length, + uint64_t hca_va, int access) +{ + int ret; + struct verbs_mr *vmr; + struct ibv_reg_mr cmd; + struct ib_uverbs_reg_mr_resp resp; + + if (!addr) { + fprintf(stderr, "2nd parm addr is NULL!\n"); + return NULL; + } + + if (!length) { + fprintf(stderr, "3st parm length is 0!\n"); + return NULL; + } + + vmr = malloc(sizeof(*vmr)); + if (!vmr) + return NULL; + + ret = ibv_cmd_reg_mr(pd, addr, length, hca_va, access, vmr, &cmd, + sizeof(cmd), &resp, sizeof(resp)); + if (ret) { + free(vmr); + return NULL; + } + + return &vmr->ibv_mr; +} + +int hns_roce_u_rereg_mr(struct verbs_mr *vmr, int flags, struct ibv_pd *pd, + void *addr, size_t length, int access) +{ + struct ibv_rereg_mr cmd; + struct ib_uverbs_rereg_mr_resp resp; + + return ibv_cmd_rereg_mr(vmr, flags, addr, length, (uintptr_t)addr, + access, pd, &cmd, sizeof(cmd), &resp, + sizeof(resp)); +} + +int hns_roce_u_dereg_mr(struct verbs_mr *vmr) +{ + int ret; + + ret = ibv_cmd_dereg_mr(vmr); + if (ret) + return ret; + + free(vmr); + + return ret; +} + +int hns_roce_u_bind_mw(struct ibv_qp *qp, struct ibv_mw *mw, + struct ibv_mw_bind *mw_bind) +{ + struct ibv_mw_bind_info *bind_info = &mw_bind->bind_info; + struct ibv_send_wr *bad_wr = NULL; + struct ibv_send_wr wr = {}; + int ret; + + if (!bind_info->mr && bind_info->length) + return EINVAL; + + if (mw->pd != qp->pd) + return EINVAL; + + if (bind_info->mr && (mw->pd != bind_info->mr->pd)) + return EINVAL; + + if (mw->type != IBV_MW_TYPE_1) + return EINVAL; + + if (bind_info->mw_access_flags & ~(IBV_ACCESS_REMOTE_WRITE | + IBV_ACCESS_REMOTE_READ | IBV_ACCESS_REMOTE_ATOMIC)) + return EINVAL; + + wr.opcode = IBV_WR_BIND_MW; + wr.next = NULL; + + wr.wr_id = mw_bind->wr_id; + wr.send_flags = mw_bind->send_flags; + + wr.bind_mw.mw = mw; + wr.bind_mw.rkey = ibv_inc_rkey(mw->rkey); + wr.bind_mw.bind_info = mw_bind->bind_info; + + ret = hns_roce_u_v2_post_send(qp, &wr, &bad_wr); + if (ret) + return ret; + + mw->rkey = wr.bind_mw.rkey; + + return 0; +} + +struct ibv_mw *hns_roce_u_alloc_mw(struct ibv_pd *pd, enum ibv_mw_type type) +{ + struct ibv_mw *mw; + struct ibv_alloc_mw cmd = {}; + struct ib_uverbs_alloc_mw_resp resp = {}; + + mw = malloc(sizeof(*mw)); + if (!mw) + return NULL; + + if (ibv_cmd_alloc_mw(pd, type, mw, &cmd, sizeof(cmd), + &resp, sizeof(resp))) { + free(mw); + return NULL; + } + + return mw; +} + +int hns_roce_u_dealloc_mw(struct ibv_mw *mw) +{ + int ret; + + ret = ibv_cmd_dealloc_mw(mw); + if (ret) + return ret; + + free(mw); + + return 0; +} + +static int align_cq_size(int req) +{ + int nent; + + for (nent = HNS_ROCE_MIN_CQE_NUM; nent < req; nent <<= 1) + ; + + return nent; +} + +/* must check min depth before align */ +static int align_qp_size(int req) +{ + int nent; + + for (nent = HNS_ROCE_V1_MIN_WQE_NUM; nent < req; nent <<= 1) + ; + + return nent; +} + +static uint64_t align_queue_size(uint64_t req) +{ + return roundup_pow_of_two(req); +} + +static int hns_roce_verify_cq(int *cqe, struct hns_roce_context *context) +{ + if (*cqe < 1 || *cqe > context->max_cqe) + return -1; + + if (*cqe < HNS_ROCE_MIN_CQE_NUM) + *cqe = HNS_ROCE_MIN_CQE_NUM; + + return 0; +} + +static int hns_roce_alloc_cq_buf(struct hns_roce_device *dev, + struct hns_roce_buf *buf, int nent) +{ + if (hns_roce_alloc_buf(buf, + align(nent * HNS_ROCE_CQE_ENTRY_SIZE, dev->page_size), + dev->page_size)) + return -1; + + return 0; +} + +struct ibv_cq *hns_roce_u_create_cq(struct ibv_context *context, int cqe, + struct ibv_comp_channel *channel, + int comp_vector) +{ + struct hns_roce_device *hr_dev = to_hr_dev(context->device); + struct hns_roce_create_cq cmd = {}; + struct hns_roce_create_cq_resp resp = {}; + struct hns_roce_cq *cq; + int ret; + + if (hns_roce_verify_cq(&cqe, to_hr_ctx(context))) + return NULL; + + cq = malloc(sizeof(*cq)); + if (!cq) + return NULL; + + cq->cons_index = 0; + + if (pthread_spin_init(&cq->lock, PTHREAD_PROCESS_PRIVATE)) + goto err; + + if (hr_dev->hw_version == HNS_ROCE_HW_VER1) + cqe = align_cq_size(cqe); + else + cqe = align_queue_size(cqe); + + if (hns_roce_alloc_cq_buf(hr_dev, &cq->buf, cqe)) + goto err; + + cmd.buf_addr = (uintptr_t) cq->buf.buf; + + if (hr_dev->hw_version != HNS_ROCE_HW_VER1) { + cq->set_ci_db = hns_roce_alloc_db(to_hr_ctx(context), + HNS_ROCE_CQ_TYPE_DB); + if (!cq->set_ci_db) + goto err_buf; + + cmd.db_addr = (uintptr_t) cq->set_ci_db; + } + + ret = ibv_cmd_create_cq(context, cqe, channel, comp_vector, + &cq->ibv_cq, &cmd.ibv_cmd, sizeof(cmd), + &resp.ibv_resp, sizeof(resp)); + if (ret) + goto err_db; + + cq->cqn = resp.cqn; + cq->cq_depth = cqe; + cq->flags = resp.cap_flags; + + if (hr_dev->hw_version == HNS_ROCE_HW_VER1) + cq->set_ci_db = to_hr_ctx(context)->cq_tptr_base + cq->cqn * 2; + + cq->arm_db = cq->set_ci_db; + cq->arm_sn = 1; + *(cq->set_ci_db) = 0; + *(cq->arm_db) = 0; + + return &cq->ibv_cq; + +err_db: + if (hr_dev->hw_version != HNS_ROCE_HW_VER1) + hns_roce_free_db(to_hr_ctx(context), cq->set_ci_db, + HNS_ROCE_CQ_TYPE_DB); + +err_buf: + hns_roce_free_buf(&cq->buf); + +err: + free(cq); + + return NULL; +} + +void hns_roce_u_cq_event(struct ibv_cq *cq) +{ + to_hr_cq(cq)->arm_sn++; +} + +int hns_roce_u_modify_cq(struct ibv_cq *cq, struct ibv_modify_cq_attr *attr) +{ + struct ibv_modify_cq cmd = {}; + + return ibv_cmd_modify_cq(cq, attr, &cmd, sizeof(cmd)); +} + +int hns_roce_u_destroy_cq(struct ibv_cq *cq) +{ + int ret; + + ret = ibv_cmd_destroy_cq(cq); + if (ret) + return ret; + + if (to_hr_dev(cq->context->device)->hw_version != HNS_ROCE_HW_VER1) + hns_roce_free_db(to_hr_ctx(cq->context), + to_hr_cq(cq)->set_ci_db, HNS_ROCE_CQ_TYPE_DB); + hns_roce_free_buf(&to_hr_cq(cq)->buf); + free(to_hr_cq(cq)); + + return ret; +} + +static int hns_roce_create_idx_que(struct ibv_pd *pd, struct hns_roce_srq *srq) +{ + struct hns_roce_idx_que *idx_que = &srq->idx_que; + uint32_t bitmap_num; + int i; + + idx_que->entry_sz = HNS_ROCE_IDX_QUE_ENTRY_SZ; + + /* bits needed in bitmap */ + bitmap_num = align(srq->max_wqe, BIT_CNT_PER_U64); + + idx_que->bitmap = calloc(1, bitmap_num / BIT_CNT_PER_BYTE); + if (!idx_que->bitmap) + return -1; + + /* bitmap_num indicates amount of u64 */ + bitmap_num = bitmap_num / BIT_CNT_PER_U64; + + idx_que->buf_size = srq->max_wqe * idx_que->entry_sz; + if (hns_roce_alloc_buf(&idx_que->buf, idx_que->buf_size, + to_hr_dev(pd->context->device)->page_size)) { + free(idx_que->bitmap); + idx_que->bitmap = NULL; + return -1; + } + + /* init the idx_que bitmap */ + for (i = 0; i < bitmap_num; ++i) + idx_que->bitmap[i] = ~(0UL); + + return 0; +} + +static int hns_roce_alloc_srq_buf(struct ibv_pd *pd, struct ibv_srq_attr *attr, + struct hns_roce_srq *srq) +{ + int srq_buf_size; + int srq_size; + + srq->wrid = calloc(srq->max_wqe, sizeof(unsigned long)); + if (!srq->wrid) + return -1; + + /* srq size */ + srq_size = srq->max_gs * sizeof(struct hns_roce_v2_wqe_data_seg); + + for (srq->wqe_shift = HNS_ROCE_SGE_SHIFT; + 1 << srq->wqe_shift < srq_size; ++srq->wqe_shift) + ; /* nothing */ + + srq_buf_size = srq->max_wqe << srq->wqe_shift; + + /* allocate srq wqe buf */ + if (hns_roce_alloc_buf(&srq->buf, srq_buf_size, + to_hr_dev(pd->context->device)->page_size)) { + free(srq->wrid); + return -1; + } + + srq->head = 0; + srq->tail = srq->max_wqe - 1; + + return 0; +} + +struct ibv_srq *hns_roce_u_create_srq(struct ibv_pd *pd, + struct ibv_srq_init_attr *init_attr) +{ + struct hns_roce_create_srq cmd; + struct hns_roce_create_srq_resp resp; + struct hns_roce_srq *srq; + int ret; + + if (init_attr->attr.max_wr > HNS_ROCE_MAX_SRQWQE_NUM || + init_attr->attr.max_sge > HNS_ROCE_MAX_SRQSGE_NUM) + return NULL; + + srq = calloc(1, sizeof(*srq)); + if (!srq) + return NULL; + + if (pthread_spin_init(&srq->lock, PTHREAD_PROCESS_PRIVATE)) + goto out; + + srq->max_wqe = align_queue_size(init_attr->attr.max_wr + 1); + srq->max_gs = init_attr->attr.max_sge; + + ret = hns_roce_create_idx_que(pd, srq); + if (ret) { + fprintf(stderr, "hns_roce_create_idx_que failed!\n"); + goto out; + } + + if (hns_roce_alloc_srq_buf(pd, &init_attr->attr, srq)) { + fprintf(stderr, "hns_roce_alloc_srq_buf failed!\n"); + goto err_idx_que; + } + + srq->db = hns_roce_alloc_db(to_hr_ctx(pd->context), + HNS_ROCE_QP_TYPE_DB); + if (!srq->db) + goto err_srq_buf; + + *(srq->db) = 0; + cmd.buf_addr = (uintptr_t)srq->buf.buf; + cmd.que_addr = (uintptr_t)srq->idx_que.buf.buf; + cmd.db_addr = (uintptr_t)srq->db; + + ret = ibv_cmd_create_srq(pd, &srq->verbs_srq.srq, init_attr, + &cmd.ibv_cmd, sizeof(cmd), &resp.ibv_resp, + sizeof(resp)); + if (ret) + goto err_srq_db; + + srq->srqn = resp.srqn; + return &srq->verbs_srq.srq; + +err_srq_db: + hns_roce_free_db(to_hr_ctx(pd->context), srq->db, HNS_ROCE_QP_TYPE_DB); + +err_srq_buf: + free(srq->wrid); + hns_roce_free_buf(&srq->buf); + +err_idx_que: + free(srq->idx_que.bitmap); + hns_roce_free_buf(&srq->idx_que.buf); +out: + free(srq); + return NULL; +} + +int hns_roce_u_modify_srq(struct ibv_srq *srq, struct ibv_srq_attr *srq_attr, + int srq_attr_mask) +{ + struct ibv_modify_srq cmd; + + return ibv_cmd_modify_srq(srq, srq_attr, srq_attr_mask, &cmd, + sizeof(cmd)); +} + +int hns_roce_u_query_srq(struct ibv_srq *srq, struct ibv_srq_attr *srq_attr) +{ + struct ibv_query_srq cmd; + + return ibv_cmd_query_srq(srq, srq_attr, &cmd, sizeof(cmd)); +} + +int hns_roce_u_destroy_srq(struct ibv_srq *srq) +{ + int ret; + + ret = ibv_cmd_destroy_srq(srq); + if (ret) + return ret; + + hns_roce_free_db(to_hr_ctx(srq->context), to_hr_srq(srq)->db, + HNS_ROCE_QP_TYPE_DB); + hns_roce_free_buf(&to_hr_srq(srq)->buf); + free(to_hr_srq(srq)->wrid); + hns_roce_free_buf(&to_hr_srq(srq)->idx_que.buf); + free(to_hr_srq(srq)->idx_que.bitmap); + free(to_hr_srq(srq)); + + return 0; +} + +static int hns_roce_verify_qp(struct ibv_qp_init_attr *attr, + struct hns_roce_context *context) +{ + struct hns_roce_device *hr_dev = + to_hr_dev(context->ibv_ctx.context.device); + uint32_t min_wqe_num = hr_dev->hw_version == HNS_ROCE_HW_VER1 ? + HNS_ROCE_V1_MIN_WQE_NUM : + HNS_ROCE_V2_MIN_WQE_NUM; + + if (!attr->cap.max_send_wr || + attr->cap.max_send_wr > context->max_qp_wr || + attr->cap.max_recv_wr > context->max_qp_wr || + attr->cap.max_send_sge > context->max_sge || + attr->cap.max_recv_sge > context->max_sge) + return EINVAL; + + if (attr->cap.max_send_wr < min_wqe_num) + attr->cap.max_send_wr = min_wqe_num; + + if (attr->cap.max_recv_wr && attr->cap.max_recv_wr < min_wqe_num) + attr->cap.max_recv_wr = min_wqe_num; + + if (attr->cap.max_recv_sge < 1) + attr->cap.max_recv_sge = 1; + + if ((attr->qp_type != IBV_QPT_RC) && (attr->qp_type != IBV_QPT_UD)) + return EINVAL; + + if ((attr->qp_type == IBV_QPT_RC) && + (attr->cap.max_inline_data > HNS_ROCE_MAX_INLINE_DATA_LEN)) + return EINVAL; + + return 0; +} + +static int hns_roce_alloc_recv_inl_buf(struct ibv_qp_cap *cap, + struct hns_roce_qp *qp) +{ + int i; + + qp->rq_rinl_buf.wqe_list = calloc(qp->rq.wqe_cnt, + sizeof(struct hns_roce_rinl_wqe)); + if (!qp->rq_rinl_buf.wqe_list) + return -1; + + qp->rq_rinl_buf.wqe_cnt = qp->rq.wqe_cnt; + + qp->rq_rinl_buf.wqe_list[0].sg_list = + calloc(qp->rq.wqe_cnt * cap->max_recv_sge, + sizeof(struct hns_roce_rinl_sge)); + if (!qp->rq_rinl_buf.wqe_list[0].sg_list) { + free(qp->rq_rinl_buf.wqe_list); + return -1; + } + + for (i = 0; i < qp->rq_rinl_buf.wqe_cnt; i++) { + int wqe_size = i * cap->max_recv_sge; + + qp->rq_rinl_buf.wqe_list[i].sg_list = + &(qp->rq_rinl_buf.wqe_list[0].sg_list[wqe_size]); + } + + return 0; +} + +static int hns_roce_calc_qp_buff_size(struct ibv_pd *pd, struct ibv_qp_cap *cap, + enum ibv_qp_type type, + struct hns_roce_qp *qp) +{ + int page_size = to_hr_dev(pd->context->device)->page_size; + + if (to_hr_dev(pd->context->device)->hw_version == HNS_ROCE_HW_VER1) { + qp->rq.wqe_shift = hr_ilog32(sizeof(struct hns_roce_rc_rq_wqe)); + + qp->buf_size = align((qp->sq.wqe_cnt << qp->sq.wqe_shift), + page_size) + + (qp->rq.wqe_cnt << qp->rq.wqe_shift); + + if (qp->rq.wqe_shift > qp->sq.wqe_shift) { + qp->rq.offset = 0; + qp->sq.offset = qp->rq.wqe_cnt << qp->rq.wqe_shift; + } else { + qp->rq.offset = align((qp->sq.wqe_cnt << + qp->sq.wqe_shift), page_size); + qp->sq.offset = 0; + } + } else { + unsigned int rqwqe_size = HNS_ROCE_SGE_SIZE * cap->max_recv_sge; + + qp->rq.wqe_shift = hr_ilog32(rqwqe_size); + + if (qp->sq.max_gs > HNS_ROCE_SGE_IN_WQE || type == IBV_QPT_UD) + qp->ex_sge.sge_shift = HNS_ROCE_SGE_SHIFT; + else + qp->ex_sge.sge_shift = 0; + + /* alloc recv inline buf */ + if (hns_roce_alloc_recv_inl_buf(cap, qp)) + return -1; + + qp->buf_size = align((qp->sq.wqe_cnt << qp->sq.wqe_shift), + page_size) + + align((qp->ex_sge.sge_cnt << + qp->ex_sge.sge_shift), + page_size) + + (qp->rq.wqe_cnt << qp->rq.wqe_shift); + + if (qp->ex_sge.sge_cnt) { + qp->sq.offset = 0; + qp->ex_sge.offset = align((qp->sq.wqe_cnt << + qp->sq.wqe_shift), + page_size); + qp->rq.offset = qp->ex_sge.offset + + align((qp->ex_sge.sge_cnt << + qp->ex_sge.sge_shift), + page_size); + } else { + qp->sq.offset = 0; + qp->ex_sge.offset = 0; + qp->rq.offset = align((qp->sq.wqe_cnt << + qp->sq.wqe_shift), page_size); + } + } + + return 0; +} + +static int hns_roce_alloc_qp_buf(struct ibv_pd *pd, struct ibv_qp_cap *cap, + enum ibv_qp_type type, struct hns_roce_qp *qp) +{ + int page_size = to_hr_dev(pd->context->device)->page_size; + + qp->sq.wrid = malloc(qp->sq.wqe_cnt * sizeof(uint64_t)); + if (!qp->sq.wrid) + return -1; + + if (qp->rq.wqe_cnt) { + qp->rq.wrid = malloc(qp->rq.wqe_cnt * sizeof(uint64_t)); + if (!qp->rq.wrid) { + free(qp->sq.wrid); + return -1; + } + } + + if (hns_roce_calc_qp_buff_size(pd, cap, type, qp)) { + if (qp->rq.wqe_cnt) + free(qp->rq.wrid); + free(qp->sq.wrid); + return -1; + } + + if (hns_roce_alloc_buf(&qp->buf, align(qp->buf_size, page_size), + to_hr_dev(pd->context->device)->page_size)) { + if (qp->rq.wqe_cnt) + free(qp->rq.wrid); + free(qp->sq.wrid); + return -1; + } + + return 0; +} + +static void hns_roce_set_qp_params(struct ibv_pd *pd, + struct ibv_qp_init_attr *attr, + struct hns_roce_qp *qp, + struct hns_roce_context *ctx) +{ + unsigned int sge_ex_count; + + if (to_hr_dev(pd->context->device)->hw_version == HNS_ROCE_HW_VER1) { + qp->sq.wqe_cnt = align_qp_size(attr->cap.max_send_wr); + qp->rq.wqe_cnt = align_qp_size(attr->cap.max_recv_wr); + } else { + qp->sq.wqe_cnt = align_queue_size(attr->cap.max_send_wr); + qp->rq.wqe_cnt = align_queue_size(attr->cap.max_recv_wr); + } + + qp->sq.wqe_shift = hr_ilog32(sizeof(struct hns_roce_rc_send_wqe)); + qp->sq.shift = hr_ilog32(qp->sq.wqe_cnt); + qp->rq.max_gs = attr->cap.max_recv_sge; + + if (to_hr_dev(pd->context->device)->hw_version == HNS_ROCE_HW_VER1) { + qp->sq.max_gs = HNS_ROCE_SGE_IN_WQE; + } else { + qp->sq.max_gs = attr->cap.max_send_sge; + if (qp->sq.max_gs > HNS_ROCE_SGE_IN_WQE) { + sge_ex_count = qp->sq.wqe_cnt * + (qp->sq.max_gs - HNS_ROCE_SGE_IN_WQE); + qp->ex_sge.sge_cnt = align_queue_size(sge_ex_count); + } else { + qp->ex_sge.sge_cnt = 0; + } + } + + /* limit by the context queried during alloc context */ + qp->sq.max_post = min(ctx->max_qp_wr, qp->sq.wqe_cnt); + qp->sq.max_gs = min(ctx->max_sge, qp->sq.max_gs); + + qp->sq_signal_bits = attr->sq_sig_all ? 0 : 1; + qp->max_inline_data = HNS_ROCE_MAX_INLINE_DATA_LEN; + + /* update attr for creating qp */ + attr->cap.max_send_wr = qp->sq.max_post; + attr->cap.max_inline_data = qp->max_inline_data; +} + +static int get_sq_db_addr(struct ibv_pd *pd, struct ibv_qp_init_attr *attr, + struct hns_roce_qp *qp, + struct hns_roce_context *context, + struct hns_roce_create_qp *cmd) +{ + if ((to_hr_dev(pd->context->device)->hw_version != HNS_ROCE_HW_VER1) && + attr->cap.max_send_wr) { + qp->sdb = hns_roce_alloc_db(context, HNS_ROCE_QP_TYPE_DB); + if (!qp->sdb) + return ENOMEM; + + *(qp->sdb) = 0; + cmd->sdb_addr = (uintptr_t)qp->sdb; + } else + cmd->sdb_addr = 0; + + return 0; +} + +static int get_rq_db_addr(struct ibv_pd *pd, struct ibv_qp_init_attr *attr, + struct hns_roce_qp *qp, + struct hns_roce_context *context, + struct hns_roce_create_qp *cmd) +{ + if ((to_hr_dev(pd->context->device)->hw_version != HNS_ROCE_HW_VER1) && + attr->cap.max_recv_sge) { + qp->rdb = hns_roce_alloc_db(context, HNS_ROCE_QP_TYPE_DB); + if (!qp->rdb) + return ENOMEM; + + *(qp->rdb) = 0; + cmd->db_addr = (uintptr_t) qp->rdb; + } else + cmd->db_addr = 0; + + return 0; +} + +static int hns_roce_store_qp(struct hns_roce_context *ctx, uint32_t qpn, + struct hns_roce_qp *qp) +{ + int tind = (qpn & (ctx->num_qps - 1)) >> ctx->qp_table_shift; + + if (!ctx->qp_table[tind].refcnt) { + ctx->qp_table[tind].table = calloc(ctx->qp_table_mask + 1, + sizeof(struct hns_roce_qp *)); + if (!ctx->qp_table[tind].table) + return -1; + } + + ++ctx->qp_table[tind].refcnt; + ctx->qp_table[tind].table[qpn & ctx->qp_table_mask] = qp; + + return 0; +} + +struct ibv_qp *hns_roce_u_create_qp(struct ibv_pd *pd, + struct ibv_qp_init_attr *attr) +{ + int ret; + struct hns_roce_qp *qp; + struct hns_roce_create_qp cmd = {}; + struct hns_roce_create_qp_resp resp = {}; + struct hns_roce_context *context = to_hr_ctx(pd->context); + + if (hns_roce_verify_qp(attr, context)) { + fprintf(stderr, "hns_roce_verify_sizes failed!\n"); + return NULL; + } + + qp = malloc(sizeof(*qp)); + if (!qp) { + fprintf(stderr, "malloc failed!\n"); + return NULL; + } + + hns_roce_set_qp_params(pd, attr, qp, context); + + if (hns_roce_alloc_qp_buf(pd, &attr->cap, attr->qp_type, qp)) { + fprintf(stderr, "hns_roce_alloc_qp_buf failed!\n"); + goto err_buf; + } + + hns_roce_init_qp_indices(qp); + + if (pthread_spin_init(&qp->sq.lock, PTHREAD_PROCESS_PRIVATE) || + pthread_spin_init(&qp->rq.lock, PTHREAD_PROCESS_PRIVATE)) { + fprintf(stderr, "pthread_spin_init failed!\n"); + goto err_free; + } + + ret = get_sq_db_addr(pd, attr, qp, context, &cmd); + if (ret) + goto err_free; + + ret = get_rq_db_addr(pd, attr, qp, context, &cmd); + if (ret) + goto err_sq_db; + + cmd.buf_addr = (uintptr_t) qp->buf.buf; + cmd.log_sq_stride = qp->sq.wqe_shift; + cmd.log_sq_bb_count = hr_ilog32(qp->sq.wqe_cnt); + + pthread_mutex_lock(&context->qp_table_mutex); + + ret = ibv_cmd_create_qp(pd, &qp->ibv_qp, attr, &cmd.ibv_cmd, + sizeof(cmd), &resp.ibv_resp, sizeof(resp)); + if (ret) { + fprintf(stderr, "ibv_cmd_create_qp failed!\n"); + goto err_rq_db; + } + + ret = hns_roce_store_qp(context, qp->ibv_qp.qp_num, qp); + if (ret) { + fprintf(stderr, "hns_roce_store_qp failed!\n"); + goto err_destroy; + } + pthread_mutex_unlock(&context->qp_table_mutex); + + /* adjust rq maxima to not exceed reported device maxima */ + attr->cap.max_recv_wr = min(context->max_qp_wr, attr->cap.max_recv_wr); + attr->cap.max_recv_sge = min(context->max_sge, attr->cap.max_recv_sge); + qp->rq.wqe_cnt = attr->cap.max_recv_wr; + qp->rq.max_gs = attr->cap.max_recv_sge; + qp->rq.max_post = attr->cap.max_recv_wr; + + qp->flags = resp.cap_flags; + + return &qp->ibv_qp; + +err_destroy: + ibv_cmd_destroy_qp(&qp->ibv_qp); + +err_rq_db: + pthread_mutex_unlock(&context->qp_table_mutex); + if ((to_hr_dev(pd->context->device)->hw_version != HNS_ROCE_HW_VER1) && + attr->cap.max_recv_sge) + hns_roce_free_db(context, qp->rdb, HNS_ROCE_QP_TYPE_DB); + +err_sq_db: + if ((to_hr_dev(pd->context->device)->hw_version != HNS_ROCE_HW_VER1) && + attr->cap.max_send_wr) + hns_roce_free_db(context, qp->sdb, HNS_ROCE_QP_TYPE_DB); + +err_free: + free(qp->sq.wrid); + if (qp->rq.wqe_cnt) + free(qp->rq.wrid); + hns_roce_free_buf(&qp->buf); + +err_buf: + free(qp); + + return NULL; +} + +int hns_roce_u_query_qp(struct ibv_qp *ibqp, struct ibv_qp_attr *attr, + int attr_mask, struct ibv_qp_init_attr *init_attr) +{ + int ret; + struct ibv_query_qp cmd; + struct hns_roce_qp *qp = to_hr_qp(ibqp); + + ret = ibv_cmd_query_qp(ibqp, attr, attr_mask, init_attr, &cmd, + sizeof(cmd)); + if (ret) + return ret; + + init_attr->cap.max_send_wr = qp->sq.max_post; + init_attr->cap.max_send_sge = qp->sq.max_gs; + init_attr->cap.max_inline_data = qp->max_inline_data; + + attr->cap = init_attr->cap; + + return ret; +} diff --git a/providers/i40iw/CMakeLists.txt b/providers/i40iw/CMakeLists.txt new file mode 100644 index 0000000..d8a3a3c --- /dev/null +++ b/providers/i40iw/CMakeLists.txt @@ -0,0 +1,5 @@ +rdma_provider(i40iw + i40iw_uk.c + i40iw_umain.c + i40iw_uverbs.c +) diff --git a/providers/i40iw/i40e_devids.h b/providers/i40iw/i40e_devids.h new file mode 100644 index 0000000..6c09a66 --- /dev/null +++ b/providers/i40iw/i40e_devids.h @@ -0,0 +1,72 @@ +/******************************************************************************* +* +* Copyright (c) 2015-2016 Intel Corporation. All rights reserved. +* +* This software is available to you under a choice of one of two +* licenses. You may choose to be licensed under the terms of the GNU +* General Public License (GPL) Version 2, available from the file +* COPYING in the main directory of this source tree, or the +* OpenFabrics.org BSD license below: +* +* Redistribution and use in source and binary forms, with or +* without modification, are permitted provided that the following +* conditions are met: +* +* - Redistributions of source code must retain the above +* copyright notice, this list of conditions and the following +* disclaimer. +* +* - Redistributions in binary form must reproduce the above +* copyright notice, this list of conditions and the following +* disclaimer in the documentation and/or other materials +* provided with the distribution. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS +* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN +* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +* SOFTWARE. +* +*******************************************************************************/ + +#ifndef _I40E_DEVIDS_H_ +#define _I40E_DEVIDS_H_ + +/* Vendor ID */ +#define I40E_INTEL_VENDOR_ID 0x8086 + +/* Device IDs */ +#define I40E_DEV_ID_SFP_XL710 0x1572 +#define I40E_DEV_ID_QEMU 0x1574 +#define I40E_DEV_ID_KX_B 0x1580 +#define I40E_DEV_ID_KX_C 0x1581 +#define I40E_DEV_ID_QSFP_A 0x1583 +#define I40E_DEV_ID_QSFP_B 0x1584 +#define I40E_DEV_ID_QSFP_C 0x1585 +#define I40E_DEV_ID_10G_BASE_T 0x1586 +#define I40E_DEV_ID_20G_KR2 0x1587 +#define I40E_DEV_ID_20G_KR2_A 0x1588 +#define I40E_DEV_ID_10G_BASE_T4 0x1589 +#define I40E_DEV_ID_25G_B 0x158A +#define I40E_DEV_ID_25G_SFP28 0x158B +#define I40E_DEV_ID_VF 0x154C +#define I40E_DEV_ID_VF_HV 0x1571 +#define I40E_DEV_ID_X722_A0 0x374C +#define I40E_DEV_ID_X722_A0_VF 0x374D +#define I40E_DEV_ID_KX_X722 0x37CE +#define I40E_DEV_ID_QSFP_X722 0x37CF +#define I40E_DEV_ID_SFP_X722 0x37D0 +#define I40E_DEV_ID_1G_BASE_T_X722 0x37D1 +#define I40E_DEV_ID_10G_BASE_T_X722 0x37D2 +#define I40E_DEV_ID_SFP_I_X722 0x37D3 +#define I40E_DEV_ID_X722_VF 0x37CD +#define I40E_DEV_ID_X722_VF_HV 0x37D9 + +#define i40e_is_40G_device(d) ((d) == I40E_DEV_ID_QSFP_A || \ + (d) == I40E_DEV_ID_QSFP_B || \ + (d) == I40E_DEV_ID_QSFP_C) + +#endif /* _I40E_DEVIDS_H_ */ diff --git a/providers/i40iw/i40iw-abi.h b/providers/i40iw/i40iw-abi.h new file mode 100644 index 0000000..1fee917 --- /dev/null +++ b/providers/i40iw/i40iw-abi.h @@ -0,0 +1,55 @@ +/******************************************************************************* +* +* Copyright (c) 2015-2016 Intel Corporation. All rights reserved. +* +* This software is available to you under a choice of one of two +* licenses. You may choose to be licensed under the terms of the GNU +* General Public License (GPL) Version 2, available from the file +* COPYING in the main directory of this source tree, or the +* OpenFabrics.org BSD license below: +* +* Redistribution and use in source and binary forms, with or +* without modification, are permitted provided that the following +* conditions are met: +* +* - Redistributions of source code must retain the above +* copyright notice, this list of conditions and the following +* disclaimer. +* +* - Redistributions in binary form must reproduce the above +* copyright notice, this list of conditions and the following +* disclaimer in the documentation and/or other materials +* provided with the distribution. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS +* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN +* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +* SOFTWARE. +* +*******************************************************************************/ + +#ifndef PROVIDER_I40IW_ABI_H +#define PROVIDER_I40IW_ABI_H + +#include <infiniband/kern-abi.h> +#include <rdma/i40iw-abi.h> +#include <kernel-abi/i40iw-abi.h> + +#define I40IW_ABI_VER 5 + +DECLARE_DRV_CMD(i40iw_ualloc_pd, IB_USER_VERBS_CMD_ALLOC_PD, + empty, i40iw_alloc_pd_resp); +DECLARE_DRV_CMD(i40iw_ucreate_cq, IB_USER_VERBS_CMD_CREATE_CQ, + i40iw_create_cq_req, i40iw_create_cq_resp); +DECLARE_DRV_CMD(i40iw_ucreate_qp, IB_USER_VERBS_CMD_CREATE_QP, + i40iw_create_qp_req, i40iw_create_qp_resp); +DECLARE_DRV_CMD(i40iw_get_context, IB_USER_VERBS_CMD_GET_CONTEXT, + i40iw_alloc_ucontext_req, i40iw_alloc_ucontext_resp); +DECLARE_DRV_CMD(i40iw_ureg_mr, IB_USER_VERBS_CMD_REG_MR, + i40iw_mem_reg_req, empty); + +#endif /* I40IW_ABI_H */ diff --git a/providers/i40iw/i40iw_d.h b/providers/i40iw/i40iw_d.h new file mode 100644 index 0000000..4652dcb --- /dev/null +++ b/providers/i40iw/i40iw_d.h @@ -0,0 +1,1746 @@ +/******************************************************************************* +* +* Copyright (c) 2015-2016 Intel Corporation. All rights reserved. +* +* This software is available to you under a choice of one of two +* licenses. You may choose to be licensed under the terms of the GNU +* General Public License (GPL) Version 2, available from the file +* COPYING in the main directory of this source tree, or the +* OpenFabrics.org BSD license below: +* +* Redistribution and use in source and binary forms, with or +* without modification, are permitted provided that the following +* conditions are met: +* +* - Redistributions of source code must retain the above +* copyright notice, this list of conditions and the following +* disclaimer. +* +* - Redistributions in binary form must reproduce the above +* copyright notice, this list of conditions and the following +* disclaimer in the documentation and/or other materials +* provided with the distribution. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS +* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN +* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +* SOFTWARE. +* +*******************************************************************************/ + +#ifndef I40IW_D_H +#define I40IW_D_H + +#define I40IW_DB_ADDR_OFFSET (4 * 1024 * 1024 - 64 * 1024) +#define I40IW_VF_DB_ADDR_OFFSET (64 * 1024) + +#define I40IW_PUSH_OFFSET (4 * 1024 * 1024) +#define I40IW_PF_FIRST_PUSH_PAGE_INDEX 16 +#define I40IW_VF_PUSH_OFFSET ((8 + 64) * 1024) +#define I40IW_VF_FIRST_PUSH_PAGE_INDEX 2 + +#define I40IW_PE_DB_SIZE_4M 1 +#define I40IW_PE_DB_SIZE_8M 2 + +#define I40IW_DDP_VER 1 +#define I40IW_RDMAP_VER 1 + +#define I40IW_RDMA_MODE_RDMAC 0 +#define I40IW_RDMA_MODE_IETF 1 + +#define I40IW_QP_STATE_INVALID 0 +#define I40IW_QP_STATE_IDLE 1 +#define I40IW_QP_STATE_RTS 2 +#define I40IW_QP_STATE_CLOSING 3 +#define I40IW_QP_STATE_RESERVED 4 +#define I40IW_QP_STATE_TERMINATE 5 +#define I40IW_QP_STATE_ERROR 6 + +#define I40IW_STAG_STATE_INVALID 0 +#define I40IW_STAG_STATE_VALID 1 + +#define I40IW_STAG_TYPE_SHARED 0 +#define I40IW_STAG_TYPE_NONSHARED 1 + +#define I40IW_MAX_USER_PRIORITY 8 + +#define LS_64_1(val, bits) ((u64)(uintptr_t)val << bits) +#define RS_64_1(val, bits) ((u64)(uintptr_t)val >> bits) +#define LS_32_1(val, bits) (u32)(val << bits) +#define RS_32_1(val, bits) (u32)(val >> bits) +#define I40E_HI_DWORD(x) ((u32)((((x) >> 16) >> 16) & 0xFFFFFFFF)) + +#define LS_64(val, field) (((u64)val << field ## _SHIFT) & (field ## _MASK)) + +#define RS_64(val, field) ((u64)(val & field ## _MASK) >> field ## _SHIFT) +#define LS_32(val, field) ((val << field ## _SHIFT) & (field ## _MASK)) +#define RS_32(val, field) ((val & field ## _MASK) >> field ## _SHIFT) + +#define TERM_DDP_LEN_TAGGED 14 +#define TERM_DDP_LEN_UNTAGGED 18 +#define TERM_RDMA_LEN 28 +#define RDMA_OPCODE_MASK 0x0f +#define RDMA_READ_REQ_OPCODE 1 +#define Q2_BAD_FRAME_OFFSET 72 +#define CQE_MAJOR_DRV 0x8000 + +#define I40IW_TERM_SENT 0x01 +#define I40IW_TERM_RCVD 0x02 +#define I40IW_TERM_DONE 0x04 +#define I40IW_MAC_HLEN 14 +#define I40IW_BYTE_0 0 +#define I40IW_BYTE_8 8 +#define I40IW_BYTE_16 16 +#define I40IW_BYTE_24 24 +#define I40IW_BYTE_32 32 +#define I40IW_BYTE_40 40 +#define I40IW_BYTE_48 48 +#define I40IW_BYTE_56 56 +#define I40IW_BYTE_64 64 +#define I40IW_BYTE_72 72 +#define I40IW_BYTE_80 80 +#define I40IW_BYTE_88 88 +#define I40IW_BYTE_96 96 +#define I40IW_BYTE_104 104 +#define I40IW_BYTE_112 112 +#define I40IW_BYTE_120 120 +#define I40IW_BYTE_128 128 +#define I40IW_BYTE_136 136 +#define I40IW_BYTE_144 144 +#define I40IW_BYTE_152 152 +#define I40IW_BYTE_160 160 +#define I40IW_BYTE_168 168 +#define I40IW_BYTE_176 176 +#define I40IW_BYTE_184 184 +#define I40IW_BYTE_192 192 +#define I40IW_BYTE_200 200 +#define I40IW_BYTE_208 208 + +#define I40IW_INVALID_WQE_INDEX 0xffffffff + +#define I40IW_CQP_WAIT_POLL_REGS 1 +#define I40IW_CQP_WAIT_POLL_CQ 2 +#define I40IW_CQP_WAIT_EVENT 3 + +#define I40IW_CQP_INIT_WQE(wqe) memset(wqe, 0, 64) + +#define I40IW_GET_CURRENT_CQ_ELEMENT(_cq) \ + ( \ + &((_cq)->cq_base[I40IW_RING_GETCURRENT_HEAD((_cq)->cq_ring)]) \ + ) +#define I40IW_GET_CURRENT_EXTENDED_CQ_ELEMENT(_cq) \ + ( \ + &(((struct i40iw_extended_cqe *) \ + ((_cq)->cq_base))[I40IW_RING_GETCURRENT_HEAD((_cq)->cq_ring)]) \ + ) + +#define I40IW_GET_CURRENT_AEQ_ELEMENT(_aeq) \ + ( \ + &_aeq->aeqe_base[I40IW_RING_GETCURRENT_TAIL(_aeq->aeq_ring)] \ + ) + +#define I40IW_GET_CURRENT_CEQ_ELEMENT(_ceq) \ + ( \ + &_ceq->ceqe_base[I40IW_RING_GETCURRENT_TAIL(_ceq->ceq_ring)] \ + ) + +#define I40IW_AE_SOURCE_RQ 0x1 +#define I40IW_AE_SOURCE_RQ_0011 0x3 + +#define I40IW_AE_SOURCE_CQ 0x2 +#define I40IW_AE_SOURCE_CQ_0110 0x6 +#define I40IW_AE_SOURCE_CQ_1010 0xA +#define I40IW_AE_SOURCE_CQ_1110 0xE + +#define I40IW_AE_SOURCE_SQ 0x5 +#define I40IW_AE_SOURCE_SQ_0111 0x7 + +#define I40IW_AE_SOURCE_IN_RR_WR 0x9 +#define I40IW_AE_SOURCE_IN_RR_WR_1011 0xB +#define I40IW_AE_SOURCE_OUT_RR 0xD +#define I40IW_AE_SOURCE_OUT_RR_1111 0xF + +#define I40IW_TCP_STATE_NON_EXISTENT 0 +#define I40IW_TCP_STATE_CLOSED 1 +#define I40IW_TCP_STATE_LISTEN 2 +#define I40IW_STATE_SYN_SEND 3 +#define I40IW_TCP_STATE_SYN_RECEIVED 4 +#define I40IW_TCP_STATE_ESTABLISHED 5 +#define I40IW_TCP_STATE_CLOSE_WAIT 6 +#define I40IW_TCP_STATE_FIN_WAIT_1 7 +#define I40IW_TCP_STATE_CLOSING 8 +#define I40IW_TCP_STATE_LAST_ACK 9 +#define I40IW_TCP_STATE_FIN_WAIT_2 10 +#define I40IW_TCP_STATE_TIME_WAIT 11 +#define I40IW_TCP_STATE_RESERVED_1 12 +#define I40IW_TCP_STATE_RESERVED_2 13 +#define I40IW_TCP_STATE_RESERVED_3 14 +#define I40IW_TCP_STATE_RESERVED_4 15 + +/* ILQ CQP hash table fields */ +#define I40IW_CQPSQ_QHASH_VLANID_SHIFT 32 +#define I40IW_CQPSQ_QHASH_VLANID_MASK \ + ((u64)0xfff << I40IW_CQPSQ_QHASH_VLANID_SHIFT) + +#define I40IW_CQPSQ_QHASH_QPN_SHIFT 32 +#define I40IW_CQPSQ_QHASH_QPN_MASK \ + ((u64)0x3ffff << I40IW_CQPSQ_QHASH_QPN_SHIFT) + +#define I40IW_CQPSQ_QHASH_QS_HANDLE_SHIFT 0 +#define I40IW_CQPSQ_QHASH_QS_HANDLE_MASK ((u64)0x3ff << I40IW_CQPSQ_QHASH_QS_HANDLE_SHIFT) + +#define I40IW_CQPSQ_QHASH_SRC_PORT_SHIFT 16 +#define I40IW_CQPSQ_QHASH_SRC_PORT_MASK \ + ((u64)0xffff << I40IW_CQPSQ_QHASH_SRC_PORT_SHIFT) + +#define I40IW_CQPSQ_QHASH_DEST_PORT_SHIFT 0 +#define I40IW_CQPSQ_QHASH_DEST_PORT_MASK \ + ((u64)0xffff << I40IW_CQPSQ_QHASH_DEST_PORT_SHIFT) + +#define I40IW_CQPSQ_QHASH_ADDR0_SHIFT 32 +#define I40IW_CQPSQ_QHASH_ADDR0_MASK \ + ((u64)0xffffffff << I40IW_CQPSQ_QHASH_ADDR0_SHIFT) + +#define I40IW_CQPSQ_QHASH_ADDR1_SHIFT 0 +#define I40IW_CQPSQ_QHASH_ADDR1_MASK \ + ((u64)0xffffffff << I40IW_CQPSQ_QHASH_ADDR1_SHIFT) + +#define I40IW_CQPSQ_QHASH_ADDR2_SHIFT 32 +#define I40IW_CQPSQ_QHASH_ADDR2_MASK \ + ((u64)0xffffffff << I40IW_CQPSQ_QHASH_ADDR2_SHIFT) + +#define I40IW_CQPSQ_QHASH_ADDR3_SHIFT 0 +#define I40IW_CQPSQ_QHASH_ADDR3_MASK \ + ((u64)0xffffffff << I40IW_CQPSQ_QHASH_ADDR3_SHIFT) + +#define I40IW_CQPSQ_QHASH_WQEVALID_SHIFT 63 +#define I40IW_CQPSQ_QHASH_WQEVALID_MASK \ + ((u64)0x1 << I40IW_CQPSQ_QHASH_WQEVALID_SHIFT) +#define I40IW_CQPSQ_QHASH_OPCODE_SHIFT 32 +#define I40IW_CQPSQ_QHASH_OPCODE_MASK \ + ((u64)0x3f << I40IW_CQPSQ_QHASH_OPCODE_SHIFT) + +#define I40IW_CQPSQ_QHASH_MANAGE_SHIFT 61 +#define I40IW_CQPSQ_QHASH_MANAGE_MASK \ + ((u64)0x3 << I40IW_CQPSQ_QHASH_MANAGE_SHIFT) + +#define I40IW_CQPSQ_QHASH_IPV4VALID_SHIFT 60 +#define I40IW_CQPSQ_QHASH_IPV4VALID_MASK \ + ((u64)0x1 << I40IW_CQPSQ_QHASH_IPV4VALID_SHIFT) + +#define I40IW_CQPSQ_QHASH_VLANVALID_SHIFT 59 +#define I40IW_CQPSQ_QHASH_VLANVALID_MASK \ + ((u64)0x1 << I40IW_CQPSQ_QHASH_VLANVALID_SHIFT) + +#define I40IW_CQPSQ_QHASH_ENTRYTYPE_SHIFT 42 +#define I40IW_CQPSQ_QHASH_ENTRYTYPE_MASK \ + ((u64)0x7 << I40IW_CQPSQ_QHASH_ENTRYTYPE_SHIFT) +/* CQP Host Context */ +#define I40IW_CQPHC_EN_DC_TCP_SHIFT 0 +#define I40IW_CQPHC_EN_DC_TCP_MASK (1UL << I40IW_CQPHC_EN_DC_TCP_SHIFT) + +#define I40IW_CQPHC_SQSIZE_SHIFT 8 +#define I40IW_CQPHC_SQSIZE_MASK (0xfUL << I40IW_CQPHC_SQSIZE_SHIFT) + +#define I40IW_CQPHC_DISABLE_PFPDUS_SHIFT 1 +#define I40IW_CQPHC_DISABLE_PFPDUS_MASK (0x1UL << I40IW_CQPHC_DISABLE_PFPDUS_SHIFT) + +#define I40IW_CQPHC_ENABLED_VFS_SHIFT 32 +#define I40IW_CQPHC_ENABLED_VFS_MASK (0x3fULL << I40IW_CQPHC_ENABLED_VFS_SHIFT) + +#define I40IW_CQPHC_HMC_PROFILE_SHIFT 0 +#define I40IW_CQPHC_HMC_PROFILE_MASK (0x7ULL << I40IW_CQPHC_HMC_PROFILE_SHIFT) + +#define I40IW_CQPHC_SVER_SHIFT 24 +#define I40IW_CQPHC_SVER_MASK (0xffUL << I40IW_CQPHC_SVER_SHIFT) + +#define I40IW_CQPHC_SQBASE_SHIFT 9 +#define I40IW_CQPHC_SQBASE_MASK \ + (0xfffffffffffffeULL << I40IW_CQPHC_SQBASE_SHIFT) + +#define I40IW_CQPHC_QPCTX_SHIFT 0 +#define I40IW_CQPHC_QPCTX_MASK \ + (0xffffffffffffffffULL << I40IW_CQPHC_QPCTX_SHIFT) +#define I40IW_CQPHC_SVER 1 + +#define I40IW_CQP_SW_SQSIZE_4 4 +#define I40IW_CQP_SW_SQSIZE_2048 2048 + +/* iWARP QP Doorbell shadow area */ +#define I40IW_QP_DBSA_HW_SQ_TAIL_SHIFT 0 +#define I40IW_QP_DBSA_HW_SQ_TAIL_MASK \ + (0x3fffUL << I40IW_QP_DBSA_HW_SQ_TAIL_SHIFT) + +/* Completion Queue Doorbell shadow area */ +#define I40IW_CQ_DBSA_CQEIDX_SHIFT 0 +#define I40IW_CQ_DBSA_CQEIDX_MASK (0xfffffUL << I40IW_CQ_DBSA_CQEIDX_SHIFT) + +#define I40IW_CQ_DBSA_SW_CQ_SELECT_SHIFT 0 +#define I40IW_CQ_DBSA_SW_CQ_SELECT_MASK \ + (0x3fffUL << I40IW_CQ_DBSA_SW_CQ_SELECT_SHIFT) + +#define I40IW_CQ_DBSA_ARM_NEXT_SHIFT 14 +#define I40IW_CQ_DBSA_ARM_NEXT_MASK (1UL << I40IW_CQ_DBSA_ARM_NEXT_SHIFT) + +#define I40IW_CQ_DBSA_ARM_NEXT_SE_SHIFT 15 +#define I40IW_CQ_DBSA_ARM_NEXT_SE_MASK (1UL << I40IW_CQ_DBSA_ARM_NEXT_SE_SHIFT) + +#define I40IW_CQ_DBSA_ARM_SEQ_NUM_SHIFT 16 +#define I40IW_CQ_DBSA_ARM_SEQ_NUM_MASK \ + (0x3UL << I40IW_CQ_DBSA_ARM_SEQ_NUM_SHIFT) + +/* CQP and iWARP Completion Queue */ +#define I40IW_CQ_QPCTX_SHIFT I40IW_CQPHC_QPCTX_SHIFT +#define I40IW_CQ_QPCTX_MASK I40IW_CQPHC_QPCTX_MASK + +#define I40IW_CCQ_OPRETVAL_SHIFT 0 +#define I40IW_CCQ_OPRETVAL_MASK (0xffffffffUL << I40IW_CCQ_OPRETVAL_SHIFT) + +#define I40IW_CQ_MINERR_SHIFT 0 +#define I40IW_CQ_MINERR_MASK (0xffffUL << I40IW_CQ_MINERR_SHIFT) + +#define I40IW_CQ_MAJERR_SHIFT 16 +#define I40IW_CQ_MAJERR_MASK (0xffffUL << I40IW_CQ_MAJERR_SHIFT) + +#define I40IW_CQ_WQEIDX_SHIFT 32 +#define I40IW_CQ_WQEIDX_MASK (0x3fffULL << I40IW_CQ_WQEIDX_SHIFT) + +#define I40IW_CQ_ERROR_SHIFT 55 +#define I40IW_CQ_ERROR_MASK (1ULL << I40IW_CQ_ERROR_SHIFT) + +#define I40IW_CQ_SQ_SHIFT 62 +#define I40IW_CQ_SQ_MASK (1ULL << I40IW_CQ_SQ_SHIFT) + +#define I40IW_CQ_VALID_SHIFT 63 +#define I40IW_CQ_VALID_MASK (1ULL << I40IW_CQ_VALID_SHIFT) + +#define I40IWCQ_PAYLDLEN_SHIFT 0 +#define I40IWCQ_PAYLDLEN_MASK (0xffffffffUL << I40IWCQ_PAYLDLEN_SHIFT) + +#define I40IWCQ_TCPSEQNUM_SHIFT 32 +#define I40IWCQ_TCPSEQNUM_MASK (0xffffffffULL << I40IWCQ_TCPSEQNUM_SHIFT) + +#define I40IWCQ_INVSTAG_SHIFT 0 +#define I40IWCQ_INVSTAG_MASK (0xffffffffUL << I40IWCQ_INVSTAG_SHIFT) + +#define I40IWCQ_QPID_SHIFT 32 +#define I40IWCQ_QPID_MASK (0x3ffffULL << I40IWCQ_QPID_SHIFT) + +#define I40IWCQ_PSHDROP_SHIFT 51 +#define I40IWCQ_PSHDROP_MASK (1ULL << I40IWCQ_PSHDROP_SHIFT) + +#define I40IWCQ_SRQ_SHIFT 52 +#define I40IWCQ_SRQ_MASK (1ULL << I40IWCQ_SRQ_SHIFT) + +#define I40IWCQ_STAG_SHIFT 53 +#define I40IWCQ_STAG_MASK (1ULL << I40IWCQ_STAG_SHIFT) + +#define I40IWCQ_SOEVENT_SHIFT 54 +#define I40IWCQ_SOEVENT_MASK (1ULL << I40IWCQ_SOEVENT_SHIFT) + +#define I40IWCQ_OP_SHIFT 56 +#define I40IWCQ_OP_MASK (0x3fULL << I40IWCQ_OP_SHIFT) + +/* CEQE format */ +#define I40IW_CEQE_CQCTX_SHIFT 0 +#define I40IW_CEQE_CQCTX_MASK \ + (0x7fffffffffffffffULL << I40IW_CEQE_CQCTX_SHIFT) + +#define I40IW_CEQE_VALID_SHIFT 63 +#define I40IW_CEQE_VALID_MASK (1ULL << I40IW_CEQE_VALID_SHIFT) + +/* AEQE format */ +#define I40IW_AEQE_COMPCTX_SHIFT I40IW_CQPHC_QPCTX_SHIFT +#define I40IW_AEQE_COMPCTX_MASK I40IW_CQPHC_QPCTX_MASK + +#define I40IW_AEQE_QPCQID_SHIFT 0 +#define I40IW_AEQE_QPCQID_MASK (0x3ffffUL << I40IW_AEQE_QPCQID_SHIFT) + +#define I40IW_AEQE_WQDESCIDX_SHIFT 18 +#define I40IW_AEQE_WQDESCIDX_MASK (0x3fffULL << I40IW_AEQE_WQDESCIDX_SHIFT) + +#define I40IW_AEQE_OVERFLOW_SHIFT 33 +#define I40IW_AEQE_OVERFLOW_MASK (1ULL << I40IW_AEQE_OVERFLOW_SHIFT) + +#define I40IW_AEQE_AECODE_SHIFT 34 +#define I40IW_AEQE_AECODE_MASK (0xffffULL << I40IW_AEQE_AECODE_SHIFT) + +#define I40IW_AEQE_AESRC_SHIFT 50 +#define I40IW_AEQE_AESRC_MASK (0xfULL << I40IW_AEQE_AESRC_SHIFT) + +#define I40IW_AEQE_IWSTATE_SHIFT 54 +#define I40IW_AEQE_IWSTATE_MASK (0x7ULL << I40IW_AEQE_IWSTATE_SHIFT) + +#define I40IW_AEQE_TCPSTATE_SHIFT 57 +#define I40IW_AEQE_TCPSTATE_MASK (0xfULL << I40IW_AEQE_TCPSTATE_SHIFT) + +#define I40IW_AEQE_Q2DATA_SHIFT 61 +#define I40IW_AEQE_Q2DATA_MASK (0x3ULL << I40IW_AEQE_Q2DATA_SHIFT) + +#define I40IW_AEQE_VALID_SHIFT 63 +#define I40IW_AEQE_VALID_MASK (1ULL << I40IW_AEQE_VALID_SHIFT) + +/* CQP SQ WQES */ +#define I40IW_QP_TYPE_IWARP 1 +#define I40IW_QP_TYPE_UDA 2 +#define I40IW_QP_TYPE_CQP 4 + +#define I40IW_CQ_TYPE_IWARP 1 +#define I40IW_CQ_TYPE_ILQ 2 +#define I40IW_CQ_TYPE_IEQ 3 +#define I40IW_CQ_TYPE_CQP 4 + +#define I40IWQP_TERM_SEND_TERM_AND_FIN 0 +#define I40IWQP_TERM_SEND_TERM_ONLY 1 +#define I40IWQP_TERM_SEND_FIN_ONLY 2 +#define I40IWQP_TERM_DONOT_SEND_TERM_OR_FIN 3 + +#define I40IW_CQP_OP_CREATE_QP 0 +#define I40IW_CQP_OP_MODIFY_QP 0x1 +#define I40IW_CQP_OP_DESTROY_QP 0x02 +#define I40IW_CQP_OP_CREATE_CQ 0x03 +#define I40IW_CQP_OP_MODIFY_CQ 0x04 +#define I40IW_CQP_OP_DESTROY_CQ 0x05 +#define I40IW_CQP_OP_CREATE_SRQ 0x06 +#define I40IW_CQP_OP_MODIFY_SRQ 0x07 +#define I40IW_CQP_OP_DESTROY_SRQ 0x08 +#define I40IW_CQP_OP_ALLOC_STAG 0x09 +#define I40IW_CQP_OP_REG_MR 0x0a +#define I40IW_CQP_OP_QUERY_STAG 0x0b +#define I40IW_CQP_OP_REG_SMR 0x0c +#define I40IW_CQP_OP_DEALLOC_STAG 0x0d +#define I40IW_CQP_OP_MANAGE_LOC_MAC_IP_TABLE 0x0e +#define I40IW_CQP_OP_MANAGE_ARP 0x0f +#define I40IW_CQP_OP_MANAGE_VF_PBLE_BP 0x10 +#define I40IW_CQP_OP_MANAGE_PUSH_PAGES 0x11 +#define I40IW_CQP_OP_MANAGE_PE_TEAM 0x12 +#define I40IW_CQP_OP_UPLOAD_CONTEXT 0x13 +#define I40IW_CQP_OP_ALLOCATE_LOC_MAC_IP_TABLE_ENTRY 0x14 +#define I40IW_CQP_OP_MANAGE_HMC_PM_FUNC_TABLE 0x15 +#define I40IW_CQP_OP_CREATE_CEQ 0x16 +#define I40IW_CQP_OP_DESTROY_CEQ 0x18 +#define I40IW_CQP_OP_CREATE_AEQ 0x19 +#define I40IW_CQP_OP_DESTROY_AEQ 0x1b +#define I40IW_CQP_OP_CREATE_ADDR_VECT 0x1c +#define I40IW_CQP_OP_MODIFY_ADDR_VECT 0x1d +#define I40IW_CQP_OP_DESTROY_ADDR_VECT 0x1e +#define I40IW_CQP_OP_UPDATE_PE_SDS 0x1f +#define I40IW_CQP_OP_QUERY_FPM_VALUES 0x20 +#define I40IW_CQP_OP_COMMIT_FPM_VALUES 0x21 +#define I40IW_CQP_OP_FLUSH_WQES 0x22 +#define I40IW_CQP_OP_MANAGE_APBVT 0x23 +#define I40IW_CQP_OP_NOP 0x24 +#define I40IW_CQP_OP_MANAGE_QUAD_HASH_TABLE_ENTRY 0x25 +#define I40IW_CQP_OP_CREATE_UDA_MCAST_GROUP 0x26 +#define I40IW_CQP_OP_MODIFY_UDA_MCAST_GROUP 0x27 +#define I40IW_CQP_OP_DESTROY_UDA_MCAST_GROUP 0x28 +#define I40IW_CQP_OP_SUSPEND_QP 0x29 +#define I40IW_CQP_OP_RESUME_QP 0x2a +#define I40IW_CQP_OP_SHMC_PAGES_ALLOCATED 0x2b +#define I40IW_CQP_OP_SET_HMC_RESOURCE_PROFILE 0x2d + +#define I40IW_UDA_QPSQ_NEXT_HEADER_SHIFT 16 +#define I40IW_UDA_QPSQ_NEXT_HEADER_MASK ((u64)0xff << I40IW_UDA_QPSQ_NEXT_HEADER_SHIFT) + +#define I40IW_UDA_QPSQ_OPCODE_SHIFT 32 +#define I40IW_UDA_QPSQ_OPCODE_MASK ((u64)0x3f << I40IW_UDA_QPSQ_OPCODE_SHIFT) + +#define I40IW_UDA_QPSQ_MACLEN_SHIFT 56 +#define I40IW_UDA_QPSQ_MACLEN_MASK \ + ((u64)0x7f << I40IW_UDA_QPSQ_MACLEN_SHIFT) + +#define I40IW_UDA_QPSQ_IPLEN_SHIFT 48 +#define I40IW_UDA_QPSQ_IPLEN_MASK \ + ((u64)0x7f << I40IW_UDA_QPSQ_IPLEN_SHIFT) + +#define I40IW_UDA_QPSQ_L4T_SHIFT 30 +#define I40IW_UDA_QPSQ_L4T_MASK \ + ((u64)0x3 << I40IW_UDA_QPSQ_L4T_SHIFT) + +#define I40IW_UDA_QPSQ_IIPT_SHIFT 28 +#define I40IW_UDA_QPSQ_IIPT_MASK \ + ((u64)0x3 << I40IW_UDA_QPSQ_IIPT_SHIFT) + +#define I40IW_UDA_QPSQ_L4LEN_SHIFT 24 +#define I40IW_UDA_QPSQ_L4LEN_MASK ((u64)0xf << I40IW_UDA_QPSQ_L4LEN_SHIFT) + +#define I40IW_UDA_QPSQ_AVIDX_SHIFT 0 +#define I40IW_UDA_QPSQ_AVIDX_MASK ((u64)0xffff << I40IW_UDA_QPSQ_AVIDX_SHIFT) + +#define I40IW_UDA_QPSQ_VALID_SHIFT 63 +#define I40IW_UDA_QPSQ_VALID_MASK \ + ((u64)0x1 << I40IW_UDA_QPSQ_VALID_SHIFT) + +#define I40IW_UDA_QPSQ_SIGCOMPL_SHIFT 62 +#define I40IW_UDA_QPSQ_SIGCOMPL_MASK ((u64)0x1 << I40IW_UDA_QPSQ_SIGCOMPL_SHIFT) + +#define I40IW_UDA_PAYLOADLEN_SHIFT 0 +#define I40IW_UDA_PAYLOADLEN_MASK ((u64)0x3fff << I40IW_UDA_PAYLOADLEN_SHIFT) + +#define I40IW_UDA_HDRLEN_SHIFT 16 +#define I40IW_UDA_HDRLEN_MASK ((u64)0x1ff << I40IW_UDA_HDRLEN_SHIFT) + +#define I40IW_VLAN_TAG_VALID_SHIFT 50 +#define I40IW_VLAN_TAG_VALID_MASK ((u64)0x1 << I40IW_VLAN_TAG_VALID_SHIFT) + +#define I40IW_UDA_L3PROTO_SHIFT 0 +#define I40IW_UDA_L3PROTO_MASK ((u64)0x3 << I40IW_UDA_L3PROTO_SHIFT) + +#define I40IW_UDA_L4PROTO_SHIFT 16 +#define I40IW_UDA_L4PROTO_MASK ((u64)0x3 << I40IW_UDA_L4PROTO_SHIFT) + +#define I40IW_UDA_QPSQ_DOLOOPBACK_SHIFT 44 +#define I40IW_UDA_QPSQ_DOLOOPBACK_MASK \ + ((u64)0x1 << I40IW_UDA_QPSQ_DOLOOPBACK_SHIFT) + +/* CQP SQ WQE common fields */ +#define I40IW_CQPSQ_OPCODE_SHIFT 32 +#define I40IW_CQPSQ_OPCODE_MASK (0x3fULL << I40IW_CQPSQ_OPCODE_SHIFT) + +#define I40IW_CQPSQ_WQEVALID_SHIFT 63 +#define I40IW_CQPSQ_WQEVALID_MASK (1ULL << I40IW_CQPSQ_WQEVALID_SHIFT) + +#define I40IW_CQPSQ_TPHVAL_SHIFT 0 +#define I40IW_CQPSQ_TPHVAL_MASK (0xffUL << I40IW_CQPSQ_TPHVAL_SHIFT) + +#define I40IW_CQPSQ_TPHEN_SHIFT 60 +#define I40IW_CQPSQ_TPHEN_MASK (1ULL << I40IW_CQPSQ_TPHEN_SHIFT) + +#define I40IW_CQPSQ_PBUFADDR_SHIFT I40IW_CQPHC_QPCTX_SHIFT +#define I40IW_CQPSQ_PBUFADDR_MASK I40IW_CQPHC_QPCTX_MASK + +/* Create/Modify/Destroy QP */ + +#define I40IW_CQPSQ_QP_NEWMSS_SHIFT 32 +#define I40IW_CQPSQ_QP_NEWMSS_MASK (0x3fffULL << I40IW_CQPSQ_QP_NEWMSS_SHIFT) + +#define I40IW_CQPSQ_QP_TERMLEN_SHIFT 48 +#define I40IW_CQPSQ_QP_TERMLEN_MASK (0xfULL << I40IW_CQPSQ_QP_TERMLEN_SHIFT) + +#define I40IW_CQPSQ_QP_QPCTX_SHIFT I40IW_CQPHC_QPCTX_SHIFT +#define I40IW_CQPSQ_QP_QPCTX_MASK I40IW_CQPHC_QPCTX_MASK + +#define I40IW_CQPSQ_QP_QPID_SHIFT 0 +#define I40IW_CQPSQ_QP_QPID_MASK (0x3FFFFUL) +/* I40IWCQ_QPID_MASK */ + +#define I40IW_CQPSQ_QP_OP_SHIFT 32 +#define I40IW_CQPSQ_QP_OP_MASK I40IWCQ_OP_MASK + +#define I40IW_CQPSQ_QP_ORDVALID_SHIFT 42 +#define I40IW_CQPSQ_QP_ORDVALID_MASK (1ULL << I40IW_CQPSQ_QP_ORDVALID_SHIFT) + +#define I40IW_CQPSQ_QP_TOECTXVALID_SHIFT 43 +#define I40IW_CQPSQ_QP_TOECTXVALID_MASK \ + (1ULL << I40IW_CQPSQ_QP_TOECTXVALID_SHIFT) + +#define I40IW_CQPSQ_QP_CACHEDVARVALID_SHIFT 44 +#define I40IW_CQPSQ_QP_CACHEDVARVALID_MASK \ + (1ULL << I40IW_CQPSQ_QP_CACHEDVARVALID_SHIFT) + +#define I40IW_CQPSQ_QP_VQ_SHIFT 45 +#define I40IW_CQPSQ_QP_VQ_MASK (1ULL << I40IW_CQPSQ_QP_VQ_SHIFT) + +#define I40IW_CQPSQ_QP_FORCELOOPBACK_SHIFT 46 +#define I40IW_CQPSQ_QP_FORCELOOPBACK_MASK \ + (1ULL << I40IW_CQPSQ_QP_FORCELOOPBACK_SHIFT) + +#define I40IW_CQPSQ_QP_CQNUMVALID_SHIFT 47 +#define I40IW_CQPSQ_QP_CQNUMVALID_MASK \ + (1ULL << I40IW_CQPSQ_QP_CQNUMVALID_SHIFT) + +#define I40IW_CQPSQ_QP_QPTYPE_SHIFT 48 +#define I40IW_CQPSQ_QP_QPTYPE_MASK (0x3ULL << I40IW_CQPSQ_QP_QPTYPE_SHIFT) + +#define I40IW_CQPSQ_QP_MSSCHANGE_SHIFT 52 +#define I40IW_CQPSQ_QP_MSSCHANGE_MASK (1ULL << I40IW_CQPSQ_QP_MSSCHANGE_SHIFT) + +#define I40IW_CQPSQ_QP_STATRSRC_SHIFT 53 +#define I40IW_CQPSQ_QP_STATRSRC_MASK (1ULL << I40IW_CQPSQ_QP_STATRSRC_SHIFT) + +#define I40IW_CQPSQ_QP_IGNOREMWBOUND_SHIFT 54 +#define I40IW_CQPSQ_QP_IGNOREMWBOUND_MASK \ + (1ULL << I40IW_CQPSQ_QP_IGNOREMWBOUND_SHIFT) + +#define I40IW_CQPSQ_QP_REMOVEHASHENTRY_SHIFT 55 +#define I40IW_CQPSQ_QP_REMOVEHASHENTRY_MASK \ + (1ULL << I40IW_CQPSQ_QP_REMOVEHASHENTRY_SHIFT) + +#define I40IW_CQPSQ_QP_TERMACT_SHIFT 56 +#define I40IW_CQPSQ_QP_TERMACT_MASK (0x3ULL << I40IW_CQPSQ_QP_TERMACT_SHIFT) + +#define I40IW_CQPSQ_QP_RESETCON_SHIFT 58 +#define I40IW_CQPSQ_QP_RESETCON_MASK (1ULL << I40IW_CQPSQ_QP_RESETCON_SHIFT) + +#define I40IW_CQPSQ_QP_ARPTABIDXVALID_SHIFT 59 +#define I40IW_CQPSQ_QP_ARPTABIDXVALID_MASK \ + (1ULL << I40IW_CQPSQ_QP_ARPTABIDXVALID_SHIFT) + +#define I40IW_CQPSQ_QP_NEXTIWSTATE_SHIFT 60 +#define I40IW_CQPSQ_QP_NEXTIWSTATE_MASK \ + (0x7ULL << I40IW_CQPSQ_QP_NEXTIWSTATE_SHIFT) + +#define I40IW_CQPSQ_QP_DBSHADOWADDR_SHIFT I40IW_CQPHC_QPCTX_SHIFT +#define I40IW_CQPSQ_QP_DBSHADOWADDR_MASK I40IW_CQPHC_QPCTX_MASK + +/* Create/Modify/Destroy CQ */ +#define I40IW_CQPSQ_CQ_CQSIZE_SHIFT 0 +#define I40IW_CQPSQ_CQ_CQSIZE_MASK (0x3ffffUL << I40IW_CQPSQ_CQ_CQSIZE_SHIFT) + +#define I40IW_CQPSQ_CQ_CQCTX_SHIFT 0 +#define I40IW_CQPSQ_CQ_CQCTX_MASK \ + (0x7fffffffffffffffULL << I40IW_CQPSQ_CQ_CQCTX_SHIFT) + +#define I40IW_CQPSQ_CQ_CQCTX_SHIFT 0 +#define I40IW_CQPSQ_CQ_CQCTX_MASK \ + (0x7fffffffffffffffULL << I40IW_CQPSQ_CQ_CQCTX_SHIFT) + +#define I40IW_CQPSQ_CQ_SHADOW_READ_THRESHOLD_SHIFT 0 +#define I40IW_CQPSQ_CQ_SHADOW_READ_THRESHOLD_MASK \ + (0x3ffff << I40IW_CQPSQ_CQ_SHADOW_READ_THRESHOLD_SHIFT) + +#define I40IW_CQPSQ_CQ_CEQID_SHIFT 24 +#define I40IW_CQPSQ_CQ_CEQID_MASK (0x7fUL << I40IW_CQPSQ_CQ_CEQID_SHIFT) + +#define I40IW_CQPSQ_CQ_OP_SHIFT 32 +#define I40IW_CQPSQ_CQ_OP_MASK (0x3fULL << I40IW_CQPSQ_CQ_OP_SHIFT) + +#define I40IW_CQPSQ_CQ_CQRESIZE_SHIFT 43 +#define I40IW_CQPSQ_CQ_CQRESIZE_MASK (1ULL << I40IW_CQPSQ_CQ_CQRESIZE_SHIFT) + +#define I40IW_CQPSQ_CQ_LPBLSIZE_SHIFT 44 +#define I40IW_CQPSQ_CQ_LPBLSIZE_MASK (3ULL << I40IW_CQPSQ_CQ_LPBLSIZE_SHIFT) + +#define I40IW_CQPSQ_CQ_CHKOVERFLOW_SHIFT 46 +#define I40IW_CQPSQ_CQ_CHKOVERFLOW_MASK \ + (1ULL << I40IW_CQPSQ_CQ_CHKOVERFLOW_SHIFT) + +#define I40IW_CQPSQ_CQ_VIRTMAP_SHIFT 47 +#define I40IW_CQPSQ_CQ_VIRTMAP_MASK (1ULL << I40IW_CQPSQ_CQ_VIRTMAP_SHIFT) + +#define I40IW_CQPSQ_CQ_ENCEQEMASK_SHIFT 48 +#define I40IW_CQPSQ_CQ_ENCEQEMASK_MASK \ + (1ULL << I40IW_CQPSQ_CQ_ENCEQEMASK_SHIFT) + +#define I40IW_CQPSQ_CQ_CEQIDVALID_SHIFT 49 +#define I40IW_CQPSQ_CQ_CEQIDVALID_MASK \ + (1ULL << I40IW_CQPSQ_CQ_CEQIDVALID_SHIFT) + +#define I40IW_CQPSQ_CQ_AVOIDMEMCNFLCT_SHIFT 61 +#define I40IW_CQPSQ_CQ_AVOIDMEMCNFLCT_MASK \ + (1ULL << I40IW_CQPSQ_CQ_AVOIDMEMCNFLCT_SHIFT) + +/* Create/Modify/Destroy Shared Receive Queue */ + +#define I40IW_CQPSQ_SRQ_RQSIZE_SHIFT 0 +#define I40IW_CQPSQ_SRQ_RQSIZE_MASK (0xfUL << I40IW_CQPSQ_SRQ_RQSIZE_SHIFT) + +#define I40IW_CQPSQ_SRQ_RQWQESIZE_SHIFT 4 +#define I40IW_CQPSQ_SRQ_RQWQESIZE_MASK \ + (0x7UL << I40IW_CQPSQ_SRQ_RQWQESIZE_SHIFT) + +#define I40IW_CQPSQ_SRQ_SRQLIMIT_SHIFT 32 +#define I40IW_CQPSQ_SRQ_SRQLIMIT_MASK \ + (0xfffULL << I40IW_CQPSQ_SRQ_SRQLIMIT_SHIFT) + +#define I40IW_CQPSQ_SRQ_SRQCTX_SHIFT I40IW_CQPHC_QPCTX_SHIFT +#define I40IW_CQPSQ_SRQ_SRQCTX_MASK I40IW_CQPHC_QPCTX_MASK + +#define I40IW_CQPSQ_SRQ_PDID_SHIFT 16 +#define I40IW_CQPSQ_SRQ_PDID_MASK \ + (0x7fffULL << I40IW_CQPSQ_SRQ_PDID_SHIFT) + +#define I40IW_CQPSQ_SRQ_SRQID_SHIFT 0 +#define I40IW_CQPSQ_SRQ_SRQID_MASK (0x7fffUL << I40IW_CQPSQ_SRQ_SRQID_SHIFT) + +#define I40IW_CQPSQ_SRQ_LPBLSIZE_SHIFT I40IW_CQPSQ_CQ_LPBLSIZE_SHIFT +#define I40IW_CQPSQ_SRQ_LPBLSIZE_MASK I40IW_CQPSQ_CQ_LPBLSIZE_MASK + +#define I40IW_CQPSQ_SRQ_VIRTMAP_SHIFT I40IW_CQPSQ_CQ_VIRTMAP_SHIFT +#define I40IW_CQPSQ_SRQ_VIRTMAP_MASK I40IW_CQPSQ_CQ_VIRTMAP_MASK + +#define I40IW_CQPSQ_SRQ_TPHEN_SHIFT I40IW_CQPSQ_TPHEN_SHIFT +#define I40IW_CQPSQ_SRQ_TPHEN_MASK I40IW_CQPSQ_TPHEN_MASK + +#define I40IW_CQPSQ_SRQ_ARMLIMITEVENT_SHIFT 61 +#define I40IW_CQPSQ_SRQ_ARMLIMITEVENT_MASK \ + (1ULL << I40IW_CQPSQ_SRQ_ARMLIMITEVENT_SHIFT) + +#define I40IW_CQPSQ_SRQ_DBSHADOWAREA_SHIFT 6 +#define I40IW_CQPSQ_SRQ_DBSHADOWAREA_MASK \ + (0x3ffffffffffffffULL << I40IW_CQPSQ_SRQ_DBSHADOWAREA_SHIFT) + +#define I40IW_CQPSQ_SRQ_FIRSTPMPBLIDX_SHIFT 0 +#define I40IW_CQPSQ_SRQ_FIRSTPMPBLIDX_MASK \ + (0xfffffffUL << I40IW_CQPSQ_SRQ_FIRSTPMPBLIDX_SHIFT) + +/* Allocate/Register/Register Shared/Deallocate Stag */ +#define I40IW_CQPSQ_STAG_VA_FBO_SHIFT I40IW_CQPHC_QPCTX_SHIFT +#define I40IW_CQPSQ_STAG_VA_FBO_MASK I40IW_CQPHC_QPCTX_MASK + +#define I40IW_CQPSQ_STAG_STAGLEN_SHIFT 0 +#define I40IW_CQPSQ_STAG_STAGLEN_MASK \ + (0x3fffffffffffULL << I40IW_CQPSQ_STAG_STAGLEN_SHIFT) + +#define I40IW_CQPSQ_STAG_PDID_SHIFT 48 +#define I40IW_CQPSQ_STAG_PDID_MASK (0x7fffULL << I40IW_CQPSQ_STAG_PDID_SHIFT) + +#define I40IW_CQPSQ_STAG_KEY_SHIFT 0 +#define I40IW_CQPSQ_STAG_KEY_MASK (0xffUL << I40IW_CQPSQ_STAG_KEY_SHIFT) + +#define I40IW_CQPSQ_STAG_IDX_SHIFT 8 +#define I40IW_CQPSQ_STAG_IDX_MASK (0xffffffUL << I40IW_CQPSQ_STAG_IDX_SHIFT) + +#define I40IW_CQPSQ_STAG_PARENTSTAGIDX_SHIFT 32 +#define I40IW_CQPSQ_STAG_PARENTSTAGIDX_MASK \ + (0xffffffULL << I40IW_CQPSQ_STAG_PARENTSTAGIDX_SHIFT) + +#define I40IW_CQPSQ_STAG_MR_SHIFT 43 +#define I40IW_CQPSQ_STAG_MR_MASK (1ULL << I40IW_CQPSQ_STAG_MR_SHIFT) + +#define I40IW_CQPSQ_STAG_LPBLSIZE_SHIFT I40IW_CQPSQ_CQ_LPBLSIZE_SHIFT +#define I40IW_CQPSQ_STAG_LPBLSIZE_MASK I40IW_CQPSQ_CQ_LPBLSIZE_MASK + +#define I40IW_CQPSQ_STAG_HPAGESIZE_SHIFT 46 +#define I40IW_CQPSQ_STAG_HPAGESIZE_MASK \ + (1ULL << I40IW_CQPSQ_STAG_HPAGESIZE_SHIFT) + +#define I40IW_CQPSQ_STAG_ARIGHTS_SHIFT 48 +#define I40IW_CQPSQ_STAG_ARIGHTS_MASK \ + (0x1fULL << I40IW_CQPSQ_STAG_ARIGHTS_SHIFT) + +#define I40IW_CQPSQ_STAG_REMACCENABLED_SHIFT 53 +#define I40IW_CQPSQ_STAG_REMACCENABLED_MASK \ + (1ULL << I40IW_CQPSQ_STAG_REMACCENABLED_SHIFT) + +#define I40IW_CQPSQ_STAG_VABASEDTO_SHIFT 59 +#define I40IW_CQPSQ_STAG_VABASEDTO_MASK \ + (1ULL << I40IW_CQPSQ_STAG_VABASEDTO_SHIFT) + +#define I40IW_CQPSQ_STAG_USEHMCFNIDX_SHIFT 60 +#define I40IW_CQPSQ_STAG_USEHMCFNIDX_MASK \ + (1ULL << I40IW_CQPSQ_STAG_USEHMCFNIDX_SHIFT) + +#define I40IW_CQPSQ_STAG_USEPFRID_SHIFT 61 +#define I40IW_CQPSQ_STAG_USEPFRID_MASK \ + (1ULL << I40IW_CQPSQ_STAG_USEPFRID_SHIFT) + +#define I40IW_CQPSQ_STAG_PBA_SHIFT I40IW_CQPHC_QPCTX_SHIFT +#define I40IW_CQPSQ_STAG_PBA_MASK I40IW_CQPHC_QPCTX_MASK + +#define I40IW_CQPSQ_STAG_HMCFNIDX_SHIFT 0 +#define I40IW_CQPSQ_STAG_HMCFNIDX_MASK \ + (0x3fUL << I40IW_CQPSQ_STAG_HMCFNIDX_SHIFT) + +#define I40IW_CQPSQ_STAG_FIRSTPMPBLIDX_SHIFT 0 +#define I40IW_CQPSQ_STAG_FIRSTPMPBLIDX_MASK \ + (0xfffffffUL << I40IW_CQPSQ_STAG_FIRSTPMPBLIDX_SHIFT) + +/* Query stag */ +#define I40IW_CQPSQ_QUERYSTAG_IDX_SHIFT I40IW_CQPSQ_STAG_IDX_SHIFT +#define I40IW_CQPSQ_QUERYSTAG_IDX_MASK I40IW_CQPSQ_STAG_IDX_MASK + +/* Allocate Local IP Address Entry */ + +/* Manage Local IP Address Table - MLIPA */ +#define I40IW_CQPSQ_MLIPA_IPV6LO_SHIFT I40IW_CQPHC_QPCTX_SHIFT +#define I40IW_CQPSQ_MLIPA_IPV6LO_MASK I40IW_CQPHC_QPCTX_MASK + +#define I40IW_CQPSQ_MLIPA_IPV6HI_SHIFT I40IW_CQPHC_QPCTX_SHIFT +#define I40IW_CQPSQ_MLIPA_IPV6HI_MASK I40IW_CQPHC_QPCTX_MASK + +#define I40IW_CQPSQ_MLIPA_IPV4_SHIFT 0 +#define I40IW_CQPSQ_MLIPA_IPV4_MASK \ + (0xffffffffUL << I40IW_CQPSQ_MLIPA_IPV4_SHIFT) + +#define I40IW_CQPSQ_MLIPA_IPTABLEIDX_SHIFT 0 +#define I40IW_CQPSQ_MLIPA_IPTABLEIDX_MASK \ + (0x3fUL << I40IW_CQPSQ_MLIPA_IPTABLEIDX_SHIFT) + +#define I40IW_CQPSQ_MLIPA_IPV4VALID_SHIFT 42 +#define I40IW_CQPSQ_MLIPA_IPV4VALID_MASK \ + (1ULL << I40IW_CQPSQ_MLIPA_IPV4VALID_SHIFT) + +#define I40IW_CQPSQ_MLIPA_IPV6VALID_SHIFT 43 +#define I40IW_CQPSQ_MLIPA_IPV6VALID_MASK \ + (1ULL << I40IW_CQPSQ_MLIPA_IPV6VALID_SHIFT) + +#define I40IW_CQPSQ_MLIPA_FREEENTRY_SHIFT 62 +#define I40IW_CQPSQ_MLIPA_FREEENTRY_MASK \ + (1ULL << I40IW_CQPSQ_MLIPA_FREEENTRY_SHIFT) + +#define I40IW_CQPSQ_MLIPA_IGNORE_REF_CNT_SHIFT 61 +#define I40IW_CQPSQ_MLIPA_IGNORE_REF_CNT_MASK \ + (1ULL << I40IW_CQPSQ_MLIPA_IGNORE_REF_CNT_SHIFT) + +#define I40IW_CQPSQ_MLIPA_MAC0_SHIFT 0 +#define I40IW_CQPSQ_MLIPA_MAC0_MASK (0xffUL << I40IW_CQPSQ_MLIPA_MAC0_SHIFT) + +#define I40IW_CQPSQ_MLIPA_MAC1_SHIFT 8 +#define I40IW_CQPSQ_MLIPA_MAC1_MASK (0xffUL << I40IW_CQPSQ_MLIPA_MAC1_SHIFT) + +#define I40IW_CQPSQ_MLIPA_MAC2_SHIFT 16 +#define I40IW_CQPSQ_MLIPA_MAC2_MASK (0xffUL << I40IW_CQPSQ_MLIPA_MAC2_SHIFT) + +#define I40IW_CQPSQ_MLIPA_MAC3_SHIFT 24 +#define I40IW_CQPSQ_MLIPA_MAC3_MASK (0xffUL << I40IW_CQPSQ_MLIPA_MAC3_SHIFT) + +#define I40IW_CQPSQ_MLIPA_MAC4_SHIFT 32 +#define I40IW_CQPSQ_MLIPA_MAC4_MASK (0xffULL << I40IW_CQPSQ_MLIPA_MAC4_SHIFT) + +#define I40IW_CQPSQ_MLIPA_MAC5_SHIFT 40 +#define I40IW_CQPSQ_MLIPA_MAC5_MASK (0xffULL << I40IW_CQPSQ_MLIPA_MAC5_SHIFT) + +/* Manage ARP Table - MAT */ +#define I40IW_CQPSQ_MAT_REACHMAX_SHIFT 0 +#define I40IW_CQPSQ_MAT_REACHMAX_MASK \ + (0xffffffffUL << I40IW_CQPSQ_MAT_REACHMAX_SHIFT) + +#define I40IW_CQPSQ_MAT_MACADDR_SHIFT 0 +#define I40IW_CQPSQ_MAT_MACADDR_MASK \ + (0xffffffffffffULL << I40IW_CQPSQ_MAT_MACADDR_SHIFT) + +#define I40IW_CQPSQ_MAT_ARPENTRYIDX_SHIFT 0 +#define I40IW_CQPSQ_MAT_ARPENTRYIDX_MASK \ + (0xfffUL << I40IW_CQPSQ_MAT_ARPENTRYIDX_SHIFT) + +#define I40IW_CQPSQ_MAT_ENTRYVALID_SHIFT 42 +#define I40IW_CQPSQ_MAT_ENTRYVALID_MASK \ + (1ULL << I40IW_CQPSQ_MAT_ENTRYVALID_SHIFT) + +#define I40IW_CQPSQ_MAT_PERMANENT_SHIFT 43 +#define I40IW_CQPSQ_MAT_PERMANENT_MASK \ + (1ULL << I40IW_CQPSQ_MAT_PERMANENT_SHIFT) + +#define I40IW_CQPSQ_MAT_QUERY_SHIFT 44 +#define I40IW_CQPSQ_MAT_QUERY_MASK (1ULL << I40IW_CQPSQ_MAT_QUERY_SHIFT) + +/* Manage VF PBLE Backing Pages - MVPBP*/ +#define I40IW_CQPSQ_MVPBP_PD_ENTRY_CNT_SHIFT 0 +#define I40IW_CQPSQ_MVPBP_PD_ENTRY_CNT_MASK \ + (0x3ffULL << I40IW_CQPSQ_MVPBP_PD_ENTRY_CNT_SHIFT) + +#define I40IW_CQPSQ_MVPBP_FIRST_PD_INX_SHIFT 16 +#define I40IW_CQPSQ_MVPBP_FIRST_PD_INX_MASK \ + (0x1ffULL << I40IW_CQPSQ_MVPBP_FIRST_PD_INX_SHIFT) + +#define I40IW_CQPSQ_MVPBP_SD_INX_SHIFT 32 +#define I40IW_CQPSQ_MVPBP_SD_INX_MASK \ + (0xfffULL << I40IW_CQPSQ_MVPBP_SD_INX_SHIFT) + +#define I40IW_CQPSQ_MVPBP_INV_PD_ENT_SHIFT 62 +#define I40IW_CQPSQ_MVPBP_INV_PD_ENT_MASK \ + (0x1ULL << I40IW_CQPSQ_MVPBP_INV_PD_ENT_SHIFT) + +#define I40IW_CQPSQ_MVPBP_PD_PLPBA_SHIFT 3 +#define I40IW_CQPSQ_MVPBP_PD_PLPBA_MASK \ + (0x1fffffffffffffffULL << I40IW_CQPSQ_MVPBP_PD_PLPBA_SHIFT) + +/* Manage Push Page - MPP */ +#define I40IW_INVALID_PUSH_PAGE_INDEX 0xffff + +#define I40IW_CQPSQ_MPP_QS_HANDLE_SHIFT 0 +#define I40IW_CQPSQ_MPP_QS_HANDLE_MASK (0xffffUL << \ + I40IW_CQPSQ_MPP_QS_HANDLE_SHIFT) + +#define I40IW_CQPSQ_MPP_PPIDX_SHIFT 0 +#define I40IW_CQPSQ_MPP_PPIDX_MASK (0x3ffUL << I40IW_CQPSQ_MPP_PPIDX_SHIFT) + +#define I40IW_CQPSQ_MPP_FREE_PAGE_SHIFT 62 +#define I40IW_CQPSQ_MPP_FREE_PAGE_MASK (1ULL << I40IW_CQPSQ_MPP_FREE_PAGE_SHIFT) + +/* Upload Context - UCTX */ +#define I40IW_CQPSQ_UCTX_QPCTXADDR_SHIFT I40IW_CQPHC_QPCTX_SHIFT +#define I40IW_CQPSQ_UCTX_QPCTXADDR_MASK I40IW_CQPHC_QPCTX_MASK + +#define I40IW_CQPSQ_UCTX_QPID_SHIFT 0 +#define I40IW_CQPSQ_UCTX_QPID_MASK (0x3ffffUL << I40IW_CQPSQ_UCTX_QPID_SHIFT) + +#define I40IW_CQPSQ_UCTX_QPTYPE_SHIFT 48 +#define I40IW_CQPSQ_UCTX_QPTYPE_MASK (0xfULL << I40IW_CQPSQ_UCTX_QPTYPE_SHIFT) + +#define I40IW_CQPSQ_UCTX_RAWFORMAT_SHIFT 61 +#define I40IW_CQPSQ_UCTX_RAWFORMAT_MASK \ + (1ULL << I40IW_CQPSQ_UCTX_RAWFORMAT_SHIFT) + +#define I40IW_CQPSQ_UCTX_FREEZEQP_SHIFT 62 +#define I40IW_CQPSQ_UCTX_FREEZEQP_MASK \ + (1ULL << I40IW_CQPSQ_UCTX_FREEZEQP_SHIFT) + +/* Manage HMC PM Function Table - MHMC */ +#define I40IW_CQPSQ_MHMC_VFIDX_SHIFT 0 +#define I40IW_CQPSQ_MHMC_VFIDX_MASK (0x7fUL << I40IW_CQPSQ_MHMC_VFIDX_SHIFT) + +#define I40IW_CQPSQ_MHMC_FREEPMFN_SHIFT 62 +#define I40IW_CQPSQ_MHMC_FREEPMFN_MASK \ + (1ULL << I40IW_CQPSQ_MHMC_FREEPMFN_SHIFT) + +/* Set HMC Resource Profile - SHMCRP */ +#define I40IW_CQPSQ_SHMCRP_HMC_PROFILE_SHIFT 0 +#define I40IW_CQPSQ_SHMCRP_HMC_PROFILE_MASK \ + (0x7ULL << I40IW_CQPSQ_SHMCRP_HMC_PROFILE_SHIFT) +#define I40IW_CQPSQ_SHMCRP_VFNUM_SHIFT 32 +#define I40IW_CQPSQ_SHMCRP_VFNUM_MASK (0x3fULL << I40IW_CQPSQ_SHMCRP_VFNUM_SHIFT) + +/* Create/Destroy CEQ */ +#define I40IW_CQPSQ_CEQ_CEQSIZE_SHIFT 0 +#define I40IW_CQPSQ_CEQ_CEQSIZE_MASK \ + (0x1ffffUL << I40IW_CQPSQ_CEQ_CEQSIZE_SHIFT) + +#define I40IW_CQPSQ_CEQ_CEQID_SHIFT 0 +#define I40IW_CQPSQ_CEQ_CEQID_MASK (0x7fUL << I40IW_CQPSQ_CEQ_CEQID_SHIFT) + +#define I40IW_CQPSQ_CEQ_LPBLSIZE_SHIFT I40IW_CQPSQ_CQ_LPBLSIZE_SHIFT +#define I40IW_CQPSQ_CEQ_LPBLSIZE_MASK I40IW_CQPSQ_CQ_LPBLSIZE_MASK + +#define I40IW_CQPSQ_CEQ_VMAP_SHIFT 47 +#define I40IW_CQPSQ_CEQ_VMAP_MASK (1ULL << I40IW_CQPSQ_CEQ_VMAP_SHIFT) + +#define I40IW_CQPSQ_CEQ_FIRSTPMPBLIDX_SHIFT 0 +#define I40IW_CQPSQ_CEQ_FIRSTPMPBLIDX_MASK \ + (0xfffffffUL << I40IW_CQPSQ_CEQ_FIRSTPMPBLIDX_SHIFT) + +/* Create/Destroy AEQ */ +#define I40IW_CQPSQ_AEQ_AEQECNT_SHIFT 0 +#define I40IW_CQPSQ_AEQ_AEQECNT_MASK \ + (0x7ffffUL << I40IW_CQPSQ_AEQ_AEQECNT_SHIFT) + +#define I40IW_CQPSQ_AEQ_LPBLSIZE_SHIFT I40IW_CQPSQ_CQ_LPBLSIZE_SHIFT +#define I40IW_CQPSQ_AEQ_LPBLSIZE_MASK I40IW_CQPSQ_CQ_LPBLSIZE_MASK + +#define I40IW_CQPSQ_AEQ_VMAP_SHIFT 47 +#define I40IW_CQPSQ_AEQ_VMAP_MASK (1ULL << I40IW_CQPSQ_AEQ_VMAP_SHIFT) + +#define I40IW_CQPSQ_AEQ_FIRSTPMPBLIDX_SHIFT 0 +#define I40IW_CQPSQ_AEQ_FIRSTPMPBLIDX_MASK \ + (0xfffffffUL << I40IW_CQPSQ_AEQ_FIRSTPMPBLIDX_SHIFT) + +/* Commit FPM Values - CFPM */ +#define I40IW_CQPSQ_CFPM_HMCFNID_SHIFT 0 +#define I40IW_CQPSQ_CFPM_HMCFNID_MASK (0x3fUL << I40IW_CQPSQ_CFPM_HMCFNID_SHIFT) + +/* Flush WQEs - FWQE */ +#define I40IW_CQPSQ_FWQE_AECODE_SHIFT 0 +#define I40IW_CQPSQ_FWQE_AECODE_MASK (0xffffUL << I40IW_CQPSQ_FWQE_AECODE_SHIFT) + +#define I40IW_CQPSQ_FWQE_AESOURCE_SHIFT 16 +#define I40IW_CQPSQ_FWQE_AESOURCE_MASK \ + (0xfUL << I40IW_CQPSQ_FWQE_AESOURCE_SHIFT) + +#define I40IW_CQPSQ_FWQE_RQMNERR_SHIFT 0 +#define I40IW_CQPSQ_FWQE_RQMNERR_MASK \ + (0xffffUL << I40IW_CQPSQ_FWQE_RQMNERR_SHIFT) + +#define I40IW_CQPSQ_FWQE_RQMJERR_SHIFT 16 +#define I40IW_CQPSQ_FWQE_RQMJERR_MASK \ + (0xffffUL << I40IW_CQPSQ_FWQE_RQMJERR_SHIFT) + +#define I40IW_CQPSQ_FWQE_SQMNERR_SHIFT 32 +#define I40IW_CQPSQ_FWQE_SQMNERR_MASK \ + (0xffffULL << I40IW_CQPSQ_FWQE_SQMNERR_SHIFT) + +#define I40IW_CQPSQ_FWQE_SQMJERR_SHIFT 48 +#define I40IW_CQPSQ_FWQE_SQMJERR_MASK \ + (0xffffULL << I40IW_CQPSQ_FWQE_SQMJERR_SHIFT) + +#define I40IW_CQPSQ_FWQE_QPID_SHIFT 0 +#define I40IW_CQPSQ_FWQE_QPID_MASK (0x3ffffULL << I40IW_CQPSQ_FWQE_QPID_SHIFT) + +#define I40IW_CQPSQ_FWQE_GENERATE_AE_SHIFT 59 +#define I40IW_CQPSQ_FWQE_GENERATE_AE_MASK (1ULL << \ + I40IW_CQPSQ_FWQE_GENERATE_AE_SHIFT) + +#define I40IW_CQPSQ_FWQE_USERFLCODE_SHIFT 60 +#define I40IW_CQPSQ_FWQE_USERFLCODE_MASK \ + (1ULL << I40IW_CQPSQ_FWQE_USERFLCODE_SHIFT) + +#define I40IW_CQPSQ_FWQE_FLUSHSQ_SHIFT 61 +#define I40IW_CQPSQ_FWQE_FLUSHSQ_MASK (1ULL << I40IW_CQPSQ_FWQE_FLUSHSQ_SHIFT) + +#define I40IW_CQPSQ_FWQE_FLUSHRQ_SHIFT 62 +#define I40IW_CQPSQ_FWQE_FLUSHRQ_MASK (1ULL << I40IW_CQPSQ_FWQE_FLUSHRQ_SHIFT) + +/* Manage Accelerated Port Table - MAPT */ +#define I40IW_CQPSQ_MAPT_PORT_SHIFT 0 +#define I40IW_CQPSQ_MAPT_PORT_MASK (0xffffUL << I40IW_CQPSQ_MAPT_PORT_SHIFT) + +#define I40IW_CQPSQ_MAPT_ADDPORT_SHIFT 62 +#define I40IW_CQPSQ_MAPT_ADDPORT_MASK (1ULL << I40IW_CQPSQ_MAPT_ADDPORT_SHIFT) + +/* Update Protocol Engine SDs */ +#define I40IW_CQPSQ_UPESD_SDCMD_SHIFT 0 +#define I40IW_CQPSQ_UPESD_SDCMD_MASK (0xffffffffUL << I40IW_CQPSQ_UPESD_SDCMD_SHIFT) + +#define I40IW_CQPSQ_UPESD_SDDATALOW_SHIFT 0 +#define I40IW_CQPSQ_UPESD_SDDATALOW_MASK \ + (0xffffffffUL << I40IW_CQPSQ_UPESD_SDDATALOW_SHIFT) + +#define I40IW_CQPSQ_UPESD_SDDATAHI_SHIFT 32 +#define I40IW_CQPSQ_UPESD_SDDATAHI_MASK \ + (0xffffffffULL << I40IW_CQPSQ_UPESD_SDDATAHI_SHIFT) +#define I40IW_CQPSQ_UPESD_HMCFNID_SHIFT 0 +#define I40IW_CQPSQ_UPESD_HMCFNID_MASK \ + (0x3fUL << I40IW_CQPSQ_UPESD_HMCFNID_SHIFT) + +#define I40IW_CQPSQ_UPESD_ENTRY_VALID_SHIFT 63 +#define I40IW_CQPSQ_UPESD_ENTRY_VALID_MASK \ + ((u64)1 << I40IW_CQPSQ_UPESD_ENTRY_VALID_SHIFT) + +#define I40IW_CQPSQ_UPESD_ENTRY_COUNT_SHIFT 0 +#define I40IW_CQPSQ_UPESD_ENTRY_COUNT_MASK \ + (0xfUL << I40IW_CQPSQ_UPESD_ENTRY_COUNT_SHIFT) + +#define I40IW_CQPSQ_UPESD_SKIP_ENTRY_SHIFT 7 +#define I40IW_CQPSQ_UPESD_SKIP_ENTRY_MASK \ + (0x1UL << I40IW_CQPSQ_UPESD_SKIP_ENTRY_SHIFT) + +/* Suspend QP */ +#define I40IW_CQPSQ_SUSPENDQP_QPID_SHIFT 0 +#define I40IW_CQPSQ_SUSPENDQP_QPID_MASK (0x3FFFFUL) +/* I40IWCQ_QPID_MASK */ + +/* Resume QP */ +#define I40IW_CQPSQ_RESUMEQP_QSHANDLE_SHIFT 0 +#define I40IW_CQPSQ_RESUMEQP_QSHANDLE_MASK \ + (0xffffffffUL << I40IW_CQPSQ_RESUMEQP_QSHANDLE_SHIFT) + +#define I40IW_CQPSQ_RESUMEQP_QPID_SHIFT 0 +#define I40IW_CQPSQ_RESUMEQP_QPID_MASK (0x3FFFFUL) +/* I40IWCQ_QPID_MASK */ + +/* IW QP Context */ +#define I40IWQPC_DDP_VER_SHIFT 0 +#define I40IWQPC_DDP_VER_MASK (3UL << I40IWQPC_DDP_VER_SHIFT) + +#define I40IWQPC_SNAP_SHIFT 2 +#define I40IWQPC_SNAP_MASK (1UL << I40IWQPC_SNAP_SHIFT) + +#define I40IWQPC_IPV4_SHIFT 3 +#define I40IWQPC_IPV4_MASK (1UL << I40IWQPC_IPV4_SHIFT) + +#define I40IWQPC_NONAGLE_SHIFT 4 +#define I40IWQPC_NONAGLE_MASK (1UL << I40IWQPC_NONAGLE_SHIFT) + +#define I40IWQPC_INSERTVLANTAG_SHIFT 5 +#define I40IWQPC_INSERTVLANTAG_MASK (1 << I40IWQPC_INSERTVLANTAG_SHIFT) + +#define I40IWQPC_USESRQ_SHIFT 6 +#define I40IWQPC_USESRQ_MASK (1UL << I40IWQPC_USESRQ_SHIFT) + +#define I40IWQPC_TIMESTAMP_SHIFT 7 +#define I40IWQPC_TIMESTAMP_MASK (1UL << I40IWQPC_TIMESTAMP_SHIFT) + +#define I40IWQPC_RQWQESIZE_SHIFT 8 +#define I40IWQPC_RQWQESIZE_MASK (3UL << I40IWQPC_RQWQESIZE_SHIFT) + +#define I40IWQPC_INSERTL2TAG2_SHIFT 11 +#define I40IWQPC_INSERTL2TAG2_MASK (1UL << I40IWQPC_INSERTL2TAG2_SHIFT) + +#define I40IWQPC_LIMIT_SHIFT 12 +#define I40IWQPC_LIMIT_MASK (3UL << I40IWQPC_LIMIT_SHIFT) + +#define I40IWQPC_DROPOOOSEG_SHIFT 15 +#define I40IWQPC_DROPOOOSEG_MASK (1UL << I40IWQPC_DROPOOOSEG_SHIFT) + +#define I40IWQPC_DUPACK_THRESH_SHIFT 16 +#define I40IWQPC_DUPACK_THRESH_MASK (7UL << I40IWQPC_DUPACK_THRESH_SHIFT) + +#define I40IWQPC_ERR_RQ_IDX_VALID_SHIFT 19 +#define I40IWQPC_ERR_RQ_IDX_VALID_MASK (1UL << I40IWQPC_ERR_RQ_IDX_VALID_SHIFT) + +#define I40IWQPC_DIS_VLAN_CHECKS_SHIFT 19 +#define I40IWQPC_DIS_VLAN_CHECKS_MASK (7UL << I40IWQPC_DIS_VLAN_CHECKS_SHIFT) + +#define I40IWQPC_RCVTPHEN_SHIFT 28 +#define I40IWQPC_RCVTPHEN_MASK (1UL << I40IWQPC_RCVTPHEN_SHIFT) + +#define I40IWQPC_XMITTPHEN_SHIFT 29 +#define I40IWQPC_XMITTPHEN_MASK (1ULL << I40IWQPC_XMITTPHEN_SHIFT) + +#define I40IWQPC_RQTPHEN_SHIFT 30 +#define I40IWQPC_RQTPHEN_MASK (1UL << I40IWQPC_RQTPHEN_SHIFT) + +#define I40IWQPC_SQTPHEN_SHIFT 31 +#define I40IWQPC_SQTPHEN_MASK (1ULL << I40IWQPC_SQTPHEN_SHIFT) + +#define I40IWQPC_PPIDX_SHIFT 32 +#define I40IWQPC_PPIDX_MASK (0x3ffULL << I40IWQPC_PPIDX_SHIFT) + +#define I40IWQPC_PMENA_SHIFT 47 +#define I40IWQPC_PMENA_MASK (1ULL << I40IWQPC_PMENA_SHIFT) + +#define I40IWQPC_RDMAP_VER_SHIFT 62 +#define I40IWQPC_RDMAP_VER_MASK (3ULL << I40IWQPC_RDMAP_VER_SHIFT) + +#define I40IWQPC_SQADDR_SHIFT I40IW_CQPHC_QPCTX_SHIFT +#define I40IWQPC_SQADDR_MASK I40IW_CQPHC_QPCTX_MASK + +#define I40IWQPC_RQADDR_SHIFT I40IW_CQPHC_QPCTX_SHIFT +#define I40IWQPC_RQADDR_MASK I40IW_CQPHC_QPCTX_MASK + +#define I40IWQPC_TTL_SHIFT 0 +#define I40IWQPC_TTL_MASK (0xffUL << I40IWQPC_TTL_SHIFT) + +#define I40IWQPC_RQSIZE_SHIFT 8 +#define I40IWQPC_RQSIZE_MASK (0xfUL << I40IWQPC_RQSIZE_SHIFT) + +#define I40IWQPC_SQSIZE_SHIFT 12 +#define I40IWQPC_SQSIZE_MASK (0xfUL << I40IWQPC_SQSIZE_SHIFT) + +#define I40IWQPC_SRCMACADDRIDX_SHIFT 16 +#define I40IWQPC_SRCMACADDRIDX_MASK (0x3fUL << I40IWQPC_SRCMACADDRIDX_SHIFT) + +#define I40IWQPC_AVOIDSTRETCHACK_SHIFT 23 +#define I40IWQPC_AVOIDSTRETCHACK_MASK (1UL << I40IWQPC_AVOIDSTRETCHACK_SHIFT) + +#define I40IWQPC_TOS_SHIFT 24 +#define I40IWQPC_TOS_MASK (0xffUL << I40IWQPC_TOS_SHIFT) + +#define I40IWQPC_SRCPORTNUM_SHIFT 32 +#define I40IWQPC_SRCPORTNUM_MASK (0xffffULL << I40IWQPC_SRCPORTNUM_SHIFT) + +#define I40IWQPC_DESTPORTNUM_SHIFT 48 +#define I40IWQPC_DESTPORTNUM_MASK (0xffffULL << I40IWQPC_DESTPORTNUM_SHIFT) + +#define I40IWQPC_DESTIPADDR0_SHIFT 32 +#define I40IWQPC_DESTIPADDR0_MASK \ + (0xffffffffULL << I40IWQPC_DESTIPADDR0_SHIFT) + +#define I40IWQPC_DESTIPADDR1_SHIFT 0 +#define I40IWQPC_DESTIPADDR1_MASK \ + (0xffffffffULL << I40IWQPC_DESTIPADDR1_SHIFT) + +#define I40IWQPC_DESTIPADDR2_SHIFT 32 +#define I40IWQPC_DESTIPADDR2_MASK \ + (0xffffffffULL << I40IWQPC_DESTIPADDR2_SHIFT) + +#define I40IWQPC_DESTIPADDR3_SHIFT 0 +#define I40IWQPC_DESTIPADDR3_MASK \ + (0xffffffffULL << I40IWQPC_DESTIPADDR3_SHIFT) + +#define I40IWQPC_SNDMSS_SHIFT 16 +#define I40IWQPC_SNDMSS_MASK (0x3fffUL << I40IWQPC_SNDMSS_SHIFT) + +#define I40IWQPC_VLANTAG_SHIFT 32 +#define I40IWQPC_VLANTAG_MASK (0xffffULL << I40IWQPC_VLANTAG_SHIFT) + +#define I40IWQPC_ARPIDX_SHIFT 48 +#define I40IWQPC_ARPIDX_MASK (0xfffULL << I40IWQPC_ARPIDX_SHIFT) + +#define I40IWQPC_FLOWLABEL_SHIFT 0 +#define I40IWQPC_FLOWLABEL_MASK (0xfffffUL << I40IWQPC_FLOWLABEL_SHIFT) + +#define I40IWQPC_WSCALE_SHIFT 20 +#define I40IWQPC_WSCALE_MASK (1UL << I40IWQPC_WSCALE_SHIFT) + +#define I40IWQPC_KEEPALIVE_SHIFT 21 +#define I40IWQPC_KEEPALIVE_MASK (1UL << I40IWQPC_KEEPALIVE_SHIFT) + +#define I40IWQPC_IGNORE_TCP_OPT_SHIFT 22 +#define I40IWQPC_IGNORE_TCP_OPT_MASK (1UL << I40IWQPC_IGNORE_TCP_OPT_SHIFT) + +#define I40IWQPC_IGNORE_TCP_UNS_OPT_SHIFT 23 +#define I40IWQPC_IGNORE_TCP_UNS_OPT_MASK \ + (1UL << I40IWQPC_IGNORE_TCP_UNS_OPT_SHIFT) + +#define I40IWQPC_TCPSTATE_SHIFT 28 +#define I40IWQPC_TCPSTATE_MASK (0xfUL << I40IWQPC_TCPSTATE_SHIFT) + +#define I40IWQPC_RCVSCALE_SHIFT 32 +#define I40IWQPC_RCVSCALE_MASK (0xfULL << I40IWQPC_RCVSCALE_SHIFT) + +#define I40IWQPC_SNDSCALE_SHIFT 40 +#define I40IWQPC_SNDSCALE_MASK (0xfULL << I40IWQPC_SNDSCALE_SHIFT) + +#define I40IWQPC_PDIDX_SHIFT 48 +#define I40IWQPC_PDIDX_MASK (0x7fffULL << I40IWQPC_PDIDX_SHIFT) + +#define I40IWQPC_KALIVE_TIMER_MAX_PROBES_SHIFT 16 +#define I40IWQPC_KALIVE_TIMER_MAX_PROBES_MASK \ + (0xffUL << I40IWQPC_KALIVE_TIMER_MAX_PROBES_SHIFT) + +#define I40IWQPC_KEEPALIVE_INTERVAL_SHIFT 24 +#define I40IWQPC_KEEPALIVE_INTERVAL_MASK \ + (0xffUL << I40IWQPC_KEEPALIVE_INTERVAL_SHIFT) + +#define I40IWQPC_TIMESTAMP_RECENT_SHIFT 0 +#define I40IWQPC_TIMESTAMP_RECENT_MASK \ + (0xffffffffUL << I40IWQPC_TIMESTAMP_RECENT_SHIFT) + +#define I40IWQPC_TIMESTAMP_AGE_SHIFT 32 +#define I40IWQPC_TIMESTAMP_AGE_MASK \ + (0xffffffffULL << I40IWQPC_TIMESTAMP_AGE_SHIFT) + +#define I40IWQPC_SNDNXT_SHIFT 0 +#define I40IWQPC_SNDNXT_MASK (0xffffffffUL << I40IWQPC_SNDNXT_SHIFT) + +#define I40IWQPC_SNDWND_SHIFT 32 +#define I40IWQPC_SNDWND_MASK (0xffffffffULL << I40IWQPC_SNDWND_SHIFT) + +#define I40IWQPC_RCVNXT_SHIFT 0 +#define I40IWQPC_RCVNXT_MASK (0xffffffffUL << I40IWQPC_RCVNXT_SHIFT) + +#define I40IWQPC_RCVWND_SHIFT 32 +#define I40IWQPC_RCVWND_MASK (0xffffffffULL << I40IWQPC_RCVWND_SHIFT) + +#define I40IWQPC_SNDMAX_SHIFT 0 +#define I40IWQPC_SNDMAX_MASK (0xffffffffUL << I40IWQPC_SNDMAX_SHIFT) + +#define I40IWQPC_SNDUNA_SHIFT 32 +#define I40IWQPC_SNDUNA_MASK (0xffffffffULL << I40IWQPC_SNDUNA_SHIFT) + +#define I40IWQPC_SRTT_SHIFT 0 +#define I40IWQPC_SRTT_MASK (0xffffffffUL << I40IWQPC_SRTT_SHIFT) + +#define I40IWQPC_RTTVAR_SHIFT 32 +#define I40IWQPC_RTTVAR_MASK (0xffffffffULL << I40IWQPC_RTTVAR_SHIFT) + +#define I40IWQPC_SSTHRESH_SHIFT 0 +#define I40IWQPC_SSTHRESH_MASK (0xffffffffUL << I40IWQPC_SSTHRESH_SHIFT) + +#define I40IWQPC_CWND_SHIFT 32 +#define I40IWQPC_CWND_MASK (0xffffffffULL << I40IWQPC_CWND_SHIFT) + +#define I40IWQPC_SNDWL1_SHIFT 0 +#define I40IWQPC_SNDWL1_MASK (0xffffffffUL << I40IWQPC_SNDWL1_SHIFT) + +#define I40IWQPC_SNDWL2_SHIFT 32 +#define I40IWQPC_SNDWL2_MASK (0xffffffffULL << I40IWQPC_SNDWL2_SHIFT) + +#define I40IWQPC_ERR_RQ_IDX_SHIFT 32 +#define I40IWQPC_ERR_RQ_IDX_MASK (0x3fffULL << I40IWQPC_ERR_RQ_IDX_SHIFT) + +#define I40IWQPC_MAXSNDWND_SHIFT 0 +#define I40IWQPC_MAXSNDWND_MASK (0xffffffffUL << I40IWQPC_MAXSNDWND_SHIFT) + +#define I40IWQPC_REXMIT_THRESH_SHIFT 48 +#define I40IWQPC_REXMIT_THRESH_MASK (0x3fULL << I40IWQPC_REXMIT_THRESH_SHIFT) + +#define I40IWQPC_TXCQNUM_SHIFT 0 +#define I40IWQPC_TXCQNUM_MASK (0x1ffffUL << I40IWQPC_TXCQNUM_SHIFT) + +#define I40IWQPC_RXCQNUM_SHIFT 32 +#define I40IWQPC_RXCQNUM_MASK (0x1ffffULL << I40IWQPC_RXCQNUM_SHIFT) + +#define I40IWQPC_Q2ADDR_SHIFT I40IW_CQPHC_QPCTX_SHIFT +#define I40IWQPC_Q2ADDR_MASK I40IW_CQPHC_QPCTX_MASK + +#define I40IWQPC_LASTBYTESENT_SHIFT 0 +#define I40IWQPC_LASTBYTESENT_MASK (0xffUL << I40IWQPC_LASTBYTESENT_SHIFT) + +#define I40IWQPC_SRQID_SHIFT 32 +#define I40IWQPC_SRQID_MASK (0xffULL << I40IWQPC_SRQID_SHIFT) + +#define I40IWQPC_ORDSIZE_SHIFT 0 +#define I40IWQPC_ORDSIZE_MASK (0x7fUL << I40IWQPC_ORDSIZE_SHIFT) + +#define I40IWQPC_IRDSIZE_SHIFT 16 +#define I40IWQPC_IRDSIZE_MASK (0x3UL << I40IWQPC_IRDSIZE_SHIFT) + +#define I40IWQPC_WRRDRSPOK_SHIFT 20 +#define I40IWQPC_WRRDRSPOK_MASK (1UL << I40IWQPC_WRRDRSPOK_SHIFT) + +#define I40IWQPC_RDOK_SHIFT 21 +#define I40IWQPC_RDOK_MASK (1UL << I40IWQPC_RDOK_SHIFT) + +#define I40IWQPC_SNDMARKERS_SHIFT 22 +#define I40IWQPC_SNDMARKERS_MASK (1UL << I40IWQPC_SNDMARKERS_SHIFT) + +#define I40IWQPC_BINDEN_SHIFT 23 +#define I40IWQPC_BINDEN_MASK (1UL << I40IWQPC_BINDEN_SHIFT) + +#define I40IWQPC_FASTREGEN_SHIFT 24 +#define I40IWQPC_FASTREGEN_MASK (1UL << I40IWQPC_FASTREGEN_SHIFT) + +#define I40IWQPC_PRIVEN_SHIFT 25 +#define I40IWQPC_PRIVEN_MASK (1UL << I40IWQPC_PRIVEN_SHIFT) + +#define I40IWQPC_LSMMPRESENT_SHIFT 26 +#define I40IWQPC_LSMMPRESENT_MASK (1UL << I40IWQPC_LSMMPRESENT_SHIFT) + +#define I40IWQPC_ADJUSTFORLSMM_SHIFT 27 +#define I40IWQPC_ADJUSTFORLSMM_MASK (1UL << I40IWQPC_ADJUSTFORLSMM_SHIFT) + +#define I40IWQPC_IWARPMODE_SHIFT 28 +#define I40IWQPC_IWARPMODE_MASK (1UL << I40IWQPC_IWARPMODE_SHIFT) + +#define I40IWQPC_RCVMARKERS_SHIFT 29 +#define I40IWQPC_RCVMARKERS_MASK (1UL << I40IWQPC_RCVMARKERS_SHIFT) + +#define I40IWQPC_ALIGNHDRS_SHIFT 30 +#define I40IWQPC_ALIGNHDRS_MASK (1UL << I40IWQPC_ALIGNHDRS_SHIFT) + +#define I40IWQPC_RCVNOMPACRC_SHIFT 31 +#define I40IWQPC_RCVNOMPACRC_MASK (1UL << I40IWQPC_RCVNOMPACRC_SHIFT) + +#define I40IWQPC_RCVMARKOFFSET_SHIFT 33 +#define I40IWQPC_RCVMARKOFFSET_MASK (0x1ffULL << I40IWQPC_RCVMARKOFFSET_SHIFT) + +#define I40IWQPC_SNDMARKOFFSET_SHIFT 48 +#define I40IWQPC_SNDMARKOFFSET_MASK (0x1ffULL << I40IWQPC_SNDMARKOFFSET_SHIFT) + +#define I40IWQPC_QPCOMPCTX_SHIFT I40IW_CQPHC_QPCTX_SHIFT +#define I40IWQPC_QPCOMPCTX_MASK I40IW_CQPHC_QPCTX_MASK + +#define I40IWQPC_SQTPHVAL_SHIFT 0 +#define I40IWQPC_SQTPHVAL_MASK (0xffUL << I40IWQPC_SQTPHVAL_SHIFT) + +#define I40IWQPC_RQTPHVAL_SHIFT 8 +#define I40IWQPC_RQTPHVAL_MASK (0xffUL << I40IWQPC_RQTPHVAL_SHIFT) + +#define I40IWQPC_QSHANDLE_SHIFT 16 +#define I40IWQPC_QSHANDLE_MASK (0x3ffUL << I40IWQPC_QSHANDLE_SHIFT) + +#define I40IWQPC_EXCEPTION_LAN_QUEUE_SHIFT 32 +#define I40IWQPC_EXCEPTION_LAN_QUEUE_MASK (0xfffULL << \ + I40IWQPC_EXCEPTION_LAN_QUEUE_SHIFT) + +#define I40IWQPC_LOCAL_IPADDR3_SHIFT 0 +#define I40IWQPC_LOCAL_IPADDR3_MASK \ + (0xffffffffUL << I40IWQPC_LOCAL_IPADDR3_SHIFT) + +#define I40IWQPC_LOCAL_IPADDR2_SHIFT 32 +#define I40IWQPC_LOCAL_IPADDR2_MASK \ + (0xffffffffULL << I40IWQPC_LOCAL_IPADDR2_SHIFT) + +#define I40IWQPC_LOCAL_IPADDR1_SHIFT 0 +#define I40IWQPC_LOCAL_IPADDR1_MASK \ + (0xffffffffUL << I40IWQPC_LOCAL_IPADDR1_SHIFT) + +#define I40IWQPC_LOCAL_IPADDR0_SHIFT 32 +#define I40IWQPC_LOCAL_IPADDR0_MASK \ + (0xffffffffULL << I40IWQPC_LOCAL_IPADDR0_SHIFT) + +#define I40IW_QP_SW_MIN_WQSIZE 4 /*in WRs*/ +#define I40IW_SQ_RSVD 2 +#define I40IW_RQ_RSVD 1 +#define I40IW_QP_SW_MAX_SQ_QUANTAS 2048 +#define I40IW_QP_SW_MAX_RQ_QUANTAS 16384 +#define I40IWQP_OP_RDMA_WRITE 0 +#define I40IWQP_OP_RDMA_READ 1 +#define I40IWQP_OP_RDMA_SEND 3 +#define I40IWQP_OP_RDMA_SEND_INV 4 +#define I40IWQP_OP_RDMA_SEND_SOL_EVENT 5 +#define I40IWQP_OP_RDMA_SEND_SOL_EVENT_INV 6 +#define I40IWQP_OP_BIND_MW 8 +#define I40IWQP_OP_FAST_REGISTER 9 +#define I40IWQP_OP_LOCAL_INVALIDATE 10 +#define I40IWQP_OP_RDMA_READ_LOC_INV 11 +#define I40IWQP_OP_NOP 12 + +#define I40IW_RSVD_SHIFT 41 +#define I40IW_RSVD_MASK (0x7fffULL << I40IW_RSVD_SHIFT) + +/* iwarp QP SQ WQE common fields */ +#define I40IWQPSQ_OPCODE_SHIFT 32 +#define I40IWQPSQ_OPCODE_MASK (0x3fULL << I40IWQPSQ_OPCODE_SHIFT) + +#define I40IWQPSQ_ADDFRAGCNT_SHIFT 38 +#define I40IWQPSQ_ADDFRAGCNT_MASK (0x7ULL << I40IWQPSQ_ADDFRAGCNT_SHIFT) + +#define I40IWQPSQ_PUSHWQE_SHIFT 56 +#define I40IWQPSQ_PUSHWQE_MASK (1ULL << I40IWQPSQ_PUSHWQE_SHIFT) + +#define I40IWQPSQ_STREAMMODE_SHIFT 58 +#define I40IWQPSQ_STREAMMODE_MASK (1ULL << I40IWQPSQ_STREAMMODE_SHIFT) + +#define I40IWQPSQ_WAITFORRCVPDU_SHIFT 59 +#define I40IWQPSQ_WAITFORRCVPDU_MASK (1ULL << I40IWQPSQ_WAITFORRCVPDU_SHIFT) + +#define I40IWQPSQ_READFENCE_SHIFT 60 +#define I40IWQPSQ_READFENCE_MASK (1ULL << I40IWQPSQ_READFENCE_SHIFT) + +#define I40IWQPSQ_LOCALFENCE_SHIFT 61 +#define I40IWQPSQ_LOCALFENCE_MASK (1ULL << I40IWQPSQ_LOCALFENCE_SHIFT) + +#define I40IWQPSQ_SIGCOMPL_SHIFT 62 +#define I40IWQPSQ_SIGCOMPL_MASK (1ULL << I40IWQPSQ_SIGCOMPL_SHIFT) + +#define I40IWQPSQ_VALID_SHIFT 63 +#define I40IWQPSQ_VALID_MASK (1ULL << I40IWQPSQ_VALID_SHIFT) + +#define I40IWQPSQ_FRAG_TO_SHIFT I40IW_CQPHC_QPCTX_SHIFT +#define I40IWQPSQ_FRAG_TO_MASK I40IW_CQPHC_QPCTX_MASK + +#define I40IWQPSQ_FRAG_LEN_SHIFT 0 +#define I40IWQPSQ_FRAG_LEN_MASK (0xffffffffUL << I40IWQPSQ_FRAG_LEN_SHIFT) + +#define I40IWQPSQ_FRAG_STAG_SHIFT 32 +#define I40IWQPSQ_FRAG_STAG_MASK (0xffffffffULL << I40IWQPSQ_FRAG_STAG_SHIFT) + +#define I40IWQPSQ_REMSTAGINV_SHIFT 0 +#define I40IWQPSQ_REMSTAGINV_MASK (0xffffffffUL << I40IWQPSQ_REMSTAGINV_SHIFT) + +#define I40IWQPSQ_INLINEDATAFLAG_SHIFT 57 +#define I40IWQPSQ_INLINEDATAFLAG_MASK (1ULL << I40IWQPSQ_INLINEDATAFLAG_SHIFT) + +#define I40IWQPSQ_INLINEDATALEN_SHIFT 48 +#define I40IWQPSQ_INLINEDATALEN_MASK \ + (0x7fULL << I40IWQPSQ_INLINEDATALEN_SHIFT) + +/* iwarp send with push mode */ +#define I40IWQPSQ_WQDESCIDX_SHIFT 0 +#define I40IWQPSQ_WQDESCIDX_MASK (0x3fffUL << I40IWQPSQ_WQDESCIDX_SHIFT) + +/* rdma write */ +#define I40IWQPSQ_REMSTAG_SHIFT 0 +#define I40IWQPSQ_REMSTAG_MASK (0xffffffffUL << I40IWQPSQ_REMSTAG_SHIFT) + +#define I40IWQPSQ_REMTO_SHIFT I40IW_CQPHC_QPCTX_SHIFT +#define I40IWQPSQ_REMTO_MASK I40IW_CQPHC_QPCTX_MASK + +/* memory window */ +#define I40IWQPSQ_STAGRIGHTS_SHIFT 48 +#define I40IWQPSQ_STAGRIGHTS_MASK (0x1fULL << I40IWQPSQ_STAGRIGHTS_SHIFT) + +#define I40IWQPSQ_VABASEDTO_SHIFT 53 +#define I40IWQPSQ_VABASEDTO_MASK (1ULL << I40IWQPSQ_VABASEDTO_SHIFT) + +#define I40IWQPSQ_MWLEN_SHIFT I40IW_CQPHC_QPCTX_SHIFT +#define I40IWQPSQ_MWLEN_MASK I40IW_CQPHC_QPCTX_MASK + +#define I40IWQPSQ_PARENTMRSTAG_SHIFT 0 +#define I40IWQPSQ_PARENTMRSTAG_MASK \ + (0xffffffffUL << I40IWQPSQ_PARENTMRSTAG_SHIFT) + +#define I40IWQPSQ_MWSTAG_SHIFT 32 +#define I40IWQPSQ_MWSTAG_MASK (0xffffffffULL << I40IWQPSQ_MWSTAG_SHIFT) + +#define I40IWQPSQ_BASEVA_TO_FBO_SHIFT I40IW_CQPHC_QPCTX_SHIFT +#define I40IWQPSQ_BASEVA_TO_FBO_MASK I40IW_CQPHC_QPCTX_MASK + +/* Local Invalidate */ +#define I40IWQPSQ_LOCSTAG_SHIFT 32 +#define I40IWQPSQ_LOCSTAG_MASK (0xffffffffULL << I40IWQPSQ_LOCSTAG_SHIFT) + +/* Fast Register */ +#define I40IWQPSQ_STAGKEY_SHIFT 0 +#define I40IWQPSQ_STAGKEY_MASK (0xffUL << I40IWQPSQ_STAGKEY_SHIFT) + +#define I40IWQPSQ_STAGINDEX_SHIFT 8 +#define I40IWQPSQ_STAGINDEX_MASK (0xffffffUL << I40IWQPSQ_STAGINDEX_SHIFT) + +#define I40IWQPSQ_COPYHOSTPBLS_SHIFT 43 +#define I40IWQPSQ_COPYHOSTPBLS_MASK (1ULL << I40IWQPSQ_COPYHOSTPBLS_SHIFT) + +#define I40IWQPSQ_LPBLSIZE_SHIFT 44 +#define I40IWQPSQ_LPBLSIZE_MASK (3ULL << I40IWQPSQ_LPBLSIZE_SHIFT) + +#define I40IWQPSQ_HPAGESIZE_SHIFT 46 +#define I40IWQPSQ_HPAGESIZE_MASK (3ULL << I40IWQPSQ_HPAGESIZE_SHIFT) + +#define I40IWQPSQ_STAGLEN_SHIFT 0 +#define I40IWQPSQ_STAGLEN_MASK (0x1ffffffffffULL << I40IWQPSQ_STAGLEN_SHIFT) + +#define I40IWQPSQ_FIRSTPMPBLIDXLO_SHIFT 48 +#define I40IWQPSQ_FIRSTPMPBLIDXLO_MASK \ + (0xffffULL << I40IWQPSQ_FIRSTPMPBLIDXLO_SHIFT) + +#define I40IWQPSQ_FIRSTPMPBLIDXHI_SHIFT 0 +#define I40IWQPSQ_FIRSTPMPBLIDXHI_MASK \ + (0xfffUL << I40IWQPSQ_FIRSTPMPBLIDXHI_SHIFT) + +#define I40IWQPSQ_PBLADDR_SHIFT 12 +#define I40IWQPSQ_PBLADDR_MASK (0xfffffffffffffULL << I40IWQPSQ_PBLADDR_SHIFT) + +/* iwarp QP RQ WQE common fields */ +#define I40IWQPRQ_ADDFRAGCNT_SHIFT I40IWQPSQ_ADDFRAGCNT_SHIFT +#define I40IWQPRQ_ADDFRAGCNT_MASK I40IWQPSQ_ADDFRAGCNT_MASK + +#define I40IWQPRQ_VALID_SHIFT I40IWQPSQ_VALID_SHIFT +#define I40IWQPRQ_VALID_MASK I40IWQPSQ_VALID_MASK + +#define I40IWQPRQ_COMPLCTX_SHIFT I40IW_CQPHC_QPCTX_SHIFT +#define I40IWQPRQ_COMPLCTX_MASK I40IW_CQPHC_QPCTX_MASK + +#define I40IWQPRQ_FRAG_LEN_SHIFT I40IWQPSQ_FRAG_LEN_SHIFT +#define I40IWQPRQ_FRAG_LEN_MASK I40IWQPSQ_FRAG_LEN_MASK + +#define I40IWQPRQ_STAG_SHIFT I40IWQPSQ_FRAG_STAG_SHIFT +#define I40IWQPRQ_STAG_MASK I40IWQPSQ_FRAG_STAG_MASK + +#define I40IWQPRQ_TO_SHIFT I40IWQPSQ_FRAG_TO_SHIFT +#define I40IWQPRQ_TO_MASK I40IWQPSQ_FRAG_TO_MASK + +/* Query FPM CQP buf */ +#define I40IW_QUERY_FPM_MAX_QPS_SHIFT 0 +#define I40IW_QUERY_FPM_MAX_QPS_MASK \ + (0x7ffffUL << I40IW_QUERY_FPM_MAX_QPS_SHIFT) + +#define I40IW_QUERY_FPM_MAX_CQS_SHIFT 0 +#define I40IW_QUERY_FPM_MAX_CQS_MASK \ + (0x3ffffUL << I40IW_QUERY_FPM_MAX_CQS_SHIFT) + +#define I40IW_QUERY_FPM_FIRST_PE_SD_INDEX_SHIFT 0 +#define I40IW_QUERY_FPM_FIRST_PE_SD_INDEX_MASK \ + (0x3fffUL << I40IW_QUERY_FPM_FIRST_PE_SD_INDEX_SHIFT) + +#define I40IW_QUERY_FPM_MAX_PE_SDS_SHIFT 32 +#define I40IW_QUERY_FPM_MAX_PE_SDS_MASK \ + (0x3fffULL << I40IW_QUERY_FPM_MAX_PE_SDS_SHIFT) + +#define I40IW_QUERY_FPM_MAX_QPS_SHIFT 0 +#define I40IW_QUERY_FPM_MAX_QPS_MASK \ + (0x7ffffUL << I40IW_QUERY_FPM_MAX_QPS_SHIFT) + +#define I40IW_QUERY_FPM_MAX_CQS_SHIFT 0 +#define I40IW_QUERY_FPM_MAX_CQS_MASK \ + (0x3ffffUL << I40IW_QUERY_FPM_MAX_CQS_SHIFT) + +#define I40IW_QUERY_FPM_MAX_CEQS_SHIFT 0 +#define I40IW_QUERY_FPM_MAX_CEQS_MASK \ + (0xffUL << I40IW_QUERY_FPM_MAX_CEQS_SHIFT) + +#define I40IW_QUERY_FPM_XFBLOCKSIZE_SHIFT 32 +#define I40IW_QUERY_FPM_XFBLOCKSIZE_MASK \ + (0xffffffffULL << I40IW_QUERY_FPM_XFBLOCKSIZE_SHIFT) + +#define I40IW_QUERY_FPM_Q1BLOCKSIZE_SHIFT 32 +#define I40IW_QUERY_FPM_Q1BLOCKSIZE_MASK \ + (0xffffffffULL << I40IW_QUERY_FPM_Q1BLOCKSIZE_SHIFT) + +#define I40IW_QUERY_FPM_HTMULTIPLIER_SHIFT 16 +#define I40IW_QUERY_FPM_HTMULTIPLIER_MASK \ + (0xfUL << I40IW_QUERY_FPM_HTMULTIPLIER_SHIFT) + +#define I40IW_QUERY_FPM_TIMERBUCKET_SHIFT 32 +#define I40IW_QUERY_FPM_TIMERBUCKET_MASK \ + (0xffFFULL << I40IW_QUERY_FPM_TIMERBUCKET_SHIFT) + +/* Static HMC pages allocated buf */ +#define I40IW_SHMC_PAGE_ALLOCATED_HMC_FN_ID_SHIFT 0 +#define I40IW_SHMC_PAGE_ALLOCATED_HMC_FN_ID_MASK \ + (0x3fUL << I40IW_SHMC_PAGE_ALLOCATED_HMC_FN_ID_SHIFT) + +#define I40IW_HW_PAGE_SIZE 4096 +#define I40IW_DONE_COUNT 1000 +#define I40IW_SLEEP_COUNT 10 + +enum { + I40IW_QUEUES_ALIGNMENT_MASK = (128 - 1), + I40IW_AEQ_ALIGNMENT_MASK = (256 - 1), + I40IW_Q2_ALIGNMENT_MASK = (256 - 1), + I40IW_CEQ_ALIGNMENT_MASK = (256 - 1), + I40IW_CQ0_ALIGNMENT_MASK = (256 - 1), + I40IW_HOST_CTX_ALIGNMENT_MASK = (4 - 1), + I40IW_SHADOWAREA_MASK = (128 - 1), + I40IW_FPM_QUERY_BUF_ALIGNMENT_MASK = 0, + I40IW_FPM_COMMIT_BUF_ALIGNMENT_MASK = 0 +}; + +enum i40iw_alignment { + I40IW_CQP_ALIGNMENT = 0x200, + I40IW_AEQ_ALIGNMENT = 0x100, + I40IW_CEQ_ALIGNMENT = 0x100, + I40IW_CQ0_ALIGNMENT = 0x100, + I40IW_SD_BUF_ALIGNMENT = 0x100 +}; + +#define I40IW_WQE_SIZE_64 64 + +#define I40IW_QP_WQE_MIN_SIZE 32 +#define I40IW_QP_WQE_MAX_SIZE 128 + +#define I40IW_CQE_QTYPE_RQ 0 +#define I40IW_CQE_QTYPE_SQ 1 + +#define I40IW_RING_INIT(_ring, _size) \ + { \ + (_ring).head = 0; \ + (_ring).tail = 0; \ + (_ring).size = (_size); \ + } +#define I40IW_RING_GETSIZE(_ring) ((_ring).size) +#define I40IW_RING_GETCURRENT_HEAD(_ring) ((_ring).head) +#define I40IW_RING_GETCURRENT_TAIL(_ring) ((_ring).tail) + +#define I40IW_RING_MOVE_HEAD(_ring, _retcode) \ + { \ + register u32 size; \ + size = (_ring).size; \ + if (!I40IW_RING_FULL_ERR(_ring)) { \ + (_ring).head = ((_ring).head + 1) % size; \ + (_retcode) = 0; \ + } else { \ + (_retcode) = I40IW_ERR_RING_FULL; \ + } \ + } + +#define I40IW_RING_MOVE_HEAD_BY_COUNT(_ring, _count, _retcode) \ + { \ + register u32 size; \ + size = (_ring).size; \ + if ((I40IW_RING_WORK_AVAILABLE(_ring) + (_count)) < size) { \ + (_ring).head = ((_ring).head + (_count)) % size; \ + (_retcode) = 0; \ + } else { \ + (_retcode) = I40IW_ERR_RING_FULL; \ + } \ + } + +#define I40IW_RING_MOVE_TAIL(_ring) \ + (_ring).tail = ((_ring).tail + 1) % (_ring).size + +#define I40IW_RING_MOVE_HEAD_NOCHECK(_ring) \ + (_ring).head = ((_ring).head + 1) % (_ring).size + +#define I40IW_RING_MOVE_TAIL_BY_COUNT(_ring, _count) \ + (_ring).tail = ((_ring).tail + (_count)) % (_ring).size + +#define I40IW_RING_SET_TAIL(_ring, _pos) \ + (_ring).tail = (_pos) % (_ring).size + +#define I40IW_RING_FULL_ERR(_ring) \ + ( \ + (I40IW_RING_WORK_AVAILABLE(_ring) == ((_ring).size - 1)) \ + ) + +#define I40IW_ERR_RING_FULL2(_ring) \ + ( \ + (I40IW_RING_WORK_AVAILABLE(_ring) == ((_ring).size - 2)) \ + ) + +#define I40IW_ERR_RING_FULL3(_ring) \ + ( \ + (I40IW_RING_WORK_AVAILABLE(_ring) == ((_ring).size - 3)) \ + ) + +#define I40IW_RING_MORE_WORK(_ring) \ + ( \ + (I40IW_RING_WORK_AVAILABLE(_ring) != 0) \ + ) + +#define I40IW_RING_WORK_AVAILABLE(_ring) \ + ( \ + (((_ring).head + (_ring).size - (_ring).tail) % (_ring).size) \ + ) + +#define I40IW_RING_GET_WQES_AVAILABLE(_ring) \ + ( \ + ((_ring).size - I40IW_RING_WORK_AVAILABLE(_ring) - 1) \ + ) + +#define I40IW_ATOMIC_RING_MOVE_HEAD(_ring, index, _retcode) \ + { \ + index = I40IW_RING_GETCURRENT_HEAD(_ring); \ + I40IW_RING_MOVE_HEAD(_ring, _retcode); \ + } + +/* Async Events codes */ +#define I40IW_AE_AMP_UNALLOCATED_STAG 0x0102 +#define I40IW_AE_AMP_INVALID_STAG 0x0103 +#define I40IW_AE_AMP_BAD_QP 0x0104 +#define I40IW_AE_AMP_BAD_PD 0x0105 +#define I40IW_AE_AMP_BAD_STAG_KEY 0x0106 +#define I40IW_AE_AMP_BAD_STAG_INDEX 0x0107 +#define I40IW_AE_AMP_BOUNDS_VIOLATION 0x0108 +#define I40IW_AE_AMP_RIGHTS_VIOLATION 0x0109 +#define I40IW_AE_AMP_TO_WRAP 0x010a +#define I40IW_AE_AMP_FASTREG_SHARED 0x010b +#define I40IW_AE_AMP_FASTREG_VALID_STAG 0x010c +#define I40IW_AE_AMP_FASTREG_MW_STAG 0x010d +#define I40IW_AE_AMP_FASTREG_INVALID_RIGHTS 0x010e +#define I40IW_AE_AMP_FASTREG_PBL_TABLE_OVERFLOW 0x010f +#define I40IW_AE_AMP_FASTREG_INVALID_LENGTH 0x0110 +#define I40IW_AE_AMP_INVALIDATE_SHARED 0x0111 +#define I40IW_AE_AMP_INVALIDATE_NO_REMOTE_ACCESS_RIGHTS 0x0112 +#define I40IW_AE_AMP_INVALIDATE_MR_WITH_BOUND_WINDOWS 0x0113 +#define I40IW_AE_AMP_MWBIND_VALID_STAG 0x0114 +#define I40IW_AE_AMP_MWBIND_OF_MR_STAG 0x0115 +#define I40IW_AE_AMP_MWBIND_TO_ZERO_BASED_STAG 0x0116 +#define I40IW_AE_AMP_MWBIND_TO_MW_STAG 0x0117 +#define I40IW_AE_AMP_MWBIND_INVALID_RIGHTS 0x0118 +#define I40IW_AE_AMP_MWBIND_INVALID_BOUNDS 0x0119 +#define I40IW_AE_AMP_MWBIND_TO_INVALID_PARENT 0x011a +#define I40IW_AE_AMP_MWBIND_BIND_DISABLED 0x011b +#define I40IW_AE_AMP_WQE_INVALID_PARAMETER 0x0130 +#define I40IW_AE_BAD_CLOSE 0x0201 +#define I40IW_AE_RDMAP_ROE_BAD_LLP_CLOSE 0x0202 +#define I40IW_AE_CQ_OPERATION_ERROR 0x0203 +#define I40IW_AE_PRIV_OPERATION_DENIED 0x011c +#define I40IW_AE_RDMA_READ_WHILE_ORD_ZERO 0x0205 +#define I40IW_AE_STAG_ZERO_INVALID 0x0206 +#define I40IW_AE_IB_RREQ_AND_Q1_FULL 0x0207 +#define I40IW_AE_SRQ_LIMIT 0x0209 +#define I40IW_AE_WQE_UNEXPECTED_OPCODE 0x020a +#define I40IW_AE_WQE_INVALID_PARAMETER 0x020b +#define I40IW_AE_WQE_LSMM_TOO_LONG 0x0220 +#define I40IW_AE_DDP_INVALID_MSN_GAP_IN_MSN 0x0301 +#define I40IW_AE_DDP_INVALID_MSN_RANGE_IS_NOT_VALID 0x0302 +#define I40IW_AE_DDP_UBE_DDP_MESSAGE_TOO_LONG_FOR_AVAILABLE_BUFFER 0x0303 +#define I40IW_AE_DDP_UBE_INVALID_DDP_VERSION 0x0304 +#define I40IW_AE_DDP_UBE_INVALID_MO 0x0305 +#define I40IW_AE_DDP_UBE_INVALID_MSN_NO_BUFFER_AVAILABLE 0x0306 +#define I40IW_AE_DDP_UBE_INVALID_QN 0x0307 +#define I40IW_AE_DDP_NO_L_BIT 0x0308 +#define I40IW_AE_RDMAP_ROE_INVALID_RDMAP_VERSION 0x0311 +#define I40IW_AE_RDMAP_ROE_UNEXPECTED_OPCODE 0x0312 +#define I40IW_AE_ROE_INVALID_RDMA_READ_REQUEST 0x0313 +#define I40IW_AE_ROE_INVALID_RDMA_WRITE_OR_READ_RESP 0x0314 +#define I40IW_AE_INVALID_ARP_ENTRY 0x0401 +#define I40IW_AE_INVALID_TCP_OPTION_RCVD 0x0402 +#define I40IW_AE_STALE_ARP_ENTRY 0x0403 +#define I40IW_AE_INVALID_WQE_LENGTH 0x0404 +#define I40IW_AE_INVALID_MAC_ENTRY 0x0405 +#define I40IW_AE_LLP_CLOSE_COMPLETE 0x0501 +#define I40IW_AE_LLP_CONNECTION_RESET 0x0502 +#define I40IW_AE_LLP_FIN_RECEIVED 0x0503 +#define I40IW_AE_LLP_RECEIVED_MARKER_AND_LENGTH_FIELDS_DONT_MATCH 0x0504 +#define I40IW_AE_LLP_RECEIVED_MPA_CRC_ERROR 0x0505 +#define I40IW_AE_LLP_SEGMENT_TOO_LARGE 0x0506 +#define I40IW_AE_LLP_SEGMENT_TOO_SMALL 0x0507 +#define I40IW_AE_LLP_SYN_RECEIVED 0x0508 +#define I40IW_AE_LLP_TERMINATE_RECEIVED 0x0509 +#define I40IW_AE_LLP_TOO_MANY_RETRIES 0x050a +#define I40IW_AE_LLP_TOO_MANY_KEEPALIVE_RETRIES 0x050b +#define I40IW_AE_LLP_DOUBT_REACHABILITY 0x050c +#define I40IW_AE_LLP_RX_VLAN_MISMATCH 0x050d +#define I40IW_AE_RESOURCE_EXHAUSTION 0x0520 +#define I40IW_AE_RESET_SENT 0x0601 +#define I40IW_AE_TERMINATE_SENT 0x0602 +#define I40IW_AE_RESET_NOT_SENT 0x0603 +#define I40IW_AE_LCE_QP_CATASTROPHIC 0x0700 +#define I40IW_AE_LCE_FUNCTION_CATASTROPHIC 0x0701 +#define I40IW_AE_LCE_CQ_CATASTROPHIC 0x0702 +#define I40IW_AE_UDA_XMIT_FRAG_SEQ 0x0800 +#define I40IW_AE_UDA_XMIT_DGRAM_TOO_LONG 0x0801 +#define I40IW_AE_UDA_XMIT_IPADDR_MISMATCH 0x0802 +#define I40IW_AE_QP_SUSPEND_COMPLETE 0x0900 + +#define OP_DELETE_LOCAL_MAC_IPADDR_ENTRY 1 +#define OP_CEQ_DESTROY 2 +#define OP_AEQ_DESTROY 3 +#define OP_DELETE_ARP_CACHE_ENTRY 4 +#define OP_MANAGE_APBVT_ENTRY 5 +#define OP_CEQ_CREATE 6 +#define OP_AEQ_CREATE 7 +#define OP_ALLOC_LOCAL_MAC_IPADDR_ENTRY 8 +#define OP_ADD_LOCAL_MAC_IPADDR_ENTRY 9 +#define OP_MANAGE_QHASH_TABLE_ENTRY 10 +#define OP_QP_MODIFY 11 +#define OP_QP_UPLOAD_CONTEXT 12 +#define OP_CQ_CREATE 13 +#define OP_CQ_DESTROY 14 +#define OP_QP_CREATE 15 +#define OP_QP_DESTROY 16 +#define OP_ALLOC_STAG 17 +#define OP_MR_REG_NON_SHARED 18 +#define OP_DEALLOC_STAG 19 +#define OP_MW_ALLOC 20 +#define OP_QP_FLUSH_WQES 21 +#define OP_ADD_ARP_CACHE_ENTRY 22 +#define OP_MANAGE_PUSH_PAGE 23 +#define OP_UPDATE_PE_SDS 24 +#define OP_MANAGE_HMC_PM_FUNC_TABLE 25 +#define OP_SUSPEND 26 +#define OP_RESUME 27 +#define OP_MANAGE_VF_PBLE_BP 28 +#define OP_QUERY_FPM_VALUES 29 +#define OP_COMMIT_FPM_VALUES 30 +#define OP_SIZE_CQP_STAT_ARRAY 31 + +#endif diff --git a/providers/i40iw/i40iw_osdep.h b/providers/i40iw/i40iw_osdep.h new file mode 100644 index 0000000..92bedd3 --- /dev/null +++ b/providers/i40iw/i40iw_osdep.h @@ -0,0 +1,108 @@ +/******************************************************************************* +* +* Copyright (c) 2015-2016 Intel Corporation. All rights reserved. +* +* This software is available to you under a choice of one of two +* licenses. You may choose to be licensed under the terms of the GNU +* General Public License (GPL) Version 2, available from the file +* COPYING in the main directory of this source tree, or the +* OpenFabrics.org BSD license below: +* +* Redistribution and use in source and binary forms, with or +* without modification, are permitted provided that the following +* conditions are met: +* +* - Redistributions of source code must retain the above +* copyright notice, this list of conditions and the following +* disclaimer. +* +* - Redistributions in binary form must reproduce the above +* copyright notice, this list of conditions and the following +* disclaimer in the documentation and/or other materials +* provided with the distribution. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS +* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN +* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +* SOFTWARE. +* +*******************************************************************************/ + +#ifndef I40IW_OSDEP_H +#define I40IW_OSDEP_H + +#include <stdbool.h> +#include <stdio.h> +#include <string.h> +#include <util/udma_barrier.h> +#include <linux/types.h> +typedef unsigned char u8; +typedef unsigned long long u64; +typedef unsigned int u32; +typedef unsigned short u16; +typedef unsigned long i40iw_uintptr; +typedef unsigned long *i40iw_bits_t; +typedef __be16 BE16; +typedef __be32 BE32; +typedef __be64 BE64; +typedef __le16 LE16; +typedef __le32 LE32; +typedef __le64 LE64; + +#define STATS_TIMER_DELAY 1000 +#define INLINE inline + +static inline void set_64bit_val(u64 *wqe_words, u32 byte_index, u64 value) +{ + wqe_words[byte_index >> 3] = value; +} + +/** + * set_32bit_val - set 32 value to hw wqe + * @wqe_words: wqe addr to write + * @byte_index: index in wqe + * @value: value to write + **/ +static inline void set_32bit_val(u32 *wqe_words, u32 byte_index, u32 value) +{ + wqe_words[byte_index >> 2] = value; +} + +/** + * get_64bit_val - read 64 bit value from wqe + * @wqe_words: wqe addr + * @byte_index: index to read from + * @value: read value + **/ +static inline void get_64bit_val(u64 *wqe_words, u32 byte_index, u64 *value) +{ + *value = wqe_words[byte_index >> 3]; +} + +/** + * get_32bit_val - read 32 bit value from wqe + * @wqe_words: wqe addr + * @byte_index: index to reaad from + * @value: return 32 bit value + **/ +static inline void get_32bit_val(u32 *wqe_words, u32 byte_index, u32 *value) +{ + *value = wqe_words[byte_index >> 2]; +} + +#define i40iw_get_virt_to_phy +#define IOMEM + +static inline void db_wr32(u32 value, u32 *wqe_word) +{ + *wqe_word = value; +} + +#define ACQUIRE_LOCK() +#define RELEASE_LOCK() + +#endif /* _I40IW_OSDEP_H_ */ diff --git a/providers/i40iw/i40iw_register.h b/providers/i40iw/i40iw_register.h new file mode 100644 index 0000000..5776818 --- /dev/null +++ b/providers/i40iw/i40iw_register.h @@ -0,0 +1,1030 @@ +/******************************************************************************* +* +* Copyright (c) 2015-2016 Intel Corporation. All rights reserved. +* +* This software is available to you under a choice of one of two +* licenses. You may choose to be licensed under the terms of the GNU +* General Public License (GPL) Version 2, available from the file +* COPYING in the main directory of this source tree, or the +* OpenFabrics.org BSD license below: +* +* Redistribution and use in source and binary forms, with or +* without modification, are permitted provided that the following +* conditions are met: +* +* - Redistributions of source code must retain the above +* copyright notice, this list of conditions and the following +* disclaimer. +* +* - Redistributions in binary form must reproduce the above +* copyright notice, this list of conditions and the following +* disclaimer in the documentation and/or other materials +* provided with the distribution. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS +* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN +* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +* SOFTWARE. +* +*******************************************************************************/ + +#ifndef I40IW_REGISTER_H +#define I40IW_REGISTER_H + +#define I40E_GLGEN_STAT 0x000B612C /* Reset: POR */ + +#define I40E_PFHMC_PDINV 0x000C0300 /* Reset: PFR */ +#define I40E_PFHMC_PDINV_PMSDIDX_SHIFT 0 +#define I40E_PFHMC_PDINV_PMSDIDX_MASK (0xFFF << I40E_PFHMC_PDINV_PMSDIDX_SHIFT) +#define I40E_PFHMC_PDINV_PMPDIDX_SHIFT 16 +#define I40E_PFHMC_PDINV_PMPDIDX_MASK (0x1FF << I40E_PFHMC_PDINV_PMPDIDX_SHIFT) +#define I40E_PFHMC_SDCMD_PMSDWR_SHIFT 31 +#define I40E_PFHMC_SDCMD_PMSDWR_MASK (0x1 << I40E_PFHMC_SDCMD_PMSDWR_SHIFT) +#define I40E_PFHMC_SDDATALOW_PMSDVALID_SHIFT 0 +#define I40E_PFHMC_SDDATALOW_PMSDVALID_MASK (0x1 << I40E_PFHMC_SDDATALOW_PMSDVALID_SHIFT) +#define I40E_PFHMC_SDDATALOW_PMSDTYPE_SHIFT 1 +#define I40E_PFHMC_SDDATALOW_PMSDTYPE_MASK (0x1 << I40E_PFHMC_SDDATALOW_PMSDTYPE_SHIFT) +#define I40E_PFHMC_SDDATALOW_PMSDBPCOUNT_SHIFT 2 +#define I40E_PFHMC_SDDATALOW_PMSDBPCOUNT_MASK (0x3FF << I40E_PFHMC_SDDATALOW_PMSDBPCOUNT_SHIFT) + +#define I40E_PFINT_DYN_CTLN(_INTPF) (0x00034800 + ((_INTPF) * 4)) /* _i=0...511 */ /* Reset: PFR */ +#define I40E_PFINT_DYN_CTLN_INTENA_SHIFT 0 +#define I40E_PFINT_DYN_CTLN_INTENA_MASK (0x1 << I40E_PFINT_DYN_CTLN_INTENA_SHIFT) +#define I40E_PFINT_DYN_CTLN_CLEARPBA_SHIFT 1 +#define I40E_PFINT_DYN_CTLN_CLEARPBA_MASK (0x1 << I40E_PFINT_DYN_CTLN_CLEARPBA_SHIFT) +#define I40E_PFINT_DYN_CTLN_ITR_INDX_SHIFT 3 +#define I40E_PFINT_DYN_CTLN_ITR_INDX_MASK (0x3 << I40E_PFINT_DYN_CTLN_ITR_INDX_SHIFT) + +#define I40E_VFINT_DYN_CTLN1(_INTVF) (0x00003800 + ((_INTVF) * 4)) /* _i=0...15 */ /* Reset: VFR */ +#define I40E_GLHMC_VFPDINV(_i) (0x000C8300 + ((_i) * 4)) /* _i=0...31 */ /* Reset: CORER */ + +#define I40E_PFHMC_PDINV_PMSDPARTSEL_SHIFT 15 +#define I40E_PFHMC_PDINV_PMSDPARTSEL_MASK (0x1 << I40E_PFHMC_PDINV_PMSDPARTSEL_SHIFT) +#define I40E_GLPCI_LBARCTRL 0x000BE484 /* Reset: POR */ +#define I40E_GLPCI_LBARCTRL_PE_DB_SIZE_SHIFT 4 +#define I40E_GLPCI_LBARCTRL_PE_DB_SIZE_MASK (0x3 << I40E_GLPCI_LBARCTRL_PE_DB_SIZE_SHIFT) +#define I40E_GLPCI_DREVID 0x0009C480 /* Reset: PCIR */ +#define I40E_GLPCI_DREVID_DEFAULT_REVID_SHIFT 0 +#define I40E_GLPCI_DREVID_DEFAULT_REVID_MASK 0xFF + +#define I40E_PFPE_AEQALLOC 0x00131180 /* Reset: PFR */ +#define I40E_PFPE_AEQALLOC_AECOUNT_SHIFT 0 +#define I40E_PFPE_AEQALLOC_AECOUNT_MASK (0xFFFFFFFF << I40E_PFPE_AEQALLOC_AECOUNT_SHIFT) +#define I40E_PFPE_CCQPHIGH 0x00008200 /* Reset: PFR */ +#define I40E_PFPE_CCQPHIGH_PECCQPHIGH_SHIFT 0 +#define I40E_PFPE_CCQPHIGH_PECCQPHIGH_MASK (0xFFFFFFFF << I40E_PFPE_CCQPHIGH_PECCQPHIGH_SHIFT) +#define I40E_PFPE_CCQPLOW 0x00008180 /* Reset: PFR */ +#define I40E_PFPE_CCQPLOW_PECCQPLOW_SHIFT 0 +#define I40E_PFPE_CCQPLOW_PECCQPLOW_MASK (0xFFFFFFFF << I40E_PFPE_CCQPLOW_PECCQPLOW_SHIFT) +#define I40E_PFPE_CCQPSTATUS 0x00008100 /* Reset: PFR */ +#define I40E_PFPE_CCQPSTATUS_CCQP_DONE_SHIFT 0 +#define I40E_PFPE_CCQPSTATUS_CCQP_DONE_MASK (0x1 << I40E_PFPE_CCQPSTATUS_CCQP_DONE_SHIFT) +#define I40E_PFPE_CCQPSTATUS_HMC_PROFILE_SHIFT 4 +#define I40E_PFPE_CCQPSTATUS_HMC_PROFILE_MASK (0x7 << I40E_PFPE_CCQPSTATUS_HMC_PROFILE_SHIFT) +#define I40E_PFPE_CCQPSTATUS_RDMA_EN_VFS_SHIFT 16 +#define I40E_PFPE_CCQPSTATUS_RDMA_EN_VFS_MASK (0x3F << I40E_PFPE_CCQPSTATUS_RDMA_EN_VFS_SHIFT) +#define I40E_PFPE_CCQPSTATUS_CCQP_ERR_SHIFT 31 +#define I40E_PFPE_CCQPSTATUS_CCQP_ERR_MASK (0x1 << I40E_PFPE_CCQPSTATUS_CCQP_ERR_SHIFT) +#define I40E_PFPE_CQACK 0x00131100 /* Reset: PFR */ +#define I40E_PFPE_CQACK_PECQID_SHIFT 0 +#define I40E_PFPE_CQACK_PECQID_MASK (0x1FFFF << I40E_PFPE_CQACK_PECQID_SHIFT) +#define I40E_PFPE_CQARM 0x00131080 /* Reset: PFR */ +#define I40E_PFPE_CQARM_PECQID_SHIFT 0 +#define I40E_PFPE_CQARM_PECQID_MASK (0x1FFFF << I40E_PFPE_CQARM_PECQID_SHIFT) +#define I40E_PFPE_CQPDB 0x00008000 /* Reset: PFR */ +#define I40E_PFPE_CQPDB_WQHEAD_SHIFT 0 +#define I40E_PFPE_CQPDB_WQHEAD_MASK (0x7FF << I40E_PFPE_CQPDB_WQHEAD_SHIFT) +#define I40E_PFPE_CQPERRCODES 0x00008880 /* Reset: PFR */ +#define I40E_PFPE_CQPERRCODES_CQP_MINOR_CODE_SHIFT 0 +#define I40E_PFPE_CQPERRCODES_CQP_MINOR_CODE_MASK (0xFFFF << I40E_PFPE_CQPERRCODES_CQP_MINOR_CODE_SHIFT) +#define I40E_PFPE_CQPERRCODES_CQP_MAJOR_CODE_SHIFT 16 +#define I40E_PFPE_CQPERRCODES_CQP_MAJOR_CODE_MASK (0xFFFF << I40E_PFPE_CQPERRCODES_CQP_MAJOR_CODE_SHIFT) +#define I40E_PFPE_CQPTAIL 0x00008080 /* Reset: PFR */ +#define I40E_PFPE_CQPTAIL_WQTAIL_SHIFT 0 +#define I40E_PFPE_CQPTAIL_WQTAIL_MASK (0x7FF << I40E_PFPE_CQPTAIL_WQTAIL_SHIFT) +#define I40E_PFPE_CQPTAIL_CQP_OP_ERR_SHIFT 31 +#define I40E_PFPE_CQPTAIL_CQP_OP_ERR_MASK (0x1 << I40E_PFPE_CQPTAIL_CQP_OP_ERR_SHIFT) +#define I40E_PFPE_FLMQ1ALLOCERR 0x00008980 /* Reset: PFR */ +#define I40E_PFPE_FLMQ1ALLOCERR_ERROR_COUNT_SHIFT 0 +#define I40E_PFPE_FLMQ1ALLOCERR_ERROR_COUNT_MASK (0xFFFF << I40E_PFPE_FLMQ1ALLOCERR_ERROR_COUNT_SHIFT) +#define I40E_PFPE_FLMXMITALLOCERR 0x00008900 /* Reset: PFR */ +#define I40E_PFPE_FLMXMITALLOCERR_ERROR_COUNT_SHIFT 0 +#define I40E_PFPE_FLMXMITALLOCERR_ERROR_COUNT_MASK (0xFFFF << I40E_PFPE_FLMXMITALLOCERR_ERROR_COUNT_SHIFT) +#define I40E_PFPE_IPCONFIG0 0x00008280 /* Reset: PFR */ +#define I40E_PFPE_IPCONFIG0_PEIPID_SHIFT 0 +#define I40E_PFPE_IPCONFIG0_PEIPID_MASK (0xFFFF << I40E_PFPE_IPCONFIG0_PEIPID_SHIFT) +#define I40E_PFPE_IPCONFIG0_USEENTIREIDRANGE_SHIFT 16 +#define I40E_PFPE_IPCONFIG0_USEENTIREIDRANGE_MASK (0x1 << I40E_PFPE_IPCONFIG0_USEENTIREIDRANGE_SHIFT) +#define I40E_PFPE_MRTEIDXMASK 0x00008600 /* Reset: PFR */ +#define I40E_PFPE_MRTEIDXMASK_MRTEIDXMASKBITS_SHIFT 0 +#define I40E_PFPE_MRTEIDXMASK_MRTEIDXMASKBITS_MASK (0x1F << I40E_PFPE_MRTEIDXMASK_MRTEIDXMASKBITS_SHIFT) +#define I40E_PFPE_RCVUNEXPECTEDERROR 0x00008680 /* Reset: PFR */ +#define I40E_PFPE_RCVUNEXPECTEDERROR_TCP_RX_UNEXP_ERR_SHIFT 0 +#define I40E_PFPE_RCVUNEXPECTEDERROR_TCP_RX_UNEXP_ERR_MASK (0xFFFFFF << I40E_PFPE_RCVUNEXPECTEDERROR_TCP_RX_UNEXP_ERR_SHIFT) +#define I40E_PFPE_TCPNOWTIMER 0x00008580 /* Reset: PFR */ +#define I40E_PFPE_TCPNOWTIMER_TCP_NOW_SHIFT 0 +#define I40E_PFPE_TCPNOWTIMER_TCP_NOW_MASK (0xFFFFFFFF << I40E_PFPE_TCPNOWTIMER_TCP_NOW_SHIFT) + +#define I40E_PFPE_WQEALLOC 0x00138C00 /* Reset: PFR */ +#define I40E_PFPE_WQEALLOC_PEQPID_SHIFT 0 +#define I40E_PFPE_WQEALLOC_PEQPID_MASK (0x3FFFF << I40E_PFPE_WQEALLOC_PEQPID_SHIFT) +#define I40E_PFPE_WQEALLOC_WQE_DESC_INDEX_SHIFT 20 +#define I40E_PFPE_WQEALLOC_WQE_DESC_INDEX_MASK (0xFFF << I40E_PFPE_WQEALLOC_WQE_DESC_INDEX_SHIFT) + +#define I40E_VFPE_AEQALLOC(_VF) (0x00130C00 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: VFR */ +#define I40E_VFPE_AEQALLOC_MAX_INDEX 127 +#define I40E_VFPE_AEQALLOC_AECOUNT_SHIFT 0 +#define I40E_VFPE_AEQALLOC_AECOUNT_MASK (0xFFFFFFFF << I40E_VFPE_AEQALLOC_AECOUNT_SHIFT) +#define I40E_VFPE_CCQPHIGH(_VF) (0x00001000 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: VFR */ +#define I40E_VFPE_CCQPHIGH_MAX_INDEX 127 +#define I40E_VFPE_CCQPHIGH_PECCQPHIGH_SHIFT 0 +#define I40E_VFPE_CCQPHIGH_PECCQPHIGH_MASK (0xFFFFFFFF << I40E_VFPE_CCQPHIGH_PECCQPHIGH_SHIFT) +#define I40E_VFPE_CCQPLOW(_VF) (0x00000C00 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: VFR */ +#define I40E_VFPE_CCQPLOW_MAX_INDEX 127 +#define I40E_VFPE_CCQPLOW_PECCQPLOW_SHIFT 0 +#define I40E_VFPE_CCQPLOW_PECCQPLOW_MASK (0xFFFFFFFF << I40E_VFPE_CCQPLOW_PECCQPLOW_SHIFT) +#define I40E_VFPE_CCQPSTATUS(_VF) (0x00000800 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: VFR */ +#define I40E_VFPE_CCQPSTATUS_MAX_INDEX 127 +#define I40E_VFPE_CCQPSTATUS_CCQP_DONE_SHIFT 0 +#define I40E_VFPE_CCQPSTATUS_CCQP_DONE_MASK (0x1 << I40E_VFPE_CCQPSTATUS_CCQP_DONE_SHIFT) +#define I40E_VFPE_CCQPSTATUS_HMC_PROFILE_SHIFT 4 +#define I40E_VFPE_CCQPSTATUS_HMC_PROFILE_MASK (0x7 << I40E_VFPE_CCQPSTATUS_HMC_PROFILE_SHIFT) +#define I40E_VFPE_CCQPSTATUS_RDMA_EN_VFS_SHIFT 16 +#define I40E_VFPE_CCQPSTATUS_RDMA_EN_VFS_MASK (0x3F << I40E_VFPE_CCQPSTATUS_RDMA_EN_VFS_SHIFT) +#define I40E_VFPE_CCQPSTATUS_CCQP_ERR_SHIFT 31 +#define I40E_VFPE_CCQPSTATUS_CCQP_ERR_MASK (0x1 << I40E_VFPE_CCQPSTATUS_CCQP_ERR_SHIFT) +#define I40E_VFPE_CQACK(_VF) (0x00130800 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: VFR */ +#define I40E_VFPE_CQACK_MAX_INDEX 127 +#define I40E_VFPE_CQACK_PECQID_SHIFT 0 +#define I40E_VFPE_CQACK_PECQID_MASK (0x1FFFF << I40E_VFPE_CQACK_PECQID_SHIFT) +#define I40E_VFPE_CQARM(_VF) (0x00130400 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: VFR */ +#define I40E_VFPE_CQARM_MAX_INDEX 127 +#define I40E_VFPE_CQARM_PECQID_SHIFT 0 +#define I40E_VFPE_CQARM_PECQID_MASK (0x1FFFF << I40E_VFPE_CQARM_PECQID_SHIFT) +#define I40E_VFPE_CQPDB(_VF) (0x00000000 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: VFR */ +#define I40E_VFPE_CQPDB_MAX_INDEX 127 +#define I40E_VFPE_CQPDB_WQHEAD_SHIFT 0 +#define I40E_VFPE_CQPDB_WQHEAD_MASK (0x7FF << I40E_VFPE_CQPDB_WQHEAD_SHIFT) +#define I40E_VFPE_CQPERRCODES(_VF) (0x00001800 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: VFR */ +#define I40E_VFPE_CQPERRCODES_MAX_INDEX 127 +#define I40E_VFPE_CQPERRCODES_CQP_MINOR_CODE_SHIFT 0 +#define I40E_VFPE_CQPERRCODES_CQP_MINOR_CODE_MASK (0xFFFF << I40E_VFPE_CQPERRCODES_CQP_MINOR_CODE_SHIFT) +#define I40E_VFPE_CQPERRCODES_CQP_MAJOR_CODE_SHIFT 16 +#define I40E_VFPE_CQPERRCODES_CQP_MAJOR_CODE_MASK (0xFFFF << I40E_VFPE_CQPERRCODES_CQP_MAJOR_CODE_SHIFT) +#define I40E_VFPE_CQPTAIL(_VF) (0x00000400 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: VFR */ +#define I40E_VFPE_CQPTAIL_MAX_INDEX 127 +#define I40E_VFPE_CQPTAIL_WQTAIL_SHIFT 0 +#define I40E_VFPE_CQPTAIL_WQTAIL_MASK (0x7FF << I40E_VFPE_CQPTAIL_WQTAIL_SHIFT) +#define I40E_VFPE_CQPTAIL_CQP_OP_ERR_SHIFT 31 +#define I40E_VFPE_CQPTAIL_CQP_OP_ERR_MASK (0x1 << I40E_VFPE_CQPTAIL_CQP_OP_ERR_SHIFT) +#define I40E_VFPE_IPCONFIG0(_VF) (0x00001400 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: VFR */ +#define I40E_VFPE_IPCONFIG0_MAX_INDEX 127 +#define I40E_VFPE_IPCONFIG0_PEIPID_SHIFT 0 +#define I40E_VFPE_IPCONFIG0_PEIPID_MASK (0xFFFF << I40E_VFPE_IPCONFIG0_PEIPID_SHIFT) +#define I40E_VFPE_IPCONFIG0_USEENTIREIDRANGE_SHIFT 16 +#define I40E_VFPE_IPCONFIG0_USEENTIREIDRANGE_MASK (0x1 << I40E_VFPE_IPCONFIG0_USEENTIREIDRANGE_SHIFT) +#define I40E_VFPE_MRTEIDXMASK(_VF) (0x00003000 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: VFR */ +#define I40E_VFPE_MRTEIDXMASK_MAX_INDEX 127 +#define I40E_VFPE_MRTEIDXMASK_MRTEIDXMASKBITS_SHIFT 0 +#define I40E_VFPE_MRTEIDXMASK_MRTEIDXMASKBITS_MASK (0x1F << I40E_VFPE_MRTEIDXMASK_MRTEIDXMASKBITS_SHIFT) +#define I40E_VFPE_RCVUNEXPECTEDERROR(_VF) (0x00003400 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: VFR */ +#define I40E_VFPE_RCVUNEXPECTEDERROR_MAX_INDEX 127 +#define I40E_VFPE_RCVUNEXPECTEDERROR_TCP_RX_UNEXP_ERR_SHIFT 0 +#define I40E_VFPE_RCVUNEXPECTEDERROR_TCP_RX_UNEXP_ERR_MASK (0xFFFFFF << I40E_VFPE_RCVUNEXPECTEDERROR_TCP_RX_UNEXP_ERR_SHIFT) +#define I40E_VFPE_TCPNOWTIMER(_VF) (0x00002C00 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: VFR */ +#define I40E_VFPE_TCPNOWTIMER_MAX_INDEX 127 +#define I40E_VFPE_TCPNOWTIMER_TCP_NOW_SHIFT 0 +#define I40E_VFPE_TCPNOWTIMER_TCP_NOW_MASK (0xFFFFFFFF << I40E_VFPE_TCPNOWTIMER_TCP_NOW_SHIFT) +#define I40E_VFPE_WQEALLOC(_VF) (0x00138000 + ((_VF) * 4)) /* _i=0...127 */ /* Reset: VFR */ +#define I40E_VFPE_WQEALLOC_MAX_INDEX 127 +#define I40E_VFPE_WQEALLOC_PEQPID_SHIFT 0 +#define I40E_VFPE_WQEALLOC_PEQPID_MASK (0x3FFFF << I40E_VFPE_WQEALLOC_PEQPID_SHIFT) +#define I40E_VFPE_WQEALLOC_WQE_DESC_INDEX_SHIFT 20 +#define I40E_VFPE_WQEALLOC_WQE_DESC_INDEX_MASK (0xFFF << I40E_VFPE_WQEALLOC_WQE_DESC_INDEX_SHIFT) + +#define I40E_GLPE_CPUSTATUS0 0x0000D040 /* Reset: PE_CORER */ +#define I40E_GLPE_CPUSTATUS0_PECPUSTATUS0_SHIFT 0 +#define I40E_GLPE_CPUSTATUS0_PECPUSTATUS0_MASK (0xFFFFFFFF << I40E_GLPE_CPUSTATUS0_PECPUSTATUS0_SHIFT) +#define I40E_GLPE_CPUSTATUS1 0x0000D044 /* Reset: PE_CORER */ +#define I40E_GLPE_CPUSTATUS1_PECPUSTATUS1_SHIFT 0 +#define I40E_GLPE_CPUSTATUS1_PECPUSTATUS1_MASK (0xFFFFFFFF << I40E_GLPE_CPUSTATUS1_PECPUSTATUS1_SHIFT) +#define I40E_GLPE_CPUSTATUS2 0x0000D048 /* Reset: PE_CORER */ +#define I40E_GLPE_CPUSTATUS2_PECPUSTATUS2_SHIFT 0 +#define I40E_GLPE_CPUSTATUS2_PECPUSTATUS2_MASK (0xFFFFFFFF << I40E_GLPE_CPUSTATUS2_PECPUSTATUS2_SHIFT) +#define I40E_GLPE_CPUTRIG0 0x0000D060 /* Reset: PE_CORER */ +#define I40E_GLPE_CPUTRIG0_PECPUTRIG0_SHIFT 0 +#define I40E_GLPE_CPUTRIG0_PECPUTRIG0_MASK (0xFFFF << I40E_GLPE_CPUTRIG0_PECPUTRIG0_SHIFT) +#define I40E_GLPE_CPUTRIG0_TEPREQUEST0_SHIFT 17 +#define I40E_GLPE_CPUTRIG0_TEPREQUEST0_MASK (0x1 << I40E_GLPE_CPUTRIG0_TEPREQUEST0_SHIFT) +#define I40E_GLPE_CPUTRIG0_OOPREQUEST0_SHIFT 18 +#define I40E_GLPE_CPUTRIG0_OOPREQUEST0_MASK (0x1 << I40E_GLPE_CPUTRIG0_OOPREQUEST0_SHIFT) +#define I40E_GLPE_DUAL40_RUPM 0x0000DA04 /* Reset: PE_CORER */ +#define I40E_GLPE_DUAL40_RUPM_DUAL_40G_MODE_SHIFT 0 +#define I40E_GLPE_DUAL40_RUPM_DUAL_40G_MODE_MASK (0x1 << I40E_GLPE_DUAL40_RUPM_DUAL_40G_MODE_SHIFT) +#define I40E_GLPE_PFAEQEDROPCNT(_i) (0x00131440 + ((_i) * 4)) /* _i=0...15 */ /* Reset: CORER */ +#define I40E_GLPE_PFAEQEDROPCNT_MAX_INDEX 15 +#define I40E_GLPE_PFAEQEDROPCNT_AEQEDROPCNT_SHIFT 0 +#define I40E_GLPE_PFAEQEDROPCNT_AEQEDROPCNT_MASK (0xFFFF << I40E_GLPE_PFAEQEDROPCNT_AEQEDROPCNT_SHIFT) +#define I40E_GLPE_PFCEQEDROPCNT(_i) (0x001313C0 + ((_i) * 4)) /* _i=0...15 */ /* Reset: CORER */ +#define I40E_GLPE_PFCEQEDROPCNT_MAX_INDEX 15 +#define I40E_GLPE_PFCEQEDROPCNT_CEQEDROPCNT_SHIFT 0 +#define I40E_GLPE_PFCEQEDROPCNT_CEQEDROPCNT_MASK (0xFFFF << I40E_GLPE_PFCEQEDROPCNT_CEQEDROPCNT_SHIFT) +#define I40E_GLPE_PFCQEDROPCNT(_i) (0x00131340 + ((_i) * 4)) /* _i=0...15 */ /* Reset: CORER */ +#define I40E_GLPE_PFCQEDROPCNT_MAX_INDEX 15 +#define I40E_GLPE_PFCQEDROPCNT_CQEDROPCNT_SHIFT 0 +#define I40E_GLPE_PFCQEDROPCNT_CQEDROPCNT_MASK (0xFFFF << I40E_GLPE_PFCQEDROPCNT_CQEDROPCNT_SHIFT) +#define I40E_GLPE_RUPM_CQPPOOL 0x0000DACC /* Reset: PE_CORER */ +#define I40E_GLPE_RUPM_CQPPOOL_CQPSPADS_SHIFT 0 +#define I40E_GLPE_RUPM_CQPPOOL_CQPSPADS_MASK (0xFF << I40E_GLPE_RUPM_CQPPOOL_CQPSPADS_SHIFT) +#define I40E_GLPE_RUPM_FLRPOOL 0x0000DAC4 /* Reset: PE_CORER */ +#define I40E_GLPE_RUPM_FLRPOOL_FLRSPADS_SHIFT 0 +#define I40E_GLPE_RUPM_FLRPOOL_FLRSPADS_MASK (0xFF << I40E_GLPE_RUPM_FLRPOOL_FLRSPADS_SHIFT) +#define I40E_GLPE_RUPM_GCTL 0x0000DA00 /* Reset: PE_CORER */ +#define I40E_GLPE_RUPM_GCTL_ALLOFFTH_SHIFT 0 +#define I40E_GLPE_RUPM_GCTL_ALLOFFTH_MASK (0xFF << I40E_GLPE_RUPM_GCTL_ALLOFFTH_SHIFT) +#define I40E_GLPE_RUPM_GCTL_RUPM_P0_DIS_SHIFT 26 +#define I40E_GLPE_RUPM_GCTL_RUPM_P0_DIS_MASK (0x1 << I40E_GLPE_RUPM_GCTL_RUPM_P0_DIS_SHIFT) +#define I40E_GLPE_RUPM_GCTL_RUPM_P1_DIS_SHIFT 27 +#define I40E_GLPE_RUPM_GCTL_RUPM_P1_DIS_MASK (0x1 << I40E_GLPE_RUPM_GCTL_RUPM_P1_DIS_SHIFT) +#define I40E_GLPE_RUPM_GCTL_RUPM_P2_DIS_SHIFT 28 +#define I40E_GLPE_RUPM_GCTL_RUPM_P2_DIS_MASK (0x1 << I40E_GLPE_RUPM_GCTL_RUPM_P2_DIS_SHIFT) +#define I40E_GLPE_RUPM_GCTL_RUPM_P3_DIS_SHIFT 29 +#define I40E_GLPE_RUPM_GCTL_RUPM_P3_DIS_MASK (0x1 << I40E_GLPE_RUPM_GCTL_RUPM_P3_DIS_SHIFT) +#define I40E_GLPE_RUPM_GCTL_RUPM_DIS_SHIFT 30 +#define I40E_GLPE_RUPM_GCTL_RUPM_DIS_MASK (0x1 << I40E_GLPE_RUPM_GCTL_RUPM_DIS_SHIFT) +#define I40E_GLPE_RUPM_GCTL_SWLB_MODE_SHIFT 31 +#define I40E_GLPE_RUPM_GCTL_SWLB_MODE_MASK (0x1 << I40E_GLPE_RUPM_GCTL_SWLB_MODE_SHIFT) +#define I40E_GLPE_RUPM_PTXPOOL 0x0000DAC8 /* Reset: PE_CORER */ +#define I40E_GLPE_RUPM_PTXPOOL_PTXSPADS_SHIFT 0 +#define I40E_GLPE_RUPM_PTXPOOL_PTXSPADS_MASK (0xFF << I40E_GLPE_RUPM_PTXPOOL_PTXSPADS_SHIFT) +#define I40E_GLPE_RUPM_PUSHPOOL 0x0000DAC0 /* Reset: PE_CORER */ +#define I40E_GLPE_RUPM_PUSHPOOL_PUSHSPADS_SHIFT 0 +#define I40E_GLPE_RUPM_PUSHPOOL_PUSHSPADS_MASK (0xFF << I40E_GLPE_RUPM_PUSHPOOL_PUSHSPADS_SHIFT) +#define I40E_GLPE_RUPM_TXHOST_EN 0x0000DA08 /* Reset: PE_CORER */ +#define I40E_GLPE_RUPM_TXHOST_EN_TXHOST_EN_SHIFT 0 +#define I40E_GLPE_RUPM_TXHOST_EN_TXHOST_EN_MASK (0x1 << I40E_GLPE_RUPM_TXHOST_EN_TXHOST_EN_SHIFT) +#define I40E_GLPE_VFAEQEDROPCNT(_i) (0x00132540 + ((_i) * 4)) /* _i=0...31 */ /* Reset: CORER */ +#define I40E_GLPE_VFAEQEDROPCNT_MAX_INDEX 31 +#define I40E_GLPE_VFAEQEDROPCNT_AEQEDROPCNT_SHIFT 0 +#define I40E_GLPE_VFAEQEDROPCNT_AEQEDROPCNT_MASK (0xFFFF << I40E_GLPE_VFAEQEDROPCNT_AEQEDROPCNT_SHIFT) +#define I40E_GLPE_VFCEQEDROPCNT(_i) (0x00132440 + ((_i) * 4)) /* _i=0...31 */ /* Reset: CORER */ +#define I40E_GLPE_VFCEQEDROPCNT_MAX_INDEX 31 +#define I40E_GLPE_VFCEQEDROPCNT_CEQEDROPCNT_SHIFT 0 +#define I40E_GLPE_VFCEQEDROPCNT_CEQEDROPCNT_MASK (0xFFFF << I40E_GLPE_VFCEQEDROPCNT_CEQEDROPCNT_SHIFT) +#define I40E_GLPE_VFCQEDROPCNT(_i) (0x00132340 + ((_i) * 4)) /* _i=0...31 */ /* Reset: CORER */ +#define I40E_GLPE_VFCQEDROPCNT_MAX_INDEX 31 +#define I40E_GLPE_VFCQEDROPCNT_CQEDROPCNT_SHIFT 0 +#define I40E_GLPE_VFCQEDROPCNT_CQEDROPCNT_MASK (0xFFFF << I40E_GLPE_VFCQEDROPCNT_CQEDROPCNT_SHIFT) +#define I40E_GLPE_VFFLMOBJCTRL(_i) (0x0000D400 + ((_i) * 4)) /* _i=0...31 */ /* Reset: PE_CORER */ +#define I40E_GLPE_VFFLMOBJCTRL_MAX_INDEX 31 +#define I40E_GLPE_VFFLMOBJCTRL_XMIT_BLOCKSIZE_SHIFT 0 +#define I40E_GLPE_VFFLMOBJCTRL_XMIT_BLOCKSIZE_MASK (0x7 << I40E_GLPE_VFFLMOBJCTRL_XMIT_BLOCKSIZE_SHIFT) +#define I40E_GLPE_VFFLMOBJCTRL_Q1_BLOCKSIZE_SHIFT 8 +#define I40E_GLPE_VFFLMOBJCTRL_Q1_BLOCKSIZE_MASK (0x7 << I40E_GLPE_VFFLMOBJCTRL_Q1_BLOCKSIZE_SHIFT) +#define I40E_GLPE_VFFLMQ1ALLOCERR(_i) (0x0000C700 + ((_i) * 4)) /* _i=0...31 */ /* Reset: PE_CORER */ +#define I40E_GLPE_VFFLMQ1ALLOCERR_MAX_INDEX 31 +#define I40E_GLPE_VFFLMQ1ALLOCERR_ERROR_COUNT_SHIFT 0 +#define I40E_GLPE_VFFLMQ1ALLOCERR_ERROR_COUNT_MASK (0xFFFF << I40E_GLPE_VFFLMQ1ALLOCERR_ERROR_COUNT_SHIFT) +#define I40E_GLPE_VFFLMXMITALLOCERR(_i) (0x0000C600 + ((_i) * 4)) /* _i=0...31 */ /* Reset: PE_CORER */ +#define I40E_GLPE_VFFLMXMITALLOCERR_MAX_INDEX 31 +#define I40E_GLPE_VFFLMXMITALLOCERR_ERROR_COUNT_SHIFT 0 +#define I40E_GLPE_VFFLMXMITALLOCERR_ERROR_COUNT_MASK (0xFFFF << I40E_GLPE_VFFLMXMITALLOCERR_ERROR_COUNT_SHIFT) +#define I40E_GLPE_VFUDACTRL(_i) (0x0000C000 + ((_i) * 4)) /* _i=0...31 */ /* Reset: PE_CORER */ +#define I40E_GLPE_VFUDACTRL_MAX_INDEX 31 +#define I40E_GLPE_VFUDACTRL_IPV4MCFRAGRESBP_SHIFT 0 +#define I40E_GLPE_VFUDACTRL_IPV4MCFRAGRESBP_MASK (0x1 << I40E_GLPE_VFUDACTRL_IPV4MCFRAGRESBP_SHIFT) +#define I40E_GLPE_VFUDACTRL_IPV4UCFRAGRESBP_SHIFT 1 +#define I40E_GLPE_VFUDACTRL_IPV4UCFRAGRESBP_MASK (0x1 << I40E_GLPE_VFUDACTRL_IPV4UCFRAGRESBP_SHIFT) +#define I40E_GLPE_VFUDACTRL_IPV6MCFRAGRESBP_SHIFT 2 +#define I40E_GLPE_VFUDACTRL_IPV6MCFRAGRESBP_MASK (0x1 << I40E_GLPE_VFUDACTRL_IPV6MCFRAGRESBP_SHIFT) +#define I40E_GLPE_VFUDACTRL_IPV6UCFRAGRESBP_SHIFT 3 +#define I40E_GLPE_VFUDACTRL_IPV6UCFRAGRESBP_MASK (0x1 << I40E_GLPE_VFUDACTRL_IPV6UCFRAGRESBP_SHIFT) +#define I40E_GLPE_VFUDACTRL_UDPMCFRAGRESFAIL_SHIFT 4 +#define I40E_GLPE_VFUDACTRL_UDPMCFRAGRESFAIL_MASK (0x1 << I40E_GLPE_VFUDACTRL_UDPMCFRAGRESFAIL_SHIFT) +#define I40E_GLPE_VFUDAUCFBQPN(_i) (0x0000C100 + ((_i) * 4)) /* _i=0...31 */ /* Reset: PE_CORER */ +#define I40E_GLPE_VFUDAUCFBQPN_MAX_INDEX 31 +#define I40E_GLPE_VFUDAUCFBQPN_QPN_SHIFT 0 +#define I40E_GLPE_VFUDAUCFBQPN_QPN_MASK (0x3FFFF << I40E_GLPE_VFUDAUCFBQPN_QPN_SHIFT) +#define I40E_GLPE_VFUDAUCFBQPN_VALID_SHIFT 31 +#define I40E_GLPE_VFUDAUCFBQPN_VALID_MASK (0x1 << I40E_GLPE_VFUDAUCFBQPN_VALID_SHIFT) + +#define I40E_GLPES_PFIP4RXDISCARD(_i) (0x00010600 + ((_i) * 4)) /* _i=0...15 */ /* Reset: PE_CORER */ +#define I40E_GLPES_PFIP4RXDISCARD_MAX_INDEX 15 +#define I40E_GLPES_PFIP4RXDISCARD_IP4RXDISCARD_SHIFT 0 +#define I40E_GLPES_PFIP4RXDISCARD_IP4RXDISCARD_MASK (0xFFFFFFFF << I40E_GLPES_PFIP4RXDISCARD_IP4RXDISCARD_SHIFT) +#define I40E_GLPES_PFIP4RXFRAGSHI(_i) (0x00010804 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */ +#define I40E_GLPES_PFIP4RXFRAGSHI_MAX_INDEX 15 +#define I40E_GLPES_PFIP4RXFRAGSHI_IP4RXFRAGSHI_SHIFT 0 +#define I40E_GLPES_PFIP4RXFRAGSHI_IP4RXFRAGSHI_MASK (0xFFFF << I40E_GLPES_PFIP4RXFRAGSHI_IP4RXFRAGSHI_SHIFT) +#define I40E_GLPES_PFIP4RXFRAGSLO(_i) (0x00010800 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */ +#define I40E_GLPES_PFIP4RXFRAGSLO_MAX_INDEX 15 +#define I40E_GLPES_PFIP4RXFRAGSLO_IP4RXFRAGSLO_SHIFT 0 +#define I40E_GLPES_PFIP4RXFRAGSLO_IP4RXFRAGSLO_MASK (0xFFFFFFFF << I40E_GLPES_PFIP4RXFRAGSLO_IP4RXFRAGSLO_SHIFT) +#define I40E_GLPES_PFIP4RXMCOCTSHI(_i) (0x00010A04 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */ +#define I40E_GLPES_PFIP4RXMCOCTSHI_MAX_INDEX 15 +#define I40E_GLPES_PFIP4RXMCOCTSHI_IP4RXMCOCTSHI_SHIFT 0 +#define I40E_GLPES_PFIP4RXMCOCTSHI_IP4RXMCOCTSHI_MASK (0xFFFF << I40E_GLPES_PFIP4RXMCOCTSHI_IP4RXMCOCTSHI_SHIFT) +#define I40E_GLPES_PFIP4RXMCOCTSLO(_i) (0x00010A00 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */ +#define I40E_GLPES_PFIP4RXMCOCTSLO_MAX_INDEX 15 +#define I40E_GLPES_PFIP4RXMCOCTSLO_IP4RXMCOCTSLO_SHIFT 0 +#define I40E_GLPES_PFIP4RXMCOCTSLO_IP4RXMCOCTSLO_MASK (0xFFFFFFFF << I40E_GLPES_PFIP4RXMCOCTSLO_IP4RXMCOCTSLO_SHIFT) +#define I40E_GLPES_PFIP4RXMCPKTSHI(_i) (0x00010C04 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */ +#define I40E_GLPES_PFIP4RXMCPKTSHI_MAX_INDEX 15 +#define I40E_GLPES_PFIP4RXMCPKTSHI_IP4RXMCPKTSHI_SHIFT 0 +#define I40E_GLPES_PFIP4RXMCPKTSHI_IP4RXMCPKTSHI_MASK (0xFFFF << I40E_GLPES_PFIP4RXMCPKTSHI_IP4RXMCPKTSHI_SHIFT) +#define I40E_GLPES_PFIP4RXMCPKTSLO(_i) (0x00010C00 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */ +#define I40E_GLPES_PFIP4RXMCPKTSLO_MAX_INDEX 15 +#define I40E_GLPES_PFIP4RXMCPKTSLO_IP4RXMCPKTSLO_SHIFT 0 +#define I40E_GLPES_PFIP4RXMCPKTSLO_IP4RXMCPKTSLO_MASK (0xFFFFFFFF << I40E_GLPES_PFIP4RXMCPKTSLO_IP4RXMCPKTSLO_SHIFT) +#define I40E_GLPES_PFIP4RXOCTSHI(_i) (0x00010204 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */ +#define I40E_GLPES_PFIP4RXOCTSHI_MAX_INDEX 15 +#define I40E_GLPES_PFIP4RXOCTSHI_IP4RXOCTSHI_SHIFT 0 +#define I40E_GLPES_PFIP4RXOCTSHI_IP4RXOCTSHI_MASK (0xFFFF << I40E_GLPES_PFIP4RXOCTSHI_IP4RXOCTSHI_SHIFT) +#define I40E_GLPES_PFIP4RXOCTSLO(_i) (0x00010200 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */ +#define I40E_GLPES_PFIP4RXOCTSLO_MAX_INDEX 15 +#define I40E_GLPES_PFIP4RXOCTSLO_IP4RXOCTSLO_SHIFT 0 +#define I40E_GLPES_PFIP4RXOCTSLO_IP4RXOCTSLO_MASK (0xFFFFFFFF << I40E_GLPES_PFIP4RXOCTSLO_IP4RXOCTSLO_SHIFT) +#define I40E_GLPES_PFIP4RXPKTSHI(_i) (0x00010404 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */ +#define I40E_GLPES_PFIP4RXPKTSHI_MAX_INDEX 15 +#define I40E_GLPES_PFIP4RXPKTSHI_IP4RXPKTSHI_SHIFT 0 +#define I40E_GLPES_PFIP4RXPKTSHI_IP4RXPKTSHI_MASK (0xFFFF << I40E_GLPES_PFIP4RXPKTSHI_IP4RXPKTSHI_SHIFT) +#define I40E_GLPES_PFIP4RXPKTSLO(_i) (0x00010400 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */ +#define I40E_GLPES_PFIP4RXPKTSLO_MAX_INDEX 15 +#define I40E_GLPES_PFIP4RXPKTSLO_IP4RXPKTSLO_SHIFT 0 +#define I40E_GLPES_PFIP4RXPKTSLO_IP4RXPKTSLO_MASK (0xFFFFFFFF << I40E_GLPES_PFIP4RXPKTSLO_IP4RXPKTSLO_SHIFT) +#define I40E_GLPES_PFIP4RXTRUNC(_i) (0x00010700 + ((_i) * 4)) /* _i=0...15 */ /* Reset: PE_CORER */ +#define I40E_GLPES_PFIP4RXTRUNC_MAX_INDEX 15 +#define I40E_GLPES_PFIP4RXTRUNC_IP4RXTRUNC_SHIFT 0 +#define I40E_GLPES_PFIP4RXTRUNC_IP4RXTRUNC_MASK (0xFFFFFFFF << I40E_GLPES_PFIP4RXTRUNC_IP4RXTRUNC_SHIFT) +#define I40E_GLPES_PFIP4TXFRAGSHI(_i) (0x00011E04 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */ +#define I40E_GLPES_PFIP4TXFRAGSHI_MAX_INDEX 15 +#define I40E_GLPES_PFIP4TXFRAGSHI_IP4TXFRAGSHI_SHIFT 0 +#define I40E_GLPES_PFIP4TXFRAGSHI_IP4TXFRAGSHI_MASK (0xFFFF << I40E_GLPES_PFIP4TXFRAGSHI_IP4TXFRAGSHI_SHIFT) +#define I40E_GLPES_PFIP4TXFRAGSLO(_i) (0x00011E00 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */ +#define I40E_GLPES_PFIP4TXFRAGSLO_MAX_INDEX 15 +#define I40E_GLPES_PFIP4TXFRAGSLO_IP4TXFRAGSLO_SHIFT 0 +#define I40E_GLPES_PFIP4TXFRAGSLO_IP4TXFRAGSLO_MASK (0xFFFFFFFF << I40E_GLPES_PFIP4TXFRAGSLO_IP4TXFRAGSLO_SHIFT) +#define I40E_GLPES_PFIP4TXMCOCTSHI(_i) (0x00012004 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */ +#define I40E_GLPES_PFIP4TXMCOCTSHI_MAX_INDEX 15 +#define I40E_GLPES_PFIP4TXMCOCTSHI_IP4TXMCOCTSHI_SHIFT 0 +#define I40E_GLPES_PFIP4TXMCOCTSHI_IP4TXMCOCTSHI_MASK (0xFFFF << I40E_GLPES_PFIP4TXMCOCTSHI_IP4TXMCOCTSHI_SHIFT) +#define I40E_GLPES_PFIP4TXMCOCTSLO(_i) (0x00012000 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */ +#define I40E_GLPES_PFIP4TXMCOCTSLO_MAX_INDEX 15 +#define I40E_GLPES_PFIP4TXMCOCTSLO_IP4TXMCOCTSLO_SHIFT 0 +#define I40E_GLPES_PFIP4TXMCOCTSLO_IP4TXMCOCTSLO_MASK (0xFFFFFFFF << I40E_GLPES_PFIP4TXMCOCTSLO_IP4TXMCOCTSLO_SHIFT) +#define I40E_GLPES_PFIP4TXMCPKTSHI(_i) (0x00012204 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */ +#define I40E_GLPES_PFIP4TXMCPKTSHI_MAX_INDEX 15 +#define I40E_GLPES_PFIP4TXMCPKTSHI_IP4TXMCPKTSHI_SHIFT 0 +#define I40E_GLPES_PFIP4TXMCPKTSHI_IP4TXMCPKTSHI_MASK (0xFFFF << I40E_GLPES_PFIP4TXMCPKTSHI_IP4TXMCPKTSHI_SHIFT) +#define I40E_GLPES_PFIP4TXMCPKTSLO(_i) (0x00012200 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */ +#define I40E_GLPES_PFIP4TXMCPKTSLO_MAX_INDEX 15 +#define I40E_GLPES_PFIP4TXMCPKTSLO_IP4TXMCPKTSLO_SHIFT 0 +#define I40E_GLPES_PFIP4TXMCPKTSLO_IP4TXMCPKTSLO_MASK (0xFFFFFFFF << I40E_GLPES_PFIP4TXMCPKTSLO_IP4TXMCPKTSLO_SHIFT) +#define I40E_GLPES_PFIP4TXNOROUTE(_i) (0x00012E00 + ((_i) * 4)) /* _i=0...15 */ /* Reset: PE_CORER */ +#define I40E_GLPES_PFIP4TXNOROUTE_MAX_INDEX 15 +#define I40E_GLPES_PFIP4TXNOROUTE_IP4TXNOROUTE_SHIFT 0 +#define I40E_GLPES_PFIP4TXNOROUTE_IP4TXNOROUTE_MASK (0xFFFFFF << I40E_GLPES_PFIP4TXNOROUTE_IP4TXNOROUTE_SHIFT) +#define I40E_GLPES_PFIP4TXOCTSHI(_i) (0x00011A04 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */ +#define I40E_GLPES_PFIP4TXOCTSHI_MAX_INDEX 15 +#define I40E_GLPES_PFIP4TXOCTSHI_IP4TXOCTSHI_SHIFT 0 +#define I40E_GLPES_PFIP4TXOCTSHI_IP4TXOCTSHI_MASK (0xFFFF << I40E_GLPES_PFIP4TXOCTSHI_IP4TXOCTSHI_SHIFT) +#define I40E_GLPES_PFIP4TXOCTSLO(_i) (0x00011A00 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */ +#define I40E_GLPES_PFIP4TXOCTSLO_MAX_INDEX 15 +#define I40E_GLPES_PFIP4TXOCTSLO_IP4TXOCTSLO_SHIFT 0 +#define I40E_GLPES_PFIP4TXOCTSLO_IP4TXOCTSLO_MASK (0xFFFFFFFF << I40E_GLPES_PFIP4TXOCTSLO_IP4TXOCTSLO_SHIFT) +#define I40E_GLPES_PFIP4TXPKTSHI(_i) (0x00011C04 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */ +#define I40E_GLPES_PFIP4TXPKTSHI_MAX_INDEX 15 +#define I40E_GLPES_PFIP4TXPKTSHI_IP4TXPKTSHI_SHIFT 0 +#define I40E_GLPES_PFIP4TXPKTSHI_IP4TXPKTSHI_MASK (0xFFFF << I40E_GLPES_PFIP4TXPKTSHI_IP4TXPKTSHI_SHIFT) +#define I40E_GLPES_PFIP4TXPKTSLO(_i) (0x00011C00 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */ +#define I40E_GLPES_PFIP4TXPKTSLO_MAX_INDEX 15 +#define I40E_GLPES_PFIP4TXPKTSLO_IP4TXPKTSLO_SHIFT 0 +#define I40E_GLPES_PFIP4TXPKTSLO_IP4TXPKTSLO_MASK (0xFFFFFFFF << I40E_GLPES_PFIP4TXPKTSLO_IP4TXPKTSLO_SHIFT) +#define I40E_GLPES_PFIP6RXDISCARD(_i) (0x00011200 + ((_i) * 4)) /* _i=0...15 */ /* Reset: PE_CORER */ +#define I40E_GLPES_PFIP6RXDISCARD_MAX_INDEX 15 +#define I40E_GLPES_PFIP6RXDISCARD_IP6RXDISCARD_SHIFT 0 +#define I40E_GLPES_PFIP6RXDISCARD_IP6RXDISCARD_MASK (0xFFFFFFFF << I40E_GLPES_PFIP6RXDISCARD_IP6RXDISCARD_SHIFT) +#define I40E_GLPES_PFIP6RXFRAGSHI(_i) (0x00011404 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */ +#define I40E_GLPES_PFIP6RXFRAGSHI_MAX_INDEX 15 +#define I40E_GLPES_PFIP6RXFRAGSHI_IP6RXFRAGSHI_SHIFT 0 +#define I40E_GLPES_PFIP6RXFRAGSHI_IP6RXFRAGSHI_MASK (0xFFFF << I40E_GLPES_PFIP6RXFRAGSHI_IP6RXFRAGSHI_SHIFT) +#define I40E_GLPES_PFIP6RXFRAGSLO(_i) (0x00011400 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */ +#define I40E_GLPES_PFIP6RXFRAGSLO_MAX_INDEX 15 +#define I40E_GLPES_PFIP6RXFRAGSLO_IP6RXFRAGSLO_SHIFT 0 +#define I40E_GLPES_PFIP6RXFRAGSLO_IP6RXFRAGSLO_MASK (0xFFFFFFFF << I40E_GLPES_PFIP6RXFRAGSLO_IP6RXFRAGSLO_SHIFT) +#define I40E_GLPES_PFIP6RXMCOCTSHI(_i) (0x00011604 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */ +#define I40E_GLPES_PFIP6RXMCOCTSHI_MAX_INDEX 15 +#define I40E_GLPES_PFIP6RXMCOCTSHI_IP6RXMCOCTSHI_SHIFT 0 +#define I40E_GLPES_PFIP6RXMCOCTSHI_IP6RXMCOCTSHI_MASK (0xFFFF << I40E_GLPES_PFIP6RXMCOCTSHI_IP6RXMCOCTSHI_SHIFT) +#define I40E_GLPES_PFIP6RXMCOCTSLO(_i) (0x00011600 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */ +#define I40E_GLPES_PFIP6RXMCOCTSLO_MAX_INDEX 15 +#define I40E_GLPES_PFIP6RXMCOCTSLO_IP6RXMCOCTSLO_SHIFT 0 +#define I40E_GLPES_PFIP6RXMCOCTSLO_IP6RXMCOCTSLO_MASK (0xFFFFFFFF << I40E_GLPES_PFIP6RXMCOCTSLO_IP6RXMCOCTSLO_SHIFT) +#define I40E_GLPES_PFIP6RXMCPKTSHI(_i) (0x00011804 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */ +#define I40E_GLPES_PFIP6RXMCPKTSHI_MAX_INDEX 15 +#define I40E_GLPES_PFIP6RXMCPKTSHI_IP6RXMCPKTSHI_SHIFT 0 +#define I40E_GLPES_PFIP6RXMCPKTSHI_IP6RXMCPKTSHI_MASK (0xFFFF << I40E_GLPES_PFIP6RXMCPKTSHI_IP6RXMCPKTSHI_SHIFT) +#define I40E_GLPES_PFIP6RXMCPKTSLO(_i) (0x00011800 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */ +#define I40E_GLPES_PFIP6RXMCPKTSLO_MAX_INDEX 15 +#define I40E_GLPES_PFIP6RXMCPKTSLO_IP6RXMCPKTSLO_SHIFT 0 +#define I40E_GLPES_PFIP6RXMCPKTSLO_IP6RXMCPKTSLO_MASK (0xFFFFFFFF << I40E_GLPES_PFIP6RXMCPKTSLO_IP6RXMCPKTSLO_SHIFT) +#define I40E_GLPES_PFIP6RXOCTSHI(_i) (0x00010E04 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */ +#define I40E_GLPES_PFIP6RXOCTSHI_MAX_INDEX 15 +#define I40E_GLPES_PFIP6RXOCTSHI_IP6RXOCTSHI_SHIFT 0 +#define I40E_GLPES_PFIP6RXOCTSHI_IP6RXOCTSHI_MASK (0xFFFF << I40E_GLPES_PFIP6RXOCTSHI_IP6RXOCTSHI_SHIFT) +#define I40E_GLPES_PFIP6RXOCTSLO(_i) (0x00010E00 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */ +#define I40E_GLPES_PFIP6RXOCTSLO_MAX_INDEX 15 +#define I40E_GLPES_PFIP6RXOCTSLO_IP6RXOCTSLO_SHIFT 0 +#define I40E_GLPES_PFIP6RXOCTSLO_IP6RXOCTSLO_MASK (0xFFFFFFFF << I40E_GLPES_PFIP6RXOCTSLO_IP6RXOCTSLO_SHIFT) +#define I40E_GLPES_PFIP6RXPKTSHI(_i) (0x00011004 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */ +#define I40E_GLPES_PFIP6RXPKTSHI_MAX_INDEX 15 +#define I40E_GLPES_PFIP6RXPKTSHI_IP6RXPKTSHI_SHIFT 0 +#define I40E_GLPES_PFIP6RXPKTSHI_IP6RXPKTSHI_MASK (0xFFFF << I40E_GLPES_PFIP6RXPKTSHI_IP6RXPKTSHI_SHIFT) +#define I40E_GLPES_PFIP6RXPKTSLO(_i) (0x00011000 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */ +#define I40E_GLPES_PFIP6RXPKTSLO_MAX_INDEX 15 +#define I40E_GLPES_PFIP6RXPKTSLO_IP6RXPKTSLO_SHIFT 0 +#define I40E_GLPES_PFIP6RXPKTSLO_IP6RXPKTSLO_MASK (0xFFFFFFFF << I40E_GLPES_PFIP6RXPKTSLO_IP6RXPKTSLO_SHIFT) +#define I40E_GLPES_PFIP6RXTRUNC(_i) (0x00011300 + ((_i) * 4)) /* _i=0...15 */ /* Reset: PE_CORER */ +#define I40E_GLPES_PFIP6RXTRUNC_MAX_INDEX 15 +#define I40E_GLPES_PFIP6RXTRUNC_IP6RXTRUNC_SHIFT 0 +#define I40E_GLPES_PFIP6RXTRUNC_IP6RXTRUNC_MASK (0xFFFFFFFF << I40E_GLPES_PFIP6RXTRUNC_IP6RXTRUNC_SHIFT) +#define I40E_GLPES_PFIP6TXFRAGSHI(_i) (0x00012804 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */ +#define I40E_GLPES_PFIP6TXFRAGSHI_MAX_INDEX 15 +#define I40E_GLPES_PFIP6TXFRAGSHI_IP6TXFRAGSHI_SHIFT 0 +#define I40E_GLPES_PFIP6TXFRAGSHI_IP6TXFRAGSHI_MASK (0xFFFF << I40E_GLPES_PFIP6TXFRAGSHI_IP6TXFRAGSHI_SHIFT) +#define I40E_GLPES_PFIP6TXFRAGSLO(_i) (0x00012800 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */ +#define I40E_GLPES_PFIP6TXFRAGSLO_MAX_INDEX 15 +#define I40E_GLPES_PFIP6TXFRAGSLO_IP6TXFRAGSLO_SHIFT 0 +#define I40E_GLPES_PFIP6TXFRAGSLO_IP6TXFRAGSLO_MASK (0xFFFFFFFF << I40E_GLPES_PFIP6TXFRAGSLO_IP6TXFRAGSLO_SHIFT) +#define I40E_GLPES_PFIP6TXMCOCTSHI(_i) (0x00012A04 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */ +#define I40E_GLPES_PFIP6TXMCOCTSHI_MAX_INDEX 15 +#define I40E_GLPES_PFIP6TXMCOCTSHI_IP6TXMCOCTSHI_SHIFT 0 +#define I40E_GLPES_PFIP6TXMCOCTSHI_IP6TXMCOCTSHI_MASK (0xFFFF << I40E_GLPES_PFIP6TXMCOCTSHI_IP6TXMCOCTSHI_SHIFT) +#define I40E_GLPES_PFIP6TXMCOCTSLO(_i) (0x00012A00 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */ +#define I40E_GLPES_PFIP6TXMCOCTSLO_MAX_INDEX 15 +#define I40E_GLPES_PFIP6TXMCOCTSLO_IP6TXMCOCTSLO_SHIFT 0 +#define I40E_GLPES_PFIP6TXMCOCTSLO_IP6TXMCOCTSLO_MASK (0xFFFFFFFF << I40E_GLPES_PFIP6TXMCOCTSLO_IP6TXMCOCTSLO_SHIFT) +#define I40E_GLPES_PFIP6TXMCPKTSHI(_i) (0x00012C04 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */ +#define I40E_GLPES_PFIP6TXMCPKTSHI_MAX_INDEX 15 +#define I40E_GLPES_PFIP6TXMCPKTSHI_IP6TXMCPKTSHI_SHIFT 0 +#define I40E_GLPES_PFIP6TXMCPKTSHI_IP6TXMCPKTSHI_MASK (0xFFFF << I40E_GLPES_PFIP6TXMCPKTSHI_IP6TXMCPKTSHI_SHIFT) +#define I40E_GLPES_PFIP6TXMCPKTSLO(_i) (0x00012C00 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */ +#define I40E_GLPES_PFIP6TXMCPKTSLO_MAX_INDEX 15 +#define I40E_GLPES_PFIP6TXMCPKTSLO_IP6TXMCPKTSLO_SHIFT 0 +#define I40E_GLPES_PFIP6TXMCPKTSLO_IP6TXMCPKTSLO_MASK (0xFFFFFFFF << I40E_GLPES_PFIP6TXMCPKTSLO_IP6TXMCPKTSLO_SHIFT) +#define I40E_GLPES_PFIP6TXNOROUTE(_i) (0x00012F00 + ((_i) * 4)) /* _i=0...15 */ /* Reset: PE_CORER */ +#define I40E_GLPES_PFIP6TXNOROUTE_MAX_INDEX 15 +#define I40E_GLPES_PFIP6TXNOROUTE_IP6TXNOROUTE_SHIFT 0 +#define I40E_GLPES_PFIP6TXNOROUTE_IP6TXNOROUTE_MASK (0xFFFFFF << I40E_GLPES_PFIP6TXNOROUTE_IP6TXNOROUTE_SHIFT) +#define I40E_GLPES_PFIP6TXOCTSHI(_i) (0x00012404 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */ +#define I40E_GLPES_PFIP6TXOCTSHI_MAX_INDEX 15 +#define I40E_GLPES_PFIP6TXOCTSHI_IP6TXOCTSHI_SHIFT 0 +#define I40E_GLPES_PFIP6TXOCTSHI_IP6TXOCTSHI_MASK (0xFFFF << I40E_GLPES_PFIP6TXOCTSHI_IP6TXOCTSHI_SHIFT) +#define I40E_GLPES_PFIP6TXOCTSLO(_i) (0x00012400 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */ +#define I40E_GLPES_PFIP6TXOCTSLO_MAX_INDEX 15 +#define I40E_GLPES_PFIP6TXOCTSLO_IP6TXOCTSLO_SHIFT 0 +#define I40E_GLPES_PFIP6TXOCTSLO_IP6TXOCTSLO_MASK (0xFFFFFFFF << I40E_GLPES_PFIP6TXOCTSLO_IP6TXOCTSLO_SHIFT) +#define I40E_GLPES_PFIP6TXPKTSHI(_i) (0x00012604 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */ +#define I40E_GLPES_PFIP6TXPKTSHI_MAX_INDEX 15 +#define I40E_GLPES_PFIP6TXPKTSHI_IP6TXPKTSHI_SHIFT 0 +#define I40E_GLPES_PFIP6TXPKTSHI_IP6TXPKTSHI_MASK (0xFFFF << I40E_GLPES_PFIP6TXPKTSHI_IP6TXPKTSHI_SHIFT) +#define I40E_GLPES_PFIP6TXPKTSLO(_i) (0x00012600 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */ +#define I40E_GLPES_PFIP6TXPKTSLO_MAX_INDEX 15 +#define I40E_GLPES_PFIP6TXPKTSLO_IP6TXPKTSLO_SHIFT 0 +#define I40E_GLPES_PFIP6TXPKTSLO_IP6TXPKTSLO_MASK (0xFFFFFFFF << I40E_GLPES_PFIP6TXPKTSLO_IP6TXPKTSLO_SHIFT) +#define I40E_GLPES_PFRDMARXRDSHI(_i) (0x00013E04 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */ +#define I40E_GLPES_PFRDMARXRDSHI_MAX_INDEX 15 +#define I40E_GLPES_PFRDMARXRDSHI_RDMARXRDSHI_SHIFT 0 +#define I40E_GLPES_PFRDMARXRDSHI_RDMARXRDSHI_MASK (0xFFFF << I40E_GLPES_PFRDMARXRDSHI_RDMARXRDSHI_SHIFT) +#define I40E_GLPES_PFRDMARXRDSLO(_i) (0x00013E00 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */ +#define I40E_GLPES_PFRDMARXRDSLO_MAX_INDEX 15 +#define I40E_GLPES_PFRDMARXRDSLO_RDMARXRDSLO_SHIFT 0 +#define I40E_GLPES_PFRDMARXRDSLO_RDMARXRDSLO_MASK (0xFFFFFFFF << I40E_GLPES_PFRDMARXRDSLO_RDMARXRDSLO_SHIFT) +#define I40E_GLPES_PFRDMARXSNDSHI(_i) (0x00014004 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */ +#define I40E_GLPES_PFRDMARXSNDSHI_MAX_INDEX 15 +#define I40E_GLPES_PFRDMARXSNDSHI_RDMARXSNDSHI_SHIFT 0 +#define I40E_GLPES_PFRDMARXSNDSHI_RDMARXSNDSHI_MASK (0xFFFF << I40E_GLPES_PFRDMARXSNDSHI_RDMARXSNDSHI_SHIFT) +#define I40E_GLPES_PFRDMARXSNDSLO(_i) (0x00014000 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */ +#define I40E_GLPES_PFRDMARXSNDSLO_MAX_INDEX 15 +#define I40E_GLPES_PFRDMARXSNDSLO_RDMARXSNDSLO_SHIFT 0 +#define I40E_GLPES_PFRDMARXSNDSLO_RDMARXSNDSLO_MASK (0xFFFFFFFF << I40E_GLPES_PFRDMARXSNDSLO_RDMARXSNDSLO_SHIFT) +#define I40E_GLPES_PFRDMARXWRSHI(_i) (0x00013C04 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */ +#define I40E_GLPES_PFRDMARXWRSHI_MAX_INDEX 15 +#define I40E_GLPES_PFRDMARXWRSHI_RDMARXWRSHI_SHIFT 0 +#define I40E_GLPES_PFRDMARXWRSHI_RDMARXWRSHI_MASK (0xFFFF << I40E_GLPES_PFRDMARXWRSHI_RDMARXWRSHI_SHIFT) +#define I40E_GLPES_PFRDMARXWRSLO(_i) (0x00013C00 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */ +#define I40E_GLPES_PFRDMARXWRSLO_MAX_INDEX 15 +#define I40E_GLPES_PFRDMARXWRSLO_RDMARXWRSLO_SHIFT 0 +#define I40E_GLPES_PFRDMARXWRSLO_RDMARXWRSLO_MASK (0xFFFFFFFF << I40E_GLPES_PFRDMARXWRSLO_RDMARXWRSLO_SHIFT) +#define I40E_GLPES_PFRDMATXRDSHI(_i) (0x00014404 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */ +#define I40E_GLPES_PFRDMATXRDSHI_MAX_INDEX 15 +#define I40E_GLPES_PFRDMATXRDSHI_RDMARXRDSHI_SHIFT 0 +#define I40E_GLPES_PFRDMATXRDSHI_RDMARXRDSHI_MASK (0xFFFF << I40E_GLPES_PFRDMATXRDSHI_RDMARXRDSHI_SHIFT) +#define I40E_GLPES_PFRDMATXRDSLO(_i) (0x00014400 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */ +#define I40E_GLPES_PFRDMATXRDSLO_MAX_INDEX 15 +#define I40E_GLPES_PFRDMATXRDSLO_RDMARXRDSLO_SHIFT 0 +#define I40E_GLPES_PFRDMATXRDSLO_RDMARXRDSLO_MASK (0xFFFFFFFF << I40E_GLPES_PFRDMATXRDSLO_RDMARXRDSLO_SHIFT) +#define I40E_GLPES_PFRDMATXSNDSHI(_i) (0x00014604 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */ +#define I40E_GLPES_PFRDMATXSNDSHI_MAX_INDEX 15 +#define I40E_GLPES_PFRDMATXSNDSHI_RDMARXSNDSHI_SHIFT 0 +#define I40E_GLPES_PFRDMATXSNDSHI_RDMARXSNDSHI_MASK (0xFFFF << I40E_GLPES_PFRDMATXSNDSHI_RDMARXSNDSHI_SHIFT) +#define I40E_GLPES_PFRDMATXSNDSLO(_i) (0x00014600 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */ +#define I40E_GLPES_PFRDMATXSNDSLO_MAX_INDEX 15 +#define I40E_GLPES_PFRDMATXSNDSLO_RDMARXSNDSLO_SHIFT 0 +#define I40E_GLPES_PFRDMATXSNDSLO_RDMARXSNDSLO_MASK (0xFFFFFFFF << I40E_GLPES_PFRDMATXSNDSLO_RDMARXSNDSLO_SHIFT) +#define I40E_GLPES_PFRDMATXWRSHI(_i) (0x00014204 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */ +#define I40E_GLPES_PFRDMATXWRSHI_MAX_INDEX 15 +#define I40E_GLPES_PFRDMATXWRSHI_RDMARXWRSHI_SHIFT 0 +#define I40E_GLPES_PFRDMATXWRSHI_RDMARXWRSHI_MASK (0xFFFF << I40E_GLPES_PFRDMATXWRSHI_RDMARXWRSHI_SHIFT) +#define I40E_GLPES_PFRDMATXWRSLO(_i) (0x00014200 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */ +#define I40E_GLPES_PFRDMATXWRSLO_MAX_INDEX 15 +#define I40E_GLPES_PFRDMATXWRSLO_RDMARXWRSLO_SHIFT 0 +#define I40E_GLPES_PFRDMATXWRSLO_RDMARXWRSLO_MASK (0xFFFFFFFF << I40E_GLPES_PFRDMATXWRSLO_RDMARXWRSLO_SHIFT) +#define I40E_GLPES_PFRDMAVBNDHI(_i) (0x00014804 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */ +#define I40E_GLPES_PFRDMAVBNDHI_MAX_INDEX 15 +#define I40E_GLPES_PFRDMAVBNDHI_RDMAVBNDHI_SHIFT 0 +#define I40E_GLPES_PFRDMAVBNDHI_RDMAVBNDHI_MASK (0xFFFFFFFF << I40E_GLPES_PFRDMAVBNDHI_RDMAVBNDHI_SHIFT) +#define I40E_GLPES_PFRDMAVBNDLO(_i) (0x00014800 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */ +#define I40E_GLPES_PFRDMAVBNDLO_MAX_INDEX 15 +#define I40E_GLPES_PFRDMAVBNDLO_RDMAVBNDLO_SHIFT 0 +#define I40E_GLPES_PFRDMAVBNDLO_RDMAVBNDLO_MASK (0xFFFFFFFF << I40E_GLPES_PFRDMAVBNDLO_RDMAVBNDLO_SHIFT) +#define I40E_GLPES_PFRDMAVINVHI(_i) (0x00014A04 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */ +#define I40E_GLPES_PFRDMAVINVHI_MAX_INDEX 15 +#define I40E_GLPES_PFRDMAVINVHI_RDMAVINVHI_SHIFT 0 +#define I40E_GLPES_PFRDMAVINVHI_RDMAVINVHI_MASK (0xFFFFFFFF << I40E_GLPES_PFRDMAVINVHI_RDMAVINVHI_SHIFT) +#define I40E_GLPES_PFRDMAVINVLO(_i) (0x00014A00 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */ +#define I40E_GLPES_PFRDMAVINVLO_MAX_INDEX 15 +#define I40E_GLPES_PFRDMAVINVLO_RDMAVINVLO_SHIFT 0 +#define I40E_GLPES_PFRDMAVINVLO_RDMAVINVLO_MASK (0xFFFFFFFF << I40E_GLPES_PFRDMAVINVLO_RDMAVINVLO_SHIFT) +#define I40E_GLPES_PFRXVLANERR(_i) (0x00010000 + ((_i) * 4)) /* _i=0...15 */ /* Reset: PE_CORER */ +#define I40E_GLPES_PFRXVLANERR_MAX_INDEX 15 +#define I40E_GLPES_PFRXVLANERR_RXVLANERR_SHIFT 0 +#define I40E_GLPES_PFRXVLANERR_RXVLANERR_MASK (0xFFFFFF << I40E_GLPES_PFRXVLANERR_RXVLANERR_SHIFT) +#define I40E_GLPES_PFTCPRTXSEG(_i) (0x00013600 + ((_i) * 4)) /* _i=0...15 */ /* Reset: PE_CORER */ +#define I40E_GLPES_PFTCPRTXSEG_MAX_INDEX 15 +#define I40E_GLPES_PFTCPRTXSEG_TCPRTXSEG_SHIFT 0 +#define I40E_GLPES_PFTCPRTXSEG_TCPRTXSEG_MASK (0xFFFFFFFF << I40E_GLPES_PFTCPRTXSEG_TCPRTXSEG_SHIFT) +#define I40E_GLPES_PFTCPRXOPTERR(_i) (0x00013200 + ((_i) * 4)) /* _i=0...15 */ /* Reset: PE_CORER */ +#define I40E_GLPES_PFTCPRXOPTERR_MAX_INDEX 15 +#define I40E_GLPES_PFTCPRXOPTERR_TCPRXOPTERR_SHIFT 0 +#define I40E_GLPES_PFTCPRXOPTERR_TCPRXOPTERR_MASK (0xFFFFFF << I40E_GLPES_PFTCPRXOPTERR_TCPRXOPTERR_SHIFT) +#define I40E_GLPES_PFTCPRXPROTOERR(_i) (0x00013300 + ((_i) * 4)) /* _i=0...15 */ /* Reset: PE_CORER */ +#define I40E_GLPES_PFTCPRXPROTOERR_MAX_INDEX 15 +#define I40E_GLPES_PFTCPRXPROTOERR_TCPRXPROTOERR_SHIFT 0 +#define I40E_GLPES_PFTCPRXPROTOERR_TCPRXPROTOERR_MASK (0xFFFFFF << I40E_GLPES_PFTCPRXPROTOERR_TCPRXPROTOERR_SHIFT) +#define I40E_GLPES_PFTCPRXSEGSHI(_i) (0x00013004 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */ +#define I40E_GLPES_PFTCPRXSEGSHI_MAX_INDEX 15 +#define I40E_GLPES_PFTCPRXSEGSHI_TCPRXSEGSHI_SHIFT 0 +#define I40E_GLPES_PFTCPRXSEGSHI_TCPRXSEGSHI_MASK (0xFFFF << I40E_GLPES_PFTCPRXSEGSHI_TCPRXSEGSHI_SHIFT) +#define I40E_GLPES_PFTCPRXSEGSLO(_i) (0x00013000 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */ +#define I40E_GLPES_PFTCPRXSEGSLO_MAX_INDEX 15 +#define I40E_GLPES_PFTCPRXSEGSLO_TCPRXSEGSLO_SHIFT 0 +#define I40E_GLPES_PFTCPRXSEGSLO_TCPRXSEGSLO_MASK (0xFFFFFFFF << I40E_GLPES_PFTCPRXSEGSLO_TCPRXSEGSLO_SHIFT) +#define I40E_GLPES_PFTCPTXSEGHI(_i) (0x00013404 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */ +#define I40E_GLPES_PFTCPTXSEGHI_MAX_INDEX 15 +#define I40E_GLPES_PFTCPTXSEGHI_TCPTXSEGHI_SHIFT 0 +#define I40E_GLPES_PFTCPTXSEGHI_TCPTXSEGHI_MASK (0xFFFF << I40E_GLPES_PFTCPTXSEGHI_TCPTXSEGHI_SHIFT) +#define I40E_GLPES_PFTCPTXSEGLO(_i) (0x00013400 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */ +#define I40E_GLPES_PFTCPTXSEGLO_MAX_INDEX 15 +#define I40E_GLPES_PFTCPTXSEGLO_TCPTXSEGLO_SHIFT 0 +#define I40E_GLPES_PFTCPTXSEGLO_TCPTXSEGLO_MASK (0xFFFFFFFF << I40E_GLPES_PFTCPTXSEGLO_TCPTXSEGLO_SHIFT) +#define I40E_GLPES_PFUDPRXPKTSHI(_i) (0x00013804 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */ +#define I40E_GLPES_PFUDPRXPKTSHI_MAX_INDEX 15 +#define I40E_GLPES_PFUDPRXPKTSHI_UDPRXPKTSHI_SHIFT 0 +#define I40E_GLPES_PFUDPRXPKTSHI_UDPRXPKTSHI_MASK (0xFFFF << I40E_GLPES_PFUDPRXPKTSHI_UDPRXPKTSHI_SHIFT) +#define I40E_GLPES_PFUDPRXPKTSLO(_i) (0x00013800 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */ +#define I40E_GLPES_PFUDPRXPKTSLO_MAX_INDEX 15 +#define I40E_GLPES_PFUDPRXPKTSLO_UDPRXPKTSLO_SHIFT 0 +#define I40E_GLPES_PFUDPRXPKTSLO_UDPRXPKTSLO_MASK (0xFFFFFFFF << I40E_GLPES_PFUDPRXPKTSLO_UDPRXPKTSLO_SHIFT) +#define I40E_GLPES_PFUDPTXPKTSHI(_i) (0x00013A04 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */ +#define I40E_GLPES_PFUDPTXPKTSHI_MAX_INDEX 15 +#define I40E_GLPES_PFUDPTXPKTSHI_UDPTXPKTSHI_SHIFT 0 +#define I40E_GLPES_PFUDPTXPKTSHI_UDPTXPKTSHI_MASK (0xFFFF << I40E_GLPES_PFUDPTXPKTSHI_UDPTXPKTSHI_SHIFT) +#define I40E_GLPES_PFUDPTXPKTSLO(_i) (0x00013A00 + ((_i) * 8)) /* _i=0...15 */ /* Reset: PE_CORER */ +#define I40E_GLPES_PFUDPTXPKTSLO_MAX_INDEX 15 +#define I40E_GLPES_PFUDPTXPKTSLO_UDPTXPKTSLO_SHIFT 0 +#define I40E_GLPES_PFUDPTXPKTSLO_UDPTXPKTSLO_MASK (0xFFFFFFFF << I40E_GLPES_PFUDPTXPKTSLO_UDPTXPKTSLO_SHIFT) +#define I40E_GLPES_RDMARXMULTFPDUSHI 0x0001E014 /* Reset: PE_CORER */ +#define I40E_GLPES_RDMARXMULTFPDUSHI_RDMARXMULTFPDUSHI_SHIFT 0 +#define I40E_GLPES_RDMARXMULTFPDUSHI_RDMARXMULTFPDUSHI_MASK (0xFFFFFF << I40E_GLPES_RDMARXMULTFPDUSHI_RDMARXMULTFPDUSHI_SHIFT) +#define I40E_GLPES_RDMARXMULTFPDUSLO 0x0001E010 /* Reset: PE_CORER */ +#define I40E_GLPES_RDMARXMULTFPDUSLO_RDMARXMULTFPDUSLO_SHIFT 0 +#define I40E_GLPES_RDMARXMULTFPDUSLO_RDMARXMULTFPDUSLO_MASK (0xFFFFFFFF << I40E_GLPES_RDMARXMULTFPDUSLO_RDMARXMULTFPDUSLO_SHIFT) +#define I40E_GLPES_RDMARXOOODDPHI 0x0001E01C /* Reset: PE_CORER */ +#define I40E_GLPES_RDMARXOOODDPHI_RDMARXOOODDPHI_SHIFT 0 +#define I40E_GLPES_RDMARXOOODDPHI_RDMARXOOODDPHI_MASK (0xFFFFFF << I40E_GLPES_RDMARXOOODDPHI_RDMARXOOODDPHI_SHIFT) +#define I40E_GLPES_RDMARXOOODDPLO 0x0001E018 /* Reset: PE_CORER */ +#define I40E_GLPES_RDMARXOOODDPLO_RDMARXOOODDPLO_SHIFT 0 +#define I40E_GLPES_RDMARXOOODDPLO_RDMARXOOODDPLO_MASK (0xFFFFFFFF << I40E_GLPES_RDMARXOOODDPLO_RDMARXOOODDPLO_SHIFT) +#define I40E_GLPES_RDMARXOOONOMARK 0x0001E004 /* Reset: PE_CORER */ +#define I40E_GLPES_RDMARXOOONOMARK_RDMAOOONOMARK_SHIFT 0 +#define I40E_GLPES_RDMARXOOONOMARK_RDMAOOONOMARK_MASK (0xFFFFFFFF << I40E_GLPES_RDMARXOOONOMARK_RDMAOOONOMARK_SHIFT) +#define I40E_GLPES_RDMARXUNALIGN 0x0001E000 /* Reset: PE_CORER */ +#define I40E_GLPES_RDMARXUNALIGN_RDMRXAUNALIGN_SHIFT 0 +#define I40E_GLPES_RDMARXUNALIGN_RDMRXAUNALIGN_MASK (0xFFFFFFFF << I40E_GLPES_RDMARXUNALIGN_RDMRXAUNALIGN_SHIFT) +#define I40E_GLPES_TCPRXFOURHOLEHI 0x0001E044 /* Reset: PE_CORER */ +#define I40E_GLPES_TCPRXFOURHOLEHI_TCPRXFOURHOLEHI_SHIFT 0 +#define I40E_GLPES_TCPRXFOURHOLEHI_TCPRXFOURHOLEHI_MASK (0xFFFFFF << I40E_GLPES_TCPRXFOURHOLEHI_TCPRXFOURHOLEHI_SHIFT) +#define I40E_GLPES_TCPRXFOURHOLELO 0x0001E040 /* Reset: PE_CORER */ +#define I40E_GLPES_TCPRXFOURHOLELO_TCPRXFOURHOLELO_SHIFT 0 +#define I40E_GLPES_TCPRXFOURHOLELO_TCPRXFOURHOLELO_MASK (0xFFFFFFFF << I40E_GLPES_TCPRXFOURHOLELO_TCPRXFOURHOLELO_SHIFT) +#define I40E_GLPES_TCPRXONEHOLEHI 0x0001E02C /* Reset: PE_CORER */ +#define I40E_GLPES_TCPRXONEHOLEHI_TCPRXONEHOLEHI_SHIFT 0 +#define I40E_GLPES_TCPRXONEHOLEHI_TCPRXONEHOLEHI_MASK (0xFFFFFF << I40E_GLPES_TCPRXONEHOLEHI_TCPRXONEHOLEHI_SHIFT) +#define I40E_GLPES_TCPRXONEHOLELO 0x0001E028 /* Reset: PE_CORER */ +#define I40E_GLPES_TCPRXONEHOLELO_TCPRXONEHOLELO_SHIFT 0 +#define I40E_GLPES_TCPRXONEHOLELO_TCPRXONEHOLELO_MASK (0xFFFFFFFF << I40E_GLPES_TCPRXONEHOLELO_TCPRXONEHOLELO_SHIFT) +#define I40E_GLPES_TCPRXPUREACKHI 0x0001E024 /* Reset: PE_CORER */ +#define I40E_GLPES_TCPRXPUREACKHI_TCPRXPUREACKSHI_SHIFT 0 +#define I40E_GLPES_TCPRXPUREACKHI_TCPRXPUREACKSHI_MASK (0xFFFFFF << I40E_GLPES_TCPRXPUREACKHI_TCPRXPUREACKSHI_SHIFT) +#define I40E_GLPES_TCPRXPUREACKSLO 0x0001E020 /* Reset: PE_CORER */ +#define I40E_GLPES_TCPRXPUREACKSLO_TCPRXPUREACKLO_SHIFT 0 +#define I40E_GLPES_TCPRXPUREACKSLO_TCPRXPUREACKLO_MASK (0xFFFFFFFF << I40E_GLPES_TCPRXPUREACKSLO_TCPRXPUREACKLO_SHIFT) +#define I40E_GLPES_TCPRXTHREEHOLEHI 0x0001E03C /* Reset: PE_CORER */ +#define I40E_GLPES_TCPRXTHREEHOLEHI_TCPRXTHREEHOLEHI_SHIFT 0 +#define I40E_GLPES_TCPRXTHREEHOLEHI_TCPRXTHREEHOLEHI_MASK (0xFFFFFF << I40E_GLPES_TCPRXTHREEHOLEHI_TCPRXTHREEHOLEHI_SHIFT) +#define I40E_GLPES_TCPRXTHREEHOLELO 0x0001E038 /* Reset: PE_CORER */ +#define I40E_GLPES_TCPRXTHREEHOLELO_TCPRXTHREEHOLELO_SHIFT 0 +#define I40E_GLPES_TCPRXTHREEHOLELO_TCPRXTHREEHOLELO_MASK (0xFFFFFFFF << I40E_GLPES_TCPRXTHREEHOLELO_TCPRXTHREEHOLELO_SHIFT) +#define I40E_GLPES_TCPRXTWOHOLEHI 0x0001E034 /* Reset: PE_CORER */ +#define I40E_GLPES_TCPRXTWOHOLEHI_TCPRXTWOHOLEHI_SHIFT 0 +#define I40E_GLPES_TCPRXTWOHOLEHI_TCPRXTWOHOLEHI_MASK (0xFFFFFF << I40E_GLPES_TCPRXTWOHOLEHI_TCPRXTWOHOLEHI_SHIFT) +#define I40E_GLPES_TCPRXTWOHOLELO 0x0001E030 /* Reset: PE_CORER */ +#define I40E_GLPES_TCPRXTWOHOLELO_TCPRXTWOHOLELO_SHIFT 0 +#define I40E_GLPES_TCPRXTWOHOLELO_TCPRXTWOHOLELO_MASK (0xFFFFFFFF << I40E_GLPES_TCPRXTWOHOLELO_TCPRXTWOHOLELO_SHIFT) +#define I40E_GLPES_TCPTXRETRANSFASTHI 0x0001E04C /* Reset: PE_CORER */ +#define I40E_GLPES_TCPTXRETRANSFASTHI_TCPTXRETRANSFASTHI_SHIFT 0 +#define I40E_GLPES_TCPTXRETRANSFASTHI_TCPTXRETRANSFASTHI_MASK (0xFFFFFF << I40E_GLPES_TCPTXRETRANSFASTHI_TCPTXRETRANSFASTHI_SHIFT) +#define I40E_GLPES_TCPTXRETRANSFASTLO 0x0001E048 /* Reset: PE_CORER */ +#define I40E_GLPES_TCPTXRETRANSFASTLO_TCPTXRETRANSFASTLO_SHIFT 0 +#define I40E_GLPES_TCPTXRETRANSFASTLO_TCPTXRETRANSFASTLO_MASK (0xFFFFFFFF << I40E_GLPES_TCPTXRETRANSFASTLO_TCPTXRETRANSFASTLO_SHIFT) +#define I40E_GLPES_TCPTXTOUTSFASTHI 0x0001E054 /* Reset: PE_CORER */ +#define I40E_GLPES_TCPTXTOUTSFASTHI_TCPTXTOUTSFASTHI_SHIFT 0 +#define I40E_GLPES_TCPTXTOUTSFASTHI_TCPTXTOUTSFASTHI_MASK (0xFFFFFF << I40E_GLPES_TCPTXTOUTSFASTHI_TCPTXTOUTSFASTHI_SHIFT) +#define I40E_GLPES_TCPTXTOUTSFASTLO 0x0001E050 /* Reset: PE_CORER */ +#define I40E_GLPES_TCPTXTOUTSFASTLO_TCPTXTOUTSFASTLO_SHIFT 0 +#define I40E_GLPES_TCPTXTOUTSFASTLO_TCPTXTOUTSFASTLO_MASK (0xFFFFFFFF << I40E_GLPES_TCPTXTOUTSFASTLO_TCPTXTOUTSFASTLO_SHIFT) +#define I40E_GLPES_TCPTXTOUTSHI 0x0001E05C /* Reset: PE_CORER */ +#define I40E_GLPES_TCPTXTOUTSHI_TCPTXTOUTSHI_SHIFT 0 +#define I40E_GLPES_TCPTXTOUTSHI_TCPTXTOUTSHI_MASK (0xFFFFFF << I40E_GLPES_TCPTXTOUTSHI_TCPTXTOUTSHI_SHIFT) +#define I40E_GLPES_TCPTXTOUTSLO 0x0001E058 /* Reset: PE_CORER */ +#define I40E_GLPES_TCPTXTOUTSLO_TCPTXTOUTSLO_SHIFT 0 +#define I40E_GLPES_TCPTXTOUTSLO_TCPTXTOUTSLO_MASK (0xFFFFFFFF << I40E_GLPES_TCPTXTOUTSLO_TCPTXTOUTSLO_SHIFT) +#define I40E_GLPES_VFIP4RXDISCARD(_i) (0x00018600 + ((_i) * 4)) /* _i=0...31 */ /* Reset: PE_CORER */ +#define I40E_GLPES_VFIP4RXDISCARD_MAX_INDEX 31 +#define I40E_GLPES_VFIP4RXDISCARD_IP4RXDISCARD_SHIFT 0 +#define I40E_GLPES_VFIP4RXDISCARD_IP4RXDISCARD_MASK (0xFFFFFFFF << I40E_GLPES_VFIP4RXDISCARD_IP4RXDISCARD_SHIFT) +#define I40E_GLPES_VFIP4RXFRAGSHI(_i) (0x00018804 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */ +#define I40E_GLPES_VFIP4RXFRAGSHI_MAX_INDEX 31 +#define I40E_GLPES_VFIP4RXFRAGSHI_IP4RXFRAGSHI_SHIFT 0 +#define I40E_GLPES_VFIP4RXFRAGSHI_IP4RXFRAGSHI_MASK (0xFFFF << I40E_GLPES_VFIP4RXFRAGSHI_IP4RXFRAGSHI_SHIFT) +#define I40E_GLPES_VFIP4RXFRAGSLO(_i) (0x00018800 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */ +#define I40E_GLPES_VFIP4RXFRAGSLO_MAX_INDEX 31 +#define I40E_GLPES_VFIP4RXFRAGSLO_IP4RXFRAGSLO_SHIFT 0 +#define I40E_GLPES_VFIP4RXFRAGSLO_IP4RXFRAGSLO_MASK (0xFFFFFFFF << I40E_GLPES_VFIP4RXFRAGSLO_IP4RXFRAGSLO_SHIFT) +#define I40E_GLPES_VFIP4RXMCOCTSHI(_i) (0x00018A04 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */ +#define I40E_GLPES_VFIP4RXMCOCTSHI_MAX_INDEX 31 +#define I40E_GLPES_VFIP4RXMCOCTSHI_IP4RXMCOCTSHI_SHIFT 0 +#define I40E_GLPES_VFIP4RXMCOCTSHI_IP4RXMCOCTSHI_MASK (0xFFFF << I40E_GLPES_VFIP4RXMCOCTSHI_IP4RXMCOCTSHI_SHIFT) +#define I40E_GLPES_VFIP4RXMCOCTSLO(_i) (0x00018A00 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */ +#define I40E_GLPES_VFIP4RXMCOCTSLO_MAX_INDEX 31 +#define I40E_GLPES_VFIP4RXMCOCTSLO_IP4RXMCOCTSLO_SHIFT 0 +#define I40E_GLPES_VFIP4RXMCOCTSLO_IP4RXMCOCTSLO_MASK (0xFFFFFFFF << I40E_GLPES_VFIP4RXMCOCTSLO_IP4RXMCOCTSLO_SHIFT) +#define I40E_GLPES_VFIP4RXMCPKTSHI(_i) (0x00018C04 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */ +#define I40E_GLPES_VFIP4RXMCPKTSHI_MAX_INDEX 31 +#define I40E_GLPES_VFIP4RXMCPKTSHI_IP4RXMCPKTSHI_SHIFT 0 +#define I40E_GLPES_VFIP4RXMCPKTSHI_IP4RXMCPKTSHI_MASK (0xFFFF << I40E_GLPES_VFIP4RXMCPKTSHI_IP4RXMCPKTSHI_SHIFT) +#define I40E_GLPES_VFIP4RXMCPKTSLO(_i) (0x00018C00 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */ +#define I40E_GLPES_VFIP4RXMCPKTSLO_MAX_INDEX 31 +#define I40E_GLPES_VFIP4RXMCPKTSLO_IP4RXMCPKTSLO_SHIFT 0 +#define I40E_GLPES_VFIP4RXMCPKTSLO_IP4RXMCPKTSLO_MASK (0xFFFFFFFF << I40E_GLPES_VFIP4RXMCPKTSLO_IP4RXMCPKTSLO_SHIFT) +#define I40E_GLPES_VFIP4RXOCTSHI(_i) (0x00018204 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */ +#define I40E_GLPES_VFIP4RXOCTSHI_MAX_INDEX 31 +#define I40E_GLPES_VFIP4RXOCTSHI_IP4RXOCTSHI_SHIFT 0 +#define I40E_GLPES_VFIP4RXOCTSHI_IP4RXOCTSHI_MASK (0xFFFF << I40E_GLPES_VFIP4RXOCTSHI_IP4RXOCTSHI_SHIFT) +#define I40E_GLPES_VFIP4RXOCTSLO(_i) (0x00018200 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */ +#define I40E_GLPES_VFIP4RXOCTSLO_MAX_INDEX 31 +#define I40E_GLPES_VFIP4RXOCTSLO_IP4RXOCTSLO_SHIFT 0 +#define I40E_GLPES_VFIP4RXOCTSLO_IP4RXOCTSLO_MASK (0xFFFFFFFF << I40E_GLPES_VFIP4RXOCTSLO_IP4RXOCTSLO_SHIFT) +#define I40E_GLPES_VFIP4RXPKTSHI(_i) (0x00018404 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */ +#define I40E_GLPES_VFIP4RXPKTSHI_MAX_INDEX 31 +#define I40E_GLPES_VFIP4RXPKTSHI_IP4RXPKTSHI_SHIFT 0 +#define I40E_GLPES_VFIP4RXPKTSHI_IP4RXPKTSHI_MASK (0xFFFF << I40E_GLPES_VFIP4RXPKTSHI_IP4RXPKTSHI_SHIFT) +#define I40E_GLPES_VFIP4RXPKTSLO(_i) (0x00018400 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */ +#define I40E_GLPES_VFIP4RXPKTSLO_MAX_INDEX 31 +#define I40E_GLPES_VFIP4RXPKTSLO_IP4RXPKTSLO_SHIFT 0 +#define I40E_GLPES_VFIP4RXPKTSLO_IP4RXPKTSLO_MASK (0xFFFFFFFF << I40E_GLPES_VFIP4RXPKTSLO_IP4RXPKTSLO_SHIFT) +#define I40E_GLPES_VFIP4RXTRUNC(_i) (0x00018700 + ((_i) * 4)) /* _i=0...31 */ /* Reset: PE_CORER */ +#define I40E_GLPES_VFIP4RXTRUNC_MAX_INDEX 31 +#define I40E_GLPES_VFIP4RXTRUNC_IP4RXTRUNC_SHIFT 0 +#define I40E_GLPES_VFIP4RXTRUNC_IP4RXTRUNC_MASK (0xFFFFFFFF << I40E_GLPES_VFIP4RXTRUNC_IP4RXTRUNC_SHIFT) +#define I40E_GLPES_VFIP4TXFRAGSHI(_i) (0x00019E04 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */ +#define I40E_GLPES_VFIP4TXFRAGSHI_MAX_INDEX 31 +#define I40E_GLPES_VFIP4TXFRAGSHI_IP4TXFRAGSHI_SHIFT 0 +#define I40E_GLPES_VFIP4TXFRAGSHI_IP4TXFRAGSHI_MASK (0xFFFF << I40E_GLPES_VFIP4TXFRAGSHI_IP4TXFRAGSHI_SHIFT) +#define I40E_GLPES_VFIP4TXFRAGSLO(_i) (0x00019E00 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */ +#define I40E_GLPES_VFIP4TXFRAGSLO_MAX_INDEX 31 +#define I40E_GLPES_VFIP4TXFRAGSLO_IP4TXFRAGSLO_SHIFT 0 +#define I40E_GLPES_VFIP4TXFRAGSLO_IP4TXFRAGSLO_MASK (0xFFFFFFFF << I40E_GLPES_VFIP4TXFRAGSLO_IP4TXFRAGSLO_SHIFT) +#define I40E_GLPES_VFIP4TXMCOCTSHI(_i) (0x0001A004 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */ +#define I40E_GLPES_VFIP4TXMCOCTSHI_MAX_INDEX 31 +#define I40E_GLPES_VFIP4TXMCOCTSHI_IP4TXMCOCTSHI_SHIFT 0 +#define I40E_GLPES_VFIP4TXMCOCTSHI_IP4TXMCOCTSHI_MASK (0xFFFF << I40E_GLPES_VFIP4TXMCOCTSHI_IP4TXMCOCTSHI_SHIFT) +#define I40E_GLPES_VFIP4TXMCOCTSLO(_i) (0x0001A000 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */ +#define I40E_GLPES_VFIP4TXMCOCTSLO_MAX_INDEX 31 +#define I40E_GLPES_VFIP4TXMCOCTSLO_IP4TXMCOCTSLO_SHIFT 0 +#define I40E_GLPES_VFIP4TXMCOCTSLO_IP4TXMCOCTSLO_MASK (0xFFFFFFFF << I40E_GLPES_VFIP4TXMCOCTSLO_IP4TXMCOCTSLO_SHIFT) +#define I40E_GLPES_VFIP4TXMCPKTSHI(_i) (0x0001A204 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */ +#define I40E_GLPES_VFIP4TXMCPKTSHI_MAX_INDEX 31 +#define I40E_GLPES_VFIP4TXMCPKTSHI_IP4TXMCPKTSHI_SHIFT 0 +#define I40E_GLPES_VFIP4TXMCPKTSHI_IP4TXMCPKTSHI_MASK (0xFFFF << I40E_GLPES_VFIP4TXMCPKTSHI_IP4TXMCPKTSHI_SHIFT) +#define I40E_GLPES_VFIP4TXMCPKTSLO(_i) (0x0001A200 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */ +#define I40E_GLPES_VFIP4TXMCPKTSLO_MAX_INDEX 31 +#define I40E_GLPES_VFIP4TXMCPKTSLO_IP4TXMCPKTSLO_SHIFT 0 +#define I40E_GLPES_VFIP4TXMCPKTSLO_IP4TXMCPKTSLO_MASK (0xFFFFFFFF << I40E_GLPES_VFIP4TXMCPKTSLO_IP4TXMCPKTSLO_SHIFT) +#define I40E_GLPES_VFIP4TXNOROUTE(_i) (0x0001AE00 + ((_i) * 4)) /* _i=0...31 */ /* Reset: PE_CORER */ +#define I40E_GLPES_VFIP4TXNOROUTE_MAX_INDEX 31 +#define I40E_GLPES_VFIP4TXNOROUTE_IP4TXNOROUTE_SHIFT 0 +#define I40E_GLPES_VFIP4TXNOROUTE_IP4TXNOROUTE_MASK (0xFFFFFF << I40E_GLPES_VFIP4TXNOROUTE_IP4TXNOROUTE_SHIFT) +#define I40E_GLPES_VFIP4TXOCTSHI(_i) (0x00019A04 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */ +#define I40E_GLPES_VFIP4TXOCTSHI_MAX_INDEX 31 +#define I40E_GLPES_VFIP4TXOCTSHI_IP4TXOCTSHI_SHIFT 0 +#define I40E_GLPES_VFIP4TXOCTSHI_IP4TXOCTSHI_MASK (0xFFFF << I40E_GLPES_VFIP4TXOCTSHI_IP4TXOCTSHI_SHIFT) +#define I40E_GLPES_VFIP4TXOCTSLO(_i) (0x00019A00 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */ +#define I40E_GLPES_VFIP4TXOCTSLO_MAX_INDEX 31 +#define I40E_GLPES_VFIP4TXOCTSLO_IP4TXOCTSLO_SHIFT 0 +#define I40E_GLPES_VFIP4TXOCTSLO_IP4TXOCTSLO_MASK (0xFFFFFFFF << I40E_GLPES_VFIP4TXOCTSLO_IP4TXOCTSLO_SHIFT) +#define I40E_GLPES_VFIP4TXPKTSHI(_i) (0x00019C04 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */ +#define I40E_GLPES_VFIP4TXPKTSHI_MAX_INDEX 31 +#define I40E_GLPES_VFIP4TXPKTSHI_IP4TXPKTSHI_SHIFT 0 +#define I40E_GLPES_VFIP4TXPKTSHI_IP4TXPKTSHI_MASK (0xFFFF << I40E_GLPES_VFIP4TXPKTSHI_IP4TXPKTSHI_SHIFT) +#define I40E_GLPES_VFIP4TXPKTSLO(_i) (0x00019C00 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */ +#define I40E_GLPES_VFIP4TXPKTSLO_MAX_INDEX 31 +#define I40E_GLPES_VFIP4TXPKTSLO_IP4TXPKTSLO_SHIFT 0 +#define I40E_GLPES_VFIP4TXPKTSLO_IP4TXPKTSLO_MASK (0xFFFFFFFF << I40E_GLPES_VFIP4TXPKTSLO_IP4TXPKTSLO_SHIFT) +#define I40E_GLPES_VFIP6RXDISCARD(_i) (0x00019200 + ((_i) * 4)) /* _i=0...31 */ /* Reset: PE_CORER */ +#define I40E_GLPES_VFIP6RXDISCARD_MAX_INDEX 31 +#define I40E_GLPES_VFIP6RXDISCARD_IP6RXDISCARD_SHIFT 0 +#define I40E_GLPES_VFIP6RXDISCARD_IP6RXDISCARD_MASK (0xFFFFFFFF << I40E_GLPES_VFIP6RXDISCARD_IP6RXDISCARD_SHIFT) +#define I40E_GLPES_VFIP6RXFRAGSHI(_i) (0x00019404 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */ +#define I40E_GLPES_VFIP6RXFRAGSHI_MAX_INDEX 31 +#define I40E_GLPES_VFIP6RXFRAGSHI_IP6RXFRAGSHI_SHIFT 0 +#define I40E_GLPES_VFIP6RXFRAGSHI_IP6RXFRAGSHI_MASK (0xFFFF << I40E_GLPES_VFIP6RXFRAGSHI_IP6RXFRAGSHI_SHIFT) +#define I40E_GLPES_VFIP6RXFRAGSLO(_i) (0x00019400 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */ +#define I40E_GLPES_VFIP6RXFRAGSLO_MAX_INDEX 31 +#define I40E_GLPES_VFIP6RXFRAGSLO_IP6RXFRAGSLO_SHIFT 0 +#define I40E_GLPES_VFIP6RXFRAGSLO_IP6RXFRAGSLO_MASK (0xFFFFFFFF << I40E_GLPES_VFIP6RXFRAGSLO_IP6RXFRAGSLO_SHIFT) +#define I40E_GLPES_VFIP6RXMCOCTSHI(_i) (0x00019604 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */ +#define I40E_GLPES_VFIP6RXMCOCTSHI_MAX_INDEX 31 +#define I40E_GLPES_VFIP6RXMCOCTSHI_IP6RXMCOCTSHI_SHIFT 0 +#define I40E_GLPES_VFIP6RXMCOCTSHI_IP6RXMCOCTSHI_MASK (0xFFFF << I40E_GLPES_VFIP6RXMCOCTSHI_IP6RXMCOCTSHI_SHIFT) +#define I40E_GLPES_VFIP6RXMCOCTSLO(_i) (0x00019600 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */ +#define I40E_GLPES_VFIP6RXMCOCTSLO_MAX_INDEX 31 +#define I40E_GLPES_VFIP6RXMCOCTSLO_IP6RXMCOCTSLO_SHIFT 0 +#define I40E_GLPES_VFIP6RXMCOCTSLO_IP6RXMCOCTSLO_MASK (0xFFFFFFFF << I40E_GLPES_VFIP6RXMCOCTSLO_IP6RXMCOCTSLO_SHIFT) +#define I40E_GLPES_VFIP6RXMCPKTSHI(_i) (0x00019804 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */ +#define I40E_GLPES_VFIP6RXMCPKTSHI_MAX_INDEX 31 +#define I40E_GLPES_VFIP6RXMCPKTSHI_IP6RXMCPKTSHI_SHIFT 0 +#define I40E_GLPES_VFIP6RXMCPKTSHI_IP6RXMCPKTSHI_MASK (0xFFFF << I40E_GLPES_VFIP6RXMCPKTSHI_IP6RXMCPKTSHI_SHIFT) +#define I40E_GLPES_VFIP6RXMCPKTSLO(_i) (0x00019800 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */ +#define I40E_GLPES_VFIP6RXMCPKTSLO_MAX_INDEX 31 +#define I40E_GLPES_VFIP6RXMCPKTSLO_IP6RXMCPKTSLO_SHIFT 0 +#define I40E_GLPES_VFIP6RXMCPKTSLO_IP6RXMCPKTSLO_MASK (0xFFFFFFFF << I40E_GLPES_VFIP6RXMCPKTSLO_IP6RXMCPKTSLO_SHIFT) +#define I40E_GLPES_VFIP6RXOCTSHI(_i) (0x00018E04 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */ +#define I40E_GLPES_VFIP6RXOCTSHI_MAX_INDEX 31 +#define I40E_GLPES_VFIP6RXOCTSHI_IP6RXOCTSHI_SHIFT 0 +#define I40E_GLPES_VFIP6RXOCTSHI_IP6RXOCTSHI_MASK (0xFFFF << I40E_GLPES_VFIP6RXOCTSHI_IP6RXOCTSHI_SHIFT) +#define I40E_GLPES_VFIP6RXOCTSLO(_i) (0x00018E00 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */ +#define I40E_GLPES_VFIP6RXOCTSLO_MAX_INDEX 31 +#define I40E_GLPES_VFIP6RXOCTSLO_IP6RXOCTSLO_SHIFT 0 +#define I40E_GLPES_VFIP6RXOCTSLO_IP6RXOCTSLO_MASK (0xFFFFFFFF << I40E_GLPES_VFIP6RXOCTSLO_IP6RXOCTSLO_SHIFT) +#define I40E_GLPES_VFIP6RXPKTSHI(_i) (0x00019004 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */ +#define I40E_GLPES_VFIP6RXPKTSHI_MAX_INDEX 31 +#define I40E_GLPES_VFIP6RXPKTSHI_IP6RXPKTSHI_SHIFT 0 +#define I40E_GLPES_VFIP6RXPKTSHI_IP6RXPKTSHI_MASK (0xFFFF << I40E_GLPES_VFIP6RXPKTSHI_IP6RXPKTSHI_SHIFT) +#define I40E_GLPES_VFIP6RXPKTSLO(_i) (0x00019000 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */ +#define I40E_GLPES_VFIP6RXPKTSLO_MAX_INDEX 31 +#define I40E_GLPES_VFIP6RXPKTSLO_IP6RXPKTSLO_SHIFT 0 +#define I40E_GLPES_VFIP6RXPKTSLO_IP6RXPKTSLO_MASK (0xFFFFFFFF << I40E_GLPES_VFIP6RXPKTSLO_IP6RXPKTSLO_SHIFT) +#define I40E_GLPES_VFIP6RXTRUNC(_i) (0x00019300 + ((_i) * 4)) /* _i=0...31 */ /* Reset: PE_CORER */ +#define I40E_GLPES_VFIP6RXTRUNC_MAX_INDEX 31 +#define I40E_GLPES_VFIP6RXTRUNC_IP6RXTRUNC_SHIFT 0 +#define I40E_GLPES_VFIP6RXTRUNC_IP6RXTRUNC_MASK (0xFFFFFFFF << I40E_GLPES_VFIP6RXTRUNC_IP6RXTRUNC_SHIFT) +#define I40E_GLPES_VFIP6TXFRAGSHI(_i) (0x0001A804 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */ +#define I40E_GLPES_VFIP6TXFRAGSHI_MAX_INDEX 31 +#define I40E_GLPES_VFIP6TXFRAGSHI_IP6TXFRAGSHI_SHIFT 0 +#define I40E_GLPES_VFIP6TXFRAGSHI_IP6TXFRAGSHI_MASK (0xFFFF << I40E_GLPES_VFIP6TXFRAGSHI_IP6TXFRAGSHI_SHIFT) +#define I40E_GLPES_VFIP6TXFRAGSLO(_i) (0x0001A800 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */ +#define I40E_GLPES_VFIP6TXFRAGSLO_MAX_INDEX 31 +#define I40E_GLPES_VFIP6TXFRAGSLO_IP6TXFRAGSLO_SHIFT 0 +#define I40E_GLPES_VFIP6TXFRAGSLO_IP6TXFRAGSLO_MASK (0xFFFFFFFF << I40E_GLPES_VFIP6TXFRAGSLO_IP6TXFRAGSLO_SHIFT) +#define I40E_GLPES_VFIP6TXMCOCTSHI(_i) (0x0001AA04 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */ +#define I40E_GLPES_VFIP6TXMCOCTSHI_MAX_INDEX 31 +#define I40E_GLPES_VFIP6TXMCOCTSHI_IP6TXMCOCTSHI_SHIFT 0 +#define I40E_GLPES_VFIP6TXMCOCTSHI_IP6TXMCOCTSHI_MASK (0xFFFF << I40E_GLPES_VFIP6TXMCOCTSHI_IP6TXMCOCTSHI_SHIFT) +#define I40E_GLPES_VFIP6TXMCOCTSLO(_i) (0x0001AA00 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */ +#define I40E_GLPES_VFIP6TXMCOCTSLO_MAX_INDEX 31 +#define I40E_GLPES_VFIP6TXMCOCTSLO_IP6TXMCOCTSLO_SHIFT 0 +#define I40E_GLPES_VFIP6TXMCOCTSLO_IP6TXMCOCTSLO_MASK (0xFFFFFFFF << I40E_GLPES_VFIP6TXMCOCTSLO_IP6TXMCOCTSLO_SHIFT) +#define I40E_GLPES_VFIP6TXMCPKTSHI(_i) (0x0001AC04 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */ +#define I40E_GLPES_VFIP6TXMCPKTSHI_MAX_INDEX 31 +#define I40E_GLPES_VFIP6TXMCPKTSHI_IP6TXMCPKTSHI_SHIFT 0 +#define I40E_GLPES_VFIP6TXMCPKTSHI_IP6TXMCPKTSHI_MASK (0xFFFF << I40E_GLPES_VFIP6TXMCPKTSHI_IP6TXMCPKTSHI_SHIFT) +#define I40E_GLPES_VFIP6TXMCPKTSLO(_i) (0x0001AC00 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */ +#define I40E_GLPES_VFIP6TXMCPKTSLO_MAX_INDEX 31 +#define I40E_GLPES_VFIP6TXMCPKTSLO_IP6TXMCPKTSLO_SHIFT 0 +#define I40E_GLPES_VFIP6TXMCPKTSLO_IP6TXMCPKTSLO_MASK (0xFFFFFFFF << I40E_GLPES_VFIP6TXMCPKTSLO_IP6TXMCPKTSLO_SHIFT) +#define I40E_GLPES_VFIP6TXNOROUTE(_i) (0x0001AF00 + ((_i) * 4)) /* _i=0...31 */ /* Reset: PE_CORER */ +#define I40E_GLPES_VFIP6TXNOROUTE_MAX_INDEX 31 +#define I40E_GLPES_VFIP6TXNOROUTE_IP6TXNOROUTE_SHIFT 0 +#define I40E_GLPES_VFIP6TXNOROUTE_IP6TXNOROUTE_MASK (0xFFFFFF << I40E_GLPES_VFIP6TXNOROUTE_IP6TXNOROUTE_SHIFT) +#define I40E_GLPES_VFIP6TXOCTSHI(_i) (0x0001A404 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */ +#define I40E_GLPES_VFIP6TXOCTSHI_MAX_INDEX 31 +#define I40E_GLPES_VFIP6TXOCTSHI_IP6TXOCTSHI_SHIFT 0 +#define I40E_GLPES_VFIP6TXOCTSHI_IP6TXOCTSHI_MASK (0xFFFF << I40E_GLPES_VFIP6TXOCTSHI_IP6TXOCTSHI_SHIFT) +#define I40E_GLPES_VFIP6TXOCTSLO(_i) (0x0001A400 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */ +#define I40E_GLPES_VFIP6TXOCTSLO_MAX_INDEX 31 +#define I40E_GLPES_VFIP6TXOCTSLO_IP6TXOCTSLO_SHIFT 0 +#define I40E_GLPES_VFIP6TXOCTSLO_IP6TXOCTSLO_MASK (0xFFFFFFFF << I40E_GLPES_VFIP6TXOCTSLO_IP6TXOCTSLO_SHIFT) +#define I40E_GLPES_VFIP6TXPKTSHI(_i) (0x0001A604 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */ +#define I40E_GLPES_VFIP6TXPKTSHI_MAX_INDEX 31 +#define I40E_GLPES_VFIP6TXPKTSHI_IP6TXPKTSHI_SHIFT 0 +#define I40E_GLPES_VFIP6TXPKTSHI_IP6TXPKTSHI_MASK (0xFFFF << I40E_GLPES_VFIP6TXPKTSHI_IP6TXPKTSHI_SHIFT) +#define I40E_GLPES_VFIP6TXPKTSLO(_i) (0x0001A600 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */ +#define I40E_GLPES_VFIP6TXPKTSLO_MAX_INDEX 31 +#define I40E_GLPES_VFIP6TXPKTSLO_IP6TXPKTSLO_SHIFT 0 +#define I40E_GLPES_VFIP6TXPKTSLO_IP6TXPKTSLO_MASK (0xFFFFFFFF << I40E_GLPES_VFIP6TXPKTSLO_IP6TXPKTSLO_SHIFT) +#define I40E_GLPES_VFRDMARXRDSHI(_i) (0x0001BE04 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */ +#define I40E_GLPES_VFRDMARXRDSHI_MAX_INDEX 31 +#define I40E_GLPES_VFRDMARXRDSHI_RDMARXRDSHI_SHIFT 0 +#define I40E_GLPES_VFRDMARXRDSHI_RDMARXRDSHI_MASK (0xFFFF << I40E_GLPES_VFRDMARXRDSHI_RDMARXRDSHI_SHIFT) +#define I40E_GLPES_VFRDMARXRDSLO(_i) (0x0001BE00 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */ +#define I40E_GLPES_VFRDMARXRDSLO_MAX_INDEX 31 +#define I40E_GLPES_VFRDMARXRDSLO_RDMARXRDSLO_SHIFT 0 +#define I40E_GLPES_VFRDMARXRDSLO_RDMARXRDSLO_MASK (0xFFFFFFFF << I40E_GLPES_VFRDMARXRDSLO_RDMARXRDSLO_SHIFT) +#define I40E_GLPES_VFRDMARXSNDSHI(_i) (0x0001C004 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */ +#define I40E_GLPES_VFRDMARXSNDSHI_MAX_INDEX 31 +#define I40E_GLPES_VFRDMARXSNDSHI_RDMARXSNDSHI_SHIFT 0 +#define I40E_GLPES_VFRDMARXSNDSHI_RDMARXSNDSHI_MASK (0xFFFF << I40E_GLPES_VFRDMARXSNDSHI_RDMARXSNDSHI_SHIFT) +#define I40E_GLPES_VFRDMARXSNDSLO(_i) (0x0001C000 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */ +#define I40E_GLPES_VFRDMARXSNDSLO_MAX_INDEX 31 +#define I40E_GLPES_VFRDMARXSNDSLO_RDMARXSNDSLO_SHIFT 0 +#define I40E_GLPES_VFRDMARXSNDSLO_RDMARXSNDSLO_MASK (0xFFFFFFFF << I40E_GLPES_VFRDMARXSNDSLO_RDMARXSNDSLO_SHIFT) +#define I40E_GLPES_VFRDMARXWRSHI(_i) (0x0001BC04 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */ +#define I40E_GLPES_VFRDMARXWRSHI_MAX_INDEX 31 +#define I40E_GLPES_VFRDMARXWRSHI_RDMARXWRSHI_SHIFT 0 +#define I40E_GLPES_VFRDMARXWRSHI_RDMARXWRSHI_MASK (0xFFFF << I40E_GLPES_VFRDMARXWRSHI_RDMARXWRSHI_SHIFT) +#define I40E_GLPES_VFRDMARXWRSLO(_i) (0x0001BC00 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */ +#define I40E_GLPES_VFRDMARXWRSLO_MAX_INDEX 31 +#define I40E_GLPES_VFRDMARXWRSLO_RDMARXWRSLO_SHIFT 0 +#define I40E_GLPES_VFRDMARXWRSLO_RDMARXWRSLO_MASK (0xFFFFFFFF << I40E_GLPES_VFRDMARXWRSLO_RDMARXWRSLO_SHIFT) +#define I40E_GLPES_VFRDMATXRDSHI(_i) (0x0001C404 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */ +#define I40E_GLPES_VFRDMATXRDSHI_MAX_INDEX 31 +#define I40E_GLPES_VFRDMATXRDSHI_RDMARXRDSHI_SHIFT 0 +#define I40E_GLPES_VFRDMATXRDSHI_RDMARXRDSHI_MASK (0xFFFF << I40E_GLPES_VFRDMATXRDSHI_RDMARXRDSHI_SHIFT) +#define I40E_GLPES_VFRDMATXRDSLO(_i) (0x0001C400 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */ +#define I40E_GLPES_VFRDMATXRDSLO_MAX_INDEX 31 +#define I40E_GLPES_VFRDMATXRDSLO_RDMARXRDSLO_SHIFT 0 +#define I40E_GLPES_VFRDMATXRDSLO_RDMARXRDSLO_MASK (0xFFFFFFFF << I40E_GLPES_VFRDMATXRDSLO_RDMARXRDSLO_SHIFT) +#define I40E_GLPES_VFRDMATXSNDSHI(_i) (0x0001C604 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */ +#define I40E_GLPES_VFRDMATXSNDSHI_MAX_INDEX 31 +#define I40E_GLPES_VFRDMATXSNDSHI_RDMARXSNDSHI_SHIFT 0 +#define I40E_GLPES_VFRDMATXSNDSHI_RDMARXSNDSHI_MASK (0xFFFF << I40E_GLPES_VFRDMATXSNDSHI_RDMARXSNDSHI_SHIFT) +#define I40E_GLPES_VFRDMATXSNDSLO(_i) (0x0001C600 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */ +#define I40E_GLPES_VFRDMATXSNDSLO_MAX_INDEX 31 +#define I40E_GLPES_VFRDMATXSNDSLO_RDMARXSNDSLO_SHIFT 0 +#define I40E_GLPES_VFRDMATXSNDSLO_RDMARXSNDSLO_MASK (0xFFFFFFFF << I40E_GLPES_VFRDMATXSNDSLO_RDMARXSNDSLO_SHIFT) +#define I40E_GLPES_VFRDMATXWRSHI(_i) (0x0001C204 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */ +#define I40E_GLPES_VFRDMATXWRSHI_MAX_INDEX 31 +#define I40E_GLPES_VFRDMATXWRSHI_RDMARXWRSHI_SHIFT 0 +#define I40E_GLPES_VFRDMATXWRSHI_RDMARXWRSHI_MASK (0xFFFF << I40E_GLPES_VFRDMATXWRSHI_RDMARXWRSHI_SHIFT) +#define I40E_GLPES_VFRDMATXWRSLO(_i) (0x0001C200 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */ +#define I40E_GLPES_VFRDMATXWRSLO_MAX_INDEX 31 +#define I40E_GLPES_VFRDMATXWRSLO_RDMARXWRSLO_SHIFT 0 +#define I40E_GLPES_VFRDMATXWRSLO_RDMARXWRSLO_MASK (0xFFFFFFFF << I40E_GLPES_VFRDMATXWRSLO_RDMARXWRSLO_SHIFT) +#define I40E_GLPES_VFRDMAVBNDHI(_i) (0x0001C804 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */ +#define I40E_GLPES_VFRDMAVBNDHI_MAX_INDEX 31 +#define I40E_GLPES_VFRDMAVBNDHI_RDMAVBNDHI_SHIFT 0 +#define I40E_GLPES_VFRDMAVBNDHI_RDMAVBNDHI_MASK (0xFFFFFFFF << I40E_GLPES_VFRDMAVBNDHI_RDMAVBNDHI_SHIFT) +#define I40E_GLPES_VFRDMAVBNDLO(_i) (0x0001C800 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */ +#define I40E_GLPES_VFRDMAVBNDLO_MAX_INDEX 31 +#define I40E_GLPES_VFRDMAVBNDLO_RDMAVBNDLO_SHIFT 0 +#define I40E_GLPES_VFRDMAVBNDLO_RDMAVBNDLO_MASK (0xFFFFFFFF << I40E_GLPES_VFRDMAVBNDLO_RDMAVBNDLO_SHIFT) +#define I40E_GLPES_VFRDMAVINVHI(_i) (0x0001CA04 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */ +#define I40E_GLPES_VFRDMAVINVHI_MAX_INDEX 31 +#define I40E_GLPES_VFRDMAVINVHI_RDMAVINVHI_SHIFT 0 +#define I40E_GLPES_VFRDMAVINVHI_RDMAVINVHI_MASK (0xFFFFFFFF << I40E_GLPES_VFRDMAVINVHI_RDMAVINVHI_SHIFT) +#define I40E_GLPES_VFRDMAVINVLO(_i) (0x0001CA00 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */ +#define I40E_GLPES_VFRDMAVINVLO_MAX_INDEX 31 +#define I40E_GLPES_VFRDMAVINVLO_RDMAVINVLO_SHIFT 0 +#define I40E_GLPES_VFRDMAVINVLO_RDMAVINVLO_MASK (0xFFFFFFFF << I40E_GLPES_VFRDMAVINVLO_RDMAVINVLO_SHIFT) +#define I40E_GLPES_VFRXVLANERR(_i) (0x00018000 + ((_i) * 4)) /* _i=0...31 */ /* Reset: PE_CORER */ +#define I40E_GLPES_VFRXVLANERR_MAX_INDEX 31 +#define I40E_GLPES_VFRXVLANERR_RXVLANERR_SHIFT 0 +#define I40E_GLPES_VFRXVLANERR_RXVLANERR_MASK (0xFFFFFF << I40E_GLPES_VFRXVLANERR_RXVLANERR_SHIFT) +#define I40E_GLPES_VFTCPRTXSEG(_i) (0x0001B600 + ((_i) * 4)) /* _i=0...31 */ /* Reset: PE_CORER */ +#define I40E_GLPES_VFTCPRTXSEG_MAX_INDEX 31 +#define I40E_GLPES_VFTCPRTXSEG_TCPRTXSEG_SHIFT 0 +#define I40E_GLPES_VFTCPRTXSEG_TCPRTXSEG_MASK (0xFFFFFFFF << I40E_GLPES_VFTCPRTXSEG_TCPRTXSEG_SHIFT) +#define I40E_GLPES_VFTCPRXOPTERR(_i) (0x0001B200 + ((_i) * 4)) /* _i=0...31 */ /* Reset: PE_CORER */ +#define I40E_GLPES_VFTCPRXOPTERR_MAX_INDEX 31 +#define I40E_GLPES_VFTCPRXOPTERR_TCPRXOPTERR_SHIFT 0 +#define I40E_GLPES_VFTCPRXOPTERR_TCPRXOPTERR_MASK (0xFFFFFF << I40E_GLPES_VFTCPRXOPTERR_TCPRXOPTERR_SHIFT) +#define I40E_GLPES_VFTCPRXPROTOERR(_i) (0x0001B300 + ((_i) * 4)) /* _i=0...31 */ /* Reset: PE_CORER */ +#define I40E_GLPES_VFTCPRXPROTOERR_MAX_INDEX 31 +#define I40E_GLPES_VFTCPRXPROTOERR_TCPRXPROTOERR_SHIFT 0 +#define I40E_GLPES_VFTCPRXPROTOERR_TCPRXPROTOERR_MASK (0xFFFFFF << I40E_GLPES_VFTCPRXPROTOERR_TCPRXPROTOERR_SHIFT) +#define I40E_GLPES_VFTCPRXSEGSHI(_i) (0x0001B004 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */ +#define I40E_GLPES_VFTCPRXSEGSHI_MAX_INDEX 31 +#define I40E_GLPES_VFTCPRXSEGSHI_TCPRXSEGSHI_SHIFT 0 +#define I40E_GLPES_VFTCPRXSEGSHI_TCPRXSEGSHI_MASK (0xFFFF << I40E_GLPES_VFTCPRXSEGSHI_TCPRXSEGSHI_SHIFT) +#define I40E_GLPES_VFTCPRXSEGSLO(_i) (0x0001B000 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */ +#define I40E_GLPES_VFTCPRXSEGSLO_MAX_INDEX 31 +#define I40E_GLPES_VFTCPRXSEGSLO_TCPRXSEGSLO_SHIFT 0 +#define I40E_GLPES_VFTCPRXSEGSLO_TCPRXSEGSLO_MASK (0xFFFFFFFF << I40E_GLPES_VFTCPRXSEGSLO_TCPRXSEGSLO_SHIFT) +#define I40E_GLPES_VFTCPTXSEGHI(_i) (0x0001B404 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */ +#define I40E_GLPES_VFTCPTXSEGHI_MAX_INDEX 31 +#define I40E_GLPES_VFTCPTXSEGHI_TCPTXSEGHI_SHIFT 0 +#define I40E_GLPES_VFTCPTXSEGHI_TCPTXSEGHI_MASK (0xFFFF << I40E_GLPES_VFTCPTXSEGHI_TCPTXSEGHI_SHIFT) +#define I40E_GLPES_VFTCPTXSEGLO(_i) (0x0001B400 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */ +#define I40E_GLPES_VFTCPTXSEGLO_MAX_INDEX 31 +#define I40E_GLPES_VFTCPTXSEGLO_TCPTXSEGLO_SHIFT 0 +#define I40E_GLPES_VFTCPTXSEGLO_TCPTXSEGLO_MASK (0xFFFFFFFF << I40E_GLPES_VFTCPTXSEGLO_TCPTXSEGLO_SHIFT) +#define I40E_GLPES_VFUDPRXPKTSHI(_i) (0x0001B804 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */ +#define I40E_GLPES_VFUDPRXPKTSHI_MAX_INDEX 31 +#define I40E_GLPES_VFUDPRXPKTSHI_UDPRXPKTSHI_SHIFT 0 +#define I40E_GLPES_VFUDPRXPKTSHI_UDPRXPKTSHI_MASK (0xFFFF << I40E_GLPES_VFUDPRXPKTSHI_UDPRXPKTSHI_SHIFT) +#define I40E_GLPES_VFUDPRXPKTSLO(_i) (0x0001B800 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */ +#define I40E_GLPES_VFUDPRXPKTSLO_MAX_INDEX 31 +#define I40E_GLPES_VFUDPRXPKTSLO_UDPRXPKTSLO_SHIFT 0 +#define I40E_GLPES_VFUDPRXPKTSLO_UDPRXPKTSLO_MASK (0xFFFFFFFF << I40E_GLPES_VFUDPRXPKTSLO_UDPRXPKTSLO_SHIFT) +#define I40E_GLPES_VFUDPTXPKTSHI(_i) (0x0001BA04 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */ +#define I40E_GLPES_VFUDPTXPKTSHI_MAX_INDEX 31 +#define I40E_GLPES_VFUDPTXPKTSHI_UDPTXPKTSHI_SHIFT 0 +#define I40E_GLPES_VFUDPTXPKTSHI_UDPTXPKTSHI_MASK (0xFFFF << I40E_GLPES_VFUDPTXPKTSHI_UDPTXPKTSHI_SHIFT) +#define I40E_GLPES_VFUDPTXPKTSLO(_i) (0x0001BA00 + ((_i) * 8)) /* _i=0...31 */ /* Reset: PE_CORER */ +#define I40E_GLPES_VFUDPTXPKTSLO_MAX_INDEX 31 +#define I40E_GLPES_VFUDPTXPKTSLO_UDPTXPKTSLO_SHIFT 0 +#define I40E_GLPES_VFUDPTXPKTSLO_UDPTXPKTSLO_MASK (0xFFFFFFFF << I40E_GLPES_VFUDPTXPKTSLO_UDPTXPKTSLO_SHIFT) + +#define I40E_VFPE_AEQALLOC1 0x0000A400 /* Reset: VFR */ +#define I40E_VFPE_AEQALLOC1_AECOUNT_SHIFT 0 +#define I40E_VFPE_AEQALLOC1_AECOUNT_MASK (0xFFFFFFFF << I40E_VFPE_AEQALLOC1_AECOUNT_SHIFT) +#define I40E_VFPE_CCQPHIGH1 0x00009800 /* Reset: VFR */ +#define I40E_VFPE_CCQPHIGH1_PECCQPHIGH_SHIFT 0 +#define I40E_VFPE_CCQPHIGH1_PECCQPHIGH_MASK (0xFFFFFFFF << I40E_VFPE_CCQPHIGH1_PECCQPHIGH_SHIFT) +#define I40E_VFPE_CCQPLOW1 0x0000AC00 /* Reset: VFR */ +#define I40E_VFPE_CCQPLOW1_PECCQPLOW_SHIFT 0 +#define I40E_VFPE_CCQPLOW1_PECCQPLOW_MASK (0xFFFFFFFF << I40E_VFPE_CCQPLOW1_PECCQPLOW_SHIFT) +#define I40E_VFPE_CCQPSTATUS1 0x0000B800 /* Reset: VFR */ +#define I40E_VFPE_CCQPSTATUS1_CCQP_DONE_SHIFT 0 +#define I40E_VFPE_CCQPSTATUS1_CCQP_DONE_MASK (0x1 << I40E_VFPE_CCQPSTATUS1_CCQP_DONE_SHIFT) +#define I40E_VFPE_CCQPSTATUS1_HMC_PROFILE_SHIFT 4 +#define I40E_VFPE_CCQPSTATUS1_HMC_PROFILE_MASK (0x7 << I40E_VFPE_CCQPSTATUS1_HMC_PROFILE_SHIFT) +#define I40E_VFPE_CCQPSTATUS1_RDMA_EN_VFS_SHIFT 16 +#define I40E_VFPE_CCQPSTATUS1_RDMA_EN_VFS_MASK (0x3F << I40E_VFPE_CCQPSTATUS1_RDMA_EN_VFS_SHIFT) +#define I40E_VFPE_CCQPSTATUS1_CCQP_ERR_SHIFT 31 +#define I40E_VFPE_CCQPSTATUS1_CCQP_ERR_MASK (0x1 << I40E_VFPE_CCQPSTATUS1_CCQP_ERR_SHIFT) +#define I40E_VFPE_CQACK1 0x0000B000 /* Reset: VFR */ +#define I40E_VFPE_CQACK1_PECQID_SHIFT 0 +#define I40E_VFPE_CQACK1_PECQID_MASK (0x1FFFF << I40E_VFPE_CQACK1_PECQID_SHIFT) +#define I40E_VFPE_CQARM1 0x0000B400 /* Reset: VFR */ +#define I40E_VFPE_CQARM1_PECQID_SHIFT 0 +#define I40E_VFPE_CQARM1_PECQID_MASK (0x1FFFF << I40E_VFPE_CQARM1_PECQID_SHIFT) +#define I40E_VFPE_CQPDB1 0x0000BC00 /* Reset: VFR */ +#define I40E_VFPE_CQPDB1_WQHEAD_SHIFT 0 +#define I40E_VFPE_CQPDB1_WQHEAD_MASK (0x7FF << I40E_VFPE_CQPDB1_WQHEAD_SHIFT) +#define I40E_VFPE_CQPERRCODES1 0x00009C00 /* Reset: VFR */ +#define I40E_VFPE_CQPERRCODES1_CQP_MINOR_CODE_SHIFT 0 +#define I40E_VFPE_CQPERRCODES1_CQP_MINOR_CODE_MASK (0xFFFF << I40E_VFPE_CQPERRCODES1_CQP_MINOR_CODE_SHIFT) +#define I40E_VFPE_CQPERRCODES1_CQP_MAJOR_CODE_SHIFT 16 +#define I40E_VFPE_CQPERRCODES1_CQP_MAJOR_CODE_MASK (0xFFFF << I40E_VFPE_CQPERRCODES1_CQP_MAJOR_CODE_SHIFT) +#define I40E_VFPE_CQPTAIL1 0x0000A000 /* Reset: VFR */ +#define I40E_VFPE_CQPTAIL1_WQTAIL_SHIFT 0 +#define I40E_VFPE_CQPTAIL1_WQTAIL_MASK (0x7FF << I40E_VFPE_CQPTAIL1_WQTAIL_SHIFT) +#define I40E_VFPE_CQPTAIL1_CQP_OP_ERR_SHIFT 31 +#define I40E_VFPE_CQPTAIL1_CQP_OP_ERR_MASK (0x1 << I40E_VFPE_CQPTAIL1_CQP_OP_ERR_SHIFT) +#define I40E_VFPE_IPCONFIG01 0x00008C00 /* Reset: VFR */ +#define I40E_VFPE_IPCONFIG01_PEIPID_SHIFT 0 +#define I40E_VFPE_IPCONFIG01_PEIPID_MASK (0xFFFF << I40E_VFPE_IPCONFIG01_PEIPID_SHIFT) +#define I40E_VFPE_IPCONFIG01_USEENTIREIDRANGE_SHIFT 16 +#define I40E_VFPE_IPCONFIG01_USEENTIREIDRANGE_MASK (0x1 << I40E_VFPE_IPCONFIG01_USEENTIREIDRANGE_SHIFT) +#define I40E_VFPE_MRTEIDXMASK1 0x00009000 /* Reset: VFR */ +#define I40E_VFPE_MRTEIDXMASK1_MRTEIDXMASKBITS_SHIFT 0 +#define I40E_VFPE_MRTEIDXMASK1_MRTEIDXMASKBITS_MASK (0x1F << I40E_VFPE_MRTEIDXMASK1_MRTEIDXMASKBITS_SHIFT) +#define I40E_VFPE_RCVUNEXPECTEDERROR1 0x00009400 /* Reset: VFR */ +#define I40E_VFPE_RCVUNEXPECTEDERROR1_TCP_RX_UNEXP_ERR_SHIFT 0 +#define I40E_VFPE_RCVUNEXPECTEDERROR1_TCP_RX_UNEXP_ERR_MASK (0xFFFFFF << I40E_VFPE_RCVUNEXPECTEDERROR1_TCP_RX_UNEXP_ERR_SHIFT) +#define I40E_VFPE_TCPNOWTIMER1 0x0000A800 /* Reset: VFR */ +#define I40E_VFPE_TCPNOWTIMER1_TCP_NOW_SHIFT 0 +#define I40E_VFPE_TCPNOWTIMER1_TCP_NOW_MASK (0xFFFFFFFF << I40E_VFPE_TCPNOWTIMER1_TCP_NOW_SHIFT) +#define I40E_VFPE_WQEALLOC1 0x0000C000 /* Reset: VFR */ +#define I40E_VFPE_WQEALLOC1_PEQPID_SHIFT 0 +#define I40E_VFPE_WQEALLOC1_PEQPID_MASK (0x3FFFF << I40E_VFPE_WQEALLOC1_PEQPID_SHIFT) +#define I40E_VFPE_WQEALLOC1_WQE_DESC_INDEX_SHIFT 20 +#define I40E_VFPE_WQEALLOC1_WQE_DESC_INDEX_MASK (0xFFF << I40E_VFPE_WQEALLOC1_WQE_DESC_INDEX_SHIFT) +#endif /* I40IW_REGISTER_H */ diff --git a/providers/i40iw/i40iw_status.h b/providers/i40iw/i40iw_status.h new file mode 100644 index 0000000..c64ce54 --- /dev/null +++ b/providers/i40iw/i40iw_status.h @@ -0,0 +1,101 @@ +/******************************************************************************* +* +* Copyright (c) 2015-2016 Intel Corporation. All rights reserved. +* +* This software is available to you under a choice of one of two +* licenses. You may choose to be licensed under the terms of the GNU +* General Public License (GPL) Version 2, available from the file +* COPYING in the main directory of this source tree, or the +* OpenFabrics.org BSD license below: +* +* Redistribution and use in source and binary forms, with or +* without modification, are permitted provided that the following +* conditions are met: +* +* - Redistributions of source code must retain the above +* copyright notice, this list of conditions and the following +* disclaimer. +* +* - Redistributions in binary form must reproduce the above +* copyright notice, this list of conditions and the following +* disclaimer in the documentation and/or other materials +* provided with the distribution. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS +* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN +* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +* SOFTWARE. +* +*******************************************************************************/ + +#ifndef I40IW_STATUS_H +#define I40IW_STATUS_H + +/* Error Codes */ +enum i40iw_status_code { + I40IW_SUCCESS = 0, + I40IW_ERR_NVM = -1, + I40IW_ERR_NVM_CHECKSUM = -2, + I40IW_ERR_CONFIG = -4, + I40IW_ERR_PARAM = -5, + I40IW_ERR_DEVICE_NOT_SUPPORTED = -6, + I40IW_ERR_RESET_FAILED = -7, + I40IW_ERR_SWFW_SYNC = -8, + I40IW_ERR_NO_MEMORY = -9, + I40IW_ERR_BAD_PTR = -10, + I40IW_ERR_INVALID_PD_ID = -11, + I40IW_ERR_INVALID_QP_ID = -12, + I40IW_ERR_INVALID_CQ_ID = -13, + I40IW_ERR_INVALID_CEQ_ID = -14, + I40IW_ERR_INVALID_AEQ_ID = -15, + I40IW_ERR_INVALID_SIZE = -16, + I40IW_ERR_INVALID_ARP_INDEX = -17, + I40IW_ERR_INVALID_FPM_FUNC_ID = -18, + I40IW_ERR_QP_INVALID_MSG_SIZE = -19, + I40IW_ERR_QP_TOOMANY_WRS_POSTED = -20, + I40IW_ERR_INVALID_FRAG_COUNT = -21, + I40IW_ERR_QUEUE_EMPTY = -22, + I40IW_ERR_INVALID_ALIGNMENT = -23, + I40IW_ERR_FLUSHED_QUEUE = -24, + I40IW_ERR_INVALID_PUSH_PAGE_INDEX = -25, + I40IW_ERR_INVALID_IMM_DATA_SIZE = -26, + I40IW_ERR_TIMEOUT = -27, + I40IW_ERR_OPCODE_MISMATCH = -28, + I40IW_ERR_CQP_COMPL_ERROR = -29, + I40IW_ERR_INVALID_VF_ID = -30, + I40IW_ERR_INVALID_HMCFN_ID = -31, + I40IW_ERR_BACKING_PAGE_ERROR = -32, + I40IW_ERR_NO_PBLCHUNKS_AVAILABLE = -33, + I40IW_ERR_INVALID_PBLE_INDEX = -34, + I40IW_ERR_INVALID_SD_INDEX = -35, + I40IW_ERR_INVALID_PAGE_DESC_INDEX = -36, + I40IW_ERR_INVALID_SD_TYPE = -37, + I40IW_ERR_MEMCPY_FAILED = -38, + I40IW_ERR_INVALID_HMC_OBJ_INDEX = -39, + I40IW_ERR_INVALID_HMC_OBJ_COUNT = -40, + I40IW_ERR_INVALID_SRQ_ARM_LIMIT = -41, + I40IW_ERR_SRQ_ENABLED = -42, + I40IW_ERR_BUF_TOO_SHORT = -43, + I40IW_ERR_BAD_IWARP_CQE = -44, + I40IW_ERR_NVM_BLANK_MODE = -45, + I40IW_ERR_NOT_IMPLEMENTED = -46, + I40IW_ERR_PE_DOORBELL_NOT_ENABLED = -47, + I40IW_ERR_NOT_READY = -48, + I40IW_NOT_SUPPORTED = -49, + I40IW_ERR_FIRMWARE_API_VERSION = -50, + I40IW_ERR_RING_FULL = -51, + I40IW_ERR_MPA_CRC = -61, + I40IW_ERR_NO_TXBUFS = -62, + I40IW_ERR_SEQ_NUM = -63, + I40IW_ERR_LIST_EMPTY = -64, + I40IW_ERR_INVALID_MAC_ADDR = -65, + I40IW_ERR_BAD_STAG = -66, + I40IW_ERR_CQ_COMPL_ERROR = -67, + I40IW_ERR_QUEUE_DESTROYED = -68 + +}; +#endif diff --git a/providers/i40iw/i40iw_uk.c b/providers/i40iw/i40iw_uk.c new file mode 100644 index 0000000..624b330 --- /dev/null +++ b/providers/i40iw/i40iw_uk.c @@ -0,0 +1,1266 @@ +/******************************************************************************* +* +* Copyright (c) 2015-2016 Intel Corporation. All rights reserved. +* +* This software is available to you under a choice of one of two +* licenses. You may choose to be licensed under the terms of the GNU +* General Public License (GPL) Version 2, available from the file +* COPYING in the main directory of this source tree, or the +* OpenFabrics.org BSD license below: +* +* Redistribution and use in source and binary forms, with or +* without modification, are permitted provided that the following +* conditions are met: +* +* - Redistributions of source code must retain the above +* copyright notice, this list of conditions and the following +* disclaimer. +* +* - Redistributions in binary form must reproduce the above +* copyright notice, this list of conditions and the following +* disclaimer in the documentation and/or other materials +* provided with the distribution. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS +* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN +* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +* SOFTWARE. +* +*******************************************************************************/ + +#include <stdint.h> +#include <stdatomic.h> + +#include "i40iw_osdep.h" +#include "i40iw_status.h" +#include "i40iw_d.h" +#include "i40iw_user.h" +#include "i40iw_register.h" + +static u32 nop_signature = 0x55550000; + +/** + * i40iw_nop_1 - insert a nop wqe and move head. no post work + * @qp: hw qp ptr + */ +static enum i40iw_status_code i40iw_nop_1(struct i40iw_qp_uk *qp) +{ + u64 header, *wqe; + u64 *wqe_0 = NULL; + u32 wqe_idx, peek_head; + bool signaled = false; + + if (!qp->sq_ring.head) + return I40IW_ERR_PARAM; + + wqe_idx = I40IW_RING_GETCURRENT_HEAD(qp->sq_ring); + wqe = qp->sq_base[wqe_idx].elem; + + qp->sq_wrtrk_array[wqe_idx].wqe_size = I40IW_QP_WQE_MIN_SIZE; + + peek_head = (qp->sq_ring.head + 1) % qp->sq_ring.size; + wqe_0 = qp->sq_base[peek_head].elem; + if (peek_head) + wqe_0[3] = LS_64(!qp->swqe_polarity, I40IWQPSQ_VALID); + else + wqe_0[3] = LS_64(qp->swqe_polarity, I40IWQPSQ_VALID); + set_64bit_val(wqe, I40IW_BYTE_0, 0); + set_64bit_val(wqe, I40IW_BYTE_8, 0); + set_64bit_val(wqe, I40IW_BYTE_16, 0); + + header = LS_64(I40IWQP_OP_NOP, I40IWQPSQ_OPCODE) | + LS_64(signaled, I40IWQPSQ_SIGCOMPL) | + LS_64(qp->swqe_polarity, I40IWQPSQ_VALID) | nop_signature++; + + udma_to_device_barrier(); /* Memory barrier to ensure data is written before valid bit is set */ + + set_64bit_val(wqe, I40IW_BYTE_24, header); + return 0; +} + +/** + * i40iw_qp_post_wr - post wr to hrdware + * @qp: hw qp ptr + */ +void i40iw_qp_post_wr(struct i40iw_qp_uk *qp) +{ + u64 temp; + u32 hw_sq_tail; + u32 sw_sq_head; + + /* valid bit is written and loads completed before reading shadow + * + * Whatever is happening here does not match our common macros for + * producer/consumer DMA and may not be portable, however on x86-64 + * the required barrier is MFENCE, get a 'portable' version via C11 + * atomic. + */ + atomic_thread_fence(memory_order_seq_cst); + + /* read the doorbell shadow area */ + get_64bit_val(qp->shadow_area, I40IW_BYTE_0, &temp); + + hw_sq_tail = (u32)RS_64(temp, I40IW_QP_DBSA_HW_SQ_TAIL); + sw_sq_head = I40IW_RING_GETCURRENT_HEAD(qp->sq_ring); + if (sw_sq_head != hw_sq_tail) { + if (sw_sq_head > qp->initial_ring.head) { + if ((hw_sq_tail >= qp->initial_ring.head) && + (hw_sq_tail < sw_sq_head)) { + db_wr32(qp->qp_id, qp->wqe_alloc_reg); + } + } else if (sw_sq_head != qp->initial_ring.head) { + if ((hw_sq_tail >= qp->initial_ring.head) || + (hw_sq_tail < sw_sq_head)) { + db_wr32(qp->qp_id, qp->wqe_alloc_reg); + } + } + } + + qp->initial_ring.head = qp->sq_ring.head; +} + +/** + * i40iw_qp_ring_push_db - ring qp doorbell + * @qp: hw qp ptr + * @wqe_idx: wqe index + */ +static void i40iw_qp_ring_push_db(struct i40iw_qp_uk *qp, u32 wqe_idx) +{ + set_32bit_val(qp->push_db, 0, LS_32((wqe_idx >> 2), I40E_PFPE_WQEALLOC_WQE_DESC_INDEX) | qp->qp_id); + qp->initial_ring.head = I40IW_RING_GETCURRENT_HEAD(qp->sq_ring); +} + +/** + * i40iw_qp_get_next_send_wqe - return next wqe ptr + * @qp: hw qp ptr + * @wqe_idx: return wqe index + * @wqe_size: size of sq wqe + */ +u64 *i40iw_qp_get_next_send_wqe(struct i40iw_qp_uk *qp, + u32 *wqe_idx, + u8 wqe_size, + u32 total_size, + u64 wr_id + ) +{ + u64 *wqe = NULL; + u64 wqe_ptr; + u32 peek_head = 0; + u16 offset; + enum i40iw_status_code ret_code = 0; + u8 nop_wqe_cnt = 0, i; + u64 *wqe_0 = NULL; + + *wqe_idx = I40IW_RING_GETCURRENT_HEAD(qp->sq_ring); + + if (!*wqe_idx) + qp->swqe_polarity = !qp->swqe_polarity; + wqe_ptr = (uintptr_t)qp->sq_base[*wqe_idx].elem; + offset = (u16)(wqe_ptr) & 0x7F; + if ((offset + wqe_size) > I40IW_QP_WQE_MAX_SIZE) { + nop_wqe_cnt = (u8)(I40IW_QP_WQE_MAX_SIZE - offset) / I40IW_QP_WQE_MIN_SIZE; + for (i = 0; i < nop_wqe_cnt; i++) { + i40iw_nop_1(qp); + I40IW_RING_MOVE_HEAD(qp->sq_ring, ret_code); + if (ret_code) + return NULL; + } + + *wqe_idx = I40IW_RING_GETCURRENT_HEAD(qp->sq_ring); + if (!*wqe_idx) + qp->swqe_polarity = !qp->swqe_polarity; + } + + if (((*wqe_idx & 3) == 1) && (wqe_size == I40IW_WQE_SIZE_64)) { + i40iw_nop_1(qp); + I40IW_RING_MOVE_HEAD(qp->sq_ring, ret_code); + if (ret_code) + return NULL; + *wqe_idx = I40IW_RING_GETCURRENT_HEAD(qp->sq_ring); + if (!*wqe_idx) + qp->swqe_polarity = !qp->swqe_polarity; + } + + I40IW_RING_MOVE_HEAD_BY_COUNT(qp->sq_ring, + (wqe_size / I40IW_QP_WQE_MIN_SIZE), ret_code); + if (ret_code) + return NULL; + + wqe = qp->sq_base[*wqe_idx].elem; + + peek_head = I40IW_RING_GETCURRENT_HEAD(qp->sq_ring); + wqe_0 = qp->sq_base[peek_head].elem; + + if (((peek_head & 3) == 1) || ((peek_head & 3) == 3)) { + if (RS_64(wqe_0[3], I40IWQPSQ_VALID) != !qp->swqe_polarity) + wqe_0[3] = LS_64(!qp->swqe_polarity, I40IWQPSQ_VALID); + } + + qp->sq_wrtrk_array[*wqe_idx].wrid = wr_id; + qp->sq_wrtrk_array[*wqe_idx].wr_len = total_size; + qp->sq_wrtrk_array[*wqe_idx].wqe_size = wqe_size; + return wqe; +} + +/** + * i40iw_set_fragment - set fragment in wqe + * @wqe: wqe for setting fragment + * @offset: offset value + * @sge: sge length and stag + */ +static void i40iw_set_fragment(u64 *wqe, u32 offset, struct i40iw_sge *sge) +{ + if (sge) { + set_64bit_val(wqe, offset, LS_64(sge->tag_off, I40IWQPSQ_FRAG_TO)); + set_64bit_val(wqe, (offset + I40IW_BYTE_8), + (LS_64(sge->len, I40IWQPSQ_FRAG_LEN) | + LS_64(sge->stag, I40IWQPSQ_FRAG_STAG))); + } +} + +/** + * i40iw_qp_get_next_recv_wqe - get next qp's rcv wqe + * @qp: hw qp ptr + * @wqe_idx: return wqe index + */ +u64 *i40iw_qp_get_next_recv_wqe(struct i40iw_qp_uk *qp, u32 *wqe_idx) +{ + u64 *wqe = NULL; + enum i40iw_status_code ret_code; + + if (I40IW_RING_FULL_ERR(qp->rq_ring)) + return NULL; + + I40IW_ATOMIC_RING_MOVE_HEAD(qp->rq_ring, *wqe_idx, ret_code); + if (ret_code) + return NULL; + if (!*wqe_idx) + qp->rwqe_polarity = !qp->rwqe_polarity; + /* rq_wqe_size_multiplier is no of qwords in one rq wqe */ + wqe = qp->rq_base[*wqe_idx * (qp->rq_wqe_size_multiplier >> 2)].elem; + + return wqe; +} + +/** + * i40iw_rdma_write - rdma write operation + * @qp: hw qp ptr + * @info: post sq information + * @post_sq: flag to post sq + */ +static enum i40iw_status_code i40iw_rdma_write(struct i40iw_qp_uk *qp, + struct i40iw_post_sq_info *info, + bool post_sq) +{ + u64 header; + u64 *wqe; + struct i40iw_rdma_write *op_info; + u32 i, wqe_idx; + u32 total_size = 0, byte_off; + enum i40iw_status_code ret_code; + bool read_fence = false; + u8 wqe_size; + + op_info = &info->op.rdma_write; + if (op_info->num_lo_sges > qp->max_sq_frag_cnt) + return I40IW_ERR_INVALID_FRAG_COUNT; + + for (i = 0; i < op_info->num_lo_sges; i++) + total_size += op_info->lo_sg_list[i].len; + + if (total_size > I40IW_MAX_OUTBOUND_MESSAGE_SIZE) + return I40IW_ERR_QP_INVALID_MSG_SIZE; + + read_fence |= info->read_fence; + + ret_code = i40iw_fragcnt_to_wqesize_sq(op_info->num_lo_sges, &wqe_size); + if (ret_code) + return ret_code; + + wqe = i40iw_qp_get_next_send_wqe(qp, &wqe_idx, wqe_size, + total_size,info->wr_id); + if (!wqe) + return I40IW_ERR_QP_TOOMANY_WRS_POSTED; + set_64bit_val(wqe, I40IW_BYTE_16, + LS_64(op_info->rem_addr.tag_off, I40IWQPSQ_FRAG_TO)); + if (!op_info->rem_addr.stag) + return I40IW_ERR_BAD_STAG; + + header = LS_64(op_info->rem_addr.stag, I40IWQPSQ_REMSTAG) | + LS_64(I40IWQP_OP_RDMA_WRITE, I40IWQPSQ_OPCODE) | + LS_64((op_info->num_lo_sges > 1 ? (op_info->num_lo_sges - 1) : 0), I40IWQPSQ_ADDFRAGCNT) | + LS_64(read_fence, I40IWQPSQ_READFENCE) | + LS_64(info->local_fence, I40IWQPSQ_LOCALFENCE) | + LS_64(info->signaled, I40IWQPSQ_SIGCOMPL) | + LS_64(qp->swqe_polarity, I40IWQPSQ_VALID); + + i40iw_set_fragment(wqe, I40IW_BYTE_0, op_info->lo_sg_list); + + for (i = 1, byte_off = I40IW_BYTE_32; i < op_info->num_lo_sges; i++) { + i40iw_set_fragment(wqe, byte_off, &op_info->lo_sg_list[i]); + byte_off += 16; + } + + udma_to_device_barrier(); /* make sure WQE is populated before valid bit is set */ + + set_64bit_val(wqe, I40IW_BYTE_24, header); + + if (post_sq) + i40iw_qp_post_wr(qp); + + return 0; +} + +/** + * i40iw_rdma_read - rdma read command + * @qp: hw qp ptr + * @info: post sq information + * @inv_stag: flag for inv_stag + * @post_sq: flag to post sq + */ +static enum i40iw_status_code i40iw_rdma_read(struct i40iw_qp_uk *qp, + struct i40iw_post_sq_info *info, + bool inv_stag, + bool post_sq) +{ + u64 *wqe; + struct i40iw_rdma_read *op_info; + u64 header; + u32 wqe_idx; + enum i40iw_status_code ret_code; + u8 wqe_size; + bool local_fence = false; + + op_info = &info->op.rdma_read; + ret_code = i40iw_fragcnt_to_wqesize_sq(1, &wqe_size); + if (ret_code) + return ret_code; + wqe = i40iw_qp_get_next_send_wqe(qp, &wqe_idx, wqe_size, + op_info->lo_addr.len,info->wr_id); + if (!wqe) + return I40IW_ERR_QP_TOOMANY_WRS_POSTED; + local_fence |= info->local_fence; + + set_64bit_val(wqe, I40IW_BYTE_16, LS_64(op_info->rem_addr.tag_off, I40IWQPSQ_FRAG_TO)); + header = LS_64(op_info->rem_addr.stag, I40IWQPSQ_REMSTAG) | + LS_64((inv_stag ? I40IWQP_OP_RDMA_READ_LOC_INV : I40IWQP_OP_RDMA_READ), I40IWQPSQ_OPCODE) | + LS_64(info->read_fence, I40IWQPSQ_READFENCE) | + LS_64(local_fence, I40IWQPSQ_LOCALFENCE) | + LS_64(info->signaled, I40IWQPSQ_SIGCOMPL) | + LS_64(qp->swqe_polarity, I40IWQPSQ_VALID); + + i40iw_set_fragment(wqe, I40IW_BYTE_0, &op_info->lo_addr); + + udma_to_device_barrier(); /* make sure WQE is populated before valid bit is set */ + + set_64bit_val(wqe, I40IW_BYTE_24, header); + if (post_sq) + i40iw_qp_post_wr(qp); + + return 0; +} + +/** + * i40iw_send - rdma send command + * @qp: hw qp ptr + * @info: post sq information + * @stag_to_inv: stag_to_inv value + * @post_sq: flag to post sq + */ +static enum i40iw_status_code i40iw_send(struct i40iw_qp_uk *qp, + struct i40iw_post_sq_info *info, + u32 stag_to_inv, + bool post_sq) +{ + u64 *wqe; + struct i40iw_post_send *op_info; + u64 header; + u32 i, wqe_idx, total_size = 0, byte_off; + enum i40iw_status_code ret_code; + bool read_fence = false; + u8 wqe_size; + + op_info = &info->op.send; + if (qp->max_sq_frag_cnt < op_info->num_sges) + return I40IW_ERR_INVALID_FRAG_COUNT; + + for (i = 0; i < op_info->num_sges; i++) + total_size += op_info->sg_list[i].len; + ret_code = i40iw_fragcnt_to_wqesize_sq(op_info->num_sges, &wqe_size); + if (ret_code) + return ret_code; + + wqe = i40iw_qp_get_next_send_wqe(qp, &wqe_idx, wqe_size, + total_size,info->wr_id); + if (!wqe) + return I40IW_ERR_QP_TOOMANY_WRS_POSTED; + + read_fence |= info->read_fence; + + set_64bit_val(wqe, I40IW_BYTE_16, 0); + header = LS_64(stag_to_inv, I40IWQPSQ_REMSTAG) | + LS_64(info->op_type, I40IWQPSQ_OPCODE) | + LS_64((op_info->num_sges > 1 ? (op_info->num_sges - 1) : 0), + I40IWQPSQ_ADDFRAGCNT) | + LS_64(read_fence, I40IWQPSQ_READFENCE) | + LS_64(info->local_fence, I40IWQPSQ_LOCALFENCE) | + LS_64(info->signaled, I40IWQPSQ_SIGCOMPL) | + LS_64(qp->swqe_polarity, I40IWQPSQ_VALID); + + i40iw_set_fragment(wqe, I40IW_BYTE_0, op_info->sg_list); + + for (i = 1, byte_off = I40IW_BYTE_32; i < op_info->num_sges; i++) { + i40iw_set_fragment(wqe, byte_off, &op_info->sg_list[i]); + byte_off += 16; + } + + udma_to_device_barrier(); /* make sure WQE is populated before valid bit is set */ + + set_64bit_val(wqe, I40IW_BYTE_24, header); + if (post_sq) + i40iw_qp_post_wr(qp); + + return 0; +} + +/** + * i40iw_inline_rdma_write - inline rdma write operation + * @qp: hw qp ptr + * @info: post sq information + * @post_sq: flag to post sq + */ +static enum i40iw_status_code i40iw_inline_rdma_write(struct i40iw_qp_uk *qp, + struct i40iw_post_sq_info *info, + bool post_sq) +{ + u64 *wqe; + u8 *dest, *src; + struct i40iw_inline_rdma_write *op_info; + u64 *push; + u64 header = 0; + u32 wqe_idx; + enum i40iw_status_code ret_code; + bool read_fence = false; + u8 wqe_size; + + op_info = &info->op.inline_rdma_write; + if (op_info->len > I40IW_MAX_INLINE_DATA_SIZE) + return I40IW_ERR_INVALID_IMM_DATA_SIZE; + + ret_code = i40iw_inline_data_size_to_wqesize(op_info->len, &wqe_size); + if (ret_code) + return ret_code; + + wqe = i40iw_qp_get_next_send_wqe(qp, &wqe_idx, wqe_size, + op_info->len,info->wr_id); + if (!wqe) + return I40IW_ERR_QP_TOOMANY_WRS_POSTED; + + read_fence |= info->read_fence; + set_64bit_val(wqe, I40IW_BYTE_16, + LS_64(op_info->rem_addr.tag_off, I40IWQPSQ_FRAG_TO)); + + header = LS_64(op_info->rem_addr.stag, I40IWQPSQ_REMSTAG) | + LS_64(I40IWQP_OP_RDMA_WRITE, I40IWQPSQ_OPCODE) | + LS_64(op_info->len, I40IWQPSQ_INLINEDATALEN) | + LS_64(1, I40IWQPSQ_INLINEDATAFLAG) | + LS_64((qp->push_db ? 1 : 0), I40IWQPSQ_PUSHWQE) | + LS_64(read_fence, I40IWQPSQ_READFENCE) | + LS_64(info->local_fence, I40IWQPSQ_LOCALFENCE) | + LS_64(info->signaled, I40IWQPSQ_SIGCOMPL) | + LS_64(qp->swqe_polarity, I40IWQPSQ_VALID); + + dest = (u8 *)wqe; + src = (u8 *)(op_info->data); + + if (op_info->len <= I40IW_BYTE_16) { + memcpy(dest, src, op_info->len); + } else { + memcpy(dest, src, I40IW_BYTE_16); + src += I40IW_BYTE_16; + dest = (u8 *)wqe + I40IW_BYTE_32; + memcpy(dest, src, op_info->len - I40IW_BYTE_16); + } + + udma_to_device_barrier(); /* make sure WQE is populated before valid bit is set */ + + set_64bit_val(wqe, I40IW_BYTE_24, header); + + if (qp->push_db) { + push = (u64 *)((uintptr_t)qp->push_wqe + (wqe_idx & 0x3) * 0x20); + memcpy(push, wqe, (op_info->len > 16) ? op_info->len + 16 : 32); + i40iw_qp_ring_push_db(qp, wqe_idx); + } else { + if (post_sq) + i40iw_qp_post_wr(qp); + } + + return 0; +} + +/** + * i40iw_inline_send - inline send operation + * @qp: hw qp ptr + * @info: post sq information + * @stag_to_inv: remote stag + * @post_sq: flag to post sq + */ +static enum i40iw_status_code i40iw_inline_send(struct i40iw_qp_uk *qp, + struct i40iw_post_sq_info *info, + u32 stag_to_inv, + bool post_sq) +{ + u64 *wqe; + u8 *dest, *src; + struct i40iw_post_inline_send *op_info; + u64 header; + u32 wqe_idx; + enum i40iw_status_code ret_code; + bool read_fence = false; + u8 wqe_size; + u64 *push; + + op_info = &info->op.inline_send; + if (op_info->len > I40IW_MAX_INLINE_DATA_SIZE) + return I40IW_ERR_INVALID_IMM_DATA_SIZE; + + ret_code = i40iw_inline_data_size_to_wqesize(op_info->len, &wqe_size); + if (ret_code) + return ret_code; + + wqe = i40iw_qp_get_next_send_wqe(qp, &wqe_idx, wqe_size, + op_info->len,info->wr_id); + if (!wqe) + return I40IW_ERR_QP_TOOMANY_WRS_POSTED; + + read_fence |= info->read_fence; + header = LS_64(stag_to_inv, I40IWQPSQ_REMSTAG) | + LS_64(info->op_type, I40IWQPSQ_OPCODE) | + LS_64(op_info->len, I40IWQPSQ_INLINEDATALEN) | + LS_64(1, I40IWQPSQ_INLINEDATAFLAG) | + LS_64((qp->push_db ? 1 : 0), I40IWQPSQ_PUSHWQE) | + LS_64(read_fence, I40IWQPSQ_READFENCE) | + LS_64(info->local_fence, I40IWQPSQ_LOCALFENCE) | + LS_64(info->signaled, I40IWQPSQ_SIGCOMPL) | + LS_64(qp->swqe_polarity, I40IWQPSQ_VALID); + + dest = (u8 *)wqe; + src = (u8 *)(op_info->data); + + if (op_info->len <= I40IW_BYTE_16) { + memcpy(dest, src, op_info->len); + } else { + memcpy(dest, src, I40IW_BYTE_16); + src += I40IW_BYTE_16; + dest = (u8 *)wqe + I40IW_BYTE_32; + memcpy(dest, src, op_info->len - I40IW_BYTE_16); + } + + udma_to_device_barrier(); /* make sure WQE is populated before valid bit is set */ + + set_64bit_val(wqe, I40IW_BYTE_24, header); + + if (qp->push_db) { + push = (u64 *)((uintptr_t)qp->push_wqe + (wqe_idx & 0x3) * 0x20); + memcpy(push, wqe, (op_info->len > 16) ? op_info->len + 16 : 32); + i40iw_qp_ring_push_db(qp, wqe_idx); + } else { + if (post_sq) + i40iw_qp_post_wr(qp); + } + + return 0; +} + +/** + * i40iw_stag_local_invalidate - stag invalidate operation + * @qp: hw qp ptr + * @info: post sq information + * @post_sq: flag to post sq + */ +static enum i40iw_status_code i40iw_stag_local_invalidate(struct i40iw_qp_uk *qp, + struct i40iw_post_sq_info *info, + bool post_sq) +{ + u64 *wqe; + struct i40iw_inv_local_stag *op_info; + u64 header; + u32 wqe_idx; + bool local_fence = false; + + op_info = &info->op.inv_local_stag; + local_fence = info->local_fence; + + wqe = i40iw_qp_get_next_send_wqe(qp, &wqe_idx, I40IW_QP_WQE_MIN_SIZE, + 0,info->wr_id); + if (!wqe) + return I40IW_ERR_QP_TOOMANY_WRS_POSTED; + set_64bit_val(wqe, I40IW_BYTE_0, 0); + set_64bit_val(wqe, I40IW_BYTE_8, + LS_64(op_info->target_stag, I40IWQPSQ_LOCSTAG)); + set_64bit_val(wqe, I40IW_BYTE_16, 0); + header = LS_64(I40IW_OP_TYPE_INV_STAG, I40IWQPSQ_OPCODE) | + LS_64(info->read_fence, I40IWQPSQ_READFENCE) | + LS_64(local_fence, I40IWQPSQ_LOCALFENCE) | + LS_64(info->signaled, I40IWQPSQ_SIGCOMPL) | + LS_64(qp->swqe_polarity, I40IWQPSQ_VALID); + + udma_to_device_barrier(); /* make sure WQE is populated before valid bit is set */ + + set_64bit_val(wqe, I40IW_BYTE_24, header); + + if (post_sq) + i40iw_qp_post_wr(qp); + + return 0; +} + +/** + * i40iw_mw_bind - Memory Window bind operation + * @qp: hw qp ptr + * @info: post sq information + * @post_sq: flag to post sq + */ +static enum i40iw_status_code i40iw_mw_bind(struct i40iw_qp_uk *qp, + struct i40iw_post_sq_info *info, + bool post_sq) +{ + u64 *wqe; + struct i40iw_bind_window *op_info; + u64 header; + u32 wqe_idx; + bool local_fence = false; + + op_info = &info->op.bind_window; + + local_fence |= info->local_fence; + wqe = i40iw_qp_get_next_send_wqe(qp, &wqe_idx, I40IW_QP_WQE_MIN_SIZE, + 0,info->wr_id); + if (!wqe) + return I40IW_ERR_QP_TOOMANY_WRS_POSTED; + set_64bit_val(wqe, I40IW_BYTE_0, (uintptr_t)op_info->va); + set_64bit_val(wqe, I40IW_BYTE_8, + LS_64(op_info->mr_stag, I40IWQPSQ_PARENTMRSTAG) | + LS_64(op_info->mw_stag, I40IWQPSQ_MWSTAG)); + set_64bit_val(wqe, I40IW_BYTE_16, op_info->bind_length); + header = LS_64(I40IW_OP_TYPE_BIND_MW, I40IWQPSQ_OPCODE) | + LS_64(((op_info->enable_reads << 2) | + (op_info->enable_writes << 3)), + I40IWQPSQ_STAGRIGHTS) | + LS_64((op_info->addressing_type == I40IW_ADDR_TYPE_VA_BASED ? 1 : 0), + I40IWQPSQ_VABASEDTO) | + LS_64(info->read_fence, I40IWQPSQ_READFENCE) | + LS_64(local_fence, I40IWQPSQ_LOCALFENCE) | + LS_64(info->signaled, I40IWQPSQ_SIGCOMPL) | + LS_64(qp->swqe_polarity, I40IWQPSQ_VALID); + + udma_to_device_barrier(); /* make sure WQE is populated before valid bit is set */ + + set_64bit_val(wqe, I40IW_BYTE_24, header); + + if (post_sq) + i40iw_qp_post_wr(qp); + + return 0; +} + +/** + * i40iw_post_receive - post receive wqe + * @qp: hw qp ptr + * @info: post rq information + */ +static enum i40iw_status_code i40iw_post_receive(struct i40iw_qp_uk *qp, + struct i40iw_post_rq_info *info) +{ + u64 *wqe; + u64 header; + u32 total_size = 0, wqe_idx, i, byte_off; + + if (qp->max_rq_frag_cnt < info->num_sges) + return I40IW_ERR_INVALID_FRAG_COUNT; + for (i = 0; i < info->num_sges; i++) + total_size += info->sg_list[i].len; + wqe = i40iw_qp_get_next_recv_wqe(qp, &wqe_idx); + if (!wqe) + return I40IW_ERR_QP_TOOMANY_WRS_POSTED; + + qp->rq_wrid_array[wqe_idx] = info->wr_id; + set_64bit_val(wqe, I40IW_BYTE_16, 0); + + header = LS_64((info->num_sges > 1 ? (info->num_sges - 1) : 0), + I40IWQPSQ_ADDFRAGCNT) | + LS_64(qp->rwqe_polarity, I40IWQPSQ_VALID); + + i40iw_set_fragment(wqe, I40IW_BYTE_0, info->sg_list); + + for (i = 1, byte_off = I40IW_BYTE_32; i < info->num_sges; i++) { + i40iw_set_fragment(wqe, byte_off, &info->sg_list[i]); + byte_off += 16; + } + + udma_to_device_barrier(); /* make sure WQE is populated before valid bit is set */ + + set_64bit_val(wqe, I40IW_BYTE_24, header); + + return 0; +} + +/** + * i40iw_cq_request_notification - cq notification request (door bell) + * @cq: hw cq + * @cq_notify: notification type + */ +static void i40iw_cq_request_notification(struct i40iw_cq_uk *cq, + enum i40iw_completion_notify cq_notify) +{ + u64 temp_val; + u16 sw_cq_sel; + u8 arm_next_se = 0; + u8 arm_next = 0; + u8 arm_seq_num; + + get_64bit_val(cq->shadow_area, I40IW_BYTE_32, &temp_val); + arm_seq_num = (u8)RS_64(temp_val, I40IW_CQ_DBSA_ARM_SEQ_NUM); + arm_seq_num++; + + sw_cq_sel = (u16)RS_64(temp_val, I40IW_CQ_DBSA_SW_CQ_SELECT); + arm_next_se = (u8)RS_64(temp_val, I40IW_CQ_DBSA_ARM_NEXT_SE); + arm_next_se |= 1; + if (cq_notify == IW_CQ_COMPL_EVENT) + arm_next = 1; + temp_val = LS_64(arm_seq_num, I40IW_CQ_DBSA_ARM_SEQ_NUM) | + LS_64(sw_cq_sel, I40IW_CQ_DBSA_SW_CQ_SELECT) | + LS_64(arm_next_se, I40IW_CQ_DBSA_ARM_NEXT_SE) | + LS_64(arm_next, I40IW_CQ_DBSA_ARM_NEXT); + + set_64bit_val(cq->shadow_area, I40IW_BYTE_32, temp_val); + + udma_to_device_barrier(); /* make sure WQE is populated before valid bit is set */ + + db_wr32(cq->cq_id, cq->cqe_alloc_reg); +} + +/** + * i40iw_cq_post_entries - update tail in shadow memory + * @cq: hw cq + * @count: # of entries processed + */ +static enum i40iw_status_code i40iw_cq_post_entries(struct i40iw_cq_uk *cq, + u8 count) +{ + I40IW_RING_MOVE_TAIL_BY_COUNT(cq->cq_ring, count); + set_64bit_val(cq->shadow_area, I40IW_BYTE_0, + I40IW_RING_GETCURRENT_HEAD(cq->cq_ring)); + return 0; +} + +/** + * i40iw_cq_poll_completion - get cq completion info + * @cq: hw cq + * @info: cq poll information returned + * @post_cq: update cq tail + */ +static enum i40iw_status_code i40iw_cq_poll_completion(struct i40iw_cq_uk *cq, + struct i40iw_cq_poll_info *info) +{ + u64 comp_ctx, qword0, qword2, qword3, wqe_qword; + u64 *cqe, *sw_wqe; + struct i40iw_qp_uk *qp; + struct i40iw_ring *pring = NULL; + u32 wqe_idx, q_type, array_idx = 0; + enum i40iw_status_code ret_code = 0; + bool move_cq_head = true; + u8 polarity; + u8 addl_wqes = 0; + + if (cq->avoid_mem_cflct) + cqe = (u64 *)I40IW_GET_CURRENT_EXTENDED_CQ_ELEMENT(cq); + else + cqe = (u64 *)I40IW_GET_CURRENT_CQ_ELEMENT(cq); + + get_64bit_val(cqe, I40IW_BYTE_24, &qword3); + polarity = (u8)RS_64(qword3, I40IW_CQ_VALID); + + if (polarity != cq->polarity) + return I40IW_ERR_QUEUE_EMPTY; + + udma_from_device_barrier(); + + q_type = (u8)RS_64(qword3, I40IW_CQ_SQ); + info->error = (bool)RS_64(qword3, I40IW_CQ_ERROR); + info->push_dropped = (bool)RS_64(qword3, I40IWCQ_PSHDROP); + if (info->error) { + info->comp_status = I40IW_COMPL_STATUS_FLUSHED; + info->major_err = (bool)RS_64(qword3, I40IW_CQ_MAJERR); + info->minor_err = (bool)RS_64(qword3, I40IW_CQ_MINERR); + } else { + info->comp_status = I40IW_COMPL_STATUS_SUCCESS; + } + + get_64bit_val(cqe, I40IW_BYTE_0, &qword0); + get_64bit_val(cqe, I40IW_BYTE_16, &qword2); + + info->tcp_seq_num = (u8)RS_64(qword0, I40IWCQ_TCPSEQNUM); + + info->qp_id = (u32)RS_64(qword2, I40IWCQ_QPID); + + get_64bit_val(cqe, I40IW_BYTE_8, &comp_ctx); + + info->solicited_event = (bool)RS_64(qword3, I40IWCQ_SOEVENT); + info->is_srq = (bool)RS_64(qword3, I40IWCQ_SRQ); + + qp = (struct i40iw_qp_uk *)(i40iw_uintptr)comp_ctx; + if (!qp) { + ret_code = I40IW_ERR_QUEUE_DESTROYED; + goto exit; + } + wqe_idx = (u32)RS_64(qword3, I40IW_CQ_WQEIDX); + info->qp_handle = (i40iw_qp_handle)(i40iw_uintptr)qp; + + if (q_type == I40IW_CQE_QTYPE_RQ) { + array_idx = (wqe_idx * 4) / qp->rq_wqe_size_multiplier; + if (info->comp_status == I40IW_COMPL_STATUS_FLUSHED) { + info->wr_id = qp->rq_wrid_array[qp->rq_ring.tail]; + array_idx = qp->rq_ring.tail; + } else { + info->wr_id = qp->rq_wrid_array[array_idx]; + } + + info->op_type = I40IW_OP_TYPE_REC; + if (qword3 & I40IWCQ_STAG_MASK) { + info->stag_invalid_set = true; + info->inv_stag = (u32)RS_64(qword2, I40IWCQ_INVSTAG); + } else { + info->stag_invalid_set = false; + } + info->bytes_xfered = (u32)RS_64(qword0, I40IWCQ_PAYLDLEN); + I40IW_RING_SET_TAIL(qp->rq_ring, array_idx + 1); + pring = &qp->rq_ring; + } else { + if (qp->first_sq_wq) { + qp->first_sq_wq = false; + if (!wqe_idx && (qp->sq_ring.head == qp->sq_ring.tail)) { + I40IW_RING_MOVE_HEAD_NOCHECK(cq->cq_ring); + I40IW_RING_MOVE_TAIL(cq->cq_ring); + set_64bit_val(cq->shadow_area, I40IW_BYTE_0, + I40IW_RING_GETCURRENT_HEAD(cq->cq_ring)); + memset(info, 0, sizeof(struct i40iw_cq_poll_info)); + return i40iw_cq_poll_completion(cq, info); + } + } + + if (info->comp_status != I40IW_COMPL_STATUS_FLUSHED) { + info->wr_id = qp->sq_wrtrk_array[wqe_idx].wrid; + info->bytes_xfered = qp->sq_wrtrk_array[wqe_idx].wr_len; + + info->op_type = (u8)RS_64(qword3, I40IWCQ_OP); + sw_wqe = qp->sq_base[wqe_idx].elem; + get_64bit_val(sw_wqe, I40IW_BYTE_24, &wqe_qword); + addl_wqes = qp->sq_wrtrk_array[wqe_idx].wqe_size/I40IW_QP_WQE_MIN_SIZE; + I40IW_RING_SET_TAIL(qp->sq_ring, (wqe_idx + addl_wqes)); + } else { + do { + u8 op_type; + u32 tail; + + tail = qp->sq_ring.tail; + sw_wqe = qp->sq_base[tail].elem; + get_64bit_val(sw_wqe, I40IW_BYTE_24, &wqe_qword); + op_type = (u8)RS_64(wqe_qword, I40IWQPSQ_OPCODE); + info->op_type = op_type; + addl_wqes = qp->sq_wrtrk_array[tail].wqe_size/I40IW_QP_WQE_MIN_SIZE; + I40IW_RING_SET_TAIL(qp->sq_ring, (tail + addl_wqes)); + if (op_type != I40IWQP_OP_NOP) { + info->wr_id = qp->sq_wrtrk_array[tail].wrid; + info->bytes_xfered = qp->sq_wrtrk_array[tail].wr_len; + break; + } + } while (1); + } + pring = &qp->sq_ring; + } + + ret_code = 0; + +exit: + if (!ret_code && + (info->comp_status == I40IW_COMPL_STATUS_FLUSHED)) + if (pring && (I40IW_RING_MORE_WORK(*pring))) + move_cq_head = false; + + if (move_cq_head) { + I40IW_RING_MOVE_HEAD_NOCHECK(cq->cq_ring); + + if (I40IW_RING_GETCURRENT_HEAD(cq->cq_ring) == 0) + cq->polarity ^= 1; + + I40IW_RING_MOVE_TAIL(cq->cq_ring); + set_64bit_val(cq->shadow_area, I40IW_BYTE_0, + I40IW_RING_GETCURRENT_HEAD(cq->cq_ring)); + } else { + if (info->is_srq) + return ret_code; + qword3 &= ~I40IW_CQ_WQEIDX_MASK; + qword3 |= LS_64(pring->tail, I40IW_CQ_WQEIDX); + set_64bit_val(cqe, I40IW_BYTE_24, qword3); + } + + return ret_code; +} + +/** + * i40iw_qp_roundup - return round up QP WQ depth + * @wqdepth: WQ depth in quantas to round up + */ +static int i40iw_qp_round_up(u32 wqdepth) +{ + int scount = 1; + + for (wqdepth--; scount <= 16; scount *= 2) + wqdepth |= wqdepth >> scount; + + return ++wqdepth; +} + +/** + * i40iw_get_wqe_shift - get shift count for maximum wqe size + * @sge: Maximum Scatter Gather Elements wqe + * @inline_data: Maximum inline data size + * @shift: Returns the shift needed based on sge + * + * Shift can be used to left shift the wqe size based on number of SGEs and inlind data size. + * For 1 SGE or inline data <= 16, shift = 0 (wqe size of 32 bytes). + * For 2 or 3 SGEs or inline data <= 48, shift = 1 (wqe size of 64 bytes). + * Shift of 2 otherwise (wqe size of 128 bytes). + */ +void i40iw_get_wqe_shift(u32 sge, u32 inline_data, u8 *shift) +{ + *shift = 0; + if (sge > 1 || inline_data > 16) + *shift = (sge < 4 && inline_data <= 48) ? 1 : 2; +} + +/* + * i40iw_get_sqdepth - get SQ depth (quantas) + * @sq_size: SQ size + * @shift: shift which determines size of WQE + * @sqdepth: depth of SQ + * + */ +enum i40iw_status_code i40iw_get_sqdepth(u32 sq_size, u8 shift, u32 *sqdepth) +{ + *sqdepth = i40iw_qp_round_up((sq_size << shift) + I40IW_SQ_RSVD); + + if (*sqdepth < (I40IW_QP_SW_MIN_WQSIZE << shift)) + *sqdepth = I40IW_QP_SW_MIN_WQSIZE << shift; + else if (*sqdepth > I40IW_QP_SW_MAX_SQ_QUANTAS) + return I40IW_ERR_INVALID_SIZE; + + return 0; +} + +/* + * i40iw_get_rq_depth - get RQ depth (quantas) + * @rq_size: RQ size + * @shift: shift which determines size of WQE + * @rqdepth: depth of RQ + * + */ +enum i40iw_status_code i40iw_get_rqdepth(u32 rq_size, u8 shift, u32 *rqdepth) +{ + *rqdepth = i40iw_qp_round_up((rq_size << shift) + I40IW_RQ_RSVD); + + if (*rqdepth < (I40IW_QP_SW_MIN_WQSIZE << shift)) + *rqdepth = I40IW_QP_SW_MIN_WQSIZE << shift; + else if (*rqdepth > I40IW_QP_SW_MAX_RQ_QUANTAS) + return I40IW_ERR_INVALID_SIZE; + + return 0; +} + +static struct i40iw_qp_uk_ops iw_qp_uk_ops = { + i40iw_qp_post_wr, + i40iw_qp_ring_push_db, + i40iw_rdma_write, + i40iw_rdma_read, + i40iw_send, + i40iw_inline_rdma_write, + i40iw_inline_send, + i40iw_stag_local_invalidate, + i40iw_mw_bind, + i40iw_post_receive, + i40iw_nop +}; + +static struct i40iw_cq_ops iw_cq_ops = { + i40iw_cq_request_notification, + i40iw_cq_poll_completion, + i40iw_cq_post_entries, + i40iw_clean_cq +}; + +static struct i40iw_device_uk_ops iw_device_uk_ops = { + i40iw_cq_uk_init, + i40iw_qp_uk_init, +}; + +/** + * i40iw_qp_uk_init - initialize shared qp + * @qp: hw qp (user and kernel) + * @info: qp initialization info + * + * initializes the vars used in both user and kernel mode. + * size of the wqe depends on numbers of max. fragements + * allowed. Then size of wqe * the number of wqes should be the + * amount of memory allocated for sq and rq. If srq is used, + * then rq_base will point to one rq wqe only (not the whole + * array of wqes) + */ +enum i40iw_status_code i40iw_qp_uk_init(struct i40iw_qp_uk *qp, + struct i40iw_qp_uk_init_info *info) +{ + enum i40iw_status_code ret_code = 0; + u32 sq_ring_size; + u8 sqshift, rqshift; + + if (info->max_sq_frag_cnt > I40IW_MAX_WQ_FRAGMENT_COUNT) + return I40IW_ERR_INVALID_FRAG_COUNT; + + if (info->max_rq_frag_cnt > I40IW_MAX_WQ_FRAGMENT_COUNT) + return I40IW_ERR_INVALID_FRAG_COUNT; + i40iw_get_wqe_shift(info->max_sq_frag_cnt, info->max_inline_data, &sqshift); + + qp->sq_base = info->sq; + qp->rq_base = info->rq; + qp->shadow_area = info->shadow_area; + qp->sq_wrtrk_array = info->sq_wrtrk_array; + qp->rq_wrid_array = info->rq_wrid_array; + + qp->wqe_alloc_reg = info->wqe_alloc_reg; + qp->qp_id = info->qp_id; + + qp->sq_size = info->sq_size; + qp->push_db = info->push_db; + qp->push_wqe = info->push_wqe; + + qp->max_sq_frag_cnt = info->max_sq_frag_cnt; + sq_ring_size = qp->sq_size << sqshift; + + I40IW_RING_INIT(qp->sq_ring, sq_ring_size); + I40IW_RING_INIT(qp->initial_ring, sq_ring_size); + I40IW_RING_MOVE_HEAD(qp->sq_ring, ret_code); + I40IW_RING_MOVE_TAIL(qp->sq_ring); + I40IW_RING_MOVE_HEAD(qp->initial_ring, ret_code); + qp->swqe_polarity = 1; + qp->first_sq_wq = true; + qp->swqe_polarity_deferred = 1; + qp->rwqe_polarity = 0; + + if (!qp->use_srq) { + qp->rq_size = info->rq_size; + qp->max_rq_frag_cnt = info->max_rq_frag_cnt; + I40IW_RING_INIT(qp->rq_ring, qp->rq_size); + switch (info->abi_ver) { + case 4: + i40iw_get_wqe_shift(info->max_rq_frag_cnt, 0, &rqshift); + break; + case 5: /* fallthrough until next ABI version */ + default: + rqshift = I40IW_MAX_RQ_WQE_SHIFT; + break; + } + qp->rq_wqe_size = rqshift; + qp->rq_wqe_size_multiplier = 4 << rqshift; + } + qp->ops = iw_qp_uk_ops; + + return ret_code; +} + +/** + * i40iw_cq_uk_init - initialize shared cq (user and kernel) + * @cq: hw cq + * @info: hw cq initialization info + */ +enum i40iw_status_code i40iw_cq_uk_init(struct i40iw_cq_uk *cq, + struct i40iw_cq_uk_init_info *info) +{ + if ((info->cq_size < I40IW_MIN_CQ_SIZE) || + (info->cq_size > I40IW_MAX_CQ_SIZE)) + return I40IW_ERR_INVALID_SIZE; + cq->cq_base = (struct i40iw_cqe *)info->cq_base; + cq->cq_id = info->cq_id; + cq->cq_size = info->cq_size; + cq->cqe_alloc_reg = info->cqe_alloc_reg; + cq->shadow_area = info->shadow_area; + cq->avoid_mem_cflct = info->avoid_mem_cflct; + + I40IW_RING_INIT(cq->cq_ring, cq->cq_size); + cq->polarity = 1; + cq->ops = iw_cq_ops; + + return 0; +} + +/** + * i40iw_device_init_uk - setup routines for iwarp shared device + * @dev: iwarp shared (user and kernel) + */ +void i40iw_device_init_uk(struct i40iw_dev_uk *dev) +{ + dev->ops_uk = iw_device_uk_ops; +} + +/** + * i40iw_clean_cq - clean cq entries + * @ queue completion context + * @cq: cq to clean + */ +void i40iw_clean_cq(void *queue, struct i40iw_cq_uk *cq) +{ + u64 *cqe; + u64 qword3, comp_ctx; + u32 cq_head; + u8 polarity, temp; + + cq_head = cq->cq_ring.head; + temp = cq->polarity; + do { + if (cq->avoid_mem_cflct) + cqe = (u64 *)&(((struct i40iw_extended_cqe *)cq->cq_base)[cq_head]); + else + cqe = (u64 *)&cq->cq_base[cq_head]; + get_64bit_val(cqe, I40IW_BYTE_24, &qword3); + polarity = (u8)RS_64(qword3, I40IW_CQ_VALID); + + if (polarity != temp) + break; + + get_64bit_val(cqe, I40IW_BYTE_8, &comp_ctx); + if ((void *)(i40iw_uintptr)comp_ctx == queue) + set_64bit_val(cqe, I40IW_BYTE_8, 0); + + cq_head = (cq_head + 1) % cq->cq_ring.size; + if (!cq_head) + temp ^= 1; + } while (true); +} + +/** + * i40iw_nop - send a nop + * @qp: hw qp ptr + * @wr_id: work request id + * @signaled: flag if signaled for completion + * @post_sq: flag to post sq + */ +enum i40iw_status_code i40iw_nop(struct i40iw_qp_uk *qp, + u64 wr_id, + bool signaled, + bool post_sq) +{ + u64 header, *wqe; + u32 wqe_idx; + + wqe = i40iw_qp_get_next_send_wqe(qp, &wqe_idx, I40IW_QP_WQE_MIN_SIZE, + 0,wr_id); + if (!wqe) + return I40IW_ERR_QP_TOOMANY_WRS_POSTED; + set_64bit_val(wqe, I40IW_BYTE_0, 0); + set_64bit_val(wqe, I40IW_BYTE_8, 0); + set_64bit_val(wqe, I40IW_BYTE_16, 0); + + header = LS_64(I40IWQP_OP_NOP, I40IWQPSQ_OPCODE) | + LS_64(signaled, I40IWQPSQ_SIGCOMPL) | + LS_64(qp->swqe_polarity, I40IWQPSQ_VALID); + + udma_to_device_barrier(); /* make sure WQE is populated before valid bit is set */ + + set_64bit_val(wqe, I40IW_BYTE_24, header); + if (post_sq) + i40iw_qp_post_wr(qp); + + return 0; +} + +/** + * i40iw_fragcnt_to_wqesize_sq - calculate wqe size based on fragment count for SQ + * @frag_cnt: number of fragments + * @wqe_size: size of sq wqe returned + */ +enum i40iw_status_code i40iw_fragcnt_to_wqesize_sq(u32 frag_cnt, u8 *wqe_size) +{ + switch (frag_cnt) { + case 0: + case 1: + *wqe_size = I40IW_QP_WQE_MIN_SIZE; + break; + case 2: + case 3: + *wqe_size = 64; + break; + case 4: + case 5: + *wqe_size = 96; + break; + case 6: + case 7: + *wqe_size = 128; + break; + default: + return I40IW_ERR_INVALID_FRAG_COUNT; + } + + return 0; +} + +/** + * i40iw_fragcnt_to_wqesize_rq - calculate wqe size based on fragment count for RQ + * @frag_cnt: number of fragments + * @wqe_size: size of rq wqe returned + */ +enum i40iw_status_code i40iw_fragcnt_to_wqesize_rq(u32 frag_cnt, u8 *wqe_size) +{ + switch (frag_cnt) { + case 0: + case 1: + *wqe_size = 32; + break; + case 2: + case 3: + *wqe_size = 64; + break; + case 4: + case 5: + case 6: + case 7: + *wqe_size = 128; + break; + default: + return I40IW_ERR_INVALID_FRAG_COUNT; + } + + return 0; +} + +/** + * i40iw_inline_data_size_to_wqesize - based on inline data, wqe size + * @data_size: data size for inline + * @wqe_size: size of sq wqe returned + */ +enum i40iw_status_code i40iw_inline_data_size_to_wqesize(u32 data_size, + u8 *wqe_size) +{ + if (data_size > I40IW_MAX_INLINE_DATA_SIZE) + return I40IW_ERR_INVALID_IMM_DATA_SIZE; + + if (data_size <= 16) + *wqe_size = I40IW_QP_WQE_MIN_SIZE; + else + *wqe_size = 64; + + return 0; +} diff --git a/providers/i40iw/i40iw_umain.c b/providers/i40iw/i40iw_umain.c new file mode 100644 index 0000000..eef8cd5 --- /dev/null +++ b/providers/i40iw/i40iw_umain.c @@ -0,0 +1,226 @@ +/******************************************************************************* +* +* Copyright (c) 2015-2016 Intel Corporation. All rights reserved. +* +* This software is available to you under a choice of one of two +* licenses. You may choose to be licensed under the terms of the GNU +* General Public License (GPL) Version 2, available from the file +* COPYING in the main directory of this source tree, or the +* OpenFabrics.org BSD license below: +* +* Redistribution and use in source and binary forms, with or +* without modification, are permitted provided that the following +* conditions are met: +* +* - Redistributions of source code must retain the above +* copyright notice, this list of conditions and the following +* disclaimer. +* +* - Redistributions in binary form must reproduce the above +* copyright notice, this list of conditions and the following +* disclaimer in the documentation and/or other materials +* provided with the distribution. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS +* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN +* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +* SOFTWARE. +* +*******************************************************************************/ + +#include <config.h> + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <errno.h> +#include <sys/mman.h> +#include <pthread.h> + +#include "i40e_devids.h" +#include "i40iw_umain.h" +#include "i40iw-abi.h" + +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> + +static void i40iw_ufree_context(struct ibv_context *ibctx); + +#define INTEL_HCA(v, d) VERBS_PCI_MATCH(v, d, NULL) +static const struct verbs_match_ent hca_table[] = { + VERBS_DRIVER_ID(RDMA_DRIVER_I40IW), +#ifdef I40E_DEV_ID_X722_A0 + INTEL_HCA(I40E_INTEL_VENDOR_ID, I40E_DEV_ID_X722_A0), +#endif +#ifdef I40E_DEV_ID_X722_A0_VF + INTEL_HCA(I40E_INTEL_VENDOR_ID, I40E_DEV_ID_X722_A0_VF), +#endif +#ifdef I40E_DEV_ID_KX_X722 + INTEL_HCA(I40E_INTEL_VENDOR_ID, I40E_DEV_ID_KX_X722), +#endif +#ifdef I40E_DEV_ID_QSFP_X722 + INTEL_HCA(I40E_INTEL_VENDOR_ID, I40E_DEV_ID_QSFP_X722), +#endif +#ifdef I40E_DEV_ID_SFP_X722 + INTEL_HCA(I40E_INTEL_VENDOR_ID, I40E_DEV_ID_SFP_X722), +#endif +#ifdef I40E_DEV_ID_1G_BASE_T_X722 + INTEL_HCA(I40E_INTEL_VENDOR_ID, I40E_DEV_ID_1G_BASE_T_X722), +#endif +#ifdef I40E_DEV_ID_10G_BASE_T_X722 + INTEL_HCA(I40E_INTEL_VENDOR_ID, I40E_DEV_ID_10G_BASE_T_X722), +#endif +#ifdef I40E_DEV_ID_SFP_I_X722 + INTEL_HCA(I40E_INTEL_VENDOR_ID, I40E_DEV_ID_SFP_I_X722), +#endif +#ifdef I40E_DEV_ID_X722_VF + INTEL_HCA(I40E_INTEL_VENDOR_ID, I40E_DEV_ID_X722_VF), +#endif +#ifdef I40E_DEV_ID_X722_VF_HV + INTEL_HCA(I40E_INTEL_VENDOR_ID, I40E_DEV_ID_X722_VF_HV), +#endif +#ifdef I40E_DEV_ID_X722_FPGA + INTEL_HCA(I40E_INTEL_VENDOR_ID, I40E_DEV_ID_X722_FPGA), +#endif +#ifdef I40E_DEV_ID_X722_FPGA_VF + INTEL_HCA(I40E_INTEL_VENDOR_ID, I40E_DEV_ID_X722_FPGA_VF), +#endif + {} +}; + +static const struct verbs_context_ops i40iw_uctx_ops = { + .query_device = i40iw_uquery_device, + .query_port = i40iw_uquery_port, + .alloc_pd = i40iw_ualloc_pd, + .dealloc_pd = i40iw_ufree_pd, + .reg_mr = i40iw_ureg_mr, + .dereg_mr = i40iw_udereg_mr, + .create_cq = i40iw_ucreate_cq, + .poll_cq = i40iw_upoll_cq, + .req_notify_cq = i40iw_uarm_cq, + .cq_event = i40iw_cq_event, + .destroy_cq = i40iw_udestroy_cq, + .create_qp = i40iw_ucreate_qp, + .query_qp = i40iw_uquery_qp, + .modify_qp = i40iw_umodify_qp, + .destroy_qp = i40iw_udestroy_qp, + .post_send = i40iw_upost_send, + .post_recv = i40iw_upost_recv, + .async_event = i40iw_async_event, + .free_context = i40iw_ufree_context, +}; + +/** + * i40iw_ualloc_context - allocate context for user app + * @ibdev: pointer to device created during i40iw_driver_init + * @cmd_fd: save fd for the device + * + * Returns callback routines table and calls driver for allocating + * context and getting back resource information to return as ibv_context. + */ + +static struct verbs_context *i40iw_ualloc_context(struct ibv_device *ibdev, + int cmd_fd, + void *private_data) +{ + struct ibv_pd *ibv_pd; + struct i40iw_uvcontext *iwvctx; + struct i40iw_get_context cmd; + struct i40iw_get_context_resp resp; + + iwvctx = verbs_init_and_alloc_context(ibdev, cmd_fd, iwvctx, ibv_ctx, + RDMA_DRIVER_I40IW); + if (!iwvctx) + return NULL; + + cmd.userspace_ver = I40IW_ABI_VER; + memset(&resp, 0, sizeof(resp)); + if (ibv_cmd_get_context(&iwvctx->ibv_ctx, (struct ibv_get_context *)&cmd, + sizeof(cmd), &resp.ibv_resp, sizeof(resp))) { + + cmd.userspace_ver = 4; + if (ibv_cmd_get_context(&iwvctx->ibv_ctx, (struct ibv_get_context *)&cmd, + sizeof(cmd), &resp.ibv_resp, sizeof(resp))) + goto err_free; + + } + + if (resp.kernel_ver > I40IW_ABI_VER) { + fprintf(stderr, PFX "%s: incompatible kernel driver version: %d. Need version %d\n", + __func__, resp.kernel_ver, I40IW_ABI_VER); + goto err_free; + } + + verbs_set_ops(&iwvctx->ibv_ctx, &i40iw_uctx_ops); + iwvctx->max_pds = resp.max_pds; + iwvctx->max_qps = resp.max_qps; + iwvctx->wq_size = resp.wq_size; + iwvctx->abi_ver = resp.kernel_ver; + + i40iw_device_init_uk(&iwvctx->dev); + ibv_pd = i40iw_ualloc_pd(&iwvctx->ibv_ctx.context); + if (!ibv_pd) + goto err_free; + ibv_pd->context = &iwvctx->ibv_ctx.context; + iwvctx->iwupd = to_i40iw_upd(ibv_pd); + + return &iwvctx->ibv_ctx; + +err_free: + fprintf(stderr, PFX "%s: failed to allocate context for device.\n", __func__); + verbs_uninit_context(&iwvctx->ibv_ctx); + free(iwvctx); + + return NULL; +} + +/** + * i40iw_ufree_context - free context that was allocated + * @ibctx: context allocated ptr + */ +static void i40iw_ufree_context(struct ibv_context *ibctx) +{ + struct i40iw_uvcontext *iwvctx = to_i40iw_uctx(ibctx); + + i40iw_ufree_pd(&iwvctx->iwupd->ibv_pd); + + verbs_uninit_context(&iwvctx->ibv_ctx); + free(iwvctx); +} + +static void i40iw_uninit_device(struct verbs_device *verbs_device) +{ + struct i40iw_udevice *dev = to_i40iw_udev(&verbs_device->device); + + free(dev); +} + +static struct verbs_device * +i40iw_device_alloc(struct verbs_sysfs_dev *sysfs_dev) +{ + struct i40iw_udevice *dev; + + dev = calloc(1, sizeof(*dev)); + if (!dev) + return NULL; + + dev->page_size = I40IW_HW_PAGE_SIZE; + return &dev->ibv_dev; +} + +static const struct verbs_device_ops i40iw_udev_ops = { + .name = "i40iw", + .match_min_abi_version = 0, + .match_max_abi_version = INT_MAX, + .match_table = hca_table, + .alloc_device = i40iw_device_alloc, + .uninit_device = i40iw_uninit_device, + .alloc_context = i40iw_ualloc_context, +}; +PROVIDER_DRIVER(i40iw, i40iw_udev_ops); diff --git a/providers/i40iw/i40iw_umain.h b/providers/i40iw/i40iw_umain.h new file mode 100644 index 0000000..10385df --- /dev/null +++ b/providers/i40iw/i40iw_umain.h @@ -0,0 +1,179 @@ +/******************************************************************************* +* +* Copyright (c) 2015-2016 Intel Corporation. All rights reserved. +* +* This software is available to you under a choice of one of two +* licenses. You may choose to be licensed under the terms of the GNU +* General Public License (GPL) Version 2, available from the file +* COPYING in the main directory of this source tree, or the +* OpenFabrics.org BSD license below: +* +* Redistribution and use in source and binary forms, with or +* without modification, are permitted provided that the following +* conditions are met: +* +* - Redistributions of source code must retain the above +* copyright notice, this list of conditions and the following +* disclaimer. +* +* - Redistributions in binary form must reproduce the above +* copyright notice, this list of conditions and the following +* disclaimer in the documentation and/or other materials +* provided with the distribution. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS +* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN +* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +* SOFTWARE. +* +*******************************************************************************/ + +#ifndef I40IW_UMAIN_H +#define I40IW_UMAIN_H + +#include <inttypes.h> +#include <stddef.h> +#include <endian.h> +#include <util/compiler.h> + +#include <infiniband/driver.h> +#include <util/udma_barrier.h> + +#include "i40iw_osdep.h" +#include "i40iw_d.h" +#include "i40iw_status.h" +#include "i40iw_user.h" + +#define PFX "libi40iw-" + +#define I40IW_BASE_PUSH_PAGE 1 +#define I40IW_U_MINCQ_SIZE 4 + +#define I40IW_WC_WITH_VLAN (1 << 3) +#define I40IW_UD_RX_BATCH_SZ 64 +#define I40IW_UD_MAX_SG_LIST_SZ 1 +#define I40IW_CQ_BUF_OV_ERR 0x3 + +#define MAX_WQ_DEPTH 16384 +#define MIN_WQ_DEPTH 4 + +#define I40E_DB_SHADOW_AREA_SIZE 64 +#define I40E_DB_CQ_OFFSET 0x40 + +struct i40iw_udevice { + struct verbs_device ibv_dev; + int page_size; +}; + +struct i40iw_upd { + struct ibv_pd ibv_pd; + void volatile *db; + void volatile *arm_cq_page; + void volatile *arm_cq; + uint32_t pd_id; +}; + +struct i40iw_uvcontext { + struct verbs_context ibv_ctx; + struct i40iw_upd *iwupd; + uint32_t max_pds; /* maximum pds allowed for this user process */ + uint32_t max_qps; /* maximum qps allowed for this user process */ + uint32_t wq_size; /* size of the WQs (sq+rq) + shadow allocated to the mmaped area */ + struct i40iw_dev_uk dev; + int abi_ver; +}; + +struct i40iw_uqp; + +struct i40iw_ucq { + struct ibv_cq ibv_cq; + struct verbs_mr vmr; + struct ibv_mr mr_shadow_area; + pthread_spinlock_t lock; + uint8_t is_armed; + uint8_t skip_arm; + int arm_sol; + int skip_sol; + int comp_vector; + struct i40iw_uqp *udqp; + struct i40iw_cq_uk cq; +}; + +struct i40iw_uqp { + struct ibv_qp ibv_qp; + struct i40iw_ucq *send_cq; + struct i40iw_ucq *recv_cq; + struct verbs_mr vmr; + uint32_t i40iw_drv_opt; + pthread_spinlock_t lock; + u32 *push_db; /* mapped as uncached memory*/ + u64 *push_wqe; /* mapped as write combined memory*/ + uint16_t sq_sig_all; + uint16_t qperr; + uint16_t rsvd; + uint32_t pending_rcvs; + uint32_t wq_size; + struct ibv_recv_wr *pend_rx_wr; + struct i40iw_qp_uk qp; + +}; + +#define to_i40iw_uxxx(xxx, type) \ + container_of(ib##xxx, struct i40iw_u##type, ibv_##xxx) + +static inline struct i40iw_udevice *to_i40iw_udev(struct ibv_device *ibdev) +{ + return container_of(ibdev, struct i40iw_udevice, ibv_dev.device); +} + +static inline struct i40iw_uvcontext *to_i40iw_uctx(struct ibv_context *ibctx) +{ + return container_of(ibctx, struct i40iw_uvcontext, ibv_ctx.context); +} + +static inline struct i40iw_upd *to_i40iw_upd(struct ibv_pd *ibpd) +{ + return to_i40iw_uxxx(pd, pd); +} + +static inline struct i40iw_ucq *to_i40iw_ucq(struct ibv_cq *ibcq) +{ + return to_i40iw_uxxx(cq, cq); +} + +static inline struct i40iw_uqp *to_i40iw_uqp(struct ibv_qp *ibqp) +{ + return to_i40iw_uxxx(qp, qp); +} + +/* i40iw_uverbs.c */ +int i40iw_uquery_device(struct ibv_context *, struct ibv_device_attr *); +int i40iw_uquery_port(struct ibv_context *, uint8_t, struct ibv_port_attr *); +struct ibv_pd *i40iw_ualloc_pd(struct ibv_context *); +int i40iw_ufree_pd(struct ibv_pd *); +struct ibv_mr *i40iw_ureg_mr(struct ibv_pd *pd, void *addr, size_t length, + uint64_t hca_va, int access); +int i40iw_udereg_mr(struct verbs_mr *vmr); +struct ibv_cq *i40iw_ucreate_cq(struct ibv_context *, int, struct ibv_comp_channel *, int); +int i40iw_udestroy_cq(struct ibv_cq *); +int i40iw_upoll_cq(struct ibv_cq *, int, struct ibv_wc *); +int i40iw_uarm_cq(struct ibv_cq *, int); +void i40iw_cq_event(struct ibv_cq *); +struct ibv_srq *i40iw_ucreate_srq(struct ibv_pd *, struct ibv_srq_init_attr *); +int i40iw_umodify_srq(struct ibv_srq *, struct ibv_srq_attr *, int); +int i40iw_udestroy_srq(struct ibv_srq *); +int i40iw_upost_srq_recv(struct ibv_srq *, struct ibv_recv_wr *, struct ibv_recv_wr **); +struct ibv_qp *i40iw_ucreate_qp(struct ibv_pd *, struct ibv_qp_init_attr *); +int i40iw_uquery_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, int, struct ibv_qp_init_attr *init_attr); +int i40iw_umodify_qp(struct ibv_qp *, struct ibv_qp_attr *, int); +int i40iw_udestroy_qp(struct ibv_qp *); +int i40iw_upost_send(struct ibv_qp *, struct ibv_send_wr *, struct ibv_send_wr **); +int i40iw_upost_recv(struct ibv_qp *, struct ibv_recv_wr *, struct ibv_recv_wr **); +void i40iw_async_event(struct ibv_context *context, + struct ibv_async_event *event); + +#endif /* i40iw_umain_H */ diff --git a/providers/i40iw/i40iw_user.h b/providers/i40iw/i40iw_user.h new file mode 100644 index 0000000..921848c --- /dev/null +++ b/providers/i40iw/i40iw_user.h @@ -0,0 +1,456 @@ +/******************************************************************************* +* +* Copyright (c) 2015-2016 Intel Corporation. All rights reserved. +* +* This software is available to you under a choice of one of two +* licenses. You may choose to be licensed under the terms of the GNU +* General Public License (GPL) Version 2, available from the file +* COPYING in the main directory of this source tree, or the +* OpenFabrics.org BSD license below: +* +* Redistribution and use in source and binary forms, with or +* without modification, are permitted provided that the following +* conditions are met: +* +* - Redistributions of source code must retain the above +* copyright notice, this list of conditions and the following +* disclaimer. +* +* - Redistributions in binary form must reproduce the above +* copyright notice, this list of conditions and the following +* disclaimer in the documentation and/or other materials +* provided with the distribution. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS +* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN +* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +* SOFTWARE. +* +*******************************************************************************/ + +#ifndef I40IW_USER_H +#define I40IW_USER_H + +enum i40iw_device_capabilities_const { + I40IW_WQE_SIZE = 4, + I40IW_CQP_WQE_SIZE = 8, + I40IW_CQE_SIZE = 4, + I40IW_EXTENDED_CQE_SIZE = 8, + I40IW_AEQE_SIZE = 2, + I40IW_CEQE_SIZE = 1, + I40IW_CQP_CTX_SIZE = 8, + I40IW_SHADOW_AREA_SIZE = 8, + I40IW_CEQ_MAX_COUNT = 256, + I40IW_QUERY_FPM_BUF_SIZE = 128, + I40IW_COMMIT_FPM_BUF_SIZE = 128, + I40IW_MIN_IW_QP_ID = 1, + I40IW_MAX_IW_QP_ID = 262143, + I40IW_MIN_CEQID = 0, + I40IW_MAX_CEQID = 256, + I40IW_MIN_CQID = 0, + I40IW_MAX_CQID = 131071, + I40IW_MIN_AEQ_ENTRIES = 1, + I40IW_MAX_AEQ_ENTRIES = 524287, + I40IW_MIN_CEQ_ENTRIES = 1, + I40IW_MAX_CEQ_ENTRIES = 131071, + I40IW_MIN_CQ_SIZE = 1, + I40IW_MAX_CQ_SIZE = 1048575, + I40IW_MAX_AEQ_ALLOCATE_COUNT = 255, + I40IW_DB_ID_ZERO = 0, + I40IW_MAX_WQ_FRAGMENT_COUNT = 3, + I40IW_MAX_SGE_RD = 1, + I40IW_MAX_OUTBOUND_MESSAGE_SIZE = 2147483647, + I40IW_MAX_INBOUND_MESSAGE_SIZE = 2147483647, + I40IW_MAX_PUSH_PAGE_COUNT = 4096, + I40IW_MAX_PE_ENABLED_VF_COUNT = 32, + I40IW_MAX_VF_FPM_ID = 47, + I40IW_MAX_VF_PER_PF = 127, + I40IW_MAX_SQ_PAYLOAD_SIZE = 2145386496, + I40IW_MAX_INLINE_DATA_SIZE = 48, + I40IW_MAX_PUSHMODE_INLINE_DATA_SIZE = 48, + I40IW_MAX_IRD_SIZE = 32, + I40IW_QPCTX_ENCD_MAXIRD = 3, + I40IW_MAX_WQ_ENTRIES = 2048, + I40IW_MAX_ORD_SIZE = 32, + I40IW_Q2_BUFFER_SIZE = (248 + 100), + I40IW_MAX_WQE_SIZE_RQ = 128, + I40IW_QP_CTX_SIZE = 248 +}; + +#define i40iw_handle void * +#define i40iw_adapter_handle i40iw_handle +#define i40iw_qp_handle i40iw_handle +#define i40iw_cq_handle i40iw_handle +#define i40iw_srq_handle i40iw_handle +#define i40iw_pd_id i40iw_handle +#define i40iw_stag_handle i40iw_handle +#define i40iw_stag_index u32 +#define i40iw_stag u32 +#define i40iw_stag_key u8 + +#define i40iw_tagged_offset u64 +#define i40iw_access_privileges u32 +#define i40iw_physical_fragment u64 +#define i40iw_address_list u64 * + +#define I40IW_CREATE_STAG(index, key) (((index) << 8) + (key)) + +#define I40IW_STAG_KEY_FROM_STAG(stag) ((stag) && 0x000000FF) + +#define I40IW_STAG_INDEX_FROM_STAG(stag) (((stag) && 0xFFFFFF00) >> 8) + +#define I40IW_MAX_MR_SIZE 0x10000000000L +#define I40IW_MAX_RQ_WQE_SHIFT 2 + +struct i40iw_qp_uk; +struct i40iw_cq_uk; +struct i40iw_srq_uk; +struct i40iw_qp_uk_init_info; +struct i40iw_cq_uk_init_info; +struct i40iw_srq_uk_init_info; + +struct i40iw_sge { + i40iw_tagged_offset tag_off; + u32 len; + i40iw_stag stag; +}; + +#define i40iw_sgl struct i40iw_sge * + +struct i40iw_ring { + volatile u32 head; + volatile u32 tail; + u32 size; +}; + +struct i40iw_cqe { + u64 buf[I40IW_CQE_SIZE]; +}; + +struct i40iw_extended_cqe { + u64 buf[I40IW_EXTENDED_CQE_SIZE]; +}; + +struct i40iw_wqe { + u64 buf[I40IW_WQE_SIZE]; +}; + +struct i40iw_qp_uk_ops; + +enum i40iw_addressing_type { + I40IW_ADDR_TYPE_ZERO_BASED = 0, + I40IW_ADDR_TYPE_VA_BASED = 1, +}; + +#define I40IW_ACCESS_FLAGS_LOCALREAD 0x01 +#define I40IW_ACCESS_FLAGS_LOCALWRITE 0x02 +#define I40IW_ACCESS_FLAGS_REMOTEREAD_ONLY 0x04 +#define I40IW_ACCESS_FLAGS_REMOTEREAD 0x05 +#define I40IW_ACCESS_FLAGS_REMOTEWRITE_ONLY 0x08 +#define I40IW_ACCESS_FLAGS_REMOTEWRITE 0x0a +#define I40IW_ACCESS_FLAGS_BIND_WINDOW 0x10 +#define I40IW_ACCESS_FLAGS_ALL 0x1F + +#define I40IW_OP_TYPE_RDMA_WRITE 0 +#define I40IW_OP_TYPE_RDMA_READ 1 +#define I40IW_OP_TYPE_SEND 3 +#define I40IW_OP_TYPE_SEND_INV 4 +#define I40IW_OP_TYPE_SEND_SOL 5 +#define I40IW_OP_TYPE_SEND_SOL_INV 6 +#define I40IW_OP_TYPE_REC 7 +#define I40IW_OP_TYPE_BIND_MW 8 +#define I40IW_OP_TYPE_FAST_REG_NSMR 9 +#define I40IW_OP_TYPE_INV_STAG 10 +#define I40IW_OP_TYPE_RDMA_READ_INV_STAG 11 +#define I40IW_OP_TYPE_NOP 12 + +enum i40iw_completion_status { + I40IW_COMPL_STATUS_SUCCESS = 0, + I40IW_COMPL_STATUS_FLUSHED, + I40IW_COMPL_STATUS_INVALID_WQE, + I40IW_COMPL_STATUS_QP_CATASTROPHIC, + I40IW_COMPL_STATUS_REMOTE_TERMINATION, + I40IW_COMPL_STATUS_INVALID_STAG, + I40IW_COMPL_STATUS_BASE_BOUND_VIOLATION, + I40IW_COMPL_STATUS_ACCESS_VIOLATION, + I40IW_COMPL_STATUS_INVALID_PD_ID, + I40IW_COMPL_STATUS_WRAP_ERROR, + I40IW_COMPL_STATUS_STAG_INVALID_PDID, + I40IW_COMPL_STATUS_RDMA_READ_ZERO_ORD, + I40IW_COMPL_STATUS_QP_NOT_PRIVLEDGED, + I40IW_COMPL_STATUS_STAG_NOT_INVALID, + I40IW_COMPL_STATUS_INVALID_PHYS_BUFFER_SIZE, + I40IW_COMPL_STATUS_INVALID_PHYS_BUFFER_ENTRY, + I40IW_COMPL_STATUS_INVALID_FBO, + I40IW_COMPL_STATUS_INVALID_LENGTH, + I40IW_COMPL_STATUS_INVALID_ACCESS, + I40IW_COMPL_STATUS_PHYS_BUFFER_LIST_TOO_LONG, + I40IW_COMPL_STATUS_INVALID_VIRT_ADDRESS, + I40IW_COMPL_STATUS_INVALID_REGION, + I40IW_COMPL_STATUS_INVALID_WINDOW, + I40IW_COMPL_STATUS_INVALID_TOTAL_LENGTH +}; + +enum i40iw_completion_notify { + IW_CQ_COMPL_EVENT = 0, + IW_CQ_COMPL_SOLICITED = 1 +}; + +struct i40iw_post_send { + i40iw_sgl sg_list; + u32 num_sges; +}; + +struct i40iw_post_inline_send { + void *data; + u32 len; +}; + +struct i40iw_post_send_w_inv { + i40iw_sgl sg_list; + u32 num_sges; + i40iw_stag remote_stag_to_inv; +}; + +struct i40iw_post_inline_send_w_inv { + void *data; + u32 len; + i40iw_stag remote_stag_to_inv; +}; + +struct i40iw_rdma_write { + i40iw_sgl lo_sg_list; + u32 num_lo_sges; + struct i40iw_sge rem_addr; +}; + +struct i40iw_inline_rdma_write { + void *data; + u32 len; + struct i40iw_sge rem_addr; +}; + +struct i40iw_rdma_read { + struct i40iw_sge lo_addr; + struct i40iw_sge rem_addr; +}; + +struct i40iw_bind_window { + i40iw_stag mr_stag; + u64 bind_length; + void *va; + enum i40iw_addressing_type addressing_type; + bool enable_reads; + bool enable_writes; + i40iw_stag mw_stag; +}; + +struct i40iw_inv_local_stag { + i40iw_stag target_stag; +}; + +struct i40iw_post_sq_info { + u64 wr_id; + u8 op_type; + bool signaled; + bool read_fence; + bool local_fence; + bool inline_data; + bool defer_flag; + union { + struct i40iw_post_send send; + struct i40iw_post_send send_w_sol; + struct i40iw_post_send_w_inv send_w_inv; + struct i40iw_post_send_w_inv send_w_sol_inv; + struct i40iw_rdma_write rdma_write; + struct i40iw_rdma_read rdma_read; + struct i40iw_rdma_read rdma_read_inv; + struct i40iw_bind_window bind_window; + struct i40iw_inv_local_stag inv_local_stag; + struct i40iw_inline_rdma_write inline_rdma_write; + struct i40iw_post_inline_send inline_send; + struct i40iw_post_inline_send inline_send_w_sol; + struct i40iw_post_inline_send_w_inv inline_send_w_inv; + struct i40iw_post_inline_send_w_inv inline_send_w_sol_inv; + } op; +}; + +struct i40iw_post_rq_info { + u64 wr_id; + i40iw_sgl sg_list; + u32 num_sges; +}; + +struct i40iw_cq_poll_info { + u64 wr_id; + i40iw_qp_handle qp_handle; + u32 bytes_xfered; + u32 tcp_seq_num; + u32 qp_id; + i40iw_stag inv_stag; + enum i40iw_completion_status comp_status; + u16 major_err; + u16 minor_err; + u8 op_type; + bool stag_invalid_set; + bool push_dropped; + bool error; + bool is_srq; + bool solicited_event; +}; + +struct i40iw_qp_uk_ops { + void (*iw_qp_post_wr)(struct i40iw_qp_uk *); + void (*iw_qp_ring_push_db)(struct i40iw_qp_uk *, u32); + enum i40iw_status_code (*iw_rdma_write)(struct i40iw_qp_uk *, + struct i40iw_post_sq_info *, bool); + enum i40iw_status_code (*iw_rdma_read)(struct i40iw_qp_uk *, + struct i40iw_post_sq_info *, bool, bool); + enum i40iw_status_code (*iw_send)(struct i40iw_qp_uk *, + struct i40iw_post_sq_info *, u32, bool); + enum i40iw_status_code (*iw_inline_rdma_write)(struct i40iw_qp_uk *, + struct i40iw_post_sq_info *, bool); + enum i40iw_status_code (*iw_inline_send)(struct i40iw_qp_uk *, + struct i40iw_post_sq_info *, u32, bool); + enum i40iw_status_code (*iw_stag_local_invalidate)(struct i40iw_qp_uk *, + struct i40iw_post_sq_info *, bool); + enum i40iw_status_code (*iw_mw_bind)(struct i40iw_qp_uk *, + struct i40iw_post_sq_info *, bool); + enum i40iw_status_code (*iw_post_receive)(struct i40iw_qp_uk *, + struct i40iw_post_rq_info *); + enum i40iw_status_code (*iw_post_nop)(struct i40iw_qp_uk *, u64, bool, bool); +}; + +struct i40iw_cq_ops { + void (*iw_cq_request_notification)(struct i40iw_cq_uk *, + enum i40iw_completion_notify); + enum i40iw_status_code (*iw_cq_poll_completion)(struct i40iw_cq_uk *, + struct i40iw_cq_poll_info *); + enum i40iw_status_code (*iw_cq_post_entries)(struct i40iw_cq_uk *, u8 count); + void (*iw_cq_clean)(void *, struct i40iw_cq_uk *); +}; + +struct i40iw_dev_uk; + +struct i40iw_device_uk_ops { + enum i40iw_status_code (*iwarp_cq_uk_init)(struct i40iw_cq_uk *, + struct i40iw_cq_uk_init_info *); + enum i40iw_status_code (*iwarp_qp_uk_init)(struct i40iw_qp_uk *, + struct i40iw_qp_uk_init_info *); +}; + +struct i40iw_dev_uk { + struct i40iw_device_uk_ops ops_uk; +}; + +struct i40iw_sq_uk_wr_trk_info { + u64 wrid; + u32 wr_len; + u8 wqe_size; + u8 reserved[3]; +}; + +struct i40iw_qp_quanta { + u64 elem[I40IW_WQE_SIZE]; +}; + +struct i40iw_qp_uk { + struct i40iw_qp_quanta *sq_base; + struct i40iw_qp_quanta *rq_base; + u32 IOMEM *wqe_alloc_reg; + struct i40iw_sq_uk_wr_trk_info *sq_wrtrk_array; + u64 *rq_wrid_array; + u64 *shadow_area; + u32 *push_db; + u64 *push_wqe; + struct i40iw_ring sq_ring; + struct i40iw_ring rq_ring; + struct i40iw_ring initial_ring; + u32 qp_id; + u32 sq_size; + u32 rq_size; + u32 max_sq_frag_cnt; + u32 max_rq_frag_cnt; + struct i40iw_qp_uk_ops ops; + bool use_srq; + u8 swqe_polarity; + u8 swqe_polarity_deferred; + u8 rwqe_polarity; + u8 rq_wqe_size; + u8 rq_wqe_size_multiplier; + bool first_sq_wq; + bool deferred_flag; +}; + +struct i40iw_cq_uk { + struct i40iw_cqe *cq_base; + u32 IOMEM *cqe_alloc_reg; + u64 *shadow_area; + u32 cq_id; + u32 cq_size; + struct i40iw_ring cq_ring; + u8 polarity; + bool avoid_mem_cflct; + + struct i40iw_cq_ops ops; +}; + +struct i40iw_qp_uk_init_info { + struct i40iw_qp_quanta *sq; + struct i40iw_qp_quanta *rq; + u32 IOMEM *wqe_alloc_reg; + u64 *shadow_area; + struct i40iw_sq_uk_wr_trk_info *sq_wrtrk_array; + u64 *rq_wrid_array; + u32 *push_db; + u64 *push_wqe; + u32 qp_id; + u32 sq_size; + u32 rq_size; + u32 max_sq_frag_cnt; + u32 max_rq_frag_cnt; + u32 max_inline_data; + int abi_ver; +}; + +struct i40iw_cq_uk_init_info { + u32 IOMEM *cqe_alloc_reg; + struct i40iw_cqe *cq_base; + u64 *shadow_area; + u32 cq_size; + u32 cq_id; + bool avoid_mem_cflct; +}; + +void i40iw_device_init_uk(struct i40iw_dev_uk *dev); + +void i40iw_qp_post_wr(struct i40iw_qp_uk *qp); +u64 *i40iw_qp_get_next_send_wqe(struct i40iw_qp_uk *qp, u32 *wqe_idx, + u8 wqe_size, + u32 total_size, + u64 wr_id + ); + +u64 *i40iw_qp_get_next_recv_wqe(struct i40iw_qp_uk *qp, u32 *wqe_idx); +u64 *i40iw_qp_get_next_srq_wqe(struct i40iw_srq_uk *srq, u32 *wqe_idx); + +enum i40iw_status_code i40iw_cq_uk_init(struct i40iw_cq_uk *cq, + struct i40iw_cq_uk_init_info *info); +enum i40iw_status_code i40iw_qp_uk_init(struct i40iw_qp_uk *qp, + struct i40iw_qp_uk_init_info *info); + +void i40iw_clean_cq(void *queue, struct i40iw_cq_uk *cq); +enum i40iw_status_code i40iw_nop(struct i40iw_qp_uk *qp, u64 wr_id, + bool signaled, bool post_sq); +enum i40iw_status_code i40iw_fragcnt_to_wqesize_sq(u32 frag_cnt, u8 *wqe_size); +enum i40iw_status_code i40iw_fragcnt_to_wqesize_rq(u32 frag_cnt, u8 *wqe_size); +enum i40iw_status_code i40iw_inline_data_size_to_wqesize(u32 data_size, + u8 *wqe_size); +void i40iw_get_wqe_shift(u32 sge, u32 inline_data, u8 *shift); +enum i40iw_status_code i40iw_get_sqdepth(u32 sq_size, u8 shift, u32 *sqdepth); +enum i40iw_status_code i40iw_get_rqdepth(u32 rq_size, u8 shift, u32 *rqdepth); +#endif diff --git a/providers/i40iw/i40iw_uverbs.c b/providers/i40iw/i40iw_uverbs.c new file mode 100644 index 0000000..71b59a7 --- /dev/null +++ b/providers/i40iw/i40iw_uverbs.c @@ -0,0 +1,979 @@ +/******************************************************************************* +* +* Copyright (c) 2015-2016 Intel Corporation. All rights reserved. +* +* This software is available to you under a choice of one of two +* licenses. You may choose to be licensed under the terms of the GNU +* General Public License (GPL) Version 2, available from the file +* COPYING in the main directory of this source tree, or the +* OpenFabrics.org BSD license below: +* +* Redistribution and use in source and binary forms, with or +* without modification, are permitted provided that the following +* conditions are met: +* +* - Redistributions of source code must retain the above +* copyright notice, this list of conditions and the following +* disclaimer. +* +* - Redistributions in binary form must reproduce the above +* copyright notice, this list of conditions and the following +* disclaimer in the documentation and/or other materials +* provided with the distribution. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS +* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN +* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +* SOFTWARE. +* +*******************************************************************************/ + +#include <config.h> + +#include <stdlib.h> +#include <stdio.h> +#include <string.h> +#include <unistd.h> +#include <signal.h> +#include <errno.h> +#include <pthread.h> +#include <malloc.h> +#include <sys/mman.h> +#include <linux/if_ether.h> +#include <sys/stat.h> +#include <fcntl.h> + +#include "i40iw_umain.h" +#include "i40iw-abi.h" + +/** + * i40iw_uquery_device - call driver to query device for max resources + * @context: user context for the device + * @attr: where to save all the mx resources from the driver + **/ +int i40iw_uquery_device(struct ibv_context *context, struct ibv_device_attr *attr) +{ + struct ibv_query_device cmd; + uint64_t i40iw_fw_ver; + int ret; + unsigned int minor, major; + + ret = ibv_cmd_query_device(context, attr, &i40iw_fw_ver, &cmd, sizeof(cmd)); + if (ret) { + fprintf(stderr, PFX "%s: query device failed and returned status code: %d\n", __func__, ret); + return ret; + } + + major = (i40iw_fw_ver >> 16) & 0xffff; + minor = i40iw_fw_ver & 0xffff; + + snprintf(attr->fw_ver, sizeof(attr->fw_ver), "%d.%d", major, minor); + + return 0; +} + +/** + * i40iw_uquery_port - get port attributes (msg size, lnk, mtu...) + * @context: user context of the device + * @port: port for the attributes + * @attr: to return port attributes + **/ +int i40iw_uquery_port(struct ibv_context *context, uint8_t port, struct ibv_port_attr *attr) +{ + struct ibv_query_port cmd; + + return ibv_cmd_query_port(context, port, attr, &cmd, sizeof(cmd)); +} + +/** + * i40iw_ualloc_pd - allocates protection domain and return pd ptr + * @context: user context of the device + **/ +struct ibv_pd *i40iw_ualloc_pd(struct ibv_context *context) +{ + struct ibv_alloc_pd cmd; + struct i40iw_ualloc_pd_resp resp; + struct i40iw_upd *iwupd; + void *map; + + iwupd = malloc(sizeof(*iwupd)); + if (!iwupd) + return NULL; + memset(&resp, 0, sizeof(resp)); + if (ibv_cmd_alloc_pd(context, &iwupd->ibv_pd, &cmd, sizeof(cmd), &resp.ibv_resp, sizeof(resp))) + goto err_free; + + iwupd->pd_id = resp.pd_id; + map = mmap(NULL, I40IW_HW_PAGE_SIZE, PROT_WRITE | PROT_READ, MAP_SHARED, context->cmd_fd, 0); + if (map == MAP_FAILED) { + ibv_cmd_dealloc_pd(&iwupd->ibv_pd); + goto err_free; + } + iwupd->db = map; + + return &iwupd->ibv_pd; + +err_free: + free(iwupd); + return NULL; +} + +/** + * i40iw_ufree_pd - free pd resources + * @pd: pd to free resources + */ +int i40iw_ufree_pd(struct ibv_pd *pd) +{ + int ret; + struct i40iw_upd *iwupd; + + iwupd = to_i40iw_upd(pd); + ret = ibv_cmd_dealloc_pd(pd); + if (ret) + return ret; + + munmap((void *)iwupd->db, I40IW_HW_PAGE_SIZE); + free(iwupd); + + return 0; +} + +/** + * i40iw_ureg_mr - register user memory region + * @pd: pd for the mr + * @addr: user address of the memory region + * @length: length of the memory + * @access: access allowed on this mr + */ +struct ibv_mr *i40iw_ureg_mr(struct ibv_pd *pd, void *addr, size_t length, + uint64_t hca_va, int access) +{ + struct verbs_mr *vmr; + struct i40iw_ureg_mr cmd; + struct ib_uverbs_reg_mr_resp resp; + + vmr = malloc(sizeof(*vmr)); + if (!vmr) + return NULL; + + cmd.reg_type = IW_MEMREG_TYPE_MEM; + + if (ibv_cmd_reg_mr(pd, addr, length, hca_va, access, vmr, &cmd.ibv_cmd, + sizeof(cmd), &resp, sizeof(resp))) { + fprintf(stderr, PFX "%s: Failed to register memory\n", __func__); + free(vmr); + return NULL; + } + return &vmr->ibv_mr; +} + +/** + * i40iw_udereg_mr - re-register memory region + * @mr: mr that was allocated + */ +int i40iw_udereg_mr(struct verbs_mr *vmr) +{ + int ret; + + ret = ibv_cmd_dereg_mr(vmr); + if (ret) + return ret; + + free(vmr); + return 0; +} + +/** + * i40iw_num_of_pages - number of pages needed + * @size: size for number of pages + */ +static inline u32 i40iw_num_of_pages(u32 size) +{ + return (size + 4095) >> 12; +} + +/** + * i40iw_ucreate_cq - create completion queue for user app + * @context: user context of the device + * @cqe: number of cq entries in the cq ring + * @channel: channel info (context, refcnt..) + * @comp_vector: save in ucq struct + */ +struct ibv_cq *i40iw_ucreate_cq(struct ibv_context *context, int cqe, + struct ibv_comp_channel *channel, int comp_vector) +{ + struct i40iw_ucq *iwucq; + struct i40iw_ucreate_cq cmd; + struct i40iw_ucreate_cq_resp resp; + struct i40iw_cq_uk_init_info info; + int ret; + struct i40iw_uvcontext *iwvctx = to_i40iw_uctx(context); + u32 cqe_struct_size; + u32 totalsize; + u32 cq_pages; + + struct i40iw_ureg_mr reg_mr_cmd; + + struct ib_uverbs_reg_mr_resp reg_mr_resp; + + if (cqe > I40IW_MAX_CQ_SIZE) + return NULL; + + cqe++; + memset(&cmd, 0, sizeof(cmd)); + memset(&resp, 0, sizeof(resp)); + memset(&info, 0, sizeof(info)); + memset(®_mr_cmd, 0, sizeof(reg_mr_cmd)); + + iwucq = malloc(sizeof(*iwucq)); + if (!iwucq) + return NULL; + memset(iwucq, 0, sizeof(*iwucq)); + + if (pthread_spin_init(&iwucq->lock, PTHREAD_PROCESS_PRIVATE)) { + free(iwucq); + return NULL; + } + if (cqe < I40IW_U_MINCQ_SIZE) + cqe = I40IW_U_MINCQ_SIZE; + + info.cq_size = cqe; + iwucq->comp_vector = comp_vector; + cqe_struct_size = sizeof(struct i40iw_cqe); + cq_pages = i40iw_num_of_pages(info.cq_size * cqe_struct_size); + totalsize = (cq_pages << 12) + I40E_DB_SHADOW_AREA_SIZE; + + info.cq_base = memalign(I40IW_HW_PAGE_SIZE, totalsize); + + if (!info.cq_base) + goto err; + + memset(info.cq_base, 0, totalsize); + info.shadow_area = (u64 *)((u8 *)info.cq_base + (cq_pages << 12)); + reg_mr_cmd.reg_type = IW_MEMREG_TYPE_CQ; + + reg_mr_cmd.cq_pages = cq_pages; + + ret = ibv_cmd_reg_mr(&iwvctx->iwupd->ibv_pd, (void *)info.cq_base, + totalsize, (uintptr_t)info.cq_base, + IBV_ACCESS_LOCAL_WRITE, &iwucq->vmr, + ®_mr_cmd.ibv_cmd, sizeof(reg_mr_cmd), + ®_mr_resp, sizeof(reg_mr_resp)); + if (ret) { + fprintf(stderr, PFX "%s: failed to pin memory for CQ\n", __func__); + goto err; + } + + cmd.user_cq_buffer = (__u64)((uintptr_t)info.cq_base); + ret = ibv_cmd_create_cq(context, info.cq_size, channel, comp_vector, + &iwucq->ibv_cq, &cmd.ibv_cmd, sizeof(cmd), + &resp.ibv_resp, sizeof(resp)); + if (ret) { + ibv_cmd_dereg_mr(&iwucq->vmr); + fprintf(stderr, PFX "%s: failed to create CQ\n", __func__); + goto err; + } + + info.cq_id = (uint16_t)resp.cq_id; + info.shadow_area = (u64 *)((u8 *)info.shadow_area + resp.reserved); + + info.cqe_alloc_reg = (u32 *)((u8 *)iwvctx->iwupd->db + I40E_DB_CQ_OFFSET); + ret = iwvctx->dev.ops_uk.iwarp_cq_uk_init(&iwucq->cq, &info); + if (!ret) + return &iwucq->ibv_cq; + else + fprintf(stderr, PFX "%s: failed to initialize CQ, status %d\n", __func__, ret); +err: + if (info.cq_base) + free(info.cq_base); + if (pthread_spin_destroy(&iwucq->lock)) + return NULL; + free(iwucq); + return NULL; +} + +/** + * i40iw_udestroy_cq - destroys cq + * @cq: ptr to cq to be destroyed + */ +int i40iw_udestroy_cq(struct ibv_cq *cq) +{ + struct i40iw_ucq *iwucq = to_i40iw_ucq(cq); + int ret; + + ret = pthread_spin_destroy(&iwucq->lock); + if (ret) + return ret; + + ret = ibv_cmd_destroy_cq(cq); + if (ret) + return ret; + + ibv_cmd_dereg_mr(&iwucq->vmr); + + free(iwucq->cq.cq_base); + free(iwucq); + + return 0; +} + +/** + * i40iw_upoll_cq - user app to poll cq + * @cq: cq to poll + * @num_entries: max cq entries to poll + * @entry: for each completion complete entry + */ +int i40iw_upoll_cq(struct ibv_cq *cq, int num_entries, struct ibv_wc *entry) +{ + struct i40iw_ucq *iwucq; + int cqe_count = 0; + struct i40iw_cq_poll_info cq_poll_info; + int ret; + + iwucq = to_i40iw_ucq(cq); + + ret = pthread_spin_lock(&iwucq->lock); + if (ret) + return ret; + while (cqe_count < num_entries) { + ret = iwucq->cq.ops.iw_cq_poll_completion(&iwucq->cq, &cq_poll_info); + if (ret == I40IW_ERR_QUEUE_EMPTY) { + break; + } else if (ret == I40IW_ERR_QUEUE_DESTROYED) { + continue; + } else if (ret) { + fprintf(stderr, PFX "%s: Error polling CQ, status %d\n", __func__, ret); + if (!cqe_count) + /* Indicate error */ + cqe_count = -1; + break; + } + entry->wc_flags = 0; + entry->wr_id = cq_poll_info.wr_id; + + if (cq_poll_info.error) { + entry->status = IBV_WC_WR_FLUSH_ERR; + entry->vendor_err = cq_poll_info.major_err << 16 | cq_poll_info.minor_err; + } else { + entry->status = IBV_WC_SUCCESS; + } + + switch (cq_poll_info.op_type) { + case I40IW_OP_TYPE_RDMA_WRITE: + entry->opcode = IBV_WC_RDMA_WRITE; + break; + case I40IW_OP_TYPE_RDMA_READ_INV_STAG: + case I40IW_OP_TYPE_RDMA_READ: + entry->opcode = IBV_WC_RDMA_READ; + break; + case I40IW_OP_TYPE_SEND_SOL: + case I40IW_OP_TYPE_SEND_SOL_INV: + case I40IW_OP_TYPE_SEND_INV: + case I40IW_OP_TYPE_SEND: + entry->opcode = IBV_WC_SEND; + break; + case I40IW_OP_TYPE_REC: + entry->opcode = IBV_WC_RECV; + break; + default: + entry->opcode = IBV_WC_RECV; + break; + } + + entry->imm_data = 0; + entry->qp_num = cq_poll_info.qp_id; + entry->src_qp = cq_poll_info.qp_id; + entry->byte_len = cq_poll_info.bytes_xfered; + entry++; + cqe_count++; + } + pthread_spin_unlock(&iwucq->lock); + return cqe_count; +} + +/** + * i40iw_arm_cq - arm of cq + * @iwucq: cq to which arm + * @cq_notify: notification params + */ +static void i40iw_arm_cq(struct i40iw_ucq *iwucq, enum i40iw_completion_notify cq_notify) +{ + iwucq->is_armed = 1; + iwucq->arm_sol = 1; + iwucq->skip_arm = 0; + iwucq->skip_sol = 1; + + iwucq->cq.ops.iw_cq_request_notification(&iwucq->cq, cq_notify); +} + +/** + * i40iw_uarm_cq - callback for arm of cq + * @cq: cq to arm + * @solicited: to get notify params + */ +int i40iw_uarm_cq(struct ibv_cq *cq, int solicited) +{ + struct i40iw_ucq *iwucq; + enum i40iw_completion_notify cq_notify = IW_CQ_COMPL_EVENT; + int ret; + + iwucq = to_i40iw_ucq(cq); + if (solicited) + cq_notify = IW_CQ_COMPL_SOLICITED; + + ret = pthread_spin_lock(&iwucq->lock); + if (ret) + return ret; + + if (iwucq->is_armed) { + if ((iwucq->arm_sol) && (!solicited)) { + i40iw_arm_cq(iwucq, cq_notify); + } else { + iwucq->skip_arm = 1; + iwucq->skip_sol &= solicited; + } + } else { + i40iw_arm_cq(iwucq, cq_notify); + } + + pthread_spin_unlock(&iwucq->lock); + + return 0; +} + +/** + * i40iw_cq_event - cq to do completion event + * @cq: cq to arm + */ +void i40iw_cq_event(struct ibv_cq *cq) +{ + struct i40iw_ucq *iwucq; + + iwucq = to_i40iw_ucq(cq); + if (pthread_spin_lock(&iwucq->lock)) + return; + + if (iwucq->skip_arm) + i40iw_arm_cq(iwucq, IW_CQ_COMPL_EVENT); + else + iwucq->is_armed = 0; + + pthread_spin_unlock(&iwucq->lock); +} + +static int i40iw_destroy_vmapped_qp(struct i40iw_uqp *iwuqp, + struct i40iw_qp_quanta *sq_base) +{ + int ret; + + ret = ibv_cmd_destroy_qp(&iwuqp->ibv_qp); + if (ret) + return ret; + + if (iwuqp->push_db) + munmap(iwuqp->push_db, I40IW_HW_PAGE_SIZE); + if (iwuqp->push_wqe) + munmap(iwuqp->push_wqe, I40IW_HW_PAGE_SIZE); + + ibv_cmd_dereg_mr(&iwuqp->vmr); + free((void *)sq_base); + + return 0; +} + +/** + * i40iw_vmapped_qp - create resources for qp + * @iwuqp: qp struct for resources + * @pd: pd for thes qp + * @attr: atributes of qp passed + * @resp: response back from create qp + * @sqdepth: depth of sq + * @rqdepth: depth of rq + * @info: info for initializing user level qp + */ +static int i40iw_vmapped_qp(struct i40iw_uqp *iwuqp, struct ibv_pd *pd, + struct ibv_qp_init_attr *attr, + struct i40iw_ucreate_qp_resp *resp, int sqdepth, + int rqdepth, struct i40iw_qp_uk_init_info *info) +{ + struct i40iw_ucreate_qp cmd; + int sqsize, rqsize, totalqpsize; + int ret; + struct i40iw_ureg_mr reg_mr_cmd; + u32 sq_pages, rq_pages; + struct ib_uverbs_reg_mr_resp reg_mr_resp; + + memset(®_mr_cmd, 0, sizeof(reg_mr_cmd)); + sqsize = sqdepth * I40IW_QP_WQE_MIN_SIZE; + rqsize = rqdepth * I40IW_QP_WQE_MIN_SIZE; + + sq_pages = i40iw_num_of_pages(sqsize); + rq_pages = i40iw_num_of_pages(rqsize); + sqsize = sq_pages << 12; + rqsize = rq_pages << 12; + totalqpsize = rqsize + sqsize + I40E_DB_SHADOW_AREA_SIZE; + info->sq = memalign(I40IW_HW_PAGE_SIZE, totalqpsize); + + if (!info->sq) { + fprintf(stderr, PFX "%s: failed to allocate memory for SQ\n", __func__); + return 0; + } + + memset(info->sq, 0, totalqpsize); + info->rq = &info->sq[sqsize / I40IW_QP_WQE_MIN_SIZE]; + info->shadow_area = info->rq[rqsize / I40IW_QP_WQE_MIN_SIZE].elem; + + reg_mr_cmd.reg_type = IW_MEMREG_TYPE_QP; + reg_mr_cmd.sq_pages = sq_pages; + reg_mr_cmd.rq_pages = rq_pages; + + ret = ibv_cmd_reg_mr(pd, (void *)info->sq, totalqpsize, + (uintptr_t)info->sq, IBV_ACCESS_LOCAL_WRITE, + &iwuqp->vmr, ®_mr_cmd.ibv_cmd, + sizeof(reg_mr_cmd), ®_mr_resp, + sizeof(reg_mr_resp)); + if (ret) { + fprintf(stderr, PFX "%s: failed to pin memory for SQ\n", __func__); + free(info->sq); + return 0; + } + cmd.user_wqe_buffers = (__u64)((uintptr_t)info->sq); + cmd.user_compl_ctx = (uintptr_t)&iwuqp->qp; + + ret = ibv_cmd_create_qp(pd, &iwuqp->ibv_qp, attr, &cmd.ibv_cmd, sizeof(cmd), + &resp->ibv_resp, sizeof(struct i40iw_ucreate_qp_resp)); + if (ret) { + fprintf(stderr, PFX "%s: failed to create QP, status %d\n", __func__, ret); + ibv_cmd_dereg_mr(&iwuqp->vmr); + free(info->sq); + return 0; + } + + iwuqp->send_cq = to_i40iw_ucq(attr->send_cq); + iwuqp->recv_cq = to_i40iw_ucq(attr->recv_cq); + info->sq_size = resp->actual_sq_size; + info->rq_size = resp->actual_rq_size; + + if (resp->push_idx != I40IW_INVALID_PUSH_PAGE_INDEX) { + void *map; + u64 offset; + + offset = (resp->push_idx + I40IW_BASE_PUSH_PAGE) * I40IW_HW_PAGE_SIZE; + + map = mmap(NULL, I40IW_HW_PAGE_SIZE, PROT_WRITE | PROT_READ, MAP_SHARED, + pd->context->cmd_fd, offset); + if (map == MAP_FAILED) { + fprintf(stderr, PFX "%s: failed to map push page, errno %d\n", __func__, errno); + info->push_wqe = NULL; + info->push_db = NULL; + } else { + info->push_wqe = map; + + offset += I40IW_HW_PAGE_SIZE; + map = mmap(NULL, I40IW_HW_PAGE_SIZE, PROT_WRITE | PROT_READ, MAP_SHARED, + pd->context->cmd_fd, offset); + if (map == MAP_FAILED) { + fprintf(stderr, PFX "%s: failed to map push doorbell, errno %d\n", __func__, errno); + munmap(info->push_wqe, I40IW_HW_PAGE_SIZE); + info->push_wqe = NULL; + info->push_db = NULL; + } else { + info->push_db = map; + } + iwuqp->push_db = info->push_db; + iwuqp->push_wqe = info->push_wqe; + } + } + return 1; +} + +/** + * i40iw_ucreate_qp - create qp on user app + * @pd: pd for the qp + * @attr: attributes of the qp to be created (sizes, sge, cq) + */ +struct ibv_qp *i40iw_ucreate_qp(struct ibv_pd *pd, struct ibv_qp_init_attr *attr) +{ + struct i40iw_ucreate_qp_resp resp; + struct i40iw_uvcontext *iwvctx = to_i40iw_uctx(pd->context); + struct i40iw_uqp *iwuqp; + struct i40iw_qp_uk_init_info info; + u32 sqdepth, rqdepth; + u8 sqshift, rqshift; + + if (attr->qp_type != IBV_QPT_RC) { + fprintf(stderr, PFX "%s: failed to create QP, unsupported QP type: 0x%x\n", __func__, attr->qp_type); + return NULL; + } + + if (attr->cap.max_send_sge > I40IW_MAX_WQ_FRAGMENT_COUNT) + attr->cap.max_send_sge = I40IW_MAX_WQ_FRAGMENT_COUNT; + + if (attr->cap.max_recv_sge > I40IW_MAX_WQ_FRAGMENT_COUNT) + attr->cap.max_recv_sge = I40IW_MAX_WQ_FRAGMENT_COUNT; + + if (attr->cap.max_inline_data > I40IW_MAX_INLINE_DATA_SIZE) + attr->cap.max_inline_data = I40IW_MAX_INLINE_DATA_SIZE; + + i40iw_get_wqe_shift(attr->cap.max_send_sge, attr->cap.max_inline_data, &sqshift); + if (i40iw_get_sqdepth(attr->cap.max_send_wr, sqshift, &sqdepth)) { + fprintf(stderr, PFX "invalid SQ attributes, max_send_wr=%d max_send_sge=%d max_inline=%d\n", + attr->cap.max_send_wr, attr->cap.max_send_sge, attr->cap.max_inline_data); + return NULL; + } + + switch (iwvctx->abi_ver) { + case 4: + i40iw_get_wqe_shift(attr->cap.max_recv_sge, 0, &rqshift); + break; + case 5: /* fallthrough until next ABI version */ + default: + rqshift = I40IW_MAX_RQ_WQE_SHIFT; + break; + } + + if (i40iw_get_rqdepth(attr->cap.max_recv_wr, rqshift, &rqdepth)) { + fprintf(stderr, PFX "invalid RQ attributes, max_recv_wr=%d max_recv_sge=%d\n", + attr->cap.max_recv_wr, attr->cap.max_recv_sge); + return NULL; + } + + iwuqp = memalign(1024, sizeof(*iwuqp)); + if (!iwuqp) + return NULL; + memset(iwuqp, 0, sizeof(*iwuqp)); + + if (pthread_spin_init(&iwuqp->lock, PTHREAD_PROCESS_PRIVATE)) + goto err_free_qp; + + memset(&info, 0, sizeof(info)); + + info.sq_size = sqdepth >> sqshift; + info.rq_size = rqdepth >> rqshift; + attr->cap.max_send_wr = info.sq_size; + attr->cap.max_recv_wr = info.rq_size; + + info.max_sq_frag_cnt = attr->cap.max_send_sge; + info.max_rq_frag_cnt = attr->cap.max_recv_sge; + + info.wqe_alloc_reg = (u32 *)iwvctx->iwupd->db; + info.sq_wrtrk_array = calloc(sqdepth, sizeof(*info.sq_wrtrk_array)); + info.abi_ver = iwvctx->abi_ver; + + if (!info.sq_wrtrk_array) { + fprintf(stderr, PFX "%s: failed to allocate memory for SQ work array\n", __func__); + goto err_destroy_lock; + } + + info.rq_wrid_array = calloc(rqdepth, sizeof(*info.rq_wrid_array)); + if (!info.rq_wrid_array) { + fprintf(stderr, PFX "%s: failed to allocate memory for RQ work array\n", __func__); + goto err_free_sq_wrtrk; + } + + iwuqp->sq_sig_all = attr->sq_sig_all; + memset(&resp, 0, sizeof(resp)); + if (!i40iw_vmapped_qp(iwuqp, pd, attr, &resp, sqdepth, rqdepth, &info)) { + fprintf(stderr, PFX "%s: failed to map QP\n", __func__); + goto err_free_rq_wrid; + } + info.qp_id = resp.qp_id; + iwuqp->i40iw_drv_opt = resp.i40iw_drv_opt; + iwuqp->ibv_qp.qp_num = resp.qp_id; + + info.max_sq_frag_cnt = attr->cap.max_send_sge; + info.max_rq_frag_cnt = attr->cap.max_recv_sge; + info.max_inline_data = attr->cap.max_inline_data; + + if (!iwvctx->dev.ops_uk.iwarp_qp_uk_init(&iwuqp->qp, &info)) { + attr->cap.max_send_wr = (sqdepth - I40IW_SQ_RSVD) >> sqshift; + attr->cap.max_recv_wr = (rqdepth - I40IW_RQ_RSVD) >> rqshift; + return &iwuqp->ibv_qp; + } + + i40iw_destroy_vmapped_qp(iwuqp, info.sq); +err_free_rq_wrid: + free(info.rq_wrid_array); +err_free_sq_wrtrk: + free(info.sq_wrtrk_array); +err_destroy_lock: + pthread_spin_destroy(&iwuqp->lock); +err_free_qp: + free(iwuqp); + return NULL; +} + +/** + * i40iw_uquery_qp - query qp for some attribute + * @qp: qp for the attributes query + * @attr: to return the attributes + * @attr_mask: mask of what is query for + * @init_attr: initial attributes during create_qp + */ +int i40iw_uquery_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask, + struct ibv_qp_init_attr *init_attr) +{ + struct ibv_query_qp cmd; + + return ibv_cmd_query_qp(qp, attr, attr_mask, init_attr, &cmd, sizeof(cmd)); +} + +/** + * i40iw_umodify_qp - send qp modify to driver + * @qp: qp to modify + * @attr: attribute to modify + * @attr_mask: mask of the attribute + */ +int i40iw_umodify_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask) +{ + struct ibv_modify_qp cmd = {}; + + return ibv_cmd_modify_qp(qp, attr, attr_mask, &cmd, sizeof(cmd)); +} + +/** + * i40iw_udestroy_qp - destroy qp + * @qp: qp to destroy + */ +int i40iw_udestroy_qp(struct ibv_qp *qp) +{ + struct i40iw_uqp *iwuqp = to_i40iw_uqp(qp); + int ret; + + ret = pthread_spin_destroy(&iwuqp->lock); + if (ret) + return ret; + + ret = i40iw_destroy_vmapped_qp(iwuqp, iwuqp->qp.sq_base); + if (ret) + return ret; + + if (iwuqp->qp.sq_wrtrk_array) + free(iwuqp->qp.sq_wrtrk_array); + if (iwuqp->qp.rq_wrid_array) + free(iwuqp->qp.rq_wrid_array); + /* Clean any pending completions from the cq(s) */ + if (iwuqp->send_cq) + i40iw_clean_cq((void *)&iwuqp->qp, &iwuqp->send_cq->cq); + + if ((iwuqp->recv_cq) && (iwuqp->recv_cq != iwuqp->send_cq)) + i40iw_clean_cq((void *)&iwuqp->qp, &iwuqp->recv_cq->cq); + + free(iwuqp); + + return 0; +} + +/** + * i40iw_copy_sg_list - copy sg list for qp + * @sg_list: copied into sg_list + * @sgl: copy from sgl + * @num_sges: count of sg entries + */ +static void i40iw_copy_sg_list(struct i40iw_sge *sg_list, struct ibv_sge *sgl, + int num_sges) +{ + unsigned int i; + + for (i = 0; (i < num_sges) && (i < I40IW_MAX_WQ_FRAGMENT_COUNT); i++) { + sg_list[i].tag_off = sgl[i].addr; + sg_list[i].len = sgl[i].length; + sg_list[i].stag = sgl[i].lkey; + } +} + +/** + * i40iw_post_send - post send wr for user application + * @ib_qp: qp ptr for wr + * @ib_wr: work request ptr + * @bad_wr: return of bad wr if err + */ +int i40iw_upost_send(struct ibv_qp *ib_qp, struct ibv_send_wr *ib_wr, struct ibv_send_wr **bad_wr) +{ + struct i40iw_uqp *iwuqp; + struct i40iw_post_sq_info info; + enum i40iw_status_code ret = 0; + int err = 0; + + iwuqp = (struct i40iw_uqp *)ib_qp; + + err = pthread_spin_lock(&iwuqp->lock); + if (err) + return err; + while (ib_wr) { + memset(&info, 0, sizeof(info)); + info.wr_id = (u64)(ib_wr->wr_id); + if ((ib_wr->send_flags & IBV_SEND_SIGNALED) || iwuqp->sq_sig_all) + info.signaled = true; + if (ib_wr->send_flags & IBV_SEND_FENCE) + info.read_fence = true; + + switch (ib_wr->opcode) { + case IBV_WR_SEND: + /* fall-through */ + case IBV_WR_SEND_WITH_INV: + if (ib_wr->opcode == IBV_WR_SEND) { + if (ib_wr->send_flags & IBV_SEND_SOLICITED) + info.op_type = I40IW_OP_TYPE_SEND_SOL; + else + info.op_type = I40IW_OP_TYPE_SEND; + } else { + if (ib_wr->send_flags & IBV_SEND_SOLICITED) + info.op_type = I40IW_OP_TYPE_SEND_SOL_INV; + else + info.op_type = I40IW_OP_TYPE_SEND_INV; + } + + if (ib_wr->send_flags & IBV_SEND_INLINE) { + info.op.inline_send.data = (void *)(uintptr_t)ib_wr->sg_list[0].addr; + info.op.inline_send.len = ib_wr->sg_list[0].length; + ret = iwuqp->qp.ops.iw_inline_send(&iwuqp->qp, &info, + ib_wr->invalidate_rkey, false); + } else { + info.op.send.num_sges = ib_wr->num_sge; + info.op.send.sg_list = (struct i40iw_sge *)ib_wr->sg_list; + ret = iwuqp->qp.ops.iw_send(&iwuqp->qp, &info, + ib_wr->invalidate_rkey, false); + } + + if (ret) { + if (ret == I40IW_ERR_QP_TOOMANY_WRS_POSTED) + err = -ENOMEM; + else + err = -EINVAL; + } + break; + + case IBV_WR_RDMA_WRITE: + info.op_type = I40IW_OP_TYPE_RDMA_WRITE; + + if (ib_wr->send_flags & IBV_SEND_INLINE) { + info.op.inline_rdma_write.data = (void *)(uintptr_t)ib_wr->sg_list[0].addr; + info.op.inline_rdma_write.len = ib_wr->sg_list[0].length; + info.op.inline_rdma_write.rem_addr.tag_off = ib_wr->wr.rdma.remote_addr; + info.op.inline_rdma_write.rem_addr.stag = ib_wr->wr.rdma.rkey; + ret = iwuqp->qp.ops.iw_inline_rdma_write(&iwuqp->qp, &info, false); + } else { + info.op.rdma_write.lo_sg_list = (void *)ib_wr->sg_list; + info.op.rdma_write.num_lo_sges = ib_wr->num_sge; + info.op.rdma_write.rem_addr.tag_off = ib_wr->wr.rdma.remote_addr; + info.op.rdma_write.rem_addr.stag = ib_wr->wr.rdma.rkey; + ret = iwuqp->qp.ops.iw_rdma_write(&iwuqp->qp, &info, false); + } + + if (ret) { + if (ret == I40IW_ERR_QP_TOOMANY_WRS_POSTED) + err = -ENOMEM; + else + err = -EINVAL; + } + break; + + case IBV_WR_RDMA_READ: + if (ib_wr->num_sge > I40IW_MAX_SGE_RD) { + err = -EINVAL; + break; + } + info.op_type = I40IW_OP_TYPE_RDMA_READ; + info.op.rdma_read.rem_addr.tag_off = ib_wr->wr.rdma.remote_addr; + info.op.rdma_read.rem_addr.stag = ib_wr->wr.rdma.rkey; + info.op.rdma_read.lo_addr.tag_off = ib_wr->sg_list->addr; + info.op.rdma_read.lo_addr.stag = ib_wr->sg_list->lkey; + info.op.rdma_read.lo_addr.len = ib_wr->sg_list->length; + ret = iwuqp->qp.ops.iw_rdma_read(&iwuqp->qp, &info, false, false); + if (ret) { + if (ret == I40IW_ERR_QP_TOOMANY_WRS_POSTED) + err = -ENOMEM; + else + err = -EINVAL; + } + break; + + default: + /* error */ + err = -EINVAL; + fprintf(stderr, PFX "%s: post work request failed, invalid opcode: 0x%x\n", __func__, ib_wr->opcode); + break; + } + + if (err) + break; + + ib_wr = ib_wr->next; + } + + if (err) + *bad_wr = ib_wr; + else + iwuqp->qp.ops.iw_qp_post_wr(&iwuqp->qp); + + pthread_spin_unlock(&iwuqp->lock); + + return err; +} + +/** + * i40iw_post_recv - post receive wr for user application + * @ib_wr: work request for receive + * @bad_wr: bad wr caused an error + */ +int i40iw_upost_recv(struct ibv_qp *ib_qp, struct ibv_recv_wr *ib_wr, struct ibv_recv_wr **bad_wr) +{ + struct i40iw_uqp *iwuqp = to_i40iw_uqp(ib_qp); + enum i40iw_status_code ret = 0; + int err = 0; + struct i40iw_post_rq_info post_recv; + struct i40iw_sge sg_list[I40IW_MAX_WQ_FRAGMENT_COUNT]; + + memset(&post_recv, 0, sizeof(post_recv)); + err = pthread_spin_lock(&iwuqp->lock); + if (err) + return err; + while (ib_wr) { + post_recv.num_sges = ib_wr->num_sge; + post_recv.wr_id = ib_wr->wr_id; + i40iw_copy_sg_list(sg_list, ib_wr->sg_list, ib_wr->num_sge); + post_recv.sg_list = sg_list; + ret = iwuqp->qp.ops.iw_post_receive(&iwuqp->qp, &post_recv); + if (ret) { + fprintf(stderr, PFX "%s: failed to post receives, status %d\n", __func__, ret); + if (ret == I40IW_ERR_QP_TOOMANY_WRS_POSTED) + err = -ENOMEM; + else + err = -EINVAL; + *bad_wr = ib_wr; + goto error; + } + ib_wr = ib_wr->next; + } + +error: + pthread_spin_unlock(&iwuqp->lock); + return err; +} + +/** + * i40iw_async_event - handle async events from driver + * @context: ibv_context + * @event: event received + */ +void i40iw_async_event(struct ibv_context *context, + struct ibv_async_event *event) +{ + struct i40iw_uqp *iwuqp; + + switch (event->event_type) { + case IBV_EVENT_QP_FATAL: + case IBV_EVENT_QP_ACCESS_ERR: + iwuqp = to_i40iw_uqp(event->element.qp); + iwuqp->qperr = 1; + break; + + default: + break; + } +} diff --git a/providers/ipathverbs/CMakeLists.txt b/providers/ipathverbs/CMakeLists.txt new file mode 100644 index 0000000..9031b86 --- /dev/null +++ b/providers/ipathverbs/CMakeLists.txt @@ -0,0 +1,11 @@ +rdma_provider(ipathverbs + ipathverbs.c + verbs.c + ) + +rdma_subst_install(FILES "truescale.conf.in" + DESTINATION "${CMAKE_INSTALL_SYSCONFDIR}/modprobe.d/" + RENAME "truescale.conf") +install(FILES truescale-serdes.cmds + DESTINATION "${CMAKE_INSTALL_LIBEXECDIR}" + PERMISSIONS OWNER_WRITE OWNER_READ GROUP_READ WORLD_READ OWNER_EXECUTE GROUP_EXECUTE WORLD_EXECUTE) diff --git a/providers/ipathverbs/COPYING b/providers/ipathverbs/COPYING new file mode 100644 index 0000000..cf55023 --- /dev/null +++ b/providers/ipathverbs/COPYING @@ -0,0 +1,35 @@ +Copyright (c) 2013. Intel Corporation. All rights reserved. +Copyright (c) 2007. QLogic Corp. All rights reserved. +Copyright (c) 2005. PathScale, Inc. All rights reserved. + +This software is available to you under a choice of one of two +licenses. You may choose to be licensed under the terms of the GNU +General Public License (GPL) Version 2, available from the file +COPYING in the main directory of this source tree, or the +OpenIB.org BSD license below: + + Redistribution and use in source and binary forms, with or + without modification, are permitted provided that the following + conditions are met: + + - Redistributions of source code must retain the above + copyright notice, this list of conditions and the following + disclaimer. + + - Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following + disclaimer in the documentation and/or other materials + provided with the distribution. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS +BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN +ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +Patent licenses, if any, provided herein do not apply to +combinations of this program with other software, or any other +product whatsoever. diff --git a/providers/ipathverbs/dracut_check b/providers/ipathverbs/dracut_check new file mode 100644 index 0000000..fbed81f --- /dev/null +++ b/providers/ipathverbs/dracut_check @@ -0,0 +1,8 @@ +#!/bin/bash + +if [ -n "$hostonly" ]; then + lspci -n 2>/dev/null | grep -q -i "1077\|1fc1" + exit $? +fi + +exit 0 diff --git a/providers/ipathverbs/dracut_install b/providers/ipathverbs/dracut_install new file mode 100644 index 0000000..a7ef490 --- /dev/null +++ b/providers/ipathverbs/dracut_install @@ -0,0 +1,13 @@ +#!/bin/bash + +inst /etc/modprobe.d/truescale.conf +inst /usr/libexec/truescale-serdes.cmds + +# All files needed by truescale-serdes.cmds need to be present here +inst /sbin/lspci +inst /bin/grep +inst /bin/sed +inst /usr/bin/logger +inst /usr/sbin/dmidecode +inst /bin/readlink +inst /bin/echo diff --git a/providers/ipathverbs/dracut_kmod b/providers/ipathverbs/dracut_kmod new file mode 100644 index 0000000..d76ae80 --- /dev/null +++ b/providers/ipathverbs/dracut_kmod @@ -0,0 +1,4 @@ +#!/bin/bash + +instmods ib_qib + diff --git a/providers/ipathverbs/ipath-abi.h b/providers/ipathverbs/ipath-abi.h new file mode 100644 index 0000000..2b2e329 --- /dev/null +++ b/providers/ipathverbs/ipath-abi.h @@ -0,0 +1,72 @@ +/* + * Copyright (c) 2006 QLogic, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Patent licenses, if any, provided herein do not apply to + * combinations of this program with other software, or any other + * product whatsoever. + */ + +#ifndef IPATH_ABI_H +#define IPATH_ABI_H + +#include <infiniband/kern-abi.h> + +struct ipath_get_context_resp { + struct ib_uverbs_get_context_resp ibv_resp; + __u32 version; +}; + +struct ipath_create_cq_resp { + struct ib_uverbs_create_cq_resp ibv_resp; + __u64 offset; +}; + +struct ipath_resize_cq_resp { + struct ib_uverbs_resize_cq_resp ibv_resp; + __u64 offset; +}; + +struct ipath_create_qp_resp { + struct ib_uverbs_create_qp_resp ibv_resp; + __u64 offset; +}; + +struct ipath_create_srq_resp { + struct ib_uverbs_create_srq_resp ibv_resp; + __u64 offset; +}; + +struct ipath_modify_srq_cmd { + struct ibv_modify_srq ibv_cmd; + __u64 offset_addr; +}; + +#endif /* IPATH_ABI_H */ diff --git a/providers/ipathverbs/ipathverbs.c b/providers/ipathverbs/ipathverbs.c new file mode 100644 index 0000000..0e1a584 --- /dev/null +++ b/providers/ipathverbs/ipathverbs.c @@ -0,0 +1,210 @@ +/* + * Copyright (C) 2006-2007 QLogic Corporation, All rights reserved. + * Copyright (c) 2005. PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Patent licenses, if any, provided herein do not apply to + * combinations of this program with other software, or any other + * product whatsoever. + */ + +#include <config.h> + +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <string.h> + +#include "ipathverbs.h" +#include "ipath-abi.h" + +static void ipath_free_context(struct ibv_context *ibctx); + +#ifndef PCI_VENDOR_ID_PATHSCALE +#define PCI_VENDOR_ID_PATHSCALE 0x1fc1 +#endif + +#ifndef PCI_VENDOR_ID_QLOGIC +#define PCI_VENDOR_ID_QLOGIC 0x1077 +#endif + +#ifndef PCI_DEVICE_ID_INFINIPATH_HT +#define PCI_DEVICE_ID_INFINIPATH_HT 0x000d +#endif + +#ifndef PCI_DEVICE_ID_INFINIPATH_PE800 +#define PCI_DEVICE_ID_INFINIPATH_PE800 0x0010 +#endif + +#ifndef PCI_DEVICE_ID_INFINIPATH_6220 +#define PCI_DEVICE_ID_INFINIPATH_6220 0x6220 +#endif + +#ifndef PCI_DEVICE_ID_INFINIPATH_7220 +#define PCI_DEVICE_ID_INFINIPATH_7220 0x7220 +#endif + +#ifndef PCI_DEVICE_ID_INFINIPATH_7322 +#define PCI_DEVICE_ID_INFINIPATH_7322 0x7322 +#endif + +#define HCA(v, d) \ + VERBS_PCI_MATCH(PCI_VENDOR_ID_##v, PCI_DEVICE_ID_INFINIPATH_##d, NULL) +static const struct verbs_match_ent hca_table[] = { + VERBS_DRIVER_ID(RDMA_DRIVER_QIB), + HCA(PATHSCALE, HT), + HCA(PATHSCALE, PE800), + HCA(QLOGIC, 6220), + HCA(QLOGIC, 7220), + HCA(QLOGIC, 7322), + {} +}; + +static const struct verbs_context_ops ipath_ctx_common_ops = { + .free_context = ipath_free_context, + .query_device = ipath_query_device, + .query_port = ipath_query_port, + + .alloc_pd = ipath_alloc_pd, + .dealloc_pd = ipath_free_pd, + + .reg_mr = ipath_reg_mr, + .dereg_mr = ipath_dereg_mr, + + .create_cq = ipath_create_cq, + .poll_cq = ipath_poll_cq, + .req_notify_cq = ibv_cmd_req_notify_cq, + .resize_cq = ipath_resize_cq, + .destroy_cq = ipath_destroy_cq, + + .create_srq = ipath_create_srq, + .modify_srq = ipath_modify_srq, + .query_srq = ipath_query_srq, + .destroy_srq = ipath_destroy_srq, + .post_srq_recv = ipath_post_srq_recv, + + .create_qp = ipath_create_qp, + .query_qp = ipath_query_qp, + .modify_qp = ipath_modify_qp, + .destroy_qp = ipath_destroy_qp, + + .post_send = ipath_post_send, + .post_recv = ipath_post_recv, + + .create_ah = ipath_create_ah, + .destroy_ah = ipath_destroy_ah, + + .attach_mcast = ibv_cmd_attach_mcast, + .detach_mcast = ibv_cmd_detach_mcast +}; + +static const struct verbs_context_ops ipath_ctx_v1_ops = { + .create_cq = ipath_create_cq_v1, + .poll_cq = ibv_cmd_poll_cq, + .resize_cq = ipath_resize_cq_v1, + .destroy_cq = ipath_destroy_cq_v1, + .create_srq = ipath_create_srq_v1, + .destroy_srq = ipath_destroy_srq_v1, + .modify_srq = ipath_modify_srq_v1, + .post_srq_recv = ibv_cmd_post_srq_recv, + .create_qp = ipath_create_qp_v1, + .destroy_qp = ipath_destroy_qp_v1, + .post_recv = ibv_cmd_post_recv, +}; + +static struct verbs_context *ipath_alloc_context(struct ibv_device *ibdev, + int cmd_fd, + void *private_data) +{ + struct ipath_context *context; + struct ibv_get_context cmd; + struct ib_uverbs_get_context_resp resp; + struct ipath_device *dev; + + context = verbs_init_and_alloc_context(ibdev, cmd_fd, context, ibv_ctx, + RDMA_DRIVER_QIB); + if (!context) + return NULL; + + if (ibv_cmd_get_context(&context->ibv_ctx, &cmd, + sizeof cmd, &resp, sizeof resp)) + goto err_free; + + verbs_set_ops(&context->ibv_ctx, &ipath_ctx_common_ops); + dev = to_idev(ibdev); + if (dev->abi_version == 1) + verbs_set_ops(&context->ibv_ctx, &ipath_ctx_v1_ops); + return &context->ibv_ctx; + +err_free: + verbs_uninit_context(&context->ibv_ctx); + free(context); + return NULL; +} + +static void ipath_free_context(struct ibv_context *ibctx) +{ + struct ipath_context *context = to_ictx(ibctx); + + verbs_uninit_context(&context->ibv_ctx); + free(context); +} + +static void ipath_uninit_device(struct verbs_device *verbs_device) +{ + struct ipath_device *dev = to_idev(&verbs_device->device); + + free(dev); +} + +static struct verbs_device * +ipath_device_alloc(struct verbs_sysfs_dev *sysfs_dev) +{ + struct ipath_device *dev; + + dev = calloc(1, sizeof(*dev)); + if (!dev) + return NULL; + + dev->abi_version = sysfs_dev->abi_ver; + + return &dev->ibv_dev; +} + +static const struct verbs_device_ops ipath_dev_ops = { + .name = "ipathverbs", + .match_min_abi_version = 0, + .match_max_abi_version = INT_MAX, + .match_table = hca_table, + .alloc_device = ipath_device_alloc, + .uninit_device = ipath_uninit_device, + .alloc_context = ipath_alloc_context, +}; +PROVIDER_DRIVER(ipathverbs, ipath_dev_ops); diff --git a/providers/ipathverbs/ipathverbs.h b/providers/ipathverbs/ipathverbs.h new file mode 100644 index 0000000..694f1f4 --- /dev/null +++ b/providers/ipathverbs/ipathverbs.h @@ -0,0 +1,259 @@ +/* + * Copyright (c) 2006-2009 QLogic Corp. All rights reserved. + * Copyright (c) 2005. PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Patent licenses, if any, provided herein do not apply to + * combinations of this program with other software, or any other + * product whatsoever. + */ + +#ifndef IPATH_H +#define IPATH_H + +#include <endian.h> +#include <pthread.h> +#include <stddef.h> +#include <stdatomic.h> + +#include <infiniband/driver.h> +#include <infiniband/verbs.h> + +#define PFX "ipath: " + +struct ipath_device { + struct verbs_device ibv_dev; + int abi_version; +}; + +struct ipath_context { + struct verbs_context ibv_ctx; +}; + +/* + * This structure needs to have the same size and offsets as + * the kernel's ib_wc structure since it is memory mapped. + */ +struct ipath_wc { + uint64_t wr_id; + enum ibv_wc_status status; + enum ibv_wc_opcode opcode; + uint32_t vendor_err; + uint32_t byte_len; + uint32_t imm_data; /* in network byte order */ + uint32_t qp_num; + uint32_t src_qp; + enum ibv_wc_flags wc_flags; + uint16_t pkey_index; + uint16_t slid; + uint8_t sl; + uint8_t dlid_path_bits; + uint8_t port_num; +}; + +struct ipath_cq_wc { + _Atomic(uint32_t) head; + _Atomic(uint32_t) tail; + struct ipath_wc queue[1]; +}; + +struct ipath_cq { + struct ibv_cq ibv_cq; + struct ipath_cq_wc *queue; + pthread_spinlock_t lock; +}; + +/* + * Receive work request queue entry. + * The size of the sg_list is determined when the QP is created and stored + * in qp->r_max_sge. + */ +struct ipath_rwqe { + uint64_t wr_id; + uint8_t num_sge; + uint8_t padding[7]; + struct ibv_sge sg_list[0]; +}; + +/* + * This struture is used to contain the head pointer, tail pointer, + * and receive work queue entries as a single memory allocation so + * it can be mmap'ed into user space. + * Note that the wq array elements are variable size so you can't + * just index into the array to get the N'th element; + * use get_rwqe_ptr() instead. + */ +struct ipath_rwq { + _Atomic(uint32_t) head; /* new requests posted to the head. */ + _Atomic(uint32_t) tail; /* receives pull requests from here. */ + struct ipath_rwqe wq[0]; +}; + +struct ipath_rq { + struct ipath_rwq *rwq; + pthread_spinlock_t lock; + uint32_t size; + uint32_t max_sge; +}; + +struct ipath_qp { + struct ibv_qp ibv_qp; + struct ipath_rq rq; +}; + +struct ipath_srq { + struct ibv_srq ibv_srq; + struct ipath_rq rq; +}; + +#define to_ixxx(xxx, type) container_of(ib##xxx, struct ipath_##type, ibv_##xxx) + +static inline struct ipath_context *to_ictx(struct ibv_context *ibctx) +{ + return container_of(ibctx, struct ipath_context, ibv_ctx.context); +} + +static inline struct ipath_device *to_idev(struct ibv_device *ibdev) +{ + return container_of(ibdev, struct ipath_device, ibv_dev.device); +} + +static inline struct ipath_cq *to_icq(struct ibv_cq *ibcq) +{ + return to_ixxx(cq, cq); +} + +static inline struct ipath_qp *to_iqp(struct ibv_qp *ibqp) +{ + return to_ixxx(qp, qp); +} + +static inline struct ipath_srq *to_isrq(struct ibv_srq *ibsrq) +{ + return to_ixxx(srq, srq); +} + +/* + * Since struct ipath_rwqe is not a fixed size, we can't simply index into + * struct ipath_rq.wq. This function does the array index computation. + */ +static inline struct ipath_rwqe *get_rwqe_ptr(struct ipath_rq *rq, + unsigned n) +{ + return (struct ipath_rwqe *) + ((char *) rq->rwq->wq + + (sizeof(struct ipath_rwqe) + + rq->max_sge * sizeof(struct ibv_sge)) * n); +} + +extern int ipath_query_device(struct ibv_context *context, + struct ibv_device_attr *attr); + +extern int ipath_query_port(struct ibv_context *context, uint8_t port, + struct ibv_port_attr *attr); + +struct ibv_pd *ipath_alloc_pd(struct ibv_context *pd); + +int ipath_free_pd(struct ibv_pd *pd); + +struct ibv_mr *ipath_reg_mr(struct ibv_pd *pd, void *addr, size_t length, + uint64_t hca_va, int access); + +int ipath_dereg_mr(struct verbs_mr *vmr); + +struct ibv_cq *ipath_create_cq(struct ibv_context *context, int cqe, + struct ibv_comp_channel *channel, + int comp_vector); + +struct ibv_cq *ipath_create_cq_v1(struct ibv_context *context, int cqe, + struct ibv_comp_channel *channel, + int comp_vector); + +int ipath_resize_cq(struct ibv_cq *cq, int cqe); + +int ipath_resize_cq_v1(struct ibv_cq *cq, int cqe); + +int ipath_destroy_cq(struct ibv_cq *cq); + +int ipath_destroy_cq_v1(struct ibv_cq *cq); + +int ipath_poll_cq(struct ibv_cq *cq, int ne, struct ibv_wc *wc); + +struct ibv_qp *ipath_create_qp(struct ibv_pd *pd, + struct ibv_qp_init_attr *attr); + +struct ibv_qp *ipath_create_qp_v1(struct ibv_pd *pd, + struct ibv_qp_init_attr *attr); + +int ipath_query_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, + int attr_mask, + struct ibv_qp_init_attr *init_attr); + +int ipath_modify_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, + int attr_mask); + +int ipath_destroy_qp(struct ibv_qp *qp); + +int ipath_destroy_qp_v1(struct ibv_qp *qp); + +int ipath_post_send(struct ibv_qp *ibqp, struct ibv_send_wr *wr, + struct ibv_send_wr **bad_wr); + +int ipath_post_recv(struct ibv_qp *ibqp, struct ibv_recv_wr *wr, + struct ibv_recv_wr **bad_wr); + +struct ibv_srq *ipath_create_srq(struct ibv_pd *pd, + struct ibv_srq_init_attr *attr); + +struct ibv_srq *ipath_create_srq_v1(struct ibv_pd *pd, + struct ibv_srq_init_attr *attr); + +int ipath_modify_srq(struct ibv_srq *srq, + struct ibv_srq_attr *attr, + int attr_mask); + +int ipath_modify_srq_v1(struct ibv_srq *srq, + struct ibv_srq_attr *attr, + int attr_mask); + +int ipath_query_srq(struct ibv_srq *srq, struct ibv_srq_attr *attr); + +int ipath_destroy_srq(struct ibv_srq *srq); + +int ipath_destroy_srq_v1(struct ibv_srq *srq); + +int ipath_post_srq_recv(struct ibv_srq *srq, struct ibv_recv_wr *wr, + struct ibv_recv_wr **bad_wr); + +struct ibv_ah *ipath_create_ah(struct ibv_pd *pd, struct ibv_ah_attr *attr); + +int ipath_destroy_ah(struct ibv_ah *ah); + +#endif /* IPATH_H */ diff --git a/providers/ipathverbs/truescale-serdes.cmds b/providers/ipathverbs/truescale-serdes.cmds new file mode 100755 index 0000000..0f89337 --- /dev/null +++ b/providers/ipathverbs/truescale-serdes.cmds @@ -0,0 +1,257 @@ +#!/bin/bash +# Copyright (c) 2013 Intel Corporation. All rights reserved. +# Copyright (c) 2010 QLogic Corporation. +# All rights reserved. +# +# This software is available to you under a choice of one of two +# licenses. You may choose to be licensed under the terms of the GNU +# General Public License (GPL) Version 2, available from the file +# COPYING in the main directory of this source tree, or the +# OpenIB.org BSD license below: +# +# Redistribution and use in source and binary forms, with or +# without modification, are permitted provided that the following +# conditions are met: +# +# - Redistributions of source code must retain the above +# copyright notice, this list of conditions and the following +# disclaimer. +# +# - Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials +# provided with the distribution. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS +# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +# This script does truescale (qib) adapter-specific actions, and is +# sourced during boot after the ib_qib module is loaded. The stop +# operation is deprecated. It isn't intended for standalone use. + +# base name in /sys/class +PATH=/sbin:/bin:/usr/sbin:/usr/bin:$PATH +export PATH +qb=/sys/class/infiniband/qib +serdes_parm=txselect + +if [ -r /etc/rdma/rdma.conf ]; then + IB_CONFIG=/etc/rdma/rdma.conf +else + IB_CONFIG=/etc/infiniband/openib.conf +fi +if [ -f $IB_CONFIG ]; then + . $IB_CONFIG +fi + +# If user specifies an override or the setting is ommitted from the config file +# then default to new back plane version. +if [ -z $QIB_QME_BPVER ]; then + QIB_QME_BPVER=1 +fi + +warn_and_log() +{ + echo "$0: $@" + logger -t infinipath "$@" +} + +setup_qmh() +{ + local -i nunit=0 bay bl2xB=0 full=0 + local parmf sysinfo bayinfo mez1bus mez2bus mez3bus=0 tbay + local -a parm bay_h1 + for parm in parameters/${serdes_parm} ${serdes_parm}; do + if [ -e /sys/module/ib_qib/$parm ]; then + parmf=/sys/module/ib_qib/$parm + break; + fi + done + if [ ! "$parmf" ]; then + warn_and_log Unable to find ${serdes_parm} parameter + return + fi + sysinfo="$(PATH=/sbin:/usr/sbin:$PATH; dmidecode -t system | \ + sed -e '/^Handle/d' -e '/^[ \t]*$/d' -e 's/[ \t]*$//' )" + if [ ! "$sysinfo" ]; then + warn_and_log Unable to determine system type + return + fi + bayinfo="$(PATH=/sbin:/usr/sbin:$PATH; dmidecode -t 204)" + if [ ! "$bayinfo" ]; then + warn_and_log Unable to determine bay + return + fi + case "${bayinfo}" in + *Server*Bay:*) tbay=$(PATH=/sbin:/usr/sbin:$PATH; dmidecode -t 204 | \ + sed -n -e 's/[ \t]*$//' -e 's/[ \t]*Server Bay:[ \t]*//p') ;; + *) tbay=$(PATH=/sbin:/usr/sbin:$PATH; dmidecode -t 204 | \ + sed -n -e '1,/BladeSystem/d' -e 's/ *$//' -e 's/^\t\t*//' \ + -e '/^[0-9][AB]*$/p' -e '/^[0-9][0-9][AB]*$/p') ;; + esac + + read pbase < $parmf + parm=($(echo ${qb}*)) + nunit=${#parm[*]} + + # [0] is a dummy in these arrays, bay #'ing starts at 1 + # H1 value, per bay (same for both ports) + m1_bay_h1=(0 8 7 7 7 7 6 6 6 8 7 7 7 7 6 6 7) + m2_bay_h1=(0 11 11 11 11 11 11 10 11 11 11 11 11 10 10 10 10) + m3_bay_h1=(0 11 11 11 11 10 10 10 10) + + # tx serdes index per bay for mez1 (either port) + mez1p1_idx=(0 2 2 17 17 17 1 1 1 2 1 17 17 16 2 18 16) + # tx serdes setting for mez1 p2 (only used on full-height blades) + mez1p2_idx=(0 4 4 3 3 3 2 4 4) + # tx serdes index per bay for mez2 port 1 + mez2p1_idx=(0 2 2 17 17 17 1 1 1 2 1 17 17 16 2 18 1) + # tx serdes index per bay for mez2 port 2 + mez2p2_idx=(0 2 2 19 1 1 1 1 1 2 1 18 17 1 19 1 1) + # tx serdes index per bay for mez3 port 1 (mez3 only on full-height blades) + mez3p1_idx=(0 2 1 18 17 1 19 1 1) + # tx serdes index per bay for mez3 port 2 (mez3 only on full-height blades) + mez3p2_idx=(0 2 1 17 17 16 2 18 1) + + case "${sysinfo}" in + *BL280[cC]*) mez1bus=3 mez2bus=6 bay=$tbay ;; + # both nodes on the 2x220 blade have bus 3, only one mez, but + # they connect to different switches through different paths + # so A and B have different parameters. They connect to + # the switch as if they were the mez2 on other blade types, + # with port 1 on mez2 for A node and port 2 on mez2 + # for the B node + *BL2x220[cC]*) + mez1bus=3 mez2bus=3 bay=${tbay%[AB]} + case "${tbay}" in + *A) bl2xB=${mez2p1_idx[$bay]} ;; + *B) bl2xB=${mez2p2_idx[$bay]} ;; + esac + ;; + *BL460[cC]*) mez1bus=6 mez2bus=9 bay=$tbay ;; + *BL465[cC]*) mez1bus=5 mez2bus=8 bay=$tbay ;; + *BL490[cC]*) mez1bus=6 mez2bus=7 bay=$tbay ;; + *BL685[cC]*) mez1bus=41 mez2bus=6 mez3bus=44 full=1 bay=$(($tbay % 9)) ;; + *) warn_and_log Unknown blade type "$sysinfo" + return ;; + esac + + # mez1 only has port1 connected, mez2, mez3 can have both ports + + # If only one card, and two mez possible, we have to figure out which + # mez we are plugged into. + # On RHEL4U8, we look in the driver subdir, all others + # in the device/driver subdir for the pcie bus. + pciprefix="[0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F]:" + if [ ${bl2xB} -ne 0 ]; then + pbase="${pbase} 0,1=${bl2xB},${m2_bay_h1[$bay]}" + else while [ $nunit -ne 0 ]; do + (( nunit-- )) + buspath=$(readlink -m ${qb}${nunit}/device) + if [ -n "$(echo ${buspath} | grep "${pciprefix}$(printf "%02d" ${mez1bus}):")" ]; then + pbase="${pbase} ${nunit},1=${mez1p1_idx[$bay]},${m1_bay_h1[$bay]}" + if [ ${full} -eq 1 ]; then + pbase="${pbase} ${nunit},2=${mez1p2_idx[$bay]},${m1_bay_h1[$bay]}" + fi + elif [ -n "$(echo ${buspath} | grep "${pciprefix}$(printf "%02d" ${mez2bus}):")" ]; then + pbase="${pbase} ${nunit},1=${mez2p1_idx[$bay]},${m2_bay_h1[$bay]}" + pbase="${pbase} ${nunit},2=${mez2p2_idx[$bay]},${m2_bay_h1[$bay]}" + elif [ -n "$(echo ${buspath} | grep "${pciprefix}$(printf "%02d" ${mez3bus}):")" ]; then + pbase="${pbase} ${nunit},1=${mez3p1_idx[$bay]},${m3_bay_h1[$bay]}" + pbase="${pbase} ${nunit},2=${mez3p2_idx[$bay]},${m3_bay_h1[$bay]}" + else + warn_and_log Mismatch on mezbus ${mez1_bus},${mez2_bus},${mez3_bus} \ + and unit ${nunit}, no serdes setup + fi + done + fi + echo -n ${pbase} > $parmf +} + + + +setup_qme() +{ + local parm parmf sn pbase + local -i nunit=0 bay idx bpver=${QIB_QME_BPVER:1} + local -a bp0_idx bp1_idx set + + # tx settings for Dell Backplane v1.0 + bp0_idx=( 0 22 23 24 25 26 24 27 28 22 23 24 25 26 24 27 28 ) + # tx settings for Dell Backplane v1.1 + bp1_idx=( 0 29 29 30 31 32 33 30 29 29 29 30 31 32 33 30 29 ) + + for parm in parameters/${serdes_parm} ${serdes_parm}; do + if [ -e /sys/module/ib_qib/$parm ]; then + parmf=/sys/module/ib_qib/$parm + break; + fi + done + if [ ! "$parmf" ]; then + warn_and_log Unable to find ${serdes_parm} parameter + return + fi + + read pbase < $parmf + parm=( $(echo ${qb}*) ) + nunit=${#parm[*]} + + if [ -e /sys/module/ib_qib/parameters/qme_bp ]; then + read bpver < /sys/module/ib_qib/parameters/qme_bp + if [ ${bpver} -ne 0 -a ${bpver} -ne 1 ]; then + warn_and_log "Invalid Dell backplane version (${bpver}). Defaulting to 1." + bpver=1 + fi + fi + eval 'set=( ${bp'${bpver}'_idx[@]} )' + + # we get two serial numbers normally, use 2nd if present, else first + sn="$(dmidecode -t 2 | grep -i serial | tail -1)" + case ${sn} in + *[sS]erial\ [nN]umber*) + bay="$(echo $sn | sed -e 's/\.$//' -e 's/.*\.0*//' -e 's/[abcd]$//')" + if [ ${bay} -gt ${#set[@]} ]; then + warn_and_log Unexpected QME7342 bay info: ${sn}, no Tx params + return + fi + idx=${set[bay]} + # H1 is same for all QME bays, so no need to specify. + while [ $nunit -ne 0 ]; do + (( nunit-- )) + pbase="${pbase} ${nunit},1=${idx} ${nunit},2=${idx}" + done + echo -n ${pbase} > $parmf + ;; + *) warn_and_log No QME7342 bay information, no Tx params + return;; + esac +} + +has_qib=$(lspci -n 2>/dev/null | grep -i "1077\|1fc1") +if [ ! "${has_qib}" ]; then + exit 0 +fi + +case "$1" in +start) + has_qmh7342=$(grep QMH7342 ${qb}*/hca_type 2>/dev/null) + if [ "${has_qmh7342}" ]; then + setup_qmh + else + has_qme7342=$(grep QME7342 ${qb}*/hca_type 2>/dev/null) + if [ "${has_qme7342}" ]; then + setup_qme + fi + fi + + ;; +stop) + warn_and_log stop operation deprecated + ;; +esac diff --git a/providers/ipathverbs/truescale.conf.in b/providers/ipathverbs/truescale.conf.in new file mode 100644 index 0000000..e2827d9 --- /dev/null +++ b/providers/ipathverbs/truescale.conf.in @@ -0,0 +1 @@ +install ib_qib modprobe -i ib_qib $CMDLINE_OPTS && @CMAKE_INSTALL_FULL_LIBEXECDIR@/truescale-serdes.cmds start diff --git a/providers/ipathverbs/verbs.c b/providers/ipathverbs/verbs.c new file mode 100644 index 0000000..505ea58 --- /dev/null +++ b/providers/ipathverbs/verbs.c @@ -0,0 +1,679 @@ +/* + * Copyright (c) 2006-2009 QLogic Corp. All rights reserved. + * Copyright (c) 2005. PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Patent licenses, if any, provided herein do not apply to + * combinations of this program with other software, or any other + * product whatsoever. + */ + +#include <config.h> + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <pthread.h> +#include <sys/mman.h> +#include <errno.h> + +#include "ipathverbs.h" +#include "ipath-abi.h" + +int ipath_query_device(struct ibv_context *context, + struct ibv_device_attr *attr) +{ + struct ibv_query_device cmd; + uint64_t raw_fw_ver; + unsigned major, minor, sub_minor; + int ret; + + ret = ibv_cmd_query_device(context, attr, &raw_fw_ver, + &cmd, sizeof cmd); + if (ret) + return ret; + + major = (raw_fw_ver >> 32) & 0xffff; + minor = (raw_fw_ver >> 16) & 0xffff; + sub_minor = raw_fw_ver & 0xffff; + + snprintf(attr->fw_ver, sizeof attr->fw_ver, + "%d.%d.%d", major, minor, sub_minor); + + return 0; +} + +int ipath_query_port(struct ibv_context *context, uint8_t port, + struct ibv_port_attr *attr) +{ + struct ibv_query_port cmd; + + return ibv_cmd_query_port(context, port, attr, &cmd, sizeof cmd); +} + +struct ibv_pd *ipath_alloc_pd(struct ibv_context *context) +{ + struct ibv_alloc_pd cmd; + struct ib_uverbs_alloc_pd_resp resp; + struct ibv_pd *pd; + + pd = malloc(sizeof *pd); + if (!pd) + return NULL; + + if (ibv_cmd_alloc_pd(context, pd, &cmd, sizeof cmd, + &resp, sizeof resp)) { + free(pd); + return NULL; + } + + return pd; +} + +int ipath_free_pd(struct ibv_pd *pd) +{ + int ret; + + ret = ibv_cmd_dealloc_pd(pd); + if (ret) + return ret; + + free(pd); + return 0; +} + +struct ibv_mr *ipath_reg_mr(struct ibv_pd *pd, void *addr, size_t length, + uint64_t hca_va, int access) +{ + struct verbs_mr *vmr; + struct ibv_reg_mr cmd; + struct ib_uverbs_reg_mr_resp resp; + int ret; + + vmr = malloc(sizeof(*vmr)); + if (!vmr) + return NULL; + + ret = ibv_cmd_reg_mr(pd, addr, length, hca_va, access, vmr, &cmd, + sizeof(cmd), &resp, sizeof(resp)); + if (ret) { + free(vmr); + return NULL; + } + + return &vmr->ibv_mr; +} + +int ipath_dereg_mr(struct verbs_mr *vmr) +{ + int ret; + + ret = ibv_cmd_dereg_mr(vmr); + if (ret) + return ret; + + free(vmr); + return 0; +} + +struct ibv_cq *ipath_create_cq(struct ibv_context *context, int cqe, + struct ibv_comp_channel *channel, + int comp_vector) +{ + struct ipath_cq *cq; + struct ipath_create_cq_resp resp; + int ret; + size_t size; + + cq = malloc(sizeof *cq); + if (!cq) + return NULL; + + ret = ibv_cmd_create_cq(context, cqe, channel, comp_vector, + &cq->ibv_cq, NULL, 0, + &resp.ibv_resp, sizeof resp); + if (ret) { + free(cq); + return NULL; + } + + size = sizeof(struct ipath_cq_wc) + sizeof(struct ipath_wc) * cqe; + cq->queue = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, + context->cmd_fd, resp.offset); + if ((void *) cq->queue == MAP_FAILED) { + ibv_cmd_destroy_cq(&cq->ibv_cq); + free(cq); + return NULL; + } + + pthread_spin_init(&cq->lock, PTHREAD_PROCESS_PRIVATE); + return &cq->ibv_cq; +} + +struct ibv_cq *ipath_create_cq_v1(struct ibv_context *context, int cqe, + struct ibv_comp_channel *channel, + int comp_vector) +{ + struct ibv_cq *cq; + int ret; + + cq = malloc(sizeof *cq); + if (!cq) + return NULL; + + ret = ibv_cmd_create_cq(context, cqe, channel, comp_vector, + cq, NULL, 0, NULL, 0); + if (ret) { + free(cq); + return NULL; + } + + return cq; +} + +int ipath_resize_cq(struct ibv_cq *ibcq, int cqe) +{ + struct ipath_cq *cq = to_icq(ibcq); + struct ibv_resize_cq cmd; + struct ipath_resize_cq_resp resp; + size_t size; + int ret; + + pthread_spin_lock(&cq->lock); + /* Save the old size so we can unmmap the queue. */ + size = sizeof(struct ipath_cq_wc) + + (sizeof(struct ipath_wc) * cq->ibv_cq.cqe); + ret = ibv_cmd_resize_cq(ibcq, cqe, &cmd, sizeof cmd, + &resp.ibv_resp, sizeof resp); + if (ret) { + pthread_spin_unlock(&cq->lock); + return ret; + } + (void) munmap(cq->queue, size); + size = sizeof(struct ipath_cq_wc) + + (sizeof(struct ipath_wc) * cq->ibv_cq.cqe); + cq->queue = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, + ibcq->context->cmd_fd, resp.offset); + ret = errno; + pthread_spin_unlock(&cq->lock); + if ((void *) cq->queue == MAP_FAILED) + return ret; + return 0; +} + +int ipath_resize_cq_v1(struct ibv_cq *ibcq, int cqe) +{ + struct ibv_resize_cq cmd; + struct ib_uverbs_resize_cq_resp resp; + + return ibv_cmd_resize_cq(ibcq, cqe, &cmd, sizeof cmd, + &resp, sizeof resp); +} + +int ipath_destroy_cq(struct ibv_cq *ibcq) +{ + struct ipath_cq *cq = to_icq(ibcq); + int ret; + + ret = ibv_cmd_destroy_cq(ibcq); + if (ret) + return ret; + + (void) munmap(cq->queue, sizeof(struct ipath_cq_wc) + + (sizeof(struct ipath_wc) * cq->ibv_cq.cqe)); + free(cq); + return 0; +} + +int ipath_destroy_cq_v1(struct ibv_cq *ibcq) +{ + int ret; + + ret = ibv_cmd_destroy_cq(ibcq); + if (!ret) + free(ibcq); + return ret; +} + +int ipath_poll_cq(struct ibv_cq *ibcq, int ne, struct ibv_wc *wc) +{ + struct ipath_cq *cq = to_icq(ibcq); + struct ipath_cq_wc *q; + int npolled; + uint32_t tail; + + pthread_spin_lock(&cq->lock); + q = cq->queue; + tail = atomic_load_explicit(&q->tail, memory_order_relaxed); + for (npolled = 0; npolled < ne; ++npolled, ++wc) { + if (tail == atomic_load(&q->head)) + break; + + /* Make sure entry is read after head index is read. */ + atomic_thread_fence(memory_order_acquire); + memcpy(wc, &q->queue[tail], sizeof(*wc)); + if (tail == cq->ibv_cq.cqe) + tail = 0; + else + tail++; + } + atomic_store(&q->tail, tail); + pthread_spin_unlock(&cq->lock); + + return npolled; +} + +struct ibv_qp *ipath_create_qp(struct ibv_pd *pd, struct ibv_qp_init_attr *attr) +{ + struct ibv_create_qp cmd; + struct ipath_create_qp_resp resp; + struct ipath_qp *qp; + int ret; + size_t size; + + qp = malloc(sizeof *qp); + if (!qp) + return NULL; + + ret = ibv_cmd_create_qp(pd, &qp->ibv_qp, attr, &cmd, sizeof cmd, + &resp.ibv_resp, sizeof resp); + if (ret) { + free(qp); + return NULL; + } + + if (attr->srq) { + qp->rq.size = 0; + qp->rq.max_sge = 0; + qp->rq.rwq = NULL; + } else { + qp->rq.size = attr->cap.max_recv_wr + 1; + qp->rq.max_sge = attr->cap.max_recv_sge; + size = sizeof(struct ipath_rwq) + + (sizeof(struct ipath_rwqe) + + (sizeof(struct ibv_sge) * qp->rq.max_sge)) * + qp->rq.size; + qp->rq.rwq = mmap(NULL, size, + PROT_READ | PROT_WRITE, MAP_SHARED, + pd->context->cmd_fd, resp.offset); + if ((void *) qp->rq.rwq == MAP_FAILED) { + ibv_cmd_destroy_qp(&qp->ibv_qp); + free(qp); + return NULL; + } + } + + pthread_spin_init(&qp->rq.lock, PTHREAD_PROCESS_PRIVATE); + return &qp->ibv_qp; +} + +struct ibv_qp *ipath_create_qp_v1(struct ibv_pd *pd, + struct ibv_qp_init_attr *attr) +{ + struct ibv_create_qp cmd; + struct ib_uverbs_create_qp_resp resp; + struct ibv_qp *qp; + int ret; + + qp = malloc(sizeof *qp); + if (!qp) + return NULL; + + ret = ibv_cmd_create_qp(pd, qp, attr, &cmd, sizeof cmd, + &resp, sizeof resp); + if (ret) { + free(qp); + return NULL; + } + + return qp; +} + +int ipath_query_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, + int attr_mask, + struct ibv_qp_init_attr *init_attr) +{ + struct ibv_query_qp cmd; + + return ibv_cmd_query_qp(qp, attr, attr_mask, init_attr, + &cmd, sizeof cmd); +} + +int ipath_modify_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, + int attr_mask) +{ + struct ibv_modify_qp cmd = {}; + + return ibv_cmd_modify_qp(qp, attr, attr_mask, &cmd, sizeof cmd); +} + +int ipath_destroy_qp(struct ibv_qp *ibqp) +{ + struct ipath_qp *qp = to_iqp(ibqp); + int ret; + + ret = ibv_cmd_destroy_qp(ibqp); + if (ret) + return ret; + + if (qp->rq.rwq) { + size_t size; + + size = sizeof(struct ipath_rwq) + + (sizeof(struct ipath_rwqe) + + (sizeof(struct ibv_sge) * qp->rq.max_sge)) * + qp->rq.size; + (void) munmap(qp->rq.rwq, size); + } + free(qp); + return 0; +} + +int ipath_destroy_qp_v1(struct ibv_qp *ibqp) +{ + int ret; + + ret = ibv_cmd_destroy_qp(ibqp); + if (!ret) + free(ibqp); + return ret; +} + +int ipath_post_send(struct ibv_qp *qp, struct ibv_send_wr *wr, + struct ibv_send_wr **bad_wr) +{ + unsigned wr_count; + struct ibv_send_wr *i; + + /* Sanity check the number of WRs being posted */ + for (i = wr, wr_count = 0; i; i = i->next) + if (++wr_count > 10) + goto iter; + + return ibv_cmd_post_send(qp, wr, bad_wr); + +iter: + do { + struct ibv_send_wr *next; + int ret; + + next = i->next; + i->next = NULL; + ret = ibv_cmd_post_send(qp, wr, bad_wr); + i->next = next; + if (ret) + return ret; + if (next == NULL) + break; + wr = next; + for (i = wr, wr_count = 0; i->next; i = i->next) + if (++wr_count > 2) + break; + } while (1); + return 0; +} + +static int post_recv(struct ipath_rq *rq, struct ibv_recv_wr *wr, + struct ibv_recv_wr **bad_wr) +{ + struct ibv_recv_wr *i; + struct ipath_rwq *rwq; + struct ipath_rwqe *wqe; + uint32_t head; + int n, ret; + + pthread_spin_lock(&rq->lock); + rwq = rq->rwq; + head = atomic_load_explicit(&rwq->head, memory_order_relaxed); + for (i = wr; i; i = i->next) { + if ((unsigned) i->num_sge > rq->max_sge) { + ret = EINVAL; + goto bad; + } + wqe = get_rwqe_ptr(rq, head); + if (++head >= rq->size) + head = 0; + if (head == atomic_load(&rwq->tail)) { + ret = ENOMEM; + goto bad; + } + wqe->wr_id = i->wr_id; + wqe->num_sge = i->num_sge; + for (n = 0; n < wqe->num_sge; n++) + wqe->sg_list[n] = i->sg_list[n]; + + /* Make sure queue entry is written before the head index. */ + atomic_thread_fence(memory_order_release); + atomic_store(&rwq->head, head); + } + ret = 0; + goto done; + +bad: + if (bad_wr) + *bad_wr = i; +done: + pthread_spin_unlock(&rq->lock); + return ret; +} + +int ipath_post_recv(struct ibv_qp *ibqp, struct ibv_recv_wr *wr, + struct ibv_recv_wr **bad_wr) +{ + struct ipath_qp *qp = to_iqp(ibqp); + + return post_recv(&qp->rq, wr, bad_wr); +} + +struct ibv_srq *ipath_create_srq(struct ibv_pd *pd, + struct ibv_srq_init_attr *attr) +{ + struct ipath_srq *srq; + struct ibv_create_srq cmd; + struct ipath_create_srq_resp resp; + int ret; + size_t size; + + srq = malloc(sizeof *srq); + if (srq == NULL) + return NULL; + + ret = ibv_cmd_create_srq(pd, &srq->ibv_srq, attr, &cmd, sizeof cmd, + &resp.ibv_resp, sizeof resp); + if (ret) { + free(srq); + return NULL; + } + + srq->rq.size = attr->attr.max_wr + 1; + srq->rq.max_sge = attr->attr.max_sge; + size = sizeof(struct ipath_rwq) + + (sizeof(struct ipath_rwqe) + + (sizeof(struct ibv_sge) * srq->rq.max_sge)) * srq->rq.size; + srq->rq.rwq = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, + pd->context->cmd_fd, resp.offset); + if ((void *) srq->rq.rwq == MAP_FAILED) { + ibv_cmd_destroy_srq(&srq->ibv_srq); + free(srq); + return NULL; + } + + pthread_spin_init(&srq->rq.lock, PTHREAD_PROCESS_PRIVATE); + return &srq->ibv_srq; +} + +struct ibv_srq *ipath_create_srq_v1(struct ibv_pd *pd, + struct ibv_srq_init_attr *attr) +{ + struct ibv_srq *srq; + struct ibv_create_srq cmd; + struct ib_uverbs_create_srq_resp resp; + int ret; + + srq = malloc(sizeof *srq); + if (srq == NULL) + return NULL; + + ret = ibv_cmd_create_srq(pd, srq, attr, &cmd, sizeof cmd, + &resp, sizeof resp); + if (ret) { + free(srq); + return NULL; + } + + return srq; +} + +int ipath_modify_srq(struct ibv_srq *ibsrq, + struct ibv_srq_attr *attr, + int attr_mask) +{ + struct ipath_srq *srq = to_isrq(ibsrq); + struct ipath_modify_srq_cmd cmd; + __u64 offset; + size_t size = 0; /* Shut up gcc */ + int ret; + + if (attr_mask & IBV_SRQ_MAX_WR) { + pthread_spin_lock(&srq->rq.lock); + /* Save the old size so we can unmmap the queue. */ + size = sizeof(struct ipath_rwq) + + (sizeof(struct ipath_rwqe) + + (sizeof(struct ibv_sge) * srq->rq.max_sge)) * + srq->rq.size; + } + cmd.offset_addr = (uintptr_t) &offset; + ret = ibv_cmd_modify_srq(ibsrq, attr, attr_mask, + &cmd.ibv_cmd, sizeof cmd); + if (ret) { + if (attr_mask & IBV_SRQ_MAX_WR) + pthread_spin_unlock(&srq->rq.lock); + return ret; + } + if (attr_mask & IBV_SRQ_MAX_WR) { + (void) munmap(srq->rq.rwq, size); + srq->rq.size = attr->max_wr + 1; + size = sizeof(struct ipath_rwq) + + (sizeof(struct ipath_rwqe) + + (sizeof(struct ibv_sge) * srq->rq.max_sge)) * + srq->rq.size; + srq->rq.rwq = mmap(NULL, size, + PROT_READ | PROT_WRITE, MAP_SHARED, + ibsrq->context->cmd_fd, offset); + pthread_spin_unlock(&srq->rq.lock); + /* XXX Now we have no receive queue. */ + if ((void *) srq->rq.rwq == MAP_FAILED) + return errno; + } + return 0; +} + +int ipath_modify_srq_v1(struct ibv_srq *ibsrq, + struct ibv_srq_attr *attr, + int attr_mask) +{ + struct ibv_modify_srq cmd; + + return ibv_cmd_modify_srq(ibsrq, attr, attr_mask, + &cmd, sizeof cmd); +} + +int ipath_query_srq(struct ibv_srq *srq, struct ibv_srq_attr *attr) +{ + struct ibv_query_srq cmd; + + return ibv_cmd_query_srq(srq, attr, &cmd, sizeof cmd); +} + +int ipath_destroy_srq(struct ibv_srq *ibsrq) +{ + struct ipath_srq *srq = to_isrq(ibsrq); + size_t size; + int ret; + + ret = ibv_cmd_destroy_srq(ibsrq); + if (ret) + return ret; + + size = sizeof(struct ipath_rwq) + + (sizeof(struct ipath_rwqe) + + (sizeof(struct ibv_sge) * srq->rq.max_sge)) * srq->rq.size; + (void) munmap(srq->rq.rwq, size); + free(srq); + return 0; +} + +int ipath_destroy_srq_v1(struct ibv_srq *ibsrq) +{ + int ret; + + ret = ibv_cmd_destroy_srq(ibsrq); + if (!ret) + free(ibsrq); + return ret; +} + +int ipath_post_srq_recv(struct ibv_srq *ibsrq, struct ibv_recv_wr *wr, + struct ibv_recv_wr **bad_wr) +{ + struct ipath_srq *srq = to_isrq(ibsrq); + + return post_recv(&srq->rq, wr, bad_wr); +} + +struct ibv_ah *ipath_create_ah(struct ibv_pd *pd, struct ibv_ah_attr *attr) +{ + struct ibv_ah *ah; + struct ib_uverbs_create_ah_resp resp; + + ah = malloc(sizeof *ah); + if (ah == NULL) + return NULL; + + memset(&resp, 0, sizeof(resp)); + if (ibv_cmd_create_ah(pd, ah, attr, &resp, sizeof(resp))) { + free(ah); + return NULL; + } + + return ah; +} + +int ipath_destroy_ah(struct ibv_ah *ah) +{ + int ret; + + ret = ibv_cmd_destroy_ah(ah); + if (ret) + return ret; + + free(ah); + return 0; +} diff --git a/providers/mlx4/CMakeLists.txt b/providers/mlx4/CMakeLists.txt new file mode 100644 index 0000000..ad849f1 --- /dev/null +++ b/providers/mlx4/CMakeLists.txt @@ -0,0 +1,18 @@ +rdma_shared_provider(mlx4 libmlx4.map + 1 1.0.${PACKAGE_VERSION} + buf.c + cq.c + dbrec.c + mlx4.c + qp.c + srq.c + verbs.c +) + +publish_headers(infiniband + mlx4dv.h +) + +install(FILES "mlx4.conf" DESTINATION "${CMAKE_INSTALL_SYSCONFDIR}/modprobe.d/") + +rdma_pkg_config("mlx4" "libibverbs" "${CMAKE_THREAD_LIBS_INIT}") diff --git a/providers/mlx4/buf.c b/providers/mlx4/buf.c new file mode 100644 index 0000000..2983d5d --- /dev/null +++ b/providers/mlx4/buf.c @@ -0,0 +1,102 @@ +/* + * Copyright (c) 2006, 2007 Cisco, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <config.h> + +#include <stdlib.h> +#include <errno.h> +#include <sys/mman.h> + +#include "mlx4.h" + +static void mlx4_free_buf_extern(struct mlx4_context *ctx, struct mlx4_buf *buf) +{ + ibv_dofork_range(buf->buf, buf->length); + ctx->extern_alloc.free(buf->buf, ctx->extern_alloc.data); +} + +static int mlx4_alloc_buf_extern(struct mlx4_context *ctx, struct mlx4_buf *buf, + size_t size) +{ + void *addr; + + addr = ctx->extern_alloc.alloc(size, ctx->extern_alloc.data); + if (addr || size == 0) { + if (ibv_dontfork_range(addr, size)) { + ctx->extern_alloc.free(addr, + ctx->extern_alloc.data); + return -1; + } + buf->buf = addr; + buf->length = size; + return 0; + } + + return -1; +} + +static bool mlx4_is_extern_alloc(struct mlx4_context *context) +{ + return context->extern_alloc.alloc && context->extern_alloc.free; +} + +int mlx4_alloc_buf(struct mlx4_context *ctx, struct mlx4_buf *buf, + size_t size, int page_size) +{ + int ret; + + if (mlx4_is_extern_alloc(ctx)) + return mlx4_alloc_buf_extern(ctx, buf, size); + + buf->length = align(size, page_size); + buf->buf = mmap(NULL, buf->length, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (buf->buf == MAP_FAILED) + return errno; + + ret = ibv_dontfork_range(buf->buf, size); + if (ret) + munmap(buf->buf, buf->length); + + return ret; +} + +void mlx4_free_buf(struct mlx4_context *context, struct mlx4_buf *buf) +{ + if (mlx4_is_extern_alloc(context)) + return mlx4_free_buf_extern(context, buf); + + if (buf->length) { + ibv_dofork_range(buf->buf, buf->length); + munmap(buf->buf, buf->length); + } +} diff --git a/providers/mlx4/cq.c b/providers/mlx4/cq.c new file mode 100644 index 0000000..be3009c --- /dev/null +++ b/providers/mlx4/cq.c @@ -0,0 +1,785 @@ +/* + * Copyright (c) 2005 Topspin Communications. All rights reserved. + * Copyright (c) 2005 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2006, 2007 Cisco Systems. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <config.h> + +#include <stdio.h> +#include <stdlib.h> +#include <pthread.h> +#include <string.h> + +#include <util/compiler.h> +#include <util/mmio.h> +#include <infiniband/opcode.h> + +#include "mlx4.h" + +enum { + CQ_OK = 0, + CQ_EMPTY = -1, + CQ_POLL_ERR = -2 +}; + +static struct mlx4_cqe *get_cqe(struct mlx4_cq *cq, int entry) +{ + return cq->buf.buf + entry * cq->cqe_size; +} + +static void *get_sw_cqe(struct mlx4_cq *cq, int n) +{ + struct mlx4_cqe *cqe = get_cqe(cq, n & cq->ibv_cq.cqe); + struct mlx4_cqe *tcqe = cq->cqe_size == 64 ? cqe + 1 : cqe; + + return (!!(tcqe->owner_sr_opcode & MLX4_CQE_OWNER_MASK) ^ + !!(n & (cq->ibv_cq.cqe + 1))) ? NULL : cqe; +} + +static struct mlx4_cqe *next_cqe_sw(struct mlx4_cq *cq) +{ + return get_sw_cqe(cq, cq->cons_index); +} + +static enum ibv_wc_status mlx4_handle_error_cqe(struct mlx4_err_cqe *cqe) +{ + if (cqe->syndrome == MLX4_CQE_SYNDROME_LOCAL_QP_OP_ERR) + printf(PFX "local QP operation err " + "(QPN %06x, WQE index %x, vendor syndrome %02x, " + "opcode = %02x)\n", + htobe32(cqe->vlan_my_qpn), htobe32(cqe->wqe_index), + cqe->vendor_err, + cqe->owner_sr_opcode & ~MLX4_CQE_OWNER_MASK); + + switch (cqe->syndrome) { + case MLX4_CQE_SYNDROME_LOCAL_LENGTH_ERR: + return IBV_WC_LOC_LEN_ERR; + case MLX4_CQE_SYNDROME_LOCAL_QP_OP_ERR: + return IBV_WC_LOC_QP_OP_ERR; + case MLX4_CQE_SYNDROME_LOCAL_PROT_ERR: + return IBV_WC_LOC_PROT_ERR; + case MLX4_CQE_SYNDROME_WR_FLUSH_ERR: + return IBV_WC_WR_FLUSH_ERR; + case MLX4_CQE_SYNDROME_MW_BIND_ERR: + return IBV_WC_MW_BIND_ERR; + case MLX4_CQE_SYNDROME_BAD_RESP_ERR: + return IBV_WC_BAD_RESP_ERR; + case MLX4_CQE_SYNDROME_LOCAL_ACCESS_ERR: + return IBV_WC_LOC_ACCESS_ERR; + case MLX4_CQE_SYNDROME_REMOTE_INVAL_REQ_ERR: + return IBV_WC_REM_INV_REQ_ERR; + case MLX4_CQE_SYNDROME_REMOTE_ACCESS_ERR: + return IBV_WC_REM_ACCESS_ERR; + case MLX4_CQE_SYNDROME_REMOTE_OP_ERR: + return IBV_WC_REM_OP_ERR; + case MLX4_CQE_SYNDROME_TRANSPORT_RETRY_EXC_ERR: + return IBV_WC_RETRY_EXC_ERR; + case MLX4_CQE_SYNDROME_RNR_RETRY_EXC_ERR: + return IBV_WC_RNR_RETRY_EXC_ERR; + case MLX4_CQE_SYNDROME_REMOTE_ABORTED_ERR: + return IBV_WC_REM_ABORT_ERR; + default: + return IBV_WC_GENERAL_ERR; + } +} + +static inline void handle_good_req(struct ibv_wc *wc, struct mlx4_cqe *cqe) +{ + wc->wc_flags = 0; + switch (mlx4dv_get_cqe_opcode(cqe)) { + case MLX4_OPCODE_RDMA_WRITE_IMM: + wc->wc_flags |= IBV_WC_WITH_IMM; + SWITCH_FALLTHROUGH; + case MLX4_OPCODE_RDMA_WRITE: + wc->opcode = IBV_WC_RDMA_WRITE; + break; + case MLX4_OPCODE_SEND_IMM: + wc->wc_flags |= IBV_WC_WITH_IMM; + SWITCH_FALLTHROUGH; + case MLX4_OPCODE_SEND: + case MLX4_OPCODE_SEND_INVAL: + wc->opcode = IBV_WC_SEND; + break; + case MLX4_OPCODE_RDMA_READ: + wc->opcode = IBV_WC_RDMA_READ; + wc->byte_len = be32toh(cqe->byte_cnt); + break; + case MLX4_OPCODE_ATOMIC_CS: + wc->opcode = IBV_WC_COMP_SWAP; + wc->byte_len = 8; + break; + case MLX4_OPCODE_ATOMIC_FA: + wc->opcode = IBV_WC_FETCH_ADD; + wc->byte_len = 8; + break; + case MLX4_OPCODE_LOCAL_INVAL: + wc->opcode = IBV_WC_LOCAL_INV; + break; + case MLX4_OPCODE_BIND_MW: + wc->opcode = IBV_WC_BIND_MW; + break; + default: + /* assume it's a send completion */ + wc->opcode = IBV_WC_SEND; + break; + } +} + +static inline int mlx4_get_next_cqe(struct mlx4_cq *cq, + struct mlx4_cqe **pcqe) + ALWAYS_INLINE; +static inline int mlx4_get_next_cqe(struct mlx4_cq *cq, + struct mlx4_cqe **pcqe) +{ + struct mlx4_cqe *cqe; + + cqe = next_cqe_sw(cq); + if (!cqe) + return CQ_EMPTY; + + if (cq->cqe_size == 64) + ++cqe; + + ++cq->cons_index; + + VALGRIND_MAKE_MEM_DEFINED(cqe, sizeof *cqe); + + /* + * Make sure we read CQ entry contents after we've checked the + * ownership bit. + */ + udma_from_device_barrier(); + + *pcqe = cqe; + + return CQ_OK; +} + +static inline int mlx4_parse_cqe(struct mlx4_cq *cq, + struct mlx4_cqe *cqe, + struct mlx4_qp **cur_qp, + struct ibv_wc *wc, int lazy) + ALWAYS_INLINE; +static inline int mlx4_parse_cqe(struct mlx4_cq *cq, + struct mlx4_cqe *cqe, + struct mlx4_qp **cur_qp, + struct ibv_wc *wc, int lazy) +{ + struct mlx4_wq *wq; + struct mlx4_srq *srq; + uint32_t qpn; + uint32_t g_mlpath_rqpn; + uint64_t *pwr_id; + uint16_t wqe_index; + struct mlx4_err_cqe *ecqe; + struct mlx4_context *mctx; + int is_error; + int is_send; + enum ibv_wc_status *pstatus; + + mctx = to_mctx(cq->ibv_cq.context); + qpn = be32toh(cqe->vlan_my_qpn) & MLX4_CQE_QPN_MASK; + if (lazy) { + cq->cqe = cqe; + cq->flags &= (~MLX4_CQ_FLAGS_RX_CSUM_VALID); + } else + wc->qp_num = qpn; + + is_send = cqe->owner_sr_opcode & MLX4_CQE_IS_SEND_MASK; + is_error = (mlx4dv_get_cqe_opcode(cqe)) == + MLX4_CQE_OPCODE_ERROR; + + if ((qpn & MLX4_XRC_QPN_BIT) && !is_send) { + /* + * We do not have to take the XSRQ table lock here, + * because CQs will be locked while SRQs are removed + * from the table. + */ + srq = mlx4_find_xsrq(&mctx->xsrq_table, + be32toh(cqe->g_mlpath_rqpn) & MLX4_CQE_QPN_MASK); + if (!srq) + return CQ_POLL_ERR; + } else { + if (!*cur_qp || (qpn != (*cur_qp)->qpn_cache)) { + /* + * We do not have to take the QP table lock here, + * because CQs will be locked while QPs are removed + * from the table. + */ + *cur_qp = mlx4_find_qp(mctx, qpn); + if (!*cur_qp) + return CQ_POLL_ERR; + } + srq = ((*cur_qp)->type == MLX4_RSC_TYPE_SRQ) ? + to_msrq((*cur_qp)->verbs_qp.qp.srq) : NULL; + } + + pwr_id = lazy ? &cq->ibv_cq.wr_id : &wc->wr_id; + if (is_send) { + wq = &(*cur_qp)->sq; + wqe_index = be16toh(cqe->wqe_index); + wq->tail += (uint16_t) (wqe_index - (uint16_t) wq->tail); + *pwr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)]; + ++wq->tail; + } else if (srq) { + wqe_index = be16toh(cqe->wqe_index); + *pwr_id = srq->wrid[wqe_index]; + mlx4_free_srq_wqe(srq, wqe_index); + } else { + wq = &(*cur_qp)->rq; + *pwr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)]; + ++wq->tail; + } + + pstatus = lazy ? &cq->ibv_cq.status : &wc->status; + if (is_error) { + ecqe = (struct mlx4_err_cqe *)cqe; + *pstatus = mlx4_handle_error_cqe(ecqe); + if (!lazy) + wc->vendor_err = ecqe->vendor_err; + return CQ_OK; + } + + *pstatus = IBV_WC_SUCCESS; + if (lazy) { + if (!is_send) + if ((*cur_qp) && ((*cur_qp)->qp_cap_cache & MLX4_RX_CSUM_VALID)) + cq->flags |= MLX4_CQ_FLAGS_RX_CSUM_VALID; + } else if (is_send) { + handle_good_req(wc, cqe); + } else { + wc->byte_len = be32toh(cqe->byte_cnt); + + switch (mlx4dv_get_cqe_opcode(cqe)) { + case MLX4_RECV_OPCODE_RDMA_WRITE_IMM: + wc->opcode = IBV_WC_RECV_RDMA_WITH_IMM; + wc->wc_flags = IBV_WC_WITH_IMM; + wc->imm_data = cqe->immed_rss_invalid; + break; + case MLX4_RECV_OPCODE_SEND_INVAL: + wc->opcode = IBV_WC_RECV; + wc->wc_flags |= IBV_WC_WITH_INV; + wc->invalidated_rkey = be32toh(cqe->immed_rss_invalid); + break; + case MLX4_RECV_OPCODE_SEND: + wc->opcode = IBV_WC_RECV; + wc->wc_flags = 0; + break; + case MLX4_RECV_OPCODE_SEND_IMM: + wc->opcode = IBV_WC_RECV; + wc->wc_flags = IBV_WC_WITH_IMM; + wc->imm_data = cqe->immed_rss_invalid; + break; + } + + wc->slid = be16toh(cqe->rlid); + g_mlpath_rqpn = be32toh(cqe->g_mlpath_rqpn); + wc->src_qp = g_mlpath_rqpn & 0xffffff; + wc->dlid_path_bits = (g_mlpath_rqpn >> 24) & 0x7f; + wc->wc_flags |= g_mlpath_rqpn & 0x80000000 ? IBV_WC_GRH : 0; + wc->pkey_index = be32toh(cqe->immed_rss_invalid) & 0x7f; + /* When working with xrc srqs, don't have qp to check link layer. + * Using IB SL, should consider Roce. (TBD) + */ + if ((*cur_qp) && (*cur_qp)->link_layer == IBV_LINK_LAYER_ETHERNET) + wc->sl = be16toh(cqe->sl_vid) >> 13; + else + wc->sl = be16toh(cqe->sl_vid) >> 12; + + if ((*cur_qp) && ((*cur_qp)->qp_cap_cache & MLX4_RX_CSUM_VALID)) { + wc->wc_flags |= ((cqe->status & htobe32(MLX4_CQE_STATUS_IPV4_CSUM_OK)) == + htobe32(MLX4_CQE_STATUS_IPV4_CSUM_OK)) << + IBV_WC_IP_CSUM_OK_SHIFT; + } + } + + return CQ_OK; +} + +static inline int mlx4_parse_lazy_cqe(struct mlx4_cq *cq, + struct mlx4_cqe *cqe) + ALWAYS_INLINE; +static inline int mlx4_parse_lazy_cqe(struct mlx4_cq *cq, + struct mlx4_cqe *cqe) +{ + return mlx4_parse_cqe(cq, cqe, &cq->cur_qp, NULL, 1); +} + +static inline int mlx4_poll_one(struct mlx4_cq *cq, + struct mlx4_qp **cur_qp, + struct ibv_wc *wc) + ALWAYS_INLINE; +static inline int mlx4_poll_one(struct mlx4_cq *cq, + struct mlx4_qp **cur_qp, + struct ibv_wc *wc) +{ + struct mlx4_cqe *cqe; + int err; + + err = mlx4_get_next_cqe(cq, &cqe); + if (err == CQ_EMPTY) + return err; + + return mlx4_parse_cqe(cq, cqe, cur_qp, wc, 0); +} + +int mlx4_poll_cq(struct ibv_cq *ibcq, int ne, struct ibv_wc *wc) +{ + struct mlx4_cq *cq = to_mcq(ibcq); + struct mlx4_qp *qp = NULL; + int npolled; + int err = CQ_OK; + + pthread_spin_lock(&cq->lock); + + for (npolled = 0; npolled < ne; ++npolled) { + err = mlx4_poll_one(cq, &qp, wc + npolled); + if (err != CQ_OK) + break; + } + + if (npolled || err == CQ_POLL_ERR) + mlx4_update_cons_index(cq); + + pthread_spin_unlock(&cq->lock); + + return err == CQ_POLL_ERR ? err : npolled; +} + +static inline void _mlx4_end_poll(struct ibv_cq_ex *ibcq, int lock) + ALWAYS_INLINE; +static inline void _mlx4_end_poll(struct ibv_cq_ex *ibcq, int lock) +{ + struct mlx4_cq *cq = to_mcq(ibv_cq_ex_to_cq(ibcq)); + + mlx4_update_cons_index(cq); + + if (lock) + pthread_spin_unlock(&cq->lock); +} + +static inline int _mlx4_start_poll(struct ibv_cq_ex *ibcq, + struct ibv_poll_cq_attr *attr, + int lock) + ALWAYS_INLINE; +static inline int _mlx4_start_poll(struct ibv_cq_ex *ibcq, + struct ibv_poll_cq_attr *attr, + int lock) +{ + struct mlx4_cq *cq = to_mcq(ibv_cq_ex_to_cq(ibcq)); + struct mlx4_cqe *cqe; + int err; + + if (unlikely(attr->comp_mask)) + return EINVAL; + + if (lock) + pthread_spin_lock(&cq->lock); + + cq->cur_qp = NULL; + + err = mlx4_get_next_cqe(cq, &cqe); + if (err == CQ_EMPTY) { + if (lock) + pthread_spin_unlock(&cq->lock); + return ENOENT; + } + + err = mlx4_parse_lazy_cqe(cq, cqe); + if (lock && err) + pthread_spin_unlock(&cq->lock); + + return err; +} + +static int mlx4_next_poll(struct ibv_cq_ex *ibcq) +{ + struct mlx4_cq *cq = to_mcq(ibv_cq_ex_to_cq(ibcq)); + struct mlx4_cqe *cqe; + int err; + + err = mlx4_get_next_cqe(cq, &cqe); + if (err == CQ_EMPTY) + return ENOENT; + + return mlx4_parse_lazy_cqe(cq, cqe); +} + +static void mlx4_end_poll(struct ibv_cq_ex *ibcq) +{ + _mlx4_end_poll(ibcq, 0); +} + +static void mlx4_end_poll_lock(struct ibv_cq_ex *ibcq) +{ + _mlx4_end_poll(ibcq, 1); +} + +static int mlx4_start_poll(struct ibv_cq_ex *ibcq, + struct ibv_poll_cq_attr *attr) +{ + return _mlx4_start_poll(ibcq, attr, 0); +} + +static int mlx4_start_poll_lock(struct ibv_cq_ex *ibcq, + struct ibv_poll_cq_attr *attr) +{ + return _mlx4_start_poll(ibcq, attr, 1); +} + +static enum ibv_wc_opcode mlx4_cq_read_wc_opcode(struct ibv_cq_ex *ibcq) +{ + struct mlx4_cq *cq = to_mcq(ibv_cq_ex_to_cq(ibcq)); + + if (cq->cqe->owner_sr_opcode & MLX4_CQE_IS_SEND_MASK) { + switch (mlx4dv_get_cqe_opcode(cq->cqe)) { + case MLX4_OPCODE_RDMA_WRITE_IMM: + case MLX4_OPCODE_RDMA_WRITE: + return IBV_WC_RDMA_WRITE; + case MLX4_OPCODE_SEND_INVAL: + case MLX4_OPCODE_SEND_IMM: + case MLX4_OPCODE_SEND: + return IBV_WC_SEND; + case MLX4_OPCODE_RDMA_READ: + return IBV_WC_RDMA_READ; + case MLX4_OPCODE_ATOMIC_CS: + return IBV_WC_COMP_SWAP; + case MLX4_OPCODE_ATOMIC_FA: + return IBV_WC_FETCH_ADD; + case MLX4_OPCODE_LOCAL_INVAL: + return IBV_WC_LOCAL_INV; + case MLX4_OPCODE_BIND_MW: + return IBV_WC_BIND_MW; + } + } else { + switch (mlx4dv_get_cqe_opcode(cq->cqe)) { + case MLX4_RECV_OPCODE_RDMA_WRITE_IMM: + return IBV_WC_RECV_RDMA_WITH_IMM; + case MLX4_RECV_OPCODE_SEND_INVAL: + case MLX4_RECV_OPCODE_SEND_IMM: + case MLX4_RECV_OPCODE_SEND: + return IBV_WC_RECV; + } + } + + return 0; +} + +static uint32_t mlx4_cq_read_wc_qp_num(struct ibv_cq_ex *ibcq) +{ + struct mlx4_cq *cq = to_mcq(ibv_cq_ex_to_cq(ibcq)); + + return be32toh(cq->cqe->vlan_my_qpn) & MLX4_CQE_QPN_MASK; +} + +static unsigned int mlx4_cq_read_wc_flags(struct ibv_cq_ex *ibcq) +{ + struct mlx4_cq *cq = to_mcq(ibv_cq_ex_to_cq(ibcq)); + int is_send = cq->cqe->owner_sr_opcode & MLX4_CQE_IS_SEND_MASK; + int wc_flags = 0; + + if (is_send) { + switch (mlx4dv_get_cqe_opcode(cq->cqe)) { + case MLX4_OPCODE_RDMA_WRITE_IMM: + case MLX4_OPCODE_SEND_IMM: + wc_flags |= IBV_WC_WITH_IMM; + break; + } + } else { + if (cq->flags & MLX4_CQ_FLAGS_RX_CSUM_VALID) + wc_flags |= ((cq->cqe->status & + htobe32(MLX4_CQE_STATUS_IPV4_CSUM_OK)) == + htobe32(MLX4_CQE_STATUS_IPV4_CSUM_OK)) << + IBV_WC_IP_CSUM_OK_SHIFT; + + switch (mlx4dv_get_cqe_opcode(cq->cqe)) { + case MLX4_RECV_OPCODE_RDMA_WRITE_IMM: + case MLX4_RECV_OPCODE_SEND_IMM: + wc_flags |= IBV_WC_WITH_IMM; + break; + case MLX4_RECV_OPCODE_SEND_INVAL: + wc_flags |= IBV_WC_WITH_INV; + break; + } + wc_flags |= (be32toh(cq->cqe->g_mlpath_rqpn) & 0x80000000) ? IBV_WC_GRH : 0; + } + + return wc_flags; +} + +static uint32_t mlx4_cq_read_wc_byte_len(struct ibv_cq_ex *ibcq) +{ + struct mlx4_cq *cq = to_mcq(ibv_cq_ex_to_cq(ibcq)); + + return be32toh(cq->cqe->byte_cnt); +} + +static uint32_t mlx4_cq_read_wc_vendor_err(struct ibv_cq_ex *ibcq) +{ + struct mlx4_cq *cq = to_mcq(ibv_cq_ex_to_cq(ibcq)); + struct mlx4_err_cqe *ecqe = (struct mlx4_err_cqe *)cq->cqe; + + return ecqe->vendor_err; +} + +static __be32 mlx4_cq_read_wc_imm_data(struct ibv_cq_ex *ibcq) +{ + struct mlx4_cq *cq = to_mcq(ibv_cq_ex_to_cq(ibcq)); + + switch (mlx4dv_get_cqe_opcode(cq->cqe)) { + case MLX4_RECV_OPCODE_SEND_INVAL: + /* This is returning invalidate_rkey which is in host order, see + * ibv_wc_read_invalidated_rkey + */ + return (__force __be32)be32toh(cq->cqe->immed_rss_invalid); + default: + return cq->cqe->immed_rss_invalid; + } +} + +static uint32_t mlx4_cq_read_wc_slid(struct ibv_cq_ex *ibcq) +{ + struct mlx4_cq *cq = to_mcq(ibv_cq_ex_to_cq(ibcq)); + + return (uint32_t)be16toh(cq->cqe->rlid); +} + +static uint8_t mlx4_cq_read_wc_sl(struct ibv_cq_ex *ibcq) +{ + struct mlx4_cq *cq = to_mcq(ibv_cq_ex_to_cq(ibcq)); + + if ((cq->cur_qp) && (cq->cur_qp->link_layer == IBV_LINK_LAYER_ETHERNET)) + return be16toh(cq->cqe->sl_vid) >> 13; + else + return be16toh(cq->cqe->sl_vid) >> 12; +} + +static uint32_t mlx4_cq_read_wc_src_qp(struct ibv_cq_ex *ibcq) +{ + struct mlx4_cq *cq = to_mcq(ibv_cq_ex_to_cq(ibcq)); + + return be32toh(cq->cqe->g_mlpath_rqpn) & 0xffffff; +} + +static uint8_t mlx4_cq_read_wc_dlid_path_bits(struct ibv_cq_ex *ibcq) +{ + struct mlx4_cq *cq = to_mcq(ibv_cq_ex_to_cq(ibcq)); + + return (be32toh(cq->cqe->g_mlpath_rqpn) >> 24) & 0x7f; +} + +static uint64_t mlx4_cq_read_wc_completion_ts(struct ibv_cq_ex *ibcq) +{ + struct mlx4_cq *cq = to_mcq(ibv_cq_ex_to_cq(ibcq)); + + return ((uint64_t)be32toh(cq->cqe->ts_47_16) << 16) | + (cq->cqe->ts_15_8 << 8) | + (cq->cqe->ts_7_0); +} + +void mlx4_cq_fill_pfns(struct mlx4_cq *cq, const struct ibv_cq_init_attr_ex *cq_attr) +{ + + if (cq->flags & MLX4_CQ_FLAGS_SINGLE_THREADED) { + cq->ibv_cq.start_poll = mlx4_start_poll; + cq->ibv_cq.end_poll = mlx4_end_poll; + } else { + cq->ibv_cq.start_poll = mlx4_start_poll_lock; + cq->ibv_cq.end_poll = mlx4_end_poll_lock; + } + cq->ibv_cq.next_poll = mlx4_next_poll; + + cq->ibv_cq.read_opcode = mlx4_cq_read_wc_opcode; + cq->ibv_cq.read_vendor_err = mlx4_cq_read_wc_vendor_err; + cq->ibv_cq.read_wc_flags = mlx4_cq_read_wc_flags; + if (cq_attr->wc_flags & IBV_WC_EX_WITH_BYTE_LEN) + cq->ibv_cq.read_byte_len = mlx4_cq_read_wc_byte_len; + if (cq_attr->wc_flags & IBV_WC_EX_WITH_IMM) + cq->ibv_cq.read_imm_data = mlx4_cq_read_wc_imm_data; + if (cq_attr->wc_flags & IBV_WC_EX_WITH_QP_NUM) + cq->ibv_cq.read_qp_num = mlx4_cq_read_wc_qp_num; + if (cq_attr->wc_flags & IBV_WC_EX_WITH_SRC_QP) + cq->ibv_cq.read_src_qp = mlx4_cq_read_wc_src_qp; + if (cq_attr->wc_flags & IBV_WC_EX_WITH_SLID) + cq->ibv_cq.read_slid = mlx4_cq_read_wc_slid; + if (cq_attr->wc_flags & IBV_WC_EX_WITH_SL) + cq->ibv_cq.read_sl = mlx4_cq_read_wc_sl; + if (cq_attr->wc_flags & IBV_WC_EX_WITH_DLID_PATH_BITS) + cq->ibv_cq.read_dlid_path_bits = mlx4_cq_read_wc_dlid_path_bits; + if (cq_attr->wc_flags & IBV_WC_EX_WITH_COMPLETION_TIMESTAMP) + cq->ibv_cq.read_completion_ts = mlx4_cq_read_wc_completion_ts; +} + +int mlx4_arm_cq(struct ibv_cq *ibvcq, int solicited) +{ + struct mlx4_cq *cq = to_mcq(ibvcq); + uint64_t doorbell; + uint32_t sn; + uint32_t ci; + uint32_t cmd; + + sn = cq->arm_sn & 3; + ci = cq->cons_index & 0xffffff; + cmd = solicited ? MLX4_CQ_DB_REQ_NOT_SOL : MLX4_CQ_DB_REQ_NOT; + + doorbell = sn << 28 | cmd | cq->cqn; + doorbell <<= 32; + doorbell |= ci; + + *cq->arm_db = htobe32(sn << 28 | cmd | ci); + + /* + * Make sure that the doorbell record in host memory is + * written before ringing the doorbell via PCI MMIO. + */ + udma_to_device_barrier(); + + mmio_write64_be(to_mctx(ibvcq->context)->uar + MLX4_CQ_DOORBELL, + htobe64(doorbell)); + + return 0; +} + +void mlx4_cq_event(struct ibv_cq *cq) +{ + to_mcq(cq)->arm_sn++; +} + +void __mlx4_cq_clean(struct mlx4_cq *cq, uint32_t qpn, struct mlx4_srq *srq) +{ + struct mlx4_cqe *cqe, *dest; + uint32_t prod_index; + uint8_t owner_bit; + int nfreed = 0; + int cqe_inc = cq->cqe_size == 64 ? 1 : 0; + + if (!cq || cq->flags & MLX4_CQ_FLAGS_DV_OWNED) + return; + + /* + * First we need to find the current producer index, so we + * know where to start cleaning from. It doesn't matter if HW + * adds new entries after this loop -- the QP we're worried + * about is already in RESET, so the new entries won't come + * from our QP and therefore don't need to be checked. + */ + for (prod_index = cq->cons_index; get_sw_cqe(cq, prod_index); ++prod_index) + if (prod_index == cq->cons_index + cq->ibv_cq.cqe) + break; + + /* + * Now sweep backwards through the CQ, removing CQ entries + * that match our QP by copying older entries on top of them. + */ + while ((int) --prod_index - (int) cq->cons_index >= 0) { + cqe = get_cqe(cq, prod_index & cq->ibv_cq.cqe); + cqe += cqe_inc; + if (srq && srq->ext_srq && + (be32toh(cqe->g_mlpath_rqpn) & MLX4_CQE_QPN_MASK) == srq->verbs_srq.srq_num && + !(cqe->owner_sr_opcode & MLX4_CQE_IS_SEND_MASK)) { + mlx4_free_srq_wqe(srq, be16toh(cqe->wqe_index)); + ++nfreed; + } else if ((be32toh(cqe->vlan_my_qpn) & MLX4_CQE_QPN_MASK) == qpn) { + if (srq && !(cqe->owner_sr_opcode & MLX4_CQE_IS_SEND_MASK)) + mlx4_free_srq_wqe(srq, be16toh(cqe->wqe_index)); + ++nfreed; + } else if (nfreed) { + dest = get_cqe(cq, (prod_index + nfreed) & cq->ibv_cq.cqe); + dest += cqe_inc; + owner_bit = dest->owner_sr_opcode & MLX4_CQE_OWNER_MASK; + memcpy(dest, cqe, sizeof *cqe); + dest->owner_sr_opcode = owner_bit | + (dest->owner_sr_opcode & ~MLX4_CQE_OWNER_MASK); + } + } + + if (nfreed) { + cq->cons_index += nfreed; + /* + * Make sure update of buffer contents is done before + * updating consumer index. + */ + udma_to_device_barrier(); + mlx4_update_cons_index(cq); + } +} + +void mlx4_cq_clean(struct mlx4_cq *cq, uint32_t qpn, struct mlx4_srq *srq) +{ + pthread_spin_lock(&cq->lock); + __mlx4_cq_clean(cq, qpn, srq); + pthread_spin_unlock(&cq->lock); +} + +int mlx4_get_outstanding_cqes(struct mlx4_cq *cq) +{ + uint32_t i; + + for (i = cq->cons_index; get_sw_cqe(cq, i); ++i) + ; + + return i - cq->cons_index; +} + +void mlx4_cq_resize_copy_cqes(struct mlx4_cq *cq, void *buf, int old_cqe) +{ + struct mlx4_cqe *cqe; + int i; + int cqe_inc = cq->cqe_size == 64 ? 1 : 0; + + i = cq->cons_index; + cqe = get_cqe(cq, (i & old_cqe)); + cqe += cqe_inc; + + while ((mlx4dv_get_cqe_opcode(cqe)) != MLX4_CQE_OPCODE_RESIZE) { + cqe->owner_sr_opcode = (cqe->owner_sr_opcode & ~MLX4_CQE_OWNER_MASK) | + (((i + 1) & (cq->ibv_cq.cqe + 1)) ? MLX4_CQE_OWNER_MASK : 0); + memcpy(buf + ((i + 1) & cq->ibv_cq.cqe) * cq->cqe_size, + cqe - cqe_inc, cq->cqe_size); + ++i; + cqe = get_cqe(cq, (i & old_cqe)); + cqe += cqe_inc; + } + + ++cq->cons_index; +} + +int mlx4_alloc_cq_buf(struct mlx4_device *dev, struct mlx4_context *ctx, + struct mlx4_buf *buf, int nent, int entry_size) +{ + if (mlx4_alloc_buf(ctx, buf, align(nent * entry_size, dev->page_size), + dev->page_size)) + return -1; + memset(buf->buf, 0, nent * entry_size); + + return 0; +} diff --git a/providers/mlx4/dbrec.c b/providers/mlx4/dbrec.c new file mode 100644 index 0000000..97c616a --- /dev/null +++ b/providers/mlx4/dbrec.c @@ -0,0 +1,152 @@ +/* + * Copyright (c) 2005 Topspin Communications. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#define _GNU_SOURCE +#include <config.h> + +#include <stdlib.h> +#include <pthread.h> +#include <string.h> + +#include "mlx4.h" + +struct mlx4_db_page { + struct mlx4_db_page *prev, *next; + struct mlx4_buf buf; + int num_db; + int use_cnt; + unsigned long free[0]; +}; + +static const int db_size[] = { + [MLX4_DB_TYPE_CQ] = 8, + [MLX4_DB_TYPE_RQ] = 4, +}; + +static struct mlx4_db_page *__add_page(struct mlx4_context *context, + enum mlx4_db_type type) +{ + struct mlx4_db_page *page; + int ps = to_mdev(context->ibv_ctx.context.device)->page_size; + int pp; + int i; + + pp = ps / db_size[type]; + + page = malloc(sizeof *page + pp / 8); + if (!page) + return NULL; + + if (mlx4_alloc_buf(context, &page->buf, ps, ps)) { + free(page); + return NULL; + } + + page->num_db = pp; + page->use_cnt = 0; + for (i = 0; i < pp / (sizeof (long) * 8); ++i) + page->free[i] = ~0; + + page->prev = NULL; + page->next = context->db_list[type]; + context->db_list[type] = page; + if (page->next) + page->next->prev = page; + + return page; +} + +__be32 *mlx4_alloc_db(struct mlx4_context *context, enum mlx4_db_type type) +{ + struct mlx4_db_page *page; + __be32 *db = NULL; + int i, j; + + pthread_mutex_lock(&context->db_list_mutex); + + for (page = context->db_list[type]; page; page = page->next) + if (page->use_cnt < page->num_db) + goto found; + + page = __add_page(context, type); + if (!page) + goto out; + +found: + ++page->use_cnt; + + for (i = 0; !page->free[i]; ++i) + /* nothing */; + + j = ffsl(page->free[i]); + page->free[i] &= ~(1UL << (j - 1)); + db = page->buf.buf + (i * 8 * sizeof (long) + (j - 1)) * db_size[type]; + +out: + pthread_mutex_unlock(&context->db_list_mutex); + + return db; +} + +void mlx4_free_db(struct mlx4_context *context, enum mlx4_db_type type, + __be32 *db) +{ + struct mlx4_db_page *page; + uintptr_t ps = to_mdev(context->ibv_ctx.context.device)->page_size; + int i; + + pthread_mutex_lock(&context->db_list_mutex); + + for (page = context->db_list[type]; page; page = page->next) + if (((uintptr_t) db & ~(ps - 1)) == (uintptr_t) page->buf.buf) + break; + + if (!page) + goto out; + + i = ((void *) db - page->buf.buf) / db_size[type]; + page->free[i / (8 * sizeof (long))] |= 1UL << (i % (8 * sizeof (long))); + + if (!--page->use_cnt) { + if (page->prev) + page->prev->next = page->next; + else + context->db_list[type] = page->next; + if (page->next) + page->next->prev = page->prev; + + mlx4_free_buf(context, &page->buf); + free(page); + } + +out: + pthread_mutex_unlock(&context->db_list_mutex); +} diff --git a/providers/mlx4/libmlx4.map b/providers/mlx4/libmlx4.map new file mode 100644 index 0000000..ac2c772 --- /dev/null +++ b/providers/mlx4/libmlx4.map @@ -0,0 +1,10 @@ +/* Export symbols should be added below according to + Documentation/versioning.md document. */ +MLX4_1.0 { + global: + mlx4dv_init_obj; + mlx4dv_query_device; + mlx4dv_create_qp; + mlx4dv_set_context_attr; + local: *; +}; diff --git a/providers/mlx4/man/CMakeLists.txt b/providers/mlx4/man/CMakeLists.txt new file mode 100644 index 0000000..3df77ca --- /dev/null +++ b/providers/mlx4/man/CMakeLists.txt @@ -0,0 +1,6 @@ +rdma_man_pages( + mlx4dv_init_obj.3 + mlx4dv_query_device.3 + mlx4dv_set_context_attr.3.md + mlx4dv.7 +) diff --git a/providers/mlx4/man/mlx4dv.7 b/providers/mlx4/man/mlx4dv.7 new file mode 100644 index 0000000..e3e4cdc --- /dev/null +++ b/providers/mlx4/man/mlx4dv.7 @@ -0,0 +1,40 @@ +.\" -*- nroff -*- +.\" Copyright (c) 2017 Mellanox Technologies, Inc. +.\" Licensed under the OpenIB.org (MIT) - See COPYING.md +.\" +.TH MLX4DV 7 2017-04-19 1.0.0 +.SH "NAME" +mlx4dv \- Direct verbs for mlx4 devices +.br +This is low level access to mlx4 devices to perform data path operations, +without general branching performed by \fBibv_post_send\fR(3). + +.SH "DESCRIPTION" +The libibverbs API is an abstract one. It is agnostic to any underlying +provider specific implementation. While this abstraction has the advantage +of user applications portability it has a performance penalty. For some +applications optimizing performance is more important than portability. + +The mlx4 direct verbs API is intended for such applications. +It exposes mlx4 specific low level data path (send/receive/completion) +operations, allowing the application to bypass the libibverbs data path API. + +This interface consists from one hardware specific header file +with relevant inline functions and conversion logic from ibverbs structures +to mlx4 specific structures. + +The direct include of mlx4dv.h together with linkage to mlx4 library will +allow usage of this new interface. + +Once an application uses the direct flow the locking scheme is fully managed +by itself. There is an expectation that no mixed flows in the data path for both +direct/non-direct access will be by same application. + +.SH "NOTES" +.SH "SEE ALSO" +.BR ibv_post_send (3), +.BR verbs (7) + +.SH "AUTHORS" +.TP +Maor Gottlieb <maorg@mellanox.com> diff --git a/providers/mlx4/man/mlx4dv_init_obj.3 b/providers/mlx4/man/mlx4dv_init_obj.3 new file mode 100644 index 0000000..7d35cc3 --- /dev/null +++ b/providers/mlx4/man/mlx4dv_init_obj.3 @@ -0,0 +1,140 @@ +.\" -*- nroff -*- +.\" Copyright (c) 2017 Mellanox Technologies, Inc. +.\" Licensed under the OpenIB.org (MIT) - See COPYING.md +.\" +.TH MLX4DV_INIT_OBJ 3 2017-02-02 1.0.0 +.SH "NAME" +mlx4dv_init_obj \- Initialize mlx4 direct verbs object from ibv_xxx structures +.SH "SYNOPSIS" +.nf +.B #include <infiniband/mlx4dv.h> +.sp +.BI "int mlx4dv_init_obj(struct mlx4dv_obj *obj, uint64_t obj_type); +.fi +.SH "DESCRIPTION" +.B mlx4dv_init_obj() +This function will initialize mlx4dv_xxx structs based on supplied type. The information +for initialization is taken from ibv_xx structs supplied as part of input. + +Request information of CQ marks its owned by direct verbs for all consumer index +related actions. The initialization type can be combination of several types together. +.PP +.nf +struct mlx4dv_qp { +.in +8 +uint32_t *rdb; +uint32_t *sdb; +struct { +.in +8 +uint32_t wqe_cnt; +int wqe_shift; +int offset; +.in -8 +} sq; +struct { +.in +8 +uint32_t wqe_cnt; +int wqe_shift; +int offset; +.in -8 +} rq; +struct { +.in +8 +void *buf; +size_t length; +.in -8 +} buf; +uint64_t comp_mask; /* Use enum mlx4dv_qp_comp_mask */ +off_t uar_mmap_offset; /* If MLX4DV_QP_MASK_UAR_MMAP_OFFSET is set in comp_mask, this will contain the mmap offset of *sdb* */ +.in -8 +}; + +struct mlx4dv_cq { +.in +8 +struct { +.in +8 +void *buf; +size_t length; +.in -8 +} buf; +uint32_t cqe_cnt; +uint32_t cqn; +uint32_t *set_ci_db; +uint32_t *arm_db; +int arm_sn; +int cqe_size; +uint64_t comp_mask; /* Use enum mlx4dv_cq_comp_mask */ +void *cq_uar; +.in -8 +}; + +struct mlx4dv_srq { +.in +8 +struct { +.in +8 +void *buf; +size_t length; +.in -8 +} buf; +int wqe_shift; +int head; +int tail; +uint32_t *db; +uint64_t comp_mask; +.in -8 +}; + +struct mlx4dv_rwq { +.in +8 +__be32 *rdb; +struct { +.in +8 +uint32_t wqe_cnt; +int wqe_shift; +int offset; +.in -8 +} rq; +struct { +.in +8 +void *buf; +size_t length; +.in -8 +} buf; +uint64_t comp_mask; +.in -8 +}; + +struct mlx4dv_obj { +.in +8 +struct { +.in +8 +struct ibv_qp *in; +struct mlx4dv_qp *out; +.in -8 +} qp; +struct { +.in +8 +struct ibv_cq *in; +struct mlx4dv_cq *out; +.in -8 +} cq; +.in -8 +}; + +enum mlx4dv_obj_type { +.in +8 +MLX4DV_OBJ_QP = 1 << 0, +MLX4DV_OBJ_CQ = 1 << 1, +MLX4DV_OBJ_SRQ = 1 << 2, +.in -8 +}; +.fi +.SH "RETURN VALUE" +0 on success or the value of errno on failure (which indicates the failure reason). +.SH "NOTES" + * Compatibility masks (comp_mask) are in/out fields. +.SH "SEE ALSO" +.BR mlx4dv (7) +.SH "AUTHORS" +.TP +Maor Gottlieb <maorg@mellanox.com> diff --git a/providers/mlx4/man/mlx4dv_query_device.3 b/providers/mlx4/man/mlx4dv_query_device.3 new file mode 100644 index 0000000..c5ec881 --- /dev/null +++ b/providers/mlx4/man/mlx4dv_query_device.3 @@ -0,0 +1,42 @@ +.\" -*- nroff -*- +.\" Licensed under the OpenIB.org (MIT) - See COPYING.md +.\" +.TH MLX4DV_QUERY_DEVICE 3 2017-06-27 1.0.0 +.SH "NAME" +mlx4dv_query_device \- Query device capabilities specific to mlx4 +.SH "SYNOPSIS" +.nf +.B #include <infiniband/mlx4dv.h> +.sp +.BI "int mlx4dv_query_device(struct ibv_context *ctx_in, +.BI " struct mlx4dv_context *attrs_out); +.fi +.SH "DESCRIPTION" +.B mlx4dv_query_device() +Query mlx4 specific device information that is usable via the direct verbs interface. +.PP +This function returns a version and compatibility mask. The version represents +the format of the internal hardware structures that mlx4dv.h exposes. +Future additions of new fields to the existing structures are handled by +the comp_mask field. +.PP +.nf +struct mlx4dv_context { +.in +8 +uint8_t version; +uint32_t max_inl_recv_sz; /* Maximum supported size of inline receive */ +uint64_t comp_mask; +.in -8 +}; + +.fi +.SH "RETURN VALUE" +0 on success or the value of errno on failure (which indicates the failure reason). +.SH "NOTES" + * Compatibility mask (comp_mask) is an in/out field. +.SH "SEE ALSO" +.BR mlx4dv (7), +.BR ibv_query_device (3) +.SH "AUTHORS" +.TP +Maor Gottlieb <maorg@mellanox.com> diff --git a/providers/mlx4/man/mlx4dv_set_context_attr.3.md b/providers/mlx4/man/mlx4dv_set_context_attr.3.md new file mode 100644 index 0000000..1056d6f --- /dev/null +++ b/providers/mlx4/man/mlx4dv_set_context_attr.3.md @@ -0,0 +1,73 @@ +--- +layout: page +title: mlx4dv_set_context_attr +section: 3 +tagline: Verbs +--- + +# NAME + +mlx4dv_set_context_attr - Set context attributes + +# SYNOPSIS + +```c +#include <infiniband/mlx4dv.h> + +int mlx4dv_set_context_attr(struct ibv_context *context, + enum mlx4dv_set_ctx_attr_type attr_type, + void *attr); +``` + +# DESCRIPTION + +mlx4dv_set_context_attr gives the ability to set vendor specific attributes on +the RDMA context. + +# ARGUMENTS +*context* +: RDMA device context to work on. + +*attr_type* +: The type of the provided attribute. + +*attr* +: Pointer to the attribute to be set. +## attr_type + +```c +enum mlx4dv_set_ctx_attr_type { + /* Attribute type uint8_t */ + MLX4DV_SET_CTX_ATTR_LOG_WQS_RANGE_SZ = 0, + MLX4DV_SET_CTX_ATTR_BUF_ALLOCATORS = 1, +}; +``` +*MLX4DV_SET_CTX_ATTR_LOG_WQS_RANGE_SZ* +: Change the LOG WQs Range size for RSS + +*MLX4DV_SET_CTX_ATTR_BUF_ALLOCATORS* +: Provide an external buffer allocator + +```c +struct mlx4dv_ctx_allocators { + void *(*alloc)(size_t size, void *priv_data); + void (*free)(void *ptr, void *priv_data); + void *data; +}; +``` +*alloc* +: Function used for buffer allocation instead of libmlx4 internal method + +*free* +: Function used to free buffers allocated by alloc function + +*data* +: Metadata that can be used by alloc and free functions + +# RETURN VALUE +Returns 0 on success, or the value of errno on failure +(which indicates the failure reason). + +#AUTHOR + +Majd Dibbiny <majd@mellanox.com> diff --git a/providers/mlx4/mlx4-abi.h b/providers/mlx4/mlx4-abi.h new file mode 100644 index 0000000..e1d8327 --- /dev/null +++ b/providers/mlx4/mlx4-abi.h @@ -0,0 +1,74 @@ +/* + * Copyright (c) 2007 Cisco, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef MLX4_ABI_H +#define MLX4_ABI_H + +#include <infiniband/kern-abi.h> +#include <rdma/mlx4-abi.h> +#include <kernel-abi/mlx4-abi.h> + +#define MLX4_UVERBS_MIN_ABI_VERSION 2 +#define MLX4_UVERBS_MAX_ABI_VERSION 4 + +#define MLX4_UVERBS_NO_DEV_CAPS_ABI_VERSION 3 + +DECLARE_DRV_CMD(mlx4_alloc_pd, IB_USER_VERBS_CMD_ALLOC_PD, + empty, mlx4_ib_alloc_pd_resp); +DECLARE_DRV_CMD(mlx4_create_cq, IB_USER_VERBS_CMD_CREATE_CQ, + mlx4_ib_create_cq, mlx4_ib_create_cq_resp); +DECLARE_DRV_CMD(mlx4_create_cq_ex, IB_USER_VERBS_EX_CMD_CREATE_CQ, + mlx4_ib_create_cq, mlx4_ib_create_cq_resp); +DECLARE_DRV_CMD(mlx4_create_qp, IB_USER_VERBS_CMD_CREATE_QP, + mlx4_ib_create_qp, empty); +DECLARE_DRV_CMD(mlx4_create_qp_ex, IB_USER_VERBS_EX_CMD_CREATE_QP, + mlx4_ib_create_qp, empty); +DECLARE_DRV_CMD(mlx4_create_qp_ex_rss, IB_USER_VERBS_EX_CMD_CREATE_QP, + mlx4_ib_create_qp_rss, empty); +DECLARE_DRV_CMD(mlx4_create_srq, IB_USER_VERBS_CMD_CREATE_SRQ, + mlx4_ib_create_srq, mlx4_ib_create_srq_resp); +DECLARE_DRV_CMD(mlx4_create_wq, IB_USER_VERBS_EX_CMD_CREATE_WQ, + mlx4_ib_create_wq, empty); +DECLARE_DRV_CMD(mlx4_create_xsrq, IB_USER_VERBS_CMD_CREATE_XSRQ, + mlx4_ib_create_srq, mlx4_ib_create_srq_resp); +DECLARE_DRV_CMD(mlx4_alloc_ucontext_v3, IB_USER_VERBS_CMD_GET_CONTEXT, + empty, mlx4_ib_alloc_ucontext_resp_v3); +DECLARE_DRV_CMD(mlx4_alloc_ucontext, IB_USER_VERBS_CMD_GET_CONTEXT, + empty, mlx4_ib_alloc_ucontext_resp); +DECLARE_DRV_CMD(mlx4_modify_wq, IB_USER_VERBS_EX_CMD_MODIFY_WQ, + mlx4_ib_modify_wq, empty); +DECLARE_DRV_CMD(mlx4_query_device_ex, IB_USER_VERBS_EX_CMD_QUERY_DEVICE, + empty, mlx4_uverbs_ex_query_device_resp); +DECLARE_DRV_CMD(mlx4_resize_cq, IB_USER_VERBS_CMD_RESIZE_CQ, + mlx4_ib_resize_cq, empty); + +#endif /* MLX4_ABI_H */ diff --git a/providers/mlx4/mlx4.c b/providers/mlx4/mlx4.c new file mode 100644 index 0000000..0842ff0 --- /dev/null +++ b/providers/mlx4/mlx4.c @@ -0,0 +1,451 @@ +/* + * Copyright (c) 2007 Cisco, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <config.h> + +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <errno.h> +#include <sys/mman.h> +#include <pthread.h> +#include <string.h> + +#include "mlx4.h" +#include "mlx4-abi.h" + +static void mlx4_free_context(struct ibv_context *ibv_ctx); + +#ifndef PCI_VENDOR_ID_MELLANOX +#define PCI_VENDOR_ID_MELLANOX 0x15b3 +#endif + +#define HCA(v, d) VERBS_PCI_MATCH(PCI_VENDOR_ID_##v, d, NULL) +static const struct verbs_match_ent hca_table[] = { + VERBS_DRIVER_ID(RDMA_DRIVER_MLX4), + HCA(MELLANOX, 0x6340), /* MT25408 "Hermon" SDR */ + HCA(MELLANOX, 0x634a), /* MT25408 "Hermon" DDR */ + HCA(MELLANOX, 0x6354), /* MT25408 "Hermon" QDR */ + HCA(MELLANOX, 0x6732), /* MT25408 "Hermon" DDR PCIe gen2 */ + HCA(MELLANOX, 0x673c), /* MT25408 "Hermon" QDR PCIe gen2 */ + HCA(MELLANOX, 0x6368), /* MT25408 "Hermon" EN 10GigE */ + HCA(MELLANOX, 0x6750), /* MT25408 "Hermon" EN 10GigE PCIe gen2 */ + HCA(MELLANOX, 0x6372), /* MT25458 ConnectX EN 10GBASE-T 10GigE */ + HCA(MELLANOX, 0x675a), /* MT25458 ConnectX EN 10GBASE-T+Gen2 10GigE */ + HCA(MELLANOX, 0x6764), /* MT26468 ConnectX EN 10GigE PCIe gen2*/ + HCA(MELLANOX, 0x6746), /* MT26438 ConnectX EN 40GigE PCIe gen2 5GT/s */ + HCA(MELLANOX, 0x676e), /* MT26478 ConnectX2 40GigE PCIe gen2 */ + HCA(MELLANOX, 0x1002), /* MT25400 Family [ConnectX-2 Virtual Function] */ + HCA(MELLANOX, 0x1003), /* MT27500 Family [ConnectX-3] */ + HCA(MELLANOX, 0x1004), /* MT27500 Family [ConnectX-3 Virtual Function] */ + HCA(MELLANOX, 0x1005), /* MT27510 Family */ + HCA(MELLANOX, 0x1006), /* MT27511 Family */ + HCA(MELLANOX, 0x1007), /* MT27520 Family */ + HCA(MELLANOX, 0x1008), /* MT27521 Family */ + HCA(MELLANOX, 0x1009), /* MT27530 Family */ + HCA(MELLANOX, 0x100a), /* MT27531 Family */ + HCA(MELLANOX, 0x100b), /* MT27540 Family */ + HCA(MELLANOX, 0x100c), /* MT27541 Family */ + HCA(MELLANOX, 0x100d), /* MT27550 Family */ + HCA(MELLANOX, 0x100e), /* MT27551 Family */ + HCA(MELLANOX, 0x100f), /* MT27560 Family */ + HCA(MELLANOX, 0x1010), /* MT27561 Family */ + VERBS_MODALIAS_MATCH("vmbus:3daf2e8ca732094bab99bd1f1c86b501", NULL), /* Microsoft Azure Network Direct */ + {} +}; + +static const struct verbs_context_ops mlx4_ctx_ops = { + .query_device = mlx4_query_device, + .query_port = mlx4_query_port, + .alloc_pd = mlx4_alloc_pd, + .dealloc_pd = mlx4_free_pd, + .reg_mr = mlx4_reg_mr, + .rereg_mr = mlx4_rereg_mr, + .dereg_mr = mlx4_dereg_mr, + .alloc_mw = mlx4_alloc_mw, + .dealloc_mw = mlx4_dealloc_mw, + .bind_mw = mlx4_bind_mw, + .create_cq = mlx4_create_cq, + .poll_cq = mlx4_poll_cq, + .req_notify_cq = mlx4_arm_cq, + .cq_event = mlx4_cq_event, + .resize_cq = mlx4_resize_cq, + .destroy_cq = mlx4_destroy_cq, + .create_srq = mlx4_create_srq, + .modify_srq = mlx4_modify_srq, + .query_srq = mlx4_query_srq, + .destroy_srq = mlx4_destroy_srq, + .post_srq_recv = mlx4_post_srq_recv, + .create_qp = mlx4_create_qp, + .query_qp = mlx4_query_qp, + .modify_qp = mlx4_modify_qp, + .destroy_qp = mlx4_destroy_qp, + .post_send = mlx4_post_send, + .post_recv = mlx4_post_recv, + .create_ah = mlx4_create_ah, + .destroy_ah = mlx4_destroy_ah, + .attach_mcast = ibv_cmd_attach_mcast, + .detach_mcast = ibv_cmd_detach_mcast, + + .close_xrcd = mlx4_close_xrcd, + .create_cq_ex = mlx4_create_cq_ex, + .create_flow = mlx4_create_flow, + .create_qp_ex = mlx4_create_qp_ex, + .create_rwq_ind_table = mlx4_create_rwq_ind_table, + .create_srq_ex = mlx4_create_srq_ex, + .create_wq = mlx4_create_wq, + .destroy_flow = mlx4_destroy_flow, + .destroy_rwq_ind_table = mlx4_destroy_rwq_ind_table, + .destroy_wq = mlx4_destroy_wq, + .get_srq_num = verbs_get_srq_num, + .modify_cq = mlx4_modify_cq, + .modify_wq = mlx4_modify_wq, + .open_qp = mlx4_open_qp, + .open_xrcd = mlx4_open_xrcd, + .query_device_ex = mlx4_query_device_ex, + .query_rt_values = mlx4_query_rt_values, + .free_context = mlx4_free_context, +}; + +static int mlx4_map_internal_clock(struct mlx4_device *mdev, + struct ibv_context *ibv_ctx) +{ + struct mlx4_context *context = to_mctx(ibv_ctx); + void *hca_clock_page; + + hca_clock_page = mmap(NULL, mdev->page_size, + PROT_READ, MAP_SHARED, ibv_ctx->cmd_fd, + mdev->page_size * 3); + + if (hca_clock_page == MAP_FAILED) { + fprintf(stderr, PFX + "Warning: Timestamp available,\n" + "but failed to mmap() hca core clock page.\n"); + return -1; + } + + context->hca_core_clock = hca_clock_page + + (context->core_clock.offset & (mdev->page_size - 1)); + return 0; +} + +static struct verbs_context *mlx4_alloc_context(struct ibv_device *ibdev, + int cmd_fd, + void *private_data) +{ + struct mlx4_context *context; + struct ibv_get_context cmd; + struct mlx4_alloc_ucontext_resp resp; + int i; + struct mlx4_alloc_ucontext_v3_resp resp_v3; + __u16 bf_reg_size; + struct mlx4_device *dev = to_mdev(ibdev); + struct verbs_context *verbs_ctx; + struct ibv_device_attr_ex dev_attrs; + + context = verbs_init_and_alloc_context(ibdev, cmd_fd, context, ibv_ctx, + RDMA_DRIVER_MLX4); + if (!context) + return NULL; + + verbs_ctx = &context->ibv_ctx; + + if (dev->abi_version <= MLX4_UVERBS_NO_DEV_CAPS_ABI_VERSION) { + if (ibv_cmd_get_context(verbs_ctx, &cmd, sizeof(cmd), + &resp_v3.ibv_resp, sizeof(resp_v3))) + goto failed; + + context->num_qps = resp_v3.qp_tab_size; + bf_reg_size = resp_v3.bf_reg_size; + context->cqe_size = sizeof (struct mlx4_cqe); + } else { + if (ibv_cmd_get_context(verbs_ctx, &cmd, sizeof(cmd), + &resp.ibv_resp, sizeof(resp))) + goto failed; + + context->num_qps = resp.qp_tab_size; + bf_reg_size = resp.bf_reg_size; + if (resp.dev_caps & MLX4_USER_DEV_CAP_LARGE_CQE) + context->cqe_size = resp.cqe_size; + else + context->cqe_size = sizeof (struct mlx4_cqe); + } + + context->qp_table_shift = ffs(context->num_qps) - 1 - MLX4_QP_TABLE_BITS; + context->qp_table_mask = (1 << context->qp_table_shift) - 1; + for (i = 0; i < MLX4_PORTS_NUM; ++i) + context->port_query_cache[i].valid = 0; + + pthread_mutex_init(&context->qp_table_mutex, NULL); + for (i = 0; i < MLX4_QP_TABLE_SIZE; ++i) + context->qp_table[i].refcnt = 0; + + for (i = 0; i < MLX4_NUM_DB_TYPE; ++i) + context->db_list[i] = NULL; + + mlx4_init_xsrq_table(&context->xsrq_table, context->num_qps); + pthread_mutex_init(&context->db_list_mutex, NULL); + + context->uar_mmap_offset = 0; + context->uar = mmap(NULL, dev->page_size, PROT_WRITE, + MAP_SHARED, cmd_fd, context->uar_mmap_offset); + if (context->uar == MAP_FAILED) + goto failed; + + if (bf_reg_size) { + context->bf_page = mmap(NULL, dev->page_size, + PROT_WRITE, MAP_SHARED, cmd_fd, + dev->page_size); + if (context->bf_page == MAP_FAILED) { + fprintf(stderr, PFX "Warning: BlueFlame available, " + "but failed to mmap() BlueFlame page.\n"); + context->bf_page = NULL; + context->bf_buf_size = 0; + } else { + context->bf_buf_size = bf_reg_size / 2; + context->bf_offset = 0; + pthread_spin_init(&context->bf_lock, PTHREAD_PROCESS_PRIVATE); + } + } else { + context->bf_page = NULL; + context->bf_buf_size = 0; + } + + verbs_set_ops(verbs_ctx, &mlx4_ctx_ops); + + context->hca_core_clock = NULL; + memset(&dev_attrs, 0, sizeof(dev_attrs)); + if (!mlx4_query_device_ex(&verbs_ctx->context, NULL, &dev_attrs, + sizeof(struct ibv_device_attr_ex))) { + context->max_qp_wr = dev_attrs.orig_attr.max_qp_wr; + context->max_sge = dev_attrs.orig_attr.max_sge; + if (context->core_clock.offset_valid) + mlx4_map_internal_clock(dev, &verbs_ctx->context); + } + + return verbs_ctx; + +failed: + verbs_uninit_context(&context->ibv_ctx); + free(context); + return NULL; +} + +static void mlx4_free_context(struct ibv_context *ibv_ctx) +{ + struct mlx4_context *context = to_mctx(ibv_ctx); + struct mlx4_device *mdev = to_mdev(ibv_ctx->device); + + munmap(context->uar, mdev->page_size); + if (context->bf_page) + munmap(context->bf_page, mdev->page_size); + if (context->hca_core_clock) + munmap(context->hca_core_clock - context->core_clock.offset, + mdev->page_size); + + verbs_uninit_context(&context->ibv_ctx); + free(context); +} + +static void mlx4_uninit_device(struct verbs_device *verbs_device) +{ + struct mlx4_device *dev = to_mdev(&verbs_device->device); + + free(dev); +} + +static struct verbs_device *mlx4_device_alloc(struct verbs_sysfs_dev *sysfs_dev) +{ + struct mlx4_device *dev; + + dev = calloc(1, sizeof *dev); + if (!dev) + return NULL; + + dev->page_size = sysconf(_SC_PAGESIZE); + dev->abi_version = sysfs_dev->abi_ver; + + return &dev->verbs_dev; +} + +static const struct verbs_device_ops mlx4_dev_ops = { + .name = "mlx4", + .match_min_abi_version = MLX4_UVERBS_MIN_ABI_VERSION, + .match_max_abi_version = MLX4_UVERBS_MAX_ABI_VERSION, + .match_table = hca_table, + .alloc_device = mlx4_device_alloc, + .uninit_device = mlx4_uninit_device, + .alloc_context = mlx4_alloc_context, +}; +PROVIDER_DRIVER(mlx4, mlx4_dev_ops); + +static int mlx4dv_get_qp(struct ibv_qp *qp_in, + struct mlx4dv_qp *qp_out) +{ + struct mlx4_qp *mqp = to_mqp(qp_in); + struct mlx4_context *ctx = to_mctx(qp_in->context); + uint64_t mask_out = 0; + + qp_out->buf.buf = mqp->buf.buf; + qp_out->buf.length = mqp->buf.length; + + qp_out->rdb = mqp->db; + qp_out->sdb = (uint32_t *) (ctx->uar + MLX4_SEND_DOORBELL); + qp_out->doorbell_qpn = mqp->doorbell_qpn; + + qp_out->sq.wqe_cnt = mqp->sq.wqe_cnt; + qp_out->sq.wqe_shift = mqp->sq.wqe_shift; + qp_out->sq.offset = mqp->sq.offset; + + qp_out->rq.wqe_cnt = mqp->rq.wqe_cnt; + qp_out->rq.wqe_shift = mqp->rq.wqe_shift; + qp_out->rq.offset = mqp->rq.offset; + + if (qp_out->comp_mask & MLX4DV_QP_MASK_UAR_MMAP_OFFSET) { + qp_out->uar_mmap_offset = ctx->uar_mmap_offset; + mask_out |= MLX4DV_QP_MASK_UAR_MMAP_OFFSET; + } + + qp_out->comp_mask = mask_out; + + return 0; +} + +static int mlx4dv_get_cq(struct ibv_cq *cq_in, + struct mlx4dv_cq *cq_out) +{ + struct mlx4_cq *mcq = to_mcq(cq_in); + struct mlx4_context *mctx = to_mctx(cq_in->context); + uint64_t mask_out = 0; + + cq_out->buf.buf = mcq->buf.buf; + cq_out->buf.length = mcq->buf.length; + cq_out->cqn = mcq->cqn; + cq_out->set_ci_db = mcq->set_ci_db; + cq_out->arm_db = mcq->arm_db; + cq_out->arm_sn = mcq->arm_sn; + cq_out->cqe_size = mcq->cqe_size; + cq_out->cqe_cnt = mcq->ibv_cq.cqe + 1; + + mcq->flags |= MLX4_CQ_FLAGS_DV_OWNED; + + if (cq_out->comp_mask & MLX4DV_CQ_MASK_UAR) { + cq_out->cq_uar = mctx->uar; + mask_out |= MLX4DV_CQ_MASK_UAR; + } + + cq_out->comp_mask = mask_out; + return 0; +} + +static int mlx4dv_get_srq(struct ibv_srq *srq_in, + struct mlx4dv_srq *srq_out) +{ + struct mlx4_srq *msrq = to_msrq(srq_in); + + srq_out->comp_mask = 0; + srq_out->buf.buf = msrq->buf.buf; + srq_out->buf.length = msrq->buf.length; + srq_out->wqe_shift = msrq->wqe_shift; + srq_out->head = msrq->head; + srq_out->tail = msrq->tail; + srq_out->db = msrq->db; + + return 0; +} + +static int mlx4dv_get_rwq(struct ibv_wq *wq_in, struct mlx4dv_rwq *wq_out) +{ + struct mlx4_qp *mqp = wq_to_mqp(wq_in); + + wq_out->comp_mask = 0; + + wq_out->buf.buf = mqp->buf.buf; + wq_out->buf.length = mqp->buf.length; + + wq_out->rdb = mqp->db; + + wq_out->rq.wqe_cnt = mqp->rq.wqe_cnt; + wq_out->rq.wqe_shift = mqp->rq.wqe_shift; + wq_out->rq.offset = mqp->rq.offset; + + return 0; +} + +int mlx4dv_init_obj(struct mlx4dv_obj *obj, uint64_t obj_type) +{ + int ret = 0; + + if (obj_type & MLX4DV_OBJ_QP) + ret = mlx4dv_get_qp(obj->qp.in, obj->qp.out); + if (!ret && (obj_type & MLX4DV_OBJ_CQ)) + ret = mlx4dv_get_cq(obj->cq.in, obj->cq.out); + if (!ret && (obj_type & MLX4DV_OBJ_SRQ)) + ret = mlx4dv_get_srq(obj->srq.in, obj->srq.out); + if (!ret && (obj_type & MLX4DV_OBJ_RWQ)) + ret = mlx4dv_get_rwq(obj->rwq.in, obj->rwq.out); + + return ret; +} + +int mlx4dv_query_device(struct ibv_context *ctx_in, + struct mlx4dv_context *attrs_out) +{ + struct mlx4_context *mctx = to_mctx(ctx_in); + + attrs_out->version = 0; + attrs_out->comp_mask = 0; + + attrs_out->max_inl_recv_sz = mctx->max_inl_recv_sz; + + return 0; +} + +int mlx4dv_set_context_attr(struct ibv_context *context, + enum mlx4dv_set_ctx_attr_type attr_type, + void *attr) +{ + struct mlx4_context *ctx = to_mctx(context); + + switch (attr_type) { + case MLX4DV_SET_CTX_ATTR_LOG_WQS_RANGE_SZ: + ctx->log_wqs_range_sz = *((uint8_t *)attr); + break; + case MLX4DV_SET_CTX_ATTR_BUF_ALLOCATORS: + ctx->extern_alloc = *((struct mlx4dv_ctx_allocators *)attr); + break; + default: + return ENOTSUP; + } + + return 0; +} diff --git a/providers/mlx4/mlx4.conf b/providers/mlx4/mlx4.conf new file mode 100644 index 0000000..c8b4cce --- /dev/null +++ b/providers/mlx4/mlx4.conf @@ -0,0 +1,21 @@ +# This file is intended for users to select the various module options +# they need for the mlx4 driver. On upgrade of the rdma package, +# any user made changes to this file are preserved. Any changes made +# to the libmlx4.conf file in this directory are overwritten on +# pacakge upgrade. +# +# Some sample options and what they would do +# Enable debugging output, device managed flow control, and disable SRIOV +#options mlx4_core debug_level=1 log_num_mgm_entry_size=-1 probe_vf=0 num_vfs=0 +# +# Enable debugging output and create SRIOV devices, but don't attach any of +# the child devices to the host, only the parent device +#options mlx4_core debug_level=1 probe_vf=0 num_vfs=7 +# +# Enable debugging output, SRIOV, and attach one of the SRIOV child devices +# in addition to the parent device to the host +#options mlx4_core debug_level=1 probe_vf=1 num_vfs=7 +# +# Enable per priority flow control for send and receive, setting both priority +# 1 and 2 as no drop priorities +#options mlx4_en pfctx=3 pfcrx=3 diff --git a/providers/mlx4/mlx4.h b/providers/mlx4/mlx4.h new file mode 100644 index 0000000..3c161e8 --- /dev/null +++ b/providers/mlx4/mlx4.h @@ -0,0 +1,421 @@ +/* + * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved. + * Copyright (c) 2005, 2006, 2007 Cisco Systems. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef MLX4_H +#define MLX4_H + +#include <endian.h> +#include <stddef.h> +#include <util/compiler.h> + +#include <infiniband/driver.h> +#include <util/udma_barrier.h> +#include <util/util.h> +#include <infiniband/verbs.h> + +#include "mlx4dv.h" + +#define MLX4_PORTS_NUM 2 + +#include <valgrind/memcheck.h> + +#define PFX "mlx4: " + +enum { + MLX4_STAT_RATE_OFFSET = 5 +}; + +enum { + MLX4_QP_TABLE_BITS = 8, + MLX4_QP_TABLE_SIZE = 1 << MLX4_QP_TABLE_BITS, + MLX4_QP_TABLE_MASK = MLX4_QP_TABLE_SIZE - 1 +}; + +#define MLX4_REMOTE_SRQN_FLAGS(wr) htobe32(wr->qp_type.xrc.remote_srqn << 8) + +enum { + MLX4_XSRQ_TABLE_BITS = 8, + MLX4_XSRQ_TABLE_SIZE = 1 << MLX4_XSRQ_TABLE_BITS, + MLX4_XSRQ_TABLE_MASK = MLX4_XSRQ_TABLE_SIZE - 1 +}; + +struct mlx4_xsrq_table { + struct { + struct mlx4_srq **table; + int refcnt; + } xsrq_table[MLX4_XSRQ_TABLE_SIZE]; + + pthread_mutex_t mutex; + int num_xsrq; + int shift; + int mask; +}; + +enum { + MLX4_XRC_QPN_BIT = (1 << 23) +}; + +enum mlx4_db_type { + MLX4_DB_TYPE_CQ, + MLX4_DB_TYPE_RQ, + MLX4_NUM_DB_TYPE +}; + +struct mlx4_device { + struct verbs_device verbs_dev; + int page_size; + int abi_version; +}; + +struct mlx4_db_page; + +struct mlx4_context { + struct verbs_context ibv_ctx; + + void *uar; + off_t uar_mmap_offset; + + void *bf_page; + int bf_buf_size; + int bf_offset; + pthread_spinlock_t bf_lock; + + struct { + struct mlx4_qp **table; + int refcnt; + } qp_table[MLX4_QP_TABLE_SIZE]; + pthread_mutex_t qp_table_mutex; + int num_qps; + int qp_table_shift; + int qp_table_mask; + int max_qp_wr; + int max_sge; + + struct mlx4_db_page *db_list[MLX4_NUM_DB_TYPE]; + pthread_mutex_t db_list_mutex; + int cqe_size; + struct mlx4_xsrq_table xsrq_table; + struct { + uint8_t valid; + uint8_t link_layer; + uint8_t flags; + enum ibv_port_cap_flags caps; + } port_query_cache[MLX4_PORTS_NUM]; + struct { + uint64_t offset; + uint8_t offset_valid; + } core_clock; + void *hca_core_clock; + uint32_t max_inl_recv_sz; + uint8_t log_wqs_range_sz; + struct mlx4dv_ctx_allocators extern_alloc; +}; + +struct mlx4_buf { + void *buf; + size_t length; +}; + +struct mlx4_pd { + struct ibv_pd ibv_pd; + uint32_t pdn; +}; + +enum { + MLX4_CQ_FLAGS_RX_CSUM_VALID = 1 << 0, + MLX4_CQ_FLAGS_EXTENDED = 1 << 1, + MLX4_CQ_FLAGS_SINGLE_THREADED = 1 << 2, + MLX4_CQ_FLAGS_DV_OWNED = 1 << 3, +}; + +struct mlx4_cq { + struct ibv_cq_ex ibv_cq; + struct mlx4_buf buf; + struct mlx4_buf resize_buf; + pthread_spinlock_t lock; + uint32_t cqn; + uint32_t cons_index; + __be32 *set_ci_db; + __be32 *arm_db; + int arm_sn; + int cqe_size; + struct mlx4_qp *cur_qp; + struct mlx4_cqe *cqe; + uint32_t flags; +}; + +struct mlx4_srq { + struct verbs_srq verbs_srq; + struct mlx4_buf buf; + pthread_spinlock_t lock; + uint64_t *wrid; + uint32_t srqn; + int max; + int max_gs; + int wqe_shift; + int head; + int tail; + __be32 *db; + uint16_t counter; + uint8_t ext_srq; +}; + +struct mlx4_wq { + uint64_t *wrid; + pthread_spinlock_t lock; + int wqe_cnt; + int max_post; + unsigned head; + unsigned tail; + int max_gs; + int wqe_shift; + int offset; +}; + +enum mlx4_rsc_type { + MLX4_RSC_TYPE_QP = 0, + MLX4_RSC_TYPE_RSS_QP = 1, + MLX4_RSC_TYPE_SRQ = 2, +}; + +struct mlx4_qp { + union { + struct verbs_qp verbs_qp; + struct ibv_wq wq; + }; + struct mlx4_buf buf; + int max_inline_data; + int buf_size; + + __be32 doorbell_qpn; + __be32 sq_signal_bits; + int sq_spare_wqes; + struct mlx4_wq sq; + + __be32 *db; + struct mlx4_wq rq; + + uint8_t link_layer; + uint8_t type; /* enum mlx4_rsc_type */ + uint32_t qp_cap_cache; + uint32_t qpn_cache; +}; + +struct mlx4_ah { + struct ibv_ah ibv_ah; + struct mlx4_av av; + uint16_t vlan; + uint8_t mac[6]; +}; + +enum { + MLX4_CSUM_SUPPORT_UD_OVER_IB = (1 << 0), + MLX4_CSUM_SUPPORT_RAW_OVER_ETH = (1 << 1), + /* Only report rx checksum when the validation is valid */ + MLX4_RX_CSUM_VALID = (1 << 16), +}; + +#define to_mxxx(xxx, type) \ + container_of(ib##xxx, struct mlx4_##type, ibv_##xxx) + +static inline struct mlx4_device *to_mdev(struct ibv_device *ibdev) +{ + /* ibv_device is first field of verbs_device + * see try_driver() in libibverbs. + */ + return container_of(ibdev, struct mlx4_device, verbs_dev.device); +} + +static inline struct mlx4_context *to_mctx(struct ibv_context *ibctx) +{ + return container_of(ibctx, struct mlx4_context, ibv_ctx.context); +} + +static inline struct mlx4_pd *to_mpd(struct ibv_pd *ibpd) +{ + return to_mxxx(pd, pd); +} + +static inline struct mlx4_cq *to_mcq(struct ibv_cq *ibcq) +{ + return container_of((struct ibv_cq_ex *)ibcq, struct mlx4_cq, ibv_cq); +} + +static inline struct mlx4_srq *to_msrq(struct ibv_srq *ibsrq) +{ + return container_of(ibsrq, struct mlx4_srq, verbs_srq.srq); +} + +static inline struct mlx4_qp *to_mqp(struct ibv_qp *ibqp) +{ + return container_of(ibqp, struct mlx4_qp, verbs_qp.qp); +} + +static inline struct mlx4_qp *wq_to_mqp(struct ibv_wq *ibwq) +{ + return container_of(ibwq, struct mlx4_qp, wq); +} + +static inline struct mlx4_ah *to_mah(struct ibv_ah *ibah) +{ + return to_mxxx(ah, ah); +} + +static inline void mlx4_update_cons_index(struct mlx4_cq *cq) +{ + *cq->set_ci_db = htobe32(cq->cons_index & 0xffffff); +} + +int mlx4_alloc_buf(struct mlx4_context *ctx, struct mlx4_buf *buf, size_t size, + int page_size); +void mlx4_free_buf(struct mlx4_context *ctx, struct mlx4_buf *buf); + +__be32 *mlx4_alloc_db(struct mlx4_context *context, enum mlx4_db_type type); +void mlx4_free_db(struct mlx4_context *context, enum mlx4_db_type type, + __be32 *db); + +int mlx4_query_device(struct ibv_context *context, + struct ibv_device_attr *attr); +int mlx4_query_device_ex(struct ibv_context *context, + const struct ibv_query_device_ex_input *input, + struct ibv_device_attr_ex *attr, + size_t attr_size); +int mlx4_query_port(struct ibv_context *context, uint8_t port, + struct ibv_port_attr *attr); +int mlx4_query_rt_values(struct ibv_context *context, + struct ibv_values_ex *values); +struct ibv_pd *mlx4_alloc_pd(struct ibv_context *context); +int mlx4_free_pd(struct ibv_pd *pd); +struct ibv_xrcd *mlx4_open_xrcd(struct ibv_context *context, + struct ibv_xrcd_init_attr *attr); +int mlx4_close_xrcd(struct ibv_xrcd *xrcd); + +struct ibv_mr *mlx4_reg_mr(struct ibv_pd *pd, void *addr, size_t length, + uint64_t hca_va, int access); +int mlx4_rereg_mr(struct verbs_mr *vmr, int flags, struct ibv_pd *pd, + void *addr, size_t length, int access); +int mlx4_dereg_mr(struct verbs_mr *vmr); + +struct ibv_mw *mlx4_alloc_mw(struct ibv_pd *pd, enum ibv_mw_type type); +int mlx4_dealloc_mw(struct ibv_mw *mw); +int mlx4_bind_mw(struct ibv_qp *qp, struct ibv_mw *mw, + struct ibv_mw_bind *mw_bind); + +struct ibv_cq *mlx4_create_cq(struct ibv_context *context, int cqe, + struct ibv_comp_channel *channel, + int comp_vector); +struct ibv_cq_ex *mlx4_create_cq_ex(struct ibv_context *context, + struct ibv_cq_init_attr_ex *cq_attr); +void mlx4_cq_fill_pfns(struct mlx4_cq *cq, const struct ibv_cq_init_attr_ex *cq_attr); +int mlx4_alloc_cq_buf(struct mlx4_device *dev, struct mlx4_context *ctx, + struct mlx4_buf *buf, int nent, int entry_size); +int mlx4_resize_cq(struct ibv_cq *cq, int cqe); +int mlx4_modify_cq(struct ibv_cq *cq, struct ibv_modify_cq_attr *attr); +int mlx4_destroy_cq(struct ibv_cq *cq); +int mlx4_poll_cq(struct ibv_cq *cq, int ne, struct ibv_wc *wc); +int mlx4_arm_cq(struct ibv_cq *cq, int solicited); +void mlx4_cq_event(struct ibv_cq *cq); +void __mlx4_cq_clean(struct mlx4_cq *cq, uint32_t qpn, struct mlx4_srq *srq); +void mlx4_cq_clean(struct mlx4_cq *cq, uint32_t qpn, struct mlx4_srq *srq); +int mlx4_get_outstanding_cqes(struct mlx4_cq *cq); +void mlx4_cq_resize_copy_cqes(struct mlx4_cq *cq, void *buf, int new_cqe); + +struct ibv_srq *mlx4_create_srq(struct ibv_pd *pd, + struct ibv_srq_init_attr *attr); +struct ibv_srq *mlx4_create_srq_ex(struct ibv_context *context, + struct ibv_srq_init_attr_ex *attr_ex); +struct ibv_srq *mlx4_create_xrc_srq(struct ibv_context *context, + struct ibv_srq_init_attr_ex *attr_ex); +int mlx4_modify_srq(struct ibv_srq *srq, + struct ibv_srq_attr *attr, + int mask); +int mlx4_query_srq(struct ibv_srq *srq, + struct ibv_srq_attr *attr); +int mlx4_destroy_srq(struct ibv_srq *srq); +int mlx4_destroy_xrc_srq(struct ibv_srq *srq); +int mlx4_alloc_srq_buf(struct ibv_pd *pd, struct ibv_srq_attr *attr, + struct mlx4_srq *srq); +void mlx4_init_xsrq_table(struct mlx4_xsrq_table *xsrq_table, int size); +struct mlx4_srq *mlx4_find_xsrq(struct mlx4_xsrq_table *xsrq_table, uint32_t srqn); +int mlx4_store_xsrq(struct mlx4_xsrq_table *xsrq_table, uint32_t srqn, + struct mlx4_srq *srq); +void mlx4_clear_xsrq(struct mlx4_xsrq_table *xsrq_table, uint32_t srqn); +void mlx4_free_srq_wqe(struct mlx4_srq *srq, int ind); +int mlx4_post_srq_recv(struct ibv_srq *ibsrq, + struct ibv_recv_wr *wr, + struct ibv_recv_wr **bad_wr); + +struct ibv_qp *mlx4_create_qp(struct ibv_pd *pd, struct ibv_qp_init_attr *attr); +struct ibv_qp *mlx4_create_qp_ex(struct ibv_context *context, + struct ibv_qp_init_attr_ex *attr); +struct ibv_qp *mlx4_open_qp(struct ibv_context *context, struct ibv_qp_open_attr *attr); +int mlx4_query_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, + int attr_mask, + struct ibv_qp_init_attr *init_attr); +int mlx4_modify_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, + int attr_mask); +int mlx4_destroy_qp(struct ibv_qp *qp); +void mlx4_init_qp_indices(struct mlx4_qp *qp); +void mlx4_qp_init_sq_ownership(struct mlx4_qp *qp); +int mlx4_post_send(struct ibv_qp *ibqp, struct ibv_send_wr *wr, + struct ibv_send_wr **bad_wr); +int mlx4_post_recv(struct ibv_qp *ibqp, struct ibv_recv_wr *wr, + struct ibv_recv_wr **bad_wr); +void mlx4_calc_sq_wqe_size(struct ibv_qp_cap *cap, enum ibv_qp_type type, + struct mlx4_qp *qp, struct ibv_qp_init_attr_ex *attr); +int mlx4_alloc_qp_buf(struct ibv_context *context, uint32_t max_recv_sge, + enum ibv_qp_type type, struct mlx4_qp *qp, + struct mlx4dv_qp_init_attr *mlx4qp_attr); +void mlx4_set_sq_sizes(struct mlx4_qp *qp, struct ibv_qp_cap *cap, + enum ibv_qp_type type); +struct mlx4_qp *mlx4_find_qp(struct mlx4_context *ctx, uint32_t qpn); +int mlx4_store_qp(struct mlx4_context *ctx, uint32_t qpn, struct mlx4_qp *qp); +void mlx4_clear_qp(struct mlx4_context *ctx, uint32_t qpn); +struct ibv_ah *mlx4_create_ah(struct ibv_pd *pd, struct ibv_ah_attr *attr); +int mlx4_destroy_ah(struct ibv_ah *ah); +int mlx4_alloc_av(struct mlx4_pd *pd, struct ibv_ah_attr *attr, + struct mlx4_ah *ah); +void mlx4_free_av(struct mlx4_ah *ah); +struct ibv_wq *mlx4_create_wq(struct ibv_context *context, + struct ibv_wq_init_attr *attr); +int mlx4_modify_wq(struct ibv_wq *wq, struct ibv_wq_attr *attr); +int mlx4_destroy_wq(struct ibv_wq *wq); +struct ibv_rwq_ind_table *mlx4_create_rwq_ind_table(struct ibv_context *context, + struct ibv_rwq_ind_table_init_attr *init_attr); +int mlx4_destroy_rwq_ind_table(struct ibv_rwq_ind_table *rwq_ind_table); +int mlx4_post_wq_recv(struct ibv_wq *ibwq, struct ibv_recv_wr *wr, + struct ibv_recv_wr **bad_wr); +struct ibv_flow *mlx4_create_flow(struct ibv_qp *qp, struct ibv_flow_attr *flow_attr); +int mlx4_destroy_flow(struct ibv_flow *flow_id); + +#endif /* MLX4_H */ diff --git a/providers/mlx4/mlx4dv.h b/providers/mlx4/mlx4dv.h new file mode 100644 index 0000000..d01c10b --- /dev/null +++ b/providers/mlx4/mlx4dv.h @@ -0,0 +1,562 @@ +/* + * Copyright (c) 2017 Mellanox Technologies, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef _MLX4DV_H_ +#define _MLX4DV_H_ + +#include <stdio.h> +#include <linux/types.h> +#include <endian.h> +#include <infiniband/verbs.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* Always inline the functions */ +#ifdef __GNUC__ +#define MLX4DV_ALWAYS_INLINE inline __attribute__((always_inline)) +#else +#define MLX4DV_ALWAYS_INLINE inline +#endif + +enum { + MLX4_OPCODE_NOP = 0x00, + MLX4_OPCODE_SEND_INVAL = 0x01, + MLX4_OPCODE_RDMA_WRITE = 0x08, + MLX4_OPCODE_RDMA_WRITE_IMM = 0x09, + MLX4_OPCODE_SEND = 0x0a, + MLX4_OPCODE_SEND_IMM = 0x0b, + MLX4_OPCODE_LSO = 0x0e, + MLX4_OPCODE_RDMA_READ = 0x10, + MLX4_OPCODE_ATOMIC_CS = 0x11, + MLX4_OPCODE_ATOMIC_FA = 0x12, + MLX4_OPCODE_MASKED_ATOMIC_CS = 0x14, + MLX4_OPCODE_MASKED_ATOMIC_FA = 0x15, + MLX4_OPCODE_BIND_MW = 0x18, + MLX4_OPCODE_FMR = 0x19, + MLX4_OPCODE_LOCAL_INVAL = 0x1b, + MLX4_OPCODE_CONFIG_CMD = 0x1f, + + MLX4_RECV_OPCODE_RDMA_WRITE_IMM = 0x00, + MLX4_RECV_OPCODE_SEND = 0x01, + MLX4_RECV_OPCODE_SEND_IMM = 0x02, + MLX4_RECV_OPCODE_SEND_INVAL = 0x03, + + MLX4_CQE_OPCODE_ERROR = 0x1e, + MLX4_CQE_OPCODE_RESIZE = 0x16, +}; + +enum { + MLX4_CQ_DOORBELL = 0x20 +}; + +#define MLX4_CQ_DB_REQ_NOT_SOL (1 << 24) +#define MLX4_CQ_DB_REQ_NOT (2 << 24) + +enum { + MLX4_CQE_VLAN_PRESENT_MASK = 1 << 29, + MLX4_CQE_QPN_MASK = 0xffffff, +}; + +enum { + MLX4_CQE_OWNER_MASK = 0x80, + MLX4_CQE_IS_SEND_MASK = 0x40, + MLX4_CQE_OPCODE_MASK = 0x1f +}; + +enum { + MLX4_CQE_SYNDROME_LOCAL_LENGTH_ERR = 0x01, + MLX4_CQE_SYNDROME_LOCAL_QP_OP_ERR = 0x02, + MLX4_CQE_SYNDROME_LOCAL_PROT_ERR = 0x04, + MLX4_CQE_SYNDROME_WR_FLUSH_ERR = 0x05, + MLX4_CQE_SYNDROME_MW_BIND_ERR = 0x06, + MLX4_CQE_SYNDROME_BAD_RESP_ERR = 0x10, + MLX4_CQE_SYNDROME_LOCAL_ACCESS_ERR = 0x11, + MLX4_CQE_SYNDROME_REMOTE_INVAL_REQ_ERR = 0x12, + MLX4_CQE_SYNDROME_REMOTE_ACCESS_ERR = 0x13, + MLX4_CQE_SYNDROME_REMOTE_OP_ERR = 0x14, + MLX4_CQE_SYNDROME_TRANSPORT_RETRY_EXC_ERR = 0x15, + MLX4_CQE_SYNDROME_RNR_RETRY_EXC_ERR = 0x16, + MLX4_CQE_SYNDROME_REMOTE_ABORTED_ERR = 0x22, +}; + +struct mlx4_err_cqe { + uint32_t vlan_my_qpn; + uint32_t reserved1[5]; + uint16_t wqe_index; + uint8_t vendor_err; + uint8_t syndrome; + uint8_t reserved2[3]; + uint8_t owner_sr_opcode; +}; + +enum mlx4_cqe_status { + MLX4_CQE_STATUS_TCP_UDP_CSUM_OK = (1 << 2), + MLX4_CQE_STATUS_IPV4_PKT = (1 << 22), + MLX4_CQE_STATUS_IP_HDR_CSUM_OK = (1 << 28), + MLX4_CQE_STATUS_IPV4_CSUM_OK = MLX4_CQE_STATUS_IPV4_PKT | + MLX4_CQE_STATUS_IP_HDR_CSUM_OK | + MLX4_CQE_STATUS_TCP_UDP_CSUM_OK +}; + +struct mlx4_cqe { + __be32 vlan_my_qpn; + __be32 immed_rss_invalid; + __be32 g_mlpath_rqpn; + union { + struct { + __be16 sl_vid; + __be16 rlid; + }; + __be32 ts_47_16; + }; + __be32 status; + __be32 byte_cnt; + __be16 wqe_index; + __be16 checksum; + uint8_t reserved3; + uint8_t ts_15_8; + uint8_t ts_7_0; + uint8_t owner_sr_opcode; +}; + +enum mlx4dv_qp_comp_mask { + MLX4DV_QP_MASK_UAR_MMAP_OFFSET = 1 << 0, +}; + +struct mlx4dv_qp { + __be32 *rdb; + uint32_t *sdb; + __be32 doorbell_qpn; + struct { + uint32_t wqe_cnt; + int wqe_shift; + int offset; + } sq; + struct { + uint32_t wqe_cnt; + int wqe_shift; + int offset; + } rq; + struct { + void *buf; + size_t length; + } buf; + uint64_t comp_mask; + off_t uar_mmap_offset; +}; + +enum mlx4dv_cq_comp_mask { + MLX4DV_CQ_MASK_UAR = 1 << 0, +}; + +struct mlx4dv_cq { + struct { + void *buf; + size_t length; + } buf; + uint32_t cqe_cnt; + uint32_t cqn; + __be32 *set_ci_db; + __be32 *arm_db; + int arm_sn; + int cqe_size; + uint64_t comp_mask; + void *cq_uar; +}; + +struct mlx4dv_srq { + struct { + void *buf; + size_t length; + } buf; + int wqe_shift; + int head; + int tail; + __be32 *db; + uint64_t comp_mask; +}; + +struct mlx4dv_rwq { + __be32 *rdb; + struct { + uint32_t wqe_cnt; + int wqe_shift; + int offset; + } rq; + struct { + void *buf; + size_t length; + } buf; + uint64_t comp_mask; +}; + +struct mlx4dv_obj { + struct { + struct ibv_qp *in; + struct mlx4dv_qp *out; + } qp; + struct { + struct ibv_cq *in; + struct mlx4dv_cq *out; + } cq; + struct { + struct ibv_srq *in; + struct mlx4dv_srq *out; + } srq; + struct { + struct ibv_wq *in; + struct mlx4dv_rwq *out; + } rwq; +}; + +enum mlx4dv_obj_type { + MLX4DV_OBJ_QP = 1 << 0, + MLX4DV_OBJ_CQ = 1 << 1, + MLX4DV_OBJ_SRQ = 1 << 2, + MLX4DV_OBJ_RWQ = 1 << 3, +}; + +/* + * This function will initialize mlx4dv_xxx structs based on supplied type. + * The information for initialization is taken from ibv_xx structs supplied + * as part of input. + * + * Request information of CQ marks its owned by DV for all consumer index + * related actions. + * + * The initialization type can be combination of several types together. + * + * Return: 0 in case of success. + */ +int mlx4dv_init_obj(struct mlx4dv_obj *obj, uint64_t obj_type); + +static MLX4DV_ALWAYS_INLINE +uint8_t mlx4dv_get_cqe_owner(struct mlx4_cqe *cqe) +{ + return cqe->owner_sr_opcode & MLX4_CQE_OWNER_MASK; +} + +static MLX4DV_ALWAYS_INLINE +void mlx4dv_set_cqe_owner(struct mlx4_cqe *cqe, uint8_t val) +{ + cqe->owner_sr_opcode = (val & MLX4_CQE_OWNER_MASK) | + (cqe->owner_sr_opcode & ~MLX4_CQE_OWNER_MASK); +} + +static MLX4DV_ALWAYS_INLINE +uint8_t mlx4dv_get_cqe_opcode(struct mlx4_cqe *cqe) +{ + return cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK; +} + +/* + * WQE related part + */ + +enum { + MLX4_SEND_DOORBELL = 0x14, +}; + +enum { + MLX4_WQE_CTRL_SOLICIT = 1 << 1, + MLX4_WQE_CTRL_CQ_UPDATE = 3 << 2, + MLX4_WQE_CTRL_IP_HDR_CSUM = 1 << 4, + MLX4_WQE_CTRL_TCP_UDP_CSUM = 1 << 5, + MLX4_WQE_CTRL_FENCE = 1 << 6, + MLX4_WQE_CTRL_STRONG_ORDER = 1 << 7 +}; + +enum { + MLX4_WQE_BIND_TYPE_2 = (1UL<<31), + MLX4_WQE_BIND_ZERO_BASED = (1<<30), +}; + +enum { + MLX4_INLINE_SEG = 1UL << 31, + MLX4_INLINE_ALIGN = 64, +}; + +enum { + MLX4_INVALID_LKEY = 0x100, +}; + +enum { + MLX4_WQE_MW_REMOTE_READ = 1 << 29, + MLX4_WQE_MW_REMOTE_WRITE = 1 << 30, + MLX4_WQE_MW_ATOMIC = 1UL << 31 +}; + +struct mlx4_wqe_local_inval_seg { + uint64_t reserved1; + __be32 mem_key; + uint32_t reserved2; + uint64_t reserved3[2]; +}; + +struct mlx4_wqe_bind_seg { + __be32 flags1; + __be32 flags2; + __be32 new_rkey; + __be32 lkey; + __be64 addr; + __be64 length; +}; + +struct mlx4_wqe_ctrl_seg { + __be32 owner_opcode; + union { + struct { + uint8_t reserved[3]; + uint8_t fence_size; + }; + __be32 bf_qpn; + }; + /* + * High 24 bits are SRC remote buffer; low 8 bits are flags: + * [7] SO (strong ordering) + * [5] TCP/UDP checksum + * [4] IP checksum + * [3:2] C (generate completion queue entry) + * [1] SE (solicited event) + * [0] FL (force loopback) + */ + union { + __be32 srcrb_flags; + __be16 srcrb_flags16[2]; + }; + + /* + * imm is immediate data for send/RDMA write w/ immediate; + * also invalidation key for send with invalidate; input + * modifier for WQEs on CCQs. + */ + __be32 imm; +}; + +struct mlx4_av { + __be32 port_pd; + uint8_t reserved1; + uint8_t g_slid; + __be16 dlid; + uint8_t reserved2; + uint8_t gid_index; + uint8_t stat_rate; + uint8_t hop_limit; + __be32 sl_tclass_flowlabel; + uint8_t dgid[16]; +}; + +struct mlx4_wqe_datagram_seg { + struct mlx4_av av; + __be32 dqpn; + __be32 qkey; + __be16 vlan; + uint8_t mac[6]; +}; + +struct mlx4_wqe_data_seg { + __be32 byte_count; + __be32 lkey; + __be64 addr; +}; + +struct mlx4_wqe_inline_seg { + __be32 byte_count; +}; + +struct mlx4_wqe_srq_next_seg { + uint16_t reserved1; + __be16 next_wqe_index; + uint32_t reserved2[3]; +}; + +struct mlx4_wqe_raddr_seg { + __be64 raddr; + __be32 rkey; + __be32 reserved; +}; + +struct mlx4_wqe_lso_seg { + __be32 mss_hdr_size; + __be32 header[0]; +}; + +struct mlx4_wqe_atomic_seg { + __be64 swap_add; + __be64 compare; +}; + +enum mlx4dv_qp_init_attr_mask { + MLX4DV_QP_INIT_ATTR_MASK_INL_RECV = 1 << 0, + MLX4DV_QP_INIT_ATTR_MASK_RESERVED = 1 << 1, +}; + +struct mlx4dv_qp_init_attr { + uint64_t comp_mask; /* Use enum mlx4dv_qp_init_attr_mask */ + uint32_t inl_recv_sz; +}; + +struct ibv_qp *mlx4dv_create_qp(struct ibv_context *context, + struct ibv_qp_init_attr_ex *attr, + struct mlx4dv_qp_init_attr *mlx4_qp_attr); + +/* + * Direct verbs device-specific attributes + */ +struct mlx4dv_context { + uint8_t version; + uint32_t max_inl_recv_sz; + uint64_t comp_mask; +}; + +/* + * Control segment - contains some control information for the current WQE. + * + * Output: + * seg - control segment to be filled + * Input: + * owner_opcode - Opcode of this WQE (Encodes the type of operation + * to be executed on the QP) and owner bit. + * wqe_cnt - Number of queue entries. + * ind - WQEBB number of the first block of this WQE. + * fence_size - Fence bit and WQE size in octowords. + * srcrb_flags - High 24 bits are SRC remote buffer; low 8 bits are + * flags which described in mlx4_wqe_ctrl_seg struct. + * imm - Immediate data/Invalidation key. + */ +static MLX4DV_ALWAYS_INLINE +void mlx4dv_set_ctrl_seg(struct mlx4_wqe_ctrl_seg *seg, uint32_t owner_opcode, + uint8_t fence_size, uint32_t srcrb_flags, uint32_t imm) +{ + seg->owner_opcode = htobe32(owner_opcode); + seg->fence_size = fence_size; + seg->srcrb_flags = htobe32(srcrb_flags); + /* + * The caller should prepare "imm" in advance based on WR opcode. + * For IBV_WR_SEND_WITH_IMM and IBV_WR_RDMA_WRITE_WITH_IMM, + * the "imm" should be assigned as is. + * For the IBV_WR_SEND_WITH_INV, it should be htobe32(imm). + */ + seg->imm = imm; +} + +/* + * Datagram Segment - contains address information required in order + * to form a datagram message. + * + * Output: + * seg - datagram segment to be filled. + * Input: + * port_pd - Port number and protection domain. + * g_slid - GRH and source LID for IB port only. + * dlid - Remote LID. + * gid_index - Index to port GID table. + * state_rate - Maximum static rate control. + * hop_limit - IPv6 hop limit. + * sl_tclass_flowlabel - Service Level, IPv6 TClass and flow table. + * dgid - Remote GID for IB port only. + * dqpn - Destination QP. + * qkey - QKey. + * vlan - VLAN for RAW ETHERNET QP only. + * mac - Destination MAC for RAW ETHERNET QP only. + */ +static MLX4DV_ALWAYS_INLINE +void mlx4dv_set_dgram_seg(struct mlx4_wqe_datagram_seg *seg, uint32_t port_pd, + uint8_t g_slid, uint16_t dlid, uint8_t gid_index, + uint8_t stat_rate, uint8_t hop_limit, uint32_t + sl_tclass_flowlabel, uint8_t *dgid, uint32_t dqpn, + uint32_t qkey, uint16_t vlan, uint8_t *mac) +{ + seg->av.port_pd = htobe32(port_pd); + seg->av.g_slid = g_slid; + seg->av.dlid = htobe16(dlid); + seg->av.gid_index = gid_index; + seg->av.stat_rate = stat_rate; + seg->av.hop_limit = hop_limit; + seg->av.sl_tclass_flowlabel = htobe32(sl_tclass_flowlabel); + memcpy(seg->av.dgid, dgid, 16); + seg->dqpn = htobe32(dqpn); + seg->qkey = htobe32(qkey); + seg->vlan = htobe16(vlan); + memcpy(seg->mac, mac, 6); +} + +/* + * Data Segments - contain pointers and a byte count for the scatter/gather list. + * They can optionally contain data, which will save a memory read access for + * gather Work Requests. + */ +static MLX4DV_ALWAYS_INLINE +void mlx4dv_set_data_seg(struct mlx4_wqe_data_seg *seg, + uint32_t length, uint32_t lkey, + uintptr_t address) +{ + seg->byte_count = htobe32(length); + seg->lkey = htobe32(lkey); + seg->addr = htobe64(address); +} + +/* Most device capabilities are exported by ibv_query_device(...), + * but there is HW device-specific information which is important + * for data-path, but isn't provided. + * + * Return 0 on success. + */ +int mlx4dv_query_device(struct ibv_context *ctx_in, + struct mlx4dv_context *attrs_out); + +enum mlx4dv_set_ctx_attr_type { + /* Attribute type uint8_t */ + MLX4DV_SET_CTX_ATTR_LOG_WQS_RANGE_SZ = 0, + MLX4DV_SET_CTX_ATTR_BUF_ALLOCATORS = 1, +}; + +struct mlx4dv_ctx_allocators { + void *(*alloc)(size_t size, void *priv_data); + void (*free)(void *ptr, void *priv_data); + void *data; +}; +/* + * Returns 0 on success, or the value of errno on failure + * (which indicates the failure reason). + */ +int mlx4dv_set_context_attr(struct ibv_context *context, + enum mlx4dv_set_ctx_attr_type attr_type, + void *attr); + +#ifdef __cplusplus +} +#endif + +#endif /* _MLX4DV_H_ */ diff --git a/providers/mlx4/qp.c b/providers/mlx4/qp.c new file mode 100644 index 0000000..7e55d25 --- /dev/null +++ b/providers/mlx4/qp.c @@ -0,0 +1,817 @@ +/* + * Copyright (c) 2005 Topspin Communications. All rights reserved. + * Copyright (c) 2005 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2007 Cisco, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <config.h> + +#include <stdlib.h> +#include <pthread.h> +#include <string.h> +#include <errno.h> +#include <util/mmio.h> +#include <util/compiler.h> + +#include "mlx4.h" + +static const uint32_t mlx4_ib_opcode[] = { + [IBV_WR_SEND] = MLX4_OPCODE_SEND, + [IBV_WR_SEND_WITH_IMM] = MLX4_OPCODE_SEND_IMM, + [IBV_WR_RDMA_WRITE] = MLX4_OPCODE_RDMA_WRITE, + [IBV_WR_RDMA_WRITE_WITH_IMM] = MLX4_OPCODE_RDMA_WRITE_IMM, + [IBV_WR_RDMA_READ] = MLX4_OPCODE_RDMA_READ, + [IBV_WR_ATOMIC_CMP_AND_SWP] = MLX4_OPCODE_ATOMIC_CS, + [IBV_WR_ATOMIC_FETCH_AND_ADD] = MLX4_OPCODE_ATOMIC_FA, + [IBV_WR_LOCAL_INV] = MLX4_OPCODE_LOCAL_INVAL, + [IBV_WR_BIND_MW] = MLX4_OPCODE_BIND_MW, + [IBV_WR_SEND_WITH_INV] = MLX4_OPCODE_SEND_INVAL, +}; + +static void *get_recv_wqe(struct mlx4_qp *qp, int n) +{ + return qp->buf.buf + qp->rq.offset + (n << qp->rq.wqe_shift); +} + +static void *get_send_wqe(struct mlx4_qp *qp, int n) +{ + return qp->buf.buf + qp->sq.offset + (n << qp->sq.wqe_shift); +} + +/* + * Stamp a SQ WQE so that it is invalid if prefetched by marking the + * first four bytes of every 64 byte chunk with 0xffffffff, except for + * the very first chunk of the WQE. + */ +static void stamp_send_wqe(struct mlx4_qp *qp, int n) +{ + uint32_t *wqe = get_send_wqe(qp, n); + int i; + int ds = (((struct mlx4_wqe_ctrl_seg *)wqe)->fence_size & 0x3f) << 2; + + for (i = 16; i < ds; i += 16) + wqe[i] = 0xffffffff; +} + +void mlx4_init_qp_indices(struct mlx4_qp *qp) +{ + qp->sq.head = 0; + qp->sq.tail = 0; + qp->rq.head = 0; + qp->rq.tail = 0; +} + +void mlx4_qp_init_sq_ownership(struct mlx4_qp *qp) +{ + struct mlx4_wqe_ctrl_seg *ctrl; + int i; + + for (i = 0; i < qp->sq.wqe_cnt; ++i) { + ctrl = get_send_wqe(qp, i); + ctrl->owner_opcode = htobe32(1 << 31); + ctrl->fence_size = 1 << (qp->sq.wqe_shift - 4); + + stamp_send_wqe(qp, i); + } +} + +static int wq_overflow(struct mlx4_wq *wq, int nreq, struct mlx4_cq *cq) +{ + unsigned cur; + + cur = wq->head - wq->tail; + if (cur + nreq < wq->max_post) + return 0; + + pthread_spin_lock(&cq->lock); + cur = wq->head - wq->tail; + pthread_spin_unlock(&cq->lock); + + return cur + nreq >= wq->max_post; +} + +static void set_bind_seg(struct mlx4_wqe_bind_seg *bseg, struct ibv_send_wr *wr) +{ + int acc = wr->bind_mw.bind_info.mw_access_flags; + bseg->flags1 = 0; + if (acc & IBV_ACCESS_REMOTE_ATOMIC) + bseg->flags1 |= htobe32(MLX4_WQE_MW_ATOMIC); + if (acc & IBV_ACCESS_REMOTE_WRITE) + bseg->flags1 |= htobe32(MLX4_WQE_MW_REMOTE_WRITE); + if (acc & IBV_ACCESS_REMOTE_READ) + bseg->flags1 |= htobe32(MLX4_WQE_MW_REMOTE_READ); + + bseg->flags2 = 0; + if (((struct ibv_mw *)(wr->bind_mw.mw))->type == IBV_MW_TYPE_2) + bseg->flags2 |= htobe32(MLX4_WQE_BIND_TYPE_2); + if (acc & IBV_ACCESS_ZERO_BASED) + bseg->flags2 |= htobe32(MLX4_WQE_BIND_ZERO_BASED); + + bseg->new_rkey = htobe32(wr->bind_mw.rkey); + bseg->lkey = htobe32(wr->bind_mw.bind_info.mr->lkey); + bseg->addr = htobe64((uint64_t) wr->bind_mw.bind_info.addr); + bseg->length = htobe64(wr->bind_mw.bind_info.length); +} + +static inline void set_local_inv_seg(struct mlx4_wqe_local_inval_seg *iseg, + uint32_t rkey) +{ + iseg->mem_key = htobe32(rkey); + + iseg->reserved1 = 0; + iseg->reserved2 = 0; + iseg->reserved3[0] = 0; + iseg->reserved3[1] = 0; +} + +static inline void set_raddr_seg(struct mlx4_wqe_raddr_seg *rseg, + uint64_t remote_addr, uint32_t rkey) +{ + rseg->raddr = htobe64(remote_addr); + rseg->rkey = htobe32(rkey); + rseg->reserved = 0; +} + +static void set_atomic_seg(struct mlx4_wqe_atomic_seg *aseg, struct ibv_send_wr *wr) +{ + if (wr->opcode == IBV_WR_ATOMIC_CMP_AND_SWP) { + aseg->swap_add = htobe64(wr->wr.atomic.swap); + aseg->compare = htobe64(wr->wr.atomic.compare_add); + } else { + aseg->swap_add = htobe64(wr->wr.atomic.compare_add); + aseg->compare = 0; + } + +} + +static void set_datagram_seg(struct mlx4_wqe_datagram_seg *dseg, + struct ibv_send_wr *wr) +{ + memcpy(&dseg->av, &to_mah(wr->wr.ud.ah)->av, sizeof (struct mlx4_av)); + dseg->dqpn = htobe32(wr->wr.ud.remote_qpn); + dseg->qkey = htobe32(wr->wr.ud.remote_qkey); + dseg->vlan = htobe16(to_mah(wr->wr.ud.ah)->vlan); + memcpy(dseg->mac, to_mah(wr->wr.ud.ah)->mac, 6); +} + +static void __set_data_seg(struct mlx4_wqe_data_seg *dseg, struct ibv_sge *sg) +{ + dseg->byte_count = htobe32(sg->length); + dseg->lkey = htobe32(sg->lkey); + dseg->addr = htobe64(sg->addr); +} + +static void set_data_seg(struct mlx4_wqe_data_seg *dseg, struct ibv_sge *sg) +{ + dseg->lkey = htobe32(sg->lkey); + dseg->addr = htobe64(sg->addr); + + /* + * Need a barrier here before writing the byte_count field to + * make sure that all the data is visible before the + * byte_count field is set. Otherwise, if the segment begins + * a new cacheline, the HCA prefetcher could grab the 64-byte + * chunk and get a valid (!= * 0xffffffff) byte count but + * stale data, and end up sending the wrong data. + */ + udma_to_device_barrier(); + + if (likely(sg->length)) + dseg->byte_count = htobe32(sg->length); + else + dseg->byte_count = htobe32(0x80000000); +} + +int mlx4_post_send(struct ibv_qp *ibqp, struct ibv_send_wr *wr, + struct ibv_send_wr **bad_wr) +{ + struct mlx4_context *ctx; + struct mlx4_qp *qp = to_mqp(ibqp); + void *wqe; + struct mlx4_wqe_ctrl_seg *uninitialized_var(ctrl); + int ind; + int nreq; + int inl = 0; + int ret = 0; + int size = 0; + int i; + + pthread_spin_lock(&qp->sq.lock); + + /* XXX check that state is OK to post send */ + + ind = qp->sq.head; + + for (nreq = 0; wr; ++nreq, wr = wr->next) { + if (wq_overflow(&qp->sq, nreq, to_mcq(ibqp->send_cq))) { + ret = ENOMEM; + *bad_wr = wr; + goto out; + } + + if (wr->num_sge > qp->sq.max_gs) { + ret = ENOMEM; + *bad_wr = wr; + goto out; + } + + if (wr->opcode >= sizeof mlx4_ib_opcode / sizeof mlx4_ib_opcode[0]) { + ret = EINVAL; + *bad_wr = wr; + goto out; + } + + ctrl = wqe = get_send_wqe(qp, ind & (qp->sq.wqe_cnt - 1)); + qp->sq.wrid[ind & (qp->sq.wqe_cnt - 1)] = wr->wr_id; + + ctrl->srcrb_flags = + (wr->send_flags & IBV_SEND_SIGNALED ? + htobe32(MLX4_WQE_CTRL_CQ_UPDATE) : 0) | + (wr->send_flags & IBV_SEND_SOLICITED ? + htobe32(MLX4_WQE_CTRL_SOLICIT) : 0) | + qp->sq_signal_bits; + + if (wr->opcode == IBV_WR_SEND_WITH_IMM || + wr->opcode == IBV_WR_RDMA_WRITE_WITH_IMM) + ctrl->imm = wr->imm_data; + else + ctrl->imm = 0; + + wqe += sizeof *ctrl; + size = sizeof *ctrl / 16; + + switch (ibqp->qp_type) { + case IBV_QPT_XRC_SEND: + ctrl->srcrb_flags |= MLX4_REMOTE_SRQN_FLAGS(wr); + /* fall through */ + case IBV_QPT_RC: + case IBV_QPT_UC: + switch (wr->opcode) { + case IBV_WR_ATOMIC_CMP_AND_SWP: + case IBV_WR_ATOMIC_FETCH_AND_ADD: + set_raddr_seg(wqe, wr->wr.atomic.remote_addr, + wr->wr.atomic.rkey); + wqe += sizeof (struct mlx4_wqe_raddr_seg); + + set_atomic_seg(wqe, wr); + wqe += sizeof (struct mlx4_wqe_atomic_seg); + size += (sizeof (struct mlx4_wqe_raddr_seg) + + sizeof (struct mlx4_wqe_atomic_seg)) / 16; + + break; + + case IBV_WR_RDMA_READ: + inl = 1; + /* fall through */ + case IBV_WR_RDMA_WRITE: + case IBV_WR_RDMA_WRITE_WITH_IMM: + if (!wr->num_sge) + inl = 1; + set_raddr_seg(wqe, wr->wr.rdma.remote_addr, + wr->wr.rdma.rkey); + wqe += sizeof (struct mlx4_wqe_raddr_seg); + size += sizeof (struct mlx4_wqe_raddr_seg) / 16; + + break; + case IBV_WR_LOCAL_INV: + ctrl->srcrb_flags |= + htobe32(MLX4_WQE_CTRL_STRONG_ORDER); + set_local_inv_seg(wqe, wr->invalidate_rkey); + wqe += sizeof + (struct mlx4_wqe_local_inval_seg); + size += sizeof + (struct mlx4_wqe_local_inval_seg) / 16; + break; + case IBV_WR_BIND_MW: + ctrl->srcrb_flags |= + htobe32(MLX4_WQE_CTRL_STRONG_ORDER); + set_bind_seg(wqe, wr); + wqe += sizeof + (struct mlx4_wqe_bind_seg); + size += sizeof + (struct mlx4_wqe_bind_seg) / 16; + break; + case IBV_WR_SEND_WITH_INV: + ctrl->imm = htobe32(wr->invalidate_rkey); + break; + + default: + /* No extra segments required for sends */ + break; + } + break; + + case IBV_QPT_UD: + set_datagram_seg(wqe, wr); + wqe += sizeof (struct mlx4_wqe_datagram_seg); + size += sizeof (struct mlx4_wqe_datagram_seg) / 16; + + if (wr->send_flags & IBV_SEND_IP_CSUM) { + if (!(qp->qp_cap_cache & MLX4_CSUM_SUPPORT_UD_OVER_IB)) { + ret = EINVAL; + *bad_wr = wr; + goto out; + } + ctrl->srcrb_flags |= htobe32(MLX4_WQE_CTRL_IP_HDR_CSUM | + MLX4_WQE_CTRL_TCP_UDP_CSUM); + } + break; + + case IBV_QPT_RAW_PACKET: + /* For raw eth, the MLX4_WQE_CTRL_SOLICIT flag is used + * to indicate that no icrc should be calculated */ + ctrl->srcrb_flags |= htobe32(MLX4_WQE_CTRL_SOLICIT); + if (wr->send_flags & IBV_SEND_IP_CSUM) { + if (!(qp->qp_cap_cache & MLX4_CSUM_SUPPORT_RAW_OVER_ETH)) { + ret = EINVAL; + *bad_wr = wr; + goto out; + } + ctrl->srcrb_flags |= htobe32(MLX4_WQE_CTRL_IP_HDR_CSUM | + MLX4_WQE_CTRL_TCP_UDP_CSUM); + } + /* Take the dmac from the payload - needed for loopback */ + if (qp->link_layer == IBV_LINK_LAYER_ETHERNET) { + ctrl->srcrb_flags16[0] = *(__be16 *)(uintptr_t)wr->sg_list[0].addr; + ctrl->imm = *(__be32 *)((uintptr_t)(wr->sg_list[0].addr) + 2); + } + + break; + + default: + break; + } + + if (wr->send_flags & IBV_SEND_INLINE && wr->num_sge) { + struct mlx4_wqe_inline_seg *seg; + void *addr; + int len, seg_len; + int num_seg; + int off, to_copy; + + inl = 0; + + seg = wqe; + wqe += sizeof *seg; + off = ((uintptr_t) wqe) & (MLX4_INLINE_ALIGN - 1); + num_seg = 0; + seg_len = 0; + + for (i = 0; i < wr->num_sge; ++i) { + addr = (void *) (uintptr_t) wr->sg_list[i].addr; + len = wr->sg_list[i].length; + inl += len; + + if (inl > qp->max_inline_data) { + inl = 0; + ret = ENOMEM; + *bad_wr = wr; + goto out; + } + + while (len >= MLX4_INLINE_ALIGN - off) { + to_copy = MLX4_INLINE_ALIGN - off; + memcpy(wqe, addr, to_copy); + len -= to_copy; + wqe += to_copy; + addr += to_copy; + seg_len += to_copy; + udma_to_device_barrier(); /* see comment below */ + seg->byte_count = htobe32(MLX4_INLINE_SEG | seg_len); + seg_len = 0; + seg = wqe; + wqe += sizeof *seg; + off = sizeof *seg; + ++num_seg; + } + + memcpy(wqe, addr, len); + wqe += len; + seg_len += len; + off += len; + } + + if (seg_len) { + ++num_seg; + /* + * Need a barrier here to make sure + * all the data is visible before the + * byte_count field is set. Otherwise + * the HCA prefetcher could grab the + * 64-byte chunk with this inline + * segment and get a valid (!= + * 0xffffffff) byte count but stale + * data, and end up sending the wrong + * data. + */ + udma_to_device_barrier(); + seg->byte_count = htobe32(MLX4_INLINE_SEG | seg_len); + } + + size += (inl + num_seg * sizeof * seg + 15) / 16; + } else { + struct mlx4_wqe_data_seg *seg = wqe; + + for (i = wr->num_sge - 1; i >= 0 ; --i) + set_data_seg(seg + i, wr->sg_list + i); + + size += wr->num_sge * (sizeof *seg / 16); + } + + ctrl->fence_size = (wr->send_flags & IBV_SEND_FENCE ? + MLX4_WQE_CTRL_FENCE : 0) | size; + + /* + * Make sure descriptor is fully written before + * setting ownership bit (because HW can start + * executing as soon as we do). + */ + udma_to_device_barrier(); + + ctrl->owner_opcode = htobe32(mlx4_ib_opcode[wr->opcode]) | + (ind & qp->sq.wqe_cnt ? htobe32(1 << 31) : 0); + + /* + * We can improve latency by not stamping the last + * send queue WQE until after ringing the doorbell, so + * only stamp here if there are still more WQEs to post. + */ + if (wr->next) + stamp_send_wqe(qp, (ind + qp->sq_spare_wqes) & + (qp->sq.wqe_cnt - 1)); + + ++ind; + } + +out: + ctx = to_mctx(ibqp->context); + + if (nreq == 1 && inl && size > 1 && size <= ctx->bf_buf_size / 16) { + ctrl->owner_opcode |= htobe32((qp->sq.head & 0xffff) << 8); + + ctrl->bf_qpn |= qp->doorbell_qpn; + ++qp->sq.head; + /* + * Make sure that descriptor is written to memory + * before writing to BlueFlame page. + */ + mmio_wc_spinlock(&ctx->bf_lock); + + mmio_memcpy_x64(ctx->bf_page + ctx->bf_offset, ctrl, + align(size * 16, 64)); + /* Flush before toggling bf_offset to be latency oriented */ + mmio_flush_writes(); + + ctx->bf_offset ^= ctx->bf_buf_size; + + pthread_spin_unlock(&ctx->bf_lock); + } else if (nreq) { + qp->sq.head += nreq; + + /* + * Make sure that descriptors are written before + * doorbell record. + */ + udma_to_device_barrier(); + + mmio_write32_be(ctx->uar + MLX4_SEND_DOORBELL, + qp->doorbell_qpn); + } + + if (nreq) + stamp_send_wqe(qp, (ind + qp->sq_spare_wqes - 1) & + (qp->sq.wqe_cnt - 1)); + + pthread_spin_unlock(&qp->sq.lock); + + return ret; +} + +static inline int _mlx4_post_recv(struct mlx4_qp *qp, struct mlx4_cq *cq, + struct ibv_recv_wr *wr, + struct ibv_recv_wr **bad_wr) + ALWAYS_INLINE; +static inline int _mlx4_post_recv(struct mlx4_qp *qp, struct mlx4_cq *cq, + struct ibv_recv_wr *wr, + struct ibv_recv_wr **bad_wr) +{ + struct mlx4_wqe_data_seg *scat; + int ret = 0; + int nreq; + int ind; + int i; + + pthread_spin_lock(&qp->rq.lock); + + /* XXX check that state is OK to post receive */ + + ind = qp->rq.head & (qp->rq.wqe_cnt - 1); + + for (nreq = 0; wr; ++nreq, wr = wr->next) { + if (wq_overflow(&qp->rq, nreq, cq)) { + ret = ENOMEM; + *bad_wr = wr; + goto out; + } + + if (wr->num_sge > qp->rq.max_gs) { + ret = ENOMEM; + *bad_wr = wr; + goto out; + } + + scat = get_recv_wqe(qp, ind); + + for (i = 0; i < wr->num_sge; ++i) + __set_data_seg(scat + i, wr->sg_list + i); + + if (i < qp->rq.max_gs) { + scat[i].byte_count = 0; + scat[i].lkey = htobe32(MLX4_INVALID_LKEY); + scat[i].addr = 0; + } + + qp->rq.wrid[ind] = wr->wr_id; + + ind = (ind + 1) & (qp->rq.wqe_cnt - 1); + } + +out: + if (nreq) { + qp->rq.head += nreq; + + /* + * Make sure that descriptors are written before + * doorbell record. + */ + udma_to_device_barrier(); + + *qp->db = htobe32(qp->rq.head & 0xffff); + } + + pthread_spin_unlock(&qp->rq.lock); + + return ret; +} + +int mlx4_post_recv(struct ibv_qp *ibqp, struct ibv_recv_wr *wr, + struct ibv_recv_wr **bad_wr) +{ + struct mlx4_qp *qp = to_mqp(ibqp); + struct mlx4_cq *cq = to_mcq(ibqp->recv_cq); + + return _mlx4_post_recv(qp, cq, wr, bad_wr); +} + +int mlx4_post_wq_recv(struct ibv_wq *ibwq, struct ibv_recv_wr *wr, + struct ibv_recv_wr **bad_wr) +{ + struct mlx4_qp *qp = wq_to_mqp(ibwq); + struct mlx4_cq *cq = to_mcq(ibwq->cq); + + return _mlx4_post_recv(qp, cq, wr, bad_wr); +} + +static int num_inline_segs(int data, enum ibv_qp_type type) +{ + /* + * Inline data segments are not allowed to cross 64 byte + * boundaries. For UD QPs, the data segments always start + * aligned to 64 bytes (16 byte control segment + 48 byte + * datagram segment); for other QPs, there will be a 16 byte + * control segment and possibly a 16 byte remote address + * segment, so in the worst case there will be only 32 bytes + * available for the first data segment. + */ + if (type == IBV_QPT_UD) + data += (sizeof (struct mlx4_wqe_ctrl_seg) + + sizeof (struct mlx4_wqe_datagram_seg)) % + MLX4_INLINE_ALIGN; + else + data += (sizeof (struct mlx4_wqe_ctrl_seg) + + sizeof (struct mlx4_wqe_raddr_seg)) % + MLX4_INLINE_ALIGN; + + return (data + MLX4_INLINE_ALIGN - sizeof (struct mlx4_wqe_inline_seg) - 1) / + (MLX4_INLINE_ALIGN - sizeof (struct mlx4_wqe_inline_seg)); +} + +void mlx4_calc_sq_wqe_size(struct ibv_qp_cap *cap, enum ibv_qp_type type, + struct mlx4_qp *qp, struct ibv_qp_init_attr_ex *attr) +{ + int size; + int max_sq_sge; + + max_sq_sge = align(cap->max_inline_data + + num_inline_segs(cap->max_inline_data, type) * + sizeof (struct mlx4_wqe_inline_seg), + sizeof (struct mlx4_wqe_data_seg)) / + sizeof (struct mlx4_wqe_data_seg); + if (max_sq_sge < cap->max_send_sge) + max_sq_sge = cap->max_send_sge; + + size = max_sq_sge * sizeof (struct mlx4_wqe_data_seg); + switch (type) { + case IBV_QPT_UD: + size += sizeof (struct mlx4_wqe_datagram_seg); + break; + + case IBV_QPT_UC: + size += sizeof (struct mlx4_wqe_raddr_seg); + break; + + case IBV_QPT_XRC_SEND: + case IBV_QPT_RC: + size += sizeof (struct mlx4_wqe_raddr_seg); + /* + * An atomic op will require an atomic segment, a + * remote address segment and one scatter entry. + */ + if (size < (sizeof (struct mlx4_wqe_atomic_seg) + + sizeof (struct mlx4_wqe_raddr_seg) + + sizeof (struct mlx4_wqe_data_seg))) + size = (sizeof (struct mlx4_wqe_atomic_seg) + + sizeof (struct mlx4_wqe_raddr_seg) + + sizeof (struct mlx4_wqe_data_seg)); + break; + + default: + break; + } + + /* Make sure that we have enough space for a bind request */ + if (size < sizeof (struct mlx4_wqe_bind_seg)) + size = sizeof (struct mlx4_wqe_bind_seg); + + size += sizeof (struct mlx4_wqe_ctrl_seg); + + if (attr->comp_mask & IBV_QP_INIT_ATTR_MAX_TSO_HEADER) + size += align(sizeof (struct mlx4_wqe_lso_seg) + attr->max_tso_header, 16); + + for (qp->sq.wqe_shift = 6; 1 << qp->sq.wqe_shift < size; + qp->sq.wqe_shift++) + ; /* nothing */ +} + +int mlx4_alloc_qp_buf(struct ibv_context *context, uint32_t max_recv_sge, + enum ibv_qp_type type, struct mlx4_qp *qp, + struct mlx4dv_qp_init_attr *mlx4qp_attr) +{ + int wqe_size; + + qp->rq.max_gs = max_recv_sge; + wqe_size = qp->rq.max_gs * sizeof(struct mlx4_wqe_data_seg); + if (mlx4qp_attr && + mlx4qp_attr->comp_mask & MLX4DV_QP_INIT_ATTR_MASK_INL_RECV && + mlx4qp_attr->inl_recv_sz > wqe_size) + wqe_size = mlx4qp_attr->inl_recv_sz; + + if (qp->sq.wqe_cnt) { + qp->sq.wrid = malloc(qp->sq.wqe_cnt * sizeof (uint64_t)); + if (!qp->sq.wrid) + return -1; + } + + if (qp->rq.wqe_cnt) { + qp->rq.wrid = malloc(qp->rq.wqe_cnt * sizeof (uint64_t)); + if (!qp->rq.wrid) { + free(qp->sq.wrid); + return -1; + } + } + + for (qp->rq.wqe_shift = 4; + 1 << qp->rq.wqe_shift < wqe_size; + qp->rq.wqe_shift++) + ; /* nothing */ + if (mlx4qp_attr) + mlx4qp_attr->inl_recv_sz = 1 << qp->rq.wqe_shift; + + qp->buf_size = (qp->rq.wqe_cnt << qp->rq.wqe_shift) + + (qp->sq.wqe_cnt << qp->sq.wqe_shift); + if (qp->rq.wqe_shift > qp->sq.wqe_shift) { + qp->rq.offset = 0; + qp->sq.offset = qp->rq.wqe_cnt << qp->rq.wqe_shift; + } else { + qp->rq.offset = qp->sq.wqe_cnt << qp->sq.wqe_shift; + qp->sq.offset = 0; + } + + if (qp->buf_size) { + if (mlx4_alloc_buf(to_mctx(context), &qp->buf, + align(qp->buf_size, to_mdev(context->device)->page_size), + to_mdev(context->device)->page_size)) { + free(qp->sq.wrid); + free(qp->rq.wrid); + return -1; + } + + memset(qp->buf.buf, 0, qp->buf_size); + } else { + qp->buf.buf = NULL; + } + + return 0; +} + +void mlx4_set_sq_sizes(struct mlx4_qp *qp, struct ibv_qp_cap *cap, + enum ibv_qp_type type) +{ + int wqe_size; + + wqe_size = (1 << qp->sq.wqe_shift) - sizeof (struct mlx4_wqe_ctrl_seg); + switch (type) { + case IBV_QPT_UD: + wqe_size -= sizeof (struct mlx4_wqe_datagram_seg); + break; + + case IBV_QPT_XRC_SEND: + case IBV_QPT_UC: + case IBV_QPT_RC: + wqe_size -= sizeof (struct mlx4_wqe_raddr_seg); + break; + + default: + break; + } + + qp->sq.max_gs = wqe_size / sizeof (struct mlx4_wqe_data_seg); + cap->max_send_sge = qp->sq.max_gs; + qp->sq.max_post = qp->sq.wqe_cnt - qp->sq_spare_wqes; + cap->max_send_wr = qp->sq.max_post; + + /* + * Inline data segments can't cross a 64 byte boundary. So + * subtract off one segment header for each 64-byte chunk, + * taking into account the fact that wqe_size will be 32 mod + * 64 for non-UD QPs. + */ + qp->max_inline_data = wqe_size - + sizeof (struct mlx4_wqe_inline_seg) * + (align(wqe_size, MLX4_INLINE_ALIGN) / MLX4_INLINE_ALIGN); + cap->max_inline_data = qp->max_inline_data; +} + +struct mlx4_qp *mlx4_find_qp(struct mlx4_context *ctx, uint32_t qpn) +{ + int tind = (qpn & (ctx->num_qps - 1)) >> ctx->qp_table_shift; + + if (ctx->qp_table[tind].refcnt) + return ctx->qp_table[tind].table[qpn & ctx->qp_table_mask]; + else + return NULL; +} + +int mlx4_store_qp(struct mlx4_context *ctx, uint32_t qpn, struct mlx4_qp *qp) +{ + int tind = (qpn & (ctx->num_qps - 1)) >> ctx->qp_table_shift; + + if (!ctx->qp_table[tind].refcnt) { + ctx->qp_table[tind].table = calloc(ctx->qp_table_mask + 1, + sizeof (struct mlx4_qp *)); + if (!ctx->qp_table[tind].table) + return -1; + } + + ++ctx->qp_table[tind].refcnt; + ctx->qp_table[tind].table[qpn & ctx->qp_table_mask] = qp; + return 0; +} + +void mlx4_clear_qp(struct mlx4_context *ctx, uint32_t qpn) +{ + int tind = (qpn & (ctx->num_qps - 1)) >> ctx->qp_table_shift; + + if (!--ctx->qp_table[tind].refcnt) + free(ctx->qp_table[tind].table); + else + ctx->qp_table[tind].table[qpn & ctx->qp_table_mask] = NULL; +} diff --git a/providers/mlx4/srq.c b/providers/mlx4/srq.c new file mode 100644 index 0000000..a02a932 --- /dev/null +++ b/providers/mlx4/srq.c @@ -0,0 +1,323 @@ +/* + * Copyright (c) 2007 Cisco, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <config.h> + +#include <stdlib.h> +#include <pthread.h> +#include <string.h> + +#include "mlx4.h" +#include "mlx4-abi.h" + +static void *get_wqe(struct mlx4_srq *srq, int n) +{ + return srq->buf.buf + (n << srq->wqe_shift); +} + +void mlx4_free_srq_wqe(struct mlx4_srq *srq, int ind) +{ + struct mlx4_wqe_srq_next_seg *next; + + pthread_spin_lock(&srq->lock); + + next = get_wqe(srq, srq->tail); + next->next_wqe_index = htobe16(ind); + srq->tail = ind; + + pthread_spin_unlock(&srq->lock); +} + +int mlx4_post_srq_recv(struct ibv_srq *ibsrq, + struct ibv_recv_wr *wr, + struct ibv_recv_wr **bad_wr) +{ + struct mlx4_srq *srq = to_msrq(ibsrq); + struct mlx4_wqe_srq_next_seg *next; + struct mlx4_wqe_data_seg *scat; + int err = 0; + int nreq; + int i; + + pthread_spin_lock(&srq->lock); + + for (nreq = 0; wr; ++nreq, wr = wr->next) { + if (wr->num_sge > srq->max_gs) { + err = -1; + *bad_wr = wr; + break; + } + + if (srq->head == srq->tail) { + /* SRQ is full*/ + err = -1; + *bad_wr = wr; + break; + } + + srq->wrid[srq->head] = wr->wr_id; + + next = get_wqe(srq, srq->head); + srq->head = be16toh(next->next_wqe_index); + scat = (struct mlx4_wqe_data_seg *) (next + 1); + + for (i = 0; i < wr->num_sge; ++i) { + scat[i].byte_count = htobe32(wr->sg_list[i].length); + scat[i].lkey = htobe32(wr->sg_list[i].lkey); + scat[i].addr = htobe64(wr->sg_list[i].addr); + } + + if (i < srq->max_gs) { + scat[i].byte_count = 0; + scat[i].lkey = htobe32(MLX4_INVALID_LKEY); + scat[i].addr = 0; + } + } + + if (nreq) { + srq->counter += nreq; + + /* + * Make sure that descriptors are written before + * we write doorbell record. + */ + udma_to_device_barrier(); + + *srq->db = htobe32(srq->counter); + } + + pthread_spin_unlock(&srq->lock); + + return err; +} + +int mlx4_alloc_srq_buf(struct ibv_pd *pd, struct ibv_srq_attr *attr, + struct mlx4_srq *srq) +{ + struct mlx4_wqe_srq_next_seg *next; + struct mlx4_wqe_data_seg *scatter; + int size; + int buf_size; + int i; + + srq->wrid = malloc(srq->max * sizeof (uint64_t)); + if (!srq->wrid) + return -1; + + size = sizeof (struct mlx4_wqe_srq_next_seg) + + srq->max_gs * sizeof (struct mlx4_wqe_data_seg); + + for (srq->wqe_shift = 5; 1 << srq->wqe_shift < size; ++srq->wqe_shift) + ; /* nothing */ + + buf_size = srq->max << srq->wqe_shift; + + if (mlx4_alloc_buf(to_mctx(pd->context), &srq->buf, buf_size, + to_mdev(pd->context->device)->page_size)) { + free(srq->wrid); + return -1; + } + + memset(srq->buf.buf, 0, buf_size); + + /* + * Now initialize the SRQ buffer so that all of the WQEs are + * linked into the list of free WQEs. + */ + + for (i = 0; i < srq->max; ++i) { + next = get_wqe(srq, i); + next->next_wqe_index = htobe16((i + 1) & (srq->max - 1)); + + for (scatter = (void *) (next + 1); + (void *) scatter < (void *) next + (1 << srq->wqe_shift); + ++scatter) + scatter->lkey = htobe32(MLX4_INVALID_LKEY); + } + + srq->head = 0; + srq->tail = srq->max - 1; + + return 0; +} + +void mlx4_init_xsrq_table(struct mlx4_xsrq_table *xsrq_table, int size) +{ + memset(xsrq_table, 0, sizeof *xsrq_table); + xsrq_table->num_xsrq = size; + xsrq_table->shift = ffs(size) - 1 - MLX4_XSRQ_TABLE_BITS; + xsrq_table->mask = (1 << xsrq_table->shift) - 1; + + pthread_mutex_init(&xsrq_table->mutex, NULL); +} + +struct mlx4_srq *mlx4_find_xsrq(struct mlx4_xsrq_table *xsrq_table, uint32_t srqn) +{ + int index; + + index = (srqn & (xsrq_table->num_xsrq - 1)) >> xsrq_table->shift; + if (xsrq_table->xsrq_table[index].refcnt) + return xsrq_table->xsrq_table[index].table[srqn & xsrq_table->mask]; + + return NULL; +} + +int mlx4_store_xsrq(struct mlx4_xsrq_table *xsrq_table, uint32_t srqn, + struct mlx4_srq *srq) +{ + int index, ret = 0; + + index = (srqn & (xsrq_table->num_xsrq - 1)) >> xsrq_table->shift; + pthread_mutex_lock(&xsrq_table->mutex); + if (!xsrq_table->xsrq_table[index].refcnt) { + xsrq_table->xsrq_table[index].table = calloc(xsrq_table->mask + 1, + sizeof(struct mlx4_srq *)); + if (!xsrq_table->xsrq_table[index].table) { + ret = -1; + goto out; + } + } + + xsrq_table->xsrq_table[index].refcnt++; + xsrq_table->xsrq_table[index].table[srqn & xsrq_table->mask] = srq; + +out: + pthread_mutex_unlock(&xsrq_table->mutex); + return ret; +} + +void mlx4_clear_xsrq(struct mlx4_xsrq_table *xsrq_table, uint32_t srqn) +{ + int index; + + index = (srqn & (xsrq_table->num_xsrq - 1)) >> xsrq_table->shift; + pthread_mutex_lock(&xsrq_table->mutex); + + if (--xsrq_table->xsrq_table[index].refcnt) + xsrq_table->xsrq_table[index].table[srqn & xsrq_table->mask] = NULL; + else + free(xsrq_table->xsrq_table[index].table); + + pthread_mutex_unlock(&xsrq_table->mutex); +} + +struct ibv_srq *mlx4_create_xrc_srq(struct ibv_context *context, + struct ibv_srq_init_attr_ex *attr_ex) +{ + struct mlx4_create_xsrq cmd; + struct mlx4_create_xsrq_resp resp; + struct mlx4_srq *srq; + int ret; + + /* Sanity check SRQ size before proceeding */ + if (attr_ex->attr.max_wr > 1 << 16 || attr_ex->attr.max_sge > 64) + return NULL; + + srq = calloc(1, sizeof *srq); + if (!srq) + return NULL; + + if (pthread_spin_init(&srq->lock, PTHREAD_PROCESS_PRIVATE)) + goto err; + + srq->max = roundup_pow_of_two(attr_ex->attr.max_wr + 1); + srq->max_gs = attr_ex->attr.max_sge; + srq->counter = 0; + srq->ext_srq = 1; + + if (mlx4_alloc_srq_buf(attr_ex->pd, &attr_ex->attr, srq)) + goto err; + + srq->db = mlx4_alloc_db(to_mctx(context), MLX4_DB_TYPE_RQ); + if (!srq->db) + goto err_free; + + *srq->db = 0; + + cmd.buf_addr = (uintptr_t) srq->buf.buf; + cmd.db_addr = (uintptr_t) srq->db; + + ret = ibv_cmd_create_srq_ex(context, &srq->verbs_srq, + sizeof(srq->verbs_srq), + attr_ex, + &cmd.ibv_cmd, sizeof cmd, + &resp.ibv_resp, sizeof resp); + if (ret) + goto err_db; + + ret = mlx4_store_xsrq(&to_mctx(context)->xsrq_table, + srq->verbs_srq.srq_num, srq); + if (ret) + goto err_destroy; + + return &srq->verbs_srq.srq; + +err_destroy: + ibv_cmd_destroy_srq(&srq->verbs_srq.srq); +err_db: + mlx4_free_db(to_mctx(context), MLX4_DB_TYPE_RQ, srq->db); +err_free: + free(srq->wrid); + mlx4_free_buf(to_mctx(context), &srq->buf); +err: + free(srq); + return NULL; +} + +int mlx4_destroy_xrc_srq(struct ibv_srq *srq) +{ + struct mlx4_context *mctx = to_mctx(srq->context); + struct mlx4_srq *msrq = to_msrq(srq); + struct mlx4_cq *mcq; + int ret; + + mcq = to_mcq(msrq->verbs_srq.cq); + mlx4_cq_clean(mcq, 0, msrq); + pthread_spin_lock(&mcq->lock); + mlx4_clear_xsrq(&mctx->xsrq_table, msrq->verbs_srq.srq_num); + pthread_spin_unlock(&mcq->lock); + + ret = ibv_cmd_destroy_srq(srq); + if (ret) { + pthread_spin_lock(&mcq->lock); + mlx4_store_xsrq(&mctx->xsrq_table, msrq->verbs_srq.srq_num, msrq); + pthread_spin_unlock(&mcq->lock); + return ret; + } + + mlx4_free_db(mctx, MLX4_DB_TYPE_RQ, msrq->db); + mlx4_free_buf(mctx, &msrq->buf); + free(msrq->wrid); + free(msrq); + + return 0; +} diff --git a/providers/mlx4/verbs.c b/providers/mlx4/verbs.c new file mode 100644 index 0000000..9f39ecd --- /dev/null +++ b/providers/mlx4/verbs.c @@ -0,0 +1,1656 @@ +/* + * Copyright (c) 2007 Cisco, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <config.h> + +#include <endian.h> +#include <stdlib.h> +#include <stdio.h> +#include <string.h> +#include <pthread.h> +#include <errno.h> + +#include <util/mmio.h> + +#include "mlx4.h" +#include "mlx4-abi.h" + +int mlx4_query_device(struct ibv_context *context, struct ibv_device_attr *attr) +{ + struct ibv_query_device cmd; + uint64_t raw_fw_ver; + unsigned major, minor, sub_minor; + int ret; + + ret = ibv_cmd_query_device(context, attr, &raw_fw_ver, &cmd, sizeof cmd); + if (ret) + return ret; + + major = (raw_fw_ver >> 32) & 0xffff; + minor = (raw_fw_ver >> 16) & 0xffff; + sub_minor = raw_fw_ver & 0xffff; + + snprintf(attr->fw_ver, sizeof attr->fw_ver, + "%d.%d.%03d", major, minor, sub_minor); + + return 0; +} + +int mlx4_query_device_ex(struct ibv_context *context, + const struct ibv_query_device_ex_input *input, + struct ibv_device_attr_ex *attr, + size_t attr_size) +{ + struct mlx4_context *mctx = to_mctx(context); + struct mlx4_query_device_ex_resp resp = {}; + struct mlx4_query_device_ex cmd = {}; + uint64_t raw_fw_ver; + unsigned sub_minor; + unsigned major; + unsigned minor; + int err; + + err = ibv_cmd_query_device_ex(context, input, attr, attr_size, + &raw_fw_ver, &cmd.ibv_cmd, sizeof(cmd), + &resp.ibv_resp, sizeof(resp)); + if (err) + return err; + + attr->rss_caps.rx_hash_fields_mask = resp.rss_caps.rx_hash_fields_mask; + attr->rss_caps.rx_hash_function = resp.rss_caps.rx_hash_function; + attr->tso_caps.max_tso = resp.tso_caps.max_tso; + attr->tso_caps.supported_qpts = resp.tso_caps.supported_qpts; + + if (resp.comp_mask & MLX4_IB_QUERY_DEV_RESP_MASK_CORE_CLOCK_OFFSET) { + mctx->core_clock.offset = resp.hca_core_clock_offset; + mctx->core_clock.offset_valid = 1; + } + mctx->max_inl_recv_sz = resp.max_inl_recv_sz; + + major = (raw_fw_ver >> 32) & 0xffff; + minor = (raw_fw_ver >> 16) & 0xffff; + sub_minor = raw_fw_ver & 0xffff; + + snprintf(attr->orig_attr.fw_ver, sizeof attr->orig_attr.fw_ver, + "%d.%d.%03d", major, minor, sub_minor); + + return 0; +} + +static int mlx4_read_clock(struct ibv_context *context, uint64_t *cycles) +{ + uint32_t clockhi, clocklo, clockhi1; + int i; + struct mlx4_context *ctx = to_mctx(context); + + if (!ctx->hca_core_clock) + return -EOPNOTSUPP; + + /* Handle wraparound */ + for (i = 0; i < 2; i++) { + clockhi = be32toh(mmio_read32_be(ctx->hca_core_clock)); + clocklo = be32toh(mmio_read32_be(ctx->hca_core_clock + 4)); + clockhi1 = be32toh(mmio_read32_be(ctx->hca_core_clock)); + if (clockhi == clockhi1) + break; + } + + *cycles = (uint64_t)clockhi << 32 | (uint64_t)clocklo; + + return 0; +} + +int mlx4_query_rt_values(struct ibv_context *context, + struct ibv_values_ex *values) +{ + uint32_t comp_mask = 0; + int err = 0; + + if (!check_comp_mask(values->comp_mask, IBV_VALUES_MASK_RAW_CLOCK)) + return EINVAL; + + if (values->comp_mask & IBV_VALUES_MASK_RAW_CLOCK) { + uint64_t cycles; + + err = mlx4_read_clock(context, &cycles); + if (!err) { + values->raw_clock.tv_sec = 0; + values->raw_clock.tv_nsec = cycles; + comp_mask |= IBV_VALUES_MASK_RAW_CLOCK; + } + } + + values->comp_mask = comp_mask; + + return err; +} + +int mlx4_query_port(struct ibv_context *context, uint8_t port, + struct ibv_port_attr *attr) +{ + struct ibv_query_port cmd; + int err; + + err = ibv_cmd_query_port(context, port, attr, &cmd, sizeof(cmd)); + if (!err && port <= MLX4_PORTS_NUM && port > 0) { + struct mlx4_context *mctx = to_mctx(context); + if (!mctx->port_query_cache[port - 1].valid) { + mctx->port_query_cache[port - 1].link_layer = + attr->link_layer; + mctx->port_query_cache[port - 1].caps = + attr->port_cap_flags; + mctx->port_query_cache[port - 1].flags = + attr->flags; + mctx->port_query_cache[port - 1].valid = 1; + } + } + + return err; +} + +/* Only the fields in the port cache will be valid */ +static int query_port_cache(struct ibv_context *context, uint8_t port_num, + struct ibv_port_attr *port_attr) +{ + struct mlx4_context *mctx = to_mctx(context); + if (port_num <= 0 || port_num > MLX4_PORTS_NUM) + return -EINVAL; + if (mctx->port_query_cache[port_num - 1].valid) { + port_attr->link_layer = + mctx-> + port_query_cache[port_num - 1]. + link_layer; + port_attr->port_cap_flags = + mctx-> + port_query_cache[port_num - 1]. + caps; + port_attr->flags = + mctx-> + port_query_cache[port_num - 1]. + flags; + return 0; + } + return mlx4_query_port(context, port_num, + (struct ibv_port_attr *)port_attr); + +} + +struct ibv_pd *mlx4_alloc_pd(struct ibv_context *context) +{ + struct ibv_alloc_pd cmd; + struct mlx4_alloc_pd_resp resp; + struct mlx4_pd *pd; + + pd = malloc(sizeof *pd); + if (!pd) + return NULL; + + if (ibv_cmd_alloc_pd(context, &pd->ibv_pd, &cmd, sizeof cmd, + &resp.ibv_resp, sizeof resp)) { + free(pd); + return NULL; + } + + pd->pdn = resp.pdn; + + return &pd->ibv_pd; +} + +int mlx4_free_pd(struct ibv_pd *pd) +{ + int ret; + + ret = ibv_cmd_dealloc_pd(pd); + if (ret) + return ret; + + free(to_mpd(pd)); + return 0; +} + +struct ibv_xrcd *mlx4_open_xrcd(struct ibv_context *context, + struct ibv_xrcd_init_attr *attr) +{ + struct ibv_open_xrcd cmd; + struct ib_uverbs_open_xrcd_resp resp; + struct verbs_xrcd *xrcd; + int ret; + + xrcd = calloc(1, sizeof *xrcd); + if (!xrcd) + return NULL; + + ret = ibv_cmd_open_xrcd(context, xrcd, sizeof(*xrcd), attr, + &cmd, sizeof cmd, &resp, sizeof resp); + if (ret) + goto err; + + return &xrcd->xrcd; + +err: + free(xrcd); + return NULL; +} + +int mlx4_close_xrcd(struct ibv_xrcd *ib_xrcd) +{ + struct verbs_xrcd *xrcd = container_of(ib_xrcd, struct verbs_xrcd, xrcd); + int ret; + + ret = ibv_cmd_close_xrcd(xrcd); + if (ret) + return ret; + + free(xrcd); + return 0; +} + +struct ibv_mr *mlx4_reg_mr(struct ibv_pd *pd, void *addr, size_t length, + uint64_t hca_va, int access) +{ + struct verbs_mr *vmr; + struct ibv_reg_mr cmd; + struct ib_uverbs_reg_mr_resp resp; + int ret; + + vmr = malloc(sizeof(*vmr)); + if (!vmr) + return NULL; + + ret = ibv_cmd_reg_mr(pd, addr, length, hca_va, access, vmr, &cmd, + sizeof(cmd), &resp, sizeof(resp)); + if (ret) { + free(vmr); + return NULL; + } + + return &vmr->ibv_mr; +} + +int mlx4_rereg_mr(struct verbs_mr *vmr, + int flags, + struct ibv_pd *pd, void *addr, + size_t length, int access) +{ + struct ibv_rereg_mr cmd; + struct ib_uverbs_rereg_mr_resp resp; + + if (flags & IBV_REREG_MR_KEEP_VALID) + return ENOTSUP; + + return ibv_cmd_rereg_mr(vmr, flags, addr, length, + (uintptr_t)addr, + access, pd, + &cmd, sizeof(cmd), + &resp, sizeof(resp)); +} + +int mlx4_dereg_mr(struct verbs_mr *vmr) +{ + int ret; + + ret = ibv_cmd_dereg_mr(vmr); + if (ret) + return ret; + + free(vmr); + return 0; +} + +struct ibv_mw *mlx4_alloc_mw(struct ibv_pd *pd, enum ibv_mw_type type) +{ + struct ibv_mw *mw; + struct ibv_alloc_mw cmd; + struct ib_uverbs_alloc_mw_resp resp; + int ret; + + mw = calloc(1, sizeof(*mw)); + if (!mw) + return NULL; + + ret = ibv_cmd_alloc_mw(pd, type, mw, &cmd, sizeof(cmd), + &resp, sizeof(resp)); + + if (ret) { + free(mw); + return NULL; + } + + return mw; +} + +int mlx4_dealloc_mw(struct ibv_mw *mw) +{ + int ret; + + ret = ibv_cmd_dealloc_mw(mw); + if (ret) + return ret; + + free(mw); + return 0; +} + +int mlx4_bind_mw(struct ibv_qp *qp, struct ibv_mw *mw, + struct ibv_mw_bind *mw_bind) +{ + struct ibv_send_wr *bad_wr = NULL; + struct ibv_send_wr wr = { }; + int ret; + + + wr.opcode = IBV_WR_BIND_MW; + wr.next = NULL; + + wr.wr_id = mw_bind->wr_id; + wr.send_flags = mw_bind->send_flags; + + wr.bind_mw.mw = mw; + wr.bind_mw.rkey = ibv_inc_rkey(mw->rkey); + wr.bind_mw.bind_info = mw_bind->bind_info; + + ret = mlx4_post_send(qp, &wr, &bad_wr); + + if (ret) + return ret; + + /* updating the mw with the latest rkey. */ + mw->rkey = wr.bind_mw.rkey; + + return 0; +} + +enum { + CREATE_CQ_SUPPORTED_WC_FLAGS = IBV_WC_STANDARD_FLAGS | + IBV_WC_EX_WITH_COMPLETION_TIMESTAMP +}; + +enum { + CREATE_CQ_SUPPORTED_COMP_MASK = IBV_CQ_INIT_ATTR_MASK_FLAGS +}; + +enum { + CREATE_CQ_SUPPORTED_FLAGS = IBV_CREATE_CQ_ATTR_SINGLE_THREADED +}; + + +static int mlx4_cmd_create_cq(struct ibv_context *context, + struct ibv_cq_init_attr_ex *cq_attr, + struct mlx4_cq *cq) +{ + struct mlx4_create_cq cmd; + struct mlx4_create_cq_resp resp; + int ret; + + cmd.buf_addr = (uintptr_t) cq->buf.buf; + cmd.db_addr = (uintptr_t) cq->set_ci_db; + + ret = ibv_cmd_create_cq(context, cq_attr->cqe, cq_attr->channel, + cq_attr->comp_vector, + ibv_cq_ex_to_cq(&cq->ibv_cq), + &cmd.ibv_cmd, sizeof(cmd), + &resp.ibv_resp, sizeof(resp)); + if (!ret) + cq->cqn = resp.cqn; + + return ret; + +} + +static int mlx4_cmd_create_cq_ex(struct ibv_context *context, + struct ibv_cq_init_attr_ex *cq_attr, + struct mlx4_cq *cq) +{ + struct mlx4_create_cq_ex cmd; + struct mlx4_create_cq_ex_resp resp; + int ret; + + cmd.buf_addr = (uintptr_t) cq->buf.buf; + cmd.db_addr = (uintptr_t) cq->set_ci_db; + + ret = ibv_cmd_create_cq_ex(context, cq_attr, + &cq->ibv_cq, &cmd.ibv_cmd, + sizeof(cmd), + &resp.ibv_resp, + sizeof(resp)); + if (!ret) + cq->cqn = resp.cqn; + + return ret; +} + +static struct ibv_cq_ex *create_cq(struct ibv_context *context, + struct ibv_cq_init_attr_ex *cq_attr, + int cq_alloc_flags) +{ + struct mlx4_cq *cq; + int ret; + struct mlx4_context *mctx = to_mctx(context); + + /* Sanity check CQ size before proceeding */ + if (cq_attr->cqe > 0x3fffff) { + errno = EINVAL; + return NULL; + } + + if (cq_attr->comp_mask & ~CREATE_CQ_SUPPORTED_COMP_MASK) { + errno = ENOTSUP; + return NULL; + } + + if (cq_attr->comp_mask & IBV_CQ_INIT_ATTR_MASK_FLAGS && + cq_attr->flags & ~CREATE_CQ_SUPPORTED_FLAGS) { + errno = ENOTSUP; + return NULL; + } + + if (cq_attr->wc_flags & ~CREATE_CQ_SUPPORTED_WC_FLAGS) { + errno = ENOTSUP; + return NULL; + } + + /* mlx4 devices don't support slid and sl in cqe when completion + * timestamp is enabled in the CQ + */ + if ((cq_attr->wc_flags & (IBV_WC_EX_WITH_SLID | IBV_WC_EX_WITH_SL)) && + (cq_attr->wc_flags & IBV_WC_EX_WITH_COMPLETION_TIMESTAMP)) { + errno = ENOTSUP; + return NULL; + } + + cq = malloc(sizeof *cq); + if (!cq) + return NULL; + + cq->cons_index = 0; + + if (pthread_spin_init(&cq->lock, PTHREAD_PROCESS_PRIVATE)) + goto err; + + cq_attr->cqe = roundup_pow_of_two(cq_attr->cqe + 1); + + if (mlx4_alloc_cq_buf(to_mdev(context->device), mctx, &cq->buf, + cq_attr->cqe, mctx->cqe_size)) + goto err; + + cq->cqe_size = mctx->cqe_size; + cq->set_ci_db = mlx4_alloc_db(to_mctx(context), MLX4_DB_TYPE_CQ); + if (!cq->set_ci_db) + goto err_buf; + + cq->arm_db = cq->set_ci_db + 1; + *cq->arm_db = 0; + cq->arm_sn = 1; + *cq->set_ci_db = 0; + cq->flags = cq_alloc_flags; + + if (cq_attr->comp_mask & IBV_CQ_INIT_ATTR_MASK_FLAGS && + cq_attr->flags & IBV_CREATE_CQ_ATTR_SINGLE_THREADED) + cq->flags |= MLX4_CQ_FLAGS_SINGLE_THREADED; + + --cq_attr->cqe; + if (cq_alloc_flags & MLX4_CQ_FLAGS_EXTENDED) + ret = mlx4_cmd_create_cq_ex(context, cq_attr, cq); + else + ret = mlx4_cmd_create_cq(context, cq_attr, cq); + + if (ret) + goto err_db; + + + if (cq_alloc_flags & MLX4_CQ_FLAGS_EXTENDED) + mlx4_cq_fill_pfns(cq, cq_attr); + + return &cq->ibv_cq; + +err_db: + mlx4_free_db(to_mctx(context), MLX4_DB_TYPE_CQ, cq->set_ci_db); + +err_buf: + mlx4_free_buf(to_mctx(context), &cq->buf); + +err: + free(cq); + + return NULL; +} + +struct ibv_cq *mlx4_create_cq(struct ibv_context *context, int cqe, + struct ibv_comp_channel *channel, + int comp_vector) +{ + struct ibv_cq_ex *cq; + struct ibv_cq_init_attr_ex cq_attr = {.cqe = cqe, .channel = channel, + .comp_vector = comp_vector, + .wc_flags = IBV_WC_STANDARD_FLAGS}; + + cq = create_cq(context, &cq_attr, 0); + return cq ? ibv_cq_ex_to_cq(cq) : NULL; +} + +struct ibv_cq_ex *mlx4_create_cq_ex(struct ibv_context *context, + struct ibv_cq_init_attr_ex *cq_attr) +{ + /* + * Make local copy since some attributes might be adjusted + * for internal use. + */ + struct ibv_cq_init_attr_ex cq_attr_c = {.cqe = cq_attr->cqe, + .channel = cq_attr->channel, + .comp_vector = cq_attr->comp_vector, + .wc_flags = cq_attr->wc_flags, + .comp_mask = cq_attr->comp_mask, + .flags = cq_attr->flags}; + + if (!check_comp_mask(cq_attr_c.comp_mask, + IBV_CQ_INIT_ATTR_MASK_FLAGS)) { + errno = EINVAL; + return NULL; + } + + return create_cq(context, &cq_attr_c, MLX4_CQ_FLAGS_EXTENDED); +} + +int mlx4_resize_cq(struct ibv_cq *ibcq, int cqe) +{ + struct mlx4_cq *cq = to_mcq(ibcq); + struct mlx4_resize_cq cmd; + struct ib_uverbs_resize_cq_resp resp; + struct mlx4_buf buf; + int old_cqe, outst_cqe, ret; + + /* Sanity check CQ size before proceeding */ + if (cqe > 0x3fffff) + return EINVAL; + + pthread_spin_lock(&cq->lock); + + cqe = roundup_pow_of_two(cqe + 1); + if (cqe == ibcq->cqe + 1) { + ret = 0; + goto out; + } + + /* Can't be smaller then the number of outstanding CQEs */ + outst_cqe = mlx4_get_outstanding_cqes(cq); + if (cqe < outst_cqe + 1) { + ret = EINVAL; + goto out; + } + + ret = mlx4_alloc_cq_buf(to_mdev(ibcq->context->device), + to_mctx(ibcq->context), &buf, cqe, + cq->cqe_size); + if (ret) + goto out; + + old_cqe = ibcq->cqe; + cmd.buf_addr = (uintptr_t) buf.buf; + + ret = ibv_cmd_resize_cq(ibcq, cqe - 1, &cmd.ibv_cmd, sizeof cmd, + &resp, sizeof resp); + if (ret) { + mlx4_free_buf(to_mctx(ibcq->context), &buf); + goto out; + } + + mlx4_cq_resize_copy_cqes(cq, buf.buf, old_cqe); + + mlx4_free_buf(to_mctx(ibcq->context), &cq->buf); + cq->buf = buf; + mlx4_update_cons_index(cq); + +out: + pthread_spin_unlock(&cq->lock); + return ret; +} + +int mlx4_destroy_cq(struct ibv_cq *cq) +{ + int ret; + + ret = ibv_cmd_destroy_cq(cq); + if (ret) + return ret; + + mlx4_free_db(to_mctx(cq->context), MLX4_DB_TYPE_CQ, to_mcq(cq)->set_ci_db); + mlx4_free_buf(to_mctx(cq->context), &to_mcq(cq)->buf); + free(to_mcq(cq)); + + return 0; +} + +struct ibv_srq *mlx4_create_srq(struct ibv_pd *pd, + struct ibv_srq_init_attr *attr) +{ + struct mlx4_create_srq cmd; + struct mlx4_create_srq_resp resp; + struct mlx4_srq *srq; + int ret; + + /* Sanity check SRQ size before proceeding */ + if (attr->attr.max_wr > 1 << 16 || attr->attr.max_sge > 64) { + errno = EINVAL; + return NULL; + } + + srq = malloc(sizeof *srq); + if (!srq) + return NULL; + + if (pthread_spin_init(&srq->lock, PTHREAD_PROCESS_PRIVATE)) + goto err; + + srq->max = roundup_pow_of_two(attr->attr.max_wr + 1); + srq->max_gs = attr->attr.max_sge; + srq->counter = 0; + srq->ext_srq = 0; + + if (mlx4_alloc_srq_buf(pd, &attr->attr, srq)) + goto err; + + srq->db = mlx4_alloc_db(to_mctx(pd->context), MLX4_DB_TYPE_RQ); + if (!srq->db) + goto err_free; + + *srq->db = 0; + + cmd.buf_addr = (uintptr_t) srq->buf.buf; + cmd.db_addr = (uintptr_t) srq->db; + + ret = ibv_cmd_create_srq(pd, &srq->verbs_srq.srq, attr, + &cmd.ibv_cmd, sizeof cmd, + &resp.ibv_resp, sizeof resp); + if (ret) + goto err_db; + + return &srq->verbs_srq.srq; + +err_db: + mlx4_free_db(to_mctx(pd->context), MLX4_DB_TYPE_RQ, srq->db); + +err_free: + free(srq->wrid); + mlx4_free_buf(to_mctx(pd->context), &srq->buf); + +err: + free(srq); + + return NULL; +} + +struct ibv_srq *mlx4_create_srq_ex(struct ibv_context *context, + struct ibv_srq_init_attr_ex *attr_ex) +{ + if (!(attr_ex->comp_mask & IBV_SRQ_INIT_ATTR_TYPE) || + (attr_ex->srq_type == IBV_SRQT_BASIC)) + return mlx4_create_srq(attr_ex->pd, (struct ibv_srq_init_attr *) attr_ex); + else if (attr_ex->srq_type == IBV_SRQT_XRC) + return mlx4_create_xrc_srq(context, attr_ex); + + return NULL; +} + +int mlx4_modify_srq(struct ibv_srq *srq, + struct ibv_srq_attr *attr, + int attr_mask) +{ + struct ibv_modify_srq cmd; + + return ibv_cmd_modify_srq(srq, attr, attr_mask, &cmd, sizeof cmd); +} + +int mlx4_query_srq(struct ibv_srq *srq, + struct ibv_srq_attr *attr) +{ + struct ibv_query_srq cmd; + + return ibv_cmd_query_srq(srq, attr, &cmd, sizeof cmd); +} + +int mlx4_destroy_srq(struct ibv_srq *srq) +{ + int ret; + + if (to_msrq(srq)->ext_srq) + return mlx4_destroy_xrc_srq(srq); + + ret = ibv_cmd_destroy_srq(srq); + if (ret) + return ret; + + mlx4_free_db(to_mctx(srq->context), MLX4_DB_TYPE_RQ, to_msrq(srq)->db); + mlx4_free_buf(to_mctx(srq->context), &to_msrq(srq)->buf); + free(to_msrq(srq)->wrid); + free(to_msrq(srq)); + + return 0; +} + +static int mlx4_cmd_create_qp_ex_rss(struct ibv_context *context, + struct ibv_qp_init_attr_ex *attr, + struct mlx4_create_qp *cmd, + struct mlx4_qp *qp) +{ + struct mlx4_create_qp_ex_rss cmd_ex = {}; + struct mlx4_create_qp_ex_resp resp; + int ret; + + if (attr->rx_hash_conf.rx_hash_key_len != + sizeof(cmd_ex.rx_hash_key)) { + errno = ENOTSUP; + return errno; + } + + cmd_ex.rx_hash_fields_mask = + attr->rx_hash_conf.rx_hash_fields_mask; + cmd_ex.rx_hash_function = + attr->rx_hash_conf.rx_hash_function; + memcpy(cmd_ex.rx_hash_key, attr->rx_hash_conf.rx_hash_key, + sizeof(cmd_ex.rx_hash_key)); + + ret = ibv_cmd_create_qp_ex2(context, &qp->verbs_qp, + sizeof(qp->verbs_qp), attr, &cmd_ex.ibv_cmd, + sizeof(cmd_ex), &resp.ibv_resp, + sizeof(resp)); + return ret; +} + +static struct ibv_qp *_mlx4_create_qp_ex_rss(struct ibv_context *context, + struct ibv_qp_init_attr_ex *attr) +{ + struct mlx4_create_qp cmd = {}; + struct mlx4_qp *qp; + int ret; + + if (!(attr->comp_mask & IBV_QP_INIT_ATTR_RX_HASH) || + !(attr->comp_mask & IBV_QP_INIT_ATTR_IND_TABLE)) { + errno = EINVAL; + return NULL; + } + + if (attr->qp_type != IBV_QPT_RAW_PACKET) { + errno = EINVAL; + return NULL; + } + + qp = calloc(1, sizeof(*qp)); + if (!qp) + return NULL; + + if (pthread_spin_init(&qp->sq.lock, PTHREAD_PROCESS_PRIVATE) || + pthread_spin_init(&qp->rq.lock, PTHREAD_PROCESS_PRIVATE)) + goto err; + + ret = mlx4_cmd_create_qp_ex_rss(context, attr, &cmd, qp); + if (ret) + goto err; + + qp->type = MLX4_RSC_TYPE_RSS_QP; + + return &qp->verbs_qp.qp; +err: + free(qp); + return NULL; +} + +static int mlx4_cmd_create_qp_ex(struct ibv_context *context, + struct ibv_qp_init_attr_ex *attr, + struct mlx4_create_qp *cmd, + struct mlx4_qp *qp) +{ + struct mlx4_create_qp_ex cmd_ex; + struct mlx4_create_qp_ex_resp resp; + int ret; + + memset(&cmd_ex, 0, sizeof(cmd_ex)); + *ibv_create_qp_ex_to_reg(&cmd_ex.ibv_cmd) = cmd->ibv_cmd.core_payload; + + cmd_ex.drv_payload = cmd->drv_payload; + + ret = ibv_cmd_create_qp_ex2(context, &qp->verbs_qp, + sizeof(qp->verbs_qp), attr, &cmd_ex.ibv_cmd, + sizeof(cmd_ex), &resp.ibv_resp, + sizeof(resp)); + return ret; +} + +enum { + MLX4_CREATE_QP_SUP_COMP_MASK = (IBV_QP_INIT_ATTR_PD | + IBV_QP_INIT_ATTR_XRCD | + IBV_QP_INIT_ATTR_CREATE_FLAGS | + IBV_QP_INIT_ATTR_MAX_TSO_HEADER), +}; + +enum { + MLX4_CREATE_QP_EX2_COMP_MASK = (IBV_QP_INIT_ATTR_CREATE_FLAGS | + IBV_QP_INIT_ATTR_MAX_TSO_HEADER), +}; + +static struct ibv_qp *create_qp_ex(struct ibv_context *context, + struct ibv_qp_init_attr_ex *attr, + struct mlx4dv_qp_init_attr *mlx4qp_attr) +{ + struct mlx4_context *ctx = to_mctx(context); + struct mlx4_create_qp cmd = {}; + struct ib_uverbs_create_qp_resp resp = {}; + struct mlx4_qp *qp; + int ret; + + if (attr->comp_mask & (IBV_QP_INIT_ATTR_RX_HASH | + IBV_QP_INIT_ATTR_IND_TABLE)) { + return _mlx4_create_qp_ex_rss(context, attr); + } + + /* Sanity check QP size before proceeding */ + if (ctx->max_qp_wr) { /* mlx4_query_device succeeded */ + if (attr->cap.max_send_wr > ctx->max_qp_wr || + attr->cap.max_recv_wr > ctx->max_qp_wr || + attr->cap.max_send_sge > ctx->max_sge || + attr->cap.max_recv_sge > ctx->max_sge) { + errno = EINVAL; + return NULL; + } + } else { + if (attr->cap.max_send_wr > 65536 || + attr->cap.max_recv_wr > 65536 || + attr->cap.max_send_sge > 64 || + attr->cap.max_recv_sge > 64) { + errno = EINVAL; + return NULL; + } + } + if (attr->cap.max_inline_data > 1024) { + errno = EINVAL; + return NULL; + } + + if (attr->comp_mask & ~MLX4_CREATE_QP_SUP_COMP_MASK) { + errno = ENOTSUP; + return NULL; + } + + qp = calloc(1, sizeof *qp); + if (!qp) + return NULL; + + if (attr->qp_type == IBV_QPT_XRC_RECV) { + attr->cap.max_send_wr = qp->sq.wqe_cnt = 0; + } else { + mlx4_calc_sq_wqe_size(&attr->cap, attr->qp_type, qp, attr); + /* + * We need to leave 2 KB + 1 WQE of headroom in the SQ to + * allow HW to prefetch. + */ + qp->sq_spare_wqes = (2048 >> qp->sq.wqe_shift) + 1; + qp->sq.wqe_cnt = roundup_pow_of_two(attr->cap.max_send_wr + qp->sq_spare_wqes); + } + + if (attr->srq || attr->qp_type == IBV_QPT_XRC_SEND || + attr->qp_type == IBV_QPT_XRC_RECV) { + attr->cap.max_recv_wr = qp->rq.wqe_cnt = attr->cap.max_recv_sge = 0; + } else { + qp->rq.wqe_cnt = roundup_pow_of_two(attr->cap.max_recv_wr); + if (attr->cap.max_recv_sge < 1) + attr->cap.max_recv_sge = 1; + if (attr->cap.max_recv_wr < 1) + attr->cap.max_recv_wr = 1; + } + + if (mlx4_alloc_qp_buf(context, attr->cap.max_recv_sge, attr->qp_type, qp, + mlx4qp_attr)) + goto err; + + mlx4_init_qp_indices(qp); + + if (pthread_spin_init(&qp->sq.lock, PTHREAD_PROCESS_PRIVATE) || + pthread_spin_init(&qp->rq.lock, PTHREAD_PROCESS_PRIVATE)) + goto err_free; + + if (mlx4qp_attr) { + if (!check_comp_mask(mlx4qp_attr->comp_mask, + MLX4DV_QP_INIT_ATTR_MASK_RESERVED - 1)) { + errno = EINVAL; + goto err_free; + } + if (mlx4qp_attr->comp_mask & MLX4DV_QP_INIT_ATTR_MASK_INL_RECV) + cmd.inl_recv_sz = mlx4qp_attr->inl_recv_sz; + } + if (attr->cap.max_recv_sge) { + qp->db = mlx4_alloc_db(to_mctx(context), MLX4_DB_TYPE_RQ); + if (!qp->db) + goto err_free; + + *qp->db = 0; + cmd.db_addr = (uintptr_t) qp->db; + } else { + cmd.db_addr = 0; + } + + cmd.buf_addr = (uintptr_t) qp->buf.buf; + cmd.log_sq_stride = qp->sq.wqe_shift; + for (cmd.log_sq_bb_count = 0; + qp->sq.wqe_cnt > 1 << cmd.log_sq_bb_count; + ++cmd.log_sq_bb_count) + ; /* nothing */ + cmd.sq_no_prefetch = 0; /* OK for ABI 2: just a reserved field */ + pthread_mutex_lock(&to_mctx(context)->qp_table_mutex); + + + if (attr->comp_mask & MLX4_CREATE_QP_EX2_COMP_MASK) + ret = mlx4_cmd_create_qp_ex(context, attr, &cmd, qp); + else + ret = ibv_cmd_create_qp_ex(context, &qp->verbs_qp, + sizeof(qp->verbs_qp), attr, + &cmd.ibv_cmd, sizeof(cmd), &resp, + sizeof(resp)); + if (ret) + goto err_rq_db; + + if (qp->sq.wqe_cnt || qp->rq.wqe_cnt) { + ret = mlx4_store_qp(to_mctx(context), qp->verbs_qp.qp.qp_num, qp); + if (ret) + goto err_destroy; + } + pthread_mutex_unlock(&to_mctx(context)->qp_table_mutex); + + qp->rq.wqe_cnt = qp->rq.max_post = attr->cap.max_recv_wr; + qp->rq.max_gs = attr->cap.max_recv_sge; + if (attr->qp_type != IBV_QPT_XRC_RECV) + mlx4_set_sq_sizes(qp, &attr->cap, attr->qp_type); + + qp->doorbell_qpn = htobe32(qp->verbs_qp.qp.qp_num << 8); + if (attr->sq_sig_all) + qp->sq_signal_bits = htobe32(MLX4_WQE_CTRL_CQ_UPDATE); + else + qp->sq_signal_bits = 0; + + qp->qpn_cache = qp->verbs_qp.qp.qp_num; + qp->type = attr->srq ? MLX4_RSC_TYPE_SRQ : MLX4_RSC_TYPE_QP; + + return &qp->verbs_qp.qp; + +err_destroy: + ibv_cmd_destroy_qp(&qp->verbs_qp.qp); + +err_rq_db: + pthread_mutex_unlock(&to_mctx(context)->qp_table_mutex); + if (attr->cap.max_recv_sge) + mlx4_free_db(to_mctx(context), MLX4_DB_TYPE_RQ, qp->db); + +err_free: + free(qp->sq.wrid); + if (qp->rq.wqe_cnt) + free(qp->rq.wrid); + mlx4_free_buf(ctx, &qp->buf); + +err: + free(qp); + + return NULL; +} + +struct ibv_qp *mlx4_create_qp_ex(struct ibv_context *context, + struct ibv_qp_init_attr_ex *attr) +{ + return create_qp_ex(context, attr, NULL); +} + +struct ibv_qp *mlx4dv_create_qp(struct ibv_context *context, + struct ibv_qp_init_attr_ex *attr, + struct mlx4dv_qp_init_attr *mlx4_qp_attr) +{ + return create_qp_ex(context, attr, mlx4_qp_attr); +} + +struct ibv_qp *mlx4_create_qp(struct ibv_pd *pd, struct ibv_qp_init_attr *attr) +{ + struct ibv_qp_init_attr_ex attr_ex; + struct ibv_qp *qp; + + memcpy(&attr_ex, attr, sizeof *attr); + attr_ex.comp_mask = IBV_QP_INIT_ATTR_PD; + attr_ex.pd = pd; + qp = mlx4_create_qp_ex(pd->context, &attr_ex); + if (qp) + memcpy(attr, &attr_ex, sizeof *attr); + return qp; +} + +struct ibv_qp *mlx4_open_qp(struct ibv_context *context, struct ibv_qp_open_attr *attr) +{ + struct ibv_open_qp cmd; + struct ib_uverbs_create_qp_resp resp; + struct mlx4_qp *qp; + int ret; + + qp = calloc(1, sizeof *qp); + if (!qp) + return NULL; + + ret = ibv_cmd_open_qp(context, &qp->verbs_qp, sizeof(qp->verbs_qp), attr, + &cmd, sizeof cmd, &resp, sizeof resp); + if (ret) + goto err; + + return &qp->verbs_qp.qp; + +err: + free(qp); + return NULL; +} + +int mlx4_query_qp(struct ibv_qp *ibqp, struct ibv_qp_attr *attr, + int attr_mask, + struct ibv_qp_init_attr *init_attr) +{ + struct ibv_query_qp cmd; + struct mlx4_qp *qp = to_mqp(ibqp); + int ret; + + if (qp->type == MLX4_RSC_TYPE_RSS_QP) + return ENOTSUP; + + ret = ibv_cmd_query_qp(ibqp, attr, attr_mask, init_attr, &cmd, sizeof cmd); + if (ret) + return ret; + + init_attr->cap.max_send_wr = qp->sq.max_post; + init_attr->cap.max_send_sge = qp->sq.max_gs; + init_attr->cap.max_inline_data = qp->max_inline_data; + + attr->cap = init_attr->cap; + + return 0; +} + +static int _mlx4_modify_qp_rss(struct ibv_qp *qp, struct ibv_qp_attr *attr, + int attr_mask) +{ + struct ibv_modify_qp cmd = {}; + + if (attr_mask & ~(IBV_QP_STATE | IBV_QP_PORT)) + return ENOTSUP; + + if (attr->qp_state > IBV_QPS_RTR) + return ENOTSUP; + + return ibv_cmd_modify_qp(qp, attr, attr_mask, &cmd, sizeof(cmd)); +} + +int mlx4_modify_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, + int attr_mask) +{ + struct ibv_modify_qp cmd = {}; + struct ibv_port_attr port_attr; + struct mlx4_qp *mqp = to_mqp(qp); + struct ibv_device_attr device_attr; + int ret; + + if (mqp->type == MLX4_RSC_TYPE_RSS_QP) + return _mlx4_modify_qp_rss(qp, attr, attr_mask); + + memset(&device_attr, 0, sizeof(device_attr)); + if (attr_mask & IBV_QP_PORT) { + ret = ibv_query_port(qp->context, attr->port_num, + &port_attr); + if (ret) + return ret; + mqp->link_layer = port_attr.link_layer; + + ret = ibv_query_device(qp->context, &device_attr); + if (ret) + return ret; + + switch(qp->qp_type) { + case IBV_QPT_UD: + if ((mqp->link_layer == IBV_LINK_LAYER_INFINIBAND) && + (device_attr.device_cap_flags & IBV_DEVICE_UD_IP_CSUM)) + mqp->qp_cap_cache |= MLX4_CSUM_SUPPORT_UD_OVER_IB | + MLX4_RX_CSUM_VALID; + break; + case IBV_QPT_RAW_PACKET: + if ((mqp->link_layer == IBV_LINK_LAYER_ETHERNET) && + (device_attr.device_cap_flags & IBV_DEVICE_RAW_IP_CSUM)) + mqp->qp_cap_cache |= MLX4_CSUM_SUPPORT_RAW_OVER_ETH | + MLX4_RX_CSUM_VALID; + break; + default: + break; + } + + } + + if (qp->state == IBV_QPS_RESET && + attr_mask & IBV_QP_STATE && + attr->qp_state == IBV_QPS_INIT) { + mlx4_qp_init_sq_ownership(to_mqp(qp)); + } + + ret = ibv_cmd_modify_qp(qp, attr, attr_mask, &cmd, sizeof cmd); + + if (!ret && + (attr_mask & IBV_QP_STATE) && + attr->qp_state == IBV_QPS_RESET) { + if (qp->recv_cq) + mlx4_cq_clean(to_mcq(qp->recv_cq), qp->qp_num, + qp->srq ? to_msrq(qp->srq) : NULL); + if (qp->send_cq && qp->send_cq != qp->recv_cq) + mlx4_cq_clean(to_mcq(qp->send_cq), qp->qp_num, NULL); + + mlx4_init_qp_indices(to_mqp(qp)); + if (to_mqp(qp)->rq.wqe_cnt) + *to_mqp(qp)->db = 0; + } + + return ret; +} + +static void mlx4_lock_cqs(struct ibv_qp *qp) +{ + struct mlx4_cq *send_cq = to_mcq(qp->send_cq); + struct mlx4_cq *recv_cq = to_mcq(qp->recv_cq); + + if (!qp->send_cq || !qp->recv_cq) { + if (qp->send_cq) + pthread_spin_lock(&send_cq->lock); + else if (qp->recv_cq) + pthread_spin_lock(&recv_cq->lock); + } else if (send_cq == recv_cq) { + pthread_spin_lock(&send_cq->lock); + } else if (send_cq->cqn < recv_cq->cqn) { + pthread_spin_lock(&send_cq->lock); + pthread_spin_lock(&recv_cq->lock); + } else { + pthread_spin_lock(&recv_cq->lock); + pthread_spin_lock(&send_cq->lock); + } +} + +static void mlx4_unlock_cqs(struct ibv_qp *qp) +{ + struct mlx4_cq *send_cq = to_mcq(qp->send_cq); + struct mlx4_cq *recv_cq = to_mcq(qp->recv_cq); + + + if (!qp->send_cq || !qp->recv_cq) { + if (qp->send_cq) + pthread_spin_unlock(&send_cq->lock); + else if (qp->recv_cq) + pthread_spin_unlock(&recv_cq->lock); + } else if (send_cq == recv_cq) { + pthread_spin_unlock(&send_cq->lock); + } else if (send_cq->cqn < recv_cq->cqn) { + pthread_spin_unlock(&recv_cq->lock); + pthread_spin_unlock(&send_cq->lock); + } else { + pthread_spin_unlock(&send_cq->lock); + pthread_spin_unlock(&recv_cq->lock); + } +} + +static int _mlx4_destroy_qp_rss(struct ibv_qp *ibqp) +{ + struct mlx4_qp *qp = to_mqp(ibqp); + int ret; + + ret = ibv_cmd_destroy_qp(ibqp); + if (ret) + return ret; + + free(qp); + + return 0; +} + +int mlx4_destroy_qp(struct ibv_qp *ibqp) +{ + struct mlx4_qp *qp = to_mqp(ibqp); + int ret; + + if (qp->type == MLX4_RSC_TYPE_RSS_QP) + return _mlx4_destroy_qp_rss(ibqp); + + pthread_mutex_lock(&to_mctx(ibqp->context)->qp_table_mutex); + ret = ibv_cmd_destroy_qp(ibqp); + if (ret) { + pthread_mutex_unlock(&to_mctx(ibqp->context)->qp_table_mutex); + return ret; + } + + mlx4_lock_cqs(ibqp); + + if (ibqp->recv_cq) + __mlx4_cq_clean(to_mcq(ibqp->recv_cq), ibqp->qp_num, + ibqp->srq ? to_msrq(ibqp->srq) : NULL); + if (ibqp->send_cq && ibqp->send_cq != ibqp->recv_cq) + __mlx4_cq_clean(to_mcq(ibqp->send_cq), ibqp->qp_num, NULL); + + if (qp->sq.wqe_cnt || qp->rq.wqe_cnt) + mlx4_clear_qp(to_mctx(ibqp->context), ibqp->qp_num); + + mlx4_unlock_cqs(ibqp); + pthread_mutex_unlock(&to_mctx(ibqp->context)->qp_table_mutex); + + if (qp->rq.wqe_cnt) { + mlx4_free_db(to_mctx(ibqp->context), MLX4_DB_TYPE_RQ, qp->db); + free(qp->rq.wrid); + } + if (qp->sq.wqe_cnt) + free(qp->sq.wrid); + mlx4_free_buf(to_mctx(ibqp->context), &qp->buf); + free(qp); + + return 0; +} + +static int link_local_gid(const union ibv_gid *gid) +{ + return gid->global.subnet_prefix == htobe64(0xfe80000000000000ULL); +} + +static int is_multicast_gid(const union ibv_gid *gid) +{ + return gid->raw[0] == 0xff; +} + +static uint16_t get_vlan_id(union ibv_gid *gid) +{ + uint16_t vid; + vid = gid->raw[11] << 8 | gid->raw[12]; + return vid < 0x1000 ? vid : 0xffff; +} + +static int mlx4_resolve_grh_to_l2(struct ibv_pd *pd, struct mlx4_ah *ah, + struct ibv_ah_attr *attr) +{ + int err, i; + uint16_t vid; + union ibv_gid sgid; + + if (link_local_gid(&attr->grh.dgid)) { + memcpy(ah->mac, &attr->grh.dgid.raw[8], 3); + memcpy(ah->mac + 3, &attr->grh.dgid.raw[13], 3); + ah->mac[0] ^= 2; + + vid = get_vlan_id(&attr->grh.dgid); + } else if (is_multicast_gid(&attr->grh.dgid)) { + ah->mac[0] = 0x33; + ah->mac[1] = 0x33; + for (i = 2; i < 6; ++i) + ah->mac[i] = attr->grh.dgid.raw[i + 10]; + + err = ibv_query_gid(pd->context, attr->port_num, + attr->grh.sgid_index, &sgid); + if (err) + return err; + + ah->av.dlid = htobe16(0xc000); + ah->av.port_pd |= htobe32(1 << 31); + + vid = get_vlan_id(&sgid); + } else + return 1; + + if (vid != 0xffff) { + ah->av.port_pd |= htobe32(1 << 29); + ah->vlan = vid | ((attr->sl & 7) << 13); + } + + return 0; +} + +struct ibv_ah *mlx4_create_ah(struct ibv_pd *pd, struct ibv_ah_attr *attr) +{ + struct mlx4_ah *ah; + struct ibv_port_attr port_attr; + + if (query_port_cache(pd->context, attr->port_num, &port_attr)) + return NULL; + + if (port_attr.flags & IBV_QPF_GRH_REQUIRED && + !attr->is_global) { + errno = EINVAL; + return NULL; + } + + ah = malloc(sizeof *ah); + if (!ah) + return NULL; + + memset(&ah->av, 0, sizeof ah->av); + + ah->av.port_pd = htobe32(to_mpd(pd)->pdn | (attr->port_num << 24)); + + if (port_attr.link_layer != IBV_LINK_LAYER_ETHERNET) { + ah->av.g_slid = attr->src_path_bits; + ah->av.dlid = htobe16(attr->dlid); + ah->av.sl_tclass_flowlabel = htobe32(attr->sl << 28); + } else + ah->av.sl_tclass_flowlabel = htobe32(attr->sl << 29); + + if (attr->static_rate) { + ah->av.stat_rate = attr->static_rate + MLX4_STAT_RATE_OFFSET; + /* XXX check rate cap? */ + } + if (attr->is_global) { + ah->av.g_slid |= 0x80; + ah->av.gid_index = attr->grh.sgid_index; + ah->av.hop_limit = attr->grh.hop_limit; + ah->av.sl_tclass_flowlabel |= + htobe32((attr->grh.traffic_class << 20) | + attr->grh.flow_label); + memcpy(ah->av.dgid, attr->grh.dgid.raw, 16); + } + + if (port_attr.link_layer == IBV_LINK_LAYER_ETHERNET) { + if (port_attr.port_cap_flags & IBV_PORT_IP_BASED_GIDS) { + uint16_t vid; + + if (ibv_resolve_eth_l2_from_gid(pd->context, attr, + ah->mac, &vid)) { + free(ah); + return NULL; + } + + if (vid <= 0xfff) { + ah->av.port_pd |= htobe32(1 << 29); + ah->vlan = vid | + ((attr->sl & 7) << 13); + } + + } else { + if (mlx4_resolve_grh_to_l2(pd, ah, attr)) { + free(ah); + return NULL; + } + } + } + + return &ah->ibv_ah; +} + +int mlx4_destroy_ah(struct ibv_ah *ah) +{ + free(to_mah(ah)); + + return 0; +} + +struct ibv_wq *mlx4_create_wq(struct ibv_context *context, + struct ibv_wq_init_attr *attr) +{ + struct mlx4_context *ctx = to_mctx(context); + struct mlx4_create_wq cmd = {}; + struct ib_uverbs_ex_create_wq_resp resp = {}; + struct mlx4_qp *qp; + int ret; + + if (attr->wq_type != IBV_WQT_RQ) { + errno = ENOTSUP; + return NULL; + } + + /* Sanity check QP size before proceeding */ + if (ctx->max_qp_wr) { /* mlx4_query_device succeeded */ + if (attr->max_wr > ctx->max_qp_wr || + attr->max_sge > ctx->max_sge) { + errno = EINVAL; + return NULL; + } + } else { + if (attr->max_wr > 65536 || + attr->max_sge > 64) { + errno = EINVAL; + return NULL; + } + } + + if (!check_comp_mask(attr->comp_mask, IBV_WQ_INIT_ATTR_FLAGS)) { + errno = ENOTSUP; + return NULL; + } + + if ((attr->comp_mask & IBV_WQ_INIT_ATTR_FLAGS) && + (attr->create_flags & ~IBV_WQ_FLAGS_SCATTER_FCS)) { + errno = ENOTSUP; + return NULL; + } + + qp = calloc(1, sizeof(*qp)); + if (!qp) + return NULL; + + if (attr->max_sge < 1) + attr->max_sge = 1; + + if (attr->max_wr < 1) + attr->max_wr = 1; + + /* Kernel driver requires a dummy SQ with minimum properties */ + qp->sq.wqe_shift = 6; + qp->sq.wqe_cnt = 1; + + qp->rq.wqe_cnt = roundup_pow_of_two(attr->max_wr); + + if (mlx4_alloc_qp_buf(context, attr->max_sge, IBV_QPT_RAW_PACKET, qp, NULL)) + goto err; + + mlx4_init_qp_indices(qp); + mlx4_qp_init_sq_ownership(qp); /* For dummy SQ */ + + if (pthread_spin_init(&qp->rq.lock, PTHREAD_PROCESS_PRIVATE)) + goto err_free; + + qp->db = mlx4_alloc_db(to_mctx(context), MLX4_DB_TYPE_RQ); + if (!qp->db) + goto err_free; + + *qp->db = 0; + cmd.db_addr = (uintptr_t)qp->db; + + cmd.buf_addr = (uintptr_t)qp->buf.buf; + + cmd.log_range_size = ctx->log_wqs_range_sz; + + pthread_mutex_lock(&to_mctx(context)->qp_table_mutex); + + ret = ibv_cmd_create_wq(context, attr, &qp->wq, &cmd.ibv_cmd, + sizeof(cmd), &resp, sizeof(resp)); + if (ret) + goto err_rq_db; + + ret = mlx4_store_qp(to_mctx(context), qp->wq.wq_num, qp); + if (ret) + goto err_destroy; + + pthread_mutex_unlock(&to_mctx(context)->qp_table_mutex); + + ctx->log_wqs_range_sz = 0; + + qp->rq.max_post = attr->max_wr; + qp->rq.wqe_cnt = attr->max_wr; + qp->rq.max_gs = attr->max_sge; + + qp->wq.state = IBV_WQS_RESET; + + qp->wq.post_recv = mlx4_post_wq_recv; + + qp->qpn_cache = qp->wq.wq_num; + + return &qp->wq; + +err_destroy: + ibv_cmd_destroy_wq(&qp->wq); + +err_rq_db: + pthread_mutex_unlock(&to_mctx(context)->qp_table_mutex); + mlx4_free_db(to_mctx(context), MLX4_DB_TYPE_RQ, qp->db); + +err_free: + free(qp->rq.wrid); + mlx4_free_buf(to_mctx(context), &qp->buf); + +err: + free(qp); + + return NULL; +} + +int mlx4_modify_wq(struct ibv_wq *ibwq, struct ibv_wq_attr *attr) +{ + struct mlx4_qp *qp = wq_to_mqp(ibwq); + struct mlx4_modify_wq cmd = {}; + int ret; + + ret = ibv_cmd_modify_wq(ibwq, attr, &cmd.ibv_cmd, sizeof(cmd)); + + if (!ret && (attr->attr_mask & IBV_WQ_ATTR_STATE) && + (ibwq->state == IBV_WQS_RESET)) { + mlx4_cq_clean(to_mcq(ibwq->cq), ibwq->wq_num, NULL); + + mlx4_init_qp_indices(qp); + *qp->db = 0; + } + + return ret; +} + +struct ibv_flow *mlx4_create_flow(struct ibv_qp *qp, struct ibv_flow_attr *flow_attr) +{ + struct ibv_flow *flow_id; + int ret; + + flow_id = calloc(1, sizeof *flow_id); + if (!flow_id) + return NULL; + + ret = ibv_cmd_create_flow(qp, flow_id, flow_attr, + NULL, 0); + if (!ret) + return flow_id; + + free(flow_id); + return NULL; +} + +int mlx4_destroy_flow(struct ibv_flow *flow_id) +{ + int ret; + + ret = ibv_cmd_destroy_flow(flow_id); + + if (ret) + return ret; + + free(flow_id); + return 0; +} + +int mlx4_destroy_wq(struct ibv_wq *ibwq) +{ + struct mlx4_context *mcontext = to_mctx(ibwq->context); + struct mlx4_qp *qp = wq_to_mqp(ibwq); + struct mlx4_cq *cq = NULL; + int ret; + + pthread_mutex_lock(&mcontext->qp_table_mutex); + + ret = ibv_cmd_destroy_wq(ibwq); + if (ret) { + pthread_mutex_unlock(&mcontext->qp_table_mutex); + return ret; + } + + cq = to_mcq(ibwq->cq); + pthread_spin_lock(&cq->lock); + __mlx4_cq_clean(cq, ibwq->wq_num, NULL); + + mlx4_clear_qp(mcontext, ibwq->wq_num); + + pthread_spin_unlock(&cq->lock); + + pthread_mutex_unlock(&mcontext->qp_table_mutex); + + mlx4_free_db(mcontext, MLX4_DB_TYPE_RQ, qp->db); + free(qp->rq.wrid); + free(qp->sq.wrid); + + mlx4_free_buf(mcontext, &qp->buf); + + free(qp); + + return 0; +} + +struct ibv_rwq_ind_table *mlx4_create_rwq_ind_table(struct ibv_context *context, + struct ibv_rwq_ind_table_init_attr *init_attr) +{ + struct ib_uverbs_ex_create_rwq_ind_table_resp resp = {}; + struct ibv_rwq_ind_table *ind_table; + int err; + + ind_table = calloc(1, sizeof(*ind_table)); + if (!ind_table) + return NULL; + + err = ibv_cmd_create_rwq_ind_table(context, init_attr, ind_table, &resp, + sizeof(resp)); + if (err) + goto err; + + return ind_table; + +err: + free(ind_table); + return NULL; +} + +int mlx4_destroy_rwq_ind_table(struct ibv_rwq_ind_table *rwq_ind_table) +{ + int ret; + + ret = ibv_cmd_destroy_rwq_ind_table(rwq_ind_table); + + if (ret) + return ret; + + free(rwq_ind_table); + return 0; +} + +int mlx4_modify_cq(struct ibv_cq *cq, struct ibv_modify_cq_attr *attr) +{ + struct ibv_modify_cq cmd = {}; + + return ibv_cmd_modify_cq(cq, attr, &cmd, sizeof(cmd)); +} diff --git a/providers/mlx5/CMakeLists.txt b/providers/mlx5/CMakeLists.txt new file mode 100644 index 0000000..dc97642 --- /dev/null +++ b/providers/mlx5/CMakeLists.txt @@ -0,0 +1,41 @@ +set(MLX5_DEBUG "FALSE" CACHE BOOL + "Enable expensive runtime logging options for the mlx5 verbs provider") +if (MLX5_DEBUG) + add_definitions("-DMLX5_DEBUG") +endif() + +set(MLX5_MW_DEBUG "FALSE" CACHE BOOL + "Enable extra validation of memory windows for the mlx5 verbs provider") +if (MLX5_MW_DEBUG) + add_definitions("-DMW_DEBUG") +endif() + +rdma_shared_provider(mlx5 libmlx5.map + 1 1.13.${PACKAGE_VERSION} + buf.c + cq.c + dbrec.c + dr_action.c + dr_crc32.c + dr_dbg.c + dr_devx.c + dr_icm_pool.c + dr_matcher.c + dr_domain.c + dr_rule.c + dr_ste.c + dr_table.c + dr_send.c + mlx5.c + qp.c + srq.c + verbs.c +) + +publish_headers(infiniband + ../../kernel-headers/rdma/mlx5_user_ioctl_verbs.h + mlx5_api.h + mlx5dv.h +) + +rdma_pkg_config("mlx5" "libibverbs" "${CMAKE_THREAD_LIBS_INIT}") diff --git a/providers/mlx5/bitmap.h b/providers/mlx5/bitmap.h new file mode 100644 index 0000000..c0cee3c --- /dev/null +++ b/providers/mlx5/bitmap.h @@ -0,0 +1,111 @@ +/* + * Copyright (c) 2000, 2011 Mellanox Technology Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef BITMAP_H +#define BITMAP_H + +#include <stdlib.h> +#include <stdio.h> +#include <pthread.h> +#include <string.h> +#include <sys/types.h> +#include <sys/ipc.h> +#include <sys/shm.h> +#include <sys/mman.h> +#include <linux/errno.h> +#include "mlx5.h" + +/* Only ia64 requires this */ +#ifdef __ia64__ +#define MLX5_SHM_ADDR ((void *)0x8000000000000000UL) +#define MLX5_SHMAT_FLAGS (SHM_RND) +#else +#define MLX5_SHM_ADDR NULL +#define MLX5_SHMAT_FLAGS 0 +#endif + +#define BITS_PER_LONG (8 * sizeof(long)) +#define BITS_TO_LONGS(nr) DIV_ROUND_UP(nr, BITS_PER_LONG) + +#ifndef HPAGE_SIZE +#define HPAGE_SIZE (2UL * 1024 * 1024) +#endif + +#define MLX5_SHM_LENGTH HPAGE_SIZE +#define MLX5_Q_CHUNK_SIZE 32768 +#define MLX5_SHM_NUM_REGION 64 + +static inline unsigned long mlx5_ffz(uint32_t word) +{ + return __builtin_ffs(~word) - 1; +} + +static inline uint32_t mlx5_find_first_zero_bit(const unsigned long *addr, + uint32_t size) +{ + const unsigned long *p = addr; + uint32_t result = 0; + unsigned long tmp; + + while (size & ~(BITS_PER_LONG - 1)) { + tmp = *(p++); + if (~tmp) + goto found; + result += BITS_PER_LONG; + size -= BITS_PER_LONG; + } + if (!size) + return result; + + tmp = (*p) | (~0UL << size); + if (tmp == (uint32_t)~0UL) /* Are any bits zero? */ + return result + size; /* Nope. */ +found: + return result + mlx5_ffz(tmp); +} + +static inline void mlx5_set_bit(unsigned int nr, unsigned long *addr) +{ + addr[(nr / BITS_PER_LONG)] |= (1 << (nr % BITS_PER_LONG)); +} + +static inline void mlx5_clear_bit(unsigned int nr, unsigned long *addr) +{ + addr[(nr / BITS_PER_LONG)] &= ~(1 << (nr % BITS_PER_LONG)); +} + +static inline int mlx5_test_bit(unsigned int nr, const unsigned long *addr) +{ + return !!(addr[(nr / BITS_PER_LONG)] & (1 << (nr % BITS_PER_LONG))); +} + +#endif diff --git a/providers/mlx5/buf.c b/providers/mlx5/buf.c new file mode 100644 index 0000000..b5cf391 --- /dev/null +++ b/providers/mlx5/buf.c @@ -0,0 +1,685 @@ +/* + * Copyright (c) 2012 Mellanox Technologies, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <config.h> + +#include <signal.h> +#include <sys/ipc.h> +#include <sys/shm.h> +#include <stdio.h> +#include <stdlib.h> +#include <errno.h> + +#include "mlx5.h" +#include "bitmap.h" + +static int mlx5_bitmap_init(struct mlx5_bitmap *bitmap, uint32_t num, + uint32_t mask) +{ + bitmap->last = 0; + bitmap->top = 0; + bitmap->max = num; + bitmap->avail = num; + bitmap->mask = mask; + bitmap->avail = bitmap->max; + bitmap->table = calloc(BITS_TO_LONGS(bitmap->max), sizeof(*bitmap->table)); + if (!bitmap->table) + return -ENOMEM; + + return 0; +} + +static void bitmap_free_range(struct mlx5_bitmap *bitmap, uint32_t obj, + int cnt) +{ + int i; + + obj &= bitmap->max - 1; + + for (i = 0; i < cnt; i++) + mlx5_clear_bit(obj + i, bitmap->table); + bitmap->last = min(bitmap->last, obj); + bitmap->top = (bitmap->top + bitmap->max) & bitmap->mask; + bitmap->avail += cnt; +} + +static int bitmap_empty(struct mlx5_bitmap *bitmap) +{ + return (bitmap->avail == bitmap->max) ? 1 : 0; +} + +static int bitmap_avail(struct mlx5_bitmap *bitmap) +{ + return bitmap->avail; +} + +static void mlx5_bitmap_cleanup(struct mlx5_bitmap *bitmap) +{ + if (bitmap->table) + free(bitmap->table); +} + +static void free_huge_mem(struct mlx5_hugetlb_mem *hmem) +{ + mlx5_bitmap_cleanup(&hmem->bitmap); + if (shmdt(hmem->shmaddr) == -1) + mlx5_dbg(stderr, MLX5_DBG_CONTIG, "%s\n", strerror(errno)); + shmctl(hmem->shmid, IPC_RMID, NULL); + free(hmem); +} + +static int mlx5_bitmap_alloc(struct mlx5_bitmap *bitmap) +{ + uint32_t obj; + int ret; + + obj = mlx5_find_first_zero_bit(bitmap->table, bitmap->max); + if (obj < bitmap->max) { + mlx5_set_bit(obj, bitmap->table); + bitmap->last = (obj + 1); + if (bitmap->last == bitmap->max) + bitmap->last = 0; + obj |= bitmap->top; + ret = obj; + } else + ret = -1; + + if (ret != -1) + --bitmap->avail; + + return ret; +} + +static uint32_t find_aligned_range(unsigned long *bitmap, + uint32_t start, uint32_t nbits, + int len, int alignment) +{ + uint32_t end, i; + +again: + start = align(start, alignment); + + while ((start < nbits) && mlx5_test_bit(start, bitmap)) + start += alignment; + + if (start >= nbits) + return -1; + + end = start + len; + if (end > nbits) + return -1; + + for (i = start + 1; i < end; i++) { + if (mlx5_test_bit(i, bitmap)) { + start = i + 1; + goto again; + } + } + + return start; +} + +static int bitmap_alloc_range(struct mlx5_bitmap *bitmap, int cnt, + int align) +{ + uint32_t obj; + int ret, i; + + if (cnt == 1 && align == 1) + return mlx5_bitmap_alloc(bitmap); + + if (cnt > bitmap->max) + return -1; + + obj = find_aligned_range(bitmap->table, bitmap->last, + bitmap->max, cnt, align); + if (obj >= bitmap->max) { + bitmap->top = (bitmap->top + bitmap->max) & bitmap->mask; + obj = find_aligned_range(bitmap->table, 0, bitmap->max, + cnt, align); + } + + if (obj < bitmap->max) { + for (i = 0; i < cnt; i++) + mlx5_set_bit(obj + i, bitmap->table); + if (obj == bitmap->last) { + bitmap->last = (obj + cnt); + if (bitmap->last >= bitmap->max) + bitmap->last = 0; + } + obj |= bitmap->top; + ret = obj; + } else + ret = -1; + + if (ret != -1) + bitmap->avail -= cnt; + + return obj; +} + +static struct mlx5_hugetlb_mem *alloc_huge_mem(size_t size) +{ + struct mlx5_hugetlb_mem *hmem; + size_t shm_len; + + hmem = malloc(sizeof(*hmem)); + if (!hmem) + return NULL; + + shm_len = align(size, MLX5_SHM_LENGTH); + hmem->shmid = shmget(IPC_PRIVATE, shm_len, SHM_HUGETLB | SHM_R | SHM_W); + if (hmem->shmid == -1) { + mlx5_dbg(stderr, MLX5_DBG_CONTIG, "%s\n", strerror(errno)); + goto out_free; + } + + hmem->shmaddr = shmat(hmem->shmid, MLX5_SHM_ADDR, MLX5_SHMAT_FLAGS); + if (hmem->shmaddr == (void *)-1) { + mlx5_dbg(stderr, MLX5_DBG_CONTIG, "%s\n", strerror(errno)); + goto out_rmid; + } + + if (mlx5_bitmap_init(&hmem->bitmap, shm_len / MLX5_Q_CHUNK_SIZE, + shm_len / MLX5_Q_CHUNK_SIZE - 1)) { + mlx5_dbg(stderr, MLX5_DBG_CONTIG, "%s\n", strerror(errno)); + goto out_shmdt; + } + + /* + * Marked to be destroyed when process detaches from shmget segment + */ + shmctl(hmem->shmid, IPC_RMID, NULL); + + return hmem; + +out_shmdt: + if (shmdt(hmem->shmaddr) == -1) + mlx5_dbg(stderr, MLX5_DBG_CONTIG, "%s\n", strerror(errno)); + +out_rmid: + shmctl(hmem->shmid, IPC_RMID, NULL); + +out_free: + free(hmem); + return NULL; +} + +static int alloc_huge_buf(struct mlx5_context *mctx, struct mlx5_buf *buf, + size_t size, int page_size) +{ + int found = 0; + int nchunk; + struct mlx5_hugetlb_mem *hmem; + int ret; + + buf->length = align(size, MLX5_Q_CHUNK_SIZE); + nchunk = buf->length / MLX5_Q_CHUNK_SIZE; + + if (!nchunk) + return 0; + + mlx5_spin_lock(&mctx->hugetlb_lock); + list_for_each(&mctx->hugetlb_list, hmem, entry) { + if (bitmap_avail(&hmem->bitmap)) { + buf->base = bitmap_alloc_range(&hmem->bitmap, nchunk, 1); + if (buf->base != -1) { + buf->hmem = hmem; + found = 1; + break; + } + } + } + mlx5_spin_unlock(&mctx->hugetlb_lock); + + if (!found) { + hmem = alloc_huge_mem(buf->length); + if (!hmem) + return -1; + + buf->base = bitmap_alloc_range(&hmem->bitmap, nchunk, 1); + if (buf->base == -1) { + free_huge_mem(hmem); + /* TBD: remove after proven stability */ + fprintf(stderr, "BUG: huge allocation\n"); + return -1; + } + + buf->hmem = hmem; + + mlx5_spin_lock(&mctx->hugetlb_lock); + if (bitmap_avail(&hmem->bitmap)) + list_add(&mctx->hugetlb_list, &hmem->entry); + else + list_add_tail(&mctx->hugetlb_list, &hmem->entry); + mlx5_spin_unlock(&mctx->hugetlb_lock); + } + + buf->buf = hmem->shmaddr + buf->base * MLX5_Q_CHUNK_SIZE; + + ret = ibv_dontfork_range(buf->buf, buf->length); + if (ret) { + mlx5_dbg(stderr, MLX5_DBG_CONTIG, "\n"); + goto out_fork; + } + buf->type = MLX5_ALLOC_TYPE_HUGE; + + return 0; + +out_fork: + mlx5_spin_lock(&mctx->hugetlb_lock); + bitmap_free_range(&hmem->bitmap, buf->base, nchunk); + if (bitmap_empty(&hmem->bitmap)) { + list_del(&hmem->entry); + mlx5_spin_unlock(&mctx->hugetlb_lock); + free_huge_mem(hmem); + } else + mlx5_spin_unlock(&mctx->hugetlb_lock); + + return -1; +} + +static void free_huge_buf(struct mlx5_context *ctx, struct mlx5_buf *buf) +{ + int nchunk; + + nchunk = buf->length / MLX5_Q_CHUNK_SIZE; + if (!nchunk) + return; + + mlx5_spin_lock(&ctx->hugetlb_lock); + bitmap_free_range(&buf->hmem->bitmap, buf->base, nchunk); + if (bitmap_empty(&buf->hmem->bitmap)) { + list_del(&buf->hmem->entry); + mlx5_spin_unlock(&ctx->hugetlb_lock); + free_huge_mem(buf->hmem); + } else + mlx5_spin_unlock(&ctx->hugetlb_lock); +} + +void mlx5_free_buf_extern(struct mlx5_context *ctx, struct mlx5_buf *buf) +{ + ibv_dofork_range(buf->buf, buf->length); + ctx->extern_alloc.free(buf->buf, ctx->extern_alloc.data); +} + +int mlx5_alloc_buf_extern(struct mlx5_context *ctx, struct mlx5_buf *buf, + size_t size) +{ + void *addr; + + addr = ctx->extern_alloc.alloc(size, ctx->extern_alloc.data); + if (addr || size == 0) { + if (ibv_dontfork_range(addr, size)) { + mlx5_dbg(stderr, MLX5_DBG_CONTIG, + "External mode dontfork_range failed\n"); + ctx->extern_alloc.free(addr, + ctx->extern_alloc.data); + return -1; + } + buf->buf = addr; + buf->length = size; + buf->type = MLX5_ALLOC_TYPE_EXTERNAL; + return 0; + } + + mlx5_dbg(stderr, MLX5_DBG_CONTIG, "External alloc failed\n"); + return -1; +} + +static void mlx5_free_buf_custom(struct mlx5_context *ctx, + struct mlx5_buf *buf) +{ + struct mlx5_parent_domain *mparent_domain = buf->mparent_domain; + + mparent_domain->free(&mparent_domain->mpd.ibv_pd, + mparent_domain->pd_context, + buf->buf, + buf->resource_type); +} + +static int mlx5_alloc_buf_custom(struct mlx5_context *ctx, + struct mlx5_buf *buf, size_t size) +{ + struct mlx5_parent_domain *mparent_domain = buf->mparent_domain; + void *addr; + + addr = mparent_domain->alloc(&mparent_domain->mpd.ibv_pd, + mparent_domain->pd_context, size, + buf->req_alignment, + buf->resource_type); + if (addr == IBV_ALLOCATOR_USE_DEFAULT) + return 1; + + if (addr || size == 0) { + buf->buf = addr; + buf->length = size; + buf->type = MLX5_ALLOC_TYPE_CUSTOM; + return 0; + } + + return -1; +} + +int mlx5_alloc_prefered_buf(struct mlx5_context *mctx, + struct mlx5_buf *buf, + size_t size, int page_size, + enum mlx5_alloc_type type, + const char *component) +{ + int ret; + + if (type == MLX5_ALLOC_TYPE_CUSTOM) { + ret = mlx5_alloc_buf_custom(mctx, buf, size); + if (ret <= 0) + return ret; + + /* Fallback - default allocation is required */ + } + + /* + * Fallback mechanism priority: + * huge pages + * contig pages + * default + */ + if (type == MLX5_ALLOC_TYPE_HUGE || + type == MLX5_ALLOC_TYPE_PREFER_HUGE || + type == MLX5_ALLOC_TYPE_ALL) { + ret = alloc_huge_buf(mctx, buf, size, page_size); + if (!ret) + return 0; + + if (type == MLX5_ALLOC_TYPE_HUGE) + return -1; + + mlx5_dbg(stderr, MLX5_DBG_CONTIG, + "Huge mode allocation failed, fallback to %s mode\n", + MLX5_ALLOC_TYPE_ALL ? "contig" : "default"); + } + + if (type == MLX5_ALLOC_TYPE_CONTIG || + type == MLX5_ALLOC_TYPE_PREFER_CONTIG || + type == MLX5_ALLOC_TYPE_ALL) { + ret = mlx5_alloc_buf_contig(mctx, buf, size, page_size, component); + if (!ret) + return 0; + + if (type == MLX5_ALLOC_TYPE_CONTIG) + return -1; + mlx5_dbg(stderr, MLX5_DBG_CONTIG, + "Contig allocation failed, fallback to default mode\n"); + } + + if (type == MLX5_ALLOC_TYPE_EXTERNAL) + return mlx5_alloc_buf_extern(mctx, buf, size); + + return mlx5_alloc_buf(buf, size, page_size); + +} + +int mlx5_free_actual_buf(struct mlx5_context *ctx, struct mlx5_buf *buf) +{ + int err = 0; + + switch (buf->type) { + case MLX5_ALLOC_TYPE_ANON: + mlx5_free_buf(buf); + break; + + case MLX5_ALLOC_TYPE_HUGE: + free_huge_buf(ctx, buf); + break; + + case MLX5_ALLOC_TYPE_CONTIG: + mlx5_free_buf_contig(ctx, buf); + break; + + case MLX5_ALLOC_TYPE_EXTERNAL: + mlx5_free_buf_extern(ctx, buf); + break; + + case MLX5_ALLOC_TYPE_CUSTOM: + mlx5_free_buf_custom(ctx, buf); + break; + + default: + fprintf(stderr, "Bad allocation type\n"); + } + + return err; +} + +/* This function computes log2(v) rounded up. + We don't want to have a dependency to libm which exposes ceil & log2 APIs. + Code was written based on public domain code: + URL: http://graphics.stanford.edu/~seander/bithacks.html#IntegerLog. +*/ +static uint32_t mlx5_get_block_order(uint32_t v) +{ + static const uint32_t bits_arr[] = {0x2, 0xC, 0xF0, 0xFF00, 0xFFFF0000}; + static const uint32_t shift_arr[] = {1, 2, 4, 8, 16}; + int i; + uint32_t input_val = v; + + register uint32_t r = 0;/* result of log2(v) will go here */ + for (i = 4; i >= 0; i--) { + if (v & bits_arr[i]) { + v >>= shift_arr[i]; + r |= shift_arr[i]; + } + } + /* Rounding up if required */ + r += !!(input_val & ((1 << r) - 1)); + + return r; +} + +bool mlx5_is_custom_alloc(struct ibv_pd *pd) +{ + struct mlx5_parent_domain *mparent_domain = to_mparent_domain(pd); + + return (mparent_domain && mparent_domain->alloc && mparent_domain->free); +} + +bool mlx5_is_extern_alloc(struct mlx5_context *context) +{ + return context->extern_alloc.alloc && context->extern_alloc.free; +} + +void mlx5_get_alloc_type(struct mlx5_context *context, + struct ibv_pd *pd, + const char *component, + enum mlx5_alloc_type *alloc_type, + enum mlx5_alloc_type default_type) + +{ + char *env_value; + char name[128]; + + if (mlx5_is_custom_alloc(pd)) { + *alloc_type = MLX5_ALLOC_TYPE_CUSTOM; + return; + } + + if (mlx5_is_extern_alloc(context)) { + *alloc_type = MLX5_ALLOC_TYPE_EXTERNAL; + return; + } + + snprintf(name, sizeof(name), "%s_ALLOC_TYPE", component); + + *alloc_type = default_type; + + env_value = getenv(name); + if (env_value) { + if (!strcasecmp(env_value, "ANON")) + *alloc_type = MLX5_ALLOC_TYPE_ANON; + else if (!strcasecmp(env_value, "HUGE")) + *alloc_type = MLX5_ALLOC_TYPE_HUGE; + else if (!strcasecmp(env_value, "CONTIG")) + *alloc_type = MLX5_ALLOC_TYPE_CONTIG; + else if (!strcasecmp(env_value, "PREFER_CONTIG")) + *alloc_type = MLX5_ALLOC_TYPE_PREFER_CONTIG; + else if (!strcasecmp(env_value, "PREFER_HUGE")) + *alloc_type = MLX5_ALLOC_TYPE_PREFER_HUGE; + else if (!strcasecmp(env_value, "ALL")) + *alloc_type = MLX5_ALLOC_TYPE_ALL; + } +} + +static void mlx5_alloc_get_env_info(int *max_block_log, + int *min_block_log, + const char *component) + +{ + char *env; + int value; + char name[128]; + + /* First set defaults */ + *max_block_log = MLX5_MAX_LOG2_CONTIG_BLOCK_SIZE; + *min_block_log = MLX5_MIN_LOG2_CONTIG_BLOCK_SIZE; + + snprintf(name, sizeof(name), "%s_MAX_LOG2_CONTIG_BSIZE", component); + env = getenv(name); + if (env) { + value = atoi(env); + if (value <= MLX5_MAX_LOG2_CONTIG_BLOCK_SIZE && + value >= MLX5_MIN_LOG2_CONTIG_BLOCK_SIZE) + *max_block_log = value; + else + fprintf(stderr, "Invalid value %d for %s\n", + value, name); + } + sprintf(name, "%s_MIN_LOG2_CONTIG_BSIZE", component); + env = getenv(name); + if (env) { + value = atoi(env); + if (value >= MLX5_MIN_LOG2_CONTIG_BLOCK_SIZE && + value <= *max_block_log) + *min_block_log = value; + else + fprintf(stderr, "Invalid value %d for %s\n", + value, name); + } +} + +int mlx5_alloc_buf_contig(struct mlx5_context *mctx, + struct mlx5_buf *buf, size_t size, + int page_size, + const char *component) +{ + void *addr = MAP_FAILED; + int block_size_exp; + int max_block_log; + int min_block_log; + struct ibv_context *context = &mctx->ibv_ctx.context; + off_t offset; + + mlx5_alloc_get_env_info(&max_block_log, + &min_block_log, + component); + + block_size_exp = mlx5_get_block_order(size); + + if (block_size_exp > max_block_log) + block_size_exp = max_block_log; + + do { + offset = 0; + set_command(MLX5_IB_MMAP_GET_CONTIGUOUS_PAGES, &offset); + set_order(block_size_exp, &offset); + addr = mmap(NULL , size, PROT_WRITE | PROT_READ, MAP_SHARED, + context->cmd_fd, page_size * offset); + if (addr != MAP_FAILED) + break; + + /* + * The kernel returns EINVAL if not supported + */ + if (errno == EINVAL) + return -1; + + block_size_exp -= 1; + } while (block_size_exp >= min_block_log); + mlx5_dbg(mctx->dbg_fp, MLX5_DBG_CONTIG, "block order %d, addr %p\n", + block_size_exp, addr); + + if (addr == MAP_FAILED) + return -1; + + if (ibv_dontfork_range(addr, size)) { + munmap(addr, size); + return -1; + } + + buf->buf = addr; + buf->length = size; + buf->type = MLX5_ALLOC_TYPE_CONTIG; + + return 0; +} + +void mlx5_free_buf_contig(struct mlx5_context *mctx, struct mlx5_buf *buf) +{ + ibv_dofork_range(buf->buf, buf->length); + munmap(buf->buf, buf->length); +} + +int mlx5_alloc_buf(struct mlx5_buf *buf, size_t size, int page_size) +{ + int ret; + int al_size; + + al_size = align(size, page_size); + ret = posix_memalign(&buf->buf, page_size, al_size); + if (ret) + return ret; + + ret = ibv_dontfork_range(buf->buf, al_size); + if (ret) + free(buf->buf); + + if (!ret) { + buf->length = al_size; + buf->type = MLX5_ALLOC_TYPE_ANON; + } + + return ret; +} + +void mlx5_free_buf(struct mlx5_buf *buf) +{ + ibv_dofork_range(buf->buf, buf->length); + free(buf->buf); +} diff --git a/providers/mlx5/cq.c b/providers/mlx5/cq.c new file mode 100644 index 0000000..2b4f189 --- /dev/null +++ b/providers/mlx5/cq.c @@ -0,0 +1,1897 @@ +/* + * Copyright (c) 2012 Mellanox Technologies, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <config.h> + +#include <stdio.h> +#include <stdlib.h> +#include <pthread.h> +#include <string.h> +#include <errno.h> +#include <unistd.h> + +#include <util/compiler.h> +#include <util/mmio.h> +#include <infiniband/opcode.h> + +#include "mlx5.h" +#include "wqe.h" + +enum { + CQ_OK = 0, + CQ_EMPTY = -1, + CQ_POLL_ERR = -2, + CQ_POLL_NODATA = ENOENT +}; + +enum { + MLX5_CQ_MODIFY_RESEIZE = 0, + MLX5_CQ_MODIFY_MODER = 1, + MLX5_CQ_MODIFY_MAPPING = 2, +}; + +enum { + MLX5_CQE_APP_TAG_MATCHING = 1, +}; + +enum { + MLX5_CQE_APP_OP_TM_CONSUMED = 0x1, + MLX5_CQE_APP_OP_TM_EXPECTED = 0x2, + MLX5_CQE_APP_OP_TM_UNEXPECTED = 0x3, + MLX5_CQE_APP_OP_TM_NO_TAG = 0x4, + MLX5_CQE_APP_OP_TM_APPEND = 0x5, + MLX5_CQE_APP_OP_TM_REMOVE = 0x6, + MLX5_CQE_APP_OP_TM_NOOP = 0x7, + MLX5_CQE_APP_OP_TM_CONSUMED_SW_RDNV = 0x9, + MLX5_CQE_APP_OP_TM_CONSUMED_MSG = 0xA, + MLX5_CQE_APP_OP_TM_CONSUMED_MSG_SW_RDNV = 0xB, + MLX5_CQE_APP_OP_TM_MSG_COMPLETION_CANCELED = 0xC, +}; + + +/* When larger messages or rendezvous transfers are involved, matching and + * data transfer completion are distinct events that generate 2 completion + * events for the same recv_wr_id. + */ +static inline bool mlx5_cqe_app_op_tm_is_complete(int op) +{ + return op != MLX5_CQE_APP_OP_TM_CONSUMED && + op != MLX5_CQE_APP_OP_TM_CONSUMED_SW_RDNV; +} + +enum { + MLX5_CQ_LAZY_FLAGS = + MLX5_CQ_FLAGS_RX_CSUM_VALID | + MLX5_CQ_FLAGS_TM_SYNC_REQ +}; + +int mlx5_stall_num_loop = 60; +int mlx5_stall_cq_poll_min = 60; +int mlx5_stall_cq_poll_max = 100000; +int mlx5_stall_cq_inc_step = 100; +int mlx5_stall_cq_dec_step = 10; + +enum { + MLX5_TM_MAX_SYNC_DIFF = 0x3fff +}; + +static inline uint8_t get_cqe_l3_hdr_type(struct mlx5_cqe64 *cqe) +{ + return (cqe->l4_hdr_type_etc >> 2) & 0x3; +} + +static void *get_buf_cqe(struct mlx5_buf *buf, int n, int cqe_sz) +{ + return buf->buf + n * cqe_sz; +} + +static void *get_cqe(struct mlx5_cq *cq, int n) +{ + return cq->active_buf->buf + n * cq->cqe_sz; +} + +static void *get_sw_cqe(struct mlx5_cq *cq, int n) +{ + void *cqe = get_cqe(cq, n & cq->ibv_cq.cqe); + struct mlx5_cqe64 *cqe64; + + cqe64 = (cq->cqe_sz == 64) ? cqe : cqe + 64; + + if (likely(mlx5dv_get_cqe_opcode(cqe64) != MLX5_CQE_INVALID) && + !((cqe64->op_own & MLX5_CQE_OWNER_MASK) ^ !!(n & (cq->ibv_cq.cqe + 1)))) { + return cqe; + } else { + return NULL; + } +} + +static void *next_cqe_sw(struct mlx5_cq *cq) +{ + return get_sw_cqe(cq, cq->cons_index); +} + +static void update_cons_index(struct mlx5_cq *cq) +{ + cq->dbrec[MLX5_CQ_SET_CI] = htobe32(cq->cons_index & 0xffffff); +} + +static inline void handle_good_req(struct ibv_wc *wc, struct mlx5_cqe64 *cqe, struct mlx5_wq *wq, int idx) +{ + switch (be32toh(cqe->sop_drop_qpn) >> 24) { + case MLX5_OPCODE_RDMA_WRITE_IMM: + wc->wc_flags |= IBV_WC_WITH_IMM; + SWITCH_FALLTHROUGH; + case MLX5_OPCODE_RDMA_WRITE: + wc->opcode = IBV_WC_RDMA_WRITE; + break; + case MLX5_OPCODE_SEND_IMM: + wc->wc_flags |= IBV_WC_WITH_IMM; + SWITCH_FALLTHROUGH; + case MLX5_OPCODE_SEND: + case MLX5_OPCODE_SEND_INVAL: + wc->opcode = IBV_WC_SEND; + break; + case MLX5_OPCODE_RDMA_READ: + wc->opcode = IBV_WC_RDMA_READ; + wc->byte_len = be32toh(cqe->byte_cnt); + break; + case MLX5_OPCODE_ATOMIC_CS: + wc->opcode = IBV_WC_COMP_SWAP; + wc->byte_len = 8; + break; + case MLX5_OPCODE_ATOMIC_FA: + wc->opcode = IBV_WC_FETCH_ADD; + wc->byte_len = 8; + break; + case MLX5_OPCODE_UMR: + wc->opcode = wq->wr_data[idx]; + break; + case MLX5_OPCODE_TSO: + wc->opcode = IBV_WC_TSO; + break; + } +} + +static inline int handle_responder_lazy(struct mlx5_cq *cq, struct mlx5_cqe64 *cqe, + struct mlx5_resource *cur_rsc, struct mlx5_srq *srq) +{ + uint16_t wqe_ctr; + struct mlx5_wq *wq; + struct mlx5_qp *qp = rsc_to_mqp(cur_rsc); + int err = IBV_WC_SUCCESS; + + if (srq) { + wqe_ctr = be16toh(cqe->wqe_counter); + cq->ibv_cq.wr_id = srq->wrid[wqe_ctr]; + mlx5_free_srq_wqe(srq, wqe_ctr); + if (cqe->op_own & MLX5_INLINE_SCATTER_32) + err = mlx5_copy_to_recv_srq(srq, wqe_ctr, cqe, + be32toh(cqe->byte_cnt)); + else if (cqe->op_own & MLX5_INLINE_SCATTER_64) + err = mlx5_copy_to_recv_srq(srq, wqe_ctr, cqe - 1, + be32toh(cqe->byte_cnt)); + } else { + if (likely(cur_rsc->type == MLX5_RSC_TYPE_QP)) { + wq = &qp->rq; + if (qp->qp_cap_cache & MLX5_RX_CSUM_VALID) + cq->flags |= MLX5_CQ_FLAGS_RX_CSUM_VALID; + } else { + wq = &(rsc_to_mrwq(cur_rsc)->rq); + } + + wqe_ctr = wq->tail & (wq->wqe_cnt - 1); + cq->ibv_cq.wr_id = wq->wrid[wqe_ctr]; + ++wq->tail; + if (cqe->op_own & MLX5_INLINE_SCATTER_32) + err = mlx5_copy_to_recv_wqe(qp, wqe_ctr, cqe, + be32toh(cqe->byte_cnt)); + else if (cqe->op_own & MLX5_INLINE_SCATTER_64) + err = mlx5_copy_to_recv_wqe(qp, wqe_ctr, cqe - 1, + be32toh(cqe->byte_cnt)); + } + + return err; +} + +/* Returns IBV_WC_IP_CSUM_OK or 0 */ +static inline int get_csum_ok(struct mlx5_cqe64 *cqe) +{ + return (((cqe->hds_ip_ext & (MLX5_CQE_L4_OK | MLX5_CQE_L3_OK)) == + (MLX5_CQE_L4_OK | MLX5_CQE_L3_OK)) & + (get_cqe_l3_hdr_type(cqe) == MLX5_CQE_L3_HDR_TYPE_IPV4)) + << IBV_WC_IP_CSUM_OK_SHIFT; +} + +static inline int handle_responder(struct ibv_wc *wc, struct mlx5_cqe64 *cqe, + struct mlx5_resource *cur_rsc, struct mlx5_srq *srq) +{ + uint16_t wqe_ctr; + struct mlx5_wq *wq; + struct mlx5_qp *qp = rsc_to_mqp(cur_rsc); + uint8_t g; + int err = 0; + + wc->byte_len = be32toh(cqe->byte_cnt); + if (srq) { + wqe_ctr = be16toh(cqe->wqe_counter); + wc->wr_id = srq->wrid[wqe_ctr]; + mlx5_free_srq_wqe(srq, wqe_ctr); + if (cqe->op_own & MLX5_INLINE_SCATTER_32) + err = mlx5_copy_to_recv_srq(srq, wqe_ctr, cqe, + wc->byte_len); + else if (cqe->op_own & MLX5_INLINE_SCATTER_64) + err = mlx5_copy_to_recv_srq(srq, wqe_ctr, cqe - 1, + wc->byte_len); + } else { + if (likely(cur_rsc->type == MLX5_RSC_TYPE_QP)) { + wq = &qp->rq; + if (qp->qp_cap_cache & MLX5_RX_CSUM_VALID) + wc->wc_flags |= get_csum_ok(cqe); + } else { + wq = &(rsc_to_mrwq(cur_rsc)->rq); + } + + wqe_ctr = wq->tail & (wq->wqe_cnt - 1); + wc->wr_id = wq->wrid[wqe_ctr]; + ++wq->tail; + if (cqe->op_own & MLX5_INLINE_SCATTER_32) + err = mlx5_copy_to_recv_wqe(qp, wqe_ctr, cqe, + wc->byte_len); + else if (cqe->op_own & MLX5_INLINE_SCATTER_64) + err = mlx5_copy_to_recv_wqe(qp, wqe_ctr, cqe - 1, + wc->byte_len); + } + if (err) + return err; + + switch (cqe->op_own >> 4) { + case MLX5_CQE_RESP_WR_IMM: + wc->opcode = IBV_WC_RECV_RDMA_WITH_IMM; + wc->wc_flags |= IBV_WC_WITH_IMM; + wc->imm_data = cqe->imm_inval_pkey; + break; + case MLX5_CQE_RESP_SEND: + wc->opcode = IBV_WC_RECV; + break; + case MLX5_CQE_RESP_SEND_IMM: + wc->opcode = IBV_WC_RECV; + wc->wc_flags |= IBV_WC_WITH_IMM; + wc->imm_data = cqe->imm_inval_pkey; + break; + case MLX5_CQE_RESP_SEND_INV: + wc->opcode = IBV_WC_RECV; + wc->wc_flags |= IBV_WC_WITH_INV; + wc->invalidated_rkey = be32toh(cqe->imm_inval_pkey); + break; + } + wc->slid = be16toh(cqe->slid); + wc->sl = (be32toh(cqe->flags_rqpn) >> 24) & 0xf; + wc->src_qp = be32toh(cqe->flags_rqpn) & 0xffffff; + wc->dlid_path_bits = cqe->ml_path & 0x7f; + g = (be32toh(cqe->flags_rqpn) >> 28) & 3; + wc->wc_flags |= g ? IBV_WC_GRH : 0; + wc->pkey_index = be32toh(cqe->imm_inval_pkey) & 0xffff; + + return IBV_WC_SUCCESS; +} + +static void dump_cqe(FILE *fp, void *buf) +{ + __be32 *p = buf; + int i; + + for (i = 0; i < 16; i += 4) + fprintf(fp, "%08x %08x %08x %08x\n", be32toh(p[i]), be32toh(p[i + 1]), + be32toh(p[i + 2]), be32toh(p[i + 3])); +} + +static enum ibv_wc_status mlx5_handle_error_cqe(struct mlx5_err_cqe *cqe) +{ + switch (cqe->syndrome) { + case MLX5_CQE_SYNDROME_LOCAL_LENGTH_ERR: + return IBV_WC_LOC_LEN_ERR; + case MLX5_CQE_SYNDROME_LOCAL_QP_OP_ERR: + return IBV_WC_LOC_QP_OP_ERR; + case MLX5_CQE_SYNDROME_LOCAL_PROT_ERR: + return IBV_WC_LOC_PROT_ERR; + case MLX5_CQE_SYNDROME_WR_FLUSH_ERR: + return IBV_WC_WR_FLUSH_ERR; + case MLX5_CQE_SYNDROME_MW_BIND_ERR: + return IBV_WC_MW_BIND_ERR; + case MLX5_CQE_SYNDROME_BAD_RESP_ERR: + return IBV_WC_BAD_RESP_ERR; + case MLX5_CQE_SYNDROME_LOCAL_ACCESS_ERR: + return IBV_WC_LOC_ACCESS_ERR; + case MLX5_CQE_SYNDROME_REMOTE_INVAL_REQ_ERR: + return IBV_WC_REM_INV_REQ_ERR; + case MLX5_CQE_SYNDROME_REMOTE_ACCESS_ERR: + return IBV_WC_REM_ACCESS_ERR; + case MLX5_CQE_SYNDROME_REMOTE_OP_ERR: + return IBV_WC_REM_OP_ERR; + case MLX5_CQE_SYNDROME_TRANSPORT_RETRY_EXC_ERR: + return IBV_WC_RETRY_EXC_ERR; + case MLX5_CQE_SYNDROME_RNR_RETRY_EXC_ERR: + return IBV_WC_RNR_RETRY_EXC_ERR; + case MLX5_CQE_SYNDROME_REMOTE_ABORTED_ERR: + return IBV_WC_REM_ABORT_ERR; + default: + return IBV_WC_GENERAL_ERR; + } +} + +#if defined(__x86_64__) || defined (__i386__) +static inline unsigned long get_cycles(void) +{ + uint32_t low, high; + uint64_t val; + asm volatile ("rdtsc" : "=a" (low), "=d" (high)); + val = high; + val = (val << 32) | low; + return val; +} + +static void mlx5_stall_poll_cq(void) +{ + int i; + + for (i = 0; i < mlx5_stall_num_loop; i++) + (void)get_cycles(); +} +static void mlx5_stall_cycles_poll_cq(uint64_t cycles) +{ + while (get_cycles() < cycles) + ; /* Nothing */ +} +static void mlx5_get_cycles(uint64_t *cycles) +{ + *cycles = get_cycles(); +} +#else +static void mlx5_stall_poll_cq(void) +{ +} +static void mlx5_stall_cycles_poll_cq(uint64_t cycles) +{ +} +static void mlx5_get_cycles(uint64_t *cycles) +{ +} +#endif + +static inline struct mlx5_qp *get_req_context(struct mlx5_context *mctx, + struct mlx5_resource **cur_rsc, + uint32_t rsn, int cqe_ver) + ALWAYS_INLINE; +static inline struct mlx5_qp *get_req_context(struct mlx5_context *mctx, + struct mlx5_resource **cur_rsc, + uint32_t rsn, int cqe_ver) +{ + if (!*cur_rsc || (rsn != (*cur_rsc)->rsn)) + *cur_rsc = cqe_ver ? mlx5_find_uidx(mctx, rsn) : + (struct mlx5_resource *)mlx5_find_qp(mctx, rsn); + + return rsc_to_mqp(*cur_rsc); +} + +static inline int get_resp_ctx_v1(struct mlx5_context *mctx, + struct mlx5_resource **cur_rsc, + struct mlx5_srq **cur_srq, + uint32_t uidx, uint8_t *is_srq) + ALWAYS_INLINE; +static inline int get_resp_ctx_v1(struct mlx5_context *mctx, + struct mlx5_resource **cur_rsc, + struct mlx5_srq **cur_srq, + uint32_t uidx, uint8_t *is_srq) +{ + struct mlx5_qp *mqp; + + if (!*cur_rsc || (uidx != (*cur_rsc)->rsn)) { + *cur_rsc = mlx5_find_uidx(mctx, uidx); + if (unlikely(!*cur_rsc)) + return CQ_POLL_ERR; + } + + switch ((*cur_rsc)->type) { + case MLX5_RSC_TYPE_QP: + mqp = rsc_to_mqp(*cur_rsc); + if (mqp->verbs_qp.qp.srq) { + *cur_srq = to_msrq(mqp->verbs_qp.qp.srq); + *is_srq = 1; + } + break; + case MLX5_RSC_TYPE_XSRQ: + *cur_srq = rsc_to_msrq(*cur_rsc); + *is_srq = 1; + break; + case MLX5_RSC_TYPE_RWQ: + break; + default: + return CQ_POLL_ERR; + } + + return CQ_OK; +} + +static inline int get_qp_ctx(struct mlx5_context *mctx, + struct mlx5_resource **cur_rsc, + uint32_t qpn) + ALWAYS_INLINE; +static inline int get_qp_ctx(struct mlx5_context *mctx, + struct mlx5_resource **cur_rsc, + uint32_t qpn) +{ + if (!*cur_rsc || (qpn != (*cur_rsc)->rsn)) { + /* + * We do not have to take the QP table lock here, + * because CQs will be locked while QPs are removed + * from the table. + */ + *cur_rsc = (struct mlx5_resource *)mlx5_find_qp(mctx, qpn); + if (unlikely(!*cur_rsc)) + return CQ_POLL_ERR; + } + + return CQ_OK; +} + +static inline int get_srq_ctx(struct mlx5_context *mctx, + struct mlx5_srq **cur_srq, + uint32_t srqn_uidx) + ALWAYS_INLINE; +static inline int get_srq_ctx(struct mlx5_context *mctx, + struct mlx5_srq **cur_srq, + uint32_t srqn) +{ + if (!*cur_srq || (srqn != (*cur_srq)->srqn)) { + *cur_srq = mlx5_find_srq(mctx, srqn); + if (unlikely(!*cur_srq)) + return CQ_POLL_ERR; + } + + return CQ_OK; +} + +static inline int get_cur_rsc(struct mlx5_context *mctx, + int cqe_ver, + uint32_t qpn, + uint32_t srqn_uidx, + struct mlx5_resource **cur_rsc, + struct mlx5_srq **cur_srq, + uint8_t *is_srq) +{ + int err; + + if (cqe_ver) { + err = get_resp_ctx_v1(mctx, cur_rsc, cur_srq, srqn_uidx, + is_srq); + } else { + if (srqn_uidx) { + *is_srq = 1; + err = get_srq_ctx(mctx, cur_srq, srqn_uidx); + } else { + err = get_qp_ctx(mctx, cur_rsc, qpn); + } + } + + return err; + +} + +static inline int mlx5_get_next_cqe(struct mlx5_cq *cq, + struct mlx5_cqe64 **pcqe64, + void **pcqe) + ALWAYS_INLINE; +static inline int mlx5_get_next_cqe(struct mlx5_cq *cq, + struct mlx5_cqe64 **pcqe64, + void **pcqe) +{ + void *cqe; + struct mlx5_cqe64 *cqe64; + + cqe = next_cqe_sw(cq); + if (!cqe) + return CQ_EMPTY; + + cqe64 = (cq->cqe_sz == 64) ? cqe : cqe + 64; + + ++cq->cons_index; + + VALGRIND_MAKE_MEM_DEFINED(cqe64, sizeof *cqe64); + + /* + * Make sure we read CQ entry contents after we've checked the + * ownership bit. + */ + udma_from_device_barrier(); + +#ifdef MLX5_DEBUG + { + struct mlx5_context *mctx = to_mctx(cq->ibv_cq.context); + + if (mlx5_debug_mask & MLX5_DBG_CQ_CQE) { + FILE *fp = mctx->dbg_fp; + + mlx5_dbg(fp, MLX5_DBG_CQ_CQE, "dump cqe for cqn 0x%x:\n", cq->cqn); + dump_cqe(fp, cqe64); + } + } +#endif + *pcqe64 = cqe64; + *pcqe = cqe; + + return CQ_OK; +} + +static int handle_tag_matching(struct mlx5_cq *cq, + struct mlx5_cqe64 *cqe64, + struct mlx5_srq *srq) +{ + FILE *fp = to_mctx(srq->vsrq.srq.context)->dbg_fp; + struct mlx5_tag_entry *tag; + struct mlx5_srq_op *op; + uint16_t wqe_ctr; + + cq->ibv_cq.status = IBV_WC_SUCCESS; + switch (cqe64->app_op) { + case MLX5_CQE_APP_OP_TM_CONSUMED_MSG_SW_RDNV: + case MLX5_CQE_APP_OP_TM_CONSUMED_SW_RDNV: + case MLX5_CQE_APP_OP_TM_MSG_COMPLETION_CANCELED: + cq->ibv_cq.status = IBV_WC_TM_RNDV_INCOMPLETE; + SWITCH_FALLTHROUGH; + + case MLX5_CQE_APP_OP_TM_CONSUMED_MSG: + case MLX5_CQE_APP_OP_TM_CONSUMED: + case MLX5_CQE_APP_OP_TM_EXPECTED: + mlx5_spin_lock(&srq->lock); + tag = &srq->tm_list[be16toh(cqe64->app_info)]; + if (!tag->expect_cqe) { + mlx5_dbg(fp, MLX5_DBG_CQ, "got idx %d which wasn't added\n", + be16toh(cqe64->app_info)); + cq->ibv_cq.status = IBV_WC_GENERAL_ERR; + mlx5_spin_unlock(&srq->lock); + return CQ_OK; + } + cq->ibv_cq.wr_id = tag->wr_id; + if (mlx5_cqe_app_op_tm_is_complete(cqe64->app_op)) + mlx5_tm_release_tag(srq, tag); + /* inline scatter 32 not supported for TM */ + if (cqe64->op_own & MLX5_INLINE_SCATTER_64) { + if (be32toh(cqe64->byte_cnt) > tag->size) + cq->ibv_cq.status = IBV_WC_LOC_LEN_ERR; + else + memcpy(tag->ptr, cqe64 - 1, + be32toh(cqe64->byte_cnt)); + } + mlx5_spin_unlock(&srq->lock); + break; + + case MLX5_CQE_APP_OP_TM_REMOVE: + if (!(be32toh(cqe64->tm_cqe.success) & MLX5_TMC_SUCCESS)) + cq->ibv_cq.status = IBV_WC_TM_ERR; + SWITCH_FALLTHROUGH; + + case MLX5_CQE_APP_OP_TM_APPEND: + case MLX5_CQE_APP_OP_TM_NOOP: + mlx5_spin_lock(&srq->lock); +#ifdef MLX5_DEBUG + if (srq->op_tail == srq->op_head) { + mlx5_dbg(fp, MLX5_DBG_CQ, "got unexpected list op CQE\n"); + cq->ibv_cq.status = IBV_WC_GENERAL_ERR; + mlx5_spin_unlock(&srq->lock); + return CQ_OK; + } +#endif + op = srq->op + (srq->op_head++ & + (to_mqp(srq->cmd_qp)->sq.wqe_cnt - 1)); + if (op->tag) { /* APPEND or REMOVE */ + mlx5_tm_release_tag(srq, op->tag); + if (cqe64->app_op == MLX5_CQE_APP_OP_TM_REMOVE && + cq->ibv_cq.status == IBV_WC_SUCCESS) + /* + * If tag entry was successfully removed we + * don't expect consumption completion for it + * anymore. Remove reports failure if tag was + * consumed meanwhile. + */ + mlx5_tm_release_tag(srq, op->tag); + if (be16toh(cqe64->tm_cqe.hw_phase_cnt) != + op->tag->phase_cnt) + cq->flags |= MLX5_CQ_FLAGS_TM_SYNC_REQ; + } + + to_mqp(srq->cmd_qp)->sq.tail = op->wqe_head + 1; + cq->ibv_cq.wr_id = op->wr_id; + + mlx5_spin_unlock(&srq->lock); + break; + + case MLX5_CQE_APP_OP_TM_UNEXPECTED: + srq->unexp_in++; + if (srq->unexp_in - srq->unexp_out > MLX5_TM_MAX_SYNC_DIFF) + cq->flags |= MLX5_CQ_FLAGS_TM_SYNC_REQ; + SWITCH_FALLTHROUGH; + + case MLX5_CQE_APP_OP_TM_NO_TAG: + wqe_ctr = be16toh(cqe64->wqe_counter); + cq->ibv_cq.wr_id = srq->wrid[wqe_ctr]; + mlx5_free_srq_wqe(srq, wqe_ctr); + if (cqe64->op_own & MLX5_INLINE_SCATTER_32) + return mlx5_copy_to_recv_srq(srq, wqe_ctr, cqe64, + be32toh(cqe64->byte_cnt)); + else if (cqe64->op_own & MLX5_INLINE_SCATTER_64) + return mlx5_copy_to_recv_srq(srq, wqe_ctr, cqe64 - 1, + be32toh(cqe64->byte_cnt)); + break; +#ifdef MLX5_DEBUG + default: + mlx5_dbg(fp, MLX5_DBG_CQ, "un-expected TM opcode in cqe\n"); +#endif + } + + return CQ_OK; +} + +static inline int is_odp_pfault_err(struct mlx5_err_cqe *ecqe) +{ + return ecqe->syndrome == MLX5_CQE_SYNDROME_REMOTE_ABORTED_ERR && + ecqe->vendor_err_synd == MLX5_CQE_VENDOR_SYNDROME_ODP_PFAULT; +} + +static inline int mlx5_parse_cqe(struct mlx5_cq *cq, + struct mlx5_cqe64 *cqe64, + void *cqe, + struct mlx5_resource **cur_rsc, + struct mlx5_srq **cur_srq, + struct ibv_wc *wc, + int cqe_ver, int lazy) + ALWAYS_INLINE; +static inline int mlx5_parse_cqe(struct mlx5_cq *cq, + struct mlx5_cqe64 *cqe64, + void *cqe, + struct mlx5_resource **cur_rsc, + struct mlx5_srq **cur_srq, + struct ibv_wc *wc, + int cqe_ver, int lazy) +{ + struct mlx5_wq *wq; + uint16_t wqe_ctr; + uint32_t qpn; + uint32_t srqn_uidx; + int idx; + uint8_t opcode; + struct mlx5_err_cqe *ecqe; + int err; + struct mlx5_qp *mqp; + struct mlx5_context *mctx; + uint8_t is_srq; + +again: + is_srq = 0; + err = 0; + + mctx = to_mctx(ibv_cq_ex_to_cq(&cq->ibv_cq)->context); + qpn = be32toh(cqe64->sop_drop_qpn) & 0xffffff; + if (lazy) { + cq->cqe64 = cqe64; + cq->flags &= (~MLX5_CQ_LAZY_FLAGS); + } else { + wc->wc_flags = 0; + wc->qp_num = qpn; + } + + opcode = mlx5dv_get_cqe_opcode(cqe64); + switch (opcode) { + case MLX5_CQE_REQ: + { + mqp = get_req_context(mctx, cur_rsc, + (cqe_ver ? (be32toh(cqe64->srqn_uidx) & 0xffffff) : qpn), + cqe_ver); + if (unlikely(!mqp)) + return CQ_POLL_ERR; + wq = &mqp->sq; + wqe_ctr = be16toh(cqe64->wqe_counter); + idx = wqe_ctr & (wq->wqe_cnt - 1); + if (lazy) { + uint32_t wc_byte_len; + + switch (be32toh(cqe64->sop_drop_qpn) >> 24) { + case MLX5_OPCODE_UMR: + cq->umr_opcode = wq->wr_data[idx]; + break; + + case MLX5_OPCODE_RDMA_READ: + wc_byte_len = be32toh(cqe64->byte_cnt); + goto scatter_out; + case MLX5_OPCODE_ATOMIC_CS: + case MLX5_OPCODE_ATOMIC_FA: + wc_byte_len = 8; + + scatter_out: + if (cqe64->op_own & MLX5_INLINE_SCATTER_32) + err = mlx5_copy_to_send_wqe( + mqp, wqe_ctr, cqe, wc_byte_len); + else if (cqe64->op_own & MLX5_INLINE_SCATTER_64) + err = mlx5_copy_to_send_wqe( + mqp, wqe_ctr, cqe - 1, wc_byte_len); + break; + } + + cq->ibv_cq.wr_id = wq->wrid[idx]; + cq->ibv_cq.status = err; + } else { + handle_good_req(wc, cqe64, wq, idx); + + if (cqe64->op_own & MLX5_INLINE_SCATTER_32) + err = mlx5_copy_to_send_wqe(mqp, wqe_ctr, cqe, + wc->byte_len); + else if (cqe64->op_own & MLX5_INLINE_SCATTER_64) + err = mlx5_copy_to_send_wqe( + mqp, wqe_ctr, cqe - 1, wc->byte_len); + + wc->wr_id = wq->wrid[idx]; + wc->status = err; + } + + wq->tail = wq->wqe_head[idx] + 1; + break; + } + case MLX5_CQE_RESP_WR_IMM: + case MLX5_CQE_RESP_SEND: + case MLX5_CQE_RESP_SEND_IMM: + case MLX5_CQE_RESP_SEND_INV: + srqn_uidx = be32toh(cqe64->srqn_uidx) & 0xffffff; + err = get_cur_rsc(mctx, cqe_ver, qpn, srqn_uidx, cur_rsc, + cur_srq, &is_srq); + if (unlikely(err)) + return CQ_POLL_ERR; + + if (lazy) { + if (likely(cqe64->app != MLX5_CQE_APP_TAG_MATCHING)) { + cq->ibv_cq.status = handle_responder_lazy + (cq, cqe64, *cur_rsc, + is_srq ? *cur_srq : NULL); + } else { + if (unlikely(!is_srq)) + return CQ_POLL_ERR; + + err = handle_tag_matching(cq, cqe64, *cur_srq); + if (unlikely(err)) + return CQ_POLL_ERR; + } + } else { + wc->status = handle_responder(wc, cqe64, *cur_rsc, + is_srq ? *cur_srq : NULL); + } + break; + + case MLX5_CQE_NO_PACKET: + if (unlikely(cqe64->app != MLX5_CQE_APP_TAG_MATCHING)) + return CQ_POLL_ERR; + srqn_uidx = be32toh(cqe64->srqn_uidx) & 0xffffff; + err = get_cur_rsc(mctx, cqe_ver, qpn, srqn_uidx, cur_rsc, + cur_srq, &is_srq); + if (unlikely(err || !is_srq)) + return CQ_POLL_ERR; + err = handle_tag_matching(cq, cqe64, *cur_srq); + if (unlikely(err)) + return CQ_POLL_ERR; + break; + + case MLX5_CQE_RESIZE_CQ: + break; + case MLX5_CQE_REQ_ERR: + case MLX5_CQE_RESP_ERR: + srqn_uidx = be32toh(cqe64->srqn_uidx) & 0xffffff; + ecqe = (struct mlx5_err_cqe *)cqe64; + { + enum ibv_wc_status *pstatus = lazy ? &cq->ibv_cq.status : &wc->status; + + *pstatus = mlx5_handle_error_cqe(ecqe); + } + + if (!lazy) + wc->vendor_err = ecqe->vendor_err_synd; + + if (unlikely(ecqe->syndrome != MLX5_CQE_SYNDROME_WR_FLUSH_ERR && + ecqe->syndrome != MLX5_CQE_SYNDROME_TRANSPORT_RETRY_EXC_ERR && + !is_odp_pfault_err(ecqe))) { + FILE *fp = mctx->dbg_fp; + fprintf(fp, PFX "%s: got completion with error:\n", + mctx->hostname); + dump_cqe(fp, ecqe); + if (mlx5_freeze_on_error_cqe) { + fprintf(fp, PFX "freezing at poll cq..."); + while (1) + sleep(10); + } + } + + if (opcode == MLX5_CQE_REQ_ERR) { + mqp = get_req_context(mctx, cur_rsc, + (cqe_ver ? srqn_uidx : qpn), cqe_ver); + if (unlikely(!mqp)) + return CQ_POLL_ERR; + wq = &mqp->sq; + wqe_ctr = be16toh(cqe64->wqe_counter); + idx = wqe_ctr & (wq->wqe_cnt - 1); + if (lazy) + cq->ibv_cq.wr_id = wq->wrid[idx]; + else + wc->wr_id = wq->wrid[idx]; + wq->tail = wq->wqe_head[idx] + 1; + } else { + err = get_cur_rsc(mctx, cqe_ver, qpn, srqn_uidx, + cur_rsc, cur_srq, &is_srq); + if (unlikely(err)) + return CQ_POLL_ERR; + + if (is_srq) { + wqe_ctr = be16toh(cqe64->wqe_counter); + if (is_odp_pfault_err(ecqe)) { + mlx5_complete_odp_fault(*cur_srq, wqe_ctr); + err = mlx5_get_next_cqe(cq, &cqe64, &cqe); + /* CQ_POLL_NODATA indicates that CQ was not empty but the polled CQE + * was handled internally and should not processed by the caller. + */ + if (err == CQ_EMPTY) + return CQ_POLL_NODATA; + goto again; + } + + if (lazy) + cq->ibv_cq.wr_id = (*cur_srq)->wrid[wqe_ctr]; + else + wc->wr_id = (*cur_srq)->wrid[wqe_ctr]; + mlx5_free_srq_wqe(*cur_srq, wqe_ctr); + } else { + switch ((*cur_rsc)->type) { + case MLX5_RSC_TYPE_RWQ: + wq = &(rsc_to_mrwq(*cur_rsc)->rq); + break; + default: + wq = &(rsc_to_mqp(*cur_rsc)->rq); + break; + } + + if (lazy) + cq->ibv_cq.wr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)]; + else + wc->wr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)]; + ++wq->tail; + } + } + break; + } + + return CQ_OK; +} + +static inline int mlx5_parse_lazy_cqe(struct mlx5_cq *cq, + struct mlx5_cqe64 *cqe64, + void *cqe, int cqe_ver) + ALWAYS_INLINE; +static inline int mlx5_parse_lazy_cqe(struct mlx5_cq *cq, + struct mlx5_cqe64 *cqe64, + void *cqe, int cqe_ver) +{ + return mlx5_parse_cqe(cq, cqe64, cqe, &cq->cur_rsc, &cq->cur_srq, NULL, cqe_ver, 1); +} + +static inline int mlx5_poll_one(struct mlx5_cq *cq, + struct mlx5_resource **cur_rsc, + struct mlx5_srq **cur_srq, + struct ibv_wc *wc, int cqe_ver) + ALWAYS_INLINE; +static inline int mlx5_poll_one(struct mlx5_cq *cq, + struct mlx5_resource **cur_rsc, + struct mlx5_srq **cur_srq, + struct ibv_wc *wc, int cqe_ver) +{ + struct mlx5_cqe64 *cqe64; + void *cqe; + int err; + + err = mlx5_get_next_cqe(cq, &cqe64, &cqe); + if (err == CQ_EMPTY) + return err; + + return mlx5_parse_cqe(cq, cqe64, cqe, cur_rsc, cur_srq, wc, cqe_ver, 0); +} + +static inline int poll_cq(struct ibv_cq *ibcq, int ne, + struct ibv_wc *wc, int cqe_ver) + ALWAYS_INLINE; +static inline int poll_cq(struct ibv_cq *ibcq, int ne, + struct ibv_wc *wc, int cqe_ver) +{ + struct mlx5_cq *cq = to_mcq(ibcq); + struct mlx5_resource *rsc = NULL; + struct mlx5_srq *srq = NULL; + int npolled; + int err = CQ_OK; + + if (cq->stall_enable) { + if (cq->stall_adaptive_enable) { + if (cq->stall_last_count) + mlx5_stall_cycles_poll_cq(cq->stall_last_count + cq->stall_cycles); + } else if (cq->stall_next_poll) { + cq->stall_next_poll = 0; + mlx5_stall_poll_cq(); + } + } + + mlx5_spin_lock(&cq->lock); + + for (npolled = 0; npolled < ne; ++npolled) { + err = mlx5_poll_one(cq, &rsc, &srq, wc + npolled, cqe_ver); + if (err != CQ_OK) + break; + } + + update_cons_index(cq); + + mlx5_spin_unlock(&cq->lock); + + if (cq->stall_enable) { + if (cq->stall_adaptive_enable) { + if (npolled == 0) { + cq->stall_cycles = max(cq->stall_cycles-mlx5_stall_cq_dec_step, + mlx5_stall_cq_poll_min); + mlx5_get_cycles(&cq->stall_last_count); + } else if (npolled < ne) { + cq->stall_cycles = min(cq->stall_cycles+mlx5_stall_cq_inc_step, + mlx5_stall_cq_poll_max); + mlx5_get_cycles(&cq->stall_last_count); + } else { + cq->stall_cycles = max(cq->stall_cycles-mlx5_stall_cq_dec_step, + mlx5_stall_cq_poll_min); + cq->stall_last_count = 0; + } + } else if (err == CQ_EMPTY) { + cq->stall_next_poll = 1; + } + } + + return err == CQ_POLL_ERR ? err : npolled; +} + +enum polling_mode { + POLLING_MODE_NO_STALL, + POLLING_MODE_STALL, + POLLING_MODE_STALL_ADAPTIVE +}; + +static inline void _mlx5_end_poll(struct ibv_cq_ex *ibcq, + int lock, enum polling_mode stall) + ALWAYS_INLINE; +static inline void _mlx5_end_poll(struct ibv_cq_ex *ibcq, + int lock, enum polling_mode stall) +{ + struct mlx5_cq *cq = to_mcq(ibv_cq_ex_to_cq(ibcq)); + + update_cons_index(cq); + + if (lock) + mlx5_spin_unlock(&cq->lock); + + if (stall) { + if (stall == POLLING_MODE_STALL_ADAPTIVE) { + if (!(cq->flags & MLX5_CQ_FLAGS_FOUND_CQES)) { + cq->stall_cycles = max(cq->stall_cycles - mlx5_stall_cq_dec_step, + mlx5_stall_cq_poll_min); + mlx5_get_cycles(&cq->stall_last_count); + } else if (cq->flags & MLX5_CQ_FLAGS_EMPTY_DURING_POLL) { + cq->stall_cycles = min(cq->stall_cycles + mlx5_stall_cq_inc_step, + mlx5_stall_cq_poll_max); + mlx5_get_cycles(&cq->stall_last_count); + } else { + cq->stall_cycles = max(cq->stall_cycles - mlx5_stall_cq_dec_step, + mlx5_stall_cq_poll_min); + cq->stall_last_count = 0; + } + } else if (!(cq->flags & MLX5_CQ_FLAGS_FOUND_CQES)) { + cq->stall_next_poll = 1; + } + + cq->flags &= ~(MLX5_CQ_FLAGS_FOUND_CQES | MLX5_CQ_FLAGS_EMPTY_DURING_POLL); + } +} + +static inline int mlx5_start_poll(struct ibv_cq_ex *ibcq, struct ibv_poll_cq_attr *attr, + int lock, enum polling_mode stall, + int cqe_version, int clock_update) + ALWAYS_INLINE; +static inline int mlx5_start_poll(struct ibv_cq_ex *ibcq, struct ibv_poll_cq_attr *attr, + int lock, enum polling_mode stall, + int cqe_version, int clock_update) +{ + struct mlx5_cq *cq = to_mcq(ibv_cq_ex_to_cq(ibcq)); + struct mlx5_cqe64 *cqe64; + void *cqe; + int err; + + if (unlikely(attr->comp_mask)) + return EINVAL; + + if (stall) { + if (stall == POLLING_MODE_STALL_ADAPTIVE) { + if (cq->stall_last_count) + mlx5_stall_cycles_poll_cq(cq->stall_last_count + cq->stall_cycles); + } else if (cq->stall_next_poll) { + cq->stall_next_poll = 0; + mlx5_stall_poll_cq(); + } + } + + if (lock) + mlx5_spin_lock(&cq->lock); + + cq->cur_rsc = NULL; + cq->cur_srq = NULL; + + err = mlx5_get_next_cqe(cq, &cqe64, &cqe); + if (err == CQ_EMPTY) { + if (lock) + mlx5_spin_unlock(&cq->lock); + + if (stall) { + if (stall == POLLING_MODE_STALL_ADAPTIVE) { + cq->stall_cycles = max(cq->stall_cycles - mlx5_stall_cq_dec_step, + mlx5_stall_cq_poll_min); + mlx5_get_cycles(&cq->stall_last_count); + } else { + cq->stall_next_poll = 1; + } + } + + return ENOENT; + } + + if (stall) + cq->flags |= MLX5_CQ_FLAGS_FOUND_CQES; + + err = mlx5_parse_lazy_cqe(cq, cqe64, cqe, cqe_version); + if (lock && err) + mlx5_spin_unlock(&cq->lock); + + if (stall && err == CQ_POLL_ERR) { + if (stall == POLLING_MODE_STALL_ADAPTIVE) { + cq->stall_cycles = max(cq->stall_cycles - mlx5_stall_cq_dec_step, + mlx5_stall_cq_poll_min); + cq->stall_last_count = 0; + } + + cq->flags &= ~(MLX5_CQ_FLAGS_FOUND_CQES); + + goto out; + } + + if (clock_update && !err) + err = mlx5dv_get_clock_info(ibcq->context, &cq->last_clock_info); + +out: + return err; +} + +static inline int mlx5_next_poll(struct ibv_cq_ex *ibcq, + enum polling_mode stall, int cqe_version) + ALWAYS_INLINE; +static inline int mlx5_next_poll(struct ibv_cq_ex *ibcq, + enum polling_mode stall, + int cqe_version) +{ + struct mlx5_cq *cq = to_mcq(ibv_cq_ex_to_cq(ibcq)); + struct mlx5_cqe64 *cqe64; + void *cqe; + int err; + + err = mlx5_get_next_cqe(cq, &cqe64, &cqe); + if (err == CQ_EMPTY) { + if (stall == POLLING_MODE_STALL_ADAPTIVE) + cq->flags |= MLX5_CQ_FLAGS_EMPTY_DURING_POLL; + + return ENOENT; + } + + return mlx5_parse_lazy_cqe(cq, cqe64, cqe, cqe_version); +} + +static inline int mlx5_next_poll_adaptive_v0(struct ibv_cq_ex *ibcq) +{ + return mlx5_next_poll(ibcq, POLLING_MODE_STALL_ADAPTIVE, 0); +} + +static inline int mlx5_next_poll_adaptive_v1(struct ibv_cq_ex *ibcq) +{ + return mlx5_next_poll(ibcq, POLLING_MODE_STALL_ADAPTIVE, 1); +} + +static inline int mlx5_next_poll_v0(struct ibv_cq_ex *ibcq) +{ + return mlx5_next_poll(ibcq, 0, 0); +} + +static inline int mlx5_next_poll_v1(struct ibv_cq_ex *ibcq) +{ + return mlx5_next_poll(ibcq, 0, 1); +} + +static inline int mlx5_start_poll_v0(struct ibv_cq_ex *ibcq, + struct ibv_poll_cq_attr *attr) +{ + return mlx5_start_poll(ibcq, attr, 0, 0, 0, 0); +} + +static inline int mlx5_start_poll_v1(struct ibv_cq_ex *ibcq, + struct ibv_poll_cq_attr *attr) +{ + return mlx5_start_poll(ibcq, attr, 0, 0, 1, 0); +} + +static inline int mlx5_start_poll_v0_lock(struct ibv_cq_ex *ibcq, + struct ibv_poll_cq_attr *attr) +{ + return mlx5_start_poll(ibcq, attr, 1, 0, 0, 0); +} + +static inline int mlx5_start_poll_v1_lock(struct ibv_cq_ex *ibcq, + struct ibv_poll_cq_attr *attr) +{ + return mlx5_start_poll(ibcq, attr, 1, 0, 1, 0); +} + +static inline int mlx5_start_poll_adaptive_stall_v0_lock(struct ibv_cq_ex *ibcq, + struct ibv_poll_cq_attr *attr) +{ + return mlx5_start_poll(ibcq, attr, 1, POLLING_MODE_STALL_ADAPTIVE, 0, 0); +} + +static inline int mlx5_start_poll_stall_v0_lock(struct ibv_cq_ex *ibcq, + struct ibv_poll_cq_attr *attr) +{ + return mlx5_start_poll(ibcq, attr, 1, POLLING_MODE_STALL, 0, 0); +} + +static inline int mlx5_start_poll_adaptive_stall_v1_lock(struct ibv_cq_ex *ibcq, + struct ibv_poll_cq_attr *attr) +{ + return mlx5_start_poll(ibcq, attr, 1, POLLING_MODE_STALL_ADAPTIVE, 1, 0); +} + +static inline int mlx5_start_poll_stall_v1_lock(struct ibv_cq_ex *ibcq, + struct ibv_poll_cq_attr *attr) +{ + return mlx5_start_poll(ibcq, attr, 1, POLLING_MODE_STALL, 1, 0); +} + +static inline int mlx5_start_poll_stall_v0(struct ibv_cq_ex *ibcq, + struct ibv_poll_cq_attr *attr) +{ + return mlx5_start_poll(ibcq, attr, 0, POLLING_MODE_STALL, 0, 0); +} + +static inline int mlx5_start_poll_adaptive_stall_v0(struct ibv_cq_ex *ibcq, + struct ibv_poll_cq_attr *attr) +{ + return mlx5_start_poll(ibcq, attr, 0, POLLING_MODE_STALL_ADAPTIVE, 0, 0); +} + +static inline int mlx5_start_poll_adaptive_stall_v1(struct ibv_cq_ex *ibcq, + struct ibv_poll_cq_attr *attr) +{ + return mlx5_start_poll(ibcq, attr, 0, POLLING_MODE_STALL_ADAPTIVE, 1, 0); +} + +static inline int mlx5_start_poll_stall_v1(struct ibv_cq_ex *ibcq, + struct ibv_poll_cq_attr *attr) +{ + return mlx5_start_poll(ibcq, attr, 0, POLLING_MODE_STALL, 1, 0); +} + +static inline int mlx5_start_poll_v0_lock_clock_update(struct ibv_cq_ex *ibcq, + struct ibv_poll_cq_attr *attr) +{ + return mlx5_start_poll(ibcq, attr, 1, 0, 0, 1); +} + +static inline int mlx5_start_poll_v1_lock_clock_update(struct ibv_cq_ex *ibcq, + struct ibv_poll_cq_attr *attr) +{ + return mlx5_start_poll(ibcq, attr, 1, 0, 1, 1); +} + +static inline int mlx5_start_poll_v1_clock_update(struct ibv_cq_ex *ibcq, + struct ibv_poll_cq_attr *attr) +{ + return mlx5_start_poll(ibcq, attr, 0, 0, 1, 1); +} + +static inline int mlx5_start_poll_v0_clock_update(struct ibv_cq_ex *ibcq, + struct ibv_poll_cq_attr *attr) +{ + return mlx5_start_poll(ibcq, attr, 0, 0, 0, 1); +} + +static inline int mlx5_start_poll_stall_v1_lock_clock_update(struct ibv_cq_ex *ibcq, + struct ibv_poll_cq_attr *attr) +{ + return mlx5_start_poll(ibcq, attr, 1, POLLING_MODE_STALL, 1, 1); +} + +static inline int mlx5_start_poll_stall_v0_lock_clock_update(struct ibv_cq_ex *ibcq, + struct ibv_poll_cq_attr *attr) +{ + return mlx5_start_poll(ibcq, attr, 1, POLLING_MODE_STALL, 0, 1); +} + +static inline int mlx5_start_poll_stall_v1_clock_update(struct ibv_cq_ex *ibcq, + struct ibv_poll_cq_attr *attr) +{ + return mlx5_start_poll(ibcq, attr, 0, POLLING_MODE_STALL, 1, 1); +} + +static inline int mlx5_start_poll_stall_v0_clock_update(struct ibv_cq_ex *ibcq, + struct ibv_poll_cq_attr *attr) +{ + return mlx5_start_poll(ibcq, attr, 0, POLLING_MODE_STALL, 0, 1); +} + +static inline int mlx5_start_poll_adaptive_stall_v0_lock_clock_update(struct ibv_cq_ex *ibcq, + struct ibv_poll_cq_attr *attr) +{ + return mlx5_start_poll(ibcq, attr, 1, POLLING_MODE_STALL_ADAPTIVE, 0, 1); +} + +static inline int mlx5_start_poll_adaptive_stall_v1_lock_clock_update(struct ibv_cq_ex *ibcq, + struct ibv_poll_cq_attr *attr) +{ + return mlx5_start_poll(ibcq, attr, 1, POLLING_MODE_STALL_ADAPTIVE, 1, 1); +} + +static inline int mlx5_start_poll_adaptive_stall_v0_clock_update(struct ibv_cq_ex *ibcq, + struct ibv_poll_cq_attr *attr) +{ + return mlx5_start_poll(ibcq, attr, 0, POLLING_MODE_STALL_ADAPTIVE, 0, 1); +} + +static inline int mlx5_start_poll_adaptive_stall_v1_clock_update(struct ibv_cq_ex *ibcq, + struct ibv_poll_cq_attr *attr) +{ + return mlx5_start_poll(ibcq, attr, 0, POLLING_MODE_STALL_ADAPTIVE, 1, 1); +} + +static inline void mlx5_end_poll_adaptive_stall_lock(struct ibv_cq_ex *ibcq) +{ + _mlx5_end_poll(ibcq, 1, POLLING_MODE_STALL_ADAPTIVE); +} + +static inline void mlx5_end_poll_stall_lock(struct ibv_cq_ex *ibcq) +{ + _mlx5_end_poll(ibcq, 1, POLLING_MODE_STALL); +} + +static inline void mlx5_end_poll_adaptive_stall(struct ibv_cq_ex *ibcq) +{ + _mlx5_end_poll(ibcq, 0, POLLING_MODE_STALL_ADAPTIVE); +} + +static inline void mlx5_end_poll_stall(struct ibv_cq_ex *ibcq) +{ + _mlx5_end_poll(ibcq, 0, POLLING_MODE_STALL); +} + +static inline void mlx5_end_poll(struct ibv_cq_ex *ibcq) +{ + _mlx5_end_poll(ibcq, 0, 0); +} + +static inline void mlx5_end_poll_lock(struct ibv_cq_ex *ibcq) +{ + _mlx5_end_poll(ibcq, 1, 0); +} + +int mlx5_poll_cq(struct ibv_cq *ibcq, int ne, struct ibv_wc *wc) +{ + return poll_cq(ibcq, ne, wc, 0); +} + +int mlx5_poll_cq_v1(struct ibv_cq *ibcq, int ne, struct ibv_wc *wc) +{ + return poll_cq(ibcq, ne, wc, 1); +} + +static inline enum ibv_wc_opcode mlx5_cq_read_wc_opcode(struct ibv_cq_ex *ibcq) +{ + struct mlx5_cq *cq = to_mcq(ibv_cq_ex_to_cq(ibcq)); + + switch (mlx5dv_get_cqe_opcode(cq->cqe64)) { + case MLX5_CQE_RESP_WR_IMM: + return IBV_WC_RECV_RDMA_WITH_IMM; + case MLX5_CQE_RESP_SEND: + case MLX5_CQE_RESP_SEND_IMM: + case MLX5_CQE_RESP_SEND_INV: + if (unlikely(cq->cqe64->app == MLX5_CQE_APP_TAG_MATCHING)) { + switch (cq->cqe64->app_op) { + case MLX5_CQE_APP_OP_TM_CONSUMED_MSG_SW_RDNV: + case MLX5_CQE_APP_OP_TM_CONSUMED_MSG: + case MLX5_CQE_APP_OP_TM_CONSUMED_SW_RDNV: + case MLX5_CQE_APP_OP_TM_EXPECTED: + case MLX5_CQE_APP_OP_TM_UNEXPECTED: + return IBV_WC_TM_RECV; + case MLX5_CQE_APP_OP_TM_NO_TAG: + return IBV_WC_TM_NO_TAG; + } + } + return IBV_WC_RECV; + case MLX5_CQE_NO_PACKET: + switch (cq->cqe64->app_op) { + case MLX5_CQE_APP_OP_TM_REMOVE: + return IBV_WC_TM_DEL; + case MLX5_CQE_APP_OP_TM_APPEND: + return IBV_WC_TM_ADD; + case MLX5_CQE_APP_OP_TM_NOOP: + return IBV_WC_TM_SYNC; + case MLX5_CQE_APP_OP_TM_CONSUMED: + return IBV_WC_TM_RECV; + } + break; + case MLX5_CQE_REQ: + switch (be32toh(cq->cqe64->sop_drop_qpn) >> 24) { + case MLX5_OPCODE_RDMA_WRITE_IMM: + case MLX5_OPCODE_RDMA_WRITE: + return IBV_WC_RDMA_WRITE; + case MLX5_OPCODE_SEND_IMM: + case MLX5_OPCODE_SEND: + case MLX5_OPCODE_SEND_INVAL: + return IBV_WC_SEND; + case MLX5_OPCODE_RDMA_READ: + return IBV_WC_RDMA_READ; + case MLX5_OPCODE_ATOMIC_CS: + return IBV_WC_COMP_SWAP; + case MLX5_OPCODE_ATOMIC_FA: + return IBV_WC_FETCH_ADD; + case MLX5_OPCODE_UMR: + return cq->umr_opcode; + case MLX5_OPCODE_TSO: + return IBV_WC_TSO; + } + } + +#ifdef MLX5_DEBUG +{ + struct mlx5_context *ctx = to_mctx(ibcq->context); + + mlx5_dbg(ctx->dbg_fp, MLX5_DBG_CQ_CQE, "un-expected opcode in cqe\n"); +} +#endif + return 0; +} + +static inline uint32_t mlx5_cq_read_wc_qp_num(struct ibv_cq_ex *ibcq) +{ + struct mlx5_cq *cq = to_mcq(ibv_cq_ex_to_cq(ibcq)); + + return be32toh(cq->cqe64->sop_drop_qpn) & 0xffffff; +} + +static inline unsigned int mlx5_cq_read_wc_flags(struct ibv_cq_ex *ibcq) +{ + struct mlx5_cq *cq = to_mcq(ibv_cq_ex_to_cq(ibcq)); + int wc_flags = 0; + + if (cq->flags & MLX5_CQ_FLAGS_RX_CSUM_VALID) + wc_flags = get_csum_ok(cq->cqe64); + + switch (mlx5dv_get_cqe_opcode(cq->cqe64)) { + case MLX5_CQE_RESP_WR_IMM: + case MLX5_CQE_RESP_SEND_IMM: + wc_flags |= IBV_WC_WITH_IMM; + break; + case MLX5_CQE_RESP_SEND_INV: + wc_flags |= IBV_WC_WITH_INV; + break; + } + + if (cq->flags & MLX5_CQ_FLAGS_TM_SYNC_REQ) + wc_flags |= IBV_WC_TM_SYNC_REQ; + + if (unlikely(cq->cqe64->app == MLX5_CQE_APP_TAG_MATCHING)) { + switch (cq->cqe64->app_op) { + case MLX5_CQE_APP_OP_TM_CONSUMED_MSG_SW_RDNV: + case MLX5_CQE_APP_OP_TM_CONSUMED_MSG: + case MLX5_CQE_APP_OP_TM_MSG_COMPLETION_CANCELED: + /* Full completion */ + wc_flags |= (IBV_WC_TM_MATCH | IBV_WC_TM_DATA_VALID); + break; + case MLX5_CQE_APP_OP_TM_CONSUMED_SW_RDNV: + case MLX5_CQE_APP_OP_TM_CONSUMED: /* First completion */ + wc_flags |= IBV_WC_TM_MATCH; + break; + case MLX5_CQE_APP_OP_TM_EXPECTED: /* Second completion */ + wc_flags |= IBV_WC_TM_DATA_VALID; + break; + } + } + + wc_flags |= ((be32toh(cq->cqe64->flags_rqpn) >> 28) & 3) ? IBV_WC_GRH : 0; + return wc_flags; +} + +static inline uint32_t mlx5_cq_read_wc_byte_len(struct ibv_cq_ex *ibcq) +{ + struct mlx5_cq *cq = to_mcq(ibv_cq_ex_to_cq(ibcq)); + + return be32toh(cq->cqe64->byte_cnt); +} + +static inline uint32_t mlx5_cq_read_wc_vendor_err(struct ibv_cq_ex *ibcq) +{ + struct mlx5_cq *cq = to_mcq(ibv_cq_ex_to_cq(ibcq)); + struct mlx5_err_cqe *ecqe = (struct mlx5_err_cqe *)cq->cqe64; + + return ecqe->vendor_err_synd; +} + +static inline __be32 mlx5_cq_read_wc_imm_data(struct ibv_cq_ex *ibcq) +{ + struct mlx5_cq *cq = to_mcq(ibv_cq_ex_to_cq(ibcq)); + + switch (mlx5dv_get_cqe_opcode(cq->cqe64)) { + case MLX5_CQE_RESP_SEND_INV: + /* This is returning invalidate_rkey which is in host order, see + * ibv_wc_read_invalidated_rkey + */ + return (__force __be32)be32toh(cq->cqe64->imm_inval_pkey); + default: + return cq->cqe64->imm_inval_pkey; + } +} + +static inline uint32_t mlx5_cq_read_wc_slid(struct ibv_cq_ex *ibcq) +{ + struct mlx5_cq *cq = to_mcq(ibv_cq_ex_to_cq(ibcq)); + + return (uint32_t)be16toh(cq->cqe64->slid); +} + +static inline uint8_t mlx5_cq_read_wc_sl(struct ibv_cq_ex *ibcq) +{ + struct mlx5_cq *cq = to_mcq(ibv_cq_ex_to_cq(ibcq)); + + return (be32toh(cq->cqe64->flags_rqpn) >> 24) & 0xf; +} + +static inline uint32_t mlx5_cq_read_wc_src_qp(struct ibv_cq_ex *ibcq) +{ + struct mlx5_cq *cq = to_mcq(ibv_cq_ex_to_cq(ibcq)); + + return be32toh(cq->cqe64->flags_rqpn) & 0xffffff; +} + +static inline uint8_t mlx5_cq_read_wc_dlid_path_bits(struct ibv_cq_ex *ibcq) +{ + struct mlx5_cq *cq = to_mcq(ibv_cq_ex_to_cq(ibcq)); + + return cq->cqe64->ml_path & 0x7f; +} + +static inline uint64_t mlx5_cq_read_wc_completion_ts(struct ibv_cq_ex *ibcq) +{ + struct mlx5_cq *cq = to_mcq(ibv_cq_ex_to_cq(ibcq)); + + return be64toh(cq->cqe64->timestamp); +} + +static inline uint64_t +mlx5_cq_read_wc_completion_wallclock_ns(struct ibv_cq_ex *ibcq) +{ + struct mlx5_cq *cq = to_mcq(ibv_cq_ex_to_cq(ibcq)); + + return mlx5dv_ts_to_ns(&cq->last_clock_info, + mlx5_cq_read_wc_completion_ts(ibcq)); +} + +static inline uint16_t mlx5_cq_read_wc_cvlan(struct ibv_cq_ex *ibcq) +{ + struct mlx5_cq *cq = to_mcq(ibv_cq_ex_to_cq(ibcq)); + + return be16toh(cq->cqe64->vlan_info); +} + +static inline uint32_t mlx5_cq_read_flow_tag(struct ibv_cq_ex *ibcq) +{ + struct mlx5_cq *cq = to_mcq(ibv_cq_ex_to_cq(ibcq)); + + return be32toh(cq->cqe64->sop_drop_qpn) & MLX5_FLOW_TAG_MASK; +} + +static inline void mlx5_cq_read_wc_tm_info(struct ibv_cq_ex *ibcq, + struct ibv_wc_tm_info *tm_info) +{ + struct mlx5_cq *cq = to_mcq(ibv_cq_ex_to_cq(ibcq)); + + tm_info->tag = be64toh(cq->cqe64->tmh.tag); + tm_info->priv = be32toh(cq->cqe64->tmh.app_ctx); +} + +#define BIT(i) (1UL << (i)) + +#define SINGLE_THREADED BIT(0) +#define STALL BIT(1) +#define V1 BIT(2) +#define ADAPTIVE BIT(3) +#define CLOCK_UPDATE BIT(4) + +#define mlx5_start_poll_name(cqe_ver, lock, stall, adaptive, clock_update) \ + mlx5_start_poll##adaptive##stall##cqe_ver##lock##clock_update +#define mlx5_next_poll_name(cqe_ver, adaptive) \ + mlx5_next_poll##adaptive##cqe_ver +#define mlx5_end_poll_name(lock, stall, adaptive) \ + mlx5_end_poll##adaptive##stall##lock + +#define POLL_FN_ENTRY(cqe_ver, lock, stall, adaptive, clock_update) { \ + .start_poll = &mlx5_start_poll_name(cqe_ver, lock, stall, adaptive, clock_update), \ + .next_poll = &mlx5_next_poll_name(cqe_ver, adaptive), \ + .end_poll = &mlx5_end_poll_name(lock, stall, adaptive), \ + } + +static const struct op +{ + int (*start_poll)(struct ibv_cq_ex *ibcq, struct ibv_poll_cq_attr *attr); + int (*next_poll)(struct ibv_cq_ex *ibcq); + void (*end_poll)(struct ibv_cq_ex *ibcq); +} ops[ADAPTIVE + V1 + STALL + SINGLE_THREADED + CLOCK_UPDATE + 1] = { + [V1] = POLL_FN_ENTRY(_v1, _lock, , ,), + [0] = POLL_FN_ENTRY(_v0, _lock, , ,), + [V1 | SINGLE_THREADED] = POLL_FN_ENTRY(_v1, , , , ), + [SINGLE_THREADED] = POLL_FN_ENTRY(_v0, , , , ), + [V1 | STALL] = POLL_FN_ENTRY(_v1, _lock, _stall, , ), + [STALL] = POLL_FN_ENTRY(_v0, _lock, _stall, , ), + [V1 | SINGLE_THREADED | STALL] = POLL_FN_ENTRY(_v1, , _stall, , ), + [SINGLE_THREADED | STALL] = POLL_FN_ENTRY(_v0, , _stall, , ), + [V1 | STALL | ADAPTIVE] = POLL_FN_ENTRY(_v1, _lock, _stall, _adaptive, ), + [STALL | ADAPTIVE] = POLL_FN_ENTRY(_v0, _lock, _stall, _adaptive, ), + [V1 | SINGLE_THREADED | STALL | ADAPTIVE] = POLL_FN_ENTRY(_v1, , _stall, _adaptive, ), + [SINGLE_THREADED | STALL | ADAPTIVE] = POLL_FN_ENTRY(_v0, , _stall, _adaptive, ), + [V1 | CLOCK_UPDATE] = POLL_FN_ENTRY(_v1, _lock, , , _clock_update), + [0 | CLOCK_UPDATE] = POLL_FN_ENTRY(_v0, _lock, , , _clock_update), + [V1 | SINGLE_THREADED | CLOCK_UPDATE] = POLL_FN_ENTRY(_v1, , , , _clock_update), + [SINGLE_THREADED | CLOCK_UPDATE] = POLL_FN_ENTRY(_v0, , , , _clock_update), + [V1 | STALL | CLOCK_UPDATE] = POLL_FN_ENTRY(_v1, _lock, _stall, , _clock_update), + [STALL | CLOCK_UPDATE] = POLL_FN_ENTRY(_v0, _lock, _stall, , _clock_update), + [V1 | SINGLE_THREADED | STALL | CLOCK_UPDATE] = POLL_FN_ENTRY(_v1, , _stall, , _clock_update), + [SINGLE_THREADED | STALL | CLOCK_UPDATE] = POLL_FN_ENTRY(_v0, , _stall, , _clock_update), + [V1 | STALL | ADAPTIVE | CLOCK_UPDATE] = POLL_FN_ENTRY(_v1, _lock, _stall, _adaptive, _clock_update), + [STALL | ADAPTIVE | CLOCK_UPDATE] = POLL_FN_ENTRY(_v0, _lock, _stall, _adaptive, _clock_update), + [V1 | SINGLE_THREADED | STALL | ADAPTIVE | CLOCK_UPDATE] = POLL_FN_ENTRY(_v1, , _stall, _adaptive, _clock_update), + [SINGLE_THREADED | STALL | ADAPTIVE | CLOCK_UPDATE] = POLL_FN_ENTRY(_v0, , _stall, _adaptive, _clock_update), +}; + +int mlx5_cq_fill_pfns(struct mlx5_cq *cq, + const struct ibv_cq_init_attr_ex *cq_attr, + struct mlx5_context *mctx) +{ + const struct op *poll_ops = &ops[((cq->stall_enable && cq->stall_adaptive_enable) ? ADAPTIVE : 0) | + (mctx->cqe_version ? V1 : 0) | + (cq->flags & MLX5_CQ_FLAGS_SINGLE_THREADED ? + SINGLE_THREADED : 0) | + (cq->stall_enable ? STALL : 0) | + ((cq_attr->wc_flags & IBV_WC_EX_WITH_COMPLETION_TIMESTAMP_WALLCLOCK) ? + CLOCK_UPDATE : 0)]; + + cq->ibv_cq.start_poll = poll_ops->start_poll; + cq->ibv_cq.next_poll = poll_ops->next_poll; + cq->ibv_cq.end_poll = poll_ops->end_poll; + + cq->ibv_cq.read_opcode = mlx5_cq_read_wc_opcode; + cq->ibv_cq.read_vendor_err = mlx5_cq_read_wc_vendor_err; + cq->ibv_cq.read_wc_flags = mlx5_cq_read_wc_flags; + if (cq_attr->wc_flags & IBV_WC_EX_WITH_BYTE_LEN) + cq->ibv_cq.read_byte_len = mlx5_cq_read_wc_byte_len; + if (cq_attr->wc_flags & IBV_WC_EX_WITH_IMM) + cq->ibv_cq.read_imm_data = mlx5_cq_read_wc_imm_data; + if (cq_attr->wc_flags & IBV_WC_EX_WITH_QP_NUM) + cq->ibv_cq.read_qp_num = mlx5_cq_read_wc_qp_num; + if (cq_attr->wc_flags & IBV_WC_EX_WITH_SRC_QP) + cq->ibv_cq.read_src_qp = mlx5_cq_read_wc_src_qp; + if (cq_attr->wc_flags & IBV_WC_EX_WITH_SLID) + cq->ibv_cq.read_slid = mlx5_cq_read_wc_slid; + if (cq_attr->wc_flags & IBV_WC_EX_WITH_SL) + cq->ibv_cq.read_sl = mlx5_cq_read_wc_sl; + if (cq_attr->wc_flags & IBV_WC_EX_WITH_DLID_PATH_BITS) + cq->ibv_cq.read_dlid_path_bits = mlx5_cq_read_wc_dlid_path_bits; + if (cq_attr->wc_flags & IBV_WC_EX_WITH_COMPLETION_TIMESTAMP) + cq->ibv_cq.read_completion_ts = mlx5_cq_read_wc_completion_ts; + if (cq_attr->wc_flags & IBV_WC_EX_WITH_CVLAN) + cq->ibv_cq.read_cvlan = mlx5_cq_read_wc_cvlan; + if (cq_attr->wc_flags & IBV_WC_EX_WITH_FLOW_TAG) + cq->ibv_cq.read_flow_tag = mlx5_cq_read_flow_tag; + if (cq_attr->wc_flags & IBV_WC_EX_WITH_TM_INFO) + cq->ibv_cq.read_tm_info = mlx5_cq_read_wc_tm_info; + if (cq_attr->wc_flags & IBV_WC_EX_WITH_COMPLETION_TIMESTAMP_WALLCLOCK) { + if (!mctx->clock_info_page) + return EOPNOTSUPP; + cq->ibv_cq.read_completion_wallclock_ns = + mlx5_cq_read_wc_completion_wallclock_ns; + } + + return 0; +} + +int mlx5_arm_cq(struct ibv_cq *ibvcq, int solicited) +{ + struct mlx5_cq *cq = to_mcq(ibvcq); + struct mlx5_context *ctx = to_mctx(ibvcq->context); + uint64_t doorbell; + uint32_t sn; + uint32_t ci; + uint32_t cmd; + + sn = cq->arm_sn & 3; + ci = cq->cons_index & 0xffffff; + cmd = solicited ? MLX5_CQ_DB_REQ_NOT_SOL : MLX5_CQ_DB_REQ_NOT; + + doorbell = sn << 28 | cmd | ci; + doorbell <<= 32; + doorbell |= cq->cqn; + + cq->dbrec[MLX5_CQ_ARM_DB] = htobe32(sn << 28 | cmd | ci); + + /* + * Make sure that the doorbell record in host memory is + * written before ringing the doorbell via PCI WC MMIO. + */ + mmio_wc_start(); + + mmio_write64_be(ctx->cq_uar_reg + MLX5_CQ_DOORBELL, htobe64(doorbell)); + + mmio_flush_writes(); + + return 0; +} + +void mlx5_cq_event(struct ibv_cq *cq) +{ + to_mcq(cq)->arm_sn++; +} + +static int is_equal_rsn(struct mlx5_cqe64 *cqe64, uint32_t rsn) +{ + return rsn == (be32toh(cqe64->sop_drop_qpn) & 0xffffff); +} + +static inline int is_equal_uidx(struct mlx5_cqe64 *cqe64, uint32_t uidx) +{ + return uidx == (be32toh(cqe64->srqn_uidx) & 0xffffff); +} + +static inline int is_responder(uint8_t opcode) +{ + switch (opcode) { + case MLX5_CQE_RESP_WR_IMM: + case MLX5_CQE_RESP_SEND: + case MLX5_CQE_RESP_SEND_IMM: + case MLX5_CQE_RESP_SEND_INV: + case MLX5_CQE_RESP_ERR: + return 1; + } + + return 0; +} + +static inline int free_res_cqe(struct mlx5_cqe64 *cqe64, uint32_t rsn, + struct mlx5_srq *srq, int cqe_version) +{ + if (cqe_version) { + if (is_equal_uidx(cqe64, rsn)) { + if (srq && is_responder(mlx5dv_get_cqe_opcode(cqe64))) + mlx5_free_srq_wqe(srq, + be16toh(cqe64->wqe_counter)); + return 1; + } + } else { + if (is_equal_rsn(cqe64, rsn)) { + if (srq && (be32toh(cqe64->srqn_uidx) & 0xffffff)) + mlx5_free_srq_wqe(srq, + be16toh(cqe64->wqe_counter)); + return 1; + } + } + + return 0; +} + +void __mlx5_cq_clean(struct mlx5_cq *cq, uint32_t rsn, struct mlx5_srq *srq) +{ + uint32_t prod_index; + int nfreed = 0; + struct mlx5_cqe64 *cqe64, *dest64; + void *cqe, *dest; + uint8_t owner_bit; + int cqe_version; + + if (!cq || cq->flags & MLX5_CQ_FLAGS_DV_OWNED) + return; + + /* + * First we need to find the current producer index, so we + * know where to start cleaning from. It doesn't matter if HW + * adds new entries after this loop -- the QP we're worried + * about is already in RESET, so the new entries won't come + * from our QP and therefore don't need to be checked. + */ + for (prod_index = cq->cons_index; get_sw_cqe(cq, prod_index); ++prod_index) + if (prod_index == cq->cons_index + cq->ibv_cq.cqe) + break; + + /* + * Now sweep backwards through the CQ, removing CQ entries + * that match our QP by copying older entries on top of them. + */ + cqe_version = (to_mctx(cq->ibv_cq.context))->cqe_version; + while ((int) --prod_index - (int) cq->cons_index >= 0) { + cqe = get_cqe(cq, prod_index & cq->ibv_cq.cqe); + cqe64 = (cq->cqe_sz == 64) ? cqe : cqe + 64; + if (free_res_cqe(cqe64, rsn, srq, cqe_version)) { + ++nfreed; + } else if (nfreed) { + dest = get_cqe(cq, (prod_index + nfreed) & cq->ibv_cq.cqe); + dest64 = (cq->cqe_sz == 64) ? dest : dest + 64; + owner_bit = dest64->op_own & MLX5_CQE_OWNER_MASK; + memcpy(dest, cqe, cq->cqe_sz); + dest64->op_own = owner_bit | + (dest64->op_own & ~MLX5_CQE_OWNER_MASK); + } + } + + if (nfreed) { + cq->cons_index += nfreed; + /* + * Make sure update of buffer contents is done before + * updating consumer index. + */ + udma_to_device_barrier(); + update_cons_index(cq); + } +} + +void mlx5_cq_clean(struct mlx5_cq *cq, uint32_t qpn, struct mlx5_srq *srq) +{ + mlx5_spin_lock(&cq->lock); + __mlx5_cq_clean(cq, qpn, srq); + mlx5_spin_unlock(&cq->lock); +} + +static uint8_t sw_ownership_bit(int n, int nent) +{ + return (n & nent) ? 1 : 0; +} + +static int is_hw(uint8_t own, int n, int mask) +{ + return (own & MLX5_CQE_OWNER_MASK) ^ !!(n & (mask + 1)); +} + +void mlx5_cq_resize_copy_cqes(struct mlx5_cq *cq) +{ + struct mlx5_cqe64 *scqe64; + struct mlx5_cqe64 *dcqe64; + void *start_cqe; + void *scqe; + void *dcqe; + int ssize; + int dsize; + int i; + uint8_t sw_own; + + ssize = cq->cqe_sz; + dsize = cq->resize_cqe_sz; + + i = cq->cons_index; + scqe = get_buf_cqe(cq->active_buf, i & cq->active_cqes, ssize); + scqe64 = ssize == 64 ? scqe : scqe + 64; + start_cqe = scqe; + if (is_hw(scqe64->op_own, i, cq->active_cqes)) { + fprintf(stderr, "expected cqe in sw ownership\n"); + return; + } + + while ((scqe64->op_own >> 4) != MLX5_CQE_RESIZE_CQ) { + dcqe = get_buf_cqe(cq->resize_buf, (i + 1) & (cq->resize_cqes - 1), dsize); + dcqe64 = dsize == 64 ? dcqe : dcqe + 64; + sw_own = sw_ownership_bit(i + 1, cq->resize_cqes); + memcpy(dcqe, scqe, ssize); + dcqe64->op_own = (dcqe64->op_own & ~MLX5_CQE_OWNER_MASK) | sw_own; + + ++i; + scqe = get_buf_cqe(cq->active_buf, i & cq->active_cqes, ssize); + scqe64 = ssize == 64 ? scqe : scqe + 64; + if (is_hw(scqe64->op_own, i, cq->active_cqes)) { + fprintf(stderr, "expected cqe in sw ownership\n"); + return; + } + + if (scqe == start_cqe) { + fprintf(stderr, "resize CQ failed to get resize CQE\n"); + return; + } + } + ++cq->cons_index; +} + +int mlx5_alloc_cq_buf(struct mlx5_context *mctx, struct mlx5_cq *cq, + struct mlx5_buf *buf, int nent, int cqe_sz) +{ + struct mlx5_cqe64 *cqe; + int i; + struct mlx5_device *dev = to_mdev(mctx->ibv_ctx.context.device); + int ret; + enum mlx5_alloc_type type; + enum mlx5_alloc_type default_type = MLX5_ALLOC_TYPE_ANON; + + if (mlx5_use_huge("HUGE_CQ")) + default_type = MLX5_ALLOC_TYPE_HUGE; + + mlx5_get_alloc_type(mctx, cq->parent_domain, + MLX5_CQ_PREFIX, &type, default_type); + + if (type == MLX5_ALLOC_TYPE_CUSTOM) { + buf->mparent_domain = to_mparent_domain(cq->parent_domain); + buf->req_alignment = dev->page_size; + buf->resource_type = MLX5DV_RES_TYPE_CQ; + } + + ret = mlx5_alloc_prefered_buf(mctx, buf, + align(nent * cqe_sz, dev->page_size), + dev->page_size, + type, + MLX5_CQ_PREFIX); + + if (ret) + return -1; + + if (buf->type != MLX5_ALLOC_TYPE_CUSTOM) + memset(buf->buf, 0, nent * cqe_sz); + + for (i = 0; i < nent; ++i) { + cqe = buf->buf + i * cqe_sz; + cqe += cqe_sz == 128 ? 1 : 0; + cqe->op_own = MLX5_CQE_INVALID << 4; + } + + return 0; +} + +int mlx5_free_cq_buf(struct mlx5_context *ctx, struct mlx5_buf *buf) +{ + return mlx5_free_actual_buf(ctx, buf); +} diff --git a/providers/mlx5/dbrec.c b/providers/mlx5/dbrec.c new file mode 100644 index 0000000..5ef3d16 --- /dev/null +++ b/providers/mlx5/dbrec.c @@ -0,0 +1,187 @@ +/* + * Copyright (c) 2012 Mellanox Technologies, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#define _GNU_SOURCE +#include <config.h> + +#include <stdlib.h> +#include <pthread.h> +#include <string.h> + +#include "mlx5.h" + +struct mlx5_db_page { + struct mlx5_db_page *prev, *next; + struct mlx5_buf buf; + int num_db; + int use_cnt; + unsigned long free[0]; +}; + +static struct mlx5_db_page *__add_page(struct mlx5_context *context) +{ + struct mlx5_db_page *page; + int ps = to_mdev(context->ibv_ctx.context.device)->page_size; + int pp; + int i; + int nlong; + int ret; + + pp = ps / context->cache_line_size; + nlong = (pp + 8 * sizeof(long) - 1) / (8 * sizeof(long)); + + page = malloc(sizeof *page + nlong * sizeof(long)); + if (!page) + return NULL; + + if (mlx5_is_extern_alloc(context)) + ret = mlx5_alloc_buf_extern(context, &page->buf, ps); + else + ret = mlx5_alloc_buf(&page->buf, ps, ps); + if (ret) { + free(page); + return NULL; + } + + page->num_db = pp; + page->use_cnt = 0; + for (i = 0; i < nlong; ++i) + page->free[i] = ~0; + + page->prev = NULL; + page->next = context->db_list; + context->db_list = page; + if (page->next) + page->next->prev = page; + + return page; +} + +__be32 *mlx5_alloc_dbrec(struct mlx5_context *context, struct ibv_pd *pd, + bool *custom_alloc) +{ + struct mlx5_db_page *page; + __be32 *db = NULL; + int i, j; + + if (mlx5_is_custom_alloc(pd)) { + struct mlx5_parent_domain *mparent_domain = to_mparent_domain(pd); + + db = mparent_domain->alloc(&mparent_domain->mpd.ibv_pd, + mparent_domain->pd_context, 8, 8, + MLX5DV_RES_TYPE_DBR); + + if (db == IBV_ALLOCATOR_USE_DEFAULT) + goto default_alloc; + + if (!db) + return NULL; + + *custom_alloc = true; + return db; + } + +default_alloc: + pthread_mutex_lock(&context->db_list_mutex); + + for (page = context->db_list; page; page = page->next) + if (page->use_cnt < page->num_db) + goto found; + + page = __add_page(context); + if (!page) + goto out; + +found: + ++page->use_cnt; + + for (i = 0; !page->free[i]; ++i) + /* nothing */; + + j = ffsl(page->free[i]); + --j; + page->free[i] &= ~(1UL << j); + db = page->buf.buf + (i * 8 * sizeof(long) + j) * context->cache_line_size; + +out: + pthread_mutex_unlock(&context->db_list_mutex); + + return db; +} + +void mlx5_free_db(struct mlx5_context *context, __be32 *db, struct ibv_pd *pd, + bool custom_alloc) +{ + struct mlx5_db_page *page; + uintptr_t ps = to_mdev(context->ibv_ctx.context.device)->page_size; + int i; + + if (custom_alloc) { + struct mlx5_parent_domain *mparent_domain = to_mparent_domain(pd); + + mparent_domain->free(&mparent_domain->mpd.ibv_pd, + mparent_domain->pd_context, + db, + MLX5DV_RES_TYPE_DBR); + return; + } + + pthread_mutex_lock(&context->db_list_mutex); + + for (page = context->db_list; page; page = page->next) + if (((uintptr_t) db & ~(ps - 1)) == (uintptr_t) page->buf.buf) + break; + + if (!page) + goto out; + + i = ((void *) db - page->buf.buf) / context->cache_line_size; + page->free[i / (8 * sizeof(long))] |= 1UL << (i % (8 * sizeof(long))); + + if (!--page->use_cnt) { + if (page->prev) + page->prev->next = page->next; + else + context->db_list = page->next; + if (page->next) + page->next->prev = page->prev; + + if (page->buf.type == MLX5_ALLOC_TYPE_EXTERNAL) + mlx5_free_buf_extern(context, &page->buf); + else + mlx5_free_buf(&page->buf); + + free(page); + } + +out: + pthread_mutex_unlock(&context->db_list_mutex); +} diff --git a/providers/mlx5/dr_action.c b/providers/mlx5/dr_action.c new file mode 100644 index 0000000..5f457fa --- /dev/null +++ b/providers/mlx5/dr_action.c @@ -0,0 +1,1875 @@ +/* + * Copyright (c) 2019, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <unistd.h> +#include <arpa/inet.h> +#include <ccan/ilog.h> +#include <ccan/array_size.h> +#include "mlx5dv_dr.h" + +enum dr_action_domain { + DR_ACTION_DOMAIN_NIC_INGRESS, + DR_ACTION_DOMAIN_NIC_EGRESS, + DR_ACTION_DOMAIN_FDB_INGRESS, + DR_ACTION_DOMAIN_FDB_EGRESS, + DR_ACTION_DOMAIN_MAX, +}; + +enum dr_action_valid_state { + DR_ACTION_STATE_ERR, + DR_ACTION_STATE_NO_ACTION, + DR_ACTION_STATE_REFORMAT, + DR_ACTION_STATE_MODIFY_HDR, + DR_ACTION_STATE_NON_TERM, + DR_ACTION_STATE_TERM, + DR_ACTION_STATE_MAX, +}; + +static const enum dr_action_valid_state next_action_state[DR_ACTION_DOMAIN_MAX] + [DR_ACTION_STATE_MAX] + [DR_ACTION_TYP_MAX] = { + [DR_ACTION_DOMAIN_NIC_INGRESS] = { + [DR_ACTION_STATE_NO_ACTION] = { + [DR_ACTION_TYP_DROP] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_QP] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_FT] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_TAG] = DR_ACTION_STATE_NON_TERM, + [DR_ACTION_TYP_CTR] = DR_ACTION_STATE_NON_TERM, + [DR_ACTION_TYP_METER] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_TNL_L2_TO_L2] = DR_ACTION_STATE_REFORMAT, + [DR_ACTION_TYP_TNL_L3_TO_L2] = DR_ACTION_STATE_REFORMAT, + [DR_ACTION_TYP_MODIFY_HDR] = DR_ACTION_STATE_MODIFY_HDR, + }, + [DR_ACTION_STATE_REFORMAT] = { + [DR_ACTION_TYP_QP] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_FT] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_TAG] = DR_ACTION_STATE_REFORMAT, + [DR_ACTION_TYP_CTR] = DR_ACTION_STATE_REFORMAT, + [DR_ACTION_TYP_METER] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_MODIFY_HDR] = DR_ACTION_STATE_MODIFY_HDR, + }, + [DR_ACTION_STATE_MODIFY_HDR] = { + [DR_ACTION_TYP_QP] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_FT] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_TAG] = DR_ACTION_STATE_MODIFY_HDR, + [DR_ACTION_TYP_CTR] = DR_ACTION_STATE_MODIFY_HDR, + [DR_ACTION_TYP_METER] = DR_ACTION_STATE_TERM, + }, + [DR_ACTION_STATE_NON_TERM] = { + [DR_ACTION_TYP_DROP] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_QP] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_FT] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_TAG] = DR_ACTION_STATE_NON_TERM, + [DR_ACTION_TYP_CTR] = DR_ACTION_STATE_NON_TERM, + [DR_ACTION_TYP_METER] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_TNL_L2_TO_L2] = DR_ACTION_STATE_REFORMAT, + [DR_ACTION_TYP_TNL_L3_TO_L2] = DR_ACTION_STATE_REFORMAT, + [DR_ACTION_TYP_MODIFY_HDR] = DR_ACTION_STATE_MODIFY_HDR, + }, + [DR_ACTION_STATE_TERM] = { + [DR_ACTION_TYP_CTR] = DR_ACTION_STATE_TERM, + }, + }, + [DR_ACTION_DOMAIN_NIC_EGRESS] = { + [DR_ACTION_STATE_NO_ACTION] = { + [DR_ACTION_TYP_DROP] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_FT] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_CTR] = DR_ACTION_STATE_NON_TERM, + [DR_ACTION_TYP_METER] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_L2_TO_TNL_L2] = DR_ACTION_STATE_REFORMAT, + [DR_ACTION_TYP_L2_TO_TNL_L3] = DR_ACTION_STATE_REFORMAT, + [DR_ACTION_TYP_MODIFY_HDR] = DR_ACTION_STATE_MODIFY_HDR, + }, + [DR_ACTION_STATE_REFORMAT] = { + [DR_ACTION_TYP_FT] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_CTR] = DR_ACTION_STATE_REFORMAT, + [DR_ACTION_TYP_METER] = DR_ACTION_STATE_TERM, + }, + [DR_ACTION_STATE_MODIFY_HDR] = { + [DR_ACTION_TYP_FT] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_CTR] = DR_ACTION_STATE_MODIFY_HDR, + [DR_ACTION_TYP_METER] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_L2_TO_TNL_L2] = DR_ACTION_STATE_REFORMAT, + [DR_ACTION_TYP_L2_TO_TNL_L3] = DR_ACTION_STATE_REFORMAT, + }, + [DR_ACTION_STATE_NON_TERM] = { + [DR_ACTION_TYP_DROP] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_FT] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_CTR] = DR_ACTION_STATE_NON_TERM, + [DR_ACTION_TYP_METER] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_L2_TO_TNL_L2] = DR_ACTION_STATE_REFORMAT, + [DR_ACTION_TYP_L2_TO_TNL_L3] = DR_ACTION_STATE_REFORMAT, + [DR_ACTION_TYP_MODIFY_HDR] = DR_ACTION_STATE_MODIFY_HDR, + }, + [DR_ACTION_STATE_TERM] = { + [DR_ACTION_TYP_CTR] = DR_ACTION_STATE_TERM, + }, + }, + [DR_ACTION_DOMAIN_FDB_INGRESS] = { + [DR_ACTION_STATE_NO_ACTION] = { + [DR_ACTION_TYP_DROP] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_FT] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_CTR] = DR_ACTION_STATE_NON_TERM, + [DR_ACTION_TYP_METER] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_TNL_L2_TO_L2] = DR_ACTION_STATE_REFORMAT, + [DR_ACTION_TYP_TNL_L3_TO_L2] = DR_ACTION_STATE_REFORMAT, + [DR_ACTION_TYP_MODIFY_HDR] = DR_ACTION_STATE_MODIFY_HDR, + [DR_ACTION_TYP_VPORT] = DR_ACTION_STATE_TERM, + }, + [DR_ACTION_STATE_REFORMAT] = { + [DR_ACTION_TYP_FT] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_CTR] = DR_ACTION_STATE_REFORMAT, + [DR_ACTION_TYP_METER] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_MODIFY_HDR] = DR_ACTION_STATE_MODIFY_HDR, + [DR_ACTION_TYP_VPORT] = DR_ACTION_STATE_TERM, + }, + [DR_ACTION_STATE_MODIFY_HDR] = { + [DR_ACTION_TYP_FT] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_CTR] = DR_ACTION_STATE_MODIFY_HDR, + [DR_ACTION_TYP_METER] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_VPORT] = DR_ACTION_STATE_TERM, + }, + [DR_ACTION_STATE_NON_TERM] = { + [DR_ACTION_TYP_DROP] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_FT] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_CTR] = DR_ACTION_STATE_NON_TERM, + [DR_ACTION_TYP_METER] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_TNL_L2_TO_L2] = DR_ACTION_STATE_REFORMAT, + [DR_ACTION_TYP_TNL_L3_TO_L2] = DR_ACTION_STATE_REFORMAT, + [DR_ACTION_TYP_MODIFY_HDR] = DR_ACTION_STATE_MODIFY_HDR, + [DR_ACTION_TYP_VPORT] = DR_ACTION_STATE_TERM, + }, + [DR_ACTION_STATE_TERM] = { + [DR_ACTION_TYP_CTR] = DR_ACTION_STATE_TERM, + }, + }, + [DR_ACTION_DOMAIN_FDB_EGRESS] = { + [DR_ACTION_STATE_NO_ACTION] = { + [DR_ACTION_TYP_DROP] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_FT] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_CTR] = DR_ACTION_STATE_NON_TERM, + [DR_ACTION_TYP_MODIFY_HDR] = DR_ACTION_STATE_MODIFY_HDR, + [DR_ACTION_TYP_METER] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_L2_TO_TNL_L2] = DR_ACTION_STATE_REFORMAT, + [DR_ACTION_TYP_L2_TO_TNL_L3] = DR_ACTION_STATE_REFORMAT, + [DR_ACTION_TYP_VPORT] = DR_ACTION_STATE_TERM, + }, + [DR_ACTION_STATE_REFORMAT] = { + [DR_ACTION_TYP_FT] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_CTR] = DR_ACTION_STATE_REFORMAT, + [DR_ACTION_TYP_METER] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_VPORT] = DR_ACTION_STATE_TERM, + }, + [DR_ACTION_STATE_MODIFY_HDR] = { + [DR_ACTION_TYP_FT] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_CTR] = DR_ACTION_STATE_MODIFY_HDR, + [DR_ACTION_TYP_METER] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_L2_TO_TNL_L2] = DR_ACTION_STATE_REFORMAT, + [DR_ACTION_TYP_L2_TO_TNL_L3] = DR_ACTION_STATE_REFORMAT, + [DR_ACTION_TYP_VPORT] = DR_ACTION_STATE_TERM, + }, + [DR_ACTION_STATE_NON_TERM] = { + [DR_ACTION_TYP_DROP] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_FT] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_CTR] = DR_ACTION_STATE_NON_TERM, + [DR_ACTION_TYP_MODIFY_HDR] = DR_ACTION_STATE_MODIFY_HDR, + [DR_ACTION_TYP_METER] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_L2_TO_TNL_L2] = DR_ACTION_STATE_REFORMAT, + [DR_ACTION_TYP_L2_TO_TNL_L3] = DR_ACTION_STATE_REFORMAT, + [DR_ACTION_TYP_VPORT] = DR_ACTION_STATE_TERM, + }, + [DR_ACTION_STATE_TERM] = { + [DR_ACTION_TYP_CTR] = DR_ACTION_STATE_TERM, + }, + }, +}; + +struct dr_action_modify_field_conv { + uint16_t hw_field; + uint8_t start; + uint8_t end; + uint8_t l3_type; + uint8_t l4_type; +}; + +static const struct dr_action_modify_field_conv dr_action_conv_arr[] = { + [MLX5_ACTION_IN_FIELD_OUT_SMAC_47_16] = { + .hw_field = MLX5_DR_ACTION_MDFY_HW_FLD_L2_1, .start = 16, .end = 47, + }, + [MLX5_ACTION_IN_FIELD_OUT_SMAC_15_0] = { + .hw_field = MLX5_DR_ACTION_MDFY_HW_FLD_L2_1, .start = 0, .end = 15, + }, + [MLX5_ACTION_IN_FIELD_OUT_ETHERTYPE] = { + .hw_field = MLX5_DR_ACTION_MDFY_HW_FLD_L2_2, .start = 32, .end = 47, + }, + [MLX5_ACTION_IN_FIELD_OUT_DMAC_47_16] = { + .hw_field = MLX5_DR_ACTION_MDFY_HW_FLD_L2_0, .start = 16, .end = 47, + }, + [MLX5_ACTION_IN_FIELD_OUT_DMAC_15_0] = { + .hw_field = MLX5_DR_ACTION_MDFY_HW_FLD_L2_0, .start = 0, .end = 15, + }, + [MLX5_ACTION_IN_FIELD_OUT_IP_DSCP] = { + .hw_field = MLX5_DR_ACTION_MDFY_HW_FLD_L3_1, .start = 0, .end = 5, + }, + [MLX5_ACTION_IN_FIELD_OUT_TCP_FLAGS] = { + .hw_field = MLX5_DR_ACTION_MDFY_HW_FLD_L4_0, .start = 48, .end = 56, + .l4_type = MLX5_DR_ACTION_MDFY_HW_HDR_L4_TCP, + }, + [MLX5_ACTION_IN_FIELD_OUT_TCP_SPORT] = { + .hw_field = MLX5_DR_ACTION_MDFY_HW_FLD_L4_0, .start = 0, .end = 15, + .l4_type = MLX5_DR_ACTION_MDFY_HW_HDR_L4_TCP, + }, + [MLX5_ACTION_IN_FIELD_OUT_TCP_DPORT] = { + .hw_field = MLX5_DR_ACTION_MDFY_HW_FLD_L4_0, .start = 16, .end = 31, + .l4_type = MLX5_DR_ACTION_MDFY_HW_HDR_L4_TCP, + }, + [MLX5_ACTION_IN_FIELD_OUT_IP_TTL] = { + .hw_field = MLX5_DR_ACTION_MDFY_HW_FLD_L3_1, .start = 8, .end = 15, + .l3_type = MLX5_DR_ACTION_MDFY_HW_HDR_L3_IPV4, + }, + [MLX5_ACTION_IN_FIELD_OUT_IPV6_HOPLIMIT] = { + .hw_field = MLX5_DR_ACTION_MDFY_HW_FLD_L3_1, .start = 8, .end = 15, + .l3_type = MLX5_DR_ACTION_MDFY_HW_HDR_L3_IPV6, + }, + [MLX5_ACTION_IN_FIELD_OUT_UDP_SPORT] = { + .hw_field = MLX5_DR_ACTION_MDFY_HW_FLD_L4_0, .start = 0, .end = 15, + .l4_type = MLX5_DR_ACTION_MDFY_HW_HDR_L4_UDP, + }, + [MLX5_ACTION_IN_FIELD_OUT_UDP_DPORT] = { + .hw_field = MLX5_DR_ACTION_MDFY_HW_FLD_L4_0, .start = 16, .end = 31, + .l4_type = MLX5_DR_ACTION_MDFY_HW_HDR_L4_UDP, + }, + [MLX5_ACTION_IN_FIELD_OUT_SIPV6_127_96] = { + .hw_field = MLX5_DR_ACTION_MDFY_HW_FLD_L3_3, .start = 32, .end = 63, + .l3_type = MLX5_DR_ACTION_MDFY_HW_HDR_L3_IPV6, + }, + [MLX5_ACTION_IN_FIELD_OUT_SIPV6_95_64] = { + .hw_field = MLX5_DR_ACTION_MDFY_HW_FLD_L3_3, .start = 0, .end = 31, + .l3_type = MLX5_DR_ACTION_MDFY_HW_HDR_L3_IPV6, + }, + [MLX5_ACTION_IN_FIELD_OUT_SIPV6_63_32] = { + .hw_field = MLX5_DR_ACTION_MDFY_HW_FLD_L3_4, .start = 32, .end = 63, + .l3_type = MLX5_DR_ACTION_MDFY_HW_HDR_L3_IPV6, + }, + [MLX5_ACTION_IN_FIELD_OUT_SIPV6_31_0] = { + .hw_field = MLX5_DR_ACTION_MDFY_HW_FLD_L3_4, .start = 0, .end = 31, + .l3_type = MLX5_DR_ACTION_MDFY_HW_HDR_L3_IPV6, + }, + [MLX5_ACTION_IN_FIELD_OUT_DIPV6_127_96] = { + .hw_field = MLX5_DR_ACTION_MDFY_HW_FLD_L3_0, .start = 32, .end = 63, + .l3_type = MLX5_DR_ACTION_MDFY_HW_HDR_L3_IPV6, + }, + [MLX5_ACTION_IN_FIELD_OUT_DIPV6_95_64] = { + .hw_field = MLX5_DR_ACTION_MDFY_HW_FLD_L3_0, .start = 0, .end = 31, + .l3_type = MLX5_DR_ACTION_MDFY_HW_HDR_L3_IPV6, + }, + [MLX5_ACTION_IN_FIELD_OUT_DIPV6_63_32] = { + .hw_field = MLX5_DR_ACTION_MDFY_HW_FLD_L3_2, .start = 32, .end = 63, + .l3_type = MLX5_DR_ACTION_MDFY_HW_HDR_L3_IPV6, + }, + [MLX5_ACTION_IN_FIELD_OUT_DIPV6_31_0] = { + .hw_field = MLX5_DR_ACTION_MDFY_HW_FLD_L3_2, .start = 0, .end = 31, + .l3_type = MLX5_DR_ACTION_MDFY_HW_HDR_L3_IPV6, + }, + [MLX5_ACTION_IN_FIELD_OUT_SIPV4] = { + .hw_field = MLX5_DR_ACTION_MDFY_HW_FLD_L3_0, .start = 0, .end = 31, + .l3_type = MLX5_DR_ACTION_MDFY_HW_HDR_L3_IPV4, + }, + [MLX5_ACTION_IN_FIELD_OUT_DIPV4] = { + .hw_field = MLX5_DR_ACTION_MDFY_HW_FLD_L3_0, .start = 32, .end = 63, + .l3_type = MLX5_DR_ACTION_MDFY_HW_HDR_L3_IPV4, + }, + [MLX5_ACTION_IN_FIELD_OUT_METADATA_REGA] = { + .hw_field = MLX5_DR_ACTION_MDFY_HW_FLD_METADATA, .start = 0, .end = 31, + }, + [MLX5_ACTION_IN_FIELD_OUT_METADATA_REGB] = { + .hw_field = MLX5_DR_ACTION_MDFY_HW_FLD_METADATA, .start = 32, .end = 63, + }, + [MLX5_ACTION_IN_FIELD_OUT_METADATA_REGC_0] = { + .hw_field = MLX5_DR_ACTION_MDFY_HW_FLD_REG_0, .start = 32, .end = 63, + }, + [MLX5_ACTION_IN_FIELD_OUT_METADATA_REGC_1] = { + .hw_field = MLX5_DR_ACTION_MDFY_HW_FLD_REG_0, .start = 0, .end = 31, + }, + [MLX5_ACTION_IN_FIELD_OUT_METADATA_REGC_2] = { + .hw_field = MLX5_DR_ACTION_MDFY_HW_FLD_REG_1, .start = 32, .end = 63, + }, + [MLX5_ACTION_IN_FIELD_OUT_METADATA_REGC_3] = { + .hw_field = MLX5_DR_ACTION_MDFY_HW_FLD_REG_1, .start = 0, .end = 31, + }, + [MLX5_ACTION_IN_FIELD_OUT_METADATA_REGC_4] = { + .hw_field = MLX5_DR_ACTION_MDFY_HW_FLD_REG_2, .start = 32, .end = 63, + }, + [MLX5_ACTION_IN_FIELD_OUT_METADATA_REGC_5] = { + .hw_field = MLX5_DR_ACTION_MDFY_HW_FLD_REG_2, .start = 0, .end = 31, + }, + [MLX5_ACTION_IN_FIELD_OUT_TCP_SEQ_NUM] = { + .hw_field = MLX5_DR_ACTION_MDFY_HW_FLD_L4_1, .start = 32, .end = 63, + }, + [MLX5_ACTION_IN_FIELD_OUT_TCP_ACK_NUM] = { + .hw_field = MLX5_DR_ACTION_MDFY_HW_FLD_L4_1, .start = 0, .end = 31, + }, + [MLX5_ACTION_IN_FIELD_OUT_FIRST_VID] = { + .hw_field = MLX5_DR_ACTION_MDFY_HW_FLD_L2_2, .start = 0, .end = 15, + }, +}; + +struct dr_action_apply_attr { + uint32_t modify_index; + uint16_t modify_actions; + uint32_t decap_index; + uint16_t decap_actions; + bool decap_with_vlan; + uint64_t final_icm_addr; + uint32_t flow_tag; + uint32_t ctr_id; + uint16_t gvmi; + uint32_t reformat_id; + uint32_t reformat_size; +}; + +static enum mlx5dv_flow_action_packet_reformat_type +dr_action_type_to_reformat_enum(enum dr_action_type action_type) +{ + switch (action_type) { + case DR_ACTION_TYP_TNL_L2_TO_L2: + return MLX5DV_FLOW_ACTION_PACKET_REFORMAT_TYPE_L2_TUNNEL_TO_L2; + case DR_ACTION_TYP_L2_TO_TNL_L2: + return MLX5DV_FLOW_ACTION_PACKET_REFORMAT_TYPE_L2_TO_L2_TUNNEL; + case DR_ACTION_TYP_TNL_L3_TO_L2: + return MLX5DV_FLOW_ACTION_PACKET_REFORMAT_TYPE_L3_TUNNEL_TO_L2; + case DR_ACTION_TYP_L2_TO_TNL_L3: + return MLX5DV_FLOW_ACTION_PACKET_REFORMAT_TYPE_L2_TO_L3_TUNNEL; + default: + assert(false); + return 0; + } +} + +static enum dr_action_type +dr_action_reformat_to_action_type(enum mlx5dv_flow_action_packet_reformat_type type) +{ + switch (type) { + case MLX5DV_FLOW_ACTION_PACKET_REFORMAT_TYPE_L2_TUNNEL_TO_L2: + return DR_ACTION_TYP_TNL_L2_TO_L2; + case MLX5DV_FLOW_ACTION_PACKET_REFORMAT_TYPE_L2_TO_L2_TUNNEL: + return DR_ACTION_TYP_L2_TO_TNL_L2; + case MLX5DV_FLOW_ACTION_PACKET_REFORMAT_TYPE_L3_TUNNEL_TO_L2: + return DR_ACTION_TYP_TNL_L3_TO_L2; + case MLX5DV_FLOW_ACTION_PACKET_REFORMAT_TYPE_L2_TO_L3_TUNNEL: + return DR_ACTION_TYP_L2_TO_TNL_L3; + default: + assert(false); + return 0; + } +} + +static void dr_actions_init_next_ste(uint8_t **last_ste, + uint32_t *added_stes, + enum dr_ste_entry_type entry_type, + uint16_t gvmi) +{ + (*added_stes)++; + *last_ste += DR_STE_SIZE; + dr_ste_init(*last_ste, DR_STE_LU_TYPE_DONT_CARE, entry_type, gvmi); +} + +static void dr_actions_apply_tx(uint8_t *action_type_set, + uint8_t *last_ste, + struct dr_action_apply_attr *attr, + uint32_t *added_stes) +{ + /* We want to make sure the modify header comes before L2 + * encapsulation. The reason for that is that we support + * modify headers for outer headers only + */ + if (action_type_set[DR_ACTION_TYP_MODIFY_HDR]) { + dr_ste_set_entry_type(last_ste, DR_STE_TYPE_MODIFY_PKT); + dr_ste_set_rewrite_actions(last_ste, + attr->modify_actions, + attr->modify_index); + } + + if (action_type_set[DR_ACTION_TYP_L2_TO_TNL_L2] || + action_type_set[DR_ACTION_TYP_L2_TO_TNL_L3]) { + /* Modify header and encapsulation require a different STEs. + * Since modify header STE format doesn't support encapsulation + * tunneling_action. + */ + if (action_type_set[DR_ACTION_TYP_MODIFY_HDR]) + dr_actions_init_next_ste(&last_ste, + added_stes, + DR_STE_TYPE_TX, + attr->gvmi); + + dr_ste_set_tx_encap(last_ste, + attr->reformat_id, + attr->reformat_size, + action_type_set[DR_ACTION_TYP_L2_TO_TNL_L3]); + } + + if (action_type_set[DR_ACTION_TYP_CTR]) + dr_ste_set_counter_id(last_ste, attr->ctr_id); +} + +static void dr_actions_apply_rx(uint8_t *action_type_set, + uint8_t *last_ste, + struct dr_action_apply_attr *attr, + uint32_t *added_stes) +{ + if (action_type_set[DR_ACTION_TYP_CTR]) + dr_ste_set_counter_id(last_ste, attr->ctr_id); + + if (action_type_set[DR_ACTION_TYP_TNL_L3_TO_L2]) { + dr_ste_set_entry_type(last_ste, DR_STE_TYPE_MODIFY_PKT); + dr_ste_set_rx_decap_l3(last_ste, attr->decap_with_vlan); + dr_ste_set_rewrite_actions(last_ste, + attr->decap_actions, + attr->decap_index); + } + + if (action_type_set[DR_ACTION_TYP_TNL_L2_TO_L2]) + dr_ste_set_rx_decap(last_ste); + + if (action_type_set[DR_ACTION_TYP_MODIFY_HDR]) { + if (dr_ste_get_entry_type(last_ste) == DR_STE_TYPE_MODIFY_PKT) + dr_actions_init_next_ste(&last_ste, + added_stes, + DR_STE_TYPE_MODIFY_PKT, + attr->gvmi); + else + dr_ste_set_entry_type(last_ste, DR_STE_TYPE_MODIFY_PKT); + + dr_ste_set_rewrite_actions(last_ste, + attr->modify_actions, + attr->modify_index); + } + + if (action_type_set[DR_ACTION_TYP_TAG]) { + if (dr_ste_get_entry_type(last_ste) == DR_STE_TYPE_MODIFY_PKT) + dr_actions_init_next_ste(&last_ste, + added_stes, + DR_STE_TYPE_RX, + attr->gvmi); + + dr_ste_rx_set_flow_tag(last_ste, attr->flow_tag); + } +} + +/* Apply the actions on the rule STE array starting from the last_ste. + * Actions might require more than one STE, new_num_stes will return + * the new size of the STEs array, rule with actions. */ +static void dr_actions_apply(enum dr_ste_entry_type ste_type, + uint8_t *action_type_set, + uint8_t *last_ste, + struct dr_action_apply_attr *attr, + uint32_t *new_num_stes) +{ + uint32_t added_stes = 0; + + if (ste_type == DR_STE_TYPE_RX) + dr_actions_apply_rx(action_type_set, last_ste, attr, &added_stes); + else + dr_actions_apply_tx(action_type_set, last_ste, attr, &added_stes); + + last_ste += added_stes * DR_STE_SIZE; + *new_num_stes += added_stes; + + dr_ste_set_hit_addr(last_ste, attr->final_icm_addr, 1); +} + +static enum dr_action_domain +dr_action_get_action_domain(enum mlx5dv_dr_domain_type domain, + enum dr_ste_entry_type ste_type) +{ + if (domain == MLX5DV_DR_DOMAIN_TYPE_NIC_RX) { + return DR_ACTION_DOMAIN_NIC_INGRESS; + } else if (domain == MLX5DV_DR_DOMAIN_TYPE_NIC_TX) { + return DR_ACTION_DOMAIN_NIC_EGRESS; + } else { + /* FDB domain */ + if (ste_type == DR_STE_TYPE_RX) + return DR_ACTION_DOMAIN_FDB_INGRESS; + else + return DR_ACTION_DOMAIN_FDB_EGRESS; + } +} + +static int +dr_action_validate_and_get_next_state(enum dr_action_domain action_domain, + uint32_t action_type, + uint32_t *state) +{ + uint32_t cur_state = *state; + + /* Check action state machine is valid */ + *state = next_action_state[action_domain][cur_state][action_type]; + + if (*state == DR_ACTION_STATE_ERR) { + errno = EOPNOTSUPP; + return errno; + } + + return 0; +} + +#define WITH_VLAN_NUM_HW_ACTIONS 6 + +int dr_actions_build_ste_arr(struct mlx5dv_dr_matcher *matcher, + struct dr_matcher_rx_tx *nic_matcher, + struct mlx5dv_dr_action *actions[], + uint32_t num_actions, + uint8_t *ste_arr, + uint32_t *new_hw_ste_arr_sz) +{ + struct dr_domain_rx_tx *nic_dmn = nic_matcher->nic_tbl->nic_dmn; + bool rx_rule = nic_dmn->ste_type == DR_STE_TYPE_RX; + struct mlx5dv_dr_domain *dmn = matcher->tbl->dmn; + uint8_t action_type_set[DR_ACTION_TYP_MAX] = {}; + uint32_t state = DR_ACTION_STATE_NO_ACTION; + struct dr_action_apply_attr attr = {}; + enum dr_action_domain action_domain; + uint8_t *last_ste; + int i; + + attr.gvmi = dmn->info.caps.gvmi; + attr.final_icm_addr = nic_dmn->default_icm_addr; + action_domain = dr_action_get_action_domain(dmn->type, nic_dmn->ste_type); + + for (i = 0; i < num_actions; i++) { + struct mlx5dv_dr_action *action; + uint32_t action_type; + + action = actions[i]; + action_type = action->action_type; + + switch (action_type) { + case DR_ACTION_TYP_DROP: + attr.final_icm_addr = nic_dmn->drop_icm_addr; + break; + case DR_ACTION_TYP_FT: + if (action->dest_tbl->dmn != dmn) { + dr_dbg(dmn, "Destination table belongs to a different domain\n"); + goto out_invalid_arg; + } + if (action->dest_tbl->level <= matcher->tbl->level) { + dr_dbg(dmn, "Destination table level should be higher than source table\n"); + goto out_invalid_arg; + } + attr.final_icm_addr = rx_rule ? + action->dest_tbl->rx.s_anchor->chunk->icm_addr : + action->dest_tbl->tx.s_anchor->chunk->icm_addr; + break; + case DR_ACTION_TYP_QP: + { + struct mlx5_qp *mlx5_qp = to_mqp(action->qp); + + if (!mlx5_qp->tir_icm_addr) { + dr_dbg(dmn, "Unsupported QP for action\n"); + goto out_invalid_arg; + } + attr.final_icm_addr = mlx5_qp->tir_icm_addr; + } + break; + case DR_ACTION_TYP_CTR: + attr.ctr_id = action->ctr.devx_obj->object_id + + action->ctr.offset; + break; + case DR_ACTION_TYP_TAG: + attr.flow_tag = action->flow_tag; + break; + case DR_ACTION_TYP_TNL_L2_TO_L2: + break; + case DR_ACTION_TYP_TNL_L3_TO_L2: + if (action->rewrite.is_root_level) { + dr_dbg(dmn, "Root decap L3 action cannot be used on current table\n"); + goto out_invalid_arg; + } + attr.decap_index = action->rewrite.index; + attr.decap_actions = action->rewrite.num_of_actions; + attr.decap_with_vlan = + attr.decap_actions == WITH_VLAN_NUM_HW_ACTIONS; + break; + case DR_ACTION_TYP_MODIFY_HDR: + if (action->rewrite.is_root_level) { + dr_dbg(dmn, "Root modify header action cannot be used on current table\n"); + goto out_invalid_arg; + } + attr.modify_index = action->rewrite.index; + attr.modify_actions = action->rewrite.num_of_actions; + break; + case DR_ACTION_TYP_L2_TO_TNL_L2: + case DR_ACTION_TYP_L2_TO_TNL_L3: + if (action->reformat.is_root_level) { + dr_dbg(dmn, "Root encap action cannot be used on current table\n"); + goto out_invalid_arg; + } + attr.reformat_size = action->reformat.reformat_size; + attr.reformat_id = action->reformat.dvo->object_id; + break; + case DR_ACTION_TYP_METER: + if (action->meter.next_ft->dmn != dmn) { + dr_dbg(dmn, "Next table belongs to a different domain\n"); + goto out_invalid_arg; + } + if (action->meter.next_ft->level <= + matcher->tbl->level) { + dr_dbg(dmn, "Next table level should he higher than source table\n"); + goto out_invalid_arg; + } + attr.final_icm_addr = rx_rule ? + action->meter.rx_icm_addr : + action->meter.tx_icm_addr; + break; + case DR_ACTION_TYP_VPORT: + if (action->vport.dmn != dmn) { + dr_dbg(dmn, "Destination vport belongs to a different domain\n"); + goto out_invalid_arg; + } + if (rx_rule) { + /* Loopback on WIRE vport is not supported */ + if (action->vport.num == WIRE_PORT) + goto out_invalid_arg; + + attr.final_icm_addr = action->vport.caps->icm_address_rx; + } else { + attr.final_icm_addr = action->vport.caps->icm_address_tx; + } + break; + default: + goto out_invalid_arg; + } + + /* Check action duplication */ + if (++action_type_set[action_type] > 1) { + dr_dbg(dmn, "Duplicate action type provided\n"); + goto out_invalid_arg; + } + + /* Check action state machine is valid */ + if (dr_action_validate_and_get_next_state(action_domain, + action_type, + &state)) { + dr_dbg(dmn, "Invalid action sequence provided\n"); + goto out_errno; + } + } + + *new_hw_ste_arr_sz = nic_matcher->num_of_builders; + last_ste = ste_arr + DR_STE_SIZE * (nic_matcher->num_of_builders - 1); + + dr_actions_apply(nic_dmn->ste_type, + action_type_set, + last_ste, + &attr, + new_hw_ste_arr_sz); + + return 0; + +out_invalid_arg: + errno = EINVAL; +out_errno: + return errno; +} + +int dr_actions_build_attr(struct mlx5dv_dr_matcher *matcher, + struct mlx5dv_dr_action *actions[], + size_t num_actions, + struct mlx5dv_flow_action_attr *attr, + struct mlx5_flow_action_attr_aux *attr_aux) +{ + struct mlx5dv_dr_domain *dmn = matcher->tbl->dmn; + int i; + + for (i = 0; i < num_actions; i++) { + switch (actions[i]->action_type) { + case DR_ACTION_TYP_FT: + if (actions[i]->dest_tbl->dmn != dmn) { + dr_dbg(dmn, "Destination table belongs to a different domain\n"); + errno = EINVAL; + return errno; + } + attr[i].type = MLX5DV_FLOW_ACTION_DEST_DEVX; + attr[i].obj = actions[i]->dest_tbl->devx_obj; + break; + case DR_ACTION_TYP_TNL_L2_TO_L2: + case DR_ACTION_TYP_L2_TO_TNL_L2: + case DR_ACTION_TYP_TNL_L3_TO_L2: + case DR_ACTION_TYP_L2_TO_TNL_L3: + attr[i].type = MLX5DV_FLOW_ACTION_IBV_FLOW_ACTION; + attr[i].action = actions[i]->reformat.flow_action; + break; + case DR_ACTION_TYP_MODIFY_HDR: + attr[i].type = MLX5DV_FLOW_ACTION_IBV_FLOW_ACTION; + attr[i].action = actions[i]->rewrite.flow_action; + break; + case DR_ACTION_TYP_QP: + attr[i].type = MLX5DV_FLOW_ACTION_DEST_IBV_QP; + attr[i].qp = actions[i]->qp; + break; + case DR_ACTION_TYP_CTR: + attr[i].type = MLX5DV_FLOW_ACTION_COUNTERS_DEVX; + attr[i].obj = actions[i]->ctr.devx_obj; + + if (actions[i]->ctr.offset) { + attr_aux[i].type = MLX5_FLOW_ACTION_COUNTER_OFFSET; + attr_aux[i].offset = actions[i]->ctr.offset; + } + break; + case DR_ACTION_TYP_TAG: + attr[i].type = MLX5DV_FLOW_ACTION_TAG; + attr[i].tag_value = actions[i]->flow_tag; + break; + default: + dr_dbg(dmn, "Found unsupported action type: %d\n", + actions[i]->action_type); + errno = ENOTSUP; + return errno; + } + } + return 0; +} + +#define SVLAN_ETHERTYPE 0x88a8 +#define HDR_LEN_L2_ONLY 14 +#define HDR_LEN_L2_VLAN 18 +#define REWRITE_HW_ACTION_NUM 6 + +static int dr_actions_l2_rewrite(struct mlx5dv_dr_domain *dmn, + struct mlx5dv_dr_action *action, + void *data, size_t data_sz) +{ + struct mlx5_ifc_l2_hdr_bits *l2_hdr = data; + uint64_t ops[REWRITE_HW_ACTION_NUM] = {}; + uint32_t hdr_fld_4b; + uint16_t hdr_fld_2b; + uint16_t vlan_type; + bool vlan; + int i = 0; + int ret; + + vlan = (data_sz != HDR_LEN_L2_ONLY); + + /* dmac_47_16 */ + DEVX_SET(dr_action_hw_set, ops + i, opcode, MLX5_DR_ACTION_MDFY_HW_OP_SET); + DEVX_SET(dr_action_hw_set, ops + i, destination_length, 0); + DEVX_SET(dr_action_hw_set, ops + i, destination_field_code, MLX5_DR_ACTION_MDFY_HW_FLD_L2_0); + DEVX_SET(dr_action_hw_set, ops + i, destination_left_shifter, 16); + hdr_fld_4b = DEVX_GET(l2_hdr, l2_hdr, dmac_47_16); + DEVX_SET(dr_action_hw_set, ops + i, inline_data, hdr_fld_4b); + i++; + + /* smac_47_16 */ + DEVX_SET(dr_action_hw_set, ops + i, opcode, MLX5_DR_ACTION_MDFY_HW_OP_SET); + DEVX_SET(dr_action_hw_set, ops + i, destination_length, 0); + DEVX_SET(dr_action_hw_set, ops + i, destination_field_code, MLX5_DR_ACTION_MDFY_HW_FLD_L2_1); + DEVX_SET(dr_action_hw_set, ops + i, destination_left_shifter, 16); + hdr_fld_4b = (DEVX_GET(l2_hdr, l2_hdr, smac_31_0) >> 16 | + DEVX_GET(l2_hdr, l2_hdr, smac_47_32) << 16); + DEVX_SET(dr_action_hw_set, ops + i, inline_data, hdr_fld_4b); + i++; + + /* dmac_15_0 */ + DEVX_SET(dr_action_hw_set, ops + i, opcode, MLX5_DR_ACTION_MDFY_HW_OP_SET); + DEVX_SET(dr_action_hw_set, ops + i, destination_length, 16); + DEVX_SET(dr_action_hw_set, ops + i, destination_field_code, MLX5_DR_ACTION_MDFY_HW_FLD_L2_0); + DEVX_SET(dr_action_hw_set, ops + i, destination_left_shifter, 0); + hdr_fld_2b = DEVX_GET(l2_hdr, l2_hdr, dmac_15_0); + DEVX_SET(dr_action_hw_set, ops + i, inline_data, hdr_fld_2b); + i++; + + /* ethertype + (optional) vlan */ + DEVX_SET(dr_action_hw_set, ops + i, opcode, MLX5_DR_ACTION_MDFY_HW_OP_SET); + DEVX_SET(dr_action_hw_set, ops + i, destination_field_code, MLX5_DR_ACTION_MDFY_HW_FLD_L2_2); + DEVX_SET(dr_action_hw_set, ops + i, destination_left_shifter, 32); + if (!vlan) { + hdr_fld_2b = DEVX_GET(l2_hdr, l2_hdr, ethertype); + DEVX_SET(dr_action_hw_set, ops + i, inline_data, hdr_fld_2b); + DEVX_SET(dr_action_hw_set, ops + i, destination_length, 16); + } else { + hdr_fld_2b = DEVX_GET(l2_hdr, l2_hdr, ethertype); + vlan_type = hdr_fld_2b == SVLAN_ETHERTYPE ? DR_STE_SVLAN : DR_STE_CVLAN; + hdr_fld_2b = DEVX_GET(l2_hdr, l2_hdr, vlan); + hdr_fld_4b = (vlan_type << 16) | hdr_fld_2b; + DEVX_SET(dr_action_hw_set, ops + i, inline_data, hdr_fld_4b); + DEVX_SET(dr_action_hw_set, ops + i, destination_length, 18); + } + i++; + + /* smac_15_0 */ + DEVX_SET(dr_action_hw_set, ops + i, opcode, MLX5_DR_ACTION_MDFY_HW_OP_SET); + DEVX_SET(dr_action_hw_set, ops + i, destination_length, 16); + DEVX_SET(dr_action_hw_set, ops + i, destination_field_code, MLX5_DR_ACTION_MDFY_HW_FLD_L2_1); + DEVX_SET(dr_action_hw_set, ops + i, destination_left_shifter, 0); + hdr_fld_2b = DEVX_GET(l2_hdr, l2_hdr, smac_31_0); + DEVX_SET(dr_action_hw_set, ops + i, inline_data, hdr_fld_2b); + i++; + + if (vlan) { + DEVX_SET(dr_action_hw_set, ops + i, opcode, MLX5_DR_ACTION_MDFY_HW_OP_SET); + hdr_fld_2b = DEVX_GET(l2_hdr, l2_hdr, vlan_type); + DEVX_SET(dr_action_hw_set, ops + i, inline_data, hdr_fld_2b); + DEVX_SET(dr_action_hw_set, ops + i, destination_length, 16); + DEVX_SET(dr_action_hw_set, ops + i, destination_field_code, MLX5_DR_ACTION_MDFY_HW_FLD_L2_2); + DEVX_SET(dr_action_hw_set, ops + i, destination_left_shifter, 0); + i++; + } + + action->rewrite.data = (void *)ops; + action->rewrite.num_of_actions = i; + action->rewrite.chunk->byte_size = i * sizeof(*ops); + + ret = dr_send_postsend_action(dmn, action); + if (ret) { + dr_dbg(dmn, "Writing encapsulation action to ICM failed\n"); + return ret; + } + + return 0; +} + +static struct mlx5dv_dr_action * +dr_action_create_generic(enum dr_action_type action_type) +{ + struct mlx5dv_dr_action *action; + + action = calloc(1, sizeof(struct mlx5dv_dr_action)); + if (!action) { + errno = ENOMEM; + return NULL; + } + + action->action_type = action_type; + atomic_init(&action->refcount, 1); + + return action; +} + +struct mlx5dv_dr_action *mlx5dv_dr_action_create_drop(void) +{ + return dr_action_create_generic(DR_ACTION_TYP_DROP); +} + +struct mlx5dv_dr_action * +mlx5dv_dr_action_create_dest_ibv_qp(struct ibv_qp *ibqp) +{ + struct mlx5dv_dr_action *action; + + if (ibqp->qp_type != IBV_QPT_RAW_PACKET) { + errno = EINVAL; + return NULL; + } + + action = dr_action_create_generic(DR_ACTION_TYP_QP); + if (!action) + return NULL; + + action->qp = ibqp; + + return action; +} + +struct mlx5dv_dr_action * +mlx5dv_dr_action_create_dest_table(struct mlx5dv_dr_table *tbl) +{ + struct mlx5dv_dr_action *action; + + atomic_fetch_add(&tbl->refcount, 1); + + if (dr_is_root_table(tbl)) { + dr_dbg(tbl->dmn, "Root table cannot be used as a destination\n"); + errno = EINVAL; + goto dec_ref; + } + + action = dr_action_create_generic(DR_ACTION_TYP_FT); + if (!action) + goto dec_ref; + + action->dest_tbl = tbl; + + return action; + +dec_ref: + atomic_fetch_sub(&tbl->refcount, 1); + return NULL; +} + +struct mlx5dv_dr_action * +mlx5dv_dr_action_create_flow_counter(struct mlx5dv_devx_obj *devx_obj, + uint32_t offset) +{ + struct mlx5dv_dr_action *action; + + if (devx_obj->type != MLX5_DEVX_FLOW_COUNTER) { + errno = EINVAL; + return NULL; + } + + action = dr_action_create_generic(DR_ACTION_TYP_CTR); + if (!action) + return NULL; + + action->ctr.devx_obj = devx_obj; + action->ctr.offset = offset; + + return action; +} + +struct mlx5dv_dr_action *mlx5dv_dr_action_create_tag(uint32_t tag_value) +{ + struct mlx5dv_dr_action *action; + + action = dr_action_create_generic(DR_ACTION_TYP_TAG); + if (!action) + return NULL; + + action->flow_tag = tag_value & 0xffffff; + + return action; +} + +static int +dr_action_create_reformat_action_root(struct mlx5dv_dr_domain *dmn, + size_t data_sz, + void *data, + struct mlx5dv_dr_action *action) +{ + enum mlx5dv_flow_action_packet_reformat_type reformat_type; + struct ibv_flow_action *flow_action; + enum mlx5dv_flow_table_type type; + + if (dmn->type == MLX5DV_DR_DOMAIN_TYPE_NIC_RX) + type = MLX5_IB_UAPI_FLOW_TABLE_TYPE_NIC_RX; + else if (dmn->type == MLX5DV_DR_DOMAIN_TYPE_NIC_TX) + type = MLX5_IB_UAPI_FLOW_TABLE_TYPE_NIC_TX; + else + type = MLX5_IB_UAPI_FLOW_TABLE_TYPE_FDB; + + reformat_type = dr_action_type_to_reformat_enum(action->action_type); + flow_action = mlx5dv_create_flow_action_packet_reformat(dmn->ctx, + data_sz, + data, + reformat_type, + type); + if (!flow_action) + return errno; + + action->reformat.flow_action = flow_action; + return 0; +} + +static int +dr_action_verify_reformat_params(enum mlx5dv_flow_action_packet_reformat_type reformat_type, + struct mlx5dv_dr_domain *dmn, + size_t data_sz, + void *data) +{ + if ((!data && data_sz) || (data && !data_sz) || reformat_type > + MLX5DV_FLOW_ACTION_PACKET_REFORMAT_TYPE_L2_TO_L3_TUNNEL) { + dr_dbg(dmn, "Invalid reformat parameter!\n"); + goto out_err; + } + + if (dmn->type == MLX5DV_DR_DOMAIN_TYPE_FDB) + return 0; + + if (dmn->type == MLX5DV_DR_DOMAIN_TYPE_NIC_RX) { + if (reformat_type != MLX5_IB_UAPI_FLOW_ACTION_PACKET_REFORMAT_TYPE_L2_TUNNEL_TO_L2 && + reformat_type != MLX5_IB_UAPI_FLOW_ACTION_PACKET_REFORMAT_TYPE_L3_TUNNEL_TO_L2) { + dr_dbg(dmn, "Action reformat type not support on RX domain\n"); + goto out_err; + } + } else if (dmn->type == MLX5DV_DR_DOMAIN_TYPE_NIC_TX) { + if (reformat_type != MLX5_IB_UAPI_FLOW_ACTION_PACKET_REFORMAT_TYPE_L2_TO_L2_TUNNEL && + reformat_type != MLX5_IB_UAPI_FLOW_ACTION_PACKET_REFORMAT_TYPE_L2_TO_L3_TUNNEL) { + dr_dbg(dmn, "Action reformat type not support on TX domain\n"); + goto out_err; + } + } + + return 0; + +out_err: + errno = EINVAL; + return errno; +} + +#define ACTION_CACHE_LINE_SIZE 64 + +static int +dr_action_create_reformat_action(struct mlx5dv_dr_domain *dmn, + size_t data_sz, void *data, + struct mlx5dv_dr_action *action) +{ + struct mlx5dv_devx_obj *obj; + + switch (action->action_type) { + case DR_ACTION_TYP_L2_TO_TNL_L2: + case DR_ACTION_TYP_L2_TO_TNL_L3: + { + enum reformat_type rt; + + if (action->action_type == DR_ACTION_TYP_L2_TO_TNL_L2) + rt = MLX5_REFORMAT_TYPE_L2_TO_L2_TUNNEL; + else + rt = MLX5_REFORMAT_TYPE_L2_TO_L3_TUNNEL; + + obj = dr_devx_create_reformat_ctx(dmn->ctx, rt, data_sz, data); + if (!obj) + return errno; + + action->reformat.dvo = obj; + action->reformat.reformat_size = data_sz; + return 0; + } + case DR_ACTION_TYP_TNL_L2_TO_L2: + { + return 0; + } + case DR_ACTION_TYP_TNL_L3_TO_L2: + { + int ret; + + /* Only Ethernet frame is supported, with VLAN (18) or without (14) */ + if (data_sz != HDR_LEN_L2_ONLY && data_sz != HDR_LEN_L2_VLAN) { + errno = EINVAL; + return errno; + } + + action->rewrite.chunk = dr_icm_alloc_chunk(dmn->action_icm_pool, + DR_CHUNK_SIZE_8); + if (!action->rewrite.chunk) + return errno; + + action->rewrite.index = (action->rewrite.chunk->icm_addr - + dmn->info.caps.hdr_modify_icm_addr) / + ACTION_CACHE_LINE_SIZE; + + ret = dr_actions_l2_rewrite(dmn, action, data, data_sz); + if (ret) { + dr_icm_free_chunk(action->rewrite.chunk); + return ret; + } + return 0; + } + default: + dr_dbg(dmn, "Reformat type is not supported %d\n", action->action_type); + errno = ENOTSUP; + return errno; + } +} + +struct mlx5dv_dr_action * +mlx5dv_dr_action_create_packet_reformat(struct mlx5dv_dr_domain *dmn, + uint32_t flags, + enum mlx5dv_flow_action_packet_reformat_type reformat_type, + size_t data_sz, + void *data) +{ + struct mlx5dv_dr_action *action; + enum dr_action_type action_type; + int ret; + + atomic_fetch_add(&dmn->refcount, 1); + + if (!check_comp_mask(flags, MLX5DV_DR_ACTION_FLAGS_ROOT_LEVEL)) { + errno = EINVAL; + goto dec_ref; + } + + if (!dmn->info.supp_sw_steering && + !(flags & MLX5DV_DR_ACTION_FLAGS_ROOT_LEVEL)) { + dr_dbg(dmn, "Only root actions are supported on current domain\n"); + errno = EOPNOTSUPP; + goto dec_ref; + } + + /* General checks */ + ret = dr_action_verify_reformat_params(reformat_type, dmn, data_sz, data); + if (ret) + goto dec_ref; + + action_type = dr_action_reformat_to_action_type(reformat_type); + action = dr_action_create_generic(action_type); + if (!action) + goto dec_ref; + + action->reformat.dmn = dmn; + + /* Create the action according to the table type */ + if (flags & MLX5DV_DR_ACTION_FLAGS_ROOT_LEVEL) { + action->reformat.is_root_level = true; + ret = dr_action_create_reformat_action_root(dmn, + data_sz, + data, + action); + } else { + action->reformat.is_root_level = false; + ret = dr_action_create_reformat_action(dmn, + data_sz, + data, + action); + } + + if (ret) { + dr_dbg(dmn, "Failed creating reformat action %d\n", ret); + goto free_action; + } + + return action; + +free_action: + free(action); +dec_ref: + atomic_fetch_sub(&dmn->refcount, 1); + return NULL; +} + +static const struct dr_action_modify_field_conv * +dr_action_modify_get_hw_info(uint16_t sw_field) +{ + const struct dr_action_modify_field_conv *hw_action_info; + + if (sw_field >= ARRAY_SIZE(dr_action_conv_arr)) + goto not_found; + + hw_action_info = &dr_action_conv_arr[sw_field]; + if (!hw_action_info->end && !hw_action_info->start) + goto not_found; + + return hw_action_info; + +not_found: + errno = EINVAL; + return NULL; +} + +static int +dr_action_modify_sw_to_hw_add(struct mlx5dv_dr_domain *dmn, + __be64 *sw_action, + __be64 *hw_action, + const struct dr_action_modify_field_conv **ret_hw_info) +{ + const struct dr_action_modify_field_conv *hw_action_info; + uint8_t max_length; + uint16_t sw_field; + uint32_t data; + + /* Get SW modify action data */ + sw_field = DEVX_GET(set_action_in, sw_action, field); + data = DEVX_GET(set_action_in, sw_action, data); + + /* Convert SW data to HW modify action format */ + hw_action_info = dr_action_modify_get_hw_info(sw_field); + if (!hw_action_info) { + dr_dbg(dmn, "Modify ADD action invalid field given\n"); + errno = EINVAL; + return errno; + } + + max_length = hw_action_info->end - hw_action_info->start + 1; + + DEVX_SET(dr_action_hw_set, hw_action, opcode, + MLX5_DR_ACTION_MDFY_HW_OP_ADD); + DEVX_SET(dr_action_hw_set, hw_action, destination_field_code, + hw_action_info->hw_field); + DEVX_SET(dr_action_hw_set, hw_action, destination_left_shifter, + hw_action_info->start); + DEVX_SET(dr_action_hw_set, hw_action, destination_length, + max_length == 32 ? 0 : max_length); + DEVX_SET(dr_action_hw_set, hw_action, inline_data, data); + + *ret_hw_info = hw_action_info; + + return 0; +} + +static int +dr_action_modify_sw_to_hw_set(struct mlx5dv_dr_domain *dmn, + __be64 *sw_action, + __be64 *hw_action, + const struct dr_action_modify_field_conv **ret_hw_info) +{ + const struct dr_action_modify_field_conv *hw_action_info; + uint8_t offset, length, max_length; + uint16_t sw_field; + uint32_t data; + + /* Get SW modify action data */ + sw_field = DEVX_GET(set_action_in, sw_action, field); + offset = DEVX_GET(set_action_in, sw_action, offset); + length = DEVX_GET(set_action_in, sw_action, length); + data = DEVX_GET(set_action_in, sw_action, data); + + /* Convert SW data to HW modify action format */ + hw_action_info = dr_action_modify_get_hw_info(sw_field); + if (!hw_action_info) { + dr_dbg(dmn, "Modify SET action invalid field given\n"); + errno = EINVAL; + return errno; + } + + /* Based on device specification value of 0 means 32 */ + length = length ? length : 32; + max_length = hw_action_info->end - hw_action_info->start + 1; + + if (length + offset > max_length) { + dr_dbg(dmn, "Modify action length + offset exceeds limit\n"); + errno = EINVAL; + return errno; + } + + DEVX_SET(dr_action_hw_set, hw_action, opcode, + MLX5_DR_ACTION_MDFY_HW_OP_SET); + DEVX_SET(dr_action_hw_set, hw_action, destination_field_code, + hw_action_info->hw_field); + DEVX_SET(dr_action_hw_set, hw_action, destination_left_shifter, + hw_action_info->start + offset); + DEVX_SET(dr_action_hw_set, hw_action, destination_length, + length == 32 ? 0 : length); + DEVX_SET(dr_action_hw_set, hw_action, inline_data, data); + + *ret_hw_info = hw_action_info; + + return 0; +} + +static int +dr_action_modify_sw_to_hw_copy(struct mlx5dv_dr_domain *dmn, + __be64 *sw_action, + __be64 *hw_action, + const struct dr_action_modify_field_conv **ret_dst_hw_info, + const struct dr_action_modify_field_conv **ret_src_hw_info) +{ + uint8_t src_offset, dst_offset, src_max_length, dst_max_length, length; + const struct dr_action_modify_field_conv *src_hw_action_info; + const struct dr_action_modify_field_conv *dst_hw_action_info; + uint16_t src_field, dst_field; + + /* Get SW modify action data */ + src_field = DEVX_GET(copy_action_in, sw_action, src_field); + dst_field = DEVX_GET(copy_action_in, sw_action, dst_field); + src_offset = DEVX_GET(copy_action_in, sw_action, src_offset); + dst_offset = DEVX_GET(copy_action_in, sw_action, dst_offset); + length = DEVX_GET(copy_action_in, sw_action, length); + + /* Convert SW data to HW modify action format */ + src_hw_action_info = dr_action_modify_get_hw_info(src_field); + dst_hw_action_info = dr_action_modify_get_hw_info(dst_field); + if (!src_hw_action_info || !dst_hw_action_info) { + dr_dbg(dmn, "Modify COPY action invalid src/dst field given\n"); + errno = EINVAL; + return errno; + } + + /* Based on device specification value of 0 means 32 */ + length = length ? length : 32; + src_max_length = src_hw_action_info->end - + src_hw_action_info->start + 1; + dst_max_length = dst_hw_action_info->end - + dst_hw_action_info->start + 1; + if (length + src_offset > src_max_length || + length + dst_offset > dst_max_length) { + dr_dbg(dmn, "Modify action length exceeds limit\n"); + errno = EINVAL; + return errno; + } + + DEVX_SET(dr_action_hw_copy, hw_action, opcode, + MLX5_DR_ACTION_MDFY_HW_OP_COPY); + DEVX_SET(dr_action_hw_copy, hw_action, destination_field_code, + dst_hw_action_info->hw_field); + DEVX_SET(dr_action_hw_copy, hw_action, destination_left_shifter, + dst_hw_action_info->start + dst_offset); + DEVX_SET(dr_action_hw_copy, hw_action, destination_length, length); + DEVX_SET(dr_action_hw_copy, hw_action, source_field_code, + src_hw_action_info->hw_field); + DEVX_SET(dr_action_hw_copy, hw_action, source_left_shifter, + src_hw_action_info->start + src_offset); + + *ret_dst_hw_info = dst_hw_action_info; + *ret_src_hw_info = src_hw_action_info; + + return 0; +} + +static int +dr_action_modify_sw_to_hw(struct mlx5dv_dr_domain *dmn, + __be64 *sw_action, + __be64 *hw_action, + const struct dr_action_modify_field_conv **ret_dst_hw_info, + const struct dr_action_modify_field_conv **ret_src_hw_info) +{ + uint8_t action = DEVX_GET(set_action_in, sw_action, action_type); + int ret = 0; + + *hw_action = 0; + *ret_src_hw_info = NULL; + + switch (action) { + case MLX5_ACTION_TYPE_SET: + ret = dr_action_modify_sw_to_hw_set(dmn, + sw_action, + hw_action, + ret_dst_hw_info); + break; + case MLX5_ACTION_TYPE_ADD: + ret = dr_action_modify_sw_to_hw_add(dmn, + sw_action, + hw_action, + ret_dst_hw_info); + break; + case MLX5_ACTION_TYPE_COPY: + ret = dr_action_modify_sw_to_hw_copy(dmn, + sw_action, + hw_action, + ret_dst_hw_info, + ret_src_hw_info); + break; + default: + dr_dbg(dmn, "Unsupported action type %d for modify action\n", + action); + errno = EOPNOTSUPP; + ret = errno; + break; + } + + return ret; +} + +static int +dr_action_modify_check_field_limitation_set(struct mlx5dv_dr_action *action, + const __be64 *sw_action) +{ + uint16_t sw_field = DEVX_GET(set_action_in, sw_action, field); + struct mlx5dv_dr_domain *dmn = action->rewrite.dmn; + + if (sw_field == MLX5_ACTION_IN_FIELD_OUT_METADATA_REGA) { + action->rewrite.allow_rx = false; + if (dmn->type != MLX5DV_DR_DOMAIN_TYPE_NIC_TX) { + dr_dbg(dmn, "Unsupported field %d for RX/FDB set action\n", + sw_field); + errno = EINVAL; + return errno; + } + } else if (sw_field == MLX5_ACTION_IN_FIELD_OUT_METADATA_REGB) { + action->rewrite.allow_tx = false; + if (dmn->type != MLX5DV_DR_DOMAIN_TYPE_NIC_RX) { + dr_dbg(dmn, "Unsupported field %d for TX/FDB set action\n", + sw_field); + errno = EINVAL; + return errno; + } + } + + if (!action->rewrite.allow_rx && !action->rewrite.allow_tx) { + dr_dbg(dmn, "Modify SET actions not supported on both RX and TX\n"); + errno = EINVAL; + return errno; + } + + return 0; +} + +static int +dr_action_modify_check_field_limitation_add(struct mlx5dv_dr_action *action, + const __be64 *sw_action) +{ + uint16_t sw_field = DEVX_GET(add_action_in, sw_action, field); + + if (sw_field != MLX5_ACTION_IN_FIELD_OUT_IP_TTL && + sw_field != MLX5_ACTION_IN_FIELD_OUT_IPV6_HOPLIMIT && + sw_field != MLX5_ACTION_IN_FIELD_OUT_TCP_SEQ_NUM && + sw_field != MLX5_ACTION_IN_FIELD_OUT_TCP_ACK_NUM) { + dr_dbg(action->rewrite.dmn, + "Unsupported field %d for ADD action\n", sw_field); + errno = EINVAL; + return errno; + } + + return 0; +} + +static int +dr_action_modify_check_field_limitation_copy(struct mlx5dv_dr_action *action, + const __be64 *sw_action) +{ + struct mlx5dv_dr_domain *dmn = action->rewrite.dmn; + uint16_t sw_fields[2]; + int i; + + sw_fields[0] = DEVX_GET(copy_action_in, sw_action, src_field); + sw_fields[1] = DEVX_GET(copy_action_in, sw_action, dst_field); + + for (i = 0; i < 2; i++) { + if (sw_fields[i] == MLX5_ACTION_IN_FIELD_OUT_METADATA_REGA) { + action->rewrite.allow_rx = false; + if (dmn->type != MLX5DV_DR_DOMAIN_TYPE_NIC_TX) { + dr_dbg(dmn, "Unsupported field %d for RX/FDB COPY action\n", + sw_fields[i]); + errno = EINVAL; + return errno; + } + } else if (sw_fields[i] == MLX5_ACTION_IN_FIELD_OUT_METADATA_REGB) { + action->rewrite.allow_tx = false; + if (dmn->type != MLX5DV_DR_DOMAIN_TYPE_NIC_RX) { + dr_dbg(dmn, "Unsupported field %d for TX/FDB COPY action\n", + sw_fields[i]); + errno = EINVAL; + return errno; + } + } + } + + if (!action->rewrite.allow_rx && !action->rewrite.allow_tx) { + dr_dbg(dmn, "Modify actions combination is not supported on both RX and TX\n"); + errno = EINVAL; + return errno; + } + + return 0; +} + +static int +dr_action_modify_check_field_limitation(struct mlx5dv_dr_action *action, + const __be64 *sw_action) +{ + uint8_t action_type = DEVX_GET(set_action_in, sw_action, action_type); + struct mlx5dv_dr_domain *dmn = action->rewrite.dmn; + int ret; + + switch (action_type) { + case MLX5_ACTION_TYPE_SET: + ret = dr_action_modify_check_field_limitation_set(action, + sw_action); + break; + case MLX5_ACTION_TYPE_ADD: + ret = dr_action_modify_check_field_limitation_add(action, + sw_action); + break; + case MLX5_ACTION_TYPE_COPY: + ret = dr_action_modify_check_field_limitation_copy(action, + sw_action); + break; + default: + dr_dbg(dmn, "Unsupported modify action %d\n", + action_type); + errno = EOPNOTSUPP; + ret = errno; + break; + } + + return ret; +} + +static int dr_actions_convert_modify_header(struct mlx5dv_dr_action *action, + uint32_t max_hw_actions, + uint32_t num_sw_actions, + __be64 sw_actions[], + __be64 hw_actions[], + uint32_t *num_hw_actions) +{ + const struct dr_action_modify_field_conv *hw_dst_action_info; + const struct dr_action_modify_field_conv *hw_src_action_info; + uint16_t hw_field = MLX5_DR_ACTION_MDFY_HW_FLD_RESERVED; + uint32_t l3_type = MLX5_DR_ACTION_MDFY_HW_HDR_L3_NONE; + uint32_t l4_type = MLX5_DR_ACTION_MDFY_HW_HDR_L4_NONE; + struct mlx5dv_dr_domain *dmn = action->rewrite.dmn; + int ret, i, hw_idx = 0; + __be64 *sw_action; + __be64 hw_action; + + action->rewrite.allow_rx = true; + action->rewrite.allow_tx = true; + + for (i = 0; i < num_sw_actions; i++) { + sw_action = &sw_actions[i]; + + ret = dr_action_modify_check_field_limitation(action, + sw_action); + if (ret) + return ret; + + /* Convert SW action to HW action */ + ret = dr_action_modify_sw_to_hw(dmn, + sw_action, + &hw_action, + &hw_dst_action_info, + &hw_src_action_info); + if (ret) + return ret; + + /* Due to a HW limitation we cannot modify 2 different L3 types */ + if (l3_type && hw_dst_action_info->l3_type && + (hw_dst_action_info->l3_type != l3_type)) { + dr_dbg(dmn, "Action list can't support two different L3 types\n"); + errno = ENOTSUP; + return errno; + } + if (hw_dst_action_info->l3_type) + l3_type = hw_dst_action_info->l3_type; + + /* Due to a HW limitation we cannot modify two different L4 types */ + if (l4_type && hw_dst_action_info->l4_type && + (hw_dst_action_info->l4_type != l4_type)) { + dr_dbg(dmn, "Action list can't support two different L4 types\n"); + errno = EINVAL; + return errno; + } + if (hw_dst_action_info->l4_type) + l4_type = hw_dst_action_info->l4_type; + + /* HW reads and executes two actions at once this means we + * need to create a gap if two actions access the same field + */ + if ((hw_idx % 2) && (hw_field == hw_dst_action_info->hw_field || + (hw_src_action_info && + hw_field == hw_src_action_info->hw_field))) { + /* Check if after gap insertion the total number of HW + * modify actions doesn't exceeds the limit + */ + hw_idx++; + if ((num_sw_actions + hw_idx - i) >= max_hw_actions) { + dr_dbg(dmn, "Modify header action number exceeds HW limit\n"); + errno = EINVAL; + return errno; + } + } + hw_field = hw_dst_action_info->hw_field; + + hw_actions[hw_idx] = hw_action; + hw_idx++; + } + + *num_hw_actions = hw_idx; + + return 0; +} + +static int +dr_action_create_modify_action_root(struct mlx5dv_dr_domain *dmn, + size_t actions_sz, + __be64 actions[], + struct mlx5dv_dr_action *action) +{ + struct ibv_flow_action *flow_action; + enum mlx5dv_flow_table_type type; + + if (dmn->type == MLX5DV_DR_DOMAIN_TYPE_NIC_RX) + type = MLX5_IB_UAPI_FLOW_TABLE_TYPE_NIC_RX; + else if (dmn->type == MLX5DV_DR_DOMAIN_TYPE_NIC_TX) + type = MLX5_IB_UAPI_FLOW_TABLE_TYPE_NIC_TX; + else + type = MLX5_IB_UAPI_FLOW_TABLE_TYPE_FDB; + + flow_action = mlx5dv_create_flow_action_modify_header(dmn->ctx, + actions_sz, + (__force uint64_t *)actions, + type); + if (!flow_action) + return errno; + + action->rewrite.flow_action = flow_action; + return 0; +} + +static int dr_action_create_modify_action(struct mlx5dv_dr_domain *dmn, + size_t actions_sz, + __be64 actions[], + struct mlx5dv_dr_action *action) +{ + uint32_t dynamic_chunck_size; + struct dr_icm_chunk *chunk; + uint32_t num_hw_actions; + uint32_t num_sw_actions; + __be64 *hw_actions; + int ret; + + num_sw_actions = actions_sz / DR_MODIFY_ACTION_SIZE; + if (num_sw_actions == 0) { + dr_dbg(dmn, "Invalid number of actions %u\n", num_sw_actions); + errno = EINVAL; + return errno; + } + + hw_actions = calloc(1, 2 * num_sw_actions * DR_MODIFY_ACTION_SIZE); + if (!hw_actions) { + errno = ENOMEM; + return errno; + } + + ret = dr_actions_convert_modify_header(action, + 2 * num_sw_actions, + num_sw_actions, + actions, + hw_actions, + &num_hw_actions); + if (ret) + goto free_hw_actions; + + dynamic_chunck_size = ilog32(num_hw_actions - 1); + + /* HW modify action index granularity is at least 64B */ + dynamic_chunck_size = max_t(uint32_t, dynamic_chunck_size, + DR_CHUNK_SIZE_8); + + chunk = dr_icm_alloc_chunk(dmn->action_icm_pool, dynamic_chunck_size); + if (!chunk) + goto free_hw_actions; + + action->rewrite.chunk = chunk; + action->rewrite.data = (uint8_t *)hw_actions; + action->rewrite.num_of_actions = num_hw_actions; + action->rewrite.index = (chunk->icm_addr - + dmn->info.caps.hdr_modify_icm_addr) / + ACTION_CACHE_LINE_SIZE; + + ret = dr_send_postsend_action(dmn, action); + if (ret) + goto free_chunk; + + return 0; + +free_chunk: + dr_icm_free_chunk(chunk); +free_hw_actions: + free(hw_actions); + return errno; +} + +struct mlx5dv_dr_action * +mlx5dv_dr_action_create_modify_header(struct mlx5dv_dr_domain *dmn, + uint32_t flags, + size_t actions_sz, + __be64 actions[]) +{ + struct mlx5dv_dr_action *action; + int ret = 0; + + atomic_fetch_add(&dmn->refcount, 1); + + if (!check_comp_mask(flags, MLX5DV_DR_ACTION_FLAGS_ROOT_LEVEL)) { + errno = EINVAL; + goto dec_ref; + } + + if (actions_sz % DR_MODIFY_ACTION_SIZE) { + dr_dbg(dmn, "Invalid modify actions size provided\n"); + errno = EINVAL; + goto dec_ref; + } + + if (!dmn->info.supp_sw_steering && + !(flags & MLX5DV_DR_ACTION_FLAGS_ROOT_LEVEL)) { + dr_dbg(dmn, "Only root actions are supported on current domain\n"); + errno = EOPNOTSUPP; + goto dec_ref; + } + + action = dr_action_create_generic(DR_ACTION_TYP_MODIFY_HDR); + if (!action) + goto dec_ref; + + action->rewrite.dmn = dmn; + + /* Create the action according to the table type */ + if (flags & MLX5DV_DR_ACTION_FLAGS_ROOT_LEVEL) { + action->rewrite.is_root_level = true; + ret = dr_action_create_modify_action_root(dmn, + actions_sz, + actions, + action); + } else { + action->rewrite.is_root_level = false; + ret = dr_action_create_modify_action(dmn, + actions_sz, + actions, + action); + } + + if (ret) { + dr_dbg(dmn, "Failed creating modify header action %d\n", ret); + goto free_action; + } + + return action; + +free_action: + free(action); +dec_ref: + atomic_fetch_sub(&dmn->refcount, 1); + return NULL; +} + +int mlx5dv_dr_action_modify_flow_meter(struct mlx5dv_dr_action *action, + struct mlx5dv_dr_flow_meter_attr *attr, + __be64 modify_field_select) +{ + int ret; + + if (action->action_type != DR_ACTION_TYP_METER) { + errno = EINVAL; + return errno; + } + + ret = dr_devx_modify_meter(action->meter.devx_obj, attr, + modify_field_select); + return ret; +} + +struct mlx5dv_dr_action * +mlx5dv_dr_action_create_flow_meter(struct mlx5dv_dr_flow_meter_attr *attr) +{ + struct mlx5dv_dr_domain *dmn = attr->next_table->dmn; + uint64_t rx_icm_addr, tx_icm_addr; + struct mlx5dv_devx_obj *devx_obj; + struct mlx5dv_dr_action *action; + int ret; + + if (!dmn->info.supp_sw_steering) { + dr_dbg(dmn, "Meter action is not supported on current domain\n"); + errno = EOPNOTSUPP; + return NULL; + } + + if (dr_is_root_table(attr->next_table)) { + dr_dbg(dmn, "Next table cannot be root\n"); + errno = EOPNOTSUPP; + return NULL; + } + + devx_obj = dr_devx_create_meter(dmn->ctx, attr); + if (!devx_obj) + return NULL; + + ret = dr_devx_query_meter(devx_obj, &rx_icm_addr, &tx_icm_addr); + if (ret) + goto destroy_obj; + + action = dr_action_create_generic(DR_ACTION_TYP_METER); + if (!action) + goto destroy_obj; + + action->meter.devx_obj = devx_obj; + action->meter.next_ft = attr->next_table; + action->meter.rx_icm_addr = rx_icm_addr; + action->meter.tx_icm_addr = tx_icm_addr; + + atomic_fetch_add(&attr->next_table->refcount, 1); + + return action; + +destroy_obj: + mlx5dv_devx_obj_destroy(devx_obj); + return NULL; +} + +struct mlx5dv_dr_action +*mlx5dv_dr_action_create_dest_vport(struct mlx5dv_dr_domain *dmn, uint32_t vport) +{ + struct mlx5dv_dr_action *action; + struct dr_devx_vport_cap *vport_cap; + + if (!dmn->info.supp_sw_steering || + dmn->type != MLX5DV_DR_DOMAIN_TYPE_FDB) { + dr_dbg(dmn, "Domain doesn't support vport actions\n"); + errno = EOPNOTSUPP; + return NULL; + } + + vport_cap = dr_get_vport_cap(&dmn->info.caps, vport); + if (!vport_cap) { + dr_dbg(dmn, "Failed to get vport %d caps\n", vport); + return NULL; + } + + action = dr_action_create_generic(DR_ACTION_TYP_VPORT); + if (!action) + return NULL; + + action->vport.dmn = dmn; + action->vport.num = vport; + action->vport.caps = vport_cap; + + return action; +} + +int mlx5dv_dr_action_destroy(struct mlx5dv_dr_action *action) +{ + if (atomic_load(&action->refcount) > 1) + return EBUSY; + + switch (action->action_type) { + case DR_ACTION_TYP_FT: + atomic_fetch_sub(&action->dest_tbl->refcount, 1); + break; + case DR_ACTION_TYP_TNL_L2_TO_L2: + if (action->reformat.is_root_level) + mlx5_destroy_flow_action(action->reformat.flow_action); + atomic_fetch_sub(&action->reformat.dmn->refcount, 1); + break; + case DR_ACTION_TYP_TNL_L3_TO_L2: + if (action->reformat.is_root_level) + mlx5_destroy_flow_action(action->reformat.flow_action); + else + dr_icm_free_chunk(action->rewrite.chunk); + atomic_fetch_sub(&action->reformat.dmn->refcount, 1); + break; + case DR_ACTION_TYP_L2_TO_TNL_L2: + case DR_ACTION_TYP_L2_TO_TNL_L3: + if (action->reformat.is_root_level) + mlx5_destroy_flow_action(action->reformat.flow_action); + else + mlx5dv_devx_obj_destroy(action->reformat.dvo); + atomic_fetch_sub(&action->reformat.dmn->refcount, 1); + break; + case DR_ACTION_TYP_MODIFY_HDR: + if (action->rewrite.is_root_level) { + mlx5_destroy_flow_action(action->rewrite.flow_action); + } else { + dr_icm_free_chunk(action->rewrite.chunk); + free(action->rewrite.data); + } + atomic_fetch_sub(&action->rewrite.dmn->refcount, 1); + break; + case DR_ACTION_TYP_METER: + mlx5dv_devx_obj_destroy(action->meter.devx_obj); + atomic_fetch_sub(&action->meter.next_ft->refcount, 1); + break; + default: + break; + } + + free(action); + return 0; +} diff --git a/providers/mlx5/dr_crc32.c b/providers/mlx5/dr_crc32.c new file mode 100644 index 0000000..39a8c01 --- /dev/null +++ b/providers/mlx5/dr_crc32.c @@ -0,0 +1,131 @@ +/* + * Copyright (c) 2019, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +/* + * Copyright (c) 2011-2015 Stephan Brumme. All rights reserved. + * Slicing-by-16 contributed by Bulat Ziganshin + * + * This software is provided 'as-is', without any express or implied warranty. + * In no event will the author be held liable for any damages arising from the + * of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. + * 2. If you use this software in a product, an acknowledgment in the product + * documentation would be appreciated but is not required. + * 3. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * + * Taken from http://create.stephan-brumme.com/crc32/ and adapted. + */ + +#include <stdlib.h> +#include <string.h> +#include "mlx5dv_dr.h" + +#define DR_STE_CRC_POLY 0xEDB88320L + +static uint32_t dr_ste_crc_tab32[8][256]; + +static void dr_crc32_calc_lookup_entry(uint32_t (*tbl)[256], uint8_t i, + uint8_t j) +{ + tbl[i][j] = (tbl[i-1][j] >> 8) ^ tbl[0][tbl[i-1][j] & 0xff]; +} + +void dr_crc32_init_table(void) +{ + uint32_t crc, i, j; + + for (i = 0; i < 256; i++) { + crc = i; + for (j = 0; j < 8; j++) { + if (crc & 0x00000001L) + crc = (crc >> 1) ^ DR_STE_CRC_POLY; + else + crc = crc >> 1; + } + dr_ste_crc_tab32[0][i] = crc; + } + + /* Init CRC lookup tables according to crc_slice_8 algorithm */ + for (i = 0; i < 256; i++) { + dr_crc32_calc_lookup_entry(dr_ste_crc_tab32, 1, i); + dr_crc32_calc_lookup_entry(dr_ste_crc_tab32, 2, i); + dr_crc32_calc_lookup_entry(dr_ste_crc_tab32, 3, i); + dr_crc32_calc_lookup_entry(dr_ste_crc_tab32, 4, i); + dr_crc32_calc_lookup_entry(dr_ste_crc_tab32, 5, i); + dr_crc32_calc_lookup_entry(dr_ste_crc_tab32, 6, i); + dr_crc32_calc_lookup_entry(dr_ste_crc_tab32, 7, i); + } +} + +/* Compute CRC32 (Slicing-by-8 algorithm) */ +uint32_t dr_crc32_slice8_calc(const void *input_data, size_t length) +{ + const uint32_t *current = (const uint32_t *)input_data; + const uint8_t *current_char; + uint32_t crc = 0, one, two; + + if (!input_data) + return 0; + + /* Process eight bytes at once (Slicing-by-8) */ + while (length >= 8) { + one = *current++ ^ crc; + two = *current++; + + crc = dr_ste_crc_tab32[0][(two >> 24) & 0xff] + ^ dr_ste_crc_tab32[1][(two >> 16) & 0xff] + ^ dr_ste_crc_tab32[2][(two >> 8) & 0xff] + ^ dr_ste_crc_tab32[3][two & 0xff] + ^ dr_ste_crc_tab32[4][(one >> 24) & 0xff] + ^ dr_ste_crc_tab32[5][(one >> 16) & 0xff] + ^ dr_ste_crc_tab32[6][(one >> 8) & 0xff] + ^ dr_ste_crc_tab32[7][one & 0xff]; + + length -= 8; + } + + current_char = (const uint8_t *)current; + /* Remaining 1 to 7 bytes (standard algorithm) */ + while (length-- != 0) + crc = (crc >> 8) ^ dr_ste_crc_tab32[0][(crc & 0xff) + ^ *current_char++]; + + return ((crc>>24) & 0xff) | ((crc<<8) & 0xff0000) | + ((crc>>8) & 0xff00) | ((crc<<24) & 0xff000000); +} diff --git a/providers/mlx5/dr_dbg.c b/providers/mlx5/dr_dbg.c new file mode 100644 index 0000000..12b76bb --- /dev/null +++ b/providers/mlx5/dr_dbg.c @@ -0,0 +1,703 @@ +/* + * Copyright (c) 2019 Mellanox Technologies, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <unistd.h> +#include <inttypes.h> +#include "mlx5dv_dr.h" + +#define BUFF_SIZE 1024 + +enum dr_dump_rec_type { + DR_DUMP_REC_TYPE_DOMAIN = 3000, + DR_DUMP_REC_TYPE_DOMAIN_INFO_FLEX_PARSER = 3001, + DR_DUMP_REC_TYPE_DOMAIN_INFO_DEV_ATTR = 3002, + DR_DUMP_REC_TYPE_DOMAIN_INFO_VPORT = 3003, + DR_DUMP_REC_TYPE_DOMAIN_INFO_CAPS = 3004, + DR_DUMP_REC_TYPE_DOMAIN_SEND_RING = 3005, + + DR_DUMP_REC_TYPE_TABLE = 3100, + DR_DUMP_REC_TYPE_TABLE_RX = 3101, + DR_DUMP_REC_TYPE_TABLE_TX = 3102, + + DR_DUMP_REC_TYPE_MATCHER = 3200, + DR_DUMP_REC_TYPE_MATCHER_MASK = 3201, + DR_DUMP_REC_TYPE_MATCHER_RX = 3202, + DR_DUMP_REC_TYPE_MATCHER_TX = 3203, + DR_DUMP_REC_TYPE_MATCHER_BUILDER = 3204, + + DR_DUMP_REC_TYPE_RULE = 3300, + DR_DUMP_REC_TYPE_RULE_RX_ENTRY = 3301, + DR_DUMP_REC_TYPE_RULE_TX_ENTRY = 3302, + + DR_DUMP_REC_TYPE_ACTION_ENCAP_L2 = 3400, + DR_DUMP_REC_TYPE_ACTION_ENCAP_L3 = 3401, + DR_DUMP_REC_TYPE_ACTION_MODIFY_HDR = 3402, + DR_DUMP_REC_TYPE_ACTION_DROP = 3403, + DR_DUMP_REC_TYPE_ACTION_QP = 3404, + DR_DUMP_REC_TYPE_ACTION_FT = 3405, + DR_DUMP_REC_TYPE_ACTION_CTR = 3406, + DR_DUMP_REC_TYPE_ACTION_TAG = 3407, + DR_DUMP_REC_TYPE_ACTION_VPORT = 3408, + DR_DUMP_REC_TYPE_ACTION_DECAP_L2 = 3409, + DR_DUMP_REC_TYPE_ACTION_DECAP_L3 = 3410, +}; + +static uint64_t dr_dump_icm_to_idx(uint64_t icm_addr) +{ + return (icm_addr >> 6) & 0xffffffff; +} + +static void dump_hex_print(char *dest, char *src, uint32_t size) +{ + int i; + + for (i = 0; i < size; i++) + sprintf(&dest[2 * i], "%02x", (uint8_t)src[i]); +} + +static int dr_dump_rule_action_mem(FILE *f, const uint64_t rule_id, + struct dr_rule_action_member *action_mem) +{ + struct mlx5dv_dr_action *action = action_mem->action; + const uint64_t action_id = (uint64_t) (uintptr_t) action; + int ret; + + switch (action->action_type) { + case DR_ACTION_TYP_DROP: + ret = fprintf(f, "%d,0x%" PRIx64 ",0x%" PRIx64 "\n", + DR_DUMP_REC_TYPE_ACTION_DROP, action_id, rule_id); + break; + case DR_ACTION_TYP_FT: + ret = fprintf(f, "%d,0x%" PRIx64 ",0x%" PRIx64 ",0x%x\n", + DR_DUMP_REC_TYPE_ACTION_FT, action_id, rule_id, + action->dest_tbl->devx_obj->object_id); + break; + case DR_ACTION_TYP_QP: + ret = fprintf(f, "%d,0x%" PRIx64 ",0x%" PRIx64 ",0x%x\n", + DR_DUMP_REC_TYPE_ACTION_QP, action_id, rule_id, + action->qp->qp_num); + break; + case DR_ACTION_TYP_CTR: + ret = fprintf(f, "%d,0x%" PRIx64 ",0x%" PRIx64 ",0x%x\n", + DR_DUMP_REC_TYPE_ACTION_CTR, action_id, rule_id, + action->ctr.devx_obj->object_id + + action->ctr.offset); + break; + case DR_ACTION_TYP_TAG: + ret = fprintf(f, "%d,,0x%" PRIx64 ",0x%" PRIx64 "0x%x\n", + DR_DUMP_REC_TYPE_ACTION_TAG, action_id, rule_id, + action->flow_tag); + break; + case DR_ACTION_TYP_MODIFY_HDR: + ret = fprintf(f, "%d,,0x%" PRIx64 ",0x%" PRIx64 "0x%x\n", + DR_DUMP_REC_TYPE_ACTION_MODIFY_HDR, action_id, + rule_id, action->rewrite.index); + break; + case DR_ACTION_TYP_VPORT: + ret = fprintf(f, "%d,0x%" PRIx64 ",0x%" PRIx64 ",0x%x\n", + DR_DUMP_REC_TYPE_ACTION_VPORT, action_id, rule_id, + action->vport.num); + break; + case DR_ACTION_TYP_TNL_L2_TO_L2: + ret = fprintf(f, "%d,0x%" PRIx64 ",0x%" PRIx64 "\n", + DR_DUMP_REC_TYPE_ACTION_DECAP_L2, action_id, + rule_id); + break; + case DR_ACTION_TYP_TNL_L3_TO_L2: + ret = fprintf(f, "%d,0x%" PRIx64 ",0x%" PRIx64 ",0x%x\n", + DR_DUMP_REC_TYPE_ACTION_DECAP_L3, action_id, + rule_id, action->rewrite.index); + break; + case DR_ACTION_TYP_L2_TO_TNL_L2: + ret = fprintf(f, "%d,0x%" PRIx64 ",0x%" PRIx64 ",0x%x\n", + DR_DUMP_REC_TYPE_ACTION_ENCAP_L2, action_id, + rule_id, action->reformat.dvo->object_id); + break; + case DR_ACTION_TYP_L2_TO_TNL_L3: + ret = fprintf(f, "%d,0x%" PRIx64 ",0x%" PRIx64 ",0x%x\n", + DR_DUMP_REC_TYPE_ACTION_ENCAP_L3, action_id, + rule_id, action->reformat.dvo->object_id); + break; + default: + return 0; + } + + if (ret < 0) + return ret; + + return 0; +} + +static int dr_dump_rule_mem(FILE *f, struct dr_rule_member *rule_mem, + bool is_rx, const uint64_t rule_id) +{ + char hw_ste_dump[BUFF_SIZE] = {}; + enum dr_dump_rec_type mem_rec_type; + int ret; + + mem_rec_type = is_rx ? DR_DUMP_REC_TYPE_RULE_RX_ENTRY : + DR_DUMP_REC_TYPE_RULE_TX_ENTRY; + + dump_hex_print(hw_ste_dump, (char *)rule_mem->ste->hw_ste, DR_STE_SIZE_REDUCED); + ret = fprintf(f, "%d,0x%" PRIx64 ",0x%" PRIx64 ",%s\n", + mem_rec_type, + dr_dump_icm_to_idx(dr_ste_get_icm_addr(rule_mem->ste)), + rule_id, + hw_ste_dump); + if (ret < 0) + return ret; + + return 0; +} + +static int dr_dump_rule_rx_tx(FILE *f, struct dr_rule_rx_tx *rule_rx_tx, + bool is_rx, const uint64_t rule_id) +{ + struct dr_rule_member *rule_mem; + int ret; + + list_for_each(&rule_rx_tx->rule_members_list, rule_mem, list) { + ret = dr_dump_rule_mem(f, rule_mem, is_rx, rule_id); + if (ret < 0) + return ret; + } + return 0; +} + +static int dr_dump_rule(FILE *f, struct mlx5dv_dr_rule *rule) +{ + struct dr_rule_action_member *action_mem; + const uint64_t rule_id = (uint64_t) (uintptr_t) rule; + struct dr_rule_rx_tx *rx = &rule->rx; + struct dr_rule_rx_tx *tx = &rule->tx; + int ret; + + ret = fprintf(f, "%d,0x%" PRIx64 ",0x%" PRIx64 "\n", + DR_DUMP_REC_TYPE_RULE, + rule_id, + (uint64_t) (uintptr_t) rule->matcher); + if (ret < 0) + return ret; + + if (!dr_is_root_table(rule->matcher->tbl)) { + if (rx->nic_matcher) { + ret = dr_dump_rule_rx_tx(f, rx, true, rule_id); + if (ret < 0) + return ret; + } + + if (tx->nic_matcher) { + ret = dr_dump_rule_rx_tx(f, tx, false, rule_id); + if (ret < 0) + return ret; + } + } + + list_for_each(&rule->rule_actions_list, action_mem, list) { + ret = dr_dump_rule_action_mem(f, rule_id, action_mem); + if (ret < 0) + return ret; + } + + return 0; +} + +int mlx5dv_dump_dr_rule(FILE *fout, struct mlx5dv_dr_rule *rule) +{ + int ret; + + if (!fout || !rule) + return -EINVAL; + + pthread_mutex_lock(&rule->matcher->tbl->dmn->mutex); + + ret = dr_dump_rule(fout, rule); + + pthread_mutex_unlock(&rule->matcher->tbl->dmn->mutex); + + return ret; +} + +static int dr_dump_matcher_mask(FILE *f, struct dr_match_param *mask, + uint8_t criteria, const uint64_t matcher_id) +{ + char dump[BUFF_SIZE] = {}; + int ret; + + ret = fprintf(f, "%d,0x%" PRIx64 ",", DR_DUMP_REC_TYPE_MATCHER_MASK, matcher_id); + if (ret < 0) + return ret; + + if (criteria & DR_MATCHER_CRITERIA_OUTER) { + dump_hex_print(dump, (char *)&mask->outer, sizeof(mask->outer)); + ret = fprintf(f, "%s,", dump); + } else { + ret = fprintf(f, ","); + } + + if (ret < 0) + return ret; + + if (criteria & DR_MATCHER_CRITERIA_INNER) { + dump_hex_print(dump, (char *)&mask->inner, sizeof(mask->inner)); + ret = fprintf(f, "%s,", dump); + } else { + ret = fprintf(f, ","); + } + + + if (ret < 0) + return ret; + + if (criteria & DR_MATCHER_CRITERIA_MISC) { + dump_hex_print(dump, (char *)&mask->misc, sizeof(mask->misc)); + ret = fprintf(f, "%s,", dump); + } else { + ret = fprintf(f, ","); + } + + if (ret < 0) + return ret; + + if (criteria & DR_MATCHER_CRITERIA_MISC2) { + dump_hex_print(dump, (char *)&mask->misc2, sizeof(mask->misc2)); + ret = fprintf(f, "%s,", dump); + } else { + ret = fprintf(f, ","); + } + + if (ret < 0) + return ret; + + if (criteria & DR_MATCHER_CRITERIA_MISC3) { + dump_hex_print(dump, (char *)&mask->misc3, sizeof(mask->misc3)); + ret = fprintf(f, "%s\n", dump); + } else { + ret = fprintf(f, ",\n"); + } + + if (ret < 0) + return ret; + + return 0; +} + +static int dr_dump_matcher_builder(FILE *f, struct dr_ste_build *builder, + uint32_t index, bool is_rx, + const uint64_t matcher_id) +{ + int ret; + + ret = fprintf(f, "%d,0x%" PRIx64 "%d,%d,0x%x\n", + DR_DUMP_REC_TYPE_MATCHER_BUILDER, + matcher_id, + index, + is_rx, + builder->lu_type); + if (ret < 0) + return ret; + + return 0; +} + +static int dr_dump_matcher_rx_tx(FILE *f, bool is_rx, + struct dr_matcher_rx_tx *matcher_rx_tx, + const uint64_t matcher_id) +{ + enum dr_dump_rec_type rec_type; + int i, ret; + + rec_type = is_rx ? DR_DUMP_REC_TYPE_MATCHER_RX : + DR_DUMP_REC_TYPE_MATCHER_TX; + + ret = fprintf(f, "%d,0x%" PRIx64 ",0x%" PRIx64 ",%d,0x%" PRIx64 ",0x%" PRIx64 "\n", + rec_type, + (uint64_t) (uintptr_t) matcher_rx_tx, + matcher_id, + matcher_rx_tx->num_of_builders, + dr_dump_icm_to_idx(matcher_rx_tx->s_htbl->chunk->icm_addr), + dr_dump_icm_to_idx(matcher_rx_tx->e_anchor->chunk->icm_addr)); + if (ret < 0) + return ret; + + for (i = 0; i < matcher_rx_tx->num_of_builders; i++) { + ret = dr_dump_matcher_builder(f, &matcher_rx_tx->ste_builder[i], + i, is_rx, matcher_id); + if (ret < 0) + return ret; + } + + return 0; +} + +static int dr_dump_matcher(FILE *f, struct mlx5dv_dr_matcher *matcher) +{ + struct dr_matcher_rx_tx *rx = &matcher->rx; + struct dr_matcher_rx_tx *tx = &matcher->tx; + uint64_t matcher_id; + int ret; + + matcher_id = (uint64_t) (uintptr_t) matcher; + + ret = fprintf(f, "%d,0x%" PRIx64 ",0x%" PRIx64 ",%d\n", + DR_DUMP_REC_TYPE_MATCHER, + matcher_id, + (uint64_t) (uintptr_t) matcher->tbl, + matcher->prio); + if (ret < 0) + return ret; + + + if (!dr_is_root_table(matcher->tbl)) { + ret = dr_dump_matcher_mask(f, &matcher->mask, matcher->match_criteria, matcher_id); + if (ret < 0) + return ret; + + if (rx->nic_tbl) { + ret = dr_dump_matcher_rx_tx(f, true, rx, matcher_id); + if (ret < 0) + return ret; + } + + if (tx->nic_tbl) { + ret = dr_dump_matcher_rx_tx(f, false, tx, matcher_id); + if (ret < 0) + return ret; + } + } + + return 0; +} + +static int dr_dump_matcher_all(FILE *fout, struct mlx5dv_dr_matcher *matcher) +{ + struct mlx5dv_dr_rule *rule; + int ret; + + ret = dr_dump_matcher(fout, matcher); + if (ret < 0) + return ret; + + list_for_each(&matcher->rule_list, rule, rule_list) { + ret = dr_dump_rule(fout, rule); + if (ret < 0) + return ret; + } + + return 0; +} + +int mlx5dv_dump_dr_matcher(FILE *fout, struct mlx5dv_dr_matcher *matcher) +{ + int ret; + + if (!fout || !matcher) + return -EINVAL; + + pthread_mutex_lock(&matcher->tbl->dmn->mutex); + + ret = dr_dump_matcher_all(fout, matcher); + + pthread_mutex_unlock(&matcher->tbl->dmn->mutex); + + return ret; +} + +static uint64_t dr_domain_id_calc(enum mlx5dv_dr_domain_type type) +{ + return (getpid() << 8) | (type & 0xff); +} + +static int dr_dump_table_rx_tx(FILE *f, bool is_rx, + struct dr_table_rx_tx *table_rx_tx, + const uint64_t table_id) +{ + enum dr_dump_rec_type rec_type; + int ret; + + rec_type = is_rx ? DR_DUMP_REC_TYPE_TABLE_RX : DR_DUMP_REC_TYPE_TABLE_TX; + + ret = fprintf(f, "%d,0x%" PRIx64 ",0x%" PRIx64 "\n", + rec_type, + table_id, + dr_dump_icm_to_idx(table_rx_tx->s_anchor->chunk->icm_addr)); + if (ret < 0) + return ret; + + return 0; +} + +static int dr_dump_table(FILE *f, struct mlx5dv_dr_table *table) +{ + struct dr_table_rx_tx *rx = &table->rx; + struct dr_table_rx_tx *tx = &table->tx; + int ret; + + ret = fprintf(f, "%d,0x%" PRIx64 ",0x%" PRIx64 ",%d,%d\n", + DR_DUMP_REC_TYPE_TABLE, + (uint64_t) (uintptr_t) table, + dr_domain_id_calc(table->dmn->type), + table->table_type, + table->level); + if (ret < 0) + return ret; + + if (!dr_is_root_table(table)) { + if (rx->nic_dmn) { + ret = dr_dump_table_rx_tx(f, true, rx, (uint64_t) (uintptr_t) table); + if (ret < 0) + return ret; + } + + if (tx->nic_dmn) { + ret = dr_dump_table_rx_tx(f, false, tx, (uint64_t) (uintptr_t) table); + if (ret < 0) + return ret; + } + } + return 0; +} + +static int dr_dump_table_all(FILE *fout, struct mlx5dv_dr_table *tbl) +{ + struct mlx5dv_dr_matcher *matcher; + int ret; + + ret = dr_dump_table(fout, tbl); + if (ret < 0) + return ret; + + if (!dr_is_root_table(tbl)) { + list_for_each(&tbl->matcher_list, matcher, matcher_list) { + ret = dr_dump_matcher_all(fout, matcher); + if (ret < 0) + return ret; + } + } + return 0; +} + +int mlx5dv_dump_dr_table(FILE *fout, struct mlx5dv_dr_table *tbl) +{ + int ret; + + if (!fout || !tbl) + return -EINVAL; + + pthread_mutex_lock(&tbl->dmn->mutex); + + ret = dr_dump_table_all(fout, tbl); + + pthread_mutex_unlock(&tbl->dmn->mutex); + + return ret; +} + +static int dr_dump_send_ring(FILE *f, struct dr_send_ring *ring, + const uint64_t domain_id) +{ + int ret; + + ret = fprintf(f, "%d,0x%" PRIx64 ",0x%" PRIx64 ",0x%x,0x%x\n", + DR_DUMP_REC_TYPE_DOMAIN_SEND_RING, + (uint64_t) (uintptr_t) ring, + domain_id, + ring->cq.cqn, + ring->qp->obj->object_id); + if (ret < 0) + return ret; + + return 0; +} + +static int dr_dump_domain_info_flex_parser(FILE *f, const char *flex_parser_name, + const uint8_t flex_parser_value, + const uint64_t domain_id) +{ + int ret; + + ret = fprintf(f, "%d,0x%" PRIx64 ",%s,0x%x\n", + DR_DUMP_REC_TYPE_DOMAIN_INFO_FLEX_PARSER, + domain_id, + flex_parser_name, + flex_parser_value); + if (ret < 0) + return ret; + + return 0; +} + +static int dr_dump_domain_info_caps(FILE *f, struct dr_devx_caps *caps, + const uint64_t domain_id) +{ + int i, ret; + + ret = fprintf(f, "%d,0x%" PRIx64 ",0x%x,0x%" PRIx64 ",0x%" PRIx64 ",0x%x,%d,%d\n", + DR_DUMP_REC_TYPE_DOMAIN_INFO_CAPS, + domain_id, + caps->gvmi, + caps->nic_rx_drop_address, + caps->nic_tx_drop_address, + caps->flex_protocols, + caps->num_vports, + caps->eswitch_manager); + if (ret < 0) + return ret; + + for (i = 0; i < caps->num_vports; i++) { + ret = fprintf(f, "%d,0x%" PRIx64 ",%d,0x%x,0x%" PRIx64 ",0x%" PRIx64 "\n", + DR_DUMP_REC_TYPE_DOMAIN_INFO_VPORT, + domain_id, + i, + caps->vports_caps[i].gvmi, + caps->vports_caps[i].icm_address_rx, + caps->vports_caps[i].icm_address_tx); + if (ret < 0) + return ret; + } + return 0; +} + +static int dr_dump_domain_info_dev_attr(FILE *f, struct ibv_device_attr *attr, + const uint64_t domain_id) +{ + int ret; + + ret = fprintf(f, "%d,0x%" PRIx64 ",%d,%s\n", + DR_DUMP_REC_TYPE_DOMAIN_INFO_DEV_ATTR, + domain_id, + attr->phys_port_cnt, + attr->fw_ver); + if (ret < 0) + return ret; + + return 0; +} +static int dr_dump_domain_info(FILE *f, struct dr_domain_info *info, + const uint64_t domain_id) +{ + int ret; + + ret = dr_dump_domain_info_dev_attr(f, &info->attr, domain_id); + if (ret < 0) + return ret; + + ret = dr_dump_domain_info_caps(f, &info->caps, domain_id); + if (ret < 0) + return ret; + + ret = dr_dump_domain_info_flex_parser(f, "icmp_dw0", info->caps.flex_parser_id_icmp_dw0, domain_id); + if (ret < 0) + return ret; + + ret = dr_dump_domain_info_flex_parser(f, "icmp_dw1", info->caps.flex_parser_id_icmp_dw1, domain_id); + if (ret < 0) + return ret; + + ret = dr_dump_domain_info_flex_parser(f, "icmpv6_dw0", info->caps.flex_parser_id_icmpv6_dw0, domain_id); + if (ret < 0) + return ret; + + ret = dr_dump_domain_info_flex_parser(f, "icmpv6_dw1", info->caps.flex_parser_id_icmpv6_dw1, domain_id); + if (ret < 0) + return ret; + + return 0; +} + +static int dr_dump_domain(FILE *f, struct mlx5dv_dr_domain *dmn) +{ + enum mlx5dv_dr_domain_type dmn_type = dmn->type; + char *dev_name = dmn->ctx->device->dev_name; + uint64_t domain_id; + int ret; + + domain_id = dr_domain_id_calc(dmn_type); + + ret = fprintf(f, "%d,0x%" PRIx64 ",%d,0%x,%d,%s,%s\n", + DR_DUMP_REC_TYPE_DOMAIN, + domain_id, + dmn_type, + dmn->info.caps.gvmi, + dmn->info.supp_sw_steering, + PACKAGE_VERSION, + dev_name); + if (ret < 0) + return ret; + + ret = dr_dump_domain_info(f, &dmn->info, domain_id); + if (ret < 0) + return ret; + + if (dmn->info.supp_sw_steering) { + ret = dr_dump_send_ring(f, dmn->send_ring, domain_id); + if (ret < 0) + return ret; + } + + return 0; +} + +static int dr_dump_domain_all(FILE *fout, struct mlx5dv_dr_domain *dmn) +{ + struct mlx5dv_dr_table *tbl; + int ret; + + ret = dr_dump_domain(fout, dmn); + if (ret < 0) + return ret; + + list_for_each(&dmn->tbl_list, tbl, tbl_list) { + ret = dr_dump_table_all(fout, tbl); + if (ret < 0) + return ret; + } + + return 0; +} + +int mlx5dv_dump_dr_domain(FILE *fout, struct mlx5dv_dr_domain *dmn) +{ + int ret; + + if (!fout || !dmn) + return -EINVAL; + + pthread_mutex_lock(&dmn->mutex); + + ret = dr_dump_domain_all(fout, dmn); + + pthread_mutex_unlock(&dmn->mutex); + + return ret; +} + diff --git a/providers/mlx5/dr_devx.c b/providers/mlx5/dr_devx.c new file mode 100644 index 0000000..61157b8 --- /dev/null +++ b/providers/mlx5/dr_devx.c @@ -0,0 +1,540 @@ +/* + * Copyright (c) 2019 Mellanox Technologies, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <stdlib.h> +#include <stdio.h> +#include <infiniband/verbs.h> +#include <infiniband/cmd_ioctl.h> +#include <rdma/mlx5_user_ioctl_cmds.h> +#include "mlx5dv_dr.h" + +int dr_devx_query_esw_vport_context(struct ibv_context *ctx, + bool other_vport, uint16_t vport_number, + uint64_t *icm_address_rx, + uint64_t *icm_address_tx) +{ + uint32_t out[DEVX_ST_SZ_DW(query_esw_vport_context_out)] = {}; + uint32_t in[DEVX_ST_SZ_DW(query_esw_vport_context_in)] = {}; + int err; + + DEVX_SET(query_esw_vport_context_in, in, opcode, + MLX5_CMD_OP_QUERY_ESW_VPORT_CONTEXT); + DEVX_SET(query_esw_vport_context_in, in, other_vport, other_vport); + DEVX_SET(query_esw_vport_context_in, in, vport_number, vport_number); + + err = mlx5dv_devx_general_cmd(ctx, in, sizeof(in), out, sizeof(out)); + if (err) { + dr_dbg_ctx(ctx, "Query eswitch vport context failed %d\n", err); + return err; + } + + *icm_address_rx = + DEVX_GET64(query_esw_vport_context_out, out, + esw_vport_context.sw_steering_vport_icm_address_rx); + *icm_address_tx = + DEVX_GET64(query_esw_vport_context_out, out, + esw_vport_context.sw_steering_vport_icm_address_tx); + return 0; +} + +int dr_devx_query_gvmi(struct ibv_context *ctx, bool other_vport, + uint16_t vport_number, uint16_t *gvmi) +{ + uint32_t out[DEVX_ST_SZ_DW(query_hca_cap_out)] = {}; + uint32_t in[DEVX_ST_SZ_DW(query_hca_cap_in)] = {}; + int err; + + DEVX_SET(query_hca_cap_in, in, opcode, MLX5_CMD_OP_QUERY_HCA_CAP); + DEVX_SET(query_hca_cap_in, in, other_function, other_vport); + DEVX_SET(query_hca_cap_in, in, function_id, vport_number); + DEVX_SET(query_hca_cap_in, in, op_mod, + MLX5_SET_HCA_CAP_OP_MOD_GENERAL_DEVICE | + HCA_CAP_OPMOD_GET_CUR); + + err = mlx5dv_devx_general_cmd(ctx, in, sizeof(in), out, sizeof(out)); + if (err) { + dr_dbg_ctx(ctx, "Query general failed %d\n", err); + return err; + } + + *gvmi = DEVX_GET(query_hca_cap_out, out, capability.cmd_hca_cap.vhca_id); + + return 0; +} + +int dr_devx_query_esw_caps(struct ibv_context *ctx, struct dr_esw_caps *caps) +{ + uint32_t out[DEVX_ST_SZ_DW(query_hca_cap_out)] = {}; + uint32_t in[DEVX_ST_SZ_DW(query_hca_cap_in)] = {}; + void *esw_caps; + int err; + + DEVX_SET(query_hca_cap_in, in, opcode, MLX5_CMD_OP_QUERY_HCA_CAP); + DEVX_SET(query_hca_cap_in, in, op_mod, + MLX5_SET_HCA_CAP_OP_MOD_ESW_FLOW_TABLE | + HCA_CAP_OPMOD_GET_CUR); + + err = mlx5dv_devx_general_cmd(ctx, in, sizeof(in), out, sizeof(out)); + if (err) { + dr_dbg_ctx(ctx, "Query general failed %d\n", err); + return err; + } + + esw_caps = DEVX_ADDR_OF(query_hca_cap_out, out, + capability.flow_table_eswitch_cap); + caps->drop_icm_address_rx = + DEVX_GET64(flow_table_eswitch_cap, esw_caps, + sw_steering_fdb_action_drop_icm_address_rx); + caps->drop_icm_address_tx = + DEVX_GET64(flow_table_eswitch_cap, esw_caps, + sw_steering_fdb_action_drop_icm_address_tx); + caps->uplink_icm_address_rx = + DEVX_GET64(flow_table_eswitch_cap, esw_caps, + sw_steering_uplink_icm_address_rx); + caps->uplink_icm_address_tx = + DEVX_GET64(flow_table_eswitch_cap, esw_caps, + sw_steering_uplink_icm_address_tx); + caps->sw_owner = + DEVX_GET(flow_table_eswitch_cap, esw_caps, + flow_table_properties_nic_esw_fdb.sw_owner); + return 0; +} + +int dr_devx_query_device(struct ibv_context *ctx, struct dr_devx_caps *caps) +{ + uint32_t out[DEVX_ST_SZ_DW(query_hca_cap_out)] = {}; + uint32_t in[DEVX_ST_SZ_DW(query_hca_cap_in)] = {}; + int err; + + DEVX_SET(query_hca_cap_in, in, opcode, MLX5_CMD_OP_QUERY_HCA_CAP); + DEVX_SET(query_hca_cap_in, in, op_mod, + MLX5_SET_HCA_CAP_OP_MOD_GENERAL_DEVICE | + HCA_CAP_OPMOD_GET_CUR); + + err = mlx5dv_devx_general_cmd(ctx, in, sizeof(in), out, sizeof(out)); + if (err) { + dr_dbg_ctx(ctx, "Query general failed %d\n", err); + return err; + } + + caps->eswitch_manager = DEVX_GET(query_hca_cap_out, out, + capability.cmd_hca_cap.eswitch_manager); + caps->gvmi = DEVX_GET(query_hca_cap_out, out, capability.cmd_hca_cap.vhca_id); + caps->flex_protocols = DEVX_GET(query_hca_cap_out, out, + capability.cmd_hca_cap.flex_parser_protocols); + + if (dr_matcher_supp_flex_parser_icmp_v4(caps)) { + caps->flex_parser_id_icmp_dw0 = + DEVX_GET(query_hca_cap_out, + out, + capability.cmd_hca_cap.flex_parser_id_icmp_dw0); + caps->flex_parser_id_icmp_dw1 = + DEVX_GET(query_hca_cap_out, + out, + capability.cmd_hca_cap.flex_parser_id_icmp_dw1); + } + + if (dr_matcher_supp_flex_parser_icmp_v6(caps)) { + caps->flex_parser_id_icmpv6_dw0 = + DEVX_GET(query_hca_cap_out, + out, + capability.cmd_hca_cap.flex_parser_id_icmpv6_dw0); + caps->flex_parser_id_icmpv6_dw1 = + DEVX_GET(query_hca_cap_out, + out, + capability.cmd_hca_cap.flex_parser_id_icmpv6_dw1); + } + + DEVX_SET(query_hca_cap_in, in, op_mod, + MLX5_SET_HCA_CAP_OP_MOD_NIC_FLOW_TABLE | + HCA_CAP_OPMOD_GET_CUR); + + err = mlx5dv_devx_general_cmd(ctx, in, sizeof(in), out, sizeof(out)); + if (err) { + dr_dbg_ctx(ctx, "Query flow tables failed %d\n", err); + return err; + } + + caps->nic_rx_drop_address = DEVX_GET64(query_hca_cap_out, out, + capability.flow_table_nic_cap. + sw_steering_nic_rx_action_drop_icm_address); + caps->nic_tx_drop_address = DEVX_GET64(query_hca_cap_out, out, + capability.flow_table_nic_cap. + sw_steering_nic_tx_action_drop_icm_address); + caps->nic_tx_allow_address = DEVX_GET64(query_hca_cap_out, out, + capability.flow_table_nic_cap. + sw_steering_nic_tx_action_allow_icm_address); + caps->rx_sw_owner = DEVX_GET(query_hca_cap_out, out, + capability.flow_table_nic_cap. + flow_table_properties_nic_receive.sw_owner); + caps->tx_sw_owner = DEVX_GET(query_hca_cap_out, out, + capability.flow_table_nic_cap. + flow_table_properties_nic_transmit.sw_owner); + caps->max_ft_level = DEVX_GET(query_hca_cap_out, out, + capability.flow_table_nic_cap. + flow_table_properties_nic_receive.max_ft_level); + DEVX_SET(query_hca_cap_in, in, op_mod, + MLX5_SET_HCA_CAP_OP_MOD_DEVICE_MEMORY | + HCA_CAP_OPMOD_GET_CUR); + + err = mlx5dv_devx_general_cmd(ctx, in, sizeof(in), out, sizeof(out)); + if (err) { + dr_dbg_ctx(ctx, "Query flow device memory caps failed %d\n", err); + return err; + } + + caps->log_icm_size = DEVX_GET(query_hca_cap_out, out, + capability.device_mem_cap.log_steering_sw_icm_size); + caps->hdr_modify_icm_addr = DEVX_GET64(query_hca_cap_out, out, + capability.device_mem_cap. + header_modify_sw_icm_start_address); + + return 0; +} + +int dr_devx_sync_steering(struct ibv_context *ctx) +{ + uint32_t out[DEVX_ST_SZ_DW(sync_steering_out)] = {}; + uint32_t in[DEVX_ST_SZ_DW(sync_steering_in)] = {}; + int err; + + DEVX_SET(sync_steering_in, in, opcode, MLX5_CMD_OP_SYNC_STEERING); + + err = mlx5dv_devx_general_cmd(ctx, in, sizeof(in), out, sizeof(out)); + if (err) + dr_dbg_ctx(ctx, "Sync steering failed %d\n", err); + + return err; +} + +struct mlx5dv_devx_obj *dr_devx_create_flow_table(struct ibv_context *ctx, + uint32_t table_type, + uint64_t icm_addr_rx, + uint64_t icm_addr_tx, + u8 level) +{ + uint32_t out[DEVX_ST_SZ_DW(create_flow_table_out)] = {}; + uint32_t in[DEVX_ST_SZ_DW(create_flow_table_in)] = {}; + void *ft_ctx; + + DEVX_SET(create_flow_table_in, in, opcode, MLX5_CMD_OP_CREATE_FLOW_TABLE); + DEVX_SET(create_flow_table_in, in, table_type, table_type); + + ft_ctx = DEVX_ADDR_OF(create_flow_table_in, in, flow_table_context); + DEVX_SET(flow_table_context, ft_ctx, sw_owner, 1); + + DEVX_SET(flow_table_context, ft_ctx, level, level); + /* + * icm_addr_0 used for FDB RX / NIC TX / NIC_RX + * icm_addr_1 used for FDB TX + */ + if (table_type == FS_FT_NIC_RX) { + DEVX_SET64(flow_table_context, ft_ctx, sw_owner_icm_root_0, icm_addr_rx); + } else if (table_type == FS_FT_NIC_TX) { + DEVX_SET64(flow_table_context, ft_ctx, sw_owner_icm_root_0, icm_addr_tx); + } else if (table_type == FS_FT_FDB) { + DEVX_SET64(flow_table_context, ft_ctx, sw_owner_icm_root_0, icm_addr_rx); + DEVX_SET64(flow_table_context, ft_ctx, sw_owner_icm_root_1, icm_addr_tx); + } else { + assert(false); + } + + return mlx5dv_devx_obj_create(ctx, in, sizeof(in), out, sizeof(out)); +} + +struct mlx5dv_devx_obj *dr_devx_create_reformat_ctx(struct ibv_context *ctx, + enum reformat_type rt, + size_t reformat_size, + void *reformat_data) +{ + uint32_t out[DEVX_ST_SZ_DW(alloc_packet_reformat_context_out)] = {}; + size_t insz, cmd_data_sz, cmd_total_sz; + struct mlx5dv_devx_obj *obj; + void *prctx; + void *pdata; + void *in; + + cmd_total_sz = DEVX_ST_SZ_BYTES(alloc_packet_reformat_context_in); + cmd_data_sz = DEVX_FLD_SZ_BYTES(alloc_packet_reformat_context_in, + packet_reformat_context.reformat_data); + insz = align(cmd_total_sz + reformat_size - cmd_data_sz, 4); + in = calloc(1, insz); + if (!in) { + errno = ENOMEM; + return NULL; + } + + DEVX_SET(alloc_packet_reformat_context_in, in, opcode, + MLX5_CMD_OP_ALLOC_PACKET_REFORMAT_CONTEXT); + + prctx = DEVX_ADDR_OF(alloc_packet_reformat_context_in, in, packet_reformat_context); + pdata = DEVX_ADDR_OF(packet_reformat_context_in, prctx, reformat_data); + + DEVX_SET(packet_reformat_context_in, prctx, reformat_type, rt); + DEVX_SET(packet_reformat_context_in, prctx, reformat_data_size, reformat_size); + memcpy(pdata, reformat_data, reformat_size); + + obj = mlx5dv_devx_obj_create(ctx, in, insz, out, sizeof(out)); + free(in); + + return obj; +} + +struct mlx5dv_devx_obj *dr_devx_create_meter(struct ibv_context *ctx, + struct mlx5dv_dr_flow_meter_attr + *meter_attr) +{ + uint32_t out[DEVX_ST_SZ_DW(general_obj_out_cmd_hdr)] = {}; + uint32_t in[DEVX_ST_SZ_DW(create_flow_meter_in)] = {}; + void *attr; + + if (meter_attr->flow_meter_parameter_sz > + DEVX_FLD_SZ_BYTES(flow_meter, flow_meter_params)) { + errno = EINVAL; + return NULL; + } + + attr = DEVX_ADDR_OF(create_flow_meter_in, in, hdr); + DEVX_SET(general_obj_in_cmd_hdr, + attr, opcode, MLX5_CMD_OP_CREATE_GENERAL_OBJECT); + DEVX_SET(general_obj_in_cmd_hdr, + attr, obj_type, MLX5_OBJ_TYPE_FLOW_METER); + + attr = DEVX_ADDR_OF(create_flow_meter_in, in, meter); + DEVX_SET(flow_meter, attr, active, meter_attr->active); + DEVX_SET(flow_meter, attr, return_reg_id, meter_attr->reg_c_index); + DEVX_SET(flow_meter, attr, table_type, + meter_attr->next_table->table_type); + DEVX_SET(flow_meter, attr, destination_table_id, + meter_attr->next_table->devx_obj->object_id); + + attr = DEVX_ADDR_OF(flow_meter, attr, flow_meter_params); + memcpy(attr, meter_attr->flow_meter_parameter, + meter_attr->flow_meter_parameter_sz); + + return mlx5dv_devx_obj_create(ctx, in, sizeof(in), out, sizeof(out)); +} + +int dr_devx_query_meter(struct mlx5dv_devx_obj *obj, uint64_t *rx_icm_addr, + uint64_t *tx_icm_addr) +{ + uint32_t in[DEVX_ST_SZ_DW(general_obj_in_cmd_hdr)] = {}; + uint32_t out[DEVX_ST_SZ_DW(query_flow_meter_out)] = {}; + void *attr; + int ret; + + DEVX_SET(general_obj_in_cmd_hdr, in, opcode, + MLX5_CMD_OP_QUERY_GENERAL_OBJECT); + DEVX_SET(general_obj_in_cmd_hdr, in, obj_type, + MLX5_OBJ_TYPE_FLOW_METER); + DEVX_SET(general_obj_in_cmd_hdr, in, obj_id, obj->object_id); + + ret = mlx5dv_devx_obj_query(obj, in, sizeof(in), out, sizeof(out)); + if (ret) { + dr_dbg_ctx(obj->context, "Failed to query flow meter id %u\n", + obj->object_id); + return ret; + } + + attr = DEVX_ADDR_OF(query_flow_meter_out, out, obj); + *rx_icm_addr = DEVX_GET64(flow_meter, attr, sw_steering_icm_address_rx); + *tx_icm_addr = DEVX_GET64(flow_meter, attr, sw_steering_icm_address_tx); + + return 0; +} + +int dr_devx_modify_meter(struct mlx5dv_devx_obj *obj, + struct mlx5dv_dr_flow_meter_attr *meter_attr, + __be64 modify_bits) +{ + uint32_t out[DEVX_ST_SZ_DW(general_obj_out_cmd_hdr)] = {}; + uint32_t in[DEVX_ST_SZ_DW(create_flow_meter_in)] = {}; + void *attr; + + if (meter_attr->flow_meter_parameter_sz > + DEVX_FLD_SZ_BYTES(flow_meter, flow_meter_params)) { + errno = EINVAL; + return errno; + } + + attr = DEVX_ADDR_OF(create_flow_meter_in, in, hdr); + DEVX_SET(general_obj_in_cmd_hdr, + attr, opcode, MLX5_CMD_OP_MODIFY_GENERAL_OBJECT); + DEVX_SET(general_obj_in_cmd_hdr, + attr, obj_type, MLX5_OBJ_TYPE_FLOW_METER); + DEVX_SET(general_obj_in_cmd_hdr, in, obj_id, obj->object_id); + + attr = DEVX_ADDR_OF(create_flow_meter_in, in, meter); + memcpy(DEVX_ADDR_OF(flow_meter, attr, modify_field_select), + &modify_bits, sizeof(modify_bits)); + + DEVX_SET(flow_meter, attr, active, meter_attr->active); + + attr = DEVX_ADDR_OF(flow_meter, attr, flow_meter_params); + memcpy(attr, meter_attr->flow_meter_parameter, + meter_attr->flow_meter_parameter_sz); + + return mlx5dv_devx_obj_modify(obj, in, sizeof(in), out, sizeof(out)); +} + +struct mlx5dv_devx_obj *dr_devx_create_qp(struct ibv_context *ctx, + struct dr_devx_qp_create_attr *attr) +{ + uint32_t in[DEVX_ST_SZ_DW(create_qp_in)] = {}; + uint32_t out[DEVX_ST_SZ_DW(create_qp_out)] = {}; + void *qpc = DEVX_ADDR_OF(create_qp_in, in, qpc); + + DEVX_SET(create_qp_in, in, opcode, MLX5_CMD_OP_CREATE_QP); + + DEVX_SET(qpc, qpc, st, attr->service_type); + DEVX_SET(qpc, qpc, pm_state, attr->pm_state); + DEVX_SET(qpc, qpc, pd, attr->pdn); + DEVX_SET(qpc, qpc, uar_page, attr->page_id); + DEVX_SET(qpc, qpc, cqn_snd, attr->cqn); + DEVX_SET(qpc, qpc, cqn_rcv, attr->cqn); + DEVX_SET(qpc, qpc, log_sq_size, ilog32(attr->sq_wqe_cnt - 1)); + DEVX_SET(qpc, qpc, log_rq_stride, attr->rq_wqe_shift - 4); + DEVX_SET(qpc, qpc, log_rq_size, ilog32(attr->rq_wqe_cnt - 1)); + DEVX_SET(qpc, qpc, dbr_umem_id, attr->db_umem_id); + + DEVX_SET(create_qp_in, in, wq_umem_id, attr->buff_umem_id); + + return mlx5dv_devx_obj_create(ctx, in, sizeof(in), out, sizeof(out)); +} + +int dr_devx_modify_qp_rst2init(struct ibv_context *ctx, + struct mlx5dv_devx_obj *qp_obj, + uint16_t port) +{ + uint32_t in[DEVX_ST_SZ_DW(rst2init_qp_in)] = {}; + uint32_t out[DEVX_ST_SZ_DW(rst2init_qp_out)] = {}; + void *qpc = DEVX_ADDR_OF(rst2init_qp_in, in, qpc); + + DEVX_SET(rst2init_qp_in, in, opcode, MLX5_CMD_OP_RST2INIT_QP); + DEVX_SET(rst2init_qp_in, in, qpn, qp_obj->object_id); + + DEVX_SET(qpc, qpc, primary_address_path.vhca_port_num, port); + DEVX_SET(qpc, qpc, pm_state, MLX5_QPC_PM_STATE_MIGRATED); + DEVX_SET(qpc, qpc, rre, 1); + DEVX_SET(qpc, qpc, rwe, 1); + + return mlx5dv_devx_obj_modify(qp_obj, in, + sizeof(in), out, sizeof(out)); +} + +#define DR_DEVX_ICM_UDP_PORT 49999 + +int dr_devx_modify_qp_init2rtr(struct ibv_context *ctx, + struct mlx5dv_devx_obj *qp_obj, + struct dr_devx_qp_rtr_attr *attr) +{ + uint32_t in[DEVX_ST_SZ_DW(init2rtr_qp_in)] = {}; + uint32_t out[DEVX_ST_SZ_DW(init2rtr_qp_out)] = {}; + void *qpc = DEVX_ADDR_OF(init2rtr_qp_in, in, qpc); + + DEVX_SET(init2rtr_qp_in, in, opcode, MLX5_CMD_OP_INIT2RTR_QP); + DEVX_SET(init2rtr_qp_in, in, qpn, qp_obj->object_id); + + DEVX_SET(qpc, qpc, mtu, attr->mtu); + DEVX_SET(qpc, qpc, log_msg_max, DR_CHUNK_SIZE_MAX - 1); + DEVX_SET(qpc, qpc, remote_qpn, attr->qp_num); + memcpy(DEVX_ADDR_OF(qpc, qpc, primary_address_path.rmac_47_32), + attr->dgid_attr.mac, sizeof(attr->dgid_attr.mac)); + memcpy(DEVX_ADDR_OF(qpc, qpc, primary_address_path.rgid_rip), + attr->dgid_attr.gid.raw, sizeof(attr->dgid_attr.gid.raw)); + DEVX_SET(qpc, qpc, primary_address_path.src_addr_index, + attr->sgid_index); + + if (attr->dgid_attr.roce_ver == MLX5_ROCE_VERSION_2) + DEVX_SET(qpc, qpc, primary_address_path.udp_sport, + DR_DEVX_ICM_UDP_PORT); + + DEVX_SET(qpc, qpc, primary_address_path.vhca_port_num, attr->port_num); + DEVX_SET(qpc, qpc, min_rnr_nak, 1); + + return mlx5dv_devx_obj_modify(qp_obj, in, + sizeof(in), out, sizeof(out)); +} + +int dr_devx_modify_qp_rtr2rts(struct ibv_context *ctx, + struct mlx5dv_devx_obj *qp_obj, + struct dr_devx_qp_rts_attr *attr) +{ + uint32_t in[DEVX_ST_SZ_DW(rtr2rts_qp_in)] = {}; + uint32_t out[DEVX_ST_SZ_DW(rtr2rts_qp_out)] = {}; + void *qpc = DEVX_ADDR_OF(rtr2rts_qp_in, in, qpc); + + DEVX_SET(rtr2rts_qp_in, in, opcode, MLX5_CMD_OP_RTR2RTS_QP); + DEVX_SET(rtr2rts_qp_in, in, qpn, qp_obj->object_id); + + DEVX_SET(qpc, qpc, log_ack_req_freq, 0); + DEVX_SET(qpc, qpc, retry_count, attr->retry_cnt); + DEVX_SET(qpc, qpc, rnr_retry, attr->rnr_retry); + + return mlx5dv_devx_obj_modify(qp_obj, in, + sizeof(in), out, sizeof(out)); +} + +int dr_devx_query_gid(struct ibv_context *ctx, uint8_t vhca_port_num, + uint16_t index, struct dr_gid_attr *attr) +{ + uint32_t out[DEVX_ST_SZ_DW(query_roce_address_out)] = {}; + uint32_t in[DEVX_ST_SZ_DW(query_roce_address_in)] = {}; + int ret; + + DEVX_SET(query_roce_address_in, in, opcode, + MLX5_CMD_OP_QUERY_ROCE_ADDRESS); + + DEVX_SET(query_roce_address_in, in, roce_address_index, index); + DEVX_SET(query_roce_address_in, in, vhca_port_num, vhca_port_num); + + ret = mlx5dv_devx_general_cmd(ctx, in, sizeof(in), out, sizeof(out)); + if (ret) + return ret; + + memcpy(&attr->gid, + DEVX_ADDR_OF(query_roce_address_out, + out, roce_address.source_l3_address), + sizeof(attr->gid)); + memcpy(attr->mac, + DEVX_ADDR_OF(query_roce_address_out, out, + roce_address.source_mac_47_32), + sizeof(attr->mac)); + + if (DEVX_GET(query_roce_address_out, out, + roce_address.roce_version) == MLX5_ROCE_VERSION_2) + attr->roce_ver = MLX5_ROCE_VERSION_2; + else + attr->roce_ver = MLX5_ROCE_VERSION_1; + + return 0; +} diff --git a/providers/mlx5/dr_domain.c b/providers/mlx5/dr_domain.c new file mode 100644 index 0000000..339208e --- /dev/null +++ b/providers/mlx5/dr_domain.c @@ -0,0 +1,362 @@ +/* + * Copyright (c) 2019, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <unistd.h> +#include <stdlib.h> +#include "mlx5dv_dr.h" + +enum { + MLX5DV_DR_DOMAIN_SYNC_SUP_FLAGS = + (MLX5DV_DR_DOMAIN_SYNC_FLAGS_SW | + MLX5DV_DR_DOMAIN_SYNC_FLAGS_HW), +}; + +static int dr_domain_init_resources(struct mlx5dv_dr_domain *dmn) +{ + int ret = -1; + + dmn->pd = ibv_alloc_pd(dmn->ctx); + if (!dmn->pd) { + dr_dbg(dmn, "Couldn't allocate PD\n"); + return ret; + } + + dmn->uar = mlx5dv_devx_alloc_uar(dmn->ctx, 0); + if (!dmn->uar) { + dr_dbg(dmn, "Can't allocate UAR\n"); + goto clean_pd; + } + + dmn->ste_icm_pool = dr_icm_pool_create(dmn, DR_ICM_TYPE_STE); + if (!dmn->ste_icm_pool) { + dr_dbg(dmn, "Couldn't get icm memory for %s\n", + ibv_get_device_name(dmn->ctx->device)); + goto clean_uar; + } + + dmn->action_icm_pool = dr_icm_pool_create(dmn, DR_ICM_TYPE_MODIFY_ACTION); + if (!dmn->action_icm_pool) { + dr_dbg(dmn, "Couldn't get action icm memory for %s\n", + ibv_get_device_name(dmn->ctx->device)); + goto free_ste_icm_pool; + } + + ret = dr_send_ring_alloc(dmn); + if (ret) { + dr_dbg(dmn, "Couldn't create send-ring for %s\n", + ibv_get_device_name(dmn->ctx->device)); + goto free_action_icm_pool; + } + + return 0; + +free_action_icm_pool: + dr_icm_pool_destroy(dmn->action_icm_pool); +free_ste_icm_pool: + dr_icm_pool_destroy(dmn->ste_icm_pool); +clean_uar: + mlx5dv_devx_free_uar(dmn->uar); +clean_pd: + ibv_dealloc_pd(dmn->pd); + + return ret; +} + +static void dr_free_resources(struct mlx5dv_dr_domain *dmn) +{ + dr_send_ring_free(dmn->send_ring); + dr_icm_pool_destroy(dmn->action_icm_pool); + dr_icm_pool_destroy(dmn->ste_icm_pool); + mlx5dv_devx_free_uar(dmn->uar); + ibv_dealloc_pd(dmn->pd); +} + +static int dr_query_vport_cap(struct ibv_context *ctx, uint16_t vport_number, + struct dr_devx_vport_cap *cap) +{ + bool other_vport = vport_number ? true : false; + int ret; + + ret = dr_devx_query_esw_vport_context(ctx, other_vport, vport_number, + &cap->icm_address_rx, + &cap->icm_address_tx); + if (ret) + return ret; + + ret = dr_devx_query_gvmi(ctx, other_vport, vport_number, &cap->gvmi); + if (ret) + return ret; + + return 0; +} + +static int dr_domain_query_fdb_caps(struct ibv_context *ctx, + struct mlx5dv_dr_domain *dmn) +{ + struct dr_esw_caps esw_caps = {}; + int num_vports; + int ret; + int i; + + if (!dmn->info.caps.eswitch_manager) + return 0; + + num_vports = dmn->info.attr.phys_port_cnt - 1; + dmn->info.caps.vports_caps = calloc(num_vports + 1, + sizeof(struct dr_devx_vport_cap)); + if (!dmn->info.caps.vports_caps) { + errno = ENOMEM; + return errno; + } + + /* Query vports */ + for (i = 0; i < num_vports; i++) { + ret = dr_query_vport_cap(ctx, i, &dmn->info.caps.vports_caps[i]); + if (ret) + goto err; + } + + /* Query uplink */ + ret = dr_devx_query_esw_caps(ctx, &esw_caps); + if (ret) + goto err; + + dmn->info.caps.fdb_sw_owner = esw_caps.sw_owner; + dmn->info.caps.vports_caps[i].icm_address_rx = esw_caps.uplink_icm_address_rx; + dmn->info.caps.vports_caps[i].icm_address_tx = esw_caps.uplink_icm_address_tx; + dmn->info.caps.esw_rx_drop_address = esw_caps.drop_icm_address_rx; + dmn->info.caps.esw_tx_drop_address = esw_caps.drop_icm_address_tx; + dmn->info.caps.num_vports = num_vports; + + return 0; + +err: + free(dmn->info.caps.vports_caps); + dmn->info.caps.vports_caps = NULL; + return ret; +} + +static int dr_domain_caps_init(struct ibv_context *ctx, + struct mlx5dv_dr_domain *dmn) +{ + struct dr_devx_vport_cap *vport_cap; + struct ibv_port_attr port_attr = {}; + int ret; + + ret = ibv_query_port(ctx, 1, &port_attr); + if (ret) { + dr_dbg(dmn, "Failed to query port\n"); + return ret; + } + + if (port_attr.link_layer != IBV_LINK_LAYER_ETHERNET) { + dr_dbg(dmn, "Failed to allocate domain, bad link type\n"); + errno = EOPNOTSUPP; + return errno; + } + + ret = ibv_query_device(ctx, &dmn->info.attr); + if (ret) + return ret; + + /* Non FDB type is supported only over root table */ + if (dmn->type != MLX5DV_DR_DOMAIN_TYPE_FDB) + return 0; + + ret = dr_devx_query_device(ctx, &dmn->info.caps); + if (ret) + /* Ignore devx query failure to allow steering on root level + * tables in case devx is not supported over mlx5dv_dr API + */ + return 0; + + ret = dr_domain_query_fdb_caps(ctx, dmn); + if (ret) + return ret; + + switch (dmn->type) { + case MLX5DV_DR_DOMAIN_TYPE_NIC_RX: + if (!dmn->info.caps.rx_sw_owner) + return 0; + + dmn->info.supp_sw_steering = true; + dmn->info.rx.ste_type = DR_STE_TYPE_RX; + dmn->info.rx.default_icm_addr = dmn->info.caps.nic_rx_drop_address; + dmn->info.rx.drop_icm_addr = dmn->info.caps.nic_rx_drop_address; + break; + case MLX5DV_DR_DOMAIN_TYPE_NIC_TX: + if (!dmn->info.caps.tx_sw_owner) + return 0; + + dmn->info.supp_sw_steering = true; + dmn->info.tx.ste_type = DR_STE_TYPE_TX; + dmn->info.tx.default_icm_addr = dmn->info.caps.nic_tx_allow_address; + dmn->info.tx.drop_icm_addr = dmn->info.caps.nic_tx_drop_address; + break; + case MLX5DV_DR_DOMAIN_TYPE_FDB: + if (!dmn->info.caps.eswitch_manager) + return 0; + + if (!dmn->info.caps.fdb_sw_owner) + return 0; + + dmn->info.rx.ste_type = DR_STE_TYPE_RX; + dmn->info.tx.ste_type = DR_STE_TYPE_TX; + vport_cap = dr_get_vport_cap(&dmn->info.caps, 0); + if (!vport_cap) { + dr_dbg(dmn, "Failed to get eswitch manager vport\n"); + return errno; + } + + dmn->info.supp_sw_steering = true; + dmn->info.tx.default_icm_addr = vport_cap->icm_address_tx; + dmn->info.rx.default_icm_addr = vport_cap->icm_address_rx; + dmn->info.rx.drop_icm_addr = dmn->info.caps.esw_rx_drop_address; + dmn->info.tx.drop_icm_addr = dmn->info.caps.esw_tx_drop_address; + break; + default: + dr_dbg(dmn, "Invalid domain\n"); + ret = EINVAL; + break; + } + + return ret; +} + +static void dr_domain_caps_uninit(struct mlx5dv_dr_domain *dmn) +{ + if (dmn->info.caps.vports_caps) + free(dmn->info.caps.vports_caps); +} + +struct mlx5dv_dr_domain * +mlx5dv_dr_domain_create(struct ibv_context *ctx, + enum mlx5dv_dr_domain_type type) +{ + struct mlx5dv_dr_domain *dmn; + int ret; + + if (type > MLX5DV_DR_DOMAIN_TYPE_FDB) { + errno = EINVAL; + return NULL; + } + + dmn = calloc(1, sizeof(*dmn)); + if (!dmn) { + errno = ENOMEM; + return NULL; + } + + dmn->ctx = ctx; + dmn->type = type; + atomic_init(&dmn->refcount, 1); + list_head_init(&dmn->tbl_list); + + if (dr_domain_caps_init(ctx, dmn)) { + dr_dbg(dmn, "Failed init domain, no caps\n"); + goto free_domain; + } + + dmn->info.max_log_action_icm_sz = DR_CHUNK_SIZE_4K; + dmn->info.max_log_sw_icm_sz = min_t(uint32_t, DR_CHUNK_SIZE_1024K, + dmn->info.caps.log_icm_size); + + /* Allocate resources */ + if (dmn->info.supp_sw_steering) { + ret = dr_domain_init_resources(dmn); + if (ret) { + dr_dbg(dmn, "Failed init domain resources for %s\n", + ibv_get_device_name(ctx->device)); + goto uninit_caps; + } + + /* Init CRC table for htbl CRC calculation */ + dr_crc32_init_table(); + } + return dmn; + +uninit_caps: + dr_domain_caps_uninit(dmn); +free_domain: + free(dmn); + return NULL; +} + +/* + * Assure synchronization of the device steering tables with updates made by SW + * insertion. + */ +int mlx5dv_dr_domain_sync(struct mlx5dv_dr_domain *dmn, uint32_t flags) +{ + int ret = 0; + + if (!dmn->info.supp_sw_steering || + !check_comp_mask(flags, MLX5DV_DR_DOMAIN_SYNC_SUP_FLAGS)) { + errno = EOPNOTSUPP; + return errno; + } + + if (flags & MLX5DV_DR_DOMAIN_SYNC_FLAGS_SW) { + pthread_mutex_lock(&dmn->mutex); + ret = dr_send_ring_force_drain(dmn); + if (ret) + goto out_unlock; + + pthread_mutex_unlock(&dmn->mutex); + } + + if (flags & MLX5DV_DR_DOMAIN_SYNC_FLAGS_HW) + ret = dr_devx_sync_steering(dmn->ctx); + + return ret; + +out_unlock: + pthread_mutex_unlock(&dmn->mutex); + return ret; +} + +int mlx5dv_dr_domain_destroy(struct mlx5dv_dr_domain *dmn) +{ + if (atomic_load(&dmn->refcount) > 1) + return EBUSY; + + if (dmn->info.supp_sw_steering) { + /* make sure resources are not used by the hardware */ + dr_devx_sync_steering(dmn->ctx); + dr_free_resources(dmn); + } + + dr_domain_caps_uninit(dmn); + + free(dmn); + return 0; +} diff --git a/providers/mlx5/dr_icm_pool.c b/providers/mlx5/dr_icm_pool.c new file mode 100644 index 0000000..1e28539 --- /dev/null +++ b/providers/mlx5/dr_icm_pool.c @@ -0,0 +1,562 @@ +/* + * Copyright (c) 2019, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <stdlib.h> +#include "mlx5dv_dr.h" + +#define DR_ICM_MODIFY_HDR_ALIGN_BASE 64 + +struct dr_icm_pool; + +#define DR_ICM_SYNC_THRESHOLD (64 * 1024 * 1024) + +struct dr_icm_bucket { + struct dr_icm_pool *pool; + + /* It is safe to allocate chunks from this list, now HW is guaranteed + * to not access this memory + */ + struct list_head free_list; + unsigned int free_list_count; + + /* This is the list of used chunks, HW may be accessing this memory */ + struct list_head used_list; + unsigned int used_list_count; + + /* HW may be accessing this memory but at some future, + * undetermined time, it might cease to do so. Before deciding to call + * sync_ste, this list is moved to tmp_list + */ + struct list_head hot_list; + unsigned int hot_list_count; + + /* Temporary list, entries from the hot list are moved to this list. + * sync_ste is executed and then tmp_list is concatenated to the free list + */ + struct list_head tmp_list; + unsigned int tmp_list_count; + + uint32_t total_chunks; + uint32_t num_of_entries; + uint32_t entry_size; + pthread_mutex_t mutex; +}; + +struct dr_icm_pool { + struct dr_icm_bucket *buckets; + enum dr_icm_type icm_type; + enum dr_icm_chunk_size max_log_chunk_sz; + enum dr_icm_chunk_size num_of_buckets; + struct list_head icm_mr_list; + pthread_mutex_t mr_mutex; + struct mlx5dv_dr_domain *dmn; +}; + +struct dr_icm_mr { + struct dr_icm_pool *pool; + struct ibv_mr *mr; + struct ibv_dm *dm; + size_t used_length; + uint64_t icm_start_addr; + struct list_node mr_list; +}; + +static struct dr_icm_mr * +dr_icm_pool_mr_create(struct dr_icm_pool *pool, + enum mlx5_ib_uapi_dm_type dm_type, + size_t align_base) +{ + struct mlx5dv_alloc_dm_attr mlx5_dm_attr = {}; + struct ibv_alloc_dm_attr dm_attr = {}; + struct dr_icm_mr *icm_mr; + struct mlx5_dm *dm; + size_t align_diff; + + icm_mr = calloc(1, sizeof(struct dr_icm_mr)); + if (!icm_mr) { + errno = ENOMEM; + return NULL; + } + + icm_mr->pool = pool; + list_node_init(&icm_mr->mr_list); + + mlx5_dm_attr.type = dm_type; + + /* 2^log_biggest_table * entry-size * double-for-alignment */ + dm_attr.length = dr_icm_pool_chunk_size_to_byte(pool->max_log_chunk_sz, + pool->icm_type) * 2; + + icm_mr->dm = mlx5dv_alloc_dm(pool->dmn->ctx, &dm_attr, &mlx5_dm_attr); + if (!icm_mr->dm) { + dr_dbg(pool->dmn, "Failed allocating DM\n"); + goto free_icm_mr; + } + + /* Register device memory */ + icm_mr->mr = ibv_reg_dm_mr(pool->dmn->pd, icm_mr->dm, 0, + dm_attr.length, + IBV_ACCESS_ZERO_BASED | + IBV_ACCESS_REMOTE_WRITE | + IBV_ACCESS_LOCAL_WRITE | + IBV_ACCESS_REMOTE_READ); + if (!icm_mr->mr) { + dr_dbg(pool->dmn, "Failed DM registration\n"); + goto free_dm; + } + + dm = to_mdm(icm_mr->dm); + icm_mr->icm_start_addr = dm->remote_va; + + align_diff = icm_mr->icm_start_addr % align_base; + if (align_diff) + icm_mr->used_length = align_base - align_diff; + + list_add_tail(&pool->icm_mr_list, &icm_mr->mr_list); + + return icm_mr; + +free_dm: + mlx5_free_dm(icm_mr->dm); +free_icm_mr: + free(icm_mr); + return NULL; +} + +static void dr_icm_pool_mr_destroy(struct dr_icm_mr *icm_mr) +{ + list_del(&icm_mr->mr_list); + ibv_dereg_mr(icm_mr->mr); + mlx5_free_dm(icm_mr->dm); + free(icm_mr); +} + +static int dr_icm_chunk_ste_init(struct dr_icm_chunk *chunk) +{ + struct dr_icm_bucket *bucket = chunk->bucket; + struct dr_icm_pool *pool = bucket->pool; + + chunk->ste_arr = calloc(bucket->num_of_entries, sizeof(struct dr_ste)); + if (!chunk->ste_arr) { + dr_dbg(pool->dmn, "Failed allocating ste_arr for chunk\n"); + errno = ENOMEM; + return errno; + } + + chunk->hw_ste_arr = calloc(bucket->num_of_entries, DR_STE_SIZE_REDUCED); + if (!chunk->hw_ste_arr) { + dr_dbg(pool->dmn, "Failed allocating hw_ste_arr for chunk\n"); + errno = ENOMEM; + goto out_free_ste_arr; + } + + chunk->miss_list = malloc(bucket->num_of_entries * + sizeof(struct list_head)); + if (!chunk->miss_list) { + dr_dbg(pool->dmn, "Failed allocating miss_list for chunk\n"); + errno = ENOMEM; + goto out_free_hw_ste_arr; + } + + return 0; + +out_free_hw_ste_arr: + free(chunk->hw_ste_arr); +out_free_ste_arr: + free(chunk->ste_arr); + return errno; +} + +static int dr_icm_chunks_create(struct dr_icm_bucket *bucket) +{ + size_t mr_free_size, mr_req_size, mr_row_size; + struct dr_icm_pool *pool = bucket->pool; + enum mlx5_ib_uapi_dm_type dm_type; + struct dr_icm_chunk *chunk; + struct dr_icm_mr *icm_mr; + size_t align_base; + int i; + + mr_req_size = bucket->num_of_entries * bucket->entry_size; + mr_row_size = dr_icm_pool_chunk_size_to_byte(pool->max_log_chunk_sz, + pool->icm_type); + + if (pool->icm_type == DR_ICM_TYPE_STE) { + dm_type = MLX5_IB_UAPI_DM_TYPE_STEERING_SW_ICM; + /* Align base is the biggest chunk size / row size */ + align_base = mr_row_size; + } else { + dm_type = MLX5_IB_UAPI_DM_TYPE_HEADER_MODIFY_SW_ICM; + /* Align base is 64B */ + align_base = DR_ICM_MODIFY_HDR_ALIGN_BASE; + } + + pthread_mutex_lock(&pool->mr_mutex); + icm_mr = list_tail(&pool->icm_mr_list, struct dr_icm_mr, mr_list); + if (icm_mr) + mr_free_size = icm_mr->mr->length - icm_mr->used_length; + + if (!icm_mr || mr_free_size < mr_row_size) { + icm_mr = dr_icm_pool_mr_create(pool, dm_type, align_base); + if (!icm_mr) + goto out_err; + } + + /* Create memory aligned chunks */ + for (i = 0; i < mr_row_size / mr_req_size; i++) { + chunk = calloc(1, sizeof(struct dr_icm_chunk)); + if (!chunk) { + errno = ENOMEM; + goto out_err; + } + + chunk->bucket = bucket; + chunk->rkey = icm_mr->mr->rkey; + chunk->mr_addr = (uintptr_t)icm_mr->mr->addr + icm_mr->used_length; + chunk->icm_addr = (uintptr_t)icm_mr->icm_start_addr + icm_mr->used_length; + icm_mr->used_length += mr_req_size; + chunk->num_of_entries = bucket->num_of_entries; + chunk->byte_size = chunk->num_of_entries * bucket->entry_size; + + if (pool->icm_type == DR_ICM_TYPE_STE) + if (dr_icm_chunk_ste_init(chunk)) + goto out_free_chunk; + + list_node_init(&chunk->chunk_list); + list_add(&bucket->free_list, &chunk->chunk_list); + bucket->free_list_count++; + bucket->total_chunks++; + } + pthread_mutex_unlock(&pool->mr_mutex); + return 0; + +out_free_chunk: + free(chunk); +out_err: + pthread_mutex_unlock(&pool->mr_mutex); + return errno; +} + +static void dr_icm_chunk_ste_cleanup(struct dr_icm_chunk *chunk) +{ + free(chunk->miss_list); + free(chunk->hw_ste_arr); + free(chunk->ste_arr); +} + +static void dr_icm_chunk_destroy(struct dr_icm_chunk *chunk) +{ + struct dr_icm_bucket *bucket = chunk->bucket; + + list_del(&chunk->chunk_list); + bucket->total_chunks--; + + if (bucket->pool->icm_type == DR_ICM_TYPE_STE) + dr_icm_chunk_ste_cleanup(chunk); + + free(chunk); +} + +static void dr_icm_bucket_init(struct dr_icm_pool *pool, + struct dr_icm_bucket *bucket, + enum dr_icm_chunk_size chunk_size) +{ + if (pool->icm_type == DR_ICM_TYPE_STE) + bucket->entry_size = DR_STE_SIZE; + else + bucket->entry_size = DR_MODIFY_ACTION_SIZE; + + bucket->num_of_entries = dr_icm_pool_chunk_size_to_entries(chunk_size); + bucket->pool = pool; + pthread_mutex_init(&bucket->mutex, NULL); + list_head_init(&bucket->free_list); + list_head_init(&bucket->used_list); + list_head_init(&bucket->hot_list); + list_head_init(&bucket->tmp_list); +} + +static void dr_icm_bucket_cleanup(struct dr_icm_bucket *bucket) +{ + struct dr_icm_chunk *chunk, *next; + + pthread_mutex_destroy(&bucket->mutex); + list_append_list(&bucket->free_list, &bucket->tmp_list); + list_append_list(&bucket->free_list, &bucket->hot_list); + + list_for_each_safe(&bucket->free_list, chunk, next, chunk_list) + dr_icm_chunk_destroy(chunk); + + assert(bucket->total_chunks == 0); + + /* Cleanup of unreturned chunks */ + list_for_each_safe(&bucket->used_list, chunk, next, chunk_list) + dr_icm_chunk_destroy(chunk); +} + +static uint64_t dr_icm_hot_mem_size(struct dr_icm_pool *pool) +{ + uint64_t hot_size = 0; + int i; + + for (i = 0; i < pool->num_of_buckets; i++) + hot_size += pool->buckets[i].hot_list_count * + dr_icm_pool_chunk_size_to_byte(i, pool->icm_type); + + return hot_size; +} + +static bool dr_icm_reuse_hot_entries(struct dr_icm_pool *pool, + struct dr_icm_bucket *bucket) +{ + uint64_t bytes_for_sync; + + bytes_for_sync = dr_icm_hot_mem_size(pool); + if (bytes_for_sync < DR_ICM_SYNC_THRESHOLD || !bucket->hot_list_count) + return false; + + return true; +} + +static void dr_icm_chill_bucket_start(struct dr_icm_bucket *bucket) +{ + list_append_list(&bucket->tmp_list, &bucket->hot_list); + bucket->tmp_list_count += bucket->hot_list_count; + bucket->hot_list_count = 0; +} + +static void dr_icm_chill_bucket_end(struct dr_icm_bucket *bucket) +{ + list_append_list(&bucket->free_list, &bucket->tmp_list); + bucket->free_list_count += bucket->tmp_list_count; + bucket->tmp_list_count = 0; +} + +static void dr_icm_chill_bucket_abort(struct dr_icm_bucket *bucket) +{ + list_append_list(&bucket->hot_list, &bucket->tmp_list); + bucket->hot_list_count += bucket->tmp_list_count; + bucket->tmp_list_count = 0; +} + +static void dr_icm_chill_buckets_start(struct dr_icm_pool *pool, + struct dr_icm_bucket *cb, + bool bucks[DR_CHUNK_SIZE_MAX]) +{ + struct dr_icm_bucket *bucket; + int i; + + for (i = 0; i < pool->num_of_buckets; i++) { + bucket = &pool->buckets[i]; + if (bucket == cb) { + dr_icm_chill_bucket_start(bucket); + continue; + } + + /* Freeing the mutex is done at the end of that process, after + * sync_ste was executed at dr_icm_chill_buckets_end func. + */ + if (!pthread_mutex_trylock(&bucket->mutex)) { + dr_icm_chill_bucket_start(bucket); + bucks[i] = true; + } + } +} + +static void dr_icm_chill_buckets_end(struct dr_icm_pool *pool, + struct dr_icm_bucket *cb, + bool bucks[DR_CHUNK_SIZE_MAX]) +{ + struct dr_icm_bucket *bucket; + int i; + + for (i = 0; i < pool->num_of_buckets; i++) { + bucket = &pool->buckets[i]; + if (bucket == cb) { + dr_icm_chill_bucket_end(bucket); + continue; + } + + if (!bucks[i]) + continue; + + dr_icm_chill_bucket_end(bucket); + pthread_mutex_unlock(&bucket->mutex); + } +} + +static void dr_icm_chill_buckets_abort(struct dr_icm_pool *pool, + struct dr_icm_bucket *cb, + bool bucks[DR_CHUNK_SIZE_MAX]) +{ + struct dr_icm_bucket *bucket; + int i; + + for (i = 0; i < pool->num_of_buckets; i++) { + bucket = &pool->buckets[i]; + if (bucket == cb) { + dr_icm_chill_bucket_abort(bucket); + continue; + } + + if (!bucks[i]) + continue; + + dr_icm_chill_bucket_abort(bucket); + pthread_mutex_unlock(&bucket->mutex); + } +} + +/* Allocate an ICM chunk, each chunk holds a piece of ICM memory and + * also memory used for HW STE management for optimisations. + */ +struct dr_icm_chunk *dr_icm_alloc_chunk(struct dr_icm_pool *pool, + enum dr_icm_chunk_size chunk_size) +{ + bool bucks[DR_CHUNK_SIZE_MAX] = {}; + struct dr_icm_bucket *bucket; + struct dr_icm_chunk *chunk; + int err; + + if (chunk_size > pool->max_log_chunk_sz) { + errno = EINVAL; + return NULL; + } + + bucket = &pool->buckets[chunk_size]; + + pthread_mutex_lock(&bucket->mutex); + + /* Take chunk from pool if available, otherwise allocate new chunks */ + if (list_empty(&bucket->free_list)) { + if (dr_icm_reuse_hot_entries(pool, bucket)) { + dr_icm_chill_buckets_start(pool, bucket, bucks); + err = dr_devx_sync_steering(pool->dmn->ctx); + if (err) { + dr_icm_chill_buckets_abort(pool, bucket, bucks); + dr_dbg(pool->dmn, "Sync_steering failed\n"); + chunk = NULL; + goto out; + } + dr_icm_chill_buckets_end(pool, bucket, bucks); + } else { + dr_icm_chunks_create(bucket); + } + } + + chunk = list_tail(&bucket->free_list, struct dr_icm_chunk, chunk_list); + if (chunk) { + list_del_init(&chunk->chunk_list); + list_add_tail(&bucket->used_list, &chunk->chunk_list); + bucket->free_list_count--; + bucket->used_list_count++; + } +out: + pthread_mutex_unlock(&bucket->mutex); + return chunk; +} + +void dr_icm_free_chunk(struct dr_icm_chunk *chunk) +{ + struct dr_icm_bucket *bucket = chunk->bucket; + + if (bucket->pool->icm_type == DR_ICM_TYPE_STE) { + memset(chunk->ste_arr, 0, + bucket->num_of_entries * sizeof(struct dr_ste)); + memset(chunk->hw_ste_arr, 0, + bucket->num_of_entries * DR_STE_SIZE_REDUCED); + } + + pthread_mutex_lock(&bucket->mutex); + list_del_init(&chunk->chunk_list); + list_add_tail(&bucket->hot_list, &chunk->chunk_list); + bucket->hot_list_count++; + bucket->used_list_count--; + pthread_mutex_unlock(&bucket->mutex); +} + +struct dr_icm_pool *dr_icm_pool_create(struct mlx5dv_dr_domain *dmn, + enum dr_icm_type icm_type) +{ + enum dr_icm_chunk_size max_log_chunk_sz; + struct dr_icm_pool *pool; + int i; + + if (icm_type == DR_ICM_TYPE_STE) + max_log_chunk_sz = dmn->info.max_log_sw_icm_sz; + else + max_log_chunk_sz = dmn->info.max_log_action_icm_sz; + + pool = calloc(1, sizeof(struct dr_icm_pool)); + if (!pool) { + errno = ENOMEM; + return NULL; + } + + pool->buckets = calloc(max_log_chunk_sz + 1, sizeof(struct dr_icm_bucket)); + if (!pool->buckets) { + errno = ENOMEM; + goto free_pool; + } + + pool->dmn = dmn; + pool->icm_type = icm_type; + pool->max_log_chunk_sz = max_log_chunk_sz; + pool->num_of_buckets = max_log_chunk_sz + 1; + list_head_init(&pool->icm_mr_list); + + for (i = 0; i < pool->num_of_buckets; i++) + dr_icm_bucket_init(pool, &pool->buckets[i], i); + + pthread_mutex_init(&pool->mr_mutex, NULL); + + return pool; + +free_pool: + free(pool); + return NULL; +} + +void dr_icm_pool_destroy(struct dr_icm_pool *pool) +{ + struct dr_icm_mr *icm_mr, *next; + int i; + + pthread_mutex_destroy(&pool->mr_mutex); + + list_for_each_safe(&pool->icm_mr_list, icm_mr, next, mr_list) + dr_icm_pool_mr_destroy(icm_mr); + + for (i = 0; i < pool->num_of_buckets; i++) + dr_icm_bucket_cleanup(&pool->buckets[i]); + + free(pool->buckets); + free(pool); +} diff --git a/providers/mlx5/dr_matcher.c b/providers/mlx5/dr_matcher.c new file mode 100644 index 0000000..717ee9b --- /dev/null +++ b/providers/mlx5/dr_matcher.c @@ -0,0 +1,867 @@ +/* + * Copyright (c) 2019, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <stdlib.h> +#include "mlx5dv_dr.h" + +#define DR_MASK_IPV4_ETHERTYPE 0x0800 +#define DR_MASK_IPV6_ETHERTYPE 0x86DD +#define DR_MASK_IP_VERSION_IPV4 0x4 +#define DR_MASK_IP_VERSION_IPV6 0x6 + +static bool dr_mask_is_smac_set(struct dr_match_spec *spec) +{ + return (spec->smac_47_16 || spec->smac_15_0); +} + +static bool dr_mask_is_dmac_set(struct dr_match_spec *spec) +{ + return (spec->dmac_47_16 || spec->dmac_15_0); +} + +static bool dr_mask_is_src_addr_set(struct dr_match_spec *spec) +{ + return (spec->src_ip_127_96 || spec->src_ip_95_64 || + spec->src_ip_63_32 || spec->src_ip_31_0); +} + +static bool dr_mask_is_dst_addr_set(struct dr_match_spec *spec) +{ + return (spec->dst_ip_127_96 || spec->dst_ip_95_64 || + spec->dst_ip_63_32 || spec->dst_ip_31_0); +} + +static bool dr_mask_is_l3_base_set(struct dr_match_spec *spec) +{ + return (spec->ip_protocol || spec->frag || spec->tcp_flags || + spec->ip_ecn || spec->ip_dscp); +} + +static bool dr_mask_is_tcp_udp_base_set(struct dr_match_spec *spec) +{ + return (spec->tcp_sport || spec->tcp_dport || + spec->udp_sport || spec->udp_dport); +} + +static bool dr_mask_is_ipv4_set(struct dr_match_spec *spec) +{ + return (spec->dst_ip_31_0 || spec->src_ip_31_0); +} + +static bool dr_mask_is_ipv4_5_tuple_set(struct dr_match_spec *spec) +{ + return (dr_mask_is_l3_base_set(spec) || + dr_mask_is_tcp_udp_base_set(spec) || + dr_mask_is_ipv4_set(spec)); +} + +static bool dr_mask_is_eth_l2_tnl_set(struct dr_match_misc *misc) +{ + return misc->vxlan_vni; +} + +static bool dr_mask_is_ttl_set(struct dr_match_spec *spec) +{ + return spec->ip_ttl_hoplimit; +} + +#define DR_MASK_IS_L2_DST(_spec, _misc, _inner_outer) (_spec.first_vid || \ + (_spec).first_cfi || (_spec).first_prio || (_spec).cvlan_tag || \ + (_spec).svlan_tag || (_spec).dmac_47_16 || (_spec).dmac_15_0 || \ + (_spec).ethertype || (_spec).ip_version || \ + (_misc)._inner_outer##_second_vid || \ + (_misc)._inner_outer##_second_cfi || \ + (_misc)._inner_outer##_second_prio || \ + (_misc)._inner_outer##_second_cvlan_tag || \ + (_misc)._inner_outer##_second_svlan_tag) + +#define DR_MASK_IS_ETH_L4_SET(_spec, _misc, _inner_outer) ( \ + dr_mask_is_l3_base_set(&(_spec)) || \ + dr_mask_is_tcp_udp_base_set(&(_spec)) || \ + dr_mask_is_ttl_set(&(_spec)) || \ + (_misc)._inner_outer##_ipv6_flow_label) + +#define DR_MASK_IS_ETH_L4_MISC_SET(_misc3, _inner_outer) ( \ + (_misc3)._inner_outer##_tcp_seq_num || \ + (_misc3)._inner_outer##_tcp_ack_num) + +#define DR_MASK_IS_FIRST_MPLS_SET(_misc2, _inner_outer) ( \ + (_misc2)._inner_outer##_first_mpls_label || \ + (_misc2)._inner_outer##_first_mpls_exp || \ + (_misc2)._inner_outer##_first_mpls_s_bos || \ + (_misc2)._inner_outer##_first_mpls_ttl) + +static bool dr_mask_is_gre_set(struct dr_match_misc *misc) +{ + return (misc->gre_key_h || misc->gre_key_l || + misc->gre_protocol || misc->gre_c_present || + misc->gre_k_present || misc->gre_s_present); +} + +#define DR_MASK_IS_OUTER_MPLS_OVER_GRE_UDP_SET(_misc2, gre_udp) ( \ + (_misc2).outer_first_mpls_over_##gre_udp##_label || \ + (_misc2).outer_first_mpls_over_##gre_udp##_exp || \ + (_misc2).outer_first_mpls_over_##gre_udp##_s_bos || \ + (_misc2).outer_first_mpls_over_##gre_udp##_ttl) + +#define DR_MASK_IS_FLEX_PARSER_0_SET(_misc2) ( \ + DR_MASK_IS_OUTER_MPLS_OVER_GRE_UDP_SET(_misc2, gre) || \ + DR_MASK_IS_OUTER_MPLS_OVER_GRE_UDP_SET(_misc2, udp)) + +static bool +dr_mask_is_misc3_vxlan_gpe_set(struct dr_match_misc3 *misc3) +{ + return misc3->outer_vxlan_gpe_vni || + misc3->outer_vxlan_gpe_next_protocol || + misc3->outer_vxlan_gpe_flags; +} + +static bool +dr_matcher_supp_flex_parser_vxlan_gpe(struct dr_devx_caps *caps) +{ + return caps->flex_protocols & + MLX5_FLEX_PARSER_VXLAN_GPE_ENABLED; +} + +static bool +dr_mask_is_flex_parser_tnl_vxlan_gpe_set(struct dr_match_param *mask, + struct mlx5dv_dr_domain *dmn) +{ + return dr_mask_is_misc3_vxlan_gpe_set(&mask->misc3) && + dr_matcher_supp_flex_parser_vxlan_gpe(&dmn->info.caps); +} + +static bool dr_mask_is_misc_geneve_set(struct dr_match_misc *misc) +{ + return misc->geneve_vni || + misc->geneve_oam || + misc->geneve_protocol_type || + misc->geneve_opt_len; +} + +static bool +dr_matcher_supp_flex_parser_geneve(struct dr_devx_caps *caps) +{ + return caps->flex_protocols & + MLX5_FLEX_PARSER_GENEVE_ENABLED; +} + +static bool +dr_mask_is_flex_parser_tnl_geneve_set(struct dr_match_param *mask, + struct mlx5dv_dr_domain *dmn) +{ + return dr_mask_is_misc_geneve_set(&mask->misc) && + dr_matcher_supp_flex_parser_geneve(&dmn->info.caps); +} + +static bool dr_mask_is_misc3_gtpu_set(struct dr_match_misc3 *misc3) +{ + return misc3->gtpu_flags || + misc3->gtpu_msg_type || + misc3->gtpu_teid; +} + +static bool dr_matcher_supp_flex_parser_gtpu(struct dr_devx_caps *caps) +{ + return caps->flex_protocols & + MLX5_FLEX_PARSER_GTPU_ENABLED; +} + +static bool dr_mask_is_flex_parser_tnl_gtpu_set(struct dr_match_param *mask, + struct mlx5dv_dr_domain *dmn) +{ + return dr_mask_is_misc3_gtpu_set(&mask->misc3) && + dr_matcher_supp_flex_parser_gtpu(&dmn->info.caps); +} + +static bool dr_mask_is_flex_parser_icmpv6_set(struct dr_match_misc3 *misc3) +{ + return (misc3->icmpv6_type || misc3->icmpv6_code || + misc3->icmpv6_header_data); +} + +static bool dr_mask_is_wqe_metadata_set(struct dr_match_misc2 *misc2) +{ + return misc2->metadata_reg_a; +} + +static bool dr_mask_is_reg_c_0_3_set(struct dr_match_misc2 *misc2) +{ + return (misc2->metadata_reg_c_0 || misc2->metadata_reg_c_1 || + misc2->metadata_reg_c_2 || misc2->metadata_reg_c_3); +} + +static bool dr_mask_is_reg_c_4_7_set(struct dr_match_misc2 *misc2) +{ + return (misc2->metadata_reg_c_4 || misc2->metadata_reg_c_5 || + misc2->metadata_reg_c_6 || misc2->metadata_reg_c_7); +} + +static bool dr_mask_is_gvmi_or_qpn_set(struct dr_match_misc *misc) +{ + return (misc->source_sqn || misc->source_port); +} + +static int dr_matcher_set_ste_builders(struct mlx5dv_dr_matcher *matcher, + struct dr_matcher_rx_tx *nic_matcher) +{ + struct dr_domain_rx_tx *nic_dmn = nic_matcher->nic_tbl->nic_dmn; + struct dr_ste_build *sb = nic_matcher->ste_builder; + struct mlx5dv_dr_domain *dmn = matcher->tbl->dmn; + struct dr_match_param mask = {}; + struct dr_match_misc3 *misc3; + bool inner, rx; + uint8_t ipv; + int idx = 0; + int ret, i; + + rx = nic_dmn->ste_type == DR_STE_TYPE_RX; + + /* Create a temporary mask to track and clear used mask fields */ + if (matcher->match_criteria & DR_MATCHER_CRITERIA_OUTER) + mask.outer = matcher->mask.outer; + + if (matcher->match_criteria & DR_MATCHER_CRITERIA_MISC) + mask.misc = matcher->mask.misc; + + if (matcher->match_criteria & DR_MATCHER_CRITERIA_INNER) + mask.inner = matcher->mask.inner; + + if (matcher->match_criteria & DR_MATCHER_CRITERIA_MISC2) + mask.misc2 = matcher->mask.misc2; + + if (matcher->match_criteria & DR_MATCHER_CRITERIA_MISC3) + mask.misc3 = matcher->mask.misc3; + + ret = dr_ste_build_pre_check(dmn, matcher->match_criteria, + &matcher->mask, NULL); + if (ret) + return ret; + + /* Outer */ + if (matcher->match_criteria & (DR_MATCHER_CRITERIA_OUTER | + DR_MATCHER_CRITERIA_MISC | + DR_MATCHER_CRITERIA_MISC2 | + DR_MATCHER_CRITERIA_MISC3)) { + inner = false; + ipv = mask.outer.ip_version; + + if (dr_mask_is_wqe_metadata_set(&mask.misc2)) + dr_ste_build_general_purpose(&sb[idx++], &mask, inner, rx); + + if (dr_mask_is_reg_c_0_3_set(&mask.misc2)) + dr_ste_build_register_0(&sb[idx++], &mask, inner, rx); + + if (dr_mask_is_reg_c_4_7_set(&mask.misc2)) + dr_ste_build_register_1(&sb[idx++], &mask, inner, rx); + + if (dr_mask_is_gvmi_or_qpn_set(&mask.misc) && + (dmn->type == MLX5DV_DR_DOMAIN_TYPE_FDB || + dmn->type == MLX5DV_DR_DOMAIN_TYPE_NIC_RX)) { + ret = dr_ste_build_src_gvmi_qpn(&sb[idx++], &mask, + &dmn->info.caps, + inner, rx); + if (ret) + return ret; + } + + if (dr_mask_is_smac_set(&mask.outer) && + dr_mask_is_dmac_set(&mask.outer)) { + ret = dr_ste_build_eth_l2_src_des(&sb[idx++], &mask, + inner, rx); + if (ret) + return ret; + } + + if (dr_mask_is_smac_set(&mask.outer)) + dr_ste_build_eth_l2_src(&sb[idx++], &mask, inner, rx); + + if (DR_MASK_IS_L2_DST(mask.outer, mask.misc, outer)) + dr_ste_build_eth_l2_dst(&sb[idx++], &mask, inner, rx); + + if (ipv == 4) { + if (dr_mask_is_ipv4_5_tuple_set(&mask.outer)) + dr_ste_build_eth_l3_ipv4_5_tuple(&sb[idx++], &mask, + inner, rx); + + if (dr_mask_is_ttl_set(&mask.outer)) + dr_ste_build_eth_l3_ipv4_misc(&sb[idx++], &mask, + inner, rx); + } else if (ipv == 6) { + if (dr_mask_is_dst_addr_set(&mask.outer)) + dr_ste_build_eth_l3_ipv6_dst(&sb[idx++], &mask, + inner, rx); + + if (dr_mask_is_src_addr_set(&mask.outer)) + dr_ste_build_eth_l3_ipv6_src(&sb[idx++], &mask, + inner, rx); + + if (DR_MASK_IS_ETH_L4_SET(mask.outer, mask.misc, outer)) + dr_ste_build_ipv6_l3_l4(&sb[idx++], &mask, + inner, rx); + } + + if (dr_mask_is_flex_parser_tnl_vxlan_gpe_set(&mask, dmn)) + dr_ste_build_flex_parser_tnl_vxlan_gpe(&sb[idx++], &mask, + inner, rx); + else if (dr_mask_is_flex_parser_tnl_geneve_set(&mask, dmn)) + dr_ste_build_flex_parser_tnl_geneve(&sb[idx++], &mask, + inner, rx); + else if (dr_mask_is_flex_parser_tnl_gtpu_set(&mask, dmn)) + dr_ste_build_flex_parser_tnl_gtpu(&sb[idx++], &mask, + inner, rx); + + if (DR_MASK_IS_ETH_L4_MISC_SET(mask.misc3, outer)) + dr_ste_build_eth_l4_misc(&sb[idx++], &mask, inner, rx); + + if (DR_MASK_IS_FIRST_MPLS_SET(mask.misc2, outer)) + dr_ste_build_mpls(&sb[idx++], &mask, inner, rx); + + if (DR_MASK_IS_FLEX_PARSER_0_SET(mask.misc2)) + dr_ste_build_flex_parser_0(&sb[idx++], &mask, inner, rx); + + misc3 = &mask.misc3; + if ((DR_MASK_IS_FLEX_PARSER_ICMPV4_SET(misc3) && + dr_matcher_supp_flex_parser_icmp_v4(&dmn->info.caps)) || + (dr_mask_is_flex_parser_icmpv6_set(&mask.misc3) && + dr_matcher_supp_flex_parser_icmp_v6(&dmn->info.caps))) { + ret = dr_ste_build_flex_parser_1(&sb[idx++], + &mask, &dmn->info.caps, + inner, rx); + if (ret) + return ret; + } + if (dr_mask_is_gre_set(&mask.misc)) + dr_ste_build_gre(&sb[idx++], &mask, inner, rx); + } + + /* Inner */ + if (matcher->match_criteria & (DR_MATCHER_CRITERIA_INNER | + DR_MATCHER_CRITERIA_MISC | + DR_MATCHER_CRITERIA_MISC2 | + DR_MATCHER_CRITERIA_MISC3)) { + inner = true; + ipv = mask.inner.ip_version; + + if (dr_mask_is_eth_l2_tnl_set(&mask.misc)) + dr_ste_build_eth_l2_tnl(&sb[idx++], &mask, inner, rx); + + if (dr_mask_is_smac_set(&mask.inner) && + dr_mask_is_dmac_set(&mask.inner)) { + ret = dr_ste_build_eth_l2_src_des(&sb[idx++], + &mask, inner, rx); + if (ret) + return ret; + } + + if (dr_mask_is_smac_set(&mask.inner)) + dr_ste_build_eth_l2_src(&sb[idx++], &mask, inner, rx); + + if (DR_MASK_IS_L2_DST(mask.inner, mask.misc, inner)) + dr_ste_build_eth_l2_dst(&sb[idx++], &mask, inner, rx); + + if (ipv == 4) { + if (dr_mask_is_ipv4_5_tuple_set(&mask.inner)) + dr_ste_build_eth_l3_ipv4_5_tuple(&sb[idx++], &mask, + inner, rx); + + if (dr_mask_is_ttl_set(&mask.inner)) + dr_ste_build_eth_l3_ipv4_misc(&sb[idx++], &mask, + inner, rx); + } else if (ipv == 6) { + if (dr_mask_is_dst_addr_set(&mask.inner)) + dr_ste_build_eth_l3_ipv6_dst(&sb[idx++], &mask, + inner, rx); + + if (dr_mask_is_src_addr_set(&mask.inner)) + dr_ste_build_eth_l3_ipv6_src(&sb[idx++], &mask, + inner, rx); + + if (DR_MASK_IS_ETH_L4_SET(mask.inner, mask.misc, inner)) + dr_ste_build_ipv6_l3_l4(&sb[idx++], &mask, + inner, rx); + } + + if (DR_MASK_IS_ETH_L4_MISC_SET(mask.misc3, inner)) + dr_ste_build_eth_l4_misc(&sb[idx++], &mask, inner, rx); + + if (DR_MASK_IS_FIRST_MPLS_SET(mask.misc2, inner)) + dr_ste_build_mpls(&sb[idx++], &mask, inner, rx); + + if (DR_MASK_IS_FLEX_PARSER_0_SET(mask.misc2)) + dr_ste_build_flex_parser_0(&sb[idx++], &mask, inner, rx); + } + /* Empty matcher, takes all */ + if (matcher->match_criteria == DR_MATCHER_CRITERIA_EMPTY) + dr_ste_build_empty_always_hit(&sb[idx++], rx); + + if (idx == 0) { + dr_dbg(dmn, "Cannot generate any valid rules from mask\n"); + errno = EINVAL; + return errno; + } + + nic_matcher->num_of_builders = idx; + + /* Check that all mask fields were consumed */ + for (i = 0; i < sizeof(struct dr_match_param); i++) { + if (((uint8_t *)&mask)[i] != 0) { + dr_dbg(dmn, "Mask contains unsupported parameters\n"); + errno = EOPNOTSUPP; + return errno; + } + } + return 0; +} + +static int dr_matcher_connect(struct mlx5dv_dr_domain *dmn, + struct dr_matcher_rx_tx *curr_nic_matcher, + struct dr_matcher_rx_tx *next_nic_matcher, + struct dr_matcher_rx_tx *prev_nic_matcher) +{ + struct dr_table_rx_tx *nic_tbl = curr_nic_matcher->nic_tbl; + struct dr_domain_rx_tx *nic_dmn = nic_tbl->nic_dmn; + struct dr_htbl_connect_info info; + struct dr_ste_htbl *prev_htbl; + int ret; + + /* Connect end anchor hash table to next_htbl or to the default address */ + if (next_nic_matcher) { + info.type = CONNECT_HIT; + info.hit_next_htbl = next_nic_matcher->s_htbl; + } else { + info.type = CONNECT_MISS; + info.miss_icm_addr = nic_dmn->default_icm_addr; + } + ret = dr_ste_htbl_init_and_postsend(dmn, nic_dmn, + curr_nic_matcher->e_anchor, + &info, info.type == CONNECT_HIT); + if (ret) + return ret; + + /* Connect start hash table to end anchor */ + info.type = CONNECT_MISS; + info.miss_icm_addr = curr_nic_matcher->e_anchor->chunk->icm_addr; + ret = dr_ste_htbl_init_and_postsend(dmn, nic_dmn, + curr_nic_matcher->s_htbl, + &info, false); + if (ret) + return ret; + + /* Connect previous hash table to matcher start hash table */ + if (prev_nic_matcher) + prev_htbl = prev_nic_matcher->e_anchor; + else + prev_htbl = nic_tbl->s_anchor; + + info.type = CONNECT_HIT; + info.hit_next_htbl = curr_nic_matcher->s_htbl; + ret = dr_ste_htbl_init_and_postsend(dmn, nic_dmn, prev_htbl, + &info, true); + if (ret) + return ret; + + /* Update the pointing ste and next hash table */ + curr_nic_matcher->s_htbl->pointing_ste = prev_htbl->ste_arr; + prev_htbl->ste_arr[0].next_htbl = curr_nic_matcher->s_htbl; + + if (next_nic_matcher) { + next_nic_matcher->s_htbl->pointing_ste = curr_nic_matcher->e_anchor->ste_arr; + curr_nic_matcher->e_anchor->ste_arr[0].next_htbl = next_nic_matcher->s_htbl; + } + + return 0; +} + +static int dr_matcher_add_to_tbl(struct mlx5dv_dr_matcher *matcher) +{ + struct mlx5dv_dr_matcher *next_matcher, *prev_matcher, *tmp_matcher; + struct mlx5dv_dr_table *tbl = matcher->tbl; + struct mlx5dv_dr_domain *dmn = tbl->dmn; + int ret; + + if (dr_is_root_table(matcher->tbl)) + return 0; + + next_matcher = NULL; + + list_for_each(&tbl->matcher_list, tmp_matcher, matcher_list) + if (tmp_matcher->prio >= matcher->prio) { + next_matcher = tmp_matcher; + break; + } + + if (next_matcher) + prev_matcher = list_prev(&tbl->matcher_list, + next_matcher, + matcher_list); + else + prev_matcher = list_tail(&tbl->matcher_list, + struct mlx5dv_dr_matcher, + matcher_list); + + if (dmn->type == MLX5DV_DR_DOMAIN_TYPE_FDB || + dmn->type == MLX5DV_DR_DOMAIN_TYPE_NIC_RX) { + ret = dr_matcher_connect(dmn, &matcher->rx, + next_matcher ? &next_matcher->rx : NULL, + prev_matcher ? &prev_matcher->rx : NULL); + if (ret) + return ret; + } + + if (dmn->type == MLX5DV_DR_DOMAIN_TYPE_FDB || + dmn->type == MLX5DV_DR_DOMAIN_TYPE_NIC_TX) { + ret = dr_matcher_connect(dmn, &matcher->tx, + next_matcher ? &next_matcher->tx : NULL, + prev_matcher ? &prev_matcher->tx : NULL); + if (ret) + return ret; + } + + if (prev_matcher) + list_add_after(&tbl->matcher_list, + &prev_matcher->matcher_list, + &matcher->matcher_list); + else if (next_matcher) + list_add_before(&tbl->matcher_list, + &next_matcher->matcher_list, + &matcher->matcher_list); + else + list_add(&tbl->matcher_list, &matcher->matcher_list); + + return 0; +} + +static void dr_matcher_uninit_nic(struct dr_matcher_rx_tx *nic_matcher) +{ + dr_htbl_put(nic_matcher->s_htbl); + dr_htbl_put(nic_matcher->e_anchor); +} + +static void dr_matcher_uninit_fdb(struct mlx5dv_dr_matcher *matcher) +{ + dr_matcher_uninit_nic(&matcher->rx); + dr_matcher_uninit_nic(&matcher->tx); +} + +static int dr_matcher_uninit_root(struct mlx5dv_dr_matcher *matcher) +{ + return mlx5dv_destroy_flow_matcher(matcher->dv_matcher); +} + +static void dr_matcher_uninit(struct mlx5dv_dr_matcher *matcher) +{ + struct mlx5dv_dr_domain *dmn = matcher->tbl->dmn; + + if (dr_is_root_table(matcher->tbl)) { + dr_matcher_uninit_root(matcher); + return; + } + + switch (dmn->type) { + case MLX5DV_DR_DOMAIN_TYPE_NIC_RX: + dr_matcher_uninit_nic(&matcher->rx); + break; + case MLX5DV_DR_DOMAIN_TYPE_NIC_TX: + dr_matcher_uninit_nic(&matcher->tx); + break; + case MLX5DV_DR_DOMAIN_TYPE_FDB: + dr_matcher_uninit_fdb(matcher); + break; + default: + assert(false); + break; + } +} + +static int dr_matcher_init_nic(struct mlx5dv_dr_matcher *matcher, + struct dr_matcher_rx_tx *nic_matcher) +{ + struct mlx5dv_dr_domain *dmn = matcher->tbl->dmn; + int ret; + + ret = dr_matcher_set_ste_builders(matcher, nic_matcher); + if (ret) + return ret; + + nic_matcher->e_anchor = dr_ste_htbl_alloc(dmn->ste_icm_pool, + DR_CHUNK_SIZE_1, + DR_STE_LU_TYPE_DONT_CARE, + 0); + if (!nic_matcher->e_anchor) + return errno; + + nic_matcher->s_htbl = dr_ste_htbl_alloc(dmn->ste_icm_pool, + DR_CHUNK_SIZE_1, + nic_matcher->ste_builder[0].lu_type, + nic_matcher->ste_builder[0].byte_mask); + if (!nic_matcher->s_htbl) + goto free_e_htbl; + + /* make sure the tables exist while empty */ + dr_htbl_get(nic_matcher->s_htbl); + dr_htbl_get(nic_matcher->e_anchor); + + return 0; + +free_e_htbl: + dr_ste_htbl_free(nic_matcher->e_anchor); + return errno; +} + +static int dr_matcher_init_fdb(struct mlx5dv_dr_matcher *matcher) +{ + int ret; + + ret = dr_matcher_init_nic(matcher, &matcher->rx); + if (ret) + return ret; + + ret = dr_matcher_init_nic(matcher, &matcher->tx); + if (ret) + goto uninit_nic_rx; + + return 0; + +uninit_nic_rx: + dr_matcher_uninit_nic(&matcher->rx); + return ret; +} + +static int dr_matcher_init_root(struct mlx5dv_dr_matcher *matcher, + struct mlx5dv_flow_match_parameters *mask) +{ + struct mlx5dv_dr_domain *dmn = matcher->tbl->dmn; + struct mlx5dv_flow_matcher_attr attr = {}; + enum mlx5dv_flow_table_type type; + + if (dmn->type == MLX5DV_DR_DOMAIN_TYPE_NIC_RX) + type = MLX5_IB_UAPI_FLOW_TABLE_TYPE_NIC_RX; + else if (dmn->type == MLX5DV_DR_DOMAIN_TYPE_NIC_TX) + type = MLX5_IB_UAPI_FLOW_TABLE_TYPE_NIC_TX; + else + type = MLX5_IB_UAPI_FLOW_TABLE_TYPE_FDB; + + attr.match_mask = mask; + attr.priority = matcher->prio; + attr.type = IBV_FLOW_ATTR_NORMAL; + attr.match_criteria_enable = matcher->match_criteria; + attr.ft_type = type; + attr.comp_mask = MLX5DV_FLOW_MATCHER_MASK_FT_TYPE; + + matcher->dv_matcher = mlx5dv_create_flow_matcher(dmn->ctx, &attr); + if (!matcher->dv_matcher) + return errno; + + return 0; +} + +static int dr_matcher_init(struct mlx5dv_dr_matcher *matcher, + struct mlx5dv_flow_match_parameters *mask) +{ + struct mlx5dv_dr_table *tbl = matcher->tbl; + struct mlx5dv_dr_domain *dmn = tbl->dmn; + int ret; + + if (dr_is_root_table(matcher->tbl)) + return dr_matcher_init_root(matcher, mask); + + if (matcher->match_criteria >= DR_MATCHER_CRITERIA_MAX) { + dr_dbg(dmn, "Invalid match criteria attribute\n"); + errno = EINVAL; + return errno; + } + + if (mask) { + if (mask->match_sz > DEVX_ST_SZ_BYTES(dr_match_param)) { + dr_dbg(dmn, "Invalid match size attribute\n"); + errno = EINVAL; + return errno; + } + dr_ste_copy_param(matcher->match_criteria, &matcher->mask, mask); + } + + switch (dmn->type) { + case MLX5DV_DR_DOMAIN_TYPE_NIC_RX: + matcher->rx.nic_tbl = &tbl->rx; + ret = dr_matcher_init_nic(matcher, &matcher->rx); + break; + case MLX5DV_DR_DOMAIN_TYPE_NIC_TX: + matcher->tx.nic_tbl = &tbl->tx; + ret = dr_matcher_init_nic(matcher, &matcher->tx); + break; + case MLX5DV_DR_DOMAIN_TYPE_FDB: + matcher->rx.nic_tbl = &tbl->rx; + matcher->tx.nic_tbl = &tbl->tx; + ret = dr_matcher_init_fdb(matcher); + break; + default: + assert(false); + errno = EINVAL; + return errno; + } + + return ret; +} + +struct mlx5dv_dr_matcher * +mlx5dv_dr_matcher_create(struct mlx5dv_dr_table *tbl, + uint16_t priority, + uint8_t match_criteria_enable, + struct mlx5dv_flow_match_parameters *mask) +{ + struct mlx5dv_dr_matcher *matcher; + int ret; + + atomic_fetch_add(&tbl->refcount, 1); + + matcher = calloc(1, sizeof(*matcher)); + if (!matcher) { + errno = ENOMEM; + goto dec_ref; + } + + matcher->tbl = tbl; + matcher->prio = priority; + matcher->match_criteria = match_criteria_enable; + atomic_init(&matcher->refcount, 1); + list_node_init(&matcher->matcher_list); + list_head_init(&matcher->rule_list); + + pthread_mutex_lock(&tbl->dmn->mutex); + + ret = dr_matcher_init(matcher, mask); + if (ret) + goto free_matcher; + + ret = dr_matcher_add_to_tbl(matcher); + if (ret) + goto matcher_uninit; + + pthread_mutex_unlock(&tbl->dmn->mutex); + + return matcher; + +matcher_uninit: + dr_matcher_uninit(matcher); +free_matcher: + pthread_mutex_unlock(&tbl->dmn->mutex); + free(matcher); +dec_ref: + atomic_fetch_sub(&tbl->refcount, 1); + return NULL; +} + +static int dr_matcher_disconnect(struct mlx5dv_dr_domain *dmn, + struct dr_table_rx_tx *nic_tbl, + struct dr_matcher_rx_tx *next_nic_matcher, + struct dr_matcher_rx_tx *prev_nic_matcher) +{ + struct dr_domain_rx_tx *nic_dmn = nic_tbl->nic_dmn; + struct dr_htbl_connect_info info; + struct dr_ste_htbl *prev_anchor; + + if (prev_nic_matcher) + prev_anchor = prev_nic_matcher->e_anchor; + else + prev_anchor = nic_tbl->s_anchor; + + /* Connect previous anchor hash table to next matcher or to the default address */ + if (next_nic_matcher) { + info.type = CONNECT_HIT; + info.hit_next_htbl = next_nic_matcher->s_htbl; + next_nic_matcher->s_htbl->pointing_ste = prev_anchor->ste_arr; + prev_anchor->ste_arr[0].next_htbl = next_nic_matcher->s_htbl; + } else { + info.type = CONNECT_MISS; + info.miss_icm_addr = nic_dmn->default_icm_addr; + prev_anchor->ste_arr[0].next_htbl = NULL; + } + + return dr_ste_htbl_init_and_postsend(dmn, nic_dmn, prev_anchor, + &info, true); +} + +static int dr_matcher_remove_from_tbl(struct mlx5dv_dr_matcher *matcher) +{ + struct mlx5dv_dr_matcher *prev_matcher, *next_matcher; + struct mlx5dv_dr_table *tbl = matcher->tbl; + struct mlx5dv_dr_domain *dmn = tbl->dmn; + int ret = 0; + + if (dr_is_root_table(matcher->tbl)) + return 0; + + prev_matcher = list_prev(&tbl->matcher_list, matcher, matcher_list); + next_matcher = list_next(&tbl->matcher_list, matcher, matcher_list); + + if (dmn->type == MLX5DV_DR_DOMAIN_TYPE_FDB || + dmn->type == MLX5DV_DR_DOMAIN_TYPE_NIC_RX) { + ret = dr_matcher_disconnect(dmn, &tbl->rx, + next_matcher ? &next_matcher->rx : NULL, + prev_matcher ? &prev_matcher->rx : NULL); + if (ret) + return ret; + } + + if (dmn->type == MLX5DV_DR_DOMAIN_TYPE_FDB || + dmn->type == MLX5DV_DR_DOMAIN_TYPE_NIC_TX) { + ret = dr_matcher_disconnect(dmn, &tbl->tx, + next_matcher ? &next_matcher->tx : NULL, + prev_matcher ? &prev_matcher->tx : NULL); + if (ret) + return ret; + } + + list_del(&matcher->matcher_list); + + return 0; +} + +int mlx5dv_dr_matcher_destroy(struct mlx5dv_dr_matcher *matcher) +{ + struct mlx5dv_dr_table *tbl = matcher->tbl; + + if (atomic_load(&matcher->refcount) > 1) + return EBUSY; + + pthread_mutex_lock(&tbl->dmn->mutex); + + dr_matcher_remove_from_tbl(matcher); + dr_matcher_uninit(matcher); + atomic_fetch_sub(&matcher->tbl->refcount, 1); + + pthread_mutex_unlock(&tbl->dmn->mutex); + free(matcher); + + return 0; +} diff --git a/providers/mlx5/dr_rule.c b/providers/mlx5/dr_rule.c new file mode 100644 index 0000000..1870a8b --- /dev/null +++ b/providers/mlx5/dr_rule.c @@ -0,0 +1,1338 @@ +/* + * Copyright (c) 2019, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <stdlib.h> +#include <ccan/minmax.h> +#include "mlx5dv_dr.h" + +#define DR_RULE_MAX_STE_CHAIN (DR_RULE_MAX_STES + DR_ACTION_MAX_STES) + +static int dr_rule_append_to_miss_list(struct dr_ste *new_last_ste, + struct list_head *miss_list, + struct list_head *send_list) +{ + struct dr_ste_send_info *ste_info_last; + struct dr_ste *last_ste; + + /* The new entry will be inserted after the last */ + last_ste = list_tail(miss_list, struct dr_ste, miss_list_node); + assert(last_ste); + + ste_info_last = calloc(1, sizeof(*ste_info_last)); + if (!ste_info_last) { + errno = ENOMEM; + return errno; + } + + dr_ste_set_miss_addr(last_ste->hw_ste, dr_ste_get_icm_addr(new_last_ste)); + list_add_tail(miss_list, &new_last_ste->miss_list_node); + + dr_send_fill_and_append_ste_send_info(last_ste, DR_STE_SIZE_REDUCED, + 0, last_ste->hw_ste, + ste_info_last, send_list, true); + + return 0; +} + +static struct dr_ste +*dr_rule_create_collision_htbl(struct mlx5dv_dr_matcher *matcher, + struct dr_matcher_rx_tx *nic_matcher, + uint8_t *hw_ste) +{ + struct mlx5dv_dr_domain *dmn = matcher->tbl->dmn; + struct dr_ste_htbl *new_htbl; + struct dr_ste *ste; + + /* Create new table for miss entry */ + new_htbl = dr_ste_htbl_alloc(dmn->ste_icm_pool, + DR_CHUNK_SIZE_1, + DR_STE_LU_TYPE_DONT_CARE, + 0); + if (!new_htbl) { + dr_dbg(dmn, "Failed allocating collision table\n"); + return NULL; + } + + /* One and only entry, never grows */ + ste = new_htbl->ste_arr; + dr_ste_set_miss_addr(hw_ste, nic_matcher->e_anchor->chunk->icm_addr); + dr_htbl_get(new_htbl); + + return ste; +} + +static struct dr_ste *dr_rule_create_collision_entry(struct mlx5dv_dr_matcher *matcher, + struct dr_matcher_rx_tx *nic_matcher, + uint8_t *hw_ste, + struct dr_ste *orig_ste) +{ + struct dr_ste *ste; + + ste = dr_rule_create_collision_htbl(matcher, nic_matcher, hw_ste); + if (!ste) { + dr_dbg(matcher->tbl->dmn, "Failed creating collision entry\n"); + return NULL; + } + + ste->ste_chain_location = orig_ste->ste_chain_location; + + /* In collision entry, all members share the same miss_list_head */ + ste->htbl->miss_list = dr_ste_get_miss_list(orig_ste); + + /* Next table */ + if (dr_ste_create_next_htbl(matcher, nic_matcher, ste, hw_ste, + DR_CHUNK_SIZE_1)) { + dr_dbg(matcher->tbl->dmn, "Failed allocating table\n"); + goto free_tbl; + } + + return ste; + +free_tbl: + dr_ste_free(ste, matcher, nic_matcher); + return NULL; +} + +static int dr_rule_handle_one_ste_in_update_list(struct dr_ste_send_info *ste_info, + struct mlx5dv_dr_domain *dmn) +{ + int ret; + + list_del(&ste_info->send_list); + ret = dr_send_postsend_ste(dmn, ste_info->ste, ste_info->data, + ste_info->size, ste_info->offset); + if (ret) + goto out; + /* Copy data to ste, only reduced size, the last 16B (mask) + * is already written to the hw. + */ + memcpy(ste_info->ste->hw_ste, ste_info->data, DR_STE_SIZE_REDUCED); + +out: + free(ste_info); + return ret; +} + +static int dr_rule_send_update_list(struct list_head *send_ste_list, + struct mlx5dv_dr_domain *dmn, + bool is_reverse) +{ + struct dr_ste_send_info *ste_info, *tmp_ste_info; + int ret; + + if (is_reverse) { + list_for_each_rev_safe(send_ste_list, ste_info, tmp_ste_info, + send_list) { + ret = dr_rule_handle_one_ste_in_update_list(ste_info, + dmn); + if (ret) + return ret; + } + } else { + list_for_each_safe(send_ste_list, ste_info, tmp_ste_info, + send_list) { + ret = dr_rule_handle_one_ste_in_update_list(ste_info, + dmn); + if (ret) + return ret; + } + } + + return 0; +} + +static struct dr_ste *dr_rule_find_ste_in_miss_list(struct list_head *miss_list, + uint8_t *hw_ste) +{ + struct dr_ste *ste; + + /* Check if hw_ste is present in the list */ + list_for_each(miss_list, ste, miss_list_node) + if (dr_ste_equal_tag(ste->hw_ste, hw_ste)) + return ste; + + return NULL; +} + +static struct dr_ste * +dr_rule_rehash_handle_collision(struct mlx5dv_dr_matcher *matcher, + struct dr_matcher_rx_tx *nic_matcher, + struct list_head *update_list, + struct dr_ste *col_ste, + uint8_t *hw_ste) +{ + struct dr_ste *new_ste; + int ret; + + new_ste = dr_rule_create_collision_htbl(matcher, nic_matcher, hw_ste); + if (!new_ste) + return NULL; + + /* In collision entry, all members share the same miss_list_head */ + new_ste->htbl->miss_list = dr_ste_get_miss_list(col_ste); + + /* Update the previous from the list */ + ret = dr_rule_append_to_miss_list(new_ste, + dr_ste_get_miss_list(col_ste), + update_list); + if (ret) { + dr_dbg(matcher->tbl->dmn, "Failed update dup entry\n"); + goto err_exit; + } + + return new_ste; + +err_exit: + dr_ste_free(new_ste, matcher, nic_matcher); + return NULL; +} + +static void dr_rule_rehash_copy_ste_ctrl(struct mlx5dv_dr_matcher *matcher, + struct dr_matcher_rx_tx *nic_matcher, + struct dr_ste *cur_ste, + struct dr_ste *new_ste) +{ + new_ste->next_htbl = cur_ste->next_htbl; + new_ste->ste_chain_location = cur_ste->ste_chain_location; + + if (!dr_ste_is_last_in_rule(nic_matcher, new_ste->ste_chain_location)) + new_ste->next_htbl->pointing_ste = new_ste; + + /* + * We need to copy the refcount since this ste + * may have been traversed several times + */ + atomic_init(&new_ste->refcount, atomic_load(&cur_ste->refcount)); + + /* Link old STEs rule_mem list to the new ste */ + dr_rule_update_rule_member(cur_ste, new_ste); + list_head_init(&new_ste->rule_list); + list_append_list(&new_ste->rule_list, &cur_ste->rule_list); +} + +static struct dr_ste *dr_rule_rehash_copy_ste(struct mlx5dv_dr_matcher *matcher, + struct dr_matcher_rx_tx *nic_matcher, + struct dr_ste *cur_ste, + struct dr_ste_htbl *new_htbl, + struct list_head *update_list) +{ + uint8_t hw_ste[DR_STE_SIZE] = {}; + struct dr_ste_send_info *ste_info; + bool use_update_list = false; + struct dr_ste *new_ste; + uint8_t sb_idx; + int new_idx; + + /* Copy STE mask from the matcher */ + sb_idx = cur_ste->ste_chain_location - 1; + dr_ste_set_bit_mask(hw_ste, nic_matcher->ste_builder[sb_idx].bit_mask); + + /* Copy STE control and tag */ + memcpy(hw_ste, cur_ste->hw_ste, DR_STE_SIZE_REDUCED); + dr_ste_set_miss_addr(hw_ste, nic_matcher->e_anchor->chunk->icm_addr); + + new_idx = dr_ste_calc_hash_index(hw_ste, new_htbl); + new_ste = &new_htbl->ste_arr[new_idx]; + + if (dr_ste_not_used_ste(new_ste)) { + dr_htbl_get(new_htbl); + list_add_tail(dr_ste_get_miss_list(new_ste), &new_ste->miss_list_node); + } else { + new_ste = dr_rule_rehash_handle_collision(matcher, + nic_matcher, + update_list, + new_ste, + hw_ste); + if (!new_ste) { + dr_dbg(matcher->tbl->dmn, "Failed adding collision entry, index: %d\n", + new_idx); + return NULL; + } + new_htbl->ctrl.num_of_collisions++; + use_update_list = true; + } + + memcpy(new_ste->hw_ste, hw_ste, DR_STE_SIZE_REDUCED); + + new_htbl->ctrl.num_of_valid_entries++; + + if (use_update_list) { + ste_info = calloc(1, sizeof(*ste_info)); + if (!ste_info) { + dr_dbg(matcher->tbl->dmn, "Failed allocating ste_info\n"); + errno = ENOMEM; + goto err_exit; + } + dr_send_fill_and_append_ste_send_info(new_ste, DR_STE_SIZE, 0, + hw_ste, ste_info, + update_list, true); + } + + dr_rule_rehash_copy_ste_ctrl(matcher, nic_matcher, cur_ste, new_ste); + + return new_ste; + +err_exit: + dr_ste_free(new_ste, matcher, nic_matcher); + return NULL; +} + +static int dr_rule_rehash_copy_miss_list(struct mlx5dv_dr_matcher *matcher, + struct dr_matcher_rx_tx *nic_matcher, + struct list_head *cur_miss_list, + struct dr_ste_htbl *new_htbl, + struct list_head *update_list) +{ + struct dr_ste *tmp_ste, *cur_ste, *new_ste; + + list_for_each_safe(cur_miss_list, cur_ste, tmp_ste, miss_list_node) { + new_ste = dr_rule_rehash_copy_ste(matcher, + nic_matcher, + cur_ste, + new_htbl, + update_list); + if (!new_ste) + goto err_insert; + + list_del(&cur_ste->miss_list_node); + dr_htbl_put(cur_ste->htbl); + } + return 0; + +err_insert: + dr_dbg(matcher->tbl->dmn, "Fatal error during resize\n"); + assert(false); + return errno; +} + +static int dr_rule_rehash_copy_htbl(struct mlx5dv_dr_matcher *matcher, + struct dr_matcher_rx_tx *nic_matcher, + struct dr_ste_htbl *cur_htbl, + struct dr_ste_htbl *new_htbl, + struct list_head *update_list) +{ + struct dr_ste *cur_ste; + int cur_entries; + int err = 0; + int i; + + cur_entries = dr_icm_pool_chunk_size_to_entries(cur_htbl->chunk_size); + + for (i = 0; i < cur_entries; i++) { + cur_ste = &cur_htbl->ste_arr[i]; + if (dr_ste_not_used_ste(cur_ste)) /* Empty, nothing to copy */ + continue; + + err = dr_rule_rehash_copy_miss_list(matcher, + nic_matcher, + dr_ste_get_miss_list(cur_ste), + new_htbl, + update_list); + if (err) + goto clean_copy; + } + +clean_copy: + return err; +} + +static struct dr_ste_htbl *dr_rule_rehash_htbl(struct mlx5dv_dr_rule *rule, + struct dr_rule_rx_tx *nic_rule, + struct dr_ste_htbl *cur_htbl, + uint8_t ste_location, + struct list_head *update_list, + enum dr_icm_chunk_size new_size) +{ + struct dr_matcher_rx_tx *nic_matcher = nic_rule->nic_matcher; + struct dr_domain_rx_tx *nic_dmn = nic_matcher->nic_tbl->nic_dmn; + struct mlx5dv_dr_matcher *matcher = rule->matcher; + struct mlx5dv_dr_domain *dmn = matcher->tbl->dmn; + struct dr_ste_send_info *del_ste_info, *tmp_ste_info; + uint8_t formated_ste[DR_STE_SIZE] = {}; + struct dr_ste_send_info *ste_info; + struct dr_htbl_connect_info info; + LIST_HEAD(rehash_table_send_list); + struct dr_ste_htbl *new_htbl; + struct dr_ste *ste_to_update; + int err; + + ste_info = calloc(1, sizeof(*ste_info)); + if (!ste_info) { + errno = ENOMEM; + return NULL; + } + + new_htbl = dr_ste_htbl_alloc(dmn->ste_icm_pool, + new_size, + cur_htbl->lu_type, + cur_htbl->byte_mask); + if (!new_htbl) { + dr_dbg(dmn, "Failed to allocate new hash table\n"); + goto free_ste_info; + } + + /* Write new table to HW */ + info.type = CONNECT_MISS; + info.miss_icm_addr = nic_matcher->e_anchor->chunk->icm_addr; + dr_ste_set_formated_ste(dmn->info.caps.gvmi, + nic_dmn, + new_htbl, + formated_ste, + &info); + + new_htbl->pointing_ste = cur_htbl->pointing_ste; + new_htbl->pointing_ste->next_htbl = new_htbl; + err = dr_rule_rehash_copy_htbl(matcher, + nic_matcher, + cur_htbl, + new_htbl, + &rehash_table_send_list); + if (err) + goto free_new_htbl; + + if (dr_send_postsend_htbl(dmn, new_htbl, formated_ste, + nic_matcher->ste_builder[ste_location - 1].bit_mask)) { + dr_dbg(dmn, "Failed writing table to HW\n"); + goto free_new_htbl; + } + + /* + * Writing to the hw is done in regular order of rehash_table_send_list, + * in order to have the origin data written before the miss address of + * collision entries, if exists. + */ + if (dr_rule_send_update_list(&rehash_table_send_list, dmn, false)) { + dr_dbg(dmn, "Failed updating table to HW\n"); + goto free_ste_list; + } + + /* Connect previous hash table to current */ + if (ste_location == 1) { + /* The previous table is an anchor, anchors size is always one STE */ + struct dr_ste_htbl *prev_htbl = cur_htbl->pointing_ste->htbl; + + /* On matcher s_anchor we keep an extra refcount */ + dr_htbl_get(new_htbl); + dr_htbl_put(cur_htbl); + + nic_matcher->s_htbl = new_htbl; + + /* + * It is safe to operate dr_ste_set_hit_addr on the hw_ste here + * (48B len) which works only on first 32B + */ + dr_ste_set_hit_addr(prev_htbl->ste_arr[0].hw_ste, + new_htbl->chunk->icm_addr, + new_htbl->chunk->num_of_entries); + + ste_to_update = &prev_htbl->ste_arr[0]; + } else { + dr_ste_set_hit_addr_by_next_htbl(cur_htbl->pointing_ste->hw_ste, + new_htbl); + ste_to_update = cur_htbl->pointing_ste; + } + + dr_send_fill_and_append_ste_send_info(ste_to_update, DR_STE_SIZE_REDUCED, + 0, ste_to_update->hw_ste, ste_info, + update_list, false); + + return new_htbl; + +free_ste_list: + /* Clean all ste_info's from the new table */ + list_for_each_safe(&rehash_table_send_list, del_ste_info, tmp_ste_info, + send_list) { + list_del(&del_ste_info->send_list); + free(del_ste_info); + } + +free_new_htbl: + dr_ste_htbl_free(new_htbl); +free_ste_info: + free(ste_info); + return NULL; +} + +static struct dr_ste_htbl *dr_rule_rehash(struct mlx5dv_dr_rule *rule, + struct dr_rule_rx_tx *nic_rule, + struct dr_ste_htbl *cur_htbl, + uint8_t ste_location, + struct list_head *update_list) +{ + struct mlx5dv_dr_domain *dmn = rule->matcher->tbl->dmn; + enum dr_icm_chunk_size new_size; + + new_size = dr_icm_next_higher_chunk(cur_htbl->chunk_size); + new_size = min_t(uint32_t, new_size, dmn->info.max_log_sw_icm_sz); + + if (new_size == cur_htbl->chunk_size) + return NULL; /* Skip rehash, we already at the max size */ + + return dr_rule_rehash_htbl(rule, nic_rule, cur_htbl, ste_location, + update_list, new_size); +} + +static struct dr_ste *dr_rule_handle_collision(struct mlx5dv_dr_matcher *matcher, + struct dr_matcher_rx_tx *nic_matcher, + struct dr_ste *ste, + uint8_t *hw_ste, + struct list_head *miss_list, + struct list_head *send_list) +{ + struct dr_ste_send_info *ste_info; + struct dr_ste *new_ste; + + ste_info = calloc(1, sizeof(*ste_info)); + if (!ste_info) { + dr_dbg(matcher->tbl->dmn, "Failed allocating ste_info\n"); + errno = ENOMEM; + return NULL; + } + + new_ste = dr_rule_create_collision_entry(matcher, nic_matcher, hw_ste, ste); + if (!new_ste) { + dr_dbg(matcher->tbl->dmn, "Failed creating collision entry\n"); + goto free_send_info; + } + + if (dr_rule_append_to_miss_list(new_ste, miss_list, send_list)) { + dr_dbg(matcher->tbl->dmn, "Failed to update prev miss_list\n"); + goto err_exit; + } + + dr_send_fill_and_append_ste_send_info(new_ste, DR_STE_SIZE, 0, hw_ste, + ste_info, send_list, false); + + ste->htbl->ctrl.num_of_collisions++; + ste->htbl->ctrl.num_of_valid_entries++; + + return new_ste; + +err_exit: + dr_ste_free(new_ste, matcher, nic_matcher); +free_send_info: + free(ste_info); + return NULL; +} + +static void dr_rule_remove_action_members(struct mlx5dv_dr_rule *rule) +{ + struct dr_rule_action_member *action_mem; + struct dr_rule_action_member *tmp; + + list_for_each_safe(&rule->rule_actions_list, action_mem, tmp, list) { + list_del(&action_mem->list); + atomic_fetch_sub(&action_mem->action->refcount, 1); + free(action_mem); + } +} + +static int dr_rule_add_action_members(struct mlx5dv_dr_rule *rule, + size_t num_actions, + struct mlx5dv_dr_action *actions[]) +{ + struct dr_rule_action_member *action_mem; + int i; + + for (i = 0; i < num_actions; i++) { + action_mem = calloc(1, sizeof(*action_mem)); + if (!action_mem) { + errno = ENOMEM; + goto free_action_members; + } + + action_mem->action = actions[i]; + list_node_init(&action_mem->list); + list_add_tail(&rule->rule_actions_list, &action_mem->list); + atomic_fetch_add(&action_mem->action->refcount, 1); + } + + return 0; + +free_action_members: + dr_rule_remove_action_members(rule); + return errno; +} + +/* + * While the pointer of ste is no longer valid, like while moving ste to be + * the first in the miss_list, and to be in the origin table, + * all rule-members that are attached to this ste should update their ste member + * to the new pointer + */ +void dr_rule_update_rule_member(struct dr_ste *ste, struct dr_ste *new_ste) +{ + struct dr_rule_member *rule_mem; + + list_for_each(&ste->rule_list, rule_mem, use_ste_list) + rule_mem->ste = new_ste; +} + +static void dr_rule_clean_rule_members(struct mlx5dv_dr_rule *rule, + struct dr_rule_rx_tx *nic_rule) +{ + struct dr_rule_member *rule_mem; + struct dr_rule_member *tmp_mem; + + list_for_each_safe(&nic_rule->rule_members_list, rule_mem, tmp_mem, list) { + list_del(&rule_mem->list); + list_del(&rule_mem->use_ste_list); + dr_ste_put(rule_mem->ste, rule->matcher, nic_rule->nic_matcher); + free(rule_mem); + } +} + +static uint16_t dr_get_bits_per_mask(uint16_t byte_mask) +{ + uint16_t bits = 0; + + while (byte_mask) { + byte_mask = byte_mask & (byte_mask - 1); + bits++; + } + + return bits; +} + +static bool dr_rule_need_enlarge_hash(struct dr_ste_htbl *htbl, + struct mlx5dv_dr_domain *dmn, + struct dr_domain_rx_tx *nic_dmn) +{ + struct dr_ste_htbl_ctrl *ctrl = &htbl->ctrl; + + if (dmn->info.max_log_sw_icm_sz <= htbl->chunk_size) + return false; + + if (!ctrl->may_grow) + return false; + + if (dr_get_bits_per_mask(htbl->byte_mask) * CHAR_BIT <= htbl->chunk_size) + return false; + + if (ctrl->num_of_collisions >= ctrl->increase_threshold && + (ctrl->num_of_valid_entries - ctrl->num_of_collisions) >= ctrl->increase_threshold) + return true; + + return false; +} + +static int dr_rule_add_member(struct dr_rule_rx_tx *nic_rule, + struct dr_ste *ste) +{ + struct dr_rule_member *rule_mem; + + rule_mem = calloc(1, sizeof(*rule_mem)); + if (!rule_mem) { + errno = ENOMEM; + return errno; + } + + rule_mem->ste = ste; + list_add_tail(&nic_rule->rule_members_list, &rule_mem->list); + + list_add_tail(&ste->rule_list, &rule_mem->use_ste_list); + + return 0; +} + +static int dr_rule_handle_action_stes(struct mlx5dv_dr_rule *rule, + struct dr_rule_rx_tx *nic_rule, + struct list_head *send_ste_list, + struct dr_ste *last_ste, + uint8_t *hw_ste_arr, + uint32_t new_hw_ste_arr_sz) +{ + struct dr_matcher_rx_tx *nic_matcher = nic_rule->nic_matcher; + struct dr_ste_send_info *ste_info_arr[DR_ACTION_MAX_STES]; + uint8_t num_of_builders = nic_matcher->num_of_builders; + struct mlx5dv_dr_matcher *matcher = rule->matcher; + uint8_t *curr_hw_ste, *prev_hw_ste; + struct dr_ste *action_ste; + int i, k, ret; + + /* Two cases: + * 1. num_of_builders is equal to new_hw_ste_arr_sz, the action in the ste + * 2. num_of_builders is less then new_hw_ste_arr_sz, new ste was added + * to support the action. + */ + if (num_of_builders == new_hw_ste_arr_sz) + return 0; + + for (i = num_of_builders, k = 0; i < new_hw_ste_arr_sz; i++, k++) { + curr_hw_ste = hw_ste_arr + i * DR_STE_SIZE; + prev_hw_ste = (i == 0) ? curr_hw_ste : hw_ste_arr + ((i - 1) * DR_STE_SIZE); + action_ste = dr_rule_create_collision_htbl(matcher, + nic_matcher, + curr_hw_ste); + if (!action_ste) + return errno; + + dr_ste_get(action_ste); + + /* While free ste we go over the miss list, so add this ste to the list */ + list_add_tail(dr_ste_get_miss_list(action_ste), + &action_ste->miss_list_node); + + ste_info_arr[k] = calloc(1, sizeof(struct dr_ste_send_info)); + if (!ste_info_arr[k]) { + dr_dbg(matcher->tbl->dmn, "Failed allocate ste_info, k: %d\n", k); + errno = ENOMEM; + ret = errno; + goto err_exit; + } + + /* Point current ste to the new action */ + dr_ste_set_hit_addr_by_next_htbl(prev_hw_ste, action_ste->htbl); + ret = dr_rule_add_member(nic_rule, action_ste); + if (ret) { + dr_dbg(matcher->tbl->dmn, "Failed adding rule member\n"); + goto free_ste_info; + } + dr_send_fill_and_append_ste_send_info(action_ste, DR_STE_SIZE, 0, + curr_hw_ste, + ste_info_arr[k], + send_ste_list, false); + } + + return 0; + +free_ste_info: + free(ste_info_arr[k]); +err_exit: + dr_ste_put(action_ste, matcher, nic_matcher); + return ret; +} + +static int dr_rule_handle_empty_entry(struct mlx5dv_dr_matcher *matcher, + struct dr_matcher_rx_tx *nic_matcher, + struct dr_ste_htbl *cur_htbl, + struct dr_ste *ste, + uint8_t ste_location, + uint8_t *hw_ste, + struct list_head *miss_list, + struct list_head *send_list) +{ + struct dr_ste_send_info *ste_info; + + /* Take ref on table, only on first time this ste is used */ + dr_htbl_get(cur_htbl); + + /* new entry -> new branch */ + list_add_tail(miss_list, &ste->miss_list_node); + + dr_ste_set_miss_addr(hw_ste, nic_matcher->e_anchor->chunk->icm_addr); + + ste->ste_chain_location = ste_location; + + ste_info = calloc(1, sizeof(*ste_info)); + if (!ste_info) { + dr_dbg(matcher->tbl->dmn, "Failed allocating ste_info\n"); + errno = ENOMEM; + goto clean_ste_setting; + } + + if (dr_ste_create_next_htbl(matcher, + nic_matcher, + ste, + hw_ste, + DR_CHUNK_SIZE_1)) { + dr_dbg(matcher->tbl->dmn, "Failed allocating table\n"); + goto clean_ste_info; + } + + cur_htbl->ctrl.num_of_valid_entries++; + + dr_send_fill_and_append_ste_send_info(ste, DR_STE_SIZE, 0, hw_ste, + ste_info, send_list, false); + + return 0; + +clean_ste_info: + free(ste_info); + +clean_ste_setting: + list_del_init(&ste->miss_list_node); + dr_htbl_put(cur_htbl); + + return ENOMEM; +} + +static struct dr_ste *dr_rule_handle_ste_branch(struct mlx5dv_dr_rule *rule, + struct dr_rule_rx_tx *nic_rule, + struct list_head *send_ste_list, + struct dr_ste_htbl *cur_htbl, + uint8_t *hw_ste, + uint8_t ste_location, + struct dr_ste_htbl **put_htbl) +{ + struct dr_matcher_rx_tx *nic_matcher = nic_rule->nic_matcher; + struct dr_domain_rx_tx *nic_dmn = nic_matcher->nic_tbl->nic_dmn; + struct mlx5dv_dr_matcher *matcher = rule->matcher; + struct mlx5dv_dr_domain *dmn = matcher->tbl->dmn; + struct dr_ste_htbl *new_htbl; + struct list_head *miss_list; + struct dr_ste *matched_ste; + bool skip_rehash = false; + struct dr_ste *ste; + int index; + +again: + index = dr_ste_calc_hash_index(hw_ste, cur_htbl); + miss_list = &cur_htbl->chunk->miss_list[index]; + ste = &cur_htbl->ste_arr[index]; + + if (dr_ste_not_used_ste(ste)) { + if (dr_rule_handle_empty_entry(matcher, nic_matcher, cur_htbl, + ste, ste_location, + hw_ste, miss_list, + send_ste_list)) + return NULL; + } else { + /* Hash table index in use, check if this ste is in the miss list */ + matched_ste = dr_rule_find_ste_in_miss_list(miss_list, hw_ste); + if (matched_ste) { + /* + * if it is last STE in the chain, and has the same tag + * it means that all the previous stes are the same, + * if so, this rule is duplicated. + */ + if (!dr_ste_is_last_in_rule(nic_matcher, ste_location)) + return matched_ste; + + dr_dbg(dmn, "Duplicate rule inserted\n"); + } + + if (!skip_rehash && dr_rule_need_enlarge_hash(cur_htbl, dmn, nic_dmn)) { + /* Hash table index in use, try to resize of the hash */ + skip_rehash = true; + + /* + * Hold the table till we update. + * Release in dr_rule_create_rule_nr() + */ + *put_htbl = cur_htbl; + dr_htbl_get(cur_htbl); + + new_htbl = dr_rule_rehash(rule, nic_rule, cur_htbl, + ste_location, send_ste_list); + if (!new_htbl) { + dr_htbl_put(cur_htbl); + dr_dbg(dmn, "Failed creating rehash table, htbl-log_size: %d\n", + cur_htbl->chunk_size); + } else { + cur_htbl = new_htbl; + } + goto again; + } else { + /* Hash table index in use, add another collision (miss) */ + ste = dr_rule_handle_collision(matcher, + nic_matcher, + ste, + hw_ste, + miss_list, + send_ste_list); + if (!ste) { + dr_dbg(dmn, "Failed adding collision entry, index: %d\n", + index); + return NULL; + } + } + } + return ste; +} + +static bool dr_rule_cmp_value_to_mask(uint8_t *mask, uint8_t *value, + uint32_t s_idx, uint32_t e_idx) +{ + uint32_t i; + + for (i = s_idx; i < e_idx; i++) { + if (value[i] & ~mask[i]) { + errno = EINVAL; + return false; + } + } + return true; +} + +static bool dr_rule_verify(struct mlx5dv_dr_matcher *matcher, + struct mlx5dv_flow_match_parameters *value, + struct dr_match_param *param) +{ + uint8_t match_criteria = matcher->match_criteria; + struct mlx5dv_dr_domain *dmn = matcher->tbl->dmn; + uint8_t *mask_p = (uint8_t *)&matcher->mask; + uint8_t *param_p = (uint8_t *)param; + size_t value_size = value->match_sz; + uint32_t s_idx, e_idx; + + if (!value_size || + (value_size > DEVX_ST_SZ_BYTES(dr_match_param) || + (value_size % sizeof(uint32_t)))) { + dr_dbg(dmn, "Rule parameters length is incorrect\n"); + errno = EINVAL; + return false; + } + + dr_ste_copy_param(matcher->match_criteria, param, value); + + if (match_criteria & DR_MATCHER_CRITERIA_OUTER) { + s_idx = offsetof(struct dr_match_param, outer); + e_idx = min(s_idx + sizeof(param->outer), value_size); + + if (!dr_rule_cmp_value_to_mask(mask_p, param_p, s_idx, e_idx)) { + dr_dbg(dmn, "Rule outer parameters contains a value not specified by mask\n"); + return false; + } + } + + if (match_criteria & DR_MATCHER_CRITERIA_MISC) { + s_idx = offsetof(struct dr_match_param, misc); + e_idx = min(s_idx + sizeof(param->misc), value_size); + + if (!dr_rule_cmp_value_to_mask(mask_p, param_p, s_idx, e_idx)) { + dr_dbg(dmn, "Rule misc parameters contains a value not specified by mask\n"); + return false; + } + } + + if (match_criteria & DR_MATCHER_CRITERIA_INNER) { + s_idx = offsetof(struct dr_match_param, inner); + e_idx = min(s_idx + sizeof(param->inner), value_size); + + if (!dr_rule_cmp_value_to_mask(mask_p, param_p, s_idx, e_idx)) { + dr_dbg(dmn, "Rule inner parameters contains a value not specified by mask\n"); + return false; + } + } + + if (match_criteria & DR_MATCHER_CRITERIA_MISC2) { + s_idx = offsetof(struct dr_match_param, misc2); + e_idx = min(s_idx + sizeof(param->misc2), value_size); + + if (!dr_rule_cmp_value_to_mask(mask_p, param_p, s_idx, e_idx)) { + dr_dbg(dmn, "Rule misc2 parameters contains a value not specified by mask\n"); + return false; + } + } + + if (match_criteria & DR_MATCHER_CRITERIA_MISC3) { + s_idx = offsetof(struct dr_match_param, misc3); + e_idx = min(s_idx + sizeof(param->misc3), value_size); + + if (!dr_rule_cmp_value_to_mask(mask_p, param_p, s_idx, e_idx)) { + dr_dbg(dmn, "Rule misc3 parameters contains a value not specified by mask\n"); + return false; + } + } + return true; +} + +static int dr_rule_destroy_rule_nic(struct mlx5dv_dr_rule *rule, + struct dr_rule_rx_tx *nic_rule) +{ + dr_rule_clean_rule_members(rule, nic_rule); + return 0; +} + +static int dr_rule_destroy_rule_fdb(struct mlx5dv_dr_rule *rule) +{ + dr_rule_destroy_rule_nic(rule, &rule->rx); + dr_rule_destroy_rule_nic(rule, &rule->tx); + return 0; +} + +static int dr_rule_destroy_rule(struct mlx5dv_dr_rule *rule) +{ + struct mlx5dv_dr_domain *dmn = rule->matcher->tbl->dmn; + + switch (dmn->type) { + case MLX5DV_DR_DOMAIN_TYPE_NIC_RX: + dr_rule_destroy_rule_nic(rule, &rule->rx); + break; + case MLX5DV_DR_DOMAIN_TYPE_NIC_TX: + dr_rule_destroy_rule_nic(rule, &rule->tx); + break; + case MLX5DV_DR_DOMAIN_TYPE_FDB: + dr_rule_destroy_rule_fdb(rule); + break; + default: + errno = EINVAL; + return errno; + } + + dr_rule_remove_action_members(rule); + list_del(&rule->rule_list); + free(rule); + return 0; +} + +static int dr_rule_destroy_rule_root(struct mlx5dv_dr_rule *rule) +{ + int ret; + + ret = ibv_destroy_flow(rule->flow); + if (ret) + return ret; + + dr_rule_remove_action_members(rule); + free(rule); + return 0; +} + +static int dr_rule_skip(enum mlx5dv_dr_domain_type domain, + enum dr_ste_entry_type ste_type, + struct dr_match_param *mask, + struct dr_match_param *value) +{ + if (domain == MLX5DV_DR_DOMAIN_TYPE_FDB) { + if (mask->misc.source_port) { + if (ste_type == DR_STE_TYPE_RX) + if (value->misc.source_port != WIRE_PORT) + return 1; + + if (ste_type == DR_STE_TYPE_TX) + if (value->misc.source_port == WIRE_PORT) + return 1; + } + } + + return 0; +} + +static int +dr_rule_create_rule_nic(struct mlx5dv_dr_rule *rule, + struct dr_rule_rx_tx *nic_rule, + struct dr_match_param *param, + size_t num_actions, + struct mlx5dv_dr_action *actions[]) +{ + uint8_t hw_ste_arr[DR_RULE_MAX_STE_CHAIN * DR_STE_SIZE] = {}; + struct dr_matcher_rx_tx *nic_matcher = nic_rule->nic_matcher; + struct dr_domain_rx_tx *nic_dmn = nic_matcher->nic_tbl->nic_dmn; + struct mlx5dv_dr_matcher *matcher = rule->matcher; + struct mlx5dv_dr_domain *dmn = matcher->tbl->dmn; + struct dr_ste_send_info *ste_info, *tmp_ste_info; + struct dr_ste_htbl *htbl = NULL; + struct dr_ste_htbl *cur_htbl; + uint32_t new_hw_ste_arr_sz; + LIST_HEAD(send_ste_list); + struct dr_ste *ste = NULL; /* Fix compilation warning */ + int ret, i; + + list_head_init(&nic_rule->rule_members_list); + + if (dr_rule_skip(dmn->type, nic_dmn->ste_type, &matcher->mask, param)) + return 0; + + /* Set the tag values inside the ste array */ + ret = dr_ste_build_ste_arr(matcher, nic_matcher, param, hw_ste_arr); + if (ret) + goto out_err; + + /* Set the actions values/addresses inside the ste array */ + ret = dr_actions_build_ste_arr(matcher, nic_matcher, actions, + num_actions, hw_ste_arr, + &new_hw_ste_arr_sz); + if (ret) + goto out_err; + + cur_htbl = nic_matcher->s_htbl; + + /* + * Go over the array of STEs, and build dr_ste accordingly. + * The loop is over only the builders which are equeal or less to the + * number of stes, in case we have actions that lives in other stes. + */ + for (i = 0; i < nic_matcher->num_of_builders; i++) { + /* Calculate CRC and keep new ste entry */ + uint8_t *cur_hw_ste_ent = hw_ste_arr + (i * DR_STE_SIZE); + + ste = dr_rule_handle_ste_branch(rule, + nic_rule, + &send_ste_list, + cur_htbl, + cur_hw_ste_ent, + i + 1, + &htbl); + if (!ste) { + dr_dbg(dmn, "Failed creating next branch\n"); + ret = errno; + goto free_rule; + } + + cur_htbl = ste->next_htbl; + + /* Keep all STEs in the rule struct */ + ret = dr_rule_add_member(nic_rule, ste); + if (ret) { + dr_dbg(dmn, "Failed adding rule member index %d\n", i); + goto free_ste; + } + + dr_ste_get(ste); + } + + /* Connect actions */ + ret = dr_rule_handle_action_stes(rule, nic_rule, &send_ste_list, + ste, hw_ste_arr, new_hw_ste_arr_sz); + if (ret) { + dr_dbg(dmn, "Failed apply actions\n"); + goto free_rule; + } + ret = dr_rule_send_update_list(&send_ste_list, dmn, true); + if (ret) { + dr_dbg(dmn, "Failed sending ste!\n"); + goto free_rule; + } + + if (htbl) + dr_htbl_put(htbl); + + return 0; + +free_ste: + dr_ste_put(ste, matcher, nic_matcher); +free_rule: + dr_rule_clean_rule_members(rule, nic_rule); + /* Clean all ste_info's */ + list_for_each_safe(&send_ste_list, ste_info, tmp_ste_info, send_list) { + list_del(&ste_info->send_list); + free(ste_info); + } +out_err: + return ret; +} + +static int +dr_rule_create_rule_fdb(struct mlx5dv_dr_rule *rule, + struct dr_match_param *param, + size_t num_actions, + struct mlx5dv_dr_action *actions[]) +{ + struct dr_match_param copy_param = {}; + int ret; + + /* + * Copy match_param since they will be consumed during the first + * nic_rule insertion. + */ + memcpy(©_param, param, sizeof(struct dr_match_param)); + + ret = dr_rule_create_rule_nic(rule, &rule->rx, param, + num_actions, actions); + if (ret) + return ret; + + ret = dr_rule_create_rule_nic(rule, &rule->tx, ©_param, + num_actions, actions); + if (ret) + goto destroy_rule_nic_rx; + + return 0; + +destroy_rule_nic_rx: + dr_rule_destroy_rule_nic(rule, &rule->rx); + return ret; +} + +static struct mlx5dv_dr_rule * +dr_rule_create_rule(struct mlx5dv_dr_matcher *matcher, + struct mlx5dv_flow_match_parameters *value, + size_t num_actions, + struct mlx5dv_dr_action *actions[]) +{ + struct mlx5dv_dr_domain *dmn = matcher->tbl->dmn; + struct dr_match_param param = {}; + struct mlx5dv_dr_rule *rule; + int ret; + + if (!dr_rule_verify(matcher, value, ¶m)) + return NULL; + + rule = calloc(1, sizeof(*rule)); + if (!rule) { + errno = ENOMEM; + return NULL; + } + + rule->matcher = matcher; + list_head_init(&rule->rule_actions_list); + list_node_init(&rule->rule_list); + + ret = dr_rule_add_action_members(rule, num_actions, actions); + if (ret) + goto free_rule; + + switch (dmn->type) { + case MLX5DV_DR_DOMAIN_TYPE_NIC_RX: + rule->rx.nic_matcher = &matcher->rx; + ret = dr_rule_create_rule_nic(rule, &rule->rx, ¶m, + num_actions, actions); + break; + case MLX5DV_DR_DOMAIN_TYPE_NIC_TX: + rule->tx.nic_matcher = &matcher->tx; + ret = dr_rule_create_rule_nic(rule, &rule->tx, ¶m, + num_actions, actions); + break; + case MLX5DV_DR_DOMAIN_TYPE_FDB: + rule->rx.nic_matcher = &matcher->rx; + rule->tx.nic_matcher = &matcher->tx; + ret = dr_rule_create_rule_fdb(rule, ¶m, + num_actions, actions); + break; + default: + ret = EINVAL; + errno = ret; + break; + } + + if (ret) + goto remove_action_members; + + list_add_tail(&matcher->rule_list, &rule->rule_list); + return rule; + +remove_action_members: + dr_rule_remove_action_members(rule); +free_rule: + free(rule); + + return NULL; +} + +static struct mlx5dv_dr_rule * +dr_rule_create_rule_root(struct mlx5dv_dr_matcher *matcher, + struct mlx5dv_flow_match_parameters *value, + size_t num_actions, + struct mlx5dv_dr_action *actions[]) +{ + struct mlx5dv_flow_action_attr *attr; + struct mlx5_flow_action_attr_aux *attr_aux; + struct mlx5dv_dr_rule *rule; + int ret; + + rule = calloc(1, sizeof(*rule)); + if (!rule) { + errno = ENOMEM; + return NULL; + } + + rule->matcher = matcher; + list_head_init(&rule->rule_actions_list); + + attr = calloc(num_actions, sizeof(*attr)); + if (!attr) { + errno = ENOMEM; + goto free_rule; + } + + attr_aux = calloc(num_actions, sizeof(*attr_aux)); + if (!attr_aux) { + errno = ENOMEM; + goto free_attr; + } + + ret = dr_actions_build_attr(matcher, actions, num_actions, attr, attr_aux); + if (ret) + goto free_attr_aux; + + ret = dr_rule_add_action_members(rule, num_actions, actions); + if (ret) + goto free_attr_aux; + + rule->flow = __mlx5dv_create_flow(matcher->dv_matcher, + value, + num_actions, + attr, + attr_aux); + if (!rule->flow) + goto remove_action_members; + + free(attr); + free(attr_aux); + + return rule; + +remove_action_members: + dr_rule_remove_action_members(rule); +free_attr_aux: + free(attr_aux); +free_attr: + free(attr); +free_rule: + free(rule); + return NULL; +} + +struct mlx5dv_dr_rule *mlx5dv_dr_rule_create(struct mlx5dv_dr_matcher *matcher, + struct mlx5dv_flow_match_parameters *value, + size_t num_actions, + struct mlx5dv_dr_action *actions[]) +{ + struct mlx5dv_dr_rule *rule; + + pthread_mutex_lock(&matcher->tbl->dmn->mutex); + atomic_fetch_add(&matcher->refcount, 1); + + if (dr_is_root_table(matcher->tbl)) + rule = dr_rule_create_rule_root(matcher, value, num_actions, actions); + else + rule = dr_rule_create_rule(matcher, value, num_actions, actions); + + if (!rule) + atomic_fetch_sub(&matcher->refcount, 1); + + pthread_mutex_unlock(&matcher->tbl->dmn->mutex); + + return rule; +} + +int mlx5dv_dr_rule_destroy(struct mlx5dv_dr_rule *rule) +{ + struct mlx5dv_dr_matcher *matcher = rule->matcher; + struct mlx5dv_dr_table *tbl = rule->matcher->tbl; + int ret; + + pthread_mutex_lock(&tbl->dmn->mutex); + + if (dr_is_root_table(tbl)) + ret = dr_rule_destroy_rule_root(rule); + else + ret = dr_rule_destroy_rule(rule); + + pthread_mutex_unlock(&tbl->dmn->mutex); + + if (!ret) + atomic_fetch_sub(&matcher->refcount, 1); + return ret; +} diff --git a/providers/mlx5/dr_send.c b/providers/mlx5/dr_send.c new file mode 100644 index 0000000..a0237ac --- /dev/null +++ b/providers/mlx5/dr_send.c @@ -0,0 +1,1041 @@ +/* + * Copyright (c) 2019 Mellanox Technologies, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <stdlib.h> +#include <stdio.h> +#include <unistd.h> +#include <util/mmio.h> +#include "mlx5dv_dr.h" +#include "wqe.h" + +#define QUEUE_SIZE 128 +#define SIGNAL_PER_DIV_QUEUE 16 +#define TH_NUMS_TO_DRAIN 2 + +enum { + CQ_OK = 0, + CQ_EMPTY = -1, + CQ_POLL_ERR = -2 +}; + +struct dr_qp_init_attr { + uint32_t cqn; + uint32_t pdn; + struct mlx5dv_devx_uar *uar; + struct ibv_qp_cap cap; +}; + +static void *dr_cq_get_cqe(struct dr_cq *dr_cq, int n) +{ + return dr_cq->buf + n * dr_cq->cqe_sz; +} + +static void *dr_cq_get_sw_cqe(struct dr_cq *dr_cq, int n) +{ + void *cqe = dr_cq_get_cqe(dr_cq, n & (dr_cq->ncqe - 1)); + struct mlx5_cqe64 *cqe64; + + cqe64 = (dr_cq->cqe_sz == 64) ? cqe : cqe + 64; + + if (likely(mlx5dv_get_cqe_opcode(cqe64) != MLX5_CQE_INVALID) && + !((cqe64->op_own & MLX5_CQE_OWNER_MASK) ^ + !!(n & dr_cq->ncqe))) + return cqe64; + else + return NULL; +} + +static int dr_get_next_cqe(struct dr_cq *dr_cq, + struct mlx5_cqe64 **pcqe64) +{ + struct mlx5_cqe64 *cqe64; + + cqe64 = dr_cq_get_sw_cqe(dr_cq, dr_cq->cons_index); + if (!cqe64) + return CQ_EMPTY; + + ++dr_cq->cons_index; + /* + * Make sure we read CQ entry contents after we've checked the + * ownership bit. + */ + udma_from_device_barrier(); + + *pcqe64 = cqe64; + + return CQ_OK; +} + +static int dr_parse_cqe(struct dr_cq *dr_cq, struct mlx5_cqe64 *cqe64) +{ + uint16_t wqe_ctr; + uint8_t opcode; + int idx; + + wqe_ctr = be16toh(cqe64->wqe_counter); + opcode = mlx5dv_get_cqe_opcode(cqe64); + if (opcode == MLX5_CQE_REQ_ERR) { + idx = wqe_ctr & (dr_cq->qp->sq.wqe_cnt - 1); + dr_cq->qp->sq.tail = dr_cq->qp->sq.wqe_head[idx] + 1; + } else if (opcode == MLX5_CQE_RESP_ERR) { + ++dr_cq->qp->sq.tail; + } else { + idx = wqe_ctr & (dr_cq->qp->sq.wqe_cnt - 1); + dr_cq->qp->sq.tail = dr_cq->qp->sq.wqe_head[idx] + 1; + + return CQ_OK; + } + + return CQ_POLL_ERR; +} + +static int dr_cq_poll_one(struct dr_cq *dr_cq) +{ + struct mlx5_cqe64 *cqe64; + int err; + + err = dr_get_next_cqe(dr_cq, &cqe64); + if (err == CQ_EMPTY) + return err; + + return dr_parse_cqe(dr_cq, cqe64); +} + +static int dr_poll_cq(struct dr_cq *dr_cq, int ne) +{ + int npolled; + int err = 0; + + for (npolled = 0; npolled < ne; ++npolled) { + err = dr_cq_poll_one(dr_cq); + if (err != CQ_OK) + break; + } + dr_cq->db[MLX5_CQ_SET_CI] = htobe32(dr_cq->cons_index & + 0xffffff); + return err == CQ_POLL_ERR ? err : npolled; +} + +/* We calculate for specific RC QP with the required functionality */ +static int dr_qp_calc_rc_send_wqe(struct dr_qp_init_attr *attr) +{ + int size; + int inl_size = 0; + int tot_size; + + size = sizeof(struct mlx5_wqe_ctrl_seg) + + sizeof(struct mlx5_wqe_raddr_seg); + if (attr->cap.max_inline_data) + inl_size = size + align(sizeof(struct mlx5_wqe_inl_data_seg) + + attr->cap.max_inline_data, 16); + + size += attr->cap.max_send_sge * sizeof(struct mlx5_wqe_data_seg); + tot_size = max_int(size, inl_size); + + return align(tot_size, MLX5_SEND_WQE_BB); +} + +static int dr_calc_sq_size(struct dr_qp *dr_qp, + struct dr_qp_init_attr *attr) +{ + int wqe_size; + int wq_size; + + wqe_size = dr_qp_calc_rc_send_wqe(attr); + + dr_qp->max_inline_data = wqe_size - + (sizeof(struct mlx5_wqe_ctrl_seg) + + sizeof(struct mlx5_wqe_raddr_seg)) - + sizeof(struct mlx5_wqe_inl_data_seg); + + wq_size = roundup_pow_of_two(attr->cap.max_send_wr * wqe_size); + dr_qp->sq.wqe_cnt = wq_size / MLX5_SEND_WQE_BB; + dr_qp->sq.wqe_shift = STATIC_ILOG_32(MLX5_SEND_WQE_BB) - 1; + dr_qp->sq.max_gs = attr->cap.max_send_sge; + dr_qp->sq.max_post = wq_size / wqe_size; + + return wq_size; +} + +static int dr_qp_calc_recv_wqe(struct dr_qp_init_attr *attr) +{ + uint32_t size; + int num_scatter; + + num_scatter = max_t(uint32_t, attr->cap.max_recv_sge, 1); + size = sizeof(struct mlx5_wqe_data_seg) * num_scatter; + + size = roundup_pow_of_two(size); + + return size; +} + +static int dr_calc_rq_size(struct dr_qp *dr_qp, + struct dr_qp_init_attr *attr) +{ + int wqe_size; + int wq_size; + + wqe_size = dr_qp_calc_recv_wqe(attr); + + wq_size = roundup_pow_of_two(attr->cap.max_recv_wr) * wqe_size; + wq_size = max(wq_size, MLX5_SEND_WQE_BB); + dr_qp->rq.wqe_cnt = wq_size / wqe_size; + dr_qp->rq.wqe_shift = ilog32(wqe_size - 1); + dr_qp->rq.max_post = 1 << ilog32(wq_size / wqe_size - 1); + dr_qp->rq.max_gs = wqe_size / sizeof(struct mlx5_wqe_data_seg); + + return wq_size; +} + +static int dr_calc_wq_size(struct dr_qp *dr_qp, struct dr_qp_init_attr *attr) +{ + int result; + int ret; + + result = dr_calc_sq_size(dr_qp, attr); + + ret = dr_calc_rq_size(dr_qp, attr); + + result += ret; + dr_qp->sq.offset = ret; + dr_qp->rq.offset = 0; + + return result; +} + +static int dr_qp_alloc_buf(struct dr_qp *dr_qp, int size) +{ + int al_size; + int ret; + + dr_qp->sq.wqe_head = malloc(dr_qp->sq.wqe_cnt * + sizeof(*dr_qp->sq.wqe_head)); + if (!dr_qp->sq.wqe_head) { + errno = ENOMEM; + return errno; + } + + al_size = align(size, sysconf(_SC_PAGESIZE)); + ret = posix_memalign(&dr_qp->buf.buf, sysconf(_SC_PAGESIZE), al_size); + if (ret) { + errno = ret; + goto free_wqe_head; + } + + dr_qp->buf.length = al_size; + dr_qp->buf.type = MLX5_ALLOC_TYPE_ANON; + memset(dr_qp->buf.buf, 0, dr_qp->buf.length); + + return 0; + +free_wqe_head: + free(dr_qp->sq.wqe_head); + return ret; +} + +static struct dr_qp *dr_create_rc_qp(struct ibv_context *ctx, + struct dr_qp_init_attr *attr) +{ + struct dr_devx_qp_create_attr qp_create_attr; + struct mlx5dv_devx_obj *obj; + struct dr_qp *dr_qp; + int size; + int ret; + + dr_qp = calloc(1, sizeof(*dr_qp)); + if (!dr_qp) { + errno = ENOMEM; + return NULL; + } + + size = dr_calc_wq_size(dr_qp, attr); + + if (dr_qp_alloc_buf(dr_qp, size)) + goto err_alloc_bufs; + + dr_qp->sq_start = dr_qp->buf.buf + dr_qp->sq.offset; + dr_qp->sq.qend = dr_qp->buf.buf + dr_qp->sq.offset + + (dr_qp->sq.wqe_cnt << dr_qp->sq.wqe_shift); + dr_qp->rq.head = 0; + dr_qp->rq.tail = 0; + dr_qp->sq.cur_post = 0; + + ret = posix_memalign((void **)&dr_qp->db, 8, 8); + if (ret) { + errno = ret; + goto err_db_alloc; + } + + dr_qp->db[MLX5_RCV_DBR] = 0; + dr_qp->db[MLX5_SND_DBR] = 0; + dr_qp->db_umem = mlx5dv_devx_umem_reg(ctx, dr_qp->db, 8, + IBV_ACCESS_LOCAL_WRITE | + IBV_ACCESS_REMOTE_WRITE | + IBV_ACCESS_REMOTE_READ); + if (!dr_qp->db_umem) + goto err_db_umem; + + dr_qp->buf_umem = mlx5dv_devx_umem_reg(ctx, dr_qp->buf.buf, + dr_qp->buf.length, + IBV_ACCESS_LOCAL_WRITE | + IBV_ACCESS_REMOTE_WRITE | + IBV_ACCESS_REMOTE_READ); + if (!dr_qp->buf_umem) + goto err_buf_umem; + + qp_create_attr.page_id = attr->uar->page_id; + qp_create_attr.pdn = attr->pdn; + qp_create_attr.cqn = attr->cqn; + qp_create_attr.pm_state = MLX5_QPC_PM_STATE_MIGRATED; + qp_create_attr.service_type = MLX5_QPC_ST_RC; + qp_create_attr.buff_umem_id = dr_qp->buf_umem->umem_id; + qp_create_attr.db_umem_id = dr_qp->db_umem->umem_id; + qp_create_attr.sq_wqe_cnt = dr_qp->sq.wqe_cnt; + qp_create_attr.rq_wqe_cnt = dr_qp->rq.wqe_cnt; + qp_create_attr.rq_wqe_shift = dr_qp->rq.wqe_shift; + + obj = dr_devx_create_qp(ctx, &qp_create_attr); + if (!obj) + goto err_qp_create; + + dr_qp->uar = attr->uar; + dr_qp->obj = obj; + + return dr_qp; + +err_qp_create: + mlx5dv_devx_umem_dereg(dr_qp->buf_umem); +err_buf_umem: + mlx5dv_devx_umem_dereg(dr_qp->db_umem); +err_db_umem: + free(dr_qp->db); +err_db_alloc: + free(dr_qp->sq.wqe_head); + free(dr_qp->buf.buf); +err_alloc_bufs: + free(dr_qp); + return NULL; +} + +static int dr_destroy_qp(struct dr_qp *dr_qp) +{ + int ret; + + ret = mlx5dv_devx_obj_destroy(dr_qp->obj); + if (ret) + return ret; + + ret = mlx5dv_devx_umem_dereg(dr_qp->buf_umem); + if (ret) + return ret; + + ret = mlx5dv_devx_umem_dereg(dr_qp->db_umem); + if (ret) + return ret; + + free(dr_qp->db); + free(dr_qp->sq.wqe_head); + free(dr_qp->buf.buf); + free(dr_qp); + + return 0; +} + +static void dr_set_raddr_seg(struct mlx5_wqe_raddr_seg *rseg, + uint64_t remote_addr, uint32_t rkey) +{ + rseg->raddr = htobe64(remote_addr); + rseg->rkey = htobe32(rkey); + rseg->reserved = 0; +} + +static void dr_post_send_db(struct dr_qp *dr_qp, int size, void *ctrl) +{ + dr_qp->sq.head += 2; /* RDMA_WRITE + RDMA_READ */ + + /* + * Make sure that descriptors are written before + * updating doorbell record and ringing the doorbell + */ + udma_to_device_barrier(); + dr_qp->db[MLX5_SND_DBR] = htobe32(dr_qp->sq.cur_post & 0xffff); + + /* Make sure that the doorbell write happens before the memcpy + * to WC memory below + */ + mmio_wc_start(); + mmio_write64_be((uint8_t *)dr_qp->uar->reg_addr, *(__be64 *)ctrl); + mmio_flush_writes(); +} + +static void dr_set_data_ptr_seg(struct mlx5_wqe_data_seg *dseg, + struct dr_data_seg *data_seg) +{ + dseg->byte_count = htobe32(data_seg->length); + dseg->lkey = htobe32(data_seg->lkey); + dseg->addr = htobe64(data_seg->addr); +} + +static int dr_set_data_inl_seg(struct dr_qp *dr_qp, + struct dr_data_seg *data_seg, + void *wqe, uint32_t opcode, int *sz) +{ + struct mlx5_wqe_inline_seg *seg; + void *qend = dr_qp->sq.qend; + int inl = 0; + void *addr; + int copy; + int len; + + seg = wqe; + wqe += sizeof(*seg); + addr = (void *)(unsigned long)(data_seg->addr); + len = data_seg->length; + inl += len; + + if (unlikely(wqe + len > qend)) { + copy = qend - wqe; + memcpy(wqe, addr, copy); + addr += copy; + len -= copy; + wqe = dr_qp->sq_start; + } + memcpy(wqe, addr, len); + wqe += len; + + if (likely(inl)) { + seg->byte_count = htobe32(inl | MLX5_INLINE_SEG); + *sz = align(inl + sizeof(seg->byte_count), 16) / 16; + } else { + *sz = 0; + } + + return 0; +} + +static void dr_set_ctrl_seg(struct mlx5_wqe_ctrl_seg *ctrl, + struct dr_data_seg *data_seg) +{ + *(uint32_t *)((void *)ctrl + 8) = 0; + ctrl->imm = 0; + ctrl->fm_ce_se = data_seg->send_flags & IBV_SEND_SIGNALED ? + MLX5_WQE_CTRL_CQ_UPDATE : 0; +} + +static void dr_rdma_segments(struct dr_qp *dr_qp, uint64_t remote_addr, + uint32_t rkey, struct dr_data_seg *data_seg, + uint32_t opcode, int nreq) +{ + struct mlx5_wqe_ctrl_seg *ctrl = NULL; + void *qend = dr_qp->sq.qend; + unsigned idx; + int size = 0; + void *seg; + + idx = dr_qp->sq.cur_post & (dr_qp->sq.wqe_cnt - 1); + ctrl = dr_qp->sq_start + (idx << MLX5_SEND_WQE_SHIFT); + seg = ctrl; + dr_set_ctrl_seg(ctrl, data_seg); + seg += sizeof(*ctrl); + size = sizeof(*ctrl) / 16; + + dr_set_raddr_seg(seg, remote_addr, rkey); + seg += sizeof(struct mlx5_wqe_raddr_seg); + size += sizeof(struct mlx5_wqe_raddr_seg) / 16; + + if (data_seg->send_flags & IBV_SEND_INLINE) { + int uninitialized_var(sz); + + dr_set_data_inl_seg(dr_qp, data_seg, seg, opcode, &sz); + size += sz; + } else { + if (unlikely(seg == qend)) + seg = dr_qp->sq_start; + dr_set_data_ptr_seg(seg, data_seg); + size += sizeof(struct mlx5_wqe_data_seg) / 16; + } + ctrl->opmod_idx_opcode = + htobe32(((dr_qp->sq.cur_post & 0xffff) << 8) | opcode); + ctrl->qpn_ds = htobe32(size | (dr_qp->obj->object_id << 8)); + dr_qp->sq.wqe_head[idx] = dr_qp->sq.head + nreq; + dr_qp->sq.cur_post += DIV_ROUND_UP(size * 16, MLX5_SEND_WQE_BB); + + if (nreq) + dr_post_send_db(dr_qp, size, ctrl); +} + +static void dr_post_send(struct dr_qp *dr_qp, struct postsend_info *send_info) +{ + dr_rdma_segments(dr_qp, send_info->remote_addr, send_info->rkey, + &send_info->write, MLX5_OPCODE_RDMA_WRITE, 0); + dr_rdma_segments(dr_qp, send_info->remote_addr, send_info->rkey, + &send_info->read, MLX5_OPCODE_RDMA_READ, 1); +} + +/* + * dr_send_fill_and_append_ste_send_info: Add data to be sent with send_list + * parameters: + * @ste - The data that attached to this specific ste + * @size - of data to write + * @offset - of the data from start of the hw_ste entry + * @data - data + * @ste_info - ste to be sent with send_list + * @send_list - to append into it + * @copy_data - if true indicates that the data should be kept because it's not + * backuped any where (like in re-hash). + * if false, it lets the data to be updated after it was added to + * the list. + */ +void dr_send_fill_and_append_ste_send_info(struct dr_ste *ste, uint16_t size, + uint16_t offset, uint8_t *data, + struct dr_ste_send_info *ste_info, + struct list_head *send_list, + bool copy_data) +{ + ste_info->size = size; + ste_info->ste = ste; + ste_info->offset = offset; + + if (copy_data) { + memcpy(ste_info->data_cont, data, size); + ste_info->data = ste_info->data_cont; + } else { + ste_info->data = data; + } + + list_add_tail(send_list, &ste_info->send_list); +} + +static bool dr_is_device_fatal(struct mlx5dv_dr_domain *dmn) +{ + struct mlx5_context *mlx5_ctx = to_mctx(dmn->ctx); + + if (mlx5_ctx->flags & MLX5_CTX_FLAGS_FATAL_STATE) + return true; + return false; +} + +/* + * The function tries to consume one wc each time, unless the queue is full, in + * that case, which means that the hw is behind the sw in a full queue len + * the function will drain the cq till it empty. + */ +static int dr_handle_pending_wc(struct mlx5dv_dr_domain *dmn, + struct dr_send_ring *send_ring) +{ + bool is_drain = false; + int ne; + + if (send_ring->pending_wqe >= send_ring->signal_th) { + /* Queue is full start drain it */ + if (send_ring->pending_wqe >= dmn->send_ring->signal_th * TH_NUMS_TO_DRAIN) + is_drain = true; + + do { + /* + * On IBV_EVENT_DEVICE_FATAL a success is returned to + * let the application free its resources successfully + */ + if (dr_is_device_fatal(dmn)) + return 0; + + ne = dr_poll_cq(&send_ring->cq, 1); + if (ne < 0) { + dr_dbg(dmn, "poll CQ failed\n"); + return ne; + } else if (ne == 1) { + send_ring->pending_wqe -= send_ring->signal_th; + } + } while (is_drain && send_ring->pending_wqe); + } + + return 0; +} + +static void dr_fill_data_segs(struct dr_send_ring *send_ring, + struct postsend_info *send_info) +{ + unsigned int inline_flag; + + send_ring->pending_wqe++; + if (!send_info->write.lkey) + inline_flag = IBV_SEND_INLINE; + else + inline_flag = 0; + + send_info->write.send_flags = inline_flag; + + if (send_ring->pending_wqe % send_ring->signal_th == 0) + send_info->write.send_flags |= IBV_SEND_SIGNALED; + + send_ring->pending_wqe++; + send_info->read.length = send_info->write.length; + if (inline_flag) { + /* Read into dedicated buffer */ + send_info->read.addr = (uintptr_t)send_ring->sync_buff; + send_info->read.lkey = send_ring->sync_mr->lkey; + } else { + /* Read into the same write area */ + send_info->read.addr = (uintptr_t)send_info->write.addr; + send_info->read.lkey = send_ring->mr->lkey; + } + + if (send_ring->pending_wqe % send_ring->signal_th == 0) + send_info->read.send_flags = IBV_SEND_SIGNALED; + else + send_info->read.send_flags = 0; +} + +static int dr_postsend_icm_data(struct mlx5dv_dr_domain *dmn, + struct postsend_info *send_info) +{ + struct dr_send_ring *send_ring = dmn->send_ring; + uint32_t buff_offset; + int ret; + + ret = dr_handle_pending_wc(dmn, send_ring); + if (ret) + return ret; + + if (send_info->write.length > dmn->info.max_inline_size) { + buff_offset = (send_ring->tx_head & (dmn->send_ring->signal_th - 1)) * + send_ring->max_post_send_size; + /* Copy to ring mr */ + memcpy(send_ring->buf + buff_offset, + (void *) (uintptr_t)send_info->write.addr, + send_info->write.length); + send_info->write.addr = (uintptr_t)send_ring->buf + buff_offset; + send_info->write.lkey = send_ring->mr->lkey; + } + + send_ring->tx_head++; + dr_fill_data_segs(send_ring, send_info); + dr_post_send(send_ring->qp, send_info); + + return 0; +} + +static int dr_get_tbl_copy_details(struct mlx5dv_dr_domain *dmn, + struct dr_ste_htbl *htbl, + uint8_t **data, + uint32_t *byte_size, + int *iterations, + int *num_stes) +{ + int alloc_size; + + if (htbl->chunk->byte_size > dmn->send_ring->max_post_send_size) { + *iterations = htbl->chunk->byte_size / dmn->send_ring->max_post_send_size; + *byte_size = dmn->send_ring->max_post_send_size; + alloc_size = *byte_size; + *num_stes = *byte_size / DR_STE_SIZE; + } else { + *iterations = 1; + *num_stes = htbl->chunk->num_of_entries; + alloc_size = *num_stes * DR_STE_SIZE; + } + + *data = calloc(1, alloc_size); + if (!*data) { + errno = ENOMEM; + return errno; + } + + return 0; +} + +/* + * dr_postsend_ste: write size bytes into offset from the hw icm. + * + * Input: + * dmn - Domain + * ste - The ste struct that contains the data (at least part of it) + * data - The real data to send + * size - data size for writing. + * offset - The offset from the icm mapped data to start write to. + * this for write only part of the buffer. + * + * Return: 0 on success. + */ +int dr_send_postsend_ste(struct mlx5dv_dr_domain *dmn, struct dr_ste *ste, + uint8_t *data, uint16_t size, uint16_t offset) +{ + struct postsend_info send_info = {}; + + send_info.write.addr = (uintptr_t) data; + send_info.write.length = size; + send_info.write.lkey = 0; + send_info.remote_addr = dr_ste_get_mr_addr(ste) + offset; + send_info.rkey = ste->htbl->chunk->rkey; + + return dr_postsend_icm_data(dmn, &send_info); +} + +int dr_send_postsend_htbl(struct mlx5dv_dr_domain *dmn, struct dr_ste_htbl *htbl, + uint8_t *formated_ste, uint8_t *mask) +{ + uint32_t byte_size = htbl->chunk->byte_size; + int i, j, num_stes_per_iter, iterations; + uint8_t *data; + int ret; + + ret = dr_get_tbl_copy_details(dmn, htbl, &data, &byte_size, + &iterations, &num_stes_per_iter); + if (ret) + return ret; + + /* Send the data iteration times */ + for (i = 0; i < iterations; i++) { + uint32_t ste_index = i * (byte_size / DR_STE_SIZE); + struct postsend_info send_info = {}; + + /* Copy all ste's on the data buffer, need to add the bit_mask */ + for (j = 0; j < num_stes_per_iter; j++) { + if (dr_ste_is_not_valid_entry(htbl->ste_arr[ste_index + j].hw_ste)) { + memcpy(data + (j * DR_STE_SIZE), + formated_ste, DR_STE_SIZE); + } else { + /* Copy data */ + memcpy(data + (j * DR_STE_SIZE), htbl->ste_arr[ste_index + j].hw_ste, + DR_STE_SIZE_REDUCED); + /* Copy bit_mask */ + memcpy(data + (j * DR_STE_SIZE) + DR_STE_SIZE_REDUCED, + mask, DR_STE_SIZE_MASK); + } + } + + send_info.write.addr = (uintptr_t) data; + send_info.write.length = byte_size; + send_info.write.lkey = 0; + send_info.remote_addr = dr_ste_get_mr_addr(htbl->ste_arr + ste_index); + send_info.rkey = htbl->chunk->rkey; + + ret = dr_postsend_icm_data(dmn, &send_info); + if (ret) + goto out_free; + } + +out_free: + free(data); + return ret; +} + +/* Initialize htble with default STEs */ +int dr_send_postsend_formated_htbl(struct mlx5dv_dr_domain *dmn, + struct dr_ste_htbl *htbl, + uint8_t *ste_init_data, + bool update_hw_ste) +{ + uint32_t byte_size = htbl->chunk->byte_size; + int i, num_stes, iterations, ret; + uint8_t *data; + + ret = dr_get_tbl_copy_details(dmn, htbl, &data, &byte_size, + &iterations, &num_stes); + if (ret) + return ret; + + for (i = 0; i < num_stes; i++) { + uint8_t *copy_dst; + + /* Copy the same ste on the data buffer */ + copy_dst = data + i * DR_STE_SIZE; + memcpy(copy_dst, ste_init_data, DR_STE_SIZE); + + if (update_hw_ste) { + /* Copy the reduced ste to hash table ste_arr */ + copy_dst = htbl->hw_ste_arr + i * DR_STE_SIZE_REDUCED; + memcpy(copy_dst, ste_init_data, DR_STE_SIZE_REDUCED); + } + } + + /* Send the data iteration times */ + for (i = 0; i < iterations; i++) { + uint32_t ste_index = i * (byte_size / DR_STE_SIZE); + struct postsend_info send_info = {}; + + send_info.write.addr = (uintptr_t) data; + send_info.write.length = byte_size; + send_info.write.lkey = 0; + send_info.remote_addr = dr_ste_get_mr_addr(htbl->ste_arr + ste_index); + send_info.rkey = htbl->chunk->rkey; + + ret = dr_postsend_icm_data(dmn, &send_info); + if (ret) + goto out_free; + } + +out_free: + free(data); + return ret; +} + +int dr_send_postsend_action(struct mlx5dv_dr_domain *dmn, + struct mlx5dv_dr_action *action) +{ + struct postsend_info send_info = {}; + int ret; + + send_info.write.addr = (uintptr_t) action->rewrite.data; + send_info.write.length = action->rewrite.num_of_actions * + DR_MODIFY_ACTION_SIZE; + send_info.write.lkey = 0; + send_info.remote_addr = action->rewrite.chunk->mr_addr; + send_info.rkey = action->rewrite.chunk->rkey; + + pthread_mutex_lock(&dmn->mutex); + ret = dr_postsend_icm_data(dmn, &send_info); + pthread_mutex_unlock(&dmn->mutex); + + return ret; +} + +static int dr_prepare_qp_to_rts(struct mlx5dv_dr_domain *dmn) +{ + struct dr_devx_qp_rts_attr rts_attr = {}; + struct dr_devx_qp_rtr_attr rtr_attr = {}; + struct dr_qp *dr_qp = dmn->send_ring->qp; + enum ibv_mtu mtu = IBV_MTU_1024; + uint16_t gid_index = 0; + int port = 1; + int ret; + + /* Init */ + ret = dr_devx_modify_qp_rst2init(dmn->ctx, dr_qp->obj, port); + if (ret) { + dr_dbg(dmn, "Failed to modify QP to INIT, ret: %d\n", ret); + return ret; + } + + /* RTR */ + ret = dr_devx_query_gid(dmn->ctx, port, gid_index, &rtr_attr.dgid_attr); + if (ret) { + dr_dbg(dmn, "can't read sgid of index %d\n", gid_index); + return ret; + } + + rtr_attr.mtu = mtu; + rtr_attr.qp_num = dr_qp->obj->object_id; + rtr_attr.min_rnr_timer = 12; + rtr_attr.port_num = port; + rtr_attr.sgid_index = gid_index; + + ret = dr_devx_modify_qp_init2rtr(dmn->ctx, dr_qp->obj, &rtr_attr); + if (ret) { + dr_dbg(dmn, "Failed to modify QP to RTR, ret: %d\n", ret); + return ret; + } + + /* RTS */ + rts_attr.timeout = 14; + rts_attr.retry_cnt = 7; + rts_attr.rnr_retry = 7; + + ret = dr_devx_modify_qp_rtr2rts(dmn->ctx, dr_qp->obj, &rts_attr); + if (ret) { + dr_dbg(dmn, "Failed to modify QP to RTS, ret: %d\n", ret); + return ret; + } + + return 0; +} + +/* Each domain has its own ib resources */ +int dr_send_ring_alloc(struct mlx5dv_dr_domain *dmn) +{ + struct dr_qp_init_attr init_attr = {}; + struct mlx5dv_pd mlx5_pd = {}; + struct mlx5dv_cq mlx5_cq = {}; + int cq_size, page_size; + struct mlx5dv_obj obj; + int size; + int access_flags = IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE | + IBV_ACCESS_REMOTE_READ; + int ret; + + dmn->send_ring = calloc(1, sizeof(*dmn->send_ring)); + if (!dmn->send_ring) { + dr_dbg(dmn, "Couldn't allocate send-ring\n"); + errno = ENOMEM; + return errno; + } + + cq_size = QUEUE_SIZE + 1; + dmn->send_ring->cq.ibv_cq = ibv_create_cq(dmn->ctx, cq_size, NULL, NULL, 0); + if (!dmn->send_ring->cq.ibv_cq) { + dr_dbg(dmn, "Failed to create CQ with %u entries\n", cq_size); + ret = ENODEV; + errno = ENODEV; + goto free_send_ring; + } + + obj.cq.in = dmn->send_ring->cq.ibv_cq; + obj.cq.out = &mlx5_cq; + + ret = mlx5dv_init_obj(&obj, MLX5DV_OBJ_CQ); + if (ret) + goto clean_cq; + + dmn->send_ring->cq.buf = mlx5_cq.buf; + dmn->send_ring->cq.db = mlx5_cq.dbrec; + dmn->send_ring->cq.ncqe = mlx5_cq.cqe_cnt; + dmn->send_ring->cq.cqe_sz = mlx5_cq.cqe_size; + + obj.pd.in = dmn->pd; + obj.pd.out = &mlx5_pd; + + ret = mlx5dv_init_obj(&obj, MLX5DV_OBJ_PD); + if (ret) + goto clean_cq; + + init_attr.cqn = mlx5_cq.cqn; + init_attr.pdn = mlx5_pd.pdn; + init_attr.uar = dmn->uar; + init_attr.cap.max_send_wr = QUEUE_SIZE; + init_attr.cap.max_recv_wr = 1; + init_attr.cap.max_send_sge = 1; + init_attr.cap.max_recv_sge = 1; + init_attr.cap.max_inline_data = DR_STE_SIZE; + + dmn->send_ring->qp = dr_create_rc_qp(dmn->ctx, &init_attr); + if (!dmn->send_ring->qp) { + dr_dbg(dmn, "Couldn't create QP\n"); + ret = errno; + goto clean_cq; + } + dmn->send_ring->cq.qp = dmn->send_ring->qp; + + dmn->info.max_send_wr = QUEUE_SIZE; + dmn->info.max_inline_size = min(dmn->send_ring->qp->max_inline_data, + DR_STE_SIZE); + + dmn->send_ring->signal_th = dmn->info.max_send_wr / SIGNAL_PER_DIV_QUEUE; + + /* Prepare qp to be used */ + ret = dr_prepare_qp_to_rts(dmn); + if (ret) { + dr_dbg(dmn, "Couldn't prepare QP\n"); + goto clean_qp; + } + + dmn->send_ring->max_post_send_size = + dr_icm_pool_chunk_size_to_byte(DR_CHUNK_SIZE_1K, DR_ICM_TYPE_STE); + + /* Allocating the max size as a buffer for writing */ + size = dmn->send_ring->signal_th * dmn->send_ring->max_post_send_size; + page_size = sysconf(_SC_PAGESIZE); + ret = posix_memalign(&dmn->send_ring->buf, page_size, size); + if (ret) { + dr_dbg(dmn, "Couldn't allocate send-ring buf.\n"); + errno = ret; + goto clean_qp; + } + + memset(dmn->send_ring->buf, 0, size); + dmn->send_ring->buf_size = size; + + dmn->send_ring->mr = ibv_reg_mr(dmn->pd, dmn->send_ring->buf, size, + access_flags); + if (!dmn->send_ring->mr) { + dr_dbg(dmn, "Couldn't register send-ring MR\n"); + ret = errno; + goto free_mem; + } + + dmn->send_ring->sync_mr = ibv_reg_mr(dmn->pd, dmn->send_ring->sync_buff, + MIN_READ_SYNC, + IBV_ACCESS_LOCAL_WRITE | + IBV_ACCESS_REMOTE_READ | + IBV_ACCESS_REMOTE_WRITE); + if (!dmn->send_ring->sync_mr) { + dr_dbg(dmn, "Couldn't register sync mr\n"); + ret = errno; + goto clean_mr; + } + + return 0; + +clean_mr: + ibv_dereg_mr(dmn->send_ring->mr); +free_mem: + free(dmn->send_ring->buf); +clean_qp: + dr_destroy_qp(dmn->send_ring->qp); +clean_cq: + ibv_destroy_cq(dmn->send_ring->cq.ibv_cq); +free_send_ring: + free(dmn->send_ring); + + return ret; +} + +void dr_send_ring_free(struct dr_send_ring *send_ring) +{ + dr_destroy_qp(send_ring->qp); + ibv_destroy_cq(send_ring->cq.ibv_cq); + ibv_dereg_mr(send_ring->sync_mr); + ibv_dereg_mr(send_ring->mr); + free(send_ring->buf); + free(send_ring); +} + +int dr_send_ring_force_drain(struct mlx5dv_dr_domain *dmn) +{ + struct dr_send_ring *send_ring = dmn->send_ring; + struct postsend_info send_info = {}; + uint8_t data[DR_STE_SIZE]; + int i, num_of_sends_req; + int ret; + + /* Sending this amount of requests makes sure we will get drain */ + num_of_sends_req = send_ring->signal_th * TH_NUMS_TO_DRAIN / 2; + + /* Send fake requests forcing the last to be signaled */ + send_info.write.addr = (uintptr_t) data; + send_info.write.length = DR_STE_SIZE; + send_info.write.lkey = 0; + /* Using the sync_mr in order to write/read */ + send_info.remote_addr = (uintptr_t) send_ring->sync_mr->addr; + send_info.rkey = send_ring->sync_mr->rkey; + + + for (i = 0; i < num_of_sends_req; i++) { + ret = dr_postsend_icm_data(dmn, &send_info); + if (ret) + return ret; + } + + ret = dr_handle_pending_wc(dmn, send_ring); + + return ret; +} diff --git a/providers/mlx5/dr_ste.c b/providers/mlx5/dr_ste.c new file mode 100644 index 0000000..2c401fd --- /dev/null +++ b/providers/mlx5/dr_ste.c @@ -0,0 +1,2428 @@ +/* + * Copyright (c) 2019, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <stdlib.h> +#include <string.h> +#include "mlx5dv_dr.h" + +#define IPV4_ETHERTYPE 0x0800 +#define IPV6_ETHERTYPE 0x86DD +#define STE_IPV4 0x1 +#define STE_IPV6 0x2 +#define STE_TCP 0x1 +#define STE_UDP 0x2 +#define STE_SPI 0x3 +#define IP_VERSION_IPV4 0x4 +#define IP_VERSION_IPV6 0x6 +#define IP_PROTOCOL_UDP 0x11 +#define IP_PROTOCOL_TCP 0x06 +#define IP_PROTOCOL_IPSEC 0x33 +#define TCP_PROTOCOL 0x6 +#define UDP_PROTOCOL 0x11 +#define IPSEC_PROTOCOL 0x33 + +#define DR_STE_ENABLE_FLOW_TAG (1 << 31) + +/* Read from layout struct */ +#define DR_STE_GET(typ, p, fld) DEVX_GET(ste_##typ, p, fld) + +/* Write to layout a value */ +#define DR_STE_SET(typ, p, fld, v) DEVX_SET(ste_##typ, p, fld, v) + +#define DR_STE_SET_BOOL(typ, p, fld, v) DEVX_SET(ste_##typ, p, fld, !!(v)) + +/* Set to STE a specific value using DR_STE_SET */ +#define DR_STE_SET_VAL(lookup_type, tag, t_fname, spec, s_fname, value) do { \ + if ((spec)->s_fname) { \ + DR_STE_SET(lookup_type, tag, t_fname, value); \ + (spec)->s_fname = 0; \ + } \ +} while (0) + +/* Set to STE spec->s_fname to tag->t_fname */ +#define DR_STE_SET_TAG(lookup_type, tag, t_fname, spec, s_fname) \ + DR_STE_SET_VAL(lookup_type, tag, t_fname, spec, s_fname, (spec)->s_fname); + +/* Set to STE -1 to bit_mask->bm_fname and set spec->s_fname as used */ +#define DR_STE_SET_MASK(lookup_type, bit_mask, bm_fname, spec, s_fname) \ + DR_STE_SET_VAL(lookup_type, bit_mask, bm_fname, spec, s_fname, -1); + +/* Set to STE spec->s_fname to bit_mask->bm_fname and set spec->s_fname as used */ +#define DR_STE_SET_MASK_V(lookup_type, bit_mask, bm_fname, spec, s_fname) \ + DR_STE_SET_VAL(lookup_type, bit_mask, bm_fname, spec, s_fname, (spec)->s_fname); + +#define DR_STE_SET_TCP_FLAGS(lookup_type, tag, spec) do { \ + DR_STE_SET_BOOL(lookup_type, tag, tcp_ns, (spec)->tcp_flags & (1 << 8)); \ + DR_STE_SET_BOOL(lookup_type, tag, tcp_cwr, (spec)->tcp_flags & (1 << 7)); \ + DR_STE_SET_BOOL(lookup_type, tag, tcp_ece, (spec)->tcp_flags & (1 << 6)); \ + DR_STE_SET_BOOL(lookup_type, tag, tcp_urg, (spec)->tcp_flags & (1 << 5)); \ + DR_STE_SET_BOOL(lookup_type, tag, tcp_ack, (spec)->tcp_flags & (1 << 4)); \ + DR_STE_SET_BOOL(lookup_type, tag, tcp_psh, (spec)->tcp_flags & (1 << 3)); \ + DR_STE_SET_BOOL(lookup_type, tag, tcp_rst, (spec)->tcp_flags & (1 << 2)); \ + DR_STE_SET_BOOL(lookup_type, tag, tcp_syn, (spec)->tcp_flags & (1 << 1)); \ + DR_STE_SET_BOOL(lookup_type, tag, tcp_fin, (spec)->tcp_flags & (1 << 0)); \ +} while (0) + +#define DR_STE_SET_MPLS_MASK(lookup_type, mask, in_out, bit_mask) do { \ + DR_STE_SET_MASK_V(lookup_type, mask, mpls0_label, mask, \ + in_out##_first_mpls_label);\ + DR_STE_SET_MASK_V(lookup_type, mask, mpls0_s_bos, mask, \ + in_out##_first_mpls_s_bos); \ + DR_STE_SET_MASK_V(lookup_type, mask, mpls0_exp, mask, \ + in_out##_first_mpls_exp); \ + DR_STE_SET_MASK_V(lookup_type, mask, mpls0_ttl, mask, \ + in_out##_first_mpls_ttl); \ +} while (0) + +#define DR_STE_SET_MPLS_TAG(lookup_type, mask, in_out, tag) do { \ + DR_STE_SET_TAG(lookup_type, tag, mpls0_label, mask, \ + in_out##_first_mpls_label);\ + DR_STE_SET_TAG(lookup_type, tag, mpls0_s_bos, mask, \ + in_out##_first_mpls_s_bos); \ + DR_STE_SET_TAG(lookup_type, tag, mpls0_exp, mask, \ + in_out##_first_mpls_exp); \ + DR_STE_SET_TAG(lookup_type, tag, mpls0_ttl, mask, \ + in_out##_first_mpls_ttl); \ +} while (0) + +#define DR_STE_IS_OUTER_MPLS_OVER_GRE_SET(_misc) (\ + (_misc)->outer_first_mpls_over_gre_label || \ + (_misc)->outer_first_mpls_over_gre_exp || \ + (_misc)->outer_first_mpls_over_gre_s_bos || \ + (_misc)->outer_first_mpls_over_gre_ttl) +#define DR_STE_IS_OUTER_MPLS_OVER_UDP_SET(_misc) (\ + (_misc)->outer_first_mpls_over_udp_label || \ + (_misc)->outer_first_mpls_over_udp_exp || \ + (_misc)->outer_first_mpls_over_udp_s_bos || \ + (_misc)->outer_first_mpls_over_udp_ttl) + +#define DR_STE_CALC_LU_TYPE(lookup_type, rx, inner) \ + ((inner) ? DR_STE_LU_TYPE_##lookup_type##_I : \ + (rx) ? DR_STE_LU_TYPE_##lookup_type##_D : \ + DR_STE_LU_TYPE_##lookup_type##_O) + +enum dr_ste_tunl_action { + DR_STE_TUNL_ACTION_NONE = 0, + DR_STE_TUNL_ACTION_ENABLE = 1, + DR_STE_TUNL_ACTION_DECAP = 2, + DR_STE_TUNL_ACTION_L3_DECAP = 3, +}; + +enum dr_ste_action_type { + DR_STE_ACTION_TYPE_ENCAP_L3 = 3, + DR_STE_ACTION_TYPE_ENCAP = 4, +}; + +struct dr_hw_ste_format { + uint8_t ctrl[DR_STE_SIZE_CTRL]; + uint8_t tag[DR_STE_SIZE_TAG]; + uint8_t mask[DR_STE_SIZE_MASK]; +}; + +uint32_t dr_ste_calc_hash_index(uint8_t *hw_ste_p, + struct dr_ste_htbl *htbl) +{ + struct dr_hw_ste_format *hw_ste = (struct dr_hw_ste_format *)hw_ste_p; + uint8_t masked[DR_STE_SIZE_TAG] = {}; + uint32_t crc32, index; + uint16_t bit; + int i; + + /* Don't calculate CRC if the result is predicted */ + if (htbl->chunk->num_of_entries == 1 || htbl->byte_mask == 0) + return 0; + + /* Mask tag using byte mask, bit per byte */ + bit = 1 << (DR_STE_SIZE_TAG - 1); + for (i = 0; i < DR_STE_SIZE_TAG; i++) { + if (htbl->byte_mask & bit) + masked[i] = hw_ste->tag[i]; + + bit = bit >> 1; + } + + crc32 = dr_crc32_slice8_calc(masked, DR_STE_SIZE_TAG); + index = crc32 % htbl->chunk->num_of_entries; + + return index; +} + +static uint16_t dr_ste_conv_bit_to_byte_mask(uint8_t *bit_mask) +{ + uint16_t byte_mask = 0; + int i; + + for (i = 0; i < DR_STE_SIZE_MASK; i++) { + byte_mask = byte_mask << 1; + if (bit_mask[i] == 0xff) + byte_mask |= 1; + } + return byte_mask; +} + +void dr_ste_set_bit_mask(uint8_t *hw_ste_p, uint8_t *bit_mask) +{ + struct dr_hw_ste_format *hw_ste = (struct dr_hw_ste_format *)hw_ste_p; + + memcpy(hw_ste->mask, bit_mask, DR_STE_SIZE_MASK); +} + +void dr_ste_rx_set_flow_tag(uint8_t *hw_ste_p, uint32_t flow_tag) +{ + DR_STE_SET(rx_steering_mult, hw_ste_p, qp_list_pointer, + DR_STE_ENABLE_FLOW_TAG | flow_tag); +} + +void dr_ste_set_counter_id(uint8_t *hw_ste_p, uint32_t ctr_id) +{ + /* This can be used for both rx_steering_mult and for sx_transmit */ + DR_STE_SET(rx_steering_mult, hw_ste_p, counter_trigger_15_0, ctr_id); + DR_STE_SET(rx_steering_mult, hw_ste_p, counter_trigger_23_16, ctr_id >> 16); +} + +void dr_ste_set_tx_encap(void *hw_ste_p, uint32_t reformat_id, int size, bool encap_l3) +{ + DR_STE_SET(sx_transmit, hw_ste_p, action_type, + encap_l3 ? DR_STE_ACTION_TYPE_ENCAP_L3 : DR_STE_ACTION_TYPE_ENCAP); + /* The hardware expects here size in words (2 byte) */ + DR_STE_SET(sx_transmit, hw_ste_p, action_description, size / 2); + DR_STE_SET(sx_transmit, hw_ste_p, encap_pointer_vlan_data, reformat_id); +} + +void dr_ste_set_rx_decap(uint8_t *hw_ste_p) +{ + DR_STE_SET(rx_steering_mult, hw_ste_p, tunneling_action, + DR_STE_TUNL_ACTION_DECAP); +} + +void dr_ste_set_rx_decap_l3(uint8_t *hw_ste_p, bool vlan) +{ + DR_STE_SET(rx_steering_mult, hw_ste_p, tunneling_action, + DR_STE_TUNL_ACTION_L3_DECAP); + DR_STE_SET(modify_packet, hw_ste_p, action_description, vlan ? 1 : 0); +} + +void dr_ste_set_entry_type(uint8_t *hw_ste_p, uint8_t entry_type) +{ + DR_STE_SET(general, hw_ste_p, entry_type, entry_type); +} + +uint8_t dr_ste_get_entry_type(uint8_t *hw_ste_p) +{ + return DR_STE_GET(general, hw_ste_p, entry_type); +} + +void dr_ste_set_rewrite_actions(uint8_t *hw_ste_p, uint16_t num_of_actions, + uint32_t re_write_index) +{ + DR_STE_SET(modify_packet, hw_ste_p, number_of_re_write_actions, + num_of_actions); + DR_STE_SET(modify_packet, hw_ste_p, header_re_write_actions_pointer, + re_write_index); +} + +void dr_ste_init(uint8_t *hw_ste_p, uint8_t lu_type, uint8_t entry_type, + uint16_t gvmi) +{ + DR_STE_SET(general, hw_ste_p, entry_type, entry_type); + DR_STE_SET(general, hw_ste_p, entry_sub_type, lu_type); + DR_STE_SET(general, hw_ste_p, next_lu_type, DR_STE_LU_TYPE_DONT_CARE); + + /* Set GVMI once, this is the same for RX/TX + * bits 63_48 of next table base / miss address encode the next GVMI + */ + DR_STE_SET(rx_steering_mult, hw_ste_p, gvmi, gvmi); + DR_STE_SET(rx_steering_mult, hw_ste_p, next_table_base_63_48, gvmi); + DR_STE_SET(rx_steering_mult, hw_ste_p, miss_address_63_48, gvmi); +} + +static void dr_ste_set_always_hit(struct dr_hw_ste_format *hw_ste) +{ + memset(&hw_ste->tag, 0, sizeof(hw_ste->tag)); + memset(&hw_ste->mask, 0, sizeof(hw_ste->mask)); +} + +static void dr_ste_set_always_miss(struct dr_hw_ste_format *hw_ste) +{ + hw_ste->tag[0] = 0xdc; + hw_ste->mask[0] = 0; +} + +uint64_t dr_ste_get_miss_addr(uint8_t *hw_ste) +{ + uint64_t index = + (DR_STE_GET(rx_steering_mult, hw_ste, miss_address_31_6) | + DR_STE_GET(rx_steering_mult, hw_ste, miss_address_39_32) << 26); + + return index << 6; +} + +void dr_ste_set_hit_addr(uint8_t *hw_ste, uint64_t icm_addr, uint32_t ht_size) +{ + uint64_t index = (icm_addr >> 5) | ht_size; + + DR_STE_SET(general, hw_ste, next_table_base_39_32_size, index >> 27); + DR_STE_SET(general, hw_ste, next_table_base_31_5_size, index); +} + +uint64_t dr_ste_get_icm_addr(struct dr_ste *ste) +{ + uint32_t index = ste - ste->htbl->ste_arr; + + return ste->htbl->chunk->icm_addr + DR_STE_SIZE * index; +} + +uint64_t dr_ste_get_mr_addr(struct dr_ste *ste) +{ + uint32_t index = ste - ste->htbl->ste_arr; + + return ste->htbl->chunk->mr_addr + DR_STE_SIZE * index; +} + +struct list_head *dr_ste_get_miss_list(struct dr_ste *ste) +{ + uint32_t index = ste - ste->htbl->ste_arr; + + return &ste->htbl->miss_list[index]; +} + +void dr_ste_always_hit_htbl(struct dr_ste *ste, struct dr_ste_htbl *next_htbl) +{ + struct dr_icm_chunk *chunk = next_htbl->chunk; + uint8_t *hw_ste = ste->hw_ste; + + DR_STE_SET(general, hw_ste, byte_mask, next_htbl->byte_mask); + DR_STE_SET(general, hw_ste, next_lu_type, next_htbl->lu_type); + dr_ste_set_hit_addr(hw_ste, chunk->icm_addr, chunk->num_of_entries); + + dr_ste_set_always_hit((struct dr_hw_ste_format *)ste->hw_ste); +} + +bool dr_ste_is_last_in_rule(struct dr_matcher_rx_tx *nic_matcher, + uint8_t ste_location) +{ + return ste_location == nic_matcher->num_of_builders; +} + +/* + * Replace relevant fields, except of: + * htbl - keep the origin htbl + * miss_list + list - already took the src from the list. + * icm_addr/mr_addr - depends on the hosting table. + * + * Before: + * | a | -> | b | -> | c | -> + * + * After: + * | a | -> | c | -> + * While the data that was in b copied to a. + */ +static void dr_ste_replace(struct dr_ste *dst, struct dr_ste *src) +{ + memcpy(dst->hw_ste, src->hw_ste, DR_STE_SIZE_REDUCED); + dst->next_htbl = src->next_htbl; + if (dst->next_htbl) + dst->next_htbl->pointing_ste = dst; + + atomic_init(&dst->refcount, atomic_load(&src->refcount)); + + list_head_init(&dst->rule_list); + list_append_list(&dst->rule_list, &src->rule_list); +} + +/* Free ste which is the head and the only one in miss_list */ +static void +dr_ste_remove_head_ste(struct dr_ste *ste, + struct dr_matcher_rx_tx *nic_matcher, + struct dr_ste_send_info *ste_info_head, + struct list_head *send_ste_list, + struct dr_ste_htbl *stats_tbl) +{ + uint8_t tmp_data_ste[DR_STE_SIZE] = {}; + struct dr_ste tmp_ste = {}; + uint64_t miss_addr; + + tmp_ste.hw_ste = tmp_data_ste; + /* + * Use temp ste because dr_ste_always_miss_addr + * touches bit_mask area which doesn't exist at ste->hw_ste. + */ + memcpy(tmp_ste.hw_ste, ste->hw_ste, DR_STE_SIZE_REDUCED); + miss_addr = nic_matcher->e_anchor->chunk->icm_addr; + dr_ste_always_miss_addr(&tmp_ste, miss_addr); + memcpy(ste->hw_ste, tmp_ste.hw_ste, DR_STE_SIZE_REDUCED); + + list_del_init(&ste->miss_list_node); + + /* Write full STE size in order to have "always_miss" */ + dr_send_fill_and_append_ste_send_info(ste, DR_STE_SIZE, + 0, tmp_data_ste, + ste_info_head, + send_ste_list, + true /* Copy data */); + + stats_tbl->ctrl.num_of_valid_entries--; +} + +/* + * Free ste which is the head but NOT the only one in miss_list: + * |_ste_| --> |_next_ste_| -->|__| -->|__| -->/0 + */ +static void +dr_ste_replace_head_ste(struct dr_ste *ste, struct dr_ste *next_ste, + struct dr_ste_send_info *ste_info_head, + struct list_head *send_ste_list, + struct dr_ste_htbl *stats_tbl) + +{ + struct dr_ste_htbl *next_miss_htbl; + + next_miss_htbl = next_ste->htbl; + + /* Remove from the miss_list the next_ste before copy */ + list_del_init(&next_ste->miss_list_node); + + /* All rule-members that use next_ste should know about that */ + dr_rule_update_rule_member(next_ste, ste); + + /* Move data from next into ste */ + dr_ste_replace(ste, next_ste); + + /* + * Del the htbl that contains the next_ste. + * The origin htbl stay with the same number of entries. + */ + dr_htbl_put(next_miss_htbl); + + dr_send_fill_and_append_ste_send_info(ste, DR_STE_SIZE_REDUCED, + 0, ste->hw_ste, + ste_info_head, + send_ste_list, + true /* Copy data */); + + stats_tbl->ctrl.num_of_collisions--; + stats_tbl->ctrl.num_of_valid_entries--; +} + +/* + * Free ste that is located in the middle of the miss list: + * |__| -->|_prev_ste_|->|_ste_|-->|_next_ste_| + */ +static void dr_ste_remove_middle_ste(struct dr_ste *ste, + struct dr_ste_send_info *ste_info, + struct list_head *send_ste_list, + struct dr_ste_htbl *stats_tbl) +{ + struct dr_ste *prev_ste; + uint64_t miss_addr; + + prev_ste = list_prev(dr_ste_get_miss_list(ste), ste, miss_list_node); + assert(prev_ste); + + miss_addr = dr_ste_get_miss_addr(ste->hw_ste); + dr_ste_set_miss_addr(prev_ste->hw_ste, miss_addr); + + dr_send_fill_and_append_ste_send_info(prev_ste, DR_STE_SIZE_REDUCED, 0, + prev_ste->hw_ste, ste_info, + send_ste_list, true /* Copy data*/); + + list_del_init(&ste->miss_list_node); + + stats_tbl->ctrl.num_of_valid_entries--; + stats_tbl->ctrl.num_of_collisions--; +} + +void dr_ste_free(struct dr_ste *ste, + struct mlx5dv_dr_matcher *matcher, + struct dr_matcher_rx_tx *nic_matcher) +{ + struct dr_ste_send_info *cur_ste_info, *tmp_ste_info; + struct mlx5dv_dr_domain *dmn = matcher->tbl->dmn; + struct dr_ste_send_info ste_info_head; + struct dr_ste *next_ste, *first_ste; + LIST_HEAD(send_ste_list); + bool put_on_origin_table = true; + struct dr_ste_htbl *stats_tbl; + + first_ste = list_top(dr_ste_get_miss_list(ste), struct dr_ste, miss_list_node); + stats_tbl = first_ste->htbl; + /* + * Two options: + * 1. ste is head: + * a. head ste is the only ste in the miss list + * b. head ste is not the only ste in the miss-list + * 2. ste is not head + */ + if (first_ste == ste) { /* Ste is the head */ + next_ste = list_next(dr_ste_get_miss_list(ste), ste, miss_list_node); + if (!next_ste) { + /* One and only entry in the list */ + dr_ste_remove_head_ste(ste, nic_matcher, + &ste_info_head, + &send_ste_list, + stats_tbl); + } else { + /* First but not only entry in the list */ + dr_ste_replace_head_ste(ste, next_ste, &ste_info_head, + &send_ste_list, stats_tbl); + put_on_origin_table = false; + } + } else { /* Ste in the middle of the list */ + dr_ste_remove_middle_ste(ste, &ste_info_head, &send_ste_list, stats_tbl); + } + + /* Update HW */ + list_for_each_safe(&send_ste_list, cur_ste_info, tmp_ste_info, send_list) { + list_del(&cur_ste_info->send_list); + dr_send_postsend_ste(dmn, cur_ste_info->ste, + cur_ste_info->data, cur_ste_info->size, + cur_ste_info->offset); + } + + if (put_on_origin_table) + dr_htbl_put(ste->htbl); +} + +bool dr_ste_equal_tag(void *src, void *dst) +{ + struct dr_hw_ste_format *s_hw_ste = (struct dr_hw_ste_format *)src; + struct dr_hw_ste_format *d_hw_ste = (struct dr_hw_ste_format *)dst; + + return !memcmp(s_hw_ste->tag, d_hw_ste->tag, DR_STE_SIZE_TAG); +} + +void dr_ste_set_hit_addr_by_next_htbl(uint8_t *hw_ste, + struct dr_ste_htbl *next_htbl) +{ + struct dr_icm_chunk *chunk = next_htbl->chunk; + + dr_ste_set_hit_addr(hw_ste, chunk->icm_addr, chunk->num_of_entries); +} + +void dr_ste_set_miss_addr(uint8_t *hw_ste_p, uint64_t miss_addr) +{ + uint64_t index = miss_addr >> 6; + + /* Miss address for TX and RX STEs located in the same offsets */ + DR_STE_SET(rx_steering_mult, hw_ste_p, miss_address_39_32, index >> 26); + DR_STE_SET(rx_steering_mult, hw_ste_p, miss_address_31_6, index); +} + +void dr_ste_always_miss_addr(struct dr_ste *ste, uint64_t miss_addr) +{ + uint8_t *hw_ste = ste->hw_ste; + + DR_STE_SET(rx_steering_mult, hw_ste, next_lu_type, DR_STE_LU_TYPE_DONT_CARE); + dr_ste_set_miss_addr(hw_ste, miss_addr); + dr_ste_set_always_miss((struct dr_hw_ste_format *)ste->hw_ste); +} + +/* + * The assumption here is that we don't update the ste->hw_ste if it is not + * used ste, so it will be all zero, checking the next_lu_type. + */ +bool dr_ste_is_not_valid_entry(uint8_t *p_hw_ste) +{ + struct dr_hw_ste_format *hw_ste = (struct dr_hw_ste_format *)p_hw_ste; + + if (DR_STE_GET(general, hw_ste, next_lu_type) == + DR_STE_LU_TYPE_NOP) + return true; + + return false; +} + +bool dr_ste_not_used_ste(struct dr_ste *ste) +{ + return !atomic_load(&ste->refcount); +} + +/* Init one ste as a pattern for ste data array */ +void dr_ste_set_formated_ste(uint16_t gvmi, + struct dr_domain_rx_tx *nic_dmn, + struct dr_ste_htbl *htbl, + uint8_t *formated_ste, + struct dr_htbl_connect_info *connect_info) +{ + struct dr_ste ste = {}; + + dr_ste_init(formated_ste, htbl->lu_type, nic_dmn->ste_type, gvmi); + ste.hw_ste = formated_ste; + + if (connect_info->type == CONNECT_HIT) + dr_ste_always_hit_htbl(&ste, connect_info->hit_next_htbl); + else + dr_ste_always_miss_addr(&ste, connect_info->miss_icm_addr); +} + +int dr_ste_htbl_init_and_postsend(struct mlx5dv_dr_domain *dmn, + struct dr_domain_rx_tx *nic_dmn, + struct dr_ste_htbl *htbl, + struct dr_htbl_connect_info *connect_info, + bool update_hw_ste) +{ + uint8_t formated_ste[DR_STE_SIZE] = {}; + + dr_ste_set_formated_ste(dmn->info.caps.gvmi, + nic_dmn, + htbl, + formated_ste, + connect_info); + + return dr_send_postsend_formated_htbl(dmn, htbl, formated_ste, update_hw_ste); +} + +int dr_ste_create_next_htbl(struct mlx5dv_dr_matcher *matcher, + struct dr_matcher_rx_tx *nic_matcher, + struct dr_ste *ste, + uint8_t *cur_hw_ste, + enum dr_icm_chunk_size log_table_size) +{ + struct dr_hw_ste_format *hw_ste = (struct dr_hw_ste_format *)cur_hw_ste; + struct dr_domain_rx_tx *nic_dmn = nic_matcher->nic_tbl->nic_dmn; + struct mlx5dv_dr_domain *dmn = matcher->tbl->dmn; + struct dr_htbl_connect_info info; + struct dr_ste_htbl *next_htbl; + + if (!dr_ste_is_last_in_rule(nic_matcher, ste->ste_chain_location)) { + uint8_t next_lu_type; + uint16_t byte_mask; + + next_lu_type = DR_STE_GET(general, hw_ste, next_lu_type); + byte_mask = DR_STE_GET(general, hw_ste, byte_mask); + + next_htbl = dr_ste_htbl_alloc(dmn->ste_icm_pool, + log_table_size, + next_lu_type, + byte_mask); + if (!next_htbl) { + dr_dbg(dmn, "Failed allocating next hash table\n"); + return errno; + } + + /* Write new table to HW */ + info.type = CONNECT_MISS; + info.miss_icm_addr = nic_matcher->e_anchor->chunk->icm_addr; + if (dr_ste_htbl_init_and_postsend(dmn, nic_dmn, next_htbl, + &info, false)) { + dr_dbg(dmn, "Failed writing table to HW\n"); + goto free_table; + } + + dr_ste_set_hit_addr_by_next_htbl(cur_hw_ste, next_htbl); + ste->next_htbl = next_htbl; + next_htbl->pointing_ste = ste; + } + + return 0; + +free_table: + dr_ste_htbl_free(next_htbl); + return ENOENT; +} + +static void dr_ste_set_ctrl(struct dr_ste_htbl *htbl) +{ + struct dr_ste_htbl_ctrl *ctrl = &htbl->ctrl; + int num_of_entries; + + htbl->ctrl.may_grow = true; + + if (htbl->chunk_size == DR_CHUNK_SIZE_MAX - 1 || !htbl->byte_mask) + htbl->ctrl.may_grow = false; + + /* Threshold is 50%, one is added to table of size 1 */ + num_of_entries = dr_icm_pool_chunk_size_to_entries(htbl->chunk_size); + ctrl->increase_threshold = (num_of_entries + 1) / 2; +} + +struct dr_ste_htbl *dr_ste_htbl_alloc(struct dr_icm_pool *pool, + enum dr_icm_chunk_size chunk_size, + uint8_t lu_type, uint16_t byte_mask) +{ + struct dr_icm_chunk *chunk; + struct dr_ste_htbl *htbl; + int i; + + htbl = calloc(1, sizeof(struct dr_ste_htbl)); + if (!htbl) { + errno = ENOMEM; + return NULL; + } + + chunk = dr_icm_alloc_chunk(pool, chunk_size); + if (!chunk) + goto out_free_htbl; + + htbl->chunk = chunk; + htbl->lu_type = lu_type; + htbl->byte_mask = byte_mask; + htbl->ste_arr = chunk->ste_arr; + htbl->hw_ste_arr = chunk->hw_ste_arr; + htbl->miss_list = chunk->miss_list; + atomic_init(&htbl->refcount, 0); + + for (i = 0; i < chunk->num_of_entries; i++) { + struct dr_ste *ste = &htbl->ste_arr[i]; + + ste->hw_ste = htbl->hw_ste_arr + i * DR_STE_SIZE_REDUCED; + ste->htbl = htbl; + atomic_init(&ste->refcount, 0); + list_node_init(&ste->miss_list_node); + list_head_init(&htbl->miss_list[i]); + list_head_init(&ste->rule_list); + } + + htbl->chunk_size = chunk_size; + dr_ste_set_ctrl(htbl); + return htbl; + +out_free_htbl: + free(htbl); + return NULL; +} + +int dr_ste_htbl_free(struct dr_ste_htbl *htbl) +{ + if (atomic_load(&htbl->refcount)) + return EBUSY; + + dr_icm_free_chunk(htbl->chunk); + free(htbl); + return 0; +} + +static int dr_ste_build_pre_check_spec(struct mlx5dv_dr_domain *dmn, + struct dr_match_spec *m_spec, + struct dr_match_spec *v_spec) +{ + if (m_spec->ip_version) { + if (m_spec->ip_version != 4 && m_spec->ip_version != 6) { + dr_dbg(dmn, "IP version must be specified v4 or v6\n"); + errno = EOPNOTSUPP; + return errno; + } + + if (v_spec && (v_spec->ip_version != m_spec->ip_version)) { + dr_dbg(dmn, "Mask and value IP version must be equal\n"); + errno = EOPNOTSUPP; + return errno; + } + } + return 0; +} + +int dr_ste_build_pre_check(struct mlx5dv_dr_domain *dmn, + uint8_t match_criteria, + struct dr_match_param *mask, + struct dr_match_param *value) +{ + int ret; + + if (match_criteria & DR_MATCHER_CRITERIA_OUTER) { + ret = dr_ste_build_pre_check_spec(dmn, + &mask->outer, + value ? &value->outer : NULL); + if (ret) + return ret; + } + + if (match_criteria & DR_MATCHER_CRITERIA_INNER) { + ret = dr_ste_build_pre_check_spec(dmn, + &mask->inner, + value ? &value->inner : NULL); + if (ret) + return ret; + } + + if (!value && (match_criteria & DR_MATCHER_CRITERIA_MISC)) { + if (mask->misc.source_port && mask->misc.source_port != 0xffff) { + dr_dbg(dmn, "Partial mask source_port is not supported\n"); + errno = ENOTSUP; + return errno; + } + } + + return 0; +} + +int dr_ste_build_ste_arr(struct mlx5dv_dr_matcher *matcher, + struct dr_matcher_rx_tx *nic_matcher, + struct dr_match_param *value, + uint8_t *ste_arr) +{ + struct dr_domain_rx_tx *nic_dmn = nic_matcher->nic_tbl->nic_dmn; + struct mlx5dv_dr_domain *dmn = matcher->tbl->dmn; + struct dr_ste_build *sb; + int ret, i; + + ret = dr_ste_build_pre_check(dmn, matcher->match_criteria, + &matcher->mask, value); + if (ret) + return ret; + + sb = nic_matcher->ste_builder; + for (i = 0; i < nic_matcher->num_of_builders; i++) { + dr_ste_init(ste_arr, + sb->lu_type, + nic_dmn->ste_type, + dmn->info.caps.gvmi); + + dr_ste_set_bit_mask(ste_arr, sb->bit_mask); + + ret = sb->ste_build_tag_func(value, sb, ste_arr); + if (ret) + return ret; + + /* Connect the STEs */ + if (i < (nic_matcher->num_of_builders - 1)) { + /* Need the next builder for these fields, + * not relevant for the last ste in the chain. + */ + sb++; + DR_STE_SET(general, ste_arr, next_lu_type, sb->lu_type); + DR_STE_SET(general, ste_arr, byte_mask, sb->byte_mask); + } + ste_arr += DR_STE_SIZE; + } + return 0; +} + +static int dr_ste_build_eth_l2_src_des_bit_mask(struct dr_match_param *value, + bool inner, uint8_t *bit_mask) +{ + struct dr_match_spec *mask = inner ? &value->inner : &value->outer; + + DR_STE_SET_MASK_V(eth_l2_src_dst, bit_mask, dmac_47_16, mask, dmac_47_16); + DR_STE_SET_MASK_V(eth_l2_src_dst, bit_mask, dmac_15_0, mask, dmac_15_0); + + if (mask->smac_47_16 || mask->smac_15_0) { + DR_STE_SET(eth_l2_src_dst, bit_mask, smac_47_32, + mask->smac_47_16 >> 16); + DR_STE_SET(eth_l2_src_dst, bit_mask, smac_31_0, + mask->smac_47_16 << 16 | mask->smac_15_0); + mask->smac_47_16 = 0; + mask->smac_15_0 = 0; + } + + DR_STE_SET_MASK_V(eth_l2_src_dst, bit_mask, first_vlan_id, mask, first_vid); + DR_STE_SET_MASK_V(eth_l2_src_dst, bit_mask, first_cfi, mask, first_cfi); + DR_STE_SET_MASK_V(eth_l2_src_dst, bit_mask, first_priority, mask, first_prio); + DR_STE_SET_MASK(eth_l2_src_dst, bit_mask, l3_type, mask, ip_version); + + if (mask->cvlan_tag) { + DR_STE_SET(eth_l2_src_dst, bit_mask, first_vlan_qualifier, -1); + mask->cvlan_tag = 0; + } else if (mask->svlan_tag) { + DR_STE_SET(eth_l2_src_dst, bit_mask, first_vlan_qualifier, -1); + mask->svlan_tag = 0; + } + + if (mask->cvlan_tag || mask->svlan_tag) { + errno = EINVAL; + return errno; + } + + return 0; +} + +static void dr_ste_copy_mask_misc(char *mask, struct dr_match_misc *spec) +{ + spec->gre_c_present = DEVX_GET(dr_match_set_misc, mask, gre_c_present); + spec->gre_k_present = DEVX_GET(dr_match_set_misc, mask, gre_k_present); + spec->gre_s_present = DEVX_GET(dr_match_set_misc, mask, gre_s_present); + spec->source_vhca_port = DEVX_GET(dr_match_set_misc, mask, source_vhca_port); + spec->source_sqn = DEVX_GET(dr_match_set_misc, mask, source_sqn); + + spec->source_port = DEVX_GET(dr_match_set_misc, mask, source_port); + + spec->outer_second_prio = DEVX_GET(dr_match_set_misc, mask, outer_second_prio); + spec->outer_second_cfi = DEVX_GET(dr_match_set_misc, mask, outer_second_cfi); + spec->outer_second_vid = DEVX_GET(dr_match_set_misc, mask, outer_second_vid); + spec->inner_second_prio = DEVX_GET(dr_match_set_misc, mask, inner_second_prio); + spec->inner_second_cfi = DEVX_GET(dr_match_set_misc, mask, inner_second_cfi); + spec->inner_second_vid = DEVX_GET(dr_match_set_misc, mask, inner_second_vid); + + spec->outer_second_cvlan_tag = + DEVX_GET(dr_match_set_misc, mask, outer_second_cvlan_tag); + spec->inner_second_cvlan_tag = + DEVX_GET(dr_match_set_misc, mask, inner_second_cvlan_tag); + spec->outer_second_svlan_tag = + DEVX_GET(dr_match_set_misc, mask, outer_second_svlan_tag); + spec->inner_second_svlan_tag = + DEVX_GET(dr_match_set_misc, mask, inner_second_svlan_tag); + + spec->gre_protocol = DEVX_GET(dr_match_set_misc, mask, gre_protocol); + + spec->gre_key_h = DEVX_GET(dr_match_set_misc, mask, gre_key_h); + spec->gre_key_l = DEVX_GET(dr_match_set_misc, mask, gre_key_l); + + spec->vxlan_vni = DEVX_GET(dr_match_set_misc, mask, vxlan_vni); + + spec->geneve_vni = DEVX_GET(dr_match_set_misc, mask, geneve_vni); + spec->geneve_oam = DEVX_GET(dr_match_set_misc, mask, geneve_oam); + + spec->outer_ipv6_flow_label = + DEVX_GET(dr_match_set_misc, mask, outer_ipv6_flow_label); + + spec->inner_ipv6_flow_label = + DEVX_GET(dr_match_set_misc, mask, inner_ipv6_flow_label); + + spec->geneve_opt_len = DEVX_GET(dr_match_set_misc, mask, geneve_opt_len); + spec->geneve_protocol_type = + DEVX_GET(dr_match_set_misc, mask, geneve_protocol_type); + + spec->bth_dst_qp = DEVX_GET(dr_match_set_misc, mask, bth_dst_qp); +} + +static void dr_ste_copy_mask_spec(char *mask, struct dr_match_spec *spec) +{ + spec->smac_47_16 = DEVX_GET(dr_match_spec, mask, smac_47_16); + + spec->smac_15_0 = DEVX_GET(dr_match_spec, mask, smac_15_0); + spec->ethertype = DEVX_GET(dr_match_spec, mask, ethertype); + + spec->dmac_47_16 = DEVX_GET(dr_match_spec, mask, dmac_47_16); + + spec->dmac_15_0 = DEVX_GET(dr_match_spec, mask, dmac_15_0); + spec->first_prio = DEVX_GET(dr_match_spec, mask, first_prio); + spec->first_cfi = DEVX_GET(dr_match_spec, mask, first_cfi); + spec->first_vid = DEVX_GET(dr_match_spec, mask, first_vid); + + spec->ip_protocol = DEVX_GET(dr_match_spec, mask, ip_protocol); + spec->ip_dscp = DEVX_GET(dr_match_spec, mask, ip_dscp); + spec->ip_ecn = DEVX_GET(dr_match_spec, mask, ip_ecn); + spec->cvlan_tag = DEVX_GET(dr_match_spec, mask, cvlan_tag); + spec->svlan_tag = DEVX_GET(dr_match_spec, mask, svlan_tag); + spec->frag = DEVX_GET(dr_match_spec, mask, frag); + spec->ip_version = DEVX_GET(dr_match_spec, mask, ip_version); + spec->tcp_flags = DEVX_GET(dr_match_spec, mask, tcp_flags); + spec->tcp_sport = DEVX_GET(dr_match_spec, mask, tcp_sport); + spec->tcp_dport = DEVX_GET(dr_match_spec, mask, tcp_dport); + + spec->ip_ttl_hoplimit = DEVX_GET(dr_match_spec, mask, ip_ttl_hoplimit); + + spec->udp_sport = DEVX_GET(dr_match_spec, mask, udp_sport); + spec->udp_dport = DEVX_GET(dr_match_spec, mask, udp_dport); + + spec->src_ip_127_96 = DEVX_GET(dr_match_spec, mask, src_ip_127_96); + + spec->src_ip_95_64 = DEVX_GET(dr_match_spec, mask, src_ip_95_64); + + spec->src_ip_63_32 = DEVX_GET(dr_match_spec, mask, src_ip_63_32); + + spec->src_ip_31_0 = DEVX_GET(dr_match_spec, mask, src_ip_31_0); + + spec->dst_ip_127_96 = DEVX_GET(dr_match_spec, mask, dst_ip_127_96); + + spec->dst_ip_95_64 = DEVX_GET(dr_match_spec, mask, dst_ip_95_64); + + spec->dst_ip_63_32 = DEVX_GET(dr_match_spec, mask, dst_ip_63_32); + + spec->dst_ip_31_0 = DEVX_GET(dr_match_spec, mask, dst_ip_31_0); +} + +static void dr_ste_copy_mask_misc2(char *mask, struct dr_match_misc2 *spec) +{ + spec->outer_first_mpls_label = + DEVX_GET(dr_match_set_misc2, mask, outer_first_mpls_label); + spec->outer_first_mpls_exp = + DEVX_GET(dr_match_set_misc2, mask, outer_first_mpls_exp); + spec->outer_first_mpls_s_bos = + DEVX_GET(dr_match_set_misc2, mask, outer_first_mpls_s_bos); + spec->outer_first_mpls_ttl = + DEVX_GET(dr_match_set_misc2, mask, outer_first_mpls_ttl); + spec->inner_first_mpls_label = + DEVX_GET(dr_match_set_misc2, mask, inner_first_mpls_label); + spec->inner_first_mpls_exp = + DEVX_GET(dr_match_set_misc2, mask, inner_first_mpls_exp); + spec->inner_first_mpls_s_bos = + DEVX_GET(dr_match_set_misc2, mask, inner_first_mpls_s_bos); + spec->inner_first_mpls_ttl = + DEVX_GET(dr_match_set_misc2, mask, inner_first_mpls_ttl); + spec->outer_first_mpls_over_gre_label = + DEVX_GET(dr_match_set_misc2, mask, outer_first_mpls_over_gre_label); + spec->outer_first_mpls_over_gre_exp = + DEVX_GET(dr_match_set_misc2, mask, outer_first_mpls_over_gre_exp); + spec->outer_first_mpls_over_gre_s_bos = + DEVX_GET(dr_match_set_misc2, mask, outer_first_mpls_over_gre_s_bos); + spec->outer_first_mpls_over_gre_ttl = + DEVX_GET(dr_match_set_misc2, mask, outer_first_mpls_over_gre_ttl); + spec->outer_first_mpls_over_udp_label = + DEVX_GET(dr_match_set_misc2, mask, outer_first_mpls_over_udp_label); + spec->outer_first_mpls_over_udp_exp = + DEVX_GET(dr_match_set_misc2, mask, outer_first_mpls_over_udp_exp); + spec->outer_first_mpls_over_udp_s_bos = + DEVX_GET(dr_match_set_misc2, mask, outer_first_mpls_over_udp_s_bos); + spec->outer_first_mpls_over_udp_ttl = + DEVX_GET(dr_match_set_misc2, mask, outer_first_mpls_over_udp_ttl); + spec->metadata_reg_c_7 = DEVX_GET(dr_match_set_misc2, mask, metadata_reg_c_7); + spec->metadata_reg_c_6 = DEVX_GET(dr_match_set_misc2, mask, metadata_reg_c_6); + spec->metadata_reg_c_5 = DEVX_GET(dr_match_set_misc2, mask, metadata_reg_c_5); + spec->metadata_reg_c_4 = DEVX_GET(dr_match_set_misc2, mask, metadata_reg_c_4); + spec->metadata_reg_c_3 = DEVX_GET(dr_match_set_misc2, mask, metadata_reg_c_3); + spec->metadata_reg_c_2 = DEVX_GET(dr_match_set_misc2, mask, metadata_reg_c_2); + spec->metadata_reg_c_1 = DEVX_GET(dr_match_set_misc2, mask, metadata_reg_c_1); + spec->metadata_reg_c_0 = DEVX_GET(dr_match_set_misc2, mask, metadata_reg_c_0); + spec->metadata_reg_a = DEVX_GET(dr_match_set_misc2, mask, metadata_reg_a); + spec->metadata_reg_b = DEVX_GET(dr_match_set_misc2, mask, metadata_reg_b); +} + +static void dr_ste_copy_mask_misc3(char *mask, struct dr_match_misc3 *spec) +{ + spec->inner_tcp_seq_num = DEVX_GET(dr_match_set_misc3, mask, inner_tcp_seq_num); + spec->outer_tcp_seq_num = DEVX_GET(dr_match_set_misc3, mask, outer_tcp_seq_num); + spec->inner_tcp_ack_num = DEVX_GET(dr_match_set_misc3, mask, inner_tcp_ack_num); + spec->outer_tcp_ack_num = DEVX_GET(dr_match_set_misc3, mask, outer_tcp_ack_num); + spec->outer_vxlan_gpe_vni = + DEVX_GET(dr_match_set_misc3, mask, outer_vxlan_gpe_vni); + spec->outer_vxlan_gpe_next_protocol = + DEVX_GET(dr_match_set_misc3, mask, outer_vxlan_gpe_next_protocol); + spec->outer_vxlan_gpe_flags = + DEVX_GET(dr_match_set_misc3, mask, outer_vxlan_gpe_flags); + spec->icmpv4_header_data = DEVX_GET(dr_match_set_misc3, mask, icmp_header_data); + spec->icmpv6_header_data = + DEVX_GET(dr_match_set_misc3, mask, icmpv6_header_data); + spec->icmpv4_type = DEVX_GET(dr_match_set_misc3, mask, icmp_type); + spec->icmpv4_code = DEVX_GET(dr_match_set_misc3, mask, icmp_code); + spec->icmpv6_type = DEVX_GET(dr_match_set_misc3, mask, icmpv6_type); + spec->icmpv6_code = DEVX_GET(dr_match_set_misc3, mask, icmpv6_code); + spec->gtpu_flags = DEVX_GET(dr_match_set_misc3, mask, gtpu_flags); + spec->gtpu_msg_type = DEVX_GET(dr_match_set_misc3, mask, gtpu_msg_type); + spec->gtpu_teid = DEVX_GET(dr_match_set_misc3, mask, gtpu_teid); +} + +#define MAX_PARAM_SIZE 512 + +void dr_ste_copy_param(uint8_t match_criteria, + struct dr_match_param *set_param, + struct mlx5dv_flow_match_parameters *mask) +{ + char tail_param[MAX_PARAM_SIZE] = {}; + size_t param_location; + uint8_t *data = (uint8_t *)mask->match_buf; + void *buff; + + if (match_criteria & DR_MATCHER_CRITERIA_OUTER) { + if (mask->match_sz < DEVX_ST_SZ_BYTES(dr_match_spec)) { + memcpy(tail_param, data, mask->match_sz); + buff = tail_param; + } else { + buff = mask->match_buf; + } + dr_ste_copy_mask_spec(buff, &set_param->outer); + } + param_location = DEVX_ST_SZ_BYTES(dr_match_spec); + + if (match_criteria & DR_MATCHER_CRITERIA_MISC) { + if (mask->match_sz < param_location + + DEVX_ST_SZ_BYTES(dr_match_set_misc)) { + memcpy(tail_param, data + param_location, + mask->match_sz - param_location); + buff = tail_param; + } else { + buff = data + param_location; + } + dr_ste_copy_mask_misc(buff, &set_param->misc); + } + param_location += DEVX_ST_SZ_BYTES(dr_match_set_misc); + + if (match_criteria & DR_MATCHER_CRITERIA_INNER) { + if (mask->match_sz < param_location + + DEVX_ST_SZ_BYTES(dr_match_spec)) { + memcpy(tail_param, data + param_location, + mask->match_sz - param_location); + buff = tail_param; + } else { + buff = data + param_location; + } + dr_ste_copy_mask_spec(buff, &set_param->inner); + } + param_location += DEVX_ST_SZ_BYTES(dr_match_spec); + + if (match_criteria & DR_MATCHER_CRITERIA_MISC2) { + if (mask->match_sz < param_location + + DEVX_ST_SZ_BYTES(dr_match_set_misc2)) { + memcpy(tail_param, data + param_location, + mask->match_sz - param_location); + buff = tail_param; + } else { + buff = data + param_location; + } + dr_ste_copy_mask_misc2(buff, &set_param->misc2); + } + + param_location += DEVX_ST_SZ_BYTES(dr_match_set_misc2); + + if (match_criteria & DR_MATCHER_CRITERIA_MISC3) { + if (mask->match_sz < param_location + + DEVX_ST_SZ_BYTES(dr_match_set_misc3)) { + memcpy(tail_param, data + param_location, + mask->match_sz - param_location); + buff = tail_param; + } else { + buff = data + param_location; + } + dr_ste_copy_mask_misc3(buff, &set_param->misc3); + } +} + +static int dr_ste_build_eth_l2_src_des_tag(struct dr_match_param *value, + struct dr_ste_build *sb, + uint8_t *hw_ste_p) +{ + struct dr_hw_ste_format *hw_ste = (struct dr_hw_ste_format *)hw_ste_p; + struct dr_match_spec *spec = sb->inner ? &value->inner : &value->outer; + uint8_t *tag = hw_ste->tag; + + DR_STE_SET_TAG(eth_l2_src_dst, tag, dmac_47_16, spec, dmac_47_16); + DR_STE_SET_TAG(eth_l2_src_dst, tag, dmac_15_0, spec, dmac_15_0); + + if (spec->smac_47_16 || spec->smac_15_0) { + DR_STE_SET(eth_l2_src_dst, tag, smac_47_32, + spec->smac_47_16 >> 16); + DR_STE_SET(eth_l2_src_dst, tag, smac_31_0, + spec->smac_47_16 << 16 | spec->smac_15_0); + spec->smac_47_16 = 0; + spec->smac_15_0 = 0; + } + + if (spec->ip_version) { + if (spec->ip_version == IP_VERSION_IPV4) { + DR_STE_SET(eth_l2_src_dst, tag, l3_type, STE_IPV4); + spec->ip_version = 0; + } else if (spec->ip_version == IP_VERSION_IPV6) { + DR_STE_SET(eth_l2_src_dst, tag, l3_type, STE_IPV6); + spec->ip_version = 0; + } else { + errno = EINVAL; + return errno; + } + } + + DR_STE_SET_TAG(eth_l2_src_dst, tag, first_vlan_id, spec, first_vid); + DR_STE_SET_TAG(eth_l2_src_dst, tag, first_cfi, spec, first_cfi); + DR_STE_SET_TAG(eth_l2_src_dst, tag, first_priority, spec, first_prio); + + if (spec->cvlan_tag) { + DR_STE_SET(eth_l2_src_dst, tag, first_vlan_qualifier, DR_STE_CVLAN); + spec->cvlan_tag = 0; + } else if (spec->svlan_tag) { + DR_STE_SET(eth_l2_src_dst, tag, first_vlan_qualifier, DR_STE_SVLAN); + spec->svlan_tag = 0; + } + return 0; +} + +int dr_ste_build_eth_l2_src_des(struct dr_ste_build *sb, + struct dr_match_param *mask, + bool inner, bool rx) +{ + int ret; + + ret = dr_ste_build_eth_l2_src_des_bit_mask(mask, inner, sb->bit_mask); + if (ret) + return ret; + + sb->rx = rx; + sb->inner = inner; + sb->lu_type = DR_STE_CALC_LU_TYPE(ETHL2_SRC_DST, rx, inner); + sb->byte_mask = dr_ste_conv_bit_to_byte_mask(sb->bit_mask); + sb->ste_build_tag_func = &dr_ste_build_eth_l2_src_des_tag; + + return 0; +} + +static void dr_ste_build_eth_l3_ipv6_dst_bit_mask(struct dr_match_param *value, + bool inner, uint8_t *bit_mask) +{ + struct dr_match_spec *mask = inner ? &value->inner : &value->outer; + + DR_STE_SET_MASK_V(eth_l3_ipv6_dst, bit_mask, dst_ip_127_96, mask, dst_ip_127_96); + DR_STE_SET_MASK_V(eth_l3_ipv6_dst, bit_mask, dst_ip_95_64, mask, dst_ip_95_64); + DR_STE_SET_MASK_V(eth_l3_ipv6_dst, bit_mask, dst_ip_63_32, mask, dst_ip_63_32); + DR_STE_SET_MASK_V(eth_l3_ipv6_dst, bit_mask, dst_ip_31_0, mask, dst_ip_31_0); +} + +static int dr_ste_build_eth_l3_ipv6_dst_tag(struct dr_match_param *value, + struct dr_ste_build *sb, + uint8_t *hw_ste_p) +{ + struct dr_hw_ste_format *hw_ste = (struct dr_hw_ste_format *)hw_ste_p; + struct dr_match_spec *spec = sb->inner ? &value->inner : &value->outer; + uint8_t *tag = hw_ste->tag; + + DR_STE_SET_TAG(eth_l3_ipv6_dst, tag, dst_ip_127_96, spec, dst_ip_127_96); + DR_STE_SET_TAG(eth_l3_ipv6_dst, tag, dst_ip_95_64, spec, dst_ip_95_64); + DR_STE_SET_TAG(eth_l3_ipv6_dst, tag, dst_ip_63_32, spec, dst_ip_63_32); + DR_STE_SET_TAG(eth_l3_ipv6_dst, tag, dst_ip_31_0, spec, dst_ip_31_0); + + return 0; +} + +void dr_ste_build_eth_l3_ipv6_dst(struct dr_ste_build *sb, + struct dr_match_param *mask, + bool inner, bool rx) +{ + dr_ste_build_eth_l3_ipv6_dst_bit_mask(mask, inner, sb->bit_mask); + + sb->rx = rx; + sb->inner = inner; + sb->lu_type = DR_STE_CALC_LU_TYPE(ETHL3_IPV6_DST, rx, inner); + sb->byte_mask = dr_ste_conv_bit_to_byte_mask(sb->bit_mask); + sb->ste_build_tag_func = &dr_ste_build_eth_l3_ipv6_dst_tag; +} + +static void dr_ste_build_eth_l3_ipv6_src_bit_mask(struct dr_match_param *value, + bool inner, uint8_t *bit_mask) +{ + struct dr_match_spec *mask = inner ? &value->inner : &value->outer; + + DR_STE_SET_MASK_V(eth_l3_ipv6_src, bit_mask, src_ip_127_96, mask, src_ip_127_96); + DR_STE_SET_MASK_V(eth_l3_ipv6_src, bit_mask, src_ip_95_64, mask, src_ip_95_64); + DR_STE_SET_MASK_V(eth_l3_ipv6_src, bit_mask, src_ip_63_32, mask, src_ip_63_32); + DR_STE_SET_MASK_V(eth_l3_ipv6_src, bit_mask, src_ip_31_0, mask, src_ip_31_0); +} + +static int dr_ste_build_eth_l3_ipv6_src_tag(struct dr_match_param *value, + struct dr_ste_build *sb, + uint8_t *hw_ste_p) +{ + struct dr_hw_ste_format *hw_ste = (struct dr_hw_ste_format *)hw_ste_p; + struct dr_match_spec *spec = sb->inner ? &value->inner : &value->outer; + uint8_t *tag = hw_ste->tag; + + DR_STE_SET_TAG(eth_l3_ipv6_src, tag, src_ip_127_96, spec, src_ip_127_96); + DR_STE_SET_TAG(eth_l3_ipv6_src, tag, src_ip_95_64, spec, src_ip_95_64); + DR_STE_SET_TAG(eth_l3_ipv6_src, tag, src_ip_63_32, spec, src_ip_63_32); + DR_STE_SET_TAG(eth_l3_ipv6_src, tag, src_ip_31_0, spec, src_ip_31_0); + + return 0; +} + +void dr_ste_build_eth_l3_ipv6_src(struct dr_ste_build *sb, + struct dr_match_param *mask, + bool inner, bool rx) +{ + dr_ste_build_eth_l3_ipv6_src_bit_mask(mask, inner, sb->bit_mask); + + sb->rx = rx; + sb->inner = inner; + sb->lu_type = DR_STE_CALC_LU_TYPE(ETHL3_IPV6_SRC, rx, inner); + sb->byte_mask = dr_ste_conv_bit_to_byte_mask(sb->bit_mask); + sb->ste_build_tag_func = &dr_ste_build_eth_l3_ipv6_src_tag; +} + +static void dr_ste_build_eth_l3_ipv4_5_tuple_bit_mask(struct dr_match_param *value, + bool inner, + uint8_t *bit_mask) +{ + struct dr_match_spec *mask = inner ? &value->inner : &value->outer; + + DR_STE_SET_MASK_V(eth_l3_ipv4_5_tuple, bit_mask, destination_address, mask, dst_ip_31_0); + DR_STE_SET_MASK_V(eth_l3_ipv4_5_tuple, bit_mask, source_address, mask, src_ip_31_0); + DR_STE_SET_MASK_V(eth_l3_ipv4_5_tuple, bit_mask, destination_port, mask, tcp_dport); + DR_STE_SET_MASK_V(eth_l3_ipv4_5_tuple, bit_mask, destination_port, mask, udp_dport); + DR_STE_SET_MASK_V(eth_l3_ipv4_5_tuple, bit_mask, source_port, mask, tcp_sport); + DR_STE_SET_MASK_V(eth_l3_ipv4_5_tuple, bit_mask, source_port, mask, udp_sport); + DR_STE_SET_MASK_V(eth_l3_ipv4_5_tuple, bit_mask, protocol, mask, ip_protocol); + DR_STE_SET_MASK_V(eth_l3_ipv4_5_tuple, bit_mask, fragmented, mask, frag); + DR_STE_SET_MASK_V(eth_l3_ipv4_5_tuple, bit_mask, dscp, mask, ip_dscp); + DR_STE_SET_MASK_V(eth_l3_ipv4_5_tuple, bit_mask, ecn, mask, ip_ecn); + + if (mask->tcp_flags) { + DR_STE_SET_TCP_FLAGS(eth_l3_ipv4_5_tuple, bit_mask, mask); + mask->tcp_flags = 0; + } +} + +static int dr_ste_build_eth_l3_ipv4_5_tuple_tag(struct dr_match_param *value, + struct dr_ste_build *sb, + uint8_t *hw_ste_p) +{ + struct dr_hw_ste_format *hw_ste = (struct dr_hw_ste_format *)hw_ste_p; + struct dr_match_spec *spec = sb->inner ? &value->inner : &value->outer; + uint8_t *tag = hw_ste->tag; + + DR_STE_SET_TAG(eth_l3_ipv4_5_tuple, tag, destination_address, spec, dst_ip_31_0); + DR_STE_SET_TAG(eth_l3_ipv4_5_tuple, tag, source_address, spec, src_ip_31_0); + DR_STE_SET_TAG(eth_l3_ipv4_5_tuple, tag, destination_port, spec, tcp_dport); + DR_STE_SET_TAG(eth_l3_ipv4_5_tuple, tag, destination_port, spec, udp_dport); + DR_STE_SET_TAG(eth_l3_ipv4_5_tuple, tag, source_port, spec, tcp_sport); + DR_STE_SET_TAG(eth_l3_ipv4_5_tuple, tag, source_port, spec, udp_sport); + DR_STE_SET_TAG(eth_l3_ipv4_5_tuple, tag, protocol, spec, ip_protocol); + DR_STE_SET_TAG(eth_l3_ipv4_5_tuple, tag, fragmented, spec, frag); + DR_STE_SET_TAG(eth_l3_ipv4_5_tuple, tag, dscp, spec, ip_dscp); + DR_STE_SET_TAG(eth_l3_ipv4_5_tuple, tag, ecn, spec, ip_ecn); + + if (spec->tcp_flags) { + DR_STE_SET_TCP_FLAGS(eth_l3_ipv4_5_tuple, tag, spec); + spec->tcp_flags = 0; + } + + return 0; +} + +void dr_ste_build_eth_l3_ipv4_5_tuple(struct dr_ste_build *sb, + struct dr_match_param *mask, + bool inner, bool rx) +{ + dr_ste_build_eth_l3_ipv4_5_tuple_bit_mask(mask, inner, sb->bit_mask); + + sb->rx = rx; + sb->inner = inner; + sb->lu_type = DR_STE_CALC_LU_TYPE(ETHL3_IPV4_5_TUPLE, rx, inner); + sb->byte_mask = dr_ste_conv_bit_to_byte_mask(sb->bit_mask); + sb->ste_build_tag_func = &dr_ste_build_eth_l3_ipv4_5_tuple_tag; +} + +static void +dr_ste_build_eth_l2_src_or_dst_bit_mask(struct dr_match_param *value, + bool inner, uint8_t *bit_mask) +{ + struct dr_match_spec *mask = inner ? &value->inner : &value->outer; + struct dr_match_misc *misc_mask = &value->misc; + + DR_STE_SET_MASK_V(eth_l2_src, bit_mask, first_vlan_id, mask, first_vid); + DR_STE_SET_MASK_V(eth_l2_src, bit_mask, first_cfi, mask, first_cfi); + DR_STE_SET_MASK_V(eth_l2_src, bit_mask, first_priority, mask, first_prio); + DR_STE_SET_MASK_V(eth_l2_src, bit_mask, ip_fragmented, mask, frag); + DR_STE_SET_MASK_V(eth_l2_src, bit_mask, l3_ethertype, mask, ethertype); + DR_STE_SET_MASK(eth_l2_src, bit_mask, l3_type, mask, ip_version); + + if (mask->svlan_tag || mask->cvlan_tag) { + DR_STE_SET(eth_l2_src, bit_mask, first_vlan_qualifier, -1); + mask->cvlan_tag = 0; + mask->svlan_tag = 0; + } + + if (inner) { + if (misc_mask->inner_second_cvlan_tag || + misc_mask->inner_second_svlan_tag) { + DR_STE_SET(eth_l2_src, bit_mask, second_vlan_qualifier, -1); + misc_mask->inner_second_cvlan_tag = 0; + misc_mask->inner_second_svlan_tag = 0; + } + + DR_STE_SET_MASK_V(eth_l2_src, bit_mask, second_vlan_id, misc_mask, inner_second_vid); + DR_STE_SET_MASK_V(eth_l2_src, bit_mask, second_cfi, misc_mask, inner_second_cfi); + DR_STE_SET_MASK_V(eth_l2_src, bit_mask, second_priority, misc_mask, inner_second_prio); + } else { + if (misc_mask->outer_second_cvlan_tag || + misc_mask->outer_second_svlan_tag) { + DR_STE_SET(eth_l2_src, bit_mask, second_vlan_qualifier, -1); + misc_mask->outer_second_cvlan_tag = 0; + misc_mask->outer_second_svlan_tag = 0; + } + + DR_STE_SET_MASK_V(eth_l2_src, bit_mask, second_vlan_id, misc_mask, outer_second_vid); + DR_STE_SET_MASK_V(eth_l2_src, bit_mask, second_cfi, misc_mask, outer_second_cfi); + DR_STE_SET_MASK_V(eth_l2_src, bit_mask, second_priority, misc_mask, outer_second_prio); + } +} + +static int dr_ste_build_eth_l2_src_or_dst_tag(struct dr_match_param *value, + bool inner, uint8_t *hw_ste_p) +{ + struct dr_hw_ste_format *hw_ste = (struct dr_hw_ste_format *)hw_ste_p; + struct dr_match_spec *spec = inner ? &value->inner : &value->outer; + struct dr_match_misc *misc_spec = &value->misc; + uint8_t *tag = hw_ste->tag; + + DR_STE_SET_TAG(eth_l2_src, tag, first_vlan_id, spec, first_vid); + DR_STE_SET_TAG(eth_l2_src, tag, first_cfi, spec, first_cfi); + DR_STE_SET_TAG(eth_l2_src, tag, first_priority, spec, first_prio); + DR_STE_SET_TAG(eth_l2_src, tag, ip_fragmented, spec, frag); + DR_STE_SET_TAG(eth_l2_src, tag, l3_ethertype, spec, ethertype); + + if (spec->ip_version) { + if (spec->ip_version == IP_VERSION_IPV4) { + DR_STE_SET(eth_l2_src, tag, l3_type, STE_IPV4); + spec->ip_version = 0; + } else if (spec->ip_version == IP_VERSION_IPV6) { + DR_STE_SET(eth_l2_src, tag, l3_type, STE_IPV6); + spec->ip_version = 0; + } else { + errno = EINVAL; + return errno; + } + } + + if (spec->cvlan_tag) { + DR_STE_SET(eth_l2_src, tag, first_vlan_qualifier, DR_STE_CVLAN); + spec->cvlan_tag = 0; + } else if (spec->svlan_tag) { + DR_STE_SET(eth_l2_src, tag, first_vlan_qualifier, DR_STE_SVLAN); + spec->svlan_tag = 0; + } + + if (inner) { + if (misc_spec->inner_second_cvlan_tag) { + DR_STE_SET(eth_l2_src, tag, second_vlan_qualifier, DR_STE_CVLAN); + misc_spec->inner_second_cvlan_tag = 0; + } else if (misc_spec->inner_second_svlan_tag) { + DR_STE_SET(eth_l2_src, tag, second_vlan_qualifier, DR_STE_SVLAN); + misc_spec->inner_second_svlan_tag = 0; + } + + DR_STE_SET_TAG(eth_l2_src, tag, second_vlan_id, misc_spec, inner_second_vid); + DR_STE_SET_TAG(eth_l2_src, tag, second_cfi, misc_spec, inner_second_cfi); + DR_STE_SET_TAG(eth_l2_src, tag, second_priority, misc_spec, inner_second_prio); + } else { + if (misc_spec->outer_second_cvlan_tag) { + DR_STE_SET(eth_l2_src, tag, second_vlan_qualifier, DR_STE_CVLAN); + misc_spec->outer_second_cvlan_tag = 0; + } else if (misc_spec->outer_second_svlan_tag) { + DR_STE_SET(eth_l2_src, tag, second_vlan_qualifier, DR_STE_SVLAN); + misc_spec->outer_second_svlan_tag = 0; + } + DR_STE_SET_TAG(eth_l2_src, tag, second_vlan_id, misc_spec, outer_second_vid); + DR_STE_SET_TAG(eth_l2_src, tag, second_cfi, misc_spec, outer_second_cfi); + DR_STE_SET_TAG(eth_l2_src, tag, second_priority, misc_spec, outer_second_prio); + } + + return 0; +} + +static void dr_ste_build_eth_l2_src_bit_mask(struct dr_match_param *value, + bool inner, uint8_t *bit_mask) +{ + struct dr_match_spec *mask = inner ? &value->inner : &value->outer; + + DR_STE_SET_MASK_V(eth_l2_src, bit_mask, smac_47_16, mask, smac_47_16); + DR_STE_SET_MASK_V(eth_l2_src, bit_mask, smac_15_0, mask, smac_15_0); + + dr_ste_build_eth_l2_src_or_dst_bit_mask(value, inner, bit_mask); +} + +static int dr_ste_build_eth_l2_src_tag(struct dr_match_param *value, + struct dr_ste_build *sb, + uint8_t *hw_ste_p) +{ + struct dr_hw_ste_format *hw_ste = (struct dr_hw_ste_format *)hw_ste_p; + struct dr_match_spec *spec = sb->inner ? &value->inner : &value->outer; + uint8_t *tag = hw_ste->tag; + + DR_STE_SET_TAG(eth_l2_src, tag, smac_47_16, spec, smac_47_16); + DR_STE_SET_TAG(eth_l2_src, tag, smac_15_0, spec, smac_15_0); + + return dr_ste_build_eth_l2_src_or_dst_tag(value, sb->inner, hw_ste_p); +} + +void dr_ste_build_eth_l2_src(struct dr_ste_build *sb, + struct dr_match_param *mask, + bool inner, bool rx) +{ + dr_ste_build_eth_l2_src_bit_mask(mask, inner, sb->bit_mask); + + sb->rx = rx; + sb->inner = inner; + sb->lu_type = DR_STE_CALC_LU_TYPE(ETHL2_SRC, rx, inner); + sb->byte_mask = dr_ste_conv_bit_to_byte_mask(sb->bit_mask); + sb->ste_build_tag_func = &dr_ste_build_eth_l2_src_tag; +} + +static void dr_ste_build_eth_l2_dst_bit_mask(struct dr_match_param *value, + bool inner, uint8_t *bit_mask) +{ + struct dr_match_spec *mask = inner ? &value->inner : &value->outer; + + DR_STE_SET_MASK_V(eth_l2_dst, bit_mask, dmac_47_16, mask, dmac_47_16); + DR_STE_SET_MASK_V(eth_l2_dst, bit_mask, dmac_15_0, mask, dmac_15_0); + + dr_ste_build_eth_l2_src_or_dst_bit_mask(value, inner, bit_mask); +} + +static int dr_ste_build_eth_l2_dst_tag(struct dr_match_param *value, + struct dr_ste_build *sb, + uint8_t *hw_ste_p) +{ + struct dr_hw_ste_format *hw_ste = (struct dr_hw_ste_format *)hw_ste_p; + struct dr_match_spec *spec = sb->inner ? &value->inner : &value->outer; + uint8_t *tag = hw_ste->tag; + + DR_STE_SET_TAG(eth_l2_dst, tag, dmac_47_16, spec, dmac_47_16); + DR_STE_SET_TAG(eth_l2_dst, tag, dmac_15_0, spec, dmac_15_0); + + return dr_ste_build_eth_l2_src_or_dst_tag(value, sb->inner, hw_ste_p); +} + +void dr_ste_build_eth_l2_dst(struct dr_ste_build *sb, + struct dr_match_param *mask, + bool inner, bool rx) +{ + dr_ste_build_eth_l2_dst_bit_mask(mask, inner, sb->bit_mask); + + sb->rx = rx; + sb->inner = inner; + sb->lu_type = DR_STE_CALC_LU_TYPE(ETHL2_DST, rx, inner); + sb->byte_mask = dr_ste_conv_bit_to_byte_mask(sb->bit_mask); + sb->ste_build_tag_func = &dr_ste_build_eth_l2_dst_tag; +} + +static void dr_ste_build_eth_l2_tnl_bit_mask(struct dr_match_param *value, + bool inner, uint8_t *bit_mask) +{ + struct dr_match_spec *mask = inner ? &value->inner : &value->outer; + struct dr_match_misc *misc = &value->misc; + + DR_STE_SET_MASK_V(eth_l2_tnl, bit_mask, dmac_47_16, mask, dmac_47_16); + DR_STE_SET_MASK_V(eth_l2_tnl, bit_mask, dmac_15_0, mask, dmac_15_0); + DR_STE_SET_MASK_V(eth_l2_tnl, bit_mask, first_vlan_id, mask, first_vid); + DR_STE_SET_MASK_V(eth_l2_tnl, bit_mask, first_cfi, mask, first_cfi); + DR_STE_SET_MASK_V(eth_l2_tnl, bit_mask, first_priority, mask, first_prio); + DR_STE_SET_MASK_V(eth_l2_tnl, bit_mask, ip_fragmented, mask, frag); + DR_STE_SET_MASK_V(eth_l2_tnl, bit_mask, l3_ethertype, mask, ethertype); + DR_STE_SET_MASK(eth_l2_tnl, bit_mask, l3_type, mask, ip_version); + + if (misc->vxlan_vni) { + DR_STE_SET(eth_l2_tnl, bit_mask, l2_tunneling_network_id, (misc->vxlan_vni << 8)); + misc->vxlan_vni = 0; + } + + if (mask->svlan_tag || mask->cvlan_tag) { + DR_STE_SET(eth_l2_tnl, bit_mask, first_vlan_qualifier, -1); + mask->cvlan_tag = 0; + mask->svlan_tag = 0; + } +} + +static int dr_ste_build_eth_l2_tnl_tag(struct dr_match_param *value, + struct dr_ste_build *sb, + uint8_t *hw_ste_p) +{ + struct dr_hw_ste_format *hw_ste = (struct dr_hw_ste_format *)hw_ste_p; + struct dr_match_spec *spec = sb->inner ? &value->inner : &value->outer; + struct dr_match_misc *misc = &value->misc; + uint8_t *tag = hw_ste->tag; + + DR_STE_SET_TAG(eth_l2_tnl, tag, dmac_47_16, spec, dmac_47_16); + DR_STE_SET_TAG(eth_l2_tnl, tag, dmac_15_0, spec, dmac_15_0); + DR_STE_SET_TAG(eth_l2_tnl, tag, first_vlan_id, spec, first_vid); + DR_STE_SET_TAG(eth_l2_tnl, tag, first_cfi, spec, first_cfi); + DR_STE_SET_TAG(eth_l2_tnl, tag, ip_fragmented, spec, frag); + DR_STE_SET_TAG(eth_l2_tnl, tag, first_priority, spec, first_prio); + DR_STE_SET_TAG(eth_l2_tnl, tag, l3_ethertype, spec, ethertype); + + if (misc->vxlan_vni) { + DR_STE_SET(eth_l2_tnl, tag, l2_tunneling_network_id, + (misc->vxlan_vni << 8)); + misc->vxlan_vni = 0; + } + + if (spec->cvlan_tag) { + DR_STE_SET(eth_l2_tnl, tag, first_vlan_qualifier, DR_STE_CVLAN); + spec->cvlan_tag = 0; + } else if (spec->svlan_tag) { + DR_STE_SET(eth_l2_tnl, tag, first_vlan_qualifier, DR_STE_SVLAN); + spec->svlan_tag = 0; + } + + if (spec->ip_version) { + if (spec->ip_version == IP_VERSION_IPV4) { + DR_STE_SET(eth_l2_tnl, tag, l3_type, STE_IPV4); + spec->ip_version = 0; + } else if (spec->ip_version == IP_VERSION_IPV6) { + DR_STE_SET(eth_l2_tnl, tag, l3_type, STE_IPV6); + spec->ip_version = 0; + } else { + errno = EINVAL; + return errno; + } + } + + return 0; +} + +void dr_ste_build_eth_l2_tnl(struct dr_ste_build *sb, + struct dr_match_param *mask, bool inner, bool rx) +{ + dr_ste_build_eth_l2_tnl_bit_mask(mask, inner, sb->bit_mask); + + sb->rx = rx; + sb->inner = inner; + sb->lu_type = DR_STE_LU_TYPE_ETHL2_TUNNELING_I; + sb->byte_mask = dr_ste_conv_bit_to_byte_mask(sb->bit_mask); + sb->ste_build_tag_func = &dr_ste_build_eth_l2_tnl_tag; +} + +static void dr_ste_build_eth_l3_ipv4_misc_bit_mask(struct dr_match_param *value, + bool inner, uint8_t *bit_mask) +{ + struct dr_match_spec *mask = inner ? &value->inner : &value->outer; + + DR_STE_SET_MASK_V(eth_l3_ipv4_misc, bit_mask, time_to_live, mask, ip_ttl_hoplimit); +} + +static int dr_ste_build_eth_l3_ipv4_misc_tag(struct dr_match_param *value, + struct dr_ste_build *sb, + uint8_t *hw_ste_p) +{ + struct dr_hw_ste_format *hw_ste = (struct dr_hw_ste_format *)hw_ste_p; + struct dr_match_spec *spec = sb->inner ? &value->inner : &value->outer; + uint8_t *tag = hw_ste->tag; + + DR_STE_SET_TAG(eth_l3_ipv4_misc, tag, time_to_live, spec, ip_ttl_hoplimit); + + return 0; +} + +void dr_ste_build_eth_l3_ipv4_misc(struct dr_ste_build *sb, + struct dr_match_param *mask, + bool inner, bool rx) +{ + dr_ste_build_eth_l3_ipv4_misc_bit_mask(mask, inner, sb->bit_mask); + + sb->rx = rx; + sb->inner = inner; + sb->lu_type = DR_STE_CALC_LU_TYPE(ETHL3_IPV4_MISC, rx, inner); + sb->byte_mask = dr_ste_conv_bit_to_byte_mask(sb->bit_mask); + sb->ste_build_tag_func = &dr_ste_build_eth_l3_ipv4_misc_tag; +} + +static void dr_ste_build_ipv6_l3_l4_bit_mask(struct dr_match_param *value, + bool inner, uint8_t *bit_mask) +{ + struct dr_match_spec *mask = inner ? &value->inner : &value->outer; + struct dr_match_misc *misc = &value->misc; + + DR_STE_SET_MASK_V(eth_l4, bit_mask, dst_port, mask, tcp_dport); + DR_STE_SET_MASK_V(eth_l4, bit_mask, src_port, mask, tcp_sport); + DR_STE_SET_MASK_V(eth_l4, bit_mask, dst_port, mask, udp_dport); + DR_STE_SET_MASK_V(eth_l4, bit_mask, src_port, mask, udp_sport); + DR_STE_SET_MASK_V(eth_l4, bit_mask, protocol, mask, ip_protocol); + DR_STE_SET_MASK_V(eth_l4, bit_mask, fragmented, mask, frag); + DR_STE_SET_MASK_V(eth_l4, bit_mask, dscp, mask, ip_dscp); + DR_STE_SET_MASK_V(eth_l4, bit_mask, ecn, mask, ip_ecn); + DR_STE_SET_MASK_V(eth_l4, bit_mask, ipv6_hop_limit, mask, ip_ttl_hoplimit); + if (inner) { + DR_STE_SET_MASK_V(eth_l4, bit_mask, flow_label, + misc, inner_ipv6_flow_label); + } else { + DR_STE_SET_MASK_V(eth_l4, bit_mask, flow_label, + misc, outer_ipv6_flow_label); + } + + if (mask->tcp_flags) { + DR_STE_SET_TCP_FLAGS(eth_l4, bit_mask, mask); + mask->tcp_flags = 0; + } +} + +static int dr_ste_build_ipv6_l3_l4_tag(struct dr_match_param *value, + struct dr_ste_build *sb, + uint8_t *hw_ste_p) +{ + struct dr_hw_ste_format *hw_ste = (struct dr_hw_ste_format *)hw_ste_p; + struct dr_match_spec *spec = sb->inner ? &value->inner : &value->outer; + struct dr_match_misc *misc = &value->misc; + uint8_t *tag = hw_ste->tag; + + DR_STE_SET_TAG(eth_l4, tag, dst_port, spec, tcp_dport); + DR_STE_SET_TAG(eth_l4, tag, src_port, spec, tcp_sport); + DR_STE_SET_TAG(eth_l4, tag, dst_port, spec, udp_dport); + DR_STE_SET_TAG(eth_l4, tag, src_port, spec, udp_sport); + DR_STE_SET_TAG(eth_l4, tag, protocol, spec, ip_protocol); + DR_STE_SET_TAG(eth_l4, tag, fragmented, spec, frag); + DR_STE_SET_TAG(eth_l4, tag, dscp, spec, ip_dscp); + DR_STE_SET_TAG(eth_l4, tag, ecn, spec, ip_ecn); + DR_STE_SET_TAG(eth_l4, tag, ipv6_hop_limit, spec, ip_ttl_hoplimit); + if (sb->inner) { + DR_STE_SET_TAG(eth_l4, tag, flow_label, + misc, inner_ipv6_flow_label); + } else { + DR_STE_SET_TAG(eth_l4, tag, flow_label, + misc, outer_ipv6_flow_label); + } + + if (spec->tcp_flags) { + DR_STE_SET_TCP_FLAGS(eth_l4, tag, spec); + spec->tcp_flags = 0; + } + + return 0; +} + +void dr_ste_build_ipv6_l3_l4(struct dr_ste_build *sb, + struct dr_match_param *mask, + bool inner, bool rx) +{ + dr_ste_build_ipv6_l3_l4_bit_mask(mask, inner, sb->bit_mask); + + sb->rx = rx; + sb->inner = inner; + sb->lu_type = DR_STE_CALC_LU_TYPE(ETHL4, rx, inner); + sb->byte_mask = dr_ste_conv_bit_to_byte_mask(sb->bit_mask); + sb->ste_build_tag_func = &dr_ste_build_ipv6_l3_l4_tag; +} + +static int dr_ste_build_empty_always_hit_tag(struct dr_match_param *value, + struct dr_ste_build *sb, + uint8_t *hw_ste_p) +{ + return 0; +} + +void dr_ste_build_empty_always_hit(struct dr_ste_build *sb, bool rx) +{ + sb->rx = rx; + sb->lu_type = DR_STE_LU_TYPE_DONT_CARE; + sb->byte_mask = 0; + sb->ste_build_tag_func = &dr_ste_build_empty_always_hit_tag; +} + +static void dr_ste_build_mpls_bit_mask(struct dr_match_param *value, + bool inner, uint8_t *bit_mask) +{ + struct dr_match_misc2 *misc2_mask = &value->misc2; + + if (inner) + DR_STE_SET_MPLS_MASK(mpls, misc2_mask, inner, bit_mask); + else + DR_STE_SET_MPLS_MASK(mpls, misc2_mask, outer, bit_mask); +} + +static int dr_ste_build_mpls_tag(struct dr_match_param *value, + struct dr_ste_build *sb, + uint8_t *hw_ste_p) +{ + struct dr_hw_ste_format *hw_ste = (struct dr_hw_ste_format *)hw_ste_p; + struct dr_match_misc2 *misc2_mask = &value->misc2; + uint8_t *tag = hw_ste->tag; + + if (sb->inner) + DR_STE_SET_MPLS_TAG(mpls, misc2_mask, inner, tag); + else + DR_STE_SET_MPLS_TAG(mpls, misc2_mask, outer, tag); + + return 0; +} + +void dr_ste_build_mpls(struct dr_ste_build *sb, struct dr_match_param *mask, + bool inner, bool rx) +{ + dr_ste_build_mpls_bit_mask(mask, inner, sb->bit_mask); + + sb->rx = rx; + sb->inner = inner; + sb->lu_type = DR_STE_CALC_LU_TYPE(MPLS_FIRST, rx, inner); + sb->byte_mask = dr_ste_conv_bit_to_byte_mask(sb->bit_mask); + sb->ste_build_tag_func = &dr_ste_build_mpls_tag; +} + +static void dr_ste_build_gre_bit_mask(struct dr_match_param *value, + bool inner, uint8_t *bit_mask) +{ + struct dr_match_misc *misc_mask = &value->misc; + + DR_STE_SET_MASK_V(gre, bit_mask, gre_protocol, misc_mask, gre_protocol); + DR_STE_SET_MASK_V(gre, bit_mask, gre_k_present, misc_mask, gre_k_present); + DR_STE_SET_MASK_V(gre, bit_mask, gre_key_h, misc_mask, gre_key_h); + DR_STE_SET_MASK_V(gre, bit_mask, gre_key_l, misc_mask, gre_key_l); + + DR_STE_SET_MASK_V(gre, bit_mask, gre_c_present, misc_mask, gre_c_present); + DR_STE_SET_MASK_V(gre, bit_mask, gre_s_present, misc_mask, gre_s_present); +} + +static int dr_ste_build_gre_tag(struct dr_match_param *value, + struct dr_ste_build *sb, + uint8_t *hw_ste_p) +{ + struct dr_hw_ste_format *hw_ste = (struct dr_hw_ste_format *)hw_ste_p; + struct dr_match_misc *misc = &value->misc; + uint8_t *tag = hw_ste->tag; + + DR_STE_SET_TAG(gre, tag, gre_protocol, misc, gre_protocol); + + DR_STE_SET_TAG(gre, tag, gre_k_present, misc, gre_k_present); + DR_STE_SET_TAG(gre, tag, gre_key_h, misc, gre_key_h); + DR_STE_SET_TAG(gre, tag, gre_key_l, misc, gre_key_l); + + DR_STE_SET_TAG(gre, tag, gre_c_present, misc, gre_c_present); + + DR_STE_SET_TAG(gre, tag, gre_s_present, misc, gre_s_present); + + return 0; +} + +void dr_ste_build_gre(struct dr_ste_build *sb, struct dr_match_param *mask, + bool inner, bool rx) +{ + dr_ste_build_gre_bit_mask(mask, inner, sb->bit_mask); + + sb->rx = rx; + sb->inner = inner; + sb->lu_type = DR_STE_LU_TYPE_GRE; + sb->byte_mask = dr_ste_conv_bit_to_byte_mask(sb->bit_mask); + sb->ste_build_tag_func = &dr_ste_build_gre_tag; +} + +static void dr_ste_build_flex_parser_0_bit_mask(struct dr_match_param *value, + bool inner, uint8_t *bit_mask) +{ + struct dr_match_misc2 *misc_2_mask = &value->misc2; + + if (DR_STE_IS_OUTER_MPLS_OVER_GRE_SET(misc_2_mask)) { + DR_STE_SET_MASK_V(flex_parser_0, bit_mask, parser_3_label, + misc_2_mask, outer_first_mpls_over_gre_label); + + DR_STE_SET_MASK_V(flex_parser_0, bit_mask, parser_3_exp, + misc_2_mask, outer_first_mpls_over_gre_exp); + + DR_STE_SET_MASK_V(flex_parser_0, bit_mask, parser_3_s_bos, + misc_2_mask, outer_first_mpls_over_gre_s_bos); + + DR_STE_SET_MASK_V(flex_parser_0, bit_mask, parser_3_ttl, + misc_2_mask, outer_first_mpls_over_gre_ttl); + } else { + DR_STE_SET_MASK_V(flex_parser_0, bit_mask, parser_3_label, + misc_2_mask, outer_first_mpls_over_udp_label); + + DR_STE_SET_MASK_V(flex_parser_0, bit_mask, parser_3_exp, + misc_2_mask, outer_first_mpls_over_udp_exp); + + DR_STE_SET_MASK_V(flex_parser_0, bit_mask, parser_3_s_bos, + misc_2_mask, outer_first_mpls_over_udp_s_bos); + + DR_STE_SET_MASK_V(flex_parser_0, bit_mask, parser_3_ttl, + misc_2_mask, outer_first_mpls_over_udp_ttl); + } +} + +static int dr_ste_build_flex_parser_0_tag(struct dr_match_param *value, + struct dr_ste_build *sb, + uint8_t *hw_ste_p) +{ + struct dr_hw_ste_format *hw_ste = (struct dr_hw_ste_format *)hw_ste_p; + struct dr_match_misc2 *misc_2_mask = &value->misc2; + uint8_t *tag = hw_ste->tag; + + if (DR_STE_IS_OUTER_MPLS_OVER_GRE_SET(misc_2_mask)) { + DR_STE_SET_TAG(flex_parser_0, tag, parser_3_label, + misc_2_mask, outer_first_mpls_over_gre_label); + + DR_STE_SET_TAG(flex_parser_0, tag, parser_3_exp, + misc_2_mask, outer_first_mpls_over_gre_exp); + + DR_STE_SET_TAG(flex_parser_0, tag, parser_3_s_bos, + misc_2_mask, outer_first_mpls_over_gre_s_bos); + + DR_STE_SET_TAG(flex_parser_0, tag, parser_3_ttl, + misc_2_mask, outer_first_mpls_over_gre_ttl); + } else { + DR_STE_SET_TAG(flex_parser_0, tag, parser_3_label, + misc_2_mask, outer_first_mpls_over_udp_label); + + DR_STE_SET_TAG(flex_parser_0, tag, parser_3_exp, + misc_2_mask, outer_first_mpls_over_udp_exp); + + DR_STE_SET_TAG(flex_parser_0, tag, parser_3_s_bos, + misc_2_mask, outer_first_mpls_over_udp_s_bos); + + DR_STE_SET_TAG(flex_parser_0, tag, parser_3_ttl, + misc_2_mask, outer_first_mpls_over_udp_ttl); + } + return 0; +} + +void dr_ste_build_flex_parser_0(struct dr_ste_build *sb, + struct dr_match_param *mask, + bool inner, bool rx) +{ + dr_ste_build_flex_parser_0_bit_mask(mask, inner, sb->bit_mask); + + sb->rx = rx; + sb->inner = inner; + sb->lu_type = DR_STE_LU_TYPE_FLEX_PARSER_0; + sb->byte_mask = dr_ste_conv_bit_to_byte_mask(sb->bit_mask); + sb->ste_build_tag_func = &dr_ste_build_flex_parser_0_tag; +} + +#define ICMP_TYPE_OFFSET_FIRST_DW 24 +#define ICMP_CODE_OFFSET_FIRST_DW 16 +#define ICMP_HEADER_DATA_OFFSET_SECOND_DW 0 + +static int dr_ste_build_flex_parser_1_bit_mask(struct dr_match_param *mask, + struct dr_devx_caps *caps, + uint8_t *bit_mask) +{ + struct dr_match_misc3 *misc_3_mask = &mask->misc3; + bool is_ipv4_mask = DR_MASK_IS_FLEX_PARSER_ICMPV4_SET(misc_3_mask); + uint32_t icmp_header_data_mask; + uint32_t icmp_type_mask; + uint32_t icmp_code_mask; + int dw0_location; + int dw1_location; + + if (is_ipv4_mask) { + icmp_header_data_mask = misc_3_mask->icmpv4_header_data; + icmp_type_mask = misc_3_mask->icmpv4_type; + icmp_code_mask = misc_3_mask->icmpv4_code; + dw0_location = caps->flex_parser_id_icmp_dw0; + dw1_location = caps->flex_parser_id_icmp_dw1; + } else { + icmp_header_data_mask = misc_3_mask->icmpv6_header_data; + icmp_type_mask = misc_3_mask->icmpv6_type; + icmp_code_mask = misc_3_mask->icmpv6_code; + dw0_location = caps->flex_parser_id_icmpv6_dw0; + dw1_location = caps->flex_parser_id_icmpv6_dw1; + } + + switch (dw0_location) { + case 4: + if (icmp_type_mask) { + DR_STE_SET(flex_parser_1, bit_mask, flex_parser_4, + (icmp_type_mask << ICMP_TYPE_OFFSET_FIRST_DW)); + if (is_ipv4_mask) + misc_3_mask->icmpv4_type = 0; + else + misc_3_mask->icmpv6_type = 0; + } + if (icmp_code_mask) { + uint32_t cur_val = DR_STE_GET(flex_parser_1, bit_mask, + flex_parser_4); + DR_STE_SET(flex_parser_1, bit_mask, flex_parser_4, + cur_val | (icmp_code_mask << ICMP_CODE_OFFSET_FIRST_DW)); + if (is_ipv4_mask) + misc_3_mask->icmpv4_code = 0; + else + misc_3_mask->icmpv6_code = 0; + } + break; + default: + errno = ENOTSUP; + return errno; + } + + switch (dw1_location) { + case 5: + if (icmp_header_data_mask) { + DR_STE_SET(flex_parser_1, bit_mask, flex_parser_5, + (icmp_header_data_mask << ICMP_HEADER_DATA_OFFSET_SECOND_DW)); + if (is_ipv4_mask) + misc_3_mask->icmpv4_header_data = 0; + else + misc_3_mask->icmpv6_header_data = 0; + } + break; + default: + errno = ENOTSUP; + return errno; + } + + return 0; +} + +static int dr_ste_build_flex_parser_1_tag(struct dr_match_param *value, + struct dr_ste_build *sb, + uint8_t *hw_ste_p) +{ + struct dr_hw_ste_format *hw_ste = (struct dr_hw_ste_format *)hw_ste_p; + struct dr_match_misc3 *misc_3 = &value->misc3; + bool is_ipv4 = DR_MASK_IS_FLEX_PARSER_ICMPV4_SET(misc_3); + uint8_t *tag = hw_ste->tag; + uint32_t icmp_header_data; + uint32_t icmp_type; + uint32_t icmp_code; + int dw0_location; + int dw1_location; + + if (is_ipv4) { + icmp_header_data = misc_3->icmpv4_header_data; + icmp_type = misc_3->icmpv4_type; + icmp_code = misc_3->icmpv4_code; + dw0_location = sb->caps->flex_parser_id_icmp_dw0; + dw1_location = sb->caps->flex_parser_id_icmp_dw1; + } else { + icmp_header_data = misc_3->icmpv6_header_data; + icmp_type = misc_3->icmpv6_type; + icmp_code = misc_3->icmpv6_code; + dw0_location = sb->caps->flex_parser_id_icmpv6_dw0; + dw1_location = sb->caps->flex_parser_id_icmpv6_dw1; + } + + switch (dw0_location) { + case 4: + if (icmp_type) { + DR_STE_SET(flex_parser_1, tag, flex_parser_4, + (icmp_type << ICMP_TYPE_OFFSET_FIRST_DW)); + if (is_ipv4) + misc_3->icmpv4_type = 0; + else + misc_3->icmpv6_type = 0; + } + + if (icmp_code) { + uint32_t cur_val = DR_STE_GET(flex_parser_1, tag, + flex_parser_4); + DR_STE_SET(flex_parser_1, tag, flex_parser_4, + cur_val | (icmp_code << ICMP_CODE_OFFSET_FIRST_DW)); + if (is_ipv4) + misc_3->icmpv4_code = 0; + else + misc_3->icmpv6_code = 0; + } + break; + default: + errno = ENOTSUP; + return errno; + } + + switch (dw1_location) { + case 5: + if (icmp_header_data) { + DR_STE_SET(flex_parser_1, tag, flex_parser_5, + (icmp_header_data << ICMP_HEADER_DATA_OFFSET_SECOND_DW)); + if (is_ipv4) + misc_3->icmpv4_header_data = 0; + else + misc_3->icmpv6_header_data = 0; + } + break; + default: + errno = ENOTSUP; + return errno; + } + + return 0; +} + +int dr_ste_build_flex_parser_1(struct dr_ste_build *sb, + struct dr_match_param *mask, + struct dr_devx_caps *caps, + bool inner, bool rx) +{ + int ret; + + ret = dr_ste_build_flex_parser_1_bit_mask(mask, caps, sb->bit_mask); + if (ret) + return ret; + + sb->rx = rx; + sb->inner = inner; + sb->caps = caps; + sb->lu_type = DR_STE_LU_TYPE_FLEX_PARSER_1; + sb->byte_mask = dr_ste_conv_bit_to_byte_mask(sb->bit_mask); + sb->ste_build_tag_func = &dr_ste_build_flex_parser_1_tag; + + return 0; +} + +static void dr_ste_build_general_purpose_bit_mask(struct dr_match_param *value, + bool inner, uint8_t *bit_mask) +{ + struct dr_match_misc2 *misc_2_mask = &value->misc2; + + DR_STE_SET_MASK_V(general_purpose, bit_mask, + general_purpose_lookup_field, misc_2_mask, + metadata_reg_a); +} + +static int dr_ste_build_general_purpose_tag(struct dr_match_param *value, + struct dr_ste_build *sb, + uint8_t *hw_ste_p) +{ + struct dr_hw_ste_format *hw_ste = (struct dr_hw_ste_format *)hw_ste_p; + struct dr_match_misc2 *misc_2_mask = &value->misc2; + uint8_t *tag = hw_ste->tag; + + DR_STE_SET_TAG(general_purpose, tag, general_purpose_lookup_field, + misc_2_mask, metadata_reg_a); + + return 0; +} + +void dr_ste_build_general_purpose(struct dr_ste_build *sb, + struct dr_match_param *mask, + bool inner, bool rx) +{ + dr_ste_build_general_purpose_bit_mask(mask, inner, sb->bit_mask); + + sb->rx = rx; + sb->inner = inner; + sb->lu_type = DR_STE_LU_TYPE_GENERAL_PURPOSE; + sb->byte_mask = dr_ste_conv_bit_to_byte_mask(sb->bit_mask); + sb->ste_build_tag_func = &dr_ste_build_general_purpose_tag; +} + +static void dr_ste_build_eth_l4_misc_bit_mask(struct dr_match_param *value, + bool inner, uint8_t *bit_mask) +{ + struct dr_match_misc3 *misc_3_mask = &value->misc3; + + if (inner) { + DR_STE_SET_MASK_V(eth_l4_misc, bit_mask, seq_num, misc_3_mask, + inner_tcp_seq_num); + DR_STE_SET_MASK_V(eth_l4_misc, bit_mask, ack_num, misc_3_mask, + inner_tcp_ack_num); + } else { + DR_STE_SET_MASK_V(eth_l4_misc, bit_mask, seq_num, misc_3_mask, + outer_tcp_seq_num); + DR_STE_SET_MASK_V(eth_l4_misc, bit_mask, ack_num, misc_3_mask, + outer_tcp_ack_num); + } +} + +static int dr_ste_build_eth_l4_misc_tag(struct dr_match_param *value, + struct dr_ste_build *sb, + uint8_t *hw_ste_p) +{ + struct dr_hw_ste_format *hw_ste = (struct dr_hw_ste_format *)hw_ste_p; + struct dr_match_misc3 *misc3 = &value->misc3; + uint8_t *tag = hw_ste->tag; + + if (sb->inner) { + DR_STE_SET_TAG(eth_l4_misc, tag, seq_num, misc3, inner_tcp_seq_num); + DR_STE_SET_TAG(eth_l4_misc, tag, ack_num, misc3, inner_tcp_ack_num); + } else { + DR_STE_SET_TAG(eth_l4_misc, tag, seq_num, misc3, outer_tcp_seq_num); + DR_STE_SET_TAG(eth_l4_misc, tag, ack_num, misc3, outer_tcp_ack_num); + } + + return 0; +} + +void dr_ste_build_eth_l4_misc(struct dr_ste_build *sb, + struct dr_match_param *mask, + bool inner, bool rx) +{ + dr_ste_build_eth_l4_misc_bit_mask(mask, inner, sb->bit_mask); + + sb->rx = rx; + sb->inner = inner; + sb->lu_type = DR_STE_CALC_LU_TYPE(ETHL4_MISC, rx, inner); + sb->byte_mask = dr_ste_conv_bit_to_byte_mask(sb->bit_mask); + sb->ste_build_tag_func = &dr_ste_build_eth_l4_misc_tag; +} + +static void +dr_ste_build_flex_parser_tnl_vxlan_gpe_bit_mask(struct dr_match_param *value, + bool inner, uint8_t *bit_mask) +{ + struct dr_match_misc3 *misc_3_mask = &value->misc3; + + DR_STE_SET_MASK_V(flex_parser_tnl_vxlan_gpe, bit_mask, + outer_vxlan_gpe_flags, + misc_3_mask, outer_vxlan_gpe_flags); + DR_STE_SET_MASK_V(flex_parser_tnl_vxlan_gpe, bit_mask, + outer_vxlan_gpe_next_protocol, + misc_3_mask, outer_vxlan_gpe_next_protocol); + DR_STE_SET_MASK_V(flex_parser_tnl_vxlan_gpe, bit_mask, + outer_vxlan_gpe_vni, + misc_3_mask, outer_vxlan_gpe_vni); +} + +static int +dr_ste_build_flex_parser_tnl_vxlan_gpe_tag(struct dr_match_param *value, + struct dr_ste_build *sb, + uint8_t *hw_ste_p) +{ + struct dr_hw_ste_format *hw_ste = (struct dr_hw_ste_format *)hw_ste_p; + struct dr_match_misc3 *misc3 = &value->misc3; + uint8_t *tag = hw_ste->tag; + + DR_STE_SET_TAG(flex_parser_tnl_vxlan_gpe, tag, + outer_vxlan_gpe_flags, misc3, + outer_vxlan_gpe_flags); + DR_STE_SET_TAG(flex_parser_tnl_vxlan_gpe, tag, + outer_vxlan_gpe_next_protocol, misc3, + outer_vxlan_gpe_next_protocol); + DR_STE_SET_TAG(flex_parser_tnl_vxlan_gpe, tag, + outer_vxlan_gpe_vni, misc3, + outer_vxlan_gpe_vni); + + return 0; +} + +void dr_ste_build_flex_parser_tnl_vxlan_gpe(struct dr_ste_build *sb, + struct dr_match_param *mask, + bool inner, bool rx) +{ + dr_ste_build_flex_parser_tnl_vxlan_gpe_bit_mask(mask, inner, + sb->bit_mask); + sb->rx = rx; + sb->inner = inner; + sb->lu_type = DR_STE_LU_TYPE_FLEX_PARSER_TNL_HEADER; + sb->byte_mask = dr_ste_conv_bit_to_byte_mask(sb->bit_mask); + sb->ste_build_tag_func = &dr_ste_build_flex_parser_tnl_vxlan_gpe_tag; +} + +static void +dr_ste_build_flex_parser_tnl_geneve_bit_mask(struct dr_match_param *value, + uint8_t *bit_mask) +{ + struct dr_match_misc *misc_mask = &value->misc; + + DR_STE_SET_MASK_V(flex_parser_tnl_geneve, bit_mask, + geneve_protocol_type, + misc_mask, geneve_protocol_type); + DR_STE_SET_MASK_V(flex_parser_tnl_geneve, bit_mask, + geneve_oam, + misc_mask, geneve_oam); + DR_STE_SET_MASK_V(flex_parser_tnl_geneve, bit_mask, + geneve_opt_len, + misc_mask, geneve_opt_len); + DR_STE_SET_MASK_V(flex_parser_tnl_geneve, bit_mask, + geneve_vni, + misc_mask, geneve_vni); +} + +static int +dr_ste_build_flex_parser_tnl_geneve_tag(struct dr_match_param *value, + struct dr_ste_build *sb, + uint8_t *hw_ste_p) +{ + struct dr_hw_ste_format *hw_ste = (struct dr_hw_ste_format *)hw_ste_p; + struct dr_match_misc *misc = &value->misc; + uint8_t *tag = hw_ste->tag; + + DR_STE_SET_TAG(flex_parser_tnl_geneve, tag, + geneve_protocol_type, misc, geneve_protocol_type); + DR_STE_SET_TAG(flex_parser_tnl_geneve, tag, + geneve_oam, misc, geneve_oam); + DR_STE_SET_TAG(flex_parser_tnl_geneve, tag, + geneve_opt_len, misc, geneve_opt_len); + DR_STE_SET_TAG(flex_parser_tnl_geneve, tag, + geneve_vni, misc, geneve_vni); + + return 0; +} + +void dr_ste_build_flex_parser_tnl_geneve(struct dr_ste_build *sb, + struct dr_match_param *mask, + bool inner, bool rx) +{ + dr_ste_build_flex_parser_tnl_geneve_bit_mask(mask, sb->bit_mask); + sb->rx = rx; + sb->inner = inner; + sb->lu_type = DR_STE_LU_TYPE_FLEX_PARSER_TNL_HEADER; + sb->byte_mask = dr_ste_conv_bit_to_byte_mask(sb->bit_mask); + sb->ste_build_tag_func = &dr_ste_build_flex_parser_tnl_geneve_tag; +} + +static void +dr_ste_build_flex_parser_tnl_gtpu_bit_mask(struct dr_match_param *value, + uint8_t *bit_mask) +{ + struct dr_match_misc3 *misc3 = &value->misc3; + + DR_STE_SET_MASK_V(flex_parser_tnl_gtpu, bit_mask, + gtpu_flags, misc3, + gtpu_flags); + DR_STE_SET_MASK_V(flex_parser_tnl_gtpu, bit_mask, + gtpu_msg_type, misc3, + gtpu_msg_type); + DR_STE_SET_MASK_V(flex_parser_tnl_gtpu, bit_mask, + gtpu_teid, misc3, + gtpu_teid); +} + +static int +dr_ste_build_flex_parser_tnl_gtpu_tag(struct dr_match_param *value, + struct dr_ste_build *sb, + uint8_t *hw_ste_p) +{ + struct dr_hw_ste_format *hw_ste = (struct dr_hw_ste_format *)hw_ste_p; + struct dr_match_misc3 *misc3 = &value->misc3; + uint8_t *tag = hw_ste->tag; + + DR_STE_SET_TAG(flex_parser_tnl_gtpu, tag, + gtpu_flags, misc3, + gtpu_flags); + DR_STE_SET_TAG(flex_parser_tnl_gtpu, tag, + gtpu_msg_type, misc3, + gtpu_msg_type); + DR_STE_SET_TAG(flex_parser_tnl_gtpu, tag, + gtpu_teid, misc3, + gtpu_teid); + + return 0; +} + +void dr_ste_build_flex_parser_tnl_gtpu(struct dr_ste_build *sb, + struct dr_match_param *mask, + bool inner, bool rx) +{ + dr_ste_build_flex_parser_tnl_gtpu_bit_mask(mask, sb->bit_mask); + sb->rx = rx; + sb->inner = inner; + sb->lu_type = DR_STE_LU_TYPE_FLEX_PARSER_TNL_HEADER; + sb->byte_mask = dr_ste_conv_bit_to_byte_mask(sb->bit_mask); + sb->ste_build_tag_func = &dr_ste_build_flex_parser_tnl_gtpu_tag; +} + +static void dr_ste_build_register_0_bit_mask(struct dr_match_param *value, + uint8_t *bit_mask) +{ + struct dr_match_misc2 *misc_2_mask = &value->misc2; + + DR_STE_SET_MASK_V(register_0, bit_mask, register_0_h, + misc_2_mask, metadata_reg_c_0); + DR_STE_SET_MASK_V(register_0, bit_mask, register_0_l, + misc_2_mask, metadata_reg_c_1); + DR_STE_SET_MASK_V(register_0, bit_mask, register_1_h, + misc_2_mask, metadata_reg_c_2); + DR_STE_SET_MASK_V(register_0, bit_mask, register_1_l, + misc_2_mask, metadata_reg_c_3); +} + +static int dr_ste_build_register_0_tag(struct dr_match_param *value, + struct dr_ste_build *sb, + uint8_t *hw_ste_p) +{ + struct dr_hw_ste_format *hw_ste = (struct dr_hw_ste_format *)hw_ste_p; + struct dr_match_misc2 *misc2 = &value->misc2; + uint8_t *tag = hw_ste->tag; + + DR_STE_SET_TAG(register_0, tag, register_0_h, misc2, metadata_reg_c_0); + DR_STE_SET_TAG(register_0, tag, register_0_l, misc2, metadata_reg_c_1); + DR_STE_SET_TAG(register_0, tag, register_1_h, misc2, metadata_reg_c_2); + DR_STE_SET_TAG(register_0, tag, register_1_l, misc2, metadata_reg_c_3); + + return 0; +} + +void dr_ste_build_register_0(struct dr_ste_build *sb, + struct dr_match_param *mask, + bool inner, bool rx) +{ + dr_ste_build_register_0_bit_mask(mask, sb->bit_mask); + + sb->rx = rx; + sb->inner = inner; + sb->lu_type = DR_STE_LU_TYPE_STEERING_REGISTERS_0; + sb->byte_mask = dr_ste_conv_bit_to_byte_mask(sb->bit_mask); + sb->ste_build_tag_func = &dr_ste_build_register_0_tag; +} + +static void dr_ste_build_register_1_bit_mask(struct dr_match_param *value, + uint8_t *bit_mask) +{ + struct dr_match_misc2 *misc_2_mask = &value->misc2; + + DR_STE_SET_MASK_V(register_1, bit_mask, register_2_h, + misc_2_mask, metadata_reg_c_4); + DR_STE_SET_MASK_V(register_1, bit_mask, register_2_l, + misc_2_mask, metadata_reg_c_5); + DR_STE_SET_MASK_V(register_1, bit_mask, register_3_h, + misc_2_mask, metadata_reg_c_6); + DR_STE_SET_MASK_V(register_1, bit_mask, register_3_l, + misc_2_mask, metadata_reg_c_7); +} + +static int dr_ste_build_register_1_tag(struct dr_match_param *value, + struct dr_ste_build *sb, + uint8_t *hw_ste_p) +{ + struct dr_hw_ste_format *hw_ste = (struct dr_hw_ste_format *)hw_ste_p; + struct dr_match_misc2 *misc2 = &value->misc2; + uint8_t *tag = hw_ste->tag; + + DR_STE_SET_TAG(register_1, tag, register_2_h, misc2, metadata_reg_c_4); + DR_STE_SET_TAG(register_1, tag, register_2_l, misc2, metadata_reg_c_5); + DR_STE_SET_TAG(register_1, tag, register_3_h, misc2, metadata_reg_c_6); + DR_STE_SET_TAG(register_1, tag, register_3_l, misc2, metadata_reg_c_7); + + return 0; +} + +void dr_ste_build_register_1(struct dr_ste_build *sb, + struct dr_match_param *mask, + bool inner, bool rx) +{ + dr_ste_build_register_1_bit_mask(mask, sb->bit_mask); + + sb->rx = rx; + sb->inner = inner; + sb->lu_type = DR_STE_LU_TYPE_STEERING_REGISTERS_1; + sb->byte_mask = dr_ste_conv_bit_to_byte_mask(sb->bit_mask); + sb->ste_build_tag_func = &dr_ste_build_register_1_tag; +} + +static int dr_ste_build_src_gvmi_qpn_bit_mask(struct dr_match_param *value, + uint8_t *bit_mask) +{ + struct dr_match_misc *misc_mask = &value->misc; + + if (misc_mask->source_port && misc_mask->source_port != 0xffff) { + errno = EINVAL; + return errno; + } + DR_STE_SET_MASK(src_gvmi_qp, bit_mask, source_gvmi, misc_mask, source_port); + DR_STE_SET_MASK(src_gvmi_qp, bit_mask, source_qp, misc_mask, source_sqn); + + return 0; +} + +static int dr_ste_build_src_gvmi_qpn_tag(struct dr_match_param *value, + struct dr_ste_build *sb, + uint8_t *hw_ste_p) +{ + struct dr_hw_ste_format *hw_ste = (struct dr_hw_ste_format *)hw_ste_p; + struct dr_match_misc *misc = &value->misc; + struct dr_devx_vport_cap *vport_cap; + uint8_t *tag = hw_ste->tag; + + DR_STE_SET_TAG(src_gvmi_qp, tag, source_qp, misc, source_sqn); + + vport_cap = dr_get_vport_cap(sb->caps, misc->source_port); + if (!vport_cap) + return errno; + + if (vport_cap->gvmi) + DR_STE_SET(src_gvmi_qp, tag, source_gvmi, vport_cap->gvmi); + + misc->source_port = 0; + + return 0; +} + +int dr_ste_build_src_gvmi_qpn(struct dr_ste_build *sb, + struct dr_match_param *mask, + struct dr_devx_caps *caps, + bool inner, bool rx) +{ + int ret; + + ret = dr_ste_build_src_gvmi_qpn_bit_mask(mask, sb->bit_mask); + if (ret) + return ret; + + sb->rx = rx; + sb->caps = caps; + sb->inner = inner; + sb->lu_type = DR_STE_LU_TYPE_SRC_GVMI_AND_QP; + sb->byte_mask = dr_ste_conv_bit_to_byte_mask(sb->bit_mask); + sb->ste_build_tag_func = &dr_ste_build_src_gvmi_qpn_tag; + + return 0; +} diff --git a/providers/mlx5/dr_table.c b/providers/mlx5/dr_table.c new file mode 100644 index 0000000..3f3a065 --- /dev/null +++ b/providers/mlx5/dr_table.c @@ -0,0 +1,241 @@ +/* + * Copyright (c) 2019, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <stdlib.h> +#include "mlx5dv_dr.h" + +static void dr_table_uninit_nic(struct dr_table_rx_tx *nic_tbl) +{ + dr_htbl_put(nic_tbl->s_anchor); +} + +static void dr_table_uninit_fdb(struct mlx5dv_dr_table *tbl) +{ + dr_table_uninit_nic(&tbl->rx); + dr_table_uninit_nic(&tbl->tx); +} + +static void dr_table_uninit(struct mlx5dv_dr_table *tbl) +{ + pthread_mutex_lock(&tbl->dmn->mutex); + + switch (tbl->dmn->type) { + case MLX5DV_DR_DOMAIN_TYPE_NIC_RX: + dr_table_uninit_nic(&tbl->rx); + break; + case MLX5DV_DR_DOMAIN_TYPE_NIC_TX: + dr_table_uninit_nic(&tbl->tx); + break; + case MLX5DV_DR_DOMAIN_TYPE_FDB: + dr_table_uninit_fdb(tbl); + break; + default: + break; + } + + pthread_mutex_unlock(&tbl->dmn->mutex); +} + +static int dr_table_init_nic(struct mlx5dv_dr_domain *dmn, + struct dr_table_rx_tx *nic_tbl) +{ + struct dr_domain_rx_tx *nic_dmn = nic_tbl->nic_dmn; + struct dr_htbl_connect_info info; + int ret; + + nic_tbl->s_anchor = dr_ste_htbl_alloc(dmn->ste_icm_pool, + DR_CHUNK_SIZE_1, + DR_STE_LU_TYPE_DONT_CARE, + 0); + if (!nic_tbl->s_anchor) + return errno; + + info.type = CONNECT_MISS; + info.miss_icm_addr = nic_dmn->default_icm_addr; + ret = dr_ste_htbl_init_and_postsend(dmn, nic_dmn, nic_tbl->s_anchor, + &info, true); + if (ret) + goto free_s_anchor; + + dr_htbl_get(nic_tbl->s_anchor); + + return 0; + +free_s_anchor: + dr_ste_htbl_free(nic_tbl->s_anchor); + return ret; +} + +static int dr_table_init_fdb(struct mlx5dv_dr_table *tbl) +{ + int ret; + + ret = dr_table_init_nic(tbl->dmn, &tbl->rx); + if (ret) + return ret; + + ret = dr_table_init_nic(tbl->dmn, &tbl->tx); + if (ret) + goto destroy_rx; + + return 0; + +destroy_rx: + dr_table_uninit_nic(&tbl->rx); + return ret; +} + +static int dr_table_init(struct mlx5dv_dr_table *tbl) +{ + int ret = 0; + + list_head_init(&tbl->matcher_list); + + pthread_mutex_lock(&tbl->dmn->mutex); + + switch (tbl->dmn->type) { + case MLX5DV_DR_DOMAIN_TYPE_NIC_RX: + tbl->table_type = FS_FT_NIC_RX; + tbl->rx.nic_dmn = &tbl->dmn->info.rx; + ret = dr_table_init_nic(tbl->dmn, &tbl->rx); + break; + case MLX5DV_DR_DOMAIN_TYPE_NIC_TX: + tbl->table_type = FS_FT_NIC_TX; + tbl->tx.nic_dmn = &tbl->dmn->info.tx; + ret = dr_table_init_nic(tbl->dmn, &tbl->tx); + break; + case MLX5DV_DR_DOMAIN_TYPE_FDB: + tbl->table_type = FS_FT_FDB; + tbl->rx.nic_dmn = &tbl->dmn->info.rx; + tbl->tx.nic_dmn = &tbl->dmn->info.tx; + ret = dr_table_init_fdb(tbl); + break; + default: + assert(false); + break; + } + + pthread_mutex_unlock(&tbl->dmn->mutex); + + return ret; +} + +static int dr_table_create_devx_tbl(struct mlx5dv_dr_table *tbl) +{ + uint64_t icm_addr_rx = 0; + uint64_t icm_addr_tx = 0; + + if (tbl->rx.s_anchor) + icm_addr_rx = tbl->rx.s_anchor->chunk->icm_addr; + + if (tbl->tx.s_anchor) + icm_addr_tx = tbl->tx.s_anchor->chunk->icm_addr; + + tbl->devx_obj = dr_devx_create_flow_table(tbl->dmn->ctx, + tbl->table_type, + icm_addr_rx, + icm_addr_tx, + tbl->dmn->info.caps.max_ft_level - 1); + if (!tbl->devx_obj) + return errno; + + return 0; +} + +struct mlx5dv_dr_table *mlx5dv_dr_table_create(struct mlx5dv_dr_domain *dmn, + uint32_t level) +{ + struct mlx5dv_dr_table *tbl; + int ret; + + atomic_fetch_add(&dmn->refcount, 1); + + if (level && !dmn->info.supp_sw_steering) { + errno = EOPNOTSUPP; + goto dec_ref; + } + + tbl = calloc(1, sizeof(*tbl)); + if (!tbl) { + errno = ENOMEM; + goto dec_ref; + } + + tbl->dmn = dmn; + tbl->level = level; + atomic_init(&tbl->refcount, 1); + + if (!dr_is_root_table(tbl)) { + ret = dr_table_init(tbl); + if (ret) + goto free_tbl; + + ret = dr_table_create_devx_tbl(tbl); + if (ret) + goto uninit_tbl; + } + + list_node_init(&tbl->tbl_list); + list_add_tail(&dmn->tbl_list, &tbl->tbl_list); + + return tbl; + +uninit_tbl: + dr_table_uninit(tbl); +free_tbl: + free(tbl); +dec_ref: + atomic_fetch_sub(&dmn->refcount, 1); + return NULL; +} + +int mlx5dv_dr_table_destroy(struct mlx5dv_dr_table *tbl) +{ + int ret = 0; + + if (atomic_load(&tbl->refcount) > 1) + return EBUSY; + + if (!dr_is_root_table(tbl)) { + ret = mlx5dv_devx_obj_destroy(tbl->devx_obj); + if (ret) + return ret; + + dr_table_uninit(tbl); + } + + list_del(&tbl->tbl_list); + atomic_fetch_sub(&tbl->dmn->refcount, 1); + free(tbl); + + return ret; +} diff --git a/providers/mlx5/libmlx5.map b/providers/mlx5/libmlx5.map new file mode 100644 index 0000000..ef5930c --- /dev/null +++ b/providers/mlx5/libmlx5.map @@ -0,0 +1,135 @@ +/* Export symbols should be added below according to + Documentation/versioning.md document. */ +MLX5_1.0 { + global: + mlx5dv_query_device; + mlx5dv_init_obj; + local: *; +}; + +MLX5_1.1 { + global: + mlx5dv_create_cq; +} MLX5_1.0; + +MLX5_1.2 { + global: + mlx5dv_init_obj; + mlx5dv_set_context_attr; +} MLX5_1.1; + +MLX5_1.3 { + global: + mlx5dv_create_qp; + mlx5dv_create_wq; +} MLX5_1.2; + +MLX5_1.4 { + global: + mlx5dv_get_clock_info; +} MLX5_1.3; + +MLX5_1.5 { + global: + mlx5dv_create_flow_action_esp; +} MLX5_1.4; + +MLX5_1.6 { + global: + mlx5dv_create_flow_matcher; + mlx5dv_destroy_flow_matcher; + mlx5dv_create_flow; +} MLX5_1.5; + +MLX5_1.7 { + global: + mlx5dv_create_flow_action_modify_header; + mlx5dv_create_flow_action_packet_reformat; + mlx5dv_devx_alloc_uar; + mlx5dv_devx_free_uar; + mlx5dv_devx_general_cmd; + mlx5dv_devx_obj_create; + mlx5dv_devx_obj_destroy; + mlx5dv_devx_obj_modify; + mlx5dv_devx_obj_query; + mlx5dv_devx_query_eqn; + mlx5dv_devx_umem_dereg; + mlx5dv_devx_umem_reg; + mlx5dv_open_device; +} MLX5_1.6; + +MLX5_1.8 { + global: + mlx5dv_devx_cq_modify; + mlx5dv_devx_cq_query; + mlx5dv_devx_ind_tbl_modify; + mlx5dv_devx_ind_tbl_query; + mlx5dv_devx_qp_modify; + mlx5dv_devx_qp_query; + mlx5dv_devx_srq_modify; + mlx5dv_devx_srq_query; + mlx5dv_devx_wq_modify; + mlx5dv_devx_wq_query; + mlx5dv_is_supported; +} MLX5_1.7; + +MLX5_1.9 { + global: + mlx5dv_devx_create_cmd_comp; + mlx5dv_devx_destroy_cmd_comp; + mlx5dv_devx_get_async_cmd_comp; + mlx5dv_devx_obj_query_async; +} MLX5_1.8; + +MLX5_1.10 { + global: + mlx5dv_alloc_dm; + mlx5dv_create_mkey; + mlx5dv_destroy_mkey; + mlx5dv_dr_action_create_dest_table; + mlx5dv_dr_action_create_dest_ibv_qp; + mlx5dv_dr_action_create_dest_vport; + mlx5dv_dr_action_create_flow_counter; + mlx5dv_dr_action_create_drop; + mlx5dv_dr_action_create_modify_header; + mlx5dv_dr_action_create_packet_reformat; + mlx5dv_dr_action_create_tag; + mlx5dv_dr_action_destroy; + mlx5dv_dr_domain_create; + mlx5dv_dr_domain_destroy; + mlx5dv_dr_domain_sync; + mlx5dv_dr_matcher_create; + mlx5dv_dr_matcher_destroy; + mlx5dv_dr_rule_create; + mlx5dv_dr_rule_destroy; + mlx5dv_dr_table_create; + mlx5dv_dr_table_destroy; + mlx5dv_qp_ex_from_ibv_qp_ex; +} MLX5_1.9; + +MLX5_1.11 { + global: + mlx5dv_devx_create_event_channel; + mlx5dv_devx_destroy_event_channel; + mlx5dv_devx_get_event; + mlx5dv_devx_subscribe_devx_event; + mlx5dv_devx_subscribe_devx_event_fd; +} MLX5_1.10; + +MLX5_1.12 { + global: + mlx5dv_alloc_var; + mlx5dv_dr_action_create_flow_meter; + mlx5dv_dr_action_modify_flow_meter; + mlx5dv_dump_dr_domain; + mlx5dv_dump_dr_matcher; + mlx5dv_dump_dr_rule; + mlx5dv_dump_dr_table; + mlx5dv_free_var; +} MLX5_1.11; + +MLX5_1.13 { + global: + mlx5dv_pp_alloc; + mlx5dv_pp_free; +} MLX5_1.12; diff --git a/providers/mlx5/man/CMakeLists.txt b/providers/mlx5/man/CMakeLists.txt new file mode 100644 index 0000000..d5f8b86 --- /dev/null +++ b/providers/mlx5/man/CMakeLists.txt @@ -0,0 +1,85 @@ +rdma_man_pages( + mlx5dv_alloc_dm.3.md + mlx5dv_alloc_var.3.md + mlx5dv_create_cq.3.md + mlx5dv_create_flow.3.md + mlx5dv_create_flow_action_modify_header.3.md + mlx5dv_create_flow_action_packet_reformat.3.md + mlx5dv_create_flow_matcher.3.md + mlx5dv_create_mkey.3.md + mlx5dv_create_qp.3.md + mlx5dv_devx_alloc_uar.3.md + mlx5dv_devx_create_cmd_comp.3.md + mlx5dv_devx_create_event_channel.3.md + mlx5dv_devx_get_event.3.md + mlx5dv_devx_obj_create.3.md + mlx5dv_devx_qp_modify.3.md + mlx5dv_devx_query_eqn.3.md + mlx5dv_devx_subscribe_devx_event.3.md + mlx5dv_devx_umem_reg.3.md + mlx5dv_dr_flow.3.md + mlx5dv_dump.3.md + mlx5dv_flow_action_esp.3.md + mlx5dv_get_clock_info.3 + mlx5dv_init_obj.3 + mlx5dv_is_supported.3.md + mlx5dv_open_device.3.md + mlx5dv_pp_alloc.3.md + mlx5dv_query_device.3 + mlx5dv_ts_to_ns.3 + mlx5dv_wr_post.3.md + mlx5dv.7 +) +rdma_alias_man_pages( + mlx5dv_alloc_var.3 mlx5dv_free_var.3 + mlx5dv_create_mkey.3 mlx5dv_destroy_mkey.3 + mlx5dv_devx_alloc_uar.3 mlx5dv_devx_free_uar.3 + mlx5dv_devx_create_cmd_comp.3 mlx5dv_devx_destroy_cmd_comp.3 + mlx5dv_devx_create_event_channel.3 mlx5dv_devx_destroy_event_channel.3 + mlx5dv_devx_create_cmd_comp.3 mlx5dv_devx_get_async_cmd_comp.3 + mlx5dv_devx_obj_create.3 mlx5dv_devx_general_cmd.3 + mlx5dv_devx_obj_create.3 mlx5dv_devx_obj_destroy.3 + mlx5dv_devx_obj_create.3 mlx5dv_devx_obj_query.3 + mlx5dv_devx_obj_create.3 mlx5dv_devx_obj_query_async.3 + mlx5dv_devx_obj_create.3 mlx5dv_devx_obj_modify.3 + mlx5dv_devx_qp_modify.3 mlx5dv_devx_qp_query.3 + mlx5dv_devx_qp_modify.3 mlx5dv_devx_cq_modify.3 + mlx5dv_devx_qp_modify.3 mlx5dv_devx_cq_query.3 + mlx5dv_devx_qp_modify.3 mlx5dv_devx_wq_modify.3 + mlx5dv_devx_qp_modify.3 mlx5dv_devx_wq_query.3 + mlx5dv_devx_qp_modify.3 mlx5dv_devx_srq_modify.3 + mlx5dv_devx_qp_modify.3 mlx5dv_devx_srq_query.3 + mlx5dv_devx_qp_modify.3 mlx5dv_devx_ind_tbl_modify.3 + mlx5dv_devx_qp_modify.3 mlx5dv_devx_ind_tbl_query.3 + mlx5dv_devx_subscribe_devx_event.3 mlx5dv_devx_subscribe_devx_event_fd.3 + mlx5dv_devx_umem_reg.3 mlx5dv_devx_umem_dereg.3 + mlx5dv_dr_flow.3 mlx5dv_dr_action_create_dest_table.3 + mlx5dv_dr_flow.3 mlx5dv_dr_action_create_dest_ibv_qp.3 + mlx5dv_dr_flow.3 mlx5dv_dr_action_create_dest_vport.3 + mlx5dv_dr_flow.3 mlx5dv_dr_action_create_flow_counter.3 + mlx5dv_dr_flow.3 mlx5dv_dr_action_create_drop.3 + mlx5dv_dr_flow.3 mlx5dv_dr_action_create_flow_meter.3 + mlx5dv_dr_flow.3 mlx5dv_dr_action_create_modify_header.3 + mlx5dv_dr_flow.3 mlx5dv_dr_action_create_packet_reformat.3 + mlx5dv_dr_flow.3 mlx5dv_dr_action_create_tag.3 + mlx5dv_dr_flow.3 mlx5dv_dr_action_destroy.3 + mlx5dv_dr_flow.3 mlx5dv_dr_action_modify_flow_meter.3 + mlx5dv_dr_flow.3 mlx5dv_dr_domain_create.3 + mlx5dv_dr_flow.3 mlx5dv_dr_domain_destroy.3 + mlx5dv_dr_flow.3 mlx5dv_dr_domain_sync.3 + mlx5dv_dr_flow.3 mlx5dv_dr_matcher_create.3 + mlx5dv_dr_flow.3 mlx5dv_dr_matcher_destroy.3 + mlx5dv_dr_flow.3 mlx5dv_dr_rule_create.3 + mlx5dv_dr_flow.3 mlx5dv_dr_rule_destroy.3 + mlx5dv_dr_flow.3 mlx5dv_dr_table_create.3 + mlx5dv_dr_flow.3 mlx5dv_dr_table_destroy.3 + mlx5dv_dump.3 mlx5dv_dump_dr_domain.3 + mlx5dv_dump.3 mlx5dv_dump_dr_matcher.3 + mlx5dv_dump.3 mlx5dv_dump_dr_rule.3 + mlx5dv_dump.3 mlx5dv_dump_dr_table.3 + mlx5dv_pp_alloc.3 mlx5dv_pp_free.3 + mlx5dv_wr_post.3 mlx5dv_wr_set_dc_addr.3 + mlx5dv_wr_post.3 mlx5dv_qp_ex_from_ibv_qp_ex.3 + mlx5dv_wr_post.3 mlx5dv_wr_mr_interleaved.3 + mlx5dv_wr_post.3 mlx5dv_wr_mr_list.3 +) diff --git a/providers/mlx5/man/mlx5dv.7 b/providers/mlx5/man/mlx5dv.7 new file mode 100644 index 0000000..012becc --- /dev/null +++ b/providers/mlx5/man/mlx5dv.7 @@ -0,0 +1,45 @@ +.\" -*- nroff -*- +.\" Licensed under the OpenIB.org (MIT) - See COPYING.md +.\" +.TH MLX5DV 7 2017-02-02 1.0.0 +.SH "NAME" +mlx5dv \- Direct verbs for mlx5 devices +.br +This is low level access to mlx5 devices to perform data path operations, +without general branching performed by \fBibv_post_send\fR(3). + +.SH "DESCRIPTION" +The libibverbs API is an abstract one. It is agnostic to any underlying +provider specific implementation. While this abstraction has the advantage +of user applications portability it has a performance penalty. For some +applications optimizing performance is more important than portability. + +The mlx5 direct verbs API is intended for such applications. +It exposes mlx5 specific low level data path (send/receive/completion) +operations, allowing the application to bypass the libibverbs data path API. + +This interface consists from one hardware specific header file +with relevant inline functions and conversion logic from ibverbs structures +to mlx5 specific structures. + +The direct include of mlx5dv.h together with linkage to mlx5 library will +allow usage of this new interface. + +Once an application uses the direct flow the locking scheme is fully managed +by itself. There is an expectation that no mixed flows in the data path for both +direct/non-direct access will be by same application. + +.SH "NOTES" +All Mellanox NIC devices starting from Connect-IB (Connect-IB, +ConnectX-4, ConnectX-4Lx, ConnectX-5, ...) implement the mlx5 API, +thus using the mlx5 direct verbs does not limit the applications +to a single NIC HW device thus keeping some level of portability. + +.SH "SEE ALSO" +.BR ibv_post_send (3), +.BR verbs (7), +.BR mlx5dv_is_supported(3) + +.SH "AUTHORS" +.TP +Leon Romanovsky <leonro@mellanox.com> diff --git a/providers/mlx5/man/mlx5dv_alloc_dm.3.md b/providers/mlx5/man/mlx5dv_alloc_dm.3.md new file mode 100644 index 0000000..4db281b --- /dev/null +++ b/providers/mlx5/man/mlx5dv_alloc_dm.3.md @@ -0,0 +1,80 @@ +--- +layout: page +title: mlx5dv_alloc_dm +section: 3 +tagline: Verbs +date: 2018-9-1 +header: "mlx5 Programmer's Manual" +footer: mlx5 +--- + +# NAME + +mlx5dv_alloc_dm - allocates device memory (DM) + +# SYNOPSIS + +```c +#include <infiniband/mlx5dv.h> + +struct ibv_dm *mlx5dv_alloc_dm(struct ibv_context *context, + struct ibv_alloc_dm_attr *dm_attr, + struct mlx5dv_alloc_dm_attr *mlx5_dm_attr) +``` + + +# DESCRIPTION + +**mlx5dv_alloc_dm()** allocates device memory (DM) with specific driver properties. + +# ARGUMENTS + +Please see *ibv_alloc_dm(3)* man page for *context* and *dm_attr*. + +## mlx5_dm_attr + +```c +struct mlx5dv_alloc_dm_attr { + enum mlx5dv_alloc_dm_type type; + uint64_t comp_mask; +}; +``` + +*type* +: The device memory type user wishes to allocate: + + MLX5DV_DM_TYPE_MEMIC + Device memory of type MEMIC - On-Chip memory that + can be allocated and used as memory region for + transmitting/receiving packet directly from/to the + memory on the chip. + + MLX5DV_DM_TYPE_STEERING_SW_ICM + Device memory of type STEERING SW ICM - This memory + is used by the device to store the packet steering + tables and rules. Can be used for direct table and steering + rules creation when allocated by a privileged user. + + MLX5DV_DM_TYPE_HEADER_MODIFY_SW_ICM + Device memory of type HEADER MODIFY SW ICM - This memory + is used by the device to store the packet header modification + tables and rules. Can be used for direct table and header modification + rules creation when allocated by a privileged user. + +*comp_mask* +: Bitmask specifying what fields in the structure are valid: + Currently reserved and should be set to 0. + +# RETURN VALUE + +**mlx5dv_alloc_dm()** +returns a pointer to the created DM, on error NULL will be returned and errno will be set. + + +# SEE ALSO + +**ibv_alloc_dm**(3), + +# AUTHOR + +Ariel Levkovich <lariel@mellanox.com> diff --git a/providers/mlx5/man/mlx5dv_alloc_var.3.md b/providers/mlx5/man/mlx5dv_alloc_var.3.md new file mode 100644 index 0000000..cf46e1b --- /dev/null +++ b/providers/mlx5/man/mlx5dv_alloc_var.3.md @@ -0,0 +1,70 @@ +--- +layout: page +title: mlx5dv_alloc_var / mlx5dv_free_var +section: 3 +tagline: Verbs +--- + +# NAME + +mlx5dv_alloc_var - Allocates a VAR + +mlx5dv_free_var - Frees a VAR + +# SYNOPSIS + +```c +#include <infiniband/mlx5dv.h> + +struct mlx5dv_var * +mlx5dv_alloc_var(struct ibv_context *context, uint32_t flags); + +void mlx5dv_free_var(struct mlx5dv_var *dv_var); +``` + +# DESCRIPTION + +Create / free a VAR which can be used for some device commands over the DEVX interface. + +The DEVX API enables direct access from the user space area to the mlx5 device +driver, the VAR information is needed for few commands related to Virtio. + + +# ARGUMENTS +*context* +: RDMA device context to work on. + +*flags* +: Allocation flags for the UAR. + +## dv_var + +```c +struct mlx5dv_var { + uint32_t page_id; + uint32_t length; + off_t mmap_off; + uint64_t comp_mask; +}; +``` +*page_id* +: The device page id to be used. + +*length* +: The mmap length parameter to be used for mapping a VA to the allocated VAR entry. + +*mmap_off* +: The mmap offset parameter to be used for mapping a VA to the allocated VAR entry. + +# RETURN VALUE + +Upon success *mlx5dv_alloc_var* returns a pointer to the created VAR +,on error NULL will be returned and errno will be set. + +# SEE ALSO + +**mlx5dv_open_device**, **mlx5dv_devx_obj_create** + +# AUTHOR + +Yishai Hadas <yishaih@mellanox.com> diff --git a/providers/mlx5/man/mlx5dv_create_cq.3.md b/providers/mlx5/man/mlx5dv_create_cq.3.md new file mode 100644 index 0000000..c07cdd8 --- /dev/null +++ b/providers/mlx5/man/mlx5dv_create_cq.3.md @@ -0,0 +1,93 @@ +--- +layout: page +title: mlx5dv_create_cq +section: 3 +tagline: Verbs +date: 2018-9-1 +header: "mlx5 Programmer's Manual" +footer: mlx5 +--- + +# NAME + +mlx5dv_create_cq - creates a completion queue (CQ) + +# SYNOPSIS + +```c +#include <infiniband/mlx5dv.h> + +struct ibv_cq_ex *mlx5dv_create_cq(struct ibv_context *context, + struct ibv_cq_init_attr_ex *cq_attr, + struct mlx5dv_cq_init_attr *mlx5_cq_attr); +``` + + +# DESCRIPTION + +**mlx5dv_create_cq()** creates a completion queue (CQ) with specific driver properties. + +# ARGUMENTS + +Please see **ibv_create_cq_ex(3)** man page for **context** and **cq_attr** + +## mlx5_cq_attr + +```c +struct mlx5dv_cq_init_attr { + uint64_t comp_mask; + uint8_t cqe_comp_res_format; + uint32_t flags; + uint16_t cqe_size; +}; +``` + +*comp_mask* +: Bitmask specifying what fields in the structure are valid: + + MLX5DV_CQ_INIT_ATTR_MASK_COMPRESSED_CQE + enables creating a CQ in a mode that few CQEs may be compressed into + a single CQE, valid values in *cqe_comp_res_format* + + MLX5DV_CQ_INIT_ATTR_MASK_FLAGS + valid values in *flags* + + MLX5DV_CQ_INIT_ATTR_MASK_CQE_SIZE + valid values in *cqe_size* + +*cqe_comp_res_format* +: A bitwise OR of the various CQE response formats of the responder side: + + MLX5DV_CQE_RES_FORMAT_HASH + CQE compression with hash + + MLX5DV_CQE_RES_FORMAT_CSUM + CQE compression with RX checksum + + MLX5DV_CQE_RES_FORMAT_CSUM_STRIDX + CQE compression with stride index + +*flags* +: A bitwise OR of the various values described below: + + MLX5DV_CQ_INIT_ATTR_FLAGS_CQE_PAD + create a padded 128B CQE + +*cqe_size* +: configure the CQE size to be 64 or 128 bytes + other values will fail mlx5dv_create_cq. + +# RETURN VALUE + +**mlx5dv_create_cq()** +returns a pointer to the created CQ, or NULL if the request fails +and errno will be set. + + +# SEE ALSO + +**ibv_create_cq_ex**(3), + +# AUTHOR + +Yonatan Cohen <yonatanc@mellanox.com> diff --git a/providers/mlx5/man/mlx5dv_create_flow.3.md b/providers/mlx5/man/mlx5dv_create_flow.3.md new file mode 100644 index 0000000..bc423a8 --- /dev/null +++ b/providers/mlx5/man/mlx5dv_create_flow.3.md @@ -0,0 +1,92 @@ +--- +layout: page +title: mlx5dv_create_flow +section: 3 +tagline: Verbs +date: 2018-9-19 +header: "mlx5 Programmer's Manual" +footer: mlx5 +--- + +# NAME +mlx5dv_create_flow - creates a steering flow rule + +# SYNOPSIS + +```c +#include <infiniband/mlx5dv.h> + +struct ibv_flow * +mlx5dv_create_flow(struct mlx5dv_flow_matcher *flow_matcher, + struct mlx5dv_flow_match_parameters *match_value, + size_t num_actions, + struct mlx5dv_flow_action_attr actions_attr[]) +``` + + +# DESCRIPTION +**mlx5dv_create_flow()** creates a steering flow rule with the ability +to specify specific driver properties. + +# ARGUMENTS + +Please see *mlx5dv_create_flow_matcher(3)* for *flow_matcher* and *match_value*. + +*num_actions* +: Specifies how many actions are passed in *actions_attr* + +## *actions_attr* + +```c +struct mlx5dv_flow_action_attr { + enum mlx5dv_flow_action_type type; + union { + struct ibv_qp *qp; + struct ibv_counters *counter; + struct ibv_flow_action *action; + uint32_t tag_value; + struct mlx5dv_devx_obj *obj; + }; +}; +``` + +*type* +: MLX5DV_FLOW_ACTION_DEST_IBV_QP + The QP passed will receive the matched packets. + MLX5DV_FLOW_ACTION_IBV_FLOW_ACTION + The flow action to be applied. + MLX5DV_FLOW_ACTION_TAG + Flow tag to be provided in work completion. + MLX5DV_FLOW_ACTION_DEST_DEVX + The DEVX destination object for the matched packets. + MLX5DV_FLOW_ACTION_COUNTERS_DEVX + The DEVX counter object for the matched packets. + +*qp* +: QP passed, to be used with *type* *MLX5DV_FLOW_ACTION_DEST_IBV_QP*. + +*action* +: Flow action, to be used with *type* *MLX5DV_FLOW_ACTION_IBV_FLOW_ACTION* + see *mlx5dv_create_flow_action_modify_header(3)* and *mlx5dv_create_flow_action_packet_reformat(3)*. + +*tag_value* +: tag value to be passed in the work completion, to be used with *type* + *MLX5DV_FLOW_ACTION_TAG* see *ibv_create_cq_ex(3)*. + +*obj* +: DEVX object, to be used with *type* *MLX5DV_FLOW_ACTION_DEST_DEVX* or by *MLX5DV_FLOW_ACTION_COUNTERS_DEVX*. + +# RETURN VALUE + +**mlx5dv_create_flow** +returns a pointer to the created flow rule, on error NULL will be returned and errno will be set. + +# SEE ALSO + +*mlx5dv_create_flow_action_modify_header(3)*, *mlx5dv_create_flow_action_packet_reformat(3)*, +*mlx5dv_create_flow_matcher(3)*, *mlx5dv_create_qp(3)*, *ibv_create_qp_ex(3)* +*ibv_create_cq_ex(3)* *ibv_create_counters(3)* + +# AUTHOR + +Mark Bloch <marb@mellanox.com> diff --git a/providers/mlx5/man/mlx5dv_create_flow_action_modify_header.3.md b/providers/mlx5/man/mlx5dv_create_flow_action_modify_header.3.md new file mode 100644 index 0000000..f89665f --- /dev/null +++ b/providers/mlx5/man/mlx5dv_create_flow_action_modify_header.3.md @@ -0,0 +1,54 @@ +--- +layout: page +title: mlx5dv_create_flow_action_modify_header +section: 3 +tagline: Verbs +--- + +# NAME + +mlx5dv_create_flow_action_modify_header - Flow action modify header for mlx5 provider + +# SYNOPSIS + +```c +#include <infiniband/mlx5dv.h> + +struct ibv_flow_action * +mlx5dv_create_flow_action_modify_header(struct ibv_context *ctx, + size_t actions_sz, + uint64_t actions[], + enum mlx5dv_flow_table_type ft_type) +``` + +# DESCRIPTION + +Create a modify header flow steering action, it allows mutating a packet header. + +# ARGUMENTS + +*ctx* +: RDMA device context to create the action on. + +*actions_sz* +: The size of *actions* buffer in bytes. + +*actions* +: A buffer which contains modify actions provided in device spec format (i.e. be64). + +*ft_type* +: Defines the flow table type to which the modify header action will be attached. + + MLX5DV_FLOW_TABLE_TYPE_NIC_RX: RX FLOW TABLE + + MLX5DV_FLOW_TABLE_TYPE_NIC_TX: TX FLOW TABLE + +# RETURN VALUE + +Upon success *mlx5dv_create_flow_action_modify_header* will return a new *struct +ibv_flow_action* object, on error NULL will be returned and errno will be set. + +# SEE ALSO + +*ibv_create_flow(3)*, *ibv_create_flow_action(3)* + diff --git a/providers/mlx5/man/mlx5dv_create_flow_action_packet_reformat.3.md b/providers/mlx5/man/mlx5dv_create_flow_action_packet_reformat.3.md new file mode 100644 index 0000000..424c376 --- /dev/null +++ b/providers/mlx5/man/mlx5dv_create_flow_action_packet_reformat.3.md @@ -0,0 +1,68 @@ + +--- +layout: page +title: mlx5dv_create_flow_action_packet_reformat +section: 3 +tagline: Verbs +--- + +# NAME + +mlx5dv_create_flow_action_packet_reformat - Flow action reformat packet for mlx5 provider + +# SYNOPSIS + +```c +#include <infiniband/mlx5dv.h> + +struct ibv_flow_action * +mlx5dv_create_flow_action_packet_reformat(struct ibv_context *ctx, + size_t data_sz, + void *data, + enum mlx5dv_flow_action_packet_reformat_type reformat_type, + enum mlx5dv_flow_table_type ft_type) +``` + +# DESCRIPTION + +Create a packet reformat flow steering action. +It allows adding/removing packet headers. + +# ARGUMENTS +*ctx* +: RDMA device context to create the action on. + +*data_sz* +: The size of *data* buffer. + +*data* +: A buffer which contains headers in case the actions requires them. + +*reformat_type* +: The reformat type to be create. Use enum mlx5dv_flow_action_packet_reformat_type. + MLX5DV_FLOW_ACTION_PACKET_REFORMAT_TYPE_L2_TUNNEL_TO_L2: Decap a generic L2 + tunneled packet up to inner L2. + + MLX5DV_FLOW_ACTION_PACKET_REFORMAT_TYPE_L2_TO_L2_TUNNEL: Generic encap, *data* + should contain the encapsulating headers. + + MLX5DV_FLOW_ACTION_PACKET_REFORMAT_TYPE_L3_TUNNEL_TO_L2: Will do decap where + the inner packet starts from L3. *data* should be MAC or MAC + vlan (14 or 18 bytes) to be + appended to the packet after the decap action. + + MLX5DV_FLOW_ACTION_PACKET_REFORMAT_TYPE_L2_TO_L3_TUNNEL: Will do encap where is + L2 of the original packet will not be included. *data* should be the encapsulating header. + +*ft_type* +: It defines the flow table type to which the packet reformat action + will be attached. + +# RETURN VALUE + +Upon success *mlx5dv_create_flow_action_packet_reformat* will return a new *struct +ibv_flow_action* object, on error NULL will be returned and errno will be set. + +# SEE ALSO + +*ibv_create_flow(3)*, *ibv_create_flow_action(3)* + diff --git a/providers/mlx5/man/mlx5dv_create_flow_matcher.3.md b/providers/mlx5/man/mlx5dv_create_flow_matcher.3.md new file mode 100644 index 0000000..7b222b1 --- /dev/null +++ b/providers/mlx5/man/mlx5dv_create_flow_matcher.3.md @@ -0,0 +1,103 @@ +--- +layout: page +title: mlx5dv_create_flow_matcher +section: 3 +tagline: Verbs +date: 2018-9-19 +header: "mlx5 Programmer's Manual" +footer: mlx5 +--- + +# NAME +mlx5dv_create_flow_matcher - creates a matcher to be used with *mlx5dv_create_flow(3)* + +# SYNOPSIS + +```c +#include <infiniband/mlx5dv.h> + +struct mlx5dv_flow_matcher * +mlx5dv_create_flow_matcher(struct ibv_context *context, + struct mlx5dv_flow_matcher_attr *attr) +``` + +# DESCRIPTION + +**mlx5dv_create_flow_matcher()** creates a flow matcher (mask) to be used +with *mlx5dv_create_flow(3)*. + +# ARGUMENTS + +Please see *ibv_open_device(3)* for *context*. + +## *attr* + +```c +struct mlx5dv_flow_matcher_attr { + enum ibv_flow_attr_type type; + uint32_t flags; /* From enum ibv_flow_flags */ + uint16_t priority; + uint8_t match_criteria_enable; /* Device spec format */ + struct mlx5dv_flow_match_parameters *match_mask; + uint64_t comp_mask; + enum mlx5dv_flow_table_type ft_type; +}; +``` + +*type* +: Type of matcher to be created: + IBV_FLOW_ATTR_NORMAL: + Normal rule according to specification. + +*flags* +: special flags to control rule: + 0: + Nothing or zero value means matcher will store ingress flow rules. + IBV_FLOW_ATTR_FLAGS_EGRESS: + Specified this matcher will store egress flow rules. + +*priority* +: See *ibv_create_flow(3)*. + +*match_criteria_enable* +: What match criteria is configured in *match_mask*, passed in + device spec format. + +## *match_mask* +```c +struct mlx5dv_flow_match_parameters { + size_t match_sz; + uint64_t match_buf[]; /* Device spec format */ +}; +``` + +*match_sz* +: Size in bytes of *match_buf*. + +*match_buf* +: Set which mask to be used, passed in + device spec format. + +*comp_mask* +: MLX5DV_FLOW_MATCHER_MASK_FT_TYPE for *ft_type* + +## *ft_type* +Specified in which flow table type, the matcher will store the flow rules: + MLX5DV_FLOW_TABLE_TYPE_NIC_RX: Specified this matcher will store ingress flow rules. + MLX5DV_FLOW_TABLE_TYPE_NIC_TX Specified this matcher will store egress flow rules. + MLX5DV_FLOW_TABLE_TYPE_FDB : Specified this matcher will store FDB rules. + MLX5DV_FLOW_TABLE_TYPE_RDMA_RX: Specified this matcher will store ingress RDMA flow rules. + MLX5DV_FLOW_TABLE_TYPE_RDMA_TX: Specified this matcher will store egress RDMA flow rules. + +# RETURN VALUE + +**mlx5dv_create_flow_matcher** +returns a pointer to *mlx5dv_flow_matcher*, on error NULL will be returned and errno will be set. + +# SEE ALSO + +*ibv_open_device(3)*, *ibv_create_flow(3)* + +# AUTHOR + +Mark Bloch <markb@mellanox.com> diff --git a/providers/mlx5/man/mlx5dv_create_mkey.3.md b/providers/mlx5/man/mlx5dv_create_mkey.3.md new file mode 100644 index 0000000..d35fa4c --- /dev/null +++ b/providers/mlx5/man/mlx5dv_create_mkey.3.md @@ -0,0 +1,77 @@ +--- +layout: page +title: mlx5dv_create_mkey / mlx5dv_destroy_mkey +section: 3 +tagline: Verbs +--- + +# NAME + +mlx5dv_create_mkey - Creates an indirect mkey + +mlx5dv_create_mkey - Destroys an indirect mkey + +# SYNOPSIS + +```c +#include <infiniband/mlx5dv.h> + +struct mlx5dv_mkey_init_attr { + struct ibv_pd *pd; + uint32_t create_flags; + uint16_t max_entries; +}; + +struct mlx5dv_mkey { + uint32_t lkey; + uint32_t rkey; +}; + +struct mlx5dv_mkey * +mlx5dv_create_mkey(struct mlx5dv_mkey_init_attr *mkey_init_attr); + +int mlx5dv_destroy_mkey(struct mlx5dv_mkey *mkey); + +``` + +# DESCRIPTION + +Create / destroy an indirect mkey. + +Create an indirect mkey to enable application uses its specific device functionality. + +# ARGUMENTS + +##mkey_init_attr## + +*pd* +: ibv protection domain. + +*create_flags* +: MLX5DV_MKEY_INIT_ATTR_FLAGS_INDIRECT: + Indirect mkey is being created. + +*max_entries* +: Requested max number of pointed entries by this indirect mkey. + The function will update the *mkey_init_attr->max_entries* with the actual mkey value that was created; it will be greater than or equal to the value requested. + +# RETURN VALUE + +Upon success *mlx5dv_create_mkey* will return a new *struct +mlx5dv_mkey* on error NULL will be returned and errno will be set. + +Upon success destroy 0 is returned or the value of errno on a failure. + +# Notes + +To let this functionality works a DEVX context should be opened by using *mlx5dv_open_device*. + +The created indirect mkey can`t work with scatter to CQE feature, consider *mlx5dv_create_qp()* with MLX5DV_QP_CREATE_DISABLE_SCATTER_TO_CQE for small messages. + +# SEE ALSO + +**mlx5dv_open_device**(3), **mlx5dv_create_qp**(3) + +#AUTHOR + +Yishai Hadas <yishaih@mellanox.com> diff --git a/providers/mlx5/man/mlx5dv_create_qp.3.md b/providers/mlx5/man/mlx5dv_create_qp.3.md new file mode 100644 index 0000000..856c69a --- /dev/null +++ b/providers/mlx5/man/mlx5dv_create_qp.3.md @@ -0,0 +1,127 @@ +--- +layout: page +title: mlx5dv_create_qp +section: 3 +tagline: Verbs +date: 2018-9-1 +header: "mlx5 Programmer's Manual" +footer: mlx5 +--- + +# NAME + +mlx5dv_create_qp - creates a queue pair (QP) + +# SYNOPSIS + +```c +#include <infiniband/mlx5dv.h> + +struct ibv_qp *mlx5dv_create_qp(struct ibv_context *context, + struct ibv_qp_init_attr_ex *qp_attr, + struct mlx5dv_qp_init_attr *mlx5_qp_attr) +``` + + +# DESCRIPTION + +**mlx5dv_create_qp()** creates a queue pair (QP) with specific driver properties. + +# ARGUMENTS + +Please see *ibv_create_qp_ex(3)* man page for *context* and *qp_attr*. + +## mlx5_qp_attr + +```c +struct mlx5dv_qp_init_attr { + uint64_t comp_mask; + uint32_t create_flags; + struct mlx5dv_dc_init_attr dc_init_attr; + uint64_t send_ops_flags; +}; +``` + +*comp_mask* +: Bitmask specifying what fields in the structure are valid: + MLX5DV_QP_INIT_ATTR_MASK_QP_CREATE_FLAGS: + valid values in *create_flags* + MLX5DV_QP_INIT_ATTR_MASK_DC: + valid values in *dc_init_attr* + MLX5DV_QP_INIT_ATTR_MASK_SEND_OPS_FLAGS: + valid values in *send_ops_flags* + +*create_flags* +: A bitwise OR of the various values described below. + + MLX5DV_QP_CREATE_TUNNEL_OFFLOADS: + Enable offloading such as checksum and LRO for incoming + tunneling traffic. + + MLX5DV_QP_CREATE_TIR_ALLOW_SELF_LOOPBACK_UC: + Allow receiving loopback unicast traffic. + + MLX5DV_QP_CREATE_TIR_ALLOW_SELF_LOOPBACK_MC: + Allow receiving loopback multicast traffic. + + MLX5DV_QP_CREATE_DISABLE_SCATTER_TO_CQE: + Disable scatter to CQE feature which is enabled by default. + + MLX5DV_QP_CREATE_ALLOW_SCATTER_TO_CQE: + Allow scatter to CQE for requester even if the qp was not + configured to signal all WRs. + + MLX5DV_QP_CREATE_PACKET_BASED_CREDIT_MODE: + Set QP to work in end-to-end packet-based credit, + instead of the default message-based credits (IB spec. section 9.7.7.2). \ + It is the applications responsibility to make sure that the peer QP is configured with same mode. + +*dc_init_attr* +: DC init attributes. + +## *dc_init_attr* + +```c +struct mlx5dv_dc_init_attr { + enum mlx5dv_dc_type dc_type; + uint64_t dct_access_key; +}; +``` + +*dc_type* +: MLX5DV_DCTYPE_DCT + QP type: Target DC. + MLX5DV_DCTYPE_DCI + QP type: Initiator DC. + +*dct_access_key* +: used to create a DCT QP. + + +*send_ops_flags* +: A bitwise OR of the various values described below. + + MLX5DV_QP_EX_WITH_MR_INTERLEAVED: + Enables the mlx5dv_wr_mr_interleaved() work requset on this QP. + + MLX5DV_QP_EX_WITH_MR_LIST: + Enables the mlx5dv_wr_mr_list() work requset on this QP. + +# NOTES + +**mlx5dv_qp_ex_from_ibv_qp_ex()** is used to get *struct mlx5dv_qp_ex* for +accessing the send ops interfaces when IBV_QP_INIT_ATTR_SEND_OPS_FLAGS is used. + +# RETURN VALUE + +**mlx5dv_create_qp()** +returns a pointer to the created QP, on error NULL will be returned and errno will be set. + + +# SEE ALSO + +**ibv_query_device_ex**(3), **ibv_create_qp_ex**(3), + +# AUTHOR + +Yonatan Cohen <yonatanc@mellanox.com> diff --git a/providers/mlx5/man/mlx5dv_devx_alloc_uar.3.md b/providers/mlx5/man/mlx5dv_devx_alloc_uar.3.md new file mode 100644 index 0000000..32fe9ad --- /dev/null +++ b/providers/mlx5/man/mlx5dv_devx_alloc_uar.3.md @@ -0,0 +1,78 @@ +--- +layout: page +title: mlx5dv_devx_alloc_uar / mlx5dv_devx_free_uar +section: 3 +tagline: Verbs +--- + +# NAME + +mlx5dv_devx_alloc_uar - Allocates a DEVX UAR + +mlx5dv_devx_free_uar - Frees a DEVX UAR + +# SYNOPSIS + +```c +#include <infiniband/mlx5dv.h> + +struct mlx5dv_devx_uar *mlx5dv_devx_alloc_uar(struct ibv_context *context, + uint32_t flags); + +void mlx5dv_devx_free_uar(struct mlx5dv_devx_uar *devx_uar); +``` + +# DESCRIPTION + +Create / free a DEVX UAR which is needed for other device commands over the DEVX interface. + +The DEVX API enables direct access from the user space area to the mlx5 device +driver, the UAR information is needed for few commands as of QP creation. + + +# ARGUMENTS +*context* +: RDMA device context to work on. + +*flags* +: Allocation flags for the UAR. + MLX5DV_UAR_ALLOC_TYPE_BF: + Allocate UAR with Blueflame properties. + MLX5DV_UAR_ALLOC_TYPE_NC: + Allocate UAR with non-cache properties. + +## devx_uar + +```c +struct mlx5dv_devx_uar { + void *reg_addr; + void *base_addr; + uint32_t page_id; + off_t mmap_off; + uint64_t comp_mask; +}; +``` +*reg_addr* +: The write address of DB/BF. + +*base_addr* +: The base address of the UAR. + +*page_id* +: The device page id to be used. + +*mmap_off* +: The mmap offset parameter to be used for re-mapping, to be used by a secondary process. + +# RETURN VALUE + +Upon success *mlx5dv_devx_alloc_uar* will return a new *struct +mlx5dv_devx_uar*, on error NULL will be returned and errno will be set. + +# SEE ALSO + +**mlx5dv_open_device**, **mlx5dv_devx_obj_create** + +#AUTHOR + +Yishai Hadas <yishaih@mellanox.com> diff --git a/providers/mlx5/man/mlx5dv_devx_create_cmd_comp.3.md b/providers/mlx5/man/mlx5dv_devx_create_cmd_comp.3.md new file mode 100644 index 0000000..2b1be5e --- /dev/null +++ b/providers/mlx5/man/mlx5dv_devx_create_cmd_comp.3.md @@ -0,0 +1,78 @@ +--- +layout: page +title: mlx5dv_devx_create_cmd_comp, mlx5dv_devx_destroy_cmd_comp, get_async +section: 3 +tagline: Verbs +--- + +# NAME + +mlx5dv_devx_create_cmd_comp - Create a command completion to be used for DEVX asynchronous commands. + +mlx5dv_devx_destroy_cmd_comp - Destroy a devx command completion. + +mlx5dv_devx_get_async_cmd_comp - Get an asynchronous command completion. +# SYNOPSIS + +```c +#include <infiniband/mlx5dv.h> + +struct mlx5dv_devx_cmd_comp { + int fd; +}; + +struct mlx5dv_devx_cmd_comp * +mlx5dv_devx_create_cmd_comp(struct ibv_context *context) + +void mlx5dv_devx_destroy_cmd_comp(struct mlx5dv_devx_cmd_comp *cmd_comp) + +struct mlx5dv_devx_async_cmd_hdr { + uint64_t wr_id; + uint8_t out_data[]; +}; + +int mlx5dv_devx_get_async_cmd_comp(struct mlx5dv_devx_cmd_comp *cmd_comp, + struct mlx5dv_devx_async_cmd_hdr *cmd_resp, + size_t cmd_resp_len) +``` + +# DESCRIPTION + +Create or destroy a command completion to be used for DEVX asynchronous commands. + +The create verb exposes an mlx5dv_devx_cmd_comp object that can be used as part +of asynchronous DEVX commands. This lets an application run asynchronously +without blocking and once the response is ready read it from this object. + +The response can be read by the mlx5dv_devx_get_async_cmd_comp() API, upon response the *wr_id* that was supplied +upon the asynchronous command is returned and the *out_data* includes the data itself. +The application must supply a large enough buffer to match any command that was issued on the *cmd_comp*, its size +is given by the input *cmd_resp_len* parameter. + +# ARGUMENTS +*context* +: RDMA device context to create the action on. + +*cmd_comp* +: The command completion object. + +*cmd_resp* +: The output data from the asynchronous command. + +*cmd_resp_len* +: The output buffer size to hold the response. + +# RETURN VALUE + +Upon success *mlx5dv_devx_create_cmd_comp* will return a new *struct +mlx5dv_devx_cmd_comp* object, on error NULL will be returned and errno will be set. + +Upon success *mlx5dv_devx_get_async_cmd_comp* will return 0, otherwise errno will be returned. + +# SEE ALSO + +*mlx5dv_open_device(3)*, *mlx5dv_devx_obj_create(3)* + +#AUTHOR + +Yishai Hadas <yishaih@mellanox.com> diff --git a/providers/mlx5/man/mlx5dv_devx_create_event_channel.3.md b/providers/mlx5/man/mlx5dv_devx_create_event_channel.3.md new file mode 100644 index 0000000..f4a4013 --- /dev/null +++ b/providers/mlx5/man/mlx5dv_devx_create_event_channel.3.md @@ -0,0 +1,58 @@ +--- +layout: page +title: mlx5dv_devx_create_event_channel, mlx5dv_devx_destroy_event_channel +section: 3 +tagline: Verbs +--- + +# NAME + +mlx5dv_devx_create_event_channel - Create an event channel to be used for DEVX asynchronous events. + +mlx5dv_devx_destroy_event_channel - Destroy a DEVX event channel. + +# SYNOPSIS + +```c +#include <infiniband/mlx5dv.h> + +struct mlx5dv_devx_event_channel { + int fd; +}; + +struct mlx5dv_devx_event_channel * +mlx5dv_devx_create_event_channel(struct ibv_context *context, + enum mlx5dv_devx_create_event_channel_flags flags) + +void mlx5dv_devx_destroy_event_channel(struct mlx5dv_devx_event_channel *event_channel) + +``` + +# DESCRIPTION + +Create or destroy a channel to be used for DEVX asynchronous events. + +The create verb exposes an mlx5dv_devx_event_channel object that can be used to +read asynchronous DEVX events. This lets an application to subscribe to get +device events and once an event occurred read it from this object. + +# ARGUMENTS +*context* +: RDMA device context to create the channel on. + +*flags* +: MLX5DV_DEVX_CREATE_EVENT_CHANNEL_FLAGS_OMIT_EV_DATA: + omit the event data on this channel. + +# RETURN VALUE + +Upon success *mlx5dv_devx_create_event_channel* will return a new *struct +mlx5dv_devx_event_channel* object, on error NULL will be returned and errno will be set. + +# SEE ALSO + +*mlx5dv_open_device(3)*, *mlx5dv_devx_obj_create(3)* + +#AUTHOR + +Yishai Hadas <yishaih@mellanox.com> diff --git a/providers/mlx5/man/mlx5dv_devx_get_event.3.md b/providers/mlx5/man/mlx5dv_devx_get_event.3.md new file mode 100644 index 0000000..ae15998 --- /dev/null +++ b/providers/mlx5/man/mlx5dv_devx_get_event.3.md @@ -0,0 +1,67 @@ +--- +layout: page +title: mlx5dv_devx_get_event +section: 3 +tagline: Verbs +--- + +# NAME + +mlx5dv_devx_get_event - Get an asynchronous event. + +# SYNOPSIS + +```c +#include <infiniband/mlx5dv.h> + +struct mlx5dv_devx_async_event_hdr { + uint64_t cookie; + uint8_t out_data[]; +}; + +ssize_t mlx5dv_devx_get_event(struct mlx5dv_devx_event_channel *event_channel, + struct mlx5dv_devx_async_event_hdr *event_data, + size_t event_resp_len) + +``` + +# DESCRIPTION + +Get a device event on the given *event_channel*. +Post a successful subscription over the event channel by calling to mlx5dv_devx_subscribe_devx_event() the application should use this +API to get the response once an event has occurred. + +Upon response the *cookie* that was supplied upon the subscription is returned and the *out_data* includes the data itself. +The *out_data* may be omitted in case the channel was created with the omit data flag. + +The application must supply a large enough buffer to hold the event according to the device specification, the buffer size +is given by the input *event_resp_len* parameter. + +# ARGUMENTS +*event_channel* +: The channel to get the event over. + +*event_data* +: The output data from the asynchronous event. + +*event_resp_len* +: The output buffer size to hold the response. + +# RETURN VALUE + +Upon success *mlx5dv_devx_get_event* will return the number of bytes read, otherwise -1 will be returned and errno was set. + +# NOTES + +In case the *event_channel* was created with the omit data flag, events having the same type may be combined per subscription and be reported once with the matching *cookie*. +In that mode of work, ordering is not preserved between those events to other on this channel. + +On the other hand, when each event should hold the device data ordering is preserved, however, events might be loose as of lack of kernel memory, in that case EOVERFLOW will be reported. + +# SEE ALSO + +*mlx5dv_open_device(3)*, *mlx5dv_devx_subscribe_devx_event(3)* + +#AUTHOR + +Yishai Hadas <yishaih@mellanox.com> diff --git a/providers/mlx5/man/mlx5dv_devx_obj_create.3.md b/providers/mlx5/man/mlx5dv_devx_obj_create.3.md new file mode 100644 index 0000000..0a77f35 --- /dev/null +++ b/providers/mlx5/man/mlx5dv_devx_obj_create.3.md @@ -0,0 +1,127 @@ +--- +layout: page +title: mlx5dv_devx_obj_create / destroy / modify /query / general +section: 3 +tagline: Verbs +--- + +# NAME + +mlx5dv_devx_obj_create - Creates a devx object + +mlx5dv_devx_obj_destroy - Destroys a devx object + +mlx5dv_devx_obj_modify - Modifies a devx object + +mlx5dv_devx_obj_query - Queries a devx object + +mlx5dv_devx_obj_query_async - Queries a devx object in an asynchronous mode + +mlx5dv_devx_general_cmd - Issues a general command over the devx interface + +# SYNOPSIS + +```c +#include <infiniband/mlx5dv.h> + +struct mlx5dv_devx_obj * +mlx5dv_devx_obj_create(struct ibv_context *context, const void *in, size_t inlen, + void *out, size_t outlen); +int mlx5dv_devx_obj_query(struct mlx5dv_devx_obj *obj, const void *in, size_t inlen, + void *out, size_t outlen); +int mlx5dv_devx_obj_query_async(struct mlx5dv_devx_obj *obj, const void *in, + size_t inlen, size_t outlen, + uint64_t wr_id, + struct mlx5dv_devx_cmd_comp *cmd_comp); +int mlx5dv_devx_obj_modify(struct mlx5dv_devx_obj *obj, const void *in, size_t inlen, + void *out, size_t outlen); +int mlx5dv_devx_obj_destroy(struct mlx5dv_devx_obj *obj); +int mlx5dv_devx_general_cmd(struct ibv_context *context, const void *in, size_t inlen, + void *out, size_t outlen); +``` + +# DESCRIPTION + +Create / destroy / modify / query a devx object, issue a general command over the devx interface. + +The DEVX API enables direct access from the user space area to the mlx5 device +driver by using the KABI mechanism. The main purpose is to make the user +space driver as independent as possible from the kernel so that future device +functionality and commands can be activated with minimal to none kernel changes. + +A DEVX object represents some underlay firmware object, the input command to +create it is some raw data given by the user application which should match the +device specification. Upon successful creation the output buffer includes the +raw data from the device according to its specification, this data +can be used as part of related firmware commands to this object. + +Once the DEVX object is created it can be queried/modified/destroyed by the +matching mlx5dv_devx_obj_xxx() API. Both the input and the output for those APIs +need to match the device specification as well. + +The mlx5dv_devx_general_cmd() API enables issuing some general command which is +not related to an object such as query device capabilities. + +The mlx5dv_devx_obj_query_async() API is similar to the query object API, +however, it runs asynchronously without blocking. The input includes an +mlx5dv_devx_cmd_comp object and an identifier named 'wr_id' for this command. +The response should be read upon success with the mlx5dv_devx_get_async_cmd_comp() API. +The 'wr_id' that was supplied as an input is returned as part of the response +to let application knows for which command the response is related to. + +An application can gradually migrate to use DEVX according to its needs, it is +not all or nothing. For example it can create an ibv_cq via ibv_create_cq() +verb and then use the returned cqn to create a DEVX QP object by the +mlx5dv_devx_obj_create() API which needs that cqn. + +The above example can enable an application to create a QP with some driver +specific attributes that are not exposed in the ibv_create_qp() API, in that +case no user or kernel change may be needed at all as the command input reaches +directly to the firmware. + +The expected users for the DEVX APIs are application that use the mlx5 DV APIs +and are familiar with the device specification in both control and data path. + +To successfully create a DEVX object and work on, a DEVX context must be +created, this is done by the mlx5dv_open_device() API with the +*MLX5DV_CONTEXT_FLAGS_DEVX* flag. + +# ARGUMENTS +*context* +: RDMA device context to create the action on. + +*in* +: A buffer which contains the command's input data provided in a device specification format. + +*inlen* +: The size of *in* buffer in bytes. + +*out* +: A buffer which contains the command's output data according to the device specification format. + +*outlen* +: The size of *out* buffer in bytes. + +*obj* +: For query, modify, destroy: the devx object to work on. + +*wr_id* +: The command identifier when working in asynchronous mode. + +*cmd_comp* +: The command completion object to read the response from in asynchronous mode. + +# RETURN VALUE + +Upon success *mlx5dv_devx_create_obj* will return a new *struct +mlx5dv_devx_obj* on error NULL will be returned and errno will be set. + +Upon success query, modify, destroy, general commands, 0 is returned or the value of errno on a failure. + +# SEE ALSO + +**mlx5dv_open_device**, **mlx5dv_devx_create_cmd_comp**, **mlx5dv_devx_get_async_cmd_comp** + +#AUTHOR + +Yishai Hadas <yishaih@mellanox.com> diff --git a/providers/mlx5/man/mlx5dv_devx_qp_modify.3.md b/providers/mlx5/man/mlx5dv_devx_qp_modify.3.md new file mode 100644 index 0000000..370fe1c --- /dev/null +++ b/providers/mlx5/man/mlx5dv_devx_qp_modify.3.md @@ -0,0 +1,99 @@ +--- +layout: page +title: mlx5dv_devx_qp[/cq/srq/wq/ind_tbl]_modify / query +section: 3 +tagline: Verbs +--- + +# NAME + +mlx5dv_devx_qp_modify - Modifies a verbs QP via DEVX + +mlx5dv_devx_qp_query - Queries a verbs QP via DEVX + +mlx5dv_devx_cq_modify - Modifies a verbs CQ via DEVX + +mlx5dv_devx_cq_query - Queries a verbs CQ via DEVX + +mlx5dv_devx_srq_modify - Modifies a verbs SRQ via DEVX + +mlx5dv_devx_srq_query - Queries a verbs SRQ via DEVX + +mlx5dv_devx_wq_modify - Modifies a verbs WQ via DEVX + +mlx5dv_devx_wq_query - Queries a verbs WQ via DEVX + +mlx5dv_devx_ind_tbl_modify - Modifies a verbs indirection table via DEVX + +mlx5dv_devx_ind_tbl_query - Queries a verbs indirection table via DEVX + +# SYNOPSIS + +```c +#include <infiniband/mlx5dv.h> +int mlx5dv_devx_qp_modify(struct ibv_qp *qp, const void *in, size_t inlen, + void *out, size_t outlen); +int mlx5dv_devx_qp_query(struct ibv_qp *qp, const void *in, size_t inlen, + void *out, size_t outlen); +int mlx5dv_devx_cq_modify(struct ibv_cq *cq, const void *in, size_t inlen, + void *out, size_t outlen); +int mlx5dv_devx_cq_query(struct ibv_cq *cq, const void *in, size_t inlen, + void *out, size_t outlen); +int mlx5dv_devx_srq_modify(struct ibv_srq *srq, const void *in, size_t inlen, + void *out, size_t outlen); +int mlx5dv_devx_srq_query(struct ibv_srq *srq, const void *in, size_t inlen, + void *out, size_t outlen); +int mlx5dv_devx_wq_modify(struct ibv_wq *wq, const void *in, size_t inlen, + void *out, size_t outlen); +int mlx5dv_devx_wq_query(struct ibv_wq *wq, const void *in, size_t inlen, + void *out, size_t outlen); +int mlx5dv_devx_ind_tbl_modify(struct ibv_rwq_ind_table *ind_tbl, + const void *in, size_t inlen, + void *out, size_t outlen); +int mlx5dv_devx_ind_tbl_query(struct ibv_rwq_ind_table *ind_tbl, + const void *in, size_t inlen, + void *out, size_t outlen); +``` + +# DESCRIPTION + +Modify / query a verb object over the DEVX interface. + +The DEVX API enables direct access from the user space area to the mlx5 device +driver by using the KABI mechanism. The main purpose is to make the user +space driver as independent as possible from the kernel so that future device +functionality and commands can be activated with minimal to none kernel changes. + +The above APIs enables modifying/querying a verb object via the DEVX interface. +This enables interoperability between verbs and DEVX. As such an application +can use the create method from verbs (e.g. ibv_create_qp) and modify and query the created +object via DEVX (e.g. mlx5dv_devx_qp_modify). + +# ARGUMENTS +*qp/cq/wq/srq/ind_tbl* +: The ibv_xxx object to issue the action on. + +*in* +: A buffer which contains the command's input data provided in a device specification format. + +*inlen* +: The size of *in* buffer in bytes. + +*out* +: A buffer which contains the command's output data according to the device specification format. + +*outlen* +: The size of *out* buffer in bytes. + + +# RETURN VALUE + +Upon success 0 is returned or the value of errno on a failure. + +# SEE ALSO + +**mlx5dv_open_device**, **mlx5dv_devx_obj_create** + +#AUTHOR + +Yishai Hadas <yishaih@mellanox.com> diff --git a/providers/mlx5/man/mlx5dv_devx_query_eqn.3.md b/providers/mlx5/man/mlx5dv_devx_query_eqn.3.md new file mode 100644 index 0000000..8632aca --- /dev/null +++ b/providers/mlx5/man/mlx5dv_devx_query_eqn.3.md @@ -0,0 +1,49 @@ +--- +layout: page +title: mlx5dv_devx_query_eqn +section: 3 +tagline: Verbs +--- + +# NAME + +mlx5dv_devx_query_eqn - Query EQN for a given vector id. + +# SYNOPSIS + +```c +#include <infiniband/mlx5dv.h> + +int mlx5dv_devx_query_eqn(struct ibv_context *context, uint32_t vector, + uint32_t *eqn); +``` + +# DESCRIPTION + +Query EQN for a given input vector, the EQN is needed for other device commands over the DEVX interface. + +The DEVX API enables direct access from the user space area to the mlx5 device +driver, the EQN information is needed for few commands such as CQ creation. + + +# ARGUMENTS +*context* +: RDMA device context to work on. + +*vector* +: Completion vector number. + +*eqn* +: The device EQ number which relates to the given input vector. + +# RETURN VALUE + +returns 0 on success, or the value of errno on failure (which indicates the failure reason). + +# SEE ALSO + +**mlx5dv_open_device**, **mlx5dv_devx_obj_create** + +#AUTHOR + +Yishai Hadas <yishaih@mellanox.com> diff --git a/providers/mlx5/man/mlx5dv_devx_subscribe_devx_event.3.md b/providers/mlx5/man/mlx5dv_devx_subscribe_devx_event.3.md new file mode 100644 index 0000000..0191184 --- /dev/null +++ b/providers/mlx5/man/mlx5dv_devx_subscribe_devx_event.3.md @@ -0,0 +1,64 @@ +--- +layout: page +title: mlx5dv_devx_subscribe_devx_event, mlx5dv_devx_subscribe_devx_event_fd +section: 3 +tagline: Verbs +--- + +# NAME + +mlx5dv_devx_subscribe_devx_event - Subscribe over an event channel for device events. + +mlx5dv_devx_subscribe_devx_event_fd - Subscribe over an event channel for device events to signal eventfd. + +# SYNOPSIS + +```c +#include <infiniband/mlx5dv.h> + +int mlx5dv_devx_subscribe_devx_event(struct mlx5dv_devx_event_channel *dv_event_channel, + struct mlx5dv_devx_obj *obj, + uint16_t events_sz, + uint16_t events_num[], + uint64_t cookie) + +int mlx5dv_devx_subscribe_devx_event_fd(struct mlx5dv_devx_event_channel *dv_event_channel, + int fd, + struct mlx5dv_devx_obj *obj, + uint16_t event_num) +``` + +# DESCRIPTION + +Subscribe over a DEVX event channel for device events. + +# ARGUMENTS +*dv_event_channel* +: Event channel to subscribe over. + +*fd* +: A file descriptor that previously was opened by the eventfd() system call. + +*obj* +: DEVX object that *events_num* relates to, can be NULL for unaffiliated events. + +*events_sz* +: Size of the *events_num* buffer that holds the events to subscribe for. + +*events_num* +: Holds the required event numbers to subscribe for, numbers are according to the device specification. + +*cookie* +: The value to be returned back when reading the event, can be used as an ID for application use. + +# NOTES +When mlx5dv_devx_subscribe_devx_event_fd will be used the *fd* will be signaled once an event has occurred. + + +# SEE ALSO + +*mlx5dv_open_device(3)*, *mlx5dv_devx_create_event_channel(3)*, *mlx5dv_devx_get_event(3)* + +#AUTHOR + +Yishai Hadas <yishaih@mellanox.com> diff --git a/providers/mlx5/man/mlx5dv_devx_umem_reg.3.md b/providers/mlx5/man/mlx5dv_devx_umem_reg.3.md new file mode 100644 index 0000000..8dbd6f6 --- /dev/null +++ b/providers/mlx5/man/mlx5dv_devx_umem_reg.3.md @@ -0,0 +1,70 @@ +--- +layout: page +title: mlx5dv_devx_umem_reg, mlx5dv_devx_umem_dereg +section: 3 +tagline: Verbs +--- + +# NAME + +mlx5dv_devx_umem_reg - Register a user memory to be used by the devx interface + +mlx5dv_devx_umem_dereg - Deregister a devx umem object + +# SYNOPSIS + +```c +#include <infiniband/mlx5dv.h> + +struct mlx5dv_devx_umem { + uint32_t umem_id; +}; + +struct mlx5dv_devx_umem * +mlx5dv_devx_umem_reg(struct ibv_context *context, void *addr, size_t size, + uint32_t access) + +int mlx5dv_devx_umem_dereg(struct mlx5dv_devx_umem *dv_devx_umem) +``` + +# DESCRIPTION + +Register or deregister a user memory to be used by the devx interface. + +The register verb exposes a UMEM DEVX object for user memory registration for +DMA. The API to register the user memory gets as input the user address, +length and access flags, and provides to the user as output an object which +holds the UMEM ID returned by the firmware to this registered memory. + +The user will use that UMEM ID in device direct commands that use this memory +instead of the physical addresses list, for example upon +*mlx5dv_devx_obj_create* to create a QP. + +# ARGUMENTS +*context* +: RDMA device context to create the action on. + +*addr* +: The memory start address to register. + +*size* +: The size of *addr* buffer. + +*access* +: The desired memory protection attributes; it is either 0 or the bitwise OR of one or more of *enum ibv_access_flags*. + + +# RETURN VALUE + +Upon success *mlx5dv_devx_umem_reg* will return a new *struct +mlx5dv_devx_umem* object, on error NULL will be returned and errno will be set. + +*mlx5dv_devx_umem_dereg* returns 0 on success, or the value of errno on failure (which indicates the failure reason). + +# SEE ALSO + +*mlx5dv_open_device(3)*, *ibv_reg_mr(3)*, *mlx5dv_devx_obj_create(3)* + +#AUTHOR + +Yishai Hadas <yishaih@mellanox.com> diff --git a/providers/mlx5/man/mlx5dv_dr_flow.3.md b/providers/mlx5/man/mlx5dv_dr_flow.3.md new file mode 100644 index 0000000..6bba15d --- /dev/null +++ b/providers/mlx5/man/mlx5dv_dr_flow.3.md @@ -0,0 +1,222 @@ +--- +date: 2019-03-28 +layout: page +title: MLX5DV_DR API +section: 3 +license: 'Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md' +header: "mlx5 Programmer's Manual" +footer: mlx5 +--- + +# NAME + +mlx5dv_dr_domain_create, mlx5dv_dr_domain_sync, mlx5dv_dr_domain_destroy - Manage flow domains + +mlx5dv_dr_table_create, mlx5dv_dr_table_destroy - Manage flow tables + +mlx5dv_dr_matcher_create, mlx5dv_dr_matcher_destroy - Manage flow matchers + +mlx5dv_dr_rule_create, mlx5dv_dr_rule_destroy - Manage flow rules + +mlx5dv_dr_action_create_drop - Create drop action + +mlx5dv_dr_action_create_tag - Create tag actions + +mlx5dv_dr_action_create_dest_ibv_qp, mlx5dv_dr_action_create_dest_table, mlx5dv_dr_action_create_dest_vport - Create packet destination actions + +mlx5dv_dr_action_create_packet_reformat - Create packet reformat actions + +mlx5dv_dr_action_create_modify_header - Create modify header actions + +mlx5dv_dr_action_create_flow_counter - Create devx flow counter actions + +mlx5dv_dr_action_create_flow_meter, mlx5dv_dr_action_modify_flow_meter - Create and modify meter action + +mlx5dv_dr_action_destroy - Destroy actions + +# SYNOPSIS + +```c +#include <infiniband/mlx5dv.h> + +struct mlx5dv_dr_domain *mlx5dv_dr_domain_create( + struct ibv_context *ctx, + enum mlx5dv_dr_domain_type type); + +int mlx5dv_dr_domain_sync( + struct mlx5dv_dr_domain *domain, + uint32_t flags); + +int mlx5dv_dr_domain_destroy(struct mlx5dv_dr_domain *domain); + +struct mlx5dv_dr_table *mlx5dv_dr_table_create( + struct mlx5dv_dr_domain *domain, + uint32_t level); + +int mlx5dv_dr_table_destroy(struct mlx5dv_dr_table *table); + +struct mlx5dv_dr_matcher *mlx5dv_dr_matcher_create( + struct mlx5dv_dr_table *table, + uint16_t priority, + uint8_t match_criteria_enable, + struct mlx5dv_flow_match_parameters *mask); + +int mlx5dv_dr_matcher_destroy(struct mlx5dv_dr_matcher *matcher); + +struct mlx5dv_dr_rule *mlx5dv_dr_rule_create( + struct mlx5dv_dr_matcher *matcher, + struct mlx5dv_flow_match_parameters *value, + size_t num_actions, + struct mlx5dv_dr_action *actions[]); + +void mlx5dv_dr_rule_destroy(struct mlx5dv_dr_rule *rule); + +struct mlx5dv_dr_action *mlx5dv_dr_action_create_drop(void); + +struct mlx5dv_dr_action *mlx5dv_dr_action_create_tag( + uint32_t tag_value); + +struct mlx5dv_dr_action *mlx5dv_dr_action_create_dest_ibv_qp( + struct ibv_qp *ibqp); + +struct mlx5dv_dr_action *mlx5dv_dr_action_create_dest_table( + struct mlx5dv_dr_table *table); + +struct mlx5dv_dr_action *mlx5dv_dr_action_create_dest_vport( + struct mlx5dv_dr_domain *domain, + uint32_t vport); + +struct mlx5dv_dr_action *mlx5dv_dr_action_create_packet_reformat( + struct mlx5dv_dr_domain *domain, + uint32_t flags, + enum mlx5dv_flow_action_packet_reformat_type reformat_type, + size_t data_sz, void *data); + +struct mlx5dv_dr_action *mlx5dv_dr_action_create_modify_header( + struct mlx5dv_dr_domain *domain, + uint32_t flags, + size_t actions_sz, + __be64 actions[]); + +struct mlx5dv_dr_action *mlx5dv_dr_action_create_flow_counter( + struct mlx5dv_devx_obj *devx_obj, + uint32_t offset); + +struct mlx5dv_dr_action * +mlx5dv_dr_action_create_flow_meter(struct mlx5dv_dr_flow_meter_attr *attr); + +int mlx5dv_dr_action_modify_flow_meter(struct mlx5dv_dr_action *action, + struct mlx5dv_dr_flow_meter_attr *attr, + __be64 modify_field_select); + +int mlx5dv_dr_action_destroy(struct mlx5dv_dr_action *action); +``` + +# DESCRIPTION + +The Direct Rule API (mlx5dv_dr_\*) allows complete access by verbs application to the device`s packet steering functionality. + +Steering flow rules are the combination of attributes with a match pattern and a list of actions. +Rules can have several distinct actions (such as counting, encapsulating, decapsulating before redirecting packets to a particular queue or port, etc.). +In order to manage the rule execution order for the packet processing matching by HW, multiple flow tables in an ordered chain and multiple flow matchers sorted by priorities are defined. + +## Domain +*mlx5dv_dr_domain_create()* creates a DR domain object to be used with *mlx5dv_dr_table_create()* and *mlx5dv_dr_action_create_\*()*. + +A domain should be destroyed by calling *mlx5dv_dr_domain_destroy()* once all depended resources are released. + +The device support the following domains types: + +**MLX5DV_DR_DOMAIN_TYPE_NIC_RX** +Manage ethernet packets received on the NIC. Packets in this domain can be dropped, dispatched to QP`s, modified or redirected to additional tables inside the domain. +Default behavior: Drop packet. + +**MLX5DV_DR_DOMAIN_TYPE_NIC_TX** +Manage ethernet packets transmit on the NIC. Packets in this domain can be dropped, modified or redirected to additional tables inside the domain. +Default behavior: Forward packet to NIC vport (to eSwitch or wire). + +**MLX5DV_DR_DOMAIN_TYPE_FDB** +Manage ethernet packets in the eSwitch Forwarding Data Base for packets received from wire or from any other vport. Packets in this domain can be dropped, dispatched to vport, modified or redirected to additional tables inside the domain. +Default behavior: Forward packet to eSwitch manager vport. + +*mlx5dv_dr_domain_sync()* is used in order to flush the rule submission queue. By default, rules in a domain are updated in HW asynchronously. **flags** should be a set of type *enum mlx5dv_dr_domain_sync_flags*: + +**MLX5DV_DR_DOMAIN_SYNC_FLAGS_SW**: block until completion of all software queued tasks. + +**MLX5DV_DR_DOMAIN_SYNC_FLAGS_HW**: clear the steering HW cache to enforce next packet hits the latest rules, in addition to the SW SYNC handling. + +## Table +*mlx5dv_dr_table_create()* creates a DR table in the **domain**, at the appropriate **level**, and can be used with *mlx5dv_dr_matcher_create()* and *mlx5dv_dr_action_create_dest_table()*. +All packets start traversing the steering domain tree at table **level** zero (0). +Using rule and action, packets can by redirected to other tables in the domain. + +A table should be destroyed by calling *mlx5dv_dr_table_destroy()* once all depended resources are released. + +## Matcher +*mlx5dv_dr_matcher_create()* create a matcher object in **table**, at sorted **priority** (lower value is check first). A matcher can hold multiple rules, all with identical **mask** of type *struct mlx5dv_flow_match_parameters* which represents the exact attributes to be compared by HW steering. The **match_criteria_enable** and **mask** are defined in a device spec format. Only the fields that where masked in the *matcher* should be filled by the rule in *mlx5dv_dr_rule_create()*. + +A matcher should be destroyed by calling *mlx5dv_dr_matcher_destroy()* once all depended resources are released. + +## Actions +A set of action create API are defined by *mlx5dv_dr_action_create_\*()*. All action are created as *struct mlx5dv_dr_action*. +An action should be destroyed by calling *mlx5dv_dr_action_destroy()* once all depended rules are destroyed. + +When an action handle is reused for multiple rules, the same action will be executed. e.g.: action 'count' will count multiple flows rules on the same HW flow counter context. action 'drop' will drop packets of different rule from any matcher. + +Action: Drop +*mlx5dv_dr_action_create_drop* create a terminating action which drops packets. Can not be mixed with Destination actions. + +Action: Tag +*mlx5dv_dr_action_create_tag* creates a non-terminating action which tags packets with **tag_value**. The **tag_value** is available in the CQE of the packet received. Valid only on domain type NIC_RX. + +Action: Destination +*mlx5dv_dr_action_create_dest_ibv_qp* creates a terminating action delivering the packet to a QP, defined by **ibqp**. Valid only on domain type NIC_RX. +*mlx5dv_dr_action_create_dest_table* creates a forwarding action to another flow table, defined by **table**. The destination **table** must be from the same domain with a level higher than zero. +*mlx5dv_dr_action_create_dest_vport* creates a forwarding action to a **vport** on the same **domain**. Valid only on domain type FDB. + +Action: Packet Reformat +*mlx5dv_dr_action_create_packet_reformat* create a packet reformat context and action in the **domain**. The **reformat_type**, **data_sz** and **data** are defined in *man mlx5dv_create_flow_action_packet_reformat*. + +Action: Modify Header +*mlx5dv_dr_action_create_modify_header* create a modify header context and action in the **domain**. The **actions_sz** and **actions** are defined in *man mlx5dv_create_flow_action_modify_header*. + +Action: Flow Count +*mlx5dv_dr_action_create_flow_counter* creates a flow counter action from a DEVX flow counter object, based on **devx_obj** and specific counter index from **offset** in the counter bulk. + +Action: Meter +*mlx5dv_dr_action_create_flow_meter* creates a meter action based on the flow meter parameters. The paramertes are according to the device specification. +*mlx5dv_dr_action_modify_flow_meter* modifies existing flow meter **action** based on **modify_field_select**. **modify_field_select** is according to the device specification. + +Action Flags: action **flags** can be set to one of the types of *enum mlx5dv_dr_action_flags*: + +**MLX5DV_DR_ACTION_FLAGS_ROOT_LEVEL**: is used to indicate the action is targeted for flow table in level=0 (ROOT) of the specific domain. + +## Rule +*mlx5dv_dr_rule_create()* creates a HW steering rule entry in **matcher**. The **value** of type *struct mlx5dv_flow_match_parameters* holds the exact attribute values of the steering rule to be matched, in a device spec format. Only the fields that where masked in the *matcher* should be filled. +HW will perform the set of **num_actions** from the **action** array of type *struct mlx5dv_dr_action*, once a packet matches the exact **value** of the rule (referred to as a 'hit'). + +*mlx5dv_dr_rule_destroy()* destroys the rule. + +# RETURN VALUE +The create API calls will return a pointer to the relevant object: table, matcher, action, rule. on failure, NULL will be returned and errno will be set. + +The destroy API calls will returns 0 on success, or the value of errno on failure (which indicates the failure reason). + +# LIMITATIONS +Application can verify is a feature is supported by *trail and error*. No capabilities are exposed, as the combination of all the options exposed are way to large to define. + +Tables are size less by definition. They are expected to grow and shrink to accommodate for all rules, according to driver capabilities. Once reaching a limit, an error is returned. + +Matchers in same priority, in the same table, will have undefined ordered. + +A rule with identical value pattern to another rule on a given matcher are rejected. + +IP version in matcher mask and rule should be equal and set to 4, 6 or 0. +# SEE ALSO + +**mlx5dv_open_device(3)**, **mlx5dv_create_flow_action_packet_reformat(3)**, **mlx5dv_create_flow_action_modify_header(3)**. + +# AUTHOR + +Alex Rosenbaum <alexr@mellanox.com> +Alex Vesker <valex@mellanox.com> diff --git a/providers/mlx5/man/mlx5dv_dump.3.md b/providers/mlx5/man/mlx5dv_dump.3.md new file mode 100644 index 0000000..a11175e --- /dev/null +++ b/providers/mlx5/man/mlx5dv_dump.3.md @@ -0,0 +1,52 @@ +--- +date: 2019-11-18 +layout: page +title: MLX5DV_DUMP API +section: 3 +license: 'Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md' +header: "mlx5 Programmer's Manual" +footer: mlx5 +--- + +# NAME + +mlx5dv_dump_dr_domain - Dump DR Domain + +mlx5dv_dump_dr_table - Dump DR Table + +mlx5dv_dump_dr_matcher - Dump DR Matcher + +mlx5dv_dump_dr_rule - Dump DR Rule + +# SYNOPSIS + +```c +#include <infiniband/mlx5dv.h> + +int mlx5dv_dump_dr_domain(FILE *fout, struct mlx5dv_dr_domain *domain); +int mlx5dv_dump_dr_table(FILE *fout, struct mlx5dv_dr_table *table); +int mlx5dv_dump_dr_matcher(FILE *fout, struct mlx5dv_dr_matcher *matcher); +int mlx5dv_dump_dr_rule(FILE *fout, struct mlx5dv_dr_rule *rule); +``` + +# DESCRIPTION + +The Dump API (mlx5dv_dump_\*) allows the dumping of the existing rdma-core resources to the provided file. +The output file format is vendor specific. + +*mlx5dv_dump_dr_domain()* dumps a DR Domain object properties to a specified file. + +*mlx5dv_dump_dr_table()* dumps a DR Table object properties to a specified file. + +*mlx5dv_dump_dr_matcher()* dumps a DR Matcher object properties to a specified file. + +*mlx5dv_dump_dr_rule()* dumps a DR Rule object properties to a specified file. + +# RETURN VALUE +The API calls returns 0 on success, or the value of errno on failure (which indicates the failure reason). +The calls are blocking - function returns only when all related resources info is written to the file. + +# AUTHOR + +Yevgeny Kliteynik <kliteyn@mellanox.com> +Muhammad Sammar <muhammads@mellanox.com> diff --git a/providers/mlx5/man/mlx5dv_flow_action_esp.3.md b/providers/mlx5/man/mlx5dv_flow_action_esp.3.md new file mode 100644 index 0000000..cd40a94 --- /dev/null +++ b/providers/mlx5/man/mlx5dv_flow_action_esp.3.md @@ -0,0 +1,60 @@ +--- +layout: page +title: mlx5dv_flow_action_esp +section: 3 +tagline: Verbs +--- + +# NAME + +mlx5dv_flow_action_esp - Flow action esp for mlx5 provider + +# SYNOPSIS + +```c +#include <infiniband/mlx5/mlx5dv.h> + +struct ibv_flow_action * +mlx5dv_create_flow_action_esp(struct ibv_context *ctx, + struct ibv_flow_action_esp_attr *esp, + struct mlx5dv_flow_action_esp *mlx5_attr); +``` + +# DESCRIPTION + +Create an IPSEC ESP flow steering action. +This verb is identical to *ibv_create_flow_action_esp* verb, but allows mlx5 specific flags. + +# ARGUMENTS + +Please see *ibv_flow_action_esp(3)* man page for *ctx* and *esp*. + +## *mlx5_attr* argument + +```c +struct mlx5dv_flow_action_esp { + uint64_t comp_mask; /* Use enum mlx5dv_flow_action_esp_mask */ + uint32_t action_flags; /* Use enum mlx5dv_flow_action_flags */ +}; +``` + +*comp_mask* +: Bitmask specifying what fields in the structure are valid (*enum mlx5dv_flow_action_esp_mask*). + +*action_flags* +: A bitwise OR of the various values described below. + + *MLX5DV_FLOW_ACTION_FLAGS_REQUIRE_METADATA*: + Each received and transmitted packet using offload is expected to carry metadata in the form of a L2 header + with ethernet type 0x8CE4, followed by 6 bytes of data and the original packet ethertype. + +# NOTE + +The ESN is expected to be placed in the IV field for egress packets. +The 64 bit sequence number is written in big-endian over the 64 bit IV field. +There is no need to call modify to update the ESN window on egress when this DV is used. + +# SEE ALSO + +*ibv_flow_action_esp(3)*, *RFC 4106* + diff --git a/providers/mlx5/man/mlx5dv_get_clock_info.3 b/providers/mlx5/man/mlx5dv_get_clock_info.3 new file mode 100644 index 0000000..194a32e --- /dev/null +++ b/providers/mlx5/man/mlx5dv_get_clock_info.3 @@ -0,0 +1,37 @@ +.\" -*- nroff -*- +.\" Licensed under the OpenIB.org (MIT) - See COPYING.md +.\" +.TH MLX5DV_GET_CLOCK_INFO 3 2017-11-08 1.0.0 +.SH "NAME" +mlx5dv_get_clock_info \- Get device clock information +.SH "SYNOPSIS" +.nf +.B #include <infiniband/mlx5dv.h> +.sp +.BI "int mlx5dv_get_clock_info(struct ibv_context *ctx_in, +.BI " struct mlx5dv_clock_info *clock_info); +.fi +.SH "DESCRIPTION" +Get the updated core +.I clock_info +from the device driver. This information will be used later to translate the +completion timestamp from HCA core clock to nanoseconds. The values of the clock are +updated from the driver's PTP clock, therefore, without a running PTP +client on the machine, the wall clock conversion will not be accurate. +.PP +Pass the latest \fBstruct mlx5dv_clock_info\fR to \fBmlx5dv_ts_to_ns(3)\fR in order to translate +the completion timestamp from HCA core clock to nanoseconds. +.PP +If the clock_info becomes too old then time conversion will return wrong conversion results. +The user must ensure that \fBmlx5dv_get_clock_info(3)\fR is called at least once every +\fBmax_clock_info_update_nsec\fR as returned by the \fBmlx5dv_query_device(3)\fR function. +.PP +.fi +.SH "RETURN VALUE" +0 on success or the value of errno on failure (which indicates the failure reason). +.SH "SEE ALSO" +.BR mlx5dv (7), +.BR mlx5dv_ts_to_ns (3) +.SH "AUTHORS" +.TP +Feras Daoud <ferasda@mellanox.com> diff --git a/providers/mlx5/man/mlx5dv_init_obj.3 b/providers/mlx5/man/mlx5dv_init_obj.3 new file mode 100644 index 0000000..c6cdff5 --- /dev/null +++ b/providers/mlx5/man/mlx5dv_init_obj.3 @@ -0,0 +1,181 @@ +.\" -*- nroff -*- +.\" Licensed under the OpenIB.org (MIT) - See COPYING.md +.\" +.TH MLX5DV_INIT_OBJ 3 2017-02-02 1.0.0 +.SH "NAME" +mlx5dv_init_obj \- Initialize mlx5 direct verbs object from ibv_xxx structures +.SH "SYNOPSIS" +.nf +.B #include <infiniband/mlx5dv.h> +.sp +.BI "int mlx5dv_init_obj(struct mlx5dv_obj *obj, uint64_t obj_type); +.fi +.SH "DESCRIPTION" +.B mlx5dv_init_obj() +This function will initialize mlx5dv_xxx structs based on supplied type. The information +for initialization is taken from ibv_xx structs supplied as part of input. + +Request information of CQ marks its owned by direct verbs for all consumer index +related actions. The initialization type can be combination of several types together. +.PP +.nf +struct mlx5dv_qp { +.in +8 +uint32_t *dbrec; +struct { +.in +8 +void *buf; +uint32_t wqe_cnt; +uint32_t stride; +.in -8 +} sq; +struct { +.in +8 +void *buf; +uint32_t wqe_cnt; +uint32_t stride; +.in -8 +} rq; +struct { +.in +8 +void *reg; +uint32_t size; +.in -8 +} bf; +uint64_t comp_mask; +off_t uar_mmap_offset; +uint32_t tirn; +uint32_t tisn; +uint32_t rqn; +uint32_t sqn; +uint64_t tir_icm_address; +.in -8 +}; + +struct mlx5dv_cq { +.in +8 +void *buf; +uint32_t *dbrec; +uint32_t cqe_cnt; +uint32_t cqe_size; +void *cq_uar; +uint32_t cqn; +uint64_t comp_mask; +.in -8 +}; + +struct mlx5dv_srq { +.in +8 +void *buf; +uint32_t *dbrec; +uint32_t stride; +uint32_t head; +uint32_t tail; +uint64_t comp_mask; +uint32_t srqn; +.in -8 +}; + +struct mlx5dv_rwq { +.in +8 +void *buf; +uint32_t *dbrec; +uint32_t wqe_cnt; +uint32_t stride; +uint64_t comp_mask; +.in -8 +}; + +struct mlx5dv_dm { +.in +8 +void *buf; +uint64_t length; +uint64_t comp_mask; +uint64_t remote_va; +.in -8 +}; + +struct mlx5dv_ah { +.in +8 +struct mlx5_wqe_av *av; +uint64_t comp_mask; +.in -8 +}; + +struct mlx5dv_pd { +.in +8 +uint32_t pdn; +uint64_t comp_mask; +.in -8 +}; + +struct mlx5dv_obj { +.in +8 +struct { +.in +8 +struct ibv_qp *in; +struct mlx5dv_qp *out; +.in -8 +} qp; +struct { +.in +8 +struct ibv_cq *in; +struct mlx5dv_cq *out; +.in -8 +} cq; +struct { +.in +8 +struct ibv_srq *in; +struct mlx5dv_srq *out; +.in -8 +} srq; +struct { +.in +8 +struct ibv_wq *in; +struct mlx5dv_rwq *out; +.in -8 +} rwq; +struct { +.in +8 +struct ibv_dm *in; +struct mlx5dv_dm *out; +.in -8 +} dm; +struct { +.in +8 +struct ibv_ah *in; +struct mlx5dv_ah *out; +.in -8 +} ah; +struct { +.in +8 +struct ibv_pd *in; +struct mlx5dv_pd *out; +.in -8 +} pd; +.in -8 +}; + +enum mlx5dv_obj_type { +.in +8 +MLX5DV_OBJ_QP = 1 << 0, +MLX5DV_OBJ_CQ = 1 << 1, +MLX5DV_OBJ_SRQ = 1 << 2, +MLX5DV_OBJ_RWQ = 1 << 3, +MLX5DV_OBJ_DM = 1 << 4, +MLX5DV_OBJ_AH = 1 << 5, +MLX5DV_OBJ_PD = 1 << 6, +.in -8 +}; +.fi +.SH "RETURN VALUE" +0 on success or the value of errno on failure (which indicates the failure reason). +.SH "NOTES" + * The information if doorbell is blueflame is based on mlx5dv_qp->bf->size, +in case of 0 it's not a BF. + * Compatibility masks (comp_mask) are in/out fields. +.SH "SEE ALSO" +.BR mlx5dv (7) +.SH "AUTHORS" +.TP +Leon Romanovsky <leonro@mellanox.com> diff --git a/providers/mlx5/man/mlx5dv_is_supported.3.md b/providers/mlx5/man/mlx5dv_is_supported.3.md new file mode 100644 index 0000000..71613dd --- /dev/null +++ b/providers/mlx5/man/mlx5dv_is_supported.3.md @@ -0,0 +1,38 @@ +--- +layout: page +title: mlx5dv_is_supported +section: 3 +tagline: Verbs +--- + +# NAME + +mlx5dv_is_supported - Check whether an RDMA device implemented by the mlx5 provider + +# SYNOPSIS + +```c +#include <infiniband/mlx5dv.h> + +bool mlx5dv_is_supported(struct ibv_device *device); +``` + +# DESCRIPTION + +mlx5dv functions may be called only if this function returns true for the RDMA device. + +# ARGUMENTS + +*device* +: RDMA device to check. + +# RETURN VALUE +Returns true if device is implemented by mlx5 provider. + +# SEE ALSO + +*mlx5dv(7)* + +# AUTHOR + +Artemy Kovalyov <artemyko@mellanox.com> diff --git a/providers/mlx5/man/mlx5dv_open_device.3.md b/providers/mlx5/man/mlx5dv_open_device.3.md new file mode 100644 index 0000000..8a5876b --- /dev/null +++ b/providers/mlx5/man/mlx5dv_open_device.3.md @@ -0,0 +1,57 @@ +--- +layout: page +title: mlx5dv_open_device +section: 3 +tagline: Verbs +--- + +# NAME + +mlx5dv_open_device - Open an RDMA device context for the mlx5 provider + +# SYNOPSIS + +```c +#include <infiniband/mlx5dv.h> + +struct ibv_context * +mlx5dv_open_device(struct ibv_device *device, struct mlx5dv_context_attr *attr); +``` + +# DESCRIPTION + +Open an RDMA device context with specific mlx5 provider attributes. + +# ARGUMENTS + +*device* +: RDMA device to open. + +## *attr* argument + +```c +struct mlx5dv_context_attr { + uint32_t flags; + uint64_t comp_mask; +}; +``` + +*flags* +: A bitwise OR of the various values described below. + + *MLX5DV_CONTEXT_FLAGS_DEVX*: + Allocate a DEVX context + +*comp_mask* +: Bitmask specifying what fields in the structure are valid + +# RETURN VALUE +Returns a pointer to the allocated device context, or NULL if the request fails. + +# SEE ALSO + +*ibv_open_device(3)* + +# AUTHOR + +Yishai Hadas <yishaih@mellanox.com> diff --git a/providers/mlx5/man/mlx5dv_pp_alloc.3.md b/providers/mlx5/man/mlx5dv_pp_alloc.3.md new file mode 100644 index 0000000..f1137c4 --- /dev/null +++ b/providers/mlx5/man/mlx5dv_pp_alloc.3.md @@ -0,0 +1,72 @@ +--- +layout: page +title: mlx5dv_pp_alloc / mlx5dv_pp_free +section: 3 +tagline: Verbs +--- + +# NAME + +mlx5dv_pp_alloc - Allocates a packet pacing entry + +mlx5dv_pp_free - Frees a packet pacing entry + +# SYNOPSIS + +```c +#include <infiniband/mlx5dv.h> + +struct mlx5dv_pp * +mlx5dv_pp_alloc(struct ibv_context *context, + size_t pp_context_sz, + const void *pp_context, + uint32_t flags); + +void mlx5dv_pp_free(struct mlx5dv_pp *dv_pp); +``` + +# DESCRIPTION + +Create / free a packet pacing entry which can be used for some device commands over the DEVX interface. + +The DEVX API enables direct access from the user space area to the mlx5 device +driver, the packet pacing information is needed for few commands where a packet pacing index is needed. + + +# ARGUMENTS +*context* +: RDMA device context to work on, need to be opened with DEVX support by using mlx5dv_open_device(). + +*pp_context_sz* +: Length of *pp_context* input buffer. + +*pp_context* +: Packet pacing context according to the device specification. + +*flags* +: MLX5DV_PP_ALLOC_FLAGS_DEDICATED_INDEX: + allocate a dedicated index. + +## dv_pp + +```c +struct mlx5dv_pp { + uint16_t index; +}; + +``` +*index* +: The device index to be used. + +# RETURN VALUE + +Upon success *mlx5dv_pp_alloc* returns a pointer to the created packet pacing object, on error NULL +will be returned and errno will be set. + +# SEE ALSO + +**mlx5dv_open_device**, **mlx5dv_devx_obj_create** + +# AUTHOR + +Yishai Hadas <yishaih@mellanox.com> diff --git a/providers/mlx5/man/mlx5dv_query_device.3 b/providers/mlx5/man/mlx5dv_query_device.3 new file mode 100644 index 0000000..d005552 --- /dev/null +++ b/providers/mlx5/man/mlx5dv_query_device.3 @@ -0,0 +1,132 @@ +.\" -*- nroff -*- +.\" Licensed under the OpenIB.org (MIT) - See COPYING.md +.\" +.TH MLX5DV_QUERY_DEVICE 3 2017-02-02 1.0.0 +.SH "NAME" +mlx5dv_query_device \- Query device capabilities specific to mlx5 +.SH "SYNOPSIS" +.nf +.B #include <infiniband/mlx5dv.h> +.sp +.BI "int mlx5dv_query_device(struct ibv_context *ctx_in, +.BI " struct mlx5dv_context *attrs_out); +.fi +.SH "DESCRIPTION" +.B mlx5dv_query_device() +Query HW device-specific information which is important for data-path, but isn't provided by +\fBibv_query_device\fR(3). +.PP +This function returns version, flags and compatibility mask. The version represents the format +of the internal hardware structures that mlx5dv.h represents. Additions of new fields to the existed +structures are handled by comp_mask field. +.PP +.nf +struct mlx5dv_sw_parsing_caps { +.in +8 +uint32_t sw_parsing_offloads; /* Use enum mlx5dv_sw_parsing_offloads */ +uint32_t supported_qpts; +.in -8 +}; +.PP +.nf +struct mlx5dv_striding_rq_caps { +.in +8 +uint32_t min_single_stride_log_num_of_bytes; /* min log size of each stride */ +uint32_t max_single_stride_log_num_of_bytes; /* max log size of each stride */ +uint32_t min_single_wqe_log_num_of_strides; /* min log number of strides per WQE */ +uint32_t max_single_wqe_log_num_of_strides; /* max log number of strides per WQE */ +uint32_t supported_qpts; +.in -8 +}; +.PP +.nf +struct mlx5dv_context { +.in +8 +uint8_t version; +uint64_t flags; +uint64_t comp_mask; /* Use enum mlx5dv_context_comp_mask */ +struct mlx5dv_cqe_comp_caps cqe_comp_caps; +struct mlx5dv_sw_parsing_caps sw_parsing_caps; +uint32_t tunnel_offloads_caps; +uint32_t max_dynamic_bfregs /* max blue-flame registers that can be dynamiclly allocated */ +uint64_t max_clock_info_update_nsec; +uint32_t flow_action_flags; /* use enum mlx5dv_flow_action_cap_flags */ +uint32_t dc_odp_caps; /* use enum ibv_odp_transport_cap_bits */ +void *hca_core_clock; /* points to a memory location that is mapped to the HCA's core clock */ +.in -8 +}; + +enum mlx5dv_context_flags { +.in +8 +/* + * This flag indicates if CQE version 0 or 1 is needed. + */ + MLX5DV_CONTEXT_FLAGS_CQE_V1 = (1 << 0), + MLX5DV_CONTEXT_FLAGS_OBSOLETE = (1 << 1), /* Obsoleted, don't use */ + MLX5DV_CONTEXT_FLAGS_MPW_ALLOWED = (1 << 2), /* Multi packet WQE is allowed */ + MLX5DV_CONTEXT_FLAGS_ENHANCED_MPW = (1 << 3), /* Enhanced multi packet WQE is supported or not */ + MLX5DV_CONTEXT_FLAGS_CQE_128B_COMP = (1 << 4), /* Support CQE 128B compression */ + MLX5DV_CONTEXT_FLAGS_CQE_128B_PAD = (1 << 5), /* Support CQE 128B padding */ + MLX5DV_CONTEXT_FLAGS_PACKET_BASED_CREDIT_MODE = (1 << 6), /* Support packet based credit mode in RC QP */ +.in -8 +}; + +.PP +.nf +enum mlx5dv_context_comp_mask { +.in +8 +MLX5DV_CONTEXT_MASK_CQE_COMPRESION = 1 << 0, +MLX5DV_CONTEXT_MASK_SWP = 1 << 1, +MLX5DV_CONTEXT_MASK_STRIDING_RQ = 1 << 2, +MLX5DV_CONTEXT_MASK_TUNNEL_OFFLOADS = 1 << 3, +MLX5DV_CONTEXT_MASK_DYN_BFREGS = 1 << 4, +MLX5DV_CONTEXT_MASK_CLOCK_INFO_UPDATE = 1 << 5, +MLX5DV_CONTEXT_MASK_FLOW_ACTION_FLAGS = 1 << 6, +MLX5DV_CONTEXT_MASK_DC_ODP_CAPS = 1 << 7, +MLX5DV_CONTEXT_MASK_HCA_CORE_CLOCK = 1 << 8, +.in -8 +}; + +.PP +.nf +enum enum mlx5dv_sw_parsing_offloads { +.in +8 +MLX5DV_SW_PARSING = 1 << 0, +MLX5DV_SW_PARSING_CSUM = 1 << 1, +MLX5DV_SW_PARSING_LSO = 1 << 2, +.in -8 +}; + +.PP +.nf +enum mlx5dv_tunnel_offloads { +.in +8 +MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_VXLAN = 1 << 0, +MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_GRE = 1 << 1, +MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_GENEVE = 1 << 2, +.in -8 +}; + +.PP +.nf +enum mlx5dv_flow_action_cap_flags { +.in +8 +MLX5DV_FLOW_ACTION_FLAGS_ESP_AES_GCM = 1 << 0, /* Flow action ESP (with AES_GCM keymat) is supported */ +MLX5DV_FLOW_ACTION_FLAGS_ESP_AES_GCM_REQ_METADATA = 1 << 1, /* Flow action ESP always return metadata in the payload */ +MLX5DV_FLOW_ACTION_FLAGS_ESP_AES_GCM_SPI_STEERING = 1 << 2, /* ESP (with AESGCM keymat) Supports matching by SPI (rather than hashing against SPI) */ +MLX5DV_FLOW_ACTION_FLAGS_ESP_AES_GCM_FULL_OFFLOAD = 1 << 3, /* Flow action ESP supports full offload (with AES_GCM keymat) */ +MLX5DV_FLOW_ACTION_FLAGS_ESP_AES_GCM_TX_IV_IS_ESN = 1 << 4, /* Flow action ESP (with AES_GCM keymat), ESN comes implicitly from IV. */ +.in -8 +}; + +.fi +.SH "RETURN VALUE" +0 on success or the value of errno on failure (which indicates the failure reason). +.SH "NOTES" + * Compatibility mask (comp_mask) is in/out field. +.SH "SEE ALSO" +.BR mlx5dv (7), +.BR ibv_query_device (3) +.SH "AUTHORS" +.TP +Leon Romanovsky <leonro@mellanox.com> diff --git a/providers/mlx5/man/mlx5dv_ts_to_ns.3 b/providers/mlx5/man/mlx5dv_ts_to_ns.3 new file mode 100644 index 0000000..197b195 --- /dev/null +++ b/providers/mlx5/man/mlx5dv_ts_to_ns.3 @@ -0,0 +1,35 @@ +.\" -*- nroff -*- +.\" Licensed under the OpenIB.org (MIT) - See COPYING.md +.\" +.TH MLX5DV_TS_TO_NS 3 2017-11-08 1.0.0 +.SH "NAME" +mlx5dv_ts_to_ns \- Convert device timestamp from HCA core clock units to +the corresponding nanosecond counts +.SH "SYNOPSIS" +.nf +.B #include <infiniband/mlx5dv.h> +.sp +.BI "uint64_t mlx5dv_ts_to_ns(struct mlx5dv_clock_info *clock_info, +.BI " uint64_t device_timestamp); +.fi +.SH "DESCRIPTION" +.B mlx5dv_ts_to_ns(3) +Converts a host byte order +.I device_timestamp +from HCA core clock units into the corresponding nanosecond wallclock time. +.PP +\fBstruct mlx5dv_clock_info\fR can be retrieved using \fBmlx5dv_get_clock_info(3)\fR. +.PP +The greater the difference between the device reporting a timestamp and the last +mlx5dv_clock_info update, the greater the inaccuracy of the clock time conversion. + +.fi +.SH "RETURN VALUE" +Timestamp in nanoseconds +.SH "SEE ALSO" +.BR mlx5dv (7), +.BR mlx5dv_get_clock_info (3), +.BR mlx5dv_query_device (3) +.SH "AUTHORS" +.TP +Feras Daoud <ferasda@mellanox.com> diff --git a/providers/mlx5/man/mlx5dv_wr_post.3.md b/providers/mlx5/man/mlx5dv_wr_post.3.md new file mode 100644 index 0000000..2ff2271 --- /dev/null +++ b/providers/mlx5/man/mlx5dv_wr_post.3.md @@ -0,0 +1,150 @@ +--- +date: 2019-02-24 +footer: mlx5 +header: "mlx5 Programmer's Manual" +tagline: Verbs +layout: page +license: 'Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md' +section: 3 +title: MLX5DV_WR +--- + +# NAME + +mlx5dv_wr_set_dc_addr - Attach a DC info to the last work request + +# SYNOPSIS + +```c +#include <infiniband/mlx5dv.h> + +static inline void mlx5dv_wr_set_dc_addr(struct mlx5dv_qp_ex *mqp, + struct ibv_ah *ah, + uint32_t remote_dctn, + uint64_t remote_dc_key); + +struct mlx5dv_mr_interleaved { + uint64_t addr; + uint32_t bytes_count; + uint32_t bytes_skip; + uint32_t lkey; +}; + +static inline void mlx5dv_wr_mr_interleaved(struct mlx5dv_qp_ex *mqp, + struct mlx5dv_mkey *mkey, + uint32_t access_flags, /* use enum ibv_access_flags */ + uint32_t repeat_count, + uint16_t num_interleaved, + struct mlx5dv_mr_interleaved *data); + +static inline void mlx5dv_wr_mr_list(struct mlx5dv_qp_ex *mqp, + struct mlx5dv_mkey *mkey, + uint32_t access_flags, /* use enum ibv_access_flags */ + uint16_t num_sges, + struct ibv_sge *sge); +``` + +# DESCRIPTION + +The MLX5DV work request APIs (mlx5dv_wr_\*) is an extension for IBV work +request API (ibv_wr_\*) with mlx5 specific features for send work request. +This may be used together with or without ibv_wr_* calls. + +# USAGE + +To use these APIs a QP must be created using mlx5dv_create_qp() with +*send_ops_flags* of struct ibv_qp_init_attr_ex set. + +If the QP does not support all the requested work request types then QP +creation will fail. + +The mlx5dv_qp_ex is extracted from the IBV_QP by ibv_qp_to_qp_ex() and +mlx5dv_qp_ex_from_ibv_qp_ex(). This should be used to apply the mlx5 specific +features on the posted WR. + +A work request creation requires to use the ibv_qp_ex as described in the +man for ibv_wr_post and mlx5dv_qp with its available builders and setters. + +## QP Specific builders +*RC* QPs +: *mlx5dv_wr_mr_interleaved()* + + registers an interleaved memory layout by using an indirect mkey and some interleaved data. + The layout of the memory pointed by the mkey after its registration will be the *data* representation for the *num_interleaved* entries. + This single layout representation is repeated by *repeat_count*. + + The *data* as described by struct mlx5dv_mr_interleaved will hold real data defined by *bytes_count* and then a padding of *bytes_skip*. + Post a successful registration, RDMA operations can use this *mkey*. The hardware will scatter the data according to the pattern. + The *mkey* should be used in a zero-based mode. The *addr* field in its *ibv_sge* is an offset in the total data. + To create this *mkey* mlx5dv_create_mkey() should be used. + + Current implementation requires the IBV_SEND_INLINE option to be on in *ibv_qp_ex->wr_flags* field. + To be able to have more than 3 *num_interleaved* entries, the QP should be created with a larger WQE size that may fit it. + This should be done using the *max_inline_data* attribute of *struct ibv_qp_cap* upon its creation. + + As one entry will be consumed for strided header, the *mkey* should be created with one more entry than the required *num_interleaved*. + + In case *ibv_qp_ex->wr_flags* turns on IBV_SEND_SIGNALED, the reported WC opcode will be MLX5DV_WC_UMR. + Unregister the *mkey* to enable another pattern registration should be done via ibv_post_send with IBV_WR_LOCAL_INV opcode. + +: *mlx5dv_wr_mr_list()* + + registers a memory layout based on list of ibv_sge. + The layout of the memory pointed by the *mkey* after its registration will be based on the list of *sge* counted by *num_sges*. + Post a successful registration RDMA operations can use this *mkey*, the hardware will scatter the data according to the pattern. + The *mkey* should be used in a zero-based mode, the *addr* field in its *ibv_sge* is an offset in the total data. + + Current implementation requires the IBV_SEND_INLINE option to be on in *ibv_qp_ex->wr_flags* field. + To be able to have more than 4 *num_sge* entries, the QP should be created with a larger WQE size that may fit it. + This should be done using the *max_inline_data* attribute of *struct ibv_qp_cap* upon its creation. + + In case *ibv_qp_ex->wr_flags* turns on IBV_SEND_SIGNALED, the reported WC opcode will be MLX5DV_WC_UMR. + Unregister the *mkey* to enable other pattern registration should be done via ibv_post_send with IBV_WR_LOCAL_INV opcode. + +## QP Specific setters + +*DCI* QPs +: *mlx5dv_wr_set_dc_addr()* must be called to set the DCI WR properties. The + destination address of the work is specified by *ah*, the remote DCT + number is specified by *remote_dctn* and the DC key is specified by + *remote_dc_key*. + This setter is available when the QP transport is DCI and send_ops_flags + in struct ibv_qp_init_attr_ex is set. + The available builders and setters for DCI QP are the same as RC QP. + +# EXAMPLE + +```c +/* create DC QP type and specify the required send opcodes */ +attr_ex.qp_type = IBV_QPT_DRIVER; +attr_ex.comp_mask |= IBV_QP_INIT_ATTR_SEND_OPS_FLAGS; +attr_ex.send_ops_flags |= IBV_QP_EX_WITH_RDMA_WRITE; + +attr_dv.comp_mask |= MLX5DV_QP_INIT_ATTR_MASK_DC; +attr_dv.dc_init_attr.dc_type = MLX5DV_DCTYPE_DCI; + +ibv_qp *qp = mlx5dv_create_qp(ctx, attr_ex, attr_dv); +ibv_qp_ex *qpx = ibv_qp_to_qp_ex(qp); +mlx5dv_qp_ex *mqpx = mlx5dv_qp_ex_from_ibv_qp_ex(qpx); + +ibv_wr_start(qpx); + +/* Use ibv_qp_ex object to set WR generic attributes */ +qpx->wr_id = my_wr_id_1; +qpx->wr_flags = IBV_SEND_SIGNALED; +ibv_wr_rdma_write(qpx, rkey, remote_addr_1); +ibv_wr_set_sge(qpx, lkey, local_addr_1, length_1); + +/* Use mlx5 DC setter using mlx5dv_qp_ex object */ +mlx5dv_wr_set_wr_dc_addr(mqpx, ah, remote_dctn, remote_dc_key); + +ret = ibv_wr_complete(qpx); +``` + +# SEE ALSO + +**ibv_post_send**(3), **ibv_create_qp_ex(3)**, **ibv_wr_post(3)**, **mlx5dv_create_mkey(3)**. + +# AUTHOR + +Guy Levi <guyle@mellanox.com> diff --git a/providers/mlx5/mlx5-abi.h b/providers/mlx5/mlx5-abi.h new file mode 100644 index 0000000..2b66e82 --- /dev/null +++ b/providers/mlx5/mlx5-abi.h @@ -0,0 +1,96 @@ +/* + * Copyright (c) 2012 Mellanox Technologies, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef MLX5_ABI_H +#define MLX5_ABI_H + +#include <infiniband/kern-abi.h> +#include <infiniband/verbs.h> +#include <rdma/mlx5-abi.h> +#include <kernel-abi/mlx5-abi.h> +#include "mlx5dv.h" + +#define MLX5_UVERBS_MIN_ABI_VERSION 1 +#define MLX5_UVERBS_MAX_ABI_VERSION 1 + +enum { + MLX5_NUM_NON_FP_BFREGS_PER_UAR = 2, + NUM_BFREGS_PER_UAR = 4, + MLX5_MAX_UARS = 1 << 8, + MLX5_MAX_BFREGS = MLX5_MAX_UARS * MLX5_NUM_NON_FP_BFREGS_PER_UAR, + MLX5_DEF_TOT_UUARS = 8 * MLX5_NUM_NON_FP_BFREGS_PER_UAR, + MLX5_MED_BFREGS_TSHOLD = 12, +}; + +DECLARE_DRV_CMD(mlx5_alloc_ucontext, IB_USER_VERBS_CMD_GET_CONTEXT, + mlx5_ib_alloc_ucontext_req_v2, mlx5_ib_alloc_ucontext_resp); +DECLARE_DRV_CMD(mlx5_create_ah, IB_USER_VERBS_CMD_CREATE_AH, + empty, mlx5_ib_create_ah_resp); +DECLARE_DRV_CMD(mlx5_alloc_pd, IB_USER_VERBS_CMD_ALLOC_PD, + empty, mlx5_ib_alloc_pd_resp); +DECLARE_DRV_CMD(mlx5_create_cq, IB_USER_VERBS_CMD_CREATE_CQ, + mlx5_ib_create_cq, mlx5_ib_create_cq_resp); +DECLARE_DRV_CMD(mlx5_create_cq_ex, IB_USER_VERBS_EX_CMD_CREATE_CQ, + mlx5_ib_create_cq, mlx5_ib_create_cq_resp); +DECLARE_DRV_CMD(mlx5_create_srq, IB_USER_VERBS_CMD_CREATE_SRQ, + mlx5_ib_create_srq, mlx5_ib_create_srq_resp); +DECLARE_DRV_CMD(mlx5_create_srq_ex, IB_USER_VERBS_CMD_CREATE_XSRQ, + mlx5_ib_create_srq, mlx5_ib_create_srq_resp); +DECLARE_DRV_CMD(mlx5_create_qp_ex, IB_USER_VERBS_EX_CMD_CREATE_QP, + mlx5_ib_create_qp, mlx5_ib_create_qp_resp); +DECLARE_DRV_CMD(mlx5_create_qp_ex_rss, IB_USER_VERBS_EX_CMD_CREATE_QP, + mlx5_ib_create_qp_rss, mlx5_ib_create_qp_resp); +DECLARE_DRV_CMD(mlx5_create_qp, IB_USER_VERBS_CMD_CREATE_QP, + mlx5_ib_create_qp, mlx5_ib_create_qp_resp); +DECLARE_DRV_CMD(mlx5_create_wq, IB_USER_VERBS_EX_CMD_CREATE_WQ, + mlx5_ib_create_wq, mlx5_ib_create_wq_resp); +DECLARE_DRV_CMD(mlx5_modify_wq, IB_USER_VERBS_EX_CMD_MODIFY_WQ, + mlx5_ib_modify_wq, empty); +DECLARE_DRV_CMD(mlx5_create_rwq_ind_table, IB_USER_VERBS_EX_CMD_CREATE_RWQ_IND_TBL, + empty, empty); +DECLARE_DRV_CMD(mlx5_destroy_rwq_ind_table, IB_USER_VERBS_EX_CMD_DESTROY_RWQ_IND_TBL, + empty, empty); +DECLARE_DRV_CMD(mlx5_resize_cq, IB_USER_VERBS_CMD_RESIZE_CQ, + mlx5_ib_resize_cq, empty); +DECLARE_DRV_CMD(mlx5_query_device_ex, IB_USER_VERBS_EX_CMD_QUERY_DEVICE, + empty, mlx5_ib_query_device_resp); +DECLARE_DRV_CMD(mlx5_modify_qp_ex, IB_USER_VERBS_EX_CMD_MODIFY_QP, + empty, mlx5_ib_modify_qp_resp); + +struct mlx5_modify_qp { + struct ibv_modify_qp_ex ibv_cmd; + __u32 comp_mask; + struct mlx5_ib_burst_info burst_info; + __u32 reserved; +}; + +#endif /* MLX5_ABI_H */ diff --git a/providers/mlx5/mlx5.c b/providers/mlx5/mlx5.c new file mode 100644 index 0000000..85ad9af --- /dev/null +++ b/providers/mlx5/mlx5.c @@ -0,0 +1,1481 @@ +/* + * Copyright (c) 2012 Mellanox Technologies, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#define _GNU_SOURCE +#include <config.h> + +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <errno.h> +#include <sys/mman.h> +#include <pthread.h> +#include <string.h> +#include <sched.h> +#include <sys/param.h> + +#include <util/symver.h> + +#include "mlx5.h" +#include "mlx5-abi.h" +#include "wqe.h" +#include "mlx5_ifc.h" + +static void mlx5_free_context(struct ibv_context *ibctx); + +#ifndef PCI_VENDOR_ID_MELLANOX +#define PCI_VENDOR_ID_MELLANOX 0x15b3 +#endif + +#ifndef CPU_OR +#define CPU_OR(x, y, z) do {} while (0) +#endif + +#ifndef CPU_EQUAL +#define CPU_EQUAL(x, y) 1 +#endif + +#define HCA(v, d) VERBS_PCI_MATCH(PCI_VENDOR_ID_##v, d, NULL) +static const struct verbs_match_ent hca_table[] = { + VERBS_DRIVER_ID(RDMA_DRIVER_MLX5), + HCA(MELLANOX, 0x1011), /* MT4113 Connect-IB */ + HCA(MELLANOX, 0x1012), /* Connect-IB Virtual Function */ + HCA(MELLANOX, 0x1013), /* ConnectX-4 */ + HCA(MELLANOX, 0x1014), /* ConnectX-4 Virtual Function */ + HCA(MELLANOX, 0x1015), /* ConnectX-4LX */ + HCA(MELLANOX, 0x1016), /* ConnectX-4LX Virtual Function */ + HCA(MELLANOX, 0x1017), /* ConnectX-5, PCIe 3.0 */ + HCA(MELLANOX, 0x1018), /* ConnectX-5 Virtual Function */ + HCA(MELLANOX, 0x1019), /* ConnectX-5 Ex */ + HCA(MELLANOX, 0x101a), /* ConnectX-5 Ex VF */ + HCA(MELLANOX, 0x101b), /* ConnectX-6 */ + HCA(MELLANOX, 0x101c), /* ConnectX-6 VF */ + HCA(MELLANOX, 0x101d), /* ConnectX-6 DX */ + HCA(MELLANOX, 0x101e), /* ConnectX family mlx5Gen Virtual Function */ + HCA(MELLANOX, 0x101f), /* ConnectX-6 LX */ + HCA(MELLANOX, 0x1021), /* ConnectX-7 */ + HCA(MELLANOX, 0xa2d2), /* BlueField integrated ConnectX-5 network controller */ + HCA(MELLANOX, 0xa2d3), /* BlueField integrated ConnectX-5 network controller VF */ + HCA(MELLANOX, 0xa2d6), /* BlueField-2 integrated ConnectX-6 Dx network controller */ + {} +}; + +uint32_t mlx5_debug_mask = 0; +int mlx5_freeze_on_error_cqe; + +static const struct verbs_context_ops mlx5_ctx_common_ops = { + .query_device = mlx5_query_device, + .query_port = mlx5_query_port, + .alloc_pd = mlx5_alloc_pd, + .async_event = mlx5_async_event, + .dealloc_pd = mlx5_free_pd, + .reg_mr = mlx5_reg_mr, + .rereg_mr = mlx5_rereg_mr, + .dereg_mr = mlx5_dereg_mr, + .alloc_mw = mlx5_alloc_mw, + .dealloc_mw = mlx5_dealloc_mw, + .bind_mw = mlx5_bind_mw, + .create_cq = mlx5_create_cq, + .poll_cq = mlx5_poll_cq, + .req_notify_cq = mlx5_arm_cq, + .cq_event = mlx5_cq_event, + .resize_cq = mlx5_resize_cq, + .destroy_cq = mlx5_destroy_cq, + .create_srq = mlx5_create_srq, + .modify_srq = mlx5_modify_srq, + .query_srq = mlx5_query_srq, + .destroy_srq = mlx5_destroy_srq, + .post_srq_recv = mlx5_post_srq_recv, + .create_qp = mlx5_create_qp, + .query_qp = mlx5_query_qp, + .modify_qp = mlx5_modify_qp, + .destroy_qp = mlx5_destroy_qp, + .post_send = mlx5_post_send, + .post_recv = mlx5_post_recv, + .create_ah = mlx5_create_ah, + .destroy_ah = mlx5_destroy_ah, + .attach_mcast = mlx5_attach_mcast, + .detach_mcast = mlx5_detach_mcast, + + .advise_mr = mlx5_advise_mr, + .alloc_dm = mlx5_alloc_dm, + .alloc_parent_domain = mlx5_alloc_parent_domain, + .alloc_td = mlx5_alloc_td, + .attach_counters_point_flow = mlx5_attach_counters_point_flow, + .close_xrcd = mlx5_close_xrcd, + .create_counters = mlx5_create_counters, + .create_cq_ex = mlx5_create_cq_ex, + .create_flow = mlx5_create_flow, + .create_flow_action_esp = mlx5_create_flow_action_esp, + .create_qp_ex = mlx5_create_qp_ex, + .create_rwq_ind_table = mlx5_create_rwq_ind_table, + .create_srq_ex = mlx5_create_srq_ex, + .create_wq = mlx5_create_wq, + .dealloc_td = mlx5_dealloc_td, + .destroy_counters = mlx5_destroy_counters, + .destroy_flow = mlx5_destroy_flow, + .destroy_flow_action = mlx5_destroy_flow_action, + .destroy_rwq_ind_table = mlx5_destroy_rwq_ind_table, + .destroy_wq = mlx5_destroy_wq, + .free_dm = mlx5_free_dm, + .get_srq_num = mlx5_get_srq_num, + .modify_cq = mlx5_modify_cq, + .modify_flow_action_esp = mlx5_modify_flow_action_esp, + .modify_qp_rate_limit = mlx5_modify_qp_rate_limit, + .modify_wq = mlx5_modify_wq, + .open_qp = mlx5_open_qp, + .open_xrcd = mlx5_open_xrcd, + .post_srq_ops = mlx5_post_srq_ops, + .query_device_ex = mlx5_query_device_ex, + .query_rt_values = mlx5_query_rt_values, + .read_counters = mlx5_read_counters, + .reg_dm_mr = mlx5_reg_dm_mr, + .alloc_null_mr = mlx5_alloc_null_mr, + .free_context = mlx5_free_context, +}; + +static const struct verbs_context_ops mlx5_ctx_cqev1_ops = { + .poll_cq = mlx5_poll_cq_v1, +}; + +static int read_number_from_line(const char *line, int *value) +{ + const char *ptr; + + ptr = strchr(line, ':'); + if (!ptr) + return 1; + + ++ptr; + + *value = atoi(ptr); + return 0; +} +/** + * The function looks for the first free user-index in all the + * user-index tables. If all are used, returns -1, otherwise + * a valid user-index. + * In case the reference count of the table is zero, it means the + * table is not in use and wasn't allocated yet, therefore the + * mlx5_store_uidx allocates the table, and increment the reference + * count on the table. + */ +static int32_t get_free_uidx(struct mlx5_context *ctx) +{ + int32_t tind; + int32_t i; + + for (tind = 0; tind < MLX5_UIDX_TABLE_SIZE; tind++) { + if (ctx->uidx_table[tind].refcnt < MLX5_UIDX_TABLE_MASK) + break; + } + + if (tind == MLX5_UIDX_TABLE_SIZE) + return -1; + + if (!ctx->uidx_table[tind].refcnt) + return tind << MLX5_UIDX_TABLE_SHIFT; + + for (i = 0; i < MLX5_UIDX_TABLE_MASK + 1; i++) { + if (!ctx->uidx_table[tind].table[i]) + break; + } + + return (tind << MLX5_UIDX_TABLE_SHIFT) | i; +} + +int32_t mlx5_store_uidx(struct mlx5_context *ctx, void *rsc) +{ + int32_t tind; + int32_t ret = -1; + int32_t uidx; + + pthread_mutex_lock(&ctx->uidx_table_mutex); + uidx = get_free_uidx(ctx); + if (uidx < 0) + goto out; + + tind = uidx >> MLX5_UIDX_TABLE_SHIFT; + + if (!ctx->uidx_table[tind].refcnt) { + ctx->uidx_table[tind].table = calloc(MLX5_UIDX_TABLE_MASK + 1, + sizeof(struct mlx5_resource *)); + if (!ctx->uidx_table[tind].table) + goto out; + } + + ++ctx->uidx_table[tind].refcnt; + ctx->uidx_table[tind].table[uidx & MLX5_UIDX_TABLE_MASK] = rsc; + ret = uidx; + +out: + pthread_mutex_unlock(&ctx->uidx_table_mutex); + return ret; +} + +void mlx5_clear_uidx(struct mlx5_context *ctx, uint32_t uidx) +{ + int tind = uidx >> MLX5_UIDX_TABLE_SHIFT; + + pthread_mutex_lock(&ctx->uidx_table_mutex); + + if (!--ctx->uidx_table[tind].refcnt) + free(ctx->uidx_table[tind].table); + else + ctx->uidx_table[tind].table[uidx & MLX5_UIDX_TABLE_MASK] = NULL; + + pthread_mutex_unlock(&ctx->uidx_table_mutex); +} + +static int mlx5_is_sandy_bridge(int *num_cores) +{ + char line[128]; + FILE *fd; + int rc = 0; + int cur_cpu_family = -1; + int cur_cpu_model = -1; + + fd = fopen("/proc/cpuinfo", "r"); + if (!fd) + return 0; + + *num_cores = 0; + + while (fgets(line, 128, fd)) { + int value; + + /* if this is information on new processor */ + if (!strncmp(line, "processor", 9)) { + ++*num_cores; + + cur_cpu_family = -1; + cur_cpu_model = -1; + } else if (!strncmp(line, "cpu family", 10)) { + if ((cur_cpu_family < 0) && (!read_number_from_line(line, &value))) + cur_cpu_family = value; + } else if (!strncmp(line, "model", 5)) { + if ((cur_cpu_model < 0) && (!read_number_from_line(line, &value))) + cur_cpu_model = value; + } + + /* if this is a Sandy Bridge CPU */ + if ((cur_cpu_family == 6) && + (cur_cpu_model == 0x2A || (cur_cpu_model == 0x2D) )) + rc = 1; + } + + fclose(fd); + return rc; +} + +/* +man cpuset + + This format displays each 32-bit word in hexadecimal (using ASCII characters "0" - "9" and "a" - "f"); words + are filled with leading zeros, if required. For masks longer than one word, a comma separator is used between + words. Words are displayed in big-endian order, which has the most significant bit first. The hex digits + within a word are also in big-endian order. + + The number of 32-bit words displayed is the minimum number needed to display all bits of the bitmask, based on + the size of the bitmask. + + Examples of the Mask Format: + + 00000001 # just bit 0 set + 40000000,00000000,00000000 # just bit 94 set + 000000ff,00000000 # bits 32-39 set + 00000000,000E3862 # 1,5,6,11-13,17-19 set + + A mask with bits 0, 1, 2, 4, 8, 16, 32, and 64 set displays as: + + 00000001,00000001,00010117 + + The first "1" is for bit 64, the second for bit 32, the third for bit 16, the fourth for bit 8, the fifth for + bit 4, and the "7" is for bits 2, 1, and 0. +*/ +static void mlx5_local_cpu_set(struct ibv_device *ibdev, cpu_set_t *cpu_set) +{ + char *p, buf[1024] = {}; + char *env_value; + uint32_t word; + int i, k; + + env_value = getenv("MLX5_LOCAL_CPUS"); + if (env_value) + strncpy(buf, env_value, sizeof(buf) - 1); + else { + char fname[MAXPATHLEN]; + FILE *fp; + + snprintf(fname, MAXPATHLEN, "/sys/class/infiniband/%s/device/local_cpus", + ibv_get_device_name(ibdev)); + + fp = fopen(fname, "r"); + if (!fp) { + fprintf(stderr, PFX "Warning: can not get local cpu set: failed to open %s\n", fname); + return; + } + if (!fgets(buf, sizeof(buf), fp)) { + fprintf(stderr, PFX "Warning: can not get local cpu set: failed to read cpu mask\n"); + fclose(fp); + return; + } + fclose(fp); + } + + p = strrchr(buf, ','); + if (!p) + p = buf; + + i = 0; + do { + if (*p == ',') { + *p = 0; + p ++; + } + + word = strtoul(p, NULL, 16); + + for (k = 0; word; ++k, word >>= 1) + if (word & 1) + CPU_SET(k+i, cpu_set); + + if (p == buf) + break; + + p = strrchr(buf, ','); + if (!p) + p = buf; + + i += 32; + } while (i < CPU_SETSIZE); +} + +static int mlx5_enable_sandy_bridge_fix(struct ibv_device *ibdev) +{ + cpu_set_t my_cpus, dev_local_cpus, result_set; + int stall_enable; + int ret; + int num_cores; + + if (!mlx5_is_sandy_bridge(&num_cores)) + return 0; + + /* by default enable stall on sandy bridge arch */ + stall_enable = 1; + + /* + * check if app is bound to cpu set that is inside + * of device local cpu set. Disable stalling if true + */ + + /* use static cpu set - up to CPU_SETSIZE (1024) cpus/node */ + CPU_ZERO(&my_cpus); + CPU_ZERO(&dev_local_cpus); + CPU_ZERO(&result_set); + ret = sched_getaffinity(0, sizeof(my_cpus), &my_cpus); + if (ret == -1) { + if (errno == EINVAL) + fprintf(stderr, PFX "Warning: my cpu set is too small\n"); + else + fprintf(stderr, PFX "Warning: failed to get my cpu set\n"); + goto out; + } + + /* get device local cpu set */ + mlx5_local_cpu_set(ibdev, &dev_local_cpus); + + /* check if my cpu set is in dev cpu */ + CPU_OR(&result_set, &my_cpus, &dev_local_cpus); + stall_enable = CPU_EQUAL(&result_set, &dev_local_cpus) ? 0 : 1; + +out: + return stall_enable; +} + +static void mlx5_read_env(struct ibv_device *ibdev, struct mlx5_context *ctx) +{ + char *env_value; + + env_value = getenv("MLX5_STALL_CQ_POLL"); + if (env_value) + /* check if cq stall is enforced by user */ + ctx->stall_enable = (strcmp(env_value, "0")) ? 1 : 0; + else + /* autodetect if we need to do cq polling */ + ctx->stall_enable = mlx5_enable_sandy_bridge_fix(ibdev); + + env_value = getenv("MLX5_STALL_NUM_LOOP"); + if (env_value) + mlx5_stall_num_loop = atoi(env_value); + + env_value = getenv("MLX5_STALL_CQ_POLL_MIN"); + if (env_value) + mlx5_stall_cq_poll_min = atoi(env_value); + + env_value = getenv("MLX5_STALL_CQ_POLL_MAX"); + if (env_value) + mlx5_stall_cq_poll_max = atoi(env_value); + + env_value = getenv("MLX5_STALL_CQ_INC_STEP"); + if (env_value) + mlx5_stall_cq_inc_step = atoi(env_value); + + env_value = getenv("MLX5_STALL_CQ_DEC_STEP"); + if (env_value) + mlx5_stall_cq_dec_step = atoi(env_value); + + ctx->stall_adaptive_enable = 0; + ctx->stall_cycles = 0; + + if (mlx5_stall_num_loop < 0) { + ctx->stall_adaptive_enable = 1; + ctx->stall_cycles = mlx5_stall_cq_poll_min; + } + +} + +static int get_total_uuars(int page_size) +{ + int size = MLX5_DEF_TOT_UUARS; + int uuars_in_page; + char *env; + + env = getenv("MLX5_TOTAL_UUARS"); + if (env) + size = atoi(env); + + if (size < 1) + return -EINVAL; + + uuars_in_page = page_size / MLX5_ADAPTER_PAGE_SIZE * MLX5_NUM_NON_FP_BFREGS_PER_UAR; + size = max(uuars_in_page, size); + size = align(size, MLX5_NUM_NON_FP_BFREGS_PER_UAR); + if (size > MLX5_MAX_BFREGS) + return -ENOMEM; + + return size; +} + +static void open_debug_file(struct mlx5_context *ctx) +{ + char *env; + + env = getenv("MLX5_DEBUG_FILE"); + if (!env) { + ctx->dbg_fp = stderr; + return; + } + + ctx->dbg_fp = fopen(env, "aw+"); + if (!ctx->dbg_fp) { + fprintf(stderr, "Failed opening debug file %s, using stderr\n", env); + ctx->dbg_fp = stderr; + return; + } +} + +static void close_debug_file(struct mlx5_context *ctx) +{ + if (ctx->dbg_fp && ctx->dbg_fp != stderr) + fclose(ctx->dbg_fp); +} + +static void set_debug_mask(void) +{ + char *env; + + env = getenv("MLX5_DEBUG_MASK"); + if (env) + mlx5_debug_mask = strtol(env, NULL, 0); +} + +static void set_freeze_on_error(void) +{ + char *env; + + env = getenv("MLX5_FREEZE_ON_ERROR_CQE"); + if (env) + mlx5_freeze_on_error_cqe = strtol(env, NULL, 0); +} + +static int get_always_bf(void) +{ + char *env; + + env = getenv("MLX5_POST_SEND_PREFER_BF"); + if (!env) + return 1; + + return strcmp(env, "0") ? 1 : 0; +} + +static int get_shut_up_bf(void) +{ + char *env; + + env = getenv("MLX5_SHUT_UP_BF"); + if (!env) + return 0; + + return strcmp(env, "0") ? 1 : 0; +} + +static int get_num_low_lat_uuars(int tot_uuars) +{ + char *env; + int num = 4; + + env = getenv("MLX5_NUM_LOW_LAT_UUARS"); + if (env) + num = atoi(env); + + if (num < 0) + return -EINVAL; + + num = max(num, tot_uuars - MLX5_MED_BFREGS_TSHOLD); + return num; +} + +/* The library allocates an array of uuar contexts. The one in index zero does + * not to execersize odd/even policy so it can avoid a lock but it may not use + * blue flame. The upper ones, low_lat_uuars can use blue flame with no lock + * since they are assigned to one QP only. The rest can use blue flame but since + * they are shared they need a lock + */ +static int need_uuar_lock(struct mlx5_context *ctx, int uuarn) +{ + int i; + + if (uuarn == 0 || mlx5_single_threaded) + return 0; + + i = (uuarn / 2) + (uuarn % 2); + if (i >= ctx->tot_uuars - ctx->low_lat_uuars) + return 0; + + return 1; +} + +static int single_threaded_app(void) +{ + + char *env; + + env = getenv("MLX5_SINGLE_THREADED"); + if (env) + return strcmp(env, "1") ? 0 : 1; + + return 0; +} + +static int mlx5_cmd_get_context(struct mlx5_context *context, + struct mlx5_alloc_ucontext *req, + size_t req_len, + struct mlx5_alloc_ucontext_resp *resp, + size_t resp_len) +{ + struct verbs_context *verbs_ctx = &context->ibv_ctx; + + if (!ibv_cmd_get_context(verbs_ctx, &req->ibv_cmd, + req_len, &resp->ibv_resp, resp_len)) + return 0; + + /* The ibv_cmd_get_context fails in older kernels when passing + * a request length that the kernel doesn't know. + * To avoid breaking compatibility of new libmlx5 and older + * kernels, when ibv_cmd_get_context fails with the full + * request length, we try once again with the legacy length. + * We repeat this process while reducing requested size based + * on the feature input size. To avoid this in the future, we + * will remove the check in kernel that requires fields unknown + * to the kernel to be cleared. This will require that any new + * feature that involves extending struct mlx5_alloc_ucontext + * will be accompanied by an indication in the form of one or + * more fields in struct mlx5_alloc_ucontext_resp. If the + * response value can be interpreted as feature not supported + * when the returned value is zero, this will suffice to + * indicate to the library that the request was ignored by the + * kernel, either because it is unaware or because it decided + * to do so. If zero is a valid response, we will add a new + * field that indicates whether the request was handled. + */ + if (!ibv_cmd_get_context(verbs_ctx, &req->ibv_cmd, + offsetof(struct mlx5_alloc_ucontext, lib_caps), + &resp->ibv_resp, resp_len)) + return 0; + + return ibv_cmd_get_context(verbs_ctx, &req->ibv_cmd, + offsetof(struct mlx5_alloc_ucontext, + max_cqe_version), + &resp->ibv_resp, resp_len); +} + +static int mlx5_map_internal_clock(struct mlx5_device *mdev, + struct ibv_context *ibv_ctx) +{ + struct mlx5_context *context = to_mctx(ibv_ctx); + void *hca_clock_page; + off_t offset = 0; + + set_command(MLX5_IB_MMAP_CORE_CLOCK, &offset); + hca_clock_page = mmap(NULL, mdev->page_size, + PROT_READ, MAP_SHARED, ibv_ctx->cmd_fd, + mdev->page_size * offset); + + if (hca_clock_page == MAP_FAILED) { + fprintf(stderr, PFX + "Warning: Timestamp available,\n" + "but failed to mmap() hca core clock page.\n"); + return -1; + } + + context->hca_core_clock = hca_clock_page + + (context->core_clock.offset & (mdev->page_size - 1)); + return 0; +} + +static void mlx5_map_clock_info(struct mlx5_device *mdev, + struct ibv_context *ibv_ctx) +{ + struct mlx5_context *context = to_mctx(ibv_ctx); + void *clock_info_page; + off_t offset = 0; + + set_command(MLX5_IB_MMAP_CLOCK_INFO, &offset); + set_index(MLX5_IB_CLOCK_INFO_V1, &offset); + clock_info_page = mmap(NULL, mdev->page_size, + PROT_READ, MAP_SHARED, ibv_ctx->cmd_fd, + offset * mdev->page_size); + + if (clock_info_page != MAP_FAILED) + context->clock_info_page = clock_info_page; +} + +static uint32_t get_dc_odp_caps(struct ibv_context *ctx) +{ + uint32_t in[DEVX_ST_SZ_DW(query_hca_cap_in)] = {}; + uint32_t out[DEVX_ST_SZ_DW(query_hca_cap_out)] = {}; + uint16_t opmod = (MLX5_CAP_ODP << 1) | HCA_CAP_OPMOD_GET_CUR; + uint32_t ret; + + DEVX_SET(query_hca_cap_in, in, opcode, MLX5_CMD_OP_QUERY_HCA_CAP); + DEVX_SET(query_hca_cap_in, in, op_mod, opmod); + + ret = mlx5dv_devx_general_cmd(ctx, in, sizeof(in), out, sizeof(out)); + if (ret) + return 0; + + if (DEVX_GET(query_hca_cap_out, out, + capability.odp_cap.dc_odp_caps.send)) + ret |= IBV_ODP_SUPPORT_SEND; + if (DEVX_GET(query_hca_cap_out, out, + capability.odp_cap.dc_odp_caps.receive)) + ret |= IBV_ODP_SUPPORT_RECV; + if (DEVX_GET(query_hca_cap_out, out, + capability.odp_cap.dc_odp_caps.write)) + ret |= IBV_ODP_SUPPORT_WRITE; + if (DEVX_GET(query_hca_cap_out, out, + capability.odp_cap.dc_odp_caps.read)) + ret |= IBV_ODP_SUPPORT_READ; + if (DEVX_GET(query_hca_cap_out, out, + capability.odp_cap.dc_odp_caps.atomic)) + ret |= IBV_ODP_SUPPORT_ATOMIC; + if (DEVX_GET(query_hca_cap_out, out, + capability.odp_cap.dc_odp_caps.srq_receive)) + ret |= IBV_ODP_SUPPORT_SRQ_RECV; + + return ret; +} + +int mlx5dv_query_device(struct ibv_context *ctx_in, + struct mlx5dv_context *attrs_out) +{ + struct mlx5_context *mctx = to_mctx(ctx_in); + uint64_t comp_mask_out = 0; + + if (!is_mlx5_dev(ctx_in->device)) + return EOPNOTSUPP; + + attrs_out->version = 0; + attrs_out->flags = 0; + + if (mctx->cqe_version == MLX5_CQE_VERSION_V1) + attrs_out->flags |= MLX5DV_CONTEXT_FLAGS_CQE_V1; + + if (mctx->vendor_cap_flags & MLX5_VENDOR_CAP_FLAGS_MPW_ALLOWED) + attrs_out->flags |= MLX5DV_CONTEXT_FLAGS_MPW_ALLOWED; + + if (mctx->vendor_cap_flags & MLX5_VENDOR_CAP_FLAGS_CQE_128B_COMP) + attrs_out->flags |= MLX5DV_CONTEXT_FLAGS_CQE_128B_COMP; + + if (mctx->vendor_cap_flags & MLX5_VENDOR_CAP_FLAGS_CQE_128B_PAD) + attrs_out->flags |= MLX5DV_CONTEXT_FLAGS_CQE_128B_PAD; + + if (attrs_out->comp_mask & MLX5DV_CONTEXT_MASK_CQE_COMPRESION) { + attrs_out->cqe_comp_caps = mctx->cqe_comp_caps; + comp_mask_out |= MLX5DV_CONTEXT_MASK_CQE_COMPRESION; + } + + if (mctx->vendor_cap_flags & MLX5_VENDOR_CAP_FLAGS_ENHANCED_MPW) + attrs_out->flags |= MLX5DV_CONTEXT_FLAGS_ENHANCED_MPW; + + if (mctx->vendor_cap_flags & + MLX5_VENDOR_CAP_FLAGS_PACKET_BASED_CREDIT_MODE) + attrs_out->flags |= MLX5DV_CONTEXT_FLAGS_PACKET_BASED_CREDIT_MODE; + + if (attrs_out->comp_mask & MLX5DV_CONTEXT_MASK_SWP) { + attrs_out->sw_parsing_caps = mctx->sw_parsing_caps; + comp_mask_out |= MLX5DV_CONTEXT_MASK_SWP; + } + + if (attrs_out->comp_mask & MLX5DV_CONTEXT_MASK_STRIDING_RQ) { + attrs_out->striding_rq_caps = mctx->striding_rq_caps; + comp_mask_out |= MLX5DV_CONTEXT_MASK_STRIDING_RQ; + } + + if (attrs_out->comp_mask & MLX5DV_CONTEXT_MASK_TUNNEL_OFFLOADS) { + attrs_out->tunnel_offloads_caps = mctx->tunnel_offloads_caps; + comp_mask_out |= MLX5DV_CONTEXT_MASK_TUNNEL_OFFLOADS; + } + + if (attrs_out->comp_mask & MLX5DV_CONTEXT_MASK_DYN_BFREGS) { + attrs_out->max_dynamic_bfregs = mctx->num_dyn_bfregs; + comp_mask_out |= MLX5DV_CONTEXT_MASK_DYN_BFREGS; + } + + if (attrs_out->comp_mask & MLX5DV_CONTEXT_MASK_CLOCK_INFO_UPDATE) { + if (mctx->clock_info_page) { + attrs_out->max_clock_info_update_nsec = + mctx->clock_info_page->overflow_period; + comp_mask_out |= MLX5DV_CONTEXT_MASK_CLOCK_INFO_UPDATE; + } + } + + if (attrs_out->comp_mask & MLX5DV_CONTEXT_MASK_FLOW_ACTION_FLAGS) { + attrs_out->flow_action_flags = mctx->flow_action_flags; + comp_mask_out |= MLX5DV_CONTEXT_MASK_FLOW_ACTION_FLAGS; + } + + if (attrs_out->comp_mask & MLX5DV_CONTEXT_MASK_DC_ODP_CAPS) { + attrs_out->dc_odp_caps = get_dc_odp_caps(ctx_in); + comp_mask_out |= MLX5DV_CONTEXT_MASK_DC_ODP_CAPS; + } + + if (attrs_out->comp_mask & MLX5DV_CONTEXT_MASK_HCA_CORE_CLOCK) { + if (mctx->hca_core_clock) { + attrs_out->hca_core_clock = mctx->hca_core_clock; + comp_mask_out |= MLX5DV_CONTEXT_MASK_HCA_CORE_CLOCK; + } + } + + attrs_out->comp_mask = comp_mask_out; + + return 0; +} + +static int mlx5dv_get_qp(struct ibv_qp *qp_in, + struct mlx5dv_qp *qp_out) +{ + struct mlx5_qp *mqp = to_mqp(qp_in); + uint64_t mask_out = 0; + + if (!is_mlx5_dev(qp_in->context->device)) + return EOPNOTSUPP; + + qp_out->dbrec = mqp->db; + + if (mqp->sq_buf_size) + /* IBV_QPT_RAW_PACKET */ + qp_out->sq.buf = (void *)((uintptr_t)mqp->sq_buf.buf); + else + qp_out->sq.buf = (void *)((uintptr_t)mqp->buf.buf + mqp->sq.offset); + qp_out->sq.wqe_cnt = mqp->sq.wqe_cnt; + qp_out->sq.stride = 1 << mqp->sq.wqe_shift; + + qp_out->rq.buf = (void *)((uintptr_t)mqp->buf.buf + mqp->rq.offset); + qp_out->rq.wqe_cnt = mqp->rq.wqe_cnt; + qp_out->rq.stride = 1 << mqp->rq.wqe_shift; + + qp_out->bf.reg = mqp->bf->reg; + + if (qp_out->comp_mask & MLX5DV_QP_MASK_UAR_MMAP_OFFSET) { + qp_out->uar_mmap_offset = mqp->bf->uar_mmap_offset; + mask_out |= MLX5DV_QP_MASK_UAR_MMAP_OFFSET; + } + + if (qp_out->comp_mask & MLX5DV_QP_MASK_RAW_QP_HANDLES) { + qp_out->tirn = mqp->tirn; + qp_out->tisn = mqp->tisn; + qp_out->rqn = mqp->rqn; + qp_out->sqn = mqp->sqn; + mask_out |= MLX5DV_QP_MASK_RAW_QP_HANDLES; + } + + if (qp_out->comp_mask & MLX5DV_QP_MASK_RAW_QP_TIR_ADDR) { + qp_out->tir_icm_addr = mqp->tir_icm_addr; + mask_out |= MLX5DV_QP_MASK_RAW_QP_TIR_ADDR; + } + + if (mqp->bf->uuarn > 0) + qp_out->bf.size = mqp->bf->buf_size; + else + qp_out->bf.size = 0; + + qp_out->comp_mask = mask_out; + + return 0; +} + +static int mlx5dv_get_cq(struct ibv_cq *cq_in, + struct mlx5dv_cq *cq_out) +{ + struct mlx5_cq *mcq = to_mcq(cq_in); + struct mlx5_context *mctx = to_mctx(cq_in->context); + + if (!is_mlx5_dev(cq_in->context->device)) + return EOPNOTSUPP; + + cq_out->comp_mask = 0; + cq_out->cqn = mcq->cqn; + cq_out->cqe_cnt = mcq->ibv_cq.cqe + 1; + cq_out->cqe_size = mcq->cqe_sz; + cq_out->buf = mcq->active_buf->buf; + cq_out->dbrec = mcq->dbrec; + cq_out->cq_uar = mctx->cq_uar_reg; + + mcq->flags |= MLX5_CQ_FLAGS_DV_OWNED; + + return 0; +} + +static int mlx5dv_get_rwq(struct ibv_wq *wq_in, + struct mlx5dv_rwq *rwq_out) +{ + struct mlx5_rwq *mrwq = to_mrwq(wq_in); + + if (!is_mlx5_dev(wq_in->context->device)) + return EOPNOTSUPP; + + rwq_out->comp_mask = 0; + rwq_out->buf = mrwq->pbuff; + rwq_out->dbrec = mrwq->recv_db; + rwq_out->wqe_cnt = mrwq->rq.wqe_cnt; + rwq_out->stride = 1 << mrwq->rq.wqe_shift; + + return 0; +} + +static int mlx5dv_get_srq(struct ibv_srq *srq_in, + struct mlx5dv_srq *srq_out) +{ + struct mlx5_srq *msrq; + uint64_t mask_out = 0; + + if (!is_mlx5_dev(srq_in->context->device)) + return EOPNOTSUPP; + + msrq = container_of(srq_in, struct mlx5_srq, vsrq.srq); + + srq_out->buf = msrq->buf.buf; + srq_out->dbrec = msrq->db; + srq_out->stride = 1 << msrq->wqe_shift; + srq_out->head = msrq->head; + srq_out->tail = msrq->tail; + + if (srq_out->comp_mask & MLX5DV_SRQ_MASK_SRQN) { + srq_out->srqn = msrq->srqn; + mask_out |= MLX5DV_SRQ_MASK_SRQN; + } + + srq_out->comp_mask = mask_out; + return 0; +} + +static int mlx5dv_get_dm(struct ibv_dm *dm_in, + struct mlx5dv_dm *dm_out) +{ + struct mlx5_dm *mdm = to_mdm(dm_in); + uint64_t mask_out = 0; + + if (!is_mlx5_dev(dm_in->context->device)) + return EOPNOTSUPP; + + dm_out->buf = mdm->start_va; + dm_out->length = mdm->length; + + if (dm_out->comp_mask & MLX5DV_DM_MASK_REMOTE_VA) { + dm_out->remote_va = mdm->remote_va; + mask_out |= MLX5DV_DM_MASK_REMOTE_VA; + } + + dm_out->comp_mask = mask_out; + + return 0; +} + +static int mlx5dv_get_av(struct ibv_ah *ah_in, + struct mlx5dv_ah *ah_out) +{ + struct mlx5_ah *mah = to_mah(ah_in); + + if (!is_mlx5_dev(ah_in->context->device)) + return EOPNOTSUPP; + + ah_out->comp_mask = 0; + ah_out->av = &mah->av; + + return 0; +} + +static int mlx5dv_get_pd(struct ibv_pd *pd_in, + struct mlx5dv_pd *pd_out) +{ + struct mlx5_pd *mpd = to_mpd(pd_in); + + if (!is_mlx5_dev(pd_in->context->device)) + return EOPNOTSUPP; + + pd_out->comp_mask = 0; + pd_out->pdn = mpd->pdn; + + return 0; +} + +LATEST_SYMVER_FUNC(mlx5dv_init_obj, 1_2, "MLX5_1.2", + int, + struct mlx5dv_obj *obj, uint64_t obj_type) +{ + int ret = 0; + + if (obj_type & MLX5DV_OBJ_QP) + ret = mlx5dv_get_qp(obj->qp.in, obj->qp.out); + if (!ret && (obj_type & MLX5DV_OBJ_CQ)) + ret = mlx5dv_get_cq(obj->cq.in, obj->cq.out); + if (!ret && (obj_type & MLX5DV_OBJ_SRQ)) + ret = mlx5dv_get_srq(obj->srq.in, obj->srq.out); + if (!ret && (obj_type & MLX5DV_OBJ_RWQ)) + ret = mlx5dv_get_rwq(obj->rwq.in, obj->rwq.out); + if (!ret && (obj_type & MLX5DV_OBJ_DM)) + ret = mlx5dv_get_dm(obj->dm.in, obj->dm.out); + if (!ret && (obj_type & MLX5DV_OBJ_AH)) + ret = mlx5dv_get_av(obj->ah.in, obj->ah.out); + if (!ret && (obj_type & MLX5DV_OBJ_PD)) + ret = mlx5dv_get_pd(obj->pd.in, obj->pd.out); + + return ret; +} + +COMPAT_SYMVER_FUNC(mlx5dv_init_obj, 1_0, "MLX5_1.0", + int, + struct mlx5dv_obj *obj, uint64_t obj_type) +{ + int ret = 0; + + ret = __mlx5dv_init_obj_1_2(obj, obj_type); + if (!ret && (obj_type & MLX5DV_OBJ_CQ)) { + /* ABI version 1.0 returns the void ** in this memory + * location + */ + obj->cq.out->cq_uar = &(to_mctx(obj->cq.in->context)->cq_uar_reg); + } + return ret; +} + +off_t get_uar_mmap_offset(int idx, int page_size, int command) +{ + off_t offset = 0; + + set_command(command, &offset); + + if (command == MLX5_IB_MMAP_ALLOC_WC && + idx >= (1 << MLX5_IB_MMAP_CMD_SHIFT)) + set_extended_index(idx, &offset); + else + set_index(idx, &offset); + + return offset * page_size; +} + +static off_t uar_type_to_cmd(int uar_type) +{ + return (uar_type == MLX5_UAR_TYPE_NC) ? MLX5_MMAP_GET_NC_PAGES_CMD : + MLX5_MMAP_GET_REGULAR_PAGES_CMD; +} + +void *mlx5_mmap(struct mlx5_uar_info *uar, int index, int cmd_fd, int page_size, + int uar_type) +{ + off_t offset; + + if (uar_type == MLX5_UAR_TYPE_NC) { + offset = get_uar_mmap_offset(index, page_size, + MLX5_MMAP_GET_NC_PAGES_CMD); + uar->reg = mmap(NULL, page_size, PROT_WRITE, MAP_SHARED, + cmd_fd, offset); + if (uar->reg != MAP_FAILED) { + uar->type = MLX5_UAR_TYPE_NC; + goto out; + } + } + + /* Backward compatibility for legacy kernels that don't support + * MLX5_MMAP_GET_NC_PAGES_CMD mmap command. + */ + offset = get_uar_mmap_offset(index, page_size, + (uar_type == MLX5_UAR_TYPE_REGULAR_DYN) ? + MLX5_IB_MMAP_ALLOC_WC : + MLX5_MMAP_GET_REGULAR_PAGES_CMD); + uar->reg = mmap(NULL, page_size, PROT_WRITE, MAP_SHARED, + cmd_fd, offset); + if (uar->reg != MAP_FAILED) + uar->type = MLX5_UAR_TYPE_REGULAR; + +out: + return uar->reg; +} + +int mlx5dv_set_context_attr(struct ibv_context *ibv_ctx, + enum mlx5dv_set_ctx_attr_type type, void *attr) +{ + struct mlx5_context *ctx = to_mctx(ibv_ctx); + + if (!is_mlx5_dev(ibv_ctx->device)) + return EOPNOTSUPP; + + switch (type) { + case MLX5DV_CTX_ATTR_BUF_ALLOCATORS: + ctx->extern_alloc = *((struct mlx5dv_ctx_allocators *)attr); + break; + default: + return ENOTSUP; + } + + return 0; +} + +int mlx5dv_get_clock_info(struct ibv_context *ctx_in, + struct mlx5dv_clock_info *clock_info) +{ + struct mlx5_context *ctx = to_mctx(ctx_in); + const struct mlx5_ib_clock_info *ci = ctx->clock_info_page; + uint32_t retry, tmp_sig; + atomic_uint32_t *sig; + + if (!ci) + return EINVAL; + + sig = (atomic_uint32_t *)&ci->sign; + + do { + retry = 10; +repeat: + tmp_sig = atomic_load(sig); + if (unlikely(tmp_sig & + MLX5_IB_CLOCK_INFO_KERNEL_UPDATING)) { + if (--retry) + goto repeat; + return EBUSY; + } + clock_info->nsec = ci->nsec; + clock_info->last_cycles = ci->cycles; + clock_info->frac = ci->frac; + clock_info->mult = ci->mult; + clock_info->shift = ci->shift; + clock_info->mask = ci->mask; + } while (unlikely(tmp_sig != atomic_load(sig))); + + return 0; +} + +static void adjust_uar_info(struct mlx5_device *mdev, + struct mlx5_context *context, + struct mlx5_alloc_ucontext_resp resp) +{ + if (!resp.log_uar_size && !resp.num_uars_per_page) { + /* old kernel */ + context->uar_size = mdev->page_size; + context->num_uars_per_page = 1; + return; + } + + context->uar_size = 1 << resp.log_uar_size; + context->num_uars_per_page = resp.num_uars_per_page; +} + +bool mlx5dv_is_supported(struct ibv_device *device) +{ + return is_mlx5_dev(device); +} + +struct ibv_context * +mlx5dv_open_device(struct ibv_device *device, struct mlx5dv_context_attr *attr) +{ + if (!is_mlx5_dev(device)) { + errno = EOPNOTSUPP; + return NULL; + } + + return verbs_open_device(device, attr); +} + +static struct verbs_context *mlx5_alloc_context(struct ibv_device *ibdev, + int cmd_fd, + void *private_data) +{ + struct mlx5_context *context; + struct mlx5_alloc_ucontext req; + struct mlx5_alloc_ucontext_resp resp; + int i; + int page_size; + int tot_uuars; + int low_lat_uuars; + int gross_uuars; + int j; + struct mlx5_device *mdev = to_mdev(ibdev); + struct verbs_context *v_ctx; + struct ibv_port_attr port_attr; + struct ibv_device_attr_ex device_attr; + int k; + int bfi; + int num_sys_page_map; + struct mlx5dv_context_attr *ctx_attr = private_data; + bool always_devx = false; + + if (ctx_attr && ctx_attr->comp_mask) { + errno = EINVAL; + return NULL; + } + + context = verbs_init_and_alloc_context(ibdev, cmd_fd, context, ibv_ctx, + RDMA_DRIVER_MLX5); + if (!context) + return NULL; + + v_ctx = &context->ibv_ctx; + page_size = mdev->page_size; + mlx5_single_threaded = single_threaded_app(); + + open_debug_file(context); + set_debug_mask(); + set_freeze_on_error(); + if (gethostname(context->hostname, sizeof(context->hostname))) + strcpy(context->hostname, "host_unknown"); + + tot_uuars = get_total_uuars(page_size); + if (tot_uuars < 0) { + errno = -tot_uuars; + goto err_free; + } + + low_lat_uuars = get_num_low_lat_uuars(tot_uuars); + if (low_lat_uuars < 0) { + errno = -low_lat_uuars; + goto err_free; + } + + if (low_lat_uuars > tot_uuars - 1) { + errno = ENOMEM; + goto err_free; + } + + memset(&req, 0, sizeof(req)); + memset(&resp, 0, sizeof(resp)); + + req.total_num_bfregs = tot_uuars; + req.num_low_latency_bfregs = low_lat_uuars; + req.max_cqe_version = MLX5_CQE_VERSION_V1; + req.lib_caps |= (MLX5_LIB_CAP_4K_UAR | MLX5_LIB_CAP_DYN_UAR); + if (ctx_attr && ctx_attr->flags) { + + if (!check_comp_mask(ctx_attr->flags, + MLX5DV_CONTEXT_FLAGS_DEVX)) { + errno = EINVAL; + goto err_free; + } + + req.flags = MLX5_IB_ALLOC_UCTX_DEVX; + } else { + req.flags = MLX5_IB_ALLOC_UCTX_DEVX; + always_devx = true; + } + +retry_open: + if (mlx5_cmd_get_context(context, &req, sizeof(req), &resp, + sizeof(resp))) { + if (always_devx) { + req.flags &= ~MLX5_IB_ALLOC_UCTX_DEVX; + always_devx = false; + memset(&resp, 0, sizeof(resp)); + goto retry_open; + } else { + goto err_free; + } + } + + context->max_num_qps = resp.qp_tab_size; + context->bf_reg_size = resp.bf_reg_size; + context->tot_uuars = resp.tot_bfregs; + context->low_lat_uuars = low_lat_uuars; + context->cache_line_size = resp.cache_line_size; + context->max_sq_desc_sz = resp.max_sq_desc_sz; + context->max_rq_desc_sz = resp.max_rq_desc_sz; + context->max_send_wqebb = resp.max_send_wqebb; + context->num_ports = resp.num_ports; + context->max_recv_wr = resp.max_recv_wr; + context->max_srq_recv_wr = resp.max_srq_recv_wr; + context->num_dyn_bfregs = resp.num_dyn_bfregs; + + if (resp.comp_mask & MLX5_IB_ALLOC_UCONTEXT_RESP_MASK_DUMP_FILL_MKEY) { + context->dump_fill_mkey = resp.dump_fill_mkey; + /* Have the BE value ready to be used in data path */ + context->dump_fill_mkey_be = htobe32(resp.dump_fill_mkey); + } else { + /* kernel driver will never return MLX5_INVALID_LKEY for + * dump_fill_mkey + */ + context->dump_fill_mkey = MLX5_INVALID_LKEY; + context->dump_fill_mkey_be = htobe32(MLX5_INVALID_LKEY); + } + + context->cqe_version = resp.cqe_version; + + adjust_uar_info(mdev, context, resp); + + context->cmds_supp_uhw = resp.cmds_supp_uhw; + context->vendor_cap_flags = 0; + list_head_init(&context->dyn_uar_bf_list); + list_head_init(&context->dyn_uar_nc_list); + list_head_init(&context->dyn_uar_qp_shared_list); + list_head_init(&context->dyn_uar_qp_dedicated_list); + + if (resp.eth_min_inline) + context->eth_min_inline_size = (resp.eth_min_inline == MLX5_USER_INLINE_MODE_NONE) ? + 0 : MLX5_ETH_L2_INLINE_HEADER_SIZE; + else + context->eth_min_inline_size = MLX5_ETH_L2_INLINE_HEADER_SIZE; + + pthread_mutex_init(&context->qp_table_mutex, NULL); + pthread_mutex_init(&context->srq_table_mutex, NULL); + pthread_mutex_init(&context->uidx_table_mutex, NULL); + pthread_mutex_init(&context->dyn_bfregs_mutex, NULL); + for (i = 0; i < MLX5_QP_TABLE_SIZE; ++i) + context->qp_table[i].refcnt = 0; + + for (i = 0; i < MLX5_QP_TABLE_SIZE; ++i) + context->uidx_table[i].refcnt = 0; + + context->db_list = NULL; + + pthread_mutex_init(&context->db_list_mutex, NULL); + + context->prefer_bf = get_always_bf(); + context->shut_up_bf = get_shut_up_bf(); + + if (context->tot_uuars) { + gross_uuars = context->tot_uuars / MLX5_NUM_NON_FP_BFREGS_PER_UAR * NUM_BFREGS_PER_UAR; + context->bfs = calloc(gross_uuars, sizeof(*context->bfs)); + if (!context->bfs) { + errno = ENOMEM; + goto err_free; + } + context->flags |= MLX5_CTX_FLAGS_NO_KERN_DYN_UAR; + } else { + context->qp_max_dedicated_uuars = low_lat_uuars; + context->qp_max_shared_uuars = tot_uuars - low_lat_uuars; + goto bf_done; + } + + context->max_num_legacy_dyn_uar_sys_page = context->num_dyn_bfregs / + (context->num_uars_per_page * MLX5_NUM_NON_FP_BFREGS_PER_UAR); + num_sys_page_map = context->tot_uuars / (context->num_uars_per_page * MLX5_NUM_NON_FP_BFREGS_PER_UAR); + for (i = 0; i < num_sys_page_map; ++i) { + if (mlx5_mmap(&context->uar[i], i, cmd_fd, page_size, + context->shut_up_bf ? MLX5_UAR_TYPE_NC : + MLX5_UAR_TYPE_REGULAR) == MAP_FAILED) { + context->uar[i].reg = NULL; + goto err_free_bf; + } + } + + for (i = 0; i < num_sys_page_map; i++) { + for (j = 0; j < context->num_uars_per_page; j++) { + for (k = 0; k < NUM_BFREGS_PER_UAR; k++) { + bfi = (i * context->num_uars_per_page + j) * NUM_BFREGS_PER_UAR + k; + context->bfs[bfi].reg = context->uar[i].reg + MLX5_ADAPTER_PAGE_SIZE * j + + MLX5_BF_OFFSET + k * context->bf_reg_size; + context->bfs[bfi].need_lock = need_uuar_lock(context, bfi); + mlx5_spinlock_init(&context->bfs[bfi].lock, context->bfs[bfi].need_lock); + context->bfs[bfi].offset = 0; + if (bfi) + context->bfs[bfi].buf_size = context->bf_reg_size / 2; + context->bfs[bfi].uuarn = bfi; + context->bfs[bfi].uar_mmap_offset = get_uar_mmap_offset(i, + page_size, + uar_type_to_cmd(context->uar[i].type)); + } + } + } + +bf_done: + + context->hca_core_clock = NULL; + if (resp.response_length + sizeof(resp.ibv_resp) >= + offsetof(struct mlx5_alloc_ucontext_resp, hca_core_clock_offset) + + sizeof(resp.hca_core_clock_offset) && + resp.comp_mask & MLX5_IB_ALLOC_UCONTEXT_RESP_MASK_CORE_CLOCK_OFFSET) { + context->core_clock.offset = resp.hca_core_clock_offset; + mlx5_map_internal_clock(mdev, &v_ctx->context); + } + + context->clock_info_page = NULL; + if (resp.response_length + sizeof(resp.ibv_resp) >= + offsetof(struct mlx5_alloc_ucontext_resp, clock_info_versions) + + sizeof(resp.clock_info_versions) && + (resp.clock_info_versions & (1 << MLX5_IB_CLOCK_INFO_V1))) { + mlx5_map_clock_info(mdev, &v_ctx->context); + } + + context->flow_action_flags = resp.flow_action_flags; + + mlx5_read_env(ibdev, context); + + mlx5_spinlock_init(&context->hugetlb_lock, !mlx5_single_threaded); + list_head_init(&context->hugetlb_list); + + verbs_set_ops(v_ctx, &mlx5_ctx_common_ops); + if (context->cqe_version) { + if (context->cqe_version == MLX5_CQE_VERSION_V1) + verbs_set_ops(v_ctx, &mlx5_ctx_cqev1_ops); + else + goto err_free; + } + + memset(&device_attr, 0, sizeof(device_attr)); + if (!mlx5_query_device_ex(&v_ctx->context, NULL, &device_attr, + sizeof(struct ibv_device_attr_ex))) { + context->cached_device_cap_flags = + device_attr.orig_attr.device_cap_flags; + context->atomic_cap = device_attr.orig_attr.atomic_cap; + context->cached_tso_caps = device_attr.tso_caps; + context->max_dm_size = device_attr.max_dm_size; + } + + for (j = 0; j < min(MLX5_MAX_PORTS_NUM, context->num_ports); ++j) { + memset(&port_attr, 0, sizeof(port_attr)); + if (!mlx5_query_port(&v_ctx->context, j + 1, &port_attr)) { + context->cached_link_layer[j] = port_attr.link_layer; + context->cached_port_flags[j] = port_attr.flags; + } + } + + context->cq_uar = mlx5_attach_dedicated_uar(&v_ctx->context, + MLX5_IB_UAPI_UAR_ALLOC_TYPE_NC); + context->cq_uar_reg = context->cq_uar ? context->cq_uar->uar : context->uar[0].reg; + + return v_ctx; + +err_free_bf: + free(context->bfs); + +err_free: + for (i = 0; i < MLX5_MAX_UARS; ++i) { + if (context->uar[i].reg) + munmap(context->uar[i].reg, page_size); + } + close_debug_file(context); + + verbs_uninit_context(&context->ibv_ctx); + free(context); + return NULL; +} + +static void mlx5_free_context(struct ibv_context *ibctx) +{ + struct mlx5_context *context = to_mctx(ibctx); + int page_size = to_mdev(ibctx->device)->page_size; + int i; + + free(context->bfs); + for (i = 0; i < MLX5_MAX_UARS; ++i) { + if (context->uar[i].reg) + munmap(context->uar[i].reg, page_size); + } + if (context->hca_core_clock) + munmap(context->hca_core_clock - context->core_clock.offset, + page_size); + if (context->clock_info_page) + munmap((void *)context->clock_info_page, page_size); + close_debug_file(context); + clean_dyn_uars(ibctx); + + verbs_uninit_context(&context->ibv_ctx); + free(context); +} + +static void mlx5_uninit_device(struct verbs_device *verbs_device) +{ + struct mlx5_device *dev = to_mdev(&verbs_device->device); + + free(dev); +} + +static struct verbs_device *mlx5_device_alloc(struct verbs_sysfs_dev *sysfs_dev) +{ + struct mlx5_device *dev; + + dev = calloc(1, sizeof *dev); + if (!dev) + return NULL; + + dev->page_size = sysconf(_SC_PAGESIZE); + dev->driver_abi_ver = sysfs_dev->abi_ver; + + return &dev->verbs_dev; +} + +static const struct verbs_device_ops mlx5_dev_ops = { + .name = "mlx5", + .match_min_abi_version = MLX5_UVERBS_MIN_ABI_VERSION, + .match_max_abi_version = MLX5_UVERBS_MAX_ABI_VERSION, + .match_table = hca_table, + .alloc_device = mlx5_device_alloc, + .uninit_device = mlx5_uninit_device, + .alloc_context = mlx5_alloc_context, +}; + +bool is_mlx5_dev(struct ibv_device *device) +{ + struct verbs_device *verbs_device = verbs_get_device(device); + + return verbs_device->ops == &mlx5_dev_ops; +} +PROVIDER_DRIVER(mlx5, mlx5_dev_ops); diff --git a/providers/mlx5/mlx5.h b/providers/mlx5/mlx5.h new file mode 100644 index 0000000..54a9e1c --- /dev/null +++ b/providers/mlx5/mlx5.h @@ -0,0 +1,1145 @@ +/* + * Copyright (c) 2012 Mellanox Technologies, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef MLX5_H +#define MLX5_H + +#include <stddef.h> +#include <stdio.h> +#include <stdatomic.h> +#include <util/compiler.h> + +#include <infiniband/driver.h> +#include <util/udma_barrier.h> +#include <util/util.h> +#include "mlx5-abi.h" +#include <ccan/list.h> +#include "bitmap.h" +#include <ccan/minmax.h> +#include "mlx5dv.h" + +#include <valgrind/memcheck.h> + +#define PFX "mlx5: " + +typedef _Atomic(uint32_t) atomic_uint32_t; + +enum { + MLX5_IB_MMAP_CMD_SHIFT = 8, + MLX5_IB_MMAP_CMD_MASK = 0xff, +}; + +enum { + MLX5_CQE_VERSION_V0 = 0, + MLX5_CQE_VERSION_V1 = 1, +}; + +enum { + MLX5_ADAPTER_PAGE_SIZE = 4096, +}; + +#define MLX5_CQ_PREFIX "MLX_CQ" +#define MLX5_QP_PREFIX "MLX_QP" +#define MLX5_MR_PREFIX "MLX_MR" +#define MLX5_RWQ_PREFIX "MLX_RWQ" +#define MLX5_SRQ_PREFIX "MLX_SRQ" +#define MLX5_MAX_LOG2_CONTIG_BLOCK_SIZE 23 +#define MLX5_MIN_LOG2_CONTIG_BLOCK_SIZE 12 + +enum { + MLX5_DBG_QP = 1 << 0, + MLX5_DBG_CQ = 1 << 1, + MLX5_DBG_QP_SEND = 1 << 2, + MLX5_DBG_QP_SEND_ERR = 1 << 3, + MLX5_DBG_CQ_CQE = 1 << 4, + MLX5_DBG_CONTIG = 1 << 5, + MLX5_DBG_DR = 1 << 6, +}; + +extern uint32_t mlx5_debug_mask; +extern int mlx5_freeze_on_error_cqe; + +#ifdef MLX5_DEBUG +#define mlx5_dbg(fp, mask, format, arg...) \ +do { \ + if (mask & mlx5_debug_mask) { \ + int tmp = errno; \ + fprintf(fp, "%s:%d: " format, __func__, __LINE__, ##arg); \ + errno = tmp; \ + } \ +} while (0) + +#else +static inline void mlx5_dbg(FILE *fp, uint32_t mask, const char *fmt, ...) + __attribute__((format(printf, 3, 4))); +static inline void mlx5_dbg(FILE *fp, uint32_t mask, const char *fmt, ...) +{ +} +#endif + +enum { + MLX5_STAT_RATE_OFFSET = 5 +}; + +enum { + MLX5_QP_TABLE_SHIFT = 12, + MLX5_QP_TABLE_MASK = (1 << MLX5_QP_TABLE_SHIFT) - 1, + MLX5_QP_TABLE_SIZE = 1 << (24 - MLX5_QP_TABLE_SHIFT), +}; + +enum { + MLX5_UIDX_TABLE_SHIFT = 12, + MLX5_UIDX_TABLE_MASK = (1 << MLX5_UIDX_TABLE_SHIFT) - 1, + MLX5_UIDX_TABLE_SIZE = 1 << (24 - MLX5_UIDX_TABLE_SHIFT), +}; + +enum { + MLX5_SRQ_TABLE_SHIFT = 12, + MLX5_SRQ_TABLE_MASK = (1 << MLX5_SRQ_TABLE_SHIFT) - 1, + MLX5_SRQ_TABLE_SIZE = 1 << (24 - MLX5_SRQ_TABLE_SHIFT), +}; + +enum { + MLX5_BF_OFFSET = 0x800 +}; + +enum { + MLX5_TM_OPCODE_NOP = 0x00, + MLX5_TM_OPCODE_APPEND = 0x01, + MLX5_TM_OPCODE_REMOVE = 0x02, +}; + +enum { + MLX5_RECV_OPCODE_RDMA_WRITE_IMM = 0x00, + MLX5_RECV_OPCODE_SEND = 0x01, + MLX5_RECV_OPCODE_SEND_IMM = 0x02, + MLX5_RECV_OPCODE_SEND_INVAL = 0x03, + + MLX5_CQE_OPCODE_ERROR = 0x1e, + MLX5_CQE_OPCODE_RESIZE = 0x16, +}; + +enum { + MLX5_SRQ_FLAG_TM_SW_CNT = (1 << 6), + MLX5_SRQ_FLAG_TM_CQE_REQ = (1 << 7), +}; + +enum { + MLX5_MAX_PORTS_NUM = 2, +}; + +enum { + MLX5_CSUM_SUPPORT_RAW_OVER_ETH = (1 << 0), + MLX5_CSUM_SUPPORT_UNDERLAY_UD = (1 << 1), + /* + * Only report rx checksum when the validation + * is valid. + */ + MLX5_RX_CSUM_VALID = (1 << 16), +}; + +enum mlx5_alloc_type { + MLX5_ALLOC_TYPE_ANON, + MLX5_ALLOC_TYPE_HUGE, + MLX5_ALLOC_TYPE_CONTIG, + MLX5_ALLOC_TYPE_PREFER_HUGE, + MLX5_ALLOC_TYPE_PREFER_CONTIG, + MLX5_ALLOC_TYPE_EXTERNAL, + MLX5_ALLOC_TYPE_CUSTOM, + MLX5_ALLOC_TYPE_ALL +}; + +enum mlx5_rsc_type { + MLX5_RSC_TYPE_QP, + MLX5_RSC_TYPE_XSRQ, + MLX5_RSC_TYPE_SRQ, + MLX5_RSC_TYPE_RWQ, + MLX5_RSC_TYPE_INVAL, +}; + +enum mlx5_vendor_cap_flags { + MLX5_VENDOR_CAP_FLAGS_MPW = 1 << 0, /* Obsoleted */ + MLX5_VENDOR_CAP_FLAGS_MPW_ALLOWED = 1 << 1, + MLX5_VENDOR_CAP_FLAGS_ENHANCED_MPW = 1 << 2, + MLX5_VENDOR_CAP_FLAGS_CQE_128B_COMP = 1 << 3, + MLX5_VENDOR_CAP_FLAGS_CQE_128B_PAD = 1 << 4, + MLX5_VENDOR_CAP_FLAGS_PACKET_BASED_CREDIT_MODE = 1 << 5, + MLX5_VENDOR_CAP_FLAGS_SCAT2CQE_DCT = 1 << 6, +}; + +enum { + MLX5_FLOW_TAG_MASK = 0x00ffffff, +}; + +struct mlx5_resource { + enum mlx5_rsc_type type; + uint32_t rsn; +}; + +struct mlx5_device { + struct verbs_device verbs_dev; + int page_size; + int driver_abi_ver; +}; + +struct mlx5_db_page; + +struct mlx5_spinlock { + pthread_spinlock_t lock; + int in_use; + int need_lock; +}; + +enum mlx5_uar_type { + MLX5_UAR_TYPE_REGULAR, + MLX5_UAR_TYPE_NC, + MLX5_UAR_TYPE_REGULAR_DYN, +}; + +struct mlx5_uar_info { + void *reg; + enum mlx5_uar_type type; +}; + +enum mlx5_ctx_flags { + MLX5_CTX_FLAGS_FATAL_STATE = 1 << 0, + MLX5_CTX_FLAGS_NO_KERN_DYN_UAR = 1 << 1, +}; + +struct mlx5_context { + struct verbs_context ibv_ctx; + int max_num_qps; + int bf_reg_size; + int tot_uuars; + int low_lat_uuars; + int num_uars_per_page; + int bf_regs_per_page; + int num_bf_regs; + int prefer_bf; + int shut_up_bf; + struct { + struct mlx5_qp **table; + int refcnt; + } qp_table[MLX5_QP_TABLE_SIZE]; + pthread_mutex_t qp_table_mutex; + + struct { + struct mlx5_srq **table; + int refcnt; + } srq_table[MLX5_SRQ_TABLE_SIZE]; + pthread_mutex_t srq_table_mutex; + + struct { + struct mlx5_resource **table; + int refcnt; + } uidx_table[MLX5_UIDX_TABLE_SIZE]; + pthread_mutex_t uidx_table_mutex; + + struct mlx5_uar_info uar[MLX5_MAX_UARS]; + struct mlx5_db_page *db_list; + pthread_mutex_t db_list_mutex; + int cache_line_size; + int max_sq_desc_sz; + int max_rq_desc_sz; + int max_send_wqebb; + int max_recv_wr; + unsigned max_srq_recv_wr; + int num_ports; + int stall_enable; + int stall_adaptive_enable; + int stall_cycles; + struct mlx5_bf *bfs; + FILE *dbg_fp; + char hostname[40]; + struct mlx5_spinlock hugetlb_lock; + struct list_head hugetlb_list; + int cqe_version; + uint8_t cached_link_layer[MLX5_MAX_PORTS_NUM]; + uint8_t cached_port_flags[MLX5_MAX_PORTS_NUM]; + unsigned int cached_device_cap_flags; + enum ibv_atomic_cap atomic_cap; + struct { + uint64_t offset; + uint64_t mask; + } core_clock; + void *hca_core_clock; + const struct mlx5_ib_clock_info *clock_info_page; + struct ibv_tso_caps cached_tso_caps; + int cmds_supp_uhw; + uint32_t uar_size; + uint64_t vendor_cap_flags; /* Use enum mlx5_vendor_cap_flags */ + struct mlx5dv_cqe_comp_caps cqe_comp_caps; + struct mlx5dv_ctx_allocators extern_alloc; + struct mlx5dv_sw_parsing_caps sw_parsing_caps; + struct mlx5dv_striding_rq_caps striding_rq_caps; + uint32_t tunnel_offloads_caps; + struct mlx5_packet_pacing_caps packet_pacing_caps; + pthread_mutex_t dyn_bfregs_mutex; /* protects the dynamic bfregs allocation */ + uint32_t num_dyn_bfregs; + uint32_t max_num_legacy_dyn_uar_sys_page; + uint32_t curr_legacy_dyn_sys_uar_page; + uint16_t flow_action_flags; + uint64_t max_dm_size; + uint32_t eth_min_inline_size; + uint32_t dump_fill_mkey; + __be32 dump_fill_mkey_be; + uint32_t flags; + struct list_head dyn_uar_bf_list; + struct list_head dyn_uar_nc_list; + struct list_head dyn_uar_qp_shared_list; + struct list_head dyn_uar_qp_dedicated_list; + uint16_t qp_max_dedicated_uuars; + uint16_t qp_alloc_dedicated_uuars; + uint16_t qp_max_shared_uuars; + uint16_t qp_alloc_shared_uuars; + struct mlx5_bf *cq_uar; + void *cq_uar_reg; +}; + +struct mlx5_bitmap { + uint32_t last; + uint32_t top; + uint32_t max; + uint32_t avail; + uint32_t mask; + unsigned long *table; +}; + +struct mlx5_hugetlb_mem { + int shmid; + void *shmaddr; + struct mlx5_bitmap bitmap; + struct list_node entry; +}; + +struct mlx5_buf { + void *buf; + size_t length; + int base; + struct mlx5_hugetlb_mem *hmem; + enum mlx5_alloc_type type; + uint64_t resource_type; + size_t req_alignment; + struct mlx5_parent_domain *mparent_domain; +}; + +struct mlx5_td { + struct ibv_td ibv_td; + struct mlx5_bf *bf; + atomic_int refcount; +}; + +struct mlx5_pd { + struct ibv_pd ibv_pd; + uint32_t pdn; + atomic_int refcount; + struct mlx5_pd *mprotection_domain; +}; + +struct mlx5_parent_domain { + struct mlx5_pd mpd; + struct mlx5_td *mtd; + void *(*alloc)(struct ibv_pd *pd, void *pd_context, size_t size, + size_t alignment, uint64_t resource_type); + void (*free)(struct ibv_pd *pd, void *pd_context, void *ptr, + uint64_t resource_type); + void *pd_context; +}; + +enum { + MLX5_CQ_SET_CI = 0, + MLX5_CQ_ARM_DB = 1, +}; + +enum { + MLX5_CQ_FLAGS_RX_CSUM_VALID = 1 << 0, + MLX5_CQ_FLAGS_EMPTY_DURING_POLL = 1 << 1, + MLX5_CQ_FLAGS_FOUND_CQES = 1 << 2, + MLX5_CQ_FLAGS_EXTENDED = 1 << 3, + MLX5_CQ_FLAGS_SINGLE_THREADED = 1 << 4, + MLX5_CQ_FLAGS_DV_OWNED = 1 << 5, + MLX5_CQ_FLAGS_TM_SYNC_REQ = 1 << 6, +}; + +struct mlx5_cq { + /* ibv_cq should always be subset of ibv_cq_ex */ + struct ibv_cq_ex ibv_cq; + struct mlx5_buf buf_a; + struct mlx5_buf buf_b; + struct mlx5_buf *active_buf; + struct mlx5_buf *resize_buf; + int resize_cqes; + int active_cqes; + struct mlx5_spinlock lock; + uint32_t cqn; + uint32_t cons_index; + __be32 *dbrec; + bool custom_db; + int arm_sn; + int cqe_sz; + int resize_cqe_sz; + int stall_next_poll; + int stall_enable; + uint64_t stall_last_count; + int stall_adaptive_enable; + int stall_cycles; + struct mlx5_resource *cur_rsc; + struct mlx5_srq *cur_srq; + struct mlx5_cqe64 *cqe64; + uint32_t flags; + int umr_opcode; + struct mlx5dv_clock_info last_clock_info; + struct ibv_pd *parent_domain; +}; + +struct mlx5_tag_entry { + struct mlx5_tag_entry *next; + uint64_t wr_id; + int phase_cnt; + void *ptr; + uint32_t size; + int8_t expect_cqe; +}; + +struct mlx5_srq_op { + struct mlx5_tag_entry *tag; + uint64_t wr_id; + /* we need to advance tail pointer */ + uint32_t wqe_head; +}; + +struct mlx5_srq { + struct mlx5_resource rsc; /* This struct must be first */ + struct verbs_srq vsrq; + struct mlx5_buf buf; + struct mlx5_spinlock lock; + uint64_t *wrid; + uint32_t srqn; + int max; + int max_gs; + int wqe_shift; + int head; + int tail; + int waitq_head; + int waitq_tail; + __be32 *db; + bool custom_db; + uint16_t counter; + int wq_sig; + struct ibv_qp *cmd_qp; + struct mlx5_tag_entry *tm_list; /* vector of all tags */ + struct mlx5_tag_entry *tm_head; /* queue of free tags */ + struct mlx5_tag_entry *tm_tail; + struct mlx5_srq_op *op; + int op_head; + int op_tail; + int unexp_in; + int unexp_out; +}; + + +static inline void mlx5_tm_release_tag(struct mlx5_srq *srq, + struct mlx5_tag_entry *tag) +{ + if (!--tag->expect_cqe) { + tag->next = NULL; + srq->tm_tail->next = tag; + srq->tm_tail = tag; + } +} + +struct wr_list { + uint16_t opcode; + uint16_t next; +}; + +struct mlx5_wq { + uint64_t *wrid; + unsigned *wqe_head; + struct mlx5_spinlock lock; + unsigned wqe_cnt; + unsigned max_post; + unsigned head; + unsigned tail; + unsigned cur_post; + int max_gs; + int wqe_shift; + int offset; + void *qend; + uint32_t *wr_data; +}; + +struct mlx5_devx_uar { + struct mlx5dv_devx_uar dv_devx_uar; + struct ibv_context *context; +}; + +struct mlx5_bf { + void *reg; + int need_lock; + struct mlx5_spinlock lock; + unsigned offset; + unsigned buf_size; + unsigned uuarn; + off_t uar_mmap_offset; + /* The virtual address of the mmaped uar, applicable for the dynamic use case */ + void *uar; + /* Index in the dynamic bfregs portion */ + uint32_t bfreg_dyn_index; + struct mlx5_devx_uar devx_uar; + uint8_t dyn_alloc_uar : 1; + uint8_t mmaped_entry : 1; + uint8_t nc_mode : 1; + uint8_t qp_dedicated : 1; + uint8_t qp_shared : 1; + uint32_t count; + struct list_node uar_entry; + uint32_t uar_handle; + uint32_t length; + uint32_t page_id; +}; + +struct mlx5_dm { + struct verbs_dm verbs_dm; + size_t length; + void *mmap_va; + void *start_va; + uint64_t remote_va; +}; + +struct mlx5_mr { + struct verbs_mr vmr; + struct mlx5_buf buf; + uint32_t alloc_flags; +}; + +enum mlx5_qp_flags { + MLX5_QP_FLAGS_USE_UNDERLAY = 0x01, +}; + +struct mlx5_qp { + struct mlx5_resource rsc; /* This struct must be first */ + struct verbs_qp verbs_qp; + struct mlx5dv_qp_ex dv_qp; + struct ibv_qp *ibv_qp; + struct mlx5_buf buf; + int max_inline_data; + int buf_size; + /* For Raw Packet QP, use different buffers for the SQ and RQ */ + struct mlx5_buf sq_buf; + int sq_buf_size; + struct mlx5_bf *bf; + + /* Start of new post send API specific fields */ + bool inl_wqe; + uint8_t cur_setters_cnt; + uint8_t fm_cache_rb; + int err; + int nreq; + uint32_t cur_size; + uint32_t cur_post_rb; + void *cur_eth; + void *cur_data; + struct mlx5_wqe_ctrl_seg *cur_ctrl; + /* End of new post send API specific fields */ + + uint8_t fm_cache; + uint8_t sq_signal_bits; + void *sq_start; + struct mlx5_wq sq; + + __be32 *db; + bool custom_db; + struct mlx5_wq rq; + int wq_sig; + uint32_t qp_cap_cache; + int atomics_enabled; + uint32_t max_tso; + uint16_t max_tso_header; + int rss_qp; + uint32_t flags; /* Use enum mlx5_qp_flags */ + enum mlx5dv_dc_type dc_type; + uint32_t tirn; + uint32_t tisn; + uint32_t rqn; + uint32_t sqn; + uint64_t tir_icm_addr; +}; + +struct mlx5_ah { + struct ibv_ah ibv_ah; + struct mlx5_wqe_av av; + bool kern_ah; +}; + +struct mlx5_rwq { + struct mlx5_resource rsc; + struct ibv_wq wq; + struct mlx5_buf buf; + int buf_size; + struct mlx5_wq rq; + __be32 *db; + bool custom_db; + void *pbuff; + __be32 *recv_db; + int wq_sig; +}; + +struct mlx5_counter_node { + uint32_t index; + struct list_node entry; + enum ibv_counter_description desc; +}; + +struct mlx5_counters { + struct verbs_counters vcounters; + struct list_head counters_list; + pthread_mutex_t lock; + uint32_t ncounters; + /* number of bounded objects */ + int refcount; +}; + +struct mlx5_flow { + struct ibv_flow flow_id; + struct mlx5_counters *mcounters; +}; + +struct mlx5dv_flow_matcher { + struct ibv_context *context; + uint32_t handle; +}; + +enum mlx5_devx_obj_type { + MLX5_DEVX_FLOW_TABLE = 1, + MLX5_DEVX_FLOW_COUNTER = 2, + MLX5_DEVX_FLOW_METER = 3, + MLX5_DEVX_QP = 4, + MLX5_DEVX_PKT_REFORMAT_CTX = 5, +}; + +struct mlx5dv_devx_obj { + struct ibv_context *context; + uint32_t handle; + enum mlx5_devx_obj_type type; + uint32_t object_id; +}; + +struct mlx5_var_obj { + struct mlx5dv_var dv_var; + struct ibv_context *context; + uint32_t handle; +}; + +struct mlx5_pp_obj { + struct mlx5dv_pp dv_pp; + struct ibv_context *context; + uint32_t handle; +}; + +struct mlx5_devx_umem { + struct mlx5dv_devx_umem dv_devx_umem; + struct ibv_context *context; + uint32_t handle; +}; + +struct mlx5_mkey { + struct mlx5dv_mkey dv_mkey; + struct mlx5dv_devx_obj *devx_obj; + uint16_t num_desc; +}; + +struct mlx5_devx_event_channel { + struct ibv_context *context; + struct mlx5dv_devx_event_channel dv_event_channel; +}; + +enum mlx5_flow_action_type { + MLX5_FLOW_ACTION_COUNTER_OFFSET = 1, +}; + +struct mlx5_flow_action_attr_aux { + enum mlx5_flow_action_type type; + uint32_t offset; +}; + +struct ibv_flow * +__mlx5dv_create_flow(struct mlx5dv_flow_matcher *flow_matcher, + struct mlx5dv_flow_match_parameters *match_value, + size_t num_actions, + struct mlx5dv_flow_action_attr actions_attr[], + struct mlx5_flow_action_attr_aux actions_attr_aux[]); + +extern int mlx5_stall_num_loop; +extern int mlx5_stall_cq_poll_min; +extern int mlx5_stall_cq_poll_max; +extern int mlx5_stall_cq_inc_step; +extern int mlx5_stall_cq_dec_step; +extern int mlx5_single_threaded; + +#define to_mxxx(xxx, type) container_of(ib##xxx, struct mlx5_##type, ibv_##xxx) + +static inline struct mlx5_device *to_mdev(struct ibv_device *ibdev) +{ + return container_of(ibdev, struct mlx5_device, verbs_dev.device); +} + +static inline struct mlx5_context *to_mctx(struct ibv_context *ibctx) +{ + return container_of(ibctx, struct mlx5_context, ibv_ctx.context); +} + +/* to_mpd always returns the real mlx5_pd object ie the protection domain. */ +static inline struct mlx5_pd *to_mpd(struct ibv_pd *ibpd) +{ + struct mlx5_pd *mpd = to_mxxx(pd, pd); + + if (mpd->mprotection_domain) + return mpd->mprotection_domain; + + return mpd; +} + +static inline struct mlx5_parent_domain *to_mparent_domain(struct ibv_pd *ibpd) +{ + struct mlx5_parent_domain *mparent_domain = + ibpd ? container_of(ibpd, struct mlx5_parent_domain, mpd.ibv_pd) : NULL; + + if (mparent_domain && mparent_domain->mpd.mprotection_domain) + return mparent_domain; + + /* Otherwise ibpd isn't a parent_domain */ + return NULL; +} + +static inline struct mlx5_cq *to_mcq(struct ibv_cq *ibcq) +{ + return container_of((struct ibv_cq_ex *)ibcq, struct mlx5_cq, ibv_cq); +} + +static inline struct mlx5_srq *to_msrq(struct ibv_srq *ibsrq) +{ + struct verbs_srq *vsrq = (struct verbs_srq *)ibsrq; + + return container_of(vsrq, struct mlx5_srq, vsrq); +} + +static inline struct mlx5_td *to_mtd(struct ibv_td *ibtd) +{ + return to_mxxx(td, td); +} + +static inline struct mlx5_qp *to_mqp(struct ibv_qp *ibqp) +{ + struct verbs_qp *vqp = (struct verbs_qp *)ibqp; + + return container_of(vqp, struct mlx5_qp, verbs_qp); +} + +static inline struct mlx5_qp *mqp_from_mlx5dv_qp_ex(struct mlx5dv_qp_ex *dv_qp) +{ + return container_of(dv_qp, struct mlx5_qp, dv_qp); +} + +static inline struct mlx5_rwq *to_mrwq(struct ibv_wq *ibwq) +{ + return container_of(ibwq, struct mlx5_rwq, wq); +} + +static inline struct mlx5_dm *to_mdm(struct ibv_dm *ibdm) +{ + return container_of(ibdm, struct mlx5_dm, verbs_dm.dm); +} + +static inline struct mlx5_mr *to_mmr(struct ibv_mr *ibmr) +{ + return container_of(ibmr, struct mlx5_mr, vmr.ibv_mr); +} + +static inline struct mlx5_ah *to_mah(struct ibv_ah *ibah) +{ + return to_mxxx(ah, ah); +} + +static inline int max_int(int a, int b) +{ + return a > b ? a : b; +} + +static inline struct mlx5_qp *rsc_to_mqp(struct mlx5_resource *rsc) +{ + return (struct mlx5_qp *)rsc; +} + +static inline struct mlx5_srq *rsc_to_msrq(struct mlx5_resource *rsc) +{ + return (struct mlx5_srq *)rsc; +} + +static inline struct mlx5_rwq *rsc_to_mrwq(struct mlx5_resource *rsc) +{ + return (struct mlx5_rwq *)rsc; +} + +static inline struct mlx5_counters *to_mcounters(struct ibv_counters *ibcounters) +{ + return container_of(ibcounters, struct mlx5_counters, vcounters.counters); +} + +static inline struct mlx5_flow *to_mflow(struct ibv_flow *flow_id) +{ + return container_of(flow_id, struct mlx5_flow, flow_id); +} + +bool is_mlx5_dev(struct ibv_device *device); + +int mlx5_alloc_buf(struct mlx5_buf *buf, size_t size, int page_size); +void mlx5_free_buf(struct mlx5_buf *buf); +int mlx5_alloc_buf_contig(struct mlx5_context *mctx, struct mlx5_buf *buf, + size_t size, int page_size, const char *component); +void mlx5_free_buf_contig(struct mlx5_context *mctx, struct mlx5_buf *buf); +int mlx5_alloc_prefered_buf(struct mlx5_context *mctx, + struct mlx5_buf *buf, + size_t size, int page_size, + enum mlx5_alloc_type alloc_type, + const char *component); +int mlx5_free_actual_buf(struct mlx5_context *ctx, struct mlx5_buf *buf); +void mlx5_get_alloc_type(struct mlx5_context *context, + struct ibv_pd *pd, + const char *component, + enum mlx5_alloc_type *alloc_type, + enum mlx5_alloc_type default_alloc_type); +int mlx5_use_huge(const char *key); +bool mlx5_is_custom_alloc(struct ibv_pd *pd); +bool mlx5_is_extern_alloc(struct mlx5_context *context); +int mlx5_alloc_buf_extern(struct mlx5_context *ctx, struct mlx5_buf *buf, + size_t size); +void mlx5_free_buf_extern(struct mlx5_context *ctx, struct mlx5_buf *buf); + +__be32 *mlx5_alloc_dbrec(struct mlx5_context *context, struct ibv_pd *pd, + bool *custom_alloc); +void mlx5_free_db(struct mlx5_context *context, __be32 *db, struct ibv_pd *pd, + bool custom_alloc); + +int mlx5_query_device(struct ibv_context *context, + struct ibv_device_attr *attr); +int mlx5_query_device_ex(struct ibv_context *context, + const struct ibv_query_device_ex_input *input, + struct ibv_device_attr_ex *attr, + size_t attr_size); +int mlx5_query_rt_values(struct ibv_context *context, + struct ibv_values_ex *values); +struct ibv_qp *mlx5_create_qp_ex(struct ibv_context *context, + struct ibv_qp_init_attr_ex *attr); +int mlx5_query_port(struct ibv_context *context, uint8_t port, + struct ibv_port_attr *attr); + +struct ibv_pd *mlx5_alloc_pd(struct ibv_context *context); +int mlx5_free_pd(struct ibv_pd *pd); + +void mlx5_async_event(struct ibv_context *context, + struct ibv_async_event *event); + +struct ibv_mr *mlx5_alloc_null_mr(struct ibv_pd *pd); +struct ibv_mr *mlx5_reg_mr(struct ibv_pd *pd, void *addr, size_t length, + uint64_t hca_va, int access); +int mlx5_rereg_mr(struct verbs_mr *mr, int flags, struct ibv_pd *pd, void *addr, + size_t length, int access); +int mlx5_dereg_mr(struct verbs_mr *mr); +struct ibv_mw *mlx5_alloc_mw(struct ibv_pd *pd, enum ibv_mw_type); +int mlx5_dealloc_mw(struct ibv_mw *mw); +int mlx5_bind_mw(struct ibv_qp *qp, struct ibv_mw *mw, + struct ibv_mw_bind *mw_bind); + +struct ibv_cq *mlx5_create_cq(struct ibv_context *context, int cqe, + struct ibv_comp_channel *channel, + int comp_vector); +struct ibv_cq_ex *mlx5_create_cq_ex(struct ibv_context *context, + struct ibv_cq_init_attr_ex *cq_attr); +int mlx5_cq_fill_pfns(struct mlx5_cq *cq, + const struct ibv_cq_init_attr_ex *cq_attr, + struct mlx5_context *mctx); +int mlx5_alloc_cq_buf(struct mlx5_context *mctx, struct mlx5_cq *cq, + struct mlx5_buf *buf, int nent, int cqe_sz); +int mlx5_free_cq_buf(struct mlx5_context *ctx, struct mlx5_buf *buf); +int mlx5_resize_cq(struct ibv_cq *cq, int cqe); +int mlx5_modify_cq(struct ibv_cq *cq, struct ibv_modify_cq_attr *attr); +int mlx5_destroy_cq(struct ibv_cq *cq); +int mlx5_poll_cq(struct ibv_cq *cq, int ne, struct ibv_wc *wc); +int mlx5_poll_cq_v1(struct ibv_cq *cq, int ne, struct ibv_wc *wc); +int mlx5_arm_cq(struct ibv_cq *cq, int solicited); +void mlx5_cq_event(struct ibv_cq *cq); +void __mlx5_cq_clean(struct mlx5_cq *cq, uint32_t qpn, struct mlx5_srq *srq); +void mlx5_cq_clean(struct mlx5_cq *cq, uint32_t qpn, struct mlx5_srq *srq); +void mlx5_cq_resize_copy_cqes(struct mlx5_cq *cq); + +struct ibv_srq *mlx5_create_srq(struct ibv_pd *pd, + struct ibv_srq_init_attr *attr); +int mlx5_modify_srq(struct ibv_srq *srq, struct ibv_srq_attr *attr, + int mask); +int mlx5_query_srq(struct ibv_srq *srq, + struct ibv_srq_attr *attr); +int mlx5_destroy_srq(struct ibv_srq *srq); +int mlx5_alloc_srq_buf(struct ibv_context *context, struct mlx5_srq *srq, + uint32_t nwr, struct ibv_pd *pd); +void mlx5_complete_odp_fault(struct mlx5_srq *srq, int ind); +void mlx5_free_srq_wqe(struct mlx5_srq *srq, int ind); +int mlx5_post_srq_recv(struct ibv_srq *ibsrq, + struct ibv_recv_wr *wr, + struct ibv_recv_wr **bad_wr); + +struct ibv_qp *mlx5_create_qp(struct ibv_pd *pd, struct ibv_qp_init_attr *attr); +int mlx5_query_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, + int attr_mask, + struct ibv_qp_init_attr *init_attr); +int mlx5_modify_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, + int attr_mask); +int mlx5_modify_qp_rate_limit(struct ibv_qp *qp, + struct ibv_qp_rate_limit_attr *attr); +int mlx5_destroy_qp(struct ibv_qp *qp); +void mlx5_init_qp_indices(struct mlx5_qp *qp); +void mlx5_init_rwq_indices(struct mlx5_rwq *rwq); +int mlx5_post_send(struct ibv_qp *ibqp, struct ibv_send_wr *wr, + struct ibv_send_wr **bad_wr); +int mlx5_post_recv(struct ibv_qp *ibqp, struct ibv_recv_wr *wr, + struct ibv_recv_wr **bad_wr); +int mlx5_post_wq_recv(struct ibv_wq *ibwq, struct ibv_recv_wr *wr, + struct ibv_recv_wr **bad_wr); +void mlx5_calc_sq_wqe_size(struct ibv_qp_cap *cap, enum ibv_qp_type type, + struct mlx5_qp *qp); +void mlx5_set_sq_sizes(struct mlx5_qp *qp, struct ibv_qp_cap *cap, + enum ibv_qp_type type); +struct mlx5_qp *mlx5_find_qp(struct mlx5_context *ctx, uint32_t qpn); +int mlx5_store_qp(struct mlx5_context *ctx, uint32_t qpn, struct mlx5_qp *qp); +void mlx5_clear_qp(struct mlx5_context *ctx, uint32_t qpn); +int32_t mlx5_store_uidx(struct mlx5_context *ctx, void *rsc); +void mlx5_clear_uidx(struct mlx5_context *ctx, uint32_t uidx); +struct mlx5_srq *mlx5_find_srq(struct mlx5_context *ctx, uint32_t srqn); +int mlx5_store_srq(struct mlx5_context *ctx, uint32_t srqn, + struct mlx5_srq *srq); +void mlx5_clear_srq(struct mlx5_context *ctx, uint32_t srqn); +struct ibv_ah *mlx5_create_ah(struct ibv_pd *pd, struct ibv_ah_attr *attr); +int mlx5_destroy_ah(struct ibv_ah *ah); +int mlx5_alloc_av(struct mlx5_pd *pd, struct ibv_ah_attr *attr, + struct mlx5_ah *ah); +void mlx5_free_av(struct mlx5_ah *ah); +int mlx5_attach_mcast(struct ibv_qp *qp, const union ibv_gid *gid, uint16_t lid); +int mlx5_detach_mcast(struct ibv_qp *qp, const union ibv_gid *gid, uint16_t lid); +void *mlx5_get_atomic_laddr(struct mlx5_qp *qp, uint16_t idx, int *byte_count); +void *mlx5_get_send_wqe(struct mlx5_qp *qp, int n); +int mlx5_copy_to_recv_wqe(struct mlx5_qp *qp, int idx, void *buf, int size); +int mlx5_copy_to_send_wqe(struct mlx5_qp *qp, int idx, void *buf, int size); +int mlx5_copy_to_recv_srq(struct mlx5_srq *srq, int idx, void *buf, int size); +struct ibv_xrcd *mlx5_open_xrcd(struct ibv_context *context, + struct ibv_xrcd_init_attr *xrcd_init_attr); +int mlx5_get_srq_num(struct ibv_srq *srq, uint32_t *srq_num); +struct ibv_qp *mlx5_open_qp(struct ibv_context *context, + struct ibv_qp_open_attr *attr); +int mlx5_close_xrcd(struct ibv_xrcd *ib_xrcd); +struct ibv_wq *mlx5_create_wq(struct ibv_context *context, + struct ibv_wq_init_attr *attr); +int mlx5_modify_wq(struct ibv_wq *wq, struct ibv_wq_attr *attr); +int mlx5_destroy_wq(struct ibv_wq *wq); +struct ibv_rwq_ind_table *mlx5_create_rwq_ind_table(struct ibv_context *context, + struct ibv_rwq_ind_table_init_attr *init_attr); +int mlx5_destroy_rwq_ind_table(struct ibv_rwq_ind_table *rwq_ind_table); +struct ibv_flow *mlx5_create_flow(struct ibv_qp *qp, struct ibv_flow_attr *flow_attr); +int mlx5_destroy_flow(struct ibv_flow *flow_id); +struct ibv_srq *mlx5_create_srq_ex(struct ibv_context *context, + struct ibv_srq_init_attr_ex *attr); +int mlx5_post_srq_ops(struct ibv_srq *srq, + struct ibv_ops_wr *wr, + struct ibv_ops_wr **bad_wr); +struct ibv_flow_action *mlx5_create_flow_action_esp(struct ibv_context *ctx, + struct ibv_flow_action_esp_attr *attr); +int mlx5_destroy_flow_action(struct ibv_flow_action *action); +int mlx5_modify_flow_action_esp(struct ibv_flow_action *action, + struct ibv_flow_action_esp_attr *attr); + +struct ibv_dm *mlx5_alloc_dm(struct ibv_context *context, + struct ibv_alloc_dm_attr *dm_attr); +int mlx5_free_dm(struct ibv_dm *ibdm); +struct ibv_mr *mlx5_reg_dm_mr(struct ibv_pd *pd, struct ibv_dm *ibdm, + uint64_t dm_offset, size_t length, + unsigned int acc); + +struct ibv_td *mlx5_alloc_td(struct ibv_context *context, struct ibv_td_init_attr *init_attr); +int mlx5_dealloc_td(struct ibv_td *td); + +struct ibv_pd *mlx5_alloc_parent_domain(struct ibv_context *context, + struct ibv_parent_domain_init_attr *attr); + + +void *mlx5_mmap(struct mlx5_uar_info *uar, int index, + int cmd_fd, int page_size, int uar_type); +off_t get_uar_mmap_offset(int idx, int page_size, int command); + +struct ibv_counters *mlx5_create_counters(struct ibv_context *context, + struct ibv_counters_init_attr *init_attr); +int mlx5_destroy_counters(struct ibv_counters *counters); +int mlx5_attach_counters_point_flow(struct ibv_counters *counters, + struct ibv_counter_attach_attr *attr, + struct ibv_flow *flow); +int mlx5_read_counters(struct ibv_counters *counters, + uint64_t *counters_value, + uint32_t ncounters, + uint32_t flags); +int mlx5_advise_mr(struct ibv_pd *pd, + enum ibv_advise_mr_advice advice, + uint32_t flags, + struct ibv_sge *sg_list, + uint32_t num_sges); +int mlx5_qp_fill_wr_pfns(struct mlx5_qp *mqp, + const struct ibv_qp_init_attr_ex *attr, + const struct mlx5dv_qp_init_attr *mlx5_attr); +void clean_dyn_uars(struct ibv_context *context); +struct mlx5_bf *mlx5_attach_dedicated_uar(struct ibv_context *context, + uint32_t flags); + +static inline void *mlx5_find_uidx(struct mlx5_context *ctx, uint32_t uidx) +{ + int tind = uidx >> MLX5_UIDX_TABLE_SHIFT; + + if (likely(ctx->uidx_table[tind].refcnt)) + return ctx->uidx_table[tind].table[uidx & MLX5_UIDX_TABLE_MASK]; + + return NULL; +} + +static inline int mlx5_spin_lock(struct mlx5_spinlock *lock) +{ + if (lock->need_lock) + return pthread_spin_lock(&lock->lock); + + if (unlikely(lock->in_use)) { + fprintf(stderr, "*** ERROR: multithreading vilation ***\n" + "You are running a multithreaded application but\n" + "you set MLX5_SINGLE_THREADED=1. Please unset it.\n"); + abort(); + } else { + lock->in_use = 1; + /* + * This fence is not at all correct, but it increases the + * chance that in_use is detected by another thread without + * much runtime cost. */ + atomic_thread_fence(memory_order_acq_rel); + } + + return 0; +} + +static inline int mlx5_spin_unlock(struct mlx5_spinlock *lock) +{ + if (lock->need_lock) + return pthread_spin_unlock(&lock->lock); + + lock->in_use = 0; + + return 0; +} + +static inline int mlx5_spinlock_init(struct mlx5_spinlock *lock, int need_lock) +{ + lock->in_use = 0; + lock->need_lock = need_lock; + return pthread_spin_init(&lock->lock, PTHREAD_PROCESS_PRIVATE); +} + +static inline int mlx5_spinlock_init_pd(struct mlx5_spinlock *lock, struct ibv_pd *pd) +{ + struct mlx5_parent_domain *mparent_domain; + int thread_safe; + + mparent_domain = to_mparent_domain(pd); + if (mparent_domain && mparent_domain->mtd) + thread_safe = 1; + else + thread_safe = mlx5_single_threaded; + + return mlx5_spinlock_init(lock, !thread_safe); +} + +static inline int mlx5_spinlock_destroy(struct mlx5_spinlock *lock) +{ + return pthread_spin_destroy(&lock->lock); +} + +static inline void set_command(int command, off_t *offset) +{ + *offset |= (command << MLX5_IB_MMAP_CMD_SHIFT); +} + +static inline void set_arg(int arg, off_t *offset) +{ + *offset |= arg; +} + +static inline void set_order(int order, off_t *offset) +{ + set_arg(order, offset); +} + +static inline void set_index(int index, off_t *offset) +{ + set_arg(index, offset); +} + +static inline void set_extended_index(int index, off_t *offset) +{ + *offset |= (index & 0xff) | ((index >> 8) << 16); +} + +static inline uint8_t calc_sig(void *wqe, int size) +{ + int i; + uint8_t *p = wqe; + uint8_t res = 0; + + for (i = 0; i < size; ++i) + res ^= p[i]; + + return ~res; +} + +static inline int align_queue_size(long long req) +{ + return roundup_pow_of_two(req); +} + +static inline bool srq_has_waitq(struct mlx5_srq *srq) +{ + return srq->waitq_head >= 0; +} + +bool srq_cooldown_wqe(struct mlx5_srq *srq, int ind); + +#endif /* MLX5_H */ diff --git a/providers/mlx5/mlx5_api.h b/providers/mlx5/mlx5_api.h new file mode 100644 index 0000000..a8e3520 --- /dev/null +++ b/providers/mlx5/mlx5_api.h @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2017, Mellanox Technologies inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef MLX5_API_H +#define MLX5_API_H + +#include <infiniband/mlx5_user_ioctl_verbs.h> + +#define mlx5dv_flow_action_flags mlx5_ib_uapi_flow_action_flags +#define MLX5DV_FLOW_ACTION_FLAGS_REQUIRE_METADATA MLX5_IB_UAPI_FLOW_ACTION_FLAGS_REQUIRE_METADATA +#define mlx5dv_flow_table_type mlx5_ib_uapi_flow_table_type +#define MLX5DV_FLOW_TABLE_TYPE_NIC_RX MLX5_IB_UAPI_FLOW_TABLE_TYPE_NIC_RX +#define MLX5DV_FLOW_TABLE_TYPE_NIC_TX MLX5_IB_UAPI_FLOW_TABLE_TYPE_NIC_TX +#define MLX5DV_FLOW_TABLE_TYPE_FDB MLX5_IB_UAPI_FLOW_TABLE_TYPE_FDB +#define MLX5DV_FLOW_TABLE_TYPE_RDMA_RX MLX5_IB_UAPI_FLOW_TABLE_TYPE_RDMA_RX +#define MLX5DV_FLOW_TABLE_TYPE_RDMA_TX MLX5_IB_UAPI_FLOW_TABLE_TYPE_RDMA_TX +#define mlx5dv_flow_action_packet_reformat_type mlx5_ib_uapi_flow_action_packet_reformat_type +#define MLX5DV_FLOW_ACTION_PACKET_REFORMAT_TYPE_L2_TUNNEL_TO_L2 MLX5_IB_UAPI_FLOW_ACTION_PACKET_REFORMAT_TYPE_L2_TUNNEL_TO_L2 +#define MLX5DV_FLOW_ACTION_PACKET_REFORMAT_TYPE_L2_TO_L2_TUNNEL MLX5_IB_UAPI_FLOW_ACTION_PACKET_REFORMAT_TYPE_L2_TO_L2_TUNNEL +#define MLX5DV_FLOW_ACTION_PACKET_REFORMAT_TYPE_L3_TUNNEL_TO_L2 MLX5_IB_UAPI_FLOW_ACTION_PACKET_REFORMAT_TYPE_L3_TUNNEL_TO_L2 +#define MLX5DV_FLOW_ACTION_PACKET_REFORMAT_TYPE_L2_TO_L3_TUNNEL MLX5_IB_UAPI_FLOW_ACTION_PACKET_REFORMAT_TYPE_L2_TO_L3_TUNNEL +#define mlx5dv_devx_async_cmd_hdr mlx5_ib_uapi_devx_async_cmd_hdr +#define mlx5dv_devx_async_event_hdr mlx5_ib_uapi_devx_async_event_hdr +#define mlx5dv_alloc_dm_type mlx5_ib_uapi_dm_type +#define MLX5DV_DM_TYPE_MEMIC MLX5_IB_UAPI_DM_TYPE_MEMIC +#define MLX5DV_DM_TYPE_STEERING_SW_ICM MLX5_IB_UAPI_DM_TYPE_STEERING_SW_ICM +#define MLX5DV_DM_TYPE_HEADER_MODIFY_SW_ICM MLX5_IB_UAPI_DM_TYPE_HEADER_MODIFY_SW_ICM +#define mlx5dv_devx_create_event_channel_flags mlx5_ib_uapi_devx_create_event_channel_flags +#define MLX5DV_DEVX_CREATE_EVENT_CHANNEL_FLAGS_OMIT_EV_DATA MLX5_IB_UAPI_DEVX_CR_EV_CH_FLAGS_OMIT_DATA +#define MLX5DV_PP_ALLOC_FLAGS_DEDICATED_INDEX MLX5_IB_UAPI_PP_ALLOC_FLAGS_DEDICATED_INDEX +#define MLX5DV_UAR_ALLOC_TYPE_BF MLX5_IB_UAPI_UAR_ALLOC_TYPE_BF +#define MLX5DV_UAR_ALLOC_TYPE_NC MLX5_IB_UAPI_UAR_ALLOC_TYPE_NC + +#endif diff --git a/providers/mlx5/mlx5_ifc.h b/providers/mlx5/mlx5_ifc.h new file mode 100644 index 0000000..79acde9 --- /dev/null +++ b/providers/mlx5/mlx5_ifc.h @@ -0,0 +1,2379 @@ +/* + * Copyright (c) 2019 Mellanox Technologies, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef MLX5_IFC_H +#define MLX5_IFC_H + +#define u8 uint8_t + +enum mlx5_cap_mode { + HCA_CAP_OPMOD_GET_CUR = 1, +}; + +enum { + MLX5_CMD_OP_QUERY_HCA_CAP = 0x100, + MLX5_CMD_OP_CREATE_MKEY = 0x200, + MLX5_CMD_OP_CREATE_QP = 0x500, + MLX5_CMD_OP_RST2INIT_QP = 0x502, + MLX5_CMD_OP_INIT2RTR_QP = 0x503, + MLX5_CMD_OP_RTR2RTS_QP = 0x504, + MLX5_CMD_OP_RTS2RTS_QP = 0x505, + MLX5_CMD_OP_QUERY_ESW_VPORT_CONTEXT = 0x752, + MLX5_CMD_OP_QUERY_ROCE_ADDRESS = 0x760, + MLX5_CMD_OP_CREATE_FLOW_TABLE = 0x930, + MLX5_CMD_OP_CREATE_FLOW_COUNTER = 0x939, + MLX5_CMD_OP_ALLOC_PACKET_REFORMAT_CONTEXT = 0x93d, + MLX5_CMD_OP_DEALLOC_PACKET_REFORMAT_CONTEXT = 0x93e, + MLX5_CMD_OP_CREATE_GENERAL_OBJECT = 0xa00, + MLX5_CMD_OP_MODIFY_GENERAL_OBJECT = 0xa01, + MLX5_CMD_OP_QUERY_GENERAL_OBJECT = 0xa02, + MLX5_CMD_OP_SYNC_STEERING = 0xb00, +}; + +struct mlx5_ifc_atomic_caps_bits { + u8 reserved_at_0[0x40]; + + u8 atomic_req_8B_endianness_mode[0x2]; + u8 reserved_at_42[0x4]; + u8 supported_atomic_req_8B_endianness_mode_1[0x1]; + + u8 reserved_at_47[0x19]; + + u8 reserved_at_60[0x20]; + + u8 reserved_at_80[0x10]; + u8 atomic_operations[0x10]; + + u8 reserved_at_a0[0x10]; + u8 atomic_size_qp[0x10]; + + u8 reserved_at_c0[0x10]; + u8 atomic_size_dc[0x10]; + + u8 reserved_at_e0[0x1a0]; + + u8 fetch_add_pci_atomic[0x10]; + u8 swap_pci_atomic[0x10]; + u8 compare_swap_pci_atomic[0x10]; + + u8 reserved_at_2b0[0x550]; +}; + +struct mlx5_ifc_flow_table_context_bits { + u8 reformat_en[0x1]; + u8 decap_en[0x1]; + u8 sw_owner[0x1]; + u8 reserved_at_3[0x1]; + u8 table_miss_action[0x4]; + u8 level[0x8]; + u8 reserved_at_10[0x8]; + u8 log_size[0x8]; + + u8 reserved_at_20[0x8]; + u8 table_miss_id[0x18]; + + u8 reserved_at_40[0x8]; + u8 lag_master_next_table_id[0x18]; + + u8 reserved_at_60[0x60]; + + u8 sw_owner_icm_root_1[0x40]; + + u8 sw_owner_icm_root_0[0x40]; +}; + +struct mlx5_ifc_create_flow_table_in_bits { + u8 opcode[0x10]; + u8 uid[0x10]; + + u8 reserved_at_20[0x10]; + u8 op_mod[0x10]; + + u8 other_vport[0x1]; + u8 reserved_at_41[0xf]; + u8 vport_number[0x10]; + + u8 reserved_at_60[0x20]; + + u8 table_type[0x8]; + u8 reserved_at_88[0x18]; + + u8 reserved_at_a0[0x20]; + + struct mlx5_ifc_flow_table_context_bits flow_table_context; +}; + +struct mlx5_ifc_create_flow_table_out_bits { + u8 status[0x8]; + u8 icm_address_63_40[0x18]; + + u8 syndrome[0x20]; + + u8 icm_address_39_32[0x8]; + u8 table_id[0x18]; + + u8 icm_address_31_0[0x20]; +}; + +struct mlx5_ifc_sync_steering_in_bits { + u8 opcode[0x10]; + u8 uid[0x10]; + + u8 reserved_at_20[0x10]; + u8 op_mod[0x10]; + + u8 reserved_at_40[0xc0]; +}; + +struct mlx5_ifc_sync_steering_out_bits { + u8 status[0x8]; + u8 reserved_at_8[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_at_40[0x40]; +}; + +struct mlx5_ifc_device_mem_cap_bits { + u8 memic[0x1]; + u8 reserved_at_1[0x1f]; + + u8 reserved_at_20[0xb]; + u8 log_min_memic_alloc_size[0x5]; + u8 reserved_at_30[0x8]; + u8 log_max_memic_addr_alignment[0x8]; + + u8 memic_bar_start_addr[0x40]; + + u8 memic_bar_size[0x20]; + + u8 max_memic_size[0x20]; + + u8 steering_sw_icm_start_address[0x40]; + + u8 reserved_at_100[0x12]; + u8 log_sw_icm_alloc_granularity[0x6]; + u8 log_steering_sw_icm_size[0x8]; + + u8 reserved_at_120[0x20]; + + u8 header_modify_sw_icm_start_address[0x40]; +}; + +struct mlx5_ifc_flow_table_fields_supported_bits { + u8 outer_dmac[0x1]; + u8 outer_smac[0x1]; + u8 outer_ether_type[0x1]; + u8 outer_ip_version[0x1]; + u8 outer_first_prio[0x1]; + u8 outer_first_cfi[0x1]; + u8 outer_first_vid[0x1]; + u8 outer_ipv4_ttl[0x1]; + u8 outer_second_prio[0x1]; + u8 outer_second_cfi[0x1]; + u8 outer_second_vid[0x1]; + u8 outer_ipv6_flow_label[0x1]; + u8 outer_sip[0x1]; + u8 outer_dip[0x1]; + u8 outer_frag[0x1]; + u8 outer_ip_protocol[0x1]; + u8 outer_ip_ecn[0x1]; + u8 outer_ip_dscp[0x1]; + u8 outer_udp_sport[0x1]; + u8 outer_udp_dport[0x1]; + u8 outer_tcp_sport[0x1]; + u8 outer_tcp_dport[0x1]; + u8 outer_tcp_flags[0x1]; + u8 outer_gre_protocol[0x1]; + u8 outer_gre_key[0x1]; + u8 outer_vxlan_vni[0x1]; + u8 outer_geneve_vni[0x1]; + u8 outer_geneve_oam[0x1]; + u8 outer_geneve_protocol_type[0x1]; + u8 outer_geneve_opt_len[0x1]; + u8 source_vhca_port[0x1]; + u8 source_eswitch_port[0x1]; + + u8 inner_dmac[0x1]; + u8 inner_smac[0x1]; + u8 inner_ether_type[0x1]; + u8 inner_ip_version[0x1]; + u8 inner_first_prio[0x1]; + u8 inner_first_cfi[0x1]; + u8 inner_first_vid[0x1]; + u8 inner_ipv4_ttl[0x1]; + u8 inner_second_prio[0x1]; + u8 inner_second_cfi[0x1]; + u8 inner_second_vid[0x1]; + u8 inner_ipv6_flow_label[0x1]; + u8 inner_sip[0x1]; + u8 inner_dip[0x1]; + u8 inner_frag[0x1]; + u8 inner_ip_protocol[0x1]; + u8 inner_ip_ecn[0x1]; + u8 inner_ip_dscp[0x1]; + u8 inner_udp_sport[0x1]; + u8 inner_udp_dport[0x1]; + u8 inner_tcp_sport[0x1]; + u8 inner_tcp_dport[0x1]; + u8 inner_tcp_flags[0x1]; + u8 reserved_at_37[0x7]; + u8 metadata_reg_b[0x1]; + u8 metadata_reg_a[0x1]; + + u8 reserved_at_40[0x5]; + u8 outer_first_mpls_over_udp_ttl[0x1]; + u8 outer_first_mpls_over_udp_s_bos[0x1]; + u8 outer_first_mpls_over_udp_exp[0x1]; + u8 outer_first_mpls_over_udp_label[0x1]; + u8 outer_first_mpls_over_gre_ttl[0x1]; + u8 outer_first_mpls_over_gre_s_bos[0x1]; + u8 outer_first_mpls_over_gre_exp[0x1]; + u8 outer_first_mpls_over_gre_label[0x1]; + u8 inner_first_mpls_ttl[0x1]; + u8 inner_first_mpls_s_bos[0x1]; + u8 inner_first_mpls_exp[0x1]; + u8 inner_first_mpls_label[0x1]; + u8 outer_first_mpls_ttl[0x1]; + u8 outer_first_mpls_s_bos[0x1]; + u8 outer_first_mpls_exp[0x1]; + u8 outer_first_mpls_label[0x1]; + u8 outer_emd_tag[0x1]; + u8 inner_esp_spi[0x1]; + u8 outer_esp_spi[0x1]; + u8 inner_ipv6_hop_limit[0x1]; + u8 outer_ipv6_hop_limit[0x1]; + u8 bth_dst_qp[0x1]; + u8 inner_first_svlan[0x1]; + u8 inner_second_svlan[0x1]; + u8 outer_first_svlan[0x1]; + u8 outer_second_svlan[0x1]; + u8 source_sqn[0x1]; + + u8 reserved_at_60[0x20]; +}; + +struct mlx5_ifc_dr_match_spec_bits { + u8 smac_47_16[0x20]; + + u8 smac_15_0[0x10]; + u8 ethertype[0x10]; + + u8 dmac_47_16[0x20]; + + u8 dmac_15_0[0x10]; + u8 first_prio[0x3]; + u8 first_cfi[0x1]; + u8 first_vid[0xc]; + + u8 ip_protocol[0x8]; + u8 ip_dscp[0x6]; + u8 ip_ecn[0x2]; + u8 cvlan_tag[0x1]; + u8 svlan_tag[0x1]; + u8 frag[0x1]; + u8 ip_version[0x4]; + u8 tcp_flags[0x9]; + + u8 tcp_sport[0x10]; + u8 tcp_dport[0x10]; + + u8 reserved_at_c0[0x18]; + u8 ip_ttl_hoplimit[0x8]; + + u8 udp_sport[0x10]; + u8 udp_dport[0x10]; + + u8 src_ip_127_96[0x20]; + + u8 src_ip_95_64[0x20]; + + u8 src_ip_63_32[0x20]; + + u8 src_ip_31_0[0x20]; + + u8 dst_ip_127_96[0x20]; + + u8 dst_ip_95_64[0x20]; + + u8 dst_ip_63_32[0x20]; + + u8 dst_ip_31_0[0x20]; +}; + +struct mlx5_ifc_dr_match_set_misc_bits { + u8 gre_c_present[0x1]; + u8 reserved_auto1[0x1]; + u8 gre_k_present[0x1]; + u8 gre_s_present[0x1]; + u8 source_vhca_port[0x4]; + u8 source_sqn[0x18]; + + u8 source_eswitch_owner_vhca_id[0x10]; + u8 source_port[0x10]; + + u8 outer_second_prio[0x3]; + u8 outer_second_cfi[0x1]; + u8 outer_second_vid[0xc]; + u8 inner_second_prio[0x3]; + u8 inner_second_cfi[0x1]; + u8 inner_second_vid[0xc]; + + u8 outer_second_cvlan_tag[0x1]; + u8 inner_second_cvlan_tag[0x1]; + u8 outer_second_svlan_tag[0x1]; + u8 inner_second_svlan_tag[0x1]; + u8 outer_emd_tag[0x1]; + u8 reserved_at_65[0xb]; + u8 gre_protocol[0x10]; + + u8 gre_key_h[0x18]; + u8 gre_key_l[0x8]; + + u8 vxlan_vni[0x18]; + u8 reserved_at_b8[0x8]; + + u8 geneve_vni[0x18]; + u8 reserved_at_e4[0x7]; + u8 geneve_oam[0x1]; + + u8 reserved_at_ec[0xc]; + u8 outer_ipv6_flow_label[0x14]; + + u8 reserved_at_100[0xc]; + u8 inner_ipv6_flow_label[0x14]; + + u8 reserved_at_120[0xa]; + u8 geneve_opt_len[0x6]; + u8 geneve_protocol_type[0x10]; + + u8 reserved_at_140[0x8]; + u8 bth_dst_qp[0x18]; + + u8 inner_esp_spi[0x20]; + + u8 outer_esp_spi[0x20]; + + u8 reserved_at_1a0[0x60]; +}; + +struct mlx5_ifc_dr_match_set_misc2_bits { + u8 outer_first_mpls_label[0x14]; + u8 outer_first_mpls_exp[0x3]; + u8 outer_first_mpls_s_bos[0x1]; + u8 outer_first_mpls_ttl[0x8]; + + u8 inner_first_mpls_label[0x14]; + u8 inner_first_mpls_exp[0x3]; + u8 inner_first_mpls_s_bos[0x1]; + u8 inner_first_mpls_ttl[0x8]; + + u8 outer_first_mpls_over_gre_label[0x14]; + u8 outer_first_mpls_over_gre_exp[0x3]; + u8 outer_first_mpls_over_gre_s_bos[0x1]; + u8 outer_first_mpls_over_gre_ttl[0x8]; + + u8 outer_first_mpls_over_udp_label[0x14]; + u8 outer_first_mpls_over_udp_exp[0x3]; + u8 outer_first_mpls_over_udp_s_bos[0x1]; + u8 outer_first_mpls_over_udp_ttl[0x8]; + + u8 metadata_reg_c_7[0x20]; + u8 metadata_reg_c_6[0x20]; + u8 metadata_reg_c_5[0x20]; + u8 metadata_reg_c_4[0x20]; + u8 metadata_reg_c_3[0x20]; + u8 metadata_reg_c_2[0x20]; + u8 metadata_reg_c_1[0x20]; + u8 metadata_reg_c_0[0x20]; + + u8 metadata_reg_a[0x20]; + u8 metadata_reg_b[0x20]; + + u8 reserved_at_260[0x40]; +}; + +struct mlx5_ifc_dr_match_set_misc3_bits { + u8 inner_tcp_seq_num[0x20]; + + u8 outer_tcp_seq_num[0x20]; + + u8 inner_tcp_ack_num[0x20]; + + u8 outer_tcp_ack_num[0x20]; + + u8 reserved_at_80[0x8]; + u8 outer_vxlan_gpe_vni[0x18]; + + u8 outer_vxlan_gpe_next_protocol[0x8]; + u8 outer_vxlan_gpe_flags[0x8]; + u8 reserved_at_b0[0x10]; + + u8 icmp_header_data[0x20]; + + u8 icmpv6_header_data[0x20]; + + u8 icmp_type[0x8]; + u8 icmp_code[0x8]; + u8 icmpv6_type[0x8]; + u8 icmpv6_code[0x8]; + + u8 reserved_at_120[0x20]; + + u8 gtpu_teid[0x20]; + + u8 gtpu_msg_type[0x8]; + u8 reserved_at_148[0x5]; + u8 gtpu_flags[0x3]; + u8 reserved_at_150[0x10]; + + u8 reserved_at_160[0x80]; +}; + +struct mlx5_ifc_dr_match_param_bits { + struct mlx5_ifc_dr_match_spec_bits outer; + struct mlx5_ifc_dr_match_set_misc_bits misc; + struct mlx5_ifc_dr_match_spec_bits inner; + struct mlx5_ifc_dr_match_set_misc2_bits misc2; + struct mlx5_ifc_dr_match_set_misc3_bits misc3; +}; + +struct mlx5_ifc_flow_table_prop_layout_bits { + u8 ft_support[0x1]; + u8 flow_tag[0x1]; + u8 flow_counter[0x1]; + u8 flow_modify_en[0x1]; + u8 modify_root[0x1]; + u8 identified_miss_table[0x1]; + u8 flow_table_modify[0x1]; + u8 reformat[0x1]; + u8 decap[0x1]; + u8 reset_root_to_default[0x1]; + u8 pop_vlan[0x1]; + u8 push_vlan[0x1]; + u8 fpga_vendor_acceleration[0x1]; + u8 pop_vlan_2[0x1]; + u8 push_vlan_2[0x1]; + u8 reformat_and_vlan_action[0x1]; + u8 modify_and_vlan_action[0x1]; + u8 sw_owner[0x1]; + u8 reformat_l3_tunnel_to_l2[0x1]; + u8 reformat_l2_to_l3_tunnel[0x1]; + u8 reformat_and_modify_action[0x1]; + u8 reserved_at_15[0xb]; + + u8 reserved_at_20[0x2]; + u8 log_max_ft_size[0x6]; + u8 log_max_modify_header_context[0x8]; + u8 max_modify_header_actions[0x8]; + u8 max_ft_level[0x8]; + + u8 reserved_at_40[0x10]; + u8 metadata_reg_b_width[0x8]; + u8 metadata_reg_a_width[0x8]; + + u8 reserved_at_60[0x18]; + u8 log_max_ft_num[0x8]; + + u8 reserved_at_80[0x10]; + u8 log_max_flow_counter[0x8]; + u8 log_max_destination[0x8]; + + u8 reserved_at_a0[0x18]; + u8 log_max_flow[0x8]; + + u8 reserved_at_c0[0x40]; + + struct mlx5_ifc_flow_table_fields_supported_bits ft_field_support; + + struct mlx5_ifc_flow_table_fields_supported_bits ft_field_bitmask_support; +}; + +enum { + MLX5_FLEX_PARSER_GENEVE_ENABLED = 1 << 3, + MLX5_FLEX_PARSER_VXLAN_GPE_ENABLED = 1 << 7, + MLX5_FLEX_PARSER_ICMP_V4_ENABLED = 1 << 8, + MLX5_FLEX_PARSER_ICMP_V6_ENABLED = 1 << 9, + MLX5_FLEX_PARSER_GTPU_ENABLED = 1 << 11, +}; + +struct mlx5_ifc_cmd_hca_cap_bits { + u8 access_other_hca_roce[0x1]; + u8 reserved_at_1[0x1e]; + u8 vhca_resource_manager[0x1]; + + u8 reserved_at_20[0x10]; + u8 vhca_id[0x10]; + + u8 reserved_at_40[0x40]; + + u8 log_max_srq_sz[0x8]; + u8 log_max_qp_sz[0x8]; + u8 reserved_at_90[0xb]; + u8 log_max_qp[0x5]; + + u8 reserved_at_a0[0xb]; + u8 log_max_srq[0x5]; + u8 reserved_at_b0[0x10]; + + u8 reserved_at_c0[0x8]; + u8 log_max_cq_sz[0x8]; + u8 reserved_at_d0[0xb]; + u8 log_max_cq[0x5]; + + u8 log_max_eq_sz[0x8]; + u8 relaxed_ordering_write[0x1]; + u8 reserved_at_e9[0x1]; + u8 log_max_mkey[0x6]; + u8 tunneled_atomic[0x1]; + u8 as_notify[0x1]; + u8 m_pci_port[0x1]; + u8 m_vhca_mk[0x1]; + u8 cmd_on_behalf[0x1]; + u8 device_emulation_manager[0x1]; + u8 terminate_scatter_list_mkey[0x1]; + u8 repeated_mkey[0x1]; + u8 dump_fill_mkey[0x1]; + u8 reserved_at_f9[0x3]; + u8 log_max_eq[0x4]; + + u8 max_indirection[0x8]; + u8 fixed_buffer_size[0x1]; + u8 log_max_mrw_sz[0x7]; + u8 force_teardown[0x1]; + u8 fast_teardown[0x1]; + u8 log_max_bsf_list_size[0x6]; + u8 umr_extended_translation_offset[0x1]; + u8 null_mkey[0x1]; + u8 log_max_klm_list_size[0x6]; + + u8 reserved_at_120[0xa]; + u8 log_max_ra_req_dc[0x6]; + u8 reserved_at_130[0xa]; + u8 log_max_ra_res_dc[0x6]; + + u8 reserved_at_140[0xa]; + u8 log_max_ra_req_qp[0x6]; + u8 reserved_at_150[0xa]; + u8 log_max_ra_res_qp[0x6]; + + u8 end_pad[0x1]; + u8 cc_query_allowed[0x1]; + u8 cc_modify_allowed[0x1]; + u8 start_pad[0x1]; + u8 cache_line_128byte[0x1]; + u8 gid_table_size_ro[0x1]; + u8 pkey_table_size_ro[0x1]; + u8 reserved_at_167[0x1]; + u8 rnr_nak_q_counters[0x1]; + u8 rts2rts_qp_counters_set_id[0x1]; + u8 rts2rts_qp_dscp[0x1]; + u8 reserved_at_16b[0x4]; + u8 qcam_reg[0x1]; + u8 gid_table_size[0x10]; + + u8 out_of_seq_cnt[0x1]; + u8 vport_counters[0x1]; + u8 retransmission_q_counters[0x1]; + u8 debug[0x1]; + u8 modify_rq_counters_set_id[0x1]; + u8 rq_delay_drop[0x1]; + u8 max_qp_cnt[0xa]; + u8 pkey_table_size[0x10]; + + u8 vport_group_manager[0x1]; + u8 vhca_group_manager[0x1]; + u8 ib_virt[0x1]; + u8 eth_virt[0x1]; + u8 vnic_env_queue_counters[0x1]; + u8 ets[0x1]; + u8 nic_flow_table[0x1]; + u8 eswitch_manager[0x1]; + u8 device_memory[0x1]; + u8 mcam_reg[0x1]; + u8 pcam_reg[0x1]; + u8 local_ca_ack_delay[0x5]; + u8 port_module_event[0x1]; + u8 enhanced_retransmission_q_counters[0x1]; + u8 port_checks[0x1]; + u8 pulse_gen_control[0x1]; + u8 disable_link_up_by_init_hca[0x1]; + u8 beacon_led[0x1]; + u8 port_type[0x2]; + u8 num_ports[0x8]; + + u8 reserved_at_1c0[0x1]; + u8 pps[0x1]; + u8 pps_modify[0x1]; + u8 log_max_msg[0x5]; + u8 multi_path_xrc_rdma[0x1]; + u8 multi_path_dc_rdma[0x1]; + u8 multi_path_rc_rdma[0x1]; + u8 traffic_fast_control[0x1]; + u8 max_tc[0x4]; + u8 temp_warn_event[0x1]; + u8 dcbx[0x1]; + u8 general_notification_event[0x1]; + u8 multi_prio_sq[0x1]; + u8 afu_owner[0x1]; + u8 fpga[0x1]; + u8 rol_s[0x1]; + u8 rol_g[0x1]; + u8 ib_port_sniffer[0x1]; + u8 wol_s[0x1]; + u8 wol_g[0x1]; + u8 wol_a[0x1]; + u8 wol_b[0x1]; + u8 wol_m[0x1]; + u8 wol_u[0x1]; + u8 wol_p[0x1]; + + u8 stat_rate_support[0x10]; + u8 reserved_at_1f0[0xc]; + u8 cqe_version[0x4]; + + u8 compact_address_vector[0x1]; + u8 eth_striding_wq[0x1]; + u8 reserved_at_202[0x1]; + u8 ipoib_enhanced_offloads[0x1]; + u8 ipoib_basic_offloads[0x1]; + u8 ib_striding_wq[0x1]; + u8 repeated_block_disabled[0x1]; + u8 umr_modify_entity_size_disabled[0x1]; + u8 umr_modify_atomic_disabled[0x1]; + u8 umr_indirect_mkey_disabled[0x1]; + u8 umr_fence[0x2]; + u8 dc_req_sctr_data_cqe[0x1]; + u8 dc_connect_qp[0x1]; + u8 dc_cnak_trace[0x1]; + u8 drain_sigerr[0x1]; + u8 cmdif_checksum[0x2]; + u8 sigerr_cqe[0x1]; + u8 reserved_at_213[0x1]; + u8 wq_signature[0x1]; + u8 sctr_data_cqe[0x1]; + u8 reserved_at_216[0x1]; + u8 sho[0x1]; + u8 tph[0x1]; + u8 rf[0x1]; + u8 dct[0x1]; + u8 qos[0x1]; + u8 eth_net_offloads[0x1]; + u8 roce[0x1]; + u8 atomic[0x1]; + u8 extended_retry_count[0x1]; + + u8 cq_oi[0x1]; + u8 cq_resize[0x1]; + u8 cq_moderation[0x1]; + u8 cq_period_mode_modify[0x1]; + u8 cq_invalidate[0x1]; + u8 reserved_at_225[0x1]; + u8 cq_eq_remap[0x1]; + u8 pg[0x1]; + u8 block_lb_mc[0x1]; + u8 exponential_backoff[0x1]; + u8 scqe_break_moderation[0x1]; + u8 cq_period_start_from_cqe[0x1]; + u8 cd[0x1]; + u8 atm[0x1]; + u8 apm[0x1]; + u8 vector_calc[0x1]; + u8 umr_ptr_rlkey[0x1]; + u8 imaicl[0x1]; + u8 qp_packet_based[0x1]; + u8 reserved_at_233[0x1]; + u8 ipoib_enhanced_pkey_change[0x1]; + u8 initiator_src_dct_in_cqe[0x1]; + u8 qkv[0x1]; + u8 pkv[0x1]; + u8 set_deth_sqpn[0x1]; + u8 rts2rts_primary_sl[0x1]; + u8 initiator_src_dct[0x1]; + u8 dc_v2[0x1]; + u8 xrc[0x1]; + u8 ud[0x1]; + u8 uc[0x1]; + u8 rc[0x1]; + + u8 uar_4k[0x1]; + u8 reserved_at_241[0x9]; + u8 uar_sz[0x6]; + u8 reserved_at_250[0x3]; + u8 log_max_dc_cnak_qps[0x5]; + u8 log_pg_sz[0x8]; + + u8 bf[0x1]; + u8 driver_version[0x1]; + u8 pad_tx_eth_packet[0x1]; + u8 query_driver_version[0x1]; + u8 max_qp_retry_freq[0x1]; + u8 qp_by_name[0x1]; + u8 mkey_by_name[0x1]; + u8 reserved_at_267[0x1]; + u8 suspend_qp_uc[0x1]; + u8 suspend_qp_ud[0x1]; + u8 suspend_qp_rc[0x1]; + u8 log_bf_reg_size[0x5]; + u8 reserved_at_270[0x6]; + u8 lag_dct[0x2]; + u8 reserved_at_278[0x3]; + u8 lag_master[0x1]; + u8 num_lag_ports[0x4]; + + u8 num_of_diagnostic_counters[0x10]; + u8 max_wqe_sz_sq[0x10]; + + u8 reserved_at_2a0[0x10]; + u8 max_wqe_sz_rq[0x10]; + + u8 max_flow_counter_31_16[0x10]; + u8 max_wqe_sz_sq_dc[0x10]; + + u8 reserved_at_2e0[0x7]; + u8 max_qp_mcg[0x19]; + + u8 mlnx_tag_ethertype[0x10]; + u8 reserved_at_310[0x8]; + u8 log_max_mcg[0x8]; + + u8 reserved_at_320[0x3]; + u8 log_max_transport_domain[0x5]; + u8 reserved_at_328[0x3]; + u8 log_max_pd[0x5]; + u8 reserved_at_330[0xb]; + u8 log_max_xrcd[0x5]; + + u8 nic_receive_steering_discard[0x1]; + u8 receive_discard_vport_down[0x1]; + u8 transmit_discard_vport_down[0x1]; + u8 eq_overrun_count[0x1]; + u8 nic_receive_steering_depth[0x1]; + u8 invalid_command_count[0x1]; + u8 quota_exceeded_count[0x1]; + u8 reserved_at_347[0x1]; + u8 log_max_flow_counter_bulk[0x8]; + u8 max_flow_counter_15_0[0x10]; + + u8 modify_tis[0x1]; + u8 reserved_at_361[0x2]; + u8 log_max_rq[0x5]; + u8 reserved_at_368[0x3]; + u8 log_max_sq[0x5]; + u8 reserved_at_370[0x3]; + u8 log_max_tir[0x5]; + u8 reserved_at_378[0x3]; + u8 log_max_tis[0x5]; + + u8 basic_cyclic_rcv_wqe[0x1]; + u8 reserved_at_381[0x2]; + u8 log_max_rmp[0x5]; + u8 reserved_at_388[0x3]; + u8 log_max_rqt[0x5]; + u8 reserved_at_390[0x3]; + u8 log_max_rqt_size[0x5]; + u8 reserved_at_398[0x3]; + u8 log_max_tis_per_sq[0x5]; + + u8 ext_stride_num_range[0x1]; + u8 reserved_at_3a1[0x2]; + u8 log_max_stride_sz_rq[0x5]; + u8 reserved_at_3a8[0x3]; + u8 log_min_stride_sz_rq[0x5]; + u8 reserved_at_3b0[0x3]; + u8 log_max_stride_sz_sq[0x5]; + u8 reserved_at_3b8[0x3]; + u8 log_min_stride_sz_sq[0x5]; + + u8 hairpin[0x1]; + u8 reserved_at_3c1[0x2]; + u8 log_max_hairpin_queues[0x5]; + u8 reserved_at_3c8[0x3]; + u8 log_max_hairpin_wq_data_sz[0x5]; + u8 reserved_at_3d0[0x3]; + u8 log_max_hairpin_num_packets[0x5]; + u8 reserved_at_3d8[0x3]; + u8 log_max_wq_sz[0x5]; + + u8 nic_vport_change_event[0x1]; + u8 disable_local_lb_uc[0x1]; + u8 disable_local_lb_mc[0x1]; + u8 log_min_hairpin_wq_data_sz[0x5]; + u8 reserved_at_3e8[0x3]; + u8 log_max_vlan_list[0x5]; + u8 reserved_at_3f0[0x3]; + u8 log_max_current_mc_list[0x5]; + u8 reserved_at_3f8[0x3]; + u8 log_max_current_uc_list[0x5]; + + u8 general_obj_types[0x40]; + + u8 reserved_at_440[0x8]; + u8 create_qp_start_hint[0x18]; + + u8 reserved_at_460[0x10]; + u8 max_num_eqs[0x10]; + + u8 reserved_at_480[0x3]; + u8 log_max_l2_table[0x5]; + u8 reserved_at_488[0x8]; + u8 log_uar_page_sz[0x10]; + + u8 reserved_at_4a0[0x20]; + + u8 device_frequency_mhz[0x20]; + + u8 device_frequency_khz[0x20]; + + u8 capi[0x1]; + u8 create_pec[0x1]; + u8 nvmf_target_offload[0x1]; + u8 capi_invalidate[0x1]; + u8 reserved_at_504[0x17]; + u8 log_max_pasid[0x5]; + + u8 num_of_uars_per_page[0x20]; + + u8 flex_parser_protocols[0x20]; + + u8 reserved_at_560[0x13]; + u8 log_max_guaranteed_connections[0x5]; + u8 reserved_at_578[0x3]; + u8 log_max_dct_connections[0x5]; + + u8 log_max_atomic_size_qp[0x8]; + u8 reserved_at_588[0x10]; + u8 log_max_atomic_size_dc[0x8]; + + u8 reserved_at_5a0[0x1c]; + u8 mini_cqe_resp_stride_index[0x1]; + u8 cqe_128_always[0x1]; + u8 cqe_compression_128b[0x1]; + u8 cqe_compression[0x1]; + + u8 cqe_compression_timeout[0x10]; + u8 cqe_compression_max_num[0x10]; + + u8 reserved_at_5e0[0xc]; + u8 log_max_tm_offloaded_op_size[0x4]; + u8 tag_matching[0x1]; + u8 rndv_offload_rc[0x1]; + u8 rndv_offload_dc[0x1]; + u8 log_tag_matching_list_sz[0x5]; + u8 reserved_at_5f8[0x3]; + u8 log_max_xrq[0x5]; + + u8 affiliate_nic_vport_criteria[0x8]; + u8 native_port_num[0x8]; + u8 num_vhca_ports[0x8]; + u8 reserved_at_618[0x5]; + u8 trusted_vnic_vhca[0x1]; + u8 sw_owner_id[0x1]; + u8 reserve_not_to_use[0x1]; + u8 reserved_at_620[0xa0]; + u8 reserved_at_6c0[0x8]; + u8 flex_parser_id_icmp_dw1[0x4]; + u8 flex_parser_id_icmp_dw0[0x4]; + u8 flex_parser_id_icmpv6_dw1[0x4]; + u8 flex_parser_id_icmpv6_dw0[0x4]; + u8 flex_parser_id_outer_first_mpls_over_gre[0x4]; + u8 flex_parser_id_outer_first_mpls_over_udp_label[0x4]; + u8 reserved_at_6b8[0x120]; +}; + +struct mlx5_ifc_header_modify_cap_properties_bits { + struct mlx5_ifc_flow_table_fields_supported_bits set_action_field_support; + + u8 reserved_at_80[0x80]; + + struct mlx5_ifc_flow_table_fields_supported_bits add_action_field_support; + + u8 reserved_at_180[0x80]; + + u8 copy_action_field_support[8][0x20]; + + u8 reserved_at_300[0x100]; +}; + +struct mlx5_ifc_flow_table_nic_cap_bits { + u8 nic_rx_multi_path_tirs[0x1]; + u8 nic_rx_multi_path_tirs_fts[0x1]; + u8 allow_sniffer_and_nic_rx_shared_tir[0x1]; + u8 reserved_at_3[0x1]; + u8 nic_rx_flow_tag_multipath_en[0x1]; + u8 reserved_at_5[0x13]; + u8 nic_receive_max_steering_depth[0x8]; + + u8 encap_general_header[0x1]; + u8 reserved_at_21[0xa]; + u8 log_max_packet_reformat_context[0x5]; + u8 reserved_at_30[0x6]; + u8 max_encap_header_size[0xa]; + + u8 reserved_at_40[0x1c0]; + + struct mlx5_ifc_flow_table_prop_layout_bits flow_table_properties_nic_receive; + + struct mlx5_ifc_flow_table_prop_layout_bits flow_table_properties_nic_receive_rdma; + + struct mlx5_ifc_flow_table_prop_layout_bits flow_table_properties_nic_receive_sniffer; + + struct mlx5_ifc_flow_table_prop_layout_bits flow_table_properties_nic_transmit; + + struct mlx5_ifc_flow_table_prop_layout_bits flow_table_properties_nic_transmit_rdma; + + struct mlx5_ifc_flow_table_prop_layout_bits flow_table_properties_nic_transmit_sniffer; + + u8 reserved_at_e00[0x200]; + + struct mlx5_ifc_header_modify_cap_properties_bits header_modify_nic_receive; + + u8 reserved_at_1400[0x800]; + + struct mlx5_ifc_header_modify_cap_properties_bits header_modify_nic_transmit; + + u8 sw_steering_nic_rx_action_drop_icm_address[0x40]; + + u8 sw_steering_nic_tx_action_drop_icm_address[0x40]; + + u8 sw_steering_nic_tx_action_allow_icm_address[0x40]; + + u8 reserved_at_20c0[0x5f40]; +}; + +struct mlx5_ifc_flow_table_eswitch_cap_bits { + u8 reserved_at_0[0x1c]; + u8 fdb_multi_path_to_table[0x1]; + u8 reserved_at_1d[0x1e3]; + + struct mlx5_ifc_flow_table_prop_layout_bits flow_table_properties_nic_esw_fdb; + + struct mlx5_ifc_flow_table_prop_layout_bits flow_table_properties_esw_acl_ingress; + + struct mlx5_ifc_flow_table_prop_layout_bits flow_table_properties_esw_acl_egress; + + u8 reserved_at_800[0x1000]; + + u8 sw_steering_fdb_action_drop_icm_address_rx[0x40]; + u8 sw_steering_fdb_action_drop_icm_address_tx[0x40]; + u8 sw_steering_uplink_icm_address_rx[0x40]; + u8 sw_steering_uplink_icm_address_tx[0x40]; + + u8 reserved_at_1900[0x6700]; +}; + +struct mlx5_ifc_odp_per_transport_service_cap_bits { + u8 send[0x1]; + u8 receive[0x1]; + u8 write[0x1]; + u8 read[0x1]; + u8 atomic[0x1]; + u8 srq_receive[0x1]; + u8 reserved_at_6[0x1a]; +}; + +struct mlx5_ifc_odp_cap_bits { + u8 reserved_at_0[0x40]; + + u8 sig[0x1]; + u8 reserved_at_41[0x1f]; + + u8 reserved_at_60[0x20]; + + struct mlx5_ifc_odp_per_transport_service_cap_bits rc_odp_caps; + + struct mlx5_ifc_odp_per_transport_service_cap_bits uc_odp_caps; + + struct mlx5_ifc_odp_per_transport_service_cap_bits ud_odp_caps; + + struct mlx5_ifc_odp_per_transport_service_cap_bits xrc_odp_caps; + + struct mlx5_ifc_odp_per_transport_service_cap_bits dc_odp_caps; + + u8 reserved_at_120[0x6e0]; +}; + +union mlx5_ifc_hca_cap_union_bits { + struct mlx5_ifc_atomic_caps_bits atomic_caps; + struct mlx5_ifc_cmd_hca_cap_bits cmd_hca_cap; + struct mlx5_ifc_flow_table_nic_cap_bits flow_table_nic_cap; + struct mlx5_ifc_flow_table_eswitch_cap_bits flow_table_eswitch_cap; + struct mlx5_ifc_device_mem_cap_bits device_mem_cap; + struct mlx5_ifc_odp_cap_bits odp_cap; + u8 reserved_at_0[0x8000]; +}; + +struct mlx5_ifc_query_hca_cap_out_bits { + u8 status[0x8]; + u8 reserved_at_8[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_at_40[0x40]; + + union mlx5_ifc_hca_cap_union_bits capability; +}; + +struct mlx5_ifc_query_hca_cap_in_bits { + u8 opcode[0x10]; + u8 reserved_at_10[0x10]; + + u8 reserved_at_20[0x10]; + u8 op_mod[0x10]; + + u8 other_function[0x1]; + u8 reserved_at_41[0xf]; + u8 function_id[0x10]; + + u8 reserved_at_60[0x20]; +}; + +enum mlx5_cap_type { + MLX5_CAP_ODP = 2, + MLX5_CAP_ATOMIC = 3, +}; + +enum { + MLX5_SET_HCA_CAP_OP_MOD_GENERAL_DEVICE = 0x0 << 1, + MLX5_SET_HCA_CAP_OP_MOD_NIC_FLOW_TABLE = 0x7 << 1, + MLX5_SET_HCA_CAP_OP_MOD_ESW_FLOW_TABLE = 0x8 << 1, + MLX5_SET_HCA_CAP_OP_MOD_DEVICE_MEMORY = 0xf << 1, +}; + +enum { + MLX5_MKC_ACCESS_MODE_KLMS = 0x2, +}; + +struct mlx5_ifc_mkc_bits { + u8 reserved_at_0[0x1]; + u8 free[0x1]; + u8 reserved_at_2[0x1]; + u8 access_mode_4_2[0x3]; + u8 reserved_at_6[0x7]; + u8 relaxed_ordering_write[0x1]; + u8 reserved_at_e[0x1]; + u8 small_fence_on_rdma_read_response[0x1]; + u8 umr_en[0x1]; + u8 a[0x1]; + u8 rw[0x1]; + u8 rr[0x1]; + u8 lw[0x1]; + u8 lr[0x1]; + u8 access_mode_1_0[0x2]; + u8 reserved_at_18[0x8]; + + u8 qpn[0x18]; + u8 mkey_7_0[0x8]; + + u8 reserved_at_40[0x20]; + + u8 length64[0x1]; + u8 bsf_en[0x1]; + u8 sync_umr[0x1]; + u8 reserved_at_63[0x2]; + u8 expected_sigerr_count[0x1]; + u8 reserved_at_66[0x1]; + u8 en_rinval[0x1]; + u8 pd[0x18]; + + u8 start_addr[0x40]; + + u8 len[0x40]; + + u8 bsf_octword_size[0x20]; + + u8 reserved_at_120[0x80]; + + u8 translations_octword_size[0x20]; + + u8 reserved_at_1c0[0x1b]; + u8 log_page_size[0x5]; + + u8 reserved_at_1e0[0x20]; +}; + +struct mlx5_ifc_create_mkey_out_bits { + u8 status[0x8]; + u8 reserved_at_8[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_at_40[0x8]; + u8 mkey_index[0x18]; + + u8 reserved_at_60[0x20]; +}; + +struct mlx5_ifc_create_mkey_in_bits { + u8 opcode[0x10]; + u8 reserved_at_10[0x10]; + + u8 reserved_at_20[0x10]; + u8 op_mod[0x10]; + + u8 reserved_at_40[0x20]; + + u8 pg_access[0x1]; + u8 mkey_umem_valid[0x1]; + u8 reserved_at_62[0x1e]; + + struct mlx5_ifc_mkc_bits memory_key_mkey_entry; + + u8 reserved_at_280[0x80]; + + u8 translations_octword_actual_size[0x20]; + + u8 reserved_at_320[0x560]; + + u8 klm_pas_mtt[0][0x20]; +}; + +struct mlx5_ifc_l2_hdr_bits { + u8 dmac_47_16[0x20]; + u8 dmac_15_0[0x10]; + u8 smac_47_32[0x10]; + u8 smac_31_0[0x20]; + u8 ethertype[0x10]; + u8 vlan_type[0x10]; + u8 vlan[0x10]; +}; + +enum { + FS_FT_NIC_RX = 0x0, + FS_FT_NIC_TX = 0x1, + FS_FT_ESW_EGRESS_ACL = 0x2, + FS_FT_ESW_INGRESS_ACL = 0x3, + FS_FT_FDB = 0X4, + FS_FT_SNIFFER_RX = 0X5, + FS_FT_SNIFFER_TX = 0X6, +}; + +struct mlx5_ifc_ste_general_bits { + u8 entry_type[0x4]; + u8 reserved_at_4[0x4]; + u8 entry_sub_type[0x8]; + u8 byte_mask[0x10]; + u8 next_table_base_63_48[0x10]; + u8 next_lu_type[0x8]; + u8 next_table_base_39_32_size[0x8]; + u8 next_table_base_31_5_size[0x1b]; + u8 linear_hash_enable[0x1]; + u8 reserved_at_5c[0x2]; + u8 next_table_rank[0x2]; + u8 reserved_at_60[0xa0]; + u8 tag_value[0x60]; + u8 bit_mask[0x60]; +}; + +struct mlx5_ifc_ste_sx_transmit_bits { + u8 entry_type[0x4]; + u8 reserved_at_4[0x4]; + u8 entry_sub_type[0x8]; + u8 byte_mask[0x10]; + + u8 next_table_base_63_48[0x10]; + u8 next_lu_type[0x8]; + u8 next_table_base_39_32_size[0x8]; + + u8 next_table_base_31_5_size[0x1b]; + u8 linear_hash_enable[0x1]; + u8 reserved_at_5c[0x2]; + u8 next_table_rank[0x2]; + + u8 sx_wire[0x1]; + u8 sx_func_lb[0x1]; + u8 sx_sniffer[0x1]; + u8 sx_wire_enable[0x1]; + u8 sx_func_lb_enable[0x1]; + u8 sx_sniffer_enable[0x1]; + u8 action_type[0x3]; + u8 reserved_at_69[0x1]; + u8 action_description[0x6]; + u8 gvmi[0x10]; + + u8 encap_pointer_vlan_data[0x20]; + + u8 loopback_syndome_en[0x8]; + u8 loopback_syndome[0x8]; + u8 counter_trigger[0x10]; + + u8 miss_address_63_48[0x10]; + u8 counter_trigger_23_16[0x8]; + u8 miss_address_39_32[0x8]; + + u8 miss_address_31_6[0x1a]; + u8 learning_point[0x1]; + u8 go_back[0x1]; + u8 match_polarity[0x1]; + u8 mask_mode[0x1]; + u8 miss_rank[0x2]; +}; + +struct mlx5_ifc_ste_rx_steering_mult_bits { + u8 entry_type[0x4]; + u8 reserved_at_4[0x4]; + u8 entry_sub_type[0x8]; + u8 byte_mask[0x10]; + + u8 next_table_base_63_48[0x10]; + u8 next_lu_type[0x8]; + u8 next_table_base_39_32_size[0x8]; + + u8 next_table_base_31_5_size[0x1b]; + u8 linear_hash_enable[0x1]; + u8 reserved_at_5c[0x2]; + u8 next_table_rank[0x2]; + + u8 member_count[0x10]; + u8 gvmi[0x10]; + + u8 qp_list_pointer[0x20]; + + u8 reserved_at_a0[0x1]; + u8 tunneling_action[0x3]; + u8 action_description[0x4]; + u8 reserved_at_a8[0x8]; + u8 counter_trigger_15_0[0x10]; + + u8 miss_address_63_48[0x10]; + u8 counter_trigger_23_16[0x08]; + u8 miss_address_39_32[0x8]; + + u8 miss_address_31_6[0x1a]; + u8 learning_point[0x1]; + u8 fail_on_error[0x1]; + u8 match_polarity[0x1]; + u8 mask_mode[0x1]; + u8 miss_rank[0x2]; +}; + +struct mlx5_ifc_ste_modify_packet_bits { + u8 entry_type[0x4]; + u8 reserved_at_4[0x4]; + u8 entry_sub_type[0x8]; + u8 byte_mask[0x10]; + + u8 next_table_base_63_48[0x10]; + u8 next_lu_type[0x8]; + u8 next_table_base_39_32_size[0x8]; + + u8 next_table_base_31_5_size[0x1b]; + u8 linear_hash_enable[0x1]; + u8 reserved_at_5c[0x2]; + u8 next_table_rank[0x2]; + + u8 number_of_re_write_actions[0x10]; + u8 gvmi[0x10]; + + u8 header_re_write_actions_pointer[0x20]; + + u8 reserved_at_a0[0x1]; + u8 tunneling_action[0x3]; + u8 action_description[0x4]; + u8 reserved_at_a8[0x8]; + u8 counter_trigger_15_0[0x10]; + + u8 miss_address_63_48[0x10]; + u8 counter_trigger_23_16[0x08]; + u8 miss_address_39_32[0x8]; + + u8 miss_address_31_6[0x1a]; + u8 learning_point[0x1]; + u8 fail_on_error[0x1]; + u8 match_polarity[0x1]; + u8 mask_mode[0x1]; + u8 miss_rank[0x2]; +}; + +struct mlx5_ifc_ste_eth_l2_src_bits { + u8 smac_47_16[0x20]; + + u8 smac_15_0[0x10]; + u8 l3_ethertype[0x10]; + + u8 qp_type[0x2]; + u8 ethertype_filter[0x1]; + u8 reserved_at_43[0x1]; + u8 sx_sniffer[0x1]; + u8 force_lb[0x1]; + u8 functional_lb[0x1]; + u8 port[0x1]; + u8 reserved_at_48[0x4]; + u8 first_priority[0x3]; + u8 first_cfi[0x1]; + u8 first_vlan_qualifier[0x2]; + u8 reserved_at_52[0x2]; + u8 first_vlan_id[0xc]; + + u8 ip_fragmented[0x1]; + u8 tcp_syn[0x1]; + u8 encp_type[0x2]; + u8 l3_type[0x2]; + u8 l4_type[0x2]; + u8 reserved_at_68[0x4]; + u8 second_priority[0x3]; + u8 second_cfi[0x1]; + u8 second_vlan_qualifier[0x2]; + u8 reserved_at_72[0x2]; + u8 second_vlan_id[0xc]; +}; + +struct mlx5_ifc_ste_eth_l2_dst_bits { + u8 dmac_47_16[0x20]; + + u8 dmac_15_0[0x10]; + u8 l3_ethertype[0x10]; + + u8 qp_type[0x2]; + u8 ethertype_filter[0x1]; + u8 reserved_at_43[0x1]; + u8 sx_sniffer[0x1]; + u8 force_lb[0x1]; + u8 functional_lb[0x1]; + u8 port[0x1]; + u8 reserved_at_48[0x4]; + u8 first_priority[0x3]; + u8 first_cfi[0x1]; + u8 first_vlan_qualifier[0x2]; + u8 reserved_at_52[0x2]; + u8 first_vlan_id[0xc]; + + u8 ip_fragmented[0x1]; + u8 tcp_syn[0x1]; + u8 encp_type[0x2]; + u8 l3_type[0x2]; + u8 l4_type[0x2]; + u8 reserved_at_68[0x4]; + u8 second_priority[0x3]; + u8 second_cfi[0x1]; + u8 second_vlan_qualifier[0x2]; + u8 reserved_at_72[0x2]; + u8 second_vlan_id[0xc]; +}; + +struct mlx5_ifc_ste_eth_l2_src_dst_bits { + u8 dmac_47_16[0x20]; + + u8 dmac_15_0[0x10]; + u8 smac_47_32[0x10]; + + u8 smac_31_0[0x20]; + + u8 sx_sniffer[0x1]; + u8 force_lb[0x1]; + u8 functional_lb[0x1]; + u8 port[0x1]; + u8 l3_type[0x2]; + u8 reserved_at_66[0x6]; + u8 first_priority[0x3]; + u8 first_cfi[0x1]; + u8 first_vlan_qualifier[0x2]; + u8 reserved_at_72[0x2]; + u8 first_vlan_id[0xc]; +}; + +struct mlx5_ifc_ste_eth_l3_ipv4_5_tuple_bits { + u8 destination_address[0x20]; + + u8 source_address[0x20]; + + u8 source_port[0x10]; + u8 destination_port[0x10]; + + u8 fragmented[0x1]; + u8 first_fragment[0x1]; + u8 reserved_at_62[0x2]; + u8 reserved_at_64[0x1]; + u8 ecn[0x2]; + u8 tcp_ns[0x1]; + u8 tcp_cwr[0x1]; + u8 tcp_ece[0x1]; + u8 tcp_urg[0x1]; + u8 tcp_ack[0x1]; + u8 tcp_psh[0x1]; + u8 tcp_rst[0x1]; + u8 tcp_syn[0x1]; + u8 tcp_fin[0x1]; + u8 dscp[0x6]; + u8 reserved_at_76[0x2]; + u8 protocol[0x8]; +}; + +struct mlx5_ifc_ste_eth_l3_ipv6_dst_bits { + u8 dst_ip_127_96[0x20]; + + u8 dst_ip_95_64[0x20]; + + u8 dst_ip_63_32[0x20]; + + u8 dst_ip_31_0[0x20]; +}; + +struct mlx5_ifc_ste_eth_l2_tnl_bits { + u8 dmac_47_16[0x20]; + + u8 dmac_15_0[0x10]; + u8 l3_ethertype[0x10]; + + u8 l2_tunneling_network_id[0x20]; + + u8 ip_fragmented[0x1]; + u8 tcp_syn[0x1]; + u8 encp_type[0x2]; + u8 l3_type[0x2]; + u8 l4_type[0x2]; + u8 first_priority[0x3]; + u8 first_cfi[0x1]; + u8 reserved_at_6c[0x3]; + u8 gre_key_flag[0x1]; + u8 first_vlan_qualifier[0x2]; + u8 reserved_at_72[0x2]; + u8 first_vlan_id[0xc]; +}; + +struct mlx5_ifc_ste_eth_l3_ipv6_src_bits { + u8 src_ip_127_96[0x20]; + + u8 src_ip_95_64[0x20]; + + u8 src_ip_63_32[0x20]; + + u8 src_ip_31_0[0x20]; +}; + +struct mlx5_ifc_ste_eth_l3_ipv4_misc_bits { + u8 version[0x4]; + u8 ihl[0x4]; + u8 reserved_at_8[0x8]; + u8 total_length[0x10]; + + u8 identification[0x10]; + u8 flags[0x3]; + u8 fragment_offset[0xd]; + + u8 time_to_live[0x8]; + u8 reserved_at_48[0x8]; + u8 checksum[0x10]; + + u8 reserved_at_60[0x20]; +}; + +struct mlx5_ifc_ste_eth_l4_bits { + u8 fragmented[0x1]; + u8 first_fragment[0x1]; + u8 reserved_at_2[0x6]; + u8 protocol[0x8]; + u8 dst_port[0x10]; + + u8 ipv6_version[0x4]; + u8 reserved_at_24[0x1]; + u8 ecn[0x2]; + u8 tcp_ns[0x1]; + u8 tcp_cwr[0x1]; + u8 tcp_ece[0x1]; + u8 tcp_urg[0x1]; + u8 tcp_ack[0x1]; + u8 tcp_psh[0x1]; + u8 tcp_rst[0x1]; + u8 tcp_syn[0x1]; + u8 tcp_fin[0x1]; + u8 src_port[0x10]; + + u8 ipv6_payload_length[0x10]; + u8 ipv6_hop_limit[0x8]; + u8 dscp[0x6]; + u8 reserved_at_5e[0x2]; + + u8 tcp_data_offset[0x4]; + u8 reserved_at_64[0x8]; + u8 flow_label[0x14]; +}; + +struct mlx5_ifc_ste_eth_l4_misc_bits { + u8 checksum[0x10]; + u8 length[0x10]; + + u8 seq_num[0x20]; + + u8 ack_num[0x20]; + + u8 urgent_pointer[0x10]; + u8 window_size[0x10]; +}; + +struct mlx5_ifc_ste_mpls_bits { + u8 mpls0_label[0x14]; + u8 mpls0_exp[0x3]; + u8 mpls0_s_bos[0x1]; + u8 mpls0_ttl[0x8]; + + u8 mpls1_label[0x20]; + + u8 mpls2_label[0x20]; + + u8 reserved_at_60[0x16]; + u8 mpls4_s_bit[0x1]; + u8 mpls4_qualifier[0x1]; + u8 mpls3_s_bit[0x1]; + u8 mpls3_qualifier[0x1]; + u8 mpls2_s_bit[0x1]; + u8 mpls2_qualifier[0x1]; + u8 mpls1_s_bit[0x1]; + u8 mpls1_qualifier[0x1]; + u8 mpls0_s_bit[0x1]; + u8 mpls0_qualifier[0x1]; +}; + +struct mlx5_ifc_ste_register_0_bits { + u8 register_0_h[0x20]; + + u8 register_0_l[0x20]; + + u8 register_1_h[0x20]; + + u8 register_1_l[0x20]; +}; + +struct mlx5_ifc_ste_register_1_bits { + u8 register_2_h[0x20]; + + u8 register_2_l[0x20]; + + u8 register_3_h[0x20]; + + u8 register_3_l[0x20]; +}; + +struct mlx5_ifc_ste_gre_bits { + u8 gre_c_present[0x1]; + u8 reserved_at_1[0x1]; + u8 gre_k_present[0x1]; + u8 gre_s_present[0x1]; + u8 strict_src_route[0x1]; + u8 recur[0x3]; + u8 flags[0x5]; + u8 version[0x3]; + u8 gre_protocol[0x10]; + + u8 checksum[0x10]; + u8 offset[0x10]; + + u8 gre_key_h[0x18]; + u8 gre_key_l[0x8]; + + u8 seq_num[0x20]; +}; + +struct mlx5_ifc_ste_flex_parser_0_bits { + u8 parser_3_label[0x14]; + u8 parser_3_exp[0x3]; + u8 parser_3_s_bos[0x1]; + u8 parser_3_ttl[0x8]; + + u8 flex_parser_2[0x20]; + + u8 flex_parser_1[0x20]; + + u8 flex_parser_0[0x20]; +}; + +struct mlx5_ifc_ste_flex_parser_1_bits { + u8 flex_parser_7[0x20]; + + u8 flex_parser_6[0x20]; + + u8 flex_parser_5[0x20]; + + u8 flex_parser_4[0x20]; +}; + +struct mlx5_ifc_ste_flex_parser_tnl_vxlan_gpe_bits { + u8 outer_vxlan_gpe_flags[0x8]; + u8 reserved_at_8[0x10]; + u8 outer_vxlan_gpe_next_protocol[0x8]; + + u8 outer_vxlan_gpe_vni[0x18]; + u8 reserved_at_38[0x8]; + + u8 reserved_at_40[0x40]; +}; + +struct mlx5_ifc_ste_flex_parser_tnl_geneve_bits { + u8 reserved_at_0[0x2]; + u8 geneve_opt_len[0x6]; + u8 geneve_oam[0x1]; + u8 reserved_at_9[0x7]; + u8 geneve_protocol_type[0x10]; + + u8 geneve_vni[0x18]; + u8 reserved_at_38[0x8]; + + u8 reserved_at_40[0x40]; +}; + +struct mlx5_ifc_ste_flex_parser_tnl_gtpu_bits { + u8 reserved_at_0[0x5]; + u8 gtpu_flags[0x3]; + u8 gtpu_msg_type[0x8]; + u8 reserved_at_10[0x10]; + + u8 gtpu_teid[0x20]; + + u8 reserved_at_40[0x40]; +}; + +struct mlx5_ifc_ste_general_purpose_bits { + u8 general_purpose_lookup_field[0x20]; + + u8 reserved_at_20[0x20]; + + u8 reserved_at_40[0x20]; + + u8 reserved_at_60[0x20]; +}; + +struct mlx5_ifc_ste_src_gvmi_qp_bits { + u8 loopback_syndrome[0x8]; + u8 reserved_at_8[0x8]; + u8 source_gvmi[0x10]; + + u8 reserved_at_20[0x5]; + u8 force_lb[0x1]; + u8 functional_lb[0x1]; + u8 source_is_requestor[0x1]; + u8 source_qp[0x18]; + + u8 reserved_at_40[0x20]; + + u8 reserved_at_60[0x20]; +}; + +struct mlx5_ifc_set_action_in_bits { + u8 action_type[0x4]; + u8 field[0xc]; + u8 reserved_at_10[0x3]; + u8 offset[0x5]; + u8 reserved_at_18[0x3]; + u8 length[0x5]; + + u8 data[0x20]; +}; + +struct mlx5_ifc_add_action_in_bits { + u8 action_type[0x4]; + u8 field[0xc]; + u8 reserved_at_10[0x10]; + + u8 data[0x20]; +}; + +struct mlx5_ifc_copy_action_in_bits { + u8 action_type[0x4]; + u8 src_field[0xc]; + u8 reserved_at_10[0x3]; + u8 src_offset[0x5]; + u8 reserved_at_18[0x3]; + u8 length[0x5]; + + u8 reserved_at_20[0x4]; + u8 dst_field[0xc]; + u8 reserved_at_30[0x3]; + u8 dst_offset[0x5]; + u8 reserved_at_38[0x8]; +}; + +enum { + MLX5_ACTION_TYPE_SET = 0x1, + MLX5_ACTION_TYPE_ADD = 0x2, + MLX5_ACTION_TYPE_COPY = 0x3, +}; + +enum { + MLX5_ACTION_IN_FIELD_OUT_SMAC_47_16 = 0x1, + MLX5_ACTION_IN_FIELD_OUT_SMAC_15_0 = 0x2, + MLX5_ACTION_IN_FIELD_OUT_ETHERTYPE = 0x3, + MLX5_ACTION_IN_FIELD_OUT_DMAC_47_16 = 0x4, + MLX5_ACTION_IN_FIELD_OUT_DMAC_15_0 = 0x5, + MLX5_ACTION_IN_FIELD_OUT_IP_DSCP = 0x6, + MLX5_ACTION_IN_FIELD_OUT_TCP_FLAGS = 0x7, + MLX5_ACTION_IN_FIELD_OUT_TCP_SPORT = 0x8, + MLX5_ACTION_IN_FIELD_OUT_TCP_DPORT = 0x9, + MLX5_ACTION_IN_FIELD_OUT_IP_TTL = 0xa, + MLX5_ACTION_IN_FIELD_OUT_UDP_SPORT = 0xb, + MLX5_ACTION_IN_FIELD_OUT_UDP_DPORT = 0xc, + MLX5_ACTION_IN_FIELD_OUT_SIPV6_127_96 = 0xd, + MLX5_ACTION_IN_FIELD_OUT_SIPV6_95_64 = 0xe, + MLX5_ACTION_IN_FIELD_OUT_SIPV6_63_32 = 0xf, + MLX5_ACTION_IN_FIELD_OUT_SIPV6_31_0 = 0x10, + MLX5_ACTION_IN_FIELD_OUT_DIPV6_127_96 = 0x11, + MLX5_ACTION_IN_FIELD_OUT_DIPV6_95_64 = 0x12, + MLX5_ACTION_IN_FIELD_OUT_DIPV6_63_32 = 0x13, + MLX5_ACTION_IN_FIELD_OUT_DIPV6_31_0 = 0x14, + MLX5_ACTION_IN_FIELD_OUT_SIPV4 = 0x15, + MLX5_ACTION_IN_FIELD_OUT_DIPV4 = 0x16, + MLX5_ACTION_IN_FIELD_OUT_FIRST_VID = 0x17, + MLX5_ACTION_IN_FIELD_OUT_IPV6_HOPLIMIT = 0x47, + MLX5_ACTION_IN_FIELD_OUT_METADATA_REGA = 0x49, + MLX5_ACTION_IN_FIELD_OUT_METADATA_REGB = 0x50, + MLX5_ACTION_IN_FIELD_OUT_METADATA_REGC_0 = 0x51, + MLX5_ACTION_IN_FIELD_OUT_METADATA_REGC_1 = 0x52, + MLX5_ACTION_IN_FIELD_OUT_METADATA_REGC_2 = 0x53, + MLX5_ACTION_IN_FIELD_OUT_METADATA_REGC_3 = 0x54, + MLX5_ACTION_IN_FIELD_OUT_METADATA_REGC_4 = 0x55, + MLX5_ACTION_IN_FIELD_OUT_METADATA_REGC_5 = 0x56, + MLX5_ACTION_IN_FIELD_OUT_TCP_SEQ_NUM = 0x59, + MLX5_ACTION_IN_FIELD_OUT_TCP_ACK_NUM = 0x5B, +}; + +struct mlx5_ifc_packet_reformat_context_in_bits { + u8 reserved_at_0[0x5]; + u8 reformat_type[0x3]; + u8 reserved_at_8[0xe]; + u8 reformat_data_size[0xa]; + + u8 reserved_at_20[0x10]; + u8 reformat_data[2][0x8]; + + u8 more_reformat_data[0][0x8]; +}; + +struct mlx5_ifc_alloc_packet_reformat_context_in_bits { + u8 opcode[0x10]; + u8 reserved_at_10[0x10]; + + u8 reserved_at_20[0x10]; + u8 op_mod[0x10]; + + u8 reserved_at_40[0xa0]; + + struct mlx5_ifc_packet_reformat_context_in_bits packet_reformat_context; +}; + +struct mlx5_ifc_alloc_packet_reformat_context_out_bits { + u8 status[0x8]; + u8 reserved_at_8[0x18]; + + u8 syndrome[0x20]; + + u8 packet_reformat_id[0x20]; + + u8 reserved_at_60[0x20]; +}; + +struct mlx5_ifc_dealloc_packet_reformat_context_in_bits { + u8 opcode[0x10]; + u8 reserved_at_10[0x10]; + + u8 reserved_20[0x10]; + u8 op_mod[0x10]; + + u8 packet_reformat_id[0x20]; + + u8 reserved_60[0x20]; +}; + +struct mlx5_ifc_dealloc_packet_reformat_context_out_bits { + u8 status[0x8]; + u8 reserved_at_8[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_at_40[0x40]; +}; + +enum reformat_type { + MLX5_REFORMAT_TYPE_L2_TO_VXLAN = 0x0, + MLX5_REFORMAT_TYPE_L2_TO_NVGRE = 0x1, + MLX5_REFORMAT_TYPE_L2_TO_L2_TUNNEL = 0x2, + MLX5_REFORMAT_TYPE_L3_TUNNEL_TO_L2 = 0x3, + MLX5_REFORMAT_TYPE_L2_TO_L3_TUNNEL = 0x4, +}; + +struct mlx5_ifc_alloc_flow_counter_in_bits { + u8 opcode[0x10]; + u8 uid[0x10]; + + u8 reserved_at_20[0x10]; + u8 op_mod[0x10]; + + u8 reserved_at_40[0x40]; +}; + +struct mlx5_ifc_alloc_flow_counter_out_bits { + u8 status[0x8]; + u8 reserved_at_8[0x18]; + + u8 syndrome[0x20]; + + u8 flow_counter_id[0x20]; + + u8 reserved_at_60[0x20]; +}; + +enum { + MLX5_OBJ_TYPE_FLOW_METER = 0x000a, +}; + +struct mlx5_ifc_general_obj_in_cmd_hdr_bits { + u8 opcode[0x10]; + u8 uid[0x10]; + + u8 reserved_at_20[0x10]; + u8 obj_type[0x10]; + + u8 obj_id[0x20]; + + u8 reserved_at_60[0x20]; +}; + +struct mlx5_ifc_general_obj_out_cmd_hdr_bits { + u8 status[0x8]; + u8 reserved_at_8[0x18]; + + u8 syndrome[0x20]; + + u8 obj_id[0x20]; + + u8 reserved_at_60[0x20]; +}; + +struct mlx5_ifc_flow_meter_bits { + u8 modify_field_select[0x40]; + + u8 active[0x1]; + u8 reserved_at_41[0x3]; + u8 return_reg_id[0x4]; + u8 table_type[0x8]; + u8 reserved_at_50[0x10]; + + u8 reserved_at_60[0x8]; + u8 destination_table_id[0x18]; + + u8 reserved_at_80[0x80]; + + u8 flow_meter_params[0x100]; + + u8 reserved_at_180[0x180]; + + u8 sw_steering_icm_address_rx[0x40]; + u8 sw_steering_icm_address_tx[0x40]; +}; + +struct mlx5_ifc_create_flow_meter_in_bits { + struct mlx5_ifc_general_obj_in_cmd_hdr_bits hdr; + struct mlx5_ifc_flow_meter_bits meter; +}; + +struct mlx5_ifc_query_flow_meter_out_bits { + struct mlx5_ifc_general_obj_out_cmd_hdr_bits hdr; + struct mlx5_ifc_flow_meter_bits obj; +}; + +struct mlx5_ifc_esw_vport_context_bits { + u8 reserved_at_0[0x3]; + u8 vport_svlan_strip[0x1]; + u8 vport_cvlan_strip[0x1]; + u8 vport_svlan_insert[0x1]; + u8 vport_cvlan_insert[0x2]; + u8 reserved_at_8[0x18]; + + u8 reserved_at_20[0x20]; + + u8 svlan_cfi[0x1]; + u8 svlan_pcp[0x3]; + u8 svlan_id[0xc]; + u8 cvlan_cfi[0x1]; + u8 cvlan_pcp[0x3]; + u8 cvlan_id[0xc]; + + u8 reserved_at_40[0x720]; + u8 sw_steering_vport_icm_address_rx[0x40]; + u8 sw_steering_vport_icm_address_tx[0x40]; +}; + +struct mlx5_ifc_query_esw_vport_context_out_bits { + u8 status[0x8]; + u8 reserved_at_8[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_at_40[0x40]; + + struct mlx5_ifc_esw_vport_context_bits esw_vport_context; +}; + +struct mlx5_ifc_query_esw_vport_context_in_bits { + u8 opcode[0x10]; + u8 reserved_at_10[0x10]; + + u8 reserved_at_20[0x10]; + u8 op_mod[0x10]; + + u8 other_vport[0x1]; + u8 reserved_at_41[0xf]; + u8 vport_number[0x10]; + + u8 reserved_at_60[0x20]; +}; + +enum { + MLX5_QPC_ST_RC = 0x0, +}; + +enum { + MLX5_QPC_PM_STATE_MIGRATED = 0x3, +}; + +struct mlx5_ifc_ads_bits { + u8 fl[0x1]; + u8 free_ar[0x1]; + u8 reserved_at_2[0xe]; + u8 pkey_index[0x10]; + + u8 reserved_at_20[0x8]; + u8 grh[0x1]; + u8 mlid[0x7]; + u8 rlid[0x10]; + + u8 ack_timeout[0x5]; + u8 reserved_at_45[0x3]; + u8 src_addr_index[0x8]; + u8 reserved_at_50[0x4]; + u8 stat_rate[0x4]; + u8 hop_limit[0x8]; + + u8 reserved_at_60[0x4]; + u8 tclass[0x8]; + u8 flow_label[0x14]; + + u8 rgid_rip[16][0x8]; + + u8 reserved_at_100[0x4]; + u8 f_dscp[0x1]; + u8 f_ecn[0x1]; + u8 reserved_at_106[0x1]; + u8 f_eth_prio[0x1]; + u8 ecn[0x2]; + u8 dscp[0x6]; + u8 udp_sport[0x10]; + + u8 dei_cfi[0x1]; + u8 eth_prio[0x3]; + u8 sl[0x4]; + u8 vhca_port_num[0x8]; + u8 rmac_47_32[0x10]; + + u8 rmac_31_0[0x20]; +}; + +struct mlx5_ifc_qpc_bits { + u8 state[0x4]; + u8 lag_tx_port_affinity[0x4]; + u8 st[0x8]; + u8 reserved_at_10[0x3]; + u8 pm_state[0x2]; + u8 reserved_at_15[0x1]; + u8 req_e2e_credit_mode[0x2]; + u8 offload_type[0x4]; + u8 end_padding_mode[0x2]; + u8 reserved_at_1e[0x2]; + + u8 wq_signature[0x1]; + u8 block_lb_mc[0x1]; + u8 atomic_like_write_en[0x1]; + u8 latency_sensitive[0x1]; + u8 reserved_at_24[0x1]; + u8 drain_sigerr[0x1]; + u8 reserved_at_26[0x2]; + u8 pd[0x18]; + + u8 mtu[0x3]; + u8 log_msg_max[0x5]; + u8 reserved_at_48[0x1]; + u8 log_rq_size[0x4]; + u8 log_rq_stride[0x3]; + u8 no_sq[0x1]; + u8 log_sq_size[0x4]; + u8 reserved_at_55[0x6]; + u8 rlky[0x1]; + u8 ulp_stateless_offload_mode[0x4]; + + u8 counter_set_id[0x8]; + u8 uar_page[0x18]; + + u8 reserved_at_80[0x8]; + u8 user_index[0x18]; + + u8 reserved_at_a0[0x3]; + u8 log_page_size[0x5]; + u8 remote_qpn[0x18]; + + struct mlx5_ifc_ads_bits primary_address_path; + + struct mlx5_ifc_ads_bits secondary_address_path; + + u8 log_ack_req_freq[0x4]; + u8 reserved_at_384[0x4]; + u8 log_sra_max[0x3]; + u8 reserved_at_38b[0x2]; + u8 retry_count[0x3]; + u8 rnr_retry[0x3]; + u8 reserved_at_393[0x1]; + u8 fre[0x1]; + u8 cur_rnr_retry[0x3]; + u8 cur_retry_count[0x3]; + u8 reserved_at_39b[0x5]; + + u8 reserved_at_3a0[0x20]; + + u8 reserved_at_3c0[0x8]; + u8 next_send_psn[0x18]; + + u8 reserved_at_3e0[0x8]; + u8 cqn_snd[0x18]; + + u8 reserved_at_400[0x8]; + u8 deth_sqpn[0x18]; + + u8 reserved_at_420[0x20]; + + u8 reserved_at_440[0x8]; + u8 last_acked_psn[0x18]; + + u8 reserved_at_460[0x8]; + u8 ssn[0x18]; + + u8 reserved_at_480[0x8]; + u8 log_rra_max[0x3]; + u8 reserved_at_48b[0x1]; + u8 atomic_mode[0x4]; + u8 rre[0x1]; + u8 rwe[0x1]; + u8 rae[0x1]; + u8 reserved_at_493[0x1]; + u8 page_offset[0x6]; + u8 reserved_at_49a[0x3]; + u8 cd_slave_receive[0x1]; + u8 cd_slave_send[0x1]; + u8 cd_master[0x1]; + + u8 reserved_at_4a0[0x3]; + u8 min_rnr_nak[0x5]; + u8 next_rcv_psn[0x18]; + + u8 reserved_at_4c0[0x8]; + u8 xrcd[0x18]; + + u8 reserved_at_4e0[0x8]; + u8 cqn_rcv[0x18]; + + u8 dbr_addr[0x40]; + + u8 q_key[0x20]; + + u8 reserved_at_560[0x5]; + u8 rq_type[0x3]; + u8 srqn_rmpn_xrqn[0x18]; + + u8 reserved_at_580[0x8]; + u8 rmsn[0x18]; + + u8 hw_sq_wqebb_counter[0x10]; + u8 sw_sq_wqebb_counter[0x10]; + + u8 hw_rq_counter[0x20]; + + u8 sw_rq_counter[0x20]; + + u8 reserved_at_600[0x20]; + + u8 reserved_at_620[0xf]; + u8 cgs[0x1]; + u8 cs_req[0x8]; + u8 cs_res[0x8]; + + u8 dc_access_key[0x40]; + + u8 reserved_at_680[0x3]; + u8 dbr_umem_valid[0x1]; + + u8 reserved_at_684[0x9c]; + + u8 dbr_umem_id[0x20]; +}; + +struct mlx5_ifc_create_qp_out_bits { + u8 status[0x8]; + u8 reserved_at_8[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_at_40[0x8]; + u8 qpn[0x18]; + + u8 reserved_at_60[0x20]; +}; + +struct mlx5_ifc_create_qp_in_bits { + u8 opcode[0x10]; + u8 uid[0x10]; + + u8 reserved_at_20[0x10]; + u8 op_mod[0x10]; + + u8 reserved_at_40[0x40]; + + u8 opt_param_mask[0x20]; + + u8 reserved_at_a0[0x20]; + + struct mlx5_ifc_qpc_bits qpc; + + u8 reserved_at_800[0x40]; + + u8 wq_umem_id[0x20]; + + u8 wq_umem_valid[0x1]; + u8 reserved_at_861[0x1f]; + + u8 pas[0][0x40]; +}; + +struct mlx5_ifc_init2rtr_qp_out_bits { + u8 status[0x8]; + u8 reserved_at_8[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_at_40[0x40]; +}; + +struct mlx5_ifc_init2rtr_qp_in_bits { + u8 opcode[0x10]; + u8 uid[0x10]; + + u8 reserved_at_20[0x10]; + u8 op_mod[0x10]; + + u8 reserved_at_40[0x8]; + u8 qpn[0x18]; + + u8 reserved_at_60[0x20]; + + u8 opt_param_mask[0x20]; + + u8 reserved_at_a0[0x20]; + + struct mlx5_ifc_qpc_bits qpc; + + u8 reserved_at_800[0x80]; +}; + +struct mlx5_ifc_rtr2rts_qp_out_bits { + u8 status[0x8]; + u8 reserved_at_8[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_at_40[0x40]; +}; + +struct mlx5_ifc_rtr2rts_qp_in_bits { + u8 opcode[0x10]; + u8 uid[0x10]; + + u8 reserved_at_20[0x10]; + u8 op_mod[0x10]; + + u8 reserved_at_40[0x8]; + u8 qpn[0x18]; + + u8 reserved_at_60[0x20]; + + u8 opt_param_mask[0x20]; + + u8 reserved_at_a0[0x20]; + + struct mlx5_ifc_qpc_bits qpc; + + u8 reserved_at_800[0x80]; +}; + +struct mlx5_ifc_rst2init_qp_out_bits { + u8 status[0x8]; + u8 reserved_at_8[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_at_40[0x40]; +}; + +struct mlx5_ifc_rst2init_qp_in_bits { + u8 opcode[0x10]; + u8 uid[0x10]; + + u8 reserved_at_20[0x10]; + u8 op_mod[0x10]; + + u8 reserved_at_40[0x8]; + u8 qpn[0x18]; + + u8 reserved_at_60[0x20]; + + u8 opt_param_mask[0x20]; + + u8 reserved_at_a0[0x20]; + + struct mlx5_ifc_qpc_bits qpc; + + u8 reserved_at_800[0x80]; +}; + +enum roce_version { + MLX5_ROCE_VERSION_1 = 0, + MLX5_ROCE_VERSION_2 = 2, +}; + +struct mlx5_ifc_roce_addr_layout_bits { + u8 source_l3_address[16][0x8]; + + u8 reserved_at_80[0x3]; + u8 vlan_valid[0x1]; + u8 vlan_id[0xc]; + u8 source_mac_47_32[0x10]; + + u8 source_mac_31_0[0x20]; + + u8 reserved_at_c0[0x14]; + u8 roce_l3_type[0x4]; + u8 roce_version[0x8]; + + u8 reserved_at_e0[0x20]; +}; + +struct mlx5_ifc_query_roce_address_out_bits { + u8 status[0x8]; + u8 reserved_at_8[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_at_40[0x40]; + + struct mlx5_ifc_roce_addr_layout_bits roce_address; +}; + +struct mlx5_ifc_query_roce_address_in_bits { + u8 opcode[0x10]; + u8 reserved_at_10[0x10]; + + u8 reserved_at_20[0x10]; + u8 op_mod[0x10]; + + u8 roce_address_index[0x10]; + u8 reserved_at_50[0xc]; + u8 vhca_port_num[0x4]; + + u8 reserved_at_60[0x20]; +}; + +/* Both HW set and HW add share the same HW format with different opcodes */ +struct mlx5_ifc_dr_action_hw_set_bits { + u8 opcode[0x8]; + u8 destination_field_code[0x8]; + u8 reserved_at_10[0x2]; + u8 destination_left_shifter[0x6]; + u8 reserved_at_18[0x3]; + u8 destination_length[0x5]; + + u8 inline_data[0x20]; +}; + +struct mlx5_ifc_dr_action_hw_copy_bits { + u8 opcode[0x8]; + u8 destination_field_code[0x8]; + u8 reserved_at_10[0x2]; + u8 destination_left_shifter[0x6]; + u8 reserved_at_18[0x2]; + u8 destination_length[0x6]; + + u8 reserved_at_20[0x8]; + u8 source_field_code[0x8]; + u8 reserved_at_30[0x2]; + u8 source_left_shifter[0x6]; + u8 reserved_at_38[0x8]; +}; + +enum { + MLX5_DR_ACTION_MDFY_HW_FLD_L2_0 = 0, + MLX5_DR_ACTION_MDFY_HW_FLD_L2_1 = 1, + MLX5_DR_ACTION_MDFY_HW_FLD_L2_2 = 2, + MLX5_DR_ACTION_MDFY_HW_FLD_L3_0 = 3, + MLX5_DR_ACTION_MDFY_HW_FLD_L3_1 = 4, + MLX5_DR_ACTION_MDFY_HW_FLD_L3_2 = 5, + MLX5_DR_ACTION_MDFY_HW_FLD_L3_3 = 6, + MLX5_DR_ACTION_MDFY_HW_FLD_L3_4 = 7, + MLX5_DR_ACTION_MDFY_HW_FLD_L4_0 = 8, + MLX5_DR_ACTION_MDFY_HW_FLD_L4_1 = 9, + MLX5_DR_ACTION_MDFY_HW_FLD_MPLS = 10, + MLX5_DR_ACTION_MDFY_HW_FLD_L2_TNL_0 = 11, + MLX5_DR_ACTION_MDFY_HW_FLD_REG_0 = 12, + MLX5_DR_ACTION_MDFY_HW_FLD_REG_1 = 13, + MLX5_DR_ACTION_MDFY_HW_FLD_REG_2 = 14, + MLX5_DR_ACTION_MDFY_HW_FLD_REG_3 = 15, + MLX5_DR_ACTION_MDFY_HW_FLD_L4_2 = 16, + MLX5_DR_ACTION_MDFY_HW_FLD_FLEX_0 = 17, + MLX5_DR_ACTION_MDFY_HW_FLD_FLEX_1 = 18, + MLX5_DR_ACTION_MDFY_HW_FLD_FLEX_2 = 19, + MLX5_DR_ACTION_MDFY_HW_FLD_FLEX_3 = 20, + MLX5_DR_ACTION_MDFY_HW_FLD_L2_TNL_1 = 21, + MLX5_DR_ACTION_MDFY_HW_FLD_METADATA = 22, + MLX5_DR_ACTION_MDFY_HW_FLD_RESERVED = 23, +}; + +enum { + MLX5_DR_ACTION_MDFY_HW_OP_COPY = 0x1, + MLX5_DR_ACTION_MDFY_HW_OP_SET = 0x2, + MLX5_DR_ACTION_MDFY_HW_OP_ADD = 0x3, +}; + +enum { + MLX5_DR_ACTION_MDFY_HW_HDR_L3_NONE = 0x0, + MLX5_DR_ACTION_MDFY_HW_HDR_L3_IPV4 = 0x1, + MLX5_DR_ACTION_MDFY_HW_HDR_L3_IPV6 = 0x2, +}; + +enum { + MLX5_DR_ACTION_MDFY_HW_HDR_L4_NONE = 0x0, + MLX5_DR_ACTION_MDFY_HW_HDR_L4_TCP = 0x1, + MLX5_DR_ACTION_MDFY_HW_HDR_L4_UDP = 0x2, +}; +#endif /* MLX5_IFC_H */ diff --git a/providers/mlx5/mlx5dv.h b/providers/mlx5/mlx5dv.h new file mode 100644 index 0000000..27a7170 --- /dev/null +++ b/providers/mlx5/mlx5dv.h @@ -0,0 +1,1543 @@ +/* + * Copyright (c) 2017 Mellanox Technologies, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef _MLX5DV_H_ +#define _MLX5DV_H_ + +#include <stdio.h> +#include <stdbool.h> +#include <linux/types.h> /* For the __be64 type */ +#include <sys/types.h> +#include <endian.h> +#if defined(__SSE3__) +#include <limits.h> +#include <emmintrin.h> +#include <tmmintrin.h> +#endif /* defined(__SSE3__) */ + +#include <infiniband/verbs.h> +#include <infiniband/tm_types.h> +#include <infiniband/mlx5_api.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* Always inline the functions */ +#ifdef __GNUC__ +#define MLX5DV_ALWAYS_INLINE inline __attribute__((always_inline)) +#else +#define MLX5DV_ALWAYS_INLINE inline +#endif + + +#define MLX5DV_RES_TYPE_QP ((uint64_t)RDMA_DRIVER_MLX5 << 32 | 1) +#define MLX5DV_RES_TYPE_RWQ ((uint64_t)RDMA_DRIVER_MLX5 << 32 | 2) +#define MLX5DV_RES_TYPE_DBR ((uint64_t)RDMA_DRIVER_MLX5 << 32 | 3) +#define MLX5DV_RES_TYPE_SRQ ((uint64_t)RDMA_DRIVER_MLX5 << 32 | 4) +#define MLX5DV_RES_TYPE_CQ ((uint64_t)RDMA_DRIVER_MLX5 << 32 | 5) + +enum { + MLX5_RCV_DBR = 0, + MLX5_SND_DBR = 1, +}; + +enum mlx5dv_context_comp_mask { + MLX5DV_CONTEXT_MASK_CQE_COMPRESION = 1 << 0, + MLX5DV_CONTEXT_MASK_SWP = 1 << 1, + MLX5DV_CONTEXT_MASK_STRIDING_RQ = 1 << 2, + MLX5DV_CONTEXT_MASK_TUNNEL_OFFLOADS = 1 << 3, + MLX5DV_CONTEXT_MASK_DYN_BFREGS = 1 << 4, + MLX5DV_CONTEXT_MASK_CLOCK_INFO_UPDATE = 1 << 5, + MLX5DV_CONTEXT_MASK_FLOW_ACTION_FLAGS = 1 << 6, + MLX5DV_CONTEXT_MASK_DC_ODP_CAPS = 1 << 7, + MLX5DV_CONTEXT_MASK_HCA_CORE_CLOCK = 1 << 8, +}; + +struct mlx5dv_cqe_comp_caps { + uint32_t max_num; + uint32_t supported_format; /* enum mlx5dv_cqe_comp_res_format */ +}; + +struct mlx5dv_sw_parsing_caps { + uint32_t sw_parsing_offloads; /* Use enum mlx5dv_sw_parsing_offloads */ + uint32_t supported_qpts; +}; + +struct mlx5dv_striding_rq_caps { + uint32_t min_single_stride_log_num_of_bytes; + uint32_t max_single_stride_log_num_of_bytes; + uint32_t min_single_wqe_log_num_of_strides; + uint32_t max_single_wqe_log_num_of_strides; + uint32_t supported_qpts; +}; + +enum mlx5dv_tunnel_offloads { + MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_VXLAN = 1 << 0, + MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_GRE = 1 << 1, + MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_GENEVE = 1 << 2, + MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_CW_MPLS_OVER_GRE = 1 << 3, + MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_CW_MPLS_OVER_UDP = 1 << 4, +}; + +enum mlx5dv_flow_action_cap_flags { + MLX5DV_FLOW_ACTION_FLAGS_ESP_AES_GCM = 1 << 0, + MLX5DV_FLOW_ACTION_FLAGS_ESP_AES_GCM_REQ_METADATA = 1 << 1, + MLX5DV_FLOW_ACTION_FLAGS_ESP_AES_GCM_SPI_STEERING = 1 << 2, + MLX5DV_FLOW_ACTION_FLAGS_ESP_AES_GCM_FULL_OFFLOAD = 1 << 3, + MLX5DV_FLOW_ACTION_FLAGS_ESP_AES_GCM_TX_IV_IS_ESN = 1 << 4, +}; + +/* + * Direct verbs device-specific attributes + */ +struct mlx5dv_context { + uint8_t version; + uint64_t flags; + uint64_t comp_mask; + struct mlx5dv_cqe_comp_caps cqe_comp_caps; + struct mlx5dv_sw_parsing_caps sw_parsing_caps; + struct mlx5dv_striding_rq_caps striding_rq_caps; + uint32_t tunnel_offloads_caps; + uint32_t max_dynamic_bfregs; + uint64_t max_clock_info_update_nsec; + uint32_t flow_action_flags; /* use enum mlx5dv_flow_action_cap_flags */ + uint32_t dc_odp_caps; /* use enum ibv_odp_transport_cap_bits */ + void *hca_core_clock; +}; + +enum mlx5dv_context_flags { + /* + * This flag indicates if CQE version 0 or 1 is needed. + */ + MLX5DV_CONTEXT_FLAGS_CQE_V1 = (1 << 0), + MLX5DV_CONTEXT_FLAGS_OBSOLETE = (1 << 1), /* Obsoleted, don't use */ + MLX5DV_CONTEXT_FLAGS_MPW_ALLOWED = (1 << 2), + MLX5DV_CONTEXT_FLAGS_ENHANCED_MPW = (1 << 3), + MLX5DV_CONTEXT_FLAGS_CQE_128B_COMP = (1 << 4), /* Support CQE 128B compression */ + MLX5DV_CONTEXT_FLAGS_CQE_128B_PAD = (1 << 5), /* Support CQE 128B padding */ + MLX5DV_CONTEXT_FLAGS_PACKET_BASED_CREDIT_MODE = (1 << 6), +}; + +enum mlx5dv_cq_init_attr_mask { + MLX5DV_CQ_INIT_ATTR_MASK_COMPRESSED_CQE = 1 << 0, + MLX5DV_CQ_INIT_ATTR_MASK_FLAGS = 1 << 1, + MLX5DV_CQ_INIT_ATTR_MASK_CQE_SIZE = 1 << 2, +}; + +enum mlx5dv_cq_init_attr_flags { + MLX5DV_CQ_INIT_ATTR_FLAGS_CQE_PAD = 1 << 0, + MLX5DV_CQ_INIT_ATTR_FLAGS_RESERVED = 1 << 1, +}; + +struct mlx5dv_cq_init_attr { + uint64_t comp_mask; /* Use enum mlx5dv_cq_init_attr_mask */ + uint8_t cqe_comp_res_format; /* Use enum mlx5dv_cqe_comp_res_format */ + uint32_t flags; /* Use enum mlx5dv_cq_init_attr_flags */ + uint16_t cqe_size; /* when MLX5DV_CQ_INIT_ATTR_MASK_CQE_SIZE set */ +}; + +struct ibv_cq_ex *mlx5dv_create_cq(struct ibv_context *context, + struct ibv_cq_init_attr_ex *cq_attr, + struct mlx5dv_cq_init_attr *mlx5_cq_attr); + +enum mlx5dv_qp_create_flags { + MLX5DV_QP_CREATE_TUNNEL_OFFLOADS = 1 << 0, + MLX5DV_QP_CREATE_TIR_ALLOW_SELF_LOOPBACK_UC = 1 << 1, + MLX5DV_QP_CREATE_TIR_ALLOW_SELF_LOOPBACK_MC = 1 << 2, + MLX5DV_QP_CREATE_DISABLE_SCATTER_TO_CQE = 1 << 3, + MLX5DV_QP_CREATE_ALLOW_SCATTER_TO_CQE = 1 << 4, + MLX5DV_QP_CREATE_PACKET_BASED_CREDIT_MODE = 1 << 5, +}; + +enum mlx5dv_mkey_init_attr_flags { + MLX5DV_MKEY_INIT_ATTR_FLAGS_INDIRECT = 1 << 0, +}; + +struct mlx5dv_mkey_init_attr { + struct ibv_pd *pd; + uint32_t create_flags; /* Use enum mlx5dv_mkey_init_attr_flags */ + uint16_t max_entries; /* Requested max number of pointed entries by this indirect mkey */ +}; + +struct mlx5dv_mkey { + uint32_t lkey; + uint32_t rkey; +}; + +struct mlx5dv_mkey *mlx5dv_create_mkey(struct mlx5dv_mkey_init_attr *mkey_init_attr); +int mlx5dv_destroy_mkey(struct mlx5dv_mkey *mkey); + +enum mlx5dv_qp_init_attr_mask { + MLX5DV_QP_INIT_ATTR_MASK_QP_CREATE_FLAGS = 1 << 0, + MLX5DV_QP_INIT_ATTR_MASK_DC = 1 << 1, + MLX5DV_QP_INIT_ATTR_MASK_SEND_OPS_FLAGS = 1 << 2, +}; + +enum mlx5dv_dc_type { + MLX5DV_DCTYPE_DCT = 1, + MLX5DV_DCTYPE_DCI, +}; + +struct mlx5dv_dc_init_attr { + enum mlx5dv_dc_type dc_type; + uint64_t dct_access_key; +}; + +enum mlx5dv_qp_create_send_ops_flags { + MLX5DV_QP_EX_WITH_MR_INTERLEAVED = 1 << 0, + MLX5DV_QP_EX_WITH_MR_LIST = 1 << 1, +}; + +struct mlx5dv_qp_init_attr { + uint64_t comp_mask; /* Use enum mlx5dv_qp_init_attr_mask */ + uint32_t create_flags; /* Use enum mlx5dv_qp_create_flags */ + struct mlx5dv_dc_init_attr dc_init_attr; + uint64_t send_ops_flags; /* Use enum mlx5dv_qp_create_send_ops_flags */ +}; + +struct ibv_qp *mlx5dv_create_qp(struct ibv_context *context, + struct ibv_qp_init_attr_ex *qp_attr, + struct mlx5dv_qp_init_attr *mlx5_qp_attr); + +struct mlx5dv_mr_interleaved { + uint64_t addr; + uint32_t bytes_count; + uint32_t bytes_skip; + uint32_t lkey; +}; + +enum mlx5dv_wc_opcode { + MLX5DV_WC_UMR = IBV_WC_DRIVER1, +}; + +struct mlx5dv_qp_ex { + uint64_t comp_mask; + /* + * Available just for the MLX5 DC QP type with send opcodes of type: + * rdma, atomic and send. + */ + void (*wr_set_dc_addr)(struct mlx5dv_qp_ex *mqp, struct ibv_ah *ah, + uint32_t remote_dctn, uint64_t remote_dc_key); + void (*wr_mr_interleaved)(struct mlx5dv_qp_ex *mqp, + struct mlx5dv_mkey *mkey, + uint32_t access_flags, /* use enum ibv_access_flags */ + uint32_t repeat_count, + uint16_t num_interleaved, + struct mlx5dv_mr_interleaved *data); + void (*wr_mr_list)(struct mlx5dv_qp_ex *mqp, + struct mlx5dv_mkey *mkey, + uint32_t access_flags, /* use enum ibv_access_flags */ + uint16_t num_sges, + struct ibv_sge *sge); +}; + +struct mlx5dv_qp_ex *mlx5dv_qp_ex_from_ibv_qp_ex(struct ibv_qp_ex *qp); + +static inline void mlx5dv_wr_set_dc_addr(struct mlx5dv_qp_ex *mqp, + struct ibv_ah *ah, + uint32_t remote_dctn, + uint64_t remote_dc_key) +{ + mqp->wr_set_dc_addr(mqp, ah, remote_dctn, remote_dc_key); +} + +static inline void mlx5dv_wr_mr_interleaved(struct mlx5dv_qp_ex *mqp, + struct mlx5dv_mkey *mkey, + uint32_t access_flags, + uint32_t repeat_count, + uint16_t num_interleaved, + struct mlx5dv_mr_interleaved *data) +{ + mqp->wr_mr_interleaved(mqp, mkey, access_flags, repeat_count, + num_interleaved, data); +} + +static inline void mlx5dv_wr_mr_list(struct mlx5dv_qp_ex *mqp, + struct mlx5dv_mkey *mkey, + uint32_t access_flags, + uint16_t num_sges, + struct ibv_sge *sge) +{ + mqp->wr_mr_list(mqp, mkey, access_flags, num_sges, sge); +} + +enum mlx5dv_flow_action_esp_mask { + MLX5DV_FLOW_ACTION_ESP_MASK_FLAGS = 1 << 0, +}; + +struct mlx5dv_flow_action_esp { + uint64_t comp_mask; /* Use enum mlx5dv_flow_action_esp_mask */ + uint32_t action_flags; /* Use enum mlx5dv_flow_action_flags */ +}; + +struct mlx5dv_flow_match_parameters { + size_t match_sz; + uint64_t match_buf[]; /* Device spec format */ +}; + +enum mlx5dv_flow_matcher_attr_mask { + MLX5DV_FLOW_MATCHER_MASK_FT_TYPE = 1 << 0, +}; + +struct mlx5dv_flow_matcher_attr { + enum ibv_flow_attr_type type; + uint32_t flags; /* From enum ibv_flow_flags */ + uint16_t priority; + uint8_t match_criteria_enable; /* Device spec format */ + struct mlx5dv_flow_match_parameters *match_mask; + uint64_t comp_mask; /* use mlx5dv_flow_matcher_attr_mask */ + enum mlx5dv_flow_table_type ft_type; +}; + +struct mlx5dv_flow_matcher; + +struct mlx5dv_flow_matcher * +mlx5dv_create_flow_matcher(struct ibv_context *context, + struct mlx5dv_flow_matcher_attr *matcher_attr); + +int mlx5dv_destroy_flow_matcher(struct mlx5dv_flow_matcher *matcher); + +enum mlx5dv_flow_action_type { + MLX5DV_FLOW_ACTION_DEST_IBV_QP, + MLX5DV_FLOW_ACTION_DROP, + MLX5DV_FLOW_ACTION_IBV_COUNTER, + MLX5DV_FLOW_ACTION_IBV_FLOW_ACTION, + MLX5DV_FLOW_ACTION_TAG, + MLX5DV_FLOW_ACTION_DEST_DEVX, + MLX5DV_FLOW_ACTION_COUNTERS_DEVX, +}; + +struct mlx5dv_flow_action_attr { + enum mlx5dv_flow_action_type type; + union { + struct ibv_qp *qp; + struct ibv_counters *counter; + struct ibv_flow_action *action; + uint32_t tag_value; + struct mlx5dv_devx_obj *obj; + }; +}; + +struct ibv_flow * +mlx5dv_create_flow(struct mlx5dv_flow_matcher *matcher, + struct mlx5dv_flow_match_parameters *match_value, + size_t num_actions, + struct mlx5dv_flow_action_attr actions_attr[]); + +struct ibv_flow_action *mlx5dv_create_flow_action_esp(struct ibv_context *ctx, + struct ibv_flow_action_esp_attr *esp, + struct mlx5dv_flow_action_esp *mlx5_attr); + +/* + * mlx5dv_create_flow_action_modify_header - Create a flow action which mutates + * a packet. The flow action can be attached to steering rules via + * ibv_create_flow(). + * + * @ctx: RDMA device context to create the action on. + * @actions_sz: The size of *actions* buffer in bytes. + * @actions: A buffer which contains modify actions provided in device spec + * format. + * @ft_type: Defines the flow table type to which the modify + * header action will be attached. + * + * Return a valid ibv_flow_action if successful, NULL otherwise. + */ +struct ibv_flow_action * +mlx5dv_create_flow_action_modify_header(struct ibv_context *ctx, + size_t actions_sz, + uint64_t actions[], + enum mlx5dv_flow_table_type ft_type); + +/* + * mlx5dv_create_flow_action_packet_reformat - Create flow action which can + * encap/decap packets. + */ +struct ibv_flow_action * +mlx5dv_create_flow_action_packet_reformat(struct ibv_context *ctx, + size_t data_sz, + void *data, + enum mlx5dv_flow_action_packet_reformat_type reformat_type, + enum mlx5dv_flow_table_type ft_type); +/* + * Most device capabilities are exported by ibv_query_device(...), + * but there is HW device-specific information which is important + * for data-path, but isn't provided. + * + * Return 0 on success. + */ +int mlx5dv_query_device(struct ibv_context *ctx_in, + struct mlx5dv_context *attrs_out); + +enum mlx5dv_qp_comp_mask { + MLX5DV_QP_MASK_UAR_MMAP_OFFSET = 1 << 0, + MLX5DV_QP_MASK_RAW_QP_HANDLES = 1 << 1, + MLX5DV_QP_MASK_RAW_QP_TIR_ADDR = 1 << 2, +}; + +struct mlx5dv_qp { + __be32 *dbrec; + struct { + void *buf; + uint32_t wqe_cnt; + uint32_t stride; + } sq; + struct { + void *buf; + uint32_t wqe_cnt; + uint32_t stride; + } rq; + struct { + void *reg; + uint32_t size; + } bf; + uint64_t comp_mask; + off_t uar_mmap_offset; + uint32_t tirn; + uint32_t tisn; + uint32_t rqn; + uint32_t sqn; + uint64_t tir_icm_addr; +}; + +struct mlx5dv_cq { + void *buf; + __be32 *dbrec; + uint32_t cqe_cnt; + uint32_t cqe_size; + void *cq_uar; + uint32_t cqn; + uint64_t comp_mask; +}; + +enum mlx5dv_srq_comp_mask { + MLX5DV_SRQ_MASK_SRQN = 1 << 0, +}; + +struct mlx5dv_srq { + void *buf; + __be32 *dbrec; + uint32_t stride; + uint32_t head; + uint32_t tail; + uint64_t comp_mask; + uint32_t srqn; +}; + +struct mlx5dv_rwq { + void *buf; + __be32 *dbrec; + uint32_t wqe_cnt; + uint32_t stride; + uint64_t comp_mask; +}; + +struct mlx5dv_alloc_dm_attr { + enum mlx5dv_alloc_dm_type type; + uint64_t comp_mask; +}; + +enum mlx5dv_dm_comp_mask { + MLX5DV_DM_MASK_REMOTE_VA = 1 << 0, +}; + +struct mlx5dv_dm { + void *buf; + uint64_t length; + uint64_t comp_mask; + uint64_t remote_va; +}; + +struct ibv_dm *mlx5dv_alloc_dm(struct ibv_context *context, + struct ibv_alloc_dm_attr *dm_attr, + struct mlx5dv_alloc_dm_attr *mlx5_dm_attr); + +struct mlx5_wqe_av; + +struct mlx5dv_ah { + struct mlx5_wqe_av *av; + uint64_t comp_mask; +}; + +struct mlx5dv_pd { + uint32_t pdn; + uint64_t comp_mask; +}; + +struct mlx5dv_obj { + struct { + struct ibv_qp *in; + struct mlx5dv_qp *out; + } qp; + struct { + struct ibv_cq *in; + struct mlx5dv_cq *out; + } cq; + struct { + struct ibv_srq *in; + struct mlx5dv_srq *out; + } srq; + struct { + struct ibv_wq *in; + struct mlx5dv_rwq *out; + } rwq; + struct { + struct ibv_dm *in; + struct mlx5dv_dm *out; + } dm; + struct { + struct ibv_ah *in; + struct mlx5dv_ah *out; + } ah; + struct { + struct ibv_pd *in; + struct mlx5dv_pd *out; + } pd; +}; + +enum mlx5dv_obj_type { + MLX5DV_OBJ_QP = 1 << 0, + MLX5DV_OBJ_CQ = 1 << 1, + MLX5DV_OBJ_SRQ = 1 << 2, + MLX5DV_OBJ_RWQ = 1 << 3, + MLX5DV_OBJ_DM = 1 << 4, + MLX5DV_OBJ_AH = 1 << 5, + MLX5DV_OBJ_PD = 1 << 6, +}; + +enum mlx5dv_wq_init_attr_mask { + MLX5DV_WQ_INIT_ATTR_MASK_STRIDING_RQ = 1 << 0, +}; + +struct mlx5dv_striding_rq_init_attr { + uint32_t single_stride_log_num_of_bytes; + uint32_t single_wqe_log_num_of_strides; + uint8_t two_byte_shift_en; +}; + +struct mlx5dv_wq_init_attr { + uint64_t comp_mask; /* Use enum mlx5dv_wq_init_attr_mask */ + struct mlx5dv_striding_rq_init_attr striding_rq_attrs; +}; + +/* + * This function creates a work queue object with extra properties + * defined by mlx5dv_wq_init_attr struct. + * + * For each bit in the comp_mask, a field in mlx5dv_wq_init_attr + * should follow. + * + * MLX5DV_WQ_INIT_ATTR_MASK_STRIDING_RQ: Create a work queue with + * striding RQ capabilities. + * - single_stride_log_num_of_bytes represents the size of each stride in the + * WQE and its value should be between min_single_stride_log_num_of_bytes + * and max_single_stride_log_num_of_bytes that are reported in + * mlx5dv_query_device. + * - single_wqe_log_num_of_strides represents the number of strides in each WQE. + * Its value should be between min_single_wqe_log_num_of_strides and + * max_single_wqe_log_num_of_strides that are reported in mlx5dv_query_device. + * - two_byte_shift_en: When enabled, hardware pads 2 bytes of zeroes + * before writing the message to memory (e.g. for IP alignment) + */ +struct ibv_wq *mlx5dv_create_wq(struct ibv_context *context, + struct ibv_wq_init_attr *wq_init_attr, + struct mlx5dv_wq_init_attr *mlx5_wq_attr); +/* + * This function will initialize mlx5dv_xxx structs based on supplied type. + * The information for initialization is taken from ibv_xx structs supplied + * as part of input. + * + * Request information of CQ marks its owned by DV for all consumer index + * related actions. + * + * The initialization type can be combination of several types together. + * + * Return: 0 in case of success. + */ +int mlx5dv_init_obj(struct mlx5dv_obj *obj, uint64_t obj_type); + +enum { + MLX5_OPCODE_NOP = 0x00, + MLX5_OPCODE_SEND_INVAL = 0x01, + MLX5_OPCODE_RDMA_WRITE = 0x08, + MLX5_OPCODE_RDMA_WRITE_IMM = 0x09, + MLX5_OPCODE_SEND = 0x0a, + MLX5_OPCODE_SEND_IMM = 0x0b, + MLX5_OPCODE_TSO = 0x0e, + MLX5_OPCODE_RDMA_READ = 0x10, + MLX5_OPCODE_ATOMIC_CS = 0x11, + MLX5_OPCODE_ATOMIC_FA = 0x12, + MLX5_OPCODE_ATOMIC_MASKED_CS = 0x14, + MLX5_OPCODE_ATOMIC_MASKED_FA = 0x15, + MLX5_OPCODE_FMR = 0x19, + MLX5_OPCODE_LOCAL_INVAL = 0x1b, + MLX5_OPCODE_CONFIG_CMD = 0x1f, + MLX5_OPCODE_UMR = 0x25, + MLX5_OPCODE_TAG_MATCHING = 0x28 +}; + +/* + * CQE related part + */ + +enum { + MLX5_INLINE_SCATTER_32 = 0x4, + MLX5_INLINE_SCATTER_64 = 0x8, +}; + +enum { + MLX5_CQE_SYNDROME_LOCAL_LENGTH_ERR = 0x01, + MLX5_CQE_SYNDROME_LOCAL_QP_OP_ERR = 0x02, + MLX5_CQE_SYNDROME_LOCAL_PROT_ERR = 0x04, + MLX5_CQE_SYNDROME_WR_FLUSH_ERR = 0x05, + MLX5_CQE_SYNDROME_MW_BIND_ERR = 0x06, + MLX5_CQE_SYNDROME_BAD_RESP_ERR = 0x10, + MLX5_CQE_SYNDROME_LOCAL_ACCESS_ERR = 0x11, + MLX5_CQE_SYNDROME_REMOTE_INVAL_REQ_ERR = 0x12, + MLX5_CQE_SYNDROME_REMOTE_ACCESS_ERR = 0x13, + MLX5_CQE_SYNDROME_REMOTE_OP_ERR = 0x14, + MLX5_CQE_SYNDROME_TRANSPORT_RETRY_EXC_ERR = 0x15, + MLX5_CQE_SYNDROME_RNR_RETRY_EXC_ERR = 0x16, + MLX5_CQE_SYNDROME_REMOTE_ABORTED_ERR = 0x22, +}; + +enum { + MLX5_CQE_VENDOR_SYNDROME_ODP_PFAULT = 0x93, +}; + +enum { + MLX5_CQE_L2_OK = 1 << 0, + MLX5_CQE_L3_OK = 1 << 1, + MLX5_CQE_L4_OK = 1 << 2, +}; + +enum { + MLX5_CQE_L3_HDR_TYPE_NONE = 0x0, + MLX5_CQE_L3_HDR_TYPE_IPV6 = 0x1, + MLX5_CQE_L3_HDR_TYPE_IPV4 = 0x2, +}; + +enum { + MLX5_CQE_OWNER_MASK = 1, + MLX5_CQE_REQ = 0, + MLX5_CQE_RESP_WR_IMM = 1, + MLX5_CQE_RESP_SEND = 2, + MLX5_CQE_RESP_SEND_IMM = 3, + MLX5_CQE_RESP_SEND_INV = 4, + MLX5_CQE_RESIZE_CQ = 5, + MLX5_CQE_NO_PACKET = 6, + MLX5_CQE_REQ_ERR = 13, + MLX5_CQE_RESP_ERR = 14, + MLX5_CQE_INVALID = 15, +}; + +enum { + MLX5_CQ_DOORBELL = 0x20 +}; + +enum { + MLX5_CQ_DB_REQ_NOT_SOL = 1 << 24, + MLX5_CQ_DB_REQ_NOT = 0 << 24, +}; + +struct mlx5_err_cqe { + uint8_t rsvd0[32]; + uint32_t srqn; + uint8_t rsvd1[18]; + uint8_t vendor_err_synd; + uint8_t syndrome; + uint32_t s_wqe_opcode_qpn; + uint16_t wqe_counter; + uint8_t signature; + uint8_t op_own; +}; + +struct mlx5_tm_cqe { + __be32 success; + __be16 hw_phase_cnt; + uint8_t rsvd0[12]; +}; + +struct mlx5_cqe64 { + union { + struct { + uint8_t rsvd0[2]; + __be16 wqe_id; + uint8_t rsvd4[13]; + uint8_t ml_path; + uint8_t rsvd20[4]; + __be16 slid; + __be32 flags_rqpn; + uint8_t hds_ip_ext; + uint8_t l4_hdr_type_etc; + __be16 vlan_info; + }; + struct mlx5_tm_cqe tm_cqe; + /* TMH is scattered to CQE upon match */ + struct ibv_tmh tmh; + }; + __be32 srqn_uidx; + __be32 imm_inval_pkey; + uint8_t app; + uint8_t app_op; + __be16 app_info; + __be32 byte_cnt; + __be64 timestamp; + __be32 sop_drop_qpn; + __be16 wqe_counter; + uint8_t signature; + uint8_t op_own; +}; + +enum { + MLX5_TMC_SUCCESS = 0x80000000U, +}; + +enum mlx5dv_cqe_comp_res_format { + MLX5DV_CQE_RES_FORMAT_HASH = 1 << 0, + MLX5DV_CQE_RES_FORMAT_CSUM = 1 << 1, + MLX5DV_CQE_RES_FORMAT_CSUM_STRIDX = 1 << 2, +}; + +enum mlx5dv_sw_parsing_offloads { + MLX5DV_SW_PARSING = 1 << 0, + MLX5DV_SW_PARSING_CSUM = 1 << 1, + MLX5DV_SW_PARSING_LSO = 1 << 2, +}; + +static MLX5DV_ALWAYS_INLINE +uint8_t mlx5dv_get_cqe_owner(struct mlx5_cqe64 *cqe) +{ + return cqe->op_own & 0x1; +} + +static MLX5DV_ALWAYS_INLINE +void mlx5dv_set_cqe_owner(struct mlx5_cqe64 *cqe, uint8_t val) +{ + cqe->op_own = (val & 0x1) | (cqe->op_own & ~0x1); +} + +/* Solicited event */ +static MLX5DV_ALWAYS_INLINE +uint8_t mlx5dv_get_cqe_se(struct mlx5_cqe64 *cqe) +{ + return (cqe->op_own >> 1) & 0x1; +} + +static MLX5DV_ALWAYS_INLINE +uint8_t mlx5dv_get_cqe_format(struct mlx5_cqe64 *cqe) +{ + return (cqe->op_own >> 2) & 0x3; +} + +static MLX5DV_ALWAYS_INLINE +uint8_t mlx5dv_get_cqe_opcode(struct mlx5_cqe64 *cqe) +{ + return cqe->op_own >> 4; +} + +/* + * WQE related part + */ +enum { + MLX5_INVALID_LKEY = 0x100, +}; + +enum { + MLX5_EXTENDED_UD_AV = 0x80000000, +}; + +enum { + MLX5_WQE_CTRL_CQ_UPDATE = 2 << 2, + MLX5_WQE_CTRL_SOLICITED = 1 << 1, + MLX5_WQE_CTRL_FENCE = 4 << 5, + MLX5_WQE_CTRL_INITIATOR_SMALL_FENCE = 1 << 5, +}; + +enum { + MLX5_SEND_WQE_BB = 64, + MLX5_SEND_WQE_SHIFT = 6, +}; + +enum { + MLX5_INLINE_SEG = 0x80000000, +}; + +enum { + MLX5_ETH_WQE_L3_CSUM = (1 << 6), + MLX5_ETH_WQE_L4_CSUM = (1 << 7), +}; + +struct mlx5_wqe_srq_next_seg { + uint8_t rsvd0[2]; + __be16 next_wqe_index; + uint8_t signature; + uint8_t rsvd1[11]; +}; + +struct mlx5_wqe_data_seg { + __be32 byte_count; + __be32 lkey; + __be64 addr; +}; + +struct mlx5_wqe_ctrl_seg { + __be32 opmod_idx_opcode; + __be32 qpn_ds; + uint8_t signature; + uint8_t rsvd[2]; + uint8_t fm_ce_se; + __be32 imm; +}; + +struct mlx5_mprq_wqe { + struct mlx5_wqe_srq_next_seg nseg; + struct mlx5_wqe_data_seg dseg; +}; + +struct mlx5_wqe_av { + union { + struct { + __be32 qkey; + __be32 reserved; + } qkey; + __be64 dc_key; + } key; + __be32 dqp_dct; + uint8_t stat_rate_sl; + uint8_t fl_mlid; + __be16 rlid; + uint8_t reserved0[4]; + uint8_t rmac[6]; + uint8_t tclass; + uint8_t hop_limit; + __be32 grh_gid_fl; + uint8_t rgid[16]; +}; + +struct mlx5_wqe_datagram_seg { + struct mlx5_wqe_av av; +}; + +struct mlx5_wqe_raddr_seg { + __be64 raddr; + __be32 rkey; + __be32 reserved; +}; + +struct mlx5_wqe_atomic_seg { + __be64 swap_add; + __be64 compare; +}; + +struct mlx5_wqe_inl_data_seg { + uint32_t byte_count; +}; + +struct mlx5_wqe_eth_seg { + __be32 rsvd0; + uint8_t cs_flags; + uint8_t rsvd1; + __be16 mss; + __be32 rsvd2; + __be16 inline_hdr_sz; + uint8_t inline_hdr_start[2]; + uint8_t inline_hdr[16]; +}; + +struct mlx5_wqe_tm_seg { + uint8_t opcode; + uint8_t flags; + __be16 index; + uint8_t rsvd0[2]; + __be16 sw_cnt; + uint8_t rsvd1[8]; + __be64 append_tag; + __be64 append_mask; +}; + +enum { + MLX5_WQE_UMR_CTRL_FLAG_INLINE = 1 << 7, + MLX5_WQE_UMR_CTRL_FLAG_CHECK_FREE = 1 << 5, + MLX5_WQE_UMR_CTRL_FLAG_TRNSLATION_OFFSET = 1 << 4, + MLX5_WQE_UMR_CTRL_FLAG_CHECK_QPN = 1 << 3, +}; + +enum { + MLX5_WQE_UMR_CTRL_MKEY_MASK_LEN = 1 << 0, + MLX5_WQE_UMR_CTRL_MKEY_MASK_START_ADDR = 1 << 6, + MLX5_WQE_UMR_CTRL_MKEY_MASK_MKEY = 1 << 13, + MLX5_WQE_UMR_CTRL_MKEY_MASK_QPN = 1 << 14, + MLX5_WQE_UMR_CTRL_MKEY_MASK_ACCESS_LOCAL_WRITE = 1 << 18, + MLX5_WQE_UMR_CTRL_MKEY_MASK_ACCESS_REMOTE_READ = 1 << 19, + MLX5_WQE_UMR_CTRL_MKEY_MASK_ACCESS_REMOTE_WRITE = 1 << 20, + MLX5_WQE_UMR_CTRL_MKEY_MASK_ACCESS_ATOMIC = 1 << 21, + MLX5_WQE_UMR_CTRL_MKEY_MASK_FREE = 1 << 29, +}; + +struct mlx5_wqe_umr_ctrl_seg { + uint8_t flags; + uint8_t rsvd0[3]; + __be16 klm_octowords; + __be16 translation_offset; + __be64 mkey_mask; + uint8_t rsvd1[32]; +}; + +struct mlx5_wqe_umr_klm_seg { + /* up to 2GB */ + __be32 byte_count; + __be32 mkey; + __be64 address; +}; + +union mlx5_wqe_umr_inline_seg { + struct mlx5_wqe_umr_klm_seg klm; +}; + +struct mlx5_wqe_umr_repeat_ent_seg { + __be16 stride; + __be16 byte_count; + __be32 memkey; + __be64 va; +}; + +struct mlx5_wqe_umr_repeat_block_seg { + __be32 byte_count; + __be32 op; + __be32 repeat_count; + __be16 reserved; + __be16 num_ent; + struct mlx5_wqe_umr_repeat_ent_seg entries[0]; +}; + +enum { + MLX5_WQE_MKEY_CONTEXT_FREE = 1 << 6 +}; + +enum { + MLX5_WQE_MKEY_CONTEXT_ACCESS_FLAGS_ATOMIC = 1 << 6, + MLX5_WQE_MKEY_CONTEXT_ACCESS_FLAGS_REMOTE_WRITE = 1 << 5, + MLX5_WQE_MKEY_CONTEXT_ACCESS_FLAGS_REMOTE_READ = 1 << 4, + MLX5_WQE_MKEY_CONTEXT_ACCESS_FLAGS_LOCAL_WRITE = 1 << 3, + MLX5_WQE_MKEY_CONTEXT_ACCESS_FLAGS_LOCAL_READ = 1 << 2 +}; + +struct mlx5_wqe_mkey_context_seg { + uint8_t free; + uint8_t reserved1; + uint8_t access_flags; + uint8_t sf; + __be32 qpn_mkey; + __be32 reserved2; + __be32 flags_pd; + __be64 start_addr; + __be64 len; + __be32 bsf_octword_size; + __be32 reserved3[4]; + __be32 translations_octword_size; + uint8_t reserved4[3]; + uint8_t log_page_size; + __be32 reserved; + union mlx5_wqe_umr_inline_seg inseg[0]; +}; + +/* + * Control segment - contains some control information for the current WQE. + * + * Output: + * seg - control segment to be filled + * Input: + * pi - WQEBB number of the first block of this WQE. + * This number should wrap at 0xffff, regardless of + * size of the WQ. + * opcode - Opcode of this WQE. Encodes the type of operation + * to be executed on the QP. + * opmod - Opcode modifier. + * qp_num - QP/SQ number this WQE is posted to. + * fm_ce_se - FM (fence mode), CE (completion and event mode) + * and SE (solicited event). + * ds - WQE size in octowords (16-byte units). DS accounts for all + * the segments in the WQE as summarized in WQE construction. + * signature - WQE signature. + * imm - Immediate data/Invalidation key/UMR mkey. + */ +static MLX5DV_ALWAYS_INLINE +void mlx5dv_set_ctrl_seg(struct mlx5_wqe_ctrl_seg *seg, uint16_t pi, + uint8_t opcode, uint8_t opmod, uint32_t qp_num, + uint8_t fm_ce_se, uint8_t ds, + uint8_t signature, uint32_t imm) +{ + seg->opmod_idx_opcode = htobe32(((uint32_t)opmod << 24) | ((uint32_t)pi << 8) | opcode); + seg->qpn_ds = htobe32((qp_num << 8) | ds); + seg->fm_ce_se = fm_ce_se; + seg->signature = signature; + /* + * The caller should prepare "imm" in advance based on WR opcode. + * For IBV_WR_SEND_WITH_IMM and IBV_WR_RDMA_WRITE_WITH_IMM, + * the "imm" should be assigned as is. + * For the IBV_WR_SEND_WITH_INV, it should be htobe32(imm). + */ + seg->imm = imm; +} + +/* x86 optimized version of mlx5dv_set_ctrl_seg() + * + * This is useful when doing calculations on large data sets + * for parallel calculations. + * + * It doesn't suit for serialized algorithms. + */ +#if defined(__SSE3__) +static MLX5DV_ALWAYS_INLINE +void mlx5dv_x86_set_ctrl_seg(struct mlx5_wqe_ctrl_seg *seg, uint16_t pi, + uint8_t opcode, uint8_t opmod, uint32_t qp_num, + uint8_t fm_ce_se, uint8_t ds, + uint8_t signature, uint32_t imm) +{ + __m128i val = _mm_set_epi32(imm, qp_num, (ds << 16) | pi, + (signature << 24) | (opcode << 16) | (opmod << 8) | fm_ce_se); + __m128i mask = _mm_set_epi8(15, 14, 13, 12, /* immediate */ + 0, /* signal/fence_mode */ +#if CHAR_MIN + -128, -128, /* reserved */ +#else + 0x80, 0x80, /* reserved */ +#endif + 3, /* signature */ + 6, /* data size */ + 8, 9, 10, /* QP num */ + 2, /* opcode */ + 4, 5, /* sw_pi in BE */ + 1 /* opmod */ + ); + *(__m128i *) seg = _mm_shuffle_epi8(val, mask); +} +#endif /* defined(__SSE3__) */ + +/* + * Datagram Segment - contains address information required in order + * to form a datagram message. + * + * Output: + * seg - datagram segment to be filled. + * Input: + * key - Q_key/access key. + * dqp_dct - Destination QP number for UD and DCT for DC. + * ext - Address vector extension. + * stat_rate_sl - Maximum static rate control, SL/ethernet priority. + * fl_mlid - Force loopback and source LID for IB. + * rlid - Remote LID + * rmac - Remote MAC + * tclass - GRH tclass/IPv6 tclass/IPv4 ToS + * hop_limit - GRH hop limit/IPv6 hop limit/IPv4 TTL + * grh_gid_fi - GRH, source GID address and IPv6 flow label. + * rgid - Remote GID/IP address. + */ +static MLX5DV_ALWAYS_INLINE +void mlx5dv_set_dgram_seg(struct mlx5_wqe_datagram_seg *seg, + uint64_t key, uint32_t dqp_dct, + uint8_t ext, uint8_t stat_rate_sl, + uint8_t fl_mlid, uint16_t rlid, + uint8_t *rmac, uint8_t tclass, + uint8_t hop_limit, uint32_t grh_gid_fi, + uint8_t *rgid) +{ + + /* Always put 64 bits, in q_key, the reserved part will be 0 */ + seg->av.key.dc_key = htobe64(key); + seg->av.dqp_dct = htobe32(((uint32_t)ext << 31) | dqp_dct); + seg->av.stat_rate_sl = stat_rate_sl; + seg->av.fl_mlid = fl_mlid; + seg->av.rlid = htobe16(rlid); + memcpy(seg->av.rmac, rmac, 6); + seg->av.tclass = tclass; + seg->av.hop_limit = hop_limit; + seg->av.grh_gid_fl = htobe32(grh_gid_fi); + memcpy(seg->av.rgid, rgid, 16); +} + +/* + * Data Segments - contain pointers and a byte count for the scatter/gather list. + * They can optionally contain data, which will save a memory read access for + * gather Work Requests. + */ +static MLX5DV_ALWAYS_INLINE +void mlx5dv_set_data_seg(struct mlx5_wqe_data_seg *seg, + uint32_t length, uint32_t lkey, + uintptr_t address) +{ + seg->byte_count = htobe32(length); + seg->lkey = htobe32(lkey); + seg->addr = htobe64(address); +} +/* + * x86 optimized version of mlx5dv_set_data_seg() + * + * This is useful when doing calculations on large data sets + * for parallel calculations. + * + * It doesn't suit for serialized algorithms. + */ +#if defined(__SSE3__) +static MLX5DV_ALWAYS_INLINE +void mlx5dv_x86_set_data_seg(struct mlx5_wqe_data_seg *seg, + uint32_t length, uint32_t lkey, + uintptr_t address) +{ + + uint64_t address64 = address; + __m128i val = _mm_set_epi32((uint32_t)address64, (uint32_t)(address64 >> 32), lkey, length); + __m128i mask = _mm_set_epi8(12, 13, 14, 15, /* local address low */ + 8, 9, 10, 11, /* local address high */ + 4, 5, 6, 7, /* l_key */ + 0, 1, 2, 3 /* byte count */ + ); + *(__m128i *) seg = _mm_shuffle_epi8(val, mask); +} +#endif /* defined(__SSE3__) */ + +/* + * Eth Segment - contains packet headers and information for stateless L2, L3, L4 offloading. + * + * Output: + * seg - Eth segment to be filled. + * Input: + * cs_flags - l3cs/l3cs_inner/l4cs/l4cs_inner. + * mss - Maximum segment size. For TSO WQEs, the number of bytes + * in the TCP payload to be transmitted in each packet. Must + * be 0 on non TSO WQEs. + * inline_hdr_sz - Length of the inlined packet headers. + * inline_hdr_start - Inlined packet header. + */ +static MLX5DV_ALWAYS_INLINE +void mlx5dv_set_eth_seg(struct mlx5_wqe_eth_seg *seg, uint8_t cs_flags, + uint16_t mss, uint16_t inline_hdr_sz, + uint8_t *inline_hdr_start) +{ + seg->cs_flags = cs_flags; + seg->mss = htobe16(mss); + seg->inline_hdr_sz = htobe16(inline_hdr_sz); + memcpy(seg->inline_hdr_start, inline_hdr_start, inline_hdr_sz); +} + +enum mlx5dv_set_ctx_attr_type { + MLX5DV_CTX_ATTR_BUF_ALLOCATORS = 1, +}; + +enum { + MLX5_MMAP_GET_REGULAR_PAGES_CMD = 0, + MLX5_MMAP_GET_NC_PAGES_CMD = 3, +}; + +struct mlx5dv_ctx_allocators { + void *(*alloc)(size_t size, void *priv_data); + void (*free)(void *ptr, void *priv_data); + void *data; +}; + +/* + * Generic context attributes set API + * + * Returns 0 on success, or the value of errno on failure + * (which indicates the failure reason). + */ +int mlx5dv_set_context_attr(struct ibv_context *context, + enum mlx5dv_set_ctx_attr_type type, void *attr); + +struct mlx5dv_clock_info { + uint64_t nsec; + uint64_t last_cycles; + uint64_t frac; + uint32_t mult; + uint32_t shift; + uint64_t mask; +}; + +/* + * Get mlx5 core clock info + * + * Output: + * clock_info - clock info to be filled + * Input: + * context - device context + * + * Return: 0 on success, or the value of errno on failure + */ +int mlx5dv_get_clock_info(struct ibv_context *context, + struct mlx5dv_clock_info *clock_info); + +/* + * Translate device timestamp to nano-sec + * + * Input: + * clock_info - clock info to be filled + * device_timestamp - timestamp to translate + * + * Return: nano-sec + */ +static inline uint64_t mlx5dv_ts_to_ns(struct mlx5dv_clock_info *clock_info, + uint64_t device_timestamp) +{ + uint64_t delta, nsec; + + /* + * device_timestamp & cycles are the free running 'mask' bit counters + * from the hardware hca_core_clock clock. + */ + delta = (device_timestamp - clock_info->last_cycles) & clock_info->mask; + nsec = clock_info->nsec; + + /* + * Guess if the device_timestamp is more recent than + * clock_info->last_cycles, if not (too far in the future) treat + * it as old time stamp. This will break every max_clock_info_update_nsec. + */ + + if (delta > clock_info->mask / 2) { + delta = (clock_info->last_cycles - device_timestamp) & + clock_info->mask; + nsec -= ((delta * clock_info->mult) - clock_info->frac) >> + clock_info->shift; + } else { + nsec += ((delta * clock_info->mult) + clock_info->frac) >> + clock_info->shift; + } + + return nsec; +} + +enum mlx5dv_context_attr_flags { + MLX5DV_CONTEXT_FLAGS_DEVX = 1 << 0, +}; + +struct mlx5dv_context_attr { + uint32_t flags; /* Use enum mlx5dv_context_attr_flags */ + uint64_t comp_mask; +}; + +bool mlx5dv_is_supported(struct ibv_device *device); + +struct ibv_context * +mlx5dv_open_device(struct ibv_device *device, struct mlx5dv_context_attr *attr); + +struct mlx5dv_devx_obj; + +struct mlx5dv_devx_obj * +mlx5dv_devx_obj_create(struct ibv_context *context, const void *in, size_t inlen, + void *out, size_t outlen); +int mlx5dv_devx_obj_query(struct mlx5dv_devx_obj *obj, const void *in, size_t inlen, + void *out, size_t outlen); +int mlx5dv_devx_obj_modify(struct mlx5dv_devx_obj *obj, const void *in, size_t inlen, + void *out, size_t outlen); +int mlx5dv_devx_obj_destroy(struct mlx5dv_devx_obj *obj); +int mlx5dv_devx_general_cmd(struct ibv_context *context, const void *in, size_t inlen, + void *out, size_t outlen); + +struct mlx5dv_devx_umem { + uint32_t umem_id; +}; + +struct mlx5dv_devx_umem * +mlx5dv_devx_umem_reg(struct ibv_context *ctx, void *addr, size_t size, uint32_t access); +int mlx5dv_devx_umem_dereg(struct mlx5dv_devx_umem *umem); + +struct mlx5dv_devx_uar { + void *reg_addr; + void *base_addr; + uint32_t page_id; + off_t mmap_off; + uint64_t comp_mask; +}; + +struct mlx5dv_devx_uar *mlx5dv_devx_alloc_uar(struct ibv_context *context, + uint32_t flags); +void mlx5dv_devx_free_uar(struct mlx5dv_devx_uar *devx_uar); + + +struct mlx5dv_var { + uint32_t page_id; + uint32_t length; + off_t mmap_off; + uint64_t comp_mask; +}; + +struct mlx5dv_var * +mlx5dv_alloc_var(struct ibv_context *context, uint32_t flags); +void mlx5dv_free_var(struct mlx5dv_var *dv_var); + +int mlx5dv_devx_query_eqn(struct ibv_context *context, uint32_t vector, + uint32_t *eqn); + +int mlx5dv_devx_cq_query(struct ibv_cq *cq, const void *in, size_t inlen, + void *out, size_t outlen); +int mlx5dv_devx_cq_modify(struct ibv_cq *cq, const void *in, size_t inlen, + void *out, size_t outlen); +int mlx5dv_devx_qp_query(struct ibv_qp *qp, const void *in, size_t inlen, + void *out, size_t outlen); +int mlx5dv_devx_qp_modify(struct ibv_qp *qp, const void *in, size_t inlen, + void *out, size_t outlen); +int mlx5dv_devx_srq_query(struct ibv_srq *srq, const void *in, size_t inlen, + void *out, size_t outlen); +int mlx5dv_devx_srq_modify(struct ibv_srq *srq, const void *in, size_t inlen, + void *out, size_t outlen); +int mlx5dv_devx_wq_query(struct ibv_wq *wq, const void *in, size_t inlen, + void *out, size_t outlen); +int mlx5dv_devx_wq_modify(struct ibv_wq *wq, const void *in, size_t inlen, + void *out, size_t outlen); +int mlx5dv_devx_ind_tbl_query(struct ibv_rwq_ind_table *ind_tbl, + const void *in, size_t inlen, + void *out, size_t outlen); +int mlx5dv_devx_ind_tbl_modify(struct ibv_rwq_ind_table *ind_tbl, + const void *in, size_t inlen, + void *out, size_t outlen); + +struct mlx5dv_devx_cmd_comp { + int fd; +}; + +struct mlx5dv_devx_cmd_comp * +mlx5dv_devx_create_cmd_comp(struct ibv_context *context); +void mlx5dv_devx_destroy_cmd_comp(struct mlx5dv_devx_cmd_comp *cmd_comp); +int mlx5dv_devx_obj_query_async(struct mlx5dv_devx_obj *obj, const void *in, + size_t inlen, size_t outlen, + uint64_t wr_id, + struct mlx5dv_devx_cmd_comp *cmd_comp); + +int mlx5dv_devx_get_async_cmd_comp(struct mlx5dv_devx_cmd_comp *cmd_comp, + struct mlx5dv_devx_async_cmd_hdr *cmd_resp, + size_t cmd_resp_len); + +struct mlx5dv_devx_event_channel { + int fd; +}; + +struct mlx5dv_devx_event_channel * +mlx5dv_devx_create_event_channel(struct ibv_context *context, + enum mlx5dv_devx_create_event_channel_flags flags); +void mlx5dv_devx_destroy_event_channel(struct mlx5dv_devx_event_channel *event_channel); + + +int mlx5dv_devx_subscribe_devx_event(struct mlx5dv_devx_event_channel *event_channel, + struct mlx5dv_devx_obj *obj, /* can be NULL for unaffiliated events */ + uint16_t events_sz, + uint16_t events_num[], + uint64_t cookie); + +int mlx5dv_devx_subscribe_devx_event_fd(struct mlx5dv_devx_event_channel *event_channel, + int fd, + struct mlx5dv_devx_obj *obj, /* can be NULL for unaffiliated events */ + uint16_t event_num); + +/* return code: upon success number of bytes read, otherwise -1 and errno was set */ +ssize_t mlx5dv_devx_get_event(struct mlx5dv_devx_event_channel *event_channel, + struct mlx5dv_devx_async_event_hdr *event_data, + size_t event_resp_len); + + +#define __devx_nullp(typ) ((struct mlx5_ifc_##typ##_bits *)NULL) +#define __devx_st_sz_bits(typ) sizeof(struct mlx5_ifc_##typ##_bits) +#define __devx_bit_sz(typ, fld) sizeof(__devx_nullp(typ)->fld) +#define __devx_bit_off(typ, fld) offsetof(struct mlx5_ifc_##typ##_bits, fld) +#define __devx_dw_off(bit_off) ((bit_off) / 32) +#define __devx_64_off(bit_off) ((bit_off) / 64) +#define __devx_dw_bit_off(bit_sz, bit_off) (32 - (bit_sz) - ((bit_off) & 0x1f)) +#define __devx_mask(bit_sz) ((uint32_t)((1ull << (bit_sz)) - 1)) +#define __devx_dw_mask(bit_sz, bit_off) \ + (__devx_mask(bit_sz) << __devx_dw_bit_off(bit_sz, bit_off)) + +#define DEVX_FLD_SZ_BYTES(typ, fld) (__devx_bit_sz(typ, fld) / 8) +#define DEVX_ST_SZ_BYTES(typ) (sizeof(struct mlx5_ifc_##typ##_bits) / 8) +#define DEVX_ST_SZ_DW(typ) (sizeof(struct mlx5_ifc_##typ##_bits) / 32) +#define DEVX_ST_SZ_QW(typ) (sizeof(struct mlx5_ifc_##typ##_bits) / 64) +#define DEVX_UN_SZ_BYTES(typ) (sizeof(union mlx5_ifc_##typ##_bits) / 8) +#define DEVX_UN_SZ_DW(typ) (sizeof(union mlx5_ifc_##typ##_bits) / 32) +#define DEVX_BYTE_OFF(typ, fld) (__devx_bit_off(typ, fld) / 8) +#define DEVX_ADDR_OF(typ, p, fld) \ + ((unsigned char *)(p) + DEVX_BYTE_OFF(typ, fld)) + +static inline void _devx_set(void *p, uint32_t value, size_t bit_off, + size_t bit_sz) +{ + __be32 *fld = (__be32 *)(p) + __devx_dw_off(bit_off); + uint32_t dw_mask = __devx_dw_mask(bit_sz, bit_off); + uint32_t mask = __devx_mask(bit_sz); + + *fld = htobe32((be32toh(*fld) & (~dw_mask)) | + ((value & mask) << __devx_dw_bit_off(bit_sz, bit_off))); +} + +#define DEVX_SET(typ, p, fld, v) \ + _devx_set(p, v, __devx_bit_off(typ, fld), __devx_bit_sz(typ, fld)) + +static inline uint32_t _devx_get(const void *p, size_t bit_off, size_t bit_sz) +{ + return ((be32toh(*((const __be32 *)(p) + __devx_dw_off(bit_off))) >> + __devx_dw_bit_off(bit_sz, bit_off)) & + __devx_mask(bit_sz)); +} + +#define DEVX_GET(typ, p, fld) \ + _devx_get(p, __devx_bit_off(typ, fld), __devx_bit_sz(typ, fld)) + +static inline void _devx_set64(void *p, uint64_t v, size_t bit_off) +{ + *((__be64 *)(p) + __devx_64_off(bit_off)) = htobe64(v); +} + +#define DEVX_SET64(typ, p, fld, v) _devx_set64(p, v, __devx_bit_off(typ, fld)) + +static inline uint64_t _devx_get64(const void *p, size_t bit_off) +{ + return be64toh(*((const __be64 *)(p) + __devx_64_off(bit_off))); +} + +#define DEVX_GET64(typ, p, fld) _devx_get64(p, __devx_bit_off(typ, fld)) + +struct mlx5dv_dr_domain; +struct mlx5dv_dr_table; +struct mlx5dv_dr_matcher; +struct mlx5dv_dr_rule; +struct mlx5dv_dr_action; + +enum mlx5dv_dr_domain_type { + MLX5DV_DR_DOMAIN_TYPE_NIC_RX, + MLX5DV_DR_DOMAIN_TYPE_NIC_TX, + MLX5DV_DR_DOMAIN_TYPE_FDB, +}; + +enum mlx5dv_dr_domain_sync_flags { + MLX5DV_DR_DOMAIN_SYNC_FLAGS_SW = 1 << 0, + MLX5DV_DR_DOMAIN_SYNC_FLAGS_HW = 1 << 1, +}; + +struct mlx5dv_dr_flow_meter_attr { + struct mlx5dv_dr_table *next_table; + uint8_t active; + uint8_t reg_c_index; + size_t flow_meter_parameter_sz; + void *flow_meter_parameter; +}; + +struct mlx5dv_dr_domain * +mlx5dv_dr_domain_create(struct ibv_context *ctx, + enum mlx5dv_dr_domain_type type); + +int mlx5dv_dr_domain_destroy(struct mlx5dv_dr_domain *domain); + +int mlx5dv_dr_domain_sync(struct mlx5dv_dr_domain *domain, uint32_t flags); + +struct mlx5dv_dr_table * +mlx5dv_dr_table_create(struct mlx5dv_dr_domain *domain, uint32_t level); + +int mlx5dv_dr_table_destroy(struct mlx5dv_dr_table *table); + +struct mlx5dv_dr_matcher * +mlx5dv_dr_matcher_create(struct mlx5dv_dr_table *table, + uint16_t priority, + uint8_t match_criteria_enable, + struct mlx5dv_flow_match_parameters *mask); + +int mlx5dv_dr_matcher_destroy(struct mlx5dv_dr_matcher *matcher); + +struct mlx5dv_dr_rule * +mlx5dv_dr_rule_create(struct mlx5dv_dr_matcher *matcher, + struct mlx5dv_flow_match_parameters *value, + size_t num_actions, + struct mlx5dv_dr_action *actions[]); + +int mlx5dv_dr_rule_destroy(struct mlx5dv_dr_rule *rule); + +enum mlx5dv_dr_action_flags { + MLX5DV_DR_ACTION_FLAGS_ROOT_LEVEL = 1 << 0, +}; + +struct mlx5dv_dr_action * +mlx5dv_dr_action_create_dest_ibv_qp(struct ibv_qp *ibqp); + +struct mlx5dv_dr_action * +mlx5dv_dr_action_create_dest_table(struct mlx5dv_dr_table *table); + +struct mlx5dv_dr_action * +mlx5dv_dr_action_create_dest_vport(struct mlx5dv_dr_domain *domain, + uint32_t vport); + +struct mlx5dv_dr_action *mlx5dv_dr_action_create_drop(void); + +struct mlx5dv_dr_action *mlx5dv_dr_action_create_tag(uint32_t tag_value); + +struct mlx5dv_dr_action * +mlx5dv_dr_action_create_flow_counter(struct mlx5dv_devx_obj *devx_obj, + uint32_t offset); + +struct mlx5dv_dr_action * +mlx5dv_dr_action_create_packet_reformat(struct mlx5dv_dr_domain *domain, + uint32_t flags, + enum mlx5dv_flow_action_packet_reformat_type reformat_type, + size_t data_sz, void *data); + +struct mlx5dv_dr_action * +mlx5dv_dr_action_create_modify_header(struct mlx5dv_dr_domain *domain, + uint32_t flags, + size_t actions_sz, + __be64 actions[]); + +struct mlx5dv_dr_action * +mlx5dv_dr_action_create_flow_meter(struct mlx5dv_dr_flow_meter_attr *attr); + +int mlx5dv_dr_action_modify_flow_meter(struct mlx5dv_dr_action *action, + struct mlx5dv_dr_flow_meter_attr *attr, + __be64 modify_field_select); + +int mlx5dv_dr_action_destroy(struct mlx5dv_dr_action *action); + +int mlx5dv_dump_dr_domain(FILE *fout, struct mlx5dv_dr_domain *domain); +int mlx5dv_dump_dr_table(FILE *fout, struct mlx5dv_dr_table *table); +int mlx5dv_dump_dr_matcher(FILE *fout, struct mlx5dv_dr_matcher *matcher); +int mlx5dv_dump_dr_rule(FILE *fout, struct mlx5dv_dr_rule *rule); + +struct mlx5dv_pp { + uint16_t index; +}; + +struct mlx5dv_pp *mlx5dv_pp_alloc(struct ibv_context *context, + size_t pp_context_sz, + const void *pp_context, + uint32_t flags); + +void mlx5dv_pp_free(struct mlx5dv_pp *pp); + +#ifdef __cplusplus +} +#endif + +#endif /* _MLX5DV_H_ */ diff --git a/providers/mlx5/mlx5dv_dr.h b/providers/mlx5/mlx5dv_dr.h new file mode 100644 index 0000000..dc99075 --- /dev/null +++ b/providers/mlx5/mlx5dv_dr.h @@ -0,0 +1,1020 @@ +/* + * Copyright (c) 2019, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef _MLX5_DV_DR_ +#define _MLX5_DV_DR_ + +#include <ccan/list.h> +#include <ccan/minmax.h> +#include <stdatomic.h> +#include "mlx5dv.h" +#include "mlx5_ifc.h" +#include "mlx5.h" + +#define DR_RULE_MAX_STES 17 +#define DR_ACTION_MAX_STES 3 +#define WIRE_PORT 0xFFFF +#define DR_STE_SVLAN 0x1 +#define DR_STE_CVLAN 0x2 + +#define dr_dbg(dmn, arg...) dr_dbg_ctx((dmn)->ctx, ##arg) + +#define dr_dbg_ctx(ctx, arg...) \ + mlx5_dbg(to_mctx(ctx)->dbg_fp, MLX5_DBG_DR, ##arg); + +enum dr_icm_chunk_size { + DR_CHUNK_SIZE_1, + DR_CHUNK_SIZE_MIN = DR_CHUNK_SIZE_1, /* keep updated when changing */ + DR_CHUNK_SIZE_2, + DR_CHUNK_SIZE_4, + DR_CHUNK_SIZE_8, + DR_CHUNK_SIZE_16, + DR_CHUNK_SIZE_32, + DR_CHUNK_SIZE_64, + DR_CHUNK_SIZE_128, + DR_CHUNK_SIZE_256, + DR_CHUNK_SIZE_512, + DR_CHUNK_SIZE_1K, + DR_CHUNK_SIZE_2K, + DR_CHUNK_SIZE_4K, + DR_CHUNK_SIZE_8K, + DR_CHUNK_SIZE_16K, + DR_CHUNK_SIZE_32K, + DR_CHUNK_SIZE_64K, + DR_CHUNK_SIZE_128K, + DR_CHUNK_SIZE_256K, + DR_CHUNK_SIZE_512K, + DR_CHUNK_SIZE_1024K, + DR_CHUNK_SIZE_2048K, + DR_CHUNK_SIZE_MAX, +}; + +enum dr_icm_type { + DR_ICM_TYPE_STE, + DR_ICM_TYPE_MODIFY_ACTION, +}; + +static inline enum dr_icm_chunk_size +dr_icm_next_higher_chunk(enum dr_icm_chunk_size chunk) +{ + chunk += 2; + if (chunk < DR_CHUNK_SIZE_MAX) + return chunk; + + return DR_CHUNK_SIZE_MAX; +} + +enum dr_ste_lu_type { + DR_STE_LU_TYPE_NOP = 0x00, + DR_STE_LU_TYPE_SRC_GVMI_AND_QP = 0x05, + DR_STE_LU_TYPE_ETHL2_TUNNELING_I = 0x0a, + DR_STE_LU_TYPE_ETHL2_DST_O = 0x06, + DR_STE_LU_TYPE_ETHL2_DST_I = 0x07, + DR_STE_LU_TYPE_ETHL2_DST_D = 0x1b, + DR_STE_LU_TYPE_ETHL2_SRC_O = 0x08, + DR_STE_LU_TYPE_ETHL2_SRC_I = 0x09, + DR_STE_LU_TYPE_ETHL2_SRC_D = 0x1c, + DR_STE_LU_TYPE_ETHL2_SRC_DST_O = 0x36, + DR_STE_LU_TYPE_ETHL2_SRC_DST_I = 0x37, + DR_STE_LU_TYPE_ETHL2_SRC_DST_D = 0x38, + DR_STE_LU_TYPE_ETHL3_IPV6_DST_O = 0x0d, + DR_STE_LU_TYPE_ETHL3_IPV6_DST_I = 0x0e, + DR_STE_LU_TYPE_ETHL3_IPV6_DST_D = 0x1e, + DR_STE_LU_TYPE_ETHL3_IPV6_SRC_O = 0x0f, + DR_STE_LU_TYPE_ETHL3_IPV6_SRC_I = 0x10, + DR_STE_LU_TYPE_ETHL3_IPV6_SRC_D = 0x1f, + DR_STE_LU_TYPE_ETHL3_IPV4_5_TUPLE_O = 0x11, + DR_STE_LU_TYPE_ETHL3_IPV4_5_TUPLE_I = 0x12, + DR_STE_LU_TYPE_ETHL3_IPV4_5_TUPLE_D = 0x20, + DR_STE_LU_TYPE_ETHL3_IPV4_MISC_O = 0x29, + DR_STE_LU_TYPE_ETHL3_IPV4_MISC_I = 0x2a, + DR_STE_LU_TYPE_ETHL3_IPV4_MISC_D = 0x2b, + DR_STE_LU_TYPE_ETHL4_O = 0x13, + DR_STE_LU_TYPE_ETHL4_I = 0x14, + DR_STE_LU_TYPE_ETHL4_D = 0x21, + DR_STE_LU_TYPE_ETHL4_MISC_O = 0x2c, + DR_STE_LU_TYPE_ETHL4_MISC_I = 0x2d, + DR_STE_LU_TYPE_ETHL4_MISC_D = 0x2e, + DR_STE_LU_TYPE_MPLS_FIRST_O = 0x15, + DR_STE_LU_TYPE_MPLS_FIRST_I = 0x24, + DR_STE_LU_TYPE_MPLS_FIRST_D = 0x25, + DR_STE_LU_TYPE_GRE = 0x16, + DR_STE_LU_TYPE_FLEX_PARSER_0 = 0x22, + DR_STE_LU_TYPE_FLEX_PARSER_1 = 0x23, + DR_STE_LU_TYPE_FLEX_PARSER_TNL_HEADER = 0x19, + DR_STE_LU_TYPE_GENERAL_PURPOSE = 0x18, + DR_STE_LU_TYPE_STEERING_REGISTERS_0 = 0x2f, + DR_STE_LU_TYPE_STEERING_REGISTERS_1 = 0x30, + DR_STE_LU_TYPE_DONT_CARE = 0x0f, +}; + +enum dr_ste_entry_type { + DR_STE_TYPE_TX = 1, + DR_STE_TYPE_RX = 2, + DR_STE_TYPE_MODIFY_PKT = 6, +}; + +enum { + DR_STE_SIZE = 64, + DR_STE_SIZE_CTRL = 32, + DR_STE_SIZE_TAG = 16, + DR_STE_SIZE_MASK = 16, +}; + +enum { + DR_STE_SIZE_REDUCED = DR_STE_SIZE - DR_STE_SIZE_MASK, +}; + +enum { + DR_MODIFY_ACTION_SIZE = 8, +}; + +enum dr_matcher_criteria { + DR_MATCHER_CRITERIA_EMPTY = 0, + DR_MATCHER_CRITERIA_OUTER = 1 << 0, + DR_MATCHER_CRITERIA_MISC = 1 << 1, + DR_MATCHER_CRITERIA_INNER = 1 << 2, + DR_MATCHER_CRITERIA_MISC2 = 1 << 3, + DR_MATCHER_CRITERIA_MISC3 = 1 << 4, + DR_MATCHER_CRITERIA_MAX = 1 << 5, +}; + +enum dr_action_type { + DR_ACTION_TYP_TNL_L2_TO_L2, + DR_ACTION_TYP_L2_TO_TNL_L2, + DR_ACTION_TYP_TNL_L3_TO_L2, + DR_ACTION_TYP_L2_TO_TNL_L3, + DR_ACTION_TYP_DROP, + DR_ACTION_TYP_QP, + DR_ACTION_TYP_FT, + DR_ACTION_TYP_CTR, + DR_ACTION_TYP_TAG, + DR_ACTION_TYP_MODIFY_HDR, + DR_ACTION_TYP_VPORT, + DR_ACTION_TYP_METER, + DR_ACTION_TYP_MAX, +}; + +struct dr_icm_pool; +struct dr_icm_chunk; +struct dr_icm_bucket; +struct dr_ste_htbl; +struct dr_match_param; +struct dr_devx_caps; +struct dr_matcher_rx_tx; + +struct dr_data_seg { + uint64_t addr; + uint32_t length; + uint32_t lkey; + unsigned int send_flags; +}; + +struct postsend_info { + struct dr_data_seg write; + struct dr_data_seg read; + uint64_t remote_addr; + uint32_t rkey; +}; + +struct dr_ste { + uint8_t *hw_ste; + /* refcount: indicates the num of rules that using this ste */ + atomic_int refcount; + + /* attached to the miss_list head at each htbl entry */ + struct list_node miss_list_node; + + /* each rule member that uses this ste attached here */ + struct list_head rule_list; + + /* this ste is member of htbl */ + struct dr_ste_htbl *htbl; + + struct dr_ste_htbl *next_htbl; + + /* this ste is part of a rule, located in ste's chain */ + uint8_t ste_chain_location; +}; + +struct dr_ste_htbl_ctrl { + /* total number of valid entries belonging to this hash table. This + * includes the non collision and collision entries + */ + int num_of_valid_entries; + + /* total number of collisions entries attached to this table */ + int num_of_collisions; + int increase_threshold; + bool may_grow; +}; + +struct dr_ste_htbl { + uint8_t lu_type; + uint16_t byte_mask; + atomic_int refcount; + struct dr_icm_chunk *chunk; + struct dr_ste *ste_arr; + uint8_t *hw_ste_arr; + + struct list_head *miss_list; + + enum dr_icm_chunk_size chunk_size; + struct dr_ste *pointing_ste; + + struct dr_ste_htbl_ctrl ctrl; +}; + +struct dr_ste_send_info { + struct dr_ste *ste; + struct list_node send_list; + uint16_t size; + uint16_t offset; + uint8_t data_cont[DR_STE_SIZE]; + uint8_t *data; +}; + +void dr_send_fill_and_append_ste_send_info(struct dr_ste *ste, uint16_t size, + uint16_t offset, uint8_t *data, + struct dr_ste_send_info *ste_info, + struct list_head *send_list, + bool copy_data); + +struct dr_ste_build { + bool inner; + bool rx; + struct dr_devx_caps *caps; + uint8_t lu_type; + uint16_t byte_mask; + uint8_t bit_mask[DR_STE_SIZE_MASK]; + int (*ste_build_tag_func)(struct dr_match_param *spec, + struct dr_ste_build *sb, + uint8_t *hw_ste_p); +}; + +struct dr_ste_htbl *dr_ste_htbl_alloc(struct dr_icm_pool *pool, + enum dr_icm_chunk_size chunk_size, + uint8_t lu_type, uint16_t byte_mask); +int dr_ste_htbl_free(struct dr_ste_htbl *htbl); + +static inline void dr_htbl_put(struct dr_ste_htbl *htbl) +{ + if (atomic_fetch_sub(&htbl->refcount, 1) == 1) + dr_ste_htbl_free(htbl); +} + +static inline void dr_htbl_get(struct dr_ste_htbl *htbl) +{ + atomic_fetch_add(&htbl->refcount, 1); +} + +/* STE utils */ +uint32_t dr_ste_calc_hash_index(uint8_t *hw_ste_p, struct dr_ste_htbl *htbl); +void dr_ste_init(uint8_t *hw_ste_p, uint8_t lu_type, uint8_t entry_type, uint16_t gvmi); +void dr_ste_always_hit_htbl(struct dr_ste *ste, struct dr_ste_htbl *next_htbl); +void dr_ste_set_miss_addr(uint8_t *hw_ste, uint64_t miss_addr); +uint64_t dr_ste_get_miss_addr(uint8_t *hw_ste); +void dr_ste_set_hit_addr(uint8_t *hw_ste, uint64_t icm_addr, uint32_t ht_size); +void dr_ste_always_miss_addr(struct dr_ste *ste, uint64_t miss_addr); +void dr_ste_set_bit_mask(uint8_t *hw_ste_p, uint8_t *bit_mask); +bool dr_ste_not_used_ste(struct dr_ste *ste); +bool dr_ste_is_last_in_rule(struct dr_matcher_rx_tx *nic_matcher, + uint8_t ste_location); +void dr_ste_rx_set_flow_tag(uint8_t *hw_ste_p, uint32_t flow_tag); +void dr_ste_set_counter_id(uint8_t *hw_ste_p, uint32_t ctr_id); +void dr_ste_set_tx_encap(void *hw_ste_p, uint32_t reformat_id, int size, bool encap_l3); +void dr_ste_set_rx_decap(uint8_t *hw_ste_p); +void dr_ste_set_rx_decap_l3(uint8_t *hw_ste_p, bool vlan); +void dr_ste_set_entry_type(uint8_t *hw_ste_p, uint8_t entry_type); +uint8_t dr_ste_get_entry_type(uint8_t *hw_ste_p); +void dr_ste_set_rewrite_actions(uint8_t *hw_ste_p, uint16_t num_of_actions, + uint32_t re_write_index); +uint64_t dr_ste_get_icm_addr(struct dr_ste *ste); +uint64_t dr_ste_get_mr_addr(struct dr_ste *ste); +struct list_head *dr_ste_get_miss_list(struct dr_ste *ste); + +void dr_ste_free(struct dr_ste *ste, + struct mlx5dv_dr_matcher *matcher, + struct dr_matcher_rx_tx *nic_matcher); +static inline void dr_ste_put(struct dr_ste *ste, + struct mlx5dv_dr_matcher *matcher, + struct dr_matcher_rx_tx *nic_matcher) +{ + if (atomic_fetch_sub(&ste->refcount, 1) == 1) + dr_ste_free(ste, matcher, nic_matcher); +} + +/* initial as 0, increased only when ste appears in a new rule */ +static inline void dr_ste_get(struct dr_ste *ste) +{ + atomic_fetch_add(&ste->refcount, 1); +} + +void dr_ste_set_hit_addr_by_next_htbl(uint8_t *hw_ste, + struct dr_ste_htbl *next_htbl); +bool dr_ste_equal_tag(void *src, void *dst); +int dr_ste_create_next_htbl(struct mlx5dv_dr_matcher *matcher, + struct dr_matcher_rx_tx *nic_matcher, + struct dr_ste *ste, + uint8_t *cur_hw_ste, + enum dr_icm_chunk_size log_table_size); + +/* STE build functions */ +int dr_ste_build_pre_check(struct mlx5dv_dr_domain *dmn, + uint8_t match_criteria, + struct dr_match_param *mask, + struct dr_match_param *value); +int dr_ste_build_ste_arr(struct mlx5dv_dr_matcher *matcher, + struct dr_matcher_rx_tx *nic_matcher, + struct dr_match_param *value, + uint8_t *ste_arr); +int dr_ste_build_eth_l2_src_des(struct dr_ste_build *builder, + struct dr_match_param *mask, + bool inner, bool rx); +void dr_ste_build_eth_l3_ipv4_5_tuple(struct dr_ste_build *sb, + struct dr_match_param *mask, + bool inner, bool rx); +void dr_ste_build_eth_l3_ipv4_misc(struct dr_ste_build *sb, + struct dr_match_param *mask, + bool inner, bool rx); +void dr_ste_build_eth_l3_ipv6_dst(struct dr_ste_build *sb, + struct dr_match_param *mask, + bool inner, bool rx); +void dr_ste_build_eth_l3_ipv6_src(struct dr_ste_build *sb, + struct dr_match_param *mask, + bool inner, bool rx); +void dr_ste_build_eth_l2_src(struct dr_ste_build *sb, + struct dr_match_param *mask, + bool inner, bool rx); +void dr_ste_build_eth_l2_dst(struct dr_ste_build *sb, + struct dr_match_param *mask, + bool inner, bool rx); +void dr_ste_build_eth_l2_tnl(struct dr_ste_build *sb, + struct dr_match_param *mask, + bool inner, bool rx); +void dr_ste_build_ipv6_l3_l4(struct dr_ste_build *sb, + struct dr_match_param *mask, + bool inner, bool rx); +void dr_ste_build_eth_l4_misc(struct dr_ste_build *sb, + struct dr_match_param *mask, + bool inner, bool rx); +void dr_ste_build_gre(struct dr_ste_build *sb, + struct dr_match_param *mask, + bool inner, bool rx); +void dr_ste_build_mpls(struct dr_ste_build *sb, + struct dr_match_param *mask, + bool inner, bool rx); +void dr_ste_build_flex_parser_0(struct dr_ste_build *sb, + struct dr_match_param *mask, + bool inner, bool rx); +int dr_ste_build_flex_parser_1(struct dr_ste_build *sb, + struct dr_match_param *mask, + struct dr_devx_caps *caps, + bool inner, bool rx); +void dr_ste_build_flex_parser_tnl_vxlan_gpe(struct dr_ste_build *sb, + struct dr_match_param *mask, + bool inner, bool rx); +void dr_ste_build_flex_parser_tnl_geneve(struct dr_ste_build *sb, + struct dr_match_param *mask, + bool inner, bool rx); +void dr_ste_build_flex_parser_tnl_gtpu(struct dr_ste_build *sb, + struct dr_match_param *mask, + bool inner, bool rx); +void dr_ste_build_general_purpose(struct dr_ste_build *sb, + struct dr_match_param *mask, + bool inner, bool rx); +void dr_ste_build_register_0(struct dr_ste_build *sb, + struct dr_match_param *mask, + bool inner, bool rx); +void dr_ste_build_register_1(struct dr_ste_build *sb, + struct dr_match_param *mask, + bool inner, bool rx); +int dr_ste_build_src_gvmi_qpn(struct dr_ste_build *sb, + struct dr_match_param *mask, + struct dr_devx_caps *caps, + bool inner, bool rx); +void dr_ste_build_empty_always_hit(struct dr_ste_build *sb, bool rx); + +/* Actions utils */ +int dr_actions_build_ste_arr(struct mlx5dv_dr_matcher *matcher, + struct dr_matcher_rx_tx *nic_matcher, + struct mlx5dv_dr_action *actions[], + uint32_t num_actions, + uint8_t *ste_arr, + uint32_t *new_hw_ste_arr_sz); +int dr_actions_build_attr(struct mlx5dv_dr_matcher *matcher, + struct mlx5dv_dr_action *actions[], + size_t num_actions, + struct mlx5dv_flow_action_attr *attr, + struct mlx5_flow_action_attr_aux *attr_aux); + +struct dr_match_spec { + uint32_t smac_47_16; /* Source MAC address of incoming packet */ + uint32_t ethertype:16; /* Incoming packet Ethertype - this is the Ethertype following the last ;VLAN tag of the packet */ + uint32_t smac_15_0:16; /* Source MAC address of incoming packet */ + uint32_t dmac_47_16; /* Destination MAC address of incoming packet */ + uint32_t first_vid:12; /* VLAN ID of first VLAN tag in the incoming packet. Valid only ;when cvlan_tag==1 or svlan_tag==1 */ + uint32_t first_cfi:1; /* CFI bit of first VLAN tag in the incoming packet. Valid only when ;cvlan_tag==1 or svlan_tag==1 */ + uint32_t first_prio:3; /* Priority of first VLAN tag in the incoming packet. Valid only when ;cvlan_tag==1 or svlan_tag==1 */ + uint32_t dmac_15_0:16; /* Destination MAC address of incoming packet */ + uint32_t tcp_flags:9; /* TCP flags. ;Bit 0: FIN;Bit 1: SYN;Bit 2: RST;Bit 3: PSH;Bit 4: ACK;Bit 5: URG;Bit 6: ECE;Bit 7: CWR;Bit 8: NS */ + uint32_t ip_version:4; /* IP version */ + uint32_t frag:1; /* Packet is an IP fragment */ + uint32_t svlan_tag:1; /* The first vlan in the packet is s-vlan (0x8a88). cvlan_tag and ;svlan_tag cannot be set together */ + uint32_t cvlan_tag:1; /* The first vlan in the packet is c-vlan (0x8100). cvlan_tag and ;svlan_tag cannot be set together */ + uint32_t ip_ecn:2; /* Explicit Congestion Notification derived from Traffic Class/TOS ;field of IPv6/v4 */ + uint32_t ip_dscp:6; /* Differentiated Services Code Point derived from Traffic Class/;TOS field of IPv6/v4 */ + uint32_t ip_protocol:8; /* IP protocol */ + uint32_t tcp_dport:16; /* TCP destination port. ;tcp and udp sport/dport are mutually exclusive */ + uint32_t tcp_sport:16; /* TCP source port.;tcp and udp sport/dport are mutually exclusive */ + uint32_t ip_ttl_hoplimit:8; + uint32_t udp_dport:16; /* UDP destination port.;tcp and udp sport/dport are mutually exclusive */ + uint32_t udp_sport:16; /* UDP source port.;tcp and udp sport/dport are mutually exclusive */ + uint32_t src_ip_127_96; /* IPv6 source address of incoming packets ;For IPv4 address use bits 31:0 (rest of the bits are reserved);This field should be qualified by an appropriate ;ethertype */ + uint32_t src_ip_95_64; /* IPv6 source address of incoming packets ;For IPv4 address use bits 31:0 (rest of the bits are reserved);This field should be qualified by an appropriate ;ethertype */ + uint32_t src_ip_63_32; /* IPv6 source address of incoming packets ;For IPv4 address use bits 31:0 (rest of the bits are reserved);This field should be qualified by an appropriate ;ethertype */ + uint32_t src_ip_31_0; /* IPv6 source address of incoming packets ;For IPv4 address use bits 31:0 (rest of the bits are reserved);This field should be qualified by an appropriate ;ethertype */ + uint32_t dst_ip_127_96; /* IPv6 destination address of incoming packets ;For IPv4 address use bits 31:0 (rest of the bits are reserved);This field should be qualified by an appropriate ;ethertype */ + uint32_t dst_ip_95_64; /* IPv6 destination address of incoming packets ;For IPv4 address use bits 31:0 (rest of the bits are reserved);This field should be qualified by an appropriate ;ethertype */ + uint32_t dst_ip_63_32; /* IPv6 destination address of incoming packets ;For IPv4 address use bits 31:0 (rest of the bits are reserved);This field should be qualified by an appropriate ;ethertype */ + uint32_t dst_ip_31_0; /* IPv6 destination address of incoming packets ;For IPv4 address use bits 31:0 (rest of the bits are reserved);This field should be qualified by an appropriate ;ethertype */ +}; + +struct dr_match_misc { + uint32_t source_sqn:24; /* Source SQN */ + uint32_t source_vhca_port:4; + uint32_t gre_s_present:1; /* used with GRE, sequence number exist when gre_s_present == 1 */ + uint32_t gre_k_present:1; /* used with GRE, key exist when gre_k_present == 1 */ + uint32_t gre_c_present:1; /* used with GRE, checksum exist when gre_c_present == 1 */ + uint32_t source_port:16; /* Source port.;0xffff determines wire port */ + uint32_t inner_second_vid:12; /* VLAN ID of first VLAN tag the inner header of the incoming packet. ;Valid only when inner_second_cvlan_tag ==1 or inner_sec;ond_svlan_tag ==1 */ + uint32_t inner_second_cfi:1; /* CFI bit of first VLAN tag in the inner header of the incoming packet. ;Valid only when inner_second_cvlan_tag ==1 or inner_sec;ond_svlan_tag ==1 */ + uint32_t inner_second_prio:3; /* Priority of second VLAN tag in the inner header of the incoming ;packet. Valid only when inner_second_cvlan_tag ==1 or inner_sec;ond_svlan_tag ==1 */ + uint32_t outer_second_vid:12; /* VLAN ID of first VLAN tag the outer header of the incoming packet. ;Valid only when outer_second_cvlan_tag ==1 or outer_sec;ond_svlan_tag ==1 */ + uint32_t outer_second_cfi:1; /* CFI bit of first VLAN tag in the outer header of the incoming packet. ;Valid only when outer_second_cvlan_tag ==1 or outer_sec;ond_svlan_tag ==1 */ + uint32_t outer_second_prio:3; /* Priority of second VLAN tag in the outer header of the incoming ;packet. Valid only when outer_second_cvlan_tag ==1 or outer_sec;ond_svlan_tag ==1 */ + uint32_t gre_protocol:16; /* GRE Protocol (outer) */ + uint32_t inner_second_svlan_tag:1; /* The second vlan in the inner header of the packet is s-vlan (0x8a88). ;inner_second_cvlan_tag and inner_second_svlan_tag cannot be set ;together */ + uint32_t outer_second_svlan_tag:1; /* The second vlan in the outer header of the packet is s-vlan (0x8a88). ;outer_second_cvlan_tag and outer_second_svlan_tag cannot be set ;together */ + uint32_t inner_second_cvlan_tag:1; /* The second vlan in the inner header of the packet is c-vlan (0x8100). ;inner_second_cvlan_tag and inner_second_svlan_tag cannot be set ;together */ + uint32_t outer_second_cvlan_tag:1; /* The second vlan in the outer header of the packet is c-vlan (0x8100). ;outer_second_cvlan_tag and outer_second_svlan_tag cannot be set ;together */ + uint32_t gre_key_l:8; /* GRE Key [7:0] (outer) */ + uint32_t gre_key_h:24; /* GRE Key[31:8] (outer) */ + uint32_t vxlan_vni:24; /* VXLAN VNI (outer) */ + uint32_t geneve_oam:1; /* GENEVE OAM field (outer) */ + uint32_t geneve_vni:24; /* GENEVE VNI field (outer) */ + uint32_t outer_ipv6_flow_label:20; /* Flow label of incoming IPv6 packet (outer) */ + uint32_t inner_ipv6_flow_label:20; /* Flow label of incoming IPv6 packet (inner) */ + uint32_t geneve_protocol_type:16; /* GENEVE protocol type (outer) */ + uint32_t geneve_opt_len:6; /* GENEVE OptLen (outer) */ + uint32_t bth_dst_qp:24; /* Destination QP in BTH header */ +}; + +struct dr_match_misc2 { + uint32_t outer_first_mpls_ttl:8; /* First MPLS TTL (outer) */ + uint32_t outer_first_mpls_s_bos:1; /* First MPLS S_BOS (outer) */ + uint32_t outer_first_mpls_exp:3; /* First MPLS EXP (outer) */ + uint32_t outer_first_mpls_label:20; /* First MPLS LABEL (outer) */ + uint32_t inner_first_mpls_ttl:8; /* First MPLS TTL (inner) */ + uint32_t inner_first_mpls_s_bos:1; /* First MPLS S_BOS (inner) */ + uint32_t inner_first_mpls_exp:3; /* First MPLS EXP (inner) */ + uint32_t inner_first_mpls_label:20; /* First MPLS LABEL (inner) */ + uint32_t outer_first_mpls_over_gre_ttl:8; /* last MPLS TTL (outer) */ + uint32_t outer_first_mpls_over_gre_s_bos:1; /* last MPLS S_BOS (outer) */ + uint32_t outer_first_mpls_over_gre_exp:3; /* last MPLS EXP (outer) */ + uint32_t outer_first_mpls_over_gre_label:20; /* last MPLS LABEL (outer) */ + uint32_t outer_first_mpls_over_udp_ttl:8; /* last MPLS TTL (outer) */ + uint32_t outer_first_mpls_over_udp_s_bos:1; /* last MPLS S_BOS (outer) */ + uint32_t outer_first_mpls_over_udp_exp:3; /* last MPLS EXP (outer) */ + uint32_t outer_first_mpls_over_udp_label:20; /* last MPLS LABEL (outer) */ + uint32_t metadata_reg_c_7; /* metadata_reg_c_7 */ + uint32_t metadata_reg_c_6; /* metadata_reg_c_6 */ + uint32_t metadata_reg_c_5; /* metadata_reg_c_5 */ + uint32_t metadata_reg_c_4; /* metadata_reg_c_4 */ + uint32_t metadata_reg_c_3; /* metadata_reg_c_3 */ + uint32_t metadata_reg_c_2; /* metadata_reg_c_2 */ + uint32_t metadata_reg_c_1; /* metadata_reg_c_1 */ + uint32_t metadata_reg_c_0; /* metadata_reg_c_0 */ + uint32_t metadata_reg_a; /* metadata_reg_a */ + uint32_t metadata_reg_b; /* metadata_reg_b */ +}; + +struct dr_match_misc3 { + uint32_t inner_tcp_seq_num; + uint32_t outer_tcp_seq_num; + uint32_t inner_tcp_ack_num; + uint32_t outer_tcp_ack_num; + uint32_t outer_vxlan_gpe_vni:24; + uint32_t outer_vxlan_gpe_flags:8; + uint32_t outer_vxlan_gpe_next_protocol:8; + uint32_t icmpv4_header_data; + uint32_t icmpv6_header_data; + uint32_t icmpv6_code:8; + uint32_t icmpv6_type:8; + uint32_t icmpv4_code:8; + uint32_t icmpv4_type:8; + uint32_t gtpu_teid; + uint32_t gtpu_msg_type:8; + uint32_t gtpu_flags:3; +}; + +struct dr_match_param { + struct dr_match_spec outer; + struct dr_match_misc misc; + struct dr_match_spec inner; + struct dr_match_misc2 misc2; + struct dr_match_misc3 misc3; +}; + +#define DR_MASK_IS_FLEX_PARSER_ICMPV4_SET(_misc3) (_misc3->icmpv4_type || \ + _misc3->icmpv4_code || \ + _misc3->icmpv4_header_data) + +struct dr_esw_caps { + uint64_t drop_icm_address_rx; + uint64_t drop_icm_address_tx; + uint64_t uplink_icm_address_rx; + uint64_t uplink_icm_address_tx; + bool sw_owner; +}; + +struct dr_devx_vport_cap { + uint16_t gvmi; + uint64_t icm_address_rx; + uint64_t icm_address_tx; +}; + +struct dr_devx_caps { + uint16_t gvmi; + uint64_t nic_rx_drop_address; + uint64_t nic_tx_drop_address; + uint64_t nic_tx_allow_address; + uint64_t esw_rx_drop_address; + uint64_t esw_tx_drop_address; + uint32_t log_icm_size; + uint64_t hdr_modify_icm_addr; + uint32_t flex_protocols; + uint8_t flex_parser_id_icmp_dw0; + uint8_t flex_parser_id_icmp_dw1; + uint8_t flex_parser_id_icmpv6_dw0; + uint8_t flex_parser_id_icmpv6_dw1; + uint8_t max_ft_level; + bool eswitch_manager; + bool rx_sw_owner; + bool tx_sw_owner; + bool fdb_sw_owner; + uint32_t num_vports; + struct dr_devx_vport_cap *vports_caps; +}; + +struct dr_domain_rx_tx { + uint64_t drop_icm_addr; + uint64_t default_icm_addr; + enum dr_ste_entry_type ste_type; +}; + +struct dr_domain_info { + bool supp_sw_steering; + uint32_t max_inline_size; + uint32_t max_send_wr; + uint32_t max_log_sw_icm_sz; + uint32_t max_log_action_icm_sz; + struct dr_domain_rx_tx rx; + struct dr_domain_rx_tx tx; + struct ibv_device_attr attr; + struct dr_devx_caps caps; +}; + +struct mlx5dv_dr_domain { + struct ibv_context *ctx; + struct ibv_pd *pd; + struct mlx5dv_devx_uar *uar; + enum mlx5dv_dr_domain_type type; + atomic_int refcount; + pthread_mutex_t mutex; + struct dr_icm_pool *ste_icm_pool; + struct dr_icm_pool *action_icm_pool; + struct dr_send_ring *send_ring; + struct dr_domain_info info; + struct list_head tbl_list; +}; + +struct dr_table_rx_tx { + struct dr_ste_htbl *s_anchor; + struct dr_domain_rx_tx *nic_dmn; +}; + +struct mlx5dv_dr_table { + struct mlx5dv_dr_domain *dmn; + struct dr_table_rx_tx rx; + struct dr_table_rx_tx tx; + uint32_t level; + uint32_t table_type; + struct list_head matcher_list; + struct mlx5dv_devx_obj *devx_obj; + atomic_int refcount; + struct list_node tbl_list; +}; + +struct dr_matcher_rx_tx { + struct dr_ste_htbl *s_htbl; + struct dr_ste_htbl *e_anchor; + struct dr_ste_build ste_builder[DR_RULE_MAX_STES]; + uint8_t num_of_builders; + uint64_t default_icm_addr; + struct dr_table_rx_tx *nic_tbl; +}; + +struct mlx5dv_dr_matcher { + struct mlx5dv_dr_table *tbl; + struct dr_matcher_rx_tx rx; + struct dr_matcher_rx_tx tx; + struct list_node matcher_list; + uint16_t prio; + struct dr_match_param mask; + uint8_t match_criteria; + atomic_int refcount; + struct mlx5dv_flow_matcher *dv_matcher; + struct list_head rule_list; +}; + +struct dr_rule_member { + struct dr_ste *ste; + /* attached to dr_rule via this */ + struct list_node list; + /* attached to dr_ste via this */ + struct list_node use_ste_list; +}; + +struct mlx5dv_dr_action { + enum dr_action_type action_type; + atomic_int refcount; + union { + struct { + struct mlx5dv_dr_domain *dmn; + bool is_root_level; + union { + struct ibv_flow_action *flow_action; /* root*/ + struct { + struct dr_icm_chunk *chunk; + uint8_t *data; + uint32_t data_size; + uint16_t num_of_actions; + uint32_t index; + bool allow_rx; + bool allow_tx; + }; + }; + } rewrite; + struct { + struct mlx5dv_dr_domain *dmn; + bool is_root_level; + union { + struct ibv_flow_action *flow_action; /* root*/ + struct { + struct mlx5dv_devx_obj *dvo; + uint32_t reformat_size; + }; + }; + } reformat; + struct { + struct mlx5dv_dr_table *next_ft; + struct mlx5dv_devx_obj *devx_obj; + uint64_t rx_icm_addr; + uint64_t tx_icm_addr; + } meter; + struct mlx5dv_dr_table *dest_tbl; + struct { + struct mlx5dv_devx_obj *devx_obj; + uint32_t offset; + } ctr; + struct { + struct mlx5dv_dr_domain *dmn; + struct dr_devx_vport_cap *caps; + uint32_t num; + } vport; + struct ibv_qp *qp; + struct mlx5dv_devx_obj *devx_obj; + uint32_t flow_tag; + }; +}; + +struct dr_rule_action_member { + struct mlx5dv_dr_action *action; + struct list_node list; +}; + +enum dr_connect_type { + CONNECT_HIT = 1, + CONNECT_MISS = 2, +}; + +struct dr_htbl_connect_info { + enum dr_connect_type type; + union { + struct dr_ste_htbl *hit_next_htbl; + uint64_t miss_icm_addr; + }; +}; + + +struct dr_rule_rx_tx { + struct list_head rule_members_list; + struct dr_matcher_rx_tx *nic_matcher; +}; + +struct mlx5dv_dr_rule { + struct mlx5dv_dr_matcher *matcher; + union { + struct { + struct dr_rule_rx_tx rx; + struct dr_rule_rx_tx tx; + }; + struct ibv_flow *flow; + }; + struct list_head rule_actions_list; + struct list_node rule_list; +}; + +void dr_rule_update_rule_member(struct dr_ste *new_ste, struct dr_ste *ste); + +struct dr_icm_chunk { + struct dr_icm_bucket *bucket; + struct list_node chunk_list; + uint32_t rkey; + uint32_t num_of_entries; + uint32_t byte_size; + uint64_t icm_addr; + uint64_t mr_addr; + + /* Memory optimisation */ + struct dr_ste *ste_arr; + uint8_t *hw_ste_arr; + struct list_head *miss_list; +}; + +static inline int dr_matcher_supp_flex_parser_icmp_v4(struct dr_devx_caps *caps) +{ + return caps->flex_protocols & MLX5_FLEX_PARSER_ICMP_V4_ENABLED; +} + +static inline int dr_matcher_supp_flex_parser_icmp_v6(struct dr_devx_caps *caps) +{ + return caps->flex_protocols & MLX5_FLEX_PARSER_ICMP_V6_ENABLED; +} + +static inline uint32_t +dr_icm_pool_chunk_size_to_entries(enum dr_icm_chunk_size chunk_size) +{ + return 1 << chunk_size; +} + +static inline int +dr_icm_pool_chunk_size_to_byte(enum dr_icm_chunk_size chunk_size, + enum dr_icm_type icm_type) +{ + int num_of_entries; + int entry_size; + + if (icm_type == DR_ICM_TYPE_STE) + entry_size = DR_STE_SIZE; + else + entry_size = DR_MODIFY_ACTION_SIZE; + + num_of_entries = dr_icm_pool_chunk_size_to_entries(chunk_size); + + return entry_size * num_of_entries; +} + +static inline struct dr_devx_vport_cap +*dr_get_vport_cap(struct dr_devx_caps *caps, uint32_t vport) +{ + if (!caps->vports_caps || + (vport >= caps->num_vports && vport != WIRE_PORT)) { + errno = EINVAL; + return NULL; + } + + return &caps->vports_caps[vport == WIRE_PORT ? caps->num_vports : vport]; +} + +/* internal API functions */ +int dr_devx_query_device(struct ibv_context *ctx, struct dr_devx_caps *caps); +int dr_devx_query_esw_vport_context(struct ibv_context *ctx, + bool other_vport, uint16_t vport_number, + uint64_t *icm_address_rx, + uint64_t *icm_address_tx); +int dr_devx_query_gvmi(struct ibv_context *ctx, + bool other_vport, uint16_t vport_number, uint16_t *gvmi); +int dr_devx_query_esw_caps(struct ibv_context *ctx, + struct dr_esw_caps *caps); +int dr_devx_sync_steering(struct ibv_context *ctx); +struct mlx5dv_devx_obj *dr_devx_create_flow_table(struct ibv_context *ctx, + uint32_t table_type, + uint64_t icm_addr_rx, + uint64_t icm_addr_tx, + u8 level); +struct mlx5dv_devx_obj *dr_devx_create_reformat_ctx(struct ibv_context *ctx, + enum reformat_type rt, + size_t reformat_size, + void *reformat_data); +struct mlx5dv_devx_obj +*dr_devx_create_meter(struct ibv_context *ctx, + struct mlx5dv_dr_flow_meter_attr *attr); +int dr_devx_query_meter(struct mlx5dv_devx_obj *obj, uint64_t *rx_icm_addr, + uint64_t *tx_icm_addr); +int dr_devx_modify_meter(struct mlx5dv_devx_obj *obj, + struct mlx5dv_dr_flow_meter_attr *attr, + __be64 modify_bits); +struct mlx5dv_devx_obj *dr_devx_create_cq(struct ibv_context *ctx, + uint32_t page_id, + uint32_t buff_umem_id, + uint32_t db_umem_id, + uint32_t eqn, + int ncqe, + int cqen); + +struct dr_devx_qp_create_attr { + uint32_t page_id; + uint32_t pdn; + uint32_t cqn; + uint32_t pm_state; + uint32_t service_type; + uint32_t buff_umem_id; + uint32_t db_umem_id; + uint32_t sq_wqe_cnt; + uint32_t rq_wqe_cnt; + uint32_t rq_wqe_shift; +}; + +struct mlx5dv_devx_obj *dr_devx_create_qp(struct ibv_context *ctx, + struct dr_devx_qp_create_attr *attr); + +int dr_devx_modify_qp_rst2init(struct ibv_context *ctx, + struct mlx5dv_devx_obj *qp_obj, + uint16_t port); + +struct dr_gid_attr { + union ibv_gid gid; + enum roce_version roce_ver; + uint8_t mac[6]; +}; + +struct dr_devx_qp_rtr_attr { + struct dr_gid_attr dgid_attr; + enum ibv_mtu mtu; + uint32_t qp_num; + uint16_t port_num; + uint8_t min_rnr_timer; + uint8_t sgid_index; +}; + +int dr_devx_modify_qp_init2rtr(struct ibv_context *ctx, + struct mlx5dv_devx_obj *qp_obj, + struct dr_devx_qp_rtr_attr *attr); + +struct dr_devx_qp_rts_attr { + uint8_t timeout; + uint8_t retry_cnt; + uint8_t rnr_retry; +}; + +int dr_devx_modify_qp_rtr2rts(struct ibv_context *ctx, + struct mlx5dv_devx_obj *qp_obj, + struct dr_devx_qp_rts_attr *attr); +int dr_devx_query_gid(struct ibv_context *ctx, uint8_t vhca_port_num, + uint16_t index, struct dr_gid_attr *attr); + +static inline bool dr_is_root_table(struct mlx5dv_dr_table *tbl) +{ + return tbl->level == 0; +} + +struct dr_icm_pool *dr_icm_pool_create(struct mlx5dv_dr_domain *dmn, + enum dr_icm_type icm_type); +void dr_icm_pool_destroy(struct dr_icm_pool *pool); + +struct dr_icm_chunk *dr_icm_alloc_chunk(struct dr_icm_pool *pool, + enum dr_icm_chunk_size chunk_size); +void dr_icm_free_chunk(struct dr_icm_chunk *chunk); +bool dr_ste_is_not_valid_entry(uint8_t *p_hw_ste); +int dr_ste_htbl_init_and_postsend(struct mlx5dv_dr_domain *dmn, + struct dr_domain_rx_tx *nic_dmn, + struct dr_ste_htbl *htbl, + struct dr_htbl_connect_info *connect_info, + bool update_hw_ste); +void dr_ste_set_formated_ste(uint16_t gvmi, + struct dr_domain_rx_tx *nic_dmn, + struct dr_ste_htbl *htbl, + uint8_t *formated_ste, + struct dr_htbl_connect_info *connect_info); +void dr_ste_copy_param(uint8_t match_criteria, + struct dr_match_param *set_param, + struct mlx5dv_flow_match_parameters *mask); + +void dr_crc32_init_table(void); +uint32_t dr_crc32_slice8_calc(const void *input_data, size_t length); + +struct dr_wq { + unsigned *wqe_head; + unsigned wqe_cnt; + unsigned max_post; + unsigned head; + unsigned tail; + unsigned cur_post; + int max_gs; + int wqe_shift; + int offset; + void *qend; +}; + +struct dr_qp { + struct mlx5_buf buf; + struct dr_wq sq; + struct dr_wq rq; + int sq_size; + void *sq_start; + int max_inline_data; + __be32 *db; + struct mlx5dv_devx_obj *obj; + struct mlx5dv_devx_uar *uar; + struct mlx5dv_devx_umem *buf_umem; + struct mlx5dv_devx_umem *db_umem; +}; + +struct dr_cq { + uint8_t *buf; + uint32_t cons_index; + int ncqe; + struct dr_qp *qp; /* Assume CQ per QP */ + __be32 *db; + struct ibv_cq *ibv_cq; + uint32_t cqn; + uint32_t cqe_sz; +}; + +#define MAX_SEND_CQE 64 +#define MIN_READ_SYNC 64 + +struct dr_send_ring { + struct dr_cq cq; + struct dr_qp *qp; + struct ibv_mr *mr; + /* How much wqes are waiting for completion */ + uint32_t pending_wqe; + /* Signal request per this trash hold value */ + uint16_t signal_th; + /* Each post_send_size less than max_post_send_size */ + uint32_t max_post_send_size; + /* manage the send queue */ + uint32_t tx_head; + void *buf; + uint32_t buf_size; + struct ibv_wc wc[MAX_SEND_CQE]; + uint8_t sync_buff[MIN_READ_SYNC]; + struct ibv_mr *sync_mr; +}; + +int dr_send_ring_alloc(struct mlx5dv_dr_domain *dmn); +void dr_send_ring_free(struct dr_send_ring *send_ring); +int dr_send_ring_force_drain(struct mlx5dv_dr_domain *dmn); +int dr_send_postsend_ste(struct mlx5dv_dr_domain *dmn, struct dr_ste *ste, + uint8_t *data, uint16_t size, uint16_t offset); +int dr_send_postsend_htbl(struct mlx5dv_dr_domain *dmn, struct dr_ste_htbl *htbl, + uint8_t *formated_ste, uint8_t *mask); +int dr_send_postsend_formated_htbl(struct mlx5dv_dr_domain *dmn, + struct dr_ste_htbl *htbl, + uint8_t *ste_init_data, + bool update_hw_ste); +int dr_send_postsend_action(struct mlx5dv_dr_domain *dmn, + struct mlx5dv_dr_action *action); +#endif diff --git a/providers/mlx5/qp.c b/providers/mlx5/qp.c new file mode 100644 index 0000000..1e65d8b --- /dev/null +++ b/providers/mlx5/qp.c @@ -0,0 +1,2940 @@ +/* + * Copyright (c) 2012 Mellanox Technologies, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <config.h> + +#include <stdlib.h> +#include <pthread.h> +#include <string.h> +#include <errno.h> +#include <stdio.h> +#include <util/mmio.h> +#include <util/compiler.h> + +#include "mlx5.h" +#include "wqe.h" + +#define MLX5_ATOMIC_SIZE 8 + +static const uint32_t mlx5_ib_opcode[] = { + [IBV_WR_SEND] = MLX5_OPCODE_SEND, + [IBV_WR_SEND_WITH_INV] = MLX5_OPCODE_SEND_INVAL, + [IBV_WR_SEND_WITH_IMM] = MLX5_OPCODE_SEND_IMM, + [IBV_WR_RDMA_WRITE] = MLX5_OPCODE_RDMA_WRITE, + [IBV_WR_RDMA_WRITE_WITH_IMM] = MLX5_OPCODE_RDMA_WRITE_IMM, + [IBV_WR_RDMA_READ] = MLX5_OPCODE_RDMA_READ, + [IBV_WR_ATOMIC_CMP_AND_SWP] = MLX5_OPCODE_ATOMIC_CS, + [IBV_WR_ATOMIC_FETCH_AND_ADD] = MLX5_OPCODE_ATOMIC_FA, + [IBV_WR_BIND_MW] = MLX5_OPCODE_UMR, + [IBV_WR_LOCAL_INV] = MLX5_OPCODE_UMR, + [IBV_WR_TSO] = MLX5_OPCODE_TSO, + [IBV_WR_DRIVER1] = MLX5_OPCODE_UMR, +}; + +static void *get_recv_wqe(struct mlx5_qp *qp, int n) +{ + return qp->buf.buf + qp->rq.offset + (n << qp->rq.wqe_shift); +} + +static void *get_wq_recv_wqe(struct mlx5_rwq *rwq, int n) +{ + return rwq->pbuff + (n << rwq->rq.wqe_shift); +} + +static int copy_to_scat(struct mlx5_wqe_data_seg *scat, void *buf, int *size, + int max, struct mlx5_context *ctx) +{ + int copy; + int i; + + if (unlikely(!(*size))) + return IBV_WC_SUCCESS; + + for (i = 0; i < max; ++i) { + copy = min_t(long, *size, be32toh(scat->byte_count)); + + /* When NULL MR is used can't copy to target, + * expected to be NULL. + */ + if (likely(scat->lkey != ctx->dump_fill_mkey_be)) + memcpy((void *)(unsigned long)be64toh(scat->addr), + buf, copy); + + *size -= copy; + if (*size == 0) + return IBV_WC_SUCCESS; + + buf += copy; + ++scat; + } + return IBV_WC_LOC_LEN_ERR; +} + +int mlx5_copy_to_recv_wqe(struct mlx5_qp *qp, int idx, void *buf, int size) +{ + struct mlx5_context *ctx = to_mctx(qp->ibv_qp->pd->context); + + struct mlx5_wqe_data_seg *scat; + int max = 1 << (qp->rq.wqe_shift - 4); + + scat = get_recv_wqe(qp, idx); + if (unlikely(qp->wq_sig)) + ++scat; + + return copy_to_scat(scat, buf, &size, max, ctx); +} + +int mlx5_copy_to_send_wqe(struct mlx5_qp *qp, int idx, void *buf, int size) +{ + struct mlx5_context *ctx = to_mctx(qp->ibv_qp->pd->context); + struct mlx5_wqe_ctrl_seg *ctrl; + struct mlx5_wqe_data_seg *scat; + void *p; + int max; + + idx &= (qp->sq.wqe_cnt - 1); + ctrl = mlx5_get_send_wqe(qp, idx); + if (qp->ibv_qp->qp_type != IBV_QPT_RC) { + fprintf(stderr, "scatter to CQE is supported only for RC QPs\n"); + return IBV_WC_GENERAL_ERR; + } + p = ctrl + 1; + + switch (be32toh(ctrl->opmod_idx_opcode) & 0xff) { + case MLX5_OPCODE_RDMA_READ: + p = p + sizeof(struct mlx5_wqe_raddr_seg); + break; + + case MLX5_OPCODE_ATOMIC_CS: + case MLX5_OPCODE_ATOMIC_FA: + p = p + sizeof(struct mlx5_wqe_raddr_seg) + + sizeof(struct mlx5_wqe_atomic_seg); + break; + + default: + fprintf(stderr, "scatter to CQE for opcode %d\n", + be32toh(ctrl->opmod_idx_opcode) & 0xff); + return IBV_WC_REM_INV_REQ_ERR; + } + + scat = p; + max = (be32toh(ctrl->qpn_ds) & 0x3F) - (((void *)scat - (void *)ctrl) >> 4); + if (unlikely((void *)(scat + max) > qp->sq.qend)) { + int tmp = ((void *)qp->sq.qend - (void *)scat) >> 4; + int orig_size = size; + + if (copy_to_scat(scat, buf, &size, tmp, ctx) == IBV_WC_SUCCESS) + return IBV_WC_SUCCESS; + max = max - tmp; + buf += orig_size - size; + scat = mlx5_get_send_wqe(qp, 0); + } + + return copy_to_scat(scat, buf, &size, max, ctx); +} + +void *mlx5_get_send_wqe(struct mlx5_qp *qp, int n) +{ + return qp->sq_start + (n << MLX5_SEND_WQE_SHIFT); +} + +void mlx5_init_rwq_indices(struct mlx5_rwq *rwq) +{ + rwq->rq.head = 0; + rwq->rq.tail = 0; +} + +void mlx5_init_qp_indices(struct mlx5_qp *qp) +{ + qp->sq.head = 0; + qp->sq.tail = 0; + qp->rq.head = 0; + qp->rq.tail = 0; + qp->sq.cur_post = 0; +} + +static int mlx5_wq_overflow(struct mlx5_wq *wq, int nreq, struct mlx5_cq *cq) +{ + unsigned cur; + + cur = wq->head - wq->tail; + if (cur + nreq < wq->max_post) + return 0; + + mlx5_spin_lock(&cq->lock); + cur = wq->head - wq->tail; + mlx5_spin_unlock(&cq->lock); + + return cur + nreq >= wq->max_post; +} + +static inline void set_raddr_seg(struct mlx5_wqe_raddr_seg *rseg, + uint64_t remote_addr, uint32_t rkey) +{ + rseg->raddr = htobe64(remote_addr); + rseg->rkey = htobe32(rkey); + rseg->reserved = 0; +} + +static void set_tm_seg(struct mlx5_wqe_tm_seg *tmseg, int op, + struct ibv_ops_wr *wr, int index) +{ + tmseg->flags = 0; + if (wr->flags & IBV_OPS_SIGNALED) + tmseg->flags |= MLX5_SRQ_FLAG_TM_CQE_REQ; + if (wr->flags & IBV_OPS_TM_SYNC) { + tmseg->flags |= MLX5_SRQ_FLAG_TM_SW_CNT; + tmseg->sw_cnt = htobe16(wr->tm.unexpected_cnt); + } + tmseg->opcode = op << 4; + if (op == MLX5_TM_OPCODE_NOP) + return; + tmseg->index = htobe16(index); + if (op == MLX5_TM_OPCODE_REMOVE) + return; + tmseg->append_tag = htobe64(wr->tm.add.tag); + tmseg->append_mask = htobe64(wr->tm.add.mask); +} + +static inline void _set_atomic_seg(struct mlx5_wqe_atomic_seg *aseg, + enum ibv_wr_opcode opcode, + uint64_t swap, + uint64_t compare_add) + ALWAYS_INLINE; +static inline void _set_atomic_seg(struct mlx5_wqe_atomic_seg *aseg, + enum ibv_wr_opcode opcode, + uint64_t swap, + uint64_t compare_add) +{ + if (opcode == IBV_WR_ATOMIC_CMP_AND_SWP) { + aseg->swap_add = htobe64(swap); + aseg->compare = htobe64(compare_add); + } else { + aseg->swap_add = htobe64(compare_add); + } +} + +static void set_atomic_seg(struct mlx5_wqe_atomic_seg *aseg, + enum ibv_wr_opcode opcode, + uint64_t swap, + uint64_t compare_add) +{ + _set_atomic_seg(aseg, opcode, swap, compare_add); +} + +static inline void _set_datagram_seg(struct mlx5_wqe_datagram_seg *dseg, + struct mlx5_wqe_av *av, + uint32_t remote_qpn, + uint32_t remote_qkey) +{ + memcpy(&dseg->av, av, sizeof(dseg->av)); + dseg->av.dqp_dct = htobe32(remote_qpn | MLX5_EXTENDED_UD_AV); + dseg->av.key.qkey.qkey = htobe32(remote_qkey); +} + +static void set_datagram_seg(struct mlx5_wqe_datagram_seg *dseg, + struct ibv_send_wr *wr) +{ + _set_datagram_seg(dseg, &to_mah(wr->wr.ud.ah)->av, wr->wr.ud.remote_qpn, + wr->wr.ud.remote_qkey); +} + +static void set_data_ptr_seg(struct mlx5_wqe_data_seg *dseg, struct ibv_sge *sg, + int offset) +{ + dseg->byte_count = htobe32(sg->length - offset); + dseg->lkey = htobe32(sg->lkey); + dseg->addr = htobe64(sg->addr + offset); +} + +static void set_data_ptr_seg_atomic(struct mlx5_wqe_data_seg *dseg, + struct ibv_sge *sg) +{ + dseg->byte_count = htobe32(MLX5_ATOMIC_SIZE); + dseg->lkey = htobe32(sg->lkey); + dseg->addr = htobe64(sg->addr); +} + +static void set_data_ptr_seg_end(struct mlx5_wqe_data_seg *dseg) +{ + dseg->byte_count = 0; + dseg->lkey = htobe32(MLX5_INVALID_LKEY); + dseg->addr = 0; +} + +/* + * Avoid using memcpy() to copy to BlueFlame page, since memcpy() + * implementations may use move-string-buffer assembler instructions, + * which do not guarantee order of copying. + */ +static void mlx5_bf_copy(uint64_t *dst, const uint64_t *src, unsigned bytecnt, + struct mlx5_qp *qp) +{ + do { + mmio_memcpy_x64(dst, src, 64); + bytecnt -= 64; + dst += 8; + src += 8; + if (unlikely(src == qp->sq.qend)) + src = qp->sq_start; + } while (bytecnt > 0); +} + +static __be32 send_ieth(struct ibv_send_wr *wr) +{ + switch (wr->opcode) { + case IBV_WR_SEND_WITH_IMM: + case IBV_WR_RDMA_WRITE_WITH_IMM: + return wr->imm_data; + case IBV_WR_SEND_WITH_INV: + return htobe32(wr->invalidate_rkey); + default: + return 0; + } +} + +static int set_data_inl_seg(struct mlx5_qp *qp, struct ibv_send_wr *wr, + void *wqe, int *sz, + struct mlx5_sg_copy_ptr *sg_copy_ptr) +{ + struct mlx5_wqe_inline_seg *seg; + void *addr; + int len; + int i; + int inl = 0; + void *qend = qp->sq.qend; + int copy; + int offset = sg_copy_ptr->offset; + + seg = wqe; + wqe += sizeof *seg; + for (i = sg_copy_ptr->index; i < wr->num_sge; ++i) { + addr = (void *) (unsigned long)(wr->sg_list[i].addr + offset); + len = wr->sg_list[i].length - offset; + inl += len; + offset = 0; + + if (unlikely(inl > qp->max_inline_data)) + return ENOMEM; + + if (unlikely(wqe + len > qend)) { + copy = qend - wqe; + memcpy(wqe, addr, copy); + addr += copy; + len -= copy; + wqe = mlx5_get_send_wqe(qp, 0); + } + memcpy(wqe, addr, len); + wqe += len; + } + + if (likely(inl)) { + seg->byte_count = htobe32(inl | MLX5_INLINE_SEG); + *sz = align(inl + sizeof seg->byte_count, 16) / 16; + } else + *sz = 0; + + return 0; +} + +static uint8_t wq_sig(struct mlx5_wqe_ctrl_seg *ctrl) +{ + return calc_sig(ctrl, be32toh(ctrl->qpn_ds)); +} + +#ifdef MLX5_DEBUG +static void dump_wqe(FILE *fp, int idx, int size_16, struct mlx5_qp *qp) +{ + uint32_t *uninitialized_var(p); + int i, j; + int tidx = idx; + + fprintf(fp, "dump wqe at %p\n", mlx5_get_send_wqe(qp, tidx)); + for (i = 0, j = 0; i < size_16 * 4; i += 4, j += 4) { + if ((i & 0xf) == 0) { + void *buf = mlx5_get_send_wqe(qp, tidx); + tidx = (tidx + 1) & (qp->sq.wqe_cnt - 1); + p = buf; + j = 0; + } + fprintf(fp, "%08x %08x %08x %08x\n", be32toh(p[j]), be32toh(p[j + 1]), + be32toh(p[j + 2]), be32toh(p[j + 3])); + } +} +#endif /* MLX5_DEBUG */ + + +void *mlx5_get_atomic_laddr(struct mlx5_qp *qp, uint16_t idx, int *byte_count) +{ + struct mlx5_wqe_data_seg *dpseg; + void *addr; + + dpseg = mlx5_get_send_wqe(qp, idx) + sizeof(struct mlx5_wqe_ctrl_seg) + + sizeof(struct mlx5_wqe_raddr_seg) + + sizeof(struct mlx5_wqe_atomic_seg); + addr = (void *)(unsigned long)be64toh(dpseg->addr); + + /* + * Currently byte count is always 8 bytes. Fix this when + * we support variable size of atomics + */ + *byte_count = 8; + return addr; +} + +static inline int copy_eth_inline_headers(struct ibv_qp *ibqp, + const void *list, + size_t nelem, + struct mlx5_wqe_eth_seg *eseg, + struct mlx5_sg_copy_ptr *sg_copy_ptr, + bool is_sge) + ALWAYS_INLINE; +static inline int copy_eth_inline_headers(struct ibv_qp *ibqp, + const void *list, + size_t nelem, + struct mlx5_wqe_eth_seg *eseg, + struct mlx5_sg_copy_ptr *sg_copy_ptr, + bool is_sge) +{ + uint32_t inl_hdr_size = to_mctx(ibqp->context)->eth_min_inline_size; + size_t inl_hdr_copy_size = 0; + int j = 0; + FILE *fp = to_mctx(ibqp->context)->dbg_fp; + size_t length; + void *addr; + + if (unlikely(nelem < 1)) { + mlx5_dbg(fp, MLX5_DBG_QP_SEND, + "illegal num_sge: %zu, minimum is 1\n", nelem); + return EINVAL; + } + + if (is_sge) { + addr = (void *)(uintptr_t)((struct ibv_sge *)list)[0].addr; + length = (size_t)((struct ibv_sge *)list)[0].length; + } else { + addr = ((struct ibv_data_buf *)list)[0].addr; + length = ((struct ibv_data_buf *)list)[0].length; + } + + if (likely(length >= MLX5_ETH_L2_INLINE_HEADER_SIZE)) { + inl_hdr_copy_size = inl_hdr_size; + memcpy(eseg->inline_hdr_start, addr, inl_hdr_copy_size); + } else { + uint32_t inl_hdr_size_left = inl_hdr_size; + + for (j = 0; j < nelem && inl_hdr_size_left > 0; ++j) { + if (is_sge) { + addr = (void *)(uintptr_t)((struct ibv_sge *)list)[j].addr; + length = (size_t)((struct ibv_sge *)list)[j].length; + } else { + addr = ((struct ibv_data_buf *)list)[j].addr; + length = ((struct ibv_data_buf *)list)[j].length; + } + + inl_hdr_copy_size = min_t(size_t, length, inl_hdr_size_left); + memcpy(eseg->inline_hdr_start + + (MLX5_ETH_L2_INLINE_HEADER_SIZE - inl_hdr_size_left), + addr, inl_hdr_copy_size); + inl_hdr_size_left -= inl_hdr_copy_size; + } + if (unlikely(inl_hdr_size_left)) { + mlx5_dbg(fp, MLX5_DBG_QP_SEND, "Ethernet headers < 16 bytes\n"); + return EINVAL; + } + if (j) + --j; + } + + eseg->inline_hdr_sz = htobe16(inl_hdr_size); + + /* If we copied all the sge into the inline-headers, then we need to + * start copying from the next sge into the data-segment. + */ + if (unlikely(length == inl_hdr_copy_size)) { + ++j; + inl_hdr_copy_size = 0; + } + + sg_copy_ptr->index = j; + sg_copy_ptr->offset = inl_hdr_copy_size; + + return 0; +} + +#define ALIGN(x, log_a) ((((x) + (1 << (log_a)) - 1)) & ~((1 << (log_a)) - 1)) + +static inline __be16 get_klm_octo(int nentries) +{ + return htobe16(ALIGN(nentries, 3) / 2); +} + +static void set_umr_data_seg(struct mlx5_qp *qp, enum ibv_mw_type type, + int32_t rkey, + const struct ibv_mw_bind_info *bind_info, + uint32_t qpn, void **seg, int *size) +{ + union { + struct mlx5_wqe_umr_klm_seg klm; + uint8_t reserved[64]; + } *data = *seg; + + data->klm.byte_count = htobe32(bind_info->length); + data->klm.mkey = htobe32(bind_info->mr->lkey); + data->klm.address = htobe64(bind_info->addr); + + memset(&data->klm + 1, 0, sizeof(data->reserved) - + sizeof(data->klm)); + + *seg += sizeof(*data); + *size += (sizeof(*data) / 16); +} + +static void set_umr_mkey_seg(struct mlx5_qp *qp, enum ibv_mw_type type, + int32_t rkey, + const struct ibv_mw_bind_info *bind_info, + uint32_t qpn, void **seg, int *size) +{ + struct mlx5_wqe_mkey_context_seg *mkey = *seg; + + mkey->qpn_mkey = htobe32((rkey & 0xFF) | + ((type == IBV_MW_TYPE_1 || !bind_info->length) ? + 0xFFFFFF00 : qpn << 8)); + if (bind_info->length) { + /* Local read is set in kernel */ + mkey->access_flags = 0; + mkey->free = 0; + if (bind_info->mw_access_flags & IBV_ACCESS_LOCAL_WRITE) + mkey->access_flags |= + MLX5_WQE_MKEY_CONTEXT_ACCESS_FLAGS_LOCAL_WRITE; + if (bind_info->mw_access_flags & IBV_ACCESS_REMOTE_WRITE) + mkey->access_flags |= + MLX5_WQE_MKEY_CONTEXT_ACCESS_FLAGS_REMOTE_WRITE; + if (bind_info->mw_access_flags & IBV_ACCESS_REMOTE_READ) + mkey->access_flags |= + MLX5_WQE_MKEY_CONTEXT_ACCESS_FLAGS_REMOTE_READ; + if (bind_info->mw_access_flags & IBV_ACCESS_REMOTE_ATOMIC) + mkey->access_flags |= + MLX5_WQE_MKEY_CONTEXT_ACCESS_FLAGS_ATOMIC; + if (bind_info->mw_access_flags & IBV_ACCESS_ZERO_BASED) + mkey->start_addr = 0; + else + mkey->start_addr = htobe64(bind_info->addr); + mkey->len = htobe64(bind_info->length); + } else { + mkey->free = MLX5_WQE_MKEY_CONTEXT_FREE; + } + + *seg += sizeof(struct mlx5_wqe_mkey_context_seg); + *size += (sizeof(struct mlx5_wqe_mkey_context_seg) / 16); +} + +static inline void set_umr_control_seg(struct mlx5_qp *qp, enum ibv_mw_type type, + int32_t rkey, + const struct ibv_mw_bind_info *bind_info, + uint32_t qpn, void **seg, int *size) +{ + struct mlx5_wqe_umr_ctrl_seg *ctrl = *seg; + + ctrl->flags = MLX5_WQE_UMR_CTRL_FLAG_TRNSLATION_OFFSET | + MLX5_WQE_UMR_CTRL_FLAG_INLINE; + ctrl->mkey_mask = htobe64(MLX5_WQE_UMR_CTRL_MKEY_MASK_FREE | + MLX5_WQE_UMR_CTRL_MKEY_MASK_MKEY); + ctrl->translation_offset = 0; + memset(ctrl->rsvd0, 0, sizeof(ctrl->rsvd0)); + memset(ctrl->rsvd1, 0, sizeof(ctrl->rsvd1)); + + if (type == IBV_MW_TYPE_2) + ctrl->mkey_mask |= htobe64(MLX5_WQE_UMR_CTRL_MKEY_MASK_QPN); + + if (bind_info->length) { + ctrl->klm_octowords = get_klm_octo(1); + if (type == IBV_MW_TYPE_2) + ctrl->flags |= MLX5_WQE_UMR_CTRL_FLAG_CHECK_FREE; + ctrl->mkey_mask |= htobe64(MLX5_WQE_UMR_CTRL_MKEY_MASK_LEN | + MLX5_WQE_UMR_CTRL_MKEY_MASK_START_ADDR | + MLX5_WQE_UMR_CTRL_MKEY_MASK_ACCESS_LOCAL_WRITE | + MLX5_WQE_UMR_CTRL_MKEY_MASK_ACCESS_REMOTE_READ | + MLX5_WQE_UMR_CTRL_MKEY_MASK_ACCESS_REMOTE_WRITE | + MLX5_WQE_UMR_CTRL_MKEY_MASK_ACCESS_ATOMIC); + } else { + ctrl->klm_octowords = get_klm_octo(0); + if (type == IBV_MW_TYPE_2) + ctrl->flags |= MLX5_WQE_UMR_CTRL_FLAG_CHECK_QPN; + } + + *seg += sizeof(struct mlx5_wqe_umr_ctrl_seg); + *size += sizeof(struct mlx5_wqe_umr_ctrl_seg) / 16; +} + +static inline int set_bind_wr(struct mlx5_qp *qp, enum ibv_mw_type type, + int32_t rkey, + const struct ibv_mw_bind_info *bind_info, + uint32_t qpn, void **seg, int *size) +{ + void *qend = qp->sq.qend; + +#ifdef MW_DEBUG + if (bind_info->mw_access_flags & + ~(IBV_ACCESS_REMOTE_ATOMIC | IBV_ACCESS_REMOTE_READ | + IBV_ACCESS_REMOTE_WRITE)) + return EINVAL; + + if (bind_info->mr && + (bind_info->mr->addr > (void *)bind_info->addr || + bind_info->mr->addr + bind_info->mr->length < + (void *)bind_info->addr + bind_info->length || + !(to_mmr(bind_info->mr)->alloc_flags & IBV_ACCESS_MW_BIND) || + (bind_info->mw_access_flags & + (IBV_ACCESS_REMOTE_ATOMIC | IBV_ACCESS_REMOTE_WRITE) && + !(to_mmr(bind_info->mr)->alloc_flags & IBV_ACCESS_LOCAL_WRITE)))) + return EINVAL; + +#endif + + /* check that len > 2GB because KLM support only 2GB */ + if (bind_info->length > 1UL << 31) + return EOPNOTSUPP; + + set_umr_control_seg(qp, type, rkey, bind_info, qpn, seg, size); + if (unlikely((*seg == qend))) + *seg = mlx5_get_send_wqe(qp, 0); + + set_umr_mkey_seg(qp, type, rkey, bind_info, qpn, seg, size); + if (!bind_info->length) + return 0; + + if (unlikely((seg == qend))) + *seg = mlx5_get_send_wqe(qp, 0); + + set_umr_data_seg(qp, type, rkey, bind_info, qpn, seg, size); + return 0; +} + +/* Copy tso header to eth segment with considering padding and WQE + * wrap around in WQ buffer. + */ +static inline int set_tso_eth_seg(void **seg, void *hdr, uint16_t hdr_sz, + uint16_t mss, + struct mlx5_qp *qp, int *size) +{ + struct mlx5_wqe_eth_seg *eseg = *seg; + int size_of_inl_hdr_start = sizeof(eseg->inline_hdr_start); + uint64_t left, left_len, copy_sz; + FILE *fp = to_mctx(qp->ibv_qp->context)->dbg_fp; + + if (unlikely(hdr_sz < MLX5_ETH_L2_MIN_HEADER_SIZE || + hdr_sz > qp->max_tso_header)) { + mlx5_dbg(fp, MLX5_DBG_QP_SEND, + "TSO header size should be at least %d and at most %d\n", + MLX5_ETH_L2_MIN_HEADER_SIZE, + qp->max_tso_header); + return EINVAL; + } + + left = hdr_sz; + eseg->mss = htobe16(mss); + eseg->inline_hdr_sz = htobe16(hdr_sz); + + /* Check if there is space till the end of queue, if yes, + * copy all in one shot, otherwise copy till the end of queue, + * rollback and then copy the left + */ + left_len = qp->sq.qend - (void *)eseg->inline_hdr_start; + copy_sz = min(left_len, left); + + memcpy(eseg->inline_hdr_start, hdr, copy_sz); + + /* The -1 is because there are already 16 bytes included in + * eseg->inline_hdr[16] + */ + *seg += align(copy_sz - size_of_inl_hdr_start, 16) - 16; + *size += align(copy_sz - size_of_inl_hdr_start, 16) / 16 - 1; + + /* The last wqe in the queue */ + if (unlikely(copy_sz < left)) { + *seg = mlx5_get_send_wqe(qp, 0); + left -= copy_sz; + hdr += copy_sz; + memcpy(*seg, hdr, left); + *seg += align(left, 16); + *size += align(left, 16) / 16; + } + + return 0; +} + +static inline int mlx5_post_send_underlay(struct mlx5_qp *qp, struct ibv_send_wr *wr, + void **pseg, int *total_size, + struct mlx5_sg_copy_ptr *sg_copy_ptr) +{ + struct mlx5_wqe_eth_seg *eseg; + int inl_hdr_copy_size; + void *seg = *pseg; + int size = 0; + + if (unlikely(wr->opcode == IBV_WR_SEND_WITH_IMM)) + return EINVAL; + + memset(seg, 0, sizeof(struct mlx5_wqe_eth_pad)); + size += sizeof(struct mlx5_wqe_eth_pad); + seg += sizeof(struct mlx5_wqe_eth_pad); + eseg = seg; + *((uint64_t *)eseg) = 0; + eseg->rsvd2 = 0; + + if (wr->send_flags & IBV_SEND_IP_CSUM) { + if (!(qp->qp_cap_cache & MLX5_CSUM_SUPPORT_UNDERLAY_UD)) + return EINVAL; + + eseg->cs_flags |= MLX5_ETH_WQE_L3_CSUM | MLX5_ETH_WQE_L4_CSUM; + } + + if (likely(wr->sg_list[0].length >= MLX5_SOURCE_QPN_INLINE_MAX_HEADER_SIZE)) + /* Copying the minimum required data unless inline mode is set */ + inl_hdr_copy_size = (wr->send_flags & IBV_SEND_INLINE) ? + MLX5_SOURCE_QPN_INLINE_MAX_HEADER_SIZE : + MLX5_IPOIB_INLINE_MIN_HEADER_SIZE; + else { + inl_hdr_copy_size = MLX5_IPOIB_INLINE_MIN_HEADER_SIZE; + /* We expect at least 4 bytes as part of first entry to hold the IPoIB header */ + if (unlikely(wr->sg_list[0].length < inl_hdr_copy_size)) + return EINVAL; + } + + memcpy(eseg->inline_hdr_start, (void *)(uintptr_t)wr->sg_list[0].addr, + inl_hdr_copy_size); + eseg->inline_hdr_sz = htobe16(inl_hdr_copy_size); + size += sizeof(struct mlx5_wqe_eth_seg); + seg += sizeof(struct mlx5_wqe_eth_seg); + + /* If we copied all the sge into the inline-headers, then we need to + * start copying from the next sge into the data-segment. + */ + if (unlikely(wr->sg_list[0].length == inl_hdr_copy_size)) + sg_copy_ptr->index++; + else + sg_copy_ptr->offset = inl_hdr_copy_size; + + *pseg = seg; + *total_size += (size / 16); + return 0; +} + +static inline void post_send_db(struct mlx5_qp *qp, struct mlx5_bf *bf, + int nreq, int inl, int size, void *ctrl) +{ + struct mlx5_context *ctx; + + if (unlikely(!nreq)) + return; + + qp->sq.head += nreq; + + /* + * Make sure that descriptors are written before + * updating doorbell record and ringing the doorbell + */ + udma_to_device_barrier(); + qp->db[MLX5_SND_DBR] = htobe32(qp->sq.cur_post & 0xffff); + + /* Make sure that the doorbell write happens before the memcpy + * to WC memory below + */ + ctx = to_mctx(qp->ibv_qp->context); + if (bf->need_lock) + mmio_wc_spinlock(&bf->lock.lock); + else + mmio_wc_start(); + + if (!ctx->shut_up_bf && nreq == 1 && bf->uuarn && + (inl || ctx->prefer_bf) && size > 1 && + size <= bf->buf_size / 16) + mlx5_bf_copy(bf->reg + bf->offset, ctrl, + align(size * 16, 64), qp); + else + mmio_write64_be(bf->reg + bf->offset, *(__be64 *)ctrl); + + /* + * use mmio_flush_writes() to ensure write combining buffers are + * flushed out of the running CPU. This must be carried inside + * the spinlock. Otherwise, there is a potential race. In the + * race, CPU A writes doorbell 1, which is waiting in the WC + * buffer. CPU B writes doorbell 2, and it's write is flushed + * earlier. Since the mmio_flush_writes is CPU local, this will + * result in the HCA seeing doorbell 2, followed by doorbell 1. + * Flush before toggling bf_offset to be latency oriented. + */ + mmio_flush_writes(); + bf->offset ^= bf->buf_size; + if (bf->need_lock) + mlx5_spin_unlock(&bf->lock); +} + +static inline int _mlx5_post_send(struct ibv_qp *ibqp, struct ibv_send_wr *wr, + struct ibv_send_wr **bad_wr) +{ + struct mlx5_qp *qp = to_mqp(ibqp); + void *seg; + struct mlx5_wqe_eth_seg *eseg; + struct mlx5_wqe_ctrl_seg *ctrl = NULL; + struct mlx5_wqe_data_seg *dpseg; + struct mlx5_sg_copy_ptr sg_copy_ptr = {.index = 0, .offset = 0}; + int nreq; + int inl = 0; + int err = 0; + int size = 0; + int i; + unsigned idx; + uint8_t opmod = 0; + struct mlx5_bf *bf = qp->bf; + void *qend = qp->sq.qend; + uint32_t mlx5_opcode; + struct mlx5_wqe_xrc_seg *xrc; + uint8_t fence; + uint8_t next_fence; + uint32_t max_tso = 0; + FILE *fp = to_mctx(ibqp->context)->dbg_fp; /* The compiler ignores in non-debug mode */ + + mlx5_spin_lock(&qp->sq.lock); + + next_fence = qp->fm_cache; + + for (nreq = 0; wr; ++nreq, wr = wr->next) { + if (unlikely(wr->opcode < 0 || + wr->opcode >= sizeof mlx5_ib_opcode / sizeof mlx5_ib_opcode[0])) { + mlx5_dbg(fp, MLX5_DBG_QP_SEND, "bad opcode %d\n", wr->opcode); + err = EINVAL; + *bad_wr = wr; + goto out; + } + + if (unlikely(mlx5_wq_overflow(&qp->sq, nreq, + to_mcq(qp->ibv_qp->send_cq)))) { + mlx5_dbg(fp, MLX5_DBG_QP_SEND, "work queue overflow\n"); + err = ENOMEM; + *bad_wr = wr; + goto out; + } + + if (unlikely(wr->num_sge > qp->sq.max_gs)) { + mlx5_dbg(fp, MLX5_DBG_QP_SEND, "max gs exceeded %d (max = %d)\n", + wr->num_sge, qp->sq.max_gs); + err = ENOMEM; + *bad_wr = wr; + goto out; + } + + if (wr->send_flags & IBV_SEND_FENCE) + fence = MLX5_WQE_CTRL_FENCE; + else + fence = next_fence; + next_fence = 0; + idx = qp->sq.cur_post & (qp->sq.wqe_cnt - 1); + ctrl = seg = mlx5_get_send_wqe(qp, idx); + *(uint32_t *)(seg + 8) = 0; + ctrl->imm = send_ieth(wr); + ctrl->fm_ce_se = qp->sq_signal_bits | fence | + (wr->send_flags & IBV_SEND_SIGNALED ? + MLX5_WQE_CTRL_CQ_UPDATE : 0) | + (wr->send_flags & IBV_SEND_SOLICITED ? + MLX5_WQE_CTRL_SOLICITED : 0); + + seg += sizeof *ctrl; + size = sizeof *ctrl / 16; + + switch (ibqp->qp_type) { + case IBV_QPT_XRC_SEND: + if (unlikely(wr->opcode != IBV_WR_BIND_MW && + wr->opcode != IBV_WR_LOCAL_INV)) { + xrc = seg; + xrc->xrc_srqn = htobe32(wr->qp_type.xrc.remote_srqn); + seg += sizeof(*xrc); + size += sizeof(*xrc) / 16; + } + /* fall through */ + case IBV_QPT_RC: + switch (wr->opcode) { + case IBV_WR_RDMA_READ: + case IBV_WR_RDMA_WRITE: + case IBV_WR_RDMA_WRITE_WITH_IMM: + set_raddr_seg(seg, wr->wr.rdma.remote_addr, + wr->wr.rdma.rkey); + seg += sizeof(struct mlx5_wqe_raddr_seg); + size += sizeof(struct mlx5_wqe_raddr_seg) / 16; + break; + + case IBV_WR_ATOMIC_CMP_AND_SWP: + case IBV_WR_ATOMIC_FETCH_AND_ADD: + if (unlikely(!qp->atomics_enabled)) { + mlx5_dbg(fp, MLX5_DBG_QP_SEND, "atomic operations are not supported\n"); + err = EOPNOTSUPP; + *bad_wr = wr; + goto out; + } + set_raddr_seg(seg, wr->wr.atomic.remote_addr, + wr->wr.atomic.rkey); + seg += sizeof(struct mlx5_wqe_raddr_seg); + + set_atomic_seg(seg, wr->opcode, + wr->wr.atomic.swap, + wr->wr.atomic.compare_add); + seg += sizeof(struct mlx5_wqe_atomic_seg); + + size += (sizeof(struct mlx5_wqe_raddr_seg) + + sizeof(struct mlx5_wqe_atomic_seg)) / 16; + break; + + case IBV_WR_BIND_MW: + next_fence = MLX5_WQE_CTRL_INITIATOR_SMALL_FENCE; + ctrl->imm = htobe32(wr->bind_mw.mw->rkey); + err = set_bind_wr(qp, wr->bind_mw.mw->type, + wr->bind_mw.rkey, + &wr->bind_mw.bind_info, + ibqp->qp_num, &seg, &size); + if (err) { + *bad_wr = wr; + goto out; + } + + qp->sq.wr_data[idx] = IBV_WC_BIND_MW; + break; + case IBV_WR_LOCAL_INV: { + struct ibv_mw_bind_info bind_info = {}; + + next_fence = MLX5_WQE_CTRL_INITIATOR_SMALL_FENCE; + ctrl->imm = htobe32(wr->invalidate_rkey); + err = set_bind_wr(qp, IBV_MW_TYPE_2, 0, + &bind_info, ibqp->qp_num, + &seg, &size); + if (err) { + *bad_wr = wr; + goto out; + } + + qp->sq.wr_data[idx] = IBV_WC_LOCAL_INV; + break; + } + + default: + break; + } + break; + + case IBV_QPT_UC: + switch (wr->opcode) { + case IBV_WR_RDMA_WRITE: + case IBV_WR_RDMA_WRITE_WITH_IMM: + set_raddr_seg(seg, wr->wr.rdma.remote_addr, + wr->wr.rdma.rkey); + seg += sizeof(struct mlx5_wqe_raddr_seg); + size += sizeof(struct mlx5_wqe_raddr_seg) / 16; + break; + case IBV_WR_BIND_MW: + next_fence = MLX5_WQE_CTRL_INITIATOR_SMALL_FENCE; + ctrl->imm = htobe32(wr->bind_mw.mw->rkey); + err = set_bind_wr(qp, wr->bind_mw.mw->type, + wr->bind_mw.rkey, + &wr->bind_mw.bind_info, + ibqp->qp_num, &seg, &size); + if (err) { + *bad_wr = wr; + goto out; + } + + qp->sq.wr_data[idx] = IBV_WC_BIND_MW; + break; + case IBV_WR_LOCAL_INV: { + struct ibv_mw_bind_info bind_info = {}; + + next_fence = MLX5_WQE_CTRL_INITIATOR_SMALL_FENCE; + ctrl->imm = htobe32(wr->invalidate_rkey); + err = set_bind_wr(qp, IBV_MW_TYPE_2, 0, + &bind_info, ibqp->qp_num, + &seg, &size); + if (err) { + *bad_wr = wr; + goto out; + } + + qp->sq.wr_data[idx] = IBV_WC_LOCAL_INV; + break; + } + + default: + break; + } + break; + + case IBV_QPT_UD: + set_datagram_seg(seg, wr); + seg += sizeof(struct mlx5_wqe_datagram_seg); + size += sizeof(struct mlx5_wqe_datagram_seg) / 16; + if (unlikely((seg == qend))) + seg = mlx5_get_send_wqe(qp, 0); + + if (unlikely(qp->flags & MLX5_QP_FLAGS_USE_UNDERLAY)) { + err = mlx5_post_send_underlay(qp, wr, &seg, &size, &sg_copy_ptr); + if (unlikely(err)) { + *bad_wr = wr; + goto out; + } + } + break; + + case IBV_QPT_RAW_PACKET: + memset(seg, 0, sizeof(struct mlx5_wqe_eth_seg)); + eseg = seg; + + if (wr->send_flags & IBV_SEND_IP_CSUM) { + if (!(qp->qp_cap_cache & MLX5_CSUM_SUPPORT_RAW_OVER_ETH)) { + err = EINVAL; + *bad_wr = wr; + goto out; + } + + eseg->cs_flags |= MLX5_ETH_WQE_L3_CSUM | MLX5_ETH_WQE_L4_CSUM; + } + + if (wr->opcode == IBV_WR_TSO) { + max_tso = qp->max_tso; + err = set_tso_eth_seg(&seg, wr->tso.hdr, + wr->tso.hdr_sz, + wr->tso.mss, qp, &size); + if (unlikely(err)) { + *bad_wr = wr; + goto out; + } + + /* For TSO WR we always copy at least MLX5_ETH_L2_MIN_HEADER_SIZE + * bytes of inline header which is included in struct mlx5_wqe_eth_seg. + * If additional bytes are copied, 'seg' and 'size' are adjusted + * inside set_tso_eth_seg(). + */ + + seg += sizeof(struct mlx5_wqe_eth_seg); + size += sizeof(struct mlx5_wqe_eth_seg) / 16; + } else { + uint32_t inl_hdr_size = + to_mctx(ibqp->context)->eth_min_inline_size; + + err = copy_eth_inline_headers(ibqp, wr->sg_list, + wr->num_sge, seg, + &sg_copy_ptr, 1); + if (unlikely(err)) { + *bad_wr = wr; + mlx5_dbg(fp, MLX5_DBG_QP_SEND, + "copy_eth_inline_headers failed, err: %d\n", + err); + goto out; + } + + /* The eth segment size depends on the device's min inline + * header requirement which can be 0 or 18. The basic eth segment + * always includes room for first 2 inline header bytes (even if + * copy size is 0) so the additional seg size is adjusted accordingly. + */ + + seg += (offsetof(struct mlx5_wqe_eth_seg, inline_hdr) + + inl_hdr_size) & ~0xf; + size += (offsetof(struct mlx5_wqe_eth_seg, inline_hdr) + + inl_hdr_size) >> 4; + } + break; + + default: + break; + } + + if (wr->send_flags & IBV_SEND_INLINE && wr->num_sge) { + int uninitialized_var(sz); + + err = set_data_inl_seg(qp, wr, seg, &sz, &sg_copy_ptr); + if (unlikely(err)) { + *bad_wr = wr; + mlx5_dbg(fp, MLX5_DBG_QP_SEND, + "inline layout failed, err %d\n", err); + goto out; + } + inl = 1; + size += sz; + } else { + dpseg = seg; + for (i = sg_copy_ptr.index; i < wr->num_sge; ++i) { + if (unlikely(dpseg == qend)) { + seg = mlx5_get_send_wqe(qp, 0); + dpseg = seg; + } + if (likely(wr->sg_list[i].length)) { + if (unlikely(wr->opcode == + IBV_WR_ATOMIC_CMP_AND_SWP || + wr->opcode == + IBV_WR_ATOMIC_FETCH_AND_ADD)) + set_data_ptr_seg_atomic(dpseg, wr->sg_list + i); + else { + if (unlikely(wr->opcode == IBV_WR_TSO)) { + if (max_tso < wr->sg_list[i].length) { + err = EINVAL; + *bad_wr = wr; + goto out; + } + max_tso -= wr->sg_list[i].length; + } + set_data_ptr_seg(dpseg, wr->sg_list + i, + sg_copy_ptr.offset); + } + sg_copy_ptr.offset = 0; + ++dpseg; + size += sizeof(struct mlx5_wqe_data_seg) / 16; + } + } + } + + mlx5_opcode = mlx5_ib_opcode[wr->opcode]; + ctrl->opmod_idx_opcode = htobe32(((qp->sq.cur_post & 0xffff) << 8) | + mlx5_opcode | + (opmod << 24)); + ctrl->qpn_ds = htobe32(size | (ibqp->qp_num << 8)); + + if (unlikely(qp->wq_sig)) + ctrl->signature = wq_sig(ctrl); + + qp->sq.wrid[idx] = wr->wr_id; + qp->sq.wqe_head[idx] = qp->sq.head + nreq; + qp->sq.cur_post += DIV_ROUND_UP(size * 16, MLX5_SEND_WQE_BB); + +#ifdef MLX5_DEBUG + if (mlx5_debug_mask & MLX5_DBG_QP_SEND) + dump_wqe(to_mctx(ibqp->context)->dbg_fp, idx, size, qp); +#endif + } + +out: + qp->fm_cache = next_fence; + post_send_db(qp, bf, nreq, inl, size, ctrl); + + mlx5_spin_unlock(&qp->sq.lock); + + return err; +} + +int mlx5_post_send(struct ibv_qp *ibqp, struct ibv_send_wr *wr, + struct ibv_send_wr **bad_wr) +{ +#ifdef MW_DEBUG + if (wr->opcode == IBV_WR_BIND_MW) { + if (wr->bind_mw.mw->type == IBV_MW_TYPE_1) + return EINVAL; + + if (!wr->bind_mw.bind_info.mr || + !wr->bind_mw.bind_info.addr || + !wr->bind_mw.bind_info.length) + return EINVAL; + + if (wr->bind_mw.bind_info.mr->pd != wr->bind_mw.mw->pd) + return EINVAL; + } +#endif + + return _mlx5_post_send(ibqp, wr, bad_wr); +} + +enum { + WQE_REQ_SETTERS_UD_XRC_DC = 2, +}; + +static void mlx5_send_wr_start(struct ibv_qp_ex *ibqp) +{ + struct mlx5_qp *mqp = to_mqp((struct ibv_qp *)ibqp); + + mlx5_spin_lock(&mqp->sq.lock); + + mqp->cur_post_rb = mqp->sq.cur_post; + mqp->fm_cache_rb = mqp->fm_cache; + mqp->err = 0; + mqp->nreq = 0; + mqp->inl_wqe = 0; +} + +static int mlx5_send_wr_complete(struct ibv_qp_ex *ibqp) +{ + struct mlx5_qp *mqp = to_mqp((struct ibv_qp *)ibqp); + int err = mqp->err; + + if (unlikely(err)) { + /* Rolling back */ + mqp->sq.cur_post = mqp->cur_post_rb; + mqp->fm_cache = mqp->fm_cache_rb; + goto out; + } + + post_send_db(mqp, mqp->bf, mqp->nreq, mqp->inl_wqe, mqp->cur_size, + mqp->cur_ctrl); + +out: + mlx5_spin_unlock(&mqp->sq.lock); + + return err; +} + +static void mlx5_send_wr_abort(struct ibv_qp_ex *ibqp) +{ + struct mlx5_qp *mqp = to_mqp((struct ibv_qp *)ibqp); + + /* Rolling back */ + mqp->sq.cur_post = mqp->cur_post_rb; + mqp->fm_cache = mqp->fm_cache_rb; + + mlx5_spin_unlock(&mqp->sq.lock); +} + +static inline void _common_wqe_init(struct ibv_qp_ex *ibqp, + enum ibv_wr_opcode ib_op) + ALWAYS_INLINE; +static inline void _common_wqe_init(struct ibv_qp_ex *ibqp, + enum ibv_wr_opcode ib_op) +{ + struct mlx5_qp *mqp = to_mqp((struct ibv_qp *)ibqp); + struct mlx5_wqe_ctrl_seg *ctrl; + uint8_t fence; + uint32_t idx; + + if (unlikely(mlx5_wq_overflow(&mqp->sq, mqp->nreq, to_mcq(ibqp->qp_base.send_cq)))) { + FILE *fp = to_mctx(((struct ibv_qp *)ibqp)->context)->dbg_fp; + + mlx5_dbg(fp, MLX5_DBG_QP_SEND, "Work queue overflow\n"); + + if (!mqp->err) + mqp->err = ENOMEM; + + return; + } + + idx = mqp->sq.cur_post & (mqp->sq.wqe_cnt - 1); + mqp->sq.wrid[idx] = ibqp->wr_id; + mqp->sq.wqe_head[idx] = mqp->sq.head + mqp->nreq; + if (ib_op == IBV_WR_BIND_MW) + mqp->sq.wr_data[idx] = IBV_WC_BIND_MW; + else if (ib_op == IBV_WR_LOCAL_INV) + mqp->sq.wr_data[idx] = IBV_WC_LOCAL_INV; + else if (ib_op == IBV_WR_DRIVER1) + mqp->sq.wr_data[idx] = IBV_WC_DRIVER1; + + ctrl = mlx5_get_send_wqe(mqp, idx); + *(uint32_t *)((void *)ctrl + 8) = 0; + + fence = (ibqp->wr_flags & IBV_SEND_FENCE) ? MLX5_WQE_CTRL_FENCE : + mqp->fm_cache; + mqp->fm_cache = 0; + + ctrl->fm_ce_se = + mqp->sq_signal_bits | fence | + (ibqp->wr_flags & IBV_SEND_SIGNALED ? + MLX5_WQE_CTRL_CQ_UPDATE : 0) | + (ibqp->wr_flags & IBV_SEND_SOLICITED ? + MLX5_WQE_CTRL_SOLICITED : 0); + + ctrl->opmod_idx_opcode = htobe32(((mqp->sq.cur_post & 0xffff) << 8) | + mlx5_ib_opcode[ib_op]); + + mqp->cur_ctrl = ctrl; +} + +static inline void _common_wqe_finilize(struct mlx5_qp *mqp) +{ + mqp->cur_ctrl->qpn_ds = htobe32(mqp->cur_size | (mqp->ibv_qp->qp_num << 8)); + + if (unlikely(mqp->wq_sig)) + mqp->cur_ctrl->signature = wq_sig(mqp->cur_ctrl); + +#ifdef MLX5_DEBUG + if (mlx5_debug_mask & MLX5_DBG_QP_SEND) { + int idx = mqp->sq.cur_post & (mqp->sq.wqe_cnt - 1); + FILE *fp = to_mctx(mqp->ibv_qp->context)->dbg_fp; + + dump_wqe(fp, idx, mqp->cur_size, mqp); + } +#endif + + mqp->sq.cur_post += DIV_ROUND_UP(mqp->cur_size, 4); +} + +static inline void _mlx5_send_wr_send(struct ibv_qp_ex *ibqp, + enum ibv_wr_opcode ib_op) + ALWAYS_INLINE; +static inline void _mlx5_send_wr_send(struct ibv_qp_ex *ibqp, + enum ibv_wr_opcode ib_op) +{ + struct mlx5_qp *mqp = to_mqp((struct ibv_qp *)ibqp); + size_t transport_seg_sz = 0; + + _common_wqe_init(ibqp, ib_op); + + if (ibqp->qp_base.qp_type == IBV_QPT_UD || + ibqp->qp_base.qp_type == IBV_QPT_DRIVER) + transport_seg_sz = sizeof(struct mlx5_wqe_datagram_seg); + else if (ibqp->qp_base.qp_type == IBV_QPT_XRC_SEND) + transport_seg_sz = sizeof(struct mlx5_wqe_xrc_seg); + + mqp->cur_data = (void *)mqp->cur_ctrl + sizeof(struct mlx5_wqe_ctrl_seg) + + transport_seg_sz; + /* In UD/DC cur_data may overrun the SQ */ + if (unlikely(mqp->cur_data == mqp->sq.qend)) + mqp->cur_data = mlx5_get_send_wqe(mqp, 0); + + mqp->cur_size = (sizeof(struct mlx5_wqe_ctrl_seg) + transport_seg_sz) / 16; + mqp->nreq++; + + /* Relevant just for WQE construction which requires more than 1 setter */ + mqp->cur_setters_cnt = 0; +} + +static void mlx5_send_wr_send_other(struct ibv_qp_ex *ibqp) +{ + _mlx5_send_wr_send(ibqp, IBV_WR_SEND); +} + +static void mlx5_send_wr_send_eth(struct ibv_qp_ex *ibqp) +{ + uint32_t inl_hdr_size = + to_mctx(((struct ibv_qp *)ibqp)->context)->eth_min_inline_size; + struct mlx5_qp *mqp = to_mqp((struct ibv_qp *)ibqp); + struct mlx5_wqe_eth_seg *eseg; + size_t eseg_sz; + + _common_wqe_init(ibqp, IBV_WR_SEND); + + eseg = (void *)mqp->cur_ctrl + sizeof(struct mlx5_wqe_ctrl_seg); + memset(eseg, 0, sizeof(struct mlx5_wqe_eth_seg)); + if (inl_hdr_size) + mqp->cur_eth = eseg; + + if (ibqp->wr_flags & IBV_SEND_IP_CSUM) { + if (unlikely(!(mqp->qp_cap_cache & + MLX5_CSUM_SUPPORT_RAW_OVER_ETH))) { + if (!mqp->err) + mqp->err = EINVAL; + + return; + } + + eseg->cs_flags |= MLX5_ETH_WQE_L3_CSUM | MLX5_ETH_WQE_L4_CSUM; + } + + /* The eth segment size depends on the device's min inline + * header requirement which can be 0 or 18. The basic eth segment + * always includes room for first 2 inline header bytes (even if + * copy size is 0) so the additional seg size is adjusted accordingly. + */ + eseg_sz = (offsetof(struct mlx5_wqe_eth_seg, inline_hdr) + + inl_hdr_size) & ~0xf; + mqp->cur_data = (void *)eseg + eseg_sz; + mqp->cur_size = (sizeof(struct mlx5_wqe_ctrl_seg) + eseg_sz) >> 4; + mqp->nreq++; +} + +static void mlx5_send_wr_send_imm(struct ibv_qp_ex *ibqp, __be32 imm_data) +{ + struct mlx5_qp *mqp = to_mqp((struct ibv_qp *)ibqp); + + _mlx5_send_wr_send(ibqp, IBV_WR_SEND_WITH_IMM); + + mqp->cur_ctrl->imm = imm_data; +} + +static void mlx5_send_wr_send_inv(struct ibv_qp_ex *ibqp, + uint32_t invalidate_rkey) +{ + struct mlx5_qp *mqp = to_mqp((struct ibv_qp *)ibqp); + + _mlx5_send_wr_send(ibqp, IBV_WR_SEND_WITH_INV); + + mqp->cur_ctrl->imm = htobe32(invalidate_rkey); +} + +static void mlx5_send_wr_send_tso(struct ibv_qp_ex *ibqp, void *hdr, + uint16_t hdr_sz, uint16_t mss) +{ + struct mlx5_qp *mqp = to_mqp((struct ibv_qp *)ibqp); + struct mlx5_wqe_eth_seg *eseg; + int size = 0; + int err; + + _common_wqe_init(ibqp, IBV_WR_TSO); + + eseg = (void *)mqp->cur_ctrl + sizeof(struct mlx5_wqe_ctrl_seg); + memset(eseg, 0, sizeof(struct mlx5_wqe_eth_seg)); + + if (ibqp->wr_flags & IBV_SEND_IP_CSUM) { + if (unlikely(!(mqp->qp_cap_cache & MLX5_CSUM_SUPPORT_RAW_OVER_ETH))) { + if (!mqp->err) + mqp->err = EINVAL; + + return; + } + + eseg->cs_flags |= MLX5_ETH_WQE_L3_CSUM | MLX5_ETH_WQE_L4_CSUM; + } + + err = set_tso_eth_seg((void *)&eseg, hdr, hdr_sz, mss, mqp, &size); + if (unlikely(err)) { + if (!mqp->err) + mqp->err = err; + + return; + } + + /* eseg and cur_size was updated with hdr size inside set_tso_eth_seg */ + mqp->cur_data = (void *)eseg + sizeof(struct mlx5_wqe_eth_seg); + mqp->cur_size = size + + ((sizeof(struct mlx5_wqe_ctrl_seg) + + sizeof(struct mlx5_wqe_eth_seg)) >> 4); + + mqp->cur_eth = NULL; + mqp->nreq++; +} + +static inline void _mlx5_send_wr_rdma(struct ibv_qp_ex *ibqp, + uint32_t rkey, + uint64_t remote_addr, + enum ibv_wr_opcode ib_op) + ALWAYS_INLINE; +static inline void _mlx5_send_wr_rdma(struct ibv_qp_ex *ibqp, + uint32_t rkey, + uint64_t remote_addr, + enum ibv_wr_opcode ib_op) +{ + struct mlx5_qp *mqp = to_mqp((struct ibv_qp *)ibqp); + size_t transport_seg_sz = 0; + void *raddr_seg; + + _common_wqe_init(ibqp, ib_op); + + if (ibqp->qp_base.qp_type == IBV_QPT_DRIVER) + transport_seg_sz = sizeof(struct mlx5_wqe_datagram_seg); + else if (ibqp->qp_base.qp_type == IBV_QPT_XRC_SEND) + transport_seg_sz = sizeof(struct mlx5_wqe_xrc_seg); + + raddr_seg = (void *)mqp->cur_ctrl + sizeof(struct mlx5_wqe_ctrl_seg) + + transport_seg_sz; + /* In DC raddr_seg may overrun the SQ */ + if (unlikely(raddr_seg == mqp->sq.qend)) + raddr_seg = mlx5_get_send_wqe(mqp, 0); + + set_raddr_seg(raddr_seg, remote_addr, rkey); + + mqp->cur_data = raddr_seg + sizeof(struct mlx5_wqe_raddr_seg); + mqp->cur_size = (sizeof(struct mlx5_wqe_ctrl_seg) + transport_seg_sz + + sizeof(struct mlx5_wqe_raddr_seg)) / 16; + mqp->nreq++; + + /* Relevant just for WQE construction which requires more than 1 setter */ + mqp->cur_setters_cnt = 0; +} + +static void mlx5_send_wr_rdma_write(struct ibv_qp_ex *ibqp, uint32_t rkey, + uint64_t remote_addr) +{ + _mlx5_send_wr_rdma(ibqp, rkey, remote_addr, IBV_WR_RDMA_WRITE); +} + +static void mlx5_send_wr_rdma_write_imm(struct ibv_qp_ex *ibqp, uint32_t rkey, + uint64_t remote_addr, __be32 imm_data) +{ + struct mlx5_qp *mqp = to_mqp((struct ibv_qp *)ibqp); + + _mlx5_send_wr_rdma(ibqp, rkey, remote_addr, IBV_WR_RDMA_WRITE_WITH_IMM); + + mqp->cur_ctrl->imm = imm_data; +} + +static void mlx5_send_wr_rdma_read(struct ibv_qp_ex *ibqp, uint32_t rkey, + uint64_t remote_addr) +{ + _mlx5_send_wr_rdma(ibqp, rkey, remote_addr, IBV_WR_RDMA_READ); +} + +static inline void _mlx5_send_wr_atomic(struct ibv_qp_ex *ibqp, uint32_t rkey, + uint64_t remote_addr, + uint64_t compare_add, + uint64_t swap, enum ibv_wr_opcode ib_op) + ALWAYS_INLINE; +static inline void _mlx5_send_wr_atomic(struct ibv_qp_ex *ibqp, uint32_t rkey, + uint64_t remote_addr, + uint64_t compare_add, + uint64_t swap, enum ibv_wr_opcode ib_op) +{ + struct mlx5_qp *mqp = to_mqp((struct ibv_qp *)ibqp); + size_t transport_seg_sz = 0; + void *raddr_seg; + + _common_wqe_init(ibqp, ib_op); + + if (ibqp->qp_base.qp_type == IBV_QPT_DRIVER) + transport_seg_sz = sizeof(struct mlx5_wqe_datagram_seg); + else if (ibqp->qp_base.qp_type == IBV_QPT_XRC_SEND) + transport_seg_sz = sizeof(struct mlx5_wqe_xrc_seg); + + raddr_seg = (void *)mqp->cur_ctrl + sizeof(struct mlx5_wqe_ctrl_seg) + + transport_seg_sz; + /* In DC raddr_seg may overrun the SQ */ + if (unlikely(raddr_seg == mqp->sq.qend)) + raddr_seg = mlx5_get_send_wqe(mqp, 0); + + set_raddr_seg(raddr_seg, remote_addr, rkey); + + _set_atomic_seg((struct mlx5_wqe_atomic_seg *)(raddr_seg + sizeof(struct mlx5_wqe_raddr_seg)), + ib_op, swap, compare_add); + + mqp->cur_data = raddr_seg + sizeof(struct mlx5_wqe_raddr_seg) + + sizeof(struct mlx5_wqe_atomic_seg); + /* In XRC, cur_data may overrun the SQ */ + if (unlikely(mqp->cur_data == mqp->sq.qend)) + mqp->cur_data = mlx5_get_send_wqe(mqp, 0); + + mqp->cur_size = (sizeof(struct mlx5_wqe_ctrl_seg) + transport_seg_sz + + sizeof(struct mlx5_wqe_raddr_seg) + + sizeof(struct mlx5_wqe_atomic_seg)) / 16; + mqp->nreq++; + + /* Relevant just for WQE construction which requires more than 1 setter */ + mqp->cur_setters_cnt = 0; +} + +static void mlx5_send_wr_atomic_cmp_swp(struct ibv_qp_ex *ibqp, uint32_t rkey, + uint64_t remote_addr, uint64_t compare, + uint64_t swap) +{ + _mlx5_send_wr_atomic(ibqp, rkey, remote_addr, compare, swap, + IBV_WR_ATOMIC_CMP_AND_SWP); +} + +static void mlx5_send_wr_atomic_fetch_add(struct ibv_qp_ex *ibqp, uint32_t rkey, + uint64_t remote_addr, uint64_t add) +{ + _mlx5_send_wr_atomic(ibqp, rkey, remote_addr, add, 0, + IBV_WR_ATOMIC_FETCH_AND_ADD); +} + +static inline void _build_umr_wqe(struct ibv_qp_ex *ibqp, uint32_t orig_rkey, + uint32_t new_rkey, + const struct ibv_mw_bind_info *bind_info, + enum ibv_wr_opcode ib_op) + ALWAYS_INLINE; +static inline void _build_umr_wqe(struct ibv_qp_ex *ibqp, uint32_t orig_rkey, + uint32_t new_rkey, + const struct ibv_mw_bind_info *bind_info, + enum ibv_wr_opcode ib_op) +{ + struct mlx5_qp *mqp = to_mqp((struct ibv_qp *)ibqp); + void *umr_seg; + int err = 0; + int size = sizeof(struct mlx5_wqe_ctrl_seg) / 16; + + _common_wqe_init(ibqp, ib_op); + + mqp->cur_ctrl->imm = htobe32(orig_rkey); + + umr_seg = (void *)mqp->cur_ctrl + sizeof(struct mlx5_wqe_ctrl_seg); + err = set_bind_wr(mqp, IBV_MW_TYPE_2, new_rkey, bind_info, + ((struct ibv_qp *)ibqp)->qp_num, &umr_seg, &size); + if (unlikely(err)) { + if (!mqp->err) + mqp->err = err; + + return; + } + + mqp->cur_size = size; + mqp->fm_cache = MLX5_WQE_CTRL_INITIATOR_SMALL_FENCE; + mqp->nreq++; + _common_wqe_finilize(mqp); +} + +static void mlx5_send_wr_bind_mw(struct ibv_qp_ex *ibqp, struct ibv_mw *mw, + uint32_t rkey, + const struct ibv_mw_bind_info *bind_info) +{ + _build_umr_wqe(ibqp, mw->rkey, rkey, bind_info, IBV_WR_BIND_MW); +} + +static void mlx5_send_wr_local_inv(struct ibv_qp_ex *ibqp, + uint32_t invalidate_rkey) +{ + const struct ibv_mw_bind_info bind_info = {}; + + _build_umr_wqe(ibqp, invalidate_rkey, 0, &bind_info, IBV_WR_LOCAL_INV); +} + +static inline void +_mlx5_send_wr_set_sge(struct mlx5_qp *mqp, uint32_t lkey, uint64_t addr, + uint32_t length) +{ + struct mlx5_wqe_data_seg *dseg; + + if (unlikely(!length)) + return; + + dseg = mqp->cur_data; + dseg->byte_count = htobe32(length); + dseg->lkey = htobe32(lkey); + dseg->addr = htobe64(addr); + mqp->cur_size += sizeof(*dseg) / 16; +} + +static void +mlx5_send_wr_set_sge_rc_uc(struct ibv_qp_ex *ibqp, uint32_t lkey, + uint64_t addr, uint32_t length) +{ + struct mlx5_qp *mqp = to_mqp((struct ibv_qp *)ibqp); + + _mlx5_send_wr_set_sge(mqp, lkey, addr, length); + _common_wqe_finilize(mqp); +} + +static void +mlx5_send_wr_set_sge_ud_xrc_dc(struct ibv_qp_ex *ibqp, uint32_t lkey, + uint64_t addr, uint32_t length) +{ + struct mlx5_qp *mqp = to_mqp((struct ibv_qp *)ibqp); + + _mlx5_send_wr_set_sge(mqp, lkey, addr, length); + + if (mqp->cur_setters_cnt == WQE_REQ_SETTERS_UD_XRC_DC - 1) + _common_wqe_finilize(mqp); + else + mqp->cur_setters_cnt++; +} + +static void +mlx5_send_wr_set_sge_eth(struct ibv_qp_ex *ibqp, uint32_t lkey, + uint64_t addr, uint32_t length) +{ + struct mlx5_qp *mqp = to_mqp((struct ibv_qp *)ibqp); + struct mlx5_wqe_eth_seg *eseg = mqp->cur_eth; + int err; + + if (eseg) { /* Inline-headers was set */ + struct mlx5_sg_copy_ptr sg_copy_ptr = {.index = 0, .offset = 0}; + struct ibv_sge sge = {.addr = addr, .length = length}; + + err = copy_eth_inline_headers((struct ibv_qp *)ibqp, &sge, 1, + eseg, &sg_copy_ptr, 1); + if (unlikely(err)) { + if (!mqp->err) + mqp->err = err; + + return; + } + + addr += sg_copy_ptr.offset; + length -= sg_copy_ptr.offset; + } + + _mlx5_send_wr_set_sge(mqp, lkey, addr, length); + + _common_wqe_finilize(mqp); +} + +static inline void +_mlx5_send_wr_set_sge_list(struct mlx5_qp *mqp, size_t num_sge, + const struct ibv_sge *sg_list) +{ + struct mlx5_wqe_data_seg *dseg = mqp->cur_data; + size_t i; + + if (unlikely(num_sge > mqp->sq.max_gs)) { + FILE *fp = to_mctx(mqp->ibv_qp->context)->dbg_fp; + + mlx5_dbg(fp, MLX5_DBG_QP_SEND, "Num SGEs %zu exceeds the maximum (%d)\n", + num_sge, mqp->sq.max_gs); + + if (!mqp->err) + mqp->err = ENOMEM; + + return; + } + + for (i = 0; i < num_sge; i++) { + if (unlikely(dseg == mqp->sq.qend)) + dseg = mlx5_get_send_wqe(mqp, 0); + + if (unlikely(!sg_list[i].length)) + continue; + + dseg->byte_count = htobe32(sg_list[i].length); + dseg->lkey = htobe32(sg_list[i].lkey); + dseg->addr = htobe64(sg_list[i].addr); + dseg++; + mqp->cur_size += (sizeof(*dseg) / 16); + } +} + +static void +mlx5_send_wr_set_sge_list_rc_uc(struct ibv_qp_ex *ibqp, size_t num_sge, + const struct ibv_sge *sg_list) +{ + struct mlx5_qp *mqp = to_mqp((struct ibv_qp *)ibqp); + + _mlx5_send_wr_set_sge_list(mqp, num_sge, sg_list); + _common_wqe_finilize(mqp); +} + +static void +mlx5_send_wr_set_sge_list_ud_xrc_dc(struct ibv_qp_ex *ibqp, size_t num_sge, + const struct ibv_sge *sg_list) +{ + struct mlx5_qp *mqp = to_mqp((struct ibv_qp *)ibqp); + + _mlx5_send_wr_set_sge_list(mqp, num_sge, sg_list); + + if (mqp->cur_setters_cnt == WQE_REQ_SETTERS_UD_XRC_DC - 1) + _common_wqe_finilize(mqp); + else + mqp->cur_setters_cnt++; +} + +static void +mlx5_send_wr_set_sge_list_eth(struct ibv_qp_ex *ibqp, size_t num_sge, + const struct ibv_sge *sg_list) +{ + struct mlx5_sg_copy_ptr sg_copy_ptr = {.index = 0, .offset = 0}; + struct mlx5_qp *mqp = to_mqp((struct ibv_qp *)ibqp); + struct mlx5_wqe_data_seg *dseg = mqp->cur_data; + struct mlx5_wqe_eth_seg *eseg = mqp->cur_eth; + size_t i; + + if (unlikely(num_sge > mqp->sq.max_gs)) { + FILE *fp = to_mctx(mqp->ibv_qp->context)->dbg_fp; + + mlx5_dbg(fp, MLX5_DBG_QP_SEND, "Num SGEs %zu exceeds the maximum (%d)\n", + num_sge, mqp->sq.max_gs); + + if (!mqp->err) + mqp->err = ENOMEM; + + return; + } + + if (eseg) { /* Inline-headers was set */ + int err; + + err = copy_eth_inline_headers((struct ibv_qp *)ibqp, sg_list, + num_sge, eseg, &sg_copy_ptr, 1); + if (unlikely(err)) { + if (!mqp->err) + mqp->err = err; + + return; + } + } + + for (i = sg_copy_ptr.index; i < num_sge; i++) { + uint32_t length = sg_list[i].length - sg_copy_ptr.offset; + + if (unlikely(!length)) + continue; + + if (unlikely(dseg == mqp->sq.qend)) + dseg = mlx5_get_send_wqe(mqp, 0); + + dseg->addr = htobe64(sg_list[i].addr + sg_copy_ptr.offset); + dseg->byte_count = htobe32(length); + dseg->lkey = htobe32(sg_list[i].lkey); + dseg++; + mqp->cur_size += (sizeof(*dseg) / 16); + sg_copy_ptr.offset = 0; + } + + _common_wqe_finilize(mqp); +} + +static inline void memcpy_to_wqe(struct mlx5_qp *mqp, void *dest, void *src, + size_t n) +{ + if (unlikely(dest + n > mqp->sq.qend)) { + size_t copy = mqp->sq.qend - dest; + + memcpy(dest, src, copy); + src += copy; + n -= copy; + dest = mlx5_get_send_wqe(mqp, 0); + } + memcpy(dest, src, n); +} + +static inline void memcpy_to_wqe_and_update(struct mlx5_qp *mqp, void **dest, + void *src, size_t n) +{ + if (unlikely(*dest + n > mqp->sq.qend)) { + size_t copy = mqp->sq.qend - *dest; + + memcpy(*dest, src, copy); + src += copy; + n -= copy; + *dest = mlx5_get_send_wqe(mqp, 0); + } + memcpy(*dest, src, n); + + *dest += n; +} + +static inline void +_mlx5_send_wr_set_inline_data(struct mlx5_qp *mqp, void *addr, size_t length) +{ + struct mlx5_wqe_inline_seg *dseg = mqp->cur_data; + + if (unlikely(length > mqp->max_inline_data)) { + FILE *fp = to_mctx(mqp->ibv_qp->context)->dbg_fp; + + mlx5_dbg(fp, MLX5_DBG_QP_SEND, + "Inline data %zu exceeds the maximum (%d)\n", + length, mqp->max_inline_data); + + if (!mqp->err) + mqp->err = ENOMEM; + + return; + } + + mqp->inl_wqe = 1; /* Encourage a BlueFlame usage */ + + if (unlikely(!length)) + return; + + memcpy_to_wqe(mqp, (void *)dseg + sizeof(*dseg), addr, length); + dseg->byte_count = htobe32(length | MLX5_INLINE_SEG); + mqp->cur_size += DIV_ROUND_UP(length + sizeof(*dseg), 16); +} + +static void +mlx5_send_wr_set_inline_data_rc_uc(struct ibv_qp_ex *ibqp, void *addr, + size_t length) +{ + struct mlx5_qp *mqp = to_mqp((struct ibv_qp *)ibqp); + + _mlx5_send_wr_set_inline_data(mqp, addr, length); + _common_wqe_finilize(mqp); +} + +static void +mlx5_send_wr_set_inline_data_ud_xrc_dc(struct ibv_qp_ex *ibqp, void *addr, + size_t length) +{ + struct mlx5_qp *mqp = to_mqp((struct ibv_qp *)ibqp); + + _mlx5_send_wr_set_inline_data(mqp, addr, length); + + if (mqp->cur_setters_cnt == WQE_REQ_SETTERS_UD_XRC_DC - 1) + _common_wqe_finilize(mqp); + else + mqp->cur_setters_cnt++; +} + +static void +mlx5_send_wr_set_inline_data_eth(struct ibv_qp_ex *ibqp, void *addr, + size_t length) +{ + struct mlx5_qp *mqp = to_mqp((struct ibv_qp *)ibqp); + struct mlx5_wqe_eth_seg *eseg = mqp->cur_eth; + + if (eseg) { /* Inline-headers was set */ + struct mlx5_sg_copy_ptr sg_copy_ptr = {.index = 0, .offset = 0}; + struct ibv_data_buf buf = {.addr = addr, .length = length}; + int err; + + err = copy_eth_inline_headers((struct ibv_qp *)ibqp, &buf, 1, + eseg, &sg_copy_ptr, 0); + if (unlikely(err)) { + if (!mqp->err) + mqp->err = err; + + return; + } + + addr += sg_copy_ptr.offset; + length -= sg_copy_ptr.offset; + } + + _mlx5_send_wr_set_inline_data(mqp, addr, length); + _common_wqe_finilize(mqp); +} + +static inline void +_mlx5_send_wr_set_inline_data_list(struct mlx5_qp *mqp, + size_t num_buf, + const struct ibv_data_buf *buf_list) +{ + struct mlx5_wqe_inline_seg *dseg = mqp->cur_data; + void *wqe = (void *)dseg + sizeof(*dseg); + size_t inl_size = 0; + int i; + + for (i = 0; i < num_buf; i++) { + size_t length = buf_list[i].length; + + inl_size += length; + + if (unlikely(inl_size > mqp->max_inline_data)) { + FILE *fp = to_mctx(mqp->ibv_qp->context)->dbg_fp; + + mlx5_dbg(fp, MLX5_DBG_QP_SEND, + "Inline data %zu exceeds the maximum (%d)\n", + inl_size, mqp->max_inline_data); + + if (!mqp->err) + mqp->err = ENOMEM; + + return; + } + + memcpy_to_wqe_and_update(mqp, &wqe, buf_list[i].addr, length); + } + + mqp->inl_wqe = 1; /* Encourage a BlueFlame usage */ + + if (unlikely(!inl_size)) + return; + + dseg->byte_count = htobe32(inl_size | MLX5_INLINE_SEG); + mqp->cur_size += DIV_ROUND_UP(inl_size + sizeof(*dseg), 16); +} + +static void +mlx5_send_wr_set_inline_data_list_rc_uc(struct ibv_qp_ex *ibqp, + size_t num_buf, + const struct ibv_data_buf *buf_list) +{ + struct mlx5_qp *mqp = to_mqp((struct ibv_qp *)ibqp); + + _mlx5_send_wr_set_inline_data_list(mqp, num_buf, buf_list); + _common_wqe_finilize(mqp); +} + +static void +mlx5_send_wr_set_inline_data_list_ud_xrc_dc(struct ibv_qp_ex *ibqp, + size_t num_buf, + const struct ibv_data_buf *buf_list) +{ + struct mlx5_qp *mqp = to_mqp((struct ibv_qp *)ibqp); + + _mlx5_send_wr_set_inline_data_list(mqp, num_buf, buf_list); + + if (mqp->cur_setters_cnt == WQE_REQ_SETTERS_UD_XRC_DC - 1) + _common_wqe_finilize(mqp); + else + mqp->cur_setters_cnt++; +} + +static void +mlx5_send_wr_set_inline_data_list_eth(struct ibv_qp_ex *ibqp, + size_t num_buf, + const struct ibv_data_buf *buf_list) +{ + struct mlx5_sg_copy_ptr sg_copy_ptr = {.index = 0, .offset = 0}; + struct mlx5_qp *mqp = to_mqp((struct ibv_qp *)ibqp); + struct mlx5_wqe_inline_seg *dseg = mqp->cur_data; + struct mlx5_wqe_eth_seg *eseg = mqp->cur_eth; + void *wqe = (void *)dseg + sizeof(*dseg); + size_t inl_size = 0; + size_t i; + + if (eseg) { /* Inline-headers was set */ + int err; + + err = copy_eth_inline_headers((struct ibv_qp *)ibqp, buf_list, + num_buf, eseg, &sg_copy_ptr, 0); + if (unlikely(err)) { + if (!mqp->err) + mqp->err = err; + + return; + } + } + + for (i = sg_copy_ptr.index; i < num_buf; i++) { + size_t length = buf_list[i].length - sg_copy_ptr.offset; + + inl_size += length; + + if (unlikely(inl_size > mqp->max_inline_data)) { + FILE *fp = to_mctx(mqp->ibv_qp->context)->dbg_fp; + + mlx5_dbg(fp, MLX5_DBG_QP_SEND, + "Inline data %zu exceeds the maximum (%d)\n", + inl_size, mqp->max_inline_data); + + if (!mqp->err) + mqp->err = EINVAL; + + return; + } + + memcpy_to_wqe_and_update(mqp, &wqe, + buf_list[i].addr + sg_copy_ptr.offset, + length); + + sg_copy_ptr.offset = 0; + } + + if (likely(inl_size)) { + dseg->byte_count = htobe32(inl_size | MLX5_INLINE_SEG); + mqp->cur_size += DIV_ROUND_UP(inl_size + sizeof(*dseg), 16); + } + + mqp->inl_wqe = 1; /* Encourage a BlueFlame usage */ + _common_wqe_finilize(mqp); +} + +static void +mlx5_send_wr_set_ud_addr(struct ibv_qp_ex *ibqp, struct ibv_ah *ah, + uint32_t remote_qpn, uint32_t remote_qkey) +{ + struct mlx5_qp *mqp = to_mqp((struct ibv_qp *)ibqp); + struct mlx5_wqe_datagram_seg *dseg = + (void *)mqp->cur_ctrl + sizeof(struct mlx5_wqe_ctrl_seg); + struct mlx5_ah *mah = to_mah(ah); + + _set_datagram_seg(dseg, &mah->av, remote_qpn, remote_qkey); + + if (mqp->cur_setters_cnt == WQE_REQ_SETTERS_UD_XRC_DC - 1) + _common_wqe_finilize(mqp); + else + mqp->cur_setters_cnt++; +} + +static void +mlx5_send_wr_set_xrc_srqn(struct ibv_qp_ex *ibqp, uint32_t remote_srqn) +{ + struct mlx5_qp *mqp = to_mqp((struct ibv_qp *)ibqp); + struct mlx5_wqe_xrc_seg *xrc_seg = + (void *)mqp->cur_ctrl + sizeof(struct mlx5_wqe_ctrl_seg); + + xrc_seg->xrc_srqn = htobe32(remote_srqn); + + if (mqp->cur_setters_cnt == WQE_REQ_SETTERS_UD_XRC_DC - 1) + _common_wqe_finilize(mqp); + else + mqp->cur_setters_cnt++; +} + +static uint8_t get_umr_mr_flags(uint32_t acc) +{ + return ((acc & IBV_ACCESS_REMOTE_ATOMIC ? + MLX5_WQE_MKEY_CONTEXT_ACCESS_FLAGS_ATOMIC : 0) | + (acc & IBV_ACCESS_REMOTE_WRITE ? + MLX5_WQE_MKEY_CONTEXT_ACCESS_FLAGS_REMOTE_WRITE : 0) | + (acc & IBV_ACCESS_REMOTE_READ ? + MLX5_WQE_MKEY_CONTEXT_ACCESS_FLAGS_REMOTE_READ : 0) | + (acc & IBV_ACCESS_LOCAL_WRITE ? + MLX5_WQE_MKEY_CONTEXT_ACCESS_FLAGS_LOCAL_WRITE : 0)); +} + +static int umr_sg_list_create(struct mlx5_qp *qp, + uint16_t num_sges, + struct ibv_sge *sge, + void *seg, + void *qend, int *size, int *xlat_size, + uint64_t *reglen) +{ + struct mlx5_wqe_data_seg *dseg; + int byte_count = 0; + int i; + size_t tmp; + + dseg = seg; + + for (i = 0; i < num_sges; i++, dseg++) { + if (unlikely(dseg == qend)) + dseg = mlx5_get_send_wqe(qp, 0); + + dseg->addr = htobe64(sge[i].addr); + dseg->lkey = htobe32(sge[i].lkey); + dseg->byte_count = htobe32(sge[i].length); + byte_count += sge[i].length; + } + + tmp = align(num_sges, 4) - num_sges; + memset(dseg, 0, tmp * sizeof(*dseg)); + + *size = align(num_sges * sizeof(*dseg), 64); + *reglen = byte_count; + *xlat_size = num_sges * sizeof(*dseg); + + return 0; +} + +/* The strided block format is as the following: + * | repeat_block | entry_block | entry_block |...| entry_block | + * While the repeat entry contains details on the list of the block_entries. + */ +static void umr_strided_seg_create(struct mlx5_qp *qp, + uint32_t repeat_count, + uint16_t num_interleaved, + struct mlx5dv_mr_interleaved *data, + void *seg, + void *qend, int *wqe_size, int *xlat_size, + uint64_t *reglen) +{ + struct mlx5_wqe_umr_repeat_block_seg *rb = seg; + struct mlx5_wqe_umr_repeat_ent_seg *eb; + int byte_count = 0; + int tmp; + int i; + + rb->op = htobe32(0x400); + rb->reserved = 0; + rb->num_ent = htobe16(num_interleaved); + rb->repeat_count = htobe32(repeat_count); + eb = rb->entries; + + /* + * ------------------------------------------------------------ + * | repeat_block | entry_block | entry_block |...| entry_block + * ------------------------------------------------------------ + */ + for (i = 0; i < num_interleaved; i++, eb++) { + if (unlikely(eb == qend)) + eb = mlx5_get_send_wqe(qp, 0); + + byte_count += data[i].bytes_count; + eb->va = htobe64(data[i].addr); + eb->byte_count = htobe16(data[i].bytes_count); + eb->stride = htobe16(data[i].bytes_count + data[i].bytes_skip); + eb->memkey = htobe32(data[i].lkey); + } + + rb->byte_count = htobe32(byte_count); + *reglen = byte_count * repeat_count; + + tmp = align(num_interleaved + 1, 4) - num_interleaved - 1; + memset(eb, 0, tmp * sizeof(*eb)); + + *wqe_size = align(sizeof(*rb) + sizeof(*eb) * num_interleaved, 64); + *xlat_size = (num_interleaved + 1) * sizeof(*eb); +} + +static void mlx5_send_wr_mr(struct mlx5dv_qp_ex *dv_qp, + struct mlx5dv_mkey *dv_mkey, + uint32_t access_flags, + uint32_t repeat_count, + uint16_t num_entries, + struct mlx5dv_mr_interleaved *data, + struct ibv_sge *sge) +{ + struct mlx5_qp *mqp = mqp_from_mlx5dv_qp_ex(dv_qp); + struct ibv_qp_ex *ibqp = &mqp->verbs_qp.qp_ex; + struct mlx5_wqe_umr_ctrl_seg *umr_ctrl_seg; + struct mlx5_wqe_mkey_context_seg *mk; + struct mlx5_mkey *mkey = container_of(dv_mkey, struct mlx5_mkey, + dv_mkey); + int xlat_size; + int size; + uint64_t reglen = 0; + void *qend = mqp->sq.qend; + void *seg; + uint16_t max_entries; + + if (unlikely(!(ibqp->wr_flags & IBV_SEND_INLINE))) { + mqp->err = EOPNOTSUPP; + return; + } + + max_entries = data ? + min_t(size_t, + (mqp->max_inline_data + sizeof(struct mlx5_wqe_inl_data_seg)) / + sizeof(struct mlx5_wqe_umr_repeat_ent_seg) - 1, + mkey->num_desc) : + min_t(size_t, + (mqp->max_inline_data + sizeof(struct mlx5_wqe_inl_data_seg)) / + sizeof(struct mlx5_wqe_data_seg), + mkey->num_desc); + + if (unlikely(num_entries > max_entries)) { + mqp->err = ENOMEM; + return; + } + + if (unlikely(!check_comp_mask(access_flags, + IBV_ACCESS_LOCAL_WRITE | + IBV_ACCESS_REMOTE_WRITE | + IBV_ACCESS_REMOTE_READ | + IBV_ACCESS_REMOTE_ATOMIC))) { + mqp->err = EINVAL; + return; + } + + _common_wqe_init(ibqp, IBV_WR_DRIVER1); + mqp->cur_size = sizeof(struct mlx5_wqe_ctrl_seg) / 16; + mqp->cur_ctrl->imm = htobe32(dv_mkey->lkey); + seg = umr_ctrl_seg = (void *)mqp->cur_ctrl + sizeof(struct mlx5_wqe_ctrl_seg); + + memset(umr_ctrl_seg, 0, sizeof(*umr_ctrl_seg)); + umr_ctrl_seg->flags = MLX5_WQE_UMR_CTRL_FLAG_INLINE; + umr_ctrl_seg->mkey_mask = htobe64(MLX5_WQE_UMR_CTRL_MKEY_MASK_LEN | + MLX5_WQE_UMR_CTRL_MKEY_MASK_ACCESS_LOCAL_WRITE | + MLX5_WQE_UMR_CTRL_MKEY_MASK_ACCESS_REMOTE_READ | + MLX5_WQE_UMR_CTRL_MKEY_MASK_ACCESS_REMOTE_WRITE | + MLX5_WQE_UMR_CTRL_MKEY_MASK_ACCESS_ATOMIC | + MLX5_WQE_UMR_CTRL_MKEY_MASK_FREE); + + seg += sizeof(struct mlx5_wqe_umr_ctrl_seg); + mqp->cur_size += sizeof(struct mlx5_wqe_umr_ctrl_seg) / 16; + + if (unlikely(seg == qend)) + seg = mlx5_get_send_wqe(mqp, 0); + + mk = seg; + memset(mk, 0, sizeof(*mk)); + mk->access_flags = get_umr_mr_flags(access_flags); + mk->qpn_mkey = htobe32(0xffffff00 | (dv_mkey->lkey & 0xff)); + + seg += sizeof(*mk); + mqp->cur_size += (sizeof(*mk) / 16); + + if (unlikely(seg == qend)) + seg = mlx5_get_send_wqe(mqp, 0); + + if (data) + umr_strided_seg_create(mqp, repeat_count, num_entries, data, + seg, qend, &size, &xlat_size, ®len); + else + umr_sg_list_create(mqp, num_entries, sge, seg, + qend, &size, &xlat_size, ®len); + + mk->len = htobe64(reglen); + umr_ctrl_seg->klm_octowords = htobe16(align(xlat_size, 64) / 16); + mqp->cur_size += size / 16; + + mqp->fm_cache = MLX5_WQE_CTRL_INITIATOR_SMALL_FENCE; + mqp->nreq++; + mqp->inl_wqe = 1; + + _common_wqe_finilize(mqp); +} + +static void mlx5_send_wr_mr_interleaved(struct mlx5dv_qp_ex *dv_qp, + struct mlx5dv_mkey *mkey, + uint32_t access_flags, + uint32_t repeat_count, + uint16_t num_interleaved, + struct mlx5dv_mr_interleaved *data) +{ + mlx5_send_wr_mr(dv_qp, mkey, access_flags, repeat_count, + num_interleaved, data, NULL); +} + +static inline void mlx5_send_wr_mr_list(struct mlx5dv_qp_ex *dv_qp, + struct mlx5dv_mkey *mkey, + uint32_t access_flags, + uint16_t num_sges, + struct ibv_sge *sge) +{ + mlx5_send_wr_mr(dv_qp, mkey, access_flags, 0, num_sges, NULL, sge); +} + +static void mlx5_send_wr_set_dc_addr(struct mlx5dv_qp_ex *dv_qp, + struct ibv_ah *ah, + uint32_t remote_dctn, + uint64_t remote_dc_key) +{ + struct mlx5_qp *mqp = mqp_from_mlx5dv_qp_ex(dv_qp); + struct mlx5_wqe_datagram_seg *dseg = + (void *)mqp->cur_ctrl + sizeof(struct mlx5_wqe_ctrl_seg); + struct mlx5_ah *mah = to_mah(ah); + + memcpy(&dseg->av, &mah->av, sizeof(dseg->av)); + dseg->av.dqp_dct |= htobe32(remote_dctn | MLX5_EXTENDED_UD_AV); + dseg->av.key.dc_key = htobe64(remote_dc_key); + + if (mqp->cur_setters_cnt == WQE_REQ_SETTERS_UD_XRC_DC - 1) + _common_wqe_finilize(mqp); + else + mqp->cur_setters_cnt++; +} + +enum { + MLX5_SUPPORTED_SEND_OPS_FLAGS_RC = + IBV_QP_EX_WITH_SEND | + IBV_QP_EX_WITH_SEND_WITH_INV | + IBV_QP_EX_WITH_SEND_WITH_IMM | + IBV_QP_EX_WITH_RDMA_WRITE | + IBV_QP_EX_WITH_RDMA_WRITE_WITH_IMM | + IBV_QP_EX_WITH_RDMA_READ | + IBV_QP_EX_WITH_ATOMIC_CMP_AND_SWP | + IBV_QP_EX_WITH_ATOMIC_FETCH_AND_ADD | + IBV_QP_EX_WITH_LOCAL_INV | + IBV_QP_EX_WITH_BIND_MW, + MLX5_SUPPORTED_SEND_OPS_FLAGS_XRC = + MLX5_SUPPORTED_SEND_OPS_FLAGS_RC, + MLX5_SUPPORTED_SEND_OPS_FLAGS_DCI = + MLX5_SUPPORTED_SEND_OPS_FLAGS_RC, + MLX5_SUPPORTED_SEND_OPS_FLAGS_UD = + IBV_QP_EX_WITH_SEND | + IBV_QP_EX_WITH_SEND_WITH_IMM, + MLX5_SUPPORTED_SEND_OPS_FLAGS_UC = + IBV_QP_EX_WITH_SEND | + IBV_QP_EX_WITH_SEND_WITH_INV | + IBV_QP_EX_WITH_SEND_WITH_IMM | + IBV_QP_EX_WITH_RDMA_WRITE | + IBV_QP_EX_WITH_RDMA_WRITE_WITH_IMM | + IBV_QP_EX_WITH_LOCAL_INV | + IBV_QP_EX_WITH_BIND_MW, + MLX5_SUPPORTED_SEND_OPS_FLAGS_RAW_PACKET = + IBV_QP_EX_WITH_SEND | + IBV_QP_EX_WITH_TSO, +}; + +static void fill_wr_builders_rc_xrc_dc(struct ibv_qp_ex *ibqp) +{ + ibqp->wr_send = mlx5_send_wr_send_other; + ibqp->wr_send_imm = mlx5_send_wr_send_imm; + ibqp->wr_send_inv = mlx5_send_wr_send_inv; + ibqp->wr_rdma_write = mlx5_send_wr_rdma_write; + ibqp->wr_rdma_write_imm = mlx5_send_wr_rdma_write_imm; + ibqp->wr_rdma_read = mlx5_send_wr_rdma_read; + ibqp->wr_atomic_cmp_swp = mlx5_send_wr_atomic_cmp_swp; + ibqp->wr_atomic_fetch_add = mlx5_send_wr_atomic_fetch_add; + ibqp->wr_bind_mw = mlx5_send_wr_bind_mw; + ibqp->wr_local_inv = mlx5_send_wr_local_inv; +} + +static void fill_wr_builders_uc(struct ibv_qp_ex *ibqp) +{ + ibqp->wr_send = mlx5_send_wr_send_other; + ibqp->wr_send_imm = mlx5_send_wr_send_imm; + ibqp->wr_send_inv = mlx5_send_wr_send_inv; + ibqp->wr_rdma_write = mlx5_send_wr_rdma_write; + ibqp->wr_rdma_write_imm = mlx5_send_wr_rdma_write_imm; + ibqp->wr_bind_mw = mlx5_send_wr_bind_mw; + ibqp->wr_local_inv = mlx5_send_wr_local_inv; +} + +static void fill_wr_builders_ud(struct ibv_qp_ex *ibqp) +{ + ibqp->wr_send = mlx5_send_wr_send_other; + ibqp->wr_send_imm = mlx5_send_wr_send_imm; +} + +static void fill_wr_builders_eth(struct ibv_qp_ex *ibqp) +{ + ibqp->wr_send = mlx5_send_wr_send_eth; + ibqp->wr_send_tso = mlx5_send_wr_send_tso; +} + +static void fill_wr_setters_rc_uc(struct ibv_qp_ex *ibqp) +{ + ibqp->wr_set_sge = mlx5_send_wr_set_sge_rc_uc; + ibqp->wr_set_sge_list = mlx5_send_wr_set_sge_list_rc_uc; + ibqp->wr_set_inline_data = mlx5_send_wr_set_inline_data_rc_uc; + ibqp->wr_set_inline_data_list = mlx5_send_wr_set_inline_data_list_rc_uc; +} + +static void fill_wr_setters_ud_xrc_dc(struct ibv_qp_ex *ibqp) +{ + ibqp->wr_set_sge = mlx5_send_wr_set_sge_ud_xrc_dc; + ibqp->wr_set_sge_list = mlx5_send_wr_set_sge_list_ud_xrc_dc; + ibqp->wr_set_inline_data = mlx5_send_wr_set_inline_data_ud_xrc_dc; + ibqp->wr_set_inline_data_list = mlx5_send_wr_set_inline_data_list_ud_xrc_dc; +} + +static void fill_wr_setters_eth(struct ibv_qp_ex *ibqp) +{ + ibqp->wr_set_sge = mlx5_send_wr_set_sge_eth; + ibqp->wr_set_sge_list = mlx5_send_wr_set_sge_list_eth; + ibqp->wr_set_inline_data = mlx5_send_wr_set_inline_data_eth; + ibqp->wr_set_inline_data_list = mlx5_send_wr_set_inline_data_list_eth; +} + +int mlx5_qp_fill_wr_pfns(struct mlx5_qp *mqp, + const struct ibv_qp_init_attr_ex *attr, + const struct mlx5dv_qp_init_attr *mlx5_attr) +{ + struct ibv_qp_ex *ibqp = &mqp->verbs_qp.qp_ex; + uint64_t ops = attr->send_ops_flags; + struct mlx5dv_qp_ex *dv_qp; + uint64_t mlx5_ops = 0; + + ibqp->wr_start = mlx5_send_wr_start; + ibqp->wr_complete = mlx5_send_wr_complete; + ibqp->wr_abort = mlx5_send_wr_abort; + + if (!mqp->atomics_enabled && + (ops & IBV_QP_EX_WITH_ATOMIC_CMP_AND_SWP || + ops & IBV_QP_EX_WITH_ATOMIC_FETCH_AND_ADD)) + return EOPNOTSUPP; + + if (mlx5_attr && + mlx5_attr->comp_mask & MLX5DV_QP_INIT_ATTR_MASK_SEND_OPS_FLAGS) + mlx5_ops = mlx5_attr->send_ops_flags; + + /* Set all supported micro-functions regardless user request */ + switch (attr->qp_type) { + case IBV_QPT_RC: + if (ops & ~MLX5_SUPPORTED_SEND_OPS_FLAGS_RC) + return EOPNOTSUPP; + + fill_wr_builders_rc_xrc_dc(ibqp); + fill_wr_setters_rc_uc(ibqp); + + if (mlx5_ops) { + if (!check_comp_mask(mlx5_ops, + MLX5DV_QP_EX_WITH_MR_INTERLEAVED | + MLX5DV_QP_EX_WITH_MR_LIST)) + return EOPNOTSUPP; + + dv_qp = &mqp->dv_qp; + dv_qp->wr_mr_interleaved = mlx5_send_wr_mr_interleaved; + dv_qp->wr_mr_list = mlx5_send_wr_mr_list; + } + + break; + + case IBV_QPT_UC: + if (ops & ~MLX5_SUPPORTED_SEND_OPS_FLAGS_UC || mlx5_ops) + return EOPNOTSUPP; + + fill_wr_builders_uc(ibqp); + fill_wr_setters_rc_uc(ibqp); + break; + + case IBV_QPT_XRC_SEND: + if (ops & ~MLX5_SUPPORTED_SEND_OPS_FLAGS_XRC || mlx5_ops) + return EOPNOTSUPP; + + fill_wr_builders_rc_xrc_dc(ibqp); + fill_wr_setters_ud_xrc_dc(ibqp); + ibqp->wr_set_xrc_srqn = mlx5_send_wr_set_xrc_srqn; + break; + + case IBV_QPT_UD: + if (ops & ~MLX5_SUPPORTED_SEND_OPS_FLAGS_UD || mlx5_ops) + return EOPNOTSUPP; + + if (mqp->flags & MLX5_QP_FLAGS_USE_UNDERLAY) + return EOPNOTSUPP; + + fill_wr_builders_ud(ibqp); + fill_wr_setters_ud_xrc_dc(ibqp); + ibqp->wr_set_ud_addr = mlx5_send_wr_set_ud_addr; + break; + + case IBV_QPT_RAW_PACKET: + if (ops & ~MLX5_SUPPORTED_SEND_OPS_FLAGS_RAW_PACKET || mlx5_ops) + return EOPNOTSUPP; + + fill_wr_builders_eth(ibqp); + fill_wr_setters_eth(ibqp); + break; + + case IBV_QPT_DRIVER: + dv_qp = &mqp->dv_qp; + + if (!(mlx5_attr->comp_mask & MLX5DV_QP_INIT_ATTR_MASK_DC && + mlx5_attr->dc_init_attr.dc_type == MLX5DV_DCTYPE_DCI)) + return EOPNOTSUPP; + + if (ops & ~MLX5_SUPPORTED_SEND_OPS_FLAGS_DCI || mlx5_ops) + return EOPNOTSUPP; + + fill_wr_builders_rc_xrc_dc(ibqp); + fill_wr_setters_ud_xrc_dc(ibqp); + dv_qp->wr_set_dc_addr = mlx5_send_wr_set_dc_addr; + break; + + default: + return EOPNOTSUPP; + } + + return 0; +} + +int mlx5_bind_mw(struct ibv_qp *qp, struct ibv_mw *mw, + struct ibv_mw_bind *mw_bind) +{ + struct ibv_mw_bind_info *bind_info = &mw_bind->bind_info; + struct ibv_send_wr wr = {}; + struct ibv_send_wr *bad_wr = NULL; + int ret; + + if (!bind_info->mr && (bind_info->addr || bind_info->length)) { + errno = EINVAL; + return errno; + } + + if (bind_info->mw_access_flags & IBV_ACCESS_ZERO_BASED) { + errno = EINVAL; + return errno; + } + + if (bind_info->mr) { + if (verbs_get_mr(bind_info->mr)->mr_type != IBV_MR_TYPE_MR) { + errno = ENOTSUP; + return errno; + } + + if (to_mmr(bind_info->mr)->alloc_flags & IBV_ACCESS_ZERO_BASED) { + errno = EINVAL; + return errno; + } + + if (mw->pd != bind_info->mr->pd) { + errno = EPERM; + return errno; + } + } + + wr.opcode = IBV_WR_BIND_MW; + wr.next = NULL; + wr.wr_id = mw_bind->wr_id; + wr.send_flags = mw_bind->send_flags; + wr.bind_mw.bind_info = mw_bind->bind_info; + wr.bind_mw.mw = mw; + wr.bind_mw.rkey = ibv_inc_rkey(mw->rkey); + + ret = _mlx5_post_send(qp, &wr, &bad_wr); + if (ret) + return ret; + + mw->rkey = wr.bind_mw.rkey; + + return 0; +} + +static void set_sig_seg(struct mlx5_qp *qp, struct mlx5_rwqe_sig *sig, + int size, uint16_t idx) +{ + uint8_t sign; + uint32_t qpn = qp->ibv_qp->qp_num; + + sign = calc_sig(sig, size); + sign ^= calc_sig(&qpn, 4); + sign ^= calc_sig(&idx, 2); + sig->signature = sign; +} + +static void set_wq_sig_seg(struct mlx5_rwq *rwq, struct mlx5_rwqe_sig *sig, + int size, uint16_t idx) +{ + uint8_t sign; + uint32_t qpn = rwq->wq.wq_num; + + sign = calc_sig(sig, size); + sign ^= calc_sig(&qpn, 4); + sign ^= calc_sig(&idx, 2); + sig->signature = sign; +} + +int mlx5_post_wq_recv(struct ibv_wq *ibwq, struct ibv_recv_wr *wr, + struct ibv_recv_wr **bad_wr) +{ + struct mlx5_rwq *rwq = to_mrwq(ibwq); + struct mlx5_wqe_data_seg *scat; + int err = 0; + int nreq; + int ind; + int i, j; + struct mlx5_rwqe_sig *sig; + + mlx5_spin_lock(&rwq->rq.lock); + + ind = rwq->rq.head & (rwq->rq.wqe_cnt - 1); + + for (nreq = 0; wr; ++nreq, wr = wr->next) { + if (unlikely(mlx5_wq_overflow(&rwq->rq, nreq, + to_mcq(rwq->wq.cq)))) { + err = ENOMEM; + *bad_wr = wr; + goto out; + } + + if (unlikely(wr->num_sge > rwq->rq.max_gs)) { + err = EINVAL; + *bad_wr = wr; + goto out; + } + + scat = get_wq_recv_wqe(rwq, ind); + sig = (struct mlx5_rwqe_sig *)scat; + if (unlikely(rwq->wq_sig)) { + memset(sig, 0, 1 << rwq->rq.wqe_shift); + ++scat; + } + + for (i = 0, j = 0; i < wr->num_sge; ++i) { + if (unlikely(!wr->sg_list[i].length)) + continue; + set_data_ptr_seg(scat + j++, wr->sg_list + i, 0); + } + + if (j < rwq->rq.max_gs) { + scat[j].byte_count = 0; + scat[j].lkey = htobe32(MLX5_INVALID_LKEY); + scat[j].addr = 0; + } + + if (unlikely(rwq->wq_sig)) + set_wq_sig_seg(rwq, sig, (wr->num_sge + 1) << 4, + rwq->rq.head & 0xffff); + + rwq->rq.wrid[ind] = wr->wr_id; + + ind = (ind + 1) & (rwq->rq.wqe_cnt - 1); + } + +out: + if (likely(nreq)) { + rwq->rq.head += nreq; + /* + * Make sure that descriptors are written before + * doorbell record. + */ + udma_to_device_barrier(); + *(rwq->recv_db) = htobe32(rwq->rq.head & 0xffff); + } + + mlx5_spin_unlock(&rwq->rq.lock); + + return err; +} + +int mlx5_post_recv(struct ibv_qp *ibqp, struct ibv_recv_wr *wr, + struct ibv_recv_wr **bad_wr) +{ + struct mlx5_qp *qp = to_mqp(ibqp); + struct mlx5_wqe_data_seg *scat; + int err = 0; + int nreq; + int ind; + int i, j; + struct mlx5_rwqe_sig *sig; + + mlx5_spin_lock(&qp->rq.lock); + + ind = qp->rq.head & (qp->rq.wqe_cnt - 1); + + for (nreq = 0; wr; ++nreq, wr = wr->next) { + if (unlikely(mlx5_wq_overflow(&qp->rq, nreq, + to_mcq(qp->ibv_qp->recv_cq)))) { + err = ENOMEM; + *bad_wr = wr; + goto out; + } + + if (unlikely(wr->num_sge > qp->rq.max_gs)) { + err = EINVAL; + *bad_wr = wr; + goto out; + } + + scat = get_recv_wqe(qp, ind); + sig = (struct mlx5_rwqe_sig *)scat; + if (unlikely(qp->wq_sig)) { + memset(sig, 0, 1 << qp->rq.wqe_shift); + ++scat; + } + + for (i = 0, j = 0; i < wr->num_sge; ++i) { + if (unlikely(!wr->sg_list[i].length)) + continue; + set_data_ptr_seg(scat + j++, wr->sg_list + i, 0); + } + + if (j < qp->rq.max_gs) { + scat[j].byte_count = 0; + scat[j].lkey = htobe32(MLX5_INVALID_LKEY); + scat[j].addr = 0; + } + + if (unlikely(qp->wq_sig)) + set_sig_seg(qp, sig, (wr->num_sge + 1) << 4, + qp->rq.head & 0xffff); + + qp->rq.wrid[ind] = wr->wr_id; + + ind = (ind + 1) & (qp->rq.wqe_cnt - 1); + } + +out: + if (likely(nreq)) { + qp->rq.head += nreq; + + /* + * Make sure that descriptors are written before + * doorbell record. + */ + udma_to_device_barrier(); + + /* + * For Raw Packet QP, avoid updating the doorbell record + * as long as the QP isn't in RTR state, to avoid receiving + * packets in illegal states. + * This is only for Raw Packet QPs since they are represented + * differently in the hardware. + */ + if (likely(!((ibqp->qp_type == IBV_QPT_RAW_PACKET || + qp->flags & MLX5_QP_FLAGS_USE_UNDERLAY) && + ibqp->state < IBV_QPS_RTR))) + qp->db[MLX5_RCV_DBR] = htobe32(qp->rq.head & 0xffff); + } + + mlx5_spin_unlock(&qp->rq.lock); + + return err; +} + +static void mlx5_tm_add_op(struct mlx5_srq *srq, struct mlx5_tag_entry *tag, + uint64_t wr_id, int nreq) +{ + struct mlx5_qp *qp = to_mqp(srq->cmd_qp); + struct mlx5_srq_op *op; + + op = srq->op + (srq->op_tail++ & (qp->sq.wqe_cnt - 1)); + op->tag = tag; + op->wr_id = wr_id; + /* Will point to next available WQE */ + op->wqe_head = qp->sq.head + nreq; + if (tag) + tag->expect_cqe++; +} + +int mlx5_post_srq_ops(struct ibv_srq *ibsrq, struct ibv_ops_wr *wr, + struct ibv_ops_wr **bad_wr) +{ + struct mlx5_context *ctx = to_mctx(ibsrq->context); + struct mlx5_srq *srq = to_msrq(ibsrq); + struct mlx5_wqe_ctrl_seg *ctrl = NULL; + struct mlx5_tag_entry *tag; + struct mlx5_bf *bf; + struct mlx5_qp *qp; + unsigned int idx; + int size = 0; + int nreq = 0; + int err = 0; + void *qend; + void *seg; + FILE *fp = ctx->dbg_fp; + + if (unlikely(!srq->cmd_qp)) { + *bad_wr = wr; + return EINVAL; + } + + qp = to_mqp(srq->cmd_qp); + bf = qp->bf; + qend = qp->sq.qend; + mlx5_spin_lock(&srq->lock); + + for (nreq = 0; wr; ++nreq, wr = wr->next) { + if (unlikely(mlx5_wq_overflow(&qp->sq, nreq, + to_mcq(qp->ibv_qp->send_cq)))) { + mlx5_dbg(fp, MLX5_DBG_QP_SEND, "work queue overflow\n"); + err = ENOMEM; + *bad_wr = wr; + goto out; + } + + idx = qp->sq.cur_post & (qp->sq.wqe_cnt - 1); + ctrl = seg = mlx5_get_send_wqe(qp, idx); + *(uint32_t *)(seg + 8) = 0; + ctrl->imm = 0; + ctrl->fm_ce_se = 0; + + seg += sizeof(*ctrl); + size = sizeof(*ctrl) / 16; + + switch (wr->opcode) { + case IBV_WR_TAG_ADD: + if (unlikely(!srq->tm_head->next)) { + mlx5_dbg(fp, MLX5_DBG_QP_SEND, "tag matching list is full\n"); + err = ENOMEM; + *bad_wr = wr; + goto out; + } + tag = srq->tm_head; +#ifdef MLX5_DEBUG + if (wr->tm.add.num_sge > 1) { + mlx5_dbg(fp, MLX5_DBG_QP_SEND, "num_sge must be at most 1\n"); + err = EINVAL; + *bad_wr = wr; + goto out; + } + + if (tag->expect_cqe) { + mlx5_dbg(fp, MLX5_DBG_QP_SEND, "tag matching list is corrupted\n"); + err = ENOMEM; + *bad_wr = wr; + goto out; + } +#endif + srq->tm_head = tag->next; + /* place index of next entry into TM segment */ + set_tm_seg(seg, MLX5_TM_OPCODE_APPEND, wr, + tag->next - srq->tm_list); + tag->next = NULL; + tag->wr_id = wr->tm.add.recv_wr_id; + if (wr->flags & IBV_OPS_TM_SYNC) + srq->unexp_out = wr->tm.unexpected_cnt; + tag->phase_cnt = srq->unexp_out; + tag->expect_cqe++; + + if (wr->flags & IBV_OPS_SIGNALED) + mlx5_tm_add_op(srq, tag, wr->wr_id, nreq); + + wr->tm.handle = tag - srq->tm_list; + seg += sizeof(struct mlx5_wqe_tm_seg); + size += sizeof(struct mlx5_wqe_tm_seg) / 16; + + if (unlikely(seg == qend)) + seg = mlx5_get_send_wqe(qp, 0); + + /* message is allowed to be empty */ + if (wr->tm.add.num_sge && wr->tm.add.sg_list->length) { + set_data_ptr_seg(seg, wr->tm.add.sg_list, 0); + tag->ptr = (void *)(uintptr_t)wr->tm.add.sg_list->addr; + tag->size = wr->tm.add.sg_list->length; + } else { + set_data_ptr_seg_end(seg); + } + size += sizeof(struct mlx5_wqe_data_seg) / 16; + break; + + case IBV_WR_TAG_DEL: + tag = &srq->tm_list[wr->tm.handle]; + +#ifdef MLX5_DEBUG + if (!tag->expect_cqe) { + mlx5_dbg(fp, MLX5_DBG_QP_SEND, "removing tag which isn't in HW ownership\n"); + err = ENOMEM; + *bad_wr = wr; + goto out; + } +#endif + set_tm_seg(seg, MLX5_TM_OPCODE_REMOVE, wr, + wr->tm.handle); + + if (wr->flags & IBV_OPS_SIGNALED) + mlx5_tm_add_op(srq, tag, wr->wr_id, nreq); + else + mlx5_tm_release_tag(srq, tag); + + seg += sizeof(struct mlx5_wqe_tm_seg); + size += sizeof(struct mlx5_wqe_tm_seg) / 16; + break; + + case IBV_WR_TAG_SYNC: + set_tm_seg(seg, MLX5_TM_OPCODE_NOP, wr, 0); + + if (wr->flags & IBV_OPS_SIGNALED) + mlx5_tm_add_op(srq, NULL, wr->wr_id, nreq); + + seg += sizeof(struct mlx5_wqe_tm_seg); + size += sizeof(struct mlx5_wqe_tm_seg) / 16; + break; + + default: + mlx5_dbg(fp, MLX5_DBG_QP_SEND, "bad opcode %d\n", + wr->opcode); + err = EINVAL; + *bad_wr = wr; + goto out; + } + + ctrl->opmod_idx_opcode = htobe32(MLX5_OPCODE_TAG_MATCHING | + ((qp->sq.cur_post & 0xffff) << 8)); + ctrl->qpn_ds = htobe32(size | (srq->cmd_qp->qp_num << 8)); + + if (unlikely(qp->wq_sig)) + ctrl->signature = wq_sig(ctrl); + + qp->sq.cur_post += DIV_ROUND_UP(size * 16, MLX5_SEND_WQE_BB); + +#ifdef MLX5_DEBUG + if (mlx5_debug_mask & MLX5_DBG_QP_SEND) + dump_wqe(fp, idx, size, qp); +#endif + } + +out: + qp->fm_cache = 0; + post_send_db(qp, bf, nreq, 0, size, ctrl); + + mlx5_spin_unlock(&srq->lock); + + return err; +} + +int mlx5_use_huge(const char *key) +{ + char *e; + e = getenv(key); + if (e && !strcmp(e, "y")) + return 1; + + return 0; +} + +struct mlx5_qp *mlx5_find_qp(struct mlx5_context *ctx, uint32_t qpn) +{ + int tind = qpn >> MLX5_QP_TABLE_SHIFT; + + if (ctx->qp_table[tind].refcnt) + return ctx->qp_table[tind].table[qpn & MLX5_QP_TABLE_MASK]; + else + return NULL; +} + +int mlx5_store_qp(struct mlx5_context *ctx, uint32_t qpn, struct mlx5_qp *qp) +{ + int tind = qpn >> MLX5_QP_TABLE_SHIFT; + + if (!ctx->qp_table[tind].refcnt) { + ctx->qp_table[tind].table = calloc(MLX5_QP_TABLE_MASK + 1, + sizeof(struct mlx5_qp *)); + if (!ctx->qp_table[tind].table) + return -1; + } + + ++ctx->qp_table[tind].refcnt; + ctx->qp_table[tind].table[qpn & MLX5_QP_TABLE_MASK] = qp; + return 0; +} + +void mlx5_clear_qp(struct mlx5_context *ctx, uint32_t qpn) +{ + int tind = qpn >> MLX5_QP_TABLE_SHIFT; + + if (!--ctx->qp_table[tind].refcnt) + free(ctx->qp_table[tind].table); + else + ctx->qp_table[tind].table[qpn & MLX5_QP_TABLE_MASK] = NULL; +} diff --git a/providers/mlx5/srq.c b/providers/mlx5/srq.c new file mode 100644 index 0000000..e9568c6 --- /dev/null +++ b/providers/mlx5/srq.c @@ -0,0 +1,382 @@ +/* + * Copyright (c) 2012 Mellanox Technologies, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <config.h> + +#include <stdlib.h> +#include <pthread.h> +#include <string.h> +#include <errno.h> + +#include "mlx5.h" +#include "wqe.h" + +static void *get_wqe(struct mlx5_srq *srq, int n) +{ + return srq->buf.buf + (n << srq->wqe_shift); +} + +int mlx5_copy_to_recv_srq(struct mlx5_srq *srq, int idx, void *buf, int size) +{ + struct mlx5_wqe_srq_next_seg *next; + struct mlx5_wqe_data_seg *scat; + int copy; + int i; + int max = 1 << (srq->wqe_shift - 4); + + next = get_wqe(srq, idx); + scat = (struct mlx5_wqe_data_seg *) (next + 1); + + for (i = 0; i < max; ++i) { + copy = min_t(long, size, be32toh(scat->byte_count)); + memcpy((void *)(unsigned long)be64toh(scat->addr), buf, copy); + size -= copy; + if (size <= 0) + return IBV_WC_SUCCESS; + + buf += copy; + ++scat; + } + return IBV_WC_LOC_LEN_ERR; +} + +void mlx5_free_srq_wqe(struct mlx5_srq *srq, int ind) +{ + struct mlx5_wqe_srq_next_seg *next; + + mlx5_spin_lock(&srq->lock); + + next = get_wqe(srq, srq->tail); + next->next_wqe_index = htobe16(ind); + srq->tail = ind; + + mlx5_spin_unlock(&srq->lock); +} + +/* Take an index and put it last in wait queue */ +static void srq_put_in_waitq(struct mlx5_srq *srq, int ind) +{ + struct mlx5_wqe_srq_next_seg *waitq_tail; + + waitq_tail = get_wqe(srq, srq->waitq_tail); + waitq_tail->next_wqe_index = htobe16(ind); + srq->waitq_tail = ind; +} + +/* Take first in wait queue and put in tail of SRQ */ +static void srq_get_from_waitq(struct mlx5_srq *srq) +{ + struct mlx5_wqe_srq_next_seg *tail; + struct mlx5_wqe_srq_next_seg *waitq_head; + + tail = get_wqe(srq, srq->tail); + waitq_head = get_wqe(srq, srq->waitq_head); + + tail->next_wqe_index = htobe16(srq->waitq_head); + srq->tail = srq->waitq_head; + srq->waitq_head = be16toh(waitq_head->next_wqe_index); +} + +/* Put the given WQE that is in SW ownership at the end of the wait queue. + * Take a WQE from the wait queue and add it to WQEs in SW ownership instead. + */ +bool srq_cooldown_wqe(struct mlx5_srq *srq, int ind) +{ + if (!srq_has_waitq(srq)) + return false; + + srq_put_in_waitq(srq, ind); + srq_get_from_waitq(srq); + return true; +} + +/* Post a WQE internally, based on a previous application post. + * Copy a given WQE's data segments to the SRQ head, advance the head + * and ring the HW doorbell. + */ +static void srq_repost(struct mlx5_srq *srq, int ind) +{ + struct mlx5_wqe_srq_next_seg *src, *dst; + struct mlx5_wqe_data_seg *src_scat, *dst_scat; + int i; + + srq->wrid[srq->head] = srq->wrid[ind]; + + src = get_wqe(srq, ind); + dst = get_wqe(srq, srq->head); + src_scat = (struct mlx5_wqe_data_seg *)(src + 1); + dst_scat = (struct mlx5_wqe_data_seg *)(dst + 1); + + for (i = 0; i < srq->max_gs; ++i) { + dst_scat[i] = src_scat[i]; + + if (dst_scat[i].lkey == htobe32(MLX5_INVALID_LKEY)) + break; + } + + srq->head = be16toh(dst->next_wqe_index); + srq->counter++; + /* Flush descriptors */ + udma_to_device_barrier(); + *srq->db = htobe32(srq->counter); +} + +void mlx5_complete_odp_fault(struct mlx5_srq *srq, int ind) +{ + mlx5_spin_lock(&srq->lock); + + if (!srq_cooldown_wqe(srq, ind)) { + struct mlx5_wqe_srq_next_seg *tail = get_wqe(srq, srq->tail); + + /* Without a wait queue put the page-faulted wqe + * back in SRQ tail. The repost is still possible but + * the risk of overriding the page-faulted WQE with a future + * post_srq_recv() is now higher. + */ + tail->next_wqe_index = htobe16(ind); + srq->tail = ind; + } + + srq_repost(srq, ind); + + mlx5_spin_unlock(&srq->lock); +} + +int mlx5_post_srq_recv(struct ibv_srq *ibsrq, + struct ibv_recv_wr *wr, + struct ibv_recv_wr **bad_wr) +{ + struct mlx5_srq *srq = to_msrq(ibsrq); + struct mlx5_wqe_srq_next_seg *next; + struct mlx5_wqe_data_seg *scat; + int err = 0; + int nreq; + int i; + + mlx5_spin_lock(&srq->lock); + + for (nreq = 0; wr; ++nreq, wr = wr->next) { + if (wr->num_sge > srq->max_gs) { + err = EINVAL; + *bad_wr = wr; + break; + } + + if (srq->head == srq->tail) { + /* SRQ is full*/ + err = ENOMEM; + *bad_wr = wr; + break; + } + + srq->wrid[srq->head] = wr->wr_id; + + next = get_wqe(srq, srq->head); + srq->head = be16toh(next->next_wqe_index); + scat = (struct mlx5_wqe_data_seg *) (next + 1); + + for (i = 0; i < wr->num_sge; ++i) { + scat[i].byte_count = htobe32(wr->sg_list[i].length); + scat[i].lkey = htobe32(wr->sg_list[i].lkey); + scat[i].addr = htobe64(wr->sg_list[i].addr); + } + + if (i < srq->max_gs) { + scat[i].byte_count = 0; + scat[i].lkey = htobe32(MLX5_INVALID_LKEY); + scat[i].addr = 0; + } + } + + if (nreq) { + srq->counter += nreq; + + /* + * Make sure that descriptors are written before + * we write doorbell record. + */ + udma_to_device_barrier(); + + *srq->db = htobe32(srq->counter); + } + + mlx5_spin_unlock(&srq->lock); + + return err; +} + +/* Build a linked list on an array of SRQ WQEs. + * Since WQEs are always added to the tail and taken from the head + * it doesn't matter where the last WQE points to. + */ +static void set_srq_buf_ll(struct mlx5_srq *srq, int start, int end) +{ + struct mlx5_wqe_srq_next_seg *next; + int i; + + for (i = start; i < end; ++i) { + next = get_wqe(srq, i); + next->next_wqe_index = htobe16(i + 1); + } +} + +int mlx5_alloc_srq_buf(struct ibv_context *context, struct mlx5_srq *srq, + uint32_t max_wr, struct ibv_pd *pd) +{ + int size; + int buf_size; + struct mlx5_context *ctx; + uint32_t orig_max_wr = max_wr; + bool have_wq = true; + enum mlx5_alloc_type alloc_type; + + ctx = to_mctx(context); + + if (srq->max_gs < 0) { + errno = EINVAL; + return -1; + } + + /* At first, try to allocate more WQEs than requested so the extra will + * be used for the wait queue. + */ + max_wr = orig_max_wr * 2 + 1; + + if (max_wr > ctx->max_srq_recv_wr) { + /* Device limits are smaller than required + * to provide a wait queue, continue without. + */ + max_wr = orig_max_wr + 1; + have_wq = false; + } + + size = sizeof(struct mlx5_wqe_srq_next_seg) + + srq->max_gs * sizeof(struct mlx5_wqe_data_seg); + size = max(32, size); + + size = roundup_pow_of_two(size); + + if (size > ctx->max_recv_wr) { + errno = EINVAL; + return -1; + } + srq->max_gs = (size - sizeof(struct mlx5_wqe_srq_next_seg)) / + sizeof(struct mlx5_wqe_data_seg); + + srq->wqe_shift = ilog32(size - 1); + + srq->max = align_queue_size(max_wr); + buf_size = srq->max * size; + + mlx5_get_alloc_type(ctx, pd, MLX5_SRQ_PREFIX, &alloc_type, + MLX5_ALLOC_TYPE_ANON); + + if (alloc_type == MLX5_ALLOC_TYPE_CUSTOM) { + srq->buf.mparent_domain = to_mparent_domain(pd); + srq->buf.req_alignment = to_mdev(context->device)->page_size; + srq->buf.resource_type = MLX5DV_RES_TYPE_SRQ; + } + + if (mlx5_alloc_prefered_buf(ctx, + &srq->buf, buf_size, + to_mdev(context->device)->page_size, + alloc_type, + MLX5_SRQ_PREFIX)) + return -1; + + if (srq->buf.type != MLX5_ALLOC_TYPE_CUSTOM) + memset(srq->buf.buf, 0, buf_size); + + srq->head = 0; + srq->tail = align_queue_size(orig_max_wr + 1) - 1; + if (have_wq) { + srq->waitq_head = srq->tail + 1; + srq->waitq_tail = srq->max - 1; + } else { + srq->waitq_head = -1; + srq->waitq_tail = -1; + } + + srq->wrid = malloc(srq->max * sizeof(*srq->wrid)); + if (!srq->wrid) { + mlx5_free_actual_buf(ctx, &srq->buf); + return -1; + } + + /* + * Now initialize the SRQ buffer so that all of the WQEs are + * linked into the list of free WQEs. + */ + + set_srq_buf_ll(srq, srq->head, srq->tail); + if (have_wq) + set_srq_buf_ll(srq, srq->waitq_head, srq->waitq_tail); + + return 0; +} + +struct mlx5_srq *mlx5_find_srq(struct mlx5_context *ctx, uint32_t srqn) +{ + int tind = srqn >> MLX5_SRQ_TABLE_SHIFT; + + if (ctx->srq_table[tind].refcnt) + return ctx->srq_table[tind].table[srqn & MLX5_SRQ_TABLE_MASK]; + else + return NULL; +} + +int mlx5_store_srq(struct mlx5_context *ctx, uint32_t srqn, + struct mlx5_srq *srq) +{ + int tind = srqn >> MLX5_SRQ_TABLE_SHIFT; + + if (!ctx->srq_table[tind].refcnt) { + ctx->srq_table[tind].table = calloc(MLX5_SRQ_TABLE_MASK + 1, + sizeof(struct mlx5_srq *)); + if (!ctx->srq_table[tind].table) + return -1; + } + + ++ctx->srq_table[tind].refcnt; + ctx->srq_table[tind].table[srqn & MLX5_SRQ_TABLE_MASK] = srq; + return 0; +} + +void mlx5_clear_srq(struct mlx5_context *ctx, uint32_t srqn) +{ + int tind = srqn >> MLX5_SRQ_TABLE_SHIFT; + + if (!--ctx->srq_table[tind].refcnt) + free(ctx->srq_table[tind].table); + else + ctx->srq_table[tind].table[srqn & MLX5_SRQ_TABLE_MASK] = NULL; +} diff --git a/providers/mlx5/verbs.c b/providers/mlx5/verbs.c new file mode 100644 index 0000000..47e8380 --- /dev/null +++ b/providers/mlx5/verbs.c @@ -0,0 +1,5301 @@ +/* + * Copyright (c) 2012 Mellanox Technologies, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <config.h> + +#include <stdlib.h> +#include <stdio.h> +#include <stdatomic.h> +#include <string.h> +#include <pthread.h> +#include <errno.h> +#include <limits.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> +#include <unistd.h> +#include <sys/mman.h> +#include <ccan/array_size.h> + +#include <util/compiler.h> +#include <util/mmio.h> +#include <rdma/ib_user_ioctl_cmds.h> +#include <rdma/mlx5_user_ioctl_cmds.h> +#include <infiniband/cmd_write.h> + +#include "mlx5.h" +#include "mlx5-abi.h" +#include "wqe.h" +#include "mlx5_ifc.h" + +int mlx5_single_threaded = 0; + +static inline int is_xrc_tgt(int type) +{ + return type == IBV_QPT_XRC_RECV; +} + +int mlx5_query_device(struct ibv_context *context, struct ibv_device_attr *attr) +{ + struct ibv_query_device cmd; + uint64_t raw_fw_ver; + unsigned major, minor, sub_minor; + int ret; + + ret = ibv_cmd_query_device(context, attr, &raw_fw_ver, &cmd, sizeof cmd); + if (ret) + return ret; + + major = (raw_fw_ver >> 32) & 0xffff; + minor = (raw_fw_ver >> 16) & 0xffff; + sub_minor = raw_fw_ver & 0xffff; + + snprintf(attr->fw_ver, sizeof attr->fw_ver, + "%d.%d.%04d", major, minor, sub_minor); + + return 0; +} + +static int mlx5_read_clock(struct ibv_context *context, uint64_t *cycles) +{ + unsigned int clockhi, clocklo, clockhi1; + int i; + struct mlx5_context *ctx = to_mctx(context); + + if (!ctx->hca_core_clock) + return -EOPNOTSUPP; + + /* Handle wraparound */ + for (i = 0; i < 2; i++) { + clockhi = be32toh(mmio_read32_be(ctx->hca_core_clock)); + clocklo = be32toh(mmio_read32_be(ctx->hca_core_clock + 4)); + clockhi1 = be32toh(mmio_read32_be(ctx->hca_core_clock)); + if (clockhi == clockhi1) + break; + } + + *cycles = (uint64_t)clockhi << 32 | (uint64_t)clocklo; + + return 0; +} + +int mlx5_query_rt_values(struct ibv_context *context, + struct ibv_values_ex *values) +{ + uint32_t comp_mask = 0; + int err = 0; + + if (!check_comp_mask(values->comp_mask, IBV_VALUES_MASK_RAW_CLOCK)) + return EINVAL; + + if (values->comp_mask & IBV_VALUES_MASK_RAW_CLOCK) { + uint64_t cycles; + + err = mlx5_read_clock(context, &cycles); + if (!err) { + values->raw_clock.tv_sec = 0; + values->raw_clock.tv_nsec = cycles; + comp_mask |= IBV_VALUES_MASK_RAW_CLOCK; + } + } + + values->comp_mask = comp_mask; + + return err; +} + +int mlx5_query_port(struct ibv_context *context, uint8_t port, + struct ibv_port_attr *attr) +{ + struct ibv_query_port cmd; + + return ibv_cmd_query_port(context, port, attr, &cmd, sizeof cmd); +} + +void mlx5_async_event(struct ibv_context *context, + struct ibv_async_event *event) +{ + struct mlx5_context *ctx; + + switch (event->event_type) { + case IBV_EVENT_DEVICE_FATAL: + ctx = to_mctx(context); + ctx->flags |= MLX5_CTX_FLAGS_FATAL_STATE; + break; + default: + break; + } +} + +struct ibv_pd *mlx5_alloc_pd(struct ibv_context *context) +{ + struct ibv_alloc_pd cmd; + struct mlx5_alloc_pd_resp resp; + struct mlx5_pd *pd; + + pd = calloc(1, sizeof *pd); + if (!pd) + return NULL; + + if (ibv_cmd_alloc_pd(context, &pd->ibv_pd, &cmd, sizeof cmd, + &resp.ibv_resp, sizeof resp)) { + free(pd); + return NULL; + } + + atomic_init(&pd->refcount, 1); + pd->pdn = resp.pdn; + + return &pd->ibv_pd; +} + +static void mlx5_free_uar(struct ibv_context *ctx, + struct mlx5_bf *bf) +{ + DECLARE_COMMAND_BUFFER(cmd, + MLX5_IB_OBJECT_UAR, + MLX5_IB_METHOD_UAR_OBJ_DESTROY, + 1); + + if (!bf->length) + goto end; + + if (bf->mmaped_entry && munmap(bf->uar, bf->length)) + assert(false); + + if (!bf->dyn_alloc_uar) + goto end; + + fill_attr_in_obj(cmd, MLX5_IB_ATTR_UAR_OBJ_DESTROY_HANDLE, bf->uar_handle); + if (execute_ioctl(ctx, cmd)) + assert(false); + +end: + free(bf); +} + +static struct mlx5_bf * +mlx5_alloc_dyn_uar(struct ibv_context *context, uint32_t flags) +{ + DECLARE_COMMAND_BUFFER(cmd, + MLX5_IB_OBJECT_UAR, + MLX5_IB_METHOD_UAR_OBJ_ALLOC, + 5); + + struct ib_uverbs_attr *handle; + struct mlx5_context *ctx = to_mctx(context); + struct mlx5_bf *bf; + bool legacy_mode = false; + off_t offset; + int ret; + + if (ctx->flags & MLX5_CTX_FLAGS_NO_KERN_DYN_UAR) { + if (flags == MLX5_IB_UAPI_UAR_ALLOC_TYPE_NC) { + errno = EOPNOTSUPP; + return NULL; + } + + if (ctx->curr_legacy_dyn_sys_uar_page > + ctx->max_num_legacy_dyn_uar_sys_page) { + errno = ENOSPC; + return NULL; + } + + legacy_mode = true; + } + + bf = calloc(1, sizeof(*bf)); + if (!bf) { + errno = ENOMEM; + return NULL; + } + + if (legacy_mode) { + struct mlx5_device *dev = to_mdev(context->device); + + offset = get_uar_mmap_offset(ctx->curr_legacy_dyn_sys_uar_page, dev->page_size, + MLX5_IB_MMAP_ALLOC_WC); + bf->length = dev->page_size; + goto do_mmap; + } + + bf->dyn_alloc_uar = true; + handle = fill_attr_out_obj(cmd, MLX5_IB_ATTR_UAR_OBJ_ALLOC_HANDLE); + fill_attr_const_in(cmd, MLX5_IB_ATTR_UAR_OBJ_ALLOC_TYPE, + flags); + fill_attr_out_ptr(cmd, MLX5_IB_ATTR_UAR_OBJ_ALLOC_MMAP_OFFSET, + &bf->uar_mmap_offset); + fill_attr_out_ptr(cmd, MLX5_IB_ATTR_UAR_OBJ_ALLOC_MMAP_LENGTH, &bf->length); + fill_attr_out_ptr(cmd, MLX5_IB_ATTR_UAR_OBJ_ALLOC_PAGE_ID, &bf->page_id); + + ret = execute_ioctl(context, cmd); + if (ret) { + free(bf); + return NULL; + } + +do_mmap: + bf->uar = mmap(NULL, bf->length, PROT_WRITE, MAP_SHARED, + context->cmd_fd, + legacy_mode ? offset : bf->uar_mmap_offset); + + if (bf->uar == MAP_FAILED) + goto err; + + bf->mmaped_entry = true; + + if (legacy_mode) + ctx->curr_legacy_dyn_sys_uar_page++; + else + bf->uar_handle = read_attr_obj(MLX5_IB_ATTR_UAR_OBJ_ALLOC_HANDLE, + handle); + + bf->nc_mode = (flags == MLX5_IB_UAPI_UAR_ALLOC_TYPE_NC); + + return bf; + +err: + mlx5_free_uar(context, bf); + return NULL; +} + +static void mlx5_insert_dyn_uuars(struct mlx5_context *ctx, + struct mlx5_bf *bf_uar) +{ + int index_in_uar, index_uar_in_page; + int num_bfregs_per_page; + struct list_head *head; + struct mlx5_bf *bf = bf_uar; + int j; + + num_bfregs_per_page = ctx->num_uars_per_page * MLX5_NUM_NON_FP_BFREGS_PER_UAR; + if (bf_uar->qp_dedicated) + head = &ctx->dyn_uar_qp_dedicated_list; + else if (bf_uar->qp_shared) + head = &ctx->dyn_uar_qp_shared_list; + else + head = (bf_uar->nc_mode) ? &ctx->dyn_uar_nc_list : &ctx->dyn_uar_bf_list; + + for (j = 0; j < num_bfregs_per_page; j++) { + if (j != 0) { + bf = calloc(1, sizeof(*bf)); + if (!bf) + return; + } + + index_uar_in_page = (j % num_bfregs_per_page) / + MLX5_NUM_NON_FP_BFREGS_PER_UAR; + index_in_uar = j % MLX5_NUM_NON_FP_BFREGS_PER_UAR; + bf->reg = bf_uar->uar + (index_uar_in_page * MLX5_ADAPTER_PAGE_SIZE) + + MLX5_BF_OFFSET + (index_in_uar * ctx->bf_reg_size); + bf->buf_size = bf_uar->nc_mode ? 0 : ctx->bf_reg_size / 2; + /* set to non zero is BF entry, will be detected as part of post_send */ + bf->uuarn = bf_uar->nc_mode ? 0 : 1; + list_node_init(&bf->uar_entry); + list_add_tail(head, &bf->uar_entry); + if (!bf_uar->dyn_alloc_uar) + bf->bfreg_dyn_index = (ctx->curr_legacy_dyn_sys_uar_page - 1) * num_bfregs_per_page; + bf->dyn_alloc_uar = bf_uar->dyn_alloc_uar; + bf->need_lock = bf_uar->qp_shared; + mlx5_spinlock_init(&bf->lock, bf->need_lock); + if (j != 0) { + bf->uar = bf_uar->uar; + bf->page_id = bf_uar->page_id + index_uar_in_page; + bf->uar_handle = bf_uar->uar_handle; + bf->nc_mode = bf_uar->nc_mode; + } + if (bf_uar->qp_dedicated) { + ctx->qp_alloc_dedicated_uuars++; + bf->qp_dedicated = true; + } else if (bf_uar->qp_shared) { + ctx->qp_alloc_shared_uuars++; + bf->qp_shared = true; + } + } +} + +static void mlx5_put_qp_uar(struct mlx5_context *ctx, struct mlx5_bf *bf) +{ + if (!bf || (!bf->qp_dedicated && !bf->qp_shared)) + return; + + pthread_mutex_lock(&ctx->dyn_bfregs_mutex); + if (bf->qp_dedicated) + list_add_tail(&ctx->dyn_uar_qp_dedicated_list, + &bf->uar_entry); + else + bf->count--; + pthread_mutex_unlock(&ctx->dyn_bfregs_mutex); +} + +static int mlx5_alloc_qp_uar(struct ibv_context *context, bool dedicated) +{ + struct mlx5_context *ctx = to_mctx(context); + struct mlx5_bf *bf; + uint32_t flags; + + flags = (ctx->shut_up_bf || !ctx->bf_reg_size) ? + MLX5_IB_UAPI_UAR_ALLOC_TYPE_NC : + MLX5_IB_UAPI_UAR_ALLOC_TYPE_BF; + + bf = mlx5_alloc_dyn_uar(context, flags); + if (!bf) + return -1; + + if (dedicated) + bf->qp_dedicated = true; + else + bf->qp_shared = true; + + mlx5_insert_dyn_uuars(ctx, bf); + return 0; +} + +static struct mlx5_bf *mlx5_get_qp_uar(struct ibv_context *context) +{ + struct mlx5_context *ctx = to_mctx(context); + struct mlx5_bf *bf = NULL, *bf_entry; + + pthread_mutex_lock(&ctx->dyn_bfregs_mutex); + do { + bf = list_pop(&ctx->dyn_uar_qp_dedicated_list, struct mlx5_bf, uar_entry); + if (bf) + break; + + if (ctx->qp_alloc_dedicated_uuars < ctx->qp_max_dedicated_uuars) { + if (mlx5_alloc_qp_uar(context, true)) + break; + continue; + } + + if (ctx->qp_alloc_shared_uuars < ctx->qp_max_shared_uuars) { + if (mlx5_alloc_qp_uar(context, false)) + break; + } + + /* Looking for a shared uuar with the less concurent usage */ + list_for_each(&ctx->dyn_uar_qp_shared_list, bf_entry, uar_entry) { + if (!bf) { + bf = bf_entry; + } else { + if (bf_entry->count < bf->count) + bf = bf_entry; + } + } + bf->count++; + } while (!bf); + + pthread_mutex_unlock(&ctx->dyn_bfregs_mutex); + return bf; +} + +/* Returns a dedicated UAR */ +struct mlx5_bf *mlx5_attach_dedicated_uar(struct ibv_context *context, + uint32_t flags) +{ + struct mlx5_context *ctx = to_mctx(context); + struct mlx5_bf *bf; + struct list_head *head; + + pthread_mutex_lock(&ctx->dyn_bfregs_mutex); + head = (flags == MLX5_IB_UAPI_UAR_ALLOC_TYPE_NC) ? + &ctx->dyn_uar_nc_list : &ctx->dyn_uar_bf_list; + bf = list_pop(head, struct mlx5_bf, uar_entry); + if (!bf) { + bf = mlx5_alloc_dyn_uar(context, flags); + if (!bf) + goto end; + mlx5_insert_dyn_uuars(ctx, bf); + bf = list_pop(head, struct mlx5_bf, uar_entry); + assert(bf); + } +end: + pthread_mutex_unlock(&ctx->dyn_bfregs_mutex); + return bf; +} + +static void mlx5_detach_dedicated_uar(struct ibv_context *context, struct mlx5_bf *bf) +{ + struct mlx5_context *ctx = to_mctx(context); + + pthread_mutex_lock(&ctx->dyn_bfregs_mutex); + list_add_tail(bf->nc_mode ? &ctx->dyn_uar_nc_list : + &ctx->dyn_uar_bf_list, + &bf->uar_entry); + pthread_mutex_unlock(&ctx->dyn_bfregs_mutex); + return; +} + +struct ibv_td *mlx5_alloc_td(struct ibv_context *context, struct ibv_td_init_attr *init_attr) +{ + struct mlx5_td *td; + + if (init_attr->comp_mask) { + errno = EINVAL; + return NULL; + } + + td = calloc(1, sizeof(*td)); + if (!td) { + errno = ENOMEM; + return NULL; + } + + td->bf = mlx5_attach_dedicated_uar(context, 0); + if (!td->bf) { + free(td); + return NULL; + } + + td->ibv_td.context = context; + atomic_init(&td->refcount, 1); + + return &td->ibv_td; +} + +int mlx5_dealloc_td(struct ibv_td *ib_td) +{ + struct mlx5_td *td; + + td = to_mtd(ib_td); + if (atomic_load(&td->refcount) > 1) + return EBUSY; + + mlx5_detach_dedicated_uar(ib_td->context, td->bf); + free(td); + + return 0; +} + +struct ibv_pd * +mlx5_alloc_parent_domain(struct ibv_context *context, + struct ibv_parent_domain_init_attr *attr) +{ + struct mlx5_parent_domain *mparent_domain; + + if (ibv_check_alloc_parent_domain(attr)) + return NULL; + + if (!check_comp_mask(attr->comp_mask, + IBV_PARENT_DOMAIN_INIT_ATTR_ALLOCATORS | + IBV_PARENT_DOMAIN_INIT_ATTR_PD_CONTEXT)) { + errno = EINVAL; + return NULL; + } + + mparent_domain = calloc(1, sizeof(*mparent_domain)); + if (!mparent_domain) { + errno = ENOMEM; + return NULL; + } + + if (attr->td) { + mparent_domain->mtd = to_mtd(attr->td); + atomic_fetch_add(&mparent_domain->mtd->refcount, 1); + } + + mparent_domain->mpd.mprotection_domain = to_mpd(attr->pd); + atomic_fetch_add(&mparent_domain->mpd.mprotection_domain->refcount, 1); + atomic_init(&mparent_domain->mpd.refcount, 1); + + ibv_initialize_parent_domain( + &mparent_domain->mpd.ibv_pd, + &mparent_domain->mpd.mprotection_domain->ibv_pd); + + if (attr->comp_mask & IBV_PARENT_DOMAIN_INIT_ATTR_ALLOCATORS) { + mparent_domain->alloc = attr->alloc; + mparent_domain->free = attr->free; + } + + if (attr->comp_mask & IBV_PARENT_DOMAIN_INIT_ATTR_PD_CONTEXT) + mparent_domain->pd_context = attr->pd_context; + + return &mparent_domain->mpd.ibv_pd; +} + +static int mlx5_dealloc_parent_domain(struct mlx5_parent_domain *mparent_domain) +{ + if (atomic_load(&mparent_domain->mpd.refcount) > 1) + return EBUSY; + + atomic_fetch_sub(&mparent_domain->mpd.mprotection_domain->refcount, 1); + + if (mparent_domain->mtd) + atomic_fetch_sub(&mparent_domain->mtd->refcount, 1); + + free(mparent_domain); + return 0; +} + +int mlx5_free_pd(struct ibv_pd *pd) +{ + int ret; + struct mlx5_parent_domain *mparent_domain = to_mparent_domain(pd); + struct mlx5_pd *mpd = to_mpd(pd); + + if (mparent_domain) + return mlx5_dealloc_parent_domain(mparent_domain); + + if (atomic_load(&mpd->refcount) > 1) + return EBUSY; + + ret = ibv_cmd_dealloc_pd(pd); + if (ret) + return ret; + + free(mpd); + return 0; +} + +struct ibv_mr *mlx5_reg_mr(struct ibv_pd *pd, void *addr, size_t length, + uint64_t hca_va, int acc) +{ + struct mlx5_mr *mr; + struct ibv_reg_mr cmd; + int ret; + enum ibv_access_flags access = (enum ibv_access_flags)acc; + struct ib_uverbs_reg_mr_resp resp; + + mr = calloc(1, sizeof(*mr)); + if (!mr) + return NULL; + + ret = ibv_cmd_reg_mr(pd, addr, length, hca_va, access, &mr->vmr, &cmd, + sizeof(cmd), &resp, sizeof(resp)); + if (ret) { + mlx5_free_buf(&(mr->buf)); + free(mr); + return NULL; + } + mr->alloc_flags = acc; + + return &mr->vmr.ibv_mr; +} + +struct ibv_mr *mlx5_alloc_null_mr(struct ibv_pd *pd) +{ + struct mlx5_mr *mr; + struct mlx5_context *ctx = to_mctx(pd->context); + + if (ctx->dump_fill_mkey == MLX5_INVALID_LKEY) { + errno = ENOTSUP; + return NULL; + } + + mr = calloc(1, sizeof(*mr)); + if (!mr) { + errno = ENOMEM; + return NULL; + } + + mr->vmr.ibv_mr.lkey = ctx->dump_fill_mkey; + + mr->vmr.ibv_mr.context = pd->context; + mr->vmr.ibv_mr.pd = pd; + mr->vmr.ibv_mr.addr = NULL; + mr->vmr.ibv_mr.length = SIZE_MAX; + mr->vmr.mr_type = IBV_MR_TYPE_NULL_MR; + + return &mr->vmr.ibv_mr; +} + +enum { + MLX5_DM_ALLOWED_ACCESS = IBV_ACCESS_LOCAL_WRITE | + IBV_ACCESS_REMOTE_WRITE | + IBV_ACCESS_REMOTE_READ | + IBV_ACCESS_REMOTE_ATOMIC | + IBV_ACCESS_ZERO_BASED | + IBV_ACCESS_OPTIONAL_RANGE +}; + +struct ibv_mr *mlx5_reg_dm_mr(struct ibv_pd *pd, struct ibv_dm *ibdm, + uint64_t dm_offset, size_t length, + unsigned int acc) +{ + struct mlx5_dm *dm = to_mdm(ibdm); + struct mlx5_mr *mr; + int ret; + + if (acc & ~MLX5_DM_ALLOWED_ACCESS) { + errno = EINVAL; + return NULL; + } + + mr = calloc(1, sizeof(*mr)); + if (!mr) { + errno = ENOMEM; + return NULL; + } + + ret = ibv_cmd_reg_dm_mr(pd, &dm->verbs_dm, dm_offset, length, acc, + &mr->vmr, NULL); + if (ret) { + free(mr); + return NULL; + } + + mr->alloc_flags = acc; + + return &mr->vmr.ibv_mr; +} + +int mlx5_rereg_mr(struct verbs_mr *vmr, int flags, struct ibv_pd *pd, + void *addr, size_t length, int access) +{ + struct ibv_rereg_mr cmd; + struct ib_uverbs_rereg_mr_resp resp; + + if (flags & IBV_REREG_MR_KEEP_VALID) + return ENOTSUP; + + return ibv_cmd_rereg_mr(vmr, flags, addr, length, (uintptr_t)addr, + access, pd, &cmd, sizeof(cmd), &resp, + sizeof(resp)); +} + +int mlx5_dereg_mr(struct verbs_mr *vmr) +{ + int ret; + + if (vmr->mr_type == IBV_MR_TYPE_NULL_MR) + goto free; + + ret = ibv_cmd_dereg_mr(vmr); + if (ret) + return ret; + +free: + free(vmr); + return 0; +} + +int mlx5_advise_mr(struct ibv_pd *pd, + enum ibv_advise_mr_advice advice, + uint32_t flags, + struct ibv_sge *sg_list, + uint32_t num_sge) +{ + return ibv_cmd_advise_mr(pd, advice, flags, sg_list, num_sge); +} + +struct ibv_mw *mlx5_alloc_mw(struct ibv_pd *pd, enum ibv_mw_type type) +{ + struct ibv_mw *mw; + struct ibv_alloc_mw cmd; + struct ib_uverbs_alloc_mw_resp resp; + int ret; + + mw = malloc(sizeof(*mw)); + if (!mw) + return NULL; + + memset(mw, 0, sizeof(*mw)); + + ret = ibv_cmd_alloc_mw(pd, type, mw, &cmd, sizeof(cmd), &resp, + sizeof(resp)); + if (ret) { + free(mw); + return NULL; + } + + return mw; +} + +int mlx5_dealloc_mw(struct ibv_mw *mw) +{ + int ret; + + ret = ibv_cmd_dealloc_mw(mw); + if (ret) + return ret; + + free(mw); + return 0; +} + +static int get_cqe_size(struct mlx5dv_cq_init_attr *mlx5cq_attr) +{ + char *env; + int size = 64; + + if (mlx5cq_attr && + (mlx5cq_attr->comp_mask & MLX5DV_CQ_INIT_ATTR_MASK_CQE_SIZE)) { + size = mlx5cq_attr->cqe_size; + } else { + env = getenv("MLX5_CQE_SIZE"); + if (env) + size = atoi(env); + } + + switch (size) { + case 64: + case 128: + return size; + + default: + return -EINVAL; + } +} + +static int use_scatter_to_cqe(void) +{ + char *env; + + env = getenv("MLX5_SCATTER_TO_CQE"); + if (env && !strcmp(env, "0")) + return 0; + + return 1; +} + +static int srq_sig_enabled(void) +{ + char *env; + + env = getenv("MLX5_SRQ_SIGNATURE"); + if (env) + return 1; + + return 0; +} + +static int qp_sig_enabled(void) +{ + char *env; + + env = getenv("MLX5_QP_SIGNATURE"); + if (env) + return 1; + + return 0; +} + +enum { + CREATE_CQ_SUPPORTED_WC_FLAGS = IBV_WC_STANDARD_FLAGS | + IBV_WC_EX_WITH_COMPLETION_TIMESTAMP | + IBV_WC_EX_WITH_CVLAN | + IBV_WC_EX_WITH_FLOW_TAG | + IBV_WC_EX_WITH_TM_INFO | + IBV_WC_EX_WITH_COMPLETION_TIMESTAMP_WALLCLOCK +}; + +enum { + CREATE_CQ_SUPPORTED_COMP_MASK = IBV_CQ_INIT_ATTR_MASK_FLAGS | + IBV_CQ_INIT_ATTR_MASK_PD +}; + +enum { + CREATE_CQ_SUPPORTED_FLAGS = + IBV_CREATE_CQ_ATTR_SINGLE_THREADED | + IBV_CREATE_CQ_ATTR_IGNORE_OVERRUN +}; + +enum { + MLX5_DV_CREATE_CQ_SUP_COMP_MASK = + (MLX5DV_CQ_INIT_ATTR_MASK_COMPRESSED_CQE | + MLX5DV_CQ_INIT_ATTR_MASK_FLAGS | + MLX5DV_CQ_INIT_ATTR_MASK_CQE_SIZE), +}; + +static struct ibv_cq_ex *create_cq(struct ibv_context *context, + const struct ibv_cq_init_attr_ex *cq_attr, + int cq_alloc_flags, + struct mlx5dv_cq_init_attr *mlx5cq_attr) +{ + struct mlx5_create_cq cmd = {}; + struct mlx5_create_cq_resp resp = {}; + struct mlx5_create_cq_ex cmd_ex = {}; + struct mlx5_create_cq_ex_resp resp_ex = {}; + struct mlx5_ib_create_cq *cmd_drv; + struct mlx5_ib_create_cq_resp *resp_drv; + struct mlx5_cq *cq; + int cqe_sz; + int ret; + int ncqe; + int rc; + struct mlx5_context *mctx = to_mctx(context); + FILE *fp = to_mctx(context)->dbg_fp; + bool use_ex = false; + + if (!cq_attr->cqe) { + mlx5_dbg(fp, MLX5_DBG_CQ, "CQE invalid\n"); + errno = EINVAL; + return NULL; + } + + if (cq_attr->comp_mask & ~CREATE_CQ_SUPPORTED_COMP_MASK) { + mlx5_dbg(fp, MLX5_DBG_CQ, + "Unsupported comp_mask for create_cq\n"); + errno = EINVAL; + return NULL; + } + + if (cq_attr->comp_mask & IBV_CQ_INIT_ATTR_MASK_FLAGS && + cq_attr->flags & ~CREATE_CQ_SUPPORTED_FLAGS) { + mlx5_dbg(fp, MLX5_DBG_CQ, + "Unsupported creation flags requested for create_cq\n"); + errno = EINVAL; + return NULL; + } + + if (cq_attr->wc_flags & ~CREATE_CQ_SUPPORTED_WC_FLAGS) { + mlx5_dbg(fp, MLX5_DBG_CQ, "\n"); + errno = ENOTSUP; + return NULL; + } + + if (mlx5cq_attr && + !check_comp_mask(mlx5cq_attr->comp_mask, + MLX5_DV_CREATE_CQ_SUP_COMP_MASK)) { + mlx5_dbg(fp, MLX5_DBG_CQ, + "unsupported vendor comp_mask for %s\n", __func__); + errno = EINVAL; + return NULL; + } + + cq = calloc(1, sizeof *cq); + if (!cq) { + mlx5_dbg(fp, MLX5_DBG_CQ, "\n"); + return NULL; + } + + if (cq_attr->comp_mask & IBV_CQ_INIT_ATTR_MASK_FLAGS) { + if (cq_attr->flags & IBV_CREATE_CQ_ATTR_SINGLE_THREADED) + cq->flags |= MLX5_CQ_FLAGS_SINGLE_THREADED; + if (cq_attr->flags & IBV_CREATE_CQ_ATTR_IGNORE_OVERRUN) + use_ex = true; + } + if (cq_attr->comp_mask & IBV_CQ_INIT_ATTR_MASK_PD) { + if (!(to_mparent_domain(cq_attr->parent_domain))) { + errno = EINVAL; + goto err; + } + cq->parent_domain = cq_attr->parent_domain; + } + + cmd_drv = use_ex ? &cmd_ex.drv_payload : &cmd.drv_payload; + resp_drv = use_ex ? &resp_ex.drv_payload : &resp.drv_payload; + + if (cq_alloc_flags & MLX5_CQ_FLAGS_EXTENDED) { + rc = mlx5_cq_fill_pfns(cq, cq_attr, mctx); + if (rc) { + errno = rc; + goto err; + } + } + + cq->cons_index = 0; + + if (mlx5_spinlock_init(&cq->lock, !mlx5_single_threaded)) + goto err; + + ncqe = align_queue_size(cq_attr->cqe + 1); + if ((ncqe > (1 << 24)) || (ncqe < (cq_attr->cqe + 1))) { + mlx5_dbg(fp, MLX5_DBG_CQ, "ncqe %d\n", ncqe); + errno = EINVAL; + goto err_spl; + } + + cqe_sz = get_cqe_size(mlx5cq_attr); + if (cqe_sz < 0) { + mlx5_dbg(fp, MLX5_DBG_CQ, "\n"); + errno = -cqe_sz; + goto err_spl; + } + + if (mlx5_alloc_cq_buf(to_mctx(context), cq, &cq->buf_a, ncqe, cqe_sz)) { + mlx5_dbg(fp, MLX5_DBG_CQ, "\n"); + goto err_spl; + } + + cq->dbrec = mlx5_alloc_dbrec(to_mctx(context), cq->parent_domain, + &cq->custom_db); + if (!cq->dbrec) { + mlx5_dbg(fp, MLX5_DBG_CQ, "\n"); + goto err_buf; + } + + cq->dbrec[MLX5_CQ_SET_CI] = 0; + cq->dbrec[MLX5_CQ_ARM_DB] = 0; + cq->arm_sn = 0; + cq->cqe_sz = cqe_sz; + cq->flags = cq_alloc_flags; + + cmd_drv->buf_addr = (uintptr_t) cq->buf_a.buf; + cmd_drv->db_addr = (uintptr_t) cq->dbrec; + cmd_drv->cqe_size = cqe_sz; + + if (mlx5cq_attr) { + if (mlx5cq_attr->comp_mask & MLX5DV_CQ_INIT_ATTR_MASK_COMPRESSED_CQE) { + if (mctx->cqe_comp_caps.max_num && + (mlx5cq_attr->cqe_comp_res_format & + mctx->cqe_comp_caps.supported_format)) { + cmd_drv->cqe_comp_en = 1; + cmd_drv->cqe_comp_res_format = mlx5cq_attr->cqe_comp_res_format; + } else { + mlx5_dbg(fp, MLX5_DBG_CQ, "CQE Compression is not supported\n"); + errno = EINVAL; + goto err_db; + } + } + + if (mlx5cq_attr->comp_mask & MLX5DV_CQ_INIT_ATTR_MASK_FLAGS) { + if (mlx5cq_attr->flags & ~(MLX5DV_CQ_INIT_ATTR_FLAGS_RESERVED - 1)) { + mlx5_dbg(fp, MLX5_DBG_CQ, + "Unsupported vendor flags for create_cq\n"); + errno = EINVAL; + goto err_db; + } + + if (mlx5cq_attr->flags & MLX5DV_CQ_INIT_ATTR_FLAGS_CQE_PAD) { + if (!(mctx->vendor_cap_flags & + MLX5_VENDOR_CAP_FLAGS_CQE_128B_PAD) || + (cqe_sz != 128)) { + mlx5_dbg(fp, MLX5_DBG_CQ, + "%dB CQE paddind is not supported\n", + cqe_sz); + errno = EINVAL; + goto err_db; + } + + cmd_drv->flags |= MLX5_IB_CREATE_CQ_FLAGS_CQE_128B_PAD; + } + } + } + + if (mctx->cq_uar) { + cmd_drv->flags |= MLX5_IB_CREATE_CQ_FLAGS_UAR_PAGE_INDEX; + cmd_drv->uar_page_index = mctx->cq_uar->page_id; + } + + if (use_ex) { + struct ibv_cq_init_attr_ex cq_attr_ex = *cq_attr; + + cq_attr_ex.cqe = ncqe - 1; + ret = ibv_cmd_create_cq_ex(context, &cq_attr_ex, &cq->ibv_cq, + &cmd_ex.ibv_cmd, sizeof(cmd_ex), + &resp_ex.ibv_resp, sizeof(resp_ex)); + } else { + ret = ibv_cmd_create_cq(context, ncqe - 1, cq_attr->channel, + cq_attr->comp_vector, + ibv_cq_ex_to_cq(&cq->ibv_cq), + &cmd.ibv_cmd, sizeof(cmd), + &resp.ibv_resp, sizeof(resp)); + } + + + if (ret) { + mlx5_dbg(fp, MLX5_DBG_CQ, "ret %d\n", ret); + goto err_db; + } + + if (cq->parent_domain) + atomic_fetch_add(&to_mparent_domain(cq->parent_domain)->mpd.refcount, 1); + cq->active_buf = &cq->buf_a; + cq->resize_buf = NULL; + cq->cqn = resp_drv->cqn; + cq->stall_enable = to_mctx(context)->stall_enable; + cq->stall_adaptive_enable = to_mctx(context)->stall_adaptive_enable; + cq->stall_cycles = to_mctx(context)->stall_cycles; + + return &cq->ibv_cq; + +err_db: + mlx5_free_db(to_mctx(context), cq->dbrec, cq->parent_domain, cq->custom_db); + +err_buf: + mlx5_free_cq_buf(to_mctx(context), &cq->buf_a); + +err_spl: + mlx5_spinlock_destroy(&cq->lock); + +err: + free(cq); + + return NULL; +} + +struct ibv_cq *mlx5_create_cq(struct ibv_context *context, int cqe, + struct ibv_comp_channel *channel, + int comp_vector) +{ + struct ibv_cq_ex *cq; + struct ibv_cq_init_attr_ex cq_attr = {.cqe = cqe, .channel = channel, + .comp_vector = comp_vector, + .wc_flags = IBV_WC_STANDARD_FLAGS}; + + if (cqe <= 0) { + errno = EINVAL; + return NULL; + } + + cq = create_cq(context, &cq_attr, 0, NULL); + return cq ? ibv_cq_ex_to_cq(cq) : NULL; +} + +struct ibv_cq_ex *mlx5_create_cq_ex(struct ibv_context *context, + struct ibv_cq_init_attr_ex *cq_attr) +{ + return create_cq(context, cq_attr, MLX5_CQ_FLAGS_EXTENDED, NULL); +} + +struct ibv_cq_ex *mlx5dv_create_cq(struct ibv_context *context, + struct ibv_cq_init_attr_ex *cq_attr, + struct mlx5dv_cq_init_attr *mlx5_cq_attr) +{ + struct ibv_cq_ex *cq; + + if (!is_mlx5_dev(context->device)) { + errno = EOPNOTSUPP; + return NULL; + } + + cq = create_cq(context, cq_attr, MLX5_CQ_FLAGS_EXTENDED, mlx5_cq_attr); + if (!cq) + return NULL; + + verbs_init_cq(ibv_cq_ex_to_cq(cq), context, + cq_attr->channel, cq_attr->cq_context); + return cq; +} + +int mlx5_resize_cq(struct ibv_cq *ibcq, int cqe) +{ + struct mlx5_cq *cq = to_mcq(ibcq); + struct mlx5_resize_cq_resp resp; + struct mlx5_resize_cq cmd; + struct mlx5_context *mctx = to_mctx(ibcq->context); + int err; + + if (cqe < 0) { + errno = EINVAL; + return errno; + } + + memset(&cmd, 0, sizeof(cmd)); + memset(&resp, 0, sizeof(resp)); + + if (((long long)cqe * 64) > INT_MAX) + return EINVAL; + + mlx5_spin_lock(&cq->lock); + cq->active_cqes = cq->ibv_cq.cqe; + if (cq->active_buf == &cq->buf_a) + cq->resize_buf = &cq->buf_b; + else + cq->resize_buf = &cq->buf_a; + + cqe = align_queue_size(cqe + 1); + if (cqe == ibcq->cqe + 1) { + cq->resize_buf = NULL; + err = 0; + goto out; + } + + /* currently we don't change cqe size */ + cq->resize_cqe_sz = cq->cqe_sz; + cq->resize_cqes = cqe; + err = mlx5_alloc_cq_buf(mctx, cq, cq->resize_buf, cq->resize_cqes, cq->resize_cqe_sz); + if (err) { + cq->resize_buf = NULL; + errno = ENOMEM; + goto out; + } + + cmd.buf_addr = (uintptr_t)cq->resize_buf->buf; + cmd.cqe_size = cq->resize_cqe_sz; + + err = ibv_cmd_resize_cq(ibcq, cqe - 1, &cmd.ibv_cmd, sizeof(cmd), + &resp.ibv_resp, sizeof(resp)); + if (err) + goto out_buf; + + mlx5_cq_resize_copy_cqes(cq); + mlx5_free_cq_buf(mctx, cq->active_buf); + cq->active_buf = cq->resize_buf; + cq->ibv_cq.cqe = cqe - 1; + mlx5_spin_unlock(&cq->lock); + cq->resize_buf = NULL; + return 0; + +out_buf: + mlx5_free_cq_buf(mctx, cq->resize_buf); + cq->resize_buf = NULL; + +out: + mlx5_spin_unlock(&cq->lock); + return err; +} + +int mlx5_destroy_cq(struct ibv_cq *cq) +{ + int ret; + struct mlx5_cq *mcq = to_mcq(cq); + + ret = ibv_cmd_destroy_cq(cq); + if (ret) + return ret; + + mlx5_free_db(to_mctx(cq->context), mcq->dbrec, mcq->parent_domain, + mcq->custom_db); + mlx5_free_cq_buf(to_mctx(cq->context), mcq->active_buf); + if (mcq->parent_domain) + atomic_fetch_sub(&to_mparent_domain(mcq->parent_domain)->mpd.refcount, 1); + free(mcq); + + return 0; +} + +struct ibv_srq *mlx5_create_srq(struct ibv_pd *pd, + struct ibv_srq_init_attr *attr) +{ + struct mlx5_create_srq cmd; + struct mlx5_create_srq_resp resp; + struct mlx5_srq *srq; + int ret; + struct mlx5_context *ctx; + int max_sge; + struct ibv_srq *ibsrq; + + ctx = to_mctx(pd->context); + srq = calloc(1, sizeof *srq); + if (!srq) { + fprintf(stderr, "%s-%d:\n", __func__, __LINE__); + return NULL; + } + ibsrq = &srq->vsrq.srq; + + memset(&cmd, 0, sizeof cmd); + if (mlx5_spinlock_init_pd(&srq->lock, pd)) { + fprintf(stderr, "%s-%d:\n", __func__, __LINE__); + goto err; + } + + if (attr->attr.max_wr > ctx->max_srq_recv_wr) { + fprintf(stderr, "%s-%d:max_wr %d, max_srq_recv_wr %d\n", __func__, __LINE__, + attr->attr.max_wr, ctx->max_srq_recv_wr); + errno = EINVAL; + goto err; + } + + /* + * this calculation does not consider required control segments. The + * final calculation is done again later. This is done so to avoid + * overflows of variables + */ + max_sge = ctx->max_rq_desc_sz / sizeof(struct mlx5_wqe_data_seg); + if (attr->attr.max_sge > max_sge) { + fprintf(stderr, "%s-%d:max_wr %d, max_srq_recv_wr %d\n", __func__, __LINE__, + attr->attr.max_wr, ctx->max_srq_recv_wr); + errno = EINVAL; + goto err; + } + + srq->max_gs = attr->attr.max_sge; + srq->counter = 0; + + if (mlx5_alloc_srq_buf(pd->context, srq, attr->attr.max_wr, pd)) { + fprintf(stderr, "%s-%d:\n", __func__, __LINE__); + goto err; + } + + srq->db = mlx5_alloc_dbrec(to_mctx(pd->context), pd, &srq->custom_db); + if (!srq->db) { + fprintf(stderr, "%s-%d:\n", __func__, __LINE__); + goto err_free; + } + + if (!srq->custom_db) + *srq->db = 0; + + cmd.buf_addr = (uintptr_t) srq->buf.buf; + cmd.db_addr = (uintptr_t) srq->db; + srq->wq_sig = srq_sig_enabled(); + if (srq->wq_sig) + cmd.flags = MLX5_SRQ_FLAG_SIGNATURE; + + attr->attr.max_sge = srq->max_gs; + pthread_mutex_lock(&ctx->srq_table_mutex); + + /* Override max_wr to let kernel know about extra WQEs for the + * wait queue. + */ + attr->attr.max_wr = srq->max - 1; + + ret = ibv_cmd_create_srq(pd, ibsrq, attr, &cmd.ibv_cmd, sizeof(cmd), + &resp.ibv_resp, sizeof(resp)); + if (ret) + goto err_db; + + /* Override kernel response that includes the wait queue with the real + * number of WQEs that are applicable for the application. + */ + attr->attr.max_wr = srq->tail; + + ret = mlx5_store_srq(ctx, resp.srqn, srq); + if (ret) + goto err_destroy; + + pthread_mutex_unlock(&ctx->srq_table_mutex); + + srq->srqn = resp.srqn; + srq->rsc.rsn = resp.srqn; + srq->rsc.type = MLX5_RSC_TYPE_SRQ; + + return ibsrq; + +err_destroy: + ibv_cmd_destroy_srq(ibsrq); + +err_db: + pthread_mutex_unlock(&ctx->srq_table_mutex); + mlx5_free_db(to_mctx(pd->context), srq->db, pd, srq->custom_db); + +err_free: + free(srq->wrid); + mlx5_free_actual_buf(ctx, &srq->buf); + +err: + free(srq); + + return NULL; +} + +int mlx5_modify_srq(struct ibv_srq *srq, + struct ibv_srq_attr *attr, + int attr_mask) +{ + struct ibv_modify_srq cmd; + + return ibv_cmd_modify_srq(srq, attr, attr_mask, &cmd, sizeof cmd); +} + +int mlx5_query_srq(struct ibv_srq *srq, + struct ibv_srq_attr *attr) +{ + struct ibv_query_srq cmd; + + return ibv_cmd_query_srq(srq, attr, &cmd, sizeof cmd); +} + +int mlx5_destroy_srq(struct ibv_srq *srq) +{ + int ret; + struct mlx5_srq *msrq = to_msrq(srq); + struct mlx5_context *ctx = to_mctx(srq->context); + + if (msrq->cmd_qp) { + ret = mlx5_destroy_qp(msrq->cmd_qp); + if (ret) + return ret; + msrq->cmd_qp = NULL; + } + + ret = ibv_cmd_destroy_srq(srq); + if (ret) + return ret; + + if (ctx->cqe_version && msrq->rsc.type == MLX5_RSC_TYPE_XSRQ) + mlx5_clear_uidx(ctx, msrq->rsc.rsn); + else + mlx5_clear_srq(ctx, msrq->srqn); + + mlx5_free_db(ctx, msrq->db, srq->pd, msrq->custom_db); + mlx5_free_actual_buf(ctx, &msrq->buf); + free(msrq->tm_list); + free(msrq->wrid); + free(msrq->op); + free(msrq); + + return 0; +} + +static int _sq_overhead(struct mlx5_qp *qp, + enum ibv_qp_type qp_type, + uint64_t ops, + uint64_t mlx5_ops) +{ + size_t size = sizeof(struct mlx5_wqe_ctrl_seg); + size_t rdma_size = 0; + size_t atomic_size = 0; + size_t mw_size = 0; + + /* Operation overhead */ + if (ops & (IBV_QP_EX_WITH_RDMA_WRITE | + IBV_QP_EX_WITH_RDMA_WRITE_WITH_IMM | + IBV_QP_EX_WITH_RDMA_READ)) + rdma_size = sizeof(struct mlx5_wqe_ctrl_seg) + + sizeof(struct mlx5_wqe_raddr_seg); + + if (ops & (IBV_QP_EX_WITH_ATOMIC_CMP_AND_SWP | + IBV_QP_EX_WITH_ATOMIC_FETCH_AND_ADD)) + atomic_size = sizeof(struct mlx5_wqe_ctrl_seg) + + sizeof(struct mlx5_wqe_raddr_seg) + + sizeof(struct mlx5_wqe_atomic_seg); + + if (ops & (IBV_QP_EX_WITH_BIND_MW | IBV_QP_EX_WITH_LOCAL_INV) || + (mlx5_ops & (MLX5DV_QP_EX_WITH_MR_INTERLEAVED | + MLX5DV_QP_EX_WITH_MR_LIST))) + mw_size = sizeof(struct mlx5_wqe_ctrl_seg) + + sizeof(struct mlx5_wqe_umr_ctrl_seg) + + sizeof(struct mlx5_wqe_mkey_context_seg) + + max_t(size_t, sizeof(struct mlx5_wqe_umr_klm_seg), 64); + + size = max_t(size_t, size, rdma_size); + size = max_t(size_t, size, atomic_size); + size = max_t(size_t, size, mw_size); + + /* Transport overhead */ + switch (qp_type) { + case IBV_QPT_DRIVER: + if (qp->dc_type != MLX5DV_DCTYPE_DCI) + return -EINVAL; + SWITCH_FALLTHROUGH; + + case IBV_QPT_UD: + size += sizeof(struct mlx5_wqe_datagram_seg); + if (qp->flags & MLX5_QP_FLAGS_USE_UNDERLAY) + size += sizeof(struct mlx5_wqe_eth_seg) + + sizeof(struct mlx5_wqe_eth_pad); + break; + + case IBV_QPT_XRC_RECV: + case IBV_QPT_XRC_SEND: + size += sizeof(struct mlx5_wqe_xrc_seg); + break; + + case IBV_QPT_RAW_PACKET: + size += sizeof(struct mlx5_wqe_eth_seg); + break; + + case IBV_QPT_RC: + case IBV_QPT_UC: + break; + + default: + return -EINVAL; + } + + return size; +} + +static int sq_overhead(struct mlx5_qp *qp, struct ibv_qp_init_attr_ex *attr, + struct mlx5dv_qp_init_attr *mlx5_qp_attr) +{ + uint64_t ops; + uint64_t mlx5_ops = 0; + + if (attr->comp_mask & IBV_QP_INIT_ATTR_SEND_OPS_FLAGS) { + ops = attr->send_ops_flags; + } else { + switch (attr->qp_type) { + case IBV_QPT_RC: + case IBV_QPT_UC: + case IBV_QPT_DRIVER: + case IBV_QPT_XRC_RECV: + case IBV_QPT_XRC_SEND: + ops = IBV_QP_EX_WITH_SEND | + IBV_QP_EX_WITH_SEND_WITH_INV | + IBV_QP_EX_WITH_SEND_WITH_IMM | + IBV_QP_EX_WITH_RDMA_WRITE | + IBV_QP_EX_WITH_RDMA_WRITE_WITH_IMM | + IBV_QP_EX_WITH_RDMA_READ | + IBV_QP_EX_WITH_ATOMIC_CMP_AND_SWP | + IBV_QP_EX_WITH_ATOMIC_FETCH_AND_ADD | + IBV_QP_EX_WITH_LOCAL_INV | + IBV_QP_EX_WITH_BIND_MW; + break; + + case IBV_QPT_UD: + ops = IBV_QP_EX_WITH_SEND | + IBV_QP_EX_WITH_SEND_WITH_IMM | + IBV_QP_EX_WITH_TSO; + break; + + case IBV_QPT_RAW_PACKET: + ops = IBV_QP_EX_WITH_SEND | + IBV_QP_EX_WITH_TSO; + break; + + default: + return -EINVAL; + } + } + + + if (mlx5_qp_attr && + mlx5_qp_attr->comp_mask & MLX5DV_QP_INIT_ATTR_MASK_SEND_OPS_FLAGS) + mlx5_ops = mlx5_qp_attr->send_ops_flags; + + return _sq_overhead(qp, attr->qp_type, ops, mlx5_ops); +} + +static int mlx5_calc_send_wqe(struct mlx5_context *ctx, + struct ibv_qp_init_attr_ex *attr, + struct mlx5dv_qp_init_attr *mlx5_qp_attr, + struct mlx5_qp *qp) +{ + int size; + int inl_size = 0; + int max_gather; + int tot_size; + + size = sq_overhead(qp, attr, mlx5_qp_attr); + if (size < 0) + return size; + + if (attr->cap.max_inline_data) { + inl_size = size + align(sizeof(struct mlx5_wqe_inl_data_seg) + + attr->cap.max_inline_data, 16); + } + + if (attr->comp_mask & IBV_QP_INIT_ATTR_MAX_TSO_HEADER) { + size += align(attr->max_tso_header, 16); + qp->max_tso_header = attr->max_tso_header; + } + + max_gather = (ctx->max_sq_desc_sz - size) / + sizeof(struct mlx5_wqe_data_seg); + if (attr->cap.max_send_sge > max_gather) + return -EINVAL; + + size += attr->cap.max_send_sge * sizeof(struct mlx5_wqe_data_seg); + tot_size = max_int(size, inl_size); + + if (tot_size > ctx->max_sq_desc_sz) + return -EINVAL; + + return align(tot_size, MLX5_SEND_WQE_BB); +} + +static int mlx5_calc_rcv_wqe(struct mlx5_context *ctx, + struct ibv_qp_init_attr_ex *attr, + struct mlx5_qp *qp) +{ + uint32_t size; + int num_scatter; + + if (attr->srq) + return 0; + + num_scatter = max_t(uint32_t, attr->cap.max_recv_sge, 1); + size = sizeof(struct mlx5_wqe_data_seg) * num_scatter; + if (qp->wq_sig) + size += sizeof(struct mlx5_rwqe_sig); + + if (size > ctx->max_rq_desc_sz) + return -EINVAL; + + size = roundup_pow_of_two(size); + + return size; +} + +static int mlx5_calc_sq_size(struct mlx5_context *ctx, + struct ibv_qp_init_attr_ex *attr, + struct mlx5dv_qp_init_attr *mlx5_qp_attr, + struct mlx5_qp *qp) +{ + int wqe_size; + int wq_size; + FILE *fp = ctx->dbg_fp; + + if (!attr->cap.max_send_wr) + return 0; + + wqe_size = mlx5_calc_send_wqe(ctx, attr, mlx5_qp_attr, qp); + if (wqe_size < 0) { + mlx5_dbg(fp, MLX5_DBG_QP, "\n"); + return wqe_size; + } + + if (wqe_size > ctx->max_sq_desc_sz) { + mlx5_dbg(fp, MLX5_DBG_QP, "\n"); + return -EINVAL; + } + + qp->max_inline_data = wqe_size - sq_overhead(qp, attr, mlx5_qp_attr) - + sizeof(struct mlx5_wqe_inl_data_seg); + attr->cap.max_inline_data = qp->max_inline_data; + + /* + * to avoid overflow, we limit max_send_wr so + * that the multiplication will fit in int + */ + if (attr->cap.max_send_wr > 0x7fffffff / ctx->max_sq_desc_sz) { + mlx5_dbg(fp, MLX5_DBG_QP, "\n"); + return -EINVAL; + } + + wq_size = roundup_pow_of_two(attr->cap.max_send_wr * wqe_size); + qp->sq.wqe_cnt = wq_size / MLX5_SEND_WQE_BB; + if (qp->sq.wqe_cnt > ctx->max_send_wqebb) { + mlx5_dbg(fp, MLX5_DBG_QP, "\n"); + return -EINVAL; + } + + qp->sq.wqe_shift = STATIC_ILOG_32(MLX5_SEND_WQE_BB) - 1; + qp->sq.max_gs = attr->cap.max_send_sge; + qp->sq.max_post = wq_size / wqe_size; + + return wq_size; +} + +enum { + DV_CREATE_WQ_SUPPORTED_COMP_MASK = MLX5DV_WQ_INIT_ATTR_MASK_STRIDING_RQ +}; + +static int mlx5_calc_rwq_size(struct mlx5_context *ctx, + struct mlx5_rwq *rwq, + struct ibv_wq_init_attr *attr, + struct mlx5dv_wq_init_attr *mlx5wq_attr) +{ + size_t wqe_size; + int wq_size; + uint32_t num_scatter; + int is_mprq = 0; + int scat_spc; + + if (!attr->max_wr) + return -EINVAL; + if (mlx5wq_attr) { + if (!check_comp_mask(mlx5wq_attr->comp_mask, + DV_CREATE_WQ_SUPPORTED_COMP_MASK)) + return -EINVAL; + + is_mprq = !!(mlx5wq_attr->comp_mask & + MLX5DV_WQ_INIT_ATTR_MASK_STRIDING_RQ); + } + + /* TBD: check caps for RQ */ + num_scatter = max_t(uint32_t, attr->max_sge, 1); + wqe_size = sizeof(struct mlx5_wqe_data_seg) * num_scatter + + sizeof(struct mlx5_wqe_srq_next_seg) * is_mprq; + + if (rwq->wq_sig) + wqe_size += sizeof(struct mlx5_rwqe_sig); + + if (wqe_size <= 0 || wqe_size > ctx->max_rq_desc_sz) + return -EINVAL; + + wqe_size = roundup_pow_of_two(wqe_size); + wq_size = roundup_pow_of_two(attr->max_wr) * wqe_size; + wq_size = max(wq_size, MLX5_SEND_WQE_BB); + rwq->rq.wqe_cnt = wq_size / wqe_size; + rwq->rq.wqe_shift = ilog32(wqe_size - 1); + rwq->rq.max_post = 1 << ilog32(wq_size / wqe_size - 1); + scat_spc = wqe_size - + ((rwq->wq_sig) ? sizeof(struct mlx5_rwqe_sig) : 0) - + is_mprq * sizeof(struct mlx5_wqe_srq_next_seg); + rwq->rq.max_gs = scat_spc / sizeof(struct mlx5_wqe_data_seg); + return wq_size; +} + +static int mlx5_calc_rq_size(struct mlx5_context *ctx, + struct ibv_qp_init_attr_ex *attr, + struct mlx5_qp *qp) +{ + int wqe_size; + int wq_size; + int scat_spc; + FILE *fp = ctx->dbg_fp; + + if (!attr->cap.max_recv_wr) + return 0; + + if (attr->cap.max_recv_wr > ctx->max_recv_wr) { + mlx5_dbg(fp, MLX5_DBG_QP, "\n"); + return -EINVAL; + } + + wqe_size = mlx5_calc_rcv_wqe(ctx, attr, qp); + if (wqe_size < 0 || wqe_size > ctx->max_rq_desc_sz) { + mlx5_dbg(fp, MLX5_DBG_QP, "\n"); + return -EINVAL; + } + + wq_size = roundup_pow_of_two(attr->cap.max_recv_wr) * wqe_size; + if (wqe_size) { + wq_size = max(wq_size, MLX5_SEND_WQE_BB); + qp->rq.wqe_cnt = wq_size / wqe_size; + qp->rq.wqe_shift = ilog32(wqe_size - 1); + qp->rq.max_post = 1 << ilog32(wq_size / wqe_size - 1); + scat_spc = wqe_size - + (qp->wq_sig ? sizeof(struct mlx5_rwqe_sig) : 0); + qp->rq.max_gs = scat_spc / sizeof(struct mlx5_wqe_data_seg); + } else { + qp->rq.wqe_cnt = 0; + qp->rq.wqe_shift = 0; + qp->rq.max_post = 0; + qp->rq.max_gs = 0; + } + return wq_size; +} + +static int mlx5_calc_wq_size(struct mlx5_context *ctx, + struct ibv_qp_init_attr_ex *attr, + struct mlx5dv_qp_init_attr *mlx5_qp_attr, + struct mlx5_qp *qp) +{ + int ret; + int result; + + ret = mlx5_calc_sq_size(ctx, attr, mlx5_qp_attr, qp); + if (ret < 0) + return ret; + + result = ret; + ret = mlx5_calc_rq_size(ctx, attr, qp); + if (ret < 0) + return ret; + + result += ret; + + qp->sq.offset = ret; + qp->rq.offset = 0; + + return result; +} + +static void map_uuar(struct ibv_context *context, struct mlx5_qp *qp, + int uuar_index, struct mlx5_bf *dyn_bf) +{ + struct mlx5_context *ctx = to_mctx(context); + + if (!dyn_bf) + qp->bf = &ctx->bfs[uuar_index]; + else + qp->bf = dyn_bf; +} + +static const char *qptype2key(enum ibv_qp_type type) +{ + switch (type) { + case IBV_QPT_RC: return "HUGE_RC"; + case IBV_QPT_UC: return "HUGE_UC"; + case IBV_QPT_UD: return "HUGE_UD"; + case IBV_QPT_RAW_PACKET: return "HUGE_RAW_ETH"; + default: return "HUGE_NA"; + } +} + +static int mlx5_alloc_qp_buf(struct ibv_context *context, + struct ibv_qp_init_attr_ex *attr, + struct mlx5_qp *qp, + int size) +{ + int err; + enum mlx5_alloc_type alloc_type; + enum mlx5_alloc_type default_alloc_type = MLX5_ALLOC_TYPE_ANON; + const char *qp_huge_key; + + if (qp->sq.wqe_cnt) { + qp->sq.wrid = malloc(qp->sq.wqe_cnt * sizeof(*qp->sq.wrid)); + if (!qp->sq.wrid) { + errno = ENOMEM; + err = -1; + return err; + } + + qp->sq.wr_data = malloc(qp->sq.wqe_cnt * sizeof(*qp->sq.wr_data)); + if (!qp->sq.wr_data) { + errno = ENOMEM; + err = -1; + goto ex_wrid; + } + + qp->sq.wqe_head = malloc(qp->sq.wqe_cnt * sizeof(*qp->sq.wqe_head)); + if (!qp->sq.wqe_head) { + errno = ENOMEM; + err = -1; + goto ex_wrid; + } + } + + if (qp->rq.wqe_cnt) { + qp->rq.wrid = malloc(qp->rq.wqe_cnt * sizeof(uint64_t)); + if (!qp->rq.wrid) { + errno = ENOMEM; + err = -1; + goto ex_wrid; + } + } + + /* compatibility support */ + qp_huge_key = qptype2key(qp->ibv_qp->qp_type); + if (mlx5_use_huge(qp_huge_key)) + default_alloc_type = MLX5_ALLOC_TYPE_HUGE; + + mlx5_get_alloc_type(to_mctx(context), attr->pd, MLX5_QP_PREFIX, + &alloc_type, default_alloc_type); + + if (alloc_type == MLX5_ALLOC_TYPE_CUSTOM) { + qp->buf.mparent_domain = to_mparent_domain(attr->pd); + qp->buf.req_alignment = to_mdev(context->device)->page_size; + qp->buf.resource_type = MLX5DV_RES_TYPE_QP; + } + + err = mlx5_alloc_prefered_buf(to_mctx(context), &qp->buf, + align(qp->buf_size, to_mdev + (context->device)->page_size), + to_mdev(context->device)->page_size, + alloc_type, + MLX5_QP_PREFIX); + + if (err) { + err = -ENOMEM; + goto ex_wrid; + } + + if (qp->buf.type != MLX5_ALLOC_TYPE_CUSTOM) + memset(qp->buf.buf, 0, qp->buf_size); + + if (attr->qp_type == IBV_QPT_RAW_PACKET || + qp->flags & MLX5_QP_FLAGS_USE_UNDERLAY) { + size_t aligned_sq_buf_size = align(qp->sq_buf_size, + to_mdev(context->device)->page_size); + + if (alloc_type == MLX5_ALLOC_TYPE_CUSTOM) { + qp->sq_buf.mparent_domain = to_mparent_domain(attr->pd); + qp->sq_buf.req_alignment = to_mdev(context->device)->page_size; + qp->sq_buf.resource_type = MLX5DV_RES_TYPE_QP; + } + + /* For Raw Packet QP, allocate a separate buffer for the SQ */ + err = mlx5_alloc_prefered_buf(to_mctx(context), &qp->sq_buf, + aligned_sq_buf_size, + to_mdev(context->device)->page_size, + alloc_type, + MLX5_QP_PREFIX); + if (err) { + err = -ENOMEM; + goto rq_buf; + } + + if (qp->sq_buf.type != MLX5_ALLOC_TYPE_CUSTOM) + memset(qp->sq_buf.buf, 0, aligned_sq_buf_size); + } + + return 0; +rq_buf: + mlx5_free_actual_buf(to_mctx(context), &qp->buf); +ex_wrid: + if (qp->rq.wrid) + free(qp->rq.wrid); + + if (qp->sq.wqe_head) + free(qp->sq.wqe_head); + + if (qp->sq.wr_data) + free(qp->sq.wr_data); + if (qp->sq.wrid) + free(qp->sq.wrid); + + return err; +} + +static void mlx5_free_qp_buf(struct mlx5_context *ctx, struct mlx5_qp *qp) +{ + mlx5_free_actual_buf(ctx, &qp->buf); + + if (qp->sq_buf.buf) + mlx5_free_actual_buf(ctx, &qp->sq_buf); + + if (qp->rq.wrid) + free(qp->rq.wrid); + + if (qp->sq.wqe_head) + free(qp->sq.wqe_head); + + if (qp->sq.wrid) + free(qp->sq.wrid); + + if (qp->sq.wr_data) + free(qp->sq.wr_data); +} + +static int mlx5_cmd_create_rss_qp(struct ibv_context *context, + struct ibv_qp_init_attr_ex *attr, + struct mlx5_qp *qp, + uint32_t mlx5_create_flags) +{ + struct mlx5_create_qp_ex_rss cmd_ex_rss = {}; + struct mlx5_create_qp_ex_resp resp = {}; + struct mlx5_ib_create_qp_resp *resp_drv; + int ret; + + if (attr->rx_hash_conf.rx_hash_key_len > sizeof(cmd_ex_rss.rx_hash_key)) { + errno = EINVAL; + return errno; + } + + cmd_ex_rss.rx_hash_fields_mask = attr->rx_hash_conf.rx_hash_fields_mask; + cmd_ex_rss.rx_hash_function = attr->rx_hash_conf.rx_hash_function; + cmd_ex_rss.rx_key_len = attr->rx_hash_conf.rx_hash_key_len; + cmd_ex_rss.flags = mlx5_create_flags; + memcpy(cmd_ex_rss.rx_hash_key, attr->rx_hash_conf.rx_hash_key, + attr->rx_hash_conf.rx_hash_key_len); + + ret = ibv_cmd_create_qp_ex2(context, &qp->verbs_qp, + sizeof(qp->verbs_qp), attr, + &cmd_ex_rss.ibv_cmd, sizeof(cmd_ex_rss), + &resp.ibv_resp, sizeof(resp)); + if (ret) + return ret; + + resp_drv = &resp.drv_payload; + + if (resp_drv->comp_mask & MLX5_IB_CREATE_QP_RESP_MASK_TIRN) + qp->tirn = resp_drv->tirn; + + if (resp_drv->comp_mask & MLX5_IB_CREATE_QP_RESP_MASK_TIR_ICM_ADDR) + qp->tir_icm_addr = resp_drv->tir_icm_addr; + + qp->rss_qp = 1; + return 0; +} + +static int mlx5_cmd_create_qp_ex(struct ibv_context *context, + struct ibv_qp_init_attr_ex *attr, + struct mlx5_create_qp *cmd, + struct mlx5_qp *qp, + struct mlx5_create_qp_ex_resp *resp) +{ + struct mlx5_create_qp_ex cmd_ex; + int ret; + + memset(&cmd_ex, 0, sizeof(cmd_ex)); + *ibv_create_qp_ex_to_reg(&cmd_ex.ibv_cmd) = cmd->ibv_cmd.core_payload; + + cmd_ex.drv_payload = cmd->drv_payload; + + ret = ibv_cmd_create_qp_ex2(context, &qp->verbs_qp, + sizeof(qp->verbs_qp), attr, &cmd_ex.ibv_cmd, + sizeof(cmd_ex), &resp->ibv_resp, + sizeof(*resp)); + + return ret; +} + +enum { + MLX5_CREATE_QP_SUP_COMP_MASK = (IBV_QP_INIT_ATTR_PD | + IBV_QP_INIT_ATTR_XRCD | + IBV_QP_INIT_ATTR_CREATE_FLAGS | + IBV_QP_INIT_ATTR_MAX_TSO_HEADER | + IBV_QP_INIT_ATTR_IND_TABLE | + IBV_QP_INIT_ATTR_RX_HASH | + IBV_QP_INIT_ATTR_SEND_OPS_FLAGS), +}; + +enum { + MLX5_DV_CREATE_QP_SUP_COMP_MASK = MLX5DV_QP_INIT_ATTR_MASK_QP_CREATE_FLAGS | + MLX5DV_QP_INIT_ATTR_MASK_DC | + MLX5DV_QP_INIT_ATTR_MASK_SEND_OPS_FLAGS +}; + +enum { + MLX5_CREATE_QP_EX2_COMP_MASK = (IBV_QP_INIT_ATTR_CREATE_FLAGS | + IBV_QP_INIT_ATTR_MAX_TSO_HEADER | + IBV_QP_INIT_ATTR_IND_TABLE | + IBV_QP_INIT_ATTR_RX_HASH), +}; + +enum { + MLX5DV_QP_CREATE_SUP_FLAGS = + (MLX5DV_QP_CREATE_TUNNEL_OFFLOADS | + MLX5DV_QP_CREATE_TIR_ALLOW_SELF_LOOPBACK_UC | + MLX5DV_QP_CREATE_TIR_ALLOW_SELF_LOOPBACK_MC | + MLX5DV_QP_CREATE_DISABLE_SCATTER_TO_CQE | + MLX5DV_QP_CREATE_ALLOW_SCATTER_TO_CQE | + MLX5DV_QP_CREATE_PACKET_BASED_CREDIT_MODE), +}; + +static int create_dct(struct ibv_context *context, + struct ibv_qp_init_attr_ex *attr, + struct mlx5dv_qp_init_attr *mlx5_qp_attr, + struct mlx5_qp *qp, uint32_t mlx5_create_flags) +{ + struct mlx5_create_qp cmd = {}; + struct mlx5_create_qp_resp resp = {}; + int ret; + struct mlx5_context *ctx = to_mctx(context); + int32_t usr_idx = 0xffffff; + FILE *fp = ctx->dbg_fp; + + if (!check_comp_mask(attr->comp_mask, IBV_QP_INIT_ATTR_PD)) { + mlx5_dbg(fp, MLX5_DBG_QP, + "Unsupported comp_mask for %s\n", __func__); + errno = EINVAL; + return errno; + } + + if (!check_comp_mask(mlx5_qp_attr->comp_mask, + MLX5DV_QP_INIT_ATTR_MASK_DC | + MLX5DV_QP_INIT_ATTR_MASK_QP_CREATE_FLAGS)) { + mlx5_dbg(fp, MLX5_DBG_QP, + "Unsupported vendor comp_mask for %s\n", __func__); + errno = EINVAL; + return errno; + } + + if (!check_comp_mask(mlx5_create_flags, MLX5_QP_FLAG_SCATTER_CQE)) { + mlx5_dbg(fp, MLX5_DBG_QP, + "Unsupported creation flags requested for DCT QP\n"); + errno = EINVAL; + return errno; + } + + if (!(ctx->vendor_cap_flags & MLX5_VENDOR_CAP_FLAGS_SCAT2CQE_DCT)) + mlx5_create_flags &= ~MLX5_QP_FLAG_SCATTER_CQE; + + cmd.flags = MLX5_QP_FLAG_TYPE_DCT | mlx5_create_flags; + cmd.access_key = mlx5_qp_attr->dc_init_attr.dct_access_key; + + if (ctx->cqe_version) { + usr_idx = mlx5_store_uidx(ctx, qp); + if (usr_idx < 0) { + mlx5_dbg(fp, MLX5_DBG_QP, "Couldn't find free user index\n"); + errno = ENOMEM; + return errno; + } + } + cmd.uidx = usr_idx; + + ret = ibv_cmd_create_qp_ex(context, &qp->verbs_qp, sizeof(qp->verbs_qp), + attr, &cmd.ibv_cmd, sizeof(cmd), + &resp.ibv_resp, sizeof(resp)); + if (ret) { + mlx5_dbg(fp, MLX5_DBG_QP, "Couldn't create dct, ret %d\n", ret); + if (ctx->cqe_version) + mlx5_clear_uidx(ctx, cmd.uidx); + return ret; + } + + qp->dc_type = MLX5DV_DCTYPE_DCT; + qp->rsc.type = MLX5_RSC_TYPE_QP; + if (ctx->cqe_version) + qp->rsc.rsn = usr_idx; + return 0; +} + +static struct ibv_qp *create_qp(struct ibv_context *context, + struct ibv_qp_init_attr_ex *attr, + struct mlx5dv_qp_init_attr *mlx5_qp_attr) +{ + struct mlx5_create_qp cmd; + struct mlx5_create_qp_resp resp; + struct mlx5_create_qp_ex_resp resp_ex; + struct mlx5_qp *qp; + int ret; + struct mlx5_context *ctx = to_mctx(context); + struct ibv_qp *ibqp; + int32_t usr_idx = 0; + uint32_t mlx5_create_flags = 0; + struct mlx5_bf *bf = NULL; + FILE *fp = ctx->dbg_fp; + struct mlx5_parent_domain *mparent_domain; + struct mlx5_ib_create_qp_resp *resp_drv; + + if (attr->comp_mask & ~MLX5_CREATE_QP_SUP_COMP_MASK) + return NULL; + + if ((attr->comp_mask & IBV_QP_INIT_ATTR_MAX_TSO_HEADER) && + (attr->qp_type != IBV_QPT_RAW_PACKET)) + return NULL; + + if (attr->comp_mask & IBV_QP_INIT_ATTR_SEND_OPS_FLAGS && + (attr->comp_mask & IBV_QP_INIT_ATTR_RX_HASH || + (attr->qp_type == IBV_QPT_DRIVER && + mlx5_qp_attr && + mlx5_qp_attr->comp_mask & MLX5DV_QP_INIT_ATTR_MASK_DC && + mlx5_qp_attr->dc_init_attr.dc_type == MLX5DV_DCTYPE_DCT))) { + errno = EINVAL; + return NULL; + } + + qp = calloc(1, sizeof(*qp)); + if (!qp) { + mlx5_dbg(fp, MLX5_DBG_QP, "\n"); + return NULL; + } + + ibqp = &qp->verbs_qp.qp; + qp->ibv_qp = ibqp; + + if ((attr->comp_mask & IBV_QP_INIT_ATTR_CREATE_FLAGS) && + (attr->create_flags & IBV_QP_CREATE_SOURCE_QPN)) { + + if (attr->qp_type != IBV_QPT_UD) { + errno = EINVAL; + goto err; + } + + qp->flags |= MLX5_QP_FLAGS_USE_UNDERLAY; + } + + memset(&cmd, 0, sizeof(cmd)); + memset(&resp, 0, sizeof(resp)); + memset(&resp_ex, 0, sizeof(resp_ex)); + + if (use_scatter_to_cqe()) + mlx5_create_flags |= MLX5_QP_FLAG_SCATTER_CQE; + + if (mlx5_qp_attr) { + if (!check_comp_mask(mlx5_qp_attr->comp_mask, + MLX5_DV_CREATE_QP_SUP_COMP_MASK)) { + mlx5_dbg(fp, MLX5_DBG_QP, + "Unsupported vendor comp_mask for create_qp\n"); + errno = EINVAL; + goto err; + } + + if ((mlx5_qp_attr->comp_mask & MLX5DV_QP_INIT_ATTR_MASK_DC) && + (attr->qp_type != IBV_QPT_DRIVER)) { + mlx5_dbg(fp, MLX5_DBG_QP, "DC QP must be of type IBV_QPT_DRIVER\n"); + errno = EINVAL; + goto err; + } + if (mlx5_qp_attr->comp_mask & + MLX5DV_QP_INIT_ATTR_MASK_QP_CREATE_FLAGS) { + if (!check_comp_mask(mlx5_qp_attr->create_flags, + MLX5DV_QP_CREATE_SUP_FLAGS)) { + mlx5_dbg(fp, MLX5_DBG_QP, + "Unsupported creation flags requested for create_qp\n"); + errno = EINVAL; + goto err; + } + if (mlx5_qp_attr->create_flags & + MLX5DV_QP_CREATE_TUNNEL_OFFLOADS) { + mlx5_create_flags |= MLX5_QP_FLAG_TUNNEL_OFFLOADS; + } + if (mlx5_qp_attr->create_flags & + MLX5DV_QP_CREATE_TIR_ALLOW_SELF_LOOPBACK_UC) { + mlx5_create_flags |= + MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_UC; + } + if (mlx5_qp_attr->create_flags & + MLX5DV_QP_CREATE_TIR_ALLOW_SELF_LOOPBACK_MC) { + mlx5_create_flags |= + MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_MC; + } + if (mlx5_qp_attr->create_flags & + MLX5DV_QP_CREATE_DISABLE_SCATTER_TO_CQE) { + if (mlx5_qp_attr->create_flags & + MLX5DV_QP_CREATE_ALLOW_SCATTER_TO_CQE) { + mlx5_dbg(fp, MLX5_DBG_QP, + "Wrong usage of creation flags requested for create_qp\n"); + errno = EINVAL; + goto err; + } + mlx5_create_flags &= ~MLX5_QP_FLAG_SCATTER_CQE; + } + if (mlx5_qp_attr->create_flags & + MLX5DV_QP_CREATE_ALLOW_SCATTER_TO_CQE) { + mlx5_create_flags |= + (MLX5_QP_FLAG_ALLOW_SCATTER_CQE | + MLX5_QP_FLAG_SCATTER_CQE); + } + if (mlx5_qp_attr->create_flags & + MLX5DV_QP_CREATE_PACKET_BASED_CREDIT_MODE) + mlx5_create_flags |= MLX5_QP_FLAG_PACKET_BASED_CREDIT_MODE; + + } + + if (attr->qp_type == IBV_QPT_DRIVER) { + if (mlx5_qp_attr->comp_mask & MLX5DV_QP_INIT_ATTR_MASK_DC) { + if (mlx5_qp_attr->dc_init_attr.dc_type == MLX5DV_DCTYPE_DCT) { + ret = create_dct(context, attr, mlx5_qp_attr, + qp, mlx5_create_flags); + if (ret) + goto err; + return ibqp; + } else if (mlx5_qp_attr->dc_init_attr.dc_type == MLX5DV_DCTYPE_DCI) { + mlx5_create_flags |= MLX5_QP_FLAG_TYPE_DCI; + qp->dc_type = MLX5DV_DCTYPE_DCI; + } else { + errno = EINVAL; + goto err; + } + } else { + errno = EINVAL; + goto err; + } + } + + } else { + if (attr->qp_type == IBV_QPT_DRIVER) + goto err; + } + + if (attr->comp_mask & IBV_QP_INIT_ATTR_RX_HASH) { + /* Scatter2CQE is unsupported for RSS QP */ + mlx5_create_flags &= ~MLX5_QP_FLAG_SCATTER_CQE; + + ret = mlx5_cmd_create_rss_qp(context, attr, qp, + mlx5_create_flags); + if (ret) + goto err; + + return ibqp; + } + + if (ctx->atomic_cap) + qp->atomics_enabled = 1; + + if (attr->comp_mask & IBV_QP_INIT_ATTR_SEND_OPS_FLAGS || + (mlx5_qp_attr && + mlx5_qp_attr->comp_mask & MLX5DV_QP_INIT_ATTR_MASK_SEND_OPS_FLAGS)) { + /* + * Scatter2cqe, which is a data-path optimization, is disabled + * since driver DC data-path doesn't support it. + */ + if (mlx5_qp_attr && + mlx5_qp_attr->comp_mask & MLX5DV_QP_INIT_ATTR_MASK_DC) { + mlx5_create_flags &= ~MLX5_QP_FLAG_SCATTER_CQE; + } + + ret = mlx5_qp_fill_wr_pfns(qp, attr, mlx5_qp_attr); + if (ret) { + errno = ret; + mlx5_dbg(fp, MLX5_DBG_QP, "Failed to handle operations flags (errno %d)\n", errno); + goto err; + } + } + + cmd.flags = mlx5_create_flags; + qp->wq_sig = qp_sig_enabled(); + if (qp->wq_sig) + cmd.flags |= MLX5_QP_FLAG_SIGNATURE; + + ret = mlx5_calc_wq_size(ctx, attr, mlx5_qp_attr, qp); + if (ret < 0) { + errno = -ret; + goto err; + } + + if (attr->qp_type == IBV_QPT_RAW_PACKET || + qp->flags & MLX5_QP_FLAGS_USE_UNDERLAY) { + qp->buf_size = qp->sq.offset; + qp->sq_buf_size = ret - qp->buf_size; + qp->sq.offset = 0; + } else { + qp->buf_size = ret; + qp->sq_buf_size = 0; + } + + if (mlx5_alloc_qp_buf(context, attr, qp, ret)) { + mlx5_dbg(fp, MLX5_DBG_QP, "\n"); + goto err; + } + + if (attr->qp_type == IBV_QPT_RAW_PACKET || + qp->flags & MLX5_QP_FLAGS_USE_UNDERLAY) { + qp->sq_start = qp->sq_buf.buf; + qp->sq.qend = qp->sq_buf.buf + + (qp->sq.wqe_cnt << qp->sq.wqe_shift); + } else { + qp->sq_start = qp->buf.buf + qp->sq.offset; + qp->sq.qend = qp->buf.buf + qp->sq.offset + + (qp->sq.wqe_cnt << qp->sq.wqe_shift); + } + + mlx5_init_qp_indices(qp); + + if (mlx5_spinlock_init_pd(&qp->sq.lock, attr->pd) || + mlx5_spinlock_init_pd(&qp->rq.lock, attr->pd)) + goto err_free_qp_buf; + + qp->db = mlx5_alloc_dbrec(ctx, attr->pd, &qp->custom_db); + if (!qp->db) { + mlx5_dbg(fp, MLX5_DBG_QP, "\n"); + goto err_free_qp_buf; + } + + if (!qp->custom_db) { + qp->db[MLX5_RCV_DBR] = 0; + qp->db[MLX5_SND_DBR] = 0; + } + + cmd.buf_addr = (uintptr_t) qp->buf.buf; + cmd.sq_buf_addr = (attr->qp_type == IBV_QPT_RAW_PACKET || + qp->flags & MLX5_QP_FLAGS_USE_UNDERLAY) ? + (uintptr_t) qp->sq_buf.buf : 0; + cmd.db_addr = (uintptr_t) qp->db; + cmd.sq_wqe_count = qp->sq.wqe_cnt; + cmd.rq_wqe_count = qp->rq.wqe_cnt; + cmd.rq_wqe_shift = qp->rq.wqe_shift; + + if (!ctx->cqe_version) { + cmd.uidx = 0xffffff; + pthread_mutex_lock(&ctx->qp_table_mutex); + } else if (!is_xrc_tgt(attr->qp_type)) { + usr_idx = mlx5_store_uidx(ctx, qp); + if (usr_idx < 0) { + mlx5_dbg(fp, MLX5_DBG_QP, "Couldn't find free user index\n"); + goto err_rq_db; + } + + cmd.uidx = usr_idx; + } + + mparent_domain = to_mparent_domain(attr->pd); + if (mparent_domain && mparent_domain->mtd) + bf = mparent_domain->mtd->bf; + + if (!bf && !(ctx->flags & MLX5_CTX_FLAGS_NO_KERN_DYN_UAR)) { + bf = mlx5_get_qp_uar(context); + if (!bf) + goto err_free_uidx; + } + + if (bf) { + if (bf->dyn_alloc_uar) { + cmd.bfreg_index = bf->page_id; + cmd.flags |= MLX5_QP_FLAG_UAR_PAGE_INDEX; + } else { + cmd.bfreg_index = bf->bfreg_dyn_index; + cmd.flags |= MLX5_QP_FLAG_BFREG_INDEX; + } + } + + if (attr->comp_mask & MLX5_CREATE_QP_EX2_COMP_MASK) + ret = mlx5_cmd_create_qp_ex(context, attr, &cmd, qp, &resp_ex); + else + ret = ibv_cmd_create_qp_ex(context, &qp->verbs_qp, sizeof(qp->verbs_qp), + attr, &cmd.ibv_cmd, sizeof(cmd), + &resp.ibv_resp, sizeof(resp)); + if (ret) { + mlx5_dbg(fp, MLX5_DBG_QP, "ret %d\n", ret); + goto err_free_uidx; + } + + resp_drv = attr->comp_mask & MLX5_CREATE_QP_EX2_COMP_MASK ? + &resp_ex.drv_payload : &resp.drv_payload; + if (!ctx->cqe_version) { + if (qp->sq.wqe_cnt || qp->rq.wqe_cnt) { + ret = mlx5_store_qp(ctx, ibqp->qp_num, qp); + if (ret) { + mlx5_dbg(fp, MLX5_DBG_QP, "ret %d\n", ret); + goto err_destroy; + } + } + + pthread_mutex_unlock(&ctx->qp_table_mutex); + } + + map_uuar(context, qp, resp_drv->bfreg_index, bf); + + qp->rq.max_post = qp->rq.wqe_cnt; + if (attr->sq_sig_all) + qp->sq_signal_bits = MLX5_WQE_CTRL_CQ_UPDATE; + else + qp->sq_signal_bits = 0; + + attr->cap.max_send_wr = qp->sq.max_post; + attr->cap.max_recv_wr = qp->rq.max_post; + attr->cap.max_recv_sge = qp->rq.max_gs; + + qp->rsc.type = MLX5_RSC_TYPE_QP; + qp->rsc.rsn = (ctx->cqe_version && !is_xrc_tgt(attr->qp_type)) ? + usr_idx : ibqp->qp_num; + + if (mparent_domain) + atomic_fetch_add(&mparent_domain->mpd.refcount, 1); + + if (resp_drv->comp_mask & MLX5_IB_CREATE_QP_RESP_MASK_TIRN) + qp->tirn = resp_drv->tirn; + + if (resp_drv->comp_mask & MLX5_IB_CREATE_QP_RESP_MASK_TISN) + qp->tisn = resp_drv->tisn; + + if (resp_drv->comp_mask & MLX5_IB_CREATE_QP_RESP_MASK_RQN) + qp->rqn = resp_drv->rqn; + + if (resp_drv->comp_mask & MLX5_IB_CREATE_QP_RESP_MASK_SQN) + qp->sqn = resp_drv->sqn; + + if (resp_drv->comp_mask & MLX5_IB_CREATE_QP_RESP_MASK_TIR_ICM_ADDR) + qp->tir_icm_addr = resp_drv->tir_icm_addr; + + if (attr->comp_mask & IBV_QP_INIT_ATTR_SEND_OPS_FLAGS) + qp->verbs_qp.comp_mask |= VERBS_QP_EX; + + return ibqp; + +err_destroy: + ibv_cmd_destroy_qp(ibqp); + +err_free_uidx: + if (bf) + mlx5_put_qp_uar(ctx, bf); + if (!ctx->cqe_version) + pthread_mutex_unlock(&to_mctx(context)->qp_table_mutex); + else if (!is_xrc_tgt(attr->qp_type)) + mlx5_clear_uidx(ctx, usr_idx); + +err_rq_db: + mlx5_free_db(to_mctx(context), qp->db, attr->pd, qp->custom_db); + +err_free_qp_buf: + mlx5_free_qp_buf(ctx, qp); + +err: + free(qp); + + return NULL; +} + +struct ibv_qp *mlx5_create_qp(struct ibv_pd *pd, + struct ibv_qp_init_attr *attr) +{ + struct ibv_qp *qp; + struct ibv_qp_init_attr_ex attrx; + + memset(&attrx, 0, sizeof(attrx)); + memcpy(&attrx, attr, sizeof(*attr)); + attrx.comp_mask = IBV_QP_INIT_ATTR_PD; + attrx.pd = pd; + qp = create_qp(pd->context, &attrx, NULL); + if (qp) + memcpy(attr, &attrx, sizeof(*attr)); + + return qp; +} + +static void mlx5_lock_cqs(struct ibv_qp *qp) +{ + struct mlx5_cq *send_cq = to_mcq(qp->send_cq); + struct mlx5_cq *recv_cq = to_mcq(qp->recv_cq); + + if (send_cq && recv_cq) { + if (send_cq == recv_cq) { + mlx5_spin_lock(&send_cq->lock); + } else if (send_cq->cqn < recv_cq->cqn) { + mlx5_spin_lock(&send_cq->lock); + mlx5_spin_lock(&recv_cq->lock); + } else { + mlx5_spin_lock(&recv_cq->lock); + mlx5_spin_lock(&send_cq->lock); + } + } else if (send_cq) { + mlx5_spin_lock(&send_cq->lock); + } else if (recv_cq) { + mlx5_spin_lock(&recv_cq->lock); + } +} + +static void mlx5_unlock_cqs(struct ibv_qp *qp) +{ + struct mlx5_cq *send_cq = to_mcq(qp->send_cq); + struct mlx5_cq *recv_cq = to_mcq(qp->recv_cq); + + if (send_cq && recv_cq) { + if (send_cq == recv_cq) { + mlx5_spin_unlock(&send_cq->lock); + } else if (send_cq->cqn < recv_cq->cqn) { + mlx5_spin_unlock(&recv_cq->lock); + mlx5_spin_unlock(&send_cq->lock); + } else { + mlx5_spin_unlock(&send_cq->lock); + mlx5_spin_unlock(&recv_cq->lock); + } + } else if (send_cq) { + mlx5_spin_unlock(&send_cq->lock); + } else if (recv_cq) { + mlx5_spin_unlock(&recv_cq->lock); + } +} + +int mlx5_destroy_qp(struct ibv_qp *ibqp) +{ + struct mlx5_qp *qp = to_mqp(ibqp); + struct mlx5_context *ctx = to_mctx(ibqp->context); + int ret; + struct mlx5_parent_domain *mparent_domain = to_mparent_domain(ibqp->pd); + + if (qp->rss_qp) { + ret = ibv_cmd_destroy_qp(ibqp); + if (ret) + return ret; + goto free; + } + + if (!ctx->cqe_version) + pthread_mutex_lock(&ctx->qp_table_mutex); + + ret = ibv_cmd_destroy_qp(ibqp); + if (ret) { + if (!ctx->cqe_version) + pthread_mutex_unlock(&ctx->qp_table_mutex); + return ret; + } + + mlx5_lock_cqs(ibqp); + + __mlx5_cq_clean(to_mcq(ibqp->recv_cq), qp->rsc.rsn, + ibqp->srq ? to_msrq(ibqp->srq) : NULL); + if (ibqp->send_cq != ibqp->recv_cq) + __mlx5_cq_clean(to_mcq(ibqp->send_cq), qp->rsc.rsn, NULL); + + if (!ctx->cqe_version) { + if (qp->dc_type == MLX5DV_DCTYPE_DCT) { + /* The QP was inserted to the tracking table only after + * that it was modifed to RTR + */ + if (ibqp->state == IBV_QPS_RTR) + mlx5_clear_qp(ctx, ibqp->qp_num); + } else { + if (qp->sq.wqe_cnt || qp->rq.wqe_cnt) + mlx5_clear_qp(ctx, ibqp->qp_num); + } + } + + mlx5_unlock_cqs(ibqp); + if (!ctx->cqe_version) + pthread_mutex_unlock(&ctx->qp_table_mutex); + else if (!is_xrc_tgt(ibqp->qp_type)) + mlx5_clear_uidx(ctx, qp->rsc.rsn); + + if (qp->dc_type != MLX5DV_DCTYPE_DCT) { + mlx5_free_db(ctx, qp->db, ibqp->pd, qp->custom_db); + mlx5_free_qp_buf(ctx, qp); + } +free: + if (mparent_domain) + atomic_fetch_sub(&mparent_domain->mpd.refcount, 1); + + mlx5_put_qp_uar(ctx, qp->bf); + free(qp); + + return 0; +} + +int mlx5_query_qp(struct ibv_qp *ibqp, struct ibv_qp_attr *attr, + int attr_mask, struct ibv_qp_init_attr *init_attr) +{ + struct ibv_query_qp cmd; + struct mlx5_qp *qp = to_mqp(ibqp); + int ret; + + if (qp->rss_qp) + return EOPNOTSUPP; + + ret = ibv_cmd_query_qp(ibqp, attr, attr_mask, init_attr, &cmd, sizeof(cmd)); + if (ret) + return ret; + + init_attr->cap.max_send_wr = qp->sq.max_post; + init_attr->cap.max_send_sge = qp->sq.max_gs; + init_attr->cap.max_inline_data = qp->max_inline_data; + + attr->cap = init_attr->cap; + + return 0; +} + +enum { + MLX5_MODIFY_QP_EX_ATTR_MASK = IBV_QP_RATE_LIMIT, +}; + +static int modify_dct(struct ibv_qp *qp, struct ibv_qp_attr *attr, + int attr_mask) +{ + struct ibv_modify_qp_ex cmd_ex = {}; + struct mlx5_modify_qp_ex_resp resp = {}; + struct mlx5_qp *mqp = to_mqp(qp); + struct mlx5_context *context = to_mctx(qp->context); + int min_resp_size; + bool dct_create; + int ret; + + ret = ibv_cmd_modify_qp_ex(qp, attr, attr_mask, &cmd_ex, sizeof(cmd_ex), + &resp.ibv_resp, sizeof(resp)); + if (ret) + return ret; + + /* dct is created in hardware and gets unique qp number when QP + * is modified to RTR so operations that require QP number need + * to be delayed to this time + */ + dct_create = + (attr_mask & IBV_QP_STATE) && + (attr->qp_state == IBV_QPS_RTR); + + if (!dct_create) + return 0; + + min_resp_size = + offsetof(typeof(resp), dctn) + + sizeof(resp.dctn) - + sizeof(resp.ibv_resp); + + if (resp.response_length < min_resp_size) { + errno = EINVAL; + return errno; + } + + qp->qp_num = resp.dctn; + + if (!context->cqe_version) { + pthread_mutex_lock(&context->qp_table_mutex); + ret = mlx5_store_qp(context, qp->qp_num, mqp); + if (!ret) + mqp->rsc.rsn = qp->qp_num; + else + errno = ENOMEM; + pthread_mutex_unlock(&context->qp_table_mutex); + return ret ? errno : 0; + } + return 0; +} + +int mlx5_modify_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, + int attr_mask) +{ + struct ibv_modify_qp cmd = {}; + struct ibv_modify_qp_ex cmd_ex = {}; + struct ib_uverbs_ex_modify_qp_resp resp = {}; + struct mlx5_qp *mqp = to_mqp(qp); + struct mlx5_context *context = to_mctx(qp->context); + int ret; + __be32 *db; + + if (mqp->dc_type == MLX5DV_DCTYPE_DCT) + return modify_dct(qp, attr, attr_mask); + + if (mqp->rss_qp) + return EOPNOTSUPP; + + if (mqp->flags & MLX5_QP_FLAGS_USE_UNDERLAY) { + if (attr_mask & ~(IBV_QP_STATE | IBV_QP_CUR_STATE)) + return EINVAL; + + /* Underlay QP is UD over infiniband */ + if (context->cached_device_cap_flags & IBV_DEVICE_UD_IP_CSUM) + mqp->qp_cap_cache |= MLX5_CSUM_SUPPORT_UNDERLAY_UD | + MLX5_RX_CSUM_VALID; + } + + if (attr_mask & IBV_QP_PORT) { + switch (qp->qp_type) { + case IBV_QPT_RAW_PACKET: + if (context->cached_link_layer[attr->port_num - 1] == + IBV_LINK_LAYER_ETHERNET) { + if (context->cached_device_cap_flags & + IBV_DEVICE_RAW_IP_CSUM) + mqp->qp_cap_cache |= + MLX5_CSUM_SUPPORT_RAW_OVER_ETH | + MLX5_RX_CSUM_VALID; + + if (ibv_is_qpt_supported( + context->cached_tso_caps.supported_qpts, + IBV_QPT_RAW_PACKET)) + mqp->max_tso = + context->cached_tso_caps.max_tso; + } + break; + default: + break; + } + } + + if (attr_mask & MLX5_MODIFY_QP_EX_ATTR_MASK) + ret = ibv_cmd_modify_qp_ex(qp, attr, attr_mask, &cmd_ex, + sizeof(cmd_ex), &resp, sizeof(resp)); + else + ret = ibv_cmd_modify_qp(qp, attr, attr_mask, + &cmd, sizeof(cmd)); + + if (!ret && + (attr_mask & IBV_QP_STATE) && + attr->qp_state == IBV_QPS_RESET) { + if (qp->recv_cq) { + mlx5_cq_clean(to_mcq(qp->recv_cq), mqp->rsc.rsn, + qp->srq ? to_msrq(qp->srq) : NULL); + } + if (qp->send_cq != qp->recv_cq && qp->send_cq) + mlx5_cq_clean(to_mcq(qp->send_cq), + to_mqp(qp)->rsc.rsn, NULL); + + mlx5_init_qp_indices(mqp); + db = mqp->db; + db[MLX5_RCV_DBR] = 0; + db[MLX5_SND_DBR] = 0; + } + + /* + * When the Raw Packet QP is in INIT state, its RQ + * underneath is already in RDY, which means it can + * receive packets. According to the IB spec, a QP can't + * receive packets until moved to RTR state. To achieve this, + * for Raw Packet QPs, we update the doorbell record + * once the QP is moved to RTR. + */ + if (!ret && + (attr_mask & IBV_QP_STATE) && + attr->qp_state == IBV_QPS_RTR && + (qp->qp_type == IBV_QPT_RAW_PACKET || + mqp->flags & MLX5_QP_FLAGS_USE_UNDERLAY)) { + mlx5_spin_lock(&mqp->rq.lock); + mqp->db[MLX5_RCV_DBR] = htobe32(mqp->rq.head & 0xffff); + mlx5_spin_unlock(&mqp->rq.lock); + } + + return ret; +} + +int mlx5_modify_qp_rate_limit(struct ibv_qp *qp, + struct ibv_qp_rate_limit_attr *attr) +{ + struct ibv_qp_attr qp_attr = {}; + struct ib_uverbs_ex_modify_qp_resp resp = {}; + struct mlx5_modify_qp cmd = {}; + struct mlx5_context *mctx = to_mctx(qp->context); + int ret; + + if (attr->comp_mask) + return EINVAL; + + if ((attr->max_burst_sz || + attr->typical_pkt_sz) && + (!attr->rate_limit || + !(mctx->packet_pacing_caps.cap_flags & + MLX5_IB_PP_SUPPORT_BURST))) + return EINVAL; + + cmd.burst_info.max_burst_sz = attr->max_burst_sz; + cmd.burst_info.typical_pkt_sz = attr->typical_pkt_sz; + qp_attr.rate_limit = attr->rate_limit; + + ret = ibv_cmd_modify_qp_ex(qp, &qp_attr, IBV_QP_RATE_LIMIT, + &cmd.ibv_cmd, sizeof(cmd), &resp, + sizeof(resp)); + + return ret; +} + +/* + * IB spec version 1.3. Table 224 Rate to mlx5 rate + * conversion table on best effort basis. + */ +static const uint8_t ib_to_mlx5_rate_table[] = { + 0, /* Invalid to unlimited */ + 0, /* Invalid to unlimited */ + 7, /* 2.5 Gbps */ + 8, /* 10Gbps */ + 9, /* 30Gbps */ + 10, /* 5 Gbps */ + 11, /* 20 Gbps */ + 12, /* 40 Gbps */ + 13, /* 60 Gbps */ + 14, /* 80 Gbps */ + 15, /* 120 Gbps */ + 11, /* 14 Gbps to 20 Gbps */ + 13, /* 56 Gbps to 60 Gbps */ + 15, /* 112 Gbps to 120 Gbps */ + 0, /* 168 Gbps to unlimited */ + 9, /* 25 Gbps to 30 Gbps */ + 15, /* 100 Gbps to 120 Gbps */ + 0, /* 200 Gbps to unlimited */ + 0, /* 300 Gbps to unlimited */ + 9, /* 28 Gbps to 30 Gbps */ + 13, /* 50 Gbps to 60 Gbps */ + 0, /* 400 Gbps to unlimited */ + 0, /* 600 Gbps to unlimited */ +}; + +static uint8_t ah_attr_to_mlx5_rate(enum ibv_rate ah_static_rate) +{ + if (ah_static_rate >= ARRAY_SIZE(ib_to_mlx5_rate_table)) + return 0; + return ib_to_mlx5_rate_table[ah_static_rate]; +} + +static void mlx5_ah_set_udp_sport(struct mlx5_ah *ah, + const struct ibv_ah_attr *attr) +{ + uint16_t sport; + uint32_t fl; + + fl = attr->grh.flow_label & IB_GRH_FLOWLABEL_MASK; + if (fl) + sport = ibv_flow_label_to_udp_sport(fl); + else + sport = rand() % (IB_ROCE_UDP_ENCAP_VALID_PORT_MAX + 1 + - IB_ROCE_UDP_ENCAP_VALID_PORT_MIN) + + IB_ROCE_UDP_ENCAP_VALID_PORT_MIN; + + ah->av.rlid = htobe16(sport); +} + +struct ibv_ah *mlx5_create_ah(struct ibv_pd *pd, struct ibv_ah_attr *attr) +{ + struct mlx5_context *ctx = to_mctx(pd->context); + struct ibv_port_attr port_attr; + struct mlx5_ah *ah; + uint8_t static_rate; + uint32_t gid_type; + __be32 tmp; + uint8_t grh; + bool is_eth; + bool grh_req; + + if (attr->port_num < 1 || attr->port_num > ctx->num_ports) + return NULL; + + if (ctx->cached_link_layer[attr->port_num - 1]) { + is_eth = ctx->cached_link_layer[attr->port_num - 1] == + IBV_LINK_LAYER_ETHERNET; + grh_req = ctx->cached_port_flags[attr->port_num - 1] & + IBV_QPF_GRH_REQUIRED; + } else { + if (ibv_query_port(pd->context, attr->port_num, &port_attr)) + return NULL; + + is_eth = port_attr.link_layer == IBV_LINK_LAYER_ETHERNET; + grh_req = port_attr.flags & IBV_QPF_GRH_REQUIRED; + } + + if (unlikely((!attr->is_global) && (is_eth || grh_req))) { + errno = EINVAL; + return NULL; + } + + ah = calloc(1, sizeof *ah); + if (!ah) + return NULL; + + static_rate = ah_attr_to_mlx5_rate(attr->static_rate); + if (is_eth) { + if (ibv_query_gid_type(pd->context, attr->port_num, + attr->grh.sgid_index, &gid_type)) + goto err; + + if (gid_type == IBV_GID_TYPE_ROCE_V2) + mlx5_ah_set_udp_sport(ah, attr); + + /* Since RoCE packets must contain GRH, this bit is reserved + * for RoCE and shouldn't be set. + */ + grh = 0; + ah->av.stat_rate_sl = (static_rate << 4) | ((attr->sl & 0x7) << 1); + } else { + ah->av.fl_mlid = attr->src_path_bits & 0x7f; + ah->av.rlid = htobe16(attr->dlid); + grh = 1; + ah->av.stat_rate_sl = (static_rate << 4) | (attr->sl & 0xf); + } + if (attr->is_global) { + ah->av.tclass = attr->grh.traffic_class; + ah->av.hop_limit = attr->grh.hop_limit; + tmp = htobe32((grh << 30) | + ((attr->grh.sgid_index & 0xff) << 20) | + (attr->grh.flow_label & IB_GRH_FLOWLABEL_MASK)); + ah->av.grh_gid_fl = tmp; + memcpy(ah->av.rgid, attr->grh.dgid.raw, 16); + } + + if (is_eth) { + if (ctx->cmds_supp_uhw & MLX5_USER_CMDS_SUPP_UHW_CREATE_AH) { + struct mlx5_create_ah_resp resp = {}; + + if (ibv_cmd_create_ah(pd, &ah->ibv_ah, attr, &resp.ibv_resp, sizeof(resp))) + goto err; + + ah->kern_ah = true; + memcpy(ah->av.rmac, resp.dmac, ETHERNET_LL_SIZE); + } else { + if (ibv_resolve_eth_l2_from_gid(pd->context, attr, + ah->av.rmac, NULL)) + goto err; + } + } + + return &ah->ibv_ah; +err: + free(ah); + return NULL; +} + +int mlx5_destroy_ah(struct ibv_ah *ah) +{ + struct mlx5_ah *mah = to_mah(ah); + int err; + + if (mah->kern_ah) { + err = ibv_cmd_destroy_ah(ah); + if (err) + return err; + } + + free(mah); + return 0; +} + +int mlx5_attach_mcast(struct ibv_qp *qp, const union ibv_gid *gid, uint16_t lid) +{ + return ibv_cmd_attach_mcast(qp, gid, lid); +} + +int mlx5_detach_mcast(struct ibv_qp *qp, const union ibv_gid *gid, uint16_t lid) +{ + return ibv_cmd_detach_mcast(qp, gid, lid); +} + +struct ibv_qp *mlx5_create_qp_ex(struct ibv_context *context, + struct ibv_qp_init_attr_ex *attr) +{ + return create_qp(context, attr, NULL); +} + +struct ibv_qp *mlx5dv_create_qp(struct ibv_context *context, + struct ibv_qp_init_attr_ex *qp_attr, + struct mlx5dv_qp_init_attr *mlx5_qp_attr) +{ + if (!is_mlx5_dev(context->device)) { + errno = EOPNOTSUPP; + return NULL; + } + + return create_qp(context, qp_attr, mlx5_qp_attr); +} + +struct mlx5dv_qp_ex *mlx5dv_qp_ex_from_ibv_qp_ex(struct ibv_qp_ex *qp) +{ + return &(container_of(qp, struct mlx5_qp, verbs_qp.qp_ex))->dv_qp; +} + +int mlx5_get_srq_num(struct ibv_srq *srq, uint32_t *srq_num) +{ + struct mlx5_srq *msrq = to_msrq(srq); + + *srq_num = msrq->srqn; + + return 0; +} + +struct ibv_qp *mlx5_open_qp(struct ibv_context *context, + struct ibv_qp_open_attr *attr) +{ + struct ibv_open_qp cmd; + struct ib_uverbs_create_qp_resp resp; + struct mlx5_qp *qp; + int ret; + + qp = calloc(1, sizeof(*qp)); + if (!qp) + return NULL; + + ret = ibv_cmd_open_qp(context, &qp->verbs_qp, sizeof(qp->verbs_qp), + attr, &cmd, sizeof(cmd), &resp, sizeof(resp)); + if (ret) + goto err; + + return &qp->verbs_qp.qp; + +err: + free(qp); + return NULL; +} + +struct ibv_xrcd * +mlx5_open_xrcd(struct ibv_context *context, + struct ibv_xrcd_init_attr *xrcd_init_attr) +{ + int err; + struct verbs_xrcd *xrcd; + struct ibv_open_xrcd cmd = {}; + struct ib_uverbs_open_xrcd_resp resp = {}; + + xrcd = calloc(1, sizeof(*xrcd)); + if (!xrcd) + return NULL; + + err = ibv_cmd_open_xrcd(context, xrcd, sizeof(*xrcd), xrcd_init_attr, + &cmd, sizeof(cmd), &resp, sizeof(resp)); + if (err) { + free(xrcd); + return NULL; + } + + return &xrcd->xrcd; +} + +int mlx5_close_xrcd(struct ibv_xrcd *ib_xrcd) +{ + struct verbs_xrcd *xrcd = container_of(ib_xrcd, struct verbs_xrcd, xrcd); + int ret; + + ret = ibv_cmd_close_xrcd(xrcd); + if (!ret) + free(xrcd); + + return ret; +} + +static struct ibv_qp * +create_cmd_qp(struct ibv_context *context, + struct ibv_srq_init_attr_ex *srq_attr, + struct ibv_srq *srq) +{ + struct ibv_qp_init_attr_ex init_attr = {}; + FILE *fp = to_mctx(context)->dbg_fp; + struct ibv_port_attr port_attr; + struct ibv_modify_qp qcmd = {}; + struct ibv_qp_attr attr = {}; + struct ibv_query_port pcmd; + struct ibv_qp *qp; + int attr_mask; + int port = 1; + int ret; + + ret = ibv_cmd_query_port(context, port, &port_attr, + &pcmd, sizeof(pcmd)); + if (ret) { + mlx5_dbg(fp, MLX5_DBG_QP, "ret %d\n", ret); + return NULL; + } + + init_attr.qp_type = IBV_QPT_RC; + init_attr.srq = srq; + /* Command QP will be used to pass MLX5_OPCODE_TAG_MATCHING messages + * to add/remove tag matching list entries. + * WQ size is based on max_ops parameter holding max number of + * outstanding list operations. + */ + init_attr.cap.max_send_wr = srq_attr->tm_cap.max_ops; + /* Tag matching list entry will point to a single sge buffer */ + init_attr.cap.max_send_sge = 1; + init_attr.comp_mask = IBV_QP_INIT_ATTR_PD; + init_attr.pd = srq_attr->pd; + init_attr.send_cq = srq_attr->cq; + init_attr.recv_cq = srq_attr->cq; + + qp = create_qp(context, &init_attr, NULL); + if (!qp) + return NULL; + + attr.qp_state = IBV_QPS_INIT; + attr.port_num = port; + attr_mask = IBV_QP_STATE | IBV_QP_PKEY_INDEX + | IBV_QP_PORT | IBV_QP_ACCESS_FLAGS; + + ret = ibv_cmd_modify_qp(qp, &attr, attr_mask, &qcmd, sizeof(qcmd)); + if (ret) { + mlx5_dbg(fp, MLX5_DBG_QP, "ret %d\n", ret); + goto err; + } + + attr.qp_state = IBV_QPS_RTR; + attr.path_mtu = IBV_MTU_256; + attr.dest_qp_num = qp->qp_num; /* Loopback */ + attr.ah_attr.dlid = port_attr.lid; + attr.ah_attr.port_num = port; + attr_mask = IBV_QP_STATE | IBV_QP_AV | IBV_QP_PATH_MTU + | IBV_QP_DEST_QPN | IBV_QP_RQ_PSN + | IBV_QP_MAX_DEST_RD_ATOMIC | IBV_QP_MIN_RNR_TIMER; + + ret = ibv_cmd_modify_qp(qp, &attr, attr_mask, &qcmd, sizeof(qcmd)); + if (ret) { + mlx5_dbg(fp, MLX5_DBG_QP, "ret %d\n", ret); + goto err; + } + + attr.qp_state = IBV_QPS_RTS; + attr_mask = IBV_QP_STATE | IBV_QP_TIMEOUT | IBV_QP_RETRY_CNT + | IBV_QP_RNR_RETRY | IBV_QP_SQ_PSN + | IBV_QP_MAX_QP_RD_ATOMIC; + + ret = ibv_cmd_modify_qp(qp, &attr, attr_mask, &qcmd, sizeof(qcmd)); + if (ret) { + mlx5_dbg(fp, MLX5_DBG_QP, "ret %d\n", ret); + goto err; + } + + return qp; + +err: + mlx5_destroy_qp(qp); + return NULL; +} + +struct ibv_srq *mlx5_create_srq_ex(struct ibv_context *context, + struct ibv_srq_init_attr_ex *attr) +{ + int err; + struct mlx5_create_srq_ex cmd; + struct mlx5_create_srq_resp resp; + struct mlx5_srq *msrq; + struct mlx5_context *ctx = to_mctx(context); + int max_sge; + struct ibv_srq *ibsrq; + int uidx; + FILE *fp = ctx->dbg_fp; + + if (!(attr->comp_mask & IBV_SRQ_INIT_ATTR_TYPE) || + (attr->srq_type == IBV_SRQT_BASIC)) + return mlx5_create_srq(attr->pd, + (struct ibv_srq_init_attr *)attr); + + if (attr->srq_type != IBV_SRQT_XRC && + attr->srq_type != IBV_SRQT_TM) { + errno = EINVAL; + return NULL; + } + + /* An extended CQ is required to read TM information from */ + if (attr->srq_type == IBV_SRQT_TM && + !(attr->cq && (to_mcq(attr->cq)->flags & MLX5_CQ_FLAGS_EXTENDED))) { + errno = EINVAL; + return NULL; + } + + msrq = calloc(1, sizeof(*msrq)); + if (!msrq) + return NULL; + + ibsrq = (struct ibv_srq *)&msrq->vsrq; + + memset(&cmd, 0, sizeof(cmd)); + memset(&resp, 0, sizeof(resp)); + + if (mlx5_spinlock_init_pd(&msrq->lock, attr->pd)) { + fprintf(stderr, "%s-%d:\n", __func__, __LINE__); + goto err; + } + + if (attr->attr.max_wr > ctx->max_srq_recv_wr) { + fprintf(stderr, "%s-%d:max_wr %d, max_srq_recv_wr %d\n", + __func__, __LINE__, attr->attr.max_wr, + ctx->max_srq_recv_wr); + errno = EINVAL; + goto err; + } + + /* + * this calculation does not consider required control segments. The + * final calculation is done again later. This is done so to avoid + * overflows of variables + */ + max_sge = ctx->max_recv_wr / sizeof(struct mlx5_wqe_data_seg); + if (attr->attr.max_sge > max_sge) { + fprintf(stderr, "%s-%d:max_wr %d, max_srq_recv_wr %d\n", + __func__, __LINE__, attr->attr.max_wr, + ctx->max_srq_recv_wr); + errno = EINVAL; + goto err; + } + + msrq->max_gs = attr->attr.max_sge; + msrq->counter = 0; + + if (mlx5_alloc_srq_buf(context, msrq, attr->attr.max_wr, attr->pd)) { + fprintf(stderr, "%s-%d:\n", __func__, __LINE__); + goto err; + } + + msrq->db = mlx5_alloc_dbrec(ctx, attr->pd, &msrq->custom_db); + if (!msrq->db) { + fprintf(stderr, "%s-%d:\n", __func__, __LINE__); + goto err_free; + } + + if (!msrq->custom_db) + *msrq->db = 0; + + cmd.buf_addr = (uintptr_t)msrq->buf.buf; + cmd.db_addr = (uintptr_t)msrq->db; + msrq->wq_sig = srq_sig_enabled(); + if (msrq->wq_sig) + cmd.flags = MLX5_SRQ_FLAG_SIGNATURE; + + attr->attr.max_sge = msrq->max_gs; + if (ctx->cqe_version) { + uidx = mlx5_store_uidx(ctx, msrq); + if (uidx < 0) { + mlx5_dbg(fp, MLX5_DBG_QP, "Couldn't find free user index\n"); + goto err_free_db; + } + cmd.uidx = uidx; + } else { + cmd.uidx = 0xffffff; + pthread_mutex_lock(&ctx->srq_table_mutex); + } + + /* Override max_wr to let kernel know about extra WQEs for the + * wait queue. + */ + attr->attr.max_wr = msrq->max - 1; + + err = ibv_cmd_create_srq_ex(context, &msrq->vsrq, sizeof(msrq->vsrq), + attr, &cmd.ibv_cmd, sizeof(cmd), + &resp.ibv_resp, sizeof(resp)); + + /* Override kernel response that includes the wait queue with the real + * number of WQEs that are applicable for the application. + */ + attr->attr.max_wr = msrq->tail; + + if (err) + goto err_free_uidx; + + if (attr->srq_type == IBV_SRQT_TM) { + int i; + + msrq->cmd_qp = create_cmd_qp(context, attr, ibsrq); + if (!msrq->cmd_qp) + goto err_destroy; + + msrq->tm_list = calloc(attr->tm_cap.max_num_tags + 1, + sizeof(struct mlx5_tag_entry)); + if (!msrq->tm_list) + goto err_free_cmd; + for (i = 0; i < attr->tm_cap.max_num_tags; i++) + msrq->tm_list[i].next = &msrq->tm_list[i + 1]; + msrq->tm_head = &msrq->tm_list[0]; + msrq->tm_tail = &msrq->tm_list[attr->tm_cap.max_num_tags]; + + msrq->op = calloc(to_mqp(msrq->cmd_qp)->sq.wqe_cnt, + sizeof(struct mlx5_srq_op)); + if (!msrq->op) + goto err_free_tm; + msrq->op_head = 0; + msrq->op_tail = 0; + } + + if (!ctx->cqe_version) { + err = mlx5_store_srq(to_mctx(context), resp.srqn, msrq); + if (err) + goto err_free_tm; + + pthread_mutex_unlock(&ctx->srq_table_mutex); + } + + msrq->srqn = resp.srqn; + msrq->rsc.type = MLX5_RSC_TYPE_XSRQ; + msrq->rsc.rsn = ctx->cqe_version ? cmd.uidx : resp.srqn; + + return ibsrq; + +err_free_tm: + free(msrq->tm_list); + free(msrq->op); +err_free_cmd: + if (msrq->cmd_qp) + mlx5_destroy_qp(msrq->cmd_qp); +err_destroy: + ibv_cmd_destroy_srq(ibsrq); + +err_free_uidx: + if (ctx->cqe_version) + mlx5_clear_uidx(ctx, cmd.uidx); + else + pthread_mutex_unlock(&ctx->srq_table_mutex); + +err_free_db: + mlx5_free_db(ctx, msrq->db, attr->pd, msrq->custom_db); + +err_free: + free(msrq->wrid); + mlx5_free_actual_buf(ctx, &msrq->buf); + +err: + free(msrq); + + return NULL; +} + +static void get_pci_atomic_caps(struct ibv_context *context, + struct ibv_device_attr_ex *attr) +{ + uint32_t in[DEVX_ST_SZ_DW(query_hca_cap_in)] = {}; + uint32_t out[DEVX_ST_SZ_DW(query_hca_cap_out)] = {}; + uint16_t opmod = (MLX5_CAP_ATOMIC << 1) | HCA_CAP_OPMOD_GET_CUR; + int ret; + + DEVX_SET(query_hca_cap_in, in, opcode, MLX5_CMD_OP_QUERY_HCA_CAP); + DEVX_SET(query_hca_cap_in, in, op_mod, opmod); + + ret = mlx5dv_devx_general_cmd(context, in, sizeof(in), out, + sizeof(out)); + if (!ret) { + attr->pci_atomic_caps.fetch_add = + DEVX_GET(query_hca_cap_out, out, + capability.atomic_caps.fetch_add_pci_atomic); + attr->pci_atomic_caps.swap = + DEVX_GET(query_hca_cap_out, out, + capability.atomic_caps.swap_pci_atomic); + attr->pci_atomic_caps.compare_swap = + DEVX_GET(query_hca_cap_out, out, + capability.atomic_caps.compare_swap_pci_atomic); + } +} + +int mlx5_query_device_ex(struct ibv_context *context, + const struct ibv_query_device_ex_input *input, + struct ibv_device_attr_ex *attr, + size_t attr_size) +{ + struct mlx5_context *mctx = to_mctx(context); + struct mlx5_query_device_ex_resp resp; + struct mlx5_query_device_ex cmd; + struct ibv_device_attr *a; + uint64_t raw_fw_ver; + unsigned sub_minor; + unsigned major; + unsigned minor; + int err; + int cmd_supp_uhw = mctx->cmds_supp_uhw & + MLX5_USER_CMDS_SUPP_UHW_QUERY_DEVICE; + + memset(&cmd, 0, sizeof(cmd)); + memset(&resp, 0, sizeof(resp)); + err = ibv_cmd_query_device_ex( + context, input, attr, attr_size, &raw_fw_ver, &cmd.ibv_cmd, + sizeof(cmd), &resp.ibv_resp, + cmd_supp_uhw ? sizeof(resp) : sizeof(resp.ibv_resp)); + if (err) + return err; + + attr->tso_caps.max_tso = resp.tso_caps.max_tso; + attr->tso_caps.supported_qpts = resp.tso_caps.supported_qpts; + attr->rss_caps.rx_hash_fields_mask = resp.rss_caps.rx_hash_fields_mask; + attr->rss_caps.rx_hash_function = resp.rss_caps.rx_hash_function; + attr->packet_pacing_caps.qp_rate_limit_min = + resp.packet_pacing_caps.qp_rate_limit_min; + attr->packet_pacing_caps.qp_rate_limit_max = + resp.packet_pacing_caps.qp_rate_limit_max; + attr->packet_pacing_caps.supported_qpts = + resp.packet_pacing_caps.supported_qpts; + + if (resp.mlx5_ib_support_multi_pkt_send_wqes & MLX5_IB_ALLOW_MPW) + mctx->vendor_cap_flags |= MLX5_VENDOR_CAP_FLAGS_MPW_ALLOWED; + + if (resp.mlx5_ib_support_multi_pkt_send_wqes & MLX5_IB_SUPPORT_EMPW) + mctx->vendor_cap_flags |= MLX5_VENDOR_CAP_FLAGS_ENHANCED_MPW; + + mctx->cqe_comp_caps.max_num = resp.cqe_comp_caps.max_num; + mctx->cqe_comp_caps.supported_format = resp.cqe_comp_caps.supported_format; + mctx->sw_parsing_caps.sw_parsing_offloads = + resp.sw_parsing_caps.sw_parsing_offloads; + mctx->sw_parsing_caps.supported_qpts = + resp.sw_parsing_caps.supported_qpts; + mctx->striding_rq_caps.min_single_stride_log_num_of_bytes = + resp.striding_rq_caps.min_single_stride_log_num_of_bytes; + mctx->striding_rq_caps.max_single_stride_log_num_of_bytes = + resp.striding_rq_caps.max_single_stride_log_num_of_bytes; + mctx->striding_rq_caps.min_single_wqe_log_num_of_strides = + resp.striding_rq_caps.min_single_wqe_log_num_of_strides; + mctx->striding_rq_caps.max_single_wqe_log_num_of_strides = + resp.striding_rq_caps.max_single_wqe_log_num_of_strides; + mctx->striding_rq_caps.supported_qpts = + resp.striding_rq_caps.supported_qpts; + mctx->tunnel_offloads_caps = resp.tunnel_offloads_caps; + mctx->packet_pacing_caps = resp.packet_pacing_caps; + + if (resp.flags & MLX5_IB_QUERY_DEV_RESP_FLAGS_CQE_128B_COMP) + mctx->vendor_cap_flags |= MLX5_VENDOR_CAP_FLAGS_CQE_128B_COMP; + + if (resp.flags & MLX5_IB_QUERY_DEV_RESP_FLAGS_CQE_128B_PAD) + mctx->vendor_cap_flags |= MLX5_VENDOR_CAP_FLAGS_CQE_128B_PAD; + + if (resp.flags & MLX5_IB_QUERY_DEV_RESP_PACKET_BASED_CREDIT_MODE) + mctx->vendor_cap_flags |= MLX5_VENDOR_CAP_FLAGS_PACKET_BASED_CREDIT_MODE; + + if (resp.flags & MLX5_IB_QUERY_DEV_RESP_FLAGS_SCAT2CQE_DCT) + mctx->vendor_cap_flags |= MLX5_VENDOR_CAP_FLAGS_SCAT2CQE_DCT; + + major = (raw_fw_ver >> 32) & 0xffff; + minor = (raw_fw_ver >> 16) & 0xffff; + sub_minor = raw_fw_ver & 0xffff; + a = &attr->orig_attr; + snprintf(a->fw_ver, sizeof(a->fw_ver), "%d.%d.%04d", + major, minor, sub_minor); + + if (attr_size >= offsetof(struct ibv_device_attr_ex, pci_atomic_caps) + + sizeof(attr->pci_atomic_caps)) + get_pci_atomic_caps(context, attr); + + return 0; +} + +static int rwq_sig_enabled(struct ibv_context *context) +{ + char *env; + + env = getenv("MLX5_RWQ_SIGNATURE"); + if (env) + return 1; + + return 0; +} + +static void mlx5_free_rwq_buf(struct mlx5_rwq *rwq, struct ibv_context *context) +{ + struct mlx5_context *ctx = to_mctx(context); + + mlx5_free_actual_buf(ctx, &rwq->buf); + free(rwq->rq.wrid); +} + +static int mlx5_alloc_rwq_buf(struct ibv_context *context, + struct ibv_pd *pd, + struct mlx5_rwq *rwq, + int size) +{ + int err; + enum mlx5_alloc_type alloc_type; + + mlx5_get_alloc_type(to_mctx(context), pd, MLX5_RWQ_PREFIX, + &alloc_type, MLX5_ALLOC_TYPE_ANON); + + rwq->rq.wrid = malloc(rwq->rq.wqe_cnt * sizeof(uint64_t)); + if (!rwq->rq.wrid) { + errno = ENOMEM; + return -1; + } + + if (alloc_type == MLX5_ALLOC_TYPE_CUSTOM) { + rwq->buf.mparent_domain = to_mparent_domain(pd); + rwq->buf.req_alignment = to_mdev(context->device)->page_size; + rwq->buf.resource_type = MLX5DV_RES_TYPE_RWQ; + } + + err = mlx5_alloc_prefered_buf(to_mctx(context), &rwq->buf, + align(rwq->buf_size, to_mdev + (context->device)->page_size), + to_mdev(context->device)->page_size, + alloc_type, + MLX5_RWQ_PREFIX); + + if (err) { + free(rwq->rq.wrid); + errno = ENOMEM; + return -1; + } + + return 0; +} + +static struct ibv_wq *create_wq(struct ibv_context *context, + struct ibv_wq_init_attr *attr, + struct mlx5dv_wq_init_attr *mlx5wq_attr) +{ + struct mlx5_create_wq cmd; + struct mlx5_create_wq_resp resp; + int err; + struct mlx5_rwq *rwq; + struct mlx5_context *ctx = to_mctx(context); + int ret; + int32_t usr_idx = 0; + FILE *fp = ctx->dbg_fp; + + if (attr->wq_type != IBV_WQT_RQ) + return NULL; + + memset(&cmd, 0, sizeof(cmd)); + memset(&resp, 0, sizeof(resp)); + + rwq = calloc(1, sizeof(*rwq)); + if (!rwq) + return NULL; + + rwq->wq_sig = rwq_sig_enabled(context); + if (rwq->wq_sig) + cmd.flags = MLX5_WQ_FLAG_SIGNATURE; + + ret = mlx5_calc_rwq_size(ctx, rwq, attr, mlx5wq_attr); + if (ret < 0) { + errno = -ret; + goto err; + } + + rwq->buf_size = ret; + if (mlx5_alloc_rwq_buf(context, attr->pd, rwq, ret)) + goto err; + + mlx5_init_rwq_indices(rwq); + + if (mlx5_spinlock_init_pd(&rwq->rq.lock, attr->pd)) + goto err_free_rwq_buf; + + rwq->db = mlx5_alloc_dbrec(ctx, attr->pd, &rwq->custom_db); + if (!rwq->db) + goto err_free_rwq_buf; + + if (!rwq->custom_db) { + rwq->db[MLX5_RCV_DBR] = 0; + rwq->db[MLX5_SND_DBR] = 0; + } + + rwq->pbuff = rwq->buf.buf + rwq->rq.offset; + rwq->recv_db = &rwq->db[MLX5_RCV_DBR]; + cmd.buf_addr = (uintptr_t)rwq->buf.buf; + cmd.db_addr = (uintptr_t)rwq->db; + cmd.rq_wqe_count = rwq->rq.wqe_cnt; + cmd.rq_wqe_shift = rwq->rq.wqe_shift; + usr_idx = mlx5_store_uidx(ctx, rwq); + if (usr_idx < 0) { + mlx5_dbg(fp, MLX5_DBG_QP, "Couldn't find free user index\n"); + goto err_free_db_rec; + } + + cmd.user_index = usr_idx; + + if (mlx5wq_attr) { + if (mlx5wq_attr->comp_mask & MLX5DV_WQ_INIT_ATTR_MASK_STRIDING_RQ) { + if ((mlx5wq_attr->striding_rq_attrs.single_stride_log_num_of_bytes < + ctx->striding_rq_caps.min_single_stride_log_num_of_bytes) || + (mlx5wq_attr->striding_rq_attrs.single_stride_log_num_of_bytes > + ctx->striding_rq_caps.max_single_stride_log_num_of_bytes)) { + errno = EINVAL; + goto err_create; + } + + if ((mlx5wq_attr->striding_rq_attrs.single_wqe_log_num_of_strides < + ctx->striding_rq_caps.min_single_wqe_log_num_of_strides) || + (mlx5wq_attr->striding_rq_attrs.single_wqe_log_num_of_strides > + ctx->striding_rq_caps.max_single_wqe_log_num_of_strides)) { + errno = EINVAL; + goto err_create; + } + + cmd.single_stride_log_num_of_bytes = + mlx5wq_attr->striding_rq_attrs.single_stride_log_num_of_bytes; + cmd.single_wqe_log_num_of_strides = + mlx5wq_attr->striding_rq_attrs.single_wqe_log_num_of_strides; + cmd.two_byte_shift_en = + mlx5wq_attr->striding_rq_attrs.two_byte_shift_en; + cmd.comp_mask |= MLX5_IB_CREATE_WQ_STRIDING_RQ; + } + } + + err = ibv_cmd_create_wq(context, attr, &rwq->wq, &cmd.ibv_cmd, + sizeof(cmd), &resp.ibv_resp, sizeof(resp)); + if (err) + goto err_create; + + rwq->rsc.type = MLX5_RSC_TYPE_RWQ; + rwq->rsc.rsn = cmd.user_index; + + rwq->wq.post_recv = mlx5_post_wq_recv; + return &rwq->wq; + +err_create: + mlx5_clear_uidx(ctx, cmd.user_index); +err_free_db_rec: + mlx5_free_db(to_mctx(context), rwq->db, attr->pd, rwq->custom_db); +err_free_rwq_buf: + mlx5_free_rwq_buf(rwq, context); +err: + free(rwq); + return NULL; +} + +struct ibv_wq *mlx5_create_wq(struct ibv_context *context, + struct ibv_wq_init_attr *attr) +{ + return create_wq(context, attr, NULL); +} + +struct ibv_wq *mlx5dv_create_wq(struct ibv_context *context, + struct ibv_wq_init_attr *attr, + struct mlx5dv_wq_init_attr *mlx5_wq_attr) +{ + if (!is_mlx5_dev(context->device)) { + errno = EOPNOTSUPP; + return NULL; + } + + return create_wq(context, attr, mlx5_wq_attr); +} + +int mlx5_modify_wq(struct ibv_wq *wq, struct ibv_wq_attr *attr) +{ + struct mlx5_modify_wq cmd = {}; + struct mlx5_rwq *rwq = to_mrwq(wq); + + if ((attr->attr_mask & IBV_WQ_ATTR_STATE) && + attr->wq_state == IBV_WQS_RDY) { + if ((attr->attr_mask & IBV_WQ_ATTR_CURR_STATE) && + attr->curr_wq_state != wq->state) + return -EINVAL; + + if (wq->state == IBV_WQS_RESET) { + mlx5_spin_lock(&to_mcq(wq->cq)->lock); + __mlx5_cq_clean(to_mcq(wq->cq), + rwq->rsc.rsn, NULL); + mlx5_spin_unlock(&to_mcq(wq->cq)->lock); + mlx5_init_rwq_indices(rwq); + rwq->db[MLX5_RCV_DBR] = 0; + rwq->db[MLX5_SND_DBR] = 0; + } + } + + return ibv_cmd_modify_wq(wq, attr, &cmd.ibv_cmd, sizeof(cmd)); +} + +int mlx5_destroy_wq(struct ibv_wq *wq) +{ + struct mlx5_rwq *rwq = to_mrwq(wq); + int ret; + + ret = ibv_cmd_destroy_wq(wq); + if (ret) + return ret; + + mlx5_spin_lock(&to_mcq(wq->cq)->lock); + __mlx5_cq_clean(to_mcq(wq->cq), rwq->rsc.rsn, NULL); + mlx5_spin_unlock(&to_mcq(wq->cq)->lock); + mlx5_clear_uidx(to_mctx(wq->context), rwq->rsc.rsn); + mlx5_free_db(to_mctx(wq->context), rwq->db, wq->pd, rwq->custom_db); + mlx5_free_rwq_buf(rwq, wq->context); + free(rwq); + + return 0; +} + +static void free_flow_counters_descriptions(struct mlx5_ib_create_flow *cmd) +{ + int i; + + for (i = 0; i < cmd->ncounters_data; i++) + free(cmd->data[i].counters_data); +} + +static int get_flow_mcounters(struct mlx5_flow *mflow, + struct ibv_flow_attr *flow_attr, + struct mlx5_counters **mcounters, + uint32_t *data_size) +{ + struct ibv_flow_spec *ib_spec; + uint32_t ncounters_used = 0; + int i; + + ib_spec = (struct ibv_flow_spec *)(flow_attr + 1); + for (i = 0; i < flow_attr->num_of_specs; i++, ib_spec = (void *)ib_spec + ib_spec->hdr.size) { + if (ib_spec->hdr.type != IBV_FLOW_SPEC_ACTION_COUNT) + continue; + + /* currently support only one counters data */ + if (ncounters_used > 0) + return EINVAL; + + *mcounters = to_mcounters(ib_spec->flow_count.counters); + ncounters_used++; + } + + *data_size = ncounters_used * sizeof(struct mlx5_ib_flow_counters_data); + return 0; +} + +static int allocate_flow_counters_descriptions(struct mlx5_counters *mcounters, + struct mlx5_ib_create_flow *cmd) +{ + struct mlx5_ib_flow_counters_data *mcntrs_data; + struct mlx5_ib_flow_counters_desc *cntrs_data; + struct mlx5_counter_node *cntr_node; + uint32_t ncounters; + int j = 0; + + mcntrs_data = cmd->data; + ncounters = mcounters->ncounters; + + /* mlx5_attach_counters_point_flow was never called */ + if (!ncounters) + return EINVAL; + + /* each counter has both index and description */ + cntrs_data = calloc(ncounters, sizeof(*cntrs_data)); + if (!cntrs_data) + return ENOMEM; + + list_for_each(&mcounters->counters_list, cntr_node, entry) { + cntrs_data[j].description = cntr_node->desc; + cntrs_data[j].index = cntr_node->index; + j++; + } + + scrub_ptr_attr(cntrs_data); + mcntrs_data[cmd->ncounters_data].counters_data = cntrs_data; + mcntrs_data[cmd->ncounters_data].ncounters = ncounters; + cmd->ncounters_data++; + + return 0; +} + +struct ibv_flow *mlx5_create_flow(struct ibv_qp *qp, struct ibv_flow_attr *flow_attr) +{ + struct mlx5_ib_create_flow *cmd; + uint32_t required_cmd_size = 0; + struct ibv_flow *flow_id; + struct mlx5_flow *mflow; + int ret; + + mflow = calloc(1, sizeof(*mflow)); + if (!mflow) { + errno = ENOMEM; + return NULL; + } + + ret = get_flow_mcounters(mflow, flow_attr, &mflow->mcounters, &required_cmd_size); + if (ret) { + errno = ret; + goto err_get_mcounters; + } + + required_cmd_size += sizeof(*cmd); + cmd = calloc(1, required_cmd_size); + if (!cmd) { + errno = ENOMEM; + goto err_get_mcounters; + } + + if (mflow->mcounters) { + pthread_mutex_lock(&mflow->mcounters->lock); + /* if the counters already bound no need to pass its description */ + if (!mflow->mcounters->refcount) { + ret = allocate_flow_counters_descriptions(mflow->mcounters, cmd); + if (ret) { + errno = ret; + goto err_desc_alloc; + } + } + } + + flow_id = &mflow->flow_id; + ret = ibv_cmd_create_flow(qp, flow_id, flow_attr, + cmd, required_cmd_size); + if (ret) + goto err_create_flow; + + if (mflow->mcounters) { + free_flow_counters_descriptions(cmd); + mflow->mcounters->refcount++; + pthread_mutex_unlock(&mflow->mcounters->lock); + } + + free(cmd); + + return flow_id; + +err_create_flow: + if (mflow->mcounters) { + free_flow_counters_descriptions(cmd); + pthread_mutex_unlock(&mflow->mcounters->lock); + } +err_desc_alloc: + free(cmd); +err_get_mcounters: + free(mflow); + return NULL; +} + +int mlx5_destroy_flow(struct ibv_flow *flow_id) +{ + struct mlx5_flow *mflow = to_mflow(flow_id); + int ret; + + ret = ibv_cmd_destroy_flow(flow_id); + if (ret) + return ret; + + if (mflow->mcounters) { + pthread_mutex_lock(&mflow->mcounters->lock); + mflow->mcounters->refcount--; + pthread_mutex_unlock(&mflow->mcounters->lock); + } + + free(mflow); + return 0; +} + +struct ibv_rwq_ind_table *mlx5_create_rwq_ind_table(struct ibv_context *context, + struct ibv_rwq_ind_table_init_attr *init_attr) +{ + struct mlx5_create_rwq_ind_table_resp resp; + struct ibv_rwq_ind_table *ind_table; + int err; + + memset(&resp, 0, sizeof(resp)); + ind_table = calloc(1, sizeof(*ind_table)); + if (!ind_table) + return NULL; + + err = ibv_cmd_create_rwq_ind_table(context, init_attr, ind_table, + &resp.ibv_resp, sizeof(resp)); + if (err) + goto err; + + return ind_table; + +err: + free(ind_table); + return NULL; +} + +int mlx5_destroy_rwq_ind_table(struct ibv_rwq_ind_table *rwq_ind_table) +{ + int ret; + + ret = ibv_cmd_destroy_rwq_ind_table(rwq_ind_table); + + if (ret) + return ret; + + free(rwq_ind_table); + return 0; +} + +int mlx5_modify_cq(struct ibv_cq *cq, struct ibv_modify_cq_attr *attr) +{ + struct ibv_modify_cq cmd = {}; + + return ibv_cmd_modify_cq(cq, attr, &cmd, sizeof(cmd)); +} + +static struct ibv_flow_action *_mlx5_create_flow_action_esp(struct ibv_context *ctx, + struct ibv_flow_action_esp_attr *attr, + struct ibv_command_buffer *driver_attr) +{ + struct verbs_flow_action *action; + int ret; + + if (!check_comp_mask(attr->comp_mask, IBV_FLOW_ACTION_ESP_MASK_ESN)) { + errno = EOPNOTSUPP; + return NULL; + } + + action = calloc(1, sizeof(*action)); + if (!action) { + errno = ENOMEM; + return NULL; + } + + ret = ibv_cmd_create_flow_action_esp(ctx, attr, action, driver_attr); + if (ret) { + free(action); + return NULL; + } + + return &action->action; +} + +struct ibv_flow_action *mlx5_create_flow_action_esp(struct ibv_context *ctx, + struct ibv_flow_action_esp_attr *attr) +{ + return _mlx5_create_flow_action_esp(ctx, attr, NULL); +} + +struct ibv_flow_action *mlx5dv_create_flow_action_esp(struct ibv_context *ctx, + struct ibv_flow_action_esp_attr *esp, + struct mlx5dv_flow_action_esp *mlx5_attr) +{ + DECLARE_COMMAND_BUFFER_LINK(driver_attr, UVERBS_OBJECT_FLOW_ACTION, + UVERBS_METHOD_FLOW_ACTION_ESP_CREATE, 1, + NULL); + + if (!check_comp_mask(mlx5_attr->comp_mask, + MLX5DV_FLOW_ACTION_ESP_MASK_FLAGS)) { + errno = EOPNOTSUPP; + return NULL; + } + + if (mlx5_attr->comp_mask & MLX5DV_FLOW_ACTION_ESP_MASK_FLAGS) { + if (!check_comp_mask(mlx5_attr->action_flags, + MLX5_IB_UAPI_FLOW_ACTION_FLAGS_REQUIRE_METADATA)) { + errno = EOPNOTSUPP; + return NULL; + } + fill_attr_in_uint64(driver_attr, MLX5_IB_ATTR_CREATE_FLOW_ACTION_FLAGS, + mlx5_attr->action_flags); + } + + return _mlx5_create_flow_action_esp(ctx, esp, driver_attr); +} + +int mlx5_modify_flow_action_esp(struct ibv_flow_action *action, + struct ibv_flow_action_esp_attr *attr) +{ + struct verbs_flow_action *vaction = + container_of(action, struct verbs_flow_action, action); + + if (!check_comp_mask(attr->comp_mask, IBV_FLOW_ACTION_ESP_MASK_ESN)) + return EOPNOTSUPP; + + return ibv_cmd_modify_flow_action_esp(vaction, attr, NULL); +} + +struct ibv_flow_action *mlx5dv_create_flow_action_modify_header(struct ibv_context *ctx, + size_t actions_sz, + uint64_t actions[], + enum mlx5dv_flow_table_type ft_type) +{ + DECLARE_COMMAND_BUFFER(cmd, UVERBS_OBJECT_FLOW_ACTION, + MLX5_IB_METHOD_FLOW_ACTION_CREATE_MODIFY_HEADER, + 3); + struct ib_uverbs_attr *handle = fill_attr_out_obj(cmd, + MLX5_IB_ATTR_CREATE_MODIFY_HEADER_HANDLE); + struct verbs_flow_action *action; + int ret; + + fill_attr_in(cmd, MLX5_IB_ATTR_CREATE_MODIFY_HEADER_ACTIONS_PRM, + actions, actions_sz); + fill_attr_const_in(cmd, MLX5_IB_ATTR_CREATE_MODIFY_HEADER_FT_TYPE, + ft_type); + + action = calloc(1, sizeof(*action)); + if (!action) { + errno = ENOMEM; + return NULL; + } + + ret = execute_ioctl(ctx, cmd); + if (ret) { + free(action); + return NULL; + } + + action->action.context = ctx; + action->type = IBV_FLOW_ACTION_UNSPECIFIED; + action->handle = read_attr_obj(MLX5_IB_ATTR_CREATE_MODIFY_HEADER_HANDLE, + handle); + + return &action->action; +} + +struct ibv_flow_action * +mlx5dv_create_flow_action_packet_reformat(struct ibv_context *ctx, + size_t data_sz, + void *data, + enum mlx5dv_flow_action_packet_reformat_type reformat_type, + enum mlx5dv_flow_table_type ft_type) +{ + DECLARE_COMMAND_BUFFER(cmd, UVERBS_OBJECT_FLOW_ACTION, + MLX5_IB_METHOD_FLOW_ACTION_CREATE_PACKET_REFORMAT, 4); + struct ib_uverbs_attr *handle = fill_attr_out_obj(cmd, + MLX5_IB_ATTR_CREATE_PACKET_REFORMAT_HANDLE); + struct verbs_flow_action *action; + int ret; + + if ((!data && data_sz) || (data && !data_sz)) { + errno = EINVAL; + return NULL; + } + + if (data && data_sz) + fill_attr_in(cmd, + MLX5_IB_ATTR_CREATE_PACKET_REFORMAT_DATA_BUF, + data, data_sz); + + fill_attr_const_in(cmd, MLX5_IB_ATTR_CREATE_PACKET_REFORMAT_TYPE, + reformat_type); + + fill_attr_const_in(cmd, MLX5_IB_ATTR_CREATE_PACKET_REFORMAT_FT_TYPE, + ft_type); + + action = calloc(1, sizeof(*action)); + if (!action) { + errno = ENOMEM; + return NULL; + } + + ret = execute_ioctl(ctx, cmd); + if (ret) { + free(action); + return NULL; + } + + action->action.context = ctx; + action->type = IBV_FLOW_ACTION_UNSPECIFIED; + action->handle = read_attr_obj(MLX5_IB_ATTR_CREATE_PACKET_REFORMAT_HANDLE, + handle); + + return &action->action; +} + +int mlx5_destroy_flow_action(struct ibv_flow_action *action) +{ + struct verbs_flow_action *vaction = + container_of(action, struct verbs_flow_action, action); + int ret = ibv_cmd_destroy_flow_action(vaction); + + if (!ret) + free(action); + + return ret; +} + +static inline int mlx5_access_dm(struct ibv_dm *ibdm, uint64_t dm_offset, + void *host_addr, size_t length, + uint32_t read) +{ + struct mlx5_dm *dm = to_mdm(ibdm); + atomic_uint32_t *dm_ptr = + (atomic_uint32_t *)dm->start_va + dm_offset / 4; + uint32_t *host_ptr = host_addr; + const uint32_t *host_end = host_ptr + length / 4; + + if (dm_offset + length > dm->length) + return EFAULT; + + /* Due to HW limitation, DM access address and length must be aligned + * to 4 bytes. + */ + if ((length & 3) || (dm_offset & 3)) + return EINVAL; + + /* Copy granularity should be 4 Bytes since we enforce copy size to be + * a multiple of 4 bytes. + */ + if (read) { + while (host_ptr != host_end) { + *host_ptr = atomic_load_explicit(dm_ptr, + memory_order_relaxed); + host_ptr++; + dm_ptr++; + } + } else { + while (host_ptr != host_end) { + atomic_store_explicit(dm_ptr, *host_ptr, + memory_order_relaxed); + host_ptr++; + dm_ptr++; + } + } + + return 0; +} +static inline int mlx5_memcpy_to_dm(struct ibv_dm *ibdm, uint64_t dm_offset, + const void *host_addr, size_t length) +{ + return mlx5_access_dm(ibdm, dm_offset, (void *)host_addr, length, 0); +} + +static inline int mlx5_memcpy_from_dm(void *host_addr, struct ibv_dm *ibdm, + uint64_t dm_offset, size_t length) +{ + return mlx5_access_dm(ibdm, dm_offset, host_addr, length, 1); +} + +static int alloc_dm_memic(struct ibv_context *ctx, + struct mlx5_dm *dm, + struct ibv_alloc_dm_attr *dm_attr, + struct ibv_command_buffer *cmdb) +{ + int page_size = to_mdev(ctx->device)->page_size; + uint64_t act_size = align(dm_attr->length, page_size); + uint64_t start_offset; + uint16_t page_idx; + off_t offset = 0; + void *va; + + if (dm_attr->length > to_mctx(ctx)->max_dm_size) { + errno = EINVAL; + return errno; + } + + fill_attr_out(cmdb, MLX5_IB_ATTR_ALLOC_DM_RESP_START_OFFSET, + &start_offset, sizeof(start_offset)); + + fill_attr_out(cmdb, MLX5_IB_ATTR_ALLOC_DM_RESP_PAGE_INDEX, + &page_idx, sizeof(page_idx)); + + if (ibv_cmd_alloc_dm(ctx, dm_attr, &dm->verbs_dm, cmdb)) + return EINVAL; + + set_command(MLX5_IB_MMAP_DEVICE_MEM, &offset); + set_extended_index(page_idx, &offset); + va = mmap(NULL, act_size, PROT_READ | PROT_WRITE, + MAP_SHARED, ctx->cmd_fd, + page_size * offset); + if (va == MAP_FAILED) { + ibv_cmd_free_dm(&dm->verbs_dm); + return ENOMEM; + } + + dm->mmap_va = va; + dm->start_va = va + (start_offset & (page_size - 1)); + dm->verbs_dm.dm.memcpy_to_dm = mlx5_memcpy_to_dm; + dm->verbs_dm.dm.memcpy_from_dm = mlx5_memcpy_from_dm; + + return 0; +} + +static int alloc_dm_steering_sw_icm(struct ibv_context *ctx, + struct mlx5_dm *dm, + struct ibv_alloc_dm_attr *dm_attr, + struct ibv_command_buffer *cmdb) +{ + uint64_t start_offset; + + fill_attr_out(cmdb, MLX5_IB_ATTR_ALLOC_DM_RESP_START_OFFSET, + &start_offset, sizeof(start_offset)); + + if (ibv_cmd_alloc_dm(ctx, dm_attr, &dm->verbs_dm, cmdb)) + return EINVAL; + + /* For SW ICM we get address in the start_offset attribute */ + dm->remote_va = start_offset; + + return 0; +} + +struct ibv_dm * +mlx5dv_alloc_dm(struct ibv_context *context, + struct ibv_alloc_dm_attr *dm_attr, + struct mlx5dv_alloc_dm_attr *mlx5_dm_attr) +{ + DECLARE_COMMAND_BUFFER(cmdb, UVERBS_OBJECT_DM, UVERBS_METHOD_DM_ALLOC, + 3); + struct ib_uverbs_attr *type_attr; + struct mlx5_dm *dm; + int err; + + if ((mlx5_dm_attr->type != MLX5DV_DM_TYPE_MEMIC) && + (mlx5_dm_attr->type != MLX5DV_DM_TYPE_STEERING_SW_ICM) && + (mlx5_dm_attr->type != MLX5DV_DM_TYPE_HEADER_MODIFY_SW_ICM)) { + errno = EOPNOTSUPP; + return NULL; + } + + if (!check_comp_mask(dm_attr->comp_mask, 0) || + !check_comp_mask(mlx5_dm_attr->comp_mask, 0)) { + errno = EINVAL; + return NULL; + } + + dm = calloc(1, sizeof(*dm)); + if (!dm) { + errno = ENOMEM; + return NULL; + } + + type_attr = fill_attr_const_in(cmdb, MLX5_IB_ATTR_ALLOC_DM_REQ_TYPE, + mlx5_dm_attr->type); + + if (mlx5_dm_attr->type == MLX5DV_DM_TYPE_MEMIC) { + attr_optional(type_attr); + err = alloc_dm_memic(context, dm, dm_attr, cmdb); + } else { + err = alloc_dm_steering_sw_icm(context, dm, dm_attr, cmdb); + } + + if (err) + goto err_free_mem; + + dm->length = dm_attr->length; + + return &dm->verbs_dm.dm; + +err_free_mem: + free(dm); + + return NULL; +} + +int mlx5_free_dm(struct ibv_dm *ibdm) +{ + struct mlx5_device *mdev = to_mdev(ibdm->context->device); + struct mlx5_dm *dm = to_mdm(ibdm); + size_t act_size = align(dm->length, mdev->page_size); + int ret; + + ret = ibv_cmd_free_dm(&dm->verbs_dm); + + if (ret) + return ret; + + if (dm->mmap_va) + munmap(dm->mmap_va, act_size); + free(dm); + return 0; +} + +struct ibv_dm *mlx5_alloc_dm(struct ibv_context *context, + struct ibv_alloc_dm_attr *dm_attr) +{ + struct mlx5dv_alloc_dm_attr mlx5_attr = { .type = MLX5DV_DM_TYPE_MEMIC }; + + return mlx5dv_alloc_dm(context, dm_attr, &mlx5_attr); +} + +struct ibv_counters *mlx5_create_counters(struct ibv_context *context, + struct ibv_counters_init_attr *init_attr) +{ + struct mlx5_counters *mcntrs; + int ret; + + if (!check_comp_mask(init_attr->comp_mask, 0)) { + errno = EOPNOTSUPP; + return NULL; + } + + mcntrs = calloc(1, sizeof(*mcntrs)); + if (!mcntrs) { + errno = ENOMEM; + return NULL; + } + + pthread_mutex_init(&mcntrs->lock, NULL); + ret = ibv_cmd_create_counters(context, + init_attr, + &mcntrs->vcounters, + NULL); + if (ret) + goto err_create; + + list_head_init(&mcntrs->counters_list); + + return &mcntrs->vcounters.counters; + +err_create: + free(mcntrs); + return NULL; +} + +int mlx5_destroy_counters(struct ibv_counters *counters) +{ + struct mlx5_counters *mcntrs = to_mcounters(counters); + struct mlx5_counter_node *tmp, *cntrs_node; + int ret; + + ret = ibv_cmd_destroy_counters(&mcntrs->vcounters); + if (ret) + return ret; + + list_for_each_safe(&mcntrs->counters_list, cntrs_node, tmp, entry) { + list_del(&cntrs_node->entry); + free(cntrs_node); + } + + free(mcntrs); + return 0; +} + +int mlx5_attach_counters_point_flow(struct ibv_counters *counters, + struct ibv_counter_attach_attr *attr, + struct ibv_flow *flow) +{ + struct mlx5_counters *mcntrs = to_mcounters(counters); + struct mlx5_counter_node *cntrs_node; + int ret; + + /* The driver supports only the static binding mode as part of ibv_create_flow */ + if (flow) + return ENOTSUP; + + if (!check_comp_mask(attr->comp_mask, 0)) + return EOPNOTSUPP; + + /* Check whether the attached counter is supported */ + if (attr->counter_desc < IBV_COUNTER_PACKETS || + attr->counter_desc > IBV_COUNTER_BYTES) + return ENOTSUP; + + cntrs_node = calloc(1, sizeof(*cntrs_node)); + if (!cntrs_node) + return ENOMEM; + + pthread_mutex_lock(&mcntrs->lock); + /* The counter is bound to a flow, attach is not allowed */ + if (mcntrs->refcount) { + ret = EBUSY; + goto err_already_bound; + } + + cntrs_node->index = attr->index; + cntrs_node->desc = attr->counter_desc; + list_add(&mcntrs->counters_list, &cntrs_node->entry); + mcntrs->ncounters++; + pthread_mutex_unlock(&mcntrs->lock); + + return 0; + +err_already_bound: + pthread_mutex_unlock(&mcntrs->lock); + free(cntrs_node); + return ret; +} + +int mlx5_read_counters(struct ibv_counters *counters, + uint64_t *counters_value, + uint32_t ncounters, + uint32_t flags) +{ + struct mlx5_counters *mcntrs = to_mcounters(counters); + + return ibv_cmd_read_counters(&mcntrs->vcounters, + counters_value, + ncounters, + flags, + NULL); + +} + +struct mlx5dv_flow_matcher * +mlx5dv_create_flow_matcher(struct ibv_context *context, + struct mlx5dv_flow_matcher_attr *attr) +{ + DECLARE_COMMAND_BUFFER(cmd, MLX5_IB_OBJECT_FLOW_MATCHER, + MLX5_IB_METHOD_FLOW_MATCHER_CREATE, + 6); + struct mlx5dv_flow_matcher *flow_matcher; + struct ib_uverbs_attr *handle; + int ret; + + if (!check_comp_mask(attr->comp_mask, + MLX5DV_FLOW_MATCHER_MASK_FT_TYPE)) { + errno = EOPNOTSUPP; + return NULL; + } + + flow_matcher = calloc(1, sizeof(*flow_matcher)); + if (!flow_matcher) { + errno = ENOMEM; + return NULL; + } + + if (attr->type != IBV_FLOW_ATTR_NORMAL) { + errno = EOPNOTSUPP; + goto err; + } + + handle = fill_attr_out_obj(cmd, MLX5_IB_ATTR_FLOW_MATCHER_CREATE_HANDLE); + fill_attr_in(cmd, MLX5_IB_ATTR_FLOW_MATCHER_MATCH_MASK, + attr->match_mask->match_buf, + attr->match_mask->match_sz); + fill_attr_in(cmd, MLX5_IB_ATTR_FLOW_MATCHER_MATCH_CRITERIA, + &attr->match_criteria_enable, sizeof(attr->match_criteria_enable)); + fill_attr_in_enum(cmd, MLX5_IB_ATTR_FLOW_MATCHER_FLOW_TYPE, + IBV_FLOW_ATTR_NORMAL, &attr->priority, + sizeof(attr->priority)); + + if (attr->comp_mask & MLX5DV_FLOW_MATCHER_MASK_FT_TYPE) + fill_attr_const_in(cmd, MLX5_IB_ATTR_FLOW_MATCHER_FT_TYPE, + attr->ft_type); + if (attr->flags) + fill_attr_const_in(cmd, MLX5_IB_ATTR_FLOW_MATCHER_FLOW_FLAGS, + attr->flags); + + ret = execute_ioctl(context, cmd); + if (ret) + goto err; + + flow_matcher->context = context; + flow_matcher->handle = read_attr_obj(MLX5_IB_ATTR_FLOW_MATCHER_CREATE_HANDLE, handle); + + return flow_matcher; + +err: + free(flow_matcher); + return NULL; +} + +int mlx5dv_destroy_flow_matcher(struct mlx5dv_flow_matcher *flow_matcher) +{ + DECLARE_COMMAND_BUFFER(cmd, MLX5_IB_OBJECT_FLOW_MATCHER, + MLX5_IB_METHOD_FLOW_MATCHER_DESTROY, + 1); + int ret; + + fill_attr_in_obj(cmd, MLX5_IB_ATTR_FLOW_MATCHER_DESTROY_HANDLE, flow_matcher->handle); + ret = execute_ioctl(flow_matcher->context, cmd); + verbs_is_destroy_err(&ret); + + if (ret) + return ret; + + free(flow_matcher); + return 0; +} + +#define CREATE_FLOW_MAX_FLOW_ACTIONS_SUPPORTED 8 +struct ibv_flow * +__mlx5dv_create_flow(struct mlx5dv_flow_matcher *flow_matcher, + struct mlx5dv_flow_match_parameters *match_value, + size_t num_actions, + struct mlx5dv_flow_action_attr actions_attr[], + struct mlx5_flow_action_attr_aux actions_attr_aux[]) +{ + uint32_t flow_actions[CREATE_FLOW_MAX_FLOW_ACTIONS_SUPPORTED]; + struct verbs_flow_action *vaction; + int num_flow_actions = 0; + struct mlx5_flow *mflow; + bool have_qp = false; + bool have_dest_devx = false; + bool have_flow_tag = false; + bool have_counter = false; + int ret; + int i; + DECLARE_COMMAND_BUFFER(cmd, UVERBS_OBJECT_FLOW, + MLX5_IB_METHOD_CREATE_FLOW, + 8); + struct ib_uverbs_attr *handle; + enum mlx5dv_flow_action_type type; + + mflow = calloc(1, sizeof(*mflow)); + if (!mflow) { + errno = ENOMEM; + return NULL; + } + + handle = fill_attr_out_obj(cmd, MLX5_IB_ATTR_CREATE_FLOW_HANDLE); + fill_attr_in(cmd, MLX5_IB_ATTR_CREATE_FLOW_MATCH_VALUE, + match_value->match_buf, + match_value->match_sz); + fill_attr_in_obj(cmd, MLX5_IB_ATTR_CREATE_FLOW_MATCHER, flow_matcher->handle); + + for (i = 0; i < num_actions; i++) { + type = actions_attr[i].type; + switch (type) { + case MLX5DV_FLOW_ACTION_DEST_IBV_QP: + if (have_qp || have_dest_devx) { + errno = EOPNOTSUPP; + goto err; + } + fill_attr_in_obj(cmd, MLX5_IB_ATTR_CREATE_FLOW_DEST_QP, + actions_attr[i].qp->handle); + have_qp = true; + break; + case MLX5DV_FLOW_ACTION_IBV_FLOW_ACTION: + if (num_flow_actions == + CREATE_FLOW_MAX_FLOW_ACTIONS_SUPPORTED) { + errno = EOPNOTSUPP; + goto err; + } + vaction = container_of(actions_attr[i].action, + struct verbs_flow_action, + action); + + flow_actions[num_flow_actions] = vaction->handle; + num_flow_actions++; + break; + case MLX5DV_FLOW_ACTION_DEST_DEVX: + if (have_dest_devx || have_qp) { + errno = EOPNOTSUPP; + goto err; + } + fill_attr_in_obj(cmd, MLX5_IB_ATTR_CREATE_FLOW_DEST_DEVX, + actions_attr[i].obj->handle); + have_dest_devx = true; + break; + case MLX5DV_FLOW_ACTION_TAG: + if (have_flow_tag) { + errno = EINVAL; + goto err; + } + fill_attr_in_uint32(cmd, + MLX5_IB_ATTR_CREATE_FLOW_TAG, + actions_attr[i].tag_value); + have_flow_tag = true; + break; + case MLX5DV_FLOW_ACTION_COUNTERS_DEVX: + if (have_counter) { + errno = EOPNOTSUPP; + goto err; + } + fill_attr_in_objs_arr(cmd, + MLX5_IB_ATTR_CREATE_FLOW_ARR_COUNTERS_DEVX, + &actions_attr[i].obj->handle, 1); + + if (actions_attr_aux && + actions_attr_aux[i].type == MLX5_FLOW_ACTION_COUNTER_OFFSET) + fill_attr_in_ptr_array(cmd, + MLX5_IB_ATTR_CREATE_FLOW_ARR_COUNTERS_DEVX_OFFSET, + &actions_attr_aux[i].offset, 1); + + have_counter = true; + break; + default: + errno = EOPNOTSUPP; + goto err; + } + } + + if (num_flow_actions) + fill_attr_in_objs_arr(cmd, + MLX5_IB_ATTR_CREATE_FLOW_ARR_FLOW_ACTIONS, + flow_actions, + num_flow_actions); + ret = execute_ioctl(flow_matcher->context, cmd); + if (ret) + goto err; + + mflow->flow_id.handle = read_attr_obj(MLX5_IB_ATTR_CREATE_FLOW_HANDLE, handle); + mflow->flow_id.context = flow_matcher->context; + return &mflow->flow_id; +err: + free(mflow); + return NULL; +} + +struct ibv_flow * +mlx5dv_create_flow(struct mlx5dv_flow_matcher *flow_matcher, + struct mlx5dv_flow_match_parameters *match_value, + size_t num_actions, + struct mlx5dv_flow_action_attr actions_attr[]) +{ + return __mlx5dv_create_flow(flow_matcher, + match_value, + num_actions, + actions_attr, + NULL); +} + +struct mlx5dv_devx_umem * +mlx5dv_devx_umem_reg(struct ibv_context *context, void *addr, size_t size, uint32_t access) +{ + DECLARE_COMMAND_BUFFER(cmd, + MLX5_IB_OBJECT_DEVX_UMEM, + MLX5_IB_METHOD_DEVX_UMEM_REG, + 5); + struct ib_uverbs_attr *handle; + struct mlx5_devx_umem *umem; + int ret; + + umem = calloc(1, sizeof(*umem)); + if (!umem) { + errno = ENOMEM; + return NULL; + } + + fill_attr_in_uint64(cmd, MLX5_IB_ATTR_DEVX_UMEM_REG_ADDR, (intptr_t)addr); + fill_attr_in_uint64(cmd, MLX5_IB_ATTR_DEVX_UMEM_REG_LEN, size); + fill_attr_in_uint32(cmd, MLX5_IB_ATTR_DEVX_UMEM_REG_ACCESS, access); + fill_attr_out(cmd, MLX5_IB_ATTR_DEVX_UMEM_REG_OUT_ID, + &umem->dv_devx_umem.umem_id, + sizeof(umem->dv_devx_umem.umem_id)); + handle = fill_attr_out_obj(cmd, MLX5_IB_ATTR_DEVX_UMEM_REG_HANDLE); + + ret = execute_ioctl(context, cmd); + if (ret) + goto err; + + umem->handle = read_attr_obj(MLX5_IB_ATTR_DEVX_UMEM_REG_HANDLE, handle); + umem->context = context; + + return &umem->dv_devx_umem; +err: + free(umem); + return NULL; +} + +int mlx5dv_devx_umem_dereg(struct mlx5dv_devx_umem *dv_devx_umem) +{ + DECLARE_COMMAND_BUFFER(cmd, + MLX5_IB_OBJECT_DEVX_UMEM, + MLX5_IB_METHOD_DEVX_UMEM_DEREG, + 1); + int ret; + struct mlx5_devx_umem *umem = container_of(dv_devx_umem, struct mlx5_devx_umem, + dv_devx_umem); + + fill_attr_in_obj(cmd, MLX5_IB_ATTR_DEVX_UMEM_DEREG_HANDLE, umem->handle); + ret = execute_ioctl(umem->context, cmd); + if (ret) + return ret; + + free(umem); + return 0; +} + +static void set_devx_obj_info(const void *in, const void *out, + struct mlx5dv_devx_obj *obj) +{ + uint16_t opcode; + uint16_t obj_type; + + opcode = DEVX_GET(general_obj_in_cmd_hdr, in, opcode); + + switch (opcode) { + case MLX5_CMD_OP_CREATE_FLOW_TABLE: + obj->type = MLX5_DEVX_FLOW_TABLE; + obj->object_id = DEVX_GET(create_flow_table_out, out, table_id); + break; + case MLX5_CMD_OP_CREATE_FLOW_COUNTER: + obj->type = MLX5_DEVX_FLOW_COUNTER; + obj->object_id = DEVX_GET(alloc_flow_counter_out, out, flow_counter_id); + break; + case MLX5_CMD_OP_CREATE_GENERAL_OBJECT: + obj_type = DEVX_GET(general_obj_in_cmd_hdr, in, obj_type); + if (obj_type == MLX5_OBJ_TYPE_FLOW_METER) + obj->type = MLX5_DEVX_FLOW_METER; + + obj->object_id = DEVX_GET(general_obj_out_cmd_hdr, out, obj_id); + break; + case MLX5_CMD_OP_CREATE_QP: + obj->type = MLX5_DEVX_QP; + obj->object_id = DEVX_GET(create_qp_out, out, qpn); + break; + case MLX5_CMD_OP_ALLOC_PACKET_REFORMAT_CONTEXT: + obj->type = MLX5_DEVX_PKT_REFORMAT_CTX; + obj->object_id = DEVX_GET(alloc_packet_reformat_context_out, + out, packet_reformat_id); + break; + default: + break; + } +} + +struct mlx5dv_devx_obj * +mlx5dv_devx_obj_create(struct ibv_context *context, const void *in, size_t inlen, + void *out, size_t outlen) +{ + DECLARE_COMMAND_BUFFER(cmd, + MLX5_IB_OBJECT_DEVX_OBJ, + MLX5_IB_METHOD_DEVX_OBJ_CREATE, + 3); + struct ib_uverbs_attr *handle; + struct mlx5dv_devx_obj *obj; + int ret; + + obj = calloc(1, sizeof(*obj)); + if (!obj) { + errno = ENOMEM; + return NULL; + } + + handle = fill_attr_out_obj(cmd, MLX5_IB_ATTR_DEVX_OBJ_CREATE_HANDLE); + fill_attr_in(cmd, MLX5_IB_ATTR_DEVX_OBJ_CREATE_CMD_IN, in, inlen); + fill_attr_out(cmd, MLX5_IB_ATTR_DEVX_OBJ_CREATE_CMD_OUT, out, outlen); + + ret = execute_ioctl(context, cmd); + if (ret) + goto err; + + obj->handle = read_attr_obj(MLX5_IB_ATTR_DEVX_OBJ_CREATE_HANDLE, handle); + obj->context = context; + set_devx_obj_info(in, out, obj); + return obj; +err: + free(obj); + return NULL; +} + +int mlx5dv_devx_obj_query(struct mlx5dv_devx_obj *obj, const void *in, size_t inlen, + void *out, size_t outlen) +{ + DECLARE_COMMAND_BUFFER(cmd, + MLX5_IB_OBJECT_DEVX_OBJ, + MLX5_IB_METHOD_DEVX_OBJ_QUERY, + 3); + + fill_attr_in_obj(cmd, MLX5_IB_ATTR_DEVX_OBJ_QUERY_HANDLE, obj->handle); + fill_attr_in(cmd, MLX5_IB_ATTR_DEVX_OBJ_QUERY_CMD_IN, in, inlen); + fill_attr_out(cmd, MLX5_IB_ATTR_DEVX_OBJ_QUERY_CMD_OUT, out, outlen); + + return execute_ioctl(obj->context, cmd); +} + +int mlx5dv_devx_obj_modify(struct mlx5dv_devx_obj *obj, const void *in, size_t inlen, + void *out, size_t outlen) +{ + DECLARE_COMMAND_BUFFER(cmd, + MLX5_IB_OBJECT_DEVX_OBJ, + MLX5_IB_METHOD_DEVX_OBJ_MODIFY, + 3); + + fill_attr_in_obj(cmd, MLX5_IB_ATTR_DEVX_OBJ_MODIFY_HANDLE, obj->handle); + fill_attr_in(cmd, MLX5_IB_ATTR_DEVX_OBJ_MODIFY_CMD_IN, in, inlen); + fill_attr_out(cmd, MLX5_IB_ATTR_DEVX_OBJ_MODIFY_CMD_OUT, out, outlen); + + return execute_ioctl(obj->context, cmd); +} + +int mlx5dv_devx_obj_destroy(struct mlx5dv_devx_obj *obj) +{ + DECLARE_COMMAND_BUFFER(cmd, + MLX5_IB_OBJECT_DEVX_OBJ, + MLX5_IB_METHOD_DEVX_OBJ_DESTROY, + 1); + int ret; + + fill_attr_in_obj(cmd, MLX5_IB_ATTR_DEVX_OBJ_DESTROY_HANDLE, obj->handle); + ret = execute_ioctl(obj->context, cmd); + + if (ret) + return ret; + free(obj); + return 0; +} + +int mlx5dv_devx_general_cmd(struct ibv_context *context, const void *in, size_t inlen, + void *out, size_t outlen) +{ + DECLARE_COMMAND_BUFFER(cmd, + MLX5_IB_OBJECT_DEVX, + MLX5_IB_METHOD_DEVX_OTHER, + 2); + + fill_attr_in(cmd, MLX5_IB_ATTR_DEVX_OTHER_CMD_IN, in, inlen); + fill_attr_out(cmd, MLX5_IB_ATTR_DEVX_OTHER_CMD_OUT, out, outlen); + + return execute_ioctl(context, cmd); +} + +void clean_dyn_uars(struct ibv_context *context) +{ + struct mlx5_context *ctx = to_mctx(context); + struct mlx5_bf *bf, *tmp_bf; + + list_for_each_safe(&ctx->dyn_uar_nc_list, bf, tmp_bf, uar_entry) { + list_del(&bf->uar_entry); + mlx5_free_uar(context, bf); + } + + list_for_each_safe(&ctx->dyn_uar_bf_list, bf, tmp_bf, uar_entry) { + list_del(&bf->uar_entry); + mlx5_free_uar(context, bf); + } + + list_for_each_safe(&ctx->dyn_uar_qp_dedicated_list, bf, tmp_bf, uar_entry) { + list_del(&bf->uar_entry); + mlx5_free_uar(context, bf); + } + + list_for_each_safe(&ctx->dyn_uar_qp_shared_list, bf, tmp_bf, uar_entry) { + list_del(&bf->uar_entry); + mlx5_free_uar(context, bf); + } + + if (ctx->cq_uar) + mlx5_free_uar(context, ctx->cq_uar); +} + +struct mlx5dv_devx_uar *mlx5dv_devx_alloc_uar(struct ibv_context *context, + uint32_t flags) +{ + DECLARE_COMMAND_BUFFER(cmd, + MLX5_IB_OBJECT_DEVX, + MLX5_IB_METHOD_DEVX_QUERY_UAR, + 2); + + int ret; + struct mlx5_bf *bf; + + if (!is_mlx5_dev(context->device)) { + errno = EOPNOTSUPP; + return NULL; + } + + if (!check_comp_mask(flags, MLX5_IB_UAPI_UAR_ALLOC_TYPE_NC)) { + errno = EOPNOTSUPP; + return NULL; + } + + bf = mlx5_attach_dedicated_uar(context, flags); + if (!bf) + return NULL; + + if (bf->dyn_alloc_uar) + bf->devx_uar.dv_devx_uar.page_id = bf->page_id; + else { + fill_attr_in_uint32(cmd, MLX5_IB_ATTR_DEVX_QUERY_UAR_USER_IDX, + bf->bfreg_dyn_index); + fill_attr_out_ptr(cmd, MLX5_IB_ATTR_DEVX_QUERY_UAR_DEV_IDX, + &bf->devx_uar.dv_devx_uar.page_id); + + ret = execute_ioctl(context, cmd); + if (ret) { + mlx5_detach_dedicated_uar(context, bf); + return NULL; + } + } + + bf->devx_uar.dv_devx_uar.reg_addr = bf->reg; + bf->devx_uar.dv_devx_uar.base_addr = bf->uar; + bf->devx_uar.dv_devx_uar.mmap_off = bf->uar_mmap_offset; + bf->devx_uar.dv_devx_uar.comp_mask = 0; + bf->devx_uar.context = context; + return &bf->devx_uar.dv_devx_uar; +} + +void mlx5dv_devx_free_uar(struct mlx5dv_devx_uar *dv_devx_uar) +{ + struct mlx5_bf *bf = container_of(dv_devx_uar, struct mlx5_bf, + devx_uar.dv_devx_uar); + + mlx5_detach_dedicated_uar(bf->devx_uar.context, bf); +} + +int mlx5dv_devx_query_eqn(struct ibv_context *context, uint32_t vector, + uint32_t *eqn) +{ + DECLARE_COMMAND_BUFFER(cmd, + MLX5_IB_OBJECT_DEVX, + MLX5_IB_METHOD_DEVX_QUERY_EQN, + 2); + + fill_attr_in_uint32(cmd, MLX5_IB_ATTR_DEVX_QUERY_EQN_USER_VEC, vector); + fill_attr_out_ptr(cmd, MLX5_IB_ATTR_DEVX_QUERY_EQN_DEV_EQN, eqn); + + return execute_ioctl(context, cmd); +} + +int mlx5dv_devx_cq_query(struct ibv_cq *cq, const void *in, size_t inlen, + void *out, size_t outlen) +{ + DECLARE_COMMAND_BUFFER(cmd, + MLX5_IB_OBJECT_DEVX_OBJ, + MLX5_IB_METHOD_DEVX_OBJ_QUERY, + 3); + + fill_attr_in_obj(cmd, MLX5_IB_ATTR_DEVX_OBJ_QUERY_HANDLE, cq->handle); + fill_attr_in(cmd, MLX5_IB_ATTR_DEVX_OBJ_QUERY_CMD_IN, in, inlen); + fill_attr_out(cmd, MLX5_IB_ATTR_DEVX_OBJ_QUERY_CMD_OUT, out, outlen); + + return execute_ioctl(cq->context, cmd); +} + +int mlx5dv_devx_cq_modify(struct ibv_cq *cq, const void *in, size_t inlen, + void *out, size_t outlen) +{ + DECLARE_COMMAND_BUFFER(cmd, + MLX5_IB_OBJECT_DEVX_OBJ, + MLX5_IB_METHOD_DEVX_OBJ_MODIFY, + 3); + + fill_attr_in_obj(cmd, MLX5_IB_ATTR_DEVX_OBJ_MODIFY_HANDLE, cq->handle); + fill_attr_in(cmd, MLX5_IB_ATTR_DEVX_OBJ_MODIFY_CMD_IN, in, inlen); + fill_attr_out(cmd, MLX5_IB_ATTR_DEVX_OBJ_MODIFY_CMD_OUT, out, outlen); + + return execute_ioctl(cq->context, cmd); +} + +int mlx5dv_devx_qp_query(struct ibv_qp *qp, const void *in, size_t inlen, + void *out, size_t outlen) +{ + DECLARE_COMMAND_BUFFER(cmd, + MLX5_IB_OBJECT_DEVX_OBJ, + MLX5_IB_METHOD_DEVX_OBJ_QUERY, + 3); + + fill_attr_in_obj(cmd, MLX5_IB_ATTR_DEVX_OBJ_QUERY_HANDLE, qp->handle); + fill_attr_in(cmd, MLX5_IB_ATTR_DEVX_OBJ_QUERY_CMD_IN, in, inlen); + fill_attr_out(cmd, MLX5_IB_ATTR_DEVX_OBJ_QUERY_CMD_OUT, out, outlen); + + return execute_ioctl(qp->context, cmd); +} + +int mlx5dv_devx_qp_modify(struct ibv_qp *qp, const void *in, size_t inlen, + void *out, size_t outlen) +{ + DECLARE_COMMAND_BUFFER(cmd, + MLX5_IB_OBJECT_DEVX_OBJ, + MLX5_IB_METHOD_DEVX_OBJ_MODIFY, + 3); + + fill_attr_in_obj(cmd, MLX5_IB_ATTR_DEVX_OBJ_MODIFY_HANDLE, qp->handle); + fill_attr_in(cmd, MLX5_IB_ATTR_DEVX_OBJ_MODIFY_CMD_IN, in, inlen); + fill_attr_out(cmd, MLX5_IB_ATTR_DEVX_OBJ_MODIFY_CMD_OUT, out, outlen); + + return execute_ioctl(qp->context, cmd); +} + +int mlx5dv_devx_srq_query(struct ibv_srq *srq, const void *in, size_t inlen, + void *out, size_t outlen) +{ + DECLARE_COMMAND_BUFFER(cmd, + MLX5_IB_OBJECT_DEVX_OBJ, + MLX5_IB_METHOD_DEVX_OBJ_QUERY, + 3); + + fill_attr_in_obj(cmd, MLX5_IB_ATTR_DEVX_OBJ_QUERY_HANDLE, srq->handle); + fill_attr_in(cmd, MLX5_IB_ATTR_DEVX_OBJ_QUERY_CMD_IN, in, inlen); + fill_attr_out(cmd, MLX5_IB_ATTR_DEVX_OBJ_QUERY_CMD_OUT, out, outlen); + + return execute_ioctl(srq->context, cmd); +} + +int mlx5dv_devx_srq_modify(struct ibv_srq *srq, const void *in, size_t inlen, + void *out, size_t outlen) +{ + DECLARE_COMMAND_BUFFER(cmd, + MLX5_IB_OBJECT_DEVX_OBJ, + MLX5_IB_METHOD_DEVX_OBJ_MODIFY, + 3); + + fill_attr_in_obj(cmd, MLX5_IB_ATTR_DEVX_OBJ_MODIFY_HANDLE, srq->handle); + fill_attr_in(cmd, MLX5_IB_ATTR_DEVX_OBJ_MODIFY_CMD_IN, in, inlen); + fill_attr_out(cmd, MLX5_IB_ATTR_DEVX_OBJ_MODIFY_CMD_OUT, out, outlen); + + return execute_ioctl(srq->context, cmd); +} + +int mlx5dv_devx_wq_query(struct ibv_wq *wq, const void *in, size_t inlen, + void *out, size_t outlen) +{ + DECLARE_COMMAND_BUFFER(cmd, + MLX5_IB_OBJECT_DEVX_OBJ, + MLX5_IB_METHOD_DEVX_OBJ_QUERY, + 3); + + fill_attr_in_obj(cmd, MLX5_IB_ATTR_DEVX_OBJ_QUERY_HANDLE, wq->handle); + fill_attr_in(cmd, MLX5_IB_ATTR_DEVX_OBJ_QUERY_CMD_IN, in, inlen); + fill_attr_out(cmd, MLX5_IB_ATTR_DEVX_OBJ_QUERY_CMD_OUT, out, outlen); + + return execute_ioctl(wq->context, cmd); +} + +int mlx5dv_devx_wq_modify(struct ibv_wq *wq, const void *in, size_t inlen, + void *out, size_t outlen) +{ + DECLARE_COMMAND_BUFFER(cmd, + MLX5_IB_OBJECT_DEVX_OBJ, + MLX5_IB_METHOD_DEVX_OBJ_MODIFY, + 3); + + fill_attr_in_obj(cmd, MLX5_IB_ATTR_DEVX_OBJ_MODIFY_HANDLE, wq->handle); + fill_attr_in(cmd, MLX5_IB_ATTR_DEVX_OBJ_MODIFY_CMD_IN, in, inlen); + fill_attr_out(cmd, MLX5_IB_ATTR_DEVX_OBJ_MODIFY_CMD_OUT, out, outlen); + + return execute_ioctl(wq->context, cmd); +} + +int mlx5dv_devx_ind_tbl_query(struct ibv_rwq_ind_table *ind_tbl, const void *in, size_t inlen, + void *out, size_t outlen) +{ + DECLARE_COMMAND_BUFFER(cmd, + MLX5_IB_OBJECT_DEVX_OBJ, + MLX5_IB_METHOD_DEVX_OBJ_QUERY, + 3); + + fill_attr_in_obj(cmd, MLX5_IB_ATTR_DEVX_OBJ_QUERY_HANDLE, ind_tbl->ind_tbl_handle); + fill_attr_in(cmd, MLX5_IB_ATTR_DEVX_OBJ_QUERY_CMD_IN, in, inlen); + fill_attr_out(cmd, MLX5_IB_ATTR_DEVX_OBJ_QUERY_CMD_OUT, out, outlen); + + return execute_ioctl(ind_tbl->context, cmd); +} + +int mlx5dv_devx_ind_tbl_modify(struct ibv_rwq_ind_table *ind_tbl, const void *in, size_t inlen, + void *out, size_t outlen) +{ + DECLARE_COMMAND_BUFFER(cmd, + MLX5_IB_OBJECT_DEVX_OBJ, + MLX5_IB_METHOD_DEVX_OBJ_MODIFY, + 3); + + fill_attr_in_obj(cmd, MLX5_IB_ATTR_DEVX_OBJ_MODIFY_HANDLE, ind_tbl->ind_tbl_handle); + fill_attr_in(cmd, MLX5_IB_ATTR_DEVX_OBJ_MODIFY_CMD_IN, in, inlen); + fill_attr_out(cmd, MLX5_IB_ATTR_DEVX_OBJ_MODIFY_CMD_OUT, out, outlen); + + return execute_ioctl(ind_tbl->context, cmd); +} + +struct mlx5dv_devx_cmd_comp * +mlx5dv_devx_create_cmd_comp(struct ibv_context *context) +{ + DECLARE_COMMAND_BUFFER(cmd, + MLX5_IB_OBJECT_DEVX_ASYNC_CMD_FD, + MLX5_IB_METHOD_DEVX_ASYNC_CMD_FD_ALLOC, + 1); + struct ib_uverbs_attr *handle; + struct mlx5dv_devx_cmd_comp *cmd_comp; + int ret; + + cmd_comp = calloc(1, sizeof(*cmd_comp)); + if (!cmd_comp) { + errno = ENOMEM; + return NULL; + } + + handle = fill_attr_out_fd(cmd, + MLX5_IB_ATTR_DEVX_ASYNC_CMD_FD_ALLOC_HANDLE, + 0); + + ret = execute_ioctl(context, cmd); + if (ret) + goto err; + + cmd_comp->fd = read_attr_fd( + MLX5_IB_ATTR_DEVX_ASYNC_CMD_FD_ALLOC_HANDLE, handle); + return cmd_comp; +err: + free(cmd_comp); + return NULL; +} + +void mlx5dv_devx_destroy_cmd_comp( + struct mlx5dv_devx_cmd_comp *cmd_comp) +{ + close(cmd_comp->fd); + free(cmd_comp); +} + +struct mlx5dv_devx_event_channel * +mlx5dv_devx_create_event_channel(struct ibv_context *context, + enum mlx5dv_devx_create_event_channel_flags flags) +{ + DECLARE_COMMAND_BUFFER(cmd, + MLX5_IB_OBJECT_DEVX_ASYNC_EVENT_FD, + MLX5_IB_METHOD_DEVX_ASYNC_EVENT_FD_ALLOC, + 2); + struct ib_uverbs_attr *handle; + struct mlx5_devx_event_channel *event_channel; + int ret; + + event_channel = calloc(1, sizeof(*event_channel)); + if (!event_channel) { + errno = ENOMEM; + return NULL; + } + + handle = fill_attr_out_fd(cmd, + MLX5_IB_ATTR_DEVX_ASYNC_EVENT_FD_ALLOC_HANDLE, + 0); + fill_attr_in_uint32(cmd, MLX5_IB_ATTR_DEVX_ASYNC_EVENT_FD_ALLOC_FLAGS, + flags); + + ret = execute_ioctl(context, cmd); + if (ret) + goto err; + + event_channel->dv_event_channel.fd = read_attr_fd( + MLX5_IB_ATTR_DEVX_ASYNC_EVENT_FD_ALLOC_HANDLE, handle); + event_channel->context = context; + return &event_channel->dv_event_channel; +err: + free(event_channel); + return NULL; +} + +void mlx5dv_devx_destroy_event_channel( + struct mlx5dv_devx_event_channel *dv_event_channel) +{ + struct mlx5_devx_event_channel *event_channel = + container_of(dv_event_channel, struct mlx5_devx_event_channel, + dv_event_channel); + + close(dv_event_channel->fd); + free(event_channel); +} + +int mlx5dv_devx_subscribe_devx_event(struct mlx5dv_devx_event_channel *dv_event_channel, + struct mlx5dv_devx_obj *obj, /* can be NULL for unaffiliated events */ + uint16_t events_sz, + uint16_t events_num[], + uint64_t cookie) +{ + struct mlx5_devx_event_channel *event_channel = + container_of(dv_event_channel, struct mlx5_devx_event_channel, + dv_event_channel); + DECLARE_COMMAND_BUFFER(cmd, + MLX5_IB_OBJECT_DEVX, + MLX5_IB_METHOD_DEVX_SUBSCRIBE_EVENT, + 4); + + fill_attr_in_fd(cmd, MLX5_IB_ATTR_DEVX_SUBSCRIBE_EVENT_FD_HANDLE, dv_event_channel->fd); + fill_attr_in_uint64(cmd, MLX5_IB_ATTR_DEVX_SUBSCRIBE_EVENT_COOKIE, cookie); + if (obj) + fill_attr_in_obj(cmd, MLX5_IB_ATTR_DEVX_SUBSCRIBE_EVENT_OBJ_HANDLE, obj->handle); + + fill_attr_in(cmd, MLX5_IB_ATTR_DEVX_SUBSCRIBE_EVENT_TYPE_NUM_LIST, events_num, events_sz); + + return execute_ioctl(event_channel->context, cmd); +} + +int mlx5dv_devx_subscribe_devx_event_fd(struct mlx5dv_devx_event_channel *dv_event_channel, + int fd, + struct mlx5dv_devx_obj *obj, /* can be NULL for unaffiliated events */ + uint16_t event_num) +{ + struct mlx5_devx_event_channel *event_channel = + container_of(dv_event_channel, struct mlx5_devx_event_channel, + dv_event_channel); + DECLARE_COMMAND_BUFFER(cmd, + MLX5_IB_OBJECT_DEVX, + MLX5_IB_METHOD_DEVX_SUBSCRIBE_EVENT, + 4); + + fill_attr_in_fd(cmd, MLX5_IB_ATTR_DEVX_SUBSCRIBE_EVENT_FD_HANDLE, dv_event_channel->fd); + if (obj) + fill_attr_in_obj(cmd, MLX5_IB_ATTR_DEVX_SUBSCRIBE_EVENT_OBJ_HANDLE, obj->handle); + fill_attr_in(cmd, MLX5_IB_ATTR_DEVX_SUBSCRIBE_EVENT_TYPE_NUM_LIST, + &event_num, sizeof(event_num)); + fill_attr_in_uint32(cmd, MLX5_IB_ATTR_DEVX_SUBSCRIBE_EVENT_FD_NUM, fd); + + return execute_ioctl(event_channel->context, cmd); +} + +int mlx5dv_devx_obj_query_async(struct mlx5dv_devx_obj *obj, const void *in, + size_t inlen, size_t outlen, + uint64_t wr_id, + struct mlx5dv_devx_cmd_comp *cmd_comp) +{ + DECLARE_COMMAND_BUFFER(cmd, + MLX5_IB_OBJECT_DEVX_OBJ, + MLX5_IB_METHOD_DEVX_OBJ_ASYNC_QUERY, + 5); + + fill_attr_in_obj(cmd, MLX5_IB_ATTR_DEVX_OBJ_QUERY_ASYNC_HANDLE, obj->handle); + fill_attr_in(cmd, MLX5_IB_ATTR_DEVX_OBJ_QUERY_ASYNC_CMD_IN, in, inlen); + fill_attr_const_in(cmd, MLX5_IB_ATTR_DEVX_OBJ_QUERY_ASYNC_OUT_LEN, outlen); + fill_attr_in_uint64(cmd, MLX5_IB_ATTR_DEVX_OBJ_QUERY_ASYNC_WR_ID, wr_id); + fill_attr_in_fd(cmd, MLX5_IB_ATTR_DEVX_OBJ_QUERY_ASYNC_FD, cmd_comp->fd); + + return execute_ioctl(obj->context, cmd); +} + +int mlx5dv_devx_get_async_cmd_comp(struct mlx5dv_devx_cmd_comp *cmd_comp, + struct mlx5dv_devx_async_cmd_hdr *cmd_resp, + size_t cmd_resp_len) +{ + ssize_t bytes; + + bytes = read(cmd_comp->fd, cmd_resp, cmd_resp_len); + if (bytes < 0) + return errno; + + if (bytes < sizeof(*cmd_resp)) + return EINVAL; + + return 0; +} + +ssize_t mlx5dv_devx_get_event(struct mlx5dv_devx_event_channel *event_channel, + struct mlx5dv_devx_async_event_hdr *event_data, + size_t event_resp_len) +{ + ssize_t bytes; + + bytes = read(event_channel->fd, event_data, event_resp_len); + if (bytes < 0) + return -1; + + /* cookie should be always exist */ + if (bytes < sizeof(*event_data)) { + errno = EINVAL; + return -1; + } + + /* event data may be omitted in case no EQE data exists (e.g. completion event on a CQ) */ + return bytes; +} + +struct mlx5dv_mkey *mlx5dv_create_mkey(struct mlx5dv_mkey_init_attr *mkey_init_attr) +{ + uint32_t out[DEVX_ST_SZ_DW(create_mkey_out)] = {}; + uint32_t in[DEVX_ST_SZ_DW(create_mkey_in)] = {}; + struct mlx5_mkey *mkey; + void *mkc; + + if (!mkey_init_attr->create_flags || + !check_comp_mask(mkey_init_attr->create_flags, + MLX5DV_MKEY_INIT_ATTR_FLAGS_INDIRECT)) { + errno = EOPNOTSUPP; + return NULL; + } + + mkey = calloc(1, sizeof(*mkey)); + if (!mkey) { + errno = ENOMEM; + return NULL; + } + + mkey->num_desc = align(mkey_init_attr->max_entries, 4); + DEVX_SET(create_mkey_in, in, opcode, MLX5_CMD_OP_CREATE_MKEY); + mkc = DEVX_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); + DEVX_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_KLMS); + DEVX_SET(mkc, mkc, free, 1); + DEVX_SET(mkc, mkc, umr_en, 1); + DEVX_SET(mkc, mkc, pd, to_mpd(mkey_init_attr->pd)->pdn); + DEVX_SET(mkc, mkc, translations_octword_size, mkey->num_desc); + DEVX_SET(mkc, mkc, lr, 1); + DEVX_SET(mkc, mkc, qpn, 0xffffff); + DEVX_SET(mkc, mkc, mkey_7_0, 0); + + mkey->devx_obj = mlx5dv_devx_obj_create(mkey_init_attr->pd->context, + in, sizeof(in), out, sizeof(out)); + if (!mkey->devx_obj) + goto end; + + mkey_init_attr->max_entries = mkey->num_desc; + mkey->dv_mkey.lkey = (DEVX_GET(create_mkey_out, out, mkey_index) << 8) | 0; + mkey->dv_mkey.rkey = mkey->dv_mkey.lkey; + + return &mkey->dv_mkey; +end: + free(mkey); + return NULL; +} + +int mlx5dv_destroy_mkey(struct mlx5dv_mkey *dv_mkey) +{ + struct mlx5_mkey *mkey = container_of(dv_mkey, struct mlx5_mkey, + dv_mkey); + int ret; + + ret = mlx5dv_devx_obj_destroy(mkey->devx_obj); + if (ret) + return ret; + + free(mkey); + return 0; +} + +struct mlx5dv_var * +mlx5dv_alloc_var(struct ibv_context *context, uint32_t flags) +{ + DECLARE_COMMAND_BUFFER(cmd, + MLX5_IB_OBJECT_VAR, + MLX5_IB_METHOD_VAR_OBJ_ALLOC, + 4); + + struct ib_uverbs_attr *handle; + struct mlx5_var_obj *obj; + int ret; + + if (!is_mlx5_dev(context->device)) { + errno = EOPNOTSUPP; + return NULL; + } + + if (flags) { + errno = EOPNOTSUPP; + return NULL; + } + + obj = calloc(1, sizeof(*obj)); + if (!obj) { + errno = ENOMEM; + return NULL; + } + + handle = fill_attr_out_obj(cmd, MLX5_IB_ATTR_VAR_OBJ_ALLOC_HANDLE); + fill_attr_out_ptr(cmd, MLX5_IB_ATTR_VAR_OBJ_ALLOC_MMAP_OFFSET, + &obj->dv_var.mmap_off); + fill_attr_out_ptr(cmd, MLX5_IB_ATTR_VAR_OBJ_ALLOC_MMAP_LENGTH, + &obj->dv_var.length); + fill_attr_out_ptr(cmd, MLX5_IB_ATTR_VAR_OBJ_ALLOC_PAGE_ID, + &obj->dv_var.page_id); + + ret = execute_ioctl(context, cmd); + if (ret) + goto err; + + obj->handle = read_attr_obj(MLX5_IB_ATTR_VAR_OBJ_ALLOC_HANDLE, handle); + obj->context = context; + + return &obj->dv_var; + +err: + free(obj); + return NULL; +} + + +void mlx5dv_free_var(struct mlx5dv_var *dv_var) +{ + DECLARE_COMMAND_BUFFER(cmd, + MLX5_IB_OBJECT_VAR, + MLX5_IB_METHOD_VAR_OBJ_DESTROY, + 1); + + struct mlx5_var_obj *obj = container_of(dv_var, struct mlx5_var_obj, + dv_var); + + fill_attr_in_obj(cmd, MLX5_IB_ATTR_VAR_OBJ_DESTROY_HANDLE, obj->handle); + if (execute_ioctl(obj->context, cmd)) + assert(false); + + free(obj); +} + +struct mlx5dv_pp *mlx5dv_pp_alloc(struct ibv_context *context, + size_t pp_context_sz, + const void *pp_context, + uint32_t flags) +{ + DECLARE_COMMAND_BUFFER(cmd, + MLX5_IB_OBJECT_PP, + MLX5_IB_METHOD_PP_OBJ_ALLOC, + 4); + + struct ib_uverbs_attr *handle; + struct mlx5_pp_obj *obj; + int ret; + + if (!is_mlx5_dev(context->device)) { + errno = EOPNOTSUPP; + return NULL; + } + + if (!check_comp_mask(flags, + MLX5_IB_UAPI_PP_ALLOC_FLAGS_DEDICATED_INDEX)) { + errno = EOPNOTSUPP; + return NULL; + } + + obj = calloc(1, sizeof(*obj)); + if (!obj) { + errno = ENOMEM; + return NULL; + } + + handle = fill_attr_out_obj(cmd, MLX5_IB_ATTR_PP_OBJ_ALLOC_HANDLE); + fill_attr_in(cmd, MLX5_IB_ATTR_PP_OBJ_ALLOC_CTX, + pp_context, pp_context_sz); + fill_attr_const_in(cmd, MLX5_IB_ATTR_PP_OBJ_ALLOC_FLAGS, flags); + fill_attr_out_ptr(cmd, MLX5_IB_ATTR_PP_OBJ_ALLOC_INDEX, + &obj->dv_pp.index); + + ret = execute_ioctl(context, cmd); + if (ret) + goto err; + + obj->handle = read_attr_obj(MLX5_IB_ATTR_PP_OBJ_ALLOC_HANDLE, handle); + obj->context = context; + + return &obj->dv_pp; + +err: + free(obj); + return NULL; +} + +void mlx5dv_pp_free(struct mlx5dv_pp *dv_pp) +{ + DECLARE_COMMAND_BUFFER(cmd, + MLX5_IB_OBJECT_PP, + MLX5_IB_METHOD_PP_OBJ_DESTROY, + 1); + + struct mlx5_pp_obj *obj = container_of(dv_pp, struct mlx5_pp_obj, + dv_pp); + + fill_attr_in_obj(cmd, MLX5_IB_ATTR_PP_OBJ_DESTROY_HANDLE, obj->handle); + if (execute_ioctl(obj->context, cmd)) + assert(false); + + free(obj); +} diff --git a/providers/mlx5/wqe.h b/providers/mlx5/wqe.h new file mode 100644 index 0000000..5d473a2 --- /dev/null +++ b/providers/mlx5/wqe.h @@ -0,0 +1,126 @@ +/* + * Copyright (c) 2012 Mellanox Technologies, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef WQE_H +#define WQE_H + +#include <stdint.h> + +struct mlx5_sg_copy_ptr { + int index; + int offset; +}; + +struct mlx5_eqe_comp { + uint32_t reserved[6]; + uint32_t cqn; +}; + +struct mlx5_eqe_qp_srq { + uint32_t reserved[6]; + uint32_t qp_srq_n; +}; + +struct mlx5_wqe_eth_pad { + uint8_t rsvd0[16]; +}; + +struct mlx5_wqe_xrc_seg { + __be32 xrc_srqn; + uint8_t rsvd[12]; +}; + +struct mlx5_wqe_masked_atomic_seg { + uint64_t swap_add; + uint64_t compare; + uint64_t swap_add_mask; + uint64_t compare_mask; +}; + +enum { + MLX5_IPOIB_INLINE_MIN_HEADER_SIZE = 4, + MLX5_SOURCE_QPN_INLINE_MAX_HEADER_SIZE = 18, + MLX5_ETH_L2_INLINE_HEADER_SIZE = 18, + MLX5_ETH_L2_MIN_HEADER_SIZE = 14, +}; + +struct mlx5_seg_set_psv { + uint8_t rsvd[4]; + uint16_t syndrome; + uint16_t status; + uint16_t block_guard; + uint16_t app_tag; + uint32_t ref_tag; + uint32_t mkey; + uint64_t va; +}; + +struct mlx5_seg_get_psv { + uint8_t rsvd[19]; + uint8_t num_psv; + uint32_t l_key; + uint64_t va; + uint32_t psv_index[4]; +}; + +struct mlx5_seg_check_psv { + uint8_t rsvd0[2]; + uint16_t err_coalescing_op; + uint8_t rsvd1[2]; + uint16_t xport_err_op; + uint8_t rsvd2[2]; + uint16_t xport_err_mask; + uint8_t rsvd3[7]; + uint8_t num_psv; + uint32_t l_key; + uint64_t va; + uint32_t psv_index[4]; +}; + +struct mlx5_rwqe_sig { + uint8_t rsvd0[4]; + uint8_t signature; + uint8_t rsvd1[11]; +}; + +struct mlx5_wqe_signature_seg { + uint8_t rsvd0[4]; + uint8_t signature; + uint8_t rsvd1[11]; +}; + +struct mlx5_wqe_inline_seg { + __be32 byte_count; +}; + + +#endif /* WQE_H */ diff --git a/providers/mthca/CMakeLists.txt b/providers/mthca/CMakeLists.txt new file mode 100644 index 0000000..63d7147 --- /dev/null +++ b/providers/mthca/CMakeLists.txt @@ -0,0 +1,10 @@ +rdma_provider(mthca + ah.c + buf.c + cq.c + memfree.c + mthca.c + qp.c + srq.c + verbs.c +) diff --git a/providers/mthca/ah.c b/providers/mthca/ah.c new file mode 100644 index 0000000..adefb17 --- /dev/null +++ b/providers/mthca/ah.c @@ -0,0 +1,190 @@ +/* + * Copyright (c) 2005 Topspin Communications. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <config.h> + +#include <endian.h> +#include <stdlib.h> +#include <pthread.h> +#include <string.h> + +#include "mthca.h" + +struct mthca_ah_page { + struct mthca_ah_page *prev, *next; + struct mthca_buf buf; + struct ibv_mr *mr; + int use_cnt; + unsigned free[0]; +}; + +static struct mthca_ah_page *__add_page(struct mthca_pd *pd, int page_size, int per_page) +{ + struct mthca_ah_page *page; + int i; + + page = malloc(sizeof *page + per_page * sizeof (int)); + if (!page) + return NULL; + + if (mthca_alloc_buf(&page->buf, page_size, page_size)) { + free(page); + return NULL; + } + + page->mr = mthca_reg_mr(&pd->ibv_pd, page->buf.buf, page_size, + (uintptr_t) page->buf.buf, 0); + if (!page->mr) { + mthca_free_buf(&page->buf); + free(page); + return NULL; + } + + page->mr->context = pd->ibv_pd.context; + + page->use_cnt = 0; + for (i = 0; i < per_page; ++i) + page->free[i] = ~0; + + page->prev = NULL; + page->next = pd->ah_list; + pd->ah_list = page; + if (page->next) + page->next->prev = page; + + return page; +} + +int mthca_alloc_av(struct mthca_pd *pd, struct ibv_ah_attr *attr, + struct mthca_ah *ah) +{ + if (mthca_is_memfree(pd->ibv_pd.context)) { + ah->av = malloc(sizeof *ah->av); + if (!ah->av) + return -1; + } else { + struct mthca_ah_page *page; + int ps; + int pp; + int i, j; + + ps = to_mdev(pd->ibv_pd.context->device)->page_size; + pp = ps / (sizeof *ah->av * 8 * sizeof (int)); + + pthread_mutex_lock(&pd->ah_mutex); + for (page = pd->ah_list; page; page = page->next) + if (page->use_cnt < ps / sizeof *ah->av) + for (i = 0; i < pp; ++i) + if (page->free[i]) + goto found; + + page = __add_page(pd, ps, pp); + if (!page) { + pthread_mutex_unlock(&pd->ah_mutex); + return -1; + } + + found: + ++page->use_cnt; + + for (i = 0, j = -1; i < pp; ++i) + if (page->free[i]) { + j = ffs(page->free[i]); + page->free[i] &= ~(1 << (j - 1)); + ah->av = page->buf.buf + + (i * 8 * sizeof (int) + (j - 1)) * sizeof *ah->av; + break; + } + + ah->key = page->mr->lkey; + ah->page = page; + + pthread_mutex_unlock(&pd->ah_mutex); + } + + memset(ah->av, 0, sizeof *ah->av); + + ah->av->port_pd = htobe32(pd->pdn | (attr->port_num << 24)); + ah->av->g_slid = attr->src_path_bits; + ah->av->dlid = htobe16(attr->dlid); + ah->av->msg_sr = (3 << 4) | /* 2K message */ + attr->static_rate; + ah->av->sl_tclass_flowlabel = htobe32(attr->sl << 28); + if (attr->is_global) { + ah->av->g_slid |= 0x80; + /* XXX get gid_table length */ + ah->av->gid_index = (attr->port_num - 1) * 32 + + attr->grh.sgid_index; + ah->av->hop_limit = attr->grh.hop_limit; + ah->av->sl_tclass_flowlabel |= + htobe32((attr->grh.traffic_class << 20) | + attr->grh.flow_label); + memcpy(ah->av->dgid, attr->grh.dgid.raw, 16); + } else { + /* Arbel workaround -- low byte of GID must be 2 */ + ah->av->dgid[3] = htobe32(2); + } + + return 0; +} + +void mthca_free_av(struct mthca_ah *ah) +{ + if (mthca_is_memfree(ah->ibv_ah.context)) { + free(ah->av); + } else { + struct mthca_pd *pd = to_mpd(ah->ibv_ah.pd); + struct mthca_ah_page *page; + int i; + + pthread_mutex_lock(&pd->ah_mutex); + + page = ah->page; + i = ((void *) ah->av - page->buf.buf) / sizeof *ah->av; + page->free[i / (8 * sizeof (int))] |= 1 << (i % (8 * sizeof (int))); + + if (!--page->use_cnt) { + if (page->prev) + page->prev->next = page->next; + else + pd->ah_list = page->next; + if (page->next) + page->next->prev = page->prev; + + mthca_dereg_mr(verbs_get_mr(page->mr)); + mthca_free_buf(&page->buf); + free(page); + } + + pthread_mutex_unlock(&pd->ah_mutex); + } +} diff --git a/providers/mthca/buf.c b/providers/mthca/buf.c new file mode 100644 index 0000000..c03ee1f --- /dev/null +++ b/providers/mthca/buf.c @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2006 Cisco Systems, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <config.h> + +#include <stdlib.h> +#include <sys/mman.h> +#include <errno.h> + +#include "mthca.h" + +int mthca_alloc_buf(struct mthca_buf *buf, size_t size, int page_size) +{ + int ret; + + buf->length = align(size, page_size); + buf->buf = mmap(NULL, buf->length, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (buf->buf == MAP_FAILED) + return errno; + + ret = ibv_dontfork_range(buf->buf, size); + if (ret) + munmap(buf->buf, buf->length); + + return ret; +} + +void mthca_free_buf(struct mthca_buf *buf) +{ + ibv_dofork_range(buf->buf, buf->length); + munmap(buf->buf, buf->length); +} diff --git a/providers/mthca/cq.c b/providers/mthca/cq.c new file mode 100644 index 0000000..dd8baca --- /dev/null +++ b/providers/mthca/cq.c @@ -0,0 +1,627 @@ +/* + * Copyright (c) 2005 Topspin Communications. All rights reserved. + * Copyright (c) 2005 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2006 Cisco Systems. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <config.h> + +#include <endian.h> +#include <stdio.h> +#include <stdlib.h> +#include <pthread.h> +#include <string.h> + +#include <infiniband/opcode.h> + +#include "mthca.h" +#include "doorbell.h" + +enum { + MTHCA_CQ_DOORBELL = 0x20 +}; + +enum { + CQ_OK = 0, + CQ_EMPTY = -1, + CQ_POLL_ERR = -2 +}; + +#define MTHCA_TAVOR_CQ_DB_INC_CI (1 << 24) +#define MTHCA_TAVOR_CQ_DB_REQ_NOT (2 << 24) +#define MTHCA_TAVOR_CQ_DB_REQ_NOT_SOL (3 << 24) +#define MTHCA_TAVOR_CQ_DB_SET_CI (4 << 24) +#define MTHCA_TAVOR_CQ_DB_REQ_NOT_MULT (5 << 24) + +#define MTHCA_ARBEL_CQ_DB_REQ_NOT_SOL (1 << 24) +#define MTHCA_ARBEL_CQ_DB_REQ_NOT (2 << 24) +#define MTHCA_ARBEL_CQ_DB_REQ_NOT_MULT (3 << 24) + +enum { + MTHCA_CQ_ENTRY_OWNER_SW = 0x00, + MTHCA_CQ_ENTRY_OWNER_HW = 0x80, + MTHCA_ERROR_CQE_OPCODE_MASK = 0xfe +}; + +enum { + SYNDROME_LOCAL_LENGTH_ERR = 0x01, + SYNDROME_LOCAL_QP_OP_ERR = 0x02, + SYNDROME_LOCAL_EEC_OP_ERR = 0x03, + SYNDROME_LOCAL_PROT_ERR = 0x04, + SYNDROME_WR_FLUSH_ERR = 0x05, + SYNDROME_MW_BIND_ERR = 0x06, + SYNDROME_BAD_RESP_ERR = 0x10, + SYNDROME_LOCAL_ACCESS_ERR = 0x11, + SYNDROME_REMOTE_INVAL_REQ_ERR = 0x12, + SYNDROME_REMOTE_ACCESS_ERR = 0x13, + SYNDROME_REMOTE_OP_ERR = 0x14, + SYNDROME_RETRY_EXC_ERR = 0x15, + SYNDROME_RNR_RETRY_EXC_ERR = 0x16, + SYNDROME_LOCAL_RDD_VIOL_ERR = 0x20, + SYNDROME_REMOTE_INVAL_RD_REQ_ERR = 0x21, + SYNDROME_REMOTE_ABORTED_ERR = 0x22, + SYNDROME_INVAL_EECN_ERR = 0x23, + SYNDROME_INVAL_EEC_STATE_ERR = 0x24 +}; + +struct mthca_cqe { + __be32 my_qpn; + __be32 my_ee; + __be32 rqpn; + __be16 sl_g_mlpath; + __be16 rlid; + __be32 imm_etype_pkey_eec; + __be32 byte_cnt; + __be32 wqe; + uint8_t opcode; + uint8_t is_send; + uint8_t reserved; + uint8_t owner; +}; + +struct mthca_err_cqe { + __be32 my_qpn; + __be32 reserved1[3]; + uint8_t syndrome; + uint8_t vendor_err; + __be16 db_cnt; + __be32 reserved2; + __be32 wqe; + uint8_t opcode; + uint8_t reserved3[2]; + uint8_t owner; +}; + +static inline struct mthca_cqe *get_cqe(struct mthca_cq *cq, int entry) +{ + return cq->buf.buf + entry * MTHCA_CQ_ENTRY_SIZE; +} + +static inline struct mthca_cqe *cqe_sw(struct mthca_cq *cq, int i) +{ + struct mthca_cqe *cqe = get_cqe(cq, i); + return MTHCA_CQ_ENTRY_OWNER_HW & cqe->owner ? NULL : cqe; +} + +static inline struct mthca_cqe *next_cqe_sw(struct mthca_cq *cq) +{ + return cqe_sw(cq, cq->cons_index & cq->ibv_cq.cqe); +} + +static inline void set_cqe_hw(struct mthca_cqe *cqe) +{ + VALGRIND_MAKE_MEM_UNDEFINED(cqe, sizeof *cqe); + cqe->owner = MTHCA_CQ_ENTRY_OWNER_HW; +} + +/* + * incr is ignored in native Arbel (mem-free) mode, so cq->cons_index + * should be correct before calling update_cons_index(). + */ +static inline void update_cons_index(struct mthca_cq *cq, int incr) +{ + uint32_t doorbell[2]; + + if (mthca_is_memfree(cq->ibv_cq.context)) { + *cq->set_ci_db = htobe32(cq->cons_index); + mmio_ordered_writes_hack(); + } else { + doorbell[0] = MTHCA_TAVOR_CQ_DB_INC_CI | cq->cqn; + doorbell[1] = incr - 1; + + mthca_write64(doorbell, to_mctx(cq->ibv_cq.context)->uar + MTHCA_CQ_DOORBELL); + } +} + +static void dump_cqe(void *cqe_ptr) +{ + __be32 *cqe = cqe_ptr; + int i; + + for (i = 0; i < 8; ++i) + printf(" [%2x] %08x\n", i * 4, be32toh(cqe[i])); +} + +static int handle_error_cqe(struct mthca_cq *cq, + struct mthca_qp *qp, int wqe_index, int is_send, + struct mthca_err_cqe *cqe, + struct ibv_wc *wc, int *free_cqe) +{ + int err; + int dbd; + __be32 new_wqe; + + if (cqe->syndrome == SYNDROME_LOCAL_QP_OP_ERR) { + printf("local QP operation err " + "(QPN %06x, WQE @ %08x, CQN %06x, index %d)\n", + be32toh(cqe->my_qpn), be32toh(cqe->wqe), + cq->cqn, cq->cons_index); + dump_cqe(cqe); + } + + /* + * For completions in error, only work request ID, status, vendor error + * (and freed resource count for RD) have to be set. + */ + switch (cqe->syndrome) { + case SYNDROME_LOCAL_LENGTH_ERR: + wc->status = IBV_WC_LOC_LEN_ERR; + break; + case SYNDROME_LOCAL_QP_OP_ERR: + wc->status = IBV_WC_LOC_QP_OP_ERR; + break; + case SYNDROME_LOCAL_EEC_OP_ERR: + wc->status = IBV_WC_LOC_EEC_OP_ERR; + break; + case SYNDROME_LOCAL_PROT_ERR: + wc->status = IBV_WC_LOC_PROT_ERR; + break; + case SYNDROME_WR_FLUSH_ERR: + wc->status = IBV_WC_WR_FLUSH_ERR; + break; + case SYNDROME_MW_BIND_ERR: + wc->status = IBV_WC_MW_BIND_ERR; + break; + case SYNDROME_BAD_RESP_ERR: + wc->status = IBV_WC_BAD_RESP_ERR; + break; + case SYNDROME_LOCAL_ACCESS_ERR: + wc->status = IBV_WC_LOC_ACCESS_ERR; + break; + case SYNDROME_REMOTE_INVAL_REQ_ERR: + wc->status = IBV_WC_REM_INV_REQ_ERR; + break; + case SYNDROME_REMOTE_ACCESS_ERR: + wc->status = IBV_WC_REM_ACCESS_ERR; + break; + case SYNDROME_REMOTE_OP_ERR: + wc->status = IBV_WC_REM_OP_ERR; + break; + case SYNDROME_RETRY_EXC_ERR: + wc->status = IBV_WC_RETRY_EXC_ERR; + break; + case SYNDROME_RNR_RETRY_EXC_ERR: + wc->status = IBV_WC_RNR_RETRY_EXC_ERR; + break; + case SYNDROME_LOCAL_RDD_VIOL_ERR: + wc->status = IBV_WC_LOC_RDD_VIOL_ERR; + break; + case SYNDROME_REMOTE_INVAL_RD_REQ_ERR: + wc->status = IBV_WC_REM_INV_RD_REQ_ERR; + break; + case SYNDROME_REMOTE_ABORTED_ERR: + wc->status = IBV_WC_REM_ABORT_ERR; + break; + case SYNDROME_INVAL_EECN_ERR: + wc->status = IBV_WC_INV_EECN_ERR; + break; + case SYNDROME_INVAL_EEC_STATE_ERR: + wc->status = IBV_WC_INV_EEC_STATE_ERR; + break; + default: + wc->status = IBV_WC_GENERAL_ERR; + break; + } + + wc->vendor_err = cqe->vendor_err; + + /* + * Mem-free HCAs always generate one CQE per WQE, even in the + * error case, so we don't have to check the doorbell count, etc. + */ + if (mthca_is_memfree(cq->ibv_cq.context)) + return 0; + + err = mthca_free_err_wqe(qp, is_send, wqe_index, &dbd, &new_wqe); + if (err) + return err; + + /* + * If we're at the end of the WQE chain, or we've used up our + * doorbell count, free the CQE. Otherwise just update it for + * the next poll operation. + * + * This doesn't apply to mem-free HCAs, which never use the + * doorbell count field. In that case we always free the CQE. + */ + if (mthca_is_memfree(cq->ibv_cq.context) || + !(new_wqe & htobe32(0x3f)) || (!cqe->db_cnt && dbd)) + return 0; + + cqe->db_cnt = htobe16(be16toh(cqe->db_cnt) - dbd); + cqe->wqe = new_wqe; + cqe->syndrome = SYNDROME_WR_FLUSH_ERR; + + *free_cqe = 0; + + return 0; +} + +static inline int mthca_poll_one(struct mthca_cq *cq, + struct mthca_qp **cur_qp, + int *freed, + struct ibv_wc *wc) +{ + struct mthca_wq *wq; + struct mthca_cqe *cqe; + struct mthca_srq *srq; + uint32_t qpn; + int wqe_index; + int is_error; + int is_send; + int free_cqe = 1; + int err = 0; + + cqe = next_cqe_sw(cq); + if (!cqe) + return CQ_EMPTY; + + VALGRIND_MAKE_MEM_DEFINED(cqe, sizeof *cqe); + + /* + * Make sure we read CQ entry contents after we've checked the + * ownership bit. + */ + udma_from_device_barrier(); + + qpn = be32toh(cqe->my_qpn); + + is_error = (cqe->opcode & MTHCA_ERROR_CQE_OPCODE_MASK) == + MTHCA_ERROR_CQE_OPCODE_MASK; + is_send = is_error ? cqe->opcode & 0x01 : cqe->is_send & 0x80; + + if (!*cur_qp || qpn != (*cur_qp)->ibv_qp.qp_num) { + /* + * We do not have to take the QP table lock here, + * because CQs will be locked while QPs are removed + * from the table. + */ + *cur_qp = mthca_find_qp(to_mctx(cq->ibv_cq.context), qpn); + if (!*cur_qp) { + err = CQ_POLL_ERR; + goto out; + } + } + + wc->qp_num = (*cur_qp)->ibv_qp.qp_num; + + if (is_send) { + wq = &(*cur_qp)->sq; + wqe_index = ((be32toh(cqe->wqe) - (*cur_qp)->send_wqe_offset) >> wq->wqe_shift); + wc->wr_id = (*cur_qp)->wrid[wqe_index + (*cur_qp)->rq.max]; + } else if ((*cur_qp)->ibv_qp.srq) { + uint32_t wqe; + srq = to_msrq((*cur_qp)->ibv_qp.srq); + wqe = be32toh(cqe->wqe); + wq = NULL; + wqe_index = wqe >> srq->wqe_shift; + wc->wr_id = srq->wrid[wqe_index]; + mthca_free_srq_wqe(srq, wqe_index); + } else { + int32_t wqe; + wq = &(*cur_qp)->rq; + wqe = be32toh(cqe->wqe); + wqe_index = wqe >> wq->wqe_shift; + /* + * WQE addr == base - 1 might be reported by Sinai FW + * 1.0.800 and Arbel FW 5.1.400 in receive completion + * with error instead of (rq size - 1). This bug + * should be fixed in later FW revisions. + */ + if (wqe_index < 0) + wqe_index = wq->max - 1; + wc->wr_id = (*cur_qp)->wrid[wqe_index]; + } + + if (wq) { + if (wq->last_comp < wqe_index) + wq->tail += wqe_index - wq->last_comp; + else + wq->tail += wqe_index + wq->max - wq->last_comp; + + wq->last_comp = wqe_index; + } + + if (is_error) { + err = handle_error_cqe(cq, *cur_qp, wqe_index, is_send, + (struct mthca_err_cqe *) cqe, + wc, &free_cqe); + goto out; + } + + if (is_send) { + wc->wc_flags = 0; + switch (cqe->opcode) { + case MTHCA_OPCODE_RDMA_WRITE: + wc->opcode = IBV_WC_RDMA_WRITE; + break; + case MTHCA_OPCODE_RDMA_WRITE_IMM: + wc->opcode = IBV_WC_RDMA_WRITE; + wc->wc_flags |= IBV_WC_WITH_IMM; + break; + case MTHCA_OPCODE_SEND: + wc->opcode = IBV_WC_SEND; + break; + case MTHCA_OPCODE_SEND_IMM: + wc->opcode = IBV_WC_SEND; + wc->wc_flags |= IBV_WC_WITH_IMM; + break; + case MTHCA_OPCODE_RDMA_READ: + wc->opcode = IBV_WC_RDMA_READ; + wc->byte_len = be32toh(cqe->byte_cnt); + break; + case MTHCA_OPCODE_ATOMIC_CS: + wc->opcode = IBV_WC_COMP_SWAP; + wc->byte_len = be32toh(cqe->byte_cnt); + break; + case MTHCA_OPCODE_ATOMIC_FA: + wc->opcode = IBV_WC_FETCH_ADD; + wc->byte_len = be32toh(cqe->byte_cnt); + break; + case MTHCA_OPCODE_BIND_MW: + wc->opcode = IBV_WC_BIND_MW; + break; + default: + /* assume it's a send completion */ + wc->opcode = IBV_WC_SEND; + break; + } + } else { + wc->byte_len = be32toh(cqe->byte_cnt); + switch (cqe->opcode & 0x1f) { + case IBV_OPCODE_SEND_LAST_WITH_IMMEDIATE: + case IBV_OPCODE_SEND_ONLY_WITH_IMMEDIATE: + wc->wc_flags = IBV_WC_WITH_IMM; + wc->imm_data = cqe->imm_etype_pkey_eec; + wc->opcode = IBV_WC_RECV; + break; + case IBV_OPCODE_RDMA_WRITE_LAST_WITH_IMMEDIATE: + case IBV_OPCODE_RDMA_WRITE_ONLY_WITH_IMMEDIATE: + wc->wc_flags = IBV_WC_WITH_IMM; + wc->imm_data = cqe->imm_etype_pkey_eec; + wc->opcode = IBV_WC_RECV_RDMA_WITH_IMM; + break; + default: + wc->wc_flags = 0; + wc->opcode = IBV_WC_RECV; + break; + } + wc->slid = be16toh(cqe->rlid); + wc->sl = be16toh(cqe->sl_g_mlpath) >> 12; + wc->src_qp = be32toh(cqe->rqpn) & 0xffffff; + wc->dlid_path_bits = be16toh(cqe->sl_g_mlpath) & 0x7f; + wc->pkey_index = be32toh(cqe->imm_etype_pkey_eec) >> 16; + wc->wc_flags |= be16toh(cqe->sl_g_mlpath) & 0x80 ? + IBV_WC_GRH : 0; + } + + wc->status = IBV_WC_SUCCESS; + +out: + if (free_cqe) { + set_cqe_hw(cqe); + ++(*freed); + ++cq->cons_index; + } + + return err; +} + +int mthca_poll_cq(struct ibv_cq *ibcq, int ne, struct ibv_wc *wc) +{ + struct mthca_cq *cq = to_mcq(ibcq); + struct mthca_qp *qp = NULL; + int npolled; + int err = CQ_OK; + int freed = 0; + + pthread_spin_lock(&cq->lock); + + for (npolled = 0; npolled < ne; ++npolled) { + err = mthca_poll_one(cq, &qp, &freed, wc + npolled); + if (err != CQ_OK) + break; + } + + if (freed) { + udma_to_device_barrier(); + update_cons_index(cq, freed); + } + + pthread_spin_unlock(&cq->lock); + + return err == CQ_POLL_ERR ? err : npolled; +} + +int mthca_tavor_arm_cq(struct ibv_cq *cq, int solicited) +{ + uint32_t doorbell[2]; + + doorbell[0] = (solicited ? MTHCA_TAVOR_CQ_DB_REQ_NOT_SOL + : MTHCA_TAVOR_CQ_DB_REQ_NOT) | + to_mcq(cq)->cqn; + doorbell[1] = 0xffffffff; + + mthca_write64(doorbell, to_mctx(cq->context)->uar + MTHCA_CQ_DOORBELL); + + return 0; +} + +int mthca_arbel_arm_cq(struct ibv_cq *ibvcq, int solicited) +{ + struct mthca_cq *cq = to_mcq(ibvcq); + uint32_t doorbell[2]; + uint32_t sn; + + sn = cq->arm_sn & 3; + + doorbell[0] = cq->cons_index; + doorbell[1] = + (cq->cqn << 8) | (2 << 5) | (sn << 3) | (solicited ? 1 : 2); + + mthca_write64(doorbell, cq->arm_db); + + /* + * Make sure that the doorbell record in host memory is + * written before ringing the doorbell via PCI MMIO. + */ + udma_to_device_barrier(); + + doorbell[0] = (sn << 28) | (solicited ? MTHCA_ARBEL_CQ_DB_REQ_NOT_SOL + : MTHCA_ARBEL_CQ_DB_REQ_NOT) | + cq->cqn; + doorbell[1] = cq->cons_index; + + mthca_write64(doorbell, + to_mctx(ibvcq->context)->uar + MTHCA_CQ_DOORBELL); + + return 0; +} + +void mthca_arbel_cq_event(struct ibv_cq *cq) +{ + to_mcq(cq)->arm_sn++; +} + +static inline int is_recv_cqe(struct mthca_cqe *cqe) +{ + if ((cqe->opcode & MTHCA_ERROR_CQE_OPCODE_MASK) == + MTHCA_ERROR_CQE_OPCODE_MASK) + return !(cqe->opcode & 0x01); + else + return !(cqe->is_send & 0x80); +} + +void __mthca_cq_clean(struct mthca_cq *cq, uint32_t qpn, struct mthca_srq *srq) +{ + struct mthca_cqe *cqe; + uint32_t prod_index; + int i, nfreed = 0; + + /* + * First we need to find the current producer index, so we + * know where to start cleaning from. It doesn't matter if HW + * adds new entries after this loop -- the QP we're worried + * about is already in RESET, so the new entries won't come + * from our QP and therefore don't need to be checked. + */ + for (prod_index = cq->cons_index; + cqe_sw(cq, prod_index & cq->ibv_cq.cqe); + ++prod_index) + if (prod_index == cq->cons_index + cq->ibv_cq.cqe) + break; + + /* + * Now sweep backwards through the CQ, removing CQ entries + * that match our QP by copying older entries on top of them. + */ + while ((int) --prod_index - (int) cq->cons_index >= 0) { + cqe = get_cqe(cq, prod_index & cq->ibv_cq.cqe); + if (cqe->my_qpn == htobe32(qpn)) { + if (srq && is_recv_cqe(cqe)) + mthca_free_srq_wqe(srq, + be32toh(cqe->wqe) >> srq->wqe_shift); + ++nfreed; + } else if (nfreed) + memcpy(get_cqe(cq, (prod_index + nfreed) & cq->ibv_cq.cqe), + cqe, MTHCA_CQ_ENTRY_SIZE); + } + + if (nfreed) { + for (i = 0; i < nfreed; ++i) + set_cqe_hw(get_cqe(cq, (cq->cons_index + i) & cq->ibv_cq.cqe)); + udma_to_device_barrier(); + cq->cons_index += nfreed; + update_cons_index(cq, nfreed); + } +} + +void mthca_cq_clean(struct mthca_cq *cq, uint32_t qpn, struct mthca_srq *srq) +{ + pthread_spin_lock(&cq->lock); + __mthca_cq_clean(cq, qpn, srq); + pthread_spin_unlock(&cq->lock); +} + +void mthca_cq_resize_copy_cqes(struct mthca_cq *cq, void *buf, int old_cqe) +{ + int i; + + /* + * In Tavor mode, the hardware keeps the consumer and producer + * indices mod the CQ size. Since we might be making the CQ + * bigger, we need to deal with the case where the producer + * index wrapped around before the CQ was resized. + */ + if (!mthca_is_memfree(cq->ibv_cq.context) && old_cqe < cq->ibv_cq.cqe) { + cq->cons_index &= old_cqe; + if (cqe_sw(cq, old_cqe)) + cq->cons_index -= old_cqe + 1; + } + + for (i = cq->cons_index; cqe_sw(cq, i & old_cqe); ++i) + memcpy(buf + (i & cq->ibv_cq.cqe) * MTHCA_CQ_ENTRY_SIZE, + get_cqe(cq, i & old_cqe), MTHCA_CQ_ENTRY_SIZE); +} + +int mthca_alloc_cq_buf(struct mthca_device *dev, struct mthca_buf *buf, int nent) +{ + int i; + + if (mthca_alloc_buf(buf, align(nent * MTHCA_CQ_ENTRY_SIZE, dev->page_size), + dev->page_size)) + return -1; + + for (i = 0; i < nent; ++i) + ((struct mthca_cqe *) buf->buf)[i].owner = MTHCA_CQ_ENTRY_OWNER_HW; + + return 0; +} diff --git a/providers/mthca/doorbell.h b/providers/mthca/doorbell.h new file mode 100644 index 0000000..d2411ea --- /dev/null +++ b/providers/mthca/doorbell.h @@ -0,0 +1,14 @@ +/* GPLv2 or OpenIB.org BSD (MIT) See COPYING file */ +#ifndef DOORBELL_H +#define DOORBELL_H + +#include <util/mmio.h> +#include "mthca.h" + +static inline void mthca_write64(uint32_t val[2], void *reg) +{ + uint64_t doorbell = (((uint64_t)val[0]) << 32) | val[1]; + mmio_write64_be(reg, htobe64(doorbell)); +} + +#endif diff --git a/providers/mthca/memfree.c b/providers/mthca/memfree.c new file mode 100644 index 0000000..14edb00 --- /dev/null +++ b/providers/mthca/memfree.c @@ -0,0 +1,203 @@ +/* + * Copyright (c) 2005 Topspin Communications. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#define _GNU_SOURCE +#include <config.h> + +#include <endian.h> +#include <stdlib.h> +#include <pthread.h> +#include <string.h> +#include <strings.h> + +#include "mthca.h" + +#define MTHCA_FREE_MAP_SIZE (MTHCA_DB_REC_PER_PAGE / (SIZEOF_LONG * 8)) + +struct mthca_db_page { + unsigned long free[MTHCA_FREE_MAP_SIZE]; + struct mthca_buf db_rec; +}; + +struct mthca_db_table { + int npages; + int max_group1; + int min_group2; + pthread_mutex_t mutex; + struct mthca_db_page page[]; +}; + +int mthca_alloc_db(struct mthca_db_table *db_tab, enum mthca_db_type type, + __be32 **db) +{ + int i, j, k; + int group, start, end, dir; + int ret = 0; + + pthread_mutex_lock(&db_tab->mutex); + + switch (type) { + case MTHCA_DB_TYPE_CQ_ARM: + case MTHCA_DB_TYPE_SQ: + group = 0; + start = 0; + end = db_tab->max_group1; + dir = 1; + break; + + case MTHCA_DB_TYPE_CQ_SET_CI: + case MTHCA_DB_TYPE_RQ: + case MTHCA_DB_TYPE_SRQ: + group = 1; + start = db_tab->npages - 1; + end = db_tab->min_group2; + dir = -1; + break; + + default: + ret = -1; + goto out; + } + + for (i = start; i != end; i += dir) + if (db_tab->page[i].db_rec.buf) + for (j = 0; j < MTHCA_FREE_MAP_SIZE; ++j) + if (db_tab->page[i].free[j]) + goto found; + + if (db_tab->max_group1 >= db_tab->min_group2 - 1) { + ret = -1; + goto out; + } + + if (mthca_alloc_buf(&db_tab->page[i].db_rec, + MTHCA_DB_REC_PAGE_SIZE, + MTHCA_DB_REC_PAGE_SIZE)) { + ret = -1; + goto out; + } + + memset(db_tab->page[i].db_rec.buf, 0, MTHCA_DB_REC_PAGE_SIZE); + memset(db_tab->page[i].free, 0xff, sizeof db_tab->page[i].free); + + if (group == 0) + ++db_tab->max_group1; + else + --db_tab->min_group2; + +found: + for (j = 0; j < MTHCA_FREE_MAP_SIZE; ++j) { + k = ffsl(db_tab->page[i].free[j]); + if (k) + break; + } + + if (!k) { + ret = -1; + goto out; + } + + --k; + db_tab->page[i].free[j] &= ~(1UL << k); + + j = j * SIZEOF_LONG * 8 + k; + if (group == 1) + j = MTHCA_DB_REC_PER_PAGE - 1 - j; + + ret = i * MTHCA_DB_REC_PER_PAGE + j; + *db = db_tab->page[i].db_rec.buf + j * 8; + +out: + pthread_mutex_unlock(&db_tab->mutex); + return ret; +} + +void mthca_set_db_qn(__be32 *db, enum mthca_db_type type, uint32_t qn) +{ + db[1] = htobe32((qn << 8) | (type << 5)); +} + +void mthca_free_db(struct mthca_db_table *db_tab, enum mthca_db_type type, int db_index) +{ + int i, j; + struct mthca_db_page *page; + + i = db_index / MTHCA_DB_REC_PER_PAGE; + j = db_index % MTHCA_DB_REC_PER_PAGE; + + page = db_tab->page + i; + + pthread_mutex_lock(&db_tab->mutex); + *(uint64_t *) (page->db_rec.buf + j * 8) = 0; + + if (i >= db_tab->min_group2) + j = MTHCA_DB_REC_PER_PAGE - 1 - j; + + page->free[j / (SIZEOF_LONG * 8)] |= 1UL << (j % (SIZEOF_LONG * 8)); + + pthread_mutex_unlock(&db_tab->mutex); +} + +struct mthca_db_table *mthca_alloc_db_tab(int uarc_size) +{ + struct mthca_db_table *db_tab; + int npages; + int i; + + npages = uarc_size / MTHCA_DB_REC_PAGE_SIZE; + db_tab = malloc(sizeof (struct mthca_db_table) + + npages * sizeof (struct mthca_db_page)); + + pthread_mutex_init(&db_tab->mutex, NULL); + + db_tab->npages = npages; + db_tab->max_group1 = 0; + db_tab->min_group2 = npages - 1; + + for (i = 0; i < npages; ++i) + db_tab->page[i].db_rec.buf = NULL; + + return db_tab; +} + +void mthca_free_db_tab(struct mthca_db_table *db_tab) +{ + int i; + + if (!db_tab) + return; + + for (i = 0; i < db_tab->npages; ++i) + if (db_tab->page[i].db_rec.buf) + mthca_free_buf(&db_tab->page[i].db_rec); + + free(db_tab); +} diff --git a/providers/mthca/mthca-abi.h b/providers/mthca/mthca-abi.h new file mode 100644 index 0000000..ca90954 --- /dev/null +++ b/providers/mthca/mthca-abi.h @@ -0,0 +1,56 @@ +/* + * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved. + * Copyright (c) 2006 Cisco Systems. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef MTHCA_ABI_H +#define MTHCA_ABI_H + +#include <infiniband/kern-abi.h> +#include <rdma/mthca-abi.h> +#include <kernel-abi/mthca-abi.h> + +DECLARE_DRV_CMD(umthca_alloc_pd, IB_USER_VERBS_CMD_ALLOC_PD, + empty, mthca_alloc_pd_resp); +DECLARE_DRV_CMD(umthca_create_cq, IB_USER_VERBS_CMD_CREATE_CQ, + mthca_create_cq, mthca_create_cq_resp); +DECLARE_DRV_CMD(umthca_create_qp, IB_USER_VERBS_CMD_CREATE_QP, + mthca_create_qp, empty); +DECLARE_DRV_CMD(umthca_create_srq, IB_USER_VERBS_CMD_CREATE_SRQ, + mthca_create_srq, mthca_create_srq_resp); +DECLARE_DRV_CMD(umthca_alloc_ucontext, IB_USER_VERBS_CMD_GET_CONTEXT, + empty, mthca_alloc_ucontext_resp); +DECLARE_DRV_CMD(umthca_reg_mr, IB_USER_VERBS_CMD_REG_MR, + mthca_reg_mr, empty); +DECLARE_DRV_CMD(umthca_resize_cq, IB_USER_VERBS_CMD_RESIZE_CQ, + mthca_resize_cq, empty); + +#endif /* MTHCA_ABI_H */ diff --git a/providers/mthca/mthca.c b/providers/mthca/mthca.c new file mode 100644 index 0000000..abce486 --- /dev/null +++ b/providers/mthca/mthca.c @@ -0,0 +1,244 @@ +/* + * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved. + * Copyright (c) 2006 Cisco Systems. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <config.h> + +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <errno.h> +#include <sys/mman.h> +#include <pthread.h> +#include <string.h> + +#include "mthca.h" +#include "mthca-abi.h" + +static void mthca_free_context(struct ibv_context *ibctx); + +#ifndef PCI_VENDOR_ID_MELLANOX +#define PCI_VENDOR_ID_MELLANOX 0x15b3 +#endif + +#ifndef PCI_DEVICE_ID_MELLANOX_TAVOR +#define PCI_DEVICE_ID_MELLANOX_TAVOR 0x5a44 +#endif + +#ifndef PCI_DEVICE_ID_MELLANOX_ARBEL_COMPAT +#define PCI_DEVICE_ID_MELLANOX_ARBEL_COMPAT 0x6278 +#endif + +#ifndef PCI_DEVICE_ID_MELLANOX_ARBEL +#define PCI_DEVICE_ID_MELLANOX_ARBEL 0x6282 +#endif + +#ifndef PCI_DEVICE_ID_MELLANOX_SINAI_OLD +#define PCI_DEVICE_ID_MELLANOX_SINAI_OLD 0x5e8c +#endif + +#ifndef PCI_DEVICE_ID_MELLANOX_SINAI +#define PCI_DEVICE_ID_MELLANOX_SINAI 0x6274 +#endif + +#ifndef PCI_VENDOR_ID_TOPSPIN +#define PCI_VENDOR_ID_TOPSPIN 0x1867 +#endif + +#define HCA(v, d, t) \ + VERBS_PCI_MATCH(PCI_VENDOR_ID_##v, PCI_DEVICE_ID_MELLANOX_##d, \ + (void *)(MTHCA_##t)) +static const struct verbs_match_ent hca_table[] = { + HCA(MELLANOX, TAVOR, TAVOR), + HCA(MELLANOX, ARBEL_COMPAT, TAVOR), + HCA(MELLANOX, ARBEL, ARBEL), + HCA(MELLANOX, SINAI_OLD, ARBEL), + HCA(MELLANOX, SINAI, ARBEL), + HCA(TOPSPIN, TAVOR, TAVOR), + HCA(TOPSPIN, ARBEL_COMPAT, TAVOR), + HCA(TOPSPIN, ARBEL, ARBEL), + HCA(TOPSPIN, SINAI_OLD, ARBEL), + HCA(TOPSPIN, SINAI, ARBEL), + {} +}; + +static const struct verbs_context_ops mthca_ctx_common_ops = { + .query_device = mthca_query_device, + .query_port = mthca_query_port, + .alloc_pd = mthca_alloc_pd, + .dealloc_pd = mthca_free_pd, + .reg_mr = mthca_reg_mr, + .dereg_mr = mthca_dereg_mr, + .create_cq = mthca_create_cq, + .poll_cq = mthca_poll_cq, + .resize_cq = mthca_resize_cq, + .destroy_cq = mthca_destroy_cq, + .create_srq = mthca_create_srq, + .modify_srq = mthca_modify_srq, + .query_srq = mthca_query_srq, + .destroy_srq = mthca_destroy_srq, + .create_qp = mthca_create_qp, + .query_qp = mthca_query_qp, + .modify_qp = mthca_modify_qp, + .destroy_qp = mthca_destroy_qp, + .create_ah = mthca_create_ah, + .destroy_ah = mthca_destroy_ah, + .attach_mcast = ibv_cmd_attach_mcast, + .detach_mcast = ibv_cmd_detach_mcast, + .free_context = mthca_free_context, +}; + +static const struct verbs_context_ops mthca_ctx_arbel_ops = { + .cq_event = mthca_arbel_cq_event, + .post_recv = mthca_arbel_post_recv, + .post_send = mthca_arbel_post_send, + .post_srq_recv = mthca_arbel_post_srq_recv, + .req_notify_cq = mthca_arbel_arm_cq, +}; + +static const struct verbs_context_ops mthca_ctx_tavor_ops = { + .post_recv = mthca_tavor_post_recv, + .post_send = mthca_tavor_post_send, + .post_srq_recv = mthca_tavor_post_srq_recv, + .req_notify_cq = mthca_tavor_arm_cq, +}; + +static struct verbs_context *mthca_alloc_context(struct ibv_device *ibdev, + int cmd_fd, + void *private_data) +{ + struct mthca_context *context; + struct ibv_get_context cmd; + struct umthca_alloc_ucontext_resp resp; + int i; + + context = verbs_init_and_alloc_context(ibdev, cmd_fd, context, ibv_ctx, + RDMA_DRIVER_MTHCA); + if (!context) + return NULL; + + if (ibv_cmd_get_context(&context->ibv_ctx, &cmd, sizeof cmd, + &resp.ibv_resp, sizeof resp)) + goto err_free; + + context->num_qps = resp.qp_tab_size; + context->qp_table_shift = ffs(context->num_qps) - 1 - MTHCA_QP_TABLE_BITS; + context->qp_table_mask = (1 << context->qp_table_shift) - 1; + + if (mthca_is_memfree(&context->ibv_ctx.context)) { + context->db_tab = mthca_alloc_db_tab(resp.uarc_size); + if (!context->db_tab) + goto err_free; + } else + context->db_tab = NULL; + + pthread_mutex_init(&context->qp_table_mutex, NULL); + for (i = 0; i < MTHCA_QP_TABLE_SIZE; ++i) + context->qp_table[i].refcnt = 0; + + context->uar = mmap(NULL, to_mdev(ibdev)->page_size, PROT_WRITE, + MAP_SHARED, cmd_fd, 0); + if (context->uar == MAP_FAILED) + goto err_db_tab; + + pthread_spin_init(&context->uar_lock, PTHREAD_PROCESS_PRIVATE); + + context->pd = mthca_alloc_pd(&context->ibv_ctx.context); + if (!context->pd) + goto err_unmap; + + context->pd->context = &context->ibv_ctx.context; + + verbs_set_ops(&context->ibv_ctx, &mthca_ctx_common_ops); + if (mthca_is_memfree(&context->ibv_ctx.context)) + verbs_set_ops(&context->ibv_ctx, &mthca_ctx_arbel_ops); + else + verbs_set_ops(&context->ibv_ctx, &mthca_ctx_tavor_ops); + + return &context->ibv_ctx; + +err_unmap: + munmap(context->uar, to_mdev(ibdev)->page_size); + +err_db_tab: + mthca_free_db_tab(context->db_tab); + +err_free: + verbs_uninit_context(&context->ibv_ctx); + free(context); + return NULL; +} + +static void mthca_free_context(struct ibv_context *ibctx) +{ + struct mthca_context *context = to_mctx(ibctx); + + mthca_free_pd(context->pd); + munmap(context->uar, to_mdev(ibctx->device)->page_size); + mthca_free_db_tab(context->db_tab); + + verbs_uninit_context(&context->ibv_ctx); + free(context); +} + +static void mthca_uninit_device(struct verbs_device *verbs_device) +{ + struct mthca_device *dev = to_mdev(&verbs_device->device); + + free(dev); +} + +static struct verbs_device * +mthca_device_alloc(struct verbs_sysfs_dev *sysfs_dev) +{ + struct mthca_device *dev; + + dev = calloc(1, sizeof(*dev)); + if (!dev) + return NULL; + + dev->hca_type = (uintptr_t)sysfs_dev->match->driver_data; + dev->page_size = sysconf(_SC_PAGESIZE); + + return &dev->ibv_dev; +} + +static const struct verbs_device_ops mthca_dev_ops = { + .name = "mthca", + .match_min_abi_version = 0, + .match_max_abi_version = MTHCA_UVERBS_ABI_VERSION, + .match_table = hca_table, + .alloc_device = mthca_device_alloc, + .uninit_device = mthca_uninit_device, + .alloc_context = mthca_alloc_context, +}; +PROVIDER_DRIVER(mthca, mthca_dev_ops); diff --git a/providers/mthca/mthca.h b/providers/mthca/mthca.h new file mode 100644 index 0000000..b7df2f7 --- /dev/null +++ b/providers/mthca/mthca.h @@ -0,0 +1,348 @@ +/* + * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved. + * Copyright (c) 2005, 2006 Cisco Systems. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef MTHCA_H +#define MTHCA_H + +#include <stddef.h> + +#include <infiniband/driver.h> +#include <util/udma_barrier.h> + +#include <valgrind/memcheck.h> + +#define PFX "mthca: " + +enum mthca_hca_type { + MTHCA_TAVOR, + MTHCA_ARBEL +}; + +enum { + MTHCA_CQ_ENTRY_SIZE = 0x20 +}; + +enum { + MTHCA_QP_TABLE_BITS = 8, + MTHCA_QP_TABLE_SIZE = 1 << MTHCA_QP_TABLE_BITS, + MTHCA_QP_TABLE_MASK = MTHCA_QP_TABLE_SIZE - 1 +}; + +enum { + MTHCA_DB_REC_PAGE_SIZE = 4096, + MTHCA_DB_REC_PER_PAGE = MTHCA_DB_REC_PAGE_SIZE / 8 +}; + +enum mthca_db_type { + MTHCA_DB_TYPE_INVALID = 0x0, + MTHCA_DB_TYPE_CQ_SET_CI = 0x1, + MTHCA_DB_TYPE_CQ_ARM = 0x2, + MTHCA_DB_TYPE_SQ = 0x3, + MTHCA_DB_TYPE_RQ = 0x4, + MTHCA_DB_TYPE_SRQ = 0x5, + MTHCA_DB_TYPE_GROUP_SEP = 0x7 +}; + +enum { + MTHCA_OPCODE_NOP = 0x00, + MTHCA_OPCODE_RDMA_WRITE = 0x08, + MTHCA_OPCODE_RDMA_WRITE_IMM = 0x09, + MTHCA_OPCODE_SEND = 0x0a, + MTHCA_OPCODE_SEND_IMM = 0x0b, + MTHCA_OPCODE_RDMA_READ = 0x10, + MTHCA_OPCODE_ATOMIC_CS = 0x11, + MTHCA_OPCODE_ATOMIC_FA = 0x12, + MTHCA_OPCODE_BIND_MW = 0x18, + MTHCA_OPCODE_INVALID = 0xff +}; + +struct mthca_ah_page; + +struct mthca_device { + struct verbs_device ibv_dev; + enum mthca_hca_type hca_type; + int page_size; +}; + +struct mthca_db_table; + +struct mthca_context { + struct verbs_context ibv_ctx; + void *uar; + pthread_spinlock_t uar_lock; + struct mthca_db_table *db_tab; + struct ibv_pd *pd; + struct { + struct mthca_qp **table; + int refcnt; + } qp_table[MTHCA_QP_TABLE_SIZE]; + pthread_mutex_t qp_table_mutex; + int num_qps; + int qp_table_shift; + int qp_table_mask; +}; + +struct mthca_buf { + void *buf; + size_t length; +}; + +struct mthca_pd { + struct ibv_pd ibv_pd; + struct mthca_ah_page *ah_list; + pthread_mutex_t ah_mutex; + uint32_t pdn; +}; + +struct mthca_cq { + struct ibv_cq ibv_cq; + struct mthca_buf buf; + pthread_spinlock_t lock; + struct ibv_mr *mr; + uint32_t cqn; + uint32_t cons_index; + + /* Next fields are mem-free only */ + int set_ci_db_index; + __be32 *set_ci_db; + int arm_db_index; + __be32 *arm_db; + int arm_sn; +}; + +struct mthca_srq { + struct ibv_srq ibv_srq; + struct mthca_buf buf; + void *last; + pthread_spinlock_t lock; + struct ibv_mr *mr; + uint64_t *wrid; + uint32_t srqn; + int max; + int max_gs; + int wqe_shift; + int first_free; + int last_free; + int buf_size; + + /* Next fields are mem-free only */ + int db_index; + __be32 *db; + uint16_t counter; +}; + +struct mthca_wq { + pthread_spinlock_t lock; + int max; + unsigned next_ind; + unsigned last_comp; + unsigned head; + unsigned tail; + void *last; + int max_gs; + int wqe_shift; + + /* Next fields are mem-free only */ + int db_index; + __be32 *db; +}; + +struct mthca_qp { + struct ibv_qp ibv_qp; + struct mthca_buf buf; + uint64_t *wrid; + int send_wqe_offset; + int max_inline_data; + int buf_size; + struct mthca_wq sq; + struct mthca_wq rq; + struct ibv_mr *mr; + int sq_sig_all; +}; + +struct mthca_av { + __be32 port_pd; + uint8_t reserved1; + uint8_t g_slid; + __be16 dlid; + uint8_t reserved2; + uint8_t gid_index; + uint8_t msg_sr; + uint8_t hop_limit; + __be32 sl_tclass_flowlabel; + __be32 dgid[4]; +}; + +struct mthca_ah { + struct ibv_ah ibv_ah; + struct mthca_av *av; + struct mthca_ah_page *page; + uint32_t key; +}; + +static inline unsigned long align(unsigned long val, unsigned long align) +{ + return (val + align - 1) & ~(align - 1); +} + +static inline uintptr_t db_align(__be32 *db) +{ + return (uintptr_t) db & ~((uintptr_t) MTHCA_DB_REC_PAGE_SIZE - 1); +} + +#define to_mxxx(xxx, type) container_of(ib##xxx, struct mthca_##type, ibv_##xxx) + +static inline struct mthca_device *to_mdev(struct ibv_device *ibdev) +{ + return container_of(ibdev, struct mthca_device, ibv_dev.device); +} + +static inline struct mthca_context *to_mctx(struct ibv_context *ibctx) +{ + return container_of(ibctx, struct mthca_context, ibv_ctx.context); +} + +static inline struct mthca_pd *to_mpd(struct ibv_pd *ibpd) +{ + return to_mxxx(pd, pd); +} + +static inline struct mthca_cq *to_mcq(struct ibv_cq *ibcq) +{ + return to_mxxx(cq, cq); +} + +static inline struct mthca_srq *to_msrq(struct ibv_srq *ibsrq) +{ + return to_mxxx(srq, srq); +} + +static inline struct mthca_qp *to_mqp(struct ibv_qp *ibqp) +{ + return to_mxxx(qp, qp); +} + +static inline struct mthca_ah *to_mah(struct ibv_ah *ibah) +{ + return to_mxxx(ah, ah); +} + +static inline int mthca_is_memfree(struct ibv_context *ibctx) +{ + return to_mdev(ibctx->device)->hca_type == MTHCA_ARBEL; +} + +int mthca_alloc_buf(struct mthca_buf *buf, size_t size, int page_size); +void mthca_free_buf(struct mthca_buf *buf); + +int mthca_alloc_db(struct mthca_db_table *db_tab, enum mthca_db_type type, + __be32 **db); +void mthca_set_db_qn(__be32 *db, enum mthca_db_type type, uint32_t qn); +void mthca_free_db(struct mthca_db_table *db_tab, enum mthca_db_type type, int db_index); +struct mthca_db_table *mthca_alloc_db_tab(int uarc_size); +void mthca_free_db_tab(struct mthca_db_table *db_tab); + +int mthca_query_device(struct ibv_context *context, + struct ibv_device_attr *attr); +int mthca_query_port(struct ibv_context *context, uint8_t port, + struct ibv_port_attr *attr); + +struct ibv_pd *mthca_alloc_pd(struct ibv_context *context); +int mthca_free_pd(struct ibv_pd *pd); + +struct ibv_mr *mthca_reg_mr(struct ibv_pd *pd, void *addr, size_t length, + uint64_t hca_va, int access); +int mthca_dereg_mr(struct verbs_mr *mr); + +struct ibv_cq *mthca_create_cq(struct ibv_context *context, int cqe, + struct ibv_comp_channel *channel, + int comp_vector); +int mthca_resize_cq(struct ibv_cq *cq, int cqe); +int mthca_destroy_cq(struct ibv_cq *cq); +int mthca_poll_cq(struct ibv_cq *cq, int ne, struct ibv_wc *wc); +int mthca_tavor_arm_cq(struct ibv_cq *cq, int solicited); +int mthca_arbel_arm_cq(struct ibv_cq *cq, int solicited); +void mthca_arbel_cq_event(struct ibv_cq *cq); +void __mthca_cq_clean(struct mthca_cq *cq, uint32_t qpn, struct mthca_srq *srq); +void mthca_cq_clean(struct mthca_cq *cq, uint32_t qpn, struct mthca_srq *srq); +void mthca_cq_resize_copy_cqes(struct mthca_cq *cq, void *buf, int new_cqe); +int mthca_alloc_cq_buf(struct mthca_device *dev, struct mthca_buf *buf, int nent); + +struct ibv_srq *mthca_create_srq(struct ibv_pd *pd, + struct ibv_srq_init_attr *attr); +int mthca_modify_srq(struct ibv_srq *srq, + struct ibv_srq_attr *attr, + int mask); +int mthca_query_srq(struct ibv_srq *srq, + struct ibv_srq_attr *attr); +int mthca_destroy_srq(struct ibv_srq *srq); +int mthca_alloc_srq_buf(struct ibv_pd *pd, struct ibv_srq_attr *attr, + struct mthca_srq *srq); +void mthca_free_srq_wqe(struct mthca_srq *srq, int ind); +int mthca_tavor_post_srq_recv(struct ibv_srq *ibsrq, + struct ibv_recv_wr *wr, + struct ibv_recv_wr **bad_wr); +int mthca_arbel_post_srq_recv(struct ibv_srq *ibsrq, + struct ibv_recv_wr *wr, + struct ibv_recv_wr **bad_wr); + +struct ibv_qp *mthca_create_qp(struct ibv_pd *pd, struct ibv_qp_init_attr *attr); +int mthca_query_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, + int attr_mask, + struct ibv_qp_init_attr *init_attr); +int mthca_modify_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, + int attr_mask); +int mthca_destroy_qp(struct ibv_qp *qp); +void mthca_init_qp_indices(struct mthca_qp *qp); +int mthca_tavor_post_send(struct ibv_qp *ibqp, struct ibv_send_wr *wr, + struct ibv_send_wr **bad_wr); +int mthca_tavor_post_recv(struct ibv_qp *ibqp, struct ibv_recv_wr *wr, + struct ibv_recv_wr **bad_wr); +int mthca_arbel_post_send(struct ibv_qp *ibqp, struct ibv_send_wr *wr, + struct ibv_send_wr **bad_wr); +int mthca_arbel_post_recv(struct ibv_qp *ibqp, struct ibv_recv_wr *wr, + struct ibv_recv_wr **bad_wr); +int mthca_alloc_qp_buf(struct ibv_pd *pd, struct ibv_qp_cap *cap, + enum ibv_qp_type type, struct mthca_qp *qp); +struct mthca_qp *mthca_find_qp(struct mthca_context *ctx, uint32_t qpn); +int mthca_store_qp(struct mthca_context *ctx, uint32_t qpn, struct mthca_qp *qp); +void mthca_clear_qp(struct mthca_context *ctx, uint32_t qpn); +int mthca_free_err_wqe(struct mthca_qp *qp, int is_send, + int index, int *dbd, __be32 *new_wqe); +struct ibv_ah *mthca_create_ah(struct ibv_pd *pd, struct ibv_ah_attr *attr); +int mthca_destroy_ah(struct ibv_ah *ah); +int mthca_alloc_av(struct mthca_pd *pd, struct ibv_ah_attr *attr, + struct mthca_ah *ah); +void mthca_free_av(struct mthca_ah *ah); + +#endif /* MTHCA_H */ diff --git a/providers/mthca/qp.c b/providers/mthca/qp.c new file mode 100644 index 0000000..f428903 --- /dev/null +++ b/providers/mthca/qp.c @@ -0,0 +1,956 @@ +/* + * Copyright (c) 2005 Topspin Communications. All rights reserved. + * Copyright (c) 2005 Mellanox Technologies Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <config.h> + +#include <endian.h> +#include <stdlib.h> +#include <pthread.h> +#include <string.h> +#include <util/compiler.h> + +#include "mthca.h" +#include "doorbell.h" +#include "wqe.h" + +enum { + MTHCA_SEND_DOORBELL_FENCE = 1 << 5 +}; + +static const uint8_t mthca_opcode[] = { + [IBV_WR_SEND] = MTHCA_OPCODE_SEND, + [IBV_WR_SEND_WITH_IMM] = MTHCA_OPCODE_SEND_IMM, + [IBV_WR_RDMA_WRITE] = MTHCA_OPCODE_RDMA_WRITE, + [IBV_WR_RDMA_WRITE_WITH_IMM] = MTHCA_OPCODE_RDMA_WRITE_IMM, + [IBV_WR_RDMA_READ] = MTHCA_OPCODE_RDMA_READ, + [IBV_WR_ATOMIC_CMP_AND_SWP] = MTHCA_OPCODE_ATOMIC_CS, + [IBV_WR_ATOMIC_FETCH_AND_ADD] = MTHCA_OPCODE_ATOMIC_FA, +}; + +static void *get_recv_wqe(struct mthca_qp *qp, int n) +{ + return qp->buf.buf + (n << qp->rq.wqe_shift); +} + +static void *get_send_wqe(struct mthca_qp *qp, int n) +{ + return qp->buf.buf + qp->send_wqe_offset + (n << qp->sq.wqe_shift); +} + +void mthca_init_qp_indices(struct mthca_qp *qp) +{ + qp->sq.next_ind = 0; + qp->sq.last_comp = qp->sq.max - 1; + qp->sq.head = 0; + qp->sq.tail = 0; + qp->sq.last = get_send_wqe(qp, qp->sq.max - 1); + + qp->rq.next_ind = 0; + qp->rq.last_comp = qp->rq.max - 1; + qp->rq.head = 0; + qp->rq.tail = 0; + qp->rq.last = get_recv_wqe(qp, qp->rq.max - 1); +} + +static inline int wq_overflow(struct mthca_wq *wq, int nreq, struct mthca_cq *cq) +{ + unsigned cur; + + cur = wq->head - wq->tail; + if (cur + nreq < wq->max) + return 0; + + pthread_spin_lock(&cq->lock); + cur = wq->head - wq->tail; + pthread_spin_unlock(&cq->lock); + + return cur + nreq >= wq->max; +} + +int mthca_tavor_post_send(struct ibv_qp *ibqp, struct ibv_send_wr *wr, + struct ibv_send_wr **bad_wr) +{ + struct mthca_qp *qp = to_mqp(ibqp); + void *wqe, *prev_wqe; + int ind; + int nreq; + int ret = 0; + int size; + int size0 = 0; + int i; + uint32_t uninitialized_var(f0); + uint32_t uninitialized_var(op0); + + pthread_spin_lock(&qp->sq.lock); + udma_to_device_barrier(); + + ind = qp->sq.next_ind; + + for (nreq = 0; wr; ++nreq, wr = wr->next) { + if (wq_overflow(&qp->sq, nreq, to_mcq(qp->ibv_qp.send_cq))) { + ret = -1; + *bad_wr = wr; + goto out; + } + + wqe = get_send_wqe(qp, ind); + prev_wqe = qp->sq.last; + qp->sq.last = wqe; + + ((struct mthca_next_seg *) wqe)->nda_op = 0; + ((struct mthca_next_seg *) wqe)->ee_nds = 0; + ((struct mthca_next_seg *) wqe)->flags = + ((wr->send_flags & IBV_SEND_SIGNALED) ? + htobe32(MTHCA_NEXT_CQ_UPDATE) : 0) | + ((wr->send_flags & IBV_SEND_SOLICITED) ? + htobe32(MTHCA_NEXT_SOLICIT) : 0) | + htobe32(1); + if (wr->opcode == IBV_WR_SEND_WITH_IMM || + wr->opcode == IBV_WR_RDMA_WRITE_WITH_IMM) + ((struct mthca_next_seg *) wqe)->imm = wr->imm_data; + + wqe += sizeof (struct mthca_next_seg); + size = sizeof (struct mthca_next_seg) / 16; + + switch (ibqp->qp_type) { + case IBV_QPT_RC: + switch (wr->opcode) { + case IBV_WR_ATOMIC_CMP_AND_SWP: + case IBV_WR_ATOMIC_FETCH_AND_ADD: + ((struct mthca_raddr_seg *) wqe)->raddr = + htobe64(wr->wr.atomic.remote_addr); + ((struct mthca_raddr_seg *) wqe)->rkey = + htobe32(wr->wr.atomic.rkey); + ((struct mthca_raddr_seg *) wqe)->reserved = 0; + + wqe += sizeof (struct mthca_raddr_seg); + + if (wr->opcode == IBV_WR_ATOMIC_CMP_AND_SWP) { + ((struct mthca_atomic_seg *) wqe)->swap_add = + htobe64(wr->wr.atomic.swap); + ((struct mthca_atomic_seg *) wqe)->compare = + htobe64(wr->wr.atomic.compare_add); + } else { + ((struct mthca_atomic_seg *) wqe)->swap_add = + htobe64(wr->wr.atomic.compare_add); + ((struct mthca_atomic_seg *) wqe)->compare = 0; + } + + wqe += sizeof (struct mthca_atomic_seg); + size += (sizeof (struct mthca_raddr_seg) + + sizeof (struct mthca_atomic_seg)) / 16; + break; + + case IBV_WR_RDMA_WRITE: + case IBV_WR_RDMA_WRITE_WITH_IMM: + case IBV_WR_RDMA_READ: + ((struct mthca_raddr_seg *) wqe)->raddr = + htobe64(wr->wr.rdma.remote_addr); + ((struct mthca_raddr_seg *) wqe)->rkey = + htobe32(wr->wr.rdma.rkey); + ((struct mthca_raddr_seg *) wqe)->reserved = 0; + wqe += sizeof (struct mthca_raddr_seg); + size += sizeof (struct mthca_raddr_seg) / 16; + break; + + default: + /* No extra segments required for sends */ + break; + } + + break; + + case IBV_QPT_UC: + switch (wr->opcode) { + case IBV_WR_RDMA_WRITE: + case IBV_WR_RDMA_WRITE_WITH_IMM: + ((struct mthca_raddr_seg *) wqe)->raddr = + htobe64(wr->wr.rdma.remote_addr); + ((struct mthca_raddr_seg *) wqe)->rkey = + htobe32(wr->wr.rdma.rkey); + ((struct mthca_raddr_seg *) wqe)->reserved = 0; + wqe += sizeof (struct mthca_raddr_seg); + size += sizeof (struct mthca_raddr_seg) / 16; + break; + + default: + /* No extra segments required for sends */ + break; + } + + break; + + case IBV_QPT_UD: + ((struct mthca_tavor_ud_seg *) wqe)->lkey = + htobe32(to_mah(wr->wr.ud.ah)->key); + ((struct mthca_tavor_ud_seg *) wqe)->av_addr = + htobe64((uintptr_t) to_mah(wr->wr.ud.ah)->av); + ((struct mthca_tavor_ud_seg *) wqe)->dqpn = + htobe32(wr->wr.ud.remote_qpn); + ((struct mthca_tavor_ud_seg *) wqe)->qkey = + htobe32(wr->wr.ud.remote_qkey); + + wqe += sizeof (struct mthca_tavor_ud_seg); + size += sizeof (struct mthca_tavor_ud_seg) / 16; + break; + + default: + break; + } + + if (wr->num_sge > qp->sq.max_gs) { + ret = -1; + *bad_wr = wr; + goto out; + } + + if (wr->send_flags & IBV_SEND_INLINE) { + if (wr->num_sge) { + struct mthca_inline_seg *seg = wqe; + int s = 0; + + wqe += sizeof *seg; + for (i = 0; i < wr->num_sge; ++i) { + struct ibv_sge *sge = &wr->sg_list[i]; + + s += sge->length; + + if (s > qp->max_inline_data) { + ret = -1; + *bad_wr = wr; + goto out; + } + + memcpy(wqe, (void *) (intptr_t) sge->addr, + sge->length); + wqe += sge->length; + } + + seg->byte_count = htobe32(MTHCA_INLINE_SEG | s); + size += align(s + sizeof *seg, 16) / 16; + } + } else { + struct mthca_data_seg *seg; + + for (i = 0; i < wr->num_sge; ++i) { + seg = wqe; + seg->byte_count = htobe32(wr->sg_list[i].length); + seg->lkey = htobe32(wr->sg_list[i].lkey); + seg->addr = htobe64(wr->sg_list[i].addr); + wqe += sizeof *seg; + } + + size += wr->num_sge * (sizeof *seg / 16); + } + + qp->wrid[ind + qp->rq.max] = wr->wr_id; + + if (wr->opcode >= sizeof mthca_opcode / sizeof mthca_opcode[0]) { + ret = -1; + *bad_wr = wr; + goto out; + } + + ((struct mthca_next_seg *) prev_wqe)->nda_op = + htobe32(((ind << qp->sq.wqe_shift) + + qp->send_wqe_offset) | + mthca_opcode[wr->opcode]); + /* + * Make sure that nda_op is written before setting ee_nds. + */ + udma_ordering_write_barrier(); + ((struct mthca_next_seg *) prev_wqe)->ee_nds = + htobe32((size0 ? 0 : MTHCA_NEXT_DBD) | size | + ((wr->send_flags & IBV_SEND_FENCE) ? + MTHCA_NEXT_FENCE : 0)); + + if (!size0) { + size0 = size; + op0 = mthca_opcode[wr->opcode]; + f0 = wr->send_flags & IBV_SEND_FENCE ? + MTHCA_SEND_DOORBELL_FENCE : 0; + } + + ++ind; + if (ind >= qp->sq.max) + ind -= qp->sq.max; + } + +out: + if (nreq) { + uint32_t doorbell[2]; + + doorbell[0] = ((qp->sq.next_ind << qp->sq.wqe_shift) + + qp->send_wqe_offset) | + f0 | op0; + doorbell[1] = (ibqp->qp_num << 8) | size0; + + udma_to_device_barrier(); + mthca_write64(doorbell, to_mctx(ibqp->context)->uar + + MTHCA_SEND_DOORBELL); + } + + qp->sq.next_ind = ind; + qp->sq.head += nreq; + + pthread_spin_unlock(&qp->sq.lock); + return ret; +} + +int mthca_tavor_post_recv(struct ibv_qp *ibqp, struct ibv_recv_wr *wr, + struct ibv_recv_wr **bad_wr) +{ + struct mthca_qp *qp = to_mqp(ibqp); + uint32_t doorbell[2]; + int ret = 0; + int nreq; + int i; + int size; + int size0 = 0; + int ind; + void *wqe; + void *prev_wqe; + + pthread_spin_lock(&qp->rq.lock); + + ind = qp->rq.next_ind; + + for (nreq = 0; wr; wr = wr->next) { + if (wq_overflow(&qp->rq, nreq, to_mcq(qp->ibv_qp.recv_cq))) { + ret = -1; + *bad_wr = wr; + goto out; + } + + wqe = get_recv_wqe(qp, ind); + prev_wqe = qp->rq.last; + qp->rq.last = wqe; + + ((struct mthca_next_seg *) wqe)->ee_nds = + htobe32(MTHCA_NEXT_DBD); + ((struct mthca_next_seg *) wqe)->flags = + htobe32(MTHCA_NEXT_CQ_UPDATE); + + wqe += sizeof (struct mthca_next_seg); + size = sizeof (struct mthca_next_seg) / 16; + + if (wr->num_sge > qp->rq.max_gs) { + ret = -1; + *bad_wr = wr; + goto out; + } + + for (i = 0; i < wr->num_sge; ++i) { + ((struct mthca_data_seg *) wqe)->byte_count = + htobe32(wr->sg_list[i].length); + ((struct mthca_data_seg *) wqe)->lkey = + htobe32(wr->sg_list[i].lkey); + ((struct mthca_data_seg *) wqe)->addr = + htobe64(wr->sg_list[i].addr); + wqe += sizeof (struct mthca_data_seg); + size += sizeof (struct mthca_data_seg) / 16; + } + + qp->wrid[ind] = wr->wr_id; + + ((struct mthca_next_seg *) prev_wqe)->ee_nds = + htobe32(MTHCA_NEXT_DBD | size); + + if (!size0) + size0 = size; + + ++ind; + if (ind >= qp->rq.max) + ind -= qp->rq.max; + + ++nreq; + if (nreq == MTHCA_TAVOR_MAX_WQES_PER_RECV_DB) { + nreq = 0; + + doorbell[0] = + (qp->rq.next_ind << qp->rq.wqe_shift) | size0; + doorbell[1] = ibqp->qp_num << 8; + + /* + * Make sure that descriptors are written + * before doorbell is rung. + */ + udma_to_device_barrier(); + + mthca_write64(doorbell, to_mctx(ibqp->context)->uar + + MTHCA_RECV_DOORBELL); + + qp->rq.next_ind = ind; + qp->rq.head += MTHCA_TAVOR_MAX_WQES_PER_RECV_DB; + size0 = 0; + } + } + +out: + if (nreq) { + doorbell[0] = (qp->rq.next_ind << qp->rq.wqe_shift) | size0; + doorbell[1] = (ibqp->qp_num << 8) | nreq; + + /* + * Make sure that descriptors are written before + * doorbell is rung. + */ + udma_to_device_barrier(); + + mthca_write64(doorbell, to_mctx(ibqp->context)->uar + + MTHCA_RECV_DOORBELL); + } + + qp->rq.next_ind = ind; + qp->rq.head += nreq; + + pthread_spin_unlock(&qp->rq.lock); + return ret; +} + +int mthca_arbel_post_send(struct ibv_qp *ibqp, struct ibv_send_wr *wr, + struct ibv_send_wr **bad_wr) +{ + struct mthca_qp *qp = to_mqp(ibqp); + uint32_t doorbell[2]; + void *wqe, *prev_wqe; + int ind; + int nreq; + int ret = 0; + int size; + int size0 = 0; + int i; + uint32_t uninitialized_var(f0); + uint32_t uninitialized_var(op0); + + pthread_spin_lock(&qp->sq.lock); + + /* XXX check that state is OK to post send */ + + ind = qp->sq.head & (qp->sq.max - 1); + + for (nreq = 0; wr; ++nreq, wr = wr->next) { + if (nreq == MTHCA_ARBEL_MAX_WQES_PER_SEND_DB) { + nreq = 0; + + doorbell[0] = (MTHCA_ARBEL_MAX_WQES_PER_SEND_DB << 24) | + ((qp->sq.head & 0xffff) << 8) | f0 | op0; + doorbell[1] = (ibqp->qp_num << 8) | size0; + + qp->sq.head += MTHCA_ARBEL_MAX_WQES_PER_SEND_DB; + + /* + * Make sure that descriptors are written before + * doorbell record. + */ + udma_to_device_barrier(); + *qp->sq.db = htobe32(qp->sq.head & 0xffff); + + /* + * Make sure doorbell record is written before we + * write MMIO send doorbell. + */ + mmio_ordered_writes_hack(); + mthca_write64(doorbell, to_mctx(ibqp->context)->uar + + MTHCA_SEND_DOORBELL); + + size0 = 0; + } + + if (wq_overflow(&qp->sq, nreq, to_mcq(qp->ibv_qp.send_cq))) { + ret = -1; + *bad_wr = wr; + goto out; + } + + wqe = get_send_wqe(qp, ind); + prev_wqe = qp->sq.last; + qp->sq.last = wqe; + + ((struct mthca_next_seg *) wqe)->flags = + ((wr->send_flags & IBV_SEND_SIGNALED) ? + htobe32(MTHCA_NEXT_CQ_UPDATE) : 0) | + ((wr->send_flags & IBV_SEND_SOLICITED) ? + htobe32(MTHCA_NEXT_SOLICIT) : 0) | + htobe32(1); + if (wr->opcode == IBV_WR_SEND_WITH_IMM || + wr->opcode == IBV_WR_RDMA_WRITE_WITH_IMM) + ((struct mthca_next_seg *) wqe)->imm = wr->imm_data; + + wqe += sizeof (struct mthca_next_seg); + size = sizeof (struct mthca_next_seg) / 16; + + switch (ibqp->qp_type) { + case IBV_QPT_RC: + switch (wr->opcode) { + case IBV_WR_ATOMIC_CMP_AND_SWP: + case IBV_WR_ATOMIC_FETCH_AND_ADD: + ((struct mthca_raddr_seg *) wqe)->raddr = + htobe64(wr->wr.atomic.remote_addr); + ((struct mthca_raddr_seg *) wqe)->rkey = + htobe32(wr->wr.atomic.rkey); + ((struct mthca_raddr_seg *) wqe)->reserved = 0; + + wqe += sizeof (struct mthca_raddr_seg); + + if (wr->opcode == IBV_WR_ATOMIC_CMP_AND_SWP) { + ((struct mthca_atomic_seg *) wqe)->swap_add = + htobe64(wr->wr.atomic.swap); + ((struct mthca_atomic_seg *) wqe)->compare = + htobe64(wr->wr.atomic.compare_add); + } else { + ((struct mthca_atomic_seg *) wqe)->swap_add = + htobe64(wr->wr.atomic.compare_add); + ((struct mthca_atomic_seg *) wqe)->compare = 0; + } + + wqe += sizeof (struct mthca_atomic_seg); + size += (sizeof (struct mthca_raddr_seg) + + sizeof (struct mthca_atomic_seg)) / 16; + break; + + case IBV_WR_RDMA_WRITE: + case IBV_WR_RDMA_WRITE_WITH_IMM: + case IBV_WR_RDMA_READ: + ((struct mthca_raddr_seg *) wqe)->raddr = + htobe64(wr->wr.rdma.remote_addr); + ((struct mthca_raddr_seg *) wqe)->rkey = + htobe32(wr->wr.rdma.rkey); + ((struct mthca_raddr_seg *) wqe)->reserved = 0; + wqe += sizeof (struct mthca_raddr_seg); + size += sizeof (struct mthca_raddr_seg) / 16; + break; + + default: + /* No extra segments required for sends */ + break; + } + + break; + + case IBV_QPT_UC: + switch (wr->opcode) { + case IBV_WR_RDMA_WRITE: + case IBV_WR_RDMA_WRITE_WITH_IMM: + ((struct mthca_raddr_seg *) wqe)->raddr = + htobe64(wr->wr.rdma.remote_addr); + ((struct mthca_raddr_seg *) wqe)->rkey = + htobe32(wr->wr.rdma.rkey); + ((struct mthca_raddr_seg *) wqe)->reserved = 0; + wqe += sizeof (struct mthca_raddr_seg); + size += sizeof (struct mthca_raddr_seg) / 16; + break; + + default: + /* No extra segments required for sends */ + break; + } + + break; + + case IBV_QPT_UD: + memcpy(((struct mthca_arbel_ud_seg *) wqe)->av, + to_mah(wr->wr.ud.ah)->av, sizeof (struct mthca_av)); + ((struct mthca_arbel_ud_seg *) wqe)->dqpn = + htobe32(wr->wr.ud.remote_qpn); + ((struct mthca_arbel_ud_seg *) wqe)->qkey = + htobe32(wr->wr.ud.remote_qkey); + + wqe += sizeof (struct mthca_arbel_ud_seg); + size += sizeof (struct mthca_arbel_ud_seg) / 16; + break; + + default: + break; + } + + if (wr->num_sge > qp->sq.max_gs) { + ret = -1; + *bad_wr = wr; + goto out; + } + + if (wr->send_flags & IBV_SEND_INLINE) { + if (wr->num_sge) { + struct mthca_inline_seg *seg = wqe; + int s = 0; + + wqe += sizeof *seg; + for (i = 0; i < wr->num_sge; ++i) { + struct ibv_sge *sge = &wr->sg_list[i]; + + s += sge->length; + + if (s > qp->max_inline_data) { + ret = -1; + *bad_wr = wr; + goto out; + } + + memcpy(wqe, (void *) (uintptr_t) sge->addr, + sge->length); + wqe += sge->length; + } + + seg->byte_count = htobe32(MTHCA_INLINE_SEG | s); + size += align(s + sizeof *seg, 16) / 16; + } + } else { + struct mthca_data_seg *seg; + + for (i = 0; i < wr->num_sge; ++i) { + seg = wqe; + seg->byte_count = htobe32(wr->sg_list[i].length); + seg->lkey = htobe32(wr->sg_list[i].lkey); + seg->addr = htobe64(wr->sg_list[i].addr); + wqe += sizeof *seg; + } + + size += wr->num_sge * (sizeof *seg / 16); + } + + qp->wrid[ind + qp->rq.max] = wr->wr_id; + + if (wr->opcode >= sizeof mthca_opcode / sizeof mthca_opcode[0]) { + ret = -1; + *bad_wr = wr; + goto out; + } + + ((struct mthca_next_seg *) prev_wqe)->nda_op = + htobe32(((ind << qp->sq.wqe_shift) + + qp->send_wqe_offset) | + mthca_opcode[wr->opcode]); + udma_ordering_write_barrier(); + ((struct mthca_next_seg *) prev_wqe)->ee_nds = + htobe32(MTHCA_NEXT_DBD | size | + ((wr->send_flags & IBV_SEND_FENCE) ? + MTHCA_NEXT_FENCE : 0)); + + if (!size0) { + size0 = size; + op0 = mthca_opcode[wr->opcode]; + f0 = wr->send_flags & IBV_SEND_FENCE ? + MTHCA_SEND_DOORBELL_FENCE : 0; + } + + ++ind; + if (ind >= qp->sq.max) + ind -= qp->sq.max; + } + +out: + if (nreq) { + doorbell[0] = + (nreq << 24) | ((qp->sq.head & 0xffff) << 8) | f0 | op0; + doorbell[1] = (ibqp->qp_num << 8) | size0; + + qp->sq.head += nreq; + + /* + * Make sure that descriptors are written before + * doorbell record. + */ + udma_to_device_barrier(); + *qp->sq.db = htobe32(qp->sq.head & 0xffff); + + /* + * Make sure doorbell record is written before we + * write MMIO send doorbell. + */ + mmio_ordered_writes_hack(); + mthca_write64(doorbell, to_mctx(ibqp->context)->uar + + MTHCA_SEND_DOORBELL); + } + + pthread_spin_unlock(&qp->sq.lock); + return ret; +} + +int mthca_arbel_post_recv(struct ibv_qp *ibqp, struct ibv_recv_wr *wr, + struct ibv_recv_wr **bad_wr) +{ + struct mthca_qp *qp = to_mqp(ibqp); + int ret = 0; + int nreq; + int ind; + int i; + void *wqe; + + pthread_spin_lock(&qp->rq.lock); + + /* XXX check that state is OK to post receive */ + + ind = qp->rq.head & (qp->rq.max - 1); + + for (nreq = 0; wr; ++nreq, wr = wr->next) { + if (wq_overflow(&qp->rq, nreq, to_mcq(qp->ibv_qp.recv_cq))) { + ret = -1; + *bad_wr = wr; + goto out; + } + + wqe = get_recv_wqe(qp, ind); + + ((struct mthca_next_seg *) wqe)->flags = 0; + + wqe += sizeof (struct mthca_next_seg); + + if (wr->num_sge > qp->rq.max_gs) { + ret = -1; + *bad_wr = wr; + goto out; + } + + for (i = 0; i < wr->num_sge; ++i) { + ((struct mthca_data_seg *) wqe)->byte_count = + htobe32(wr->sg_list[i].length); + ((struct mthca_data_seg *) wqe)->lkey = + htobe32(wr->sg_list[i].lkey); + ((struct mthca_data_seg *) wqe)->addr = + htobe64(wr->sg_list[i].addr); + wqe += sizeof (struct mthca_data_seg); + } + + if (i < qp->rq.max_gs) { + ((struct mthca_data_seg *) wqe)->byte_count = 0; + ((struct mthca_data_seg *) wqe)->lkey = htobe32(MTHCA_INVAL_LKEY); + ((struct mthca_data_seg *) wqe)->addr = 0; + } + + qp->wrid[ind] = wr->wr_id; + + ++ind; + if (ind >= qp->rq.max) + ind -= qp->rq.max; + } +out: + if (nreq) { + qp->rq.head += nreq; + + /* + * Make sure that descriptors are written before + * doorbell record. + */ + udma_to_device_barrier(); + *qp->rq.db = htobe32(qp->rq.head & 0xffff); + } + + pthread_spin_unlock(&qp->rq.lock); + return ret; +} + +int mthca_alloc_qp_buf(struct ibv_pd *pd, struct ibv_qp_cap *cap, + enum ibv_qp_type type, struct mthca_qp *qp) +{ + int size; + int max_sq_sge; + struct mthca_next_seg *next; + int i; + + qp->rq.max_gs = cap->max_recv_sge; + qp->sq.max_gs = cap->max_send_sge; + max_sq_sge = align(cap->max_inline_data + sizeof (struct mthca_inline_seg), + sizeof (struct mthca_data_seg)) / sizeof (struct mthca_data_seg); + if (max_sq_sge < cap->max_send_sge) + max_sq_sge = cap->max_send_sge; + + qp->wrid = malloc((qp->rq.max + qp->sq.max) * sizeof (uint64_t)); + if (!qp->wrid) + return -1; + + size = sizeof (struct mthca_next_seg) + + qp->rq.max_gs * sizeof (struct mthca_data_seg); + + for (qp->rq.wqe_shift = 6; 1 << qp->rq.wqe_shift < size; + qp->rq.wqe_shift++) + ; /* nothing */ + + size = max_sq_sge * sizeof (struct mthca_data_seg); + switch (type) { + case IBV_QPT_UD: + size += mthca_is_memfree(pd->context) ? + sizeof (struct mthca_arbel_ud_seg) : + sizeof (struct mthca_tavor_ud_seg); + break; + + case IBV_QPT_UC: + size += sizeof (struct mthca_raddr_seg); + break; + + case IBV_QPT_RC: + size += sizeof (struct mthca_raddr_seg); + /* + * An atomic op will require an atomic segment, a + * remote address segment and one scatter entry. + */ + if (size < (sizeof (struct mthca_atomic_seg) + + sizeof (struct mthca_raddr_seg) + + sizeof (struct mthca_data_seg))) + size = (sizeof (struct mthca_atomic_seg) + + sizeof (struct mthca_raddr_seg) + + sizeof (struct mthca_data_seg)); + break; + + default: + break; + } + + /* Make sure that we have enough space for a bind request */ + if (size < sizeof (struct mthca_bind_seg)) + size = sizeof (struct mthca_bind_seg); + + size += sizeof (struct mthca_next_seg); + + for (qp->sq.wqe_shift = 6; 1 << qp->sq.wqe_shift < size; + qp->sq.wqe_shift++) + ; /* nothing */ + + qp->send_wqe_offset = align(qp->rq.max << qp->rq.wqe_shift, + 1 << qp->sq.wqe_shift); + + qp->buf_size = qp->send_wqe_offset + (qp->sq.max << qp->sq.wqe_shift); + + if (mthca_alloc_buf(&qp->buf, + align(qp->buf_size, to_mdev(pd->context->device)->page_size), + to_mdev(pd->context->device)->page_size)) { + free(qp->wrid); + return -1; + } + + memset(qp->buf.buf, 0, qp->buf_size); + + if (mthca_is_memfree(pd->context)) { + struct mthca_data_seg *scatter; + __be32 sz; + + sz = htobe32((sizeof (struct mthca_next_seg) + + qp->rq.max_gs * sizeof (struct mthca_data_seg)) / 16); + + for (i = 0; i < qp->rq.max; ++i) { + next = get_recv_wqe(qp, i); + next->nda_op = htobe32(((i + 1) & (qp->rq.max - 1)) << + qp->rq.wqe_shift); + next->ee_nds = sz; + + for (scatter = (void *) (next + 1); + (void *) scatter < (void *) next + (1 << qp->rq.wqe_shift); + ++scatter) + scatter->lkey = htobe32(MTHCA_INVAL_LKEY); + } + + for (i = 0; i < qp->sq.max; ++i) { + next = get_send_wqe(qp, i); + next->nda_op = htobe32((((i + 1) & (qp->sq.max - 1)) << + qp->sq.wqe_shift) + + qp->send_wqe_offset); + } + } else { + for (i = 0; i < qp->rq.max; ++i) { + next = get_recv_wqe(qp, i); + next->nda_op = htobe32((((i + 1) % qp->rq.max) << + qp->rq.wqe_shift) | 1); + } + } + + qp->sq.last = get_send_wqe(qp, qp->sq.max - 1); + qp->rq.last = get_recv_wqe(qp, qp->rq.max - 1); + + return 0; +} + +struct mthca_qp *mthca_find_qp(struct mthca_context *ctx, uint32_t qpn) +{ + int tind = (qpn & (ctx->num_qps - 1)) >> ctx->qp_table_shift; + + if (ctx->qp_table[tind].refcnt) + return ctx->qp_table[tind].table[qpn & ctx->qp_table_mask]; + else + return NULL; +} + +int mthca_store_qp(struct mthca_context *ctx, uint32_t qpn, struct mthca_qp *qp) +{ + int tind = (qpn & (ctx->num_qps - 1)) >> ctx->qp_table_shift; + + if (!ctx->qp_table[tind].refcnt) { + ctx->qp_table[tind].table = calloc(ctx->qp_table_mask + 1, + sizeof (struct mthca_qp *)); + if (!ctx->qp_table[tind].table) + return -1; + } + + ++ctx->qp_table[tind].refcnt; + ctx->qp_table[tind].table[qpn & ctx->qp_table_mask] = qp; + return 0; +} + +void mthca_clear_qp(struct mthca_context *ctx, uint32_t qpn) +{ + int tind = (qpn & (ctx->num_qps - 1)) >> ctx->qp_table_shift; + + if (!--ctx->qp_table[tind].refcnt) + free(ctx->qp_table[tind].table); + else + ctx->qp_table[tind].table[qpn & ctx->qp_table_mask] = NULL; +} + +int mthca_free_err_wqe(struct mthca_qp *qp, int is_send, + int index, int *dbd, __be32 *new_wqe) +{ + struct mthca_next_seg *next; + + /* + * For SRQs, all receive WQEs generate a CQE, so we're always + * at the end of the doorbell chain. + */ + if (qp->ibv_qp.srq && !is_send) { + *new_wqe = 0; + return 0; + } + + if (is_send) + next = get_send_wqe(qp, index); + else + next = get_recv_wqe(qp, index); + + *dbd = !!(next->ee_nds & htobe32(MTHCA_NEXT_DBD)); + if (next->ee_nds & htobe32(0x3f)) + *new_wqe = (next->nda_op & htobe32(~0x3f)) | + (next->ee_nds & htobe32(0x3f)); + else + *new_wqe = 0; + + return 0; +} + diff --git a/providers/mthca/srq.c b/providers/mthca/srq.c new file mode 100644 index 0000000..ad68961 --- /dev/null +++ b/providers/mthca/srq.c @@ -0,0 +1,312 @@ +/* + * Copyright (c) 2005 Cisco Systems. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <config.h> + +#include <endian.h> +#include <stdlib.h> +#include <pthread.h> +#include <string.h> + +#include "mthca.h" +#include "doorbell.h" +#include "wqe.h" + +static void *get_wqe(struct mthca_srq *srq, int n) +{ + return srq->buf.buf + (n << srq->wqe_shift); +} + +/* + * Return a pointer to the location within a WQE that we're using as a + * link when the WQE is in the free list. We use the imm field at an + * offset of 12 bytes because in the Tavor case, posting a WQE may + * overwrite the next segment of the previous WQE, but a receive WQE + * will never touch the imm field. This avoids corrupting our free + * list if the previous WQE has already completed and been put on the + * free list when we post the next WQE. + */ +static inline int *wqe_to_link(void *wqe) +{ + return (int *) (wqe + 12); +} + +void mthca_free_srq_wqe(struct mthca_srq *srq, int ind) +{ + struct mthca_next_seg *last_free; + + pthread_spin_lock(&srq->lock); + + last_free = get_wqe(srq, srq->last_free); + *wqe_to_link(last_free) = ind; + last_free->nda_op = htobe32((ind << srq->wqe_shift) | 1); + *wqe_to_link(get_wqe(srq, ind)) = -1; + srq->last_free = ind; + + pthread_spin_unlock(&srq->lock); +} + +int mthca_tavor_post_srq_recv(struct ibv_srq *ibsrq, + struct ibv_recv_wr *wr, + struct ibv_recv_wr **bad_wr) +{ + struct mthca_srq *srq = to_msrq(ibsrq); + uint32_t doorbell[2]; + int err = 0; + int first_ind; + int ind; + int next_ind; + int nreq; + int i; + void *wqe; + void *prev_wqe; + + pthread_spin_lock(&srq->lock); + + first_ind = srq->first_free; + + for (nreq = 0; wr; wr = wr->next) { + ind = srq->first_free; + wqe = get_wqe(srq, ind); + next_ind = *wqe_to_link(wqe); + + if (next_ind < 0) { + err = -1; + *bad_wr = wr; + break; + } + + prev_wqe = srq->last; + srq->last = wqe; + + ((struct mthca_next_seg *) wqe)->ee_nds = 0; + /* flags field will always remain 0 */ + + wqe += sizeof (struct mthca_next_seg); + + if (wr->num_sge > srq->max_gs) { + err = -1; + *bad_wr = wr; + srq->last = prev_wqe; + break; + } + + for (i = 0; i < wr->num_sge; ++i) { + ((struct mthca_data_seg *) wqe)->byte_count = + htobe32(wr->sg_list[i].length); + ((struct mthca_data_seg *) wqe)->lkey = + htobe32(wr->sg_list[i].lkey); + ((struct mthca_data_seg *) wqe)->addr = + htobe64(wr->sg_list[i].addr); + wqe += sizeof (struct mthca_data_seg); + } + + if (i < srq->max_gs) { + ((struct mthca_data_seg *) wqe)->byte_count = 0; + ((struct mthca_data_seg *) wqe)->lkey = htobe32(MTHCA_INVAL_LKEY); + ((struct mthca_data_seg *) wqe)->addr = 0; + } + + ((struct mthca_next_seg *) prev_wqe)->ee_nds = + htobe32(MTHCA_NEXT_DBD); + + srq->wrid[ind] = wr->wr_id; + srq->first_free = next_ind; + + if (++nreq == MTHCA_TAVOR_MAX_WQES_PER_RECV_DB) { + nreq = 0; + + doorbell[0] = first_ind << srq->wqe_shift; + doorbell[1] = srq->srqn << 8; + + /* + * Make sure that descriptors are written + * before doorbell is rung. + */ + udma_to_device_barrier(); + + mthca_write64(doorbell, to_mctx(ibsrq->context)->uar + + MTHCA_RECV_DOORBELL); + + first_ind = srq->first_free; + } + } + + if (nreq) { + doorbell[0] = first_ind << srq->wqe_shift; + doorbell[1] = (srq->srqn << 8) | nreq; + + /* + * Make sure that descriptors are written before + * doorbell is rung. + */ + udma_to_device_barrier(); + + mthca_write64(doorbell, to_mctx(ibsrq->context)->uar + + MTHCA_RECV_DOORBELL); + } + + pthread_spin_unlock(&srq->lock); + return err; +} + +int mthca_arbel_post_srq_recv(struct ibv_srq *ibsrq, + struct ibv_recv_wr *wr, + struct ibv_recv_wr **bad_wr) +{ + struct mthca_srq *srq = to_msrq(ibsrq); + int err = 0; + int ind; + int next_ind; + int nreq; + int i; + void *wqe; + + pthread_spin_lock(&srq->lock); + + for (nreq = 0; wr; ++nreq, wr = wr->next) { + ind = srq->first_free; + wqe = get_wqe(srq, ind); + next_ind = *wqe_to_link(wqe); + + if (next_ind < 0) { + err = -1; + *bad_wr = wr; + break; + } + + ((struct mthca_next_seg *) wqe)->ee_nds = 0; + /* flags field will always remain 0 */ + + wqe += sizeof (struct mthca_next_seg); + + if (wr->num_sge > srq->max_gs) { + err = -1; + *bad_wr = wr; + break; + } + + for (i = 0; i < wr->num_sge; ++i) { + ((struct mthca_data_seg *) wqe)->byte_count = + htobe32(wr->sg_list[i].length); + ((struct mthca_data_seg *) wqe)->lkey = + htobe32(wr->sg_list[i].lkey); + ((struct mthca_data_seg *) wqe)->addr = + htobe64(wr->sg_list[i].addr); + wqe += sizeof (struct mthca_data_seg); + } + + if (i < srq->max_gs) { + ((struct mthca_data_seg *) wqe)->byte_count = 0; + ((struct mthca_data_seg *) wqe)->lkey = htobe32(MTHCA_INVAL_LKEY); + ((struct mthca_data_seg *) wqe)->addr = 0; + } + + srq->wrid[ind] = wr->wr_id; + srq->first_free = next_ind; + } + + if (nreq) { + srq->counter += nreq; + + /* + * Make sure that descriptors are written before + * we write doorbell record. + */ + udma_ordering_write_barrier(); + *srq->db = htobe32(srq->counter); + } + + pthread_spin_unlock(&srq->lock); + return err; +} + +int mthca_alloc_srq_buf(struct ibv_pd *pd, struct ibv_srq_attr *attr, + struct mthca_srq *srq) +{ + struct mthca_data_seg *scatter; + void *wqe; + int size; + int i; + + srq->wrid = malloc(srq->max * sizeof (uint64_t)); + if (!srq->wrid) + return -1; + + size = sizeof (struct mthca_next_seg) + + srq->max_gs * sizeof (struct mthca_data_seg); + + for (srq->wqe_shift = 6; 1 << srq->wqe_shift < size; ++srq->wqe_shift) + ; /* nothing */ + + srq->buf_size = srq->max << srq->wqe_shift; + + if (mthca_alloc_buf(&srq->buf, + align(srq->buf_size, to_mdev(pd->context->device)->page_size), + to_mdev(pd->context->device)->page_size)) { + free(srq->wrid); + return -1; + } + + memset(srq->buf.buf, 0, srq->buf_size); + + /* + * Now initialize the SRQ buffer so that all of the WQEs are + * linked into the list of free WQEs. In addition, set the + * scatter list L_Keys to the sentry value of 0x100. + */ + + for (i = 0; i < srq->max; ++i) { + struct mthca_next_seg *next; + + next = wqe = get_wqe(srq, i); + + if (i < srq->max - 1) { + *wqe_to_link(wqe) = i + 1; + next->nda_op = htobe32(((i + 1) << srq->wqe_shift) | 1); + } else { + *wqe_to_link(wqe) = -1; + next->nda_op = 0; + } + + for (scatter = wqe + sizeof (struct mthca_next_seg); + (void *) scatter < wqe + (1 << srq->wqe_shift); + ++scatter) + scatter->lkey = htobe32(MTHCA_INVAL_LKEY); + } + + srq->first_free = 0; + srq->last_free = srq->max - 1; + srq->last = get_wqe(srq, srq->max - 1); + + return 0; +} diff --git a/providers/mthca/verbs.c b/providers/mthca/verbs.c new file mode 100644 index 0000000..99e5ec6 --- /dev/null +++ b/providers/mthca/verbs.c @@ -0,0 +1,731 @@ +/* + * Copyright (c) 2005 Topspin Communications. All rights reserved. + * Copyright (c) 2005, 2006 Cisco Systems. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <config.h> + +#include <stdlib.h> +#include <stdio.h> +#include <strings.h> +#include <pthread.h> +#include <errno.h> + +#include "mthca.h" +#include "mthca-abi.h" + +int mthca_query_device(struct ibv_context *context, struct ibv_device_attr *attr) +{ + struct ibv_query_device cmd; + uint64_t raw_fw_ver; + unsigned major, minor, sub_minor; + int ret; + + ret = ibv_cmd_query_device(context, attr, &raw_fw_ver, &cmd, sizeof cmd); + if (ret) + return ret; + + major = (raw_fw_ver >> 32) & 0xffff; + minor = (raw_fw_ver >> 16) & 0xffff; + sub_minor = raw_fw_ver & 0xffff; + + snprintf(attr->fw_ver, sizeof attr->fw_ver, + "%d.%d.%d", major, minor, sub_minor); + + return 0; +} + +int mthca_query_port(struct ibv_context *context, uint8_t port, + struct ibv_port_attr *attr) +{ + struct ibv_query_port cmd; + + return ibv_cmd_query_port(context, port, attr, &cmd, sizeof cmd); +} + +struct ibv_pd *mthca_alloc_pd(struct ibv_context *context) +{ + struct ibv_alloc_pd cmd; + struct umthca_alloc_pd_resp resp; + struct mthca_pd *pd; + + pd = malloc(sizeof *pd); + if (!pd) + return NULL; + + if (!mthca_is_memfree(context)) { + pd->ah_list = NULL; + if (pthread_mutex_init(&pd->ah_mutex, NULL)) { + free(pd); + return NULL; + } + } + + if (ibv_cmd_alloc_pd(context, &pd->ibv_pd, &cmd, sizeof cmd, + &resp.ibv_resp, sizeof resp)) { + free(pd); + return NULL; + } + + pd->pdn = resp.pdn; + + return &pd->ibv_pd; +} + +int mthca_free_pd(struct ibv_pd *pd) +{ + int ret; + + ret = ibv_cmd_dealloc_pd(pd); + if (ret) + return ret; + + free(to_mpd(pd)); + return 0; +} + +static struct ibv_mr *__mthca_reg_mr(struct ibv_pd *pd, void *addr, + size_t length, uint64_t hca_va, + int access, + int dma_sync) +{ + struct verbs_mr *vmr; + struct umthca_reg_mr cmd; + struct ib_uverbs_reg_mr_resp resp; + int ret; + + /* + * Old kernels just ignore the extra data we pass in with the + * reg_mr command structure, so there's no need to add an ABI + * version check here (and indeed the kernel ABI was not + * incremented due to this change). + */ + cmd.mr_attrs = dma_sync ? MTHCA_MR_DMASYNC : 0; + cmd.reserved = 0; + + vmr = malloc(sizeof(*vmr)); + if (!vmr) + return NULL; + + ret = ibv_cmd_reg_mr(pd, addr, length, hca_va, access, vmr, + &cmd.ibv_cmd, sizeof(cmd), &resp, sizeof(resp)); + if (ret) { + free(vmr); + return NULL; + } + + return &vmr->ibv_mr; +} + +struct ibv_mr *mthca_reg_mr(struct ibv_pd *pd, void *addr, size_t length, + uint64_t hca_va, int access) +{ + return __mthca_reg_mr(pd, addr, length, hca_va, access, 0); +} + +int mthca_dereg_mr(struct verbs_mr *vmr) +{ + int ret; + + ret = ibv_cmd_dereg_mr(vmr); + if (ret) + return ret; + + free(vmr); + return 0; +} + +static int align_cq_size(int cqe) +{ + int nent; + + for (nent = 1; nent <= cqe; nent <<= 1) + ; /* nothing */ + + return nent; +} + +struct ibv_cq *mthca_create_cq(struct ibv_context *context, int cqe, + struct ibv_comp_channel *channel, + int comp_vector) +{ + struct umthca_create_cq cmd; + struct umthca_create_cq_resp resp; + struct mthca_cq *cq; + int ret; + + /* Sanity check CQ size before proceeding */ + if (cqe > 131072) + return NULL; + + cq = malloc(sizeof *cq); + if (!cq) + return NULL; + + cq->cons_index = 0; + + if (pthread_spin_init(&cq->lock, PTHREAD_PROCESS_PRIVATE)) + goto err; + + cqe = align_cq_size(cqe); + if (mthca_alloc_cq_buf(to_mdev(context->device), &cq->buf, cqe)) + goto err; + + cq->mr = __mthca_reg_mr(to_mctx(context)->pd, cq->buf.buf, + cqe * MTHCA_CQ_ENTRY_SIZE, + 0, IBV_ACCESS_LOCAL_WRITE, 1); + if (!cq->mr) + goto err_buf; + + cq->mr->context = context; + + if (mthca_is_memfree(context)) { + cq->arm_sn = 1; + cq->set_ci_db_index = mthca_alloc_db(to_mctx(context)->db_tab, + MTHCA_DB_TYPE_CQ_SET_CI, + &cq->set_ci_db); + if (cq->set_ci_db_index < 0) + goto err_unreg; + + cq->arm_db_index = mthca_alloc_db(to_mctx(context)->db_tab, + MTHCA_DB_TYPE_CQ_ARM, + &cq->arm_db); + if (cq->arm_db_index < 0) + goto err_set_db; + + cmd.arm_db_page = db_align(cq->arm_db); + cmd.set_db_page = db_align(cq->set_ci_db); + cmd.arm_db_index = cq->arm_db_index; + cmd.set_db_index = cq->set_ci_db_index; + } else { + cmd.arm_db_page = cmd.set_db_page = + cmd.arm_db_index = cmd.set_db_index = 0; + } + + cmd.lkey = cq->mr->lkey; + cmd.pdn = to_mpd(to_mctx(context)->pd)->pdn; + ret = ibv_cmd_create_cq(context, cqe - 1, channel, comp_vector, + &cq->ibv_cq, &cmd.ibv_cmd, sizeof cmd, + &resp.ibv_resp, sizeof resp); + if (ret) + goto err_arm_db; + + cq->cqn = resp.cqn; + + if (mthca_is_memfree(context)) { + mthca_set_db_qn(cq->set_ci_db, MTHCA_DB_TYPE_CQ_SET_CI, cq->cqn); + mthca_set_db_qn(cq->arm_db, MTHCA_DB_TYPE_CQ_ARM, cq->cqn); + } + + return &cq->ibv_cq; + +err_arm_db: + if (mthca_is_memfree(context)) + mthca_free_db(to_mctx(context)->db_tab, MTHCA_DB_TYPE_CQ_ARM, + cq->arm_db_index); + +err_set_db: + if (mthca_is_memfree(context)) + mthca_free_db(to_mctx(context)->db_tab, MTHCA_DB_TYPE_CQ_SET_CI, + cq->set_ci_db_index); + +err_unreg: + mthca_dereg_mr(verbs_get_mr(cq->mr)); + +err_buf: + mthca_free_buf(&cq->buf); + +err: + free(cq); + + return NULL; +} + +int mthca_resize_cq(struct ibv_cq *ibcq, int cqe) +{ + struct mthca_cq *cq = to_mcq(ibcq); + struct umthca_resize_cq cmd; + struct ibv_mr *mr; + struct mthca_buf buf; + struct ib_uverbs_resize_cq_resp resp; + int old_cqe; + int ret; + + /* Sanity check CQ size before proceeding */ + if (cqe > 131072) + return EINVAL; + + pthread_spin_lock(&cq->lock); + + cqe = align_cq_size(cqe); + if (cqe == ibcq->cqe + 1) { + ret = 0; + goto out; + } + + ret = mthca_alloc_cq_buf(to_mdev(ibcq->context->device), &buf, cqe); + if (ret) + goto out; + + mr = __mthca_reg_mr(to_mctx(ibcq->context)->pd, buf.buf, + cqe * MTHCA_CQ_ENTRY_SIZE, + 0, IBV_ACCESS_LOCAL_WRITE, 1); + if (!mr) { + mthca_free_buf(&buf); + ret = ENOMEM; + goto out; + } + + mr->context = ibcq->context; + + old_cqe = ibcq->cqe; + + cmd.lkey = mr->lkey; + ret = ibv_cmd_resize_cq(ibcq, cqe - 1, &cmd.ibv_cmd, sizeof cmd, &resp, + sizeof resp); + if (ret) { + mthca_dereg_mr(verbs_get_mr(mr)); + mthca_free_buf(&buf); + goto out; + } + + mthca_cq_resize_copy_cqes(cq, buf.buf, old_cqe); + + mthca_dereg_mr(verbs_get_mr(cq->mr)); + mthca_free_buf(&cq->buf); + + cq->buf = buf; + cq->mr = mr; + +out: + pthread_spin_unlock(&cq->lock); + return ret; +} + +int mthca_destroy_cq(struct ibv_cq *cq) +{ + int ret; + + ret = ibv_cmd_destroy_cq(cq); + if (ret) + return ret; + + if (mthca_is_memfree(cq->context)) { + mthca_free_db(to_mctx(cq->context)->db_tab, MTHCA_DB_TYPE_CQ_SET_CI, + to_mcq(cq)->set_ci_db_index); + mthca_free_db(to_mctx(cq->context)->db_tab, MTHCA_DB_TYPE_CQ_ARM, + to_mcq(cq)->arm_db_index); + } + + mthca_dereg_mr(verbs_get_mr(to_mcq(cq)->mr)); + mthca_free_buf(&to_mcq(cq)->buf); + free(to_mcq(cq)); + + return 0; +} + +static int align_queue_size(struct ibv_context *context, int size, int spare) +{ + int ret; + + /* + * If someone asks for a 0-sized queue, presumably they're not + * going to use it. So don't mess with their size. + */ + if (!size) + return 0; + + if (mthca_is_memfree(context)) { + for (ret = 1; ret < size + spare; ret <<= 1) + ; /* nothing */ + + return ret; + } else + return size + spare; +} + +struct ibv_srq *mthca_create_srq(struct ibv_pd *pd, + struct ibv_srq_init_attr *attr) +{ + struct umthca_create_srq cmd; + struct umthca_create_srq_resp resp; + struct mthca_srq *srq; + int ret; + + /* Sanity check SRQ size before proceeding */ + if (attr->attr.max_wr > 1 << 16 || attr->attr.max_sge > 64) + return NULL; + + srq = malloc(sizeof *srq); + if (!srq) + return NULL; + + if (pthread_spin_init(&srq->lock, PTHREAD_PROCESS_PRIVATE)) + goto err; + + srq->max = align_queue_size(pd->context, attr->attr.max_wr, 1); + srq->max_gs = attr->attr.max_sge; + srq->counter = 0; + + if (mthca_alloc_srq_buf(pd, &attr->attr, srq)) + goto err; + + srq->mr = __mthca_reg_mr(pd, srq->buf.buf, srq->buf_size, 0, 0, 0); + if (!srq->mr) + goto err_free; + + srq->mr->context = pd->context; + + if (mthca_is_memfree(pd->context)) { + srq->db_index = mthca_alloc_db(to_mctx(pd->context)->db_tab, + MTHCA_DB_TYPE_SRQ, &srq->db); + if (srq->db_index < 0) + goto err_unreg; + + cmd.db_page = db_align(srq->db); + cmd.db_index = srq->db_index; + } else { + cmd.db_page = cmd.db_index = 0; + } + + cmd.lkey = srq->mr->lkey; + + ret = ibv_cmd_create_srq(pd, &srq->ibv_srq, attr, + &cmd.ibv_cmd, sizeof cmd, + &resp.ibv_resp, sizeof resp); + if (ret) + goto err_db; + + srq->srqn = resp.srqn; + + if (mthca_is_memfree(pd->context)) + mthca_set_db_qn(srq->db, MTHCA_DB_TYPE_SRQ, srq->srqn); + + return &srq->ibv_srq; + +err_db: + if (mthca_is_memfree(pd->context)) + mthca_free_db(to_mctx(pd->context)->db_tab, MTHCA_DB_TYPE_SRQ, + srq->db_index); + +err_unreg: + mthca_dereg_mr(verbs_get_mr(srq->mr)); + +err_free: + free(srq->wrid); + mthca_free_buf(&srq->buf); + +err: + free(srq); + + return NULL; +} + +int mthca_modify_srq(struct ibv_srq *srq, + struct ibv_srq_attr *attr, + int attr_mask) +{ + struct ibv_modify_srq cmd; + + return ibv_cmd_modify_srq(srq, attr, attr_mask, &cmd, sizeof cmd); +} + +int mthca_query_srq(struct ibv_srq *srq, + struct ibv_srq_attr *attr) +{ + struct ibv_query_srq cmd; + + return ibv_cmd_query_srq(srq, attr, &cmd, sizeof cmd); +} + +int mthca_destroy_srq(struct ibv_srq *srq) +{ + int ret; + + ret = ibv_cmd_destroy_srq(srq); + if (ret) + return ret; + + if (mthca_is_memfree(srq->context)) + mthca_free_db(to_mctx(srq->context)->db_tab, MTHCA_DB_TYPE_SRQ, + to_msrq(srq)->db_index); + + mthca_dereg_mr(verbs_get_mr(to_msrq(srq)->mr)); + + mthca_free_buf(&to_msrq(srq)->buf); + free(to_msrq(srq)->wrid); + free(to_msrq(srq)); + + return 0; +} + +struct ibv_qp *mthca_create_qp(struct ibv_pd *pd, struct ibv_qp_init_attr *attr) +{ + struct umthca_create_qp cmd; + struct ib_uverbs_create_qp_resp resp; + struct mthca_qp *qp; + int ret; + + /* Sanity check QP size before proceeding */ + if (attr->cap.max_send_wr > 65536 || + attr->cap.max_recv_wr > 65536 || + attr->cap.max_send_sge > 64 || + attr->cap.max_recv_sge > 64 || + attr->cap.max_inline_data > 1024) + return NULL; + + qp = malloc(sizeof *qp); + if (!qp) + return NULL; + + qp->sq.max = align_queue_size(pd->context, attr->cap.max_send_wr, 0); + qp->rq.max = align_queue_size(pd->context, attr->cap.max_recv_wr, 0); + + if (mthca_alloc_qp_buf(pd, &attr->cap, attr->qp_type, qp)) + goto err; + + mthca_init_qp_indices(qp); + + if (pthread_spin_init(&qp->sq.lock, PTHREAD_PROCESS_PRIVATE) || + pthread_spin_init(&qp->rq.lock, PTHREAD_PROCESS_PRIVATE)) + goto err_free; + + qp->mr = __mthca_reg_mr(pd, qp->buf.buf, qp->buf_size, 0, 0, 0); + if (!qp->mr) + goto err_free; + + qp->mr->context = pd->context; + + cmd.lkey = qp->mr->lkey; + cmd.reserved = 0; + + if (mthca_is_memfree(pd->context)) { + qp->sq.db_index = mthca_alloc_db(to_mctx(pd->context)->db_tab, + MTHCA_DB_TYPE_SQ, + &qp->sq.db); + if (qp->sq.db_index < 0) + goto err_unreg; + + qp->rq.db_index = mthca_alloc_db(to_mctx(pd->context)->db_tab, + MTHCA_DB_TYPE_RQ, + &qp->rq.db); + if (qp->rq.db_index < 0) + goto err_sq_db; + + cmd.sq_db_page = db_align(qp->sq.db); + cmd.rq_db_page = db_align(qp->rq.db); + cmd.sq_db_index = qp->sq.db_index; + cmd.rq_db_index = qp->rq.db_index; + } else { + cmd.sq_db_page = cmd.rq_db_page = + cmd.sq_db_index = cmd.rq_db_index = 0; + } + + pthread_mutex_lock(&to_mctx(pd->context)->qp_table_mutex); + ret = ibv_cmd_create_qp(pd, &qp->ibv_qp, attr, &cmd.ibv_cmd, sizeof cmd, + &resp, sizeof resp); + if (ret) + goto err_rq_db; + + if (mthca_is_memfree(pd->context)) { + mthca_set_db_qn(qp->sq.db, MTHCA_DB_TYPE_SQ, qp->ibv_qp.qp_num); + mthca_set_db_qn(qp->rq.db, MTHCA_DB_TYPE_RQ, qp->ibv_qp.qp_num); + } + + ret = mthca_store_qp(to_mctx(pd->context), qp->ibv_qp.qp_num, qp); + if (ret) + goto err_destroy; + pthread_mutex_unlock(&to_mctx(pd->context)->qp_table_mutex); + + qp->sq.max = attr->cap.max_send_wr; + qp->rq.max = attr->cap.max_recv_wr; + qp->sq.max_gs = attr->cap.max_send_sge; + qp->rq.max_gs = attr->cap.max_recv_sge; + qp->max_inline_data = attr->cap.max_inline_data; + + return &qp->ibv_qp; + +err_destroy: + ibv_cmd_destroy_qp(&qp->ibv_qp); + +err_rq_db: + pthread_mutex_unlock(&to_mctx(pd->context)->qp_table_mutex); + if (mthca_is_memfree(pd->context)) + mthca_free_db(to_mctx(pd->context)->db_tab, MTHCA_DB_TYPE_RQ, + qp->rq.db_index); + +err_sq_db: + if (mthca_is_memfree(pd->context)) + mthca_free_db(to_mctx(pd->context)->db_tab, MTHCA_DB_TYPE_SQ, + qp->sq.db_index); + +err_unreg: + mthca_dereg_mr(verbs_get_mr(qp->mr)); + +err_free: + free(qp->wrid); + mthca_free_buf(&qp->buf); + +err: + free(qp); + + return NULL; +} + +int mthca_query_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, + int attr_mask, + struct ibv_qp_init_attr *init_attr) +{ + struct ibv_query_qp cmd; + + return ibv_cmd_query_qp(qp, attr, attr_mask, init_attr, &cmd, sizeof cmd); +} + +int mthca_modify_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, + int attr_mask) +{ + struct ibv_modify_qp cmd = {}; + int ret; + + ret = ibv_cmd_modify_qp(qp, attr, attr_mask, &cmd, sizeof cmd); + + if (!ret && + (attr_mask & IBV_QP_STATE) && + attr->qp_state == IBV_QPS_RESET) { + mthca_cq_clean(to_mcq(qp->recv_cq), qp->qp_num, + qp->srq ? to_msrq(qp->srq) : NULL); + if (qp->send_cq != qp->recv_cq) + mthca_cq_clean(to_mcq(qp->send_cq), qp->qp_num, NULL); + + mthca_init_qp_indices(to_mqp(qp)); + + if (mthca_is_memfree(qp->context)) { + *to_mqp(qp)->sq.db = 0; + *to_mqp(qp)->rq.db = 0; + } + } + + return ret; +} + +static void mthca_lock_cqs(struct ibv_qp *qp) +{ + struct mthca_cq *send_cq = to_mcq(qp->send_cq); + struct mthca_cq *recv_cq = to_mcq(qp->recv_cq); + + if (send_cq == recv_cq) + pthread_spin_lock(&send_cq->lock); + else if (send_cq->cqn < recv_cq->cqn) { + pthread_spin_lock(&send_cq->lock); + pthread_spin_lock(&recv_cq->lock); + } else { + pthread_spin_lock(&recv_cq->lock); + pthread_spin_lock(&send_cq->lock); + } +} + +static void mthca_unlock_cqs(struct ibv_qp *qp) +{ + struct mthca_cq *send_cq = to_mcq(qp->send_cq); + struct mthca_cq *recv_cq = to_mcq(qp->recv_cq); + + if (send_cq == recv_cq) + pthread_spin_unlock(&send_cq->lock); + else if (send_cq->cqn < recv_cq->cqn) { + pthread_spin_unlock(&recv_cq->lock); + pthread_spin_unlock(&send_cq->lock); + } else { + pthread_spin_unlock(&send_cq->lock); + pthread_spin_unlock(&recv_cq->lock); + } +} + +int mthca_destroy_qp(struct ibv_qp *qp) +{ + int ret; + + pthread_mutex_lock(&to_mctx(qp->context)->qp_table_mutex); + ret = ibv_cmd_destroy_qp(qp); + if (ret) { + pthread_mutex_unlock(&to_mctx(qp->context)->qp_table_mutex); + return ret; + } + + mthca_lock_cqs(qp); + + __mthca_cq_clean(to_mcq(qp->recv_cq), qp->qp_num, + qp->srq ? to_msrq(qp->srq) : NULL); + if (qp->send_cq != qp->recv_cq) + __mthca_cq_clean(to_mcq(qp->send_cq), qp->qp_num, NULL); + + mthca_clear_qp(to_mctx(qp->context), qp->qp_num); + + mthca_unlock_cqs(qp); + pthread_mutex_unlock(&to_mctx(qp->context)->qp_table_mutex); + + if (mthca_is_memfree(qp->context)) { + mthca_free_db(to_mctx(qp->context)->db_tab, MTHCA_DB_TYPE_RQ, + to_mqp(qp)->rq.db_index); + mthca_free_db(to_mctx(qp->context)->db_tab, MTHCA_DB_TYPE_SQ, + to_mqp(qp)->sq.db_index); + } + + mthca_dereg_mr(verbs_get_mr(to_mqp(qp)->mr)); + mthca_free_buf(&to_mqp(qp)->buf); + free(to_mqp(qp)->wrid); + free(to_mqp(qp)); + + return 0; +} + +struct ibv_ah *mthca_create_ah(struct ibv_pd *pd, struct ibv_ah_attr *attr) +{ + struct mthca_ah *ah; + + ah = malloc(sizeof *ah); + if (!ah) + return NULL; + + if (mthca_alloc_av(to_mpd(pd), attr, ah)) { + free(ah); + return NULL; + } + + return &ah->ibv_ah; +} + +int mthca_destroy_ah(struct ibv_ah *ah) +{ + mthca_free_av(to_mah(ah)); + free(to_mah(ah)); + + return 0; +} diff --git a/providers/mthca/wqe.h b/providers/mthca/wqe.h new file mode 100644 index 0000000..e56ed9c --- /dev/null +++ b/providers/mthca/wqe.h @@ -0,0 +1,116 @@ +/* + * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved. + * Copyright (c) 2005 Cisco Systems. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef WQE_H +#define WQE_H + +#include <stdint.h> + +enum { + MTHCA_SEND_DOORBELL = 0x10, + MTHCA_RECV_DOORBELL = 0x18 +}; + +enum { + MTHCA_NEXT_DBD = 1 << 7, + MTHCA_NEXT_FENCE = 1 << 6, + MTHCA_NEXT_CQ_UPDATE = 1 << 3, + MTHCA_NEXT_EVENT_GEN = 1 << 2, + MTHCA_NEXT_SOLICIT = 1 << 1, +}; + +enum { + MTHCA_INLINE_SEG = 1 << 31 +}; + +enum { + MTHCA_INVAL_LKEY = 0x100, + MTHCA_TAVOR_MAX_WQES_PER_RECV_DB = 256, + MTHCA_ARBEL_MAX_WQES_PER_SEND_DB = 255 +}; + +struct mthca_next_seg { + __be32 nda_op; /* [31:6] next WQE [4:0] next opcode */ + __be32 ee_nds; /* [31:8] next EE [7] DBD [6] F [5:0] next WQE size */ + __be32 flags; /* [3] CQ [2] Event [1] Solicit */ + __be32 imm; /* immediate data */ +}; + +struct mthca_tavor_ud_seg { + __be32 reserved1; + __be32 lkey; + __be64 av_addr; + __be32 reserved2[4]; + __be32 dqpn; + __be32 qkey; + __be32 reserved3[2]; +}; + +struct mthca_arbel_ud_seg { + __be32 av[8]; + __be32 dqpn; + __be32 qkey; + __be32 reserved[2]; +}; + +struct mthca_bind_seg { + __be32 flags; /* [31] Atomic [30] rem write [29] rem read */ + __be32 reserved; + __be32 new_rkey; + __be32 lkey; + __be64 addr; + __be64 length; +}; + +struct mthca_raddr_seg { + __be64 raddr; + __be32 rkey; + __be32 reserved; +}; + +struct mthca_atomic_seg { + __be64 swap_add; + __be64 compare; +}; + +struct mthca_data_seg { + __be32 byte_count; + __be32 lkey; + __be64 addr; +}; + +struct mthca_inline_seg { + __be32 byte_count; +}; + +#endif /* WQE_H */ diff --git a/providers/ocrdma/CMakeLists.txt b/providers/ocrdma/CMakeLists.txt new file mode 100644 index 0000000..08623ad --- /dev/null +++ b/providers/ocrdma/CMakeLists.txt @@ -0,0 +1,4 @@ +rdma_provider(ocrdma + ocrdma_main.c + ocrdma_verbs.c + ) diff --git a/providers/ocrdma/Changelog b/providers/ocrdma/Changelog new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/providers/ocrdma/Changelog diff --git a/providers/ocrdma/ocrdma_abi.h b/providers/ocrdma/ocrdma_abi.h new file mode 100644 index 0000000..6956a4f --- /dev/null +++ b/providers/ocrdma/ocrdma_abi.h @@ -0,0 +1,265 @@ +/* + * Copyright (C) 2008-2013 Emulex. All rights reserved. + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __OCRDMA_ABI_H__ +#define __OCRDMA_ABI_H__ + +#include <stdint.h> +#include <infiniband/kern-abi.h> +#include <rdma/ocrdma-abi.h> +#include <kernel-abi/ocrdma-abi.h> + +#define OCRDMA_ABI_VERSION 2 + +DECLARE_DRV_CMD(uocrdma_get_context, IB_USER_VERBS_CMD_GET_CONTEXT, + empty, ocrdma_alloc_ucontext_resp); +DECLARE_DRV_CMD(uocrdma_alloc_pd, IB_USER_VERBS_CMD_ALLOC_PD, + ocrdma_alloc_pd_ureq, ocrdma_alloc_pd_uresp); +DECLARE_DRV_CMD(uocrdma_create_cq, IB_USER_VERBS_CMD_CREATE_CQ, + ocrdma_create_cq_ureq, ocrdma_create_cq_uresp); +DECLARE_DRV_CMD(uocrdma_reg_mr, IB_USER_VERBS_CMD_REG_MR, + empty, empty); +DECLARE_DRV_CMD(uocrdma_create_qp, IB_USER_VERBS_CMD_CREATE_QP, + ocrdma_create_qp_ureq, ocrdma_create_qp_uresp); +DECLARE_DRV_CMD(uocrdma_create_srq, IB_USER_VERBS_CMD_CREATE_SRQ, + empty, ocrdma_create_srq_uresp); + +#define Bit(_b) (1 << (_b)) + +#define OCRDMA_MAX_QP 2048 + +enum { + OCRDMA_DB_RQ_OFFSET = 0xE0, + OCRDMA_DB_SQ_OFFSET = 0x60, + OCRDMA_DB_SRQ_OFFSET = OCRDMA_DB_RQ_OFFSET, + OCRDMA_DB_CQ_OFFSET = 0x120 +}; + +#define OCRDMA_DB_CQ_RING_ID_MASK 0x3FF /* bits 0 - 9 */ +#define OCRDMA_DB_CQ_RING_ID_EXT_MASK 0x0C00 /* bits 10-11 of qid placing at 12-11 */ +#define OCRDMA_DB_CQ_RING_ID_EXT_MASK_SHIFT 0x1 /* qid #2 msbits placing at 12-11 */ +#define OCRDMA_DB_CQ_NUM_POPPED_SHIFT (16) /* bits 16 - 28 */ +/* Rearm bit */ +#define OCRDMA_DB_CQ_REARM_SHIFT (29) /* bit 29 */ + +/* solicited bit */ +#define OCRDMA_DB_CQ_SOLICIT_SHIFT (31) /* bit 31 */ + +enum OCRDMA_CQE_STATUS { + OCRDMA_CQE_SUCCESS = 0, + OCRDMA_CQE_LOC_LEN_ERR = 1, + OCRDMA_CQE_LOC_QP_OP_ERR = 2, + OCRDMA_CQE_LOC_EEC_OP_ERR = 3, + OCRDMA_CQE_LOC_PROT_ERR = 4, + OCRDMA_CQE_WR_FLUSH_ERR = 5, + OCRDMA_CQE_MW_BIND_ERR = 6, + OCRDMA_CQE_BAD_RESP_ERR = 7, + OCRDMA_CQE_LOC_ACCESS_ERR = 8, + OCRDMA_CQE_REM_INV_REQ_ERR = 9, + OCRDMA_CQE_REM_ACCESS_ERR = 0xa, + OCRDMA_CQE_REM_OP_ERR = 0xb, + OCRDMA_CQE_RETRY_EXC_ERR = 0xc, + OCRDMA_CQE_RNR_RETRY_EXC_ERR = 0xd, + OCRDMA_CQE_LOC_RDD_VIOL_ERR = 0xe, + OCRDMA_CQE_REM_INV_RD_REQ_ERR = 0xf, + OCRDMA_CQE_REM_ABORT_ERR = 0x10, + OCRDMA_CQE_INV_EECN_ERR = 0x11, + OCRDMA_CQE_INV_EEC_STATE_ERR = 0x12, + OCRDMA_CQE_FATAL_ERR = 0x13, + OCRDMA_CQE_RESP_TIMEOUT_ERR = 0x14, + OCRDMA_CQE_GENERAL_ERR +}; + +enum { + /* w0 */ + OCRDMA_CQE_WQEIDX_SHIFT = 0, + OCRDMA_CQE_WQEIDX_MASK = 0xFFFF, + + /* w1 */ + OCRDMA_CQE_UD_XFER_LEN_SHIFT = 16, + OCRDMA_CQE_PKEY_SHIFT = 0, + OCRDMA_CQE_PKEY_MASK = 0xFFFF, + + /* w2 */ + OCRDMA_CQE_QPN_SHIFT = 0, + OCRDMA_CQE_QPN_MASK = 0x0000FFFF, + + OCRDMA_CQE_BUFTAG_SHIFT = 16, + OCRDMA_CQE_BUFTAG_MASK = 0xFFFF << OCRDMA_CQE_BUFTAG_SHIFT, + + /* w3 */ + OCRDMA_CQE_UD_STATUS_SHIFT = 24, + OCRDMA_CQE_UD_STATUS_MASK = 0x7 << OCRDMA_CQE_UD_STATUS_SHIFT, + OCRDMA_CQE_STATUS_SHIFT = 16, + OCRDMA_CQE_STATUS_MASK = (0xFF << OCRDMA_CQE_STATUS_SHIFT), + OCRDMA_CQE_VALID = Bit(31), + OCRDMA_CQE_INVALIDATE = Bit(30), + OCRDMA_CQE_QTYPE = Bit(29), + OCRDMA_CQE_IMM = Bit(28), + OCRDMA_CQE_WRITE_IMM = Bit(27), + OCRDMA_CQE_QTYPE_SQ = 0, + OCRDMA_CQE_QTYPE_RQ = 1, + OCRDMA_CQE_SRCQP_MASK = 0xFFFFFF +}; + +struct ocrdma_cqe { + union { + /* w0 to w2 */ + struct { + __le32 wqeidx; + __le32 bytes_xfered; + __le32 qpn; + } wq; + struct { + __le32 lkey_immdt; + __le32 rxlen; + __le32 buftag_qpn; + } rq; + struct { + __le32 lkey_immdt; + __le32 rxlen_pkey; + __le32 buftag_qpn; + } ud; + struct { + __le32 word_0; + __le32 word_1; + __le32 qpn; + } cmn; + }; + __le32 flags_status_srcqpn; /* w3 */ +} __attribute__ ((packed)); + +struct ocrdma_sge { + uint32_t addr_hi; + uint32_t addr_lo; + uint32_t lrkey; + uint32_t len; +} __attribute__ ((packed)); + +enum { + OCRDMA_WQE_OPCODE_SHIFT = 0, + OCRDMA_WQE_OPCODE_MASK = 0x0000001F, + OCRDMA_WQE_FLAGS_SHIFT = 5, + OCRDMA_WQE_TYPE_SHIFT = 16, + OCRDMA_WQE_TYPE_MASK = 0x00030000, + OCRDMA_WQE_SIZE_SHIFT = 18, + OCRDMA_WQE_SIZE_MASK = 0xFF, + OCRDMA_WQE_NXT_WQE_SIZE_SHIFT = 25, + OCRDMA_WQE_LKEY_FLAGS_SHIFT = 0, + OCRDMA_WQE_LKEY_FLAGS_MASK = 0xF +}; + +enum { + OCRDMA_FLAG_SIG = 0x1, + OCRDMA_FLAG_INV = 0x2, + OCRDMA_FLAG_FENCE_L = 0x4, + OCRDMA_FLAG_FENCE_R = 0x8, + OCRDMA_FLAG_SOLICIT = 0x10, + OCRDMA_FLAG_IMM = 0x20, + OCRDMA_FLAG_AH_VLAN_PR = 0x40, + + /* Stag flags */ + OCRDMA_LKEY_FLAG_LOCAL_WR = 0x1, + OCRDMA_LKEY_FLAG_REMOTE_RD = 0x2, + OCRDMA_LKEY_FLAG_REMOTE_WR = 0x4, + OCRDMA_LKEY_FLAG_VATO = 0x8 +}; + +enum { + OCRDMA_TYPE_INLINE = 0x0, + OCRDMA_TYPE_LKEY = 0x1 +}; + +#define OCRDMA_CQE_QTYPE_RQ 1 +#define OCRDMA_CQE_QTYPE_SQ 0 + +enum OCRDMA_WQE_OPCODE { + OCRDMA_WRITE = 0x06, + OCRDMA_READ = 0x0C, + OCRDMA_RESV0 = 0x02, + OCRDMA_SEND = 0x00, + OCRDMA_BIND_MW = 0x08, + OCRDMA_RESV1 = 0x0A, + OCRDMA_LKEY_INV = 0x15, +}; + +#define OCRDMA_WQE_STRIDE 8 +#define OCRDMA_WQE_ALIGN_BYTES 16 +/* header WQE for all the SQ and RQ operations */ +struct ocrdma_hdr_wqe { + uint32_t cw; + union { + uint32_t rsvd_tag; + uint32_t rsvd_stag_flags; + }; + union { + uint32_t immdt; + uint32_t lkey; + }; + uint32_t total_len; +} __attribute__ ((packed)); + +struct ocrdma_hdr_wqe_le { + __le32 cw; + union { + __le32 rsvd_tag; + __le32 rsvd_stag_flags; + }; + union { + __le32 immdt; + __le32 lkey; + }; + __le32 total_len; +} __attribute__ ((packed)); + +struct ocrdma_ewqe_atomic { + uint32_t ra_hi; + uint32_t ra_lo; + uint32_t rkey; + uint32_t rlen; + uint32_t swap_add_hi; + uint32_t swap_add_lo; + uint32_t compare_hi; + uint32_t compare_lo; + struct ocrdma_sge sge; +} __attribute__ ((packed)); + +struct ocrdma_ewqe_ud_hdr { + uint32_t rsvd_dest_qpn; + uint32_t qkey; + uint32_t rsvd_ahid; + uint32_t hdr_type; +} __attribute__ ((packed)); + +#endif /* __OCRDMA_ABI_H__ */ diff --git a/providers/ocrdma/ocrdma_main.c b/providers/ocrdma/ocrdma_main.c new file mode 100644 index 0000000..f7ed629 --- /dev/null +++ b/providers/ocrdma/ocrdma_main.c @@ -0,0 +1,201 @@ +/* + * Copyright (C) 2008-2013 Emulex. All rights reserved. + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <config.h> + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <errno.h> +#include <sys/mman.h> +#include <pthread.h> + +#include "ocrdma_main.h" +#include "ocrdma_abi.h" +#include <ccan/list.h> + +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> + +static void ocrdma_free_context(struct ibv_context *ibctx); + +#define PCI_VENDOR_ID_EMULEX 0x10DF +#define PCI_DEVICE_ID_EMULEX_GEN1 0xe220 +#define PCI_DEVICE_ID_EMULEX_GEN2 0x720 +#define PCI_DEVICE_ID_EMULEX_GEN2_VF 0x728 + +#define UCNA(v, d) \ + VERBS_PCI_MATCH(PCI_VENDOR_ID_##v, PCI_DEVICE_ID_EMULEX_##d, NULL) +static const struct verbs_match_ent ucna_table[] = { + VERBS_DRIVER_ID(RDMA_DRIVER_OCRDMA), + UCNA(EMULEX, GEN1), + UCNA(EMULEX, GEN2), + UCNA(EMULEX, GEN2_VF), + {} +}; + +static const struct verbs_context_ops ocrdma_ctx_ops = { + .query_device = ocrdma_query_device, + .query_port = ocrdma_query_port, + .alloc_pd = ocrdma_alloc_pd, + .dealloc_pd = ocrdma_free_pd, + .reg_mr = ocrdma_reg_mr, + .dereg_mr = ocrdma_dereg_mr, + .create_cq = ocrdma_create_cq, + .poll_cq = ocrdma_poll_cq, + .req_notify_cq = ocrdma_arm_cq, + .resize_cq = ocrdma_resize_cq, + .destroy_cq = ocrdma_destroy_cq, + + .create_qp = ocrdma_create_qp, + .query_qp = ocrdma_query_qp, + .modify_qp = ocrdma_modify_qp, + .destroy_qp = ocrdma_destroy_qp, + .post_send = ocrdma_post_send, + .post_recv = ocrdma_post_recv, + .create_ah = ocrdma_create_ah, + .destroy_ah = ocrdma_destroy_ah, + + .create_srq = ocrdma_create_srq, + .modify_srq = ocrdma_modify_srq, + .query_srq = ocrdma_query_srq, + .destroy_srq = ocrdma_destroy_srq, + .post_srq_recv = ocrdma_post_srq_recv, + .attach_mcast = ocrdma_attach_mcast, + .detach_mcast = ocrdma_detach_mcast, + .free_context = ocrdma_free_context, +}; + +static void ocrdma_uninit_device(struct verbs_device *verbs_device) +{ + struct ocrdma_device *dev = get_ocrdma_dev(&verbs_device->device); + + free(dev); +} + +/* + * ocrdma_alloc_context + */ +static struct verbs_context *ocrdma_alloc_context(struct ibv_device *ibdev, + int cmd_fd, + void *private_data) +{ + struct ocrdma_devctx *ctx; + struct uocrdma_get_context cmd; + struct uocrdma_get_context_resp resp; + + ctx = verbs_init_and_alloc_context(ibdev, cmd_fd, ctx, ibv_ctx, + RDMA_DRIVER_OCRDMA); + if (!ctx) + return NULL; + + if (ibv_cmd_get_context(&ctx->ibv_ctx, + (struct ibv_get_context *)&cmd, sizeof cmd, + &resp.ibv_resp, sizeof(resp))) + goto cmd_err; + + verbs_set_ops(&ctx->ibv_ctx, &ocrdma_ctx_ops); + + get_ocrdma_dev(ibdev)->id = resp.dev_id; + get_ocrdma_dev(ibdev)->max_inline_data = resp.max_inline_data; + get_ocrdma_dev(ibdev)->wqe_size = resp.wqe_size; + get_ocrdma_dev(ibdev)->rqe_size = resp.rqe_size; + memcpy(get_ocrdma_dev(ibdev)->fw_ver, resp.fw_ver, sizeof(resp.fw_ver)); + get_ocrdma_dev(ibdev)->dpp_wqe_size = resp.dpp_wqe_size; + + ctx->ah_tbl = + mmap(NULL, resp.ah_tbl_len, PROT_READ | PROT_WRITE, MAP_SHARED, + cmd_fd, resp.ah_tbl_page); + + if (ctx->ah_tbl == MAP_FAILED) + goto cmd_err; + ctx->ah_tbl_len = resp.ah_tbl_len; + ocrdma_init_ahid_tbl(ctx); + + return &ctx->ibv_ctx; + +cmd_err: + ocrdma_err("%s: Failed to allocate context for device.\n", __func__); + verbs_uninit_context(&ctx->ibv_ctx); + free(ctx); + return NULL; +} + +/* + * ocrdma_free_context + */ +static void ocrdma_free_context(struct ibv_context *ibctx) +{ + struct ocrdma_devctx *ctx = get_ocrdma_ctx(ibctx); + + if (ctx->ah_tbl) + munmap((void *)ctx->ah_tbl, ctx->ah_tbl_len); + + verbs_uninit_context(&ctx->ibv_ctx); + free(ctx); +} + +static struct verbs_device * +ocrdma_device_alloc(struct verbs_sysfs_dev *sysfs_dev) +{ + struct ocrdma_device *dev; + + dev = calloc(1, sizeof(*dev)); + if (!dev) + return NULL; + + dev->qp_tbl = malloc(OCRDMA_MAX_QP * sizeof(struct ocrdma_qp *)); + if (!dev->qp_tbl) + goto qp_err; + bzero(dev->qp_tbl, OCRDMA_MAX_QP * sizeof(struct ocrdma_qp *)); + pthread_mutex_init(&dev->dev_lock, NULL); + pthread_spin_init(&dev->flush_q_lock, PTHREAD_PROCESS_PRIVATE); + return &dev->ibv_dev; +qp_err: + free(dev); + return NULL; +} + +static const struct verbs_device_ops ocrdma_dev_ops = { + .name = "ocrdma", + .match_min_abi_version = OCRDMA_ABI_VERSION, + .match_max_abi_version = OCRDMA_ABI_VERSION, + .match_table = ucna_table, + .alloc_device = ocrdma_device_alloc, + .uninit_device = ocrdma_uninit_device, + .alloc_context = ocrdma_alloc_context, +}; +PROVIDER_DRIVER(ocrdma, ocrdma_dev_ops); diff --git a/providers/ocrdma/ocrdma_main.h b/providers/ocrdma/ocrdma_main.h new file mode 100644 index 0000000..aadefd9 --- /dev/null +++ b/providers/ocrdma/ocrdma_main.h @@ -0,0 +1,306 @@ +/* + * Copyright (C) 2008-2013 Emulex. All rights reserved. + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __OCRDMA_MAIN_H__ +#define __OCRDMA_MAIN_H__ + +#include <inttypes.h> +#include <stddef.h> +#include <endian.h> + +#include <infiniband/driver.h> +#include <util/udma_barrier.h> + +#include <ccan/list.h> + +#define ocrdma_err(format, arg...) printf(format, ##arg) + +#define OCRDMA_DPP_PAGE_SIZE (4096) + +#define ROUND_UP_X(_val, _x) \ + (((unsigned long)(_val) + ((_x)-1)) & (long)~((_x)-1)) + +struct ocrdma_qp; + +struct ocrdma_device { + struct verbs_device ibv_dev; + struct ocrdma_qp **qp_tbl; + pthread_mutex_t dev_lock; + pthread_spinlock_t flush_q_lock; + int id; + int gen; + uint32_t wqe_size; + uint32_t rqe_size; + uint32_t dpp_wqe_size; + uint32_t max_inline_data; + uint8_t fw_ver[32]; +}; + +struct ocrdma_devctx { + struct verbs_context ibv_ctx; + uint32_t *ah_tbl; + uint32_t ah_tbl_len; + pthread_mutex_t tbl_lock; +}; + +struct ocrdma_pd { + struct ibv_pd ibv_pd; + struct ocrdma_device *dev; + struct ocrdma_devctx *uctx; + void *dpp_va; +}; + +struct ocrdma_mr { + struct verbs_mr vmr; +}; + +struct ocrdma_cq { + struct ibv_cq ibv_cq; + struct ocrdma_device *dev; + uint16_t cq_id; + uint16_t cq_dbid; + uint16_t getp; + pthread_spinlock_t cq_lock; + uint32_t max_hw_cqe; + uint32_t cq_mem_size; + struct ocrdma_cqe *va; + void *db_va; + + uint32_t db_size; + + uint32_t phase; + int phase_change; + + uint8_t deferred_arm; + uint8_t deferred_sol; + uint8_t first_arm; + struct list_head sq_head; + struct list_head rq_head; +}; + +enum { + OCRDMA_DPP_WQE_INDEX_MASK = 0xFFFF, + OCRDMA_DPP_CQE_VALID_BIT_SHIFT = 31, + OCRDMA_DPP_CQE_VALID_BIT_MASK = 1 << 31 +}; + +struct ocrdma_dpp_cqe { + uint32_t wqe_idx_valid; +}; + +enum { + OCRDMA_PD_MAX_DPP_ENABLED_QP = 16 +}; + +struct ocrdma_qp_hwq_info { + uint8_t *va; /* virtual address */ + uint32_t max_sges; + uint32_t free_cnt; + + uint32_t head, tail; + uint32_t entry_size; + uint32_t max_cnt; + uint32_t max_wqe_idx; + uint32_t len; + uint16_t dbid; /* qid, where to ring the doorbell. */ +}; + +struct ocrdma_srq { + struct ibv_srq ibv_srq; + struct ocrdma_device *dev; + void *db_va; + uint32_t db_size; + pthread_spinlock_t q_lock; + + struct ocrdma_qp_hwq_info rq; + uint32_t max_rq_sges; + uint32_t id; + uint64_t *rqe_wr_id_tbl; + uint32_t *idx_bit_fields; + uint32_t bit_fields_len; + uint32_t db_shift; +}; + +enum { + OCRDMA_CREATE_QP_REQ_DPP_CREDIT_LIMIT = 1 +}; + +enum ocrdma_qp_state { + OCRDMA_QPS_RST = 0, + OCRDMA_QPS_INIT = 1, + OCRDMA_QPS_RTR = 2, + OCRDMA_QPS_RTS = 3, + OCRDMA_QPS_SQE = 4, + OCRDMA_QPS_SQ_DRAINING = 5, + OCRDMA_QPS_ERR = 6, + OCRDMA_QPS_SQD = 7 +}; + +struct ocrdma_qp { + struct ibv_qp ibv_qp; + struct ocrdma_device *dev; + pthread_spinlock_t q_lock; + + struct ocrdma_qp_hwq_info sq; + struct ocrdma_cq *sq_cq; + struct { + uint64_t wrid; + uint16_t dpp_wqe_idx; + uint16_t dpp_wqe; + uint8_t signaled; + uint8_t rsvd[3]; + } *wqe_wr_id_tbl; + struct ocrdma_qp_hwq_info dpp_q; + int dpp_enabled; + + struct ocrdma_qp_hwq_info rq; + struct ocrdma_cq *rq_cq; + uint64_t *rqe_wr_id_tbl; + void *db_va; + void *db_sq_va; + void *db_rq_va; + uint32_t max_inline_data; + + struct ocrdma_srq *srq; + struct ocrdma_cq *dpp_cq; + + uint32_t db_size; + uint32_t max_ord; + uint32_t max_ird; + uint32_t dpp_prev_indx; + + enum ibv_qp_type qp_type; + enum ocrdma_qp_state state; + struct list_node sq_entry; + struct list_node rq_entry; + uint16_t id; + uint16_t rsvd; + uint32_t db_shift; + int signaled; /* signaled QP */ +}; + +enum { + OCRDMA_AH_ID_MASK = 0x3FF, + OCRDMA_AH_VLAN_VALID_MASK = 0x01, + OCRDMA_AH_VLAN_VALID_SHIFT = 0x1F, + OCRDMA_AH_L3_TYPE_MASK = 0x03, + OCRDMA_AH_L3_TYPE_SHIFT = 0x1D +}; + +struct ocrdma_ah { + struct ibv_ah ibv_ah; + struct ocrdma_pd *pd; + uint16_t id; + uint8_t isvlan; + uint8_t hdr_type; +}; + +#define get_ocrdma_xxx(xxx, type) \ + container_of(ib##xxx, struct ocrdma_##type, ibv_##xxx) + +static inline struct ocrdma_devctx *get_ocrdma_ctx(struct ibv_context *ibctx) +{ + return container_of(ibctx, struct ocrdma_devctx, ibv_ctx.context); +} + +static inline struct ocrdma_device *get_ocrdma_dev(struct ibv_device *ibdev) +{ + return container_of(ibdev, struct ocrdma_device, ibv_dev.device); +} + +static inline struct ocrdma_qp *get_ocrdma_qp(struct ibv_qp *ibqp) +{ + return get_ocrdma_xxx(qp, qp); +} + +static inline struct ocrdma_srq *get_ocrdma_srq(struct ibv_srq *ibsrq) +{ + return get_ocrdma_xxx(srq, srq); +} + +static inline struct ocrdma_pd *get_ocrdma_pd(struct ibv_pd *ibpd) +{ + return get_ocrdma_xxx(pd, pd); +} + +static inline struct ocrdma_cq *get_ocrdma_cq(struct ibv_cq *ibcq) +{ + return get_ocrdma_xxx(cq, cq); +} + +static inline struct ocrdma_ah *get_ocrdma_ah(struct ibv_ah *ibah) +{ + return get_ocrdma_xxx(ah, ah); +} + +void ocrdma_init_ahid_tbl(struct ocrdma_devctx *ctx); +int ocrdma_query_device(struct ibv_context *, struct ibv_device_attr *); +int ocrdma_query_port(struct ibv_context *, uint8_t, struct ibv_port_attr *); +struct ibv_pd *ocrdma_alloc_pd(struct ibv_context *); +int ocrdma_free_pd(struct ibv_pd *); +struct ibv_mr *ocrdma_reg_mr(struct ibv_pd *pd, void *addr, size_t len, + uint64_t hca_va, int access); +int ocrdma_dereg_mr(struct verbs_mr *vmr); + +struct ibv_cq *ocrdma_create_cq(struct ibv_context *, int, + struct ibv_comp_channel *, int); +int ocrdma_resize_cq(struct ibv_cq *, int); +int ocrdma_destroy_cq(struct ibv_cq *); +int ocrdma_poll_cq(struct ibv_cq *, int, struct ibv_wc *); +int ocrdma_arm_cq(struct ibv_cq *, int); + +struct ibv_qp *ocrdma_create_qp(struct ibv_pd *, struct ibv_qp_init_attr *); +int ocrdma_modify_qp(struct ibv_qp *, struct ibv_qp_attr *, + int ibv_qp_attr_mask); +int ocrdma_query_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask, + struct ibv_qp_init_attr *init_attr); +int ocrdma_destroy_qp(struct ibv_qp *); +int ocrdma_post_send(struct ibv_qp *, struct ibv_send_wr *, + struct ibv_send_wr **); +int ocrdma_post_recv(struct ibv_qp *, struct ibv_recv_wr *, + struct ibv_recv_wr **); + +struct ibv_srq *ocrdma_create_srq(struct ibv_pd *, struct ibv_srq_init_attr *); +int ocrdma_modify_srq(struct ibv_srq *, struct ibv_srq_attr *, int); +int ocrdma_destroy_srq(struct ibv_srq *); +int ocrdma_query_srq(struct ibv_srq *ibsrq, struct ibv_srq_attr *attr); +int ocrdma_post_srq_recv(struct ibv_srq *, struct ibv_recv_wr *, + struct ibv_recv_wr **); +struct ibv_ah *ocrdma_create_ah(struct ibv_pd *, struct ibv_ah_attr *); +int ocrdma_destroy_ah(struct ibv_ah *); +int ocrdma_attach_mcast(struct ibv_qp *, const union ibv_gid *, uint16_t); +int ocrdma_detach_mcast(struct ibv_qp *, const union ibv_gid *, uint16_t); +void ocrdma_async_event(struct ibv_async_event *event); + +#endif /* __OCRDMA_MAIN_H__ */ diff --git a/providers/ocrdma/ocrdma_verbs.c b/providers/ocrdma/ocrdma_verbs.c new file mode 100644 index 0000000..4ae35be --- /dev/null +++ b/providers/ocrdma/ocrdma_verbs.c @@ -0,0 +1,2163 @@ +/* + * Copyright (C) 2008-2013 Emulex. All rights reserved. + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <config.h> + +#include <assert.h> +#include <endian.h> +#include <stdlib.h> +#include <stdio.h> +#include <string.h> +#include <unistd.h> +#include <signal.h> +#include <errno.h> +#include <pthread.h> +#include <sys/mman.h> +#include <unistd.h> +#include <endian.h> + +#include "ocrdma_main.h" +#include "ocrdma_abi.h" +#include <ccan/list.h> +#include <util/compiler.h> + +static void ocrdma_ring_cq_db(struct ocrdma_cq *cq, uint32_t armed, + int solicited, uint32_t num_cqe); + +static inline void ocrdma_swap_cpu_to_le(void *dst, uint32_t len) +{ + int i = 0; + __le32 *src_ptr = dst; + uint32_t *dst_ptr = dst; + for (; i < (len / 4); i++) + *dst_ptr++ = le32toh(*src_ptr++); +} + +/* + * ocrdma_query_device + */ +int ocrdma_query_device(struct ibv_context *context, + struct ibv_device_attr *attr) +{ + struct ibv_query_device cmd; + uint64_t fw_ver; + struct ocrdma_device *dev = get_ocrdma_dev(context->device); + int status; + + bzero(attr, sizeof *attr); + status = ibv_cmd_query_device(context, attr, &fw_ver, &cmd, sizeof cmd); + memcpy(attr->fw_ver, dev->fw_ver, sizeof(dev->fw_ver)); + return status; +} + +/* + * ocrdma_query_port + */ +int ocrdma_query_port(struct ibv_context *context, uint8_t port, + struct ibv_port_attr *attr) +{ + struct ibv_query_port cmd; + int status; + status = ibv_cmd_query_port(context, port, attr, &cmd, sizeof cmd); + return status; +} + +#define OCRDMA_INVALID_AH_IDX 0xffffffff +void ocrdma_init_ahid_tbl(struct ocrdma_devctx *ctx) +{ + int i; + + pthread_mutex_init(&ctx->tbl_lock, NULL); + for (i = 0; i < (ctx->ah_tbl_len / sizeof(uint32_t)); i++) + ctx->ah_tbl[i] = OCRDMA_INVALID_AH_IDX; +} + +static int ocrdma_alloc_ah_tbl_id(struct ocrdma_devctx *ctx) +{ + int i; + int status = -EINVAL; + pthread_mutex_lock(&ctx->tbl_lock); + + for (i = 0; i < (ctx->ah_tbl_len / sizeof(uint32_t)); i++) { + if (ctx->ah_tbl[i] == OCRDMA_INVALID_AH_IDX) { + ctx->ah_tbl[i] = ctx->ah_tbl_len; + status = i; + break; + } + } + pthread_mutex_unlock(&ctx->tbl_lock); + return status; +} + +static void ocrdma_free_ah_tbl_id(struct ocrdma_devctx *ctx, int idx) +{ + pthread_mutex_lock(&ctx->tbl_lock); + ctx->ah_tbl[idx] = OCRDMA_INVALID_AH_IDX; + pthread_mutex_unlock(&ctx->tbl_lock); +} + +/* + * ocrdma_alloc_pd + */ +struct ibv_pd *ocrdma_alloc_pd(struct ibv_context *context) +{ + struct uocrdma_alloc_pd cmd; + struct uocrdma_alloc_pd_resp resp; + struct ocrdma_pd *pd; + uint64_t map_address = 0; + + pd = malloc(sizeof *pd); + if (!pd) + return NULL; + bzero(pd, sizeof *pd); + memset(&cmd, 0, sizeof(cmd)); + + if (ibv_cmd_alloc_pd(context, &pd->ibv_pd, &cmd.ibv_cmd, sizeof(cmd), + &resp.ibv_resp, sizeof(resp))) { + free(pd); + return NULL; + } + pd->dev = get_ocrdma_dev(context->device); + pd->uctx = get_ocrdma_ctx(context); + + if (resp.dpp_enabled) { + map_address = ((uint64_t) resp.dpp_page_addr_hi << 32) | + resp.dpp_page_addr_lo; + pd->dpp_va = mmap(NULL, OCRDMA_DPP_PAGE_SIZE, PROT_WRITE, + MAP_SHARED, context->cmd_fd, map_address); + if (pd->dpp_va == MAP_FAILED) { + ocrdma_free_pd(&pd->ibv_pd); + return NULL; + } + } + return &pd->ibv_pd; +} + +/* + * ocrdma_free_pd + */ +int ocrdma_free_pd(struct ibv_pd *ibpd) +{ + int status; + struct ocrdma_pd *pd = get_ocrdma_pd(ibpd); + + status = ibv_cmd_dealloc_pd(ibpd); + if (status) + return status; + + if (pd->dpp_va) + munmap((void *)pd->dpp_va, OCRDMA_DPP_PAGE_SIZE); + free(pd); + return 0; +} + +/* + * ocrdma_reg_mr + */ +struct ibv_mr *ocrdma_reg_mr(struct ibv_pd *pd, void *addr, size_t len, + uint64_t hca_va, int access) +{ + struct ocrdma_mr *mr; + struct ibv_reg_mr cmd; + struct uocrdma_reg_mr_resp resp; + + mr = malloc(sizeof *mr); + if (!mr) + return NULL; + bzero(mr, sizeof *mr); + + if (ibv_cmd_reg_mr(pd, addr, len, hca_va, access, &mr->vmr, &cmd, + sizeof(cmd), &resp.ibv_resp, sizeof(resp))) { + free(mr); + return NULL; + } + return &mr->vmr.ibv_mr; +} + +/* + * ocrdma_dereg_mr + */ +int ocrdma_dereg_mr(struct verbs_mr *vmr) +{ + int status; + status = ibv_cmd_dereg_mr(vmr); + if (status) + return status; + free(vmr); + return 0; +} + +/* + * ocrdma_create_cq + */ +static struct ibv_cq *ocrdma_create_cq_common(struct ibv_context *context, + int cqe, + struct ibv_comp_channel *channel, + int comp_vector, int dpp_cq) +{ + int status; + struct uocrdma_create_cq cmd; + struct uocrdma_create_cq_resp resp; + struct ocrdma_cq *cq; + struct ocrdma_device *dev = get_ocrdma_dev(context->device); + void *map_addr; + + cq = malloc(sizeof *cq); + if (!cq) + return NULL; + + bzero(cq, sizeof *cq); + cmd.dpp_cq = dpp_cq; + status = ibv_cmd_create_cq(context, cqe, channel, comp_vector, + &cq->ibv_cq, &cmd.ibv_cmd, sizeof cmd, + &resp.ibv_resp, sizeof resp); + if (status) + goto cq_err1; + + pthread_spin_init(&cq->cq_lock, PTHREAD_PROCESS_PRIVATE); + cq->dev = dev; + cq->cq_id = resp.cq_id; + cq->cq_dbid = resp.cq_id; + cq->cq_mem_size = resp.page_size; + cq->max_hw_cqe = resp.max_hw_cqe; + cq->phase_change = resp.phase_change; + cq->va = mmap(NULL, resp.page_size, PROT_READ | PROT_WRITE, + MAP_SHARED, context->cmd_fd, resp.page_addr[0]); + if (cq->va == MAP_FAILED) + goto cq_err2; + + map_addr = mmap(NULL, resp.db_page_size, PROT_WRITE, + MAP_SHARED, context->cmd_fd, resp.db_page_addr); + if (map_addr == MAP_FAILED) + goto cq_err2; + cq->db_va = map_addr; + cq->db_size = resp.db_page_size; + cq->phase = OCRDMA_CQE_VALID; + cq->first_arm = 1; + if (!dpp_cq) { + ocrdma_ring_cq_db(cq, 0, 0, 0); + } + cq->ibv_cq.cqe = cqe; + list_head_init(&cq->sq_head); + list_head_init(&cq->rq_head); + return &cq->ibv_cq; +cq_err2: + (void)ibv_cmd_destroy_cq(&cq->ibv_cq); +cq_err1: + free(cq); + return NULL; +} + +struct ibv_cq *ocrdma_create_cq(struct ibv_context *context, int cqe, + struct ibv_comp_channel *channel, + int comp_vector) +{ + return ocrdma_create_cq_common(context, cqe, channel, comp_vector, 0); +} + +#ifdef DPP_CQ_SUPPORT +static struct ocrdma_cq *ocrdma_create_dpp_cq(struct ibv_context *context, + int cqe) +{ + struct ibv_cq *ibcq; + ibcq = ocrdma_create_cq_common(context, cqe, 0, 0, 1); + if (ibcq) + return get_ocrdma_cq(ibcq); + return NULL; +} +#endif + +/* + * ocrdma_resize_cq + */ +int ocrdma_resize_cq(struct ibv_cq *ibcq, int new_entries) +{ + int status; + struct ibv_resize_cq cmd; + struct ib_uverbs_resize_cq_resp resp; + status = ibv_cmd_resize_cq(ibcq, new_entries, + &cmd, sizeof cmd, &resp, sizeof resp); + if (status == 0) + ibcq->cqe = new_entries; + return status; +} + +/* + * ocrdma_destroy_cq + */ +int ocrdma_destroy_cq(struct ibv_cq *ibv_cq) +{ + struct ocrdma_cq *cq = get_ocrdma_cq(ibv_cq); + int status; + + status = ibv_cmd_destroy_cq(ibv_cq); + if (status) + return status; + + if (cq->db_va) + munmap((void *)cq->db_va, cq->db_size); + if (cq->va) + munmap((void*)cq->va, cq->cq_mem_size); + + free(cq); + return 0; +} + +static void ocrdma_add_qpn_map(struct ocrdma_device *dev, struct ocrdma_qp *qp) +{ + pthread_mutex_lock(&dev->dev_lock); + dev->qp_tbl[qp->id] = qp; + pthread_mutex_unlock(&dev->dev_lock); +} + +static void _ocrdma_del_qpn_map(struct ocrdma_device *dev, struct ocrdma_qp *qp) +{ + dev->qp_tbl[qp->id] = NULL; +} + +struct ibv_srq *ocrdma_create_srq(struct ibv_pd *pd, + struct ibv_srq_init_attr *init_attr) +{ + int status = 0; + struct ocrdma_srq *srq; + struct uocrdma_create_srq cmd; + struct uocrdma_create_srq_resp resp; + void *map_addr; + + srq = calloc(1, sizeof *srq); + if (!srq) + return NULL; + + pthread_spin_init(&srq->q_lock, PTHREAD_PROCESS_PRIVATE); + status = ibv_cmd_create_srq(pd, &srq->ibv_srq, init_attr, &cmd.ibv_cmd, + sizeof cmd, &resp.ibv_resp, sizeof resp); + if (status) + goto cmd_err; + + srq->dev = get_ocrdma_pd(pd)->dev; + srq->rq.dbid = resp.rq_dbid; + srq->rq.max_sges = init_attr->attr.max_sge; + srq->rq.max_cnt = resp.num_rqe_allocated; + srq->rq.max_wqe_idx = resp.num_rqe_allocated - 1; + srq->rq.entry_size = srq->dev->rqe_size; + srq->rqe_wr_id_tbl = calloc(srq->rq.max_cnt, sizeof(uint64_t)); + if (srq->rqe_wr_id_tbl == NULL) + goto map_err; + + srq->bit_fields_len = + (srq->rq.max_cnt / 32) + (srq->rq.max_cnt % 32 ? 1 : 0); + srq->idx_bit_fields = malloc(srq->bit_fields_len * sizeof(uint32_t)); + if (srq->idx_bit_fields == NULL) + goto map_err; + memset(srq->idx_bit_fields, 0xff, + srq->bit_fields_len * sizeof(uint32_t)); + + if (resp.num_rq_pages > 1) + goto map_err; + + map_addr = mmap(NULL, resp.rq_page_size, PROT_READ | PROT_WRITE, + MAP_SHARED, pd->context->cmd_fd, resp.rq_page_addr[0]); + if (map_addr == MAP_FAILED) + goto map_err; + srq->rq.len = resp.rq_page_size; + srq->rq.va = map_addr; + + map_addr = mmap(NULL, resp.db_page_size, PROT_WRITE, + MAP_SHARED, pd->context->cmd_fd, resp.db_page_addr); + if (map_addr == MAP_FAILED) + goto map_err; + srq->db_va = (uint8_t *) map_addr + resp.db_rq_offset; + srq->db_shift = resp.db_shift; + srq->db_size = resp.db_page_size; + return &srq->ibv_srq; + +map_err: + ocrdma_destroy_srq(&srq->ibv_srq); + return NULL; + +cmd_err: + pthread_spin_destroy(&srq->q_lock); + free(srq); + return NULL; +} + +int ocrdma_modify_srq(struct ibv_srq *ibsrq, + struct ibv_srq_attr *attr, int attr_mask) +{ + struct ibv_modify_srq cmd; + + return ibv_cmd_modify_srq(ibsrq, attr, attr_mask, &cmd, sizeof cmd); +} + +int ocrdma_query_srq(struct ibv_srq *ibsrq, struct ibv_srq_attr *attr) +{ + struct ibv_query_srq cmd; + + return ibv_cmd_query_srq(ibsrq, attr, &cmd, sizeof cmd); +} + +int ocrdma_destroy_srq(struct ibv_srq *ibsrq) +{ + int status; + struct ocrdma_srq *srq; + srq = get_ocrdma_srq(ibsrq); + + status = ibv_cmd_destroy_srq(ibsrq); + if (status) + return status; + + if (srq->idx_bit_fields) + free(srq->idx_bit_fields); + if (srq->rqe_wr_id_tbl) + free(srq->rqe_wr_id_tbl); + if (srq->db_va) { + munmap((void *)srq->db_va, srq->db_size); + srq->db_va = NULL; + } + if (srq->rq.va) { + munmap(srq->rq.va, srq->rq.len); + srq->rq.va = NULL; + } + pthread_spin_destroy(&srq->q_lock); + free(srq); + return status; +} + +/* + * ocrdma_create_qp + */ +struct ibv_qp *ocrdma_create_qp(struct ibv_pd *pd, + struct ibv_qp_init_attr *attrs) +{ + int status = 0; + struct uocrdma_create_qp cmd; + struct uocrdma_create_qp_resp resp; + struct ocrdma_qp *qp; + void *map_addr; +#ifdef DPP_CQ_SUPPORT + struct ocrdma_dpp_cqe *dpp_cqe = NULL; +#endif + + qp = calloc(1, sizeof *qp); + if (!qp) + return NULL; + memset(&cmd, 0, sizeof(cmd)); + + qp->qp_type = attrs->qp_type; + pthread_spin_init(&qp->q_lock, PTHREAD_PROCESS_PRIVATE); + +#ifdef DPP_CQ_SUPPORT + if (attrs->cap.max_inline_data) { + qp->dpp_cq = ocrdma_create_dpp_cq(pd->context, + OCRDMA_CREATE_QP_REQ_DPP_CREDIT_LIMIT); + if (qp->dpp_cq) { + cmd.enable_dpp_cq = 1; + cmd.dpp_cq_id = qp->dpp_cq->cq_id; + /* Write invalid index for the first entry */ + dpp_cqe = (struct ocrdma_dpp_cqe *)qp->dpp_cq->va; + dpp_cqe->wqe_idx_valid = 0xFFFF; + qp->dpp_prev_indx = 0xFFFF; + } + } +#endif + status = ibv_cmd_create_qp(pd, &qp->ibv_qp, attrs, &cmd.ibv_cmd, + sizeof cmd, &resp.ibv_resp, sizeof resp); + if (status) + goto mbx_err; + + qp->dev = get_ocrdma_dev(pd->context->device); + qp->id = resp.qp_id; + + ocrdma_add_qpn_map(qp->dev, qp); + + qp->sq.dbid = resp.sq_dbid; + + qp->sq.max_sges = attrs->cap.max_send_sge; + qp->max_inline_data = attrs->cap.max_inline_data; + + qp->signaled = attrs->sq_sig_all; + + qp->sq.max_cnt = resp.num_wqe_allocated; + qp->sq.max_wqe_idx = resp.num_wqe_allocated - 1; + qp->sq.entry_size = qp->dev->wqe_size; + if (attrs->srq) + qp->srq = get_ocrdma_srq(attrs->srq); + else { + qp->rq.dbid = resp.rq_dbid; + qp->rq.max_sges = attrs->cap.max_recv_sge; + qp->rq.max_cnt = resp.num_rqe_allocated; + qp->rq.max_wqe_idx = resp.num_rqe_allocated - 1; + qp->rq.entry_size = qp->dev->rqe_size; + qp->rqe_wr_id_tbl = calloc(qp->rq.max_cnt, sizeof(uint64_t)); + if (qp->rqe_wr_id_tbl == NULL) + goto map_err; + } + + qp->sq_cq = get_ocrdma_cq(attrs->send_cq); + qp->rq_cq = get_ocrdma_cq(attrs->recv_cq); + + qp->wqe_wr_id_tbl = calloc(qp->sq.max_cnt, sizeof(*qp->wqe_wr_id_tbl)); + if (qp->wqe_wr_id_tbl == NULL) + goto map_err; + + /* currently we support only one virtual page */ + if ((resp.num_sq_pages > 1) || (!attrs->srq && resp.num_rq_pages > 1)) + goto map_err; + + map_addr = mmap(NULL, resp.sq_page_size, PROT_READ | PROT_WRITE, + MAP_SHARED, pd->context->cmd_fd, resp.sq_page_addr[0]); + if (map_addr == MAP_FAILED) + goto map_err; + qp->sq.va = map_addr; + qp->sq.len = resp.sq_page_size; + qp->db_shift = resp.db_shift; + + if (!attrs->srq) { + map_addr = mmap(NULL, resp.rq_page_size, PROT_READ | PROT_WRITE, + MAP_SHARED, pd->context->cmd_fd, + resp.rq_page_addr[0]); + if (map_addr == MAP_FAILED) + goto map_err; + + qp->rq.len = resp.rq_page_size; + qp->rq.va = map_addr; + } + + map_addr = mmap(NULL, resp.db_page_size, PROT_WRITE, + MAP_SHARED, pd->context->cmd_fd, resp.db_page_addr); + if (map_addr == MAP_FAILED) + goto map_err; + + qp->db_va = map_addr; + qp->db_sq_va = (uint8_t *) map_addr + resp.db_sq_offset; + qp->db_rq_va = (uint8_t *) map_addr + resp.db_rq_offset; + + qp->db_size = resp.db_page_size; + + if (resp.dpp_credit) { + struct ocrdma_pd *opd = get_ocrdma_pd(pd); + map_addr = (uint8_t *) opd->dpp_va + + (resp.dpp_offset * qp->dev->wqe_size); + qp->dpp_q.max_cnt = 1; /* DPP is posted at the same offset */ + qp->dpp_q.free_cnt = resp.dpp_credit; + qp->dpp_q.va = map_addr; + qp->dpp_q.head = qp->dpp_q.tail = 0; + qp->dpp_q.entry_size = qp->dev->dpp_wqe_size; + qp->dpp_q.len = resp.dpp_credit * qp->dev->dpp_wqe_size; + qp->dpp_enabled = 1; + } else { + if (qp->dpp_cq) { + ocrdma_destroy_cq(&qp->dpp_cq->ibv_cq); + qp->dpp_cq = NULL; + } + } + qp->state = OCRDMA_QPS_RST; + list_node_init(&qp->sq_entry); + list_node_init(&qp->rq_entry); + return &qp->ibv_qp; + +map_err: + ocrdma_destroy_qp(&qp->ibv_qp); + return NULL; +mbx_err: + pthread_spin_destroy(&qp->q_lock); + free(qp); + return NULL; +} + +static enum ocrdma_qp_state get_ocrdma_qp_state(enum ibv_qp_state qps) +{ + switch (qps) { + case IBV_QPS_RESET: + return OCRDMA_QPS_RST; + case IBV_QPS_INIT: + return OCRDMA_QPS_INIT; + case IBV_QPS_RTR: + return OCRDMA_QPS_RTR; + case IBV_QPS_RTS: + return OCRDMA_QPS_RTS; + case IBV_QPS_SQD: + return OCRDMA_QPS_SQD; + case IBV_QPS_SQE: + return OCRDMA_QPS_SQE; + case IBV_QPS_ERR: + return OCRDMA_QPS_ERR; + case IBV_QPS_UNKNOWN: + break; + default: + break; + }; + return OCRDMA_QPS_ERR; +} + +static int ocrdma_is_qp_in_sq_flushlist(struct ocrdma_cq *cq, + struct ocrdma_qp *qp) +{ + struct ocrdma_qp *list_qp; + struct ocrdma_qp *list_qp_tmp; + int found = 0; + list_for_each_safe(&cq->sq_head, list_qp, list_qp_tmp, sq_entry) { + if (qp == list_qp) { + found = 1; + break; + } + } + return found; +} + +static int ocrdma_is_qp_in_rq_flushlist(struct ocrdma_cq *cq, + struct ocrdma_qp *qp) +{ + struct ocrdma_qp *list_qp; + struct ocrdma_qp *list_qp_tmp; + int found = 0; + list_for_each_safe(&cq->rq_head, list_qp, list_qp_tmp, rq_entry) { + if (qp == list_qp) { + found = 1; + break; + } + } + return found; +} + +static void ocrdma_init_hwq_ptr(struct ocrdma_qp *qp) +{ + qp->sq.head = qp->sq.tail = 0; + qp->rq.head = qp->rq.tail = 0; + qp->dpp_q.head = qp->dpp_q.tail = 0; + qp->dpp_q.free_cnt = qp->dpp_q.max_cnt; +} + +static void ocrdma_del_flush_qp(struct ocrdma_qp *qp) +{ + int found = 0; + struct ocrdma_device *dev = qp->dev; + /* sync with any active CQ poll */ + + pthread_spin_lock(&dev->flush_q_lock); + found = ocrdma_is_qp_in_sq_flushlist(qp->sq_cq, qp); + if (found) + list_del(&qp->sq_entry); + if (!qp->srq) { + found = ocrdma_is_qp_in_rq_flushlist(qp->rq_cq, qp); + if (found) + list_del(&qp->rq_entry); + } + pthread_spin_unlock(&dev->flush_q_lock); +} + +static void ocrdma_flush_qp(struct ocrdma_qp *qp) +{ + int found; + + pthread_spin_lock(&qp->dev->flush_q_lock); + found = ocrdma_is_qp_in_sq_flushlist(qp->sq_cq, qp); + if (!found) + list_add_tail(&qp->sq_cq->sq_head, &qp->sq_entry); + if (!qp->srq) { + found = ocrdma_is_qp_in_rq_flushlist(qp->rq_cq, qp); + if (!found) + list_add_tail(&qp->rq_cq->rq_head, &qp->rq_entry); + } + pthread_spin_unlock(&qp->dev->flush_q_lock); +} + +static int ocrdma_qp_state_machine(struct ocrdma_qp *qp, + enum ibv_qp_state new_ib_state) +{ + int status = 0; + enum ocrdma_qp_state new_state; + new_state = get_ocrdma_qp_state(new_ib_state); + + pthread_spin_lock(&qp->q_lock); + + if (new_state == qp->state) { + pthread_spin_unlock(&qp->q_lock); + return 1; + } + + switch (qp->state) { + case OCRDMA_QPS_RST: + switch (new_state) { + case OCRDMA_QPS_RST: + break; + case OCRDMA_QPS_INIT: + /* init pointers to place wqe/rqe at start of hw q */ + ocrdma_init_hwq_ptr(qp); + /* detach qp from the CQ flush list */ + ocrdma_del_flush_qp(qp); + break; + default: + status = EINVAL; + break; + }; + break; + case OCRDMA_QPS_INIT: + /* qps: INIT->XXX */ + switch (new_state) { + case OCRDMA_QPS_INIT: + break; + case OCRDMA_QPS_RTR: + break; + case OCRDMA_QPS_ERR: + ocrdma_flush_qp(qp); + break; + default: + /* invalid state change. */ + status = EINVAL; + break; + }; + break; + case OCRDMA_QPS_RTR: + /* qps: RTS->XXX */ + switch (new_state) { + case OCRDMA_QPS_RTS: + break; + case OCRDMA_QPS_ERR: + ocrdma_flush_qp(qp); + break; + default: + /* invalid state change. */ + status = EINVAL; + break; + }; + break; + case OCRDMA_QPS_RTS: + /* qps: RTS->XXX */ + switch (new_state) { + case OCRDMA_QPS_SQD: + case OCRDMA_QPS_SQE: + break; + case OCRDMA_QPS_ERR: + ocrdma_flush_qp(qp); + break; + default: + /* invalid state change. */ + status = EINVAL; + break; + }; + break; + case OCRDMA_QPS_SQD: + /* qps: SQD->XXX */ + switch (new_state) { + case OCRDMA_QPS_RTS: + case OCRDMA_QPS_SQE: + case OCRDMA_QPS_ERR: + break; + default: + /* invalid state change. */ + status = EINVAL; + break; + }; + break; + case OCRDMA_QPS_SQE: + switch (new_state) { + case OCRDMA_QPS_RTS: + case OCRDMA_QPS_ERR: + break; + default: + /* invalid state change. */ + status = EINVAL; + break; + }; + break; + case OCRDMA_QPS_ERR: + /* qps: ERR->XXX */ + switch (new_state) { + case OCRDMA_QPS_RST: + break; + default: + status = EINVAL; + break; + }; + break; + default: + status = EINVAL; + break; + }; + if (!status) + qp->state = new_state; + + pthread_spin_unlock(&qp->q_lock); + return status; +} + +/* + * ocrdma_modify_qp + */ +int ocrdma_modify_qp(struct ibv_qp *ibqp, struct ibv_qp_attr *attr, + int attr_mask) +{ + struct ibv_modify_qp cmd = {}; + struct ocrdma_qp *qp = get_ocrdma_qp(ibqp); + int status; + + status = ibv_cmd_modify_qp(ibqp, attr, attr_mask, &cmd, sizeof cmd); + if ((!status) && (attr_mask & IBV_QP_STATE)) + ocrdma_qp_state_machine(qp, attr->qp_state); + return status; +} + +/* + * ocrdma_query_qp + */ +int ocrdma_query_qp(struct ibv_qp *ibqp, struct ibv_qp_attr *attr, + int attr_mask, struct ibv_qp_init_attr *init_attr) +{ + struct ibv_query_qp cmd; + struct ocrdma_qp *qp = get_ocrdma_qp(ibqp); + int status; + + status = ibv_cmd_query_qp(ibqp, attr, attr_mask, + init_attr, &cmd, sizeof(cmd)); + + if (!status) + ocrdma_qp_state_machine(qp, attr->qp_state); + + return status; +} + +static void ocrdma_srq_toggle_bit(struct ocrdma_srq *srq, int idx) +{ + int i = idx / 32; + unsigned int mask = (1 << (idx % 32)); + + if (srq->idx_bit_fields[i] & mask) { + srq->idx_bit_fields[i] &= ~mask; + } else { + srq->idx_bit_fields[i] |= mask; + } +} + +static int ocrdma_srq_get_idx(struct ocrdma_srq *srq) +{ + int row = 0; + int indx = 0; + + for (row = 0; row < srq->bit_fields_len; row++) { + if (srq->idx_bit_fields[row]) { + indx = ffs(srq->idx_bit_fields[row]); + indx = (row * 32) + (indx - 1); + if (indx >= srq->rq.max_cnt) + assert(0); + ocrdma_srq_toggle_bit(srq, indx); + break; + } + } + if (row == srq->bit_fields_len) + assert(0); + return indx + 1; /* Use the index from 1 */ +} + +static int ocrdma_dppq_credits(struct ocrdma_qp_hwq_info *q) +{ + return ((q->max_wqe_idx - q->head) + q->tail) % q->free_cnt; +} + +static int ocrdma_hwq_free_cnt(struct ocrdma_qp_hwq_info *q) +{ + return ((q->max_wqe_idx - q->head) + q->tail) % q->max_cnt; +} + +static int is_hw_sq_empty(struct ocrdma_qp *qp) +{ + return ((qp->sq.tail == qp->sq.head) ? 1 : 0); +} + +static inline int is_hw_rq_empty(struct ocrdma_qp *qp) +{ + return ((qp->rq.head == qp->rq.tail) ? 1 : 0); +} + +static inline void *ocrdma_hwq_head(struct ocrdma_qp_hwq_info *q) +{ + return q->va + (q->head * q->entry_size); +} + +/*static inline void *ocrdma_wq_tail(struct ocrdma_qp_hwq_info *q) +{ + return q->va + (q->tail * q->entry_size); +} +*/ + +static inline void *ocrdma_hwq_head_from_idx(struct ocrdma_qp_hwq_info *q, + uint32_t idx) +{ + return q->va + (idx * q->entry_size); +} + +static void ocrdma_hwq_inc_head(struct ocrdma_qp_hwq_info *q) +{ + q->head = (q->head + 1) & q->max_wqe_idx; +} + +static void ocrdma_hwq_inc_tail(struct ocrdma_qp_hwq_info *q) +{ + q->tail = (q->tail + 1) & q->max_wqe_idx; +} + +static inline void ocrdma_hwq_inc_tail_by_idx(struct ocrdma_qp_hwq_info *q, + int idx) +{ + q->tail = (idx + 1) & q->max_wqe_idx; +} + +static int is_cqe_valid(struct ocrdma_cq *cq, struct ocrdma_cqe *cqe) +{ + int cqe_valid; + cqe_valid = le32toh(cqe->flags_status_srcqpn) & OCRDMA_CQE_VALID; + return (cqe_valid == cq->phase); +} + +static int is_cqe_for_sq(struct ocrdma_cqe *cqe) +{ + return (le32toh(cqe->flags_status_srcqpn) & + OCRDMA_CQE_QTYPE) ? 0 : 1; +} + +static int is_cqe_imm(struct ocrdma_cqe *cqe) +{ + return (le32toh(cqe->flags_status_srcqpn) & + OCRDMA_CQE_IMM) ? 1 : 0; +} + +static int is_cqe_wr_imm(struct ocrdma_cqe *cqe) +{ + return (le32toh(cqe->flags_status_srcqpn) & + OCRDMA_CQE_WRITE_IMM) ? 1 : 0; +} + +static inline void ocrdma_srq_inc_tail(struct ocrdma_qp *qp, + struct ocrdma_cqe *cqe) +{ + int wqe_idx; + + wqe_idx = (le32toh(cqe->rq.buftag_qpn) >> + OCRDMA_CQE_BUFTAG_SHIFT) & qp->srq->rq.max_wqe_idx; + + if (wqe_idx < 1) + assert(0); + + pthread_spin_lock(&qp->srq->q_lock); + ocrdma_hwq_inc_tail(&qp->srq->rq); + ocrdma_srq_toggle_bit(qp->srq, wqe_idx - 1); + pthread_spin_unlock(&qp->srq->q_lock); +} + +static void ocrdma_discard_cqes(struct ocrdma_qp *qp, struct ocrdma_cq *cq) +{ + int discard_cnt = 0; + uint32_t cur_getp, stop_getp; + struct ocrdma_cqe *cqe; + uint32_t qpn = 0; + int wqe_idx; + + pthread_spin_lock(&cq->cq_lock); + + /* traverse through the CQEs in the hw CQ, + * find the matching CQE for a given qp, + * mark the matching one discarded=1. + * discard the cqe. + * ring the doorbell in the poll_cq() as + * we don't complete out of order cqe. + */ + cur_getp = cq->getp; + /* find up to when do we reap the cq.*/ + stop_getp = cur_getp; + do { + if (is_hw_sq_empty(qp) && (!qp->srq && is_hw_rq_empty(qp))) + break; + + cqe = cq->va + cur_getp; + /* if (a) no valid cqe, or (b) done reading full hw cq, or + * (c) qp_xq becomes empty. + * then exit + */ + qpn = le32toh(cqe->cmn.qpn) & OCRDMA_CQE_QPN_MASK; + /* if previously discarded cqe found, skip that too. + * check for matching qp + */ + if ((qpn == 0) || (qpn != qp->id)) + goto skip_cqe; + + /* mark cqe discarded so that it is not picked up later + * in the poll_cq(). + */ + if (is_cqe_for_sq(cqe)) { + wqe_idx = (le32toh(cqe->wq.wqeidx) & + OCRDMA_CQE_WQEIDX_MASK) & qp->sq.max_wqe_idx; + ocrdma_hwq_inc_tail_by_idx(&qp->sq, wqe_idx); + } else { + if (qp->srq) + ocrdma_srq_inc_tail(qp, cqe); + else + ocrdma_hwq_inc_tail(&qp->rq); + } + + discard_cnt += 1; + /* discard by marking qp_id = 0 */ + cqe->cmn.qpn = 0; +skip_cqe: + cur_getp = (cur_getp + 1) % cq->max_hw_cqe; + + } while (cur_getp != stop_getp); + pthread_spin_unlock(&cq->cq_lock); +} + +/* + * ocrdma_destroy_qp + */ +int ocrdma_destroy_qp(struct ibv_qp *ibqp) +{ + int status = 0; + struct ocrdma_qp *qp; + struct ocrdma_device *dev; + + qp = get_ocrdma_qp(ibqp); + dev = qp->dev; + /* + * acquire CQ lock while destroy is in progress, in order to + * protect against proessing in-flight CQEs for this QP. + */ + pthread_spin_lock(&qp->sq_cq->cq_lock); + + if (qp->rq_cq && (qp->rq_cq != qp->sq_cq)) + pthread_spin_lock(&qp->rq_cq->cq_lock); + + _ocrdma_del_qpn_map(qp->dev, qp); + + if (qp->rq_cq && (qp->rq_cq != qp->sq_cq)) + pthread_spin_unlock(&qp->rq_cq->cq_lock); + + pthread_spin_unlock(&qp->sq_cq->cq_lock); + + if (qp->db_va) + munmap((void *)qp->db_va, qp->db_size); + if (qp->rq.va) + munmap(qp->rq.va, qp->rq.len); + if (qp->sq.va) + munmap(qp->sq.va, qp->sq.len); + + /* ensure that CQEs for newly created QP (whose id may be same with + * one which just getting destroyed are same), don't get + * discarded until the old CQEs are discarded. + */ + pthread_mutex_lock(&dev->dev_lock); + status = ibv_cmd_destroy_qp(ibqp); + + ocrdma_discard_cqes(qp, qp->sq_cq); + ocrdma_discard_cqes(qp, qp->rq_cq); + pthread_mutex_unlock(&dev->dev_lock); + + ocrdma_del_flush_qp(qp); + + pthread_spin_destroy(&qp->q_lock); + if (qp->rqe_wr_id_tbl) + free(qp->rqe_wr_id_tbl); + if (qp->wqe_wr_id_tbl) + free(qp->wqe_wr_id_tbl); + if (qp->dpp_cq) + ocrdma_destroy_cq(&qp->dpp_cq->ibv_cq); + free(qp); + + return status; +} + +static void ocrdma_ring_sq_db(struct ocrdma_qp *qp) +{ + __le32 db_val = htole32((qp->sq.dbid | (1 << 16))); + + udma_to_device_barrier(); + *(__le32 *) (((uint8_t *) qp->db_sq_va)) = db_val; +} + +static void ocrdma_ring_rq_db(struct ocrdma_qp *qp) +{ + __le32 db_val = htole32((qp->rq.dbid | (1 << qp->db_shift))); + + udma_to_device_barrier(); + *(__le32 *) ((uint8_t *) qp->db_rq_va) = db_val; +} + +static void ocrdma_ring_srq_db(struct ocrdma_srq *srq) +{ + __le32 db_val = htole32(srq->rq.dbid | (1 << srq->db_shift)); + + udma_to_device_barrier(); + *(__le32 *) (srq->db_va) = db_val; +} + +static void ocrdma_ring_cq_db(struct ocrdma_cq *cq, uint32_t armed, + int solicited, uint32_t num_cqe) +{ + uint32_t val; + + val = cq->cq_dbid & OCRDMA_DB_CQ_RING_ID_MASK; + val |= ((cq->cq_dbid & OCRDMA_DB_CQ_RING_ID_EXT_MASK) << + OCRDMA_DB_CQ_RING_ID_EXT_MASK_SHIFT); + + if (armed) + val |= (1 << OCRDMA_DB_CQ_REARM_SHIFT); + if (solicited) + val |= (1 << OCRDMA_DB_CQ_SOLICIT_SHIFT); + val |= (num_cqe << OCRDMA_DB_CQ_NUM_POPPED_SHIFT); + + udma_to_device_barrier(); + *(__le32 *) ((uint8_t *) (cq->db_va) + OCRDMA_DB_CQ_OFFSET) = + htole32(val); +} + +static void ocrdma_build_ud_hdr(struct ocrdma_qp *qp, + struct ocrdma_hdr_wqe *hdr, + struct ibv_send_wr *wr) +{ + struct ocrdma_ewqe_ud_hdr *ud_hdr = + (struct ocrdma_ewqe_ud_hdr *)(hdr + 1); + struct ocrdma_ah *ah = get_ocrdma_ah(wr->wr.ud.ah); + + ud_hdr->rsvd_dest_qpn = wr->wr.ud.remote_qpn; + ud_hdr->qkey = wr->wr.ud.remote_qkey; + ud_hdr->rsvd_ahid = ah->id; + if (ah->isvlan) + hdr->cw |= (OCRDMA_FLAG_AH_VLAN_PR << + OCRDMA_WQE_FLAGS_SHIFT); + ud_hdr->hdr_type = ah->hdr_type; +} + +static void ocrdma_build_sges(struct ocrdma_hdr_wqe *hdr, + struct ocrdma_sge *sge, int num_sge, + struct ibv_sge *sg_list) +{ + int i; + for (i = 0; i < num_sge; i++) { + sge[i].lrkey = sg_list[i].lkey; + sge[i].addr_lo = sg_list[i].addr; + sge[i].addr_hi = sg_list[i].addr >> 32; + sge[i].len = sg_list[i].length; + hdr->total_len += sg_list[i].length; + } + if (num_sge == 0) + memset(sge, 0, sizeof(*sge)); +} + + +static inline uint32_t ocrdma_sglist_len(struct ibv_sge *sg_list, int num_sge) +{ + uint32_t total_len = 0, i; + + for (i = 0; i < num_sge; i++) + total_len += sg_list[i].length; + return total_len; +} + +static inline int ocrdma_build_inline_sges(struct ocrdma_qp *qp, + struct ocrdma_hdr_wqe *hdr, + struct ocrdma_sge *sge, + struct ibv_send_wr *wr, + uint32_t wqe_size) +{ + int i; + char *dpp_addr; + + if (wr->send_flags & IBV_SEND_INLINE && qp->qp_type != IBV_QPT_UD) { + hdr->total_len = ocrdma_sglist_len(wr->sg_list, wr->num_sge); + if (hdr->total_len > qp->max_inline_data) { + ocrdma_err + ("%s() supported_len=0x%x, unsupported len req=0x%x\n", + __func__, qp->max_inline_data, hdr->total_len); + return EINVAL; + } + + dpp_addr = (char *)sge; + for (i = 0; i < wr->num_sge; i++) { + memcpy(dpp_addr, + (void *)(unsigned long)wr->sg_list[i].addr, + wr->sg_list[i].length); + dpp_addr += wr->sg_list[i].length; + } + + wqe_size += ROUND_UP_X(hdr->total_len, OCRDMA_WQE_ALIGN_BYTES); + if (0 == hdr->total_len) + wqe_size += sizeof(struct ocrdma_sge); + hdr->cw |= (OCRDMA_TYPE_INLINE << OCRDMA_WQE_TYPE_SHIFT); + } else { + ocrdma_build_sges(hdr, sge, wr->num_sge, wr->sg_list); + if (wr->num_sge) + wqe_size += (wr->num_sge * sizeof(struct ocrdma_sge)); + else + wqe_size += sizeof(struct ocrdma_sge); + hdr->cw |= (OCRDMA_TYPE_LKEY << OCRDMA_WQE_TYPE_SHIFT); + } + hdr->cw |= ((wqe_size / OCRDMA_WQE_STRIDE) << OCRDMA_WQE_SIZE_SHIFT); + return 0; +} + +static int ocrdma_build_send(struct ocrdma_qp *qp, struct ocrdma_hdr_wqe *hdr, + struct ibv_send_wr *wr) +{ + int status; + struct ocrdma_sge *sge; + uint32_t wqe_size = sizeof(*hdr); + + if (qp->qp_type == IBV_QPT_UD) { + wqe_size += sizeof(struct ocrdma_ewqe_ud_hdr); + ocrdma_build_ud_hdr(qp, hdr, wr); + sge = (struct ocrdma_sge *)(hdr + 2); + } else + sge = (struct ocrdma_sge *)(hdr + 1); + + status = ocrdma_build_inline_sges(qp, hdr, sge, wr, wqe_size); + + return status; +} + +static int ocrdma_build_write(struct ocrdma_qp *qp, struct ocrdma_hdr_wqe *hdr, + struct ibv_send_wr *wr) +{ + int status; + struct ocrdma_sge *ext_rw = (struct ocrdma_sge *)(hdr + 1); + struct ocrdma_sge *sge = ext_rw + 1; + uint32_t wqe_size = sizeof(*hdr) + sizeof(*ext_rw); + + status = ocrdma_build_inline_sges(qp, hdr, sge, wr, wqe_size); + if (status) + return status; + + ext_rw->addr_lo = wr->wr.rdma.remote_addr; + ext_rw->addr_hi = (wr->wr.rdma.remote_addr >> 32); + ext_rw->lrkey = wr->wr.rdma.rkey; + ext_rw->len = hdr->total_len; + + return 0; +} + +static void ocrdma_build_read(struct ocrdma_qp *qp, struct ocrdma_hdr_wqe *hdr, + struct ibv_send_wr *wr) +{ + struct ocrdma_sge *ext_rw = (struct ocrdma_sge *)(hdr + 1); + struct ocrdma_sge *sge = ext_rw + 1; + uint32_t wqe_size = ((wr->num_sge + 1) * sizeof(*sge)) + sizeof(*hdr); + + hdr->cw |= (OCRDMA_TYPE_LKEY << OCRDMA_WQE_TYPE_SHIFT); + hdr->cw |= ((wqe_size / OCRDMA_WQE_STRIDE) << OCRDMA_WQE_SIZE_SHIFT); + hdr->cw |= (OCRDMA_READ << OCRDMA_WQE_OPCODE_SHIFT); + + ocrdma_build_sges(hdr, sge, wr->num_sge, wr->sg_list); + + ext_rw->addr_lo = wr->wr.rdma.remote_addr; + ext_rw->addr_hi = (wr->wr.rdma.remote_addr >> 32); + ext_rw->lrkey = wr->wr.rdma.rkey; + ext_rw->len = hdr->total_len; + +} + +/* Dpp cq is single entry cq, we just need to read + * wqe index from first 16 bits at 0th cqe index. + */ +static void ocrdma_poll_dpp_cq(struct ocrdma_qp *qp) +{ + struct ocrdma_cq *cq = qp->dpp_cq; + struct ocrdma_dpp_cqe *cqe; + int idx = 0; + cqe = ((struct ocrdma_dpp_cqe *)cq->va); + idx = cqe->wqe_idx_valid & OCRDMA_DPP_WQE_INDEX_MASK; + + if (idx != qp->dpp_prev_indx) { + ocrdma_hwq_inc_tail_by_idx(&qp->dpp_q, idx); + qp->dpp_prev_indx = idx; + } +} + +static uint32_t ocrdma_get_hdr_len(struct ocrdma_qp *qp, + struct ocrdma_hdr_wqe *hdr) +{ + uint32_t hdr_sz = sizeof(*hdr); + if (qp->qp_type == IBV_QPT_UD) + hdr_sz += sizeof(struct ocrdma_ewqe_ud_hdr); + if (hdr->cw & (OCRDMA_WRITE << OCRDMA_WQE_OPCODE_SHIFT)) + hdr_sz += sizeof(struct ocrdma_sge); + return hdr_sz / sizeof(uint32_t); +} + +static void ocrdma_build_dpp_wqe(void *va, struct ocrdma_hdr_wqe *wqe, + uint32_t hdr_len) +{ + uint32_t pyld_len = (wqe->cw >> OCRDMA_WQE_SIZE_SHIFT) * 2; + uint32_t i = 0; + + mmio_wc_start(); + + /* convert WQE header to LE format */ + for (; i < hdr_len; i++) + *((__le32 *) va + i) = + htole32(*((uint32_t *) wqe + i)); + /* Convertion of data is done in HW */ + for (; i < pyld_len; i++) + *((uint32_t *) va + i) = (*((uint32_t *) wqe + i)); + + mmio_flush_writes(); +} + +static void ocrdma_post_dpp_wqe(struct ocrdma_qp *qp, + struct ocrdma_hdr_wqe *hdr) +{ + if (qp->dpp_cq && ocrdma_dppq_credits(&qp->dpp_q) == 0) + ocrdma_poll_dpp_cq(qp); + if (!qp->dpp_cq || ocrdma_dppq_credits(&qp->dpp_q)) { + ocrdma_build_dpp_wqe(qp->dpp_q.va, hdr, + ocrdma_get_hdr_len(qp, hdr)); + qp->wqe_wr_id_tbl[qp->sq.head].dpp_wqe = 1; + qp->wqe_wr_id_tbl[qp->sq.head].dpp_wqe_idx = qp->dpp_q.head; + /* if dpp cq is not enabled, we can post + * wqe as soon as we receive and adapter + * takes care of flow control. + */ + if (qp->dpp_cq) + ocrdma_hwq_inc_head(&qp->dpp_q); + } else + qp->wqe_wr_id_tbl[qp->sq.head].dpp_wqe = 0; +} + +/* + * ocrdma_post_send + */ +int ocrdma_post_send(struct ibv_qp *ib_qp, struct ibv_send_wr *wr, + struct ibv_send_wr **bad_wr) +{ + int status = 0; + struct ocrdma_qp *qp; + struct ocrdma_hdr_wqe *hdr; + + qp = get_ocrdma_qp(ib_qp); + + pthread_spin_lock(&qp->q_lock); + if (qp->state != OCRDMA_QPS_RTS && qp->state != OCRDMA_QPS_SQD) { + pthread_spin_unlock(&qp->q_lock); + *bad_wr = wr; + return EINVAL; + } + + while (wr) { + + if (qp->qp_type == IBV_QPT_UD && (wr->opcode != IBV_WR_SEND && + wr->opcode != IBV_WR_SEND_WITH_IMM)) { + *bad_wr = wr; + status = EINVAL; + break; + } + + if (ocrdma_hwq_free_cnt(&qp->sq) == 0 || + wr->num_sge > qp->sq.max_sges) { + *bad_wr = wr; + status = ENOMEM; + break; + } + hdr = ocrdma_hwq_head(&qp->sq); + hdr->cw = 0; + hdr->total_len = 0; + if (wr->send_flags & IBV_SEND_SIGNALED || qp->signaled) + hdr->cw = (OCRDMA_FLAG_SIG << OCRDMA_WQE_FLAGS_SHIFT); + if (wr->send_flags & IBV_SEND_FENCE) + hdr->cw |= + (OCRDMA_FLAG_FENCE_L << OCRDMA_WQE_FLAGS_SHIFT); + if (wr->send_flags & IBV_SEND_SOLICITED) + hdr->cw |= + (OCRDMA_FLAG_SOLICIT << OCRDMA_WQE_FLAGS_SHIFT); + + qp->wqe_wr_id_tbl[qp->sq.head].wrid = wr->wr_id; + switch (wr->opcode) { + case IBV_WR_SEND_WITH_IMM: + hdr->cw |= (OCRDMA_FLAG_IMM << OCRDMA_WQE_FLAGS_SHIFT); + hdr->immdt = be32toh(wr->imm_data); + SWITCH_FALLTHROUGH; + case IBV_WR_SEND: + hdr->cw |= (OCRDMA_SEND << OCRDMA_WQE_OPCODE_SHIFT); + status = ocrdma_build_send(qp, hdr, wr); + break; + case IBV_WR_RDMA_WRITE_WITH_IMM: + hdr->cw |= (OCRDMA_FLAG_IMM << OCRDMA_WQE_FLAGS_SHIFT); + hdr->immdt = be32toh(wr->imm_data); + SWITCH_FALLTHROUGH; + case IBV_WR_RDMA_WRITE: + hdr->cw |= (OCRDMA_WRITE << OCRDMA_WQE_OPCODE_SHIFT); + status = ocrdma_build_write(qp, hdr, wr); + break; + case IBV_WR_RDMA_READ: + ocrdma_build_read(qp, hdr, wr); + break; + default: + status = EINVAL; + break; + } + if (status) { + *bad_wr = wr; + break; + } + if (wr->send_flags & IBV_SEND_SIGNALED || qp->signaled) + qp->wqe_wr_id_tbl[qp->sq.head].signaled = 1; + else + qp->wqe_wr_id_tbl[qp->sq.head].signaled = 0; + + if (qp->dpp_enabled && (wr->send_flags & IBV_SEND_INLINE)) + ocrdma_post_dpp_wqe(qp, hdr); + + ocrdma_swap_cpu_to_le(hdr, ((hdr->cw >> OCRDMA_WQE_SIZE_SHIFT) & + OCRDMA_WQE_SIZE_MASK) * + OCRDMA_WQE_STRIDE); + + ocrdma_ring_sq_db(qp); + + /* update pointer, counter for next wr */ + ocrdma_hwq_inc_head(&qp->sq); + wr = wr->next; + } + pthread_spin_unlock(&qp->q_lock); + + return status; +} + +static void ocrdma_build_rqe(struct ocrdma_hdr_wqe *rqe, struct ibv_recv_wr *wr, + uint16_t tag) +{ + struct ocrdma_sge *sge; + uint32_t wqe_size; + + if (wr->num_sge) + wqe_size = (wr->num_sge * sizeof(*sge)) + sizeof(*rqe); + else + wqe_size = sizeof(*sge) + sizeof(*rqe); + + rqe->cw = ((wqe_size / OCRDMA_WQE_STRIDE) << OCRDMA_WQE_SIZE_SHIFT); + rqe->cw |= (OCRDMA_FLAG_SIG << OCRDMA_WQE_FLAGS_SHIFT); + rqe->cw |= (OCRDMA_TYPE_LKEY << OCRDMA_WQE_TYPE_SHIFT); + rqe->total_len = 0; + rqe->rsvd_tag = tag; + sge = (struct ocrdma_sge *)(rqe + 1); + ocrdma_build_sges(rqe, sge, wr->num_sge, wr->sg_list); + ocrdma_swap_cpu_to_le(rqe, wqe_size); +} + +/* + * ocrdma_post_recv + */ +int ocrdma_post_recv(struct ibv_qp *ibqp, struct ibv_recv_wr *wr, + struct ibv_recv_wr **bad_wr) +{ + int status = 0; + struct ocrdma_qp *qp; + struct ocrdma_hdr_wqe *rqe; + + qp = get_ocrdma_qp(ibqp); + + pthread_spin_lock(&qp->q_lock); + if (qp->state == OCRDMA_QPS_RST || qp->state == OCRDMA_QPS_ERR) { + pthread_spin_unlock(&qp->q_lock); + *bad_wr = wr; + return EINVAL; + } + + while (wr) { + if (ocrdma_hwq_free_cnt(&qp->rq) == 0 || + wr->num_sge > qp->rq.max_sges) { + status = ENOMEM; + *bad_wr = wr; + break; + } + rqe = ocrdma_hwq_head(&qp->rq); + ocrdma_build_rqe(rqe, wr, 0); + qp->rqe_wr_id_tbl[qp->rq.head] = wr->wr_id; + ocrdma_ring_rq_db(qp); + + /* update pointer, counter for next wr */ + ocrdma_hwq_inc_head(&qp->rq); + wr = wr->next; + } + pthread_spin_unlock(&qp->q_lock); + + return status; +} + +static enum ibv_wc_status ocrdma_to_ibwc_err(uint16_t status) +{ + enum ibv_wc_status ibwc_status = IBV_WC_GENERAL_ERR; + switch (status) { + case OCRDMA_CQE_GENERAL_ERR: + ibwc_status = IBV_WC_GENERAL_ERR; + break; + case OCRDMA_CQE_LOC_LEN_ERR: + ibwc_status = IBV_WC_LOC_LEN_ERR; + break; + case OCRDMA_CQE_LOC_QP_OP_ERR: + ibwc_status = IBV_WC_LOC_QP_OP_ERR; + break; + case OCRDMA_CQE_LOC_EEC_OP_ERR: + ibwc_status = IBV_WC_LOC_EEC_OP_ERR; + break; + case OCRDMA_CQE_LOC_PROT_ERR: + ibwc_status = IBV_WC_LOC_PROT_ERR; + break; + case OCRDMA_CQE_WR_FLUSH_ERR: + ibwc_status = IBV_WC_WR_FLUSH_ERR; + break; + case OCRDMA_CQE_BAD_RESP_ERR: + ibwc_status = IBV_WC_BAD_RESP_ERR; + break; + case OCRDMA_CQE_LOC_ACCESS_ERR: + ibwc_status = IBV_WC_LOC_ACCESS_ERR; + break; + case OCRDMA_CQE_REM_INV_REQ_ERR: + ibwc_status = IBV_WC_REM_INV_REQ_ERR; + break; + case OCRDMA_CQE_REM_ACCESS_ERR: + ibwc_status = IBV_WC_REM_ACCESS_ERR; + break; + case OCRDMA_CQE_REM_OP_ERR: + ibwc_status = IBV_WC_REM_OP_ERR; + break; + case OCRDMA_CQE_RETRY_EXC_ERR: + ibwc_status = IBV_WC_RETRY_EXC_ERR; + break; + case OCRDMA_CQE_RNR_RETRY_EXC_ERR: + ibwc_status = IBV_WC_RNR_RETRY_EXC_ERR; + break; + case OCRDMA_CQE_LOC_RDD_VIOL_ERR: + ibwc_status = IBV_WC_LOC_RDD_VIOL_ERR; + break; + case OCRDMA_CQE_REM_INV_RD_REQ_ERR: + ibwc_status = IBV_WC_REM_INV_RD_REQ_ERR; + break; + case OCRDMA_CQE_REM_ABORT_ERR: + ibwc_status = IBV_WC_REM_ABORT_ERR; + break; + case OCRDMA_CQE_INV_EECN_ERR: + ibwc_status = IBV_WC_INV_EECN_ERR; + break; + case OCRDMA_CQE_INV_EEC_STATE_ERR: + ibwc_status = IBV_WC_INV_EEC_STATE_ERR; + break; + case OCRDMA_CQE_FATAL_ERR: + ibwc_status = IBV_WC_FATAL_ERR; + break; + case OCRDMA_CQE_RESP_TIMEOUT_ERR: + ibwc_status = IBV_WC_RESP_TIMEOUT_ERR; + break; + default: + ibwc_status = IBV_WC_GENERAL_ERR; + break; + }; + return ibwc_status; +} + +static void ocrdma_update_wc(struct ocrdma_qp *qp, struct ibv_wc *ibwc, + uint32_t wqe_idx) +{ + struct ocrdma_hdr_wqe_le *hdr; + struct ocrdma_sge *rw; + int opcode; + + hdr = ocrdma_hwq_head_from_idx(&qp->sq, wqe_idx); + + ibwc->wr_id = qp->wqe_wr_id_tbl[wqe_idx].wrid; + + /* Undo the hdr->cw swap */ + opcode = le32toh(hdr->cw) & OCRDMA_WQE_OPCODE_MASK; + switch (opcode) { + case OCRDMA_WRITE: + ibwc->opcode = IBV_WC_RDMA_WRITE; + break; + case OCRDMA_READ: + rw = (struct ocrdma_sge *)(hdr + 1); + ibwc->opcode = IBV_WC_RDMA_READ; + ibwc->byte_len = rw->len; + break; + case OCRDMA_SEND: + ibwc->opcode = IBV_WC_SEND; + break; + default: + ibwc->status = IBV_WC_GENERAL_ERR; + ocrdma_err("%s() invalid opcode received = 0x%x\n", + __func__, le32toh(hdr->cw) & OCRDMA_WQE_OPCODE_MASK); + break; + }; +} + +static void ocrdma_set_cqe_status_flushed(struct ocrdma_qp *qp, + struct ocrdma_cqe *cqe) +{ + if (is_cqe_for_sq(cqe)) { + cqe->flags_status_srcqpn = + htole32(le32toh(cqe->flags_status_srcqpn) + & ~OCRDMA_CQE_STATUS_MASK); + cqe->flags_status_srcqpn = + htole32(le32toh(cqe->flags_status_srcqpn) + | (OCRDMA_CQE_WR_FLUSH_ERR << + OCRDMA_CQE_STATUS_SHIFT)); + } else { + if (qp->qp_type == IBV_QPT_UD) { + cqe->flags_status_srcqpn = + htole32(le32toh + (cqe->flags_status_srcqpn) & + ~OCRDMA_CQE_UD_STATUS_MASK); + cqe->flags_status_srcqpn = + htole32(le32toh + (cqe->flags_status_srcqpn) | + (OCRDMA_CQE_WR_FLUSH_ERR << + OCRDMA_CQE_UD_STATUS_SHIFT)); + } else { + cqe->flags_status_srcqpn = + htole32(le32toh + (cqe->flags_status_srcqpn) & + ~OCRDMA_CQE_STATUS_MASK); + cqe->flags_status_srcqpn = + htole32(le32toh + (cqe->flags_status_srcqpn) | + (OCRDMA_CQE_WR_FLUSH_ERR << + OCRDMA_CQE_STATUS_SHIFT)); + } + } +} + +static int ocrdma_update_err_cqe(struct ibv_wc *ibwc, struct ocrdma_cqe *cqe, + struct ocrdma_qp *qp, int status) +{ + int expand = 0; + + ibwc->byte_len = 0; + ibwc->qp_num = qp->id; + ibwc->status = ocrdma_to_ibwc_err(status); + + ocrdma_flush_qp(qp); + ocrdma_qp_state_machine(qp, IBV_QPS_ERR); + + /* if wqe/rqe pending for which cqe needs to be returned, + * trigger inflating it. + */ + if (!is_hw_rq_empty(qp) || !is_hw_sq_empty(qp)) { + expand = 1; + ocrdma_set_cqe_status_flushed(qp, cqe); + } + return expand; +} + +static int ocrdma_update_err_rcqe(struct ibv_wc *ibwc, struct ocrdma_cqe *cqe, + struct ocrdma_qp *qp, int status) +{ + ibwc->opcode = IBV_WC_RECV; + ibwc->wr_id = qp->rqe_wr_id_tbl[qp->rq.tail]; + ocrdma_hwq_inc_tail(&qp->rq); + + return ocrdma_update_err_cqe(ibwc, cqe, qp, status); +} + +static int ocrdma_update_err_scqe(struct ibv_wc *ibwc, struct ocrdma_cqe *cqe, + struct ocrdma_qp *qp, int status) +{ + ocrdma_update_wc(qp, ibwc, qp->sq.tail); + ocrdma_hwq_inc_tail(&qp->sq); + + return ocrdma_update_err_cqe(ibwc, cqe, qp, status); +} + +static int ocrdma_poll_err_scqe(struct ocrdma_qp *qp, + struct ocrdma_cqe *cqe, struct ibv_wc *ibwc, + int *polled, int *stop) +{ + int expand; + int status = (le32toh(cqe->flags_status_srcqpn) & + OCRDMA_CQE_STATUS_MASK) >> OCRDMA_CQE_STATUS_SHIFT; + + /* when hw sq is empty, but rq is not empty, so we continue + * to keep the cqe in order to get the cq event again. + */ + if (is_hw_sq_empty(qp) && !is_hw_rq_empty(qp)) { + /* when cq for rq and sq is same, it is safe to return + * flush cqe for RQEs. + */ + if (!qp->srq && (qp->sq_cq == qp->rq_cq)) { + *polled = 1; + status = OCRDMA_CQE_WR_FLUSH_ERR; + expand = ocrdma_update_err_rcqe(ibwc, cqe, qp, status); + } else { + *polled = 0; + *stop = 1; + expand = 0; + } + } else if (is_hw_sq_empty(qp)) { + /* Do nothing */ + expand = 0; + *polled = 0; + *stop = 0; + } else { + *polled = 1; + expand = ocrdma_update_err_scqe(ibwc, cqe, qp, status); + } + return expand; +} + +static int ocrdma_poll_success_scqe(struct ocrdma_qp *qp, + struct ocrdma_cqe *cqe, + struct ibv_wc *ibwc, int *polled) +{ + int expand = 0; + int tail = qp->sq.tail; + uint32_t wqe_idx; + + if (!qp->wqe_wr_id_tbl[tail].signaled) { + *polled = 0; /* WC cannot be consumed yet */ + } else { + ibwc->status = IBV_WC_SUCCESS; + ibwc->wc_flags = 0; + ibwc->qp_num = qp->id; + ocrdma_update_wc(qp, ibwc, tail); + *polled = 1; + } + + wqe_idx = (le32toh(cqe->wq.wqeidx) & + OCRDMA_CQE_WQEIDX_MASK) & qp->sq.max_wqe_idx; + if (tail != wqe_idx) /* CQE cannot be consumed yet */ + expand = 1; /* Coallesced CQE */ + + ocrdma_hwq_inc_tail(&qp->sq); + return expand; +} + +static int ocrdma_poll_scqe(struct ocrdma_qp *qp, struct ocrdma_cqe *cqe, + struct ibv_wc *ibwc, int *polled, int *stop) +{ + int status, expand; + + status = (le32toh(cqe->flags_status_srcqpn) & + OCRDMA_CQE_STATUS_MASK) >> OCRDMA_CQE_STATUS_SHIFT; + + if (status == OCRDMA_CQE_SUCCESS) + expand = ocrdma_poll_success_scqe(qp, cqe, ibwc, polled); + else + expand = ocrdma_poll_err_scqe(qp, cqe, ibwc, polled, stop); + return expand; +} + +static int ocrdma_update_ud_rcqe(struct ibv_wc *ibwc, struct ocrdma_cqe *cqe) +{ + int status; + + status = (le32toh(cqe->flags_status_srcqpn) & + OCRDMA_CQE_UD_STATUS_MASK) >> OCRDMA_CQE_UD_STATUS_SHIFT; + ibwc->src_qp = le32toh(cqe->flags_status_srcqpn) & + OCRDMA_CQE_SRCQP_MASK; + ibwc->pkey_index = le32toh(cqe->ud.rxlen_pkey) & + OCRDMA_CQE_PKEY_MASK; + ibwc->wc_flags = IBV_WC_GRH; + ibwc->byte_len = (le32toh(cqe->ud.rxlen_pkey) >> + OCRDMA_CQE_UD_XFER_LEN_SHIFT); + return status; +} + +static void ocrdma_update_free_srq_cqe(struct ibv_wc *ibwc, + struct ocrdma_cqe *cqe, + struct ocrdma_qp *qp) +{ + struct ocrdma_srq *srq = NULL; + uint32_t wqe_idx; + + srq = get_ocrdma_srq(qp->ibv_qp.srq); +#if !defined(SKH_A0_WORKAROUND) /* BUG 113416 */ + wqe_idx = (le32toh(cqe->rq.buftag_qpn) >> + OCRDMA_CQE_BUFTAG_SHIFT) & srq->rq.max_wqe_idx; +#else + wqe_idx = (le32toh(cqe->flags_status_srcqpn)) & 0xFFFF; +#endif + if (wqe_idx < 1) + assert(0); + ibwc->wr_id = srq->rqe_wr_id_tbl[wqe_idx]; + + pthread_spin_lock(&srq->q_lock); + ocrdma_srq_toggle_bit(srq, wqe_idx - 1); + pthread_spin_unlock(&srq->q_lock); + + ocrdma_hwq_inc_tail(&srq->rq); +} + +static int ocrdma_poll_err_rcqe(struct ocrdma_qp *qp, struct ocrdma_cqe *cqe, + struct ibv_wc *ibwc, int *polled, int *stop, + int status) +{ + int expand; + + /* when hw_rq is empty, but wq is not empty, so continue + * to keep the cqe to get the cq event again. + */ + if (is_hw_rq_empty(qp) && !is_hw_sq_empty(qp)) { + if (!qp->srq && (qp->sq_cq == qp->rq_cq)) { + *polled = 1; + status = OCRDMA_CQE_WR_FLUSH_ERR; + expand = ocrdma_update_err_scqe(ibwc, cqe, qp, status); + } else { + *polled = 0; + *stop = 1; + expand = 0; + } + } else if (is_hw_rq_empty(qp)) { + /* Do nothing */ + expand = 0; + *polled = 0; + *stop = 0; + } else { + *polled = 1; + expand = ocrdma_update_err_rcqe(ibwc, cqe, qp, status); + } + return expand; +} + +static void ocrdma_poll_success_rcqe(struct ocrdma_qp *qp, + struct ocrdma_cqe *cqe, + struct ibv_wc *ibwc) +{ + ibwc->opcode = IBV_WC_RECV; + ibwc->qp_num = qp->id; + ibwc->status = IBV_WC_SUCCESS; + + if (qp->qp_type == IBV_QPT_UD) + ocrdma_update_ud_rcqe(ibwc, cqe); + else + ibwc->byte_len = le32toh(cqe->rq.rxlen); + + if (is_cqe_imm(cqe)) { + ibwc->imm_data = htobe32(le32toh(cqe->rq.lkey_immdt)); + ibwc->wc_flags |= IBV_WC_WITH_IMM; + } else if (is_cqe_wr_imm(cqe)) { + ibwc->opcode = IBV_WC_RECV_RDMA_WITH_IMM; + ibwc->imm_data = htobe32(le32toh(cqe->rq.lkey_immdt)); + ibwc->wc_flags |= IBV_WC_WITH_IMM; + } + if (qp->ibv_qp.srq) + ocrdma_update_free_srq_cqe(ibwc, cqe, qp); + else { + ibwc->wr_id = qp->rqe_wr_id_tbl[qp->rq.tail]; + ocrdma_hwq_inc_tail(&qp->rq); + } +} + +static int ocrdma_poll_rcqe(struct ocrdma_qp *qp, struct ocrdma_cqe *cqe, + struct ibv_wc *ibwc, int *polled, int *stop) +{ + int status; + int expand = 0; + + ibwc->wc_flags = 0; + if (qp->qp_type == IBV_QPT_UD) + status = (le32toh(cqe->flags_status_srcqpn) & + OCRDMA_CQE_UD_STATUS_MASK) >> + OCRDMA_CQE_UD_STATUS_SHIFT; + else + status = (le32toh(cqe->flags_status_srcqpn) & + OCRDMA_CQE_STATUS_MASK) >> OCRDMA_CQE_STATUS_SHIFT; + + if (status == OCRDMA_CQE_SUCCESS) { + *polled = 1; + ocrdma_poll_success_rcqe(qp, cqe, ibwc); + } else { + expand = ocrdma_poll_err_rcqe(qp, cqe, ibwc, polled, stop, + status); + } + return expand; +} + +static void ocrdma_change_cq_phase(struct ocrdma_cq *cq, + struct ocrdma_cqe *cqe, uint16_t cur_getp) +{ + if (cq->phase_change) { + if (cur_getp == 0) + cq->phase = (~cq->phase & OCRDMA_CQE_VALID); + } else + cqe->flags_status_srcqpn = 0; /* clear valid bit */ +} + +static int ocrdma_poll_hwcq(struct ocrdma_cq *cq, int num_entries, + struct ibv_wc *ibwc) +{ + uint16_t qpn = 0; + int i = 0; + int expand = 0; + int polled_hw_cqes = 0; + struct ocrdma_qp *qp = NULL; + struct ocrdma_device *dev = cq->dev; + struct ocrdma_cqe *cqe; + uint16_t cur_getp; + int polled = 0; + int stop = 0; + + cur_getp = cq->getp; + while (num_entries) { + cqe = cq->va + cur_getp; + /* check whether valid cqe or not */ + if (!is_cqe_valid(cq, cqe)) + break; + qpn = (le32toh(cqe->cmn.qpn) & OCRDMA_CQE_QPN_MASK); + /* ignore discarded cqe */ + if (qpn == 0) + goto skip_cqe; + qp = dev->qp_tbl[qpn]; + if (qp == NULL) { + ocrdma_err("%s() cqe for invalid qpn= 0x%x received.\n", + __func__, qpn); + goto skip_cqe; + } + + if (is_cqe_for_sq(cqe)) { + expand = ocrdma_poll_scqe(qp, cqe, ibwc, &polled, + &stop); + } else { + expand = ocrdma_poll_rcqe(qp, cqe, ibwc, &polled, + &stop); + } + if (expand) + goto expand_cqe; + if (stop) + goto stop_cqe; + /* clear qpn to avoid duplicate processing by discard_cqe() */ + cqe->cmn.qpn = 0; +skip_cqe: + polled_hw_cqes += 1; + cur_getp = (cur_getp + 1) % cq->max_hw_cqe; + ocrdma_change_cq_phase(cq, cqe, cur_getp); +expand_cqe: + if (polled) { + num_entries -= 1; + i += 1; + ibwc = ibwc + 1; + polled = 0; + } + } +stop_cqe: + cq->getp = cur_getp; + if (cq->deferred_arm || polled_hw_cqes) { + ocrdma_ring_cq_db(cq, cq->deferred_arm, + cq->deferred_sol, polled_hw_cqes); + cq->deferred_arm = 0; + cq->deferred_sol = 0; + } + + return i; +} + +static int ocrdma_add_err_cqe(struct ocrdma_cq *cq, int num_entries, + struct ocrdma_qp *qp, struct ibv_wc *ibwc) +{ + int err_cqes = 0; + + while (num_entries) { + if (is_hw_sq_empty(qp) && is_hw_rq_empty(qp)) + break; + if (!is_hw_sq_empty(qp) && qp->sq_cq == cq) { + ocrdma_update_wc(qp, ibwc, qp->sq.tail); + ocrdma_hwq_inc_tail(&qp->sq); + } else if (!is_hw_rq_empty(qp) && qp->rq_cq == cq) { + ibwc->wr_id = qp->rqe_wr_id_tbl[qp->rq.tail]; + ocrdma_hwq_inc_tail(&qp->rq); + } else + return err_cqes; + ibwc->byte_len = 0; + ibwc->status = IBV_WC_WR_FLUSH_ERR; + ibwc = ibwc + 1; + err_cqes += 1; + num_entries -= 1; + } + return err_cqes; +} + +/* + * ocrdma_poll_cq + */ +int ocrdma_poll_cq(struct ibv_cq *ibcq, int num_entries, struct ibv_wc *wc) +{ + struct ocrdma_cq *cq; + int cqes_to_poll = num_entries; + int num_os_cqe = 0, err_cqes = 0; + struct ocrdma_qp *qp; + struct ocrdma_qp *qp_tmp; + + cq = get_ocrdma_cq(ibcq); + pthread_spin_lock(&cq->cq_lock); + num_os_cqe = ocrdma_poll_hwcq(cq, num_entries, wc); + pthread_spin_unlock(&cq->cq_lock); + cqes_to_poll -= num_os_cqe; + + if (cqes_to_poll) { + wc = wc + num_os_cqe; + pthread_spin_lock(&cq->dev->flush_q_lock); + list_for_each_safe(&cq->sq_head, qp, qp_tmp, sq_entry) { + if (cqes_to_poll == 0) + break; + err_cqes = ocrdma_add_err_cqe(cq, cqes_to_poll, qp, wc); + cqes_to_poll -= err_cqes; + num_os_cqe += err_cqes; + wc = wc + err_cqes; + } + pthread_spin_unlock(&cq->dev->flush_q_lock); + } + return num_os_cqe; +} + +/* + * ocrdma_arm_cq + */ +int ocrdma_arm_cq(struct ibv_cq *ibcq, int solicited) +{ + struct ocrdma_cq *cq; + + cq = get_ocrdma_cq(ibcq); + pthread_spin_lock(&cq->cq_lock); + + if (cq->first_arm) { + ocrdma_ring_cq_db(cq, 1, solicited, 0); + cq->first_arm = 0; + } + + cq->deferred_arm = 1; + cq->deferred_sol = solicited; + + pthread_spin_unlock(&cq->cq_lock); + + return 0; +} + +/* + * ocrdma_post_srq_recv + */ +int ocrdma_post_srq_recv(struct ibv_srq *ibsrq, struct ibv_recv_wr *wr, + struct ibv_recv_wr **bad_wr) +{ + int status = 0; + uint16_t tag; + struct ocrdma_srq *srq; + struct ocrdma_hdr_wqe *rqe; + + srq = get_ocrdma_srq(ibsrq); + pthread_spin_lock(&srq->q_lock); + while (wr) { + if (ocrdma_hwq_free_cnt(&srq->rq) == 0 || + wr->num_sge > srq->rq.max_sges) { + status = ENOMEM; + *bad_wr = wr; + break; + } + rqe = ocrdma_hwq_head(&srq->rq); + tag = ocrdma_srq_get_idx(srq); + ocrdma_build_rqe(rqe, wr, tag); + srq->rqe_wr_id_tbl[tag] = wr->wr_id; + + ocrdma_ring_srq_db(srq); + + /* update pointer, counter for next wr */ + ocrdma_hwq_inc_head(&srq->rq); + wr = wr->next; + } + pthread_spin_unlock(&srq->q_lock); + return status; +} + +/* + * ocrdma_create_ah + */ +struct ibv_ah *ocrdma_create_ah(struct ibv_pd *ibpd, struct ibv_ah_attr *attr) +{ + int status; + int ahtbl_idx; + struct ocrdma_pd *pd; + struct ocrdma_ah *ah; + struct ib_uverbs_create_ah_resp resp; + + pd = get_ocrdma_pd(ibpd); + ah = malloc(sizeof *ah); + if (!ah) + return NULL; + bzero(ah, sizeof *ah); + ah->pd = pd; + + ahtbl_idx = ocrdma_alloc_ah_tbl_id(pd->uctx); + if (ahtbl_idx < 0) + goto tbl_err; + attr->dlid = ahtbl_idx; + memset(&resp, 0, sizeof(resp)); + status = ibv_cmd_create_ah(ibpd, &ah->ibv_ah, attr, &resp, sizeof(resp)); + if (status) + goto cmd_err; + + ah->id = pd->uctx->ah_tbl[ahtbl_idx] & OCRDMA_AH_ID_MASK; + ah->isvlan = (pd->uctx->ah_tbl[ahtbl_idx] >> + OCRDMA_AH_VLAN_VALID_SHIFT); + ah->hdr_type = ((pd->uctx->ah_tbl[ahtbl_idx] >> OCRDMA_AH_L3_TYPE_SHIFT) + & OCRDMA_AH_L3_TYPE_MASK); + + return &ah->ibv_ah; +cmd_err: + ocrdma_free_ah_tbl_id(pd->uctx, ahtbl_idx); +tbl_err: + free(ah); + return NULL; +} + +/* + * ocrdma_destroy_ah + */ +int ocrdma_destroy_ah(struct ibv_ah *ibah) +{ + int status; + struct ocrdma_ah *ah; + + ah = get_ocrdma_ah(ibah); + + status = ibv_cmd_destroy_ah(ibah); + ocrdma_free_ah_tbl_id(ah->pd->uctx, ah->id); + free(ah); + return status; +} + +/* + * ocrdma_attach_mcast + */ +int ocrdma_attach_mcast(struct ibv_qp *ibqp, const union ibv_gid *gid, + uint16_t lid) +{ + return ibv_cmd_attach_mcast(ibqp, gid, lid); +} + +/* + * ocrdma_detach_mcast + */ +int ocrdma_detach_mcast(struct ibv_qp *ibqp, const union ibv_gid *gid, + uint16_t lid) +{ + return ibv_cmd_detach_mcast(ibqp, gid, lid); +} diff --git a/providers/qedr/CMakeLists.txt b/providers/qedr/CMakeLists.txt new file mode 100644 index 0000000..8d4f3ce --- /dev/null +++ b/providers/qedr/CMakeLists.txt @@ -0,0 +1,5 @@ +rdma_provider(qedr + qelr_main.c + qelr_verbs.c + qelr_chain.c + ) diff --git a/providers/qedr/common_hsi.h b/providers/qedr/common_hsi.h new file mode 100644 index 0000000..791006b --- /dev/null +++ b/providers/qedr/common_hsi.h @@ -0,0 +1,1506 @@ +/* + * Copyright (c) 2015-2016 QLogic Corporation + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and /or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef __COMMON_HSI__ +#define __COMMON_HSI__ + +#include <stdint.h> +#include <linux/types.h> + +/********************************/ +/* PROTOCOL COMMON FW CONSTANTS */ +/********************************/ + +/* Temporarily here should be added to HSI automatically by resource allocation tool.*/ +#define T_TEST_AGG_INT_TEMP 6 +#define M_TEST_AGG_INT_TEMP 8 +#define U_TEST_AGG_INT_TEMP 6 +#define X_TEST_AGG_INT_TEMP 14 +#define Y_TEST_AGG_INT_TEMP 4 +#define P_TEST_AGG_INT_TEMP 4 + +#define X_FINAL_CLEANUP_AGG_INT 1 + +#define EVENT_RING_PAGE_SIZE_BYTES 4096 + +#define NUM_OF_GLOBAL_QUEUES 128 +#define COMMON_QUEUE_ENTRY_MAX_BYTE_SIZE 64 + +#define ISCSI_CDU_TASK_SEG_TYPE 0 +#define FCOE_CDU_TASK_SEG_TYPE 0 +#define RDMA_CDU_TASK_SEG_TYPE 1 + +#define FW_ASSERT_GENERAL_ATTN_IDX 32 + +#define MAX_PINNED_CCFC 32 + +#define EAGLE_ENG1_WORKAROUND_NIG_FLOWCTRL_MODE 3 + +/* Queue Zone sizes in bytes */ +#define TSTORM_QZONE_SIZE 8 /*tstorm_scsi_queue_zone*/ +#define MSTORM_QZONE_SIZE 16 /*mstorm_eth_queue_zone. Used only for RX producer of VFs in backward compatibility mode.*/ +#define USTORM_QZONE_SIZE 8 /*ustorm_eth_queue_zone*/ +#define XSTORM_QZONE_SIZE 8 /*xstorm_eth_queue_zone*/ +#define YSTORM_QZONE_SIZE 0 +#define PSTORM_QZONE_SIZE 0 + +#define MSTORM_VF_ZONE_DEFAULT_SIZE_LOG 7 /*Log of mstorm default VF zone size.*/ +#define ETH_MAX_NUM_RX_QUEUES_PER_VF_DEFAULT 16 /*Maximum number of RX queues that can be allocated to VF by default*/ +#define ETH_MAX_NUM_RX_QUEUES_PER_VF_DOUBLE 48 /*Maximum number of RX queues that can be allocated to VF with doubled VF zone size. Up to 96 VF supported in this mode*/ +#define ETH_MAX_NUM_RX_QUEUES_PER_VF_QUAD 112 /*Maximum number of RX queues that can be allocated to VF with 4 VF zone size. Up to 48 VF supported in this mode*/ + + +/********************************/ +/* CORE (LIGHT L2) FW CONSTANTS */ +/********************************/ + +#define CORE_LL2_MAX_RAMROD_PER_CON 8 +#define CORE_LL2_TX_BD_PAGE_SIZE_BYTES 4096 +#define CORE_LL2_RX_BD_PAGE_SIZE_BYTES 4096 +#define CORE_LL2_RX_CQE_PAGE_SIZE_BYTES 4096 +#define CORE_LL2_RX_NUM_NEXT_PAGE_BDS 1 + +#define CORE_LL2_TX_MAX_BDS_PER_PACKET 12 + +#define CORE_SPQE_PAGE_SIZE_BYTES 4096 + +#define MAX_NUM_LL2_RX_QUEUES 32 +#define MAX_NUM_LL2_TX_STATS_COUNTERS 32 + + +/////////////////////////////////////////////////////////////////////////////////////////////////// +// Include firmware verison number only- do not add constants here to avoid redundunt compilations +/////////////////////////////////////////////////////////////////////////////////////////////////// + + +#define FW_MAJOR_VERSION 8 +#define FW_MINOR_VERSION 10 +#define FW_REVISION_VERSION 9 +#define FW_ENGINEERING_VERSION 0 + +/***********************/ +/* COMMON HW CONSTANTS */ +/***********************/ + +/* PCI functions */ +#define MAX_NUM_PORTS_K2 (4) +#define MAX_NUM_PORTS_BB (2) +#define MAX_NUM_PORTS (MAX_NUM_PORTS_K2) + +#define MAX_NUM_PFS_K2 (16) +#define MAX_NUM_PFS_BB (8) +#define MAX_NUM_PFS (MAX_NUM_PFS_K2) +#define MAX_NUM_OF_PFS_IN_CHIP (16) /* On both engines */ + +#define MAX_NUM_VFS_K2 (192) +#define MAX_NUM_VFS_BB (120) +#define MAX_NUM_VFS (MAX_NUM_VFS_K2) + +#define MAX_NUM_FUNCTIONS_BB (MAX_NUM_PFS_BB + MAX_NUM_VFS_BB) +#define MAX_NUM_FUNCTIONS_K2 (MAX_NUM_PFS_K2 + MAX_NUM_VFS_K2) +#define MAX_NUM_FUNCTIONS (MAX_NUM_PFS + MAX_NUM_VFS) + +/* in both BB and K2, the VF number starts from 16. so for arrays containing all */ +/* possible PFs and VFs - we need a constant for this size */ +#define MAX_FUNCTION_NUMBER_BB (MAX_NUM_PFS + MAX_NUM_VFS_BB) +#define MAX_FUNCTION_NUMBER_K2 (MAX_NUM_PFS + MAX_NUM_VFS_K2) +#define MAX_FUNCTION_NUMBER (MAX_NUM_PFS + MAX_NUM_VFS) + +#define MAX_NUM_VPORTS_K2 (208) +#define MAX_NUM_VPORTS_BB (160) +#define MAX_NUM_VPORTS (MAX_NUM_VPORTS_K2) + +#define MAX_NUM_L2_QUEUES_K2 (320) +#define MAX_NUM_L2_QUEUES_BB (256) +#define MAX_NUM_L2_QUEUES (MAX_NUM_L2_QUEUES_K2) + +/* Traffic classes in network-facing blocks (PBF, BTB, NIG, BRB, PRS and QM) */ +// 4-Port K2. +#define NUM_PHYS_TCS_4PORT_K2 (4) +#define NUM_OF_PHYS_TCS (8) + +#define NUM_TCS_4PORT_K2 (NUM_PHYS_TCS_4PORT_K2 + 1) +#define NUM_OF_TCS (NUM_OF_PHYS_TCS + 1) + +#define LB_TC (NUM_OF_PHYS_TCS) + +/* Num of possible traffic priority values */ +#define NUM_OF_PRIO (8) + +#define MAX_NUM_VOQS_K2 (NUM_TCS_4PORT_K2 * MAX_NUM_PORTS_K2) +#define MAX_NUM_VOQS_BB (NUM_OF_TCS * MAX_NUM_PORTS_BB) +#define MAX_NUM_VOQS (MAX_NUM_VOQS_K2) +#define MAX_PHYS_VOQS (NUM_OF_PHYS_TCS * MAX_NUM_PORTS_BB) + +/* CIDs */ +#define NUM_OF_CONNECTION_TYPES (8) +#define NUM_OF_LCIDS (320) +#define NUM_OF_LTIDS (320) + +/* Clock values */ +#define MASTER_CLK_FREQ_E4 (375e6) +#define STORM_CLK_FREQ_E4 (1000e6) +#define CLK25M_CLK_FREQ_E4 (25e6) + +/* Global PXP windows (GTT) */ +#define NUM_OF_GTT 19 +#define GTT_DWORD_SIZE_BITS 10 +#define GTT_BYTE_SIZE_BITS (GTT_DWORD_SIZE_BITS + 2) +#define GTT_DWORD_SIZE (1 << GTT_DWORD_SIZE_BITS) + +/* Tools Version */ +#define TOOLS_VERSION 10 +/*****************/ +/* CDU CONSTANTS */ +/*****************/ + +#define CDU_SEG_TYPE_OFFSET_REG_TYPE_SHIFT (17) +#define CDU_SEG_TYPE_OFFSET_REG_OFFSET_MASK (0x1ffff) + +#define CDU_VF_FL_SEG_TYPE_OFFSET_REG_TYPE_SHIFT (12) +#define CDU_VF_FL_SEG_TYPE_OFFSET_REG_OFFSET_MASK (0xfff) + + +/*****************/ +/* DQ CONSTANTS */ +/*****************/ + +/* DEMS */ +#define DQ_DEMS_LEGACY 0 +#define DQ_DEMS_TOE_MORE_TO_SEND 3 +#define DQ_DEMS_TOE_LOCAL_ADV_WND 4 +#define DQ_DEMS_ROCE_CQ_CONS 7 + +/* XCM agg val selection (HW) */ +#define DQ_XCM_AGG_VAL_SEL_WORD2 0 +#define DQ_XCM_AGG_VAL_SEL_WORD3 1 +#define DQ_XCM_AGG_VAL_SEL_WORD4 2 +#define DQ_XCM_AGG_VAL_SEL_WORD5 3 +#define DQ_XCM_AGG_VAL_SEL_REG3 4 +#define DQ_XCM_AGG_VAL_SEL_REG4 5 +#define DQ_XCM_AGG_VAL_SEL_REG5 6 +#define DQ_XCM_AGG_VAL_SEL_REG6 7 + +/* XCM agg val selection (FW) */ +#define DQ_XCM_CORE_TX_BD_CONS_CMD DQ_XCM_AGG_VAL_SEL_WORD3 +#define DQ_XCM_CORE_TX_BD_PROD_CMD DQ_XCM_AGG_VAL_SEL_WORD4 +#define DQ_XCM_CORE_SPQ_PROD_CMD DQ_XCM_AGG_VAL_SEL_WORD4 +#define DQ_XCM_ETH_EDPM_NUM_BDS_CMD DQ_XCM_AGG_VAL_SEL_WORD2 +#define DQ_XCM_ETH_TX_BD_CONS_CMD DQ_XCM_AGG_VAL_SEL_WORD3 +#define DQ_XCM_ETH_TX_BD_PROD_CMD DQ_XCM_AGG_VAL_SEL_WORD4 +#define DQ_XCM_ETH_GO_TO_BD_CONS_CMD DQ_XCM_AGG_VAL_SEL_WORD5 +#define DQ_XCM_FCOE_SQ_CONS_CMD DQ_XCM_AGG_VAL_SEL_WORD3 +#define DQ_XCM_FCOE_SQ_PROD_CMD DQ_XCM_AGG_VAL_SEL_WORD4 +#define DQ_XCM_FCOE_X_FERQ_PROD_CMD DQ_XCM_AGG_VAL_SEL_WORD5 +#define DQ_XCM_ISCSI_SQ_CONS_CMD DQ_XCM_AGG_VAL_SEL_WORD3 +#define DQ_XCM_ISCSI_SQ_PROD_CMD DQ_XCM_AGG_VAL_SEL_WORD4 +#define DQ_XCM_ISCSI_MORE_TO_SEND_SEQ_CMD DQ_XCM_AGG_VAL_SEL_REG3 +#define DQ_XCM_ISCSI_EXP_STAT_SN_CMD DQ_XCM_AGG_VAL_SEL_REG6 +#define DQ_XCM_ROCE_SQ_PROD_CMD DQ_XCM_AGG_VAL_SEL_WORD4 +#define DQ_XCM_TOE_TX_BD_PROD_CMD DQ_XCM_AGG_VAL_SEL_WORD4 +#define DQ_XCM_TOE_MORE_TO_SEND_SEQ_CMD DQ_XCM_AGG_VAL_SEL_REG3 +#define DQ_XCM_TOE_LOCAL_ADV_WND_SEQ_CMD DQ_XCM_AGG_VAL_SEL_REG4 + +/* UCM agg val selection (HW) */ +#define DQ_UCM_AGG_VAL_SEL_WORD0 0 +#define DQ_UCM_AGG_VAL_SEL_WORD1 1 +#define DQ_UCM_AGG_VAL_SEL_WORD2 2 +#define DQ_UCM_AGG_VAL_SEL_WORD3 3 +#define DQ_UCM_AGG_VAL_SEL_REG0 4 +#define DQ_UCM_AGG_VAL_SEL_REG1 5 +#define DQ_UCM_AGG_VAL_SEL_REG2 6 +#define DQ_UCM_AGG_VAL_SEL_REG3 7 + +/* UCM agg val selection (FW) */ +#define DQ_UCM_ETH_PMD_TX_CONS_CMD DQ_UCM_AGG_VAL_SEL_WORD2 +#define DQ_UCM_ETH_PMD_RX_CONS_CMD DQ_UCM_AGG_VAL_SEL_WORD3 +#define DQ_UCM_ROCE_CQ_CONS_CMD DQ_UCM_AGG_VAL_SEL_REG0 +#define DQ_UCM_ROCE_CQ_PROD_CMD DQ_UCM_AGG_VAL_SEL_REG2 + +/* TCM agg val selection (HW) */ +#define DQ_TCM_AGG_VAL_SEL_WORD0 0 +#define DQ_TCM_AGG_VAL_SEL_WORD1 1 +#define DQ_TCM_AGG_VAL_SEL_WORD2 2 +#define DQ_TCM_AGG_VAL_SEL_WORD3 3 +#define DQ_TCM_AGG_VAL_SEL_REG1 4 +#define DQ_TCM_AGG_VAL_SEL_REG2 5 +#define DQ_TCM_AGG_VAL_SEL_REG6 6 +#define DQ_TCM_AGG_VAL_SEL_REG9 7 + +/* TCM agg val selection (FW) */ +#define DQ_TCM_L2B_BD_PROD_CMD DQ_TCM_AGG_VAL_SEL_WORD1 +#define DQ_TCM_ROCE_RQ_PROD_CMD DQ_TCM_AGG_VAL_SEL_WORD0 + +/* XCM agg counter flag selection (HW) */ +#define DQ_XCM_AGG_FLG_SHIFT_BIT14 0 +#define DQ_XCM_AGG_FLG_SHIFT_BIT15 1 +#define DQ_XCM_AGG_FLG_SHIFT_CF12 2 +#define DQ_XCM_AGG_FLG_SHIFT_CF13 3 +#define DQ_XCM_AGG_FLG_SHIFT_CF18 4 +#define DQ_XCM_AGG_FLG_SHIFT_CF19 5 +#define DQ_XCM_AGG_FLG_SHIFT_CF22 6 +#define DQ_XCM_AGG_FLG_SHIFT_CF23 7 + +/* XCM agg counter flag selection (FW) */ +#define DQ_XCM_CORE_DQ_CF_CMD (1 << DQ_XCM_AGG_FLG_SHIFT_CF18) +#define DQ_XCM_CORE_TERMINATE_CMD (1 << DQ_XCM_AGG_FLG_SHIFT_CF19) +#define DQ_XCM_CORE_SLOW_PATH_CMD (1 << DQ_XCM_AGG_FLG_SHIFT_CF22) +#define DQ_XCM_ETH_DQ_CF_CMD (1 << DQ_XCM_AGG_FLG_SHIFT_CF18) +#define DQ_XCM_ETH_TERMINATE_CMD (1 << DQ_XCM_AGG_FLG_SHIFT_CF19) +#define DQ_XCM_ETH_SLOW_PATH_CMD (1 << DQ_XCM_AGG_FLG_SHIFT_CF22) +#define DQ_XCM_ETH_TPH_EN_CMD (1 << DQ_XCM_AGG_FLG_SHIFT_CF23) +#define DQ_XCM_FCOE_SLOW_PATH_CMD (1 << DQ_XCM_AGG_FLG_SHIFT_CF22) +#define DQ_XCM_ISCSI_DQ_FLUSH_CMD (1 << DQ_XCM_AGG_FLG_SHIFT_CF19) +#define DQ_XCM_ISCSI_SLOW_PATH_CMD (1 << DQ_XCM_AGG_FLG_SHIFT_CF22) +#define DQ_XCM_ISCSI_PROC_ONLY_CLEANUP_CMD (1 << DQ_XCM_AGG_FLG_SHIFT_CF23) +#define DQ_XCM_TOE_DQ_FLUSH_CMD (1 << DQ_XCM_AGG_FLG_SHIFT_CF19) +#define DQ_XCM_TOE_SLOW_PATH_CMD (1 << DQ_XCM_AGG_FLG_SHIFT_CF22) + +/* UCM agg counter flag selection (HW) */ +#define DQ_UCM_AGG_FLG_SHIFT_CF0 0 +#define DQ_UCM_AGG_FLG_SHIFT_CF1 1 +#define DQ_UCM_AGG_FLG_SHIFT_CF3 2 +#define DQ_UCM_AGG_FLG_SHIFT_CF4 3 +#define DQ_UCM_AGG_FLG_SHIFT_CF5 4 +#define DQ_UCM_AGG_FLG_SHIFT_CF6 5 +#define DQ_UCM_AGG_FLG_SHIFT_RULE0EN 6 +#define DQ_UCM_AGG_FLG_SHIFT_RULE1EN 7 + +/* UCM agg counter flag selection (FW) */ +#define DQ_UCM_ETH_PMD_TX_ARM_CMD (1 << DQ_UCM_AGG_FLG_SHIFT_CF4) +#define DQ_UCM_ETH_PMD_RX_ARM_CMD (1 << DQ_UCM_AGG_FLG_SHIFT_CF5) +#define DQ_UCM_ROCE_CQ_ARM_SE_CF_CMD (1 << DQ_UCM_AGG_FLG_SHIFT_CF4) +#define DQ_UCM_ROCE_CQ_ARM_CF_CMD (1 << DQ_UCM_AGG_FLG_SHIFT_CF5) +#define DQ_UCM_TOE_TIMER_STOP_ALL_CMD (1 << DQ_UCM_AGG_FLG_SHIFT_CF3) +#define DQ_UCM_TOE_SLOW_PATH_CF_CMD (1 << DQ_UCM_AGG_FLG_SHIFT_CF4) +#define DQ_UCM_TOE_DQ_CF_CMD (1 << DQ_UCM_AGG_FLG_SHIFT_CF5) + +/* TCM agg counter flag selection (HW) */ +#define DQ_TCM_AGG_FLG_SHIFT_CF0 0 +#define DQ_TCM_AGG_FLG_SHIFT_CF1 1 +#define DQ_TCM_AGG_FLG_SHIFT_CF2 2 +#define DQ_TCM_AGG_FLG_SHIFT_CF3 3 +#define DQ_TCM_AGG_FLG_SHIFT_CF4 4 +#define DQ_TCM_AGG_FLG_SHIFT_CF5 5 +#define DQ_TCM_AGG_FLG_SHIFT_CF6 6 +#define DQ_TCM_AGG_FLG_SHIFT_CF7 7 + +/* TCM agg counter flag selection (FW) */ +#define DQ_TCM_FCOE_FLUSH_Q0_CMD (1 << DQ_TCM_AGG_FLG_SHIFT_CF1) +#define DQ_TCM_FCOE_DUMMY_TIMER_CMD (1 << DQ_TCM_AGG_FLG_SHIFT_CF2) +#define DQ_TCM_FCOE_TIMER_STOP_ALL_CMD (1 << DQ_TCM_AGG_FLG_SHIFT_CF3) +#define DQ_TCM_ISCSI_FLUSH_Q0_CMD (1 << DQ_TCM_AGG_FLG_SHIFT_CF1) +#define DQ_TCM_ISCSI_TIMER_STOP_ALL_CMD (1 << DQ_TCM_AGG_FLG_SHIFT_CF3) +#define DQ_TCM_TOE_FLUSH_Q0_CMD (1 << DQ_TCM_AGG_FLG_SHIFT_CF1) +#define DQ_TCM_TOE_TIMER_STOP_ALL_CMD (1 << DQ_TCM_AGG_FLG_SHIFT_CF3) +#define DQ_TCM_IWARP_POST_RQ_CF_CMD (1 << DQ_TCM_AGG_FLG_SHIFT_CF1) + +/* PWM address mapping */ +#define DQ_PWM_OFFSET_DPM_BASE 0x0 +#define DQ_PWM_OFFSET_DPM_END 0x27 +#define DQ_PWM_OFFSET_XCM16_BASE 0x40 +#define DQ_PWM_OFFSET_XCM32_BASE 0x44 +#define DQ_PWM_OFFSET_UCM16_BASE 0x48 +#define DQ_PWM_OFFSET_UCM32_BASE 0x4C +#define DQ_PWM_OFFSET_UCM16_4 0x50 +#define DQ_PWM_OFFSET_TCM16_BASE 0x58 +#define DQ_PWM_OFFSET_TCM32_BASE 0x5C +#define DQ_PWM_OFFSET_XCM_FLAGS 0x68 +#define DQ_PWM_OFFSET_UCM_FLAGS 0x69 +#define DQ_PWM_OFFSET_TCM_FLAGS 0x6B + +#define DQ_PWM_OFFSET_XCM_RDMA_SQ_PROD (DQ_PWM_OFFSET_XCM16_BASE + 2) +#define DQ_PWM_OFFSET_UCM_RDMA_CQ_CONS_32BIT (DQ_PWM_OFFSET_UCM32_BASE) +#define DQ_PWM_OFFSET_UCM_RDMA_CQ_CONS_16BIT (DQ_PWM_OFFSET_UCM16_4) +#define DQ_PWM_OFFSET_UCM_RDMA_INT_TIMEOUT (DQ_PWM_OFFSET_UCM16_BASE + 2) +#define DQ_PWM_OFFSET_UCM_RDMA_ARM_FLAGS (DQ_PWM_OFFSET_UCM_FLAGS) +#define DQ_PWM_OFFSET_TCM_ROCE_RQ_PROD (DQ_PWM_OFFSET_TCM16_BASE + 1) +#define DQ_PWM_OFFSET_TCM_IWARP_RQ_PROD (DQ_PWM_OFFSET_TCM16_BASE + 3) + +#define DQ_REGION_SHIFT (12) + +/* DPM */ +#define DQ_DPM_WQE_BUFF_SIZE (320) + +// Conn type ranges +#define DQ_CONN_TYPE_RANGE_SHIFT (4) + +/*****************/ +/* QM CONSTANTS */ +/*****************/ + +/* number of TX queues in the QM */ +#define MAX_QM_TX_QUEUES_K2 512 +#define MAX_QM_TX_QUEUES_BB 448 +#define MAX_QM_TX_QUEUES MAX_QM_TX_QUEUES_K2 + +/* number of Other queues in the QM */ +#define MAX_QM_OTHER_QUEUES_BB 64 +#define MAX_QM_OTHER_QUEUES_K2 128 +#define MAX_QM_OTHER_QUEUES MAX_QM_OTHER_QUEUES_K2 + +/* number of queues in a PF queue group */ +#define QM_PF_QUEUE_GROUP_SIZE 8 + +/* the size of a single queue element in bytes */ +#define QM_PQ_ELEMENT_SIZE 4 + +/* base number of Tx PQs in the CM PQ representation. + should be used when storing PQ IDs in CM PQ registers and context */ +#define CM_TX_PQ_BASE 0x200 + +/* number of global Vport/QCN rate limiters */ +#define MAX_QM_GLOBAL_RLS 256 + +/* QM registers data */ +#define QM_LINE_CRD_REG_WIDTH 16 +#define QM_LINE_CRD_REG_SIGN_BIT (1 << (QM_LINE_CRD_REG_WIDTH - 1)) +#define QM_BYTE_CRD_REG_WIDTH 24 +#define QM_BYTE_CRD_REG_SIGN_BIT (1 << (QM_BYTE_CRD_REG_WIDTH - 1)) +#define QM_WFQ_CRD_REG_WIDTH 32 +#define QM_WFQ_CRD_REG_SIGN_BIT (1 << (QM_WFQ_CRD_REG_WIDTH - 1)) +#define QM_RL_CRD_REG_WIDTH 32 +#define QM_RL_CRD_REG_SIGN_BIT (1 << (QM_RL_CRD_REG_WIDTH - 1)) + +/*****************/ +/* CAU CONSTANTS */ +/*****************/ + +#define CAU_FSM_ETH_RX 0 +#define CAU_FSM_ETH_TX 1 + +/* Number of Protocol Indices per Status Block */ +#define PIS_PER_SB 12 + + +#define CAU_HC_STOPPED_STATE 3 /* fsm is stopped or not valid for this sb */ +#define CAU_HC_DISABLE_STATE 4 /* fsm is working without interrupt coalescing for this sb*/ +#define CAU_HC_ENABLE_STATE 0 /* fsm is working with interrupt coalescing for this sb*/ + + +/*****************/ +/* IGU CONSTANTS */ +/*****************/ + +#define MAX_SB_PER_PATH_K2 (368) +#define MAX_SB_PER_PATH_BB (288) +#define MAX_TOT_SB_PER_PATH MAX_SB_PER_PATH_K2 + +#define MAX_SB_PER_PF_MIMD 129 +#define MAX_SB_PER_PF_SIMD 64 +#define MAX_SB_PER_VF 64 + +/* Memory addresses on the BAR for the IGU Sub Block */ +#define IGU_MEM_BASE 0x0000 + +#define IGU_MEM_MSIX_BASE 0x0000 +#define IGU_MEM_MSIX_UPPER 0x0101 +#define IGU_MEM_MSIX_RESERVED_UPPER 0x01ff + +#define IGU_MEM_PBA_MSIX_BASE 0x0200 +#define IGU_MEM_PBA_MSIX_UPPER 0x0202 +#define IGU_MEM_PBA_MSIX_RESERVED_UPPER 0x03ff + +#define IGU_CMD_INT_ACK_BASE 0x0400 +#define IGU_CMD_INT_ACK_UPPER (IGU_CMD_INT_ACK_BASE + MAX_TOT_SB_PER_PATH - 1) +#define IGU_CMD_INT_ACK_RESERVED_UPPER 0x05ff + +#define IGU_CMD_ATTN_BIT_UPD_UPPER 0x05f0 +#define IGU_CMD_ATTN_BIT_SET_UPPER 0x05f1 +#define IGU_CMD_ATTN_BIT_CLR_UPPER 0x05f2 + +#define IGU_REG_SISR_MDPC_WMASK_UPPER 0x05f3 +#define IGU_REG_SISR_MDPC_WMASK_LSB_UPPER 0x05f4 +#define IGU_REG_SISR_MDPC_WMASK_MSB_UPPER 0x05f5 +#define IGU_REG_SISR_MDPC_WOMASK_UPPER 0x05f6 + +#define IGU_CMD_PROD_UPD_BASE 0x0600 +#define IGU_CMD_PROD_UPD_UPPER (IGU_CMD_PROD_UPD_BASE + MAX_TOT_SB_PER_PATH - 1) +#define IGU_CMD_PROD_UPD_RESERVED_UPPER 0x07ff + +/*****************/ +/* PXP CONSTANTS */ +/*****************/ + +/* Bars for Blocks */ +#define PXP_BAR_GRC 0 +#define PXP_BAR_TSDM 0 +#define PXP_BAR_USDM 0 +#define PXP_BAR_XSDM 0 +#define PXP_BAR_MSDM 0 +#define PXP_BAR_YSDM 0 +#define PXP_BAR_PSDM 0 +#define PXP_BAR_IGU 0 +#define PXP_BAR_DQ 1 + +/* PTT and GTT */ +#define PXP_NUM_PF_WINDOWS 12 +#define PXP_PER_PF_ENTRY_SIZE 8 +#define PXP_NUM_GLOBAL_WINDOWS 243 +#define PXP_GLOBAL_ENTRY_SIZE 4 +#define PXP_ADMIN_WINDOW_ALLOWED_LENGTH 4 +#define PXP_PF_WINDOW_ADMIN_START 0 +#define PXP_PF_WINDOW_ADMIN_LENGTH 0x1000 +#define PXP_PF_WINDOW_ADMIN_END (PXP_PF_WINDOW_ADMIN_START + PXP_PF_WINDOW_ADMIN_LENGTH - 1) +#define PXP_PF_WINDOW_ADMIN_PER_PF_START 0 +#define PXP_PF_WINDOW_ADMIN_PER_PF_LENGTH (PXP_NUM_PF_WINDOWS * PXP_PER_PF_ENTRY_SIZE) +#define PXP_PF_WINDOW_ADMIN_PER_PF_END (PXP_PF_WINDOW_ADMIN_PER_PF_START + PXP_PF_WINDOW_ADMIN_PER_PF_LENGTH - 1) +#define PXP_PF_WINDOW_ADMIN_GLOBAL_START 0x200 +#define PXP_PF_WINDOW_ADMIN_GLOBAL_LENGTH (PXP_NUM_GLOBAL_WINDOWS * PXP_GLOBAL_ENTRY_SIZE) +#define PXP_PF_WINDOW_ADMIN_GLOBAL_END (PXP_PF_WINDOW_ADMIN_GLOBAL_START + PXP_PF_WINDOW_ADMIN_GLOBAL_LENGTH - 1) +#define PXP_PF_GLOBAL_PRETEND_ADDR 0x1f0 +#define PXP_PF_ME_OPAQUE_MASK_ADDR 0xf4 +#define PXP_PF_ME_OPAQUE_ADDR 0x1f8 +#define PXP_PF_ME_CONCRETE_ADDR 0x1fc + +#define PXP_EXTERNAL_BAR_PF_WINDOW_START 0x1000 +#define PXP_EXTERNAL_BAR_PF_WINDOW_NUM PXP_NUM_PF_WINDOWS +#define PXP_EXTERNAL_BAR_PF_WINDOW_SINGLE_SIZE 0x1000 +#define PXP_EXTERNAL_BAR_PF_WINDOW_LENGTH (PXP_EXTERNAL_BAR_PF_WINDOW_NUM * PXP_EXTERNAL_BAR_PF_WINDOW_SINGLE_SIZE) +#define PXP_EXTERNAL_BAR_PF_WINDOW_END (PXP_EXTERNAL_BAR_PF_WINDOW_START + PXP_EXTERNAL_BAR_PF_WINDOW_LENGTH - 1) + +#define PXP_EXTERNAL_BAR_GLOBAL_WINDOW_START (PXP_EXTERNAL_BAR_PF_WINDOW_END + 1) +#define PXP_EXTERNAL_BAR_GLOBAL_WINDOW_NUM PXP_NUM_GLOBAL_WINDOWS +#define PXP_EXTERNAL_BAR_GLOBAL_WINDOW_SINGLE_SIZE 0x1000 +#define PXP_EXTERNAL_BAR_GLOBAL_WINDOW_LENGTH (PXP_EXTERNAL_BAR_GLOBAL_WINDOW_NUM * PXP_EXTERNAL_BAR_GLOBAL_WINDOW_SINGLE_SIZE) +#define PXP_EXTERNAL_BAR_GLOBAL_WINDOW_END (PXP_EXTERNAL_BAR_GLOBAL_WINDOW_START + PXP_EXTERNAL_BAR_GLOBAL_WINDOW_LENGTH - 1) + +/* PF BAR */ +//#define PXP_BAR0_START_GRC 0x1000 +//#define PXP_BAR0_GRC_LENGTH 0xBFF000 +#define PXP_BAR0_START_GRC 0x0000 +#define PXP_BAR0_GRC_LENGTH 0x1C00000 +#define PXP_BAR0_END_GRC (PXP_BAR0_START_GRC + PXP_BAR0_GRC_LENGTH - 1) + +#define PXP_BAR0_START_IGU 0x1C00000 +#define PXP_BAR0_IGU_LENGTH 0x10000 +#define PXP_BAR0_END_IGU (PXP_BAR0_START_IGU + PXP_BAR0_IGU_LENGTH - 1) + +#define PXP_BAR0_START_TSDM 0x1C80000 +#define PXP_BAR0_SDM_LENGTH 0x40000 +#define PXP_BAR0_SDM_RESERVED_LENGTH 0x40000 +#define PXP_BAR0_END_TSDM (PXP_BAR0_START_TSDM + PXP_BAR0_SDM_LENGTH - 1) + +#define PXP_BAR0_START_MSDM 0x1D00000 +#define PXP_BAR0_END_MSDM (PXP_BAR0_START_MSDM + PXP_BAR0_SDM_LENGTH - 1) + +#define PXP_BAR0_START_USDM 0x1D80000 +#define PXP_BAR0_END_USDM (PXP_BAR0_START_USDM + PXP_BAR0_SDM_LENGTH - 1) + +#define PXP_BAR0_START_XSDM 0x1E00000 +#define PXP_BAR0_END_XSDM (PXP_BAR0_START_XSDM + PXP_BAR0_SDM_LENGTH - 1) + +#define PXP_BAR0_START_YSDM 0x1E80000 +#define PXP_BAR0_END_YSDM (PXP_BAR0_START_YSDM + PXP_BAR0_SDM_LENGTH - 1) + +#define PXP_BAR0_START_PSDM 0x1F00000 +#define PXP_BAR0_END_PSDM (PXP_BAR0_START_PSDM + PXP_BAR0_SDM_LENGTH - 1) + +#define PXP_BAR0_FIRST_INVALID_ADDRESS (PXP_BAR0_END_PSDM + 1) + +/* VF BAR */ +#define PXP_VF_BAR0 0 + +#define PXP_VF_BAR0_START_GRC 0x3E00 +#define PXP_VF_BAR0_GRC_LENGTH 0x200 +#define PXP_VF_BAR0_END_GRC (PXP_VF_BAR0_START_GRC + PXP_VF_BAR0_GRC_LENGTH - 1) + +#define PXP_VF_BAR0_START_IGU 0 +#define PXP_VF_BAR0_IGU_LENGTH 0x3000 +#define PXP_VF_BAR0_END_IGU (PXP_VF_BAR0_START_IGU + PXP_VF_BAR0_IGU_LENGTH - 1) + +#define PXP_VF_BAR0_START_DQ 0x3000 +#define PXP_VF_BAR0_DQ_LENGTH 0x200 +#define PXP_VF_BAR0_DQ_OPAQUE_OFFSET 0 +#define PXP_VF_BAR0_ME_OPAQUE_ADDRESS (PXP_VF_BAR0_START_DQ + PXP_VF_BAR0_DQ_OPAQUE_OFFSET) +#define PXP_VF_BAR0_ME_CONCRETE_ADDRESS (PXP_VF_BAR0_ME_OPAQUE_ADDRESS + 4) +#define PXP_VF_BAR0_END_DQ (PXP_VF_BAR0_START_DQ + PXP_VF_BAR0_DQ_LENGTH - 1) + +#define PXP_VF_BAR0_START_TSDM_ZONE_B 0x3200 +#define PXP_VF_BAR0_SDM_LENGTH_ZONE_B 0x200 +#define PXP_VF_BAR0_END_TSDM_ZONE_B (PXP_VF_BAR0_START_TSDM_ZONE_B + PXP_VF_BAR0_SDM_LENGTH_ZONE_B - 1) + +#define PXP_VF_BAR0_START_MSDM_ZONE_B 0x3400 +#define PXP_VF_BAR0_END_MSDM_ZONE_B (PXP_VF_BAR0_START_MSDM_ZONE_B + PXP_VF_BAR0_SDM_LENGTH_ZONE_B - 1) + +#define PXP_VF_BAR0_START_USDM_ZONE_B 0x3600 +#define PXP_VF_BAR0_END_USDM_ZONE_B (PXP_VF_BAR0_START_USDM_ZONE_B + PXP_VF_BAR0_SDM_LENGTH_ZONE_B - 1) + +#define PXP_VF_BAR0_START_XSDM_ZONE_B 0x3800 +#define PXP_VF_BAR0_END_XSDM_ZONE_B (PXP_VF_BAR0_START_XSDM_ZONE_B + PXP_VF_BAR0_SDM_LENGTH_ZONE_B - 1) + +#define PXP_VF_BAR0_START_YSDM_ZONE_B 0x3a00 +#define PXP_VF_BAR0_END_YSDM_ZONE_B (PXP_VF_BAR0_START_YSDM_ZONE_B + PXP_VF_BAR0_SDM_LENGTH_ZONE_B - 1) + +#define PXP_VF_BAR0_START_PSDM_ZONE_B 0x3c00 +#define PXP_VF_BAR0_END_PSDM_ZONE_B (PXP_VF_BAR0_START_PSDM_ZONE_B + PXP_VF_BAR0_SDM_LENGTH_ZONE_B - 1) + +#define PXP_VF_BAR0_START_SDM_ZONE_A 0x4000 +#define PXP_VF_BAR0_END_SDM_ZONE_A 0x10000 + +#define PXP_VF_BAR0_GRC_WINDOW_LENGTH 32 + +#define PXP_ILT_PAGE_SIZE_NUM_BITS_MIN 12 +#define PXP_ILT_BLOCK_FACTOR_MULTIPLIER 1024 + +// ILT Records +#define PXP_NUM_ILT_RECORDS_BB 7600 +#define PXP_NUM_ILT_RECORDS_K2 11000 +#define MAX_NUM_ILT_RECORDS MAX(PXP_NUM_ILT_RECORDS_BB,PXP_NUM_ILT_RECORDS_K2) + + +// Host Interface +#define PXP_QUEUES_ZONE_MAX_NUM 320 + + + + +/*****************/ +/* PRM CONSTANTS */ +/*****************/ +#define PRM_DMA_PAD_BYTES_NUM 2 +/*****************/ +/* SDMs CONSTANTS */ +/*****************/ + + +#define SDM_OP_GEN_TRIG_NONE 0 +#define SDM_OP_GEN_TRIG_WAKE_THREAD 1 +#define SDM_OP_GEN_TRIG_AGG_INT 2 +#define SDM_OP_GEN_TRIG_LOADER 4 +#define SDM_OP_GEN_TRIG_INDICATE_ERROR 6 +#define SDM_OP_GEN_TRIG_RELEASE_THREAD 7 + +///////////////////////////////////////////////////////////// +// Completion types +///////////////////////////////////////////////////////////// + +#define SDM_COMP_TYPE_NONE 0 +#define SDM_COMP_TYPE_WAKE_THREAD 1 +#define SDM_COMP_TYPE_AGG_INT 2 +#define SDM_COMP_TYPE_CM 3 // Send direct message to local CM and/or remote CMs. Destinations are defined by vector in CompParams. +#define SDM_COMP_TYPE_LOADER 4 +#define SDM_COMP_TYPE_PXP 5 // Send direct message to PXP (like "internal write" command) to write to remote Storm RAM via remote SDM +#define SDM_COMP_TYPE_INDICATE_ERROR 6 // Indicate error per thread +#define SDM_COMP_TYPE_RELEASE_THREAD 7 +#define SDM_COMP_TYPE_RAM 8 // Write to local RAM as a completion + + +/******************/ +/* PBF CONSTANTS */ +/******************/ + +/* Number of PBF command queue lines. Each line is 32B. */ +#define PBF_MAX_CMD_LINES 3328 + +/* Number of BTB blocks. Each block is 256B. */ +#define BTB_MAX_BLOCKS 1440 + +/*****************/ +/* PRS CONSTANTS */ +/*****************/ + +#define PRS_GFT_CAM_LINES_NO_MATCH 31 + +/* + * Async data KCQ CQE + */ +struct async_data +{ + __le32 cid /* Context ID of the connection */; + __le16 itid /* Task Id of the task (for error that happened on a a task) */; + uint8_t error_code /* error code - relevant only if the opcode indicates its an error */; + uint8_t fw_debug_param /* internal fw debug parameter */; +}; + + +/* + * Interrupt coalescing TimeSet + */ +struct coalescing_timeset +{ + uint8_t value; +#define COALESCING_TIMESET_TIMESET_MASK 0x7F /* Interrupt coalescing TimeSet (timeout_ticks = TimeSet shl (TimerRes+1)) */ +#define COALESCING_TIMESET_TIMESET_SHIFT 0 +#define COALESCING_TIMESET_VALID_MASK 0x1 /* Only if this flag is set, timeset will take effect */ +#define COALESCING_TIMESET_VALID_SHIFT 7 +}; + + +struct common_queue_zone +{ + __le16 ring_drv_data_consumer; + __le16 reserved; +}; + + +/* + * ETH Rx producers data + */ +struct eth_rx_prod_data +{ + __le16 bd_prod /* BD producer. */; + __le16 cqe_prod /* CQE producer. */; +}; + + +struct regpair +{ + __le32 lo /* low word for reg-pair */; + __le32 hi /* high word for reg-pair */; +}; + +/* + * Event Ring VF-PF Channel data + */ +struct vf_pf_channel_eqe_data +{ + struct regpair msg_addr /* VF-PF message address */; +}; + +struct iscsi_eqe_data +{ + __le32 cid /* Context ID of the connection */; + __le16 conn_id /* Task Id of the task (for error that happened on a a task) */; + uint8_t error_code /* error code - relevant only if the opcode indicates its an error */; + uint8_t error_pdu_opcode_reserved; +#define ISCSI_EQE_DATA_ERROR_PDU_OPCODE_MASK 0x3F /* The processed PDUs opcode on which happened the error - updated for specific error codes, by defualt=0xFF */ +#define ISCSI_EQE_DATA_ERROR_PDU_OPCODE_SHIFT 0 +#define ISCSI_EQE_DATA_ERROR_PDU_OPCODE_VALID_MASK 0x1 /* Indication for driver is the error_pdu_opcode field has valid value */ +#define ISCSI_EQE_DATA_ERROR_PDU_OPCODE_VALID_SHIFT 6 +#define ISCSI_EQE_DATA_RESERVED0_MASK 0x1 +#define ISCSI_EQE_DATA_RESERVED0_SHIFT 7 +}; + +/* + * Event Ring malicious VF data + */ +struct malicious_vf_eqe_data +{ + uint8_t vfId /* Malicious VF ID */; + uint8_t errId /* Malicious VF error */; + __le16 reserved[3]; +}; + +/* + * Event Ring initial cleanup data + */ +struct initial_cleanup_eqe_data +{ + uint8_t vfId /* VF ID */; + uint8_t reserved[7]; +}; + +/* + * Event Data Union + */ +union event_ring_data +{ + uint8_t bytes[8] /* Byte Array */; + struct vf_pf_channel_eqe_data vf_pf_channel /* VF-PF Channel data */; + struct iscsi_eqe_data iscsi_info /* Dedicated fields to iscsi data */; + struct regpair roceHandle /* Dedicated field for RoCE affiliated asynchronous error */; + struct malicious_vf_eqe_data malicious_vf /* Malicious VF data */; + struct initial_cleanup_eqe_data vf_init_cleanup /* VF Initial Cleanup data */; + struct regpair iwarp_handle /* Host handle for the Async Completions */; +}; + + +/* + * Event Ring Entry + */ +struct event_ring_entry +{ + uint8_t protocol_id /* Event Protocol ID */; + uint8_t opcode /* Event Opcode */; + __le16 reserved0 /* Reserved */; + __le16 echo /* Echo value from ramrod data on the host */; + uint8_t fw_return_code /* FW return code for SP ramrods */; + uint8_t flags; +#define EVENT_RING_ENTRY_ASYNC_MASK 0x1 /* 0: synchronous EQE - a completion of SP message. 1: asynchronous EQE */ +#define EVENT_RING_ENTRY_ASYNC_SHIFT 0 +#define EVENT_RING_ENTRY_RESERVED1_MASK 0x7F +#define EVENT_RING_ENTRY_RESERVED1_SHIFT 1 + union event_ring_data data; +}; + + + + + +/* + * Multi function mode + */ +enum mf_mode +{ + ERROR_MODE /* Unsupported mode */, + MF_OVLAN /* Multi function based on outer VLAN */, + MF_NPAR /* Multi function based on MAC address (NIC partitioning) */, + MAX_MF_MODE +}; + + +/* + * Per-protocol connection types + */ +enum protocol_type +{ + PROTOCOLID_ISCSI /* iSCSI */, + PROTOCOLID_FCOE /* FCoE */, + PROTOCOLID_ROCE /* RoCE */, + PROTOCOLID_CORE /* Core (light L2, slow path core) */, + PROTOCOLID_ETH /* Ethernet */, + PROTOCOLID_IWARP /* iWARP */, + PROTOCOLID_TOE /* TOE */, + PROTOCOLID_PREROCE /* Pre (tapeout) RoCE */, + PROTOCOLID_COMMON /* ProtocolCommon */, + PROTOCOLID_TCP /* TCP */, + MAX_PROTOCOL_TYPE +}; + + +/* + * Ustorm Queue Zone + */ +struct ustorm_eth_queue_zone +{ + struct coalescing_timeset int_coalescing_timeset /* Rx interrupt coalescing TimeSet */; + uint8_t reserved[3]; +}; + + +struct ustorm_queue_zone +{ + struct ustorm_eth_queue_zone eth; + struct common_queue_zone common; +}; + + + +/* + * status block structure + */ +struct cau_pi_entry +{ + __le32 prod; +#define CAU_PI_ENTRY_PROD_VAL_MASK 0xFFFF /* A per protocol indexPROD value. */ +#define CAU_PI_ENTRY_PROD_VAL_SHIFT 0 +#define CAU_PI_ENTRY_PI_TIMESET_MASK 0x7F /* This value determines the TimeSet that the PI is associated with */ +#define CAU_PI_ENTRY_PI_TIMESET_SHIFT 16 +#define CAU_PI_ENTRY_FSM_SEL_MASK 0x1 /* Select the FSM within the SB */ +#define CAU_PI_ENTRY_FSM_SEL_SHIFT 23 +#define CAU_PI_ENTRY_RESERVED_MASK 0xFF /* Select the FSM within the SB */ +#define CAU_PI_ENTRY_RESERVED_SHIFT 24 +}; + + +/* + * status block structure + */ +struct cau_sb_entry +{ + __le32 data; +#define CAU_SB_ENTRY_SB_PROD_MASK 0xFFFFFF /* The SB PROD index which is sent to the IGU. */ +#define CAU_SB_ENTRY_SB_PROD_SHIFT 0 +#define CAU_SB_ENTRY_STATE0_MASK 0xF /* RX state */ +#define CAU_SB_ENTRY_STATE0_SHIFT 24 +#define CAU_SB_ENTRY_STATE1_MASK 0xF /* TX state */ +#define CAU_SB_ENTRY_STATE1_SHIFT 28 + __le32 params; +#define CAU_SB_ENTRY_SB_TIMESET0_MASK 0x7F /* Indicates the RX TimeSet that this SB is associated with. */ +#define CAU_SB_ENTRY_SB_TIMESET0_SHIFT 0 +#define CAU_SB_ENTRY_SB_TIMESET1_MASK 0x7F /* Indicates the TX TimeSet that this SB is associated with. */ +#define CAU_SB_ENTRY_SB_TIMESET1_SHIFT 7 +#define CAU_SB_ENTRY_TIMER_RES0_MASK 0x3 /* This value will determine the RX FSM timer resolution in ticks */ +#define CAU_SB_ENTRY_TIMER_RES0_SHIFT 14 +#define CAU_SB_ENTRY_TIMER_RES1_MASK 0x3 /* This value will determine the TX FSM timer resolution in ticks */ +#define CAU_SB_ENTRY_TIMER_RES1_SHIFT 16 +#define CAU_SB_ENTRY_VF_NUMBER_MASK 0xFF +#define CAU_SB_ENTRY_VF_NUMBER_SHIFT 18 +#define CAU_SB_ENTRY_VF_VALID_MASK 0x1 +#define CAU_SB_ENTRY_VF_VALID_SHIFT 26 +#define CAU_SB_ENTRY_PF_NUMBER_MASK 0xF +#define CAU_SB_ENTRY_PF_NUMBER_SHIFT 27 +#define CAU_SB_ENTRY_TPH_MASK 0x1 /* If set then indicates that the TPH STAG is equal to the SB number. Otherwise the STAG will be equal to all ones. */ +#define CAU_SB_ENTRY_TPH_SHIFT 31 +}; + + +/* + * core doorbell data + */ +struct core_db_data +{ + uint8_t params; +#define CORE_DB_DATA_DEST_MASK 0x3 /* destination of doorbell (use enum db_dest) */ +#define CORE_DB_DATA_DEST_SHIFT 0 +#define CORE_DB_DATA_AGG_CMD_MASK 0x3 /* aggregative command to CM (use enum db_agg_cmd_sel) */ +#define CORE_DB_DATA_AGG_CMD_SHIFT 2 +#define CORE_DB_DATA_BYPASS_EN_MASK 0x1 /* enable QM bypass */ +#define CORE_DB_DATA_BYPASS_EN_SHIFT 4 +#define CORE_DB_DATA_RESERVED_MASK 0x1 +#define CORE_DB_DATA_RESERVED_SHIFT 5 +#define CORE_DB_DATA_AGG_VAL_SEL_MASK 0x3 /* aggregative value selection */ +#define CORE_DB_DATA_AGG_VAL_SEL_SHIFT 6 + uint8_t agg_flags /* bit for every DQ counter flags in CM context that DQ can increment */; + __le16 spq_prod; +}; + + +/* + * Enum of doorbell aggregative command selection + */ +enum db_agg_cmd_sel +{ + DB_AGG_CMD_NOP /* No operation */, + DB_AGG_CMD_SET /* Set the value */, + DB_AGG_CMD_ADD /* Add the value */, + DB_AGG_CMD_MAX /* Set max of current and new value */, + MAX_DB_AGG_CMD_SEL +}; + + +/* + * Enum of doorbell destination + */ +enum db_dest +{ + DB_DEST_XCM /* TX doorbell to XCM */, + DB_DEST_UCM /* RX doorbell to UCM */, + DB_DEST_TCM /* RX doorbell to TCM */, + DB_NUM_DESTINATIONS, + MAX_DB_DEST +}; + + +/* + * Enum of doorbell DPM types + */ +enum db_dpm_type +{ + DPM_LEGACY /* Legacy DPM- to Xstorm RAM */, + DPM_ROCE /* RoCE DPM- to NIG */, + DPM_L2_INLINE /* L2 DPM inline- to PBF, with packet data on doorbell */, + DPM_L2_BD /* L2 DPM with BD- to PBF, with TX BD data on doorbell */, + MAX_DB_DPM_TYPE +}; + + +/* + * Structure for doorbell data, in L2 DPM mode, for the first doorbell in a DPM burst + */ +struct db_l2_dpm_data +{ + __le16 icid /* internal CID */; + __le16 bd_prod /* bd producer value to update */; + __le32 params; +#define DB_L2_DPM_DATA_SIZE_MASK 0x3F /* Size in QWORD-s of the DPM burst */ +#define DB_L2_DPM_DATA_SIZE_SHIFT 0 +#define DB_L2_DPM_DATA_DPM_TYPE_MASK 0x3 /* Type of DPM transaction (DPM_L2_INLINE or DPM_L2_BD) (use enum db_dpm_type) */ +#define DB_L2_DPM_DATA_DPM_TYPE_SHIFT 6 +#define DB_L2_DPM_DATA_NUM_BDS_MASK 0xFF /* number of BD-s */ +#define DB_L2_DPM_DATA_NUM_BDS_SHIFT 8 +#define DB_L2_DPM_DATA_PKT_SIZE_MASK 0x7FF /* size of the packet to be transmitted in bytes */ +#define DB_L2_DPM_DATA_PKT_SIZE_SHIFT 16 +#define DB_L2_DPM_DATA_RESERVED0_MASK 0x1 +#define DB_L2_DPM_DATA_RESERVED0_SHIFT 27 +#define DB_L2_DPM_DATA_SGE_NUM_MASK 0x7 /* In DPM_L2_BD mode: the number of SGE-s */ +#define DB_L2_DPM_DATA_SGE_NUM_SHIFT 28 +#define DB_L2_DPM_DATA_RESERVED1_MASK 0x1 +#define DB_L2_DPM_DATA_RESERVED1_SHIFT 31 +}; + + +/* + * Structure for SGE in a DPM doorbell of type DPM_L2_BD + */ +struct db_l2_dpm_sge +{ + struct regpair addr /* Single continuous buffer */; + __le16 nbytes /* Number of bytes in this BD. */; + __le16 bitfields; +#define DB_L2_DPM_SGE_TPH_ST_INDEX_MASK 0x1FF /* The TPH STAG index value */ +#define DB_L2_DPM_SGE_TPH_ST_INDEX_SHIFT 0 +#define DB_L2_DPM_SGE_RESERVED0_MASK 0x3 +#define DB_L2_DPM_SGE_RESERVED0_SHIFT 9 +#define DB_L2_DPM_SGE_ST_VALID_MASK 0x1 /* Indicate if ST hint is requested or not */ +#define DB_L2_DPM_SGE_ST_VALID_SHIFT 11 +#define DB_L2_DPM_SGE_RESERVED1_MASK 0xF +#define DB_L2_DPM_SGE_RESERVED1_SHIFT 12 + __le32 reserved2; +}; + + +/* + * Structure for doorbell address, in legacy mode + */ +struct db_legacy_addr +{ + __le32 addr; +#define DB_LEGACY_ADDR_RESERVED0_MASK 0x3 +#define DB_LEGACY_ADDR_RESERVED0_SHIFT 0 +#define DB_LEGACY_ADDR_DEMS_MASK 0x7 /* doorbell extraction mode specifier- 0 if not used */ +#define DB_LEGACY_ADDR_DEMS_SHIFT 2 +#define DB_LEGACY_ADDR_ICID_MASK 0x7FFFFFF /* internal CID */ +#define DB_LEGACY_ADDR_ICID_SHIFT 5 +}; + + +/* + * Structure for doorbell address, in PWM mode + */ +struct db_pwm_addr +{ + __le32 addr; +#define DB_PWM_ADDR_RESERVED0_MASK 0x7 +#define DB_PWM_ADDR_RESERVED0_SHIFT 0 +#define DB_PWM_ADDR_OFFSET_MASK 0x7F /* Offset in PWM address space */ +#define DB_PWM_ADDR_OFFSET_SHIFT 3 +#define DB_PWM_ADDR_WID_MASK 0x3 /* Window ID */ +#define DB_PWM_ADDR_WID_SHIFT 10 +#define DB_PWM_ADDR_DPI_MASK 0xFFFF /* Doorbell page ID */ +#define DB_PWM_ADDR_DPI_SHIFT 12 +#define DB_PWM_ADDR_RESERVED1_MASK 0xF +#define DB_PWM_ADDR_RESERVED1_SHIFT 28 +}; + + +/* + * Parameters to RoCE firmware, passed in EDPM doorbell + */ +struct db_roce_dpm_params +{ + __le32 params; +#define DB_ROCE_DPM_PARAMS_SIZE_MASK 0x3F /* Size in QWORD-s of the DPM burst */ +#define DB_ROCE_DPM_PARAMS_SIZE_SHIFT 0 +#define DB_ROCE_DPM_PARAMS_DPM_TYPE_MASK 0x3 /* Type of DPM transacation (DPM_ROCE) (use enum db_dpm_type) */ +#define DB_ROCE_DPM_PARAMS_DPM_TYPE_SHIFT 6 +#define DB_ROCE_DPM_PARAMS_OPCODE_MASK 0xFF /* opcode for ROCE operation */ +#define DB_ROCE_DPM_PARAMS_OPCODE_SHIFT 8 +#define DB_ROCE_DPM_PARAMS_WQE_SIZE_MASK 0x7FF /* the size of the WQE payload in bytes */ +#define DB_ROCE_DPM_PARAMS_WQE_SIZE_SHIFT 16 +#define DB_ROCE_DPM_PARAMS_RESERVED0_MASK 0x1 +#define DB_ROCE_DPM_PARAMS_RESERVED0_SHIFT 27 +#define DB_ROCE_DPM_PARAMS_COMPLETION_FLG_MASK 0x1 /* RoCE completion flag */ +#define DB_ROCE_DPM_PARAMS_COMPLETION_FLG_SHIFT 28 +#define DB_ROCE_DPM_PARAMS_S_FLG_MASK 0x1 /* RoCE S flag */ +#define DB_ROCE_DPM_PARAMS_S_FLG_SHIFT 29 +#define DB_ROCE_DPM_PARAMS_RESERVED1_MASK 0x3 +#define DB_ROCE_DPM_PARAMS_RESERVED1_SHIFT 30 +}; + +/* + * Structure for doorbell data, in ROCE DPM mode, for the first doorbell in a DPM burst + */ +struct db_roce_dpm_data +{ + __le16 icid /* internal CID */; + __le16 prod_val /* aggregated value to update */; + struct db_roce_dpm_params params /* parametes passed to RoCE firmware */; +}; + + + +/* + * Igu interrupt command + */ +enum igu_int_cmd +{ + IGU_INT_ENABLE=0, + IGU_INT_DISABLE=1, + IGU_INT_NOP=2, + IGU_INT_NOP2=3, + MAX_IGU_INT_CMD +}; + + +/* + * IGU producer or consumer update command + */ +struct igu_prod_cons_update +{ + __le32 sb_id_and_flags; +#define IGU_PROD_CONS_UPDATE_SB_INDEX_MASK 0xFFFFFF +#define IGU_PROD_CONS_UPDATE_SB_INDEX_SHIFT 0 +#define IGU_PROD_CONS_UPDATE_UPDATE_FLAG_MASK 0x1 +#define IGU_PROD_CONS_UPDATE_UPDATE_FLAG_SHIFT 24 +#define IGU_PROD_CONS_UPDATE_ENABLE_INT_MASK 0x3 /* interrupt enable/disable/nop (use enum igu_int_cmd) */ +#define IGU_PROD_CONS_UPDATE_ENABLE_INT_SHIFT 25 +#define IGU_PROD_CONS_UPDATE_SEGMENT_ACCESS_MASK 0x1 /* (use enum igu_seg_access) */ +#define IGU_PROD_CONS_UPDATE_SEGMENT_ACCESS_SHIFT 27 +#define IGU_PROD_CONS_UPDATE_TIMER_MASK_MASK 0x1 +#define IGU_PROD_CONS_UPDATE_TIMER_MASK_SHIFT 28 +#define IGU_PROD_CONS_UPDATE_RESERVED0_MASK 0x3 +#define IGU_PROD_CONS_UPDATE_RESERVED0_SHIFT 29 +#define IGU_PROD_CONS_UPDATE_COMMAND_TYPE_MASK 0x1 /* must always be set cleared (use enum command_type_bit) */ +#define IGU_PROD_CONS_UPDATE_COMMAND_TYPE_SHIFT 31 + __le32 reserved1; +}; + + +/* + * Igu segments access for default status block only + */ +enum igu_seg_access +{ + IGU_SEG_ACCESS_REG=0, + IGU_SEG_ACCESS_ATTN=1, + MAX_IGU_SEG_ACCESS +}; + + +/* + * Enumeration for L3 type field of parsing_and_err_flags_union. L3Type: 0 - unknown (not ip) ,1 - Ipv4, 2 - Ipv6 (this field can be filled according to the last-ethertype) + */ +enum l3_type +{ + e_l3Type_unknown, + e_l3Type_ipv4, + e_l3Type_ipv6, + MAX_L3_TYPE +}; + + +/* + * Enumeration for l4Protocol field of parsing_and_err_flags_union. L4-protocol 0 - none, 1 - TCP, 2- UDP. if the packet is IPv4 fragment, and its not the first fragment, the protocol-type should be set to none. + */ +enum l4_protocol +{ + e_l4Protocol_none, + e_l4Protocol_tcp, + e_l4Protocol_udp, + MAX_L4_PROTOCOL +}; + + +/* + * Parsing and error flags field. + */ +struct parsing_and_err_flags +{ + __le16 flags; +#define PARSING_AND_ERR_FLAGS_L3TYPE_MASK 0x3 /* L3Type: 0 - unknown (not ip) ,1 - Ipv4, 2 - Ipv6 (this field can be filled according to the last-ethertype) (use enum l3_type) */ +#define PARSING_AND_ERR_FLAGS_L3TYPE_SHIFT 0 +#define PARSING_AND_ERR_FLAGS_L4PROTOCOL_MASK 0x3 /* L4-protocol 0 - none, 1 - TCP, 2- UDP. if the packet is IPv4 fragment, and its not the first fragment, the protocol-type should be set to none. (use enum l4_protocol) */ +#define PARSING_AND_ERR_FLAGS_L4PROTOCOL_SHIFT 2 +#define PARSING_AND_ERR_FLAGS_IPV4FRAG_MASK 0x1 /* Set if the packet is IPv4 fragment. */ +#define PARSING_AND_ERR_FLAGS_IPV4FRAG_SHIFT 4 +#define PARSING_AND_ERR_FLAGS_TAG8021QEXIST_MASK 0x1 /* Set if VLAN tag exists. Invalid if tunnel type are IP GRE or IP GENEVE. */ +#define PARSING_AND_ERR_FLAGS_TAG8021QEXIST_SHIFT 5 +#define PARSING_AND_ERR_FLAGS_L4CHKSMWASCALCULATED_MASK 0x1 /* Set if L4 checksum was calculated. */ +#define PARSING_AND_ERR_FLAGS_L4CHKSMWASCALCULATED_SHIFT 6 +#define PARSING_AND_ERR_FLAGS_TIMESYNCPKT_MASK 0x1 /* Set for PTP packet. */ +#define PARSING_AND_ERR_FLAGS_TIMESYNCPKT_SHIFT 7 +#define PARSING_AND_ERR_FLAGS_TIMESTAMPRECORDED_MASK 0x1 /* Set if PTP timestamp recorded. */ +#define PARSING_AND_ERR_FLAGS_TIMESTAMPRECORDED_SHIFT 8 +#define PARSING_AND_ERR_FLAGS_IPHDRERROR_MASK 0x1 /* Set if either version-mismatch or hdr-len-error or ipv4-cksm is set or ipv6 ver mismatch */ +#define PARSING_AND_ERR_FLAGS_IPHDRERROR_SHIFT 9 +#define PARSING_AND_ERR_FLAGS_L4CHKSMERROR_MASK 0x1 /* Set if L4 checksum validation failed. Valid only if L4 checksum was calculated. */ +#define PARSING_AND_ERR_FLAGS_L4CHKSMERROR_SHIFT 10 +#define PARSING_AND_ERR_FLAGS_TUNNELEXIST_MASK 0x1 /* Set if GRE/VXLAN/GENEVE tunnel detected. */ +#define PARSING_AND_ERR_FLAGS_TUNNELEXIST_SHIFT 11 +#define PARSING_AND_ERR_FLAGS_TUNNEL8021QTAGEXIST_MASK 0x1 /* Set if VLAN tag exists in tunnel header. */ +#define PARSING_AND_ERR_FLAGS_TUNNEL8021QTAGEXIST_SHIFT 12 +#define PARSING_AND_ERR_FLAGS_TUNNELIPHDRERROR_MASK 0x1 /* Set if either tunnel-ipv4-version-mismatch or tunnel-ipv4-hdr-len-error or tunnel-ipv4-cksm is set or tunneling ipv6 ver mismatch */ +#define PARSING_AND_ERR_FLAGS_TUNNELIPHDRERROR_SHIFT 13 +#define PARSING_AND_ERR_FLAGS_TUNNELL4CHKSMWASCALCULATED_MASK 0x1 /* Set if GRE or VXLAN/GENEVE UDP checksum was calculated. */ +#define PARSING_AND_ERR_FLAGS_TUNNELL4CHKSMWASCALCULATED_SHIFT 14 +#define PARSING_AND_ERR_FLAGS_TUNNELL4CHKSMERROR_MASK 0x1 /* Set if tunnel L4 checksum validation failed. Valid only if tunnel L4 checksum was calculated. */ +#define PARSING_AND_ERR_FLAGS_TUNNELL4CHKSMERROR_SHIFT 15 +}; + + +/* + * Pb context + */ +struct pb_context +{ + __le32 crc[4]; +}; + + +/* + * Concrete Function ID. + */ +struct pxp_concrete_fid +{ + __le16 fid; +#define PXP_CONCRETE_FID_PFID_MASK 0xF /* Parent PFID */ +#define PXP_CONCRETE_FID_PFID_SHIFT 0 +#define PXP_CONCRETE_FID_PORT_MASK 0x3 /* port number */ +#define PXP_CONCRETE_FID_PORT_SHIFT 4 +#define PXP_CONCRETE_FID_PATH_MASK 0x1 /* path number */ +#define PXP_CONCRETE_FID_PATH_SHIFT 6 +#define PXP_CONCRETE_FID_VFVALID_MASK 0x1 +#define PXP_CONCRETE_FID_VFVALID_SHIFT 7 +#define PXP_CONCRETE_FID_VFID_MASK 0xFF +#define PXP_CONCRETE_FID_VFID_SHIFT 8 +}; + + +/* + * Concrete Function ID. + */ +struct pxp_pretend_concrete_fid +{ + __le16 fid; +#define PXP_PRETEND_CONCRETE_FID_PFID_MASK 0xF /* Parent PFID */ +#define PXP_PRETEND_CONCRETE_FID_PFID_SHIFT 0 +#define PXP_PRETEND_CONCRETE_FID_RESERVED_MASK 0x7 /* port number. Only when part of ME register. */ +#define PXP_PRETEND_CONCRETE_FID_RESERVED_SHIFT 4 +#define PXP_PRETEND_CONCRETE_FID_VFVALID_MASK 0x1 +#define PXP_PRETEND_CONCRETE_FID_VFVALID_SHIFT 7 +#define PXP_PRETEND_CONCRETE_FID_VFID_MASK 0xFF +#define PXP_PRETEND_CONCRETE_FID_VFID_SHIFT 8 +}; + +/* + * Function ID. + */ +union pxp_pretend_fid +{ + struct pxp_pretend_concrete_fid concrete_fid; + __le16 opaque_fid; +}; + +/* + * Pxp Pretend Command Register. + */ +struct pxp_pretend_cmd +{ + union pxp_pretend_fid fid; + __le16 control; +#define PXP_PRETEND_CMD_PATH_MASK 0x1 +#define PXP_PRETEND_CMD_PATH_SHIFT 0 +#define PXP_PRETEND_CMD_USE_PORT_MASK 0x1 +#define PXP_PRETEND_CMD_USE_PORT_SHIFT 1 +#define PXP_PRETEND_CMD_PORT_MASK 0x3 +#define PXP_PRETEND_CMD_PORT_SHIFT 2 +#define PXP_PRETEND_CMD_RESERVED0_MASK 0xF +#define PXP_PRETEND_CMD_RESERVED0_SHIFT 4 +#define PXP_PRETEND_CMD_RESERVED1_MASK 0xF +#define PXP_PRETEND_CMD_RESERVED1_SHIFT 8 +#define PXP_PRETEND_CMD_PRETEND_PATH_MASK 0x1 /* is pretend mode? */ +#define PXP_PRETEND_CMD_PRETEND_PATH_SHIFT 12 +#define PXP_PRETEND_CMD_PRETEND_PORT_MASK 0x1 /* is pretend mode? */ +#define PXP_PRETEND_CMD_PRETEND_PORT_SHIFT 13 +#define PXP_PRETEND_CMD_PRETEND_FUNCTION_MASK 0x1 /* is pretend mode? */ +#define PXP_PRETEND_CMD_PRETEND_FUNCTION_SHIFT 14 +#define PXP_PRETEND_CMD_IS_CONCRETE_MASK 0x1 /* is fid concrete? */ +#define PXP_PRETEND_CMD_IS_CONCRETE_SHIFT 15 +}; + + + + +/* + * PTT Record in PXP Admin Window. + */ +struct pxp_ptt_entry +{ + __le32 offset; +#define PXP_PTT_ENTRY_OFFSET_MASK 0x7FFFFF +#define PXP_PTT_ENTRY_OFFSET_SHIFT 0 +#define PXP_PTT_ENTRY_RESERVED0_MASK 0x1FF +#define PXP_PTT_ENTRY_RESERVED0_SHIFT 23 + struct pxp_pretend_cmd pretend; +}; + + +/* + * VF Zone A Permission Register. + */ +struct pxp_vf_zone_a_permission +{ + __le32 control; +#define PXP_VF_ZONE_A_PERMISSION_VFID_MASK 0xFF +#define PXP_VF_ZONE_A_PERMISSION_VFID_SHIFT 0 +#define PXP_VF_ZONE_A_PERMISSION_VALID_MASK 0x1 +#define PXP_VF_ZONE_A_PERMISSION_VALID_SHIFT 8 +#define PXP_VF_ZONE_A_PERMISSION_RESERVED0_MASK 0x7F +#define PXP_VF_ZONE_A_PERMISSION_RESERVED0_SHIFT 9 +#define PXP_VF_ZONE_A_PERMISSION_RESERVED1_MASK 0xFFFF +#define PXP_VF_ZONE_A_PERMISSION_RESERVED1_SHIFT 16 +}; + + +/* + * Rdif context + */ +struct rdif_task_context +{ + __le32 initialRefTag; + __le16 appTagValue; + __le16 appTagMask; + uint8_t flags0; +#define RDIF_TASK_CONTEXT_IGNOREAPPTAG_MASK 0x1 +#define RDIF_TASK_CONTEXT_IGNOREAPPTAG_SHIFT 0 +#define RDIF_TASK_CONTEXT_INITIALREFTAGVALID_MASK 0x1 +#define RDIF_TASK_CONTEXT_INITIALREFTAGVALID_SHIFT 1 +#define RDIF_TASK_CONTEXT_HOSTGUARDTYPE_MASK 0x1 /* 0 = IP checksum, 1 = CRC */ +#define RDIF_TASK_CONTEXT_HOSTGUARDTYPE_SHIFT 2 +#define RDIF_TASK_CONTEXT_SETERRORWITHEOP_MASK 0x1 +#define RDIF_TASK_CONTEXT_SETERRORWITHEOP_SHIFT 3 +#define RDIF_TASK_CONTEXT_PROTECTIONTYPE_MASK 0x3 /* 1/2/3 - Protection Type */ +#define RDIF_TASK_CONTEXT_PROTECTIONTYPE_SHIFT 4 +#define RDIF_TASK_CONTEXT_CRC_SEED_MASK 0x1 /* 0=0x0000, 1=0xffff */ +#define RDIF_TASK_CONTEXT_CRC_SEED_SHIFT 6 +#define RDIF_TASK_CONTEXT_KEEPREFTAGCONST_MASK 0x1 /* Keep reference tag constant */ +#define RDIF_TASK_CONTEXT_KEEPREFTAGCONST_SHIFT 7 + uint8_t partialDifData[7]; + __le16 partialCrcValue; + __le16 partialChecksumValue; + __le32 offsetInIO; + __le16 flags1; +#define RDIF_TASK_CONTEXT_VALIDATEGUARD_MASK 0x1 +#define RDIF_TASK_CONTEXT_VALIDATEGUARD_SHIFT 0 +#define RDIF_TASK_CONTEXT_VALIDATEAPPTAG_MASK 0x1 +#define RDIF_TASK_CONTEXT_VALIDATEAPPTAG_SHIFT 1 +#define RDIF_TASK_CONTEXT_VALIDATEREFTAG_MASK 0x1 +#define RDIF_TASK_CONTEXT_VALIDATEREFTAG_SHIFT 2 +#define RDIF_TASK_CONTEXT_FORWARDGUARD_MASK 0x1 +#define RDIF_TASK_CONTEXT_FORWARDGUARD_SHIFT 3 +#define RDIF_TASK_CONTEXT_FORWARDAPPTAG_MASK 0x1 +#define RDIF_TASK_CONTEXT_FORWARDAPPTAG_SHIFT 4 +#define RDIF_TASK_CONTEXT_FORWARDREFTAG_MASK 0x1 +#define RDIF_TASK_CONTEXT_FORWARDREFTAG_SHIFT 5 +#define RDIF_TASK_CONTEXT_INTERVALSIZE_MASK 0x7 /* 0=512B, 1=1KB, 2=2KB, 3=4KB, 4=8KB */ +#define RDIF_TASK_CONTEXT_INTERVALSIZE_SHIFT 6 +#define RDIF_TASK_CONTEXT_HOSTINTERFACE_MASK 0x3 /* 0=None, 1=DIF, 2=DIX */ +#define RDIF_TASK_CONTEXT_HOSTINTERFACE_SHIFT 9 +#define RDIF_TASK_CONTEXT_DIFBEFOREDATA_MASK 0x1 /* DIF tag right at the beginning of DIF interval */ +#define RDIF_TASK_CONTEXT_DIFBEFOREDATA_SHIFT 11 +#define RDIF_TASK_CONTEXT_RESERVED0_MASK 0x1 +#define RDIF_TASK_CONTEXT_RESERVED0_SHIFT 12 +#define RDIF_TASK_CONTEXT_NETWORKINTERFACE_MASK 0x1 /* 0=None, 1=DIF */ +#define RDIF_TASK_CONTEXT_NETWORKINTERFACE_SHIFT 13 +#define RDIF_TASK_CONTEXT_FORWARDAPPTAGWITHMASK_MASK 0x1 /* Forward application tag with mask */ +#define RDIF_TASK_CONTEXT_FORWARDAPPTAGWITHMASK_SHIFT 14 +#define RDIF_TASK_CONTEXT_FORWARDREFTAGWITHMASK_MASK 0x1 /* Forward reference tag with mask */ +#define RDIF_TASK_CONTEXT_FORWARDREFTAGWITHMASK_SHIFT 15 + __le16 state; +#define RDIF_TASK_CONTEXT_RECEIVEDDIFBYTESLEFT_MASK 0xF +#define RDIF_TASK_CONTEXT_RECEIVEDDIFBYTESLEFT_SHIFT 0 +#define RDIF_TASK_CONTEXT_TRANSMITEDDIFBYTESLEFT_MASK 0xF +#define RDIF_TASK_CONTEXT_TRANSMITEDDIFBYTESLEFT_SHIFT 4 +#define RDIF_TASK_CONTEXT_ERRORINIO_MASK 0x1 +#define RDIF_TASK_CONTEXT_ERRORINIO_SHIFT 8 +#define RDIF_TASK_CONTEXT_CHECKSUMOVERFLOW_MASK 0x1 +#define RDIF_TASK_CONTEXT_CHECKSUMOVERFLOW_SHIFT 9 +#define RDIF_TASK_CONTEXT_REFTAGMASK_MASK 0xF /* mask for refernce tag handling */ +#define RDIF_TASK_CONTEXT_REFTAGMASK_SHIFT 10 +#define RDIF_TASK_CONTEXT_RESERVED1_MASK 0x3 +#define RDIF_TASK_CONTEXT_RESERVED1_SHIFT 14 + __le32 reserved2; +}; + + + +/* + * RSS hash type + */ +enum rss_hash_type +{ + RSS_HASH_TYPE_DEFAULT=0, + RSS_HASH_TYPE_IPV4=1, + RSS_HASH_TYPE_TCP_IPV4=2, + RSS_HASH_TYPE_IPV6=3, + RSS_HASH_TYPE_TCP_IPV6=4, + RSS_HASH_TYPE_UDP_IPV4=5, + RSS_HASH_TYPE_UDP_IPV6=6, + MAX_RSS_HASH_TYPE +}; + + +/* + * status block structure + */ +struct status_block +{ + __le16 pi_array[PIS_PER_SB]; + __le32 sb_num; +#define STATUS_BLOCK_SB_NUM_MASK 0x1FF +#define STATUS_BLOCK_SB_NUM_SHIFT 0 +#define STATUS_BLOCK_ZERO_PAD_MASK 0x7F +#define STATUS_BLOCK_ZERO_PAD_SHIFT 9 +#define STATUS_BLOCK_ZERO_PAD2_MASK 0xFFFF +#define STATUS_BLOCK_ZERO_PAD2_SHIFT 16 + __le32 prod_index; +#define STATUS_BLOCK_PROD_INDEX_MASK 0xFFFFFF +#define STATUS_BLOCK_PROD_INDEX_SHIFT 0 +#define STATUS_BLOCK_ZERO_PAD3_MASK 0xFF +#define STATUS_BLOCK_ZERO_PAD3_SHIFT 24 +}; + + +/* + * Tdif context + */ +struct tdif_task_context +{ + __le32 initialRefTag; + __le16 appTagValue; + __le16 appTagMask; + __le16 partialCrcValueB; + __le16 partialChecksumValueB; + __le16 stateB; +#define TDIF_TASK_CONTEXT_RECEIVEDDIFBYTESLEFTB_MASK 0xF +#define TDIF_TASK_CONTEXT_RECEIVEDDIFBYTESLEFTB_SHIFT 0 +#define TDIF_TASK_CONTEXT_TRANSMITEDDIFBYTESLEFTB_MASK 0xF +#define TDIF_TASK_CONTEXT_TRANSMITEDDIFBYTESLEFTB_SHIFT 4 +#define TDIF_TASK_CONTEXT_ERRORINIOB_MASK 0x1 +#define TDIF_TASK_CONTEXT_ERRORINIOB_SHIFT 8 +#define TDIF_TASK_CONTEXT_CHECKSUMOVERFLOW_MASK 0x1 +#define TDIF_TASK_CONTEXT_CHECKSUMOVERFLOW_SHIFT 9 +#define TDIF_TASK_CONTEXT_RESERVED0_MASK 0x3F +#define TDIF_TASK_CONTEXT_RESERVED0_SHIFT 10 + uint8_t reserved1; + uint8_t flags0; +#define TDIF_TASK_CONTEXT_IGNOREAPPTAG_MASK 0x1 +#define TDIF_TASK_CONTEXT_IGNOREAPPTAG_SHIFT 0 +#define TDIF_TASK_CONTEXT_INITIALREFTAGVALID_MASK 0x1 +#define TDIF_TASK_CONTEXT_INITIALREFTAGVALID_SHIFT 1 +#define TDIF_TASK_CONTEXT_HOSTGUARDTYPE_MASK 0x1 /* 0 = IP checksum, 1 = CRC */ +#define TDIF_TASK_CONTEXT_HOSTGUARDTYPE_SHIFT 2 +#define TDIF_TASK_CONTEXT_SETERRORWITHEOP_MASK 0x1 +#define TDIF_TASK_CONTEXT_SETERRORWITHEOP_SHIFT 3 +#define TDIF_TASK_CONTEXT_PROTECTIONTYPE_MASK 0x3 /* 1/2/3 - Protection Type */ +#define TDIF_TASK_CONTEXT_PROTECTIONTYPE_SHIFT 4 +#define TDIF_TASK_CONTEXT_CRC_SEED_MASK 0x1 /* 0=0x0000, 1=0xffff */ +#define TDIF_TASK_CONTEXT_CRC_SEED_SHIFT 6 +#define TDIF_TASK_CONTEXT_RESERVED2_MASK 0x1 +#define TDIF_TASK_CONTEXT_RESERVED2_SHIFT 7 + __le32 flags1; +#define TDIF_TASK_CONTEXT_VALIDATEGUARD_MASK 0x1 +#define TDIF_TASK_CONTEXT_VALIDATEGUARD_SHIFT 0 +#define TDIF_TASK_CONTEXT_VALIDATEAPPTAG_MASK 0x1 +#define TDIF_TASK_CONTEXT_VALIDATEAPPTAG_SHIFT 1 +#define TDIF_TASK_CONTEXT_VALIDATEREFTAG_MASK 0x1 +#define TDIF_TASK_CONTEXT_VALIDATEREFTAG_SHIFT 2 +#define TDIF_TASK_CONTEXT_FORWARDGUARD_MASK 0x1 +#define TDIF_TASK_CONTEXT_FORWARDGUARD_SHIFT 3 +#define TDIF_TASK_CONTEXT_FORWARDAPPTAG_MASK 0x1 +#define TDIF_TASK_CONTEXT_FORWARDAPPTAG_SHIFT 4 +#define TDIF_TASK_CONTEXT_FORWARDREFTAG_MASK 0x1 +#define TDIF_TASK_CONTEXT_FORWARDREFTAG_SHIFT 5 +#define TDIF_TASK_CONTEXT_INTERVALSIZE_MASK 0x7 /* 0=512B, 1=1KB, 2=2KB, 3=4KB, 4=8KB */ +#define TDIF_TASK_CONTEXT_INTERVALSIZE_SHIFT 6 +#define TDIF_TASK_CONTEXT_HOSTINTERFACE_MASK 0x3 /* 0=None, 1=DIF, 2=DIX */ +#define TDIF_TASK_CONTEXT_HOSTINTERFACE_SHIFT 9 +#define TDIF_TASK_CONTEXT_DIFBEFOREDATA_MASK 0x1 /* DIF tag right at the beginning of DIF interval */ +#define TDIF_TASK_CONTEXT_DIFBEFOREDATA_SHIFT 11 +#define TDIF_TASK_CONTEXT_RESERVED3_MASK 0x1 /* reserved */ +#define TDIF_TASK_CONTEXT_RESERVED3_SHIFT 12 +#define TDIF_TASK_CONTEXT_NETWORKINTERFACE_MASK 0x1 /* 0=None, 1=DIF */ +#define TDIF_TASK_CONTEXT_NETWORKINTERFACE_SHIFT 13 +#define TDIF_TASK_CONTEXT_RECEIVEDDIFBYTESLEFTA_MASK 0xF +#define TDIF_TASK_CONTEXT_RECEIVEDDIFBYTESLEFTA_SHIFT 14 +#define TDIF_TASK_CONTEXT_TRANSMITEDDIFBYTESLEFTA_MASK 0xF +#define TDIF_TASK_CONTEXT_TRANSMITEDDIFBYTESLEFTA_SHIFT 18 +#define TDIF_TASK_CONTEXT_ERRORINIOA_MASK 0x1 +#define TDIF_TASK_CONTEXT_ERRORINIOA_SHIFT 22 +#define TDIF_TASK_CONTEXT_CHECKSUMOVERFLOWA_MASK 0x1 +#define TDIF_TASK_CONTEXT_CHECKSUMOVERFLOWA_SHIFT 23 +#define TDIF_TASK_CONTEXT_REFTAGMASK_MASK 0xF /* mask for refernce tag handling */ +#define TDIF_TASK_CONTEXT_REFTAGMASK_SHIFT 24 +#define TDIF_TASK_CONTEXT_FORWARDAPPTAGWITHMASK_MASK 0x1 /* Forward application tag with mask */ +#define TDIF_TASK_CONTEXT_FORWARDAPPTAGWITHMASK_SHIFT 28 +#define TDIF_TASK_CONTEXT_FORWARDREFTAGWITHMASK_MASK 0x1 /* Forward reference tag with mask */ +#define TDIF_TASK_CONTEXT_FORWARDREFTAGWITHMASK_SHIFT 29 +#define TDIF_TASK_CONTEXT_KEEPREFTAGCONST_MASK 0x1 /* Keep reference tag constant */ +#define TDIF_TASK_CONTEXT_KEEPREFTAGCONST_SHIFT 30 +#define TDIF_TASK_CONTEXT_RESERVED4_MASK 0x1 +#define TDIF_TASK_CONTEXT_RESERVED4_SHIFT 31 + __le32 offsetInIOB; + __le16 partialCrcValueA; + __le16 partialChecksumValueA; + __le32 offsetInIOA; + uint8_t partialDifDataA[8]; + uint8_t partialDifDataB[8]; +}; + + +/* + * Timers context + */ +struct timers_context +{ + __le32 logical_client_0; +#define TIMERS_CONTEXT_EXPIRATIONTIMELC0_MASK 0xFFFFFFF /* Expiration time of logical client 0 */ +#define TIMERS_CONTEXT_EXPIRATIONTIMELC0_SHIFT 0 +#define TIMERS_CONTEXT_VALIDLC0_MASK 0x1 /* Valid bit of logical client 0 */ +#define TIMERS_CONTEXT_VALIDLC0_SHIFT 28 +#define TIMERS_CONTEXT_ACTIVELC0_MASK 0x1 /* Active bit of logical client 0 */ +#define TIMERS_CONTEXT_ACTIVELC0_SHIFT 29 +#define TIMERS_CONTEXT_RESERVED0_MASK 0x3 +#define TIMERS_CONTEXT_RESERVED0_SHIFT 30 + __le32 logical_client_1; +#define TIMERS_CONTEXT_EXPIRATIONTIMELC1_MASK 0xFFFFFFF /* Expiration time of logical client 1 */ +#define TIMERS_CONTEXT_EXPIRATIONTIMELC1_SHIFT 0 +#define TIMERS_CONTEXT_VALIDLC1_MASK 0x1 /* Valid bit of logical client 1 */ +#define TIMERS_CONTEXT_VALIDLC1_SHIFT 28 +#define TIMERS_CONTEXT_ACTIVELC1_MASK 0x1 /* Active bit of logical client 1 */ +#define TIMERS_CONTEXT_ACTIVELC1_SHIFT 29 +#define TIMERS_CONTEXT_RESERVED1_MASK 0x3 +#define TIMERS_CONTEXT_RESERVED1_SHIFT 30 + __le32 logical_client_2; +#define TIMERS_CONTEXT_EXPIRATIONTIMELC2_MASK 0xFFFFFFF /* Expiration time of logical client 2 */ +#define TIMERS_CONTEXT_EXPIRATIONTIMELC2_SHIFT 0 +#define TIMERS_CONTEXT_VALIDLC2_MASK 0x1 /* Valid bit of logical client 2 */ +#define TIMERS_CONTEXT_VALIDLC2_SHIFT 28 +#define TIMERS_CONTEXT_ACTIVELC2_MASK 0x1 /* Active bit of logical client 2 */ +#define TIMERS_CONTEXT_ACTIVELC2_SHIFT 29 +#define TIMERS_CONTEXT_RESERVED2_MASK 0x3 +#define TIMERS_CONTEXT_RESERVED2_SHIFT 30 + __le32 host_expiration_fields; +#define TIMERS_CONTEXT_HOSTEXPRIRATIONVALUE_MASK 0xFFFFFFF /* Expiration time on host (closest one) */ +#define TIMERS_CONTEXT_HOSTEXPRIRATIONVALUE_SHIFT 0 +#define TIMERS_CONTEXT_HOSTEXPRIRATIONVALID_MASK 0x1 /* Valid bit of host expiration */ +#define TIMERS_CONTEXT_HOSTEXPRIRATIONVALID_SHIFT 28 +#define TIMERS_CONTEXT_RESERVED3_MASK 0x7 +#define TIMERS_CONTEXT_RESERVED3_SHIFT 29 +}; + + +/* + * Enum for next_protocol field of tunnel_parsing_flags + */ +enum tunnel_next_protocol +{ + e_unknown=0, + e_l2=1, + e_ipv4=2, + e_ipv6=3, + MAX_TUNNEL_NEXT_PROTOCOL +}; + +#endif /* __COMMON_HSI__ */ diff --git a/providers/qedr/qelr.h b/providers/qedr/qelr.h new file mode 100644 index 0000000..ac522cb --- /dev/null +++ b/providers/qedr/qelr.h @@ -0,0 +1,382 @@ +/* + * Copyright (c) 2015-2016 QLogic Corporation + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and /or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef __QELR_H__ +#define __QELR_H__ + +#include <inttypes.h> +#include <stddef.h> +#include <endian.h> +#include <stdio.h> +#include <endian.h> +#include <ccan/minmax.h> + +#include <infiniband/driver.h> +#include <util/udma_barrier.h> + +#define writel(b, p) (*(uint32_t *)(p) = (b)) +#define writeq(b, p) (*(uint64_t *)(p) = (b)) + +#include "qelr_abi.h" +#include "qelr_hsi.h" +#include "qelr_chain.h" + +#define qelr_err(format, arg...) printf(format, ##arg) + +extern uint32_t qelr_dp_level; +extern uint32_t qelr_dp_module; + +enum DP_MODULE { + QELR_MSG_CQ = 0x10000, + QELR_MSG_RQ = 0x20000, + QELR_MSG_SQ = 0x40000, + QELR_MSG_QP = (QELR_MSG_SQ | QELR_MSG_RQ), + QELR_MSG_MR = 0x80000, + QELR_MSG_INIT = 0x100000, + QELR_MSG_SRQ = 0x200000, + /* to be added...up to 0x8000000 */ +}; + +enum DP_LEVEL { + QELR_LEVEL_VERBOSE = 0x0, + QELR_LEVEL_INFO = 0x1, + QELR_LEVEL_NOTICE = 0x2, + QELR_LEVEL_ERR = 0x3, +}; + +#define DP_ERR(fd, fmt, ...) \ +do { \ + fprintf(fd, "[%s:%d]" fmt, \ + __func__, __LINE__, \ + ##__VA_ARGS__); \ + fflush(fd); \ +} while (0) + +#define DP_NOTICE(fd, fmt, ...) \ +do { \ + if (qelr_dp_level <= QELR_LEVEL_NOTICE) {\ + fprintf(fd, "[%s:%d]" fmt, \ + __func__, __LINE__, \ + ##__VA_ARGS__); \ + fflush(fd); } \ +} while (0) + +#define DP_INFO(fd, fmt, ...) \ +do { \ + if (qelr_dp_level <= QELR_LEVEL_INFO) { \ + fprintf(fd, "[%s:%d]" fmt, \ + __func__, __LINE__, \ + ##__VA_ARGS__); fflush(fd); \ + } \ +} while (0) + +#define DP_VERBOSE(fd, module, fmt, ...) \ +do { \ + if ((qelr_dp_level <= QELR_LEVEL_VERBOSE) && \ + (qelr_dp_module & (module))) { \ + fprintf(fd, "[%s:%d]" fmt, \ + __func__, __LINE__, \ + ##__VA_ARGS__); fflush(fd); } \ +} while (0) + +struct qelr_buf { + void *addr; + size_t len; /* a 64 uint is used as s preparation + * for double layer pbl. + */ +}; + +#define IS_IWARP(_dev) (_dev->node_type == IBV_NODE_RNIC) +#define IS_ROCE(_dev) (_dev->node_type == IBV_NODE_CA) + +struct qelr_device { + struct verbs_device ibv_dev; +}; + +enum qelr_dpm_flags { + QELR_DPM_FLAGS_ENHANCED = (1 << 0), + QELR_DPM_FLAGS_LEGACY = (1 << 1), +}; + +struct qelr_devctx { + struct verbs_context ibv_ctx; + FILE *dbg_fp; + void *db_addr; + uint64_t db_pa; + struct qedr_user_db_rec db_rec_addr_dummy; + uint32_t db_size; + enum qelr_dpm_flags dpm_flags; + uint32_t kernel_page_size; + uint16_t ldpm_limit_size; + uint8_t edpm_trans_size; + + uint32_t max_send_wr; + uint32_t max_recv_wr; + uint32_t max_srq_wr; + uint32_t sges_per_send_wr; + uint32_t sges_per_recv_wr; + uint32_t sges_per_srq_wr; + int max_cqes; +}; + +struct qelr_pd { + struct ibv_pd ibv_pd; + uint32_t pd_id; +}; + +struct qelr_mr { + struct verbs_mr vmr; +}; + +union db_prod64 { + struct rdma_pwm_val32_data data; + uint64_t raw; +}; + +struct qelr_cq { + struct ibv_cq ibv_cq; /* must be first */ + + struct qelr_chain chain; + + void *db_addr; + union db_prod64 db; + /* Doorbell recovery entry address */ + void *db_rec_map; + struct qedr_user_db_rec *db_rec_addr; + + uint8_t chain_toggle; + union rdma_cqe *latest_cqe; + union rdma_cqe *toggle_cqe; + + uint8_t arm_flags; +}; + +enum qelr_qp_state { + QELR_QPS_RST, + QELR_QPS_INIT, + QELR_QPS_RTR, + QELR_QPS_RTS, + QELR_QPS_SQD, + QELR_QPS_ERR, + QELR_QPS_SQE +}; + +union db_prod32 { + struct rdma_pwm_val16_data data; + uint32_t raw; +}; + +struct qelr_qp_hwq_info { + /* WQE */ + struct qelr_chain chain; + uint8_t max_sges; + + /* WQ */ + uint16_t prod; + uint16_t wqe_cons; + uint16_t cons; + uint16_t max_wr; + + /* DB */ + void *db; /* Doorbell address */ + void *edpm_db; + union db_prod32 db_data; /* Doorbell data */ + /* Doorbell recovery entry address */ + void *db_rec_map; + struct qedr_user_db_rec *db_rec_addr; + void *iwarp_db2; + union db_prod32 iwarp_db2_data; + + uint16_t icid; +}; + +struct qelr_rdma_ext { + __be64 remote_va; + __be32 remote_key; + __be32 dma_length; +}; + +/* rdma extension, invalidate / immediate data + padding, inline data... */ +#define QELR_MAX_DPM_PAYLOAD (sizeof(struct qelr_rdma_ext) + sizeof(uint64_t) +\ + ROCE_REQ_MAX_INLINE_DATA_SIZE) +struct qelr_dpm { + uint8_t is_edpm; + uint8_t is_ldpm; + union { + struct db_roce_dpm_data data; + uint64_t raw; + } msg; + + uint8_t payload[QELR_MAX_DPM_PAYLOAD]; + uint32_t payload_size; + uint32_t payload_offset; + + struct qelr_rdma_ext *rdma_ext; +}; + +struct qelr_srq_hwq_info { + uint32_t max_sges; + uint32_t max_wr; + struct qelr_chain chain; + uint32_t wqe_prod; /* WQE prod index in HW ring */ + uint32_t sge_prod; /* SGE prod index in HW ring */ + uint32_t wr_prod_cnt; /* wr producer count */ + uint32_t wr_cons_cnt; /* wr consumer count */ + uint32_t num_elems; + + void *virt_prod_pair_addr; /* producer pair virtual address */ +}; + +struct qelr_srq { + struct ibv_srq ibv_srq; + struct qelr_srq_hwq_info hw_srq; + uint16_t srq_id; + pthread_spinlock_t lock; +}; + +struct qelr_qp { + struct ibv_qp ibv_qp; + pthread_spinlock_t q_lock; + enum qelr_qp_state state; /* QP state */ + + struct qelr_qp_hwq_info sq; + struct qelr_qp_hwq_info rq; + struct { + uint64_t wr_id; + enum ibv_wc_opcode opcode; + uint32_t bytes_len; + uint8_t wqe_size; + uint8_t signaled; + } *wqe_wr_id; + + struct { + uint64_t wr_id; + uint8_t wqe_size; + } *rqe_wr_id; + + uint8_t prev_wqe_size; + uint32_t max_inline_data; + uint32_t qp_id; + int sq_sig_all; + int atomic_supported; + uint8_t edpm_disabled; + struct qelr_srq *srq; +}; + +static inline struct qelr_devctx *get_qelr_ctx(struct ibv_context *ibctx) +{ + return container_of(ibctx, struct qelr_devctx, ibv_ctx.context); +} + +static inline struct qelr_device *get_qelr_dev(struct ibv_device *ibdev) +{ + return container_of(ibdev, struct qelr_device, ibv_dev.device); +} + +static inline struct qelr_qp *get_qelr_qp(struct ibv_qp *ibqp) +{ + return container_of(ibqp, struct qelr_qp, ibv_qp); +} + +static inline struct qelr_pd *get_qelr_pd(struct ibv_pd *ibpd) +{ + return container_of(ibpd, struct qelr_pd, ibv_pd); +} + +static inline struct qelr_cq *get_qelr_cq(struct ibv_cq *ibcq) +{ + return container_of(ibcq, struct qelr_cq, ibv_cq); +} + +static inline struct qelr_srq *get_qelr_srq(struct ibv_srq *ibsrq) +{ + return container_of(ibsrq, struct qelr_srq, ibv_srq); +} + +#define SET_FIELD(value, name, flag) \ + do { \ + (value) &= ~(name ## _MASK << name ## _SHIFT); \ + (value) |= ((flag) << (name ## _SHIFT)); \ + } while (0) + +#define SET_FIELD2(value, name, flag) \ + ((value) |= ((flag) << (name ## _SHIFT))) + +#define GET_FIELD(value, name) \ + (((value) >> (name ## _SHIFT)) & name ## _MASK) + +#define ROCE_WQE_ELEM_SIZE sizeof(struct rdma_sq_sge) +#define RDMA_WQE_BYTES (16) + +#define QELR_RESP_IMM (RDMA_CQE_RESPONDER_IMM_FLG_MASK << \ + RDMA_CQE_RESPONDER_IMM_FLG_SHIFT) +#define QELR_RESP_INV (RDMA_CQE_RESPONDER_INV_FLG_MASK << \ + RDMA_CQE_RESPONDER_INV_FLG_SHIFT) +#define QELR_RESP_RDMA (RDMA_CQE_RESPONDER_RDMA_FLG_MASK << \ + RDMA_CQE_RESPONDER_RDMA_FLG_SHIFT) +#define QELR_RESP_RDMA_IMM (QELR_RESP_IMM | QELR_RESP_RDMA) + +#define TYPEPTR_ADDR_SET(type_ptr, field, vaddr) \ + do { \ + (type_ptr)->field.hi = htole32(U64_HI(vaddr)); \ + (type_ptr)->field.lo = htole32(U64_LO(vaddr)); \ + } while (0) + +#define RQ_SGE_SET(sge, vaddr, vlength, vflags) \ + do { \ + TYPEPTR_ADDR_SET(sge, addr, vaddr); \ + (sge)->length = htole32(vlength); \ + (sge)->flags = htole32(vflags); \ + } while (0) + +#define SRQ_HDR_SET(hdr, vwr_id, num_sge) \ + do { \ + TYPEPTR_ADDR_SET(hdr, wr_id, vwr_id); \ + (hdr)->num_sges = num_sge; \ + } while (0) + +#define SRQ_SGE_SET(sge, vaddr, vlength, vlkey) \ + do { \ + TYPEPTR_ADDR_SET(sge, addr, vaddr); \ + (sge)->length = htole32(vlength); \ + (sge)->l_key = htole32(vlkey); \ + } while (0) + +#define U64_HI(val) ((uint32_t)(((uint64_t)(uintptr_t)(val)) >> 32)) +#define U64_LO(val) ((uint32_t)(((uint64_t)(uintptr_t)(val)) & 0xffffffff)) +#define HILO_U64(hi, lo) ((uintptr_t)((((uint64_t)(hi)) << 32) + (lo))) + +#define QELR_MAX_RQ_WQE_SIZE (RDMA_MAX_SGE_PER_RQ_WQE) +#define QELR_MAX_SQ_WQE_SIZE (ROCE_REQ_MAX_SINGLE_SQ_WQE_SIZE / \ + ROCE_WQE_ELEM_SIZE) + +#endif /* __QELR_H__ */ diff --git a/providers/qedr/qelr_abi.h b/providers/qedr/qelr_abi.h new file mode 100644 index 0000000..345872c --- /dev/null +++ b/providers/qedr/qelr_abi.h @@ -0,0 +1,55 @@ +/* + * Copyright (c) 2015-2016 QLogic Corporation + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and /or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef __QELR_ABI_H__ +#define __QELR_ABI_H__ + +#include <infiniband/kern-abi.h> +#include <rdma/qedr-abi.h> +#include <kernel-abi/qedr-abi.h> + +#define QELR_ABI_VERSION (8) + +DECLARE_DRV_CMD(qelr_alloc_pd, IB_USER_VERBS_CMD_ALLOC_PD, + empty, qedr_alloc_pd_uresp); +DECLARE_DRV_CMD(qelr_create_cq, IB_USER_VERBS_CMD_CREATE_CQ, + qedr_create_cq_ureq, qedr_create_cq_uresp); +DECLARE_DRV_CMD(qelr_create_qp, IB_USER_VERBS_CMD_CREATE_QP, + qedr_create_qp_ureq, qedr_create_qp_uresp); +DECLARE_DRV_CMD(qelr_alloc_context, IB_USER_VERBS_CMD_GET_CONTEXT, + qedr_alloc_ucontext_req, qedr_alloc_ucontext_resp); +DECLARE_DRV_CMD(qelr_reg_mr, IB_USER_VERBS_CMD_REG_MR, + empty, empty); +DECLARE_DRV_CMD(qelr_create_srq, IB_USER_VERBS_CMD_CREATE_SRQ, + qedr_create_srq_ureq, qedr_create_srq_uresp); + +#endif /* __QELR_ABI_H__ */ diff --git a/providers/qedr/qelr_chain.c b/providers/qedr/qelr_chain.c new file mode 100644 index 0000000..26d0d0a --- /dev/null +++ b/providers/qedr/qelr_chain.c @@ -0,0 +1,110 @@ +/* + * Copyright (c) 2015-2016 QLogic Corporation + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and /or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <sys/types.h> +#include <sys/mman.h> +#include <stdio.h> +#include <string.h> +#include <endian.h> +#include <errno.h> + +#include "qelr.h" + +void *qelr_chain_get_last_elem(struct qelr_chain *p_chain) +{ + void *p_virt_addr = NULL; + uint32_t size; + + if (!p_chain->first_addr) + goto out; + + size = p_chain->elem_size * (p_chain->n_elems - 1); + p_virt_addr = ((uint8_t *)p_chain->first_addr + size); +out: + return p_virt_addr; +} + +void qelr_chain_reset(struct qelr_chain *p_chain) +{ + p_chain->prod_idx = 0; + p_chain->cons_idx = 0; + + p_chain->p_cons_elem = p_chain->first_addr; + p_chain->p_prod_elem = p_chain->first_addr; +} + +#define QELR_ANON_FD (-1) /* MAP_ANONYMOUS => file desc.= -1 */ +#define QELR_ANON_OFFSET (0) /* MAP_ANONYMOUS => offset = d/c */ + +int qelr_chain_alloc(struct qelr_chain *chain, int chain_size, int page_size, + uint16_t elem_size) +{ + int ret, a_chain_size; + void *addr; + + /* alloc aligned page aligned chain */ + a_chain_size = (chain_size + page_size - 1) & ~(page_size - 1); + addr = mmap(NULL, a_chain_size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, QELR_ANON_FD, + QELR_ANON_OFFSET); + if (addr == MAP_FAILED) + return errno; + + ret = ibv_dontfork_range(addr, a_chain_size); + if (ret) { + munmap(addr, a_chain_size); + return ret; + } + + /* init chain */ + memset(chain, 0, sizeof(*chain)); + chain->first_addr = addr; + chain->size = a_chain_size; + chain->p_cons_elem = chain->first_addr; + chain->p_prod_elem = chain->first_addr; + chain->elem_size = elem_size; + chain->n_elems = chain->size / elem_size; + chain->last_addr = (void *) + ((uint8_t *)addr + (elem_size * (chain->n_elems -1))); + + /* Note: since we are using MAP_ANONYMOUS the chain is zeroed for us */ + + return 0; +} + +void qelr_chain_free(struct qelr_chain *chain) +{ + if (chain->size) { + ibv_dofork_range(chain->first_addr, chain->size); + munmap(chain->first_addr, chain->size); + } +} diff --git a/providers/qedr/qelr_chain.h b/providers/qedr/qelr_chain.h new file mode 100644 index 0000000..5b6e324 --- /dev/null +++ b/providers/qedr/qelr_chain.h @@ -0,0 +1,168 @@ +/* + * Copyright (c) 2015-2016 QLogic Corporation + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and /or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef __QELR_CHAIN_H__ +#define __QELR_CHAIN_H__ + +#include <stddef.h> +#include <stdint.h> + +struct qelr_chain { + void *first_addr; /* Address of first element in chain */ + void *last_addr; /* Address of last element in chain */ + + /* Point to next element to produce/consume */ + void *p_prod_elem; + void *p_cons_elem; + + uint32_t prod_idx; + uint32_t cons_idx; + + uint32_t n_elems; + uint32_t size; + uint16_t elem_size; +}; + +/* fast path functions are inline */ + +static inline uint32_t qelr_chain_get_cons_idx_u32(struct qelr_chain *p_chain) +{ + return p_chain->cons_idx; +} + +static inline void *qelr_chain_produce(struct qelr_chain *p_chain) +{ + void *p_ret = NULL; + + p_chain->prod_idx++; + + p_ret = p_chain->p_prod_elem; + + if (p_chain->p_prod_elem == p_chain->last_addr) + p_chain->p_prod_elem = p_chain->first_addr; + else + p_chain->p_prod_elem = (void *)(((uint8_t *)p_chain->p_prod_elem) + + p_chain->elem_size); + + return p_ret; +} + +static inline void *qelr_chain_produce_n(struct qelr_chain *p_chain, int n) +{ + void *p_ret = NULL; + int n_wrap; + + p_chain->prod_idx++; + p_ret = p_chain->p_prod_elem; + + n_wrap = p_chain->prod_idx % p_chain->n_elems; + if (n_wrap < n) + p_chain->p_prod_elem = (void *) + (((uint8_t *)p_chain->first_addr) + + (p_chain->elem_size * n_wrap)); + else + p_chain->p_prod_elem = (void *)(((uint8_t *)p_chain->p_prod_elem) + + (p_chain->elem_size * n)); + + return p_ret; +} + +static inline void *qelr_chain_consume(struct qelr_chain *p_chain) +{ + void *p_ret = NULL; + + p_chain->cons_idx++; + + p_ret = p_chain->p_cons_elem; + + if (p_chain->p_cons_elem == p_chain->last_addr) + p_chain->p_cons_elem = p_chain->first_addr; + else + p_chain->p_cons_elem = (void *) + (((uint8_t *)p_chain->p_cons_elem) + + p_chain->elem_size); + + return p_ret; +} + +static inline void *qelr_chain_consume_n(struct qelr_chain *p_chain, int n) +{ + void *p_ret = NULL; + int n_wrap; + + p_chain->cons_idx += n; + p_ret = p_chain->p_cons_elem; + + n_wrap = p_chain->cons_idx % p_chain->n_elems; + if (n_wrap < n) + p_chain->p_cons_elem = (void *) + (((uint8_t *)p_chain->first_addr) + + (p_chain->elem_size * n_wrap)); + else + p_chain->p_cons_elem = (void *)(((uint8_t *)p_chain->p_cons_elem) + + (p_chain->elem_size * n)); + + return p_ret; +} + +static inline uint32_t qelr_chain_get_elem_left_u32(struct qelr_chain *p_chain) +{ + uint32_t used; + + used = (uint32_t)(((uint64_t)((uint64_t) ~0U) + 1 + + (uint64_t)(p_chain->prod_idx)) - + (uint64_t)p_chain->cons_idx); + + return p_chain->n_elems - used; +} + +static inline uint8_t qelr_chain_is_full(struct qelr_chain *p_chain) +{ + return qelr_chain_get_elem_left_u32(p_chain) == p_chain->n_elems; +} + +static inline void qelr_chain_set_prod( + struct qelr_chain *p_chain, + uint32_t prod_idx, + void *p_prod_elem) +{ + p_chain->prod_idx = prod_idx; + p_chain->p_prod_elem = p_prod_elem; +} + +void *qelr_chain_get_last_elem(struct qelr_chain *p_chain); +void qelr_chain_reset(struct qelr_chain *p_chain); +int qelr_chain_alloc(struct qelr_chain *chain, int chain_size, int page_size, + uint16_t elem_size); +void qelr_chain_free(struct qelr_chain *buf); + +#endif /* __QELR_CHAIN_H__ */ diff --git a/providers/qedr/qelr_hsi.h b/providers/qedr/qelr_hsi.h new file mode 100644 index 0000000..8eaf183 --- /dev/null +++ b/providers/qedr/qelr_hsi.h @@ -0,0 +1,67 @@ +/* + * Copyright (c) 2015-2016 QLogic Corporation + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and /or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef __QED_HSI_ROCE__ +#define __QED_HSI_ROCE__ +/********************************/ +/* Add include to common target */ +/********************************/ +#include "common_hsi.h" + +/************************************************************************/ +/* Add include to common roce target for both eCore and protocol roce driver */ +/************************************************************************/ +#include "roce_common.h" +/************************************************************************/ +/* Add include to qed hsi rdma target for both roce and iwarp qed driver */ +/************************************************************************/ +#include "qelr_hsi_rdma.h" + +/* Affiliated asynchronous events / errors enumeration */ +enum roce_async_events_type +{ + ROCE_ASYNC_EVENT_NONE, + ROCE_ASYNC_EVENT_COMM_EST, + ROCE_ASYNC_EVENT_SQ_DRAINED, + ROCE_ASYNC_EVENT_SRQ_LIMIT, + ROCE_ASYNC_EVENT_LAST_WQE_REACHED, + ROCE_ASYNC_EVENT_CQ_ERR, + ROCE_ASYNC_EVENT_LOCAL_INVALID_REQUEST_ERR, + ROCE_ASYNC_EVENT_LOCAL_CATASTROPHIC_ERR, + ROCE_ASYNC_EVENT_LOCAL_ACCESS_ERR, + ROCE_ASYNC_EVENT_QP_CATASTROPHIC_ERR, + ROCE_ASYNC_EVENT_CQ_OVERFLOW_ERR, + ROCE_ASYNC_EVENT_SRQ_EMPTY, + MAX_ROCE_ASYNC_EVENTS_TYPE +}; + +#endif /* __QED_HSI_ROCE__ */ diff --git a/providers/qedr/qelr_hsi_rdma.h b/providers/qedr/qelr_hsi_rdma.h new file mode 100644 index 0000000..ced75d4 --- /dev/null +++ b/providers/qedr/qelr_hsi_rdma.h @@ -0,0 +1,913 @@ +/* + * Copyright (c) 2015-2016 QLogic Corporation + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and /or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef __QED_HSI_RDMA__ +#define __QED_HSI_RDMA__ + +#include "common_hsi.h" +#include "rdma_common.h" + +/* + * rdma completion notification queue element + */ +struct rdma_cnqe +{ + struct regpair cq_handle; +}; + + +struct rdma_cqe_responder +{ + struct regpair srq_wr_id; + struct regpair qp_handle; + __le32 imm_data_or_inv_r_Key /* immediate data in case imm_flg is set, or invalidated r_key in case inv_flg is set */; + __le32 length; + __le32 imm_data_hi /* High bytes of immediate data in case imm_flg is set in iWARP only */; + __le16 rq_cons /* Valid only when status is WORK_REQUEST_FLUSHED_ERR. Indicates an aggregative flush on all posted RQ WQEs until the reported rq_cons. */; + uint8_t flags; +#define RDMA_CQE_RESPONDER_TOGGLE_BIT_MASK 0x1 /* indicates a valid completion written by FW. FW toggle this bit each time it finishes producing all PBL entries */ +#define RDMA_CQE_RESPONDER_TOGGLE_BIT_SHIFT 0 +#define RDMA_CQE_RESPONDER_TYPE_MASK 0x3 /* (use enum rdma_cqe_type) */ +#define RDMA_CQE_RESPONDER_TYPE_SHIFT 1 +#define RDMA_CQE_RESPONDER_INV_FLG_MASK 0x1 /* r_key invalidated indicator */ +#define RDMA_CQE_RESPONDER_INV_FLG_SHIFT 3 +#define RDMA_CQE_RESPONDER_IMM_FLG_MASK 0x1 /* immediate data indicator */ +#define RDMA_CQE_RESPONDER_IMM_FLG_SHIFT 4 +#define RDMA_CQE_RESPONDER_RDMA_FLG_MASK 0x1 /* 1=this CQE relates to an RDMA Write. 0=Send. */ +#define RDMA_CQE_RESPONDER_RDMA_FLG_SHIFT 5 +#define RDMA_CQE_RESPONDER_RESERVED2_MASK 0x3 +#define RDMA_CQE_RESPONDER_RESERVED2_SHIFT 6 + uint8_t status; +}; + +struct rdma_cqe_requester +{ + __le16 sq_cons; + __le16 reserved0; + __le32 reserved1; + struct regpair qp_handle; + struct regpair reserved2; + __le32 reserved3; + __le16 reserved4; + uint8_t flags; +#define RDMA_CQE_REQUESTER_TOGGLE_BIT_MASK 0x1 /* indicates a valid completion written by FW. FW toggle this bit each time it finishes producing all PBL entries */ +#define RDMA_CQE_REQUESTER_TOGGLE_BIT_SHIFT 0 +#define RDMA_CQE_REQUESTER_TYPE_MASK 0x3 /* (use enum rdma_cqe_type) */ +#define RDMA_CQE_REQUESTER_TYPE_SHIFT 1 +#define RDMA_CQE_REQUESTER_RESERVED5_MASK 0x1F +#define RDMA_CQE_REQUESTER_RESERVED5_SHIFT 3 + uint8_t status; +}; + +struct rdma_cqe_common +{ + struct regpair reserved0; + struct regpair qp_handle; + __le16 reserved1[7]; + uint8_t flags; +#define RDMA_CQE_COMMON_TOGGLE_BIT_MASK 0x1 /* indicates a valid completion written by FW. FW toggle this bit each time it finishes producing all PBL entries */ +#define RDMA_CQE_COMMON_TOGGLE_BIT_SHIFT 0 +#define RDMA_CQE_COMMON_TYPE_MASK 0x3 /* (use enum rdma_cqe_type) */ +#define RDMA_CQE_COMMON_TYPE_SHIFT 1 +#define RDMA_CQE_COMMON_RESERVED2_MASK 0x1F +#define RDMA_CQE_COMMON_RESERVED2_SHIFT 3 + uint8_t status; +}; + +/* + * rdma completion queue element + */ +union rdma_cqe +{ + struct rdma_cqe_responder resp; + struct rdma_cqe_requester req; + struct rdma_cqe_common cmn; +}; + + + + +/* + * CQE requester status enumeration + */ +enum rdma_cqe_requester_status_enum +{ + RDMA_CQE_REQ_STS_OK, + RDMA_CQE_REQ_STS_BAD_RESPONSE_ERR, + RDMA_CQE_REQ_STS_LOCAL_LENGTH_ERR, + RDMA_CQE_REQ_STS_LOCAL_QP_OPERATION_ERR, + RDMA_CQE_REQ_STS_LOCAL_PROTECTION_ERR, + RDMA_CQE_REQ_STS_MEMORY_MGT_OPERATION_ERR, + RDMA_CQE_REQ_STS_REMOTE_INVALID_REQUEST_ERR, + RDMA_CQE_REQ_STS_REMOTE_ACCESS_ERR, + RDMA_CQE_REQ_STS_REMOTE_OPERATION_ERR, + RDMA_CQE_REQ_STS_RNR_NAK_RETRY_CNT_ERR, + RDMA_CQE_REQ_STS_TRANSPORT_RETRY_CNT_ERR, + RDMA_CQE_REQ_STS_WORK_REQUEST_FLUSHED_ERR, + MAX_RDMA_CQE_REQUESTER_STATUS_ENUM +}; + + + +/* + * CQE responder status enumeration + */ +enum rdma_cqe_responder_status_enum +{ + RDMA_CQE_RESP_STS_OK, + RDMA_CQE_RESP_STS_LOCAL_ACCESS_ERR, + RDMA_CQE_RESP_STS_LOCAL_LENGTH_ERR, + RDMA_CQE_RESP_STS_LOCAL_QP_OPERATION_ERR, + RDMA_CQE_RESP_STS_LOCAL_PROTECTION_ERR, + RDMA_CQE_RESP_STS_MEMORY_MGT_OPERATION_ERR, + RDMA_CQE_RESP_STS_REMOTE_INVALID_REQUEST_ERR, + RDMA_CQE_RESP_STS_WORK_REQUEST_FLUSHED_ERR, + MAX_RDMA_CQE_RESPONDER_STATUS_ENUM +}; + + +/* + * CQE type enumeration + */ +enum rdma_cqe_type +{ + RDMA_CQE_TYPE_REQUESTER, + RDMA_CQE_TYPE_RESPONDER_RQ, + RDMA_CQE_TYPE_RESPONDER_SRQ, + RDMA_CQE_TYPE_INVALID, + MAX_RDMA_CQE_TYPE +}; + + +/* + * DIF Block size options + */ +enum rdma_dif_block_size +{ + RDMA_DIF_BLOCK_512=0, + RDMA_DIF_BLOCK_4096=1, + MAX_RDMA_DIF_BLOCK_SIZE +}; + + +/* + * DIF CRC initial value + */ +enum rdma_dif_crc_seed +{ + RDMA_DIF_CRC_SEED_0000=0, + RDMA_DIF_CRC_SEED_FFFF=1, + MAX_RDMA_DIF_CRC_SEED +}; + + +/* + * RDMA DIF Error Result Structure + */ +struct rdma_dif_error_result +{ + __le32 error_intervals /* Total number of error intervals in the IO. */; + __le32 dif_error_1st_interval /* Number of the first interval that contained error. Set to 0xFFFFFFFF if error occurred in the Runt Block. */; + uint8_t flags; +#define RDMA_DIF_ERROR_RESULT_DIF_ERROR_TYPE_CRC_MASK 0x1 /* CRC error occurred. */ +#define RDMA_DIF_ERROR_RESULT_DIF_ERROR_TYPE_CRC_SHIFT 0 +#define RDMA_DIF_ERROR_RESULT_DIF_ERROR_TYPE_APP_TAG_MASK 0x1 /* App Tag error occurred. */ +#define RDMA_DIF_ERROR_RESULT_DIF_ERROR_TYPE_APP_TAG_SHIFT 1 +#define RDMA_DIF_ERROR_RESULT_DIF_ERROR_TYPE_REF_TAG_MASK 0x1 /* Ref Tag error occurred. */ +#define RDMA_DIF_ERROR_RESULT_DIF_ERROR_TYPE_REF_TAG_SHIFT 2 +#define RDMA_DIF_ERROR_RESULT_RESERVED0_MASK 0xF +#define RDMA_DIF_ERROR_RESULT_RESERVED0_SHIFT 3 +#define RDMA_DIF_ERROR_RESULT_TOGGLE_BIT_MASK 0x1 /* Used to indicate the structure is valid. Toggles each time an invalidate region is performed. */ +#define RDMA_DIF_ERROR_RESULT_TOGGLE_BIT_SHIFT 7 + uint8_t reserved1[55] /* Pad to 64 bytes to ensure efficient word line writing. */; +}; + + +/* + * DIF IO direction + */ +enum rdma_dif_io_direction_flg +{ + RDMA_DIF_DIR_RX=0, + RDMA_DIF_DIR_TX=1, + MAX_RDMA_DIF_IO_DIRECTION_FLG +}; + + +/* + * RDMA DIF Runt Result Structure + */ +struct rdma_dif_runt_result +{ + __le16 guard_tag /* CRC result of received IO. */; + __le16 reserved[3]; +}; + + +/* + * memory window type enumeration + */ +enum rdma_mw_type +{ + RDMA_MW_TYPE_1, + RDMA_MW_TYPE_2A, + MAX_RDMA_MW_TYPE +}; + + +struct rdma_rq_sge +{ + struct regpair addr; + __le32 length; + __le32 flags; +#define RDMA_RQ_SGE_L_KEY_MASK 0x3FFFFFF /* key of memory relating to this RQ */ +#define RDMA_RQ_SGE_L_KEY_SHIFT 0 +#define RDMA_RQ_SGE_NUM_SGES_MASK 0x7 /* first SGE - number of SGEs in this RQ WQE. Other SGEs - should be set to 0 */ +#define RDMA_RQ_SGE_NUM_SGES_SHIFT 26 +#define RDMA_RQ_SGE_RESERVED0_MASK 0x7 +#define RDMA_RQ_SGE_RESERVED0_SHIFT 29 +}; + + +struct rdma_sq_atomic_wqe +{ + __le32 reserved1; + __le32 length /* Total data length (8 bytes for Atomic) */; + __le32 xrc_srq /* Valid only when XRC is set for the QP */; + uint8_t req_type /* Type of WQE */; + uint8_t flags; +#define RDMA_SQ_ATOMIC_WQE_COMP_FLG_MASK 0x1 /* If set, completion will be generated when the WQE is completed */ +#define RDMA_SQ_ATOMIC_WQE_COMP_FLG_SHIFT 0 +#define RDMA_SQ_ATOMIC_WQE_RD_FENCE_FLG_MASK 0x1 /* If set, all pending RDMA read or Atomic operations will be completed before start processing this WQE */ +#define RDMA_SQ_ATOMIC_WQE_RD_FENCE_FLG_SHIFT 1 +#define RDMA_SQ_ATOMIC_WQE_INV_FENCE_FLG_MASK 0x1 /* If set, all pending operations will be completed before start processing this WQE */ +#define RDMA_SQ_ATOMIC_WQE_INV_FENCE_FLG_SHIFT 2 +#define RDMA_SQ_ATOMIC_WQE_SE_FLG_MASK 0x1 /* Don't care for atomic wqe */ +#define RDMA_SQ_ATOMIC_WQE_SE_FLG_SHIFT 3 +#define RDMA_SQ_ATOMIC_WQE_INLINE_FLG_MASK 0x1 /* Should be 0 for atomic wqe */ +#define RDMA_SQ_ATOMIC_WQE_INLINE_FLG_SHIFT 4 +#define RDMA_SQ_ATOMIC_WQE_DIF_ON_HOST_FLG_MASK 0x1 /* Should be 0 for atomic wqe */ +#define RDMA_SQ_ATOMIC_WQE_DIF_ON_HOST_FLG_SHIFT 5 +#define RDMA_SQ_ATOMIC_WQE_RESERVED0_MASK 0x3 +#define RDMA_SQ_ATOMIC_WQE_RESERVED0_SHIFT 6 + uint8_t wqe_size /* Size of WQE in 16B chunks including SGE */; + uint8_t prev_wqe_size /* Previous WQE size in 16B chunks */; + struct regpair remote_va /* remote virtual address */; + __le32 r_key /* Remote key */; + __le32 reserved2; + struct regpair cmp_data /* Data to compare in case of ATOMIC_CMP_AND_SWAP */; + struct regpair swap_data /* Swap or add data */; +}; + + +/* + * First element (16 bytes) of atomic wqe + */ +struct rdma_sq_atomic_wqe_1st +{ + __le32 reserved1; + __le32 length /* Total data length (8 bytes for Atomic) */; + __le32 xrc_srq /* Valid only when XRC is set for the QP */; + uint8_t req_type /* Type of WQE */; + uint8_t flags; +#define RDMA_SQ_ATOMIC_WQE_1ST_COMP_FLG_MASK 0x1 /* If set, completion will be generated when the WQE is completed */ +#define RDMA_SQ_ATOMIC_WQE_1ST_COMP_FLG_SHIFT 0 +#define RDMA_SQ_ATOMIC_WQE_1ST_RD_FENCE_FLG_MASK 0x1 /* If set, all pending RDMA read or Atomic operations will be completed before start processing this WQE */ +#define RDMA_SQ_ATOMIC_WQE_1ST_RD_FENCE_FLG_SHIFT 1 +#define RDMA_SQ_ATOMIC_WQE_1ST_INV_FENCE_FLG_MASK 0x1 /* If set, all pending operations will be completed before start processing this WQE */ +#define RDMA_SQ_ATOMIC_WQE_1ST_INV_FENCE_FLG_SHIFT 2 +#define RDMA_SQ_ATOMIC_WQE_1ST_SE_FLG_MASK 0x1 /* Don't care for atomic wqe */ +#define RDMA_SQ_ATOMIC_WQE_1ST_SE_FLG_SHIFT 3 +#define RDMA_SQ_ATOMIC_WQE_1ST_INLINE_FLG_MASK 0x1 /* Should be 0 for atomic wqe */ +#define RDMA_SQ_ATOMIC_WQE_1ST_INLINE_FLG_SHIFT 4 +#define RDMA_SQ_ATOMIC_WQE_1ST_RESERVED0_MASK 0x7 +#define RDMA_SQ_ATOMIC_WQE_1ST_RESERVED0_SHIFT 5 + uint8_t wqe_size /* Size of WQE in 16B chunks including all SGEs. Set to number of SGEs + 1. */; + uint8_t prev_wqe_size /* Previous WQE size in 16B chunks */; +}; + + +/* + * Second element (16 bytes) of atomic wqe + */ +struct rdma_sq_atomic_wqe_2nd +{ + struct regpair remote_va /* remote virtual address */; + __le32 r_key /* Remote key */; + __le32 reserved2; +}; + + +/* + * Third element (16 bytes) of atomic wqe + */ +struct rdma_sq_atomic_wqe_3rd +{ + struct regpair cmp_data /* Data to compare in case of ATOMIC_CMP_AND_SWAP */; + struct regpair swap_data /* Swap or add data */; +}; + + +struct rdma_sq_bind_wqe +{ + struct regpair addr; + __le32 l_key; + uint8_t req_type /* Type of WQE */; + uint8_t flags; +#define RDMA_SQ_BIND_WQE_COMP_FLG_MASK 0x1 /* If set, completion will be generated when the WQE is completed */ +#define RDMA_SQ_BIND_WQE_COMP_FLG_SHIFT 0 +#define RDMA_SQ_BIND_WQE_RD_FENCE_FLG_MASK 0x1 /* If set, all pending RDMA read or Atomic operations will be completed before start processing this WQE */ +#define RDMA_SQ_BIND_WQE_RD_FENCE_FLG_SHIFT 1 +#define RDMA_SQ_BIND_WQE_INV_FENCE_FLG_MASK 0x1 /* If set, all pending operations will be completed before start processing this WQE */ +#define RDMA_SQ_BIND_WQE_INV_FENCE_FLG_SHIFT 2 +#define RDMA_SQ_BIND_WQE_SE_FLG_MASK 0x1 /* Don't care for bind wqe */ +#define RDMA_SQ_BIND_WQE_SE_FLG_SHIFT 3 +#define RDMA_SQ_BIND_WQE_INLINE_FLG_MASK 0x1 /* Should be 0 for bind wqe */ +#define RDMA_SQ_BIND_WQE_INLINE_FLG_SHIFT 4 +#define RDMA_SQ_BIND_WQE_RESERVED0_MASK 0x7 +#define RDMA_SQ_BIND_WQE_RESERVED0_SHIFT 5 + uint8_t wqe_size /* Size of WQE in 16B chunks */; + uint8_t prev_wqe_size /* Previous WQE size in 16B chunks */; + uint8_t bind_ctrl; +#define RDMA_SQ_BIND_WQE_ZERO_BASED_MASK 0x1 /* zero based indication */ +#define RDMA_SQ_BIND_WQE_ZERO_BASED_SHIFT 0 +#define RDMA_SQ_BIND_WQE_MW_TYPE_MASK 0x1 /* (use enum rdma_mw_type) */ +#define RDMA_SQ_BIND_WQE_MW_TYPE_SHIFT 1 +#define RDMA_SQ_BIND_WQE_RESERVED1_MASK 0x3F +#define RDMA_SQ_BIND_WQE_RESERVED1_SHIFT 2 + uint8_t access_ctrl; +#define RDMA_SQ_BIND_WQE_REMOTE_READ_MASK 0x1 +#define RDMA_SQ_BIND_WQE_REMOTE_READ_SHIFT 0 +#define RDMA_SQ_BIND_WQE_REMOTE_WRITE_MASK 0x1 +#define RDMA_SQ_BIND_WQE_REMOTE_WRITE_SHIFT 1 +#define RDMA_SQ_BIND_WQE_ENABLE_ATOMIC_MASK 0x1 +#define RDMA_SQ_BIND_WQE_ENABLE_ATOMIC_SHIFT 2 +#define RDMA_SQ_BIND_WQE_LOCAL_READ_MASK 0x1 +#define RDMA_SQ_BIND_WQE_LOCAL_READ_SHIFT 3 +#define RDMA_SQ_BIND_WQE_LOCAL_WRITE_MASK 0x1 +#define RDMA_SQ_BIND_WQE_LOCAL_WRITE_SHIFT 4 +#define RDMA_SQ_BIND_WQE_RESERVED2_MASK 0x7 +#define RDMA_SQ_BIND_WQE_RESERVED2_SHIFT 5 + uint8_t reserved3; + uint8_t length_hi /* upper 8 bits of the registered MW length */; + __le32 length_lo /* lower 32 bits of the registered MW length */; + __le32 parent_l_key /* l_key of the parent MR */; + __le32 reserved4; +}; + + +/* + * First element (16 bytes) of bind wqe + */ +struct rdma_sq_bind_wqe_1st +{ + struct regpair addr; + __le32 l_key; + uint8_t req_type /* Type of WQE */; + uint8_t flags; +#define RDMA_SQ_BIND_WQE_1ST_COMP_FLG_MASK 0x1 /* If set, completion will be generated when the WQE is completed */ +#define RDMA_SQ_BIND_WQE_1ST_COMP_FLG_SHIFT 0 +#define RDMA_SQ_BIND_WQE_1ST_RD_FENCE_FLG_MASK 0x1 /* If set, all pending RDMA read or Atomic operations will be completed before start processing this WQE */ +#define RDMA_SQ_BIND_WQE_1ST_RD_FENCE_FLG_SHIFT 1 +#define RDMA_SQ_BIND_WQE_1ST_INV_FENCE_FLG_MASK 0x1 /* If set, all pending operations will be completed before start processing this WQE */ +#define RDMA_SQ_BIND_WQE_1ST_INV_FENCE_FLG_SHIFT 2 +#define RDMA_SQ_BIND_WQE_1ST_SE_FLG_MASK 0x1 /* Don't care for bind wqe */ +#define RDMA_SQ_BIND_WQE_1ST_SE_FLG_SHIFT 3 +#define RDMA_SQ_BIND_WQE_1ST_INLINE_FLG_MASK 0x1 /* Should be 0 for bind wqe */ +#define RDMA_SQ_BIND_WQE_1ST_INLINE_FLG_SHIFT 4 +#define RDMA_SQ_BIND_WQE_1ST_RESERVED0_MASK 0x7 +#define RDMA_SQ_BIND_WQE_1ST_RESERVED0_SHIFT 5 + uint8_t wqe_size /* Size of WQE in 16B chunks */; + uint8_t prev_wqe_size /* Previous WQE size in 16B chunks */; +}; + + +/* + * Second element (16 bytes) of bind wqe + */ +struct rdma_sq_bind_wqe_2nd +{ + uint8_t bind_ctrl; +#define RDMA_SQ_BIND_WQE_2ND_ZERO_BASED_MASK 0x1 /* zero based indication */ +#define RDMA_SQ_BIND_WQE_2ND_ZERO_BASED_SHIFT 0 +#define RDMA_SQ_BIND_WQE_2ND_MW_TYPE_MASK 0x1 /* (use enum rdma_mw_type) */ +#define RDMA_SQ_BIND_WQE_2ND_MW_TYPE_SHIFT 1 +#define RDMA_SQ_BIND_WQE_2ND_RESERVED1_MASK 0x3F +#define RDMA_SQ_BIND_WQE_2ND_RESERVED1_SHIFT 2 + uint8_t access_ctrl; +#define RDMA_SQ_BIND_WQE_2ND_REMOTE_READ_MASK 0x1 +#define RDMA_SQ_BIND_WQE_2ND_REMOTE_READ_SHIFT 0 +#define RDMA_SQ_BIND_WQE_2ND_REMOTE_WRITE_MASK 0x1 +#define RDMA_SQ_BIND_WQE_2ND_REMOTE_WRITE_SHIFT 1 +#define RDMA_SQ_BIND_WQE_2ND_ENABLE_ATOMIC_MASK 0x1 +#define RDMA_SQ_BIND_WQE_2ND_ENABLE_ATOMIC_SHIFT 2 +#define RDMA_SQ_BIND_WQE_2ND_LOCAL_READ_MASK 0x1 +#define RDMA_SQ_BIND_WQE_2ND_LOCAL_READ_SHIFT 3 +#define RDMA_SQ_BIND_WQE_2ND_LOCAL_WRITE_MASK 0x1 +#define RDMA_SQ_BIND_WQE_2ND_LOCAL_WRITE_SHIFT 4 +#define RDMA_SQ_BIND_WQE_2ND_RESERVED2_MASK 0x7 +#define RDMA_SQ_BIND_WQE_2ND_RESERVED2_SHIFT 5 + uint8_t reserved3; + uint8_t length_hi /* upper 8 bits of the registered MW length */; + __le32 length_lo /* lower 32 bits of the registered MW length */; + __le32 parent_l_key /* l_key of the parent MR */; + __le32 reserved4; +}; + + +/* + * Structure with only the SQ WQE common fields. Size is of one SQ element (16B) + */ +struct rdma_sq_common_wqe +{ + __le32 reserved1[3]; + uint8_t req_type /* Type of WQE */; + uint8_t flags; +#define RDMA_SQ_COMMON_WQE_COMP_FLG_MASK 0x1 /* If set, completion will be generated when the WQE is completed */ +#define RDMA_SQ_COMMON_WQE_COMP_FLG_SHIFT 0 +#define RDMA_SQ_COMMON_WQE_RD_FENCE_FLG_MASK 0x1 /* If set, all pending RDMA read or Atomic operations will be completed before start processing this WQE */ +#define RDMA_SQ_COMMON_WQE_RD_FENCE_FLG_SHIFT 1 +#define RDMA_SQ_COMMON_WQE_INV_FENCE_FLG_MASK 0x1 /* If set, all pending operations will be completed before start processing this WQE */ +#define RDMA_SQ_COMMON_WQE_INV_FENCE_FLG_SHIFT 2 +#define RDMA_SQ_COMMON_WQE_SE_FLG_MASK 0x1 /* If set, signal the responder to generate a solicited event on this WQE (only relevant in SENDs and RDMA write with Imm) */ +#define RDMA_SQ_COMMON_WQE_SE_FLG_SHIFT 3 +#define RDMA_SQ_COMMON_WQE_INLINE_FLG_MASK 0x1 /* if set, indicates inline data is following this WQE instead of SGEs (only relevant in SENDs and RDMA writes) */ +#define RDMA_SQ_COMMON_WQE_INLINE_FLG_SHIFT 4 +#define RDMA_SQ_COMMON_WQE_RESERVED0_MASK 0x7 +#define RDMA_SQ_COMMON_WQE_RESERVED0_SHIFT 5 + uint8_t wqe_size /* Size of WQE in 16B chunks including all SGEs or inline data. In case there are SGEs: set to number of SGEs + 1. In case of inline data: set to the whole number of 16B which contain the inline data + 1. */; + uint8_t prev_wqe_size /* Previous WQE size in 16B chunks */; +}; + + +struct rdma_sq_fmr_wqe +{ + struct regpair addr; + __le32 l_key; + uint8_t req_type /* Type of WQE */; + uint8_t flags; +#define RDMA_SQ_FMR_WQE_COMP_FLG_MASK 0x1 /* If set, completion will be generated when the WQE is completed */ +#define RDMA_SQ_FMR_WQE_COMP_FLG_SHIFT 0 +#define RDMA_SQ_FMR_WQE_RD_FENCE_FLG_MASK 0x1 /* If set, all pending RDMA read or Atomic operations will be completed before start processing this WQE */ +#define RDMA_SQ_FMR_WQE_RD_FENCE_FLG_SHIFT 1 +#define RDMA_SQ_FMR_WQE_INV_FENCE_FLG_MASK 0x1 /* If set, all pending operations will be completed before start processing this WQE */ +#define RDMA_SQ_FMR_WQE_INV_FENCE_FLG_SHIFT 2 +#define RDMA_SQ_FMR_WQE_SE_FLG_MASK 0x1 /* Don't care for FMR wqe */ +#define RDMA_SQ_FMR_WQE_SE_FLG_SHIFT 3 +#define RDMA_SQ_FMR_WQE_INLINE_FLG_MASK 0x1 /* Should be 0 for FMR wqe */ +#define RDMA_SQ_FMR_WQE_INLINE_FLG_SHIFT 4 +#define RDMA_SQ_FMR_WQE_DIF_ON_HOST_FLG_MASK 0x1 /* If set, indicated host memory of this WQE is DIF protected. */ +#define RDMA_SQ_FMR_WQE_DIF_ON_HOST_FLG_SHIFT 5 +#define RDMA_SQ_FMR_WQE_RESERVED0_MASK 0x3 +#define RDMA_SQ_FMR_WQE_RESERVED0_SHIFT 6 + uint8_t wqe_size /* Size of WQE in 16B chunks */; + uint8_t prev_wqe_size /* Previous WQE size in 16B chunks */; + uint8_t fmr_ctrl; +#define RDMA_SQ_FMR_WQE_PAGE_SIZE_LOG_MASK 0x1F /* 0 is 4k, 1 is 8k... */ +#define RDMA_SQ_FMR_WQE_PAGE_SIZE_LOG_SHIFT 0 +#define RDMA_SQ_FMR_WQE_ZERO_BASED_MASK 0x1 /* zero based indication */ +#define RDMA_SQ_FMR_WQE_ZERO_BASED_SHIFT 5 +#define RDMA_SQ_FMR_WQE_BIND_EN_MASK 0x1 /* indication whether bind is enabled for this MR */ +#define RDMA_SQ_FMR_WQE_BIND_EN_SHIFT 6 +#define RDMA_SQ_FMR_WQE_RESERVED1_MASK 0x1 +#define RDMA_SQ_FMR_WQE_RESERVED1_SHIFT 7 + uint8_t access_ctrl; +#define RDMA_SQ_FMR_WQE_REMOTE_READ_MASK 0x1 +#define RDMA_SQ_FMR_WQE_REMOTE_READ_SHIFT 0 +#define RDMA_SQ_FMR_WQE_REMOTE_WRITE_MASK 0x1 +#define RDMA_SQ_FMR_WQE_REMOTE_WRITE_SHIFT 1 +#define RDMA_SQ_FMR_WQE_ENABLE_ATOMIC_MASK 0x1 +#define RDMA_SQ_FMR_WQE_ENABLE_ATOMIC_SHIFT 2 +#define RDMA_SQ_FMR_WQE_LOCAL_READ_MASK 0x1 +#define RDMA_SQ_FMR_WQE_LOCAL_READ_SHIFT 3 +#define RDMA_SQ_FMR_WQE_LOCAL_WRITE_MASK 0x1 +#define RDMA_SQ_FMR_WQE_LOCAL_WRITE_SHIFT 4 +#define RDMA_SQ_FMR_WQE_RESERVED2_MASK 0x7 +#define RDMA_SQ_FMR_WQE_RESERVED2_SHIFT 5 + uint8_t reserved3; + uint8_t length_hi /* upper 8 bits of the registered MR length */; + __le32 length_lo /* lower 32 bits of the registered MR length. In case of DIF the length is specified including the DIF guards. */; + struct regpair pbl_addr /* Address of PBL */; + __le32 dif_base_ref_tag /* Ref tag of the first DIF Block. */; + __le16 dif_app_tag /* App tag of all DIF Blocks. */; + __le16 dif_app_tag_mask /* Bitmask for verifying dif_app_tag. */; + __le16 dif_runt_crc_value /* In TX IO, in case the runt_valid_flg is set, this value is used to validate the last Block in the IO. */; + __le16 dif_flags; +#define RDMA_SQ_FMR_WQE_DIF_IO_DIRECTION_FLG_MASK 0x1 /* 0=RX, 1=TX (use enum rdma_dif_io_direction_flg) */ +#define RDMA_SQ_FMR_WQE_DIF_IO_DIRECTION_FLG_SHIFT 0 +#define RDMA_SQ_FMR_WQE_DIF_BLOCK_SIZE_MASK 0x1 /* DIF block size. 0=512B 1=4096B (use enum rdma_dif_block_size) */ +#define RDMA_SQ_FMR_WQE_DIF_BLOCK_SIZE_SHIFT 1 +#define RDMA_SQ_FMR_WQE_DIF_RUNT_VALID_FLG_MASK 0x1 /* In TX IO, indicates the runt_value field is valid. In RX IO, indicates the calculated runt value is to be placed on host buffer. */ +#define RDMA_SQ_FMR_WQE_DIF_RUNT_VALID_FLG_SHIFT 2 +#define RDMA_SQ_FMR_WQE_DIF_VALIDATE_CRC_GUARD_MASK 0x1 /* In TX IO, indicates CRC of each DIF guard tag is checked. */ +#define RDMA_SQ_FMR_WQE_DIF_VALIDATE_CRC_GUARD_SHIFT 3 +#define RDMA_SQ_FMR_WQE_DIF_VALIDATE_REF_TAG_MASK 0x1 /* In TX IO, indicates Ref tag of each DIF guard tag is checked. */ +#define RDMA_SQ_FMR_WQE_DIF_VALIDATE_REF_TAG_SHIFT 4 +#define RDMA_SQ_FMR_WQE_DIF_VALIDATE_APP_TAG_MASK 0x1 /* In TX IO, indicates App tag of each DIF guard tag is checked. */ +#define RDMA_SQ_FMR_WQE_DIF_VALIDATE_APP_TAG_SHIFT 5 +#define RDMA_SQ_FMR_WQE_DIF_CRC_SEED_MASK 0x1 /* DIF CRC Seed to use. 0=0x000 1=0xFFFF (use enum rdma_dif_crc_seed) */ +#define RDMA_SQ_FMR_WQE_DIF_CRC_SEED_SHIFT 6 +#define RDMA_SQ_FMR_WQE_RESERVED4_MASK 0x1FF +#define RDMA_SQ_FMR_WQE_RESERVED4_SHIFT 7 + __le32 Reserved5; +}; + + +/* + * First element (16 bytes) of fmr wqe + */ +struct rdma_sq_fmr_wqe_1st +{ + struct regpair addr; + __le32 l_key; + uint8_t req_type /* Type of WQE */; + uint8_t flags; +#define RDMA_SQ_FMR_WQE_1ST_COMP_FLG_MASK 0x1 /* If set, completion will be generated when the WQE is completed */ +#define RDMA_SQ_FMR_WQE_1ST_COMP_FLG_SHIFT 0 +#define RDMA_SQ_FMR_WQE_1ST_RD_FENCE_FLG_MASK 0x1 /* If set, all pending RDMA read or Atomic operations will be completed before start processing this WQE */ +#define RDMA_SQ_FMR_WQE_1ST_RD_FENCE_FLG_SHIFT 1 +#define RDMA_SQ_FMR_WQE_1ST_INV_FENCE_FLG_MASK 0x1 /* If set, all pending operations will be completed before start processing this WQE */ +#define RDMA_SQ_FMR_WQE_1ST_INV_FENCE_FLG_SHIFT 2 +#define RDMA_SQ_FMR_WQE_1ST_SE_FLG_MASK 0x1 /* Don't care for FMR wqe */ +#define RDMA_SQ_FMR_WQE_1ST_SE_FLG_SHIFT 3 +#define RDMA_SQ_FMR_WQE_1ST_INLINE_FLG_MASK 0x1 /* Should be 0 for FMR wqe */ +#define RDMA_SQ_FMR_WQE_1ST_INLINE_FLG_SHIFT 4 +#define RDMA_SQ_FMR_WQE_1ST_DIF_ON_HOST_FLG_MASK 0x1 /* If set, indicated host memory of this WQE is DIF protected. */ +#define RDMA_SQ_FMR_WQE_1ST_DIF_ON_HOST_FLG_SHIFT 5 +#define RDMA_SQ_FMR_WQE_1ST_RESERVED0_MASK 0x3 +#define RDMA_SQ_FMR_WQE_1ST_RESERVED0_SHIFT 6 + uint8_t wqe_size /* Size of WQE in 16B chunks */; + uint8_t prev_wqe_size /* Previous WQE size in 16B chunks */; +}; + + +/* + * Second element (16 bytes) of fmr wqe + */ +struct rdma_sq_fmr_wqe_2nd +{ + uint8_t fmr_ctrl; +#define RDMA_SQ_FMR_WQE_2ND_PAGE_SIZE_LOG_MASK 0x1F /* 0 is 4k, 1 is 8k... */ +#define RDMA_SQ_FMR_WQE_2ND_PAGE_SIZE_LOG_SHIFT 0 +#define RDMA_SQ_FMR_WQE_2ND_ZERO_BASED_MASK 0x1 /* zero based indication */ +#define RDMA_SQ_FMR_WQE_2ND_ZERO_BASED_SHIFT 5 +#define RDMA_SQ_FMR_WQE_2ND_BIND_EN_MASK 0x1 /* indication whether bind is enabled for this MR */ +#define RDMA_SQ_FMR_WQE_2ND_BIND_EN_SHIFT 6 +#define RDMA_SQ_FMR_WQE_2ND_RESERVED1_MASK 0x1 +#define RDMA_SQ_FMR_WQE_2ND_RESERVED1_SHIFT 7 + uint8_t access_ctrl; +#define RDMA_SQ_FMR_WQE_2ND_REMOTE_READ_MASK 0x1 +#define RDMA_SQ_FMR_WQE_2ND_REMOTE_READ_SHIFT 0 +#define RDMA_SQ_FMR_WQE_2ND_REMOTE_WRITE_MASK 0x1 +#define RDMA_SQ_FMR_WQE_2ND_REMOTE_WRITE_SHIFT 1 +#define RDMA_SQ_FMR_WQE_2ND_ENABLE_ATOMIC_MASK 0x1 +#define RDMA_SQ_FMR_WQE_2ND_ENABLE_ATOMIC_SHIFT 2 +#define RDMA_SQ_FMR_WQE_2ND_LOCAL_READ_MASK 0x1 +#define RDMA_SQ_FMR_WQE_2ND_LOCAL_READ_SHIFT 3 +#define RDMA_SQ_FMR_WQE_2ND_LOCAL_WRITE_MASK 0x1 +#define RDMA_SQ_FMR_WQE_2ND_LOCAL_WRITE_SHIFT 4 +#define RDMA_SQ_FMR_WQE_2ND_RESERVED2_MASK 0x7 +#define RDMA_SQ_FMR_WQE_2ND_RESERVED2_SHIFT 5 + uint8_t reserved3; + uint8_t length_hi /* upper 8 bits of the registered MR length */; + __le32 length_lo /* lower 32 bits of the registered MR length. In case of zero based MR, will hold FBO */; + struct regpair pbl_addr /* Address of PBL */; +}; + + +/* + * Third element (16 bytes) of fmr wqe + */ +struct rdma_sq_fmr_wqe_3rd +{ + __le32 dif_base_ref_tag /* Ref tag of the first DIF Block. */; + __le16 dif_app_tag /* App tag of all DIF Blocks. */; + __le16 dif_app_tag_mask /* Bitmask for verifying dif_app_tag. */; + __le16 dif_runt_crc_value /* In TX IO, in case the runt_valid_flg is set, this value is used to validate the last Block in the IO. */; + __le16 dif_flags; +#define RDMA_SQ_FMR_WQE_3RD_DIF_IO_DIRECTION_FLG_MASK 0x1 /* 0=RX, 1=TX (use enum rdma_dif_io_direction_flg) */ +#define RDMA_SQ_FMR_WQE_3RD_DIF_IO_DIRECTION_FLG_SHIFT 0 +#define RDMA_SQ_FMR_WQE_3RD_DIF_BLOCK_SIZE_MASK 0x1 /* DIF block size. 0=512B 1=4096B (use enum rdma_dif_block_size) */ +#define RDMA_SQ_FMR_WQE_3RD_DIF_BLOCK_SIZE_SHIFT 1 +#define RDMA_SQ_FMR_WQE_3RD_DIF_RUNT_VALID_FLG_MASK 0x1 /* In TX IO, indicates the runt_value field is valid. In RX IO, indicates the calculated runt value is to be placed on host buffer. */ +#define RDMA_SQ_FMR_WQE_3RD_DIF_RUNT_VALID_FLG_SHIFT 2 +#define RDMA_SQ_FMR_WQE_3RD_DIF_VALIDATE_CRC_GUARD_MASK 0x1 /* In TX IO, indicates CRC of each DIF guard tag is checked. */ +#define RDMA_SQ_FMR_WQE_3RD_DIF_VALIDATE_CRC_GUARD_SHIFT 3 +#define RDMA_SQ_FMR_WQE_3RD_DIF_VALIDATE_REF_TAG_MASK 0x1 /* In TX IO, indicates Ref tag of each DIF guard tag is checked. */ +#define RDMA_SQ_FMR_WQE_3RD_DIF_VALIDATE_REF_TAG_SHIFT 4 +#define RDMA_SQ_FMR_WQE_3RD_DIF_VALIDATE_APP_TAG_MASK 0x1 /* In TX IO, indicates App tag of each DIF guard tag is checked. */ +#define RDMA_SQ_FMR_WQE_3RD_DIF_VALIDATE_APP_TAG_SHIFT 5 +#define RDMA_SQ_FMR_WQE_3RD_DIF_CRC_SEED_MASK 0x1 /* DIF CRC Seed to use. 0=0x000 1=0xFFFF (use enum rdma_dif_crc_seed) */ +#define RDMA_SQ_FMR_WQE_3RD_DIF_CRC_SEED_SHIFT 6 +#define RDMA_SQ_FMR_WQE_3RD_RESERVED4_MASK 0x1FF +#define RDMA_SQ_FMR_WQE_3RD_RESERVED4_SHIFT 7 + __le32 Reserved5; +}; + + +struct rdma_sq_local_inv_wqe +{ + struct regpair reserved; + __le32 inv_l_key /* The invalidate local key */; + uint8_t req_type /* Type of WQE */; + uint8_t flags; +#define RDMA_SQ_LOCAL_INV_WQE_COMP_FLG_MASK 0x1 /* If set, completion will be generated when the WQE is completed */ +#define RDMA_SQ_LOCAL_INV_WQE_COMP_FLG_SHIFT 0 +#define RDMA_SQ_LOCAL_INV_WQE_RD_FENCE_FLG_MASK 0x1 /* If set, all pending RDMA read or Atomic operations will be completed before start processing this WQE */ +#define RDMA_SQ_LOCAL_INV_WQE_RD_FENCE_FLG_SHIFT 1 +#define RDMA_SQ_LOCAL_INV_WQE_INV_FENCE_FLG_MASK 0x1 /* If set, all pending operations will be completed before start processing this WQE */ +#define RDMA_SQ_LOCAL_INV_WQE_INV_FENCE_FLG_SHIFT 2 +#define RDMA_SQ_LOCAL_INV_WQE_SE_FLG_MASK 0x1 /* Don't care for local invalidate wqe */ +#define RDMA_SQ_LOCAL_INV_WQE_SE_FLG_SHIFT 3 +#define RDMA_SQ_LOCAL_INV_WQE_INLINE_FLG_MASK 0x1 /* Should be 0 for local invalidate wqe */ +#define RDMA_SQ_LOCAL_INV_WQE_INLINE_FLG_SHIFT 4 +#define RDMA_SQ_LOCAL_INV_WQE_DIF_ON_HOST_FLG_MASK 0x1 /* If set, indicated host memory of this WQE is DIF protected. */ +#define RDMA_SQ_LOCAL_INV_WQE_DIF_ON_HOST_FLG_SHIFT 5 +#define RDMA_SQ_LOCAL_INV_WQE_RESERVED0_MASK 0x3 +#define RDMA_SQ_LOCAL_INV_WQE_RESERVED0_SHIFT 6 + uint8_t wqe_size /* Size of WQE in 16B chunks */; + uint8_t prev_wqe_size /* Previous WQE size in 16B chunks */; +}; + + +struct rdma_sq_rdma_wqe +{ + __le32 imm_data /* The immediate data in case of RDMA_WITH_IMM */; + __le32 length /* Total data length. If DIF on host is enabled, length does NOT include DIF guards. */; + __le32 xrc_srq /* Valid only when XRC is set for the QP */; + uint8_t req_type /* Type of WQE */; + uint8_t flags; +#define RDMA_SQ_RDMA_WQE_COMP_FLG_MASK 0x1 /* If set, completion will be generated when the WQE is completed */ +#define RDMA_SQ_RDMA_WQE_COMP_FLG_SHIFT 0 +#define RDMA_SQ_RDMA_WQE_RD_FENCE_FLG_MASK 0x1 /* If set, all pending RDMA read or Atomic operations will be completed before start processing this WQE */ +#define RDMA_SQ_RDMA_WQE_RD_FENCE_FLG_SHIFT 1 +#define RDMA_SQ_RDMA_WQE_INV_FENCE_FLG_MASK 0x1 /* If set, all pending operations will be completed before start processing this WQE */ +#define RDMA_SQ_RDMA_WQE_INV_FENCE_FLG_SHIFT 2 +#define RDMA_SQ_RDMA_WQE_SE_FLG_MASK 0x1 /* If set, signal the responder to generate a solicited event on this WQE */ +#define RDMA_SQ_RDMA_WQE_SE_FLG_SHIFT 3 +#define RDMA_SQ_RDMA_WQE_INLINE_FLG_MASK 0x1 /* if set, indicates inline data is following this WQE instead of SGEs. Applicable for RDMA_WR or RDMA_WR_WITH_IMM. Should be 0 for RDMA_RD */ +#define RDMA_SQ_RDMA_WQE_INLINE_FLG_SHIFT 4 +#define RDMA_SQ_RDMA_WQE_DIF_ON_HOST_FLG_MASK 0x1 /* If set, indicated host memory of this WQE is DIF protected. */ +#define RDMA_SQ_RDMA_WQE_DIF_ON_HOST_FLG_SHIFT 5 +#define RDMA_SQ_RDMA_WQE_RESERVED0_MASK 0x3 +#define RDMA_SQ_RDMA_WQE_RESERVED0_SHIFT 6 + uint8_t wqe_size /* Size of WQE in 16B chunks including all SGEs or inline data. In case there are SGEs: set to number of SGEs + 1. In case of inline data: set to the whole number of 16B which contain the inline data + 1. */; + uint8_t prev_wqe_size /* Previous WQE size in 16B chunks */; + struct regpair remote_va /* Remote virtual address */; + __le32 r_key /* Remote key */; + uint8_t dif_flags; +#define RDMA_SQ_RDMA_WQE_DIF_BLOCK_SIZE_MASK 0x1 /* if dif_on_host_flg set: DIF block size. 0=512B 1=4096B (use enum rdma_dif_block_size) */ +#define RDMA_SQ_RDMA_WQE_DIF_BLOCK_SIZE_SHIFT 0 +#define RDMA_SQ_RDMA_WQE_DIF_FIRST_RDMA_IN_IO_FLG_MASK 0x1 /* if dif_on_host_flg set: WQE executes first RDMA on related IO. */ +#define RDMA_SQ_RDMA_WQE_DIF_FIRST_RDMA_IN_IO_FLG_SHIFT 1 +#define RDMA_SQ_RDMA_WQE_DIF_LAST_RDMA_IN_IO_FLG_MASK 0x1 /* if dif_on_host_flg set: WQE executes last RDMA on related IO. */ +#define RDMA_SQ_RDMA_WQE_DIF_LAST_RDMA_IN_IO_FLG_SHIFT 2 +#define RDMA_SQ_RDMA_WQE_RESERVED1_MASK 0x1F +#define RDMA_SQ_RDMA_WQE_RESERVED1_SHIFT 3 + uint8_t reserved2[3]; +}; + + +/* + * First element (16 bytes) of rdma wqe + */ +struct rdma_sq_rdma_wqe_1st +{ + __le32 imm_data /* The immediate data in case of RDMA_WITH_IMM */; + __le32 length /* Total data length */; + __le32 xrc_srq /* Valid only when XRC is set for the QP */; + uint8_t req_type /* Type of WQE */; + uint8_t flags; +#define RDMA_SQ_RDMA_WQE_1ST_COMP_FLG_MASK 0x1 /* If set, completion will be generated when the WQE is completed */ +#define RDMA_SQ_RDMA_WQE_1ST_COMP_FLG_SHIFT 0 +#define RDMA_SQ_RDMA_WQE_1ST_RD_FENCE_FLG_MASK 0x1 /* If set, all pending RDMA read or Atomic operations will be completed before start processing this WQE */ +#define RDMA_SQ_RDMA_WQE_1ST_RD_FENCE_FLG_SHIFT 1 +#define RDMA_SQ_RDMA_WQE_1ST_INV_FENCE_FLG_MASK 0x1 /* If set, all pending operations will be completed before start processing this WQE */ +#define RDMA_SQ_RDMA_WQE_1ST_INV_FENCE_FLG_SHIFT 2 +#define RDMA_SQ_RDMA_WQE_1ST_SE_FLG_MASK 0x1 /* If set, signal the responder to generate a solicited event on this WQE */ +#define RDMA_SQ_RDMA_WQE_1ST_SE_FLG_SHIFT 3 +#define RDMA_SQ_RDMA_WQE_1ST_INLINE_FLG_MASK 0x1 /* if set, indicates inline data is following this WQE instead of SGEs. Applicable for RDMA_WR or RDMA_WR_WITH_IMM. Should be 0 for RDMA_RD */ +#define RDMA_SQ_RDMA_WQE_1ST_INLINE_FLG_SHIFT 4 +#define RDMA_SQ_RDMA_WQE_1ST_DIF_ON_HOST_FLG_MASK 0x1 /* If set, indicated host memory of this WQE is DIF protected. */ +#define RDMA_SQ_RDMA_WQE_1ST_DIF_ON_HOST_FLG_SHIFT 5 +#define RDMA_SQ_RDMA_WQE_1ST_RESERVED0_MASK 0x3 +#define RDMA_SQ_RDMA_WQE_1ST_RESERVED0_SHIFT 6 + uint8_t wqe_size /* Size of WQE in 16B chunks including all SGEs or inline data. In case there are SGEs: set to number of SGEs + 1. In case of inline data: set to the whole number of 16B which contain the inline data + 1. */; + uint8_t prev_wqe_size /* Previous WQE size in 16B chunks */; +}; + + +/* + * Second element (16 bytes) of rdma wqe + */ +struct rdma_sq_rdma_wqe_2nd +{ + struct regpair remote_va /* Remote virtual address */; + __le32 r_key /* Remote key */; + uint8_t dif_flags; +#define RDMA_SQ_RDMA_WQE_2ND_DIF_BLOCK_SIZE_MASK 0x1 /* if dif_on_host_flg set: DIF block size. 0=512B 1=4096B (use enum rdma_dif_block_size) */ +#define RDMA_SQ_RDMA_WQE_2ND_DIF_BLOCK_SIZE_SHIFT 0 +#define RDMA_SQ_RDMA_WQE_2ND_DIF_FIRST_SEGMENT_FLG_MASK 0x1 /* if dif_on_host_flg set: WQE executes first DIF on related MR. */ +#define RDMA_SQ_RDMA_WQE_2ND_DIF_FIRST_SEGMENT_FLG_SHIFT 1 +#define RDMA_SQ_RDMA_WQE_2ND_DIF_LAST_SEGMENT_FLG_MASK 0x1 /* if dif_on_host_flg set: WQE executes last DIF on related MR. */ +#define RDMA_SQ_RDMA_WQE_2ND_DIF_LAST_SEGMENT_FLG_SHIFT 2 +#define RDMA_SQ_RDMA_WQE_2ND_RESERVED1_MASK 0x1F +#define RDMA_SQ_RDMA_WQE_2ND_RESERVED1_SHIFT 3 + uint8_t reserved2[3]; +}; + + +/* + * SQ WQE req type enumeration + */ +enum rdma_sq_req_type +{ + RDMA_SQ_REQ_TYPE_SEND, + RDMA_SQ_REQ_TYPE_SEND_WITH_IMM, + RDMA_SQ_REQ_TYPE_SEND_WITH_INVALIDATE, + RDMA_SQ_REQ_TYPE_RDMA_WR, + RDMA_SQ_REQ_TYPE_RDMA_WR_WITH_IMM, + RDMA_SQ_REQ_TYPE_RDMA_RD, + RDMA_SQ_REQ_TYPE_ATOMIC_CMP_AND_SWAP, + RDMA_SQ_REQ_TYPE_ATOMIC_ADD, + RDMA_SQ_REQ_TYPE_LOCAL_INVALIDATE, + RDMA_SQ_REQ_TYPE_FAST_MR, + RDMA_SQ_REQ_TYPE_BIND, + RDMA_SQ_REQ_TYPE_INVALID, + MAX_RDMA_SQ_REQ_TYPE +}; + + +struct rdma_sq_send_wqe +{ + __le32 inv_key_or_imm_data /* the r_key to invalidate in case of SEND_WITH_INVALIDATE, or the immediate data in case of SEND_WITH_IMM */; + __le32 length /* Total data length */; + __le32 xrc_srq /* Valid only when XRC is set for the QP */; + uint8_t req_type /* Type of WQE */; + uint8_t flags; +#define RDMA_SQ_SEND_WQE_COMP_FLG_MASK 0x1 /* If set, completion will be generated when the WQE is completed */ +#define RDMA_SQ_SEND_WQE_COMP_FLG_SHIFT 0 +#define RDMA_SQ_SEND_WQE_RD_FENCE_FLG_MASK 0x1 /* If set, all pending RDMA read or Atomic operations will be completed before start processing this WQE */ +#define RDMA_SQ_SEND_WQE_RD_FENCE_FLG_SHIFT 1 +#define RDMA_SQ_SEND_WQE_INV_FENCE_FLG_MASK 0x1 /* If set, all pending operations will be completed before start processing this WQE */ +#define RDMA_SQ_SEND_WQE_INV_FENCE_FLG_SHIFT 2 +#define RDMA_SQ_SEND_WQE_SE_FLG_MASK 0x1 /* If set, signal the responder to generate a solicited event on this WQE */ +#define RDMA_SQ_SEND_WQE_SE_FLG_SHIFT 3 +#define RDMA_SQ_SEND_WQE_INLINE_FLG_MASK 0x1 /* if set, indicates inline data is following this WQE instead of SGEs */ +#define RDMA_SQ_SEND_WQE_INLINE_FLG_SHIFT 4 +#define RDMA_SQ_SEND_WQE_DIF_ON_HOST_FLG_MASK 0x1 /* Should be 0 for send wqe */ +#define RDMA_SQ_SEND_WQE_DIF_ON_HOST_FLG_SHIFT 5 +#define RDMA_SQ_SEND_WQE_RESERVED0_MASK 0x3 +#define RDMA_SQ_SEND_WQE_RESERVED0_SHIFT 6 + uint8_t wqe_size /* Size of WQE in 16B chunks including all SGEs or inline data. In case there are SGEs: set to number of SGEs + 1. In case of inline data: set to the whole number of 16B which contain the inline data + 1. */; + uint8_t prev_wqe_size /* Previous WQE size in 16B chunks */; + __le32 reserved1[4]; +}; + + +struct rdma_sq_send_wqe_1st +{ + __le32 inv_key_or_imm_data /* the r_key to invalidate in case of SEND_WITH_INVALIDATE, or the immediate data in case of SEND_WITH_IMM */; + __le32 length /* Total data length */; + __le32 xrc_srq /* Valid only when XRC is set for the QP */; + uint8_t req_type /* Type of WQE */; + uint8_t flags; +#define RDMA_SQ_SEND_WQE_1ST_COMP_FLG_MASK 0x1 /* If set, completion will be generated when the WQE is completed */ +#define RDMA_SQ_SEND_WQE_1ST_COMP_FLG_SHIFT 0 +#define RDMA_SQ_SEND_WQE_1ST_RD_FENCE_FLG_MASK 0x1 /* If set, all pending RDMA read or Atomic operations will be completed before start processing this WQE */ +#define RDMA_SQ_SEND_WQE_1ST_RD_FENCE_FLG_SHIFT 1 +#define RDMA_SQ_SEND_WQE_1ST_INV_FENCE_FLG_MASK 0x1 /* If set, all pending operations will be completed before start processing this WQE */ +#define RDMA_SQ_SEND_WQE_1ST_INV_FENCE_FLG_SHIFT 2 +#define RDMA_SQ_SEND_WQE_1ST_SE_FLG_MASK 0x1 /* If set, signal the responder to generate a solicited event on this WQE */ +#define RDMA_SQ_SEND_WQE_1ST_SE_FLG_SHIFT 3 +#define RDMA_SQ_SEND_WQE_1ST_INLINE_FLG_MASK 0x1 /* if set, indicates inline data is following this WQE instead of SGEs */ +#define RDMA_SQ_SEND_WQE_1ST_INLINE_FLG_SHIFT 4 +#define RDMA_SQ_SEND_WQE_1ST_RESERVED0_MASK 0x7 +#define RDMA_SQ_SEND_WQE_1ST_RESERVED0_SHIFT 5 + uint8_t wqe_size /* Size of WQE in 16B chunks including all SGEs or inline data. In case there are SGEs: set to number of SGEs + 1. In case of inline data: set to the whole number of 16B which contain the inline data + 1. */; + uint8_t prev_wqe_size /* Previous WQE size in 16B chunks */; +}; + + +struct rdma_sq_send_wqe_2st +{ + __le32 reserved1[4]; +}; + + +struct rdma_sq_sge +{ + __le32 length /* Total length of the send. If DIF on host is enabled, SGE length includes the DIF guards. */; + struct regpair addr; + __le32 l_key; +}; + + +struct rdma_srq_wqe_header +{ + struct regpair wr_id; + uint8_t num_sges /* number of SGEs in WQE */; + uint8_t reserved2[7]; +}; + +struct rdma_srq_sge +{ + struct regpair addr; + __le32 length; + __le32 l_key; +}; + +/* + * rdma srq sge + */ +union rdma_srq_elm +{ + struct rdma_srq_wqe_header header; + struct rdma_srq_sge sge; +}; + + + + +/* + * Rdma doorbell data for flags update + */ +struct rdma_pwm_flags_data +{ + __le16 icid /* internal CID */; + uint8_t agg_flags /* aggregative flags */; + uint8_t reserved; +}; + + +/* + * Rdma doorbell data for SQ and RQ + */ +struct rdma_pwm_val16_data +{ + __le16 icid /* internal CID */; + __le16 value /* aggregated value to update */; +}; + + +union rdma_pwm_val16_data_union +{ + struct rdma_pwm_val16_data as_struct /* Parameters field */; + __le32 as_dword; +}; + + +/* + * Rdma doorbell data for CQ + */ +struct rdma_pwm_val32_data +{ + __le16 icid /* internal CID */; + uint8_t agg_flags /* bit for every DQ counter flags in CM context that DQ can increment */; + uint8_t params; +#define RDMA_PWM_VAL32_DATA_AGG_CMD_MASK 0x3 /* aggregative command to CM (use enum db_agg_cmd_sel) */ +#define RDMA_PWM_VAL32_DATA_AGG_CMD_SHIFT 0 +#define RDMA_PWM_VAL32_DATA_BYPASS_EN_MASK 0x1 /* enable QM bypass */ +#define RDMA_PWM_VAL32_DATA_BYPASS_EN_SHIFT 2 +#define RDMA_PWM_VAL32_DATA_RESERVED_MASK 0x1F +#define RDMA_PWM_VAL32_DATA_RESERVED_SHIFT 3 + __le32 value /* aggregated value to update */; +}; + + +union rdma_pwm_val32_data_union +{ + struct rdma_pwm_val32_data as_struct /* Parameters field */; + struct regpair as_repair; +}; + +#endif /* __QED_HSI_RDMA__ */ diff --git a/providers/qedr/qelr_main.c b/providers/qedr/qelr_main.c new file mode 100644 index 0000000..e7045ca --- /dev/null +++ b/providers/qedr/qelr_main.c @@ -0,0 +1,278 @@ +/* + * Copyright (c) 2015-2016 QLogic Corporation + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and /or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <config.h> + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <errno.h> +#include <sys/mman.h> +#include <pthread.h> + +#include "qelr.h" +#include "qelr_main.h" +#include "qelr_chain.h" + +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> + +static void qelr_free_context(struct ibv_context *ibctx); + +#define PCI_VENDOR_ID_QLOGIC (0x1077) +#define PCI_DEVICE_ID_QLOGIC_57980S (0x1629) +#define PCI_DEVICE_ID_QLOGIC_57980S_40 (0x1634) +#define PCI_DEVICE_ID_QLOGIC_57980S_10 (0x1666) +#define PCI_DEVICE_ID_QLOGIC_57980S_MF (0x1636) +#define PCI_DEVICE_ID_QLOGIC_57980S_100 (0x1644) +#define PCI_DEVICE_ID_QLOGIC_57980S_50 (0x1654) +#define PCI_DEVICE_ID_QLOGIC_57980S_25 (0x1656) +#define PCI_DEVICE_ID_QLOGIC_57980S_IOV (0x1664) +#define PCI_DEVICE_ID_QLOGIC_AH (0x8070) +#define PCI_DEVICE_ID_QLOGIC_AH_IOV (0x8090) + +uint32_t qelr_dp_level; +uint32_t qelr_dp_module; + +#define QHCA(d) \ + VERBS_PCI_MATCH(PCI_VENDOR_ID_QLOGIC, PCI_DEVICE_ID_QLOGIC_##d, NULL) +static const struct verbs_match_ent hca_table[] = { + VERBS_DRIVER_ID(RDMA_DRIVER_QEDR), + QHCA(57980S), + QHCA(57980S_40), + QHCA(57980S_10), + QHCA(57980S_MF), + QHCA(57980S_100), + QHCA(57980S_50), + QHCA(57980S_25), + QHCA(57980S_IOV), + QHCA(AH), + QHCA(AH_IOV), + {} +}; + +static const struct verbs_context_ops qelr_ctx_ops = { + .query_device = qelr_query_device, + .query_port = qelr_query_port, + .alloc_pd = qelr_alloc_pd, + .dealloc_pd = qelr_dealloc_pd, + .reg_mr = qelr_reg_mr, + .dereg_mr = qelr_dereg_mr, + .create_cq = qelr_create_cq, + .poll_cq = qelr_poll_cq, + .req_notify_cq = qelr_arm_cq, + .cq_event = qelr_cq_event, + .destroy_cq = qelr_destroy_cq, + .create_qp = qelr_create_qp, + .query_qp = qelr_query_qp, + .modify_qp = qelr_modify_qp, + .destroy_qp = qelr_destroy_qp, + .create_srq = qelr_create_srq, + .destroy_srq = qelr_destroy_srq, + .modify_srq = qelr_modify_srq, + .query_srq = qelr_query_srq, + .post_srq_recv = qelr_post_srq_recv, + .post_send = qelr_post_send, + .post_recv = qelr_post_recv, + .async_event = qelr_async_event, + .free_context = qelr_free_context, +}; + +static void qelr_uninit_device(struct verbs_device *verbs_device) +{ + struct qelr_device *dev = get_qelr_dev(&verbs_device->device); + + free(dev); +} + +static void qelr_open_debug_file(struct qelr_devctx *ctx) +{ + char *env; + + env = getenv("QELR_DEBUG_FILE"); + if (!env) { + ctx->dbg_fp = stderr; + DP_VERBOSE(ctx->dbg_fp, QELR_MSG_INIT, + "Debug file opened: stderr\n"); + return; + } + + ctx->dbg_fp = fopen(env, "aw+"); + if (!ctx->dbg_fp) { + fprintf(stderr, "Failed opening debug file %s, using stderr\n", + env); + ctx->dbg_fp = stderr; + DP_VERBOSE(ctx->dbg_fp, QELR_MSG_INIT, + "Debug file opened: stderr\n"); + return; + } + + DP_VERBOSE(ctx->dbg_fp, QELR_MSG_INIT, "Debug file opened: %s\n", env); +} + +static void qelr_close_debug_file(struct qelr_devctx *ctx) +{ + if (ctx->dbg_fp && ctx->dbg_fp != stderr) + fclose(ctx->dbg_fp); +} + +static void qelr_set_debug_mask(void) +{ + char *env; + + qelr_dp_level = QELR_LEVEL_NOTICE; + qelr_dp_module = 0; + + env = getenv("QELR_DP_LEVEL"); + if (env) + qelr_dp_level = atoi(env); + + env = getenv("QELR_DP_MODULE"); + if (env) + qelr_dp_module = atoi(env); +} + +static struct verbs_context *qelr_alloc_context(struct ibv_device *ibdev, + int cmd_fd, + void *private_data) +{ + struct qelr_devctx *ctx; + struct qelr_alloc_context cmd; + struct qelr_alloc_context_resp resp; + + ctx = verbs_init_and_alloc_context(ibdev, cmd_fd, ctx, ibv_ctx, + RDMA_DRIVER_QEDR); + if (!ctx) + return NULL; + + memset(&resp, 0, sizeof(resp)); + + qelr_open_debug_file(ctx); + qelr_set_debug_mask(); + + cmd.context_flags = QEDR_ALLOC_UCTX_DB_REC; + if (ibv_cmd_get_context(&ctx->ibv_ctx, &cmd.ibv_cmd, sizeof(cmd), + &resp.ibv_resp, sizeof(resp))) + goto cmd_err; + + verbs_set_ops(&ctx->ibv_ctx, &qelr_ctx_ops); + + ctx->kernel_page_size = sysconf(_SC_PAGESIZE); + ctx->db_pa = resp.db_pa; + ctx->db_size = resp.db_size; + + /* Set dpm flags according to protocol */ + if (IS_ROCE(ibdev)) { + if (resp.dpm_flags & QEDR_DPM_TYPE_ROCE_ENHANCED) + ctx->dpm_flags = QELR_DPM_FLAGS_ENHANCED; + + if (resp.dpm_flags & QEDR_DPM_TYPE_ROCE_LEGACY) + ctx->dpm_flags |= QELR_DPM_FLAGS_LEGACY; + } else { + if (resp.dpm_flags & QEDR_DPM_TYPE_IWARP_LEGACY) + ctx->dpm_flags = QELR_DPM_FLAGS_LEGACY; + } + + /* Defaults set for backward-forward compatibility */ + if (resp.dpm_flags & QEDR_DPM_SIZES_SET) { + ctx->ldpm_limit_size = resp.ldpm_limit_size; + ctx->edpm_trans_size = resp.edpm_trans_size; + } else { + ctx->ldpm_limit_size = QEDR_LDPM_MAX_SIZE; + ctx->edpm_trans_size = QEDR_EDPM_TRANS_SIZE; + } + + ctx->max_send_wr = resp.max_send_wr; + ctx->max_recv_wr = resp.max_recv_wr; + ctx->max_srq_wr = resp.max_srq_wr; + ctx->sges_per_send_wr = resp.sges_per_send_wr; + ctx->sges_per_recv_wr = resp.sges_per_recv_wr; + ctx->sges_per_srq_wr = resp.sges_per_recv_wr; + ctx->max_cqes = resp.max_cqes; + + ctx->db_addr = mmap(NULL, ctx->db_size, PROT_WRITE, MAP_SHARED, + cmd_fd, ctx->db_pa); + + if (ctx->db_addr == MAP_FAILED) { + int errsv = errno; + + DP_ERR(ctx->dbg_fp, + "alloc context: doorbell mapping failed resp.db_pa = %llx resp.db_size=%d context->cmd_fd=%d errno=%d\n", + resp.db_pa, resp.db_size, cmd_fd, errsv); + goto cmd_err; + } + + return &ctx->ibv_ctx; + +cmd_err: + qelr_err("%s: Failed to allocate context for device.\n", __func__); + qelr_close_debug_file(ctx); + verbs_uninit_context(&ctx->ibv_ctx); + free(ctx); + return NULL; +} + +static void qelr_free_context(struct ibv_context *ibctx) +{ + struct qelr_devctx *ctx = get_qelr_ctx(ibctx); + + if (ctx->db_addr) + munmap(ctx->db_addr, ctx->db_size); + + qelr_close_debug_file(ctx); + verbs_uninit_context(&ctx->ibv_ctx); + free(ctx); +} + +static struct verbs_device *qelr_device_alloc(struct verbs_sysfs_dev *sysfs_dev) +{ + struct qelr_device *dev; + + dev = calloc(1, sizeof(*dev)); + if (!dev) + return NULL; + + return &dev->ibv_dev; +} + +static const struct verbs_device_ops qelr_dev_ops = { + .name = "qedr", + .match_min_abi_version = QELR_ABI_VERSION, + .match_max_abi_version = QELR_ABI_VERSION, + .match_table = hca_table, + .alloc_device = qelr_device_alloc, + .uninit_device = qelr_uninit_device, + .alloc_context = qelr_alloc_context, +}; +PROVIDER_DRIVER(qedr, qelr_dev_ops); diff --git a/providers/qedr/qelr_main.h b/providers/qedr/qelr_main.h new file mode 100644 index 0000000..fae8713 --- /dev/null +++ b/providers/qedr/qelr_main.h @@ -0,0 +1,82 @@ +/* + * Copyright (c) 2015-2016 QLogic Corporation + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and /or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef __QELR_MAIN_H__ +#define __QELR_MAIN_H__ + +#include <inttypes.h> +#include <stddef.h> +#include <endian.h> + +#include <infiniband/driver.h> +#include <util/udma_barrier.h> + +int qelr_query_device(struct ibv_context *, struct ibv_device_attr *); +int qelr_query_port(struct ibv_context *, uint8_t, struct ibv_port_attr *); + +struct ibv_pd *qelr_alloc_pd(struct ibv_context *); +int qelr_dealloc_pd(struct ibv_pd *); + +struct ibv_mr *qelr_reg_mr(struct ibv_pd *ibpd, void *addr, size_t len, + uint64_t hca_va, int access); +int qelr_dereg_mr(struct verbs_mr *vmr); + +struct ibv_cq *qelr_create_cq(struct ibv_context *, int, + struct ibv_comp_channel *, int); +int qelr_destroy_cq(struct ibv_cq *); +int qelr_poll_cq(struct ibv_cq *, int, struct ibv_wc *); +void qelr_cq_event(struct ibv_cq *); +int qelr_arm_cq(struct ibv_cq *, int); + +int qelr_query_srq(struct ibv_srq *ibv_srq, struct ibv_srq_attr *attr); +int qelr_modify_srq(struct ibv_srq *ibv_srq, struct ibv_srq_attr *attr, + int attr_mask); +struct ibv_srq *qelr_create_srq(struct ibv_pd *, struct ibv_srq_init_attr *); +int qelr_destroy_srq(struct ibv_srq *ibv_srq); +int qelr_post_srq_recv(struct ibv_srq *, struct ibv_recv_wr *, + struct ibv_recv_wr **bad_wr); + +struct ibv_qp *qelr_create_qp(struct ibv_pd *, struct ibv_qp_init_attr *); +int qelr_modify_qp(struct ibv_qp *, struct ibv_qp_attr *, + int ibv_qp_attr_mask); +int qelr_query_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask, + struct ibv_qp_init_attr *init_attr); +int qelr_destroy_qp(struct ibv_qp *); + +int qelr_post_send(struct ibv_qp *, struct ibv_send_wr *, + struct ibv_send_wr **); +int qelr_post_recv(struct ibv_qp *, struct ibv_recv_wr *, + struct ibv_recv_wr **); + +void qelr_async_event(struct ibv_context *contex, + struct ibv_async_event *event); +#endif /* __QELR_MAIN_H__ */ diff --git a/providers/qedr/qelr_verbs.c b/providers/qedr/qelr_verbs.c new file mode 100644 index 0000000..45b6e78 --- /dev/null +++ b/providers/qedr/qelr_verbs.c @@ -0,0 +1,2511 @@ +/* + * Copyright (c) 2015-2016 QLogic Corporation + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and /or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <config.h> + +#include <assert.h> +#include <endian.h> +#include <stdlib.h> +#include <stdio.h> +#include <string.h> +#include <unistd.h> +#include <signal.h> +#include <errno.h> +#include <pthread.h> +#include <sys/mman.h> +#include <unistd.h> +#include <stdbool.h> + +#include "qelr.h" +#include "qelr_chain.h" +#include "qelr_verbs.h" +#include <util/compiler.h> +#include <util/util.h> +#include <util/mmio.h> +#include <stdio.h> +#include <stdlib.h> + +#define QELR_SQE_ELEMENT_SIZE (sizeof(struct rdma_sq_sge)) +#define QELR_RQE_ELEMENT_SIZE (sizeof(struct rdma_rq_sge)) +#define QELR_CQE_SIZE (sizeof(union rdma_cqe)) + +static void qelr_inc_sw_cons_u16(struct qelr_qp_hwq_info *info) +{ + info->cons = (info->cons + 1) % info->max_wr; + info->wqe_cons++; +} + +static void qelr_inc_sw_prod_u16(struct qelr_qp_hwq_info *info) +{ + info->prod = (info->prod + 1) % info->max_wr; +} + +static inline int qelr_wq_is_full(struct qelr_qp_hwq_info *info) +{ + return (((info->prod + 1) % info->max_wr) == info->cons); +} + +int qelr_query_device(struct ibv_context *context, + struct ibv_device_attr *attr) +{ + struct ibv_query_device cmd; + uint64_t fw_ver; + unsigned int major, minor, revision, eng; + int status; + + bzero(attr, sizeof(*attr)); + status = ibv_cmd_query_device(context, attr, &fw_ver, &cmd, + sizeof(cmd)); + + major = (fw_ver >> 24) & 0xff; + minor = (fw_ver >> 16) & 0xff; + revision = (fw_ver >> 8) & 0xff; + eng = fw_ver & 0xff; + + snprintf(attr->fw_ver, sizeof(attr->fw_ver), + "%d.%d.%d.%d", major, minor, revision, eng); + + return status; +} + +int qelr_query_port(struct ibv_context *context, uint8_t port, + struct ibv_port_attr *attr) +{ + struct ibv_query_port cmd; + int status; + + status = ibv_cmd_query_port(context, port, attr, &cmd, sizeof(cmd)); + return status; +} + +struct ibv_pd *qelr_alloc_pd(struct ibv_context *context) +{ + struct qelr_alloc_pd cmd; + struct qelr_alloc_pd_resp resp; + struct qelr_pd *pd; + struct qelr_devctx *cxt = get_qelr_ctx(context); + + pd = malloc(sizeof(*pd)); + if (!pd) + return NULL; + + bzero(pd, sizeof(*pd)); + memset(&cmd, 0, sizeof(cmd)); + + if (ibv_cmd_alloc_pd(context, &pd->ibv_pd, &cmd.ibv_cmd, sizeof(cmd), + &resp.ibv_resp, sizeof(resp))) { + free(pd); + return NULL; + } + + pd->pd_id = resp.pd_id; + + DP_VERBOSE(cxt->dbg_fp, QELR_MSG_INIT, "Allocated pd: %d\n", pd->pd_id); + + return &pd->ibv_pd; +} + +int qelr_dealloc_pd(struct ibv_pd *ibpd) +{ + int rc = 0; + struct qelr_pd *pd = get_qelr_pd(ibpd); + struct qelr_devctx *cxt = get_qelr_ctx(ibpd->context); + + DP_VERBOSE(cxt->dbg_fp, QELR_MSG_INIT, "Deallocated pd: %d\n", + pd->pd_id); + + rc = ibv_cmd_dealloc_pd(ibpd); + + if (rc) + return rc; + + free(pd); + + return rc; +} + +struct ibv_mr *qelr_reg_mr(struct ibv_pd *ibpd, void *addr, size_t len, + uint64_t hca_va, int access) +{ + struct qelr_mr *mr; + struct ibv_reg_mr cmd; + struct qelr_reg_mr_resp resp; + struct qelr_pd *pd = get_qelr_pd(ibpd); + struct qelr_devctx *cxt = get_qelr_ctx(ibpd->context); + + mr = malloc(sizeof(*mr)); + if (!mr) + return NULL; + + bzero(mr, sizeof(*mr)); + + if (ibv_cmd_reg_mr(ibpd, addr, len, hca_va, access, &mr->vmr, &cmd, + sizeof(cmd), &resp.ibv_resp, sizeof(resp))) { + free(mr); + return NULL; + } + + DP_VERBOSE(cxt->dbg_fp, QELR_MSG_MR, + "MR Register %p completed successfully pd_id=%d addr=%p len=%zu access=%d lkey=%x rkey=%x\n", + mr, pd->pd_id, addr, len, access, mr->vmr.ibv_mr.lkey, + mr->vmr.ibv_mr.rkey); + + return &mr->vmr.ibv_mr; +} + +int qelr_dereg_mr(struct verbs_mr *vmr) +{ + struct qelr_devctx *cxt = get_qelr_ctx(vmr->ibv_mr.context); + int rc; + + rc = ibv_cmd_dereg_mr(vmr); + if (rc) + return rc; + + DP_VERBOSE(cxt->dbg_fp, QELR_MSG_MR, + "MR DERegister %p completed successfully\n", vmr); + + free(vmr); + + return 0; +} + +static void consume_cqe(struct qelr_cq *cq) +{ + if (cq->latest_cqe == cq->toggle_cqe) + cq->chain_toggle ^= RDMA_CQE_REQUESTER_TOGGLE_BIT_MASK; + + cq->latest_cqe = qelr_chain_consume(&cq->chain); +} + +static inline int qelr_cq_entries(int entries) +{ + /* FW requires an extra entry */ + return entries + 1; +} + +struct ibv_cq *qelr_create_cq(struct ibv_context *context, int cqe, + struct ibv_comp_channel *channel, + int comp_vector) +{ + struct qelr_devctx *cxt = get_qelr_ctx(context); + struct qelr_create_cq_resp resp = {}; + struct qelr_create_cq cmd; + struct qelr_cq *cq; + int chain_size; + int rc; + + DP_VERBOSE(cxt->dbg_fp, QELR_MSG_CQ, + "create cq: context=%p, cqe=%d, channel=%p, comp_vector=%d\n", + context, cqe, channel, comp_vector); + + if (!cqe || cqe > cxt->max_cqes) { + DP_ERR(cxt->dbg_fp, + "create cq: failed. attempted to allocate %d cqes but valid range is 1...%d\n", + cqe, cqe > cxt->max_cqes); + return NULL; + } + + /* allocate CQ structure */ + cq = calloc(1, sizeof(*cq)); + if (!cq) + return NULL; + + /* allocate CQ buffer */ + chain_size = qelr_cq_entries(cqe) * QELR_CQE_SIZE; + rc = qelr_chain_alloc(&cq->chain, chain_size, cxt->kernel_page_size, + QELR_CQE_SIZE); + if (rc) + goto err_0; + + cmd.addr = (uintptr_t) cq->chain.first_addr; + cmd.len = cq->chain.size; + rc = ibv_cmd_create_cq(context, cqe, channel, comp_vector, + &cq->ibv_cq, &cmd.ibv_cmd, sizeof(cmd), + &resp.ibv_resp, sizeof(resp)); + if (rc) { + DP_ERR(cxt->dbg_fp, "create cq: failed with rc = %d\n", rc); + goto err_1; + } + + /* map the doorbell and prepare its data */ + cq->db.data.icid = htole16(resp.icid); + cq->db.data.params = DB_AGG_CMD_SET << + RDMA_PWM_VAL32_DATA_AGG_CMD_SHIFT; + cq->db_addr = cxt->db_addr + resp.db_offset; + + if (resp.db_rec_addr) { + cq->db_rec_map = mmap(NULL, cxt->kernel_page_size, PROT_WRITE, + MAP_SHARED, context->cmd_fd, + resp.db_rec_addr); + if (cq->db_rec_map == MAP_FAILED) { + int errsv = errno; + + DP_ERR(cxt->dbg_fp, + "alloc context: doorbell rec mapping failed resp.db_rec_addr = %llx size=%d context->cmd_fd=%d errno=%d\n", + resp.db_rec_addr, cxt->kernel_page_size, + context->cmd_fd, errsv); + goto err_1; + } + cq->db_rec_addr = cq->db_rec_map; + } else { + /* Kernel doesn't support doorbell recovery. Point to dummy + * location instead + */ + cq->db_rec_addr = &cxt->db_rec_addr_dummy; + } + + /* point to the very last element, passing this we will toggle */ + cq->toggle_cqe = qelr_chain_get_last_elem(&cq->chain); + cq->chain_toggle = RDMA_CQE_REQUESTER_TOGGLE_BIT_MASK; + cq->latest_cqe = NULL; /* must be different from chain_toggle */ + consume_cqe(cq); + + DP_VERBOSE(cxt->dbg_fp, QELR_MSG_CQ, + "create cq: successfully created %p\n", cq); + + return &cq->ibv_cq; + +err_1: + qelr_chain_free(&cq->chain); +err_0: + free(cq); + + return NULL; +} + +int qelr_destroy_cq(struct ibv_cq *ibv_cq) +{ + struct qelr_devctx *cxt = get_qelr_ctx(ibv_cq->context); + struct qelr_cq *cq = get_qelr_cq(ibv_cq); + int rc; + + DP_VERBOSE(cxt->dbg_fp, QELR_MSG_CQ, "destroy cq: %p\n", cq); + + rc = ibv_cmd_destroy_cq(ibv_cq); + if (rc) { + DP_VERBOSE(cxt->dbg_fp, QELR_MSG_CQ, + "destroy cq: failed to destroy %p, got %d.\n", cq, + rc); + return rc; + } + + qelr_chain_free(&cq->chain); + if (cq->db_rec_map) + munmap(cq->db_rec_map, cxt->kernel_page_size); + + DP_VERBOSE(cxt->dbg_fp, QELR_MSG_CQ, + "destroy cq: successfully destroyed %p\n", cq); + + free(cq); + + return 0; +} + +int qelr_query_srq(struct ibv_srq *ibv_srq, struct ibv_srq_attr *attr) +{ + struct ibv_query_srq cmd; + + return ibv_cmd_query_srq(ibv_srq, attr, &cmd, sizeof(cmd)); +} + +int qelr_modify_srq(struct ibv_srq *srq, struct ibv_srq_attr *attr, + int attr_mask) +{ + struct ibv_modify_srq cmd; + + return ibv_cmd_modify_srq(srq, attr, attr_mask, &cmd, sizeof(cmd)); + +} + +static void qelr_destroy_srq_buffers(struct ibv_srq *ibv_srq) +{ + struct qelr_srq *srq = get_qelr_srq(ibv_srq); + uint32_t *virt_prod_pair_addr; + uint32_t prod_size; + + qelr_chain_free(&srq->hw_srq.chain); + + virt_prod_pair_addr = srq->hw_srq.virt_prod_pair_addr; + prod_size = sizeof(struct rdma_srq_producers); + + ibv_dofork_range(virt_prod_pair_addr, prod_size); + munmap(virt_prod_pair_addr, prod_size); +} + +int qelr_destroy_srq(struct ibv_srq *ibv_srq) +{ + struct qelr_srq *srq = get_qelr_srq(ibv_srq); + int ret; + + ret = ibv_cmd_destroy_srq(ibv_srq); + if (ret) + return ret; + + qelr_destroy_srq_buffers(ibv_srq); + free(srq); + + return 0; +} + +static void qelr_create_srq_configure_req(struct qelr_srq *srq, + struct qelr_create_srq *req) +{ + req->srq_addr = (uintptr_t)srq->hw_srq.chain.first_addr; + req->srq_len = srq->hw_srq.chain.size; + req->prod_pair_addr = (uintptr_t)srq->hw_srq.virt_prod_pair_addr; +} + +static int qelr_create_srq_buffers(struct qelr_devctx *cxt, + struct qelr_srq *srq, + struct ibv_srq_init_attr *attrs) +{ + uint32_t max_wr, max_sges; + int chain_size, prod_size; + void *addr; + int rc; + + max_wr = attrs->attr.max_wr; + if (!max_wr) + return -EINVAL; + + max_wr = min_t(uint32_t, max_wr, cxt->max_srq_wr); + max_sges = max_wr * (cxt->sges_per_srq_wr + 1); /* +1 for header */ + chain_size = max_sges * QELR_RQE_ELEMENT_SIZE; + + rc = qelr_chain_alloc(&srq->hw_srq.chain, chain_size, + cxt->kernel_page_size, QELR_RQE_ELEMENT_SIZE); + if (rc) { + DP_ERR(cxt->dbg_fp, + "create srq: failed to map srq, got %d", rc); + return rc; + } + + prod_size = sizeof(struct rdma_srq_producers); + addr = mmap(NULL, prod_size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, + 0); + if (addr == MAP_FAILED) { + DP_ERR(cxt->dbg_fp, + "create srq: failed to map producer, got %d", errno); + qelr_chain_free(&srq->hw_srq.chain); + return errno; + } + + rc = ibv_dontfork_range(addr, prod_size); + if (rc) { + munmap(addr, prod_size); + qelr_chain_free(&srq->hw_srq.chain); + return rc; + } + + srq->hw_srq.virt_prod_pair_addr = addr; + srq->hw_srq.max_sges = cxt->sges_per_srq_wr; + srq->hw_srq.max_wr = max_wr; + + return 0; +} + +struct ibv_srq *qelr_create_srq(struct ibv_pd *pd, + struct ibv_srq_init_attr *init_attr) +{ + struct qelr_devctx *cxt = get_qelr_ctx(pd->context); + struct qelr_create_srq req; + struct qelr_create_srq_resp resp; + struct qelr_srq *srq; + int ret; + + srq = calloc(1, sizeof(*srq)); + if (!srq) + return NULL; + + ret = qelr_create_srq_buffers(cxt, srq, init_attr); + if (ret) { + free(srq); + return NULL; + } + + pthread_spin_init(&srq->lock, PTHREAD_PROCESS_PRIVATE); + qelr_create_srq_configure_req(srq, &req); + ret = ibv_cmd_create_srq(pd, &srq->ibv_srq, init_attr, &req.ibv_cmd, + sizeof(req), &resp.ibv_resp, sizeof(resp)); + if (ret) { + qelr_destroy_srq_buffers(&srq->ibv_srq); + free(srq); + return NULL; + } + + return &srq->ibv_srq; +} + +static void qelr_free_rq(struct qelr_qp *qp) +{ + free(qp->rqe_wr_id); +} + +static void qelr_free_sq(struct qelr_qp *qp) +{ + free(qp->wqe_wr_id); +} + +static void qelr_chain_free_sq(struct qelr_qp *qp) +{ + qelr_chain_free(&qp->sq.chain); +} + +static void qelr_chain_free_rq(struct qelr_qp *qp) +{ + qelr_chain_free(&qp->rq.chain); +} + +static inline int qelr_create_qp_buffers_sq(struct qelr_devctx *cxt, + struct qelr_qp *qp, + struct ibv_qp_init_attr *attrs) +{ + uint32_t max_send_wr, max_send_sges, max_send_buf; + int chain_size; + int rc; + + /* SQ */ + max_send_wr = attrs->cap.max_send_wr; + max_send_wr = max_t(uint32_t, max_send_wr, 1); + max_send_wr = min_t(uint32_t, max_send_wr, cxt->max_send_wr); + max_send_sges = max_send_wr * cxt->sges_per_send_wr; + max_send_buf = max_send_sges * QELR_SQE_ELEMENT_SIZE; + + chain_size = max_send_buf; + rc = qelr_chain_alloc(&qp->sq.chain, chain_size, cxt->kernel_page_size, + QELR_SQE_ELEMENT_SIZE); + if (rc) + DP_ERR(cxt->dbg_fp, "create qp: failed to map SQ chain, got %d", rc); + + qp->sq.max_wr = max_send_wr; + qp->sq.max_sges = cxt->sges_per_send_wr; + + return rc; +} + +static inline int qelr_create_qp_buffers_rq(struct qelr_devctx *cxt, + struct qelr_qp *qp, + struct ibv_qp_init_attr *attrs) +{ + uint32_t max_recv_wr, max_recv_sges, max_recv_buf; + int chain_size; + int rc; + + /* RQ */ + max_recv_wr = attrs->cap.max_recv_wr; + max_recv_wr = max_t(uint32_t, max_recv_wr, 1); + max_recv_wr = min_t(uint32_t, max_recv_wr, cxt->max_recv_wr); + max_recv_sges = max_recv_wr * cxt->sges_per_recv_wr; + max_recv_buf = max_recv_sges * QELR_RQE_ELEMENT_SIZE; + + chain_size = max_recv_buf; + rc = qelr_chain_alloc(&qp->rq.chain, chain_size, cxt->kernel_page_size, + QELR_RQE_ELEMENT_SIZE); + if (rc) + DP_ERR(cxt->dbg_fp, "create qp: failed to map RQ chain, got %d", rc); + + qp->rq.max_wr = max_recv_wr; + qp->rq.max_sges = cxt->sges_per_recv_wr; + + return rc; +} + +static inline int qelr_create_qp_buffers(struct qelr_devctx *cxt, + struct qelr_qp *qp, + struct ibv_qp_init_attr *attrs) +{ + int rc; + + rc = qelr_create_qp_buffers_sq(cxt, qp, attrs); + if (rc) + return rc; + + rc = qelr_create_qp_buffers_rq(cxt, qp, attrs); + if (rc) { + qelr_chain_free_sq(qp); + if (qp->sq.db_rec_map) + munmap(qp->sq.db_rec_map, cxt->kernel_page_size); + return rc; + } + + return 0; +} + +static inline int qelr_configure_qp_sq(struct qelr_devctx *cxt, + struct qelr_qp *qp, + struct ibv_qp_init_attr *attrs, + struct qelr_create_qp_resp *resp) +{ + qp->sq.icid = resp->sq_icid; + qp->sq.db_data.data.icid = htole16(resp->sq_icid); + qp->sq.prod = 0; + qp->sq.db = cxt->db_addr + resp->sq_db_offset; + qp->sq.edpm_db = cxt->db_addr; + if (resp->sq_db_rec_addr) { + qp->sq.db_rec_map = mmap(NULL, cxt->kernel_page_size, + PROT_WRITE, MAP_SHARED, + cxt->ibv_ctx.context.cmd_fd, + resp->sq_db_rec_addr); + + if (qp->sq.db_rec_map == MAP_FAILED) { + int errsv = errno; + + DP_ERR(cxt->dbg_fp, + "alloc context: doorbell rec mapping failed resp.db_rec_addr = %llx size=%d context->cmd_fd=%d errno=%d\n", + resp->sq_db_rec_addr, cxt->kernel_page_size, + cxt->ibv_ctx.context.cmd_fd, errsv); + return -ENOMEM; + } + qp->sq.db_rec_addr = qp->sq.db_rec_map; + } else { + /* Kernel doesn't support doorbell recovery. Point to dummy + * location instead + */ + qp->sq.db_rec_addr = &cxt->db_rec_addr_dummy; + } + + /* shadow SQ */ + qp->sq.max_wr++; /* prod/cons method requires N+1 elements */ + qp->wqe_wr_id = calloc(qp->sq.max_wr, sizeof(*qp->wqe_wr_id)); + if (!qp->wqe_wr_id) { + DP_ERR(cxt->dbg_fp, + "create qp: failed shadow SQ memory allocation\n"); + return -ENOMEM; + } + return 0; +} + +static inline int qelr_configure_qp_rq(struct qelr_devctx *cxt, + struct qelr_qp *qp, + struct ibv_qp_init_attr *attrs, + struct qelr_create_qp_resp *resp) +{ + /* RQ */ + qp->rq.icid = resp->rq_icid; + qp->rq.db_data.data.icid = htole16(resp->rq_icid); + qp->rq.db = cxt->db_addr + resp->rq_db_offset; + qp->rq.iwarp_db2 = cxt->db_addr + resp->rq_db2_offset; + qp->rq.iwarp_db2_data.data.icid = htole16(qp->rq.icid); + qp->rq.iwarp_db2_data.data.value = htole16(DQ_TCM_IWARP_POST_RQ_CF_CMD); + qp->rq.prod = 0; + + if (resp->rq_db_rec_addr) { + qp->rq.db_rec_map = mmap(NULL, cxt->kernel_page_size, + PROT_WRITE, MAP_SHARED, + cxt->ibv_ctx.context.cmd_fd, + resp->rq_db_rec_addr); + if (qp->rq.db_rec_map == MAP_FAILED) { + int errsv = errno; + + DP_ERR(cxt->dbg_fp, + "alloc context: doorbell rec mapping failed resp.db_rec_addr = %llx size=%d context->cmd_fd=%d errno=%d\n", + resp->rq_db_rec_addr, cxt->kernel_page_size, + cxt->ibv_ctx.context.cmd_fd, errsv); + return -ENOMEM; + } + qp->rq.db_rec_addr = qp->rq.db_rec_map; + } else { + /* Kernel doesn't support doorbell recovery. Point to dummy + * location instead + */ + qp->rq.db_rec_addr = &cxt->db_rec_addr_dummy; + } + + /* shadow RQ */ + qp->rq.max_wr++; /* prod/cons method requires N+1 elements */ + qp->rqe_wr_id = calloc(qp->rq.max_wr, sizeof(*qp->rqe_wr_id)); + if (!qp->rqe_wr_id) { + DP_ERR(cxt->dbg_fp, + "create qp: failed shadow RQ memory allocation\n"); + return -ENOMEM; + } + + return 0; +} + +static inline int qelr_configure_qp(struct qelr_devctx *cxt, struct qelr_qp *qp, + struct ibv_qp_init_attr *attrs, + struct qelr_create_qp_resp *resp) +{ + int rc; + + /* general */ + pthread_spin_init(&qp->q_lock, PTHREAD_PROCESS_PRIVATE); + qp->qp_id = resp->qp_id; + qp->state = QELR_QPS_RST; + qp->sq_sig_all = attrs->sq_sig_all; + qp->atomic_supported = resp->atomic_supported; + + rc = qelr_configure_qp_sq(cxt, qp, attrs, resp); + if (rc) + return rc; + rc = qelr_configure_qp_rq(cxt, qp, attrs, resp); + if (rc) + qelr_free_sq(qp); + + return rc; +} + +static inline void qelr_print_qp_init_attr( + struct qelr_devctx *cxt, + struct ibv_qp_init_attr *attr) +{ + DP_VERBOSE(cxt->dbg_fp, QELR_MSG_QP, + "create qp: send_cq=%p, recv_cq=%p, srq=%p, max_inline_data=%d, max_recv_sge=%d, max_recv_wr=%d, max_send_sge=%d, max_send_wr=%d, qp_type=%d, sq_sig_all=%d\n", + attr->send_cq, attr->recv_cq, attr->srq, + attr->cap.max_inline_data, attr->cap.max_recv_sge, + attr->cap.max_recv_wr, attr->cap.max_send_sge, + attr->cap.max_send_wr, attr->qp_type, attr->sq_sig_all); +} + +static inline void +qelr_create_qp_configure_sq_req(struct qelr_qp *qp, + struct qelr_create_qp *req) +{ + req->sq_addr = (uintptr_t)qp->sq.chain.first_addr; + req->sq_len = qp->sq.chain.size; +} + +static inline void +qelr_create_qp_configure_rq_req(struct qelr_qp *qp, + struct qelr_create_qp *req) +{ + req->rq_addr = (uintptr_t)qp->rq.chain.first_addr; + req->rq_len = qp->rq.chain.size; +} + +static inline void +qelr_create_qp_configure_req(struct qelr_qp *qp, + struct qelr_create_qp *req) +{ + memset(req, 0, sizeof(*req)); + req->qp_handle_hi = U64_HI(qp); + req->qp_handle_lo = U64_LO(qp); + qelr_create_qp_configure_sq_req(qp, req); + qelr_create_qp_configure_rq_req(qp, req); +} + +struct ibv_qp *qelr_create_qp(struct ibv_pd *pd, + struct ibv_qp_init_attr *attrs) +{ + struct qelr_devctx *cxt = get_qelr_ctx(pd->context); + struct qelr_create_qp_resp resp = {}; + struct qelr_create_qp req; + struct qelr_qp *qp; + int rc; + + qelr_print_qp_init_attr(cxt, attrs); + + qp = calloc(1, sizeof(*qp)); + if (!qp) + return NULL; + + if (attrs->srq) + qp->srq = get_qelr_srq(attrs->srq); + + rc = qelr_create_qp_buffers(cxt, qp, attrs); + if (rc) + goto err0; + + qelr_create_qp_configure_req(qp, &req); + + rc = ibv_cmd_create_qp(pd, &qp->ibv_qp, attrs, &req.ibv_cmd, + sizeof(req), &resp.ibv_resp, sizeof(resp)); + if (rc) { + DP_ERR(cxt->dbg_fp, + "create qp: failed on ibv_cmd_create_qp with %d\n", rc); + goto err1; + } + + rc = qelr_configure_qp(cxt, qp, attrs, &resp); + if (rc) + goto err2; + + DP_VERBOSE(cxt->dbg_fp, QELR_MSG_QP, + "create qp: successfully created %p. handle_hi=%x handle_lo=%x\n", + qp, req.qp_handle_hi, req.qp_handle_lo); + + return &qp->ibv_qp; + +err2: + rc = ibv_cmd_destroy_qp(&qp->ibv_qp); + if (rc) + DP_ERR(cxt->dbg_fp, "create qp: fatal fault. rc=%d\n", rc); +err1: + qelr_chain_free_sq(qp); + qelr_chain_free_rq(qp); +err0: + free(qp); + + return NULL; +} + +static void qelr_print_ah_attr(struct qelr_devctx *cxt, struct ibv_ah_attr *attr) +{ + DP_VERBOSE(cxt->dbg_fp, QELR_MSG_QP, + "grh.dgid=[%#" PRIx64 ":%#" PRIx64 "], grh.flow_label=%d, grh.sgid_index=%d, grh.hop_limit=%d, grh.traffic_class=%d, dlid=%d, sl=%d, src_path_bits=%d, static_rate = %d, port_num=%d\n", + be64toh(attr->grh.dgid.global.interface_id), + be64toh(attr->grh.dgid.global.subnet_prefix), + attr->grh.flow_label, attr->grh.hop_limit, + attr->grh.sgid_index, attr->grh.traffic_class, attr->dlid, + attr->sl, attr->src_path_bits, + attr->static_rate, attr->port_num); +} + +static void qelr_print_qp_attr(struct qelr_devctx *cxt, struct ibv_qp_attr *attr) +{ + DP_VERBOSE(cxt->dbg_fp, QELR_MSG_QP, + "\tqp_state=%d\tcur_qp_state=%d\tpath_mtu=%d\tpath_mig_state=%d\tqkey=%d\trq_psn=%d\tsq_psn=%d\tdest_qp_num=%d\tqp_access_flags=%d\tmax_inline_data=%d\tmax_recv_sge=%d\tmax_recv_wr=%d\tmax_send_sge=%d\tmax_send_wr=%d\tpkey_index=%d\talt_pkey_index=%d\ten_sqd_async_notify=%d\tsq_draining=%d\tmax_rd_atomic=%d\tmax_dest_rd_atomic=%d\tmin_rnr_timer=%d\tport_num=%d\ttimeout=%d\tretry_cnt=%d\trnr_retry=%d\talt_port_num=%d\talt_timeout=%d\n", + attr->qp_state, attr->cur_qp_state, attr->path_mtu, + attr->path_mig_state, attr->qkey, attr->rq_psn, attr->sq_psn, + attr->dest_qp_num, attr->qp_access_flags, + attr->cap.max_inline_data, attr->cap.max_recv_sge, + attr->cap.max_recv_wr, attr->cap.max_send_sge, + attr->cap.max_send_wr, attr->pkey_index, + attr->alt_pkey_index, attr->en_sqd_async_notify, + attr->sq_draining, attr->max_rd_atomic, + attr->max_dest_rd_atomic, attr->min_rnr_timer, + attr->port_num, attr->timeout, attr->retry_cnt, + attr->rnr_retry, attr->alt_port_num, attr->alt_timeout); + + qelr_print_ah_attr(cxt, &attr->ah_attr); + qelr_print_ah_attr(cxt, &attr->alt_ah_attr); +} + +int qelr_query_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, + int attr_mask, struct ibv_qp_init_attr *init_attr) +{ + struct ibv_query_qp cmd; + struct qelr_devctx *cxt = get_qelr_ctx(qp->context); + int rc; + + DP_VERBOSE(cxt->dbg_fp, QELR_MSG_QP, "QP Query %p, attr_mask=0x%x\n", + get_qelr_qp(qp), attr_mask); + + rc = ibv_cmd_query_qp(qp, attr, attr_mask, + init_attr, &cmd, sizeof(cmd)); + + qelr_print_qp_attr(cxt, attr); + + return rc; +} + +static enum qelr_qp_state get_qelr_qp_state(enum ibv_qp_state qps) +{ + switch (qps) { + case IBV_QPS_RESET: + return QELR_QPS_RST; + case IBV_QPS_INIT: + return QELR_QPS_INIT; + case IBV_QPS_RTR: + return QELR_QPS_RTR; + case IBV_QPS_RTS: + return QELR_QPS_RTS; + case IBV_QPS_SQD: + return QELR_QPS_SQD; + case IBV_QPS_SQE: + return QELR_QPS_SQE; + case IBV_QPS_ERR: + default: + return QELR_QPS_ERR; + }; +} + +static void qelr_reset_qp_hwq_info(struct qelr_qp_hwq_info *q) +{ + qelr_chain_reset(&q->chain); + q->prod = 0; + q->cons = 0; + q->wqe_cons = 0; + q->db_data.data.value = 0; +} + +static int qelr_update_qp_state(struct qelr_qp *qp, + enum ibv_qp_state new_ib_state) +{ + int status = 0; + enum qelr_qp_state new_state; + + /* iWARP states are updated implicitely by driver and don't have a + * real purpose in user-lib. + */ + if (IS_IWARP(qp->ibv_qp.context->device)) + return 0; + + new_state = get_qelr_qp_state(new_ib_state); + + pthread_spin_lock(&qp->q_lock); + + if (new_state == qp->state) { + pthread_spin_unlock(&qp->q_lock); + return 0; + } + + switch (qp->state) { + case QELR_QPS_RST: + switch (new_state) { + case QELR_QPS_INIT: + qp->prev_wqe_size = 0; + qelr_reset_qp_hwq_info(&qp->sq); + qelr_reset_qp_hwq_info(&qp->rq); + break; + default: + status = -EINVAL; + break; + }; + break; + case QELR_QPS_INIT: + /* INIT->XXX */ + switch (new_state) { + case QELR_QPS_RTR: + /* Update doorbell (in case post_recv was done before + * move to RTR) + */ + if (IS_ROCE(qp->ibv_qp.context->device)) { + mmio_wc_start(); + writel(qp->rq.db_data.raw, qp->rq.db); + mmio_flush_writes(); + } + break; + case QELR_QPS_ERR: + break; + default: + /* invalid state change. */ + status = -EINVAL; + break; + }; + break; + case QELR_QPS_RTR: + /* RTR->XXX */ + switch (new_state) { + case QELR_QPS_RTS: + break; + case QELR_QPS_ERR: + break; + default: + /* invalid state change. */ + status = -EINVAL; + break; + }; + break; + case QELR_QPS_RTS: + /* RTS->XXX */ + switch (new_state) { + case QELR_QPS_SQD: + case QELR_QPS_SQE: + break; + case QELR_QPS_ERR: + break; + default: + /* invalid state change. */ + status = -EINVAL; + break; + }; + break; + case QELR_QPS_SQD: + /* SQD->XXX */ + switch (new_state) { + case QELR_QPS_RTS: + case QELR_QPS_SQE: + case QELR_QPS_ERR: + break; + default: + /* invalid state change. */ + status = -EINVAL; + break; + }; + break; + case QELR_QPS_SQE: + switch (new_state) { + case QELR_QPS_RTS: + case QELR_QPS_ERR: + break; + default: + /* invalid state change. */ + status = -EINVAL; + break; + }; + break; + case QELR_QPS_ERR: + /* ERR->XXX */ + switch (new_state) { + case QELR_QPS_RST: + break; + default: + status = -EINVAL; + break; + }; + break; + default: + status = -EINVAL; + break; + }; + if (!status) + qp->state = new_state; + + pthread_spin_unlock(&qp->q_lock); + + return status; +} + +int qelr_modify_qp(struct ibv_qp *ibqp, struct ibv_qp_attr *attr, + int attr_mask) +{ + struct ibv_modify_qp cmd = {}; + struct qelr_qp *qp = get_qelr_qp(ibqp); + struct qelr_devctx *cxt = get_qelr_ctx(ibqp->context); + union ibv_gid sgid, *p_dgid; + int rc; + + DP_VERBOSE(cxt->dbg_fp, QELR_MSG_QP, "QP Modify %p, attr_mask=0x%x\n", + qp, attr_mask); + + qelr_print_qp_attr(cxt, attr); + + rc = ibv_cmd_modify_qp(ibqp, attr, attr_mask, &cmd, sizeof(cmd)); + if (rc) { + DP_ERR(cxt->dbg_fp, "QP Modify: Failed command. rc=%d\n", rc); + return rc; + } + + if (attr_mask & IBV_QP_STATE) { + rc = qelr_update_qp_state(qp, attr->qp_state); + DP_VERBOSE(cxt->dbg_fp, QELR_MSG_QP, + "QP Modify state %d->%d, rc=%d\n", qp->state, + attr->qp_state, rc); + if (rc) { + DP_ERR(cxt->dbg_fp, + "QP Modify: Failed to update state. rc=%d\n", + rc); + + return rc; + } + } + + /* EDPM must be disabled if GIDs match */ + if (attr_mask & IBV_QP_AV) { + rc = ibv_query_gid(ibqp->context, attr->ah_attr.port_num, + attr->ah_attr.grh.sgid_index, &sgid); + + if (!rc) { + p_dgid = &attr->ah_attr.grh.dgid; + qp->edpm_disabled = !memcmp(&sgid, p_dgid, + sizeof(sgid)); + DP_VERBOSE(cxt->dbg_fp, QELR_MSG_QP, + "QP Modify: %p, edpm_disabled=%d\n", qp, + qp->edpm_disabled); + } else { + DP_ERR(cxt->dbg_fp, + "QP Modify: Failed querying GID. rc=%d\n", + rc); + } + } + + return 0; +} + +int qelr_destroy_qp(struct ibv_qp *ibqp) +{ + struct qelr_devctx *cxt = get_qelr_ctx(ibqp->context); + struct qelr_qp *qp = get_qelr_qp(ibqp); + int rc = 0; + + DP_VERBOSE(cxt->dbg_fp, QELR_MSG_QP, "destroy qp: %p\n", qp); + + rc = ibv_cmd_destroy_qp(ibqp); + if (rc) { + DP_ERR(cxt->dbg_fp, + "destroy qp: failed to destroy %p, got %d.\n", qp, rc); + return rc; + } + + qelr_free_sq(qp); + qelr_free_rq(qp); + qelr_chain_free_sq(qp); + qelr_chain_free_rq(qp); + if (qp->sq.db_rec_map) + munmap(qp->sq.db_rec_map, cxt->kernel_page_size); + if (qp->rq.db_rec_map) + munmap(qp->rq.db_rec_map, cxt->kernel_page_size); + + DP_VERBOSE(cxt->dbg_fp, QELR_MSG_QP, + "destroy cq: successfully destroyed %p\n", qp); + + free(qp); + + return 0; +} + +static int sge_data_len(struct ibv_sge *sg_list, int num_sge) +{ + int i, len = 0; + + for (i = 0; i < num_sge; i++) + len += sg_list[i].length; + return len; +} + +static void swap_wqe_data64(uint64_t *p) +{ + __be64 *bep=(__be64 *)p; + int i; + + for (i = 0; i < ROCE_WQE_ELEM_SIZE / sizeof(uint64_t); i++, p++, bep++) + *bep = htobe64(*p); +} + +static inline void qelr_init_dpm_info(struct qelr_devctx *cxt, + struct qelr_qp *qp, + struct ibv_send_wr *wr, + struct qelr_dpm *dpm, + int data_size) +{ + dpm->is_edpm = 0; + dpm->is_ldpm = 0; + + /* DPM only succeeds when transmit queues are empty */ + if (!qelr_chain_is_full(&qp->sq.chain)) + return; + + /* Check if edpm can be used */ + if (wr->send_flags & IBV_SEND_INLINE && !qp->edpm_disabled && + cxt->dpm_flags & QELR_DPM_FLAGS_ENHANCED) { + memset(dpm, 0, sizeof(*dpm)); + dpm->rdma_ext = (struct qelr_rdma_ext *)&dpm->payload; + dpm->is_edpm = 1; + return; + } + + /* Check if ldpm can be used - not inline and limited to ldpm_limit */ + if (cxt->dpm_flags & QELR_DPM_FLAGS_LEGACY && + !(wr->send_flags & IBV_SEND_INLINE) && + data_size <= cxt->ldpm_limit_size) { + memset(dpm, 0, sizeof(*dpm)); + dpm->is_ldpm = 1; + } +} + +#define QELR_IB_OPCODE_SEND_ONLY 0x04 +#define QELR_IB_OPCODE_SEND_ONLY_WITH_IMMEDIATE 0x05 +#define QELR_IB_OPCODE_RDMA_WRITE_ONLY 0x0a +#define QELR_IB_OPCODE_RDMA_WRITE_ONLY_WITH_IMMEDIATE 0x0b +#define QELR_IB_OPCODE_SEND_WITH_INV 0x17 +#define QELR_IS_IMM_OR_INV(opcode) \ + (((opcode) == QELR_IB_OPCODE_SEND_ONLY_WITH_IMMEDIATE) || \ + ((opcode) == QELR_IB_OPCODE_RDMA_WRITE_ONLY_WITH_IMMEDIATE) || \ + ((opcode) == QELR_IB_OPCODE_SEND_WITH_INV)) + +static inline void qelr_edpm_set_msg_data(struct qelr_qp *qp, + struct qelr_dpm *dpm, + uint8_t opcode, + uint16_t length, + uint8_t se, + uint8_t comp) +{ + uint32_t wqe_size, dpm_size, params; + + params = 0; + wqe_size = length + (QELR_IS_IMM_OR_INV(opcode) ? sizeof(uint32_t) : 0); + dpm_size = wqe_size + sizeof(struct db_roce_dpm_data); + + SET_FIELD(params, DB_ROCE_DPM_PARAMS_DPM_TYPE, DPM_ROCE); + SET_FIELD(params, DB_ROCE_DPM_PARAMS_OPCODE, opcode); + SET_FIELD(params, DB_ROCE_DPM_PARAMS_WQE_SIZE, wqe_size); + SET_FIELD(params, DB_ROCE_DPM_PARAMS_COMPLETION_FLG, comp ? 1 : 0); + SET_FIELD(params, DB_ROCE_DPM_PARAMS_S_FLG, se ? 1 : 0); + SET_FIELD(params, DB_ROCE_DPM_PARAMS_SIZE, + (dpm_size + sizeof(uint64_t) - 1) / sizeof(uint64_t)); + + dpm->msg.data.params.params = htole32(params); +} + +static inline void qelr_edpm_set_inv_imm(struct qelr_qp *qp, + struct qelr_dpm *dpm, + __be32 data) +{ + memcpy(&dpm->payload[dpm->payload_offset], &data, sizeof(data)); + + dpm->payload_offset += sizeof(data); + dpm->payload_size += sizeof(data); +} + +static inline void qelr_edpm_set_rdma_ext(struct qelr_qp *qp, + struct qelr_dpm *dpm, + uint64_t remote_addr, + uint32_t rkey) +{ + dpm->rdma_ext->remote_va = htobe64(remote_addr); + dpm->rdma_ext->remote_key = htobe32(rkey); + dpm->payload_offset += sizeof(*dpm->rdma_ext); + dpm->payload_size += sizeof(*dpm->rdma_ext); +} + +static inline void qelr_edpm_set_payload(struct qelr_qp *qp, + struct qelr_dpm *dpm, char *buf, + uint32_t length) +{ + memcpy(&dpm->payload[dpm->payload_offset], buf, length); + + dpm->payload_offset += length; +} + +static void qelr_prepare_sq_inline_data(struct qelr_qp *qp, + struct qelr_dpm *dpm, + int data_size, + uint8_t *wqe_size, + struct ibv_send_wr *wr, + uint8_t *bits, uint8_t bit) +{ + int i; + uint32_t seg_siz; + char *seg_prt, *wqe; + + if (!data_size) + return; + + /* set the bit */ + *bits |= bit; + + seg_prt = NULL; + wqe = NULL; + seg_siz = 0; + + /* copy data inline */ + for (i = 0; i < wr->num_sge; i++) { + uint32_t len = wr->sg_list[i].length; + void *src = (void *)(uintptr_t)wr->sg_list[i].addr; + + if (dpm->is_edpm) + qelr_edpm_set_payload(qp, dpm, src, len); + + while (len > 0) { + uint32_t cur; + + /* new segment required */ + if (!seg_siz) { + wqe = (char *)qelr_chain_produce(&qp->sq.chain); + seg_prt = wqe; + seg_siz = sizeof(struct rdma_sq_common_wqe); + (*wqe_size)++; + } + + /* calculate currently allowed length */ + cur = min(len, seg_siz); + + memcpy(seg_prt, src, cur); + + /* update segment variables */ + seg_prt += cur; + seg_siz -= cur; + /* update sge variables */ + src += cur; + len -= cur; + + /* swap fully-completed segments */ + if (!seg_siz) + swap_wqe_data64((uint64_t *)wqe); + } + } + + /* swap last not completed segment */ + if (seg_siz) + swap_wqe_data64((uint64_t *)wqe); + + if (dpm->is_edpm) { + dpm->payload_size += data_size; + + if (wr->opcode == IBV_WR_RDMA_WRITE || + wr->opcode == IBV_WR_RDMA_WRITE_WITH_IMM) + dpm->rdma_ext->dma_length = htobe32(data_size); + } +} + +static void qelr_prepare_sq_sges(struct qelr_qp *qp, + struct qelr_dpm *dpm, + uint8_t *wqe_size, + struct ibv_send_wr *wr) +{ + int i; + + for (i = 0; i < wr->num_sge; i++) { + struct rdma_sq_sge *sge = qelr_chain_produce(&qp->sq.chain); + + TYPEPTR_ADDR_SET(sge, addr, wr->sg_list[i].addr); + sge->l_key = htole32(wr->sg_list[i].lkey); + sge->length = htole32(wr->sg_list[i].length); + + if (dpm->is_ldpm) { + memcpy(&dpm->payload[dpm->payload_size], sge, + sizeof(*sge)); + dpm->payload_size += sizeof(*sge); + } + } + + if (wqe_size) + *wqe_size += wr->num_sge; +} + +static uint32_t qelr_prepare_sq_rdma_data(struct qelr_qp *qp, + struct qelr_dpm *dpm, + int data_size, + uint8_t *p_wqe_size, + struct rdma_sq_rdma_wqe_1st *rwqe, + struct rdma_sq_rdma_wqe_2nd *rwqe2, + struct ibv_send_wr *wr, + bool is_imm) +{ + memset(rwqe2, 0, sizeof(*rwqe2)); + rwqe2->r_key = htole32(wr->wr.rdma.rkey); + TYPEPTR_ADDR_SET(rwqe2, remote_va, wr->wr.rdma.remote_addr); + rwqe->length = htole32(data_size); + + if (is_imm) + rwqe->imm_data = htole32(be32toh(wr->imm_data)); + + if (wr->send_flags & IBV_SEND_INLINE && + (wr->opcode == IBV_WR_RDMA_WRITE_WITH_IMM || + wr->opcode == IBV_WR_RDMA_WRITE)) { + uint8_t flags = 0; + + SET_FIELD2(flags, RDMA_SQ_RDMA_WQE_1ST_INLINE_FLG, 1); + qelr_prepare_sq_inline_data(qp, dpm, data_size, + p_wqe_size, wr, + &rwqe->flags, flags); + rwqe->wqe_size = *p_wqe_size; + } else { + if (dpm->is_ldpm) + dpm->payload_size = sizeof(*rwqe) + sizeof(*rwqe2); + qelr_prepare_sq_sges(qp, dpm, p_wqe_size, wr); + rwqe->wqe_size = *p_wqe_size; + + if (dpm->is_ldpm) { + memcpy(dpm->payload, rwqe, sizeof(*rwqe)); + memcpy(&dpm->payload[sizeof(*rwqe)], rwqe2, + sizeof(*rwqe2)); + } + } + + return data_size; +} + +static uint32_t qelr_prepare_sq_send_data(struct qelr_qp *qp, + struct qelr_dpm *dpm, + int data_size, + uint8_t *p_wqe_size, + struct rdma_sq_send_wqe_1st *swqe, + struct rdma_sq_send_wqe_2st *swqe2, + struct ibv_send_wr *wr, + bool is_imm) +{ + memset(swqe2, 0, sizeof(*swqe2)); + swqe->length = htole32(data_size); + + if (is_imm) + swqe->inv_key_or_imm_data = htole32(be32toh(wr->imm_data)); + + if (wr->send_flags & IBV_SEND_INLINE) { + uint8_t flags = 0; + + SET_FIELD2(flags, RDMA_SQ_SEND_WQE_INLINE_FLG, 1); + qelr_prepare_sq_inline_data(qp, dpm, data_size, + p_wqe_size, wr, + &swqe->flags, flags); + swqe->wqe_size = *p_wqe_size; + } else { + if (dpm->is_ldpm) + dpm->payload_size = sizeof(*swqe) + sizeof(*swqe2); + + qelr_prepare_sq_sges(qp, dpm, p_wqe_size, wr); + swqe->wqe_size = *p_wqe_size; + if (dpm->is_ldpm) { + memcpy(dpm->payload, swqe, sizeof(*swqe)); + memcpy(&dpm->payload[sizeof(*swqe)], swqe2, + sizeof(*swqe2)); + } + } + + return data_size; +} + +static void qelr_prepare_sq_atom_data(struct qelr_qp *qp, + struct qelr_dpm *dpm, + struct rdma_sq_atomic_wqe_1st *awqe1, + struct rdma_sq_atomic_wqe_2nd *awqe2, + struct rdma_sq_atomic_wqe_3rd *awqe3, + struct ibv_send_wr *wr) +{ + if (dpm->is_ldpm) { + memcpy(&dpm->payload[dpm->payload_size], awqe1, sizeof(*awqe1)); + dpm->payload_size += sizeof(*awqe1); + memcpy(&dpm->payload[dpm->payload_size], awqe2, sizeof(*awqe2)); + dpm->payload_size += sizeof(*awqe2); + memcpy(&dpm->payload[dpm->payload_size], awqe3, sizeof(*awqe3)); + dpm->payload_size += sizeof(*awqe3); + } + + qelr_prepare_sq_sges(qp, dpm, NULL, wr); +} + +static inline void qelr_ldpm_prepare_data(struct qelr_qp *qp, + struct qelr_dpm *dpm) +{ + uint32_t val, params; + + /* DPM size is given in 8 bytes so we round up */ + val = dpm->payload_size + sizeof(struct db_roce_dpm_data); + val = DIV_ROUND_UP(val, sizeof(uint64_t)); + + params = 0; + SET_FIELD(params, DB_ROCE_DPM_PARAMS_SIZE, val); + SET_FIELD(params, DB_ROCE_DPM_PARAMS_DPM_TYPE, DPM_LEGACY); + + dpm->msg.data.params.params = htole32(params); +} + +static enum ibv_wc_opcode qelr_ibv_to_wc_opcode(enum ibv_wr_opcode opcode) +{ + switch (opcode) { + case IBV_WR_RDMA_WRITE: + case IBV_WR_RDMA_WRITE_WITH_IMM: + return IBV_WC_RDMA_WRITE; + case IBV_WR_SEND_WITH_IMM: + case IBV_WR_SEND: + case IBV_WR_SEND_WITH_INV: + return IBV_WC_SEND; + case IBV_WR_RDMA_READ: + return IBV_WC_RDMA_READ; + case IBV_WR_ATOMIC_CMP_AND_SWP: + return IBV_WC_COMP_SWAP; + case IBV_WR_ATOMIC_FETCH_AND_ADD: + return IBV_WC_FETCH_ADD; + default: + return IBV_WC_SEND; + } +} + +static inline void doorbell_qp(struct qelr_qp *qp) +{ + mmio_wc_start(); + writel(qp->sq.db_data.raw, qp->sq.db); + /* copy value to doorbell recovery mechanism */ + qp->sq.db_rec_addr->db_data = qp->sq.db_data.raw; + mmio_flush_writes(); +} + +static inline void doorbell_dpm_qp(struct qelr_devctx *cxt, struct qelr_qp *qp, + struct qelr_dpm *dpm) +{ + uint32_t offset = 0; + uint64_t *payload = (uint64_t *)dpm->payload; + uint32_t num_dwords; + int bytes = 0; + void *db_addr; + + mmio_wc_start(); + + /* Write message header */ + dpm->msg.data.icid = qp->sq.db_data.data.icid; + dpm->msg.data.prod_val = qp->sq.db_data.data.value; + db_addr = qp->sq.edpm_db; + writeq(dpm->msg.raw, db_addr); + + /* Write mesage body */ + bytes += sizeof(uint64_t); + num_dwords = DIV_ROUND_UP(dpm->payload_size, sizeof(uint64_t)); + + db_addr += sizeof(dpm->msg.data); + + if (bytes == cxt->edpm_trans_size) { + mmio_flush_writes(); + bytes = 0; + } + + while (offset < num_dwords) { + /* endianity is different between FW and DORQ HW block */ + if (dpm->is_ldpm) + mmio_write64_be(db_addr, htobe64(payload[offset])); + else /* EDPM */ + mmio_write64(db_addr, payload[offset]); + + bytes += sizeof(uint64_t); + db_addr += sizeof(uint64_t); + + /* Writing to a wc bar. We need to flush the writes every + * edpm transaction size otherwise the CPU could optimize away + * the duplicate stores. + */ + if (bytes == cxt->edpm_trans_size) { + mmio_flush_writes(); + bytes = 0; + } + offset++; + } + + mmio_flush_writes(); +} + +static inline int qelr_can_post_send(struct qelr_devctx *cxt, + struct qelr_qp *qp, + struct ibv_send_wr *wr, + int data_size) +{ + /* Invalid WR */ + if (wr->num_sge > qp->sq.max_sges) { + DP_ERR(cxt->dbg_fp, + "error: WR is bad. Post send on QP %p failed\n", + qp); + return -EINVAL; + } + + /* WR overflow */ + if (qelr_wq_is_full(&qp->sq)) { + DP_ERR(cxt->dbg_fp, + "error: WQ is full. Post send on QP %p failed (this error appears only once)\n", + qp); + return -ENOMEM; + } + + /* WQE overflow */ + if (qelr_chain_get_elem_left_u32(&qp->sq.chain) < + QELR_MAX_SQ_WQE_SIZE) { + DP_ERR(cxt->dbg_fp, + "error: WQ PBL is full. Post send on QP %p failed (this error appears only once)\n", + qp); + return -ENOMEM; + } + + if ((wr->opcode == IBV_WR_ATOMIC_CMP_AND_SWP || + wr->opcode == IBV_WR_ATOMIC_FETCH_AND_ADD) && + !qp->atomic_supported) { + DP_ERR(cxt->dbg_fp, "Atomic not supported on this machine\n"); + return -EINVAL; + } + + if ((wr->send_flags & IBV_SEND_INLINE) && + (data_size > ROCE_REQ_MAX_INLINE_DATA_SIZE)) { + DP_ERR(cxt->dbg_fp, "Too much inline data in WR: %d\n", data_size); + return -EINVAL; + } + + + return 0; +} + +static int __qelr_post_send(struct qelr_devctx *cxt, struct qelr_qp *qp, + struct ibv_send_wr *wr, int data_size, + int *normal_db_required) +{ + uint8_t se, comp, fence; + struct rdma_sq_common_wqe *wqe; + struct rdma_sq_send_wqe_1st *swqe; + struct rdma_sq_send_wqe_2st *swqe2; + struct rdma_sq_rdma_wqe_1st *rwqe; + struct rdma_sq_rdma_wqe_2nd *rwqe2; + struct rdma_sq_atomic_wqe_1st *awqe1; + struct rdma_sq_atomic_wqe_2nd *awqe2; + struct rdma_sq_atomic_wqe_3rd *awqe3; + struct qelr_dpm dpm; + uint32_t wqe_length; + uint8_t wqe_size; + uint16_t db_val; + int rc = 0; + + qelr_init_dpm_info(cxt, qp, wr, &dpm, data_size); + + wqe = qelr_chain_produce(&qp->sq.chain); + + comp = (!!(wr->send_flags & IBV_SEND_SIGNALED)) || + (!!qp->sq_sig_all); + qp->wqe_wr_id[qp->sq.prod].signaled = comp; + + /* common fields */ + wqe->flags = 0; + se = !!(wr->send_flags & IBV_SEND_SOLICITED); + fence = !!(wr->send_flags & IBV_SEND_FENCE); + SET_FIELD2(wqe->flags, RDMA_SQ_COMMON_WQE_SE_FLG, se); + SET_FIELD2(wqe->flags, RDMA_SQ_COMMON_WQE_COMP_FLG, comp); + SET_FIELD2(wqe->flags, RDMA_SQ_COMMON_WQE_RD_FENCE_FLG, fence); + wqe->prev_wqe_size = qp->prev_wqe_size; + + qp->wqe_wr_id[qp->sq.prod].opcode = qelr_ibv_to_wc_opcode(wr->opcode); + + switch (wr->opcode) { + case IBV_WR_SEND_WITH_IMM: + wqe->req_type = RDMA_SQ_REQ_TYPE_SEND_WITH_IMM; + swqe = (struct rdma_sq_send_wqe_1st *)wqe; + + wqe_size = sizeof(struct rdma_sq_send_wqe) / RDMA_WQE_BYTES; + swqe2 = (struct rdma_sq_send_wqe_2st *)qelr_chain_produce(&qp->sq.chain); + + if (dpm.is_edpm) + qelr_edpm_set_inv_imm(qp, &dpm, wr->imm_data); + + wqe_length = qelr_prepare_sq_send_data(qp, &dpm, data_size, + &wqe_size, swqe, swqe2, + wr, 1 /* Imm */); + + if (dpm.is_edpm) + qelr_edpm_set_msg_data(qp, &dpm, + QELR_IB_OPCODE_SEND_ONLY_WITH_IMMEDIATE, + wqe_length, se, comp); + else if (dpm.is_ldpm) + qelr_ldpm_prepare_data(qp, &dpm); + + qp->wqe_wr_id[qp->sq.prod].wqe_size = wqe_size; + qp->prev_wqe_size = wqe_size; + qp->wqe_wr_id[qp->sq.prod].bytes_len = wqe_length; + break; + + case IBV_WR_SEND: + wqe->req_type = RDMA_SQ_REQ_TYPE_SEND; + swqe = (struct rdma_sq_send_wqe_1st *)wqe; + + wqe_size = sizeof(struct rdma_sq_send_wqe) / RDMA_WQE_BYTES; + swqe2 = (struct rdma_sq_send_wqe_2st *)qelr_chain_produce(&qp->sq.chain); + wqe_length = qelr_prepare_sq_send_data(qp, &dpm, data_size, + &wqe_size, swqe, swqe2, + wr, 0); + if (dpm.is_edpm) + qelr_edpm_set_msg_data(qp, &dpm, + QELR_IB_OPCODE_SEND_ONLY, + wqe_length, se, comp); + else if (dpm.is_ldpm) + qelr_ldpm_prepare_data(qp, &dpm); + + qp->wqe_wr_id[qp->sq.prod].wqe_size = wqe_size; + qp->prev_wqe_size = wqe_size; + qp->wqe_wr_id[qp->sq.prod].bytes_len = wqe_length; + break; + + case IBV_WR_SEND_WITH_INV: + wqe->req_type = RDMA_SQ_REQ_TYPE_SEND_WITH_INVALIDATE; + swqe = (struct rdma_sq_send_wqe_1st *)wqe; + + wqe_size = sizeof(struct rdma_sq_send_wqe) / RDMA_WQE_BYTES; + swqe2 = qelr_chain_produce(&qp->sq.chain); + + if (dpm.is_edpm) + qelr_edpm_set_inv_imm(qp, &dpm, + htobe32(wr->invalidate_rkey)); + + swqe->inv_key_or_imm_data = htole32(wr->invalidate_rkey); + + wqe_length = qelr_prepare_sq_send_data(qp, &dpm, data_size, + &wqe_size, swqe, swqe2, + wr, 0); + + if (dpm.is_edpm) + qelr_edpm_set_msg_data(qp, &dpm, + QELR_IB_OPCODE_SEND_WITH_INV, + wqe_length, se, comp); + else if (dpm.is_ldpm) + qelr_ldpm_prepare_data(qp, &dpm); + + qp->wqe_wr_id[qp->sq.prod].wqe_size = wqe_size; + qp->prev_wqe_size = wqe_size; + qp->wqe_wr_id[qp->sq.prod].bytes_len = wqe_length; + + break; + + case IBV_WR_RDMA_WRITE_WITH_IMM: + wqe->req_type = RDMA_SQ_REQ_TYPE_RDMA_WR_WITH_IMM; + rwqe = (struct rdma_sq_rdma_wqe_1st *)wqe; + + wqe_size = sizeof(struct rdma_sq_rdma_wqe) / RDMA_WQE_BYTES; + rwqe2 = (struct rdma_sq_rdma_wqe_2nd *)qelr_chain_produce(&qp->sq.chain); + if (dpm.is_edpm) { + qelr_edpm_set_rdma_ext(qp, &dpm, wr->wr.rdma.remote_addr, + wr->wr.rdma.rkey); + qelr_edpm_set_inv_imm(qp, &dpm, wr->imm_data); + } + + wqe_length = qelr_prepare_sq_rdma_data(qp, &dpm, data_size, &wqe_size, + rwqe, rwqe2, wr, 1 /* Imm */); + if (dpm.is_edpm) + qelr_edpm_set_msg_data(qp, &dpm, + QELR_IB_OPCODE_RDMA_WRITE_ONLY_WITH_IMMEDIATE, + wqe_length + sizeof(*dpm.rdma_ext), + se, comp); + else if (dpm.is_ldpm) + qelr_ldpm_prepare_data(qp, &dpm); + + qp->wqe_wr_id[qp->sq.prod].wqe_size = wqe_size; + qp->prev_wqe_size = wqe_size; + qp->wqe_wr_id[qp->sq.prod].bytes_len = wqe_length; + break; + + case IBV_WR_RDMA_WRITE: + wqe->req_type = RDMA_SQ_REQ_TYPE_RDMA_WR; + rwqe = (struct rdma_sq_rdma_wqe_1st *)wqe; + + wqe_size = sizeof(struct rdma_sq_rdma_wqe) / RDMA_WQE_BYTES; + rwqe2 = (struct rdma_sq_rdma_wqe_2nd *)qelr_chain_produce(&qp->sq.chain); + if (dpm.is_edpm) + qelr_edpm_set_rdma_ext(qp, &dpm, + wr->wr.rdma.remote_addr, + wr->wr.rdma.rkey); + + wqe_length = qelr_prepare_sq_rdma_data(qp, &dpm, data_size, &wqe_size, + rwqe, rwqe2, wr, 0); + if (dpm.is_edpm) + qelr_edpm_set_msg_data(qp, &dpm, + QELR_IB_OPCODE_RDMA_WRITE_ONLY, + wqe_length + + sizeof(*dpm.rdma_ext), + se, comp); + else if (dpm.is_ldpm) + qelr_ldpm_prepare_data(qp, &dpm); + + qp->wqe_wr_id[qp->sq.prod].wqe_size = wqe_size; + qp->prev_wqe_size = wqe_size; + qp->wqe_wr_id[qp->sq.prod].bytes_len = wqe_length; + break; + + case IBV_WR_RDMA_READ: + wqe->req_type = RDMA_SQ_REQ_TYPE_RDMA_RD; + rwqe = (struct rdma_sq_rdma_wqe_1st *)wqe; + + wqe_size = sizeof(struct rdma_sq_rdma_wqe) / RDMA_WQE_BYTES; + rwqe2 = (struct rdma_sq_rdma_wqe_2nd *)qelr_chain_produce(&qp->sq.chain); + wqe_length = qelr_prepare_sq_rdma_data(qp, &dpm, data_size, &wqe_size, + rwqe, rwqe2, wr, 0); + if (dpm.is_ldpm) + qelr_ldpm_prepare_data(qp, &dpm); + + qp->wqe_wr_id[qp->sq.prod].wqe_size = wqe_size; + qp->prev_wqe_size = wqe_size; + qp->wqe_wr_id[qp->sq.prod].bytes_len = wqe_length; + break; + + case IBV_WR_ATOMIC_CMP_AND_SWP: + case IBV_WR_ATOMIC_FETCH_AND_ADD: + awqe1 = (struct rdma_sq_atomic_wqe_1st *)wqe; + awqe1->wqe_size = 4; + + awqe2 = (struct rdma_sq_atomic_wqe_2nd *)qelr_chain_produce(&qp->sq.chain); + TYPEPTR_ADDR_SET(awqe2, remote_va, wr->wr.atomic.remote_addr); + awqe2->r_key = htole32(wr->wr.atomic.rkey); + + awqe3 = (struct rdma_sq_atomic_wqe_3rd *)qelr_chain_produce(&qp->sq.chain); + + if (wr->opcode == IBV_WR_ATOMIC_FETCH_AND_ADD) { + wqe->req_type = RDMA_SQ_REQ_TYPE_ATOMIC_ADD; + TYPEPTR_ADDR_SET(awqe3, swap_data, wr->wr.atomic.compare_add); + } else { + wqe->req_type = RDMA_SQ_REQ_TYPE_ATOMIC_CMP_AND_SWAP; + TYPEPTR_ADDR_SET(awqe3, swap_data, wr->wr.atomic.swap); + TYPEPTR_ADDR_SET(awqe3, cmp_data, wr->wr.atomic.compare_add); + } + + qelr_prepare_sq_atom_data(qp, &dpm, awqe1, awqe2, awqe3, wr); + if (dpm.is_ldpm) + qelr_ldpm_prepare_data(qp, &dpm); + qp->wqe_wr_id[qp->sq.prod].wqe_size = awqe1->wqe_size; + qp->prev_wqe_size = awqe1->wqe_size; + + break; + + default: + /* restore prod to its position before this WR was processed */ + qelr_chain_set_prod(&qp->sq.chain, + le16toh(qp->sq.db_data.data.value), + wqe); + + /* restore prev_wqe_size */ + qp->prev_wqe_size = wqe->prev_wqe_size; + + rc = -EINVAL; + DP_ERR(cxt->dbg_fp, + "Invalid opcode %d in work request on QP %p\n", + wr->opcode, qp); + break; + } + + if (rc) + return rc; + + qp->wqe_wr_id[qp->sq.prod].wr_id = wr->wr_id; + qelr_inc_sw_prod_u16(&qp->sq); + db_val = le16toh(qp->sq.db_data.data.value) + 1; + qp->sq.db_data.data.value = htole16(db_val); + + if (dpm.is_edpm || dpm.is_ldpm) { + doorbell_dpm_qp(cxt, qp, &dpm); + *normal_db_required = 0; + } else { + *normal_db_required = 1; + } + + return 0; +} + +int qelr_post_send(struct ibv_qp *ib_qp, struct ibv_send_wr *wr, + struct ibv_send_wr **bad_wr) +{ + struct qelr_devctx *cxt = get_qelr_ctx(ib_qp->context); + struct qelr_qp *qp = get_qelr_qp(ib_qp); + int doorbell_required = 0; + *bad_wr = NULL; + int rc = 0; + + pthread_spin_lock(&qp->q_lock); + + if (IS_ROCE(ib_qp->context->device) && + (qp->state != QELR_QPS_RTS && qp->state != QELR_QPS_ERR && + qp->state != QELR_QPS_SQD)) { + pthread_spin_unlock(&qp->q_lock); + *bad_wr = wr; + return -EINVAL; + } + + while (wr) { + int data_size = sge_data_len(wr->sg_list, wr->num_sge); + + rc = qelr_can_post_send(cxt, qp, wr, data_size); + if (rc) { + *bad_wr = wr; + break; + } + + rc = __qelr_post_send(cxt, qp, wr, data_size, &doorbell_required); + if (rc) { + *bad_wr = wr; + break; + } + + wr = wr->next; + } + + if (doorbell_required) + doorbell_qp(qp); + + pthread_spin_unlock(&qp->q_lock); + + return rc; +} + +static uint32_t qelr_srq_elem_left(struct qelr_srq_hwq_info *hw_srq) +{ + uint32_t used; + + /* Calculate number of elements used based on producer + * count and consumer count and subtract it from max + * work request supported so that we get elements left. + */ + used = (uint32_t)(((uint64_t)((uint64_t)~0U) + 1 + + (uint64_t)(hw_srq->wr_prod_cnt)) - + (uint64_t)hw_srq->wr_cons_cnt); + + return hw_srq->max_wr - used; +} + +int qelr_post_srq_recv(struct ibv_srq *ibsrq, struct ibv_recv_wr *wr, + struct ibv_recv_wr **bad_wr) +{ + struct qelr_devctx *cxt = get_qelr_ctx(ibsrq->context); + struct qelr_srq *srq = get_qelr_srq(ibsrq); + struct qelr_srq_hwq_info *hw_srq = &srq->hw_srq; + struct qelr_chain *chain; + int status = 0; + + pthread_spin_lock(&srq->lock); + + chain = &srq->hw_srq.chain; + while (wr) { + struct rdma_srq_wqe_header *hdr; + int i; + + if (!qelr_srq_elem_left(hw_srq) || + wr->num_sge > srq->hw_srq.max_sges) { + DP_ERR(cxt->dbg_fp, + "Can't post WR (%d,%d) || (%d > %d)\n", + hw_srq->wr_prod_cnt, hw_srq->wr_cons_cnt, + wr->num_sge, + srq->hw_srq.max_sges); + status = -ENOMEM; + *bad_wr = wr; + break; + } + + hdr = qelr_chain_produce(chain); + + SRQ_HDR_SET(hdr, wr->wr_id, wr->num_sge); + + hw_srq->wr_prod_cnt++; + hw_srq->wqe_prod++; + hw_srq->sge_prod++; + + DP_VERBOSE(cxt->dbg_fp, QELR_MSG_SRQ, + "SRQ WR: SGEs: %d with wr_id[%d] = %" PRIx64 "\n", + wr->num_sge, hw_srq->wqe_prod, wr->wr_id); + + for (i = 0; i < wr->num_sge; i++) { + struct rdma_srq_sge *srq_sge; + + srq_sge = qelr_chain_produce(chain); + SRQ_SGE_SET(srq_sge, wr->sg_list[i].addr, + wr->sg_list[i].length, wr->sg_list[i].lkey); + + DP_VERBOSE(cxt->dbg_fp, QELR_MSG_SRQ, + "[%d]: len %d key %x addr %x:%x\n", + i, srq_sge->length, srq_sge->l_key, + srq_sge->addr.hi, srq_sge->addr.lo); + hw_srq->sge_prod++; + } + + /* Make sure that descriptors are written before we update + * producers. + */ + + udma_ordering_write_barrier(); + + struct rdma_srq_producers *virt_prod; + + virt_prod = srq->hw_srq.virt_prod_pair_addr; + virt_prod->sge_prod = htole32(hw_srq->sge_prod); + virt_prod->wqe_prod = htole32(hw_srq->wqe_prod); + + wr = wr->next; + } + + DP_VERBOSE(cxt->dbg_fp, QELR_MSG_SRQ, + "POST: Elements in SRQ: %d\n", + qelr_chain_get_elem_left_u32(chain)); + pthread_spin_unlock(&srq->lock); + + return status; +} + +int qelr_post_recv(struct ibv_qp *ibqp, struct ibv_recv_wr *wr, + struct ibv_recv_wr **bad_wr) +{ + int status = 0; + struct qelr_qp *qp = get_qelr_qp(ibqp); + struct qelr_devctx *cxt = get_qelr_ctx(ibqp->context); + uint16_t db_val; + uint8_t iwarp = IS_IWARP(ibqp->context->device); + + if (unlikely(qp->srq)) { + DP_ERR(cxt->dbg_fp, + "QP is associated with SRQ, cannot post RQ buffers\n"); + *bad_wr = wr; + return -EINVAL; + } + + pthread_spin_lock(&qp->q_lock); + + if (!iwarp && qp->state == QELR_QPS_RST) { + pthread_spin_unlock(&qp->q_lock); + *bad_wr = wr; + return -EINVAL; + } + + while (wr) { + int i; + + if (qelr_chain_get_elem_left_u32(&qp->rq.chain) < + QELR_MAX_RQ_WQE_SIZE || wr->num_sge > qp->rq.max_sges) { + DP_ERR(cxt->dbg_fp, + "Can't post WR (%d < %d) || (%d > %d)\n", + qelr_chain_get_elem_left_u32(&qp->rq.chain), + QELR_MAX_RQ_WQE_SIZE, wr->num_sge, + qp->rq.max_sges); + status = -ENOMEM; + *bad_wr = wr; + break; + } + for (i = 0; i < wr->num_sge; i++) { + uint32_t flags = 0; + struct rdma_rq_sge *rqe; + + /* first one must include the number of SGE in the + * list + */ + if (!i) + SET_FIELD(flags, RDMA_RQ_SGE_NUM_SGES, + wr->num_sge); + + SET_FIELD(flags, RDMA_RQ_SGE_L_KEY, + wr->sg_list[i].lkey); + rqe = qelr_chain_produce(&qp->rq.chain); + RQ_SGE_SET(rqe, wr->sg_list[i].addr, + wr->sg_list[i].length, flags); + } + /* Special case of no sges. FW requires between 1-4 sges... + * in this case we need to post 1 sge with length zero. this is + * because rdma write with immediate consumes an RQ. + */ + if (!wr->num_sge) { + uint32_t flags = 0; + struct rdma_rq_sge *rqe; + + /* first one must include the number of SGE in the + * list + */ + SET_FIELD(flags, RDMA_RQ_SGE_L_KEY, 0); + SET_FIELD(flags, RDMA_RQ_SGE_NUM_SGES, 1); + + rqe = qelr_chain_produce(&qp->rq.chain); + RQ_SGE_SET(rqe, 0, 0, flags); + i = 1; + } + + qp->rqe_wr_id[qp->rq.prod].wr_id = wr->wr_id; + qp->rqe_wr_id[qp->rq.prod].wqe_size = i; + + qelr_inc_sw_prod_u16(&qp->rq); + + mmio_wc_start(); + + db_val = le16toh(qp->rq.db_data.data.value) + 1; + qp->rq.db_data.data.value = htole16(db_val); + + writel(qp->rq.db_data.raw, qp->rq.db); + /* copy value to doorbell recovery mechanism */ + qp->rq.db_rec_addr->db_data = qp->rq.db_data.raw; + mmio_flush_writes(); + + if (iwarp) { + writel(qp->rq.iwarp_db2_data.raw, qp->rq.iwarp_db2); + mmio_flush_writes(); + } + wr = wr->next; + } + + pthread_spin_unlock(&qp->q_lock); + + return status; +} + +static int is_valid_cqe(struct qelr_cq *cq, union rdma_cqe *cqe) +{ + struct rdma_cqe_requester *resp_cqe = &cqe->req; + + return (resp_cqe->flags & RDMA_CQE_REQUESTER_TOGGLE_BIT_MASK) == + cq->chain_toggle; +} + +static enum rdma_cqe_type cqe_get_type(union rdma_cqe *cqe) +{ + struct rdma_cqe_requester *resp_cqe = &cqe->req; + + return GET_FIELD(resp_cqe->flags, RDMA_CQE_REQUESTER_TYPE); +} + +static struct qelr_qp *cqe_get_qp(union rdma_cqe *cqe) +{ + struct regpair *qph = &cqe->req.qp_handle; + + return (struct qelr_qp *)HILO_U64(le32toh(qph->hi), le32toh(qph->lo)); +} + +static int process_req(struct qelr_qp *qp, struct qelr_cq *cq, int num_entries, + struct ibv_wc *wc, uint16_t hw_cons, + enum ibv_wc_status status, int force) +{ + struct qelr_devctx *cxt = get_qelr_ctx(qp->ibv_qp.context); + uint16_t cnt = 0; + + while (num_entries && qp->sq.wqe_cons != hw_cons) { + if (!qp->wqe_wr_id[qp->sq.cons].signaled && !force) { + /* skip WC */ + goto next_cqe; + } + + /* fill WC */ + wc->status = status; + wc->wc_flags = 0; + wc->qp_num = qp->qp_id; + + /* common section */ + wc->wr_id = qp->wqe_wr_id[qp->sq.cons].wr_id; + wc->opcode = qp->wqe_wr_id[qp->sq.cons].opcode; + + switch (wc->opcode) { + case IBV_WC_RDMA_WRITE: + wc->byte_len = qp->wqe_wr_id[qp->sq.cons].bytes_len; + DP_VERBOSE(cxt->dbg_fp, QELR_MSG_CQ, + "POLL REQ CQ: IBV_WC_RDMA_WRITE byte_len=%d\n", + qp->wqe_wr_id[qp->sq.cons].bytes_len); + break; + case IBV_WC_COMP_SWAP: + case IBV_WC_FETCH_ADD: + wc->byte_len = 8; + break; + case IBV_WC_RDMA_READ: + case IBV_WC_SEND: + case IBV_WC_BIND_MW: + wc->byte_len = qp->wqe_wr_id[qp->sq.cons].bytes_len; + DP_VERBOSE(cxt->dbg_fp, QELR_MSG_CQ, + "POLL REQ CQ: IBV_WC_RDMA_READ / IBV_WC_SEND\n"); + break; + default: + break; + } + + num_entries--; + wc++; + cnt++; +next_cqe: + while (qp->wqe_wr_id[qp->sq.cons].wqe_size--) + qelr_chain_consume(&qp->sq.chain); + qelr_inc_sw_cons_u16(&qp->sq); + } + + return cnt; +} + +static int qelr_poll_cq_req(struct qelr_qp *qp, struct qelr_cq *cq, + int num_entries, struct ibv_wc *wc, + struct rdma_cqe_requester *req) +{ + struct qelr_devctx *cxt = get_qelr_ctx(qp->ibv_qp.context); + uint16_t sq_cons = le16toh(req->sq_cons); + int cnt = 0; + + switch (req->status) { + case RDMA_CQE_REQ_STS_OK: + cnt = process_req(qp, cq, num_entries, wc, sq_cons, + IBV_WC_SUCCESS, 0); + break; + case RDMA_CQE_REQ_STS_WORK_REQUEST_FLUSHED_ERR: + DP_ERR(cxt->dbg_fp, + "Error: POLL CQ with ROCE_CQE_REQ_STS_WORK_REQUEST_FLUSHED_ERR. QP icid=0x%x\n", + qp->sq.icid); + cnt = process_req(qp, cq, num_entries, wc, sq_cons, + IBV_WC_WR_FLUSH_ERR, 1); + break; + default: /* other errors case */ + /* process all WQE before the consumer */ + qp->state = QELR_QPS_ERR; + cnt = process_req(qp, cq, num_entries, wc, sq_cons - 1, + IBV_WC_SUCCESS, 0); + wc += cnt; + /* if we have extra WC fill it with actual error info */ + if (cnt < num_entries) { + enum ibv_wc_status wc_status; + + switch (req->status) { + case RDMA_CQE_REQ_STS_BAD_RESPONSE_ERR: + DP_ERR(cxt->dbg_fp, + "Error: POLL CQ with RDMA_CQE_REQ_STS_BAD_RESPONSE_ERR. QP icid=0x%x\n", + qp->sq.icid); + wc_status = IBV_WC_BAD_RESP_ERR; + break; + case RDMA_CQE_REQ_STS_LOCAL_LENGTH_ERR: + DP_ERR(cxt->dbg_fp, + "Error: POLL CQ with RDMA_CQE_REQ_STS_LOCAL_LENGTH_ERR. QP icid=0x%x\n", + qp->sq.icid); + wc_status = IBV_WC_LOC_LEN_ERR; + break; + case RDMA_CQE_REQ_STS_LOCAL_QP_OPERATION_ERR: + DP_ERR(cxt->dbg_fp, + "Error: POLL CQ with RDMA_CQE_REQ_STS_LOCAL_QP_OPERATION_ERR. QP icid=0x%x\n", + qp->sq.icid); + wc_status = IBV_WC_LOC_QP_OP_ERR; + break; + case RDMA_CQE_REQ_STS_LOCAL_PROTECTION_ERR: + DP_ERR(cxt->dbg_fp, + "Error: POLL CQ with RDMA_CQE_REQ_STS_LOCAL_PROTECTION_ERR. QP icid=0x%x\n", + qp->sq.icid); + wc_status = IBV_WC_LOC_PROT_ERR; + break; + case RDMA_CQE_REQ_STS_MEMORY_MGT_OPERATION_ERR: + DP_ERR(cxt->dbg_fp, + "Error: POLL CQ with RDMA_CQE_REQ_STS_MEMORY_MGT_OPERATION_ERR. QP icid=0x%x\n", + qp->sq.icid); + wc_status = IBV_WC_MW_BIND_ERR; + break; + case RDMA_CQE_REQ_STS_REMOTE_INVALID_REQUEST_ERR: + DP_ERR(cxt->dbg_fp, + "Error: POLL CQ with RDMA_CQE_REQ_STS_REMOTE_INVALID_REQUEST_ERR. QP icid=0x%x\n", + qp->sq.icid); + wc_status = IBV_WC_REM_INV_REQ_ERR; + break; + case RDMA_CQE_REQ_STS_REMOTE_ACCESS_ERR: + DP_ERR(cxt->dbg_fp, + "Error: POLL CQ with RDMA_CQE_REQ_STS_REMOTE_ACCESS_ERR. QP icid=0x%x\n", + qp->sq.icid); + wc_status = IBV_WC_REM_ACCESS_ERR; + break; + case RDMA_CQE_REQ_STS_REMOTE_OPERATION_ERR: + DP_ERR(cxt->dbg_fp, + "Error: POLL CQ with RDMA_CQE_REQ_STS_REMOTE_OPERATION_ERR. QP icid=0x%x\n", + qp->sq.icid); + wc_status = IBV_WC_REM_OP_ERR; + break; + case RDMA_CQE_REQ_STS_RNR_NAK_RETRY_CNT_ERR: + DP_ERR(cxt->dbg_fp, + "Error: POLL CQ with RDMA_CQE_REQ_STS_RNR_NAK_RETRY_CNT_ERR. QP icid=0x%x\n", + qp->sq.icid); + wc_status = IBV_WC_RNR_RETRY_EXC_ERR; + break; + case RDMA_CQE_REQ_STS_TRANSPORT_RETRY_CNT_ERR: + DP_ERR(cxt->dbg_fp, + "RDMA_CQE_REQ_STS_TRANSPORT_RETRY_CNT_ERR. QP icid=0x%x\n", + qp->sq.icid); + wc_status = IBV_WC_RETRY_EXC_ERR; + break; + default: + DP_ERR(cxt->dbg_fp, + "IBV_WC_GENERAL_ERR. QP icid=0x%x\n", + qp->sq.icid); + wc_status = IBV_WC_GENERAL_ERR; + } + + cnt += process_req(qp, cq, 1, wc, sq_cons, wc_status, + 1 /* force use of WC */); + } + } + + return cnt; +} + +static void __process_resp_one(struct qelr_qp *qp, struct qelr_cq *cq, + struct ibv_wc *wc, + struct rdma_cqe_responder *resp, uint64_t wr_id) +{ + struct qelr_devctx *cxt = get_qelr_ctx(qp->ibv_qp.context); + enum ibv_wc_status wc_status = IBV_WC_SUCCESS; + uint8_t flags; + + wc->opcode = IBV_WC_RECV; + wc->wr_id = wr_id; + wc->wc_flags = 0; + switch (resp->status) { + case RDMA_CQE_RESP_STS_LOCAL_ACCESS_ERR: + wc_status = IBV_WC_LOC_ACCESS_ERR; + break; + case RDMA_CQE_RESP_STS_LOCAL_LENGTH_ERR: + wc_status = IBV_WC_LOC_LEN_ERR; + break; + case RDMA_CQE_RESP_STS_LOCAL_QP_OPERATION_ERR: + wc_status = IBV_WC_LOC_QP_OP_ERR; + break; + case RDMA_CQE_RESP_STS_LOCAL_PROTECTION_ERR: + wc_status = IBV_WC_LOC_PROT_ERR; + break; + case RDMA_CQE_RESP_STS_MEMORY_MGT_OPERATION_ERR: + wc_status = IBV_WC_MW_BIND_ERR; + break; + case RDMA_CQE_RESP_STS_REMOTE_INVALID_REQUEST_ERR: + wc_status = IBV_WC_REM_INV_RD_REQ_ERR; + break; + case RDMA_CQE_RESP_STS_OK: + wc_status = IBV_WC_SUCCESS; + wc->byte_len = le32toh(resp->length); + + flags = resp->flags & QELR_RESP_RDMA_IMM; + + switch (flags) { + case QELR_RESP_RDMA_IMM: + /* update opcode */ + wc->opcode = IBV_WC_RECV_RDMA_WITH_IMM; + SWITCH_FALLTHROUGH; + case QELR_RESP_IMM: + wc->imm_data = htobe32(le32toh(resp->imm_data_or_inv_r_Key)); + wc->wc_flags |= IBV_WC_WITH_IMM; + break; + case QELR_RESP_INV: + wc->invalidated_rkey = le32toh(resp->imm_data_or_inv_r_Key); + wc->wc_flags |= IBV_WC_WITH_INV; + break; + case QELR_RESP_RDMA: + DP_ERR(cxt->dbg_fp, "Invalid flags detected\n"); + break; + default: + /* valid configuration, but nothing to do here */ + break; + } + + break; + default: + wc->status = IBV_WC_GENERAL_ERR; + DP_ERR(cxt->dbg_fp, "Invalid CQE status detected\n"); + } + + /* fill WC */ + wc->status = wc_status; + wc->qp_num = qp->qp_id; +} + +static int process_resp_one_srq(struct qelr_qp *qp, struct qelr_cq *cq, + struct ibv_wc *wc, + struct rdma_cqe_responder *resp) +{ + struct qelr_srq_hwq_info *hw_srq = &qp->srq->hw_srq; + uint64_t wr_id; + + wr_id = (((uint64_t)(le32toh(resp->srq_wr_id.hi))) << 32) + + le32toh(resp->srq_wr_id.lo); + + if (resp->status == RDMA_CQE_RESP_STS_WORK_REQUEST_FLUSHED_ERR) { + wc->byte_len = 0; + wc->status = IBV_WC_WR_FLUSH_ERR; + wc->qp_num = qp->qp_id; + wc->wr_id = wr_id; + } else { + __process_resp_one(qp, cq, wc, resp, wr_id); + } + + hw_srq->wr_cons_cnt++; + + return 1; +} + +static int process_resp_one(struct qelr_qp *qp, struct qelr_cq *cq, + struct ibv_wc *wc, struct rdma_cqe_responder *resp) +{ + uint64_t wr_id = qp->rqe_wr_id[qp->rq.cons].wr_id; + + __process_resp_one(qp, cq, wc, resp, wr_id); + + while (qp->rqe_wr_id[qp->rq.cons].wqe_size--) + qelr_chain_consume(&qp->rq.chain); + + qelr_inc_sw_cons_u16(&qp->rq); + + return 1; +} + +static int process_resp_flush(struct qelr_qp *qp, struct qelr_cq *cq, + int num_entries, struct ibv_wc *wc, + uint16_t hw_cons) +{ + uint16_t cnt = 0; + + while (num_entries && qp->rq.wqe_cons != hw_cons) { + /* fill WC */ + wc->status = IBV_WC_WR_FLUSH_ERR; + wc->qp_num = qp->qp_id; + wc->byte_len = 0; + wc->wr_id = qp->rqe_wr_id[qp->rq.cons].wr_id; + num_entries--; + wc++; + cnt++; + while (qp->rqe_wr_id[qp->rq.cons].wqe_size--) + qelr_chain_consume(&qp->rq.chain); + qelr_inc_sw_cons_u16(&qp->rq); + } + + return cnt; +} + +/* return latest CQE (needs processing) */ +static union rdma_cqe *get_cqe(struct qelr_cq *cq) +{ + return cq->latest_cqe; +} + +static void try_consume_req_cqe(struct qelr_cq *cq, struct qelr_qp *qp, + struct rdma_cqe_requester *req, int *update) +{ + uint16_t sq_cons = le16toh(req->sq_cons); + + if (sq_cons == qp->sq.wqe_cons) { + consume_cqe(cq); + *update |= 1; + } +} + +/* used with flush only, when resp->rq_cons is valid */ +static void try_consume_resp_cqe(struct qelr_cq *cq, struct qelr_qp *qp, + uint16_t rq_cons, int *update) +{ + if (rq_cons == qp->rq.wqe_cons) { + consume_cqe(cq); + *update |= 1; + } +} + +static int qelr_poll_cq_resp_srq(struct qelr_qp *qp, struct qelr_cq *cq, + int num_entries, struct ibv_wc *wc, + struct rdma_cqe_responder *resp, int *update) +{ + int cnt; + + cnt = process_resp_one_srq(qp, cq, wc, resp); + consume_cqe(cq); + *update |= 1; + + return cnt; +} + +static int qelr_poll_cq_resp(struct qelr_qp *qp, struct qelr_cq *cq, + int num_entries, struct ibv_wc *wc, + struct rdma_cqe_responder *resp, int *update) +{ + uint16_t rq_cons = le16toh(resp->rq_cons); + int cnt; + + if (resp->status == RDMA_CQE_RESP_STS_WORK_REQUEST_FLUSHED_ERR) { + cnt = process_resp_flush(qp, cq, num_entries, wc, rq_cons); + try_consume_resp_cqe(cq, qp, rq_cons, update); + } else { + cnt = process_resp_one(qp, cq, wc, resp); + consume_cqe(cq); + *update |= 1; + } + + return cnt; +} + +static void doorbell_cq(struct qelr_cq *cq, uint32_t cons, uint8_t flags) +{ + mmio_wc_start(); + cq->db.data.agg_flags = flags; + cq->db.data.value = htole32(cons); + + writeq(cq->db.raw, cq->db_addr); + /* copy value to doorbell recovery mechanism */ + cq->db_rec_addr->db_data = cq->db.raw; + mmio_flush_writes(); +} + +int qelr_poll_cq(struct ibv_cq *ibcq, int num_entries, struct ibv_wc *wc) +{ + struct qelr_cq *cq = get_qelr_cq(ibcq); + int done = 0; + union rdma_cqe *cqe = get_cqe(cq); + int update = 0; + uint32_t db_cons; + + while (num_entries && is_valid_cqe(cq, cqe)) { + int cnt = 0; + struct qelr_qp *qp; + + /* prevent speculative reads of any field of CQE */ + udma_from_device_barrier(); + + qp = cqe_get_qp(cqe); + if (!qp) { + DP_ERR(stderr, + "Error: CQE QP pointer is NULL. CQE=%p\n", cqe); + break; + } + + switch (cqe_get_type(cqe)) { + case RDMA_CQE_TYPE_REQUESTER: + cnt = qelr_poll_cq_req(qp, cq, num_entries, wc, + &cqe->req); + try_consume_req_cqe(cq, qp, &cqe->req, &update); + break; + case RDMA_CQE_TYPE_RESPONDER_RQ: + cnt = qelr_poll_cq_resp(qp, cq, num_entries, wc, + &cqe->resp, &update); + break; + case RDMA_CQE_TYPE_RESPONDER_SRQ: + cnt = qelr_poll_cq_resp_srq(qp, cq, num_entries, wc, + &cqe->resp, &update); + break; + case RDMA_CQE_TYPE_INVALID: + default: + printf("Error: invalid CQE type = %d\n", + cqe_get_type(cqe)); + } + num_entries -= cnt; + wc += cnt; + done += cnt; + + cqe = get_cqe(cq); + } + + db_cons = qelr_chain_get_cons_idx_u32(&cq->chain) - 1; + if (update) { + /* doorbell notifies about latest VALID entry, + * but chain already point to the next INVALID one + */ + doorbell_cq(cq, db_cons, cq->arm_flags); + } + + return done; +} + +void qelr_cq_event(struct ibv_cq *ibcq) +{ + /* Trigger received, can reset arm flags */ + struct qelr_cq *cq = get_qelr_cq(ibcq); + + cq->arm_flags = 0; +} + +int qelr_arm_cq(struct ibv_cq *ibcq, int solicited) +{ + struct qelr_cq *cq = get_qelr_cq(ibcq); + uint32_t db_cons; + + db_cons = qelr_chain_get_cons_idx_u32(&cq->chain) - 1; + cq->arm_flags = solicited ? DQ_UCM_ROCE_CQ_ARM_SE_CF_CMD : + DQ_UCM_ROCE_CQ_ARM_CF_CMD; + + doorbell_cq(cq, db_cons, cq->arm_flags); + + return 0; +} + +void qelr_async_event(struct ibv_context *context, + struct ibv_async_event *event) +{ + struct qelr_cq *cq = NULL; + struct qelr_qp *qp = NULL; + + switch (event->event_type) { + case IBV_EVENT_CQ_ERR: + cq = get_qelr_cq(event->element.cq); + break; + case IBV_EVENT_QP_FATAL: + case IBV_EVENT_QP_REQ_ERR: + case IBV_EVENT_QP_ACCESS_ERR: + case IBV_EVENT_PATH_MIG_ERR:{ + qp = get_qelr_qp(event->element.qp); + break; + } + case IBV_EVENT_SQ_DRAINED: + case IBV_EVENT_PATH_MIG: + case IBV_EVENT_COMM_EST: + case IBV_EVENT_QP_LAST_WQE_REACHED: + break; + case IBV_EVENT_SRQ_LIMIT_REACHED: + case IBV_EVENT_SRQ_ERR: + return; + case IBV_EVENT_PORT_ACTIVE: + case IBV_EVENT_PORT_ERR: + break; + default: + break; + } + + fprintf(stderr, "qelr_async_event not implemented yet cq=%p qp=%p\n", + cq, qp); +} diff --git a/providers/qedr/qelr_verbs.h b/providers/qedr/qelr_verbs.h new file mode 100644 index 0000000..d0eacbf --- /dev/null +++ b/providers/qedr/qelr_verbs.h @@ -0,0 +1,87 @@ +/* + * Copyright (c) 2015-2016 QLogic Corporation + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and /or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef __QELR_VERBS_H__ +#define __QELR_VERBS_H__ + +#include <inttypes.h> +#include <stddef.h> +#include <endian.h> + +#include <infiniband/driver.h> +#include <util/udma_barrier.h> + +int qelr_query_device(struct ibv_context *context, + struct ibv_device_attr *attr); +int qelr_query_port(struct ibv_context *context, uint8_t port, + struct ibv_port_attr *attr); + +struct ibv_pd *qelr_alloc_pd(struct ibv_context *context); +int qelr_dealloc_pd(struct ibv_pd *ibpd); + +struct ibv_mr *qelr_reg_mr(struct ibv_pd *ibpd, void *addr, size_t len, + uint64_t hca_va, int access); +int qelr_dereg_mr(struct verbs_mr *mr); + +struct ibv_cq *qelr_create_cq(struct ibv_context *context, int cqe, + struct ibv_comp_channel *channel, + int comp_vector); +int qelr_arm_cq(struct ibv_cq *ibcq, int solicited); +int qelr_poll_cq(struct ibv_cq *ibcq, int num_entries, struct ibv_wc *wc); +void qelr_cq_event(struct ibv_cq *ibcq); +int qelr_destroy_cq(struct ibv_cq *); + +struct ibv_qp *qelr_create_qp(struct ibv_pd *pd, + struct ibv_qp_init_attr *attrs); +int qelr_modify_qp(struct ibv_qp *ibqp, struct ibv_qp_attr *attr, + int attr_mask); +int qelr_query_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, + int attr_mask, struct ibv_qp_init_attr *init_attr); +int qelr_destroy_qp(struct ibv_qp *ibqp); + +int qelr_post_send(struct ibv_qp *ib_qp, struct ibv_send_wr *wr, + struct ibv_send_wr **bad_wr); +int qelr_post_recv(struct ibv_qp *ibqp, struct ibv_recv_wr *wr, + struct ibv_recv_wr **bad_wr); + +int qelr_query_srq(struct ibv_srq *ibv_srq, struct ibv_srq_attr *attr); +int qelr_modify_srq(struct ibv_srq *ibv_srq, struct ibv_srq_attr *attr, + int attr_mask); +struct ibv_srq *qelr_create_srq(struct ibv_pd *pd, + struct ibv_srq_init_attr *init_attr); +int qelr_destroy_srq(struct ibv_srq *ibv_srq); +int qelr_post_srq_recv(struct ibv_srq *ibsrq, struct ibv_recv_wr *wr, + struct ibv_recv_wr **bad_wr); + +void qelr_async_event(struct ibv_context *context, + struct ibv_async_event *event); +#endif /* __QELR_VERBS_H__ */ diff --git a/providers/qedr/rdma_common.h b/providers/qedr/rdma_common.h new file mode 100644 index 0000000..f2d76bb --- /dev/null +++ b/providers/qedr/rdma_common.h @@ -0,0 +1,78 @@ +/* + * Copyright (c) 2015-2016 QLogic Corporation + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and /or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef __RDMA_COMMON__ +#define __RDMA_COMMON__ + +#include <linux/types.h> + +/************************/ +/* RDMA FW CONSTANTS */ +/************************/ + +#define RDMA_RESERVED_LKEY (0) //Reserved lkey +#define RDMA_RING_PAGE_SIZE (0x1000) //4KB pages + +#define RDMA_MAX_SGE_PER_SQ_WQE (4) //max number of SGEs in a single request +#define RDMA_MAX_SGE_PER_RQ_WQE (4) //max number of SGEs in a single request + +#define RDMA_MAX_DATA_SIZE_IN_WQE (0x7FFFFFFF) //max size of data in single request + +#define RDMA_REQ_RD_ATOMIC_ELM_SIZE (0x50) +#define RDMA_RESP_RD_ATOMIC_ELM_SIZE (0x20) + +#define RDMA_MAX_CQS (64*1024) +#define RDMA_MAX_TIDS (128*1024-1) +#define RDMA_MAX_PDS (64*1024) +#define RDMA_MAX_SRQS (32*1024) + +#define RDMA_NUM_STATISTIC_COUNTERS MAX_NUM_VPORTS +#define RDMA_NUM_STATISTIC_COUNTERS_K2 MAX_NUM_VPORTS_K2 +#define RDMA_NUM_STATISTIC_COUNTERS_BB MAX_NUM_VPORTS_BB + +#define RDMA_TASK_TYPE (PROTOCOLID_ROCE) + + +struct rdma_srq_id +{ + __le16 srq_idx /* SRQ index */; + __le16 opaque_fid; +}; + + +struct rdma_srq_producers +{ + __le32 sge_prod /* Current produced sge in SRQ */; + __le32 wqe_prod /* Current produced WQE to SRQ */; +}; + +#endif /* __RDMA_COMMON__ */ diff --git a/providers/qedr/roce_common.h b/providers/qedr/roce_common.h new file mode 100644 index 0000000..b01c2ad --- /dev/null +++ b/providers/qedr/roce_common.h @@ -0,0 +1,50 @@ +/* + * Copyright (c) 2015-2016 QLogic Corporation + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and /or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef __ROCE_COMMON__ +#define __ROCE_COMMON__ +/************************************************************************/ +/* Add include to common rdma target for both eCore and protocol rdma driver */ +/************************************************************************/ +#include "rdma_common.h" +/************************/ +/* ROCE FW CONSTANTS */ +/************************/ + +#define ROCE_REQ_MAX_INLINE_DATA_SIZE (256) //max size of inline data in single request +#define ROCE_REQ_MAX_SINGLE_SQ_WQE_SIZE (288) //Maximum size of single SQ WQE (rdma wqe and inline data) + +#define ROCE_MAX_QPS (32*1024) +#define ROCE_DCQCN_NP_MAX_QPS (64) /* notification point max QPs*/ +#define ROCE_DCQCN_RP_MAX_QPS (64) /* reaction point max QPs*/ + +#endif /* __ROCE_COMMON__ */ diff --git a/providers/rxe/CMakeLists.txt b/providers/rxe/CMakeLists.txt new file mode 100644 index 0000000..d8f3265 --- /dev/null +++ b/providers/rxe/CMakeLists.txt @@ -0,0 +1,3 @@ +rdma_provider(rxe + rxe.c + ) diff --git a/providers/rxe/man/CMakeLists.txt b/providers/rxe/man/CMakeLists.txt new file mode 100644 index 0000000..53d78db --- /dev/null +++ b/providers/rxe/man/CMakeLists.txt @@ -0,0 +1,3 @@ +rdma_man_pages( + rxe.7 +) diff --git a/providers/rxe/man/rxe.7 b/providers/rxe/man/rxe.7 new file mode 100644 index 0000000..474ffff --- /dev/null +++ b/providers/rxe/man/rxe.7 @@ -0,0 +1,97 @@ +.\" -*- nroff -*- +.\" +.TH RXE 7 2011-06-29 1.0.0 +.SH "NAME" +rxe \- Software RDMA over Ethernet +.SH "SYNOPSIS" +\fBmodprobe rdma_rxe\fR +.br +This is usually performed by a configuration utility (see \fBrdma link\fR(8).) + +.SH "DESCRIPTION" +The rdma_rxe kernel module provides a software implementation of the RoCEv2 +protocol. The RoCEv2 protocol is an RDMA transport protocol that exists on +top of UDP/IPv4 or UDP/IPv6. The InfiniBand (IB) Base Transport Header (BTH) +is encapsulated in the UDP packet. + +Once a RXE instance has been created, communicating via RXE is the same as communicating via any OFED compatible Infiniband HCA, albeit in some cases with addressing implications. + +In particular, while the use of a GRH header is optional within IB subnets, it is mandatory with RoCE. Verbs applications written over IB verbs should work seamlessly, but they require provisioning of GRH information when creating address vectors. The library and driver are modified to provide for mapping from GID to MAC addresses required by the hardware. + +.SH "FILES" +.TP +\fB/sys/class/infiniband/rxe[0,1,...]\fR +Directory that holds RDMA device information. The format is the same as other RDMA devices. + +.TP +\fB/sys/module/rdma_rxe_net/parameters/mtu\fR +Write only file used to configure RoCE and Ethernet MTU values. + +.TP +\fB/sys/module/rdma_rxe/parameters/max_ucontext\fR +Read/Write file that sets a limit on the number of UCs allowed per RXE device. + +.TP +\fB/sys/module/rdma_rxe/parameters/max_qp\fR +Read/Write file that sets a limit on the number of QPs allowed per RXE device. + +.TP +\fB/sys/module/rdma_rxe/parameters/max_qp_wr\fR +Read/Write file that sets a limit on the number of WRs per QP allowed per RXE device. + +.TP +\fB/sys/module/rdma_rxe/parameters/max_mr\fR +Read/Write file that sets a limit on the number of MRs allowed per RXE device. + +.TP +\fB/sys/module/rdma_rxe/parameters/max_fmr\fR +Read/Write file that sets a limit on the number of FMRs allowed per RXE device. + +.TP +\fB/sys/module/rdma_rxe/parameters/max_cq\fR +Read/Write file that sets a limit on the number of CQs allowed per RXE device. + +.TP +\fB/sys/module/rdma_rxe/parameters/max_log_cqe\fR +Read/Write file that sets a limit on the log base 2 of the number of CQEs per CQ allowed per RXE device. + +.TP +\fB/sys/module/rdma_rxe/parameters/max_inline_data\fR +Read/Write file that sets a limit on the maximum amount of inline data per WR allowed per RXE device. + +The above configuration parameters only affect a new RXE instance when it is created not afterwards. + +.TP +\fB/sys/module/rdma_rxe/parameters/crc_disable\fR +Read/Write file that controls the disabling of ICRC computation. Set to a nonzero value for TRUE. Zero for FALSE. + +.TP +\fB/sys/module/rdma_rxe/parameters/fast_comp|req|resp|arb\fR +Read/Write file that enables calling kernel tasklets as subroutines to reduce latency. + +.TP +\fB/sys/module/rdma_rxe/parameters/nsec_per_packet|kbyte\fR +Read/Write file that controls static rate pacing for output packets. If set to nonzero values the minimum delay to the next packet is set to nsec_per_kbyte * sizeof(current packet in KBytes) or nsec_per_packet which ever is less. + +.TP +\fB/sys/module/rdma_rxe/parameters/max_packet_per_ack\fR +Read/Write file that controls the issuing of acks by the responder during a long message. If set additional acks will be generated every max_pkt_per_ack packets. + +.TP +\fB/sys/module/rdma_rxe/parameters/max_skb_per_qp\fR +Read/Write file that controls the number of skbs (packets) that a requester can queue for sending internally. + +.TP +\fB/sys/module/rdma_rxe/parameters/max_req_comp_gap\fR +Read/Write file that controls the maximum gap between the PSN of request packets send and ack packets received. + +.TP +\fB/sys/module/rdma_rxe/parameters/default_mtu\fR +Read/Write file that controls the default mtu used for UD packets. + +.SH "SEE ALSO" +.BR rdma (8), +.BR verbs (7), + +.SH "AUTHORS" +Written by John Groves, Frank Zago and Bob Pearson at System Fabric Works. diff --git a/providers/rxe/rxe-abi.h b/providers/rxe/rxe-abi.h new file mode 100644 index 0000000..b4680a2 --- /dev/null +++ b/providers/rxe/rxe-abi.h @@ -0,0 +1,53 @@ +/* + * Copyright (c) 2009 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2009 System Fabric Works, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#ifndef RXE_ABI_H +#define RXE_ABI_H + +#include <infiniband/kern-abi.h> +#include <rdma/rdma_user_rxe.h> +#include <kernel-abi/rdma_user_rxe.h> + +DECLARE_DRV_CMD(urxe_create_cq, IB_USER_VERBS_CMD_CREATE_CQ, + empty, rxe_create_cq_resp); +DECLARE_DRV_CMD(urxe_create_qp, IB_USER_VERBS_CMD_CREATE_QP, + empty, rxe_create_qp_resp); +DECLARE_DRV_CMD(urxe_create_srq, IB_USER_VERBS_CMD_CREATE_SRQ, + empty, rxe_create_srq_resp); +DECLARE_DRV_CMD(urxe_modify_srq, IB_USER_VERBS_CMD_MODIFY_SRQ, + rxe_modify_srq_cmd, empty); +DECLARE_DRV_CMD(urxe_resize_cq, IB_USER_VERBS_CMD_RESIZE_CQ, + empty, rxe_resize_cq_resp); + +#endif /* RXE_ABI_H */ diff --git a/providers/rxe/rxe.c b/providers/rxe/rxe.c new file mode 100644 index 0000000..3af58bf --- /dev/null +++ b/providers/rxe/rxe.c @@ -0,0 +1,933 @@ +/* + * Copyright (c) 2009 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2009 System Fabric Works, Inc. All rights reserved. + * Copyright (C) 2006-2007 QLogic Corporation, All rights reserved. + * Copyright (c) 2005. PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <config.h> + +#include <endian.h> +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <string.h> +#include <pthread.h> +#include <netinet/in.h> +#include <sys/mman.h> +#include <errno.h> + +#include <endian.h> +#include <pthread.h> +#include <stddef.h> + +#include <infiniband/driver.h> +#include <infiniband/verbs.h> + +#include "rxe_queue.h" +#include "rxe-abi.h" +#include "rxe.h" + +static void rxe_free_context(struct ibv_context *ibctx); + +static const struct verbs_match_ent hca_table[] = { + VERBS_DRIVER_ID(RDMA_DRIVER_RXE), + VERBS_NAME_MATCH("rxe", NULL), + {}, +}; + +static int rxe_query_device(struct ibv_context *context, + struct ibv_device_attr *attr) +{ + struct ibv_query_device cmd; + uint64_t raw_fw_ver; + unsigned major, minor, sub_minor; + int ret; + + ret = ibv_cmd_query_device(context, attr, &raw_fw_ver, + &cmd, sizeof cmd); + if (ret) + return ret; + + major = (raw_fw_ver >> 32) & 0xffff; + minor = (raw_fw_ver >> 16) & 0xffff; + sub_minor = raw_fw_ver & 0xffff; + + snprintf(attr->fw_ver, sizeof attr->fw_ver, + "%d.%d.%d", major, minor, sub_minor); + + return 0; +} + +static int rxe_query_port(struct ibv_context *context, uint8_t port, + struct ibv_port_attr *attr) +{ + struct ibv_query_port cmd; + + return ibv_cmd_query_port(context, port, attr, &cmd, sizeof cmd); +} + +static struct ibv_pd *rxe_alloc_pd(struct ibv_context *context) +{ + struct ibv_alloc_pd cmd; + struct ib_uverbs_alloc_pd_resp resp; + struct ibv_pd *pd; + + pd = malloc(sizeof *pd); + if (!pd) + return NULL; + + if (ibv_cmd_alloc_pd(context, pd, &cmd, sizeof cmd, &resp, sizeof resp)) { + free(pd); + return NULL; + } + + return pd; +} + +static int rxe_dealloc_pd(struct ibv_pd *pd) +{ + int ret; + + ret = ibv_cmd_dealloc_pd(pd); + if (!ret) + free(pd); + + return ret; +} + +static struct ibv_mr *rxe_reg_mr(struct ibv_pd *pd, void *addr, size_t length, + uint64_t hca_va, int access) +{ + struct verbs_mr *vmr; + struct ibv_reg_mr cmd; + struct ib_uverbs_reg_mr_resp resp; + int ret; + + vmr = malloc(sizeof(*vmr)); + if (!vmr) + return NULL; + + ret = ibv_cmd_reg_mr(pd, addr, length, hca_va, access, vmr, &cmd, + sizeof(cmd), &resp, sizeof(resp)); + if (ret) { + free(vmr); + return NULL; + } + + return &vmr->ibv_mr; +} + +static int rxe_dereg_mr(struct verbs_mr *vmr) +{ + int ret; + + ret = ibv_cmd_dereg_mr(vmr); + if (ret) + return ret; + + free(vmr); + return 0; +} + +static struct ibv_cq *rxe_create_cq(struct ibv_context *context, int cqe, + struct ibv_comp_channel *channel, + int comp_vector) +{ + struct rxe_cq *cq; + struct urxe_create_cq_resp resp; + int ret; + + cq = malloc(sizeof *cq); + if (!cq) { + return NULL; + } + + ret = ibv_cmd_create_cq(context, cqe, channel, comp_vector, + &cq->ibv_cq, NULL, 0, + &resp.ibv_resp, sizeof resp); + if (ret) { + free(cq); + return NULL; + } + + cq->queue = mmap(NULL, resp.mi.size, PROT_READ | PROT_WRITE, MAP_SHARED, + context->cmd_fd, resp.mi.offset); + if ((void *)cq->queue == MAP_FAILED) { + ibv_cmd_destroy_cq(&cq->ibv_cq); + free(cq); + return NULL; + } + + cq->mmap_info = resp.mi; + pthread_spin_init(&cq->lock, PTHREAD_PROCESS_PRIVATE); + + return &cq->ibv_cq; +} + +static int rxe_resize_cq(struct ibv_cq *ibcq, int cqe) +{ + struct rxe_cq *cq = to_rcq(ibcq); + struct ibv_resize_cq cmd; + struct urxe_resize_cq_resp resp; + int ret; + + pthread_spin_lock(&cq->lock); + + ret = ibv_cmd_resize_cq(ibcq, cqe, &cmd, sizeof cmd, + &resp.ibv_resp, sizeof resp); + if (ret) { + pthread_spin_unlock(&cq->lock); + return ret; + } + + munmap(cq->queue, cq->mmap_info.size); + + cq->queue = mmap(NULL, resp.mi.size, + PROT_READ | PROT_WRITE, MAP_SHARED, + ibcq->context->cmd_fd, resp.mi.offset); + + ret = errno; + pthread_spin_unlock(&cq->lock); + + if ((void *)cq->queue == MAP_FAILED) { + cq->queue = NULL; + cq->mmap_info.size = 0; + return ret; + } + + cq->mmap_info = resp.mi; + + return 0; +} + +static int rxe_destroy_cq(struct ibv_cq *ibcq) +{ + struct rxe_cq *cq = to_rcq(ibcq); + int ret; + + ret = ibv_cmd_destroy_cq(ibcq); + if (ret) + return ret; + + if (cq->mmap_info.size) + munmap(cq->queue, cq->mmap_info.size); + free(cq); + + return 0; +} + +static int rxe_poll_cq(struct ibv_cq *ibcq, int ne, struct ibv_wc *wc) +{ + struct rxe_cq *cq = to_rcq(ibcq); + struct rxe_queue *q; + int npolled; + uint8_t *src; + + pthread_spin_lock(&cq->lock); + q = cq->queue; + + for (npolled = 0; npolled < ne; ++npolled, ++wc) { + if (queue_empty(q)) + break; + + atomic_thread_fence(memory_order_acquire); + src = consumer_addr(q); + memcpy(wc, src, sizeof(*wc)); + advance_consumer(q); + } + + pthread_spin_unlock(&cq->lock); + return npolled; +} + +static struct ibv_srq *rxe_create_srq(struct ibv_pd *pd, + struct ibv_srq_init_attr *attr) +{ + struct rxe_srq *srq; + struct ibv_create_srq cmd; + struct urxe_create_srq_resp resp; + int ret; + + srq = malloc(sizeof *srq); + if (srq == NULL) { + return NULL; + } + + ret = ibv_cmd_create_srq(pd, &srq->ibv_srq, attr, &cmd, sizeof cmd, + &resp.ibv_resp, sizeof resp); + if (ret) { + free(srq); + return NULL; + } + + srq->rq.queue = mmap(NULL, resp.mi.size, + PROT_READ | PROT_WRITE, MAP_SHARED, + pd->context->cmd_fd, resp.mi.offset); + if ((void *)srq->rq.queue == MAP_FAILED) { + ibv_cmd_destroy_srq(&srq->ibv_srq); + free(srq); + return NULL; + } + + srq->mmap_info = resp.mi; + srq->rq.max_sge = attr->attr.max_sge; + pthread_spin_init(&srq->rq.lock, PTHREAD_PROCESS_PRIVATE); + + return &srq->ibv_srq; +} + +static int rxe_modify_srq(struct ibv_srq *ibsrq, + struct ibv_srq_attr *attr, int attr_mask) +{ + struct rxe_srq *srq = to_rsrq(ibsrq); + struct urxe_modify_srq cmd; + int rc = 0; + struct mminfo mi; + + mi.offset = 0; + mi.size = 0; + + if (attr_mask & IBV_SRQ_MAX_WR) + pthread_spin_lock(&srq->rq.lock); + + cmd.mmap_info_addr = (__u64)(uintptr_t) & mi; + rc = ibv_cmd_modify_srq(ibsrq, attr, attr_mask, + &cmd.ibv_cmd, sizeof cmd); + if (rc) + goto out; + + if (attr_mask & IBV_SRQ_MAX_WR) { + (void)munmap(srq->rq.queue, srq->mmap_info.size); + srq->rq.queue = mmap(NULL, mi.size, + PROT_READ | PROT_WRITE, MAP_SHARED, + ibsrq->context->cmd_fd, mi.offset); + + if ((void *)srq->rq.queue == MAP_FAILED) { + rc = errno; + srq->rq.queue = NULL; + srq->mmap_info.size = 0; + goto out; + } + + srq->mmap_info = mi; + } + +out: + if (attr_mask & IBV_SRQ_MAX_WR) + pthread_spin_unlock(&srq->rq.lock); + return rc; +} + +static int rxe_query_srq(struct ibv_srq *srq, struct ibv_srq_attr *attr) +{ + struct ibv_query_srq cmd; + + return ibv_cmd_query_srq(srq, attr, &cmd, sizeof cmd); +} + +static int rxe_destroy_srq(struct ibv_srq *ibvsrq) +{ + int ret; + struct rxe_srq *srq = to_rsrq(ibvsrq); + struct rxe_queue *q = srq->rq.queue; + + ret = ibv_cmd_destroy_srq(ibvsrq); + if (!ret) { + if (srq->mmap_info.size) + munmap(q, srq->mmap_info.size); + free(srq); + } + + return ret; +} + +static int rxe_post_one_recv(struct rxe_wq *rq, struct ibv_recv_wr *recv_wr) +{ + int i; + struct rxe_recv_wqe *wqe; + struct rxe_queue *q = rq->queue; + int length = 0; + int rc = 0; + + if (queue_full(q)) { + rc = -ENOMEM; + goto out; + } + + if (recv_wr->num_sge > rq->max_sge) { + rc = -EINVAL; + goto out; + } + + wqe = (struct rxe_recv_wqe *)producer_addr(q); + + wqe->wr_id = recv_wr->wr_id; + wqe->num_sge = recv_wr->num_sge; + + memcpy(wqe->dma.sge, recv_wr->sg_list, + wqe->num_sge*sizeof(*wqe->dma.sge)); + + for (i = 0; i < wqe->num_sge; i++) { + length += wqe->dma.sge[i].length; + } + + wqe->dma.length = length; + wqe->dma.resid = length; + wqe->dma.cur_sge = 0; + wqe->dma.num_sge = wqe->num_sge; + wqe->dma.sge_offset = 0; + + advance_producer(q); + +out: + return rc; +} + +static int rxe_post_srq_recv(struct ibv_srq *ibvsrq, + struct ibv_recv_wr *recv_wr, + struct ibv_recv_wr **bad_recv_wr) +{ + struct rxe_srq *srq = to_rsrq(ibvsrq); + int rc = 0; + + pthread_spin_lock(&srq->rq.lock); + + while (recv_wr) { + rc = rxe_post_one_recv(&srq->rq, recv_wr); + if (rc) { + *bad_recv_wr = recv_wr; + break; + } + + recv_wr = recv_wr->next; + } + + pthread_spin_unlock(&srq->rq.lock); + + return rc; +} + +static struct ibv_qp *rxe_create_qp(struct ibv_pd *pd, + struct ibv_qp_init_attr *attr) +{ + struct ibv_create_qp cmd; + struct urxe_create_qp_resp resp; + struct rxe_qp *qp; + int ret; + + qp = malloc(sizeof *qp); + if (!qp) { + return NULL; + } + + ret = ibv_cmd_create_qp(pd, &qp->ibv_qp, attr, &cmd, sizeof cmd, + &resp.ibv_resp, sizeof resp); + if (ret) { + free(qp); + return NULL; + } + + if (attr->srq) { + qp->rq.max_sge = 0; + qp->rq.queue = NULL; + qp->rq_mmap_info.size = 0; + } else { + qp->rq.max_sge = attr->cap.max_recv_sge; + qp->rq.queue = mmap(NULL, resp.rq_mi.size, PROT_READ | PROT_WRITE, + MAP_SHARED, + pd->context->cmd_fd, resp.rq_mi.offset); + if ((void *)qp->rq.queue == MAP_FAILED) { + ibv_cmd_destroy_qp(&qp->ibv_qp); + free(qp); + return NULL; + } + + qp->rq_mmap_info = resp.rq_mi; + pthread_spin_init(&qp->rq.lock, PTHREAD_PROCESS_PRIVATE); + } + + qp->sq.max_sge = attr->cap.max_send_sge; + qp->sq.max_inline = attr->cap.max_inline_data; + qp->sq.queue = mmap(NULL, resp.sq_mi.size, PROT_READ | PROT_WRITE, + MAP_SHARED, + pd->context->cmd_fd, resp.sq_mi.offset); + if ((void *)qp->sq.queue == MAP_FAILED) { + if (qp->rq_mmap_info.size) + munmap(qp->rq.queue, qp->rq_mmap_info.size); + ibv_cmd_destroy_qp(&qp->ibv_qp); + free(qp); + return NULL; + } + + qp->sq_mmap_info = resp.sq_mi; + pthread_spin_init(&qp->sq.lock, PTHREAD_PROCESS_PRIVATE); + + return &qp->ibv_qp; +} + +static int rxe_query_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, + int attr_mask, + struct ibv_qp_init_attr *init_attr) +{ + struct ibv_query_qp cmd; + + return ibv_cmd_query_qp(qp, attr, attr_mask, init_attr, + &cmd, sizeof cmd); +} + +static int rxe_modify_qp(struct ibv_qp *ibvqp, + struct ibv_qp_attr *attr, + int attr_mask) +{ + struct ibv_modify_qp cmd = {}; + + return ibv_cmd_modify_qp(ibvqp, attr, attr_mask, &cmd, sizeof cmd); +} + +static int rxe_destroy_qp(struct ibv_qp *ibv_qp) +{ + int ret; + struct rxe_qp *qp = to_rqp(ibv_qp); + + ret = ibv_cmd_destroy_qp(ibv_qp); + if (!ret) { + if (qp->rq_mmap_info.size) + munmap(qp->rq.queue, qp->rq_mmap_info.size); + if (qp->sq_mmap_info.size) + munmap(qp->sq.queue, qp->sq_mmap_info.size); + + free(qp); + } + + return ret; +} + +/* basic sanity checks for send work request */ +static int validate_send_wr(struct rxe_wq *sq, struct ibv_send_wr *ibwr, + unsigned int length) +{ + enum ibv_wr_opcode opcode = ibwr->opcode; + + if (ibwr->num_sge > sq->max_sge) + return -EINVAL; + + if ((opcode == IBV_WR_ATOMIC_CMP_AND_SWP) + || (opcode == IBV_WR_ATOMIC_FETCH_AND_ADD)) + if (length < 8 || ibwr->wr.atomic.remote_addr & 0x7) + return -EINVAL; + + if ((ibwr->send_flags & IBV_SEND_INLINE) && (length > sq->max_inline)) + return -EINVAL; + + return 0; +} + +static void convert_send_wr(struct rxe_send_wr *kwr, struct ibv_send_wr *uwr) +{ + memset(kwr, 0, sizeof(*kwr)); + + kwr->wr_id = uwr->wr_id; + kwr->num_sge = uwr->num_sge; + kwr->opcode = uwr->opcode; + kwr->send_flags = uwr->send_flags; + kwr->ex.imm_data = uwr->imm_data; + + switch(uwr->opcode) { + case IBV_WR_RDMA_WRITE: + case IBV_WR_RDMA_WRITE_WITH_IMM: + case IBV_WR_RDMA_READ: + kwr->wr.rdma.remote_addr = uwr->wr.rdma.remote_addr; + kwr->wr.rdma.rkey = uwr->wr.rdma.rkey; + break; + + case IBV_WR_SEND: + case IBV_WR_SEND_WITH_IMM: + kwr->wr.ud.remote_qpn = uwr->wr.ud.remote_qpn; + kwr->wr.ud.remote_qkey = uwr->wr.ud.remote_qkey; + break; + + case IBV_WR_ATOMIC_CMP_AND_SWP: + case IBV_WR_ATOMIC_FETCH_AND_ADD: + kwr->wr.atomic.remote_addr = uwr->wr.atomic.remote_addr; + kwr->wr.atomic.compare_add = uwr->wr.atomic.compare_add; + kwr->wr.atomic.swap = uwr->wr.atomic.swap; + kwr->wr.atomic.rkey = uwr->wr.atomic.rkey; + break; + + case IBV_WR_LOCAL_INV: + case IBV_WR_BIND_MW: + case IBV_WR_SEND_WITH_INV: + case IBV_WR_TSO: + case IBV_WR_DRIVER1: + break; + } +} + +static int init_send_wqe(struct rxe_qp *qp, struct rxe_wq *sq, + struct ibv_send_wr *ibwr, unsigned int length, + struct rxe_send_wqe *wqe) +{ + int num_sge = ibwr->num_sge; + int i; + unsigned int opcode = ibwr->opcode; + + convert_send_wr(&wqe->wr, ibwr); + + if (qp_type(qp) == IBV_QPT_UD) + memcpy(&wqe->av, &to_rah(ibwr->wr.ud.ah)->av, + sizeof(struct rxe_av)); + + if (ibwr->send_flags & IBV_SEND_INLINE) { + uint8_t *inline_data = wqe->dma.inline_data; + + for (i = 0; i < num_sge; i++) { + memcpy(inline_data, + (uint8_t *)(long)ibwr->sg_list[i].addr, + ibwr->sg_list[i].length); + inline_data += ibwr->sg_list[i].length; + } + } else + memcpy(wqe->dma.sge, ibwr->sg_list, + num_sge*sizeof(struct ibv_sge)); + + if ((opcode == IBV_WR_ATOMIC_CMP_AND_SWP) + || (opcode == IBV_WR_ATOMIC_FETCH_AND_ADD)) + wqe->iova = ibwr->wr.atomic.remote_addr; + else + wqe->iova = ibwr->wr.rdma.remote_addr; + wqe->dma.length = length; + wqe->dma.resid = length; + wqe->dma.num_sge = num_sge; + wqe->dma.cur_sge = 0; + wqe->dma.sge_offset = 0; + wqe->state = 0; + wqe->ssn = qp->ssn++; + + return 0; +} + +static int post_one_send(struct rxe_qp *qp, struct rxe_wq *sq, + struct ibv_send_wr *ibwr) +{ + int err; + struct rxe_send_wqe *wqe; + unsigned int length = 0; + int i; + + for (i = 0; i < ibwr->num_sge; i++) + length += ibwr->sg_list[i].length; + + err = validate_send_wr(sq, ibwr, length); + if (err) { + printf("validate send failed\n"); + return err; + } + + wqe = (struct rxe_send_wqe *)producer_addr(sq->queue); + + err = init_send_wqe(qp, sq, ibwr, length, wqe); + if (err) + return err; + + if (queue_full(sq->queue)) + return -ENOMEM; + + advance_producer(sq->queue); + + return 0; +} + +/* send a null post send as a doorbell */ +static int post_send_db(struct ibv_qp *ibqp) +{ + struct ibv_post_send cmd; + struct ib_uverbs_post_send_resp resp; + + cmd.hdr.command = IB_USER_VERBS_CMD_POST_SEND; + cmd.hdr.in_words = sizeof(cmd) / 4; + cmd.hdr.out_words = sizeof(resp) / 4; + cmd.response = (uintptr_t)&resp; + cmd.qp_handle = ibqp->handle; + cmd.wr_count = 0; + cmd.sge_count = 0; + cmd.wqe_size = sizeof(struct ibv_send_wr); + + if (write(ibqp->context->cmd_fd, &cmd, sizeof(cmd)) != sizeof(cmd)) + return errno; + + return 0; +} + +/* this API does not make a distinction between + restartable and non-restartable errors */ +static int rxe_post_send(struct ibv_qp *ibqp, + struct ibv_send_wr *wr_list, + struct ibv_send_wr **bad_wr) +{ + int rc = 0; + int err; + struct rxe_qp *qp = to_rqp(ibqp); + struct rxe_wq *sq = &qp->sq; + + if (!bad_wr) + return EINVAL; + + *bad_wr = NULL; + + if (!sq || !wr_list || !sq->queue) + return EINVAL; + + pthread_spin_lock(&sq->lock); + + while (wr_list) { + rc = post_one_send(qp, sq, wr_list); + if (rc) { + *bad_wr = wr_list; + break; + } + + wr_list = wr_list->next; + } + + pthread_spin_unlock(&sq->lock); + + err = post_send_db(ibqp); + return err ? err : rc; +} + +static int rxe_post_recv(struct ibv_qp *ibqp, + struct ibv_recv_wr *recv_wr, + struct ibv_recv_wr **bad_wr) +{ + int rc = 0; + struct rxe_qp *qp = to_rqp(ibqp); + struct rxe_wq *rq = &qp->rq; + + if (!bad_wr) + return EINVAL; + + *bad_wr = NULL; + + if (!rq || !recv_wr || !rq->queue) + return EINVAL; + + pthread_spin_lock(&rq->lock); + + while (recv_wr) { + rc = rxe_post_one_recv(rq, recv_wr); + if (rc) { + *bad_wr = recv_wr; + break; + } + + recv_wr = recv_wr->next; + } + + pthread_spin_unlock(&rq->lock); + + return rc; +} + +static inline int ipv6_addr_v4mapped(const struct in6_addr *a) +{ + return IN6_IS_ADDR_V4MAPPED(a); +} + +typedef typeof(((struct rxe_av *)0)->sgid_addr) sockaddr_union_t; + +static inline int rdma_gid2ip(sockaddr_union_t *out, union ibv_gid *gid) +{ + if (ipv6_addr_v4mapped((struct in6_addr *)gid)) { + memset(&out->_sockaddr_in, 0, sizeof(out->_sockaddr_in)); + memcpy(&out->_sockaddr_in.sin_addr.s_addr, gid->raw + 12, 4); + } else { + memset(&out->_sockaddr_in6, 0, sizeof(out->_sockaddr_in6)); + out->_sockaddr_in6.sin6_family = AF_INET6; + memcpy(&out->_sockaddr_in6.sin6_addr.s6_addr, gid->raw, 16); + } + return 0; +} + +static struct ibv_ah *rxe_create_ah(struct ibv_pd *pd, struct ibv_ah_attr *attr) +{ + int err; + struct rxe_ah *ah; + struct rxe_av *av; + union ibv_gid sgid; + struct ib_uverbs_create_ah_resp resp; + + err = ibv_query_gid(pd->context, attr->port_num, attr->grh.sgid_index, + &sgid); + if (err) { + fprintf(stderr, "rxe: Failed to query sgid.\n"); + return NULL; + } + + ah = malloc(sizeof *ah); + if (ah == NULL) + return NULL; + + av = &ah->av; + av->port_num = attr->port_num; + memcpy(&av->grh, &attr->grh, sizeof(attr->grh)); + av->network_type = + ipv6_addr_v4mapped((struct in6_addr *)attr->grh.dgid.raw) ? + RDMA_NETWORK_IPV4 : RDMA_NETWORK_IPV6; + + rdma_gid2ip(&av->sgid_addr, &sgid); + rdma_gid2ip(&av->dgid_addr, &attr->grh.dgid); + if (ibv_resolve_eth_l2_from_gid(pd->context, attr, av->dmac, NULL)) { + free(ah); + return NULL; + } + + memset(&resp, 0, sizeof(resp)); + if (ibv_cmd_create_ah(pd, &ah->ibv_ah, attr, &resp, sizeof(resp))) { + free(ah); + return NULL; + } + + return &ah->ibv_ah; +} + +static int rxe_destroy_ah(struct ibv_ah *ibah) +{ + int ret; + struct rxe_ah *ah = to_rah(ibah); + + ret = ibv_cmd_destroy_ah(&ah->ibv_ah); + if (ret) + return ret; + + free(ah); + return 0; +} + +static const struct verbs_context_ops rxe_ctx_ops = { + .query_device = rxe_query_device, + .query_port = rxe_query_port, + .alloc_pd = rxe_alloc_pd, + .dealloc_pd = rxe_dealloc_pd, + .reg_mr = rxe_reg_mr, + .dereg_mr = rxe_dereg_mr, + .create_cq = rxe_create_cq, + .poll_cq = rxe_poll_cq, + .req_notify_cq = ibv_cmd_req_notify_cq, + .resize_cq = rxe_resize_cq, + .destroy_cq = rxe_destroy_cq, + .create_srq = rxe_create_srq, + .modify_srq = rxe_modify_srq, + .query_srq = rxe_query_srq, + .destroy_srq = rxe_destroy_srq, + .post_srq_recv = rxe_post_srq_recv, + .create_qp = rxe_create_qp, + .query_qp = rxe_query_qp, + .modify_qp = rxe_modify_qp, + .destroy_qp = rxe_destroy_qp, + .post_send = rxe_post_send, + .post_recv = rxe_post_recv, + .create_ah = rxe_create_ah, + .destroy_ah = rxe_destroy_ah, + .attach_mcast = ibv_cmd_attach_mcast, + .detach_mcast = ibv_cmd_detach_mcast, + .free_context = rxe_free_context, +}; + +static struct verbs_context *rxe_alloc_context(struct ibv_device *ibdev, + int cmd_fd, + void *private_data) +{ + struct rxe_context *context; + struct ibv_get_context cmd; + struct ib_uverbs_get_context_resp resp; + + context = verbs_init_and_alloc_context(ibdev, cmd_fd, context, ibv_ctx, + RDMA_DRIVER_RXE); + if (!context) + return NULL; + + if (ibv_cmd_get_context(&context->ibv_ctx, &cmd, + sizeof cmd, &resp, sizeof resp)) + goto out; + + verbs_set_ops(&context->ibv_ctx, &rxe_ctx_ops); + + return &context->ibv_ctx; + +out: + verbs_uninit_context(&context->ibv_ctx); + free(context); + return NULL; +} + +static void rxe_free_context(struct ibv_context *ibctx) +{ + struct rxe_context *context = to_rctx(ibctx); + + verbs_uninit_context(&context->ibv_ctx); + free(context); +} + +static void rxe_uninit_device(struct verbs_device *verbs_device) +{ + struct rxe_device *dev = to_rdev(&verbs_device->device); + + free(dev); +} + +static struct verbs_device *rxe_device_alloc(struct verbs_sysfs_dev *sysfs_dev) +{ + struct rxe_device *dev; + dev = calloc(1, sizeof(*dev)); + if (!dev) + return NULL; + + dev->abi_version = sysfs_dev->abi_ver; + + return &dev->ibv_dev; +} + +static const struct verbs_device_ops rxe_dev_ops = { + .name = "rxe", + /* + * For 64 bit machines ABI version 1 and 2 are the same. Otherwise 32 + * bit machines require ABI version 2 which guarentees the user and + * kernel use the same ABI. + */ + .match_min_abi_version = sizeof(void *) == 8?1:2, + .match_max_abi_version = 2, + .match_table = hca_table, + .alloc_device = rxe_device_alloc, + .uninit_device = rxe_uninit_device, + .alloc_context = rxe_alloc_context, +}; +PROVIDER_DRIVER(rxe, rxe_dev_ops); diff --git a/providers/rxe/rxe.h b/providers/rxe/rxe.h new file mode 100644 index 0000000..96f4ee9 --- /dev/null +++ b/providers/rxe/rxe.h @@ -0,0 +1,129 @@ +/* + * Copyright (c) 2009 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2009 System Fabric Works, Inc. All rights reserved. + * Copyright (c) 2006-2007 QLogic Corp. All rights reserved. + * Copyright (c) 2005. PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef RXE_H +#define RXE_H + +#include <infiniband/driver.h> +#include <sys/socket.h> +#include <netinet/in.h> +#include <rdma/rdma_user_rxe.h> /* struct rxe_av */ +#include "rxe-abi.h" + +enum rdma_network_type { + RDMA_NETWORK_IB, + RDMA_NETWORK_IPV4, + RDMA_NETWORK_IPV6 +}; + +struct rxe_device { + struct verbs_device ibv_dev; + int abi_version; +}; + +struct rxe_context { + struct verbs_context ibv_ctx; +}; + +struct rxe_cq { + struct ibv_cq ibv_cq; + struct mminfo mmap_info; + struct rxe_queue *queue; + pthread_spinlock_t lock; +}; + +struct rxe_ah { + struct ibv_ah ibv_ah; + struct rxe_av av; +}; + +struct rxe_wq { + struct rxe_queue *queue; + pthread_spinlock_t lock; + unsigned int max_sge; + unsigned int max_inline; +}; + +struct rxe_qp { + struct ibv_qp ibv_qp; + struct mminfo rq_mmap_info; + struct rxe_wq rq; + struct mminfo sq_mmap_info; + struct rxe_wq sq; + unsigned int ssn; +}; + +#define qp_type(qp) ((qp)->ibv_qp.qp_type) + +struct rxe_srq { + struct ibv_srq ibv_srq; + struct mminfo mmap_info; + struct rxe_wq rq; + uint32_t srq_num; +}; + +#define to_rxxx(xxx, type) container_of(ib##xxx, struct rxe_##type, ibv_##xxx) + +static inline struct rxe_context *to_rctx(struct ibv_context *ibctx) +{ + return container_of(ibctx, struct rxe_context, ibv_ctx.context); +} + +static inline struct rxe_device *to_rdev(struct ibv_device *ibdev) +{ + return container_of(ibdev, struct rxe_device, ibv_dev.device); +} + +static inline struct rxe_cq *to_rcq(struct ibv_cq *ibcq) +{ + return to_rxxx(cq, cq); +} + +static inline struct rxe_qp *to_rqp(struct ibv_qp *ibqp) +{ + return to_rxxx(qp, qp); +} + +static inline struct rxe_srq *to_rsrq(struct ibv_srq *ibsrq) +{ + return to_rxxx(srq, srq); +} + +static inline struct rxe_ah *to_rah(struct ibv_ah *ibah) +{ + return to_rxxx(ah, ah); +} + +#endif /* RXE_H */ diff --git a/providers/rxe/rxe_queue.h b/providers/rxe/rxe_queue.h new file mode 100644 index 0000000..5c57b3e --- /dev/null +++ b/providers/rxe/rxe_queue.h @@ -0,0 +1,128 @@ +/* + * Copyright (c) 2009 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2009 System Fabric Works, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the fileA + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +/* implements a simple circular buffer with sizes a power of 2 */ + +#ifndef H_RXE_PCQ +#define H_RXE_PCQ + +#include <stdint.h> +#include <stdatomic.h> + +/* MUST MATCH kernel struct rxe_pqc in rxe_queue.h */ +struct rxe_queue { + uint32_t log2_elem_size; + uint32_t index_mask; + uint32_t pad_1[30]; + _Atomic(uint32_t) producer_index; + uint32_t pad_2[31]; + _Atomic(uint32_t) consumer_index; + uint32_t pad_3[31]; + uint8_t data[0]; +}; + +static inline int next_index(struct rxe_queue *q, int index) +{ + return (index + 1) & q->index_mask; +} + +static inline int queue_empty(struct rxe_queue *q) +{ + /* Must hold consumer_index lock */ + return ((atomic_load(&q->producer_index) - + atomic_load_explicit(&q->consumer_index, + memory_order_relaxed)) & + q->index_mask) == 0; +} + +static inline int queue_full(struct rxe_queue *q) +{ + /* Must hold producer_index lock */ + return ((atomic_load_explicit(&q->producer_index, + memory_order_relaxed) + + 1 - atomic_load(&q->consumer_index)) & + q->index_mask) == 0; +} + +static inline void advance_producer(struct rxe_queue *q) +{ + /* Must hold producer_index lock */ + atomic_thread_fence(memory_order_release); + atomic_store( + &q->producer_index, + (atomic_load_explicit(&q->producer_index, memory_order_relaxed) + + 1) & + q->index_mask); +} + +static inline void advance_consumer(struct rxe_queue *q) +{ + /* Must hold consumer_index lock */ + atomic_store( + &q->consumer_index, + (atomic_load_explicit(&q->consumer_index, memory_order_relaxed) + + 1) & + q->index_mask); +} + +static inline void *producer_addr(struct rxe_queue *q) +{ + /* Must hold producer_index lock */ + return q->data + ((atomic_load_explicit(&q->producer_index, + memory_order_relaxed) & + q->index_mask) + << q->log2_elem_size); +} + +static inline void *consumer_addr(struct rxe_queue *q) +{ + /* Must hold consumer_index lock */ + return q->data + ((atomic_load_explicit(&q->consumer_index, + memory_order_relaxed) & + q->index_mask) + << q->log2_elem_size); +} + +static inline void *addr_from_index(struct rxe_queue *q, unsigned int index) +{ + return q->data + ((index & q->index_mask) + << q->log2_elem_size); +} + +static inline unsigned int index_from_addr(const struct rxe_queue *q, const void *addr) +{ + return (((uint8_t *)addr - q->data) >> q->log2_elem_size) & q->index_mask; +} + +#endif /* H_RXE_PCQ */ diff --git a/providers/siw/CMakeLists.txt b/providers/siw/CMakeLists.txt new file mode 100644 index 0000000..40a620e --- /dev/null +++ b/providers/siw/CMakeLists.txt @@ -0,0 +1,3 @@ +rdma_provider(siw + siw.c +) diff --git a/providers/siw/siw.c b/providers/siw/siw.c new file mode 100644 index 0000000..9530833 --- /dev/null +++ b/providers/siw/siw.c @@ -0,0 +1,926 @@ +// SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause + +// Authors: Bernard Metzler <bmt@zurich.ibm.com> +// Copyright (c) 2008-2019, IBM Corporation + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <errno.h> +#include <sys/mman.h> +#include <net/if.h> +#include <pthread.h> +#include <stdatomic.h> +#include <assert.h> + +#include "siw_abi.h" +#include "siw.h" + +static const int siw_debug; +static void siw_free_context(struct ibv_context *ibv_ctx); + +static int siw_query_device(struct ibv_context *ctx, + struct ibv_device_attr *attr) +{ + struct ibv_query_device cmd; + uint64_t raw_fw_ver; + unsigned int major, minor, sub_minor; + int rv; + + memset(&cmd, 0, sizeof(cmd)); + + rv = ibv_cmd_query_device(ctx, attr, &raw_fw_ver, &cmd, sizeof(cmd)); + if (rv) + return rv; + + major = (raw_fw_ver >> 32) & 0xffff; + minor = (raw_fw_ver >> 16) & 0xffff; + sub_minor = raw_fw_ver & 0xffff; + + snprintf(attr->fw_ver, sizeof(attr->fw_ver), "%d.%d.%d", major, minor, + sub_minor); + + return 0; +} + +static int siw_query_port(struct ibv_context *ctx, uint8_t port, + struct ibv_port_attr *attr) +{ + struct ibv_query_port cmd; + + memset(&cmd, 0, sizeof(cmd)); + + return ibv_cmd_query_port(ctx, port, attr, &cmd, sizeof(cmd)); +} + +static int siw_query_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, + int attr_mask, struct ibv_qp_init_attr *init_attr) +{ + struct ibv_query_qp cmd; + + memset(&cmd, 0, sizeof(cmd)); + + return ibv_cmd_query_qp(qp, attr, attr_mask, init_attr, &cmd, + sizeof(cmd)); +} + +static struct ibv_pd *siw_alloc_pd(struct ibv_context *ctx) +{ + struct ibv_alloc_pd cmd; + struct ib_uverbs_alloc_pd_resp resp; + struct ibv_pd *pd; + + memset(&cmd, 0, sizeof(cmd)); + + pd = calloc(1, sizeof(*pd)); + if (!pd) + return NULL; + + if (ibv_cmd_alloc_pd(ctx, pd, &cmd, sizeof(cmd), &resp, sizeof(resp))) { + free(pd); + return NULL; + } + return pd; +} + +static int siw_free_pd(struct ibv_pd *pd) +{ + int rv; + + rv = ibv_cmd_dealloc_pd(pd); + if (rv) + return rv; + + free(pd); + return 0; +} + +static struct ibv_mr *siw_reg_mr(struct ibv_pd *pd, void *addr, size_t len, + uint64_t hca_va, int access) +{ + struct siw_cmd_reg_mr cmd = {}; + struct siw_cmd_reg_mr_resp resp = {}; + struct siw_mr *mr; + int rv; + + mr = calloc(1, sizeof(*mr)); + if (!mr) + return NULL; + + rv = ibv_cmd_reg_mr(pd, addr, len, hca_va, access, + &mr->base_mr, &cmd.ibv_cmd, sizeof(cmd), + &resp.ibv_resp, sizeof(resp)); + if (rv) { + free(mr); + return NULL; + } + return &mr->base_mr.ibv_mr; +} + +static int siw_dereg_mr(struct verbs_mr *base_mr) +{ + struct siw_mr *mr = mr_base2siw(base_mr); + int rv; + + rv = ibv_cmd_dereg_mr(base_mr); + if (rv) + return rv; + + free(mr); + return 0; +} + +static struct ibv_cq *siw_create_cq(struct ibv_context *ctx, int num_cqe, + struct ibv_comp_channel *channel, + int comp_vector) +{ + struct siw_cmd_create_cq cmd = {}; + struct siw_cmd_create_cq_resp resp = {}; + struct siw_cq *cq; + int cq_size, rv; + + cq = calloc(1, sizeof(*cq)); + if (!cq) + return NULL; + + rv = ibv_cmd_create_cq(ctx, num_cqe, channel, comp_vector, &cq->base_cq, + &cmd.ibv_cmd, sizeof(cmd), &resp.ibv_resp, + sizeof(resp)); + if (rv) { + if (siw_debug) + printf("libsiw: CQ creation failed: %d\n", rv); + free(cq); + return NULL; + } + if (resp.cq_key == SIW_INVAL_UOBJ_KEY) { + if (siw_debug) + printf("libsiw: prepare CQ mapping failed\n"); + goto fail; + } + pthread_spin_init(&cq->lock, PTHREAD_PROCESS_PRIVATE); + cq->id = resp.cq_id; + cq->num_cqe = resp.num_cqe; + + cq_size = resp.num_cqe * sizeof(struct siw_cqe) + + sizeof(struct siw_cq_ctrl); + + cq->queue = mmap(NULL, cq_size, PROT_READ | PROT_WRITE, + MAP_SHARED, ctx->cmd_fd, resp.cq_key); + + if (cq->queue == MAP_FAILED) { + if (siw_debug) + printf("libsiw: CQ mapping failed: %d", errno); + goto fail; + } + cq->ctrl = (struct siw_cq_ctrl *)&cq->queue[cq->num_cqe]; + cq->ctrl->flags = SIW_NOTIFY_NOT; + + return &cq->base_cq; +fail: + ibv_cmd_destroy_cq(&cq->base_cq); + free(cq); + + return NULL; +} + +static int siw_resize_cq(struct ibv_cq *base_cq, int num_cqe) +{ + return -EOPNOTSUPP; +} + +static int siw_destroy_cq(struct ibv_cq *base_cq) +{ + struct siw_cq *cq = cq_base2siw(base_cq); + int rv; + + assert(pthread_spin_trylock(&cq->lock)); + + if (cq->queue) + munmap(cq->queue, cq->num_cqe * sizeof(struct siw_cqe) + + sizeof(struct siw_cq_ctrl)); + + rv = ibv_cmd_destroy_cq(base_cq); + if (rv) { + pthread_spin_unlock(&cq->lock); + return rv; + } + pthread_spin_destroy(&cq->lock); + + free(cq); + + return 0; +} + +static struct ibv_srq *siw_create_srq(struct ibv_pd *pd, + struct ibv_srq_init_attr *attr) +{ + struct siw_cmd_create_srq cmd = {}; + struct siw_cmd_create_srq_resp resp = {}; + struct ibv_context *ctx = pd->context; + struct siw_srq *srq; + int rv, rq_size; + + srq = calloc(1, sizeof(*srq)); + if (!srq) + return NULL; + + rv = ibv_cmd_create_srq(pd, &srq->base_srq, attr, &cmd.ibv_cmd, + sizeof(cmd), &resp.ibv_resp, sizeof(resp)); + if (rv) { + if (siw_debug) + printf("libsiw: creating SRQ failed\n"); + free(srq); + return NULL; + } + if (resp.srq_key == SIW_INVAL_UOBJ_KEY) { + if (siw_debug) + printf("libsiw: prepare SRQ mapping failed\n"); + goto fail; + } + pthread_spin_init(&srq->lock, PTHREAD_PROCESS_PRIVATE); + rq_size = resp.num_rqe * sizeof(struct siw_rqe); + srq->num_rqe = resp.num_rqe; + + srq->recvq = mmap(NULL, rq_size, PROT_READ | PROT_WRITE, + MAP_SHARED, ctx->cmd_fd, resp.srq_key); + + if (srq->recvq == MAP_FAILED) { + if (siw_debug) + printf("libsiw: SRQ mapping failed: %d", errno); + goto fail; + } + return &srq->base_srq; +fail: + ibv_cmd_destroy_srq(&srq->base_srq); + free(srq); + + return NULL; +} + +static int siw_modify_srq(struct ibv_srq *base_srq, struct ibv_srq_attr *attr, + int attr_mask) +{ + struct ibv_modify_srq cmd = {}; + struct siw_srq *srq = srq_base2siw(base_srq); + int rv; + + pthread_spin_lock(&srq->lock); + rv = ibv_cmd_modify_srq(base_srq, attr, attr_mask, &cmd, sizeof(cmd)); + pthread_spin_unlock(&srq->lock); + + return rv; +} + +static int siw_destroy_srq(struct ibv_srq *base_srq) +{ + struct siw_srq *srq = srq_base2siw(base_srq); + int rv; + + assert(pthread_spin_trylock(&srq->lock)); + + rv = ibv_cmd_destroy_srq(base_srq); + if (rv) { + pthread_spin_unlock(&srq->lock); + return rv; + } + if (srq->recvq) + munmap(srq->recvq, srq->num_rqe * sizeof(struct siw_rqe)); + + pthread_spin_destroy(&srq->lock); + + free(srq); + + return 0; +} + +static struct ibv_qp *siw_create_qp(struct ibv_pd *pd, + struct ibv_qp_init_attr *attr) +{ + struct siw_cmd_create_qp cmd = {}; + struct siw_cmd_create_qp_resp resp = {}; + struct siw_qp *qp; + struct ibv_context *base_ctx = pd->context; + int sq_size, rq_size, rv; + + memset(&cmd, 0, sizeof(cmd)); + memset(&resp, 0, sizeof(resp)); + + qp = calloc(1, sizeof(*qp)); + if (!qp) + return NULL; + + rv = ibv_cmd_create_qp(pd, &qp->base_qp, attr, &cmd.ibv_cmd, + sizeof(cmd), &resp.ibv_resp, sizeof(resp)); + + if (rv) { + if (siw_debug) + printf("libsiw: QP creation failed\n"); + free(qp); + return NULL; + } + if (resp.sq_key == SIW_INVAL_UOBJ_KEY || + resp.rq_key == SIW_INVAL_UOBJ_KEY) { + if (siw_debug) + printf("libsiw: prepare QP mapping failed\n"); + goto fail; + } + qp->id = resp.qp_id; + qp->num_sqe = resp.num_sqe; + qp->num_rqe = resp.num_rqe; + qp->sq_sig_all = attr->sq_sig_all; + + /* Init doorbell request structure */ + qp->db_req.hdr.command = IB_USER_VERBS_CMD_POST_SEND; + qp->db_req.hdr.in_words = sizeof(qp->db_req) / 4; + qp->db_req.hdr.out_words = sizeof(qp->db_resp) / 4; + qp->db_req.response = (uintptr_t)&qp->db_resp; + qp->db_req.wr_count = 0; + qp->db_req.sge_count = 0; + qp->db_req.wqe_size = sizeof(struct ibv_send_wr); + + pthread_spin_init(&qp->sq_lock, PTHREAD_PROCESS_PRIVATE); + pthread_spin_init(&qp->rq_lock, PTHREAD_PROCESS_PRIVATE); + + sq_size = resp.num_sqe * sizeof(struct siw_sqe); + + qp->sendq = mmap(NULL, sq_size, PROT_READ | PROT_WRITE, + MAP_SHARED, base_ctx->cmd_fd, resp.sq_key); + + if (qp->sendq == MAP_FAILED) { + if (siw_debug) + printf("libsiw: SQ mapping failed: %d", errno); + + qp->sendq = NULL; + goto fail; + } + if (attr->srq) { + qp->srq = srq_base2siw(attr->srq); + } else { + rq_size = resp.num_rqe * sizeof(struct siw_rqe); + + qp->recvq = mmap(NULL, rq_size, PROT_READ | PROT_WRITE, + MAP_SHARED, base_ctx->cmd_fd, resp.rq_key); + + if (qp->recvq == MAP_FAILED) { + if (siw_debug) + printf("libsiw: RQ mapping failed: %d\n", + resp.num_rqe); + qp->recvq = NULL; + goto fail; + } + } + qp->db_req.qp_handle = qp->base_qp.handle; + + return &qp->base_qp; +fail: + ibv_cmd_destroy_qp(&qp->base_qp); + + if (qp->sendq) + munmap(qp->sendq, qp->num_sqe * sizeof(struct siw_sqe)); + if (qp->recvq) + munmap(qp->recvq, qp->num_rqe * sizeof(struct siw_rqe)); + + free(qp); + + return NULL; +} + +static int siw_modify_qp(struct ibv_qp *base_qp, struct ibv_qp_attr *attr, + int attr_mask) +{ + struct ibv_modify_qp cmd; + struct siw_qp *qp = qp_base2siw(base_qp); + int rv; + + memset(&cmd, 0, sizeof(cmd)); + + pthread_spin_lock(&qp->sq_lock); + pthread_spin_lock(&qp->rq_lock); + + rv = ibv_cmd_modify_qp(base_qp, attr, attr_mask, &cmd, sizeof(cmd)); + + pthread_spin_unlock(&qp->rq_lock); + pthread_spin_unlock(&qp->sq_lock); + + return rv; +} + +static int siw_destroy_qp(struct ibv_qp *base_qp) +{ + struct siw_qp *qp = qp_base2siw(base_qp); + int rv; + + assert(pthread_spin_trylock(&qp->sq_lock)); + assert(pthread_spin_trylock(&qp->rq_lock)); + + if (qp->sendq) + munmap(qp->sendq, qp->num_sqe * sizeof(struct siw_sqe)); + if (qp->recvq) + munmap(qp->recvq, qp->num_rqe * sizeof(struct siw_rqe)); + + rv = ibv_cmd_destroy_qp(base_qp); + if (rv) { + pthread_spin_unlock(&qp->rq_lock); + pthread_spin_unlock(&qp->sq_lock); + return rv; + } + pthread_spin_destroy(&qp->rq_lock); + pthread_spin_destroy(&qp->sq_lock); + + free(qp); + + return 0; +} + +static struct ibv_ah *siw_create_ah(struct ibv_pd *pd, struct ibv_ah_attr *attr) +{ + return NULL; +} + +static int siw_destroy_ah(struct ibv_ah *ah) +{ + return -EOPNOTSUPP; +} + +static void siw_async_event(struct ibv_context *ctx, + struct ibv_async_event *event) +{ + struct ibv_qp *base_qp = event->element.qp; + struct ibv_cq *base_cq = event->element.cq; + + switch (event->event_type) { + case IBV_EVENT_CQ_ERR: + printf("libsiw: CQ[%d] event: error\n", + cq_base2siw(base_cq)->id); + break; + + case IBV_EVENT_QP_FATAL: + printf("libsiw: QP[%d] event: fatal error\n", + qp_base2siw(base_qp)->id); + break; + + case IBV_EVENT_QP_REQ_ERR: + printf("libsiw: QP[%d] event: request error\n", + qp_base2siw(base_qp)->id); + break; + + case IBV_EVENT_QP_ACCESS_ERR: + printf("libsiw: QP[%d] event: access error\n", + qp_base2siw(base_qp)->id); + break; + + case IBV_EVENT_SQ_DRAINED: + case IBV_EVENT_COMM_EST: + case IBV_EVENT_QP_LAST_WQE_REACHED: + break; + + default: + break; + } +} + +static int siw_notify_cq(struct ibv_cq *ibcq, int solicited) +{ + struct siw_cq *cq = cq_base2siw(ibcq); + int rv = 0; + + if (solicited) + atomic_store((_Atomic(uint32_t) *)&cq->ctrl->flags, + SIW_NOTIFY_SOLICITED); + else + atomic_store((_Atomic(uint32_t) *)&cq->ctrl->flags, + SIW_NOTIFY_SOLICITED | SIW_NOTIFY_NEXT_COMPLETION); + return rv; +} + +static const struct { + enum ibv_wr_opcode base; + enum siw_opcode siw; +} map_send_opcode[IBV_WR_DRIVER1 + 1] = { + { IBV_WR_RDMA_WRITE, SIW_OP_WRITE}, + { IBV_WR_RDMA_WRITE_WITH_IMM, SIW_NUM_OPCODES + 1 }, + { IBV_WR_SEND, SIW_OP_SEND }, + { IBV_WR_SEND_WITH_IMM, SIW_NUM_OPCODES + 1 }, + { IBV_WR_RDMA_READ, SIW_OP_READ }, + { IBV_WR_ATOMIC_CMP_AND_SWP, SIW_NUM_OPCODES + 1 }, + { IBV_WR_ATOMIC_FETCH_AND_ADD, SIW_NUM_OPCODES + 1 }, + { IBV_WR_LOCAL_INV, SIW_NUM_OPCODES + 1 }, + { IBV_WR_BIND_MW, SIW_NUM_OPCODES + 1 }, + { IBV_WR_SEND_WITH_INV, SIW_OP_SEND_REMOTE_INV }, + { IBV_WR_TSO, SIW_NUM_OPCODES + 1 }, + { IBV_WR_DRIVER1, SIW_NUM_OPCODES + 1 } +}; + +static inline uint16_t map_send_flags(int ibv_flags) +{ + uint16_t flags = SIW_WQE_VALID; + + if (ibv_flags & IBV_SEND_SIGNALED) + flags |= SIW_WQE_SIGNALLED; + if (ibv_flags & IBV_SEND_SOLICITED) + flags |= SIW_WQE_SOLICITED; + if (ibv_flags & IBV_SEND_INLINE) + flags |= SIW_WQE_INLINE; + if (ibv_flags & IBV_SEND_FENCE) + flags |= SIW_WQE_READ_FENCE; + + return flags; +} + +static inline int push_send_wqe(struct ibv_send_wr *base_wr, + struct siw_sqe *siw_sqe, int sig_all) +{ + uint32_t flags = map_send_flags(base_wr->send_flags); + atomic_ushort *fp = (atomic_ushort *)&siw_sqe->flags; + + siw_sqe->id = base_wr->wr_id; + siw_sqe->num_sge = base_wr->num_sge; + siw_sqe->raddr = base_wr->wr.rdma.remote_addr; + siw_sqe->rkey = base_wr->wr.rdma.rkey; + + siw_sqe->opcode = map_send_opcode[base_wr->opcode].siw; + if (siw_sqe->opcode > SIW_NUM_OPCODES) { + if (siw_debug) + printf("libsiw: opcode %d unsupported\n", + base_wr->opcode); + return -EINVAL; + } + if (sig_all) + flags |= SIW_WQE_SIGNALLED; + + if (flags & SIW_WQE_INLINE) { + char *data = (char *)&siw_sqe->sge[1]; + int bytes = 0, i = 0; + + /* Allow more than SIW_MAX_SGE, since content copied here */ + while (i < base_wr->num_sge) { + bytes += base_wr->sg_list[i].length; + if (bytes > (int)SIW_MAX_INLINE) { + if (siw_debug) + printf("libsiw: inline data: %d:%d\n", + bytes, (int)SIW_MAX_INLINE); + return -EINVAL; + } + memcpy(data, + (void *)(uintptr_t)base_wr->sg_list[i].addr, + base_wr->sg_list[i].length); + data += base_wr->sg_list[i++].length; + } + siw_sqe->sge[0].length = bytes; + + } else { + if (siw_sqe->num_sge > SIW_MAX_SGE) + return -EINVAL; + + /* this assumes same layout of siw and base SGE */ + memcpy(siw_sqe->sge, base_wr->sg_list, + siw_sqe->num_sge * sizeof(struct ibv_sge)); + } + atomic_store(fp, flags); + + return 0; +} + +static int siw_post_send(struct ibv_qp *base_qp, struct ibv_send_wr *wr, + struct ibv_send_wr **bad_wr) +{ + struct siw_qp *qp = qp_base2siw(base_qp); + uint32_t sq_put; + atomic_ushort *fp; + int new_sqe = 0, rv = 0; + + *bad_wr = NULL; + + pthread_spin_lock(&qp->sq_lock); + + sq_put = qp->sq_put; + + /* + * Push all current work requests into mmapped SQ + */ + while (wr) { + uint32_t idx = sq_put % qp->num_sqe; + struct siw_sqe *sqe = &qp->sendq[idx]; + uint16_t sqe_flags; + + fp = (atomic_ushort *)&sqe->flags; + sqe_flags = atomic_load(fp); + + if (!(sqe_flags & SIW_WQE_VALID)) { + rv = push_send_wqe(wr, sqe, qp->sq_sig_all); + if (rv) { + *bad_wr = wr; + break; + } + new_sqe++; + } else { + if (siw_debug) + printf("libsiw: QP[%d]: SQ overflow, idx %d\n", + qp->id, idx); + rv = -ENOMEM; + *bad_wr = wr; + break; + } + sq_put++; + wr = wr->next; + } + if (new_sqe) { + /* + * If last WQE pushed before position where current post_send + * started is idle, we assume SQ is not being actively + * processed. Only then, the doorbell call will be issued. + * This may significantly reduce unnecessary doorbell calls + * on a busy SQ. We also always ring the doorbell, if the + * complete SQ was re-written during current post_send. + */ + if (new_sqe < qp->num_sqe) { + uint32_t old_idx = (qp->sq_put - 1) % qp->num_sqe; + struct siw_sqe *old_sqe = &qp->sendq[old_idx]; + + fp = (atomic_ushort *)&old_sqe->flags; + if (!(atomic_load(fp) & SIW_WQE_VALID)) + rv = siw_db(qp); + } else { + rv = siw_db(qp); + } + if (rv) + *bad_wr = wr; + + qp->sq_put = sq_put; + } + pthread_spin_unlock(&qp->sq_lock); + + return rv; +} + +static inline int push_recv_wqe(struct ibv_recv_wr *base_wr, + struct siw_rqe *siw_rqe) +{ + atomic_ushort *fp = (atomic_ushort *)&siw_rqe->flags; + + siw_rqe->id = base_wr->wr_id; + siw_rqe->num_sge = base_wr->num_sge; + + if (base_wr->num_sge == 1) { + siw_rqe->sge[0].laddr = base_wr->sg_list[0].addr; + siw_rqe->sge[0].length = base_wr->sg_list[0].length; + siw_rqe->sge[0].lkey = base_wr->sg_list[0].lkey; + } else if (base_wr->num_sge && base_wr->num_sge <= SIW_MAX_SGE) + /* this assumes same layout of siw and base SGE */ + memcpy(siw_rqe->sge, base_wr->sg_list, + sizeof(struct ibv_sge) * base_wr->num_sge); + else + return -EINVAL; + + atomic_store(fp, SIW_WQE_VALID); + + return 0; +} + +static int siw_post_recv(struct ibv_qp *base_qp, struct ibv_recv_wr *wr, + struct ibv_recv_wr **bad_wr) +{ + struct siw_qp *qp = qp_base2siw(base_qp); + uint32_t rq_put; + int rv = 0; + + pthread_spin_lock(&qp->rq_lock); + + rq_put = qp->rq_put; + + while (wr) { + int idx = rq_put % qp->num_rqe; + struct siw_rqe *rqe = &qp->recvq[idx]; + atomic_ushort *fp = (atomic_ushort *)&rqe->flags; + uint16_t rqe_flags = atomic_load(fp); + + if (!(rqe_flags & SIW_WQE_VALID)) { + if (push_recv_wqe(wr, rqe)) { + *bad_wr = wr; + rv = -EINVAL; + break; + } + } else { + if (siw_debug) + printf("libsiw: QP[%d]: RQ overflow, idx %d\n", + qp->id, idx); + rv = -ENOMEM; + *bad_wr = wr; + break; + } + rq_put++; + wr = wr->next; + } + qp->rq_put = rq_put; + + pthread_spin_unlock(&qp->rq_lock); + + return rv; +} + +static int siw_post_srq_recv(struct ibv_srq *base_srq, struct ibv_recv_wr *wr, + struct ibv_recv_wr **bad_wr) +{ + struct siw_srq *srq = srq_base2siw(base_srq); + uint32_t srq_put; + int rv = 0; + + pthread_spin_lock(&srq->lock); + + srq_put = srq->rq_put; + + while (wr) { + int idx = srq_put % srq->num_rqe; + struct siw_rqe *rqe = &srq->recvq[idx]; + atomic_ushort *fp = (atomic_ushort *)&rqe->flags; + uint16_t rqe_flags = atomic_load(fp); + + if (!(rqe_flags & SIW_WQE_VALID)) { + if (push_recv_wqe(wr, rqe)) { + *bad_wr = wr; + rv = -EINVAL; + break; + } + } else { + if (siw_debug) + printf("libsiw: SRQ[%p]: SRQ overflow\n", srq); + rv = -ENOMEM; + *bad_wr = wr; + break; + } + srq_put++; + wr = wr->next; + } + srq->rq_put = srq_put; + + pthread_spin_unlock(&srq->lock); + + return rv; +} + +static const struct { + enum siw_opcode siw; + enum ibv_wc_opcode base; +} map_cqe_opcode[SIW_NUM_OPCODES] = { + { SIW_OP_WRITE, IBV_WC_RDMA_WRITE }, + { SIW_OP_READ, IBV_WC_RDMA_READ }, + { SIW_OP_READ_LOCAL_INV, IBV_WC_RDMA_READ }, + { SIW_OP_SEND, IBV_WC_SEND }, + { SIW_OP_SEND_WITH_IMM, IBV_WC_SEND }, + { SIW_OP_SEND_REMOTE_INV, IBV_WC_SEND }, + { SIW_OP_FETCH_AND_ADD, IBV_WC_FETCH_ADD }, + { SIW_OP_COMP_AND_SWAP, IBV_WC_COMP_SWAP }, + { SIW_OP_RECEIVE, IBV_WC_RECV } +}; + +static const struct { + enum siw_wc_status siw; + enum ibv_wc_status base; +} map_cqe_status[SIW_NUM_WC_STATUS] = { + { SIW_WC_SUCCESS, IBV_WC_SUCCESS }, + { SIW_WC_LOC_LEN_ERR, IBV_WC_LOC_LEN_ERR }, + { SIW_WC_LOC_PROT_ERR, IBV_WC_LOC_PROT_ERR }, + { SIW_WC_LOC_QP_OP_ERR, IBV_WC_LOC_QP_OP_ERR }, + { SIW_WC_WR_FLUSH_ERR, IBV_WC_WR_FLUSH_ERR }, + { SIW_WC_BAD_RESP_ERR, IBV_WC_BAD_RESP_ERR }, + { SIW_WC_LOC_ACCESS_ERR, IBV_WC_LOC_ACCESS_ERR }, + { SIW_WC_REM_ACCESS_ERR, IBV_WC_REM_ACCESS_ERR }, + { SIW_WC_REM_INV_REQ_ERR, IBV_WC_REM_INV_REQ_ERR }, + { SIW_WC_GENERAL_ERR, IBV_WC_GENERAL_ERR } +}; + +static inline void copy_cqe(struct siw_cqe *cqe, struct ibv_wc *wc) +{ + wc->wr_id = cqe->id; + wc->byte_len = cqe->bytes; + + /* No immediate data supported yet */ + wc->wc_flags = 0; + wc->imm_data = 0; + + wc->vendor_err = 0; + wc->opcode = map_cqe_opcode[cqe->opcode].base; + wc->status = map_cqe_status[cqe->status].base; + wc->qp_num = (uint32_t)cqe->qp_id; +} + +static int siw_poll_cq(struct ibv_cq *ibcq, int num_entries, struct ibv_wc *wc) +{ + struct siw_cq *cq = cq_base2siw(ibcq); + int new = 0; + + pthread_spin_lock(&cq->lock); + + for (; num_entries--; wc++) { + struct siw_cqe *cqe = &cq->queue[cq->cq_get % cq->num_cqe]; + atomic_uchar *fp = (atomic_uchar *)&cqe->flags; + + if (atomic_load(fp) & SIW_WQE_VALID) { + copy_cqe(cqe, wc); + atomic_store(fp, 0); + cq->cq_get++; + new++; + } else + break; + } + pthread_spin_unlock(&cq->lock); + + return new; +} + +static const struct verbs_context_ops siw_context_ops = { + .alloc_pd = siw_alloc_pd, + .async_event = siw_async_event, + .create_ah = siw_create_ah, + .create_cq = siw_create_cq, + .create_qp = siw_create_qp, + .create_srq = siw_create_srq, + .dealloc_pd = siw_free_pd, + .dereg_mr = siw_dereg_mr, + .destroy_ah = siw_destroy_ah, + .destroy_cq = siw_destroy_cq, + .destroy_qp = siw_destroy_qp, + .destroy_srq = siw_destroy_srq, + .free_context = siw_free_context, + .modify_qp = siw_modify_qp, + .modify_srq = siw_modify_srq, + .poll_cq = siw_poll_cq, + .post_recv = siw_post_recv, + .post_send = siw_post_send, + .post_srq_recv = siw_post_srq_recv, + .query_device = siw_query_device, + .query_port = siw_query_port, + .query_qp = siw_query_qp, + .reg_mr = siw_reg_mr, + .req_notify_cq = siw_notify_cq, + .resize_cq = siw_resize_cq, +}; + +static struct verbs_context *siw_alloc_context(struct ibv_device *base_dev, + int fd, void *pdata) +{ + struct siw_context *ctx; + struct ibv_get_context cmd = {}; + struct siw_cmd_alloc_context_resp resp = {}; + + ctx = verbs_init_and_alloc_context(base_dev, fd, ctx, base_ctx, + RDMA_DRIVER_SIW); + if (!ctx) + return NULL; + + if (ibv_cmd_get_context(&ctx->base_ctx, &cmd, sizeof(cmd), + &resp.ibv_resp, sizeof(resp))) { + verbs_uninit_context(&ctx->base_ctx); + free(ctx); + + return NULL; + } + verbs_set_ops(&ctx->base_ctx, &siw_context_ops); + ctx->dev_id = resp.dev_id; + + return &ctx->base_ctx; +} + +static void siw_free_context(struct ibv_context *ibv_ctx) +{ + struct siw_context *ctx = ctx_ibv2siw(ibv_ctx); + + verbs_uninit_context(&ctx->base_ctx); + free(ctx); +} + +static struct verbs_device *siw_device_alloc(struct verbs_sysfs_dev *unused) +{ + struct siw_device *dev; + + dev = calloc(1, sizeof(*dev)); + if (!dev) + return NULL; + + return &dev->base_dev; +} + +static void siw_device_free(struct verbs_device *vdev) +{ + struct siw_device *dev = + container_of(vdev, struct siw_device, base_dev); + free(dev); +} + +static const struct verbs_match_ent rnic_table[] = { + VERBS_DRIVER_ID(RDMA_DRIVER_SIW), + {}, +}; + +static const struct verbs_device_ops siw_dev_ops = { + .name = "siw", + .match_min_abi_version = SIW_ABI_VERSION, + .match_max_abi_version = SIW_ABI_VERSION, + .match_table = rnic_table, + .alloc_device = siw_device_alloc, + .uninit_device = siw_device_free, + .alloc_context = siw_alloc_context, +}; + +PROVIDER_DRIVER(siw, siw_dev_ops); diff --git a/providers/siw/siw.h b/providers/siw/siw.h new file mode 100644 index 0000000..9b83935 --- /dev/null +++ b/providers/siw/siw.h @@ -0,0 +1,109 @@ +/* SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause */ + +/* Authors: Bernard Metzler <bmt@zurich.ibm.com> */ +/* Copyright (c) 2008-2019, IBM Corporation */ + +#ifndef _SIW_H +#define _SIW_H + +#include <pthread.h> +#include <inttypes.h> +#include <stddef.h> + +#include <infiniband/driver.h> +#include <infiniband/kern-abi.h> + +struct siw_device { + struct verbs_device base_dev; +}; + +struct siw_srq { + struct ibv_srq base_srq; + struct siw_rqe *recvq; + uint32_t rq_put; + uint32_t num_rqe; + pthread_spinlock_t lock; +}; + +struct siw_mr { + struct verbs_mr base_mr; +}; + +struct siw_qp { + struct ibv_qp base_qp; + struct siw_device *siw_dev; + + uint32_t id; + + pthread_spinlock_t sq_lock; + pthread_spinlock_t rq_lock; + + struct ibv_post_send db_req; + struct ib_uverbs_post_send_resp db_resp; + + uint32_t num_sqe; + uint32_t sq_put; + int sq_sig_all; + struct siw_sqe *sendq; + + uint32_t num_rqe; + uint32_t rq_put; + struct siw_rqe *recvq; + struct siw_srq *srq; +}; + +struct siw_cq { + struct ibv_cq base_cq; + struct siw_device *siw_dev; + uint32_t id; + + /* Points to kernel shared control + * object at the end of CQE array + */ + struct siw_cq_ctrl *ctrl; + + int num_cqe; + uint32_t cq_get; + struct siw_cqe *queue; + pthread_spinlock_t lock; +}; + +struct siw_context { + struct verbs_context base_ctx; + uint32_t dev_id; +}; + +static inline struct siw_context *ctx_ibv2siw(struct ibv_context *base) +{ + return container_of(base, struct siw_context, base_ctx.context); +} + +static inline struct siw_qp *qp_base2siw(struct ibv_qp *base) +{ + return container_of(base, struct siw_qp, base_qp); +} + +static inline struct siw_cq *cq_base2siw(struct ibv_cq *base) +{ + return container_of(base, struct siw_cq, base_cq); +} + +static inline struct siw_mr *mr_base2siw(struct verbs_mr *base) +{ + return container_of(base, struct siw_mr, base_mr); +} + +static inline struct siw_srq *srq_base2siw(struct ibv_srq *base) +{ + return container_of(base, struct siw_srq, base_srq); +} + +static inline int siw_db(struct siw_qp *qp) +{ + int rv = write(qp->base_qp.context->cmd_fd, &qp->db_req, + sizeof(qp->db_req)); + + return rv == sizeof(qp->db_req) ? 0 : rv; +} + +#endif /* _SIW_H */ diff --git a/providers/siw/siw_abi.h b/providers/siw/siw_abi.h new file mode 100644 index 0000000..c112b8c --- /dev/null +++ b/providers/siw/siw_abi.h @@ -0,0 +1,24 @@ +/* SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause */ + +/* Authors: Bernard Metzler <bmt@zurich.ibm.com> */ +/* Copyright (c) 2008-2019, IBM Corporation */ + +#ifndef _SIW_ABI_H +#define _SIW_ABI_H + +#include <infiniband/kern-abi.h> +#include <rdma/siw-abi.h> +#include <kernel-abi/siw-abi.h> + +DECLARE_DRV_CMD(siw_cmd_alloc_context, IB_USER_VERBS_CMD_GET_CONTEXT, + empty, siw_uresp_alloc_ctx); +DECLARE_DRV_CMD(siw_cmd_create_cq, IB_USER_VERBS_CMD_CREATE_CQ, + empty, siw_uresp_create_cq); +DECLARE_DRV_CMD(siw_cmd_create_srq, IB_USER_VERBS_CMD_CREATE_SRQ, + empty, siw_uresp_create_srq); +DECLARE_DRV_CMD(siw_cmd_create_qp, IB_USER_VERBS_CMD_CREATE_QP, + empty, siw_uresp_create_qp); +DECLARE_DRV_CMD(siw_cmd_reg_mr, IB_USER_VERBS_CMD_REG_MR, + siw_ureq_reg_mr, siw_uresp_reg_mr); + +#endif /* _SIW_ABI_H */ diff --git a/providers/vmw_pvrdma/CMakeLists.txt b/providers/vmw_pvrdma/CMakeLists.txt new file mode 100644 index 0000000..f146295 --- /dev/null +++ b/providers/vmw_pvrdma/CMakeLists.txt @@ -0,0 +1,6 @@ +rdma_provider(vmw_pvrdma + cq.c + pvrdma_main.c + qp.c + verbs.c +) diff --git a/providers/vmw_pvrdma/cq.c b/providers/vmw_pvrdma/cq.c new file mode 100644 index 0000000..2c8739b --- /dev/null +++ b/providers/vmw_pvrdma/cq.c @@ -0,0 +1,287 @@ +/* + * Copyright (c) 2012-2016 VMware, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of EITHER the GNU General Public License + * version 2 as published by the Free Software Foundation or the BSD + * 2-Clause License. This program is distributed in the hope that it + * will be useful, but WITHOUT ANY WARRANTY; WITHOUT EVEN THE IMPLIED + * WARRANTY OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE. + * See the GNU General Public License version 2 for more details at + * http://www.gnu.org/licenses/old-licenses/gpl-2.0.en.html. + * + * You should have received a copy of the GNU General Public License + * along with this program available in the file COPYING in the main + * directory of this source tree. + * + * The BSD 2-Clause License + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED + * OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <util/udma_barrier.h> + +#include "pvrdma.h" + +enum { + CQ_OK = 0, + CQ_EMPTY = -1, + CQ_POLL_ERR = -2, +}; + +enum { + PVRDMA_CQE_IS_SEND_MASK = 0x40, + PVRDMA_CQE_OPCODE_MASK = 0x1f, +}; + +int pvrdma_alloc_cq_buf(struct pvrdma_device *dev, struct pvrdma_cq *cq, + struct pvrdma_buf *buf, int entries) +{ + if (pvrdma_alloc_buf(buf, cq->offset + + entries * (sizeof(struct pvrdma_cqe)), + dev->page_size)) + return -1; + memset(buf->buf, 0, buf->length); + + return 0; +} + +static struct pvrdma_cqe *get_cqe(struct pvrdma_cq *cq, int entry) +{ + return cq->buf.buf + cq->offset + + entry * (sizeof(struct pvrdma_cqe)); +} + +static int pvrdma_poll_one(struct pvrdma_cq *cq, + struct pvrdma_qp **cur_qp, + struct ibv_wc *wc) +{ + struct pvrdma_context *ctx = to_vctx(cq->ibv_cq.context); + int has_data; + unsigned int head; + int tried = 0; + struct pvrdma_cqe *cqe; + +retry: + has_data = pvrdma_idx_ring_has_data(&cq->ring_state->rx, + cq->cqe_cnt, &head); + if (has_data == 0) { + unsigned int val; + + if (tried) + return CQ_EMPTY; + + /* Pass down POLL to give physical HCA a chance to poll. */ + val = cq->cqn | PVRDMA_UAR_CQ_POLL; + pvrdma_write_uar_cq(ctx->uar, val); + + tried = 1; + goto retry; + } else if (has_data == -1) { + return CQ_POLL_ERR; + } + + cqe = get_cqe(cq, head); + if (!cqe) + return CQ_EMPTY; + + udma_from_device_barrier(); + + if (ctx->qp_tbl[cqe->qp & 0xFFFF]) + *cur_qp = (struct pvrdma_qp *)ctx->qp_tbl[cqe->qp & 0xFFFF]; + else + return CQ_POLL_ERR; + + wc->opcode = pvrdma_wc_opcode_to_ibv(cqe->opcode); + wc->status = pvrdma_wc_status_to_ibv(cqe->status); + wc->wr_id = cqe->wr_id; + wc->qp_num = (*cur_qp)->ibv_qp.qp_num; + wc->byte_len = cqe->byte_len; + wc->imm_data = cqe->imm_data; + wc->src_qp = cqe->src_qp; + wc->wc_flags = cqe->wc_flags; + wc->pkey_index = cqe->pkey_index; + wc->slid = cqe->slid; + wc->sl = cqe->sl; + wc->dlid_path_bits = cqe->dlid_path_bits; + wc->vendor_err = 0; + + /* Update shared ring state. */ + pvrdma_idx_ring_inc(&(cq->ring_state->rx.cons_head), cq->cqe_cnt); + + return CQ_OK; +} + +int pvrdma_poll_cq(struct ibv_cq *ibcq, int num_entries, struct ibv_wc *wc) +{ + struct pvrdma_cq *cq = to_vcq(ibcq); + struct pvrdma_qp *qp; + int npolled = 0; + + if (num_entries < 1 || wc == NULL) + return 0; + + pthread_spin_lock(&cq->lock); + + for (npolled = 0; npolled < num_entries; ++npolled) { + if (pvrdma_poll_one(cq, &qp, wc + npolled) != CQ_OK) + break; + } + + pthread_spin_unlock(&cq->lock); + + return npolled; +} + +void pvrdma_cq_clean_int(struct pvrdma_cq *cq, uint32_t qp_handle) +{ + /* Flush CQEs from specified QP */ + int has_data; + unsigned int head; + + /* Lock held */ + has_data = pvrdma_idx_ring_has_data(&cq->ring_state->rx, + cq->cqe_cnt, &head); + + if (unlikely(has_data > 0)) { + int items; + int curr; + int tail = pvrdma_idx(&cq->ring_state->rx.prod_tail, + cq->cqe_cnt); + struct pvrdma_cqe *cqe; + struct pvrdma_cqe *curr_cqe; + + items = (tail > head) ? (tail - head) : + (cq->cqe_cnt - head + tail); + curr = --tail; + while (items-- > 0) { + if (curr < 0) + curr = cq->cqe_cnt - 1; + if (tail < 0) + tail = cq->cqe_cnt - 1; + curr_cqe = get_cqe(cq, curr); + udma_from_device_barrier(); + if ((curr_cqe->qp & 0xFFFF) != qp_handle) { + if (curr != tail) { + cqe = get_cqe(cq, tail); + udma_from_device_barrier(); + *cqe = *curr_cqe; + } + tail--; + } else { + pvrdma_idx_ring_inc( + &cq->ring_state->rx.cons_head, + cq->cqe_cnt); + } + curr--; + } + } +} + +void pvrdma_cq_clean(struct pvrdma_cq *cq, uint32_t qp_handle) +{ + pthread_spin_lock(&cq->lock); + pvrdma_cq_clean_int(cq, qp_handle); + pthread_spin_unlock(&cq->lock); +} + +struct ibv_cq *pvrdma_create_cq(struct ibv_context *context, int cqe, + struct ibv_comp_channel *channel, + int comp_vector) +{ + struct pvrdma_device *dev = to_vdev(context->device); + struct user_pvrdma_create_cq cmd; + struct user_pvrdma_create_cq_resp resp; + struct pvrdma_cq *cq; + int ret; + + if (cqe < 1) + return NULL; + + cq = malloc(sizeof(*cq)); + if (!cq) + return NULL; + + /* Extra page for shared ring state */ + cq->offset = dev->page_size; + + if (pthread_spin_init(&cq->lock, PTHREAD_PROCESS_PRIVATE)) + goto err; + + cqe = align_next_power2(cqe); + + if (pvrdma_alloc_cq_buf(dev, cq, &cq->buf, cqe)) + goto err; + + cq->ring_state = cq->buf.buf; + + cmd.buf_addr = (uintptr_t) cq->buf.buf; + cmd.buf_size = cq->buf.length; + ret = ibv_cmd_create_cq(context, cqe, channel, comp_vector, + &cq->ibv_cq, &cmd.ibv_cmd, sizeof(cmd), + &resp.ibv_resp, sizeof(resp)); + if (ret) + goto err_buf; + + cq->cqn = resp.cqn; + cq->cqe_cnt = cq->ibv_cq.cqe; + + return &cq->ibv_cq; + +err_buf: + pvrdma_free_buf(&cq->buf); +err: + free(cq); + + return NULL; +} + +int pvrdma_destroy_cq(struct ibv_cq *cq) +{ + int ret; + + ret = ibv_cmd_destroy_cq(cq); + if (ret) + return ret; + + pvrdma_free_buf(&to_vcq(cq)->buf); + free(to_vcq(cq)); + + return 0; +} + +int pvrdma_req_notify_cq(struct ibv_cq *ibcq, int solicited) +{ + struct pvrdma_context *ctx = to_vctx(ibcq->context); + struct pvrdma_cq *cq = to_vcq(ibcq); + unsigned int val = cq->cqn; + + val |= solicited ? PVRDMA_UAR_CQ_ARM_SOL : PVRDMA_UAR_CQ_ARM; + pvrdma_write_uar_cq(ctx->uar, val); + + return 0; +} diff --git a/providers/vmw_pvrdma/pvrdma-abi.h b/providers/vmw_pvrdma/pvrdma-abi.h new file mode 100644 index 0000000..1a4c3c8 --- /dev/null +++ b/providers/vmw_pvrdma/pvrdma-abi.h @@ -0,0 +1,64 @@ +/* + * Copyright (c) 2012-2016 VMware, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of EITHER the GNU General Public License + * version 2 as published by the Free Software Foundation or the BSD + * 2-Clause License. This program is distributed in the hope that it + * will be useful, but WITHOUT ANY WARRANTY; WITHOUT EVEN THE IMPLIED + * WARRANTY OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE. + * See the GNU General Public License version 2 for more details at + * http://www.gnu.org/licenses/old-licenses/gpl-2.0.en.html. + * + * You should have received a copy of the GNU General Public License + * along with this program available in the file COPYING in the main + * directory of this source tree. + * + * The BSD 2-Clause License + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED + * OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __PVRDMA_ABI_FIX_H__ +#define __PVRDMA_ABI_FIX_H__ + +#include <infiniband/kern-abi.h> +#include <rdma/vmw_pvrdma-abi.h> +#include <kernel-abi/vmw_pvrdma-abi.h> + +DECLARE_DRV_CMD(user_pvrdma_alloc_pd, IB_USER_VERBS_CMD_ALLOC_PD, + empty, pvrdma_alloc_pd_resp); +DECLARE_DRV_CMD(user_pvrdma_create_cq, IB_USER_VERBS_CMD_CREATE_CQ, + pvrdma_create_cq, pvrdma_create_cq_resp); +DECLARE_DRV_CMD(user_pvrdma_create_qp, IB_USER_VERBS_CMD_CREATE_QP, + pvrdma_create_qp, pvrdma_create_qp_resp); +DECLARE_DRV_CMD(user_pvrdma_create_srq, IB_USER_VERBS_CMD_CREATE_SRQ, + pvrdma_create_srq, pvrdma_create_srq_resp); +DECLARE_DRV_CMD(user_pvrdma_alloc_ucontext, IB_USER_VERBS_CMD_GET_CONTEXT, + empty, pvrdma_alloc_ucontext_resp); + +#endif /* __PVRDMA_ABI_FIX_H__ */ diff --git a/providers/vmw_pvrdma/pvrdma.h b/providers/vmw_pvrdma/pvrdma.h new file mode 100644 index 0000000..0db6577 --- /dev/null +++ b/providers/vmw_pvrdma/pvrdma.h @@ -0,0 +1,351 @@ +/* + * Copyright (c) 2012-2016 VMware, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of EITHER the GNU General Public License + * version 2 as published by the Free Software Foundation or the BSD + * 2-Clause License. This program is distributed in the hope that it + * will be useful, but WITHOUT ANY WARRANTY; WITHOUT EVEN THE IMPLIED + * WARRANTY OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE. + * See the GNU General Public License version 2 for more details at + * http://www.gnu.org/licenses/old-licenses/gpl-2.0.en.html. + * + * You should have received a copy of the GNU General Public License + * along with this program available in the file COPYING in the main + * directory of this source tree. + * + * The BSD 2-Clause License + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED + * OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __PVRDMA_H__ +#define __PVRDMA_H__ + +#include <config.h> +#include <endian.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <pthread.h> +#include <unistd.h> +#include <sys/mman.h> +#include <infiniband/driver.h> +#include <ccan/minmax.h> +#include <util/compiler.h> + +#include "pvrdma-abi.h" +#include "pvrdma_ring.h" + +#define PFX "pvrdma: " + +enum { + PVRDMA_OPCODE_NOP = 0x00, + PVRDMA_OPCODE_SEND_INVAL = 0x01, + PVRDMA_OPCODE_RDMA_WRITE = 0x08, + PVRDMA_OPCODE_RDMA_WRITE_IMM = 0x09, + PVRDMA_OPCODE_SEND = 0x0a, + PVRDMA_OPCODE_SEND_IMM = 0x0b, + PVRDMA_OPCODE_LSO = 0x0e, + PVRDMA_OPCODE_RDMA_READ = 0x10, + PVRDMA_OPCODE_ATOMIC_CS = 0x11, + PVRDMA_OPCODE_ATOMIC_FA = 0x12, + PVRDMA_OPCODE_ATOMIC_MASK_CS = 0x14, + PVRDMA_OPCODE_ATOMIC_MASK_FA = 0x15, + PVRDMA_OPCODE_BIND_MW = 0x18, + PVRDMA_OPCODE_FMR = 0x19, + PVRDMA_OPCODE_LOCAL_INVAL = 0x1b, + PVRDMA_OPCODE_CONFIG_CMD = 0x1f, + + PVRDMA_RECV_OPCODE_RDMA_WRITE_IMM = 0x00, + PVRDMA_RECV_OPCODE_SEND = 0x01, + PVRDMA_RECV_OPCODE_SEND_IMM = 0x02, + PVRDMA_RECV_OPCODE_SEND_INVAL = 0x03, + + PVRDMA_CQE_OPCODE_ERROR = 0x1e, + PVRDMA_CQE_OPCODE_RESIZE = 0x16, +}; + +enum { + PVRDMA_WQE_CTRL_FENCE = 1 << 6, + PVRDMA_WQE_CTRL_CQ_UPDATE = 3 << 2, + PVRDMA_WQE_CTRL_SOLICIT = 1 << 1, +}; + +struct pvrdma_device { + struct verbs_device ibv_dev; + int page_size; + int abi_version; +}; + +struct pvrdma_context { + struct verbs_context ibv_ctx; + void *uar; + pthread_spinlock_t uar_lock; + int max_qp_wr; + int max_sge; + int max_cqe; + struct pvrdma_qp **qp_tbl; +}; + +struct pvrdma_buf { + void *buf; + size_t length; +}; + +struct pvrdma_pd { + struct ibv_pd ibv_pd; + uint32_t pdn; +}; + +struct pvrdma_cq { + struct ibv_cq ibv_cq; + struct pvrdma_buf buf; + struct pvrdma_buf resize_buf; + pthread_spinlock_t lock; + struct pvrdma_ring_state *ring_state; + uint32_t cqe_cnt; + uint32_t offset; + uint32_t cqn; +}; + +struct pvrdma_srq { + struct ibv_srq ibv_srq; + struct pvrdma_buf buf; + pthread_spinlock_t lock; + uint64_t *wrid; + uint32_t srqn; + int wqe_cnt; + int wqe_size; + int max_gs; + int wqe_shift; + struct pvrdma_ring_state *ring_state; + uint16_t counter; + int offset; +}; + +struct pvrdma_wq { + uint64_t *wrid; + pthread_spinlock_t lock; + int wqe_cnt; + int wqe_size; + struct pvrdma_ring *ring_state; + int max_gs; + int wqe_shift; + int offset; +}; + +struct pvrdma_qp { + struct ibv_qp ibv_qp; + struct pvrdma_buf rbuf; + struct pvrdma_buf sbuf; + int max_inline_data; + int buf_size; + __be32 sq_signal_bits; + int sq_spare_wqes; + struct pvrdma_wq sq; + struct pvrdma_wq rq; + int is_srq; + uint32_t qp_handle; +}; + +struct pvrdma_ah { + struct ibv_ah ibv_ah; + struct pvrdma_av av; +}; + +static inline unsigned long align(unsigned long val, unsigned long align) +{ + return (val + align - 1) & ~(align - 1); +} + +static inline int align_next_power2(int size) +{ + int val = 1; + + while (val < size) + val <<= 1; + + return val; +} + +static inline struct pvrdma_device *to_vdev(struct ibv_device *ibdev) +{ + return container_of(ibdev, struct pvrdma_device, ibv_dev.device); +} + +static inline struct pvrdma_context *to_vctx(struct ibv_context *ibctx) +{ + return container_of(ibctx, struct pvrdma_context, ibv_ctx.context); +} + +static inline struct pvrdma_pd *to_vpd(struct ibv_pd *ibpd) +{ + return container_of(ibpd, struct pvrdma_pd, ibv_pd); +} + +static inline struct pvrdma_cq *to_vcq(struct ibv_cq *ibcq) +{ + return container_of(ibcq, struct pvrdma_cq, ibv_cq); +} + +static inline struct pvrdma_srq *to_vsrq(struct ibv_srq *ibsrq) +{ + return container_of(ibsrq, struct pvrdma_srq, ibv_srq); +} + +static inline struct pvrdma_qp *to_vqp(struct ibv_qp *ibqp) +{ + return container_of(ibqp, struct pvrdma_qp, ibv_qp); +} + +static inline struct pvrdma_ah *to_vah(struct ibv_ah *ibah) +{ + return container_of(ibah, struct pvrdma_ah, ibv_ah); +} + +static inline void pvrdma_write_uar_qp(void *uar, unsigned value) +{ + *(__le32 *)(uar + PVRDMA_UAR_QP_OFFSET) = htole32(value); +} + +static inline void pvrdma_write_uar_cq(void *uar, unsigned value) +{ + *(__le32 *)(uar + PVRDMA_UAR_CQ_OFFSET) = htole32(value); +} + +static inline void pvrdma_write_uar_srq(void *uar, unsigned int value) +{ + *(__le32 *)(uar + PVRDMA_UAR_SRQ_OFFSET) = htole32(value); +} + +static inline int ibv_send_flags_to_pvrdma(int flags) +{ + return flags; +} + +static inline enum pvrdma_wr_opcode ibv_wr_opcode_to_pvrdma( + enum ibv_wr_opcode op) +{ + return (enum pvrdma_wr_opcode)op; +} + +static inline enum ibv_wc_status pvrdma_wc_status_to_ibv( + enum pvrdma_wc_status status) +{ + return (enum ibv_wc_status)status; +} + +static inline enum ibv_wc_opcode pvrdma_wc_opcode_to_ibv( + enum pvrdma_wc_opcode op) +{ + return (enum ibv_wc_opcode)op; +} + +static inline int pvrdma_wc_flags_to_ibv(int flags) +{ + return flags; +} + +int pvrdma_alloc_buf(struct pvrdma_buf *buf, size_t size, int page_size); +void pvrdma_free_buf(struct pvrdma_buf *buf); + +int pvrdma_query_device(struct ibv_context *context, + struct ibv_device_attr *attr); +int pvrdma_query_port(struct ibv_context *context, uint8_t port, + struct ibv_port_attr *attr); + +struct ibv_pd *pvrdma_alloc_pd(struct ibv_context *context); +int pvrdma_free_pd(struct ibv_pd *pd); + +struct ibv_mr *pvrdma_reg_mr(struct ibv_pd *pd, void *addr, size_t length, + uint64_t hca_va, int access); +int pvrdma_dereg_mr(struct verbs_mr *mr); + +struct ibv_cq *pvrdma_create_cq(struct ibv_context *context, int cqe, + struct ibv_comp_channel *channel, + int comp_vector); +int pvrdma_alloc_cq_buf(struct pvrdma_device *dev, struct pvrdma_cq *cq, + struct pvrdma_buf *buf, int nent); +int pvrdma_destroy_cq(struct ibv_cq *cq); +int pvrdma_req_notify_cq(struct ibv_cq *cq, int solicited); +int pvrdma_poll_cq(struct ibv_cq *cq, int ne, struct ibv_wc *wc); +void pvrdma_cq_event(struct ibv_cq *cq); +void pvrdma_cq_clean_int(struct pvrdma_cq *cq, uint32_t qp_handle); +void pvrdma_cq_clean(struct pvrdma_cq *cq, uint32_t qp_handle); +int pvrdma_get_outstanding_cqes(struct pvrdma_cq *cq); +void pvrdma_cq_resize_copy_cqes(struct pvrdma_cq *cq, void *buf, + int new_cqe); + +struct ibv_qp *pvrdma_create_qp(struct ibv_pd *pd, + struct ibv_qp_init_attr *attr); +int pvrdma_query_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, + int attr_mask, struct ibv_qp_init_attr *init_attr); +int pvrdma_modify_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, + int attr_mask); +int pvrdma_destroy_qp(struct ibv_qp *qp); +void pvrdma_init_qp_indices(struct pvrdma_qp *qp); +void pvrdma_qp_init_sq_ownership(struct pvrdma_qp *qp); +int pvrdma_post_send(struct ibv_qp *ibqp, struct ibv_send_wr *wr, + struct ibv_send_wr **bad_wr); +int pvrdma_post_recv(struct ibv_qp *ibqp, struct ibv_recv_wr *wr, + struct ibv_recv_wr **bad_wr); +void pvrdma_calc_sq_wqe_size(struct ibv_qp_cap *cap, enum ibv_qp_type type, + struct pvrdma_qp *qp); +int pvrdma_alloc_qp_buf(struct pvrdma_device *dev, struct ibv_qp_cap *cap, + enum ibv_qp_type type, struct pvrdma_qp *qp); +void pvrdma_set_sq_sizes(struct pvrdma_qp *qp, struct ibv_qp_cap *cap, + enum ibv_qp_type type); +struct pvrdma_qp *pvrdma_find_qp(struct pvrdma_context *ctx, + uint32_t qpn); +int pvrdma_store_qp(struct pvrdma_context *ctx, uint32_t qpn, + struct pvrdma_qp *qp); +void pvrdma_clear_qp(struct pvrdma_context *ctx, uint32_t qpn); + +struct ibv_srq *pvrdma_create_srq(struct ibv_pd *pd, + struct ibv_srq_init_attr *attr); +int pvrdma_modify_srq(struct ibv_srq *srq, struct ibv_srq_attr *attr, + int attr_mask); +int pvrdma_query_srq(struct ibv_srq *srq, + struct ibv_srq_attr *attr); +int pvrdma_destroy_srq(struct ibv_srq *srq); +int pvrdma_alloc_srq_buf(struct pvrdma_device *dev, + struct ibv_srq_attr *attr, + struct pvrdma_srq *srq); +int pvrdma_post_srq_recv(struct ibv_srq *ibsrq, + struct ibv_recv_wr *wr, + struct ibv_recv_wr **bad_wr); +void pvrdma_init_srq_queue(struct pvrdma_srq *srq); + +struct ibv_ah *pvrdma_create_ah(struct ibv_pd *pd, struct ibv_ah_attr *attr); +int pvrdma_destroy_ah(struct ibv_ah *ah); + +int pvrdma_alloc_av(struct pvrdma_pd *pd, struct ibv_ah_attr *attr, + struct pvrdma_ah *ah); +void pvrdma_free_av(struct pvrdma_ah *ah); + +#endif /* __PVRDMA_H__ */ diff --git a/providers/vmw_pvrdma/pvrdma_main.c b/providers/vmw_pvrdma/pvrdma_main.c new file mode 100644 index 0000000..14a67c1 --- /dev/null +++ b/providers/vmw_pvrdma/pvrdma_main.c @@ -0,0 +1,215 @@ +/* + * Copyright (c) 2012-2016 VMware, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of EITHER the GNU General Public License + * version 2 as published by the Free Software Foundation or the BSD + * 2-Clause License. This program is distributed in the hope that it + * will be useful, but WITHOUT ANY WARRANTY; WITHOUT EVEN THE IMPLIED + * WARRANTY OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE. + * See the GNU General Public License version 2 for more details at + * http://www.gnu.org/licenses/old-licenses/gpl-2.0.en.html. + * + * You should have received a copy of the GNU General Public License + * along with this program available in the file COPYING in the main + * directory of this source tree. + * + * The BSD 2-Clause License + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED + * OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "pvrdma.h" + +static void pvrdma_free_context(struct ibv_context *ibctx); + +/* + * VMware PVRDMA vendor id and PCI device id. + */ +#define PCI_VENDOR_ID_VMWARE 0x15AD +#define PCI_DEVICE_ID_VMWARE_PVRDMA 0x0820 + +static const struct verbs_context_ops pvrdma_ctx_ops = { + .free_context = pvrdma_free_context, + .query_device = pvrdma_query_device, + .query_port = pvrdma_query_port, + .alloc_pd = pvrdma_alloc_pd, + .dealloc_pd = pvrdma_free_pd, + + .reg_mr = pvrdma_reg_mr, + .dereg_mr = pvrdma_dereg_mr, + .create_cq = pvrdma_create_cq, + .poll_cq = pvrdma_poll_cq, + .req_notify_cq = pvrdma_req_notify_cq, + .destroy_cq = pvrdma_destroy_cq, + + .create_qp = pvrdma_create_qp, + .query_qp = pvrdma_query_qp, + .modify_qp = pvrdma_modify_qp, + .destroy_qp = pvrdma_destroy_qp, + + .create_srq = pvrdma_create_srq, + .modify_srq = pvrdma_modify_srq, + .query_srq = pvrdma_query_srq, + .destroy_srq = pvrdma_destroy_srq, + .post_srq_recv = pvrdma_post_srq_recv, + + .post_send = pvrdma_post_send, + .post_recv = pvrdma_post_recv, + .create_ah = pvrdma_create_ah, + .destroy_ah = pvrdma_destroy_ah, +}; + +int pvrdma_alloc_buf(struct pvrdma_buf *buf, size_t size, int page_size) +{ + int ret; + + buf->length = align(size, page_size); + buf->buf = mmap(NULL, buf->length, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (buf->buf == MAP_FAILED) + return errno; + + ret = ibv_dontfork_range(buf->buf, size); + if (ret) + munmap(buf->buf, buf->length); + + return ret; +} + +void pvrdma_free_buf(struct pvrdma_buf *buf) +{ + ibv_dofork_range(buf->buf, buf->length); + munmap(buf->buf, buf->length); +} + +static int pvrdma_init_context_shared(struct pvrdma_context *context, + struct ibv_device *ibdev, + int cmd_fd) +{ + struct ibv_get_context cmd; + struct user_pvrdma_alloc_ucontext_resp resp; + + context->ibv_ctx.context.cmd_fd = cmd_fd; + if (ibv_cmd_get_context(&context->ibv_ctx, &cmd, sizeof(cmd), + &resp.ibv_resp, sizeof(resp))) + return errno; + + context->qp_tbl = calloc(resp.qp_tab_size & 0xFFFF, + sizeof(struct pvrdma_qp *)); + if (!context->qp_tbl) + return -ENOMEM; + + context->uar = mmap(NULL, to_vdev(ibdev)->page_size, PROT_WRITE, + MAP_SHARED, cmd_fd, 0); + if (context->uar == MAP_FAILED) { + free(context->qp_tbl); + return errno; + } + + pthread_spin_init(&context->uar_lock, PTHREAD_PROCESS_PRIVATE); + + verbs_set_ops(&context->ibv_ctx, &pvrdma_ctx_ops); + + return 0; +} + +static void pvrdma_free_context_shared(struct pvrdma_context *context, + struct pvrdma_device *dev) +{ + munmap(context->uar, dev->page_size); + free(context->qp_tbl); +} + +static struct verbs_context *pvrdma_alloc_context(struct ibv_device *ibdev, + int cmd_fd, + void *private_data) +{ + struct pvrdma_context *context; + + context = verbs_init_and_alloc_context(ibdev, cmd_fd, context, ibv_ctx, + RDMA_DRIVER_VMW_PVRDMA); + if (!context) + return NULL; + + if (pvrdma_init_context_shared(context, ibdev, cmd_fd)) { + verbs_uninit_context(&context->ibv_ctx); + free(context); + return NULL; + } + + return &context->ibv_ctx; +} + +static void pvrdma_free_context(struct ibv_context *ibctx) +{ + struct pvrdma_context *context = to_vctx(ibctx); + + pvrdma_free_context_shared(context, to_vdev(ibctx->device)); + verbs_uninit_context(&context->ibv_ctx); + free(context); +} + +static void pvrdma_uninit_device(struct verbs_device *verbs_device) +{ + struct pvrdma_device *dev = to_vdev(&verbs_device->device); + + free(dev); +} + +static struct verbs_device * +pvrdma_device_alloc(struct verbs_sysfs_dev *sysfs_dev) +{ + struct pvrdma_device *dev; + + dev = calloc(1, sizeof(*dev)); + if (!dev) + return NULL; + + dev->abi_version = sysfs_dev->abi_ver; + dev->page_size = sysconf(_SC_PAGESIZE); + + return &dev->ibv_dev; +} + +static const struct verbs_match_ent hca_table[] = { + VERBS_DRIVER_ID(RDMA_DRIVER_VMW_PVRDMA), + VERBS_PCI_MATCH(PCI_VENDOR_ID_VMWARE, PCI_DEVICE_ID_VMWARE_PVRDMA, + NULL), + {} +}; + +static const struct verbs_device_ops pvrdma_dev_ops = { + .name = "pvrdma", + .match_min_abi_version = PVRDMA_UVERBS_ABI_VERSION, + .match_max_abi_version = PVRDMA_UVERBS_ABI_VERSION, + .match_table = hca_table, + .alloc_device = pvrdma_device_alloc, + .uninit_device = pvrdma_uninit_device, + .alloc_context = pvrdma_alloc_context, +}; +PROVIDER_DRIVER(vmw_pvrdma, pvrdma_dev_ops); diff --git a/providers/vmw_pvrdma/pvrdma_ring.h b/providers/vmw_pvrdma/pvrdma_ring.h new file mode 100644 index 0000000..565b45c --- /dev/null +++ b/providers/vmw_pvrdma/pvrdma_ring.h @@ -0,0 +1,141 @@ +/* + * Copyright (c) 2012-2016 VMware, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of EITHER the GNU General Public License + * version 2 as published by the Free Software Foundation or the BSD + * 2-Clause License. This program is distributed in the hope that it + * will be useful, but WITHOUT ANY WARRANTY; WITHOUT EVEN THE IMPLIED + * WARRANTY OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE. + * See the GNU General Public License version 2 for more details at + * http://www.gnu.org/licenses/old-licenses/gpl-2.0.en.html. + * + * You should have received a copy of the GNU General Public License + * along with this program in the file COPYING. If not, write to the + * Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, + * Boston, MA 02110-1301, USA. + * + * The BSD 2-Clause License + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED + * OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __PVRDMA_RING_H__ +#define __PVRDMA_RING_H__ + +#include <linux/types.h> + +#define PVRDMA_INVALID_IDX -1 /* Invalid index. */ + +/* + * Rings are shared with the device, so read/write access must be atomic. + * PVRDMA is x86 only, and since 32-bit access is atomic on x86, using + * regular uint32_t is safe. + */ + +struct pvrdma_ring { + uint32_t prod_tail; /* Producer tail. */ + uint32_t cons_head; /* Consumer head. */ +}; + +struct pvrdma_ring_state { + struct pvrdma_ring tx; /* Tx ring. */ + struct pvrdma_ring rx; /* Rx ring. */ +}; + +static inline int pvrdma_idx_valid(uint32_t idx, uint32_t max_elems) +{ + /* Generates fewer instructions than a less-than. */ + return (idx & ~((max_elems << 1) - 1)) == 0; +} + +static inline int32_t pvrdma_idx(uint32_t *var, uint32_t max_elems) +{ + const uint32_t idx = *var; + + if (pvrdma_idx_valid(idx, max_elems)) + return idx & (max_elems - 1); + return PVRDMA_INVALID_IDX; +} + +static inline void pvrdma_idx_ring_inc(uint32_t *var, uint32_t max_elems) +{ + uint32_t idx = (*var) + 1; /* Increment. */ + + idx &= (max_elems << 1) - 1; /* Modulo size, flip gen. */ + *var = idx; +} + +static inline int32_t pvrdma_idx_ring_has_space(const struct pvrdma_ring *r, + uint32_t max_elems, + uint32_t *out_tail) +{ + const uint32_t tail = r->prod_tail; + const uint32_t head = r->cons_head; + + if (pvrdma_idx_valid(tail, max_elems) && + pvrdma_idx_valid(head, max_elems)) { + *out_tail = tail & (max_elems - 1); + return tail != (head ^ max_elems); + } + return PVRDMA_INVALID_IDX; +} + +static inline int32_t pvrdma_idx_ring_has_data(const struct pvrdma_ring *r, + uint32_t max_elems, + uint32_t *out_head) +{ + const uint32_t tail = r->prod_tail; + const uint32_t head = r->cons_head; + + if (pvrdma_idx_valid(tail, max_elems) && + pvrdma_idx_valid(head, max_elems)) { + *out_head = head & (max_elems - 1); + return tail != head; + } + return PVRDMA_INVALID_IDX; +} + +static inline int32_t pvrdma_idx_ring_is_valid_idx(const struct pvrdma_ring *r, + uint32_t max_elems, + uint32_t *idx) +{ + const uint32_t tail = r->prod_tail; + const uint32_t head = r->cons_head; + + if (pvrdma_idx_valid(tail, max_elems) && + pvrdma_idx_valid(head, max_elems) && + pvrdma_idx_valid(*idx, max_elems)) { + if (tail > head && (*idx < tail && *idx >= head)) + return 1; + else if (head > tail && (*idx >= head || *idx < tail)) + return 1; + } + return 0; +} + +#endif /* __PVRDMA_RING_H__ */ diff --git a/providers/vmw_pvrdma/qp.c b/providers/vmw_pvrdma/qp.c new file mode 100644 index 0000000..3082529 --- /dev/null +++ b/providers/vmw_pvrdma/qp.c @@ -0,0 +1,731 @@ +/* + * Copyright (c) 2012-2017 VMware, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of EITHER the GNU General Public License + * version 2 as published by the Free Software Foundation or the BSD + * 2-Clause License. This program is distributed in the hope that it + * will be useful, but WITHOUT ANY WARRANTY; WITHOUT EVEN THE IMPLIED + * WARRANTY OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE. + * See the GNU General Public License version 2 for more details at + * http://www.gnu.org/licenses/old-licenses/gpl-2.0.en.html. + * + * You should have received a copy of the GNU General Public License + * along with this program available in the file COPYING in the main + * directory of this source tree. + * + * The BSD 2-Clause License + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED + * OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <util/udma_barrier.h> + +#include "pvrdma.h" + +int pvrdma_alloc_qp_buf(struct pvrdma_device *dev, struct ibv_qp_cap *cap, + enum ibv_qp_type type, struct pvrdma_qp *qp) +{ + qp->sq.wrid = calloc(qp->sq.wqe_cnt, sizeof(uint64_t)); + if (!qp->sq.wrid) + return -1; + + /* Align page size for sq */ + qp->sbuf.length = align(qp->sq.offset + + qp->sq.wqe_cnt * qp->sq.wqe_size, + dev->page_size); + + if (pvrdma_alloc_buf(&qp->sbuf, qp->sbuf.length, dev->page_size)) { + free(qp->sq.wrid); + return -1; + } + + memset(qp->sbuf.buf, 0, qp->sbuf.length); + + if (!qp->is_srq) { + qp->rq.wrid = calloc(qp->rq.wqe_cnt, sizeof(uint64_t)); + if (!qp->rq.wrid) { + pvrdma_free_buf(&qp->sbuf); + free(qp->sq.wrid); + return -1; + } + + /* Align page size for rq */ + qp->rbuf.length = align(qp->rq.offset + + qp->rq.wqe_cnt * qp->rq.wqe_size, + dev->page_size); + + if (pvrdma_alloc_buf(&qp->rbuf, qp->rbuf.length, + dev->page_size)) { + free(qp->sq.wrid); + free(qp->rq.wrid); + pvrdma_free_buf(&qp->sbuf); + return -1; + } + memset(qp->rbuf.buf, 0, qp->rbuf.length); + } else { + qp->rbuf.buf = NULL; + qp->rbuf.length = 0; + } + + qp->buf_size = qp->rbuf.length + qp->sbuf.length; + + return 0; +} + +void pvrdma_init_srq_queue(struct pvrdma_srq *srq) +{ + srq->ring_state->rx.cons_head = 0; + srq->ring_state->rx.prod_tail = 0; +} + +struct ibv_srq *pvrdma_create_srq(struct ibv_pd *pd, + struct ibv_srq_init_attr *attr) +{ + struct pvrdma_device *dev = to_vdev(pd->context->device); + struct user_pvrdma_create_srq cmd; + struct user_pvrdma_create_srq_resp resp; + struct pvrdma_srq *srq; + int ret; + + attr->attr.max_wr = align_next_power2(max_t(uint32_t, 1U, attr->attr.max_wr)); + attr->attr.max_sge = max_t(uint32_t, 1U, attr->attr.max_sge); + + srq = malloc(sizeof(*srq)); + if (!srq) + return NULL; + + if (pthread_spin_init(&srq->lock, PTHREAD_PROCESS_PRIVATE)) + goto err; + + srq->wqe_cnt = attr->attr.max_wr; + srq->max_gs = attr->attr.max_sge; + srq->wqe_size = align_next_power2(sizeof(struct pvrdma_rq_wqe_hdr) + + sizeof(struct ibv_sge) * + srq->max_gs); + /* Page reserved for queue metadata */ + srq->offset = dev->page_size; + + if (pvrdma_alloc_srq_buf(dev, &attr->attr, srq)) + goto err_spinlock; + + srq->ring_state = srq->buf.buf; + pvrdma_init_srq_queue(srq); + + memset(&cmd, 0, sizeof(cmd)); + cmd.buf_addr = (uintptr_t) srq->buf.buf; + cmd.buf_size = srq->buf.length; + + ret = ibv_cmd_create_srq(pd, &srq->ibv_srq, attr, + &cmd.ibv_cmd, sizeof(cmd), + &resp.ibv_resp, sizeof(resp)); + + if (ret) + goto err_free; + + srq->srqn = resp.srqn; + + return &srq->ibv_srq; + +err_free: + free(srq->wrid); + pvrdma_free_buf(&srq->buf); +err_spinlock: + pthread_spin_destroy(&srq->lock); +err: + free(srq); + + return NULL; +} + +int pvrdma_modify_srq(struct ibv_srq *srq, + struct ibv_srq_attr *attr, + int attr_mask) +{ + struct ibv_modify_srq cmd; + + return ibv_cmd_modify_srq(srq, attr, attr_mask, &cmd, sizeof(cmd)); +} + +int pvrdma_query_srq(struct ibv_srq *srq, + struct ibv_srq_attr *attr) +{ + struct ibv_query_srq cmd; + + return ibv_cmd_query_srq(srq, attr, &cmd, sizeof(cmd)); +} + +int pvrdma_destroy_srq(struct ibv_srq *ibsrq) +{ + struct pvrdma_srq *srq = to_vsrq(ibsrq); + int ret; + + ret = ibv_cmd_destroy_srq(ibsrq); + if (ret) + return ret; + + pthread_spin_destroy(&srq->lock); + pvrdma_free_buf(&srq->buf); + free(srq->wrid); + free(srq); + + return 0; +} + +static void pvrdma_init_qp_queue(struct pvrdma_qp *qp) +{ + qp->sq.ring_state->cons_head = 0; + qp->sq.ring_state->prod_tail = 0; + if (qp->rq.ring_state) { + qp->rq.ring_state->cons_head = 0; + qp->rq.ring_state->prod_tail = 0; + } +} + +struct ibv_qp *pvrdma_create_qp(struct ibv_pd *pd, + struct ibv_qp_init_attr *attr) +{ + struct pvrdma_device *dev = to_vdev(pd->context->device); + struct user_pvrdma_create_qp cmd; + struct user_pvrdma_create_qp_resp resp = {}; + struct pvrdma_qp *qp; + int is_srq = !!(attr->srq); + + attr->cap.max_send_sge = max_t(uint32_t, 1U, attr->cap.max_send_sge); + attr->cap.max_send_wr = + align_next_power2(max_t(uint32_t, 1U, attr->cap.max_send_wr)); + + if (!is_srq) { + attr->cap.max_recv_sge = max_t(uint32_t, 1U, attr->cap.max_recv_sge); + attr->cap.max_recv_wr = + align_next_power2(max_t(uint32_t, 1U, attr->cap.max_recv_wr)); + } else { + attr->cap.max_recv_sge = 0; + attr->cap.max_recv_wr = 0; + } + + qp = calloc(1, sizeof(*qp)); + if (!qp) + return NULL; + + qp->is_srq = is_srq; + + qp->sq.max_gs = attr->cap.max_send_sge; + qp->sq.wqe_cnt = attr->cap.max_send_wr; + /* Extra page for shared ring state */ + qp->sq.offset = dev->page_size; + qp->sq.wqe_size = align_next_power2(sizeof(struct pvrdma_sq_wqe_hdr) + + sizeof(struct ibv_sge) * + qp->sq.max_gs); + + if (!is_srq) { + qp->rq.max_gs = attr->cap.max_recv_sge; + qp->rq.wqe_cnt = attr->cap.max_recv_wr; + qp->rq.offset = 0; + qp->rq.wqe_size = align_next_power2(sizeof(struct pvrdma_rq_wqe_hdr) + + sizeof(struct ibv_sge) * + qp->rq.max_gs); + } else { + qp->rq.max_gs = 0; + qp->rq.wqe_cnt = 0; + qp->rq.offset = 0; + qp->rq.wqe_size = 0; + } + + /* Allocate [rq][sq] memory */ + if (pvrdma_alloc_qp_buf(dev, &attr->cap, attr->qp_type, qp)) + goto err; + + qp->sq.ring_state = qp->sbuf.buf; + if (pthread_spin_init(&qp->sq.lock, PTHREAD_PROCESS_PRIVATE)) + goto err_free; + + if (!is_srq) { + qp->rq.ring_state = (struct pvrdma_ring *)&qp->sq.ring_state[1]; + if (pthread_spin_init(&qp->rq.lock, PTHREAD_PROCESS_PRIVATE)) + goto err_free; + } else { + qp->rq.ring_state = NULL; + } + + pvrdma_init_qp_queue(qp); + + memset(&cmd, 0, sizeof(cmd)); + cmd.sbuf_addr = (uintptr_t)qp->sbuf.buf; + cmd.sbuf_size = qp->sbuf.length; + cmd.rbuf_addr = (uintptr_t)qp->rbuf.buf; + cmd.rbuf_size = qp->rbuf.length; + cmd.qp_addr = (uintptr_t) qp; + + if (ibv_cmd_create_qp(pd, &qp->ibv_qp, attr, &cmd.ibv_cmd, sizeof(cmd), + &resp.ibv_resp, sizeof(resp))) + goto err_free; + + if (resp.drv_payload.qp_handle != 0) + qp->qp_handle = resp.drv_payload.qp_handle; + else + qp->qp_handle = qp->ibv_qp.qp_num; + + to_vctx(pd->context)->qp_tbl[qp->qp_handle & 0xFFFF] = qp; + + /* If set, each WR submitted to the SQ generate a completion entry */ + if (attr->sq_sig_all) + qp->sq_signal_bits = htobe32(PVRDMA_WQE_CTRL_CQ_UPDATE); + else + qp->sq_signal_bits = 0; + + return &qp->ibv_qp; + +err_free: + if (qp->sq.wqe_cnt) + free(qp->sq.wrid); + if (qp->rq.wqe_cnt) + free(qp->rq.wrid); + pvrdma_free_buf(&qp->rbuf); + pvrdma_free_buf(&qp->sbuf); +err: + free(qp); + + return NULL; +} + +int pvrdma_query_qp(struct ibv_qp *ibqp, struct ibv_qp_attr *attr, + int attr_mask, + struct ibv_qp_init_attr *init_attr) +{ + struct ibv_query_qp cmd; + struct pvrdma_qp *qp = to_vqp(ibqp); + int ret; + + ret = ibv_cmd_query_qp(ibqp, attr, attr_mask, init_attr, + &cmd, sizeof(cmd)); + if (ret) + return ret; + + /* Passing back */ + init_attr->cap.max_send_wr = qp->sq.wqe_cnt; + init_attr->cap.max_send_sge = qp->sq.max_gs; + init_attr->cap.max_inline_data = qp->max_inline_data; + + attr->cap = init_attr->cap; + + return 0; +} + +int pvrdma_modify_qp(struct ibv_qp *ibqp, struct ibv_qp_attr *attr, + int attr_mask) +{ + struct ibv_modify_qp cmd; + struct pvrdma_qp *qp = to_vqp(ibqp); + int ret; + + /* Sanity check */ + if (!attr_mask) + return 0; + + ret = ibv_cmd_modify_qp(ibqp, attr, attr_mask, &cmd, sizeof(cmd)); + + if (!ret && + (attr_mask & IBV_QP_STATE) && + attr->qp_state == IBV_QPS_RESET) { + pvrdma_cq_clean(to_vcq(ibqp->recv_cq), qp->qp_handle); + if (ibqp->send_cq != ibqp->recv_cq) + pvrdma_cq_clean(to_vcq(ibqp->send_cq), qp->qp_handle); + pvrdma_init_qp_queue(qp); + } + + return ret; +} + +static void pvrdma_lock_cqs(struct ibv_qp *qp) +{ + struct pvrdma_cq *send_cq = to_vcq(qp->send_cq); + struct pvrdma_cq *recv_cq = to_vcq(qp->recv_cq); + + if (send_cq == recv_cq) { + pthread_spin_lock(&send_cq->lock); + } else if (send_cq->cqn < recv_cq->cqn) { + pthread_spin_lock(&send_cq->lock); + pthread_spin_lock(&recv_cq->lock); + } else { + pthread_spin_lock(&recv_cq->lock); + pthread_spin_lock(&send_cq->lock); + } +} + +static void pvrdma_unlock_cqs(struct ibv_qp *qp) +{ + struct pvrdma_cq *send_cq = to_vcq(qp->send_cq); + struct pvrdma_cq *recv_cq = to_vcq(qp->recv_cq); + + if (send_cq == recv_cq) { + pthread_spin_unlock(&send_cq->lock); + } else if (send_cq->cqn < recv_cq->cqn) { + pthread_spin_unlock(&recv_cq->lock); + pthread_spin_unlock(&send_cq->lock); + } else { + pthread_spin_unlock(&send_cq->lock); + pthread_spin_unlock(&recv_cq->lock); + } +} + +int pvrdma_destroy_qp(struct ibv_qp *ibqp) +{ + struct pvrdma_context *ctx = to_vctx(ibqp->context); + struct pvrdma_qp *qp = to_vqp(ibqp); + int ret; + + ret = ibv_cmd_destroy_qp(ibqp); + if (ret) { + return ret; + } + + pvrdma_lock_cqs(ibqp); + /* Dump cqs */ + pvrdma_cq_clean_int(to_vcq(ibqp->recv_cq), qp->qp_handle); + + if (ibqp->send_cq != ibqp->recv_cq) + pvrdma_cq_clean_int(to_vcq(ibqp->send_cq), qp->qp_handle); + pvrdma_unlock_cqs(ibqp); + + free(qp->sq.wrid); + free(qp->rq.wrid); + pvrdma_free_buf(&qp->rbuf); + pvrdma_free_buf(&qp->sbuf); + ctx->qp_tbl[qp->qp_handle & 0xFFFF] = NULL; + free(qp); + + return 0; +} + +static void *get_srq_wqe(struct pvrdma_srq *srq, int n) +{ + return srq->buf.buf + srq->offset + (n * srq->wqe_size); +} + +static void *get_rq_wqe(struct pvrdma_qp *qp, int n) +{ + return qp->rbuf.buf + qp->rq.offset + (n * qp->rq.wqe_size); +} + +static void *get_sq_wqe(struct pvrdma_qp *qp, int n) +{ + return qp->sbuf.buf + qp->sq.offset + (n * qp->sq.wqe_size); +} + +int pvrdma_post_send(struct ibv_qp *ibqp, struct ibv_send_wr *wr, + struct ibv_send_wr **bad_wr) +{ + struct pvrdma_context *ctx = to_vctx(ibqp->context); + struct pvrdma_qp *qp = to_vqp(ibqp); + int ind; + int nreq = 0; + struct pvrdma_sq_wqe_hdr *wqe_hdr; + struct ibv_sge *sge; + int ret = 0; + int i; + + /* + * In states lower than RTS, we can fail immediately. In other states, + * just post and let the device figure it out. + */ + if (ibqp->state < IBV_QPS_RTS) { + *bad_wr = wr; + return EINVAL; + } + + pthread_spin_lock(&qp->sq.lock); + + ind = pvrdma_idx(&(qp->sq.ring_state->prod_tail), qp->sq.wqe_cnt); + if (ind < 0) { + pthread_spin_unlock(&qp->sq.lock); + *bad_wr = wr; + return EINVAL; + } + + for (nreq = 0; wr; ++nreq, wr = wr->next) { + unsigned int tail; + + if (pvrdma_idx_ring_has_space(qp->sq.ring_state, + qp->sq.wqe_cnt, &tail) <= 0) { + ret = ENOMEM; + *bad_wr = wr; + goto out; + } + + if (wr->num_sge > qp->sq.max_gs) { + ret = EINVAL; + *bad_wr = wr; + goto out; + } + + wqe_hdr = (struct pvrdma_sq_wqe_hdr *)get_sq_wqe(qp, ind); + wqe_hdr->wr_id = wr->wr_id; + wqe_hdr->num_sge = wr->num_sge; + wqe_hdr->opcode = ibv_wr_opcode_to_pvrdma(wr->opcode); + wqe_hdr->send_flags = ibv_send_flags_to_pvrdma(wr->send_flags); + if (wr->opcode == IBV_WR_SEND_WITH_IMM || + wr->opcode == IBV_WR_RDMA_WRITE_WITH_IMM) + wqe_hdr->ex.imm_data = wr->imm_data; + + switch (ibqp->qp_type) { + case IBV_QPT_UD: + wqe_hdr->wr.ud.remote_qpn = wr->wr.ud.remote_qpn; + wqe_hdr->wr.ud.remote_qkey = wr->wr.ud.remote_qkey; + wqe_hdr->wr.ud.av = to_vah(wr->wr.ud.ah)->av; + break; + case IBV_QPT_RC: + switch (wr->opcode) { + case IBV_WR_RDMA_READ: + case IBV_WR_RDMA_WRITE: + case IBV_WR_RDMA_WRITE_WITH_IMM: + wqe_hdr->wr.rdma.remote_addr = + wr->wr.rdma.remote_addr; + wqe_hdr->wr.rdma.rkey = wr->wr.rdma.rkey; + break; + case IBV_WR_ATOMIC_CMP_AND_SWP: + case IBV_WR_ATOMIC_FETCH_AND_ADD: + wqe_hdr->wr.atomic.remote_addr = wr->wr.atomic.remote_addr; + wqe_hdr->wr.atomic.rkey = wr->wr.atomic.rkey; + wqe_hdr->wr.atomic.compare_add = wr->wr.atomic.compare_add; + if (wr->opcode == IBV_WR_ATOMIC_CMP_AND_SWP) + wqe_hdr->wr.atomic.swap = wr->wr.atomic.swap; + break; + default: + /* No extra segments required for sends */ + break; + } + break; + default: + fprintf(stderr, PFX "invalid post send opcode\n"); + ret = EINVAL; + *bad_wr = wr; + goto out; + } + + /* Write each segment */ + sge = (struct ibv_sge *)&wqe_hdr[1]; + for (i = 0; i < wr->num_sge; i++) { + sge->addr = wr->sg_list[i].addr; + sge->length = wr->sg_list[i].length; + sge->lkey = wr->sg_list[i].lkey; + sge++; + } + + udma_to_device_barrier(); + pvrdma_idx_ring_inc(&(qp->sq.ring_state->prod_tail), + qp->sq.wqe_cnt); + + qp->sq.wrid[ind] = wr->wr_id; + ++ind; + if (ind >= qp->sq.wqe_cnt) + ind = 0; + } + +out: + if (nreq) { + udma_to_device_barrier(); + pvrdma_write_uar_qp(ctx->uar, + PVRDMA_UAR_QP_SEND | qp->qp_handle); + } + + pthread_spin_unlock(&qp->sq.lock); + + return ret; +} + +int pvrdma_post_recv(struct ibv_qp *ibqp, struct ibv_recv_wr *wr, + struct ibv_recv_wr **bad_wr) +{ + struct pvrdma_context *ctx = to_vctx(ibqp->context); + struct pvrdma_qp *qp = to_vqp(ibqp); + struct pvrdma_rq_wqe_hdr *wqe_hdr; + struct ibv_sge *sge; + int nreq; + int ind; + int i; + int ret = 0; + + if (qp->is_srq) + return EINVAL; + + if (!wr || !bad_wr) + return EINVAL; + + /* + * In the RESET state, we can fail immediately. For other states, + * just post and let the device figure it out. + */ + if (ibqp->state == IBV_QPS_RESET) { + *bad_wr = wr; + return EINVAL; + } + + pthread_spin_lock(&qp->rq.lock); + + ind = pvrdma_idx(&(qp->rq.ring_state->prod_tail), qp->rq.wqe_cnt); + if (ind < 0) { + pthread_spin_unlock(&qp->rq.lock); + *bad_wr = wr; + return EINVAL; + } + + for (nreq = 0; wr; ++nreq, wr = wr->next) { + unsigned int tail; + + if (pvrdma_idx_ring_has_space(qp->rq.ring_state, + qp->rq.wqe_cnt, &tail) <= 0) { + ret = ENOMEM; + *bad_wr = wr; + goto out; + } + + if (wr->num_sge > qp->rq.max_gs) { + ret = EINVAL; + *bad_wr = wr; + goto out; + } + + /* Fetch wqe */ + wqe_hdr = (struct pvrdma_rq_wqe_hdr *)get_rq_wqe(qp, ind); + wqe_hdr->wr_id = wr->wr_id; + wqe_hdr->num_sge = wr->num_sge; + + sge = (struct ibv_sge *)(wqe_hdr + 1); + for (i = 0; i < wr->num_sge; ++i) { + sge->addr = (uint64_t)wr->sg_list[i].addr; + sge->length = wr->sg_list[i].length; + sge->lkey = wr->sg_list[i].lkey; + sge++; + } + + pvrdma_idx_ring_inc(&qp->rq.ring_state->prod_tail, + qp->rq.wqe_cnt); + + qp->rq.wrid[ind] = wr->wr_id; + ind = (ind + 1) & (qp->rq.wqe_cnt - 1); + } + +out: + if (nreq) + pvrdma_write_uar_qp(ctx->uar, + PVRDMA_UAR_QP_RECV | qp->qp_handle); + + pthread_spin_unlock(&qp->rq.lock); + return ret; +} + +int pvrdma_post_srq_recv(struct ibv_srq *ibsrq, + struct ibv_recv_wr *wr, + struct ibv_recv_wr **bad_wr) +{ + struct pvrdma_context *ctx = to_vctx(ibsrq->context); + struct pvrdma_srq *srq = to_vsrq(ibsrq); + struct pvrdma_rq_wqe_hdr *wqe_hdr; + struct ibv_sge *sge; + int nreq; + int ind; + int i; + int ret = 0; + + if (!wr || !bad_wr) + return EINVAL; + + pthread_spin_lock(&srq->lock); + + ind = pvrdma_idx(&(srq->ring_state->rx.prod_tail), srq->wqe_cnt); + if (ind < 0) { + pthread_spin_unlock(&srq->lock); + *bad_wr = wr; + return EINVAL; + } + + for (nreq = 0; wr; ++nreq, wr = wr->next) { + unsigned int tail; + + if (pvrdma_idx_ring_has_space(&srq->ring_state->rx, + srq->wqe_cnt, &tail) <= 0) { + ret = ENOMEM; + *bad_wr = wr; + break; + } + + if (wr->num_sge > srq->max_gs) { + ret = EINVAL; + *bad_wr = wr; + break; + } + + /* Fetch wqe */ + wqe_hdr = (struct pvrdma_rq_wqe_hdr *)get_srq_wqe(srq, ind); + wqe_hdr->wr_id = wr->wr_id; + wqe_hdr->num_sge = wr->num_sge; + + sge = (struct ibv_sge *)(wqe_hdr + 1); + for (i = 0; i < wr->num_sge; ++i) { + sge->addr = (uint64_t)wr->sg_list[i].addr; + sge->length = wr->sg_list[i].length; + sge->lkey = wr->sg_list[i].lkey; + sge++; + } + + pvrdma_idx_ring_inc(&srq->ring_state->rx.prod_tail, + srq->wqe_cnt); + + srq->wrid[ind] = wr->wr_id; + ind = (ind + 1) & (srq->wqe_cnt - 1); + } + + if (nreq) + pvrdma_write_uar_srq(ctx->uar, + PVRDMA_UAR_SRQ_RECV | srq->srqn); + + pthread_spin_unlock(&srq->lock); + + return ret; +} + +int pvrdma_alloc_srq_buf(struct pvrdma_device *dev, + struct ibv_srq_attr *attr, + struct pvrdma_srq *srq) +{ + srq->wrid = calloc(srq->wqe_cnt, sizeof(uint64_t)); + if (!srq->wrid) + return -1; + + srq->buf.length = align(srq->offset, dev->page_size); + srq->buf.length += 2 * align(srq->wqe_cnt * srq->wqe_size, dev->page_size); + + if (pvrdma_alloc_buf(&srq->buf, srq->buf.length, dev->page_size)) { + free(srq->wrid); + return -1; + } + + memset(srq->buf.buf, 0, srq->buf.length); + + return 0; +} diff --git a/providers/vmw_pvrdma/verbs.c b/providers/vmw_pvrdma/verbs.c new file mode 100644 index 0000000..e8423c0 --- /dev/null +++ b/providers/vmw_pvrdma/verbs.c @@ -0,0 +1,241 @@ +/* + * Copyright (c) 2012-2016 VMware, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of EITHER the GNU General Public License + * version 2 as published by the Free Software Foundation or the BSD + * 2-Clause License. This program is distributed in the hope that it + * will be useful, but WITHOUT ANY WARRANTY; WITHOUT EVEN THE IMPLIED + * WARRANTY OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE. + * See the GNU General Public License version 2 for more details at + * http://www.gnu.org/licenses/old-licenses/gpl-2.0.en.html. + * + * You should have received a copy of the GNU General Public License + * along with this program available in the file COPYING in the main + * directory of this source tree. + * + * The BSD 2-Clause License + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED + * OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <arpa/inet.h> +#include "pvrdma.h" + +int pvrdma_query_device(struct ibv_context *context, + struct ibv_device_attr *attr) +{ + struct ibv_query_device cmd; + uint64_t raw_fw_ver; + unsigned major, minor, sub_minor; + int ret; + + ret = ibv_cmd_query_device(context, attr, &raw_fw_ver, + &cmd, sizeof(cmd)); + if (ret) + return ret; + + major = (raw_fw_ver >> 32) & 0xffff; + minor = (raw_fw_ver >> 16) & 0xffff; + sub_minor = raw_fw_ver & 0xffff; + + snprintf(attr->fw_ver, sizeof(attr->fw_ver), + "%d.%d.%03d", major, minor, sub_minor); + + return 0; +} + +int pvrdma_query_port(struct ibv_context *context, uint8_t port, + struct ibv_port_attr *attr) +{ + struct ibv_query_port cmd; + + return ibv_cmd_query_port(context, port, attr, &cmd, sizeof(cmd)); +} + +struct ibv_pd *pvrdma_alloc_pd(struct ibv_context *context) +{ + struct ibv_alloc_pd cmd; + struct user_pvrdma_alloc_pd_resp resp; + struct pvrdma_pd *pd; + + pd = malloc(sizeof(*pd)); + if (!pd) + return NULL; + + if (ibv_cmd_alloc_pd(context, &pd->ibv_pd, &cmd, sizeof(cmd), + &resp.ibv_resp, sizeof(resp))) { + free(pd); + return NULL; + } + + pd->pdn = resp.pdn; + + return &pd->ibv_pd; +} + +int pvrdma_free_pd(struct ibv_pd *pd) +{ + int ret; + + ret = ibv_cmd_dealloc_pd(pd); + if (ret) + return ret; + + free(to_vpd(pd)); + + return 0; +} + +struct ibv_mr *pvrdma_reg_mr(struct ibv_pd *pd, void *addr, size_t length, + uint64_t hca_va, int access) +{ + struct verbs_mr *vmr; + struct ibv_reg_mr cmd; + struct ib_uverbs_reg_mr_resp resp; + int ret; + + vmr = malloc(sizeof(*vmr)); + if (!vmr) + return NULL; + + ret = ibv_cmd_reg_mr(pd, addr, length, hca_va, access, vmr, &cmd, + sizeof(cmd), &resp, sizeof(resp)); + if (ret) { + free(vmr); + return NULL; + } + + return &vmr->ibv_mr; +} + +int pvrdma_dereg_mr(struct verbs_mr *vmr) +{ + int ret; + + ret = ibv_cmd_dereg_mr(vmr); + if (ret) + return ret; + + free(vmr); + + return 0; +} + +static int is_multicast_gid(const union ibv_gid *gid) +{ + return gid->raw[0] == 0xff; +} + +static int is_link_local_gid(const union ibv_gid *gid) +{ + return gid->global.subnet_prefix == htobe64(0xfe80000000000000ULL); +} + +static int is_ipv6_addr_v4mapped(const struct in6_addr *a) +{ + return IN6_IS_ADDR_V4MAPPED(&a->s6_addr32) || + /* IPv4 encoded multicast addresses */ + (a->s6_addr32[0] == htobe32(0xff0e0000) && + ((a->s6_addr32[1] | + (a->s6_addr32[2] ^ htobe32(0x0000ffff))) == 0UL)); +} + +static int set_mac_from_gid(const union ibv_gid *gid, + __u8 mac[6]) +{ + if (is_link_local_gid(gid)) { + /* + * The MAC is embedded in GID[8-10,13-15] with the + * 7th most significant bit inverted. + */ + memcpy(mac, gid->raw + 8, 3); + memcpy(mac + 3, gid->raw + 13, 3); + mac[0] ^= 2; + + return 0; + } + + return 1; +} + +struct ibv_ah *pvrdma_create_ah(struct ibv_pd *pd, + struct ibv_ah_attr *attr) +{ + struct pvrdma_ah *ah; + struct pvrdma_av *av; + struct ibv_port_attr port_attr; + + if (!attr->is_global) + return NULL; + + if (ibv_query_port(pd->context, attr->port_num, &port_attr)) + return NULL; + + if (port_attr.link_layer == IBV_LINK_LAYER_UNSPECIFIED || + port_attr.link_layer == IBV_LINK_LAYER_INFINIBAND) + return NULL; + + if (port_attr.link_layer == IBV_LINK_LAYER_ETHERNET && + (!is_link_local_gid(&attr->grh.dgid) && + !is_multicast_gid(&attr->grh.dgid) && + !is_ipv6_addr_v4mapped((struct in6_addr *)attr->grh.dgid.raw))) + return NULL; + + ah = calloc(1, sizeof(*ah)); + if (!ah) + return NULL; + + av = &ah->av; + av->port_pd = to_vpd(pd)->pdn | (attr->port_num << 24); + av->src_path_bits = attr->src_path_bits; + av->src_path_bits |= 0x80; + av->gid_index = attr->grh.sgid_index; + av->hop_limit = attr->grh.hop_limit; + av->sl_tclass_flowlabel = (attr->grh.traffic_class << 20) | + attr->grh.flow_label; + memcpy(av->dgid, attr->grh.dgid.raw, 16); + + if (port_attr.port_cap_flags & IBV_PORT_IP_BASED_GIDS) { + if (!ibv_resolve_eth_l2_from_gid(pd->context, attr, + av->dmac, NULL)) + return &ah->ibv_ah; + } else { + if (!set_mac_from_gid(&attr->grh.dgid, av->dmac)) + return &ah->ibv_ah; + } + + free(ah); + return NULL; +} + +int pvrdma_destroy_ah(struct ibv_ah *ah) +{ + free(to_vah(ah)); + + return 0; +} diff --git a/pyverbs/CMakeLists.txt b/pyverbs/CMakeLists.txt new file mode 100755 index 0000000..8603e9d --- /dev/null +++ b/pyverbs/CMakeLists.txt @@ -0,0 +1,30 @@ +# SPDX-License-Identifier: (GPL-2.0 OR Linux-OpenIB) +# Copyright (c) 2019, Mellanox Technologies. All rights reserved. See COPYING file + +rdma_cython_module(pyverbs "" + addr.pyx + base.pyx + cm_enums.pyx + cmid.pyx + cq.pyx + device.pyx + enums.pyx + mem_alloc.pyx + mr.pyx + pd.pyx + qp.pyx + wr.pyx + xrcd.pyx + srq.pyx + ) + +rdma_python_module(pyverbs + __init__.py + pyverbs_error.py + utils.py + ) + +# mlx5 provider is not built without coherent DMA, e.g. ARM32 build. +if (HAVE_COHERENT_DMA) +add_subdirectory(providers/mlx5) +endif() diff --git a/pyverbs/__init__.pxd b/pyverbs/__init__.pxd new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/pyverbs/__init__.pxd diff --git a/pyverbs/__init__.py b/pyverbs/__init__.py new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/pyverbs/__init__.py diff --git a/pyverbs/addr.pxd b/pyverbs/addr.pxd new file mode 100644 index 0000000..e7322e8 --- /dev/null +++ b/pyverbs/addr.pxd @@ -0,0 +1,25 @@ +# SPDX-License-Identifier: (GPL-2.0 OR Linux-OpenIB) +# Copyright (c) 2018, Mellanox Technologies. All rights reserved. See COPYING file + +#cython: language_level=3 + +from .base cimport PyverbsObject, PyverbsCM +from pyverbs cimport libibverbs as v + + +cdef class GID(PyverbsObject): + cdef v.ibv_gid gid + +cdef class GRH(PyverbsObject): + cdef v.ibv_grh grh + +cdef class GlobalRoute(PyverbsObject): + cdef v.ibv_global_route gr + +cdef class AHAttr(PyverbsObject): + cdef v.ibv_ah_attr ah_attr + +cdef class AH(PyverbsCM): + cdef v.ibv_ah *ah + cdef object pd + cpdef close(self) diff --git a/pyverbs/addr.pyx b/pyverbs/addr.pyx new file mode 100644 index 0000000..c36b1b4 --- /dev/null +++ b/pyverbs/addr.pyx @@ -0,0 +1,414 @@ +# SPDX-License-Identifier: (GPL-2.0 OR Linux-OpenIB) +# Copyright (c) 2018, Mellanox Technologies. All rights reserved. See COPYING file + +from libc.stdint cimport uint8_t + +from .pyverbs_error import PyverbsUserError, PyverbsRDMAError +from pyverbs.utils import gid_str_to_array, gid_str +from pyverbs.base import PyverbsRDMAErrno +cimport pyverbs.libibverbs as v +from pyverbs.pd cimport PD +from pyverbs.cq cimport WC + +cdef extern from 'endian.h': + unsigned long be64toh(unsigned long host_64bits) + + +cdef class GID(PyverbsObject): + """ + GID class represents ibv_gid. It enables user to query for GIDs values. + """ + def __init__(self, val=None): + super().__init__() + if val is not None: + vals = gid_str_to_array(val) + + for i in range(16): + self.gid.raw[i] = <uint8_t>int(vals[i],16) + + @property + def gid(self): + """ + Expose the inner GID + :return: A GID string in an 8 words format: + 'xxxx:xxxx:xxxx:xxxx:xxxx:xxxx:xxxx:xxxx' + """ + return self.__str__() + @gid.setter + def gid(self, val): + """ + Sets the inner GID + :param val: A GID string in an 8 words format: + 'xxxx:xxxx:xxxx:xxxx:xxxx:xxxx:xxxx:xxxx' + :return: None + """ + self._set_gid(val) + + def _set_gid(self, val): + vals = gid_str_to_array(val) + + for i in range(16): + self.gid.raw[i] = <uint8_t>int(vals[i],16) + + def __str__(self): + return gid_str(self.gid._global.subnet_prefix, + self.gid._global.interface_id) + + +cdef class GRH(PyverbsObject): + """ + Represents ibv_grh struct. Used when creating or initializing an + Address Handle from a Work Completion. + """ + def __init__(self, GID sgid=None, GID dgid=None, version_tclass_flow=0, + paylen=0, next_hdr=0, hop_limit=1): + """ + Initializes a GRH object + :param sgid: Source GID + :param dgid: Destination GID + :param version_tclass_flow: A 32b big endian used to communicate + service level e.g. across subnets + :param paylen: A 16b big endian that is the packet length in bytes, + starting from the first byte after the GRH up to and + including the last byte of the ICRC + :param next_hdr: An 8b unsigned integer specifying the next header + For non-raw packets: 0x1B + For raw packets: According to IETF RFC 1700 + :param hop_limit: An 8b unsigned integer specifying the number of hops + (i.e. routers) that the packet is permitted to take + prior to being discarded + :return: A GRH object + """ + super().__init__() + self.grh.dgid = dgid.gid + self.grh.sgid = sgid.gid + self.grh.version_tclass_flow = version_tclass_flow + self.grh.paylen = paylen + self.grh.next_hdr = next_hdr + self.grh.hop_limit = hop_limit + + @property + def dgid(self): + return gid_str(self.grh.dgid._global.subnet_prefix, + self.grh.dgid._global.interface_id) + @dgid.setter + def dgid(self, val): + vals = gid_str_to_array(val) + for i in range(16): + self.grh.dgid.raw[i] = <uint8_t>int(vals[i],16) + + @property + def sgid(self): + return gid_str(self.grh.sgid._global.subnet_prefix, + self.grh.sgid._global.interface_id) + @sgid.setter + def sgid(self, val): + vals = gid_str_to_array(val) + for i in range(16): + self.grh.sgid.raw[i] = <uint8_t>int(vals[i],16) + + @property + def version_tclass_flow(self): + return self.grh.version_tclass_flow + + @version_tclass_flow.setter + def version_tclass_flow(self, val): + self.grh.version_tclass_flow = val + + @property + def paylen(self): + return self.grh.paylen + @paylen.setter + def paylen(self, val): + self.grh.paylen = val + + @property + def next_hdr(self): + return self.grh.next_hdr + @next_hdr.setter + def next_hdr(self, val): + self.grh.next_hdr = val + + @property + def hop_limit(self): + return self.grh.hop_limit + @hop_limit.setter + def hop_limit(self, val): + self.grh.hop_limit = val + + def __str__(self): + print_format = '{:22}: {:<20}\n' + return print_format.format('DGID', self.dgid) +\ + print_format.format('SGID', self.sgid) +\ + print_format.format('version tclass flow', self.version_tclass_flow) +\ + print_format.format('paylen', self.paylen) +\ + print_format.format('next header', self.next_hdr) +\ + print_format.format('hop limit', self.hop_limit) + + +cdef class GlobalRoute(PyverbsObject): + """ + Represents ibv_global_route. Used in Address Handle creation and describes + the values to be used in the GRH of the packets that will be sent using + this Address Handle. + """ + def __init__(self, GID dgid=None, flow_label=0, sgid_index=0, hop_limit=1, + traffic_class=0): + """ + Initializes a GlobalRoute object with given parameters. + :param dgid: Destination GID + :param flow_label: A 20b value. If non-zero, gives a hint to switches + and routers that this sequence of packets must be + delivered in order + :param sgid_index: An index in the port's GID table that identifies the + originator of the packet + :param hop_limit: An 8b unsigned integer specifying the number of hops + (i.e. routers) that the packet is permitted to take + prior to being discarded + :param traffic_class: An 8b unsigned integer specifying the required + delivery priority for routers + :return: A GlobalRoute object + """ + super().__init__() + self.gr.dgid=dgid.gid + self.gr.flow_label = flow_label + self.gr.sgid_index = sgid_index + self.gr.hop_limit = hop_limit + self.gr.traffic_class = traffic_class + + @property + def dgid(self): + return gid_str(self.gr.dgid._global.subnet_prefix, + self.gr.dgid._global.interface_id) + @dgid.setter + def dgid(self, val): + vals = gid_str_to_array(val) + for i in range(16): + self.gr.dgid.raw[i] = <uint8_t>int(vals[i],16) + + @property + def flow_label(self): + return self.gr.flow_label + @flow_label.setter + def flow_label(self, val): + self.gr.flow_label = val + + @property + def sgid_index(self): + return self.gr.sgid_index + @sgid_index.setter + def sgid_index(self, val): + self.gr.sgid_index = val + + @property + def hop_limit(self): + return self.gr.hop_limit + @hop_limit.setter + def hop_limit(self, val): + self.gr.hop_limit = val + + @property + def traffic_class(self): + return self.gr.traffic_class + @traffic_class.setter + def traffic_class(self, val): + self.gr.traffic_class = val + + def __str__(self): + print_format = '{:22}: {:<20}\n' + return print_format.format('DGID', self.dgid) +\ + print_format.format('flow label', self.flow_label) +\ + print_format.format('sgid index', self.sgid_index) +\ + print_format.format('hop limit', self.hop_limit) +\ + print_format.format('traffic class', self.traffic_class) + + +cdef class AHAttr(PyverbsObject): + """ Represents ibv_ah_attr struct """ + def __init__(self, dlid=0, sl=0, src_path_bits=0, static_rate=0, + is_global=0, port_num=1, GlobalRoute gr=None): + """ + Initializes an AHAttr object. + :param dlid: Destination LID, a 16b unsigned integer + :param sl: Service level, an 8b unsigned integer + :param src_path_bits: When LMC (LID mask count) is used in the port, + packets are being sent with the port's base LID, + bitwise ORed with the value of the src_path_bits. + An 8b unsigned integer + :param static_rate: An 8b unsigned integer limiting the rate of packets + that are being sent to the subnet + :param is_global: If non-zero, GRH information exists in the Address + Handle + :param port_num: The local physical port from which the packets will be + sent + :param grh: Attributes of a global routing header. Will only be used if + is_global is non zero. + :return: An AHAttr object + """ + super().__init__() + self.ah_attr.port_num = port_num + self.ah_attr.sl = sl + self.ah_attr.src_path_bits = src_path_bits + self.ah_attr.dlid = dlid + self.ah_attr.static_rate = static_rate + self.ah_attr.is_global = is_global + # Do not set GRH fields for a non-global AH + if is_global: + if gr is None: + raise PyverbsUserError('Global AH Attr is created but gr parameter is None') + self.ah_attr.grh.dgid = gr.gr.dgid + self.ah_attr.grh.flow_label = gr.flow_label + self.ah_attr.grh.sgid_index = gr.sgid_index + self.ah_attr.grh.hop_limit = gr.hop_limit + self.ah_attr.grh.traffic_class = gr.traffic_class + + @property + def port_num(self): + return self.ah_attr.port_num + @port_num.setter + def port_num(self, val): + self.ah_attr.port_num = val + + @property + def sl(self): + return self.ah_attr.sl + @sl.setter + def sl(self, val): + self.ah_attr.sl = val + + @property + def src_path_bits(self): + return self.ah_attr.src_path_bits + @src_path_bits.setter + def src_path_bits(self, val): + self.ah_attr.src_path_bits = val + + @property + def dlid(self): + return self.ah_attr.dlid + @dlid.setter + def dlid(self, val): + self.ah_attr.dlid = val + + @property + def static_rate(self): + return self.ah_attr.static_rate + @static_rate.setter + def static_rate(self, val): + self.ah_attr.static_rate = val + + @property + def is_global(self): + return self.ah_attr.is_global + @is_global.setter + def is_global(self, val): + self.ah_attr.is_global = val + + @property + def dgid(self): + if self.ah_attr.is_global: + return gid_str(self.ah_attr.grh.dgid._global.subnet_prefix, + self.ah_attr.grh.dgid._global.interface_id) + @dgid.setter + def dgid(self, val): + if self.ah_attr.is_global: + vals = gid_str_to_array(val) + for i in range(16): + self.ah_attr.grh.dgid.raw[i] = <uint8_t>int(vals[i],16) + + @property + def flow_label(self): + if self.ah_attr.is_global: + return self.ah_attr.grh.flow_label + @flow_label.setter + def flow_label(self, val): + self.ah_attr.grh.flow_label = val + + @property + def sgid_index(self): + if self.ah_attr.is_global: + return self.ah_attr.grh.sgid_index + @sgid_index.setter + def sgid_index(self, val): + self.ah_attr.grh.sgid_index = val + + @property + def hop_limit(self): + if self.ah_attr.is_global: + return self.ah_attr.grh.hop_limit + @hop_limit.setter + def hop_limit(self, val): + self.ah_attr.grh.hop_limit = val + + @property + def traffic_class(self): + if self.ah_attr.is_global: + return self.ah_attr.grh.traffic_class + @traffic_class.setter + def traffic_class(self, val): + self.ah_attr.grh.traffic_class = val + + def __str__(self): + print_format = ' {:22}: {:<20}\n' + if self.is_global: + global_format = print_format.format('dgid', self.dgid) +\ + print_format.format('flow label', self.flow_label) +\ + print_format.format('sgid index', self.sgid_index) +\ + print_format.format('hop limit', self.hop_limit) +\ + print_format.format('traffic_class', self.traffic_class) + else: + global_format = '' + return print_format.format('port num', self.port_num) +\ + print_format.format('sl', self.sl) +\ + print_format.format('source path bits', self.src_path_bits) +\ + print_format.format('dlid', self.dlid) +\ + print_format.format('static rate', self.static_rate) +\ + print_format.format('is global', self.is_global) + global_format + + +cdef class AH(PyverbsCM): + def __init__(self, PD pd, **kwargs): + """ + Initializes an AH object with the given values. + Two creation methods are supported: + - Creation via AHAttr object (calls ibv_create_ah) + - Creation via a WC object (calls ibv_create_ah_from_wc) + :param pd: PD object this AH belongs to + :param kwargs: Arguments: + * *attr* (AHAttr) + An AHAttr object (represents ibv_ah_attr struct) + * *wc* + A WC object to use for AH initialization + * *grh* + A GRH object to use for AH initialization (when using wc) + * *port_num* + Port number to be used for this AH (when using wc) + :return: An AH object on success + """ + super().__init__() + if len(kwargs) == 1: + # Create AH via ib_create_ah + ah_attr = <AHAttr>kwargs['attr'] + self.ah = v.ibv_create_ah(pd.pd, &ah_attr.ah_attr) + else: + # Create AH from WC + wc = <WC>kwargs['wc'] + grh = <GRH>kwargs['grh'] + port_num = kwargs['port_num'] + self.ah = v.ibv_create_ah_from_wc(pd.pd, &wc.wc, &grh.grh, port_num) + if self.ah == NULL: + raise PyverbsRDMAErrno('Failed to create AH') + pd.add_ref(self) + self.pd = pd + + def __dealloc__(self): + self.close() + + cpdef close(self): + if self.ah != NULL: + self.logger.debug('Closing AH') + rc = v.ibv_destroy_ah(self.ah) + if rc: + raise PyverbsRDMAError('Failed to destroy AH', rc) + self.ah = NULL + self.pd = None diff --git a/pyverbs/base.pxd b/pyverbs/base.pxd new file mode 100644 index 0000000..efa6323 --- /dev/null +++ b/pyverbs/base.pxd @@ -0,0 +1,13 @@ +# SPDX-License-Identifier: (GPL-2.0 OR Linux-OpenIB) +# Copyright (c) 2019, Mellanox Technologies. All rights reserved. + +#cython: language_level=3 + +cdef class PyverbsObject(object): + cdef object __weakref__ + cdef object logger + +cdef class PyverbsCM(PyverbsObject): + cpdef close(self) + +cdef close_weakrefs(iterables) diff --git a/pyverbs/base.pyx b/pyverbs/base.pyx new file mode 100644 index 0000000..790ba41 --- /dev/null +++ b/pyverbs/base.pyx @@ -0,0 +1,70 @@ +# SPDX-License-Identifier: (GPL-2.0 OR Linux-OpenIB) +# Copyright (c) 2019, Mellanox Technologies. All rights reserved. + +from libc.errno cimport errno +import logging + +from pyverbs.pyverbs_error import PyverbsRDMAError +cimport pyverbs.libibverbs as v + + +def inc_rkey(rkey): + return v.ibv_inc_rkey(rkey) + + +cpdef PyverbsRDMAErrno(str msg): + return PyverbsRDMAError(msg, errno) + + +LOG_LEVEL=logging.INFO +LOG_FORMAT='[%(levelname)s] %(asctime)s %(filename)s:%(lineno)s: %(message)s' +logging.basicConfig(format=LOG_FORMAT, level=LOG_LEVEL, datefmt='%d %b %Y %H:%M:%S') + + +cdef close_weakrefs(iterables): + """ + For each iterable element of iterables, pop each element and + call its close() method. This method is used when an object is being + closed while other objects still hold C references to it; the object + holds weakrefs to such other object, and closes them before trying to + teardown the C resources. + :param iterables: an array of WeakSets + :return: None + """ + # None elements can be present if an object's close() was called more + # than once (e.g. GC and by another object) + for it in iterables: + if it is None: + continue + while True: + try: + tmp = it.pop() + tmp.close() + except KeyError: # popping an empty set + break + + +cdef class PyverbsObject(object): + + def __init__(self): + self.logger = logging.getLogger(self.__class__.__name__) + + def set_log_level(self, val): + self.logger.setLevel(val) + + +cdef class PyverbsCM(PyverbsObject): + """ + This is a base class for pyverbs' context manager objects. It includes + __enter__ and __exit__ functions. + close() is also declared but it should be overridden by each inheriting + class. + """ + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_value, traceback): + return self.close() + + cpdef close(self): + pass diff --git a/pyverbs/cm_enums.pyx b/pyverbs/cm_enums.pyx new file mode 120000 index 0000000..bdab2b5 --- /dev/null +++ b/pyverbs/cm_enums.pyx @@ -0,0 +1 @@ +librdmacm_enums.pxd \ No newline at end of file diff --git a/pyverbs/cmid.pxd b/pyverbs/cmid.pxd new file mode 100755 index 0000000..df3c920 --- /dev/null +++ b/pyverbs/cmid.pxd @@ -0,0 +1,34 @@ +# SPDX-License-Identifier: (GPL-2.0 OR Linux-OpenIB) +# Copyright (c) 2019, Mellanox Technologies. All rights reserved. See COPYING file + +#cython: language_level=3 + +from pyverbs.base cimport PyverbsObject, PyverbsCM +cimport pyverbs.librdmacm as cm + + +cdef class CMID(PyverbsCM): + cdef cm.rdma_cm_id *id + cdef object event_channel + cdef object ctx + cdef object pd + cpdef close(self) + + +cdef class CMEventChannel(PyverbsObject): + cdef cm.rdma_event_channel *event_channel + cpdef close(self) + + +cdef class CMEvent(PyverbsObject): + cdef cm.rdma_cm_event *event + cpdef close(self) + + +cdef class AddrInfo(PyverbsObject): + cdef cm.rdma_addrinfo *addr_info + cpdef close(self) + + +cdef class ConnParam(PyverbsObject): + cdef cm.rdma_conn_param conn_param diff --git a/pyverbs/cmid.pyx b/pyverbs/cmid.pyx new file mode 100755 index 0000000..66d7326 --- /dev/null +++ b/pyverbs/cmid.pyx @@ -0,0 +1,541 @@ +from libc.string cimport memset + +from pyverbs.pyverbs_error import PyverbsUserError +from pyverbs.qp cimport QPInitAttr, QPAttr +from pyverbs.base import PyverbsRDMAErrno +cimport pyverbs.libibverbs_enums as e +cimport pyverbs.librdmacm_enums as ce +from pyverbs.device cimport Context +cimport pyverbs.libibverbs as v +cimport pyverbs.librdmacm as cm +from pyverbs.pd cimport PD +from pyverbs.mr cimport MR +from pyverbs.cq cimport WC + + +cdef class ConnParam(PyverbsObject): + + def __init__(self, resources=1, depth=1, flow_control=0, retry=5, + rnr_retry=5, srq=0, qp_num=0): + """ + Initialize a ConnParam object over an underlying rdma_conn_param + C object which contains connection parameters. There are a few types of + port spaces in RDMACM: RDMA_PS_TCP, RDMA_PS_UDP, RDMA_PS_IB and + RDMA_PS_IPOIB. RDMA_PS_TCP resembles RC QP connection, which provides + reliable, connection-oriented QP communication. This object applies only + to RDMA_PS_TCP port space. + :param resources: Max outstanding RDMA read and atomic ops that local + side will accept from the remote side. + :param depth: Max outstanding RDMA read and atomic ops that local side + will have to the remote side. + :param flow_control: Specifies if hardware flow control is available. + :param retry: Max number of times that a send, RDMA or atomic op from + the remote peer should be retried. + :param rnr_retry: The maximum number of times that a send operation from + the remote peer should be retried on a connection + after receiving a receiver not ready (RNR) error. + :param srq: Specifies if the QP using shared receive queue, ignored if + the QP created by CMID. + :param qp_num: Specifies the QP number, ignored if the QP created by + CMID. + :return: ConnParam object + """ + super().__init__() + memset(&self.conn_param, 0, sizeof(cm.rdma_conn_param)) + self.conn_param.responder_resources = resources + self.conn_param.initiator_depth = depth + self.conn_param.flow_control = flow_control + self.conn_param.retry_count = retry + self.conn_param.rnr_retry_count = rnr_retry + self.conn_param.srq = srq + self.conn_param.qp_num = qp_num + + @property + def qpn(self): + return self.conn_param.qp_num + @qpn.setter + def qpn(self, val): + self.conn_param.qp_num = val + + def __str__(self): + print_format = '{:<4}: {:<4}\n' + return '{}: {}\n'.format('Connection parameters', "") +\ + print_format.format('responder resources', self.conn_param.responder_resources) +\ + print_format.format('initiator depth', self.conn_param.initiator_depth) +\ + print_format.format('flow control', self.conn_param.flow_control) +\ + print_format.format('retry count', self.conn_param.retry_count) +\ + print_format.format('rnr retry count', self.conn_param.rnr_retry_count) +\ + print_format.format('srq', self.conn_param.srq) +\ + print_format.format('qp number', self.conn_param.qp_num) + + +cdef class AddrInfo(PyverbsObject): + + def __init__(self, src=None, dst=None, service=None, port_space=0, + flags=0): + """ + Initialize an AddrInfo object over an underlying rdma_addrinfo C object. + :param src: Name, dotted-decimal IPv4 or IPv6 hex address to bind to. + :param dst: Name, dotted-decimal IPv4 or IPv6 hex address to connect to. + :param service: The service name or port number of the address. + :param port_space: RDMA port space used (RDMA_PS_UDP or RDMA_PS_TCP). + :param flags: Hint flags which control the operation. + :return: An AddrInfo object which contains information needed to + establish communication. + """ + cdef char* srvc = NULL + cdef char* src_addr = NULL + cdef char* dst_addr = NULL + cdef cm.rdma_addrinfo hints + cdef cm.rdma_addrinfo *hints_ptr = NULL + cdef cm.rdma_addrinfo *res = NULL + + super().__init__() + if src is not None: + if isinstance(src, str): + src = src.encode('utf-8') + src_addr = <char*>src + if dst is not None: + if isinstance(dst, str): + dst = dst.encode('utf-8') + dst_addr = <char*>dst + if service is not None: + if isinstance(service, str): + service = service.encode('utf-8') + srvc = <char*>service + + hints_ptr = &hints + memset(hints_ptr, 0, sizeof(cm.rdma_addrinfo)) + hints.ai_port_space = port_space + hints.ai_flags = flags + if flags & ce.RAI_PASSIVE: + ret = cm.rdma_getaddrinfo(src_addr, srvc, hints_ptr, + &self.addr_info) + else: + if src: + hints.ai_flags |= ce.RAI_PASSIVE + ret = cm.rdma_getaddrinfo(src_addr, NULL, hints_ptr, &res) + if ret != 0: + raise PyverbsRDMAErrno('Failed to get Address Info') + hints.ai_src_addr = <cm.sockaddr*>res.ai_src_addr + hints.ai_src_len = res.ai_src_len + hints.ai_flags &= ~ce.RAI_PASSIVE + ret = cm.rdma_getaddrinfo(dst_addr, srvc, hints_ptr, + &self.addr_info) + if src: + cm.rdma_freeaddrinfo(res) + if ret != 0: + raise PyverbsRDMAErrno('Failed to get Address Info') + + def __dealloc__(self): + self.close() + + cpdef close(self): + if self.addr_info != NULL: + self.logger.debug('Closing AddrInfo') + cm.rdma_freeaddrinfo(self.addr_info) + self.addr_info = NULL + + +cdef class CMEvent(PyverbsObject): + + def __init__(self, CMEventChannel channel): + """ + Initialize a CMEvent object over an underlying rdma_cm_event C object + :param channel: Event Channel on which this event has been received + :return: CMEvent object + """ + super().__init__() + ret = cm.rdma_get_cm_event(channel.event_channel, &self.event) + if ret != 0: + raise PyverbsRDMAErrno('Failed to create CMEvent') + self.logger.debug('Created a CMEvent') + + def __dealloc__(self): + self.close() + + cpdef close(self): + if self.event != NULL: + self.logger.debug('Closing CMEvent') + self.ack_cm_event() + self.event = NULL + + @property + def event_type(self): + return self.event.event + + def ack_cm_event(self): + """ + Free a communication event. This call frees the event structure and any + memory that it references. + :return: None + """ + ret = cm.rdma_ack_cm_event(self.event) + if ret != 0: + raise PyverbsRDMAErrno('Failed to Acknowledge Event - {}' + .format(self.event_str())) + self.event = NULL + + def event_str(self): + if self.event == NULL: + return '' + return (<bytes>cm.rdma_event_str(self.event_type)).decode() + + +cdef class CMEventChannel(PyverbsObject): + + def __init__(self): + """ + Initialize a CMEventChannel object over an underlying rdma_event_channel + C object. + :return: EventChannel object + """ + super().__init__() + self.event_channel = cm.rdma_create_event_channel() + if self.event_channel == NULL: + raise PyverbsRDMAErrno('Failed to create CMEventChannel') + self.logger.debug('Created a CMEventChannel') + + def __dealloc__(self): + self.close() + + cpdef close(self): + if self.event_channel != NULL: + self.logger.debug('Closing CMEventChannel') + cm.rdma_destroy_event_channel(self.event_channel) + self.event_channel = NULL + + +cdef class CMID(PyverbsCM): + + def __init__(self, object creator=None, QPInitAttr qp_init_attr=None, + PD pd=None, port_space=ce.RDMA_PS_TCP, CMID listen_id=None): + """ + Initialize a CMID object over an underlying rdma_cm_id C object. + This is the main RDMA CM object which provides most of the rdmacm API. + Currently only synchronous RDMA_PS_TCP communication supported. + Notes: User-specific context, currently not supported. + :param creator: For synchronous communication we need AddrInfo object in + order to establish connection. We allow creator to be + None for inner usage, see get_request method. + :param qp_init_attr: Optional initial QP attributes of CMID + associated QP. + :param pd: Optional parameter, a PD to be associated with this CMID. + :param port_space: RDMA port space. + :param listen_id: When passive side establishes a connection, it creates + a new CMID. listen_id is used to initialize the new + CMID. + :return: CMID object for synchronous communication. + """ + cdef v.ibv_qp_init_attr *init + cdef v.ibv_pd *in_pd = NULL + + super().__init__() + self.pd = None + self.ctx = None + self.event_channel = None + if creator is None: + return + elif isinstance(creator, AddrInfo): + init = NULL if qp_init_attr is None else &qp_init_attr.attr + if pd is not None: + in_pd = pd.pd + self.pd = pd + ret = cm.rdma_create_ep(&self.id, (<AddrInfo>creator).addr_info, + in_pd, init) + if ret != 0: + raise PyverbsRDMAErrno('Failed to create CM ID') + if not (<AddrInfo>creator).addr_info.ai_flags & ce.RAI_PASSIVE: + self.ctx = Context(cmid=self) + if self.pd is None: + self.pd = PD(self) + elif isinstance(creator, CMEventChannel): + self.event_channel = <CMEventChannel>creator + ret = cm.rdma_create_id((<CMEventChannel>creator).event_channel, + &self.id, NULL, port_space) + if ret != 0: + raise PyverbsRDMAErrno('Failed to create CM ID') + elif isinstance(creator, CMEvent): + if listen_id is None: + raise PyverbsUserError('listen ID not provided') + self.id = (<CMEvent>creator).event.id + self.event_channel = listen_id.event_channel + self.ctx = listen_id.ctx + self.pd = listen_id.pd + else: + raise PyverbsRDMAErrno('Cannot create CM ID from {obj}' + .format(obj=type(creator))) + + @property + def event_channel(self): + return self.event_channel + + @property + def context(self): + return self.ctx + + @property + def pd(self): + return self.pd + + def __dealloc__(self): + self.close() + + cpdef close(self): + if self.id != NULL: + self.logger.debug('Closing CMID') + if self.event_channel is None: + cm.rdma_destroy_ep(self.id) + else: + if self.id.qp != NULL: + cm.rdma_destroy_qp(self.id) + ret = cm.rdma_destroy_id(self.id) + if ret != 0: + raise PyverbsRDMAErrno('Failed to close CMID') + if self.ctx: + (<Context>self.ctx).context = NULL + if self.pd: + (<PD>self.pd).pd = NULL + self.id = NULL + + def get_request(self): + """ + Retrieves the next pending connection request event. The call may only + be used on listening CMIDs operating synchronously. If the call is + successful, a new CMID representing the connection request will be + returned to the user. The new CMID will reference event information + associated with the request until the user calls reject, accept, or + close on the newly created identifier. + :return: New CMID representing the connection request. + """ + to_conn = CMID() + ret = cm.rdma_get_request(self.id, &to_conn.id) + if ret != 0: + raise PyverbsRDMAErrno('Failed to get request, no connection established') + self.ctx = Context(cmid=to_conn) + self.pd = PD(to_conn) + return to_conn + + def bind_addr(self, AddrInfo lai not None): + """ + Associate a source address with a CMID. If binding to a specific local + address, the CMID will also be bound to a local RDMA device. + :param lai: Local address information + :return: None + """ + ret = cm.rdma_bind_addr(self.id, lai.addr_info.ai_src_addr) + if ret != 0: + raise PyverbsRDMAErrno('Failed to Bind ID') + # After bind address, cm_id contains ibv_context. + # Now we can create Context object. + if self.ctx is None: + self.ctx = Context(cmid=self) + if self.pd is None: + self.pd = PD(self) + + def resolve_addr(self, AddrInfo rai not None, timeout_ms=2000): + """ + Resolve destination and optional source addresses from IP addresses to + an RDMA address. If successful, the specified rdma_cm_id will be bound + to a local device. + :param rai: Remote address information. + :param timeout_ms: Time to wait for resolution to complete [msec] + :return: None + """ + ret = cm.rdma_resolve_addr(self.id, rai.addr_info.ai_src_addr, + rai.addr_info.ai_dst_addr, timeout_ms) + if ret != 0: + raise PyverbsRDMAErrno('Failed to Resolve Address') + + def resolve_route(self, timeout_ms=2000): + """ + Resolve an RDMA route to the destination address in order to establish + a connection. The destination must already have been resolved by calling + resolve_addr. Thus this function is called on the client side after + resolve_addr but before calling connect. + :param timeout_ms: Time to wait for resolution to complete + :return: None + """ + ret = cm.rdma_resolve_route(self.id, timeout_ms) + if ret != 0: + raise PyverbsRDMAErrno('Failed to Resolve Route') + # After resolve route, cm_id contains ibv_context. + # Now we can create Context object. + if self.ctx is None: + self.ctx = Context(cmid=self) + if self.pd is None: + self.pd = PD(self) + + def listen(self, backlog=0): + """ + Listen for incoming connection requests or datagram service lookup. + The listen is restricted to the locally bound source address. + :param backlog: The backlog of incoming connection requests + :return: None + """ + ret = cm.rdma_listen(self.id, backlog) + if ret != 0: + raise PyverbsRDMAErrno('Listen Failed') + + def connect(self, ConnParam param=None): + """ + Initiates an active connection request to a remote destination. + :param param: Optional connection parameters + :return: None + """ + cdef cm.rdma_conn_param *conn = ¶m.conn_param if param else NULL + ret = cm.rdma_connect(self.id, conn) + if ret != 0: + raise PyverbsRDMAErrno('Failed to Connect') + + def disconnect(self): + """ + Disconnects a connection and transitions any associated QP to error + state. + :return: None + """ + ret = cm.rdma_disconnect(self.id) + if ret != 0: + raise PyverbsRDMAErrno('Failed to Disconnect') + + def accept(self, ConnParam param=None): + """ + Is called from the listening side to accept a connection or datagram + service lookup request. + :param param: Optional connection parameters + :return: None + """ + cdef cm.rdma_conn_param *conn = ¶m.conn_param if param else NULL + ret = cm.rdma_accept(self.id, conn) + if ret != 0: + raise PyverbsRDMAErrno('Failed to Accept Connection') + + def establish(self): + """ + Complete an active connection request. + If a QP has not been created on the CMID, this method should be + called by the active side to complete the connection, after getting + connect response event. This will trigger a connection established + event on the passive side. + This method should not be used on a CMID on which a QP has been + created. + """ + ret = cm.rdma_establish(self.id) + if ret != 0: + raise PyverbsRDMAErrno('Failed to Complete an active connection request') + + def create_qp(self, QPInitAttr qp_init not None): + """ + Create a QP, which is associated with CMID. + If CMID and qp_init don't hold any CQs, new CQs will be created and + associated with CMID. + If only qp_init provides CQs, they will not be associated with CMID. + If both provide CQs they have to be using the same CQs. + :param qp_init: QP init attributes + """ + ret = cm.rdma_create_qp(self.id, (<PD>self.pd).pd, &qp_init.attr) + if ret != 0: + raise PyverbsRDMAErrno('Failed to Create QP') + + def query_qp(self, attr_mask): + """ + Query QP using ibv_query_qp. + :param attr_mask: Which attributes to query (use <enum name> enum) + :return: A (QPAttr, QPInitAttr) tuple, containing the relevant QP info + """ + attr = QPAttr() + init_attr = QPInitAttr() + rc = v.ibv_query_qp(self.id.qp, &attr.attr, attr_mask, &init_attr.attr) + if rc != 0: + raise PyverbsRDMAErrno('Failed to query QP') + return attr, init_attr + + def init_qp_attr(self, qp_state): + """ + Initialize a QPAttr object used for state transitions of an external + QP (a QP which was not created using CMID). + When connecting external QPs using CMIDs both sides must call this + method before QP state transition to RTR/RTS in order to obtain + relevant QP attributes from CMID. + :param qp_state: The QP's destination state + :return: A (QPAttr, attr_mask) tuple, where attr_mask defines which + attributes of QPAttr are valid + """ + cdef int attr_mask + qp_attr = QPAttr() + qp_attr.qp_state = qp_state + + rc = cm.rdma_init_qp_attr(self.id, &qp_attr.attr, &attr_mask) + if rc != 0: + raise PyverbsRDMAErrno('Failed to get QP attributes') + return qp_attr, attr_mask + + def reg_msgs(self, size): + """ + Registers a memory region for sending or receiving messages or for + RDMA operations. The registered memory may then be posted to an CMID + using post_send or post_recv methods. + :param size: The total length of the memory to register + :return: registered MR + """ + return MR(self.pd, size, e.IBV_ACCESS_LOCAL_WRITE) + + def post_recv(self, MR mr not None): + """ + Posts a recv_wr via QP associated with CMID. + Context param of rdma_post_recv C function currently not supported. + :param mr: A valid MR object. + :return: None + """ + ret = cm.rdma_post_recv(self.id, NULL, mr.buf, mr.mr.length, mr.mr) + if ret != 0: + raise PyverbsRDMAErrno('Failed to Post Receive') + + def post_send(self, MR mr not None, flags=v.IBV_SEND_SIGNALED): + """ + Posts a message via QP associated with CMID. + Context param of rdma_post_send C function currently not supported. + :param mr: A valid MR object which contains message to send. + :param flags: flags for send work request. + :return: None + """ + ret = cm.rdma_post_send(self.id, NULL, mr.buf, mr.mr.length, mr.mr, + flags) + if ret != 0: + raise PyverbsRDMAErrno('Failed to Post Send') + + def get_recv_comp(self): + """ + Polls the receive CQ associated with CMID for a work completion. + :return: The retrieved WC or None if there is no completions + """ + cdef v.ibv_wc wc + ret = cm.rdma_get_recv_comp(self.id, &wc) + if ret < 0: + raise PyverbsRDMAErrno('Failed to retrieve receive completion') + elif ret == 0: + return None + return WC(wr_id=wc.wr_id, status=wc.status, opcode=wc.opcode, + vendor_err=wc.vendor_err, byte_len=wc.byte_len, + qp_num=wc.qp_num, src_qp=wc.src_qp, + imm_data=wc.imm_data, wc_flags=wc.wc_flags, + pkey_index=wc.pkey_index, slid=wc.slid, sl=wc.sl, + dlid_path_bits=wc.dlid_path_bits) + + def get_send_comp(self): + """ + Polls the send CQ associated with CMID for a work completion. + :return: The retrieved WC or None if there is no completions + """ + cdef v.ibv_wc wc + ret = cm.rdma_get_send_comp(self.id, &wc) + if ret < 0: + raise PyverbsRDMAErrno('Failed to retrieve send completion') + elif ret == 0: + return None + return WC(wr_id=wc.wr_id, status=wc.status, opcode=wc.opcode, + vendor_err=wc.vendor_err, byte_len=wc.byte_len, + qp_num=wc.qp_num, src_qp=wc.src_qp, + imm_data=wc.imm_data, wc_flags=wc.wc_flags, + pkey_index=wc.pkey_index, slid=wc.slid, sl=wc.sl, + dlid_path_bits=wc.dlid_path_bits) diff --git a/pyverbs/cq.pxd b/pyverbs/cq.pxd new file mode 100644 index 0000000..a523087 --- /dev/null +++ b/pyverbs/cq.pxd @@ -0,0 +1,47 @@ +# SPDX-License-Identifier: (GPL-2.0 OR Linux-OpenIB) +# Copyright (c) 2019, Mellanox Technologies. All rights reserved. + +#cython: language_level=3 + +from pyverbs.base cimport PyverbsObject, PyverbsCM +cimport pyverbs.libibverbs as v + +cdef class CompChannel(PyverbsCM): + cdef v.ibv_comp_channel *cc + cpdef close(self) + cdef object context + cdef add_ref(self, obj) + cdef object cqs + +cdef class CQ(PyverbsCM): + cdef v.ibv_cq *cq + cpdef close(self) + cdef object context + cdef add_ref(self, obj) + cdef object qps + cdef object srqs + cdef object channel + cdef object num_events + +cdef class CqInitAttrEx(PyverbsObject): + cdef v.ibv_cq_init_attr_ex attr + cdef object channel + cdef object parent_domain + +cdef class CQEX(PyverbsCM): + cdef v.ibv_cq_ex *cq + cdef v.ibv_cq *ibv_cq + cpdef close(self) + cdef object context + cdef add_ref(self, obj) + cdef object qps + cdef object srqs + +cdef class WC(PyverbsObject): + cdef v.ibv_wc wc + +cdef class PollCqAttr(PyverbsObject): + cdef v.ibv_poll_cq_attr attr + +cdef class WcTmInfo(PyverbsObject): + cdef v.ibv_wc_tm_info info diff --git a/pyverbs/cq.pyx b/pyverbs/cq.pyx new file mode 100755 index 0000000..7eef890 --- /dev/null +++ b/pyverbs/cq.pyx @@ -0,0 +1,631 @@ +# SPDX-License-Identifier: (GPL-2.0 OR Linux-OpenIB) +# Copyright (c) 2019, Mellanox Technologies. All rights reserved. +import weakref + +from pyverbs.pyverbs_error import PyverbsError, PyverbsRDMAError +from pyverbs.base import PyverbsRDMAErrno +from pyverbs.pd cimport PD, ParentDomain +from pyverbs.base cimport close_weakrefs +cimport pyverbs.libibverbs_enums as e +from pyverbs.device cimport Context +from pyverbs.srq cimport SRQ +from pyverbs.qp cimport QP + +cdef class CompChannel(PyverbsCM): + """ + A completion channel is a file descriptor used to deliver completion + notifications to a userspace process. When a completion event is generated + for a CQ, the event is delivered via the completion channel attached to the + CQ. + """ + def __init__(self, Context context not None): + """ + Initializes a completion channel object on the given device. + :param context: The device's context to use + :return: A CompChannel object on success + """ + super().__init__() + self.cc = v.ibv_create_comp_channel(context.context) + if self.cc == NULL: + raise PyverbsRDMAErrno('Failed to create a completion channel') + self.context = context + context.add_ref(self) + self.cqs = weakref.WeakSet() + self.logger.debug('Created a Completion Channel') + + def __dealloc__(self): + self.close() + + cpdef close(self): + if self.cc != NULL: + self.logger.debug('Closing completion channel') + close_weakrefs([self.cqs]) + rc = v.ibv_destroy_comp_channel(self.cc) + if rc != 0: + raise PyverbsRDMAError('Failed to destroy a completion channel', + rc) + self.cc = NULL + + def get_cq_event(self, CQ expected_cq): + """ + Waits for the next completion event in the completion event channel + :param expected_cq: The CQ that is expected to get the event + :return: None + """ + cdef v.ibv_cq *cq + cdef void *ctx + + rc = v.ibv_get_cq_event(self.cc, &cq, &ctx) + if rc != 0: + raise PyverbsRDMAErrno('Failed to get CQ event') + if cq != expected_cq.cq: + raise PyverbsRDMAErrno('Received event on an unexpected CQ') + expected_cq.num_events += 1 + + cdef add_ref(self, obj): + if isinstance(obj, CQ) or isinstance(obj, CQEX): + self.cqs.add(obj) + + +cdef class CQ(PyverbsCM): + """ + A Completion Queue is the notification mechanism for work request + completions. A CQ can have 0 or more associated QPs. + """ + def __init__(self, Context context not None, cqe, cq_context=None, + CompChannel channel=None, comp_vector=0): + """ + Initializes a CQ object with the given parameters. + :param context: The device's context on which to open the CQ + :param cqe: CQ's capacity + :param cq_context: User context's pointer + :param channel: If set, will be used to return completion events + :param comp_vector: Will be used for signaling completion events. + Must be larger than 0 and smaller than the + context's num_comp_vectors + :return: The newly created CQ + """ + super().__init__() + if channel is not None: + self.cq = v.ibv_create_cq(context.context, cqe, <void*>cq_context, + channel.cc, comp_vector) + channel.add_ref(self) + self.channel = channel + else: + self.cq = v.ibv_create_cq(context.context, cqe, <void*>cq_context, + NULL, comp_vector) + self.channel = None + if self.cq == NULL: + raise PyverbsRDMAErrno('Failed to create a CQ') + self.context = context + context.add_ref(self) + self.qps = weakref.WeakSet() + self.srqs = weakref.WeakSet() + self.num_events = 0 + self.logger.debug('Created a CQ') + + cdef add_ref(self, obj): + if isinstance(obj, QP): + self.qps.add(obj) + elif isinstance(obj, SRQ): + self.srqs.add(obj) + else: + raise PyverbsError('Unrecognized object type') + + def __dealloc__(self): + self.close() + + cpdef close(self): + if self.cq != NULL: + self.logger.debug('Closing CQ') + close_weakrefs([self.qps, self.srqs]) + if self.num_events: + self.ack_events(self.num_events) + rc = v.ibv_destroy_cq(self.cq) + if rc != 0: + raise PyverbsRDMAError('Failed to close CQ', rc) + self.cq = NULL + self.context = None + self.channel = None + + def poll(self, num_entries=1): + """ + Polls the CQ for completions. + :param num_entries: number of completions to pull + :return: (npolled, wcs): The number of polled completions and an array + of the polled completions + """ + cdef v.ibv_wc wc + wcs = [] + npolled = 0 + + while npolled < num_entries: + rc = v.ibv_poll_cq(self.cq, 1, &wc) + if rc < 0: + raise PyverbsRDMAErrno('Failed to poll CQ') + if rc == 0: + break; + npolled += 1 + wcs.append(WC(wr_id=wc.wr_id, status=wc.status, opcode=wc.opcode, + vendor_err=wc.vendor_err, byte_len=wc.byte_len, + qp_num=wc.qp_num, src_qp=wc.src_qp, + imm_data=wc.imm_data, wc_flags=wc.wc_flags, + pkey_index=wc.pkey_index, slid=wc.slid, sl=wc.sl, + dlid_path_bits=wc.dlid_path_bits)) + return npolled, wcs + + def req_notify(self, solicited_only = False): + """ + Request completion notification on the completion queue. + :param solicited_only: If non-zero, notifications will be created only + for incoming send / RDMA write WRs with + immediate data that have the solicited bit set in + their send flags. + :return: None + """ + rc = v.ibv_req_notify_cq(self.cq, solicited_only) + if rc != 0: + raise PyverbsRDMAErrno('Request notify CQ returned {rc}'. + format(rc=rc)) + + def ack_events(self, num_events): + """ + Get and acknowledge CQ events + :param num_events: Number of events to acknowledge + :return: None + """ + v.ibv_ack_cq_events(self.cq, num_events) + self.num_events -= num_events + + def __str__(self): + print_format = '{:22}: {:<20}\n' + return 'CQ\n' +\ + print_format.format('Handle', self.cq.handle) +\ + print_format.format('CQEs', self.cq.cqe) + + @property + def comp_channel(self): + return self.channel + + +cdef class CqInitAttrEx(PyverbsObject): + def __init__(self, cqe = 100, CompChannel channel = None, comp_vector = 0, + wc_flags = 0, comp_mask = 0, flags = 0, PD parent_domain = None): + """ + Initializes a CqInitAttrEx object with the given parameters. + :param cqe: CQ's capacity + :param channel: If set, will be used to return completion events + :param comp_vector: Will be used for signaling completion events. + Must be larger than 0 and smaller than the + context's num_comp_vectors + :param wc_flags: The wc_flags that should be returned in ibv_poll_cq_ex. + Or'ed bit of enum ibv_wc_flags_ex. + :param comp_mask: compatibility mask (extended verb) + :param flags: create cq attr flags - one or more flags from + ibv_create_cq_attr_flags enum + :param parent_domain: If set, will be used to custom alloc cq buffers. + :return: + """ + super().__init__() + self.attr.cqe = cqe + self.attr.cq_context = NULL + self.attr.channel = NULL if channel is None else channel.cc + self.attr.comp_vector = comp_vector + self.attr.wc_flags = wc_flags + self.attr.comp_mask = comp_mask + self.attr.flags = flags + self.attr.parent_domain = NULL if parent_domain is None else parent_domain.pd + self.channel = channel + self.parent_domain = parent_domain + + @property + def cqe(self): + return self.attr.cqe + @cqe.setter + def cqe(self, val): + self.attr.cqe = val + + # Setter-only properties require the older syntax + property cq_context: + def __set__(self, val): + self.attr.cq_context = <void*>val + + @property + def parent_domain(self): + return self.parent_domain + @parent_domain.setter + def parent_domain(self, PD val): + self.parent_domain = val + self.attr.parent_domain = val.pd + + @property + def comp_channel(self): + return self.channel + @comp_channel.setter + def comp_channel(self, CompChannel val): + self.channel = val + self.attr.channel = val.cc + + @property + def comp_vector(self): + return self.attr.comp_vector + @comp_vector.setter + def comp_vector(self, val): + self.attr.comp_vector = val + + @property + def wc_flags(self): + return self.attr.wc_flags + @wc_flags.setter + def wc_flags(self, val): + self.attr.wc_flags = val + + @property + def comp_mask(self): + return self.attr.comp_mask + @comp_mask.setter + def comp_mask(self, val): + self.attr.comp_mask = val + + @property + def flags(self): + return self.attr.flags + @flags.setter + def flags(self, val): + self.attr.flags = val + + def __str__(self): + print_format = '{:22}: {:<20}\n' + return print_format.format('Number of CQEs', self.cqe) +\ + print_format.format('WC flags', create_wc_flags_to_str(self.wc_flags)) +\ + print_format.format('comp mask', self.comp_mask) +\ + print_format.format('flags', self.flags) + + +cdef class CQEX(PyverbsCM): + def __init__(self, Context context not None, CqInitAttrEx init_attr): + """ + Initializes a CQEX object on the given device's context with the given + attributes. + :param context: The device's context on which to open the CQ + :param init_attr: Initial attributes that describe the CQ + :return: The newly created CQEX on success + """ + super().__init__() + self.qps = weakref.WeakSet() + self.srqs = weakref.WeakSet() + if self.cq != NULL: + # Leave CQ initialization to the provider + return + if init_attr is None: + init_attr = CqInitAttrEx() + self.cq = v.ibv_create_cq_ex(context.context, &init_attr.attr) + if init_attr.comp_channel: + init_attr.comp_channel.add_ref(self) + if init_attr.parent_domain: + (<ParentDomain>init_attr.parent_domain).add_ref(self) + if self.cq == NULL: + raise PyverbsRDMAErrno('Failed to create extended CQ') + self.ibv_cq = v.ibv_cq_ex_to_cq(self.cq) + self.context = context + context.add_ref(self) + + cdef add_ref(self, obj): + if isinstance(obj, QP): + self.qps.add(obj) + elif isinstance(obj, SRQ): + self.srqs.add(obj) + else: + raise PyverbsError('Unrecognized object type') + + def __dealloc__(self): + self.close() + + cpdef close(self): + if self.cq != NULL: + self.logger.debug('Closing CQEx') + close_weakrefs([self.srqs, self.qps]) + rc = v.ibv_destroy_cq(<v.ibv_cq*>self.cq) + if rc != 0: + raise PyverbsRDMAError('Failed to destroy CQEX', rc) + self.cq = NULL + self.context = None + + def start_poll(self, PollCqAttr attr): + """ + Start polling a batch of work completions. + :param attr: For easy future extensions + :return: 0 on success, ENOENT when no completions are available + """ + if attr is None: + attr = PollCqAttr() + return v.ibv_start_poll(self.cq, &attr.attr) + + def poll_next(self): + """ + Get the next work completion. + :return: 0 on success, ENOENT when no completions are available + """ + return v.ibv_next_poll(self.cq) + + def end_poll(self): + """ + Indicates the end of polling batch of work completions + :return: None + """ + return v.ibv_end_poll(self.cq) + + def read_opcode(self): + return v.ibv_wc_read_opcode(self.cq) + def read_vendor_err(self): + return v.ibv_wc_read_vendor_err(self.cq) + def read_byte_len(self): + return v.ibv_wc_read_byte_len(self.cq) + def read_imm_data(self): + return v.ibv_wc_read_imm_data(self.cq) + def read_qp_num(self): + return v.ibv_wc_read_qp_num(self.cq) + def read_src_qp(self): + return v.ibv_wc_read_src_qp(self.cq) + def read_wc_flags(self): + return v.ibv_wc_read_wc_flags(self.cq) + def read_slid(self): + return v.ibv_wc_read_slid(self.cq) + def read_sl(self): + return v.ibv_wc_read_sl(self.cq) + def read_dlid_path_bits(self): + return v.ibv_wc_read_dlid_path_bits(self.cq) + def read_timestamp(self): + return v.ibv_wc_read_completion_ts(self.cq) + def read_cvlan(self): + return v.ibv_wc_read_cvlan(self.cq) + def read_flow_tag(self): + return v.ibv_wc_read_flow_tag(self.cq) + def read_tm_info(self): + info = WcTmInfo() + v.ibv_wc_read_tm_info(self.cq, &info.info) + return info + def read_completion_wallclock_ns(self): + return v.ibv_wc_read_completion_wallclock_ns(self.cq) + + @property + def status(self): + return self.cq.status + @status.setter + def status(self, val): + self.cq.status = val + + @property + def wr_id(self): + return self.cq.wr_id + @wr_id.setter + def wr_id(self, val): + self.cq.wr_id = val + + def __str__(self): + print_format = '{:<22}: {:<20}\n' + return 'Extended CQ:\n' +\ + print_format.format('Handle', self.cq.handle) +\ + print_format.format('CQEs', self.cq.cqe) + + +cdef class WC(PyverbsObject): + def __init__(self, wr_id=0, status=0, opcode=0, vendor_err=0, byte_len=0, + qp_num=0, src_qp=0, imm_data=0, wc_flags=0, pkey_index=0, + slid=0, sl=0, dlid_path_bits=0): + super().__init__() + self.wc.wr_id = wr_id + self.wc.status = status + self.wc.opcode = opcode + self.wc.vendor_err = vendor_err + self.wc.byte_len = byte_len + self.wc.qp_num = qp_num + self.wc.src_qp = src_qp + self.wc.wc_flags = wc_flags + self.wc.pkey_index = pkey_index + self.wc.slid = slid + self.wc.imm_data = imm_data + self.wc.sl = sl + self.wc.dlid_path_bits = dlid_path_bits + + @property + def wr_id(self): + return self.wc.wr_id + @wr_id.setter + def wr_id(self, val): + self.wc.wr_id = val + + @property + def status(self): + return self.wc.status + @status.setter + def status(self, val): + self.wc.status = val + + @property + def opcode(self): + return self.wc.opcode + @opcode.setter + def opcode(self, val): + self.wc.opcode = val + + @property + def vendor_err(self): + return self.wc.vendor_err + @vendor_err.setter + def vendor_err(self, val): + self.wc.vendor_err = val + + @property + def byte_len(self): + return self.wc.byte_len + @byte_len.setter + def byte_len(self, val): + self.wc.byte_len = val + + @property + def qp_num(self): + return self.wc.qp_num + @qp_num.setter + def qp_num(self, val): + self.wc.qp_num = val + + @property + def src_qp(self): + return self.wc.src_qp + @src_qp.setter + def src_qp(self, val): + self.wc.src_qp = val + + @property + def wc_flags(self): + return self.wc.wc_flags + @wc_flags.setter + def wc_flags(self, val): + self.wc.wc_flags = val + + @property + def pkey_index(self): + return self.wc.pkey_index + @pkey_index.setter + def pkey_index(self, val): + self.wc.pkey_index = val + + @property + def slid(self): + return self.wc.slid + @slid.setter + def slid(self, val): + self.wc.slid = val + + @property + def sl(self): + return self.wc.sl + @sl.setter + def sl(self, val): + self.wc.sl = val + + @property + def imm_data(self): + return self.wc.imm_data + @imm_data.setter + def imm_data(self, val): + self.wc.imm_data = val + + @property + def dlid_path_bits(self): + return self.wc.dlid_path_bits + @dlid_path_bits.setter + def dlid_path_bits(self, val): + self.wc.dlid_path_bits = val + + def __str__(self): + print_format = '{:22}: {:<20}\n' + return print_format.format('WR ID', self.wr_id) +\ + print_format.format('status', cqe_status_to_str(self.status)) +\ + print_format.format('opcode', cqe_opcode_to_str(self.opcode)) +\ + print_format.format('vendor error', self.vendor_err) +\ + print_format.format('byte length', self.byte_len) +\ + print_format.format('QP num', self.qp_num) +\ + print_format.format('source QP', self.src_qp) +\ + print_format.format('WC flags', cqe_flags_to_str(self.wc_flags)) +\ + print_format.format('pkey index', self.pkey_index) +\ + print_format.format('slid', self.slid) +\ + print_format.format('sl', self.sl) +\ + print_format.format('imm_data', self.imm_data) +\ + print_format.format('dlid path bits', self.dlid_path_bits) + + +cdef class PollCqAttr(PyverbsObject): + @property + def comp_mask(self): + return self.attr.comp_mask + @comp_mask.setter + def comp_mask(self, val): + self.attr.comp_mask = val + + +cdef class WcTmInfo(PyverbsObject): + @property + def tag(self): + return self.info.tag + @tag.setter + def tag(self, val): + self.info.tag = val + + @property + def priv(self): + return self.info.priv + @priv.setter + def priv(self, val): + self.info.priv = val + + +def cqe_status_to_str(status): + try: + return {e.IBV_WC_SUCCESS: "success", + e.IBV_WC_LOC_LEN_ERR: "local length error", + e.IBV_WC_LOC_QP_OP_ERR: "local QP op error", + e.IBV_WC_LOC_EEC_OP_ERR: "local EEC op error", + e.IBV_WC_LOC_PROT_ERR: "local protection error", + e.IBV_WC_WR_FLUSH_ERR: "WR flush error", + e.IBV_WC_MW_BIND_ERR: "memory window bind error", + e.IBV_WC_BAD_RESP_ERR: "bad response error", + e.IBV_WC_LOC_ACCESS_ERR: "local access error", + e.IBV_WC_REM_INV_REQ_ERR: "remote invalidate request error", + e.IBV_WC_REM_ACCESS_ERR: "remote access error", + e.IBV_WC_REM_OP_ERR: "remote op error", + e.IBV_WC_RETRY_EXC_ERR: "retry exceeded error", + e.IBV_WC_RNR_RETRY_EXC_ERR: "RNR retry exceeded", + e.IBV_WC_LOC_RDD_VIOL_ERR: "local RDD violation error", + e.IBV_WC_REM_INV_RD_REQ_ERR: "remote invalidate RD request error", + e.IBV_WC_REM_ABORT_ERR: "remote abort error", + e.IBV_WC_INV_EECN_ERR: "invalidate EECN error", + e.IBV_WC_INV_EEC_STATE_ERR: "invalidate EEC state error", + e.IBV_WC_FATAL_ERR: "WC fatal error", + e.IBV_WC_RESP_TIMEOUT_ERR: "response timeout error", + e.IBV_WC_GENERAL_ERR: "general error"}[status] + except KeyError: + return "Unknown CQE status" + +def cqe_opcode_to_str(opcode): + try: + return {0x0: "Send", 0x1:"RDMA write", 0x2: "RDMA read", + 0x3: "Compare and swap", 0x4: "Fetch and add", + 0x5: "Bind Memory window", 0x6: "Local invalidate", + 0x7: "TSO", 0x80: "Receive", + 0x81: "Receive RDMA with immediate", + 0x82: "Tag matching - add", 0x83: "Tag matching - delete", + 0x84: "Tag matching - sync", 0x85: "Tag matching - receive", + 0x86: "Tag matching - no tag"}[opcode] + except KeyError: + return "Unknown CQE opcode {op}".format(op=opcode) + +def flags_to_str(flags, dictionary): + flags_str = "" + for f in dictionary: + if flags & f: + flags_str += dictionary[f] + flags_str += " " + return flags_str + + +def cqe_flags_to_str(flags): + cqe_flags = {1: "GRH", 2: "With immediate", 4: "IP csum OK", + 8: "With invalidate", 16: "TM sync request", 32: "TM match", + 64: "TM data valid"} + return flags_to_str(flags, cqe_flags) + +def create_wc_flags_to_str(flags): + cqe_flags = {e.IBV_WC_EX_WITH_BYTE_LEN: 'IBV_WC_EX_WITH_BYTE_LEN', + e.IBV_WC_EX_WITH_IMM: 'IBV_WC_EX_WITH_IMM', + e.IBV_WC_EX_WITH_QP_NUM: 'IBV_WC_EX_WITH_QP_NUM', + e.IBV_WC_EX_WITH_SRC_QP: 'IBV_WC_EX_WITH_SRC_QP', + e.IBV_WC_EX_WITH_SLID: 'IBV_WC_EX_WITH_SLID', + e.IBV_WC_EX_WITH_SL: 'IBV_WC_EX_WITH_SL', + e.IBV_WC_EX_WITH_DLID_PATH_BITS: 'IBV_WC_EX_WITH_DLID_PATH_BITS', + e.IBV_WC_EX_WITH_COMPLETION_TIMESTAMP: 'IBV_WC_EX_WITH_COMPLETION_TIMESTAMP', + e.IBV_WC_EX_WITH_CVLAN: 'IBV_WC_EX_WITH_CVLAN', + e.IBV_WC_EX_WITH_FLOW_TAG: 'IBV_WC_EX_WITH_FLOW_TAG', + e.IBV_WC_EX_WITH_COMPLETION_TIMESTAMP_WALLCLOCK: 'IBV_WC_EX_WITH_COMPLETION_TIMESTAMP_WALLCLOCK'} + return flags_to_str(flags, cqe_flags) diff --git a/pyverbs/device.pxd b/pyverbs/device.pxd new file mode 100755 index 0000000..99edf4b --- /dev/null +++ b/pyverbs/device.pxd @@ -0,0 +1,68 @@ +# SPDX-License-Identifier: (GPL-2.0 OR Linux-OpenIB) +# Copyright (c) 2018, Mellanox Technologies. All rights reserved. See COPYING file + +#cython: language_level=3 + +from .base cimport PyverbsObject, PyverbsCM +cimport pyverbs.libibverbs as v + + +cdef class Context(PyverbsCM): + cdef v.ibv_context *context + cdef v.ibv_device *device + cdef object name + cdef add_ref(self, obj) + cdef object pds + cdef object dms + cdef object ccs + cdef object cqs + cdef object qps + cdef object xrcds + cdef object vars + +cdef class DeviceAttr(PyverbsObject): + cdef v.ibv_device_attr dev_attr + +cdef class QueryDeviceExInput(PyverbsObject): + cdef v.ibv_query_device_ex_input input + +cdef class ODPCaps(PyverbsObject): + cdef v.ibv_odp_caps odp_caps + cdef object xrc_odp_caps + +cdef class RSSCaps(PyverbsObject): + cdef v.ibv_rss_caps rss_caps + +cdef class PacketPacingCaps(PyverbsObject): + cdef v.ibv_packet_pacing_caps packet_pacing_caps + +cdef class PCIAtomicCaps(PyverbsObject): + cdef v.ibv_pci_atomic_caps caps + +cdef class TMCaps(PyverbsObject): + cdef v.ibv_tm_caps tm_caps + +cdef class CQModerationCaps(PyverbsObject): + cdef v.ibv_cq_moderation_caps cq_mod_caps + +cdef class TSOCaps(PyverbsObject): + cdef v.ibv_tso_caps tso_caps + +cdef class DeviceAttrEx(PyverbsObject): + cdef v.ibv_device_attr_ex dev_attr + +cdef class AllocDmAttr(PyverbsObject): + cdef v.ibv_alloc_dm_attr alloc_dm_attr + +cdef class DM(PyverbsCM): + cdef v.ibv_dm *dm + cdef object dm_mrs + cdef object context + cdef add_ref(self, obj) + +cdef class PortAttr(PyverbsObject): + cdef v.ibv_port_attr attr + +cdef class VAR(PyverbsObject): + cdef object context + cpdef close(self) diff --git a/pyverbs/device.pyx b/pyverbs/device.pyx new file mode 100755 index 0000000..e939c0b --- /dev/null +++ b/pyverbs/device.pyx @@ -0,0 +1,993 @@ +# SPDX-License-Identifier: (GPL-2.0 OR Linux-OpenIB) +# Copyright (c) 2018, Mellanox Technologies. All rights reserved. See COPYING file + +""" +Device module introduces the Context and DeviceAttr class. +It allows user to open an IB device (using Context(name=<name>) and query it, +which returns a DeviceAttr object. +""" +import weakref + +from .pyverbs_error import PyverbsRDMAError, PyverbsError +from pyverbs.cq cimport CQEX, CQ, CompChannel +from .pyverbs_error import PyverbsUserError +from pyverbs.base import PyverbsRDMAErrno +from pyverbs.base cimport close_weakrefs +cimport pyverbs.libibverbs_enums as e +cimport pyverbs.libibverbs as v +from pyverbs.cmid cimport CMID +from pyverbs.xrcd cimport XRCD +from pyverbs.addr cimport GID +from pyverbs.mr import DMMR +from pyverbs.pd cimport PD +from pyverbs.qp cimport QP +from libc.stdlib cimport free, malloc +from libc.string cimport memset +from libc.stdint cimport uint64_t + +cdef extern from 'endian.h': + unsigned long be64toh(unsigned long host_64bits); + + +class Device(PyverbsObject): + """ + Device class represents the C ibv_device. It stores device's properties. + It is not a part of objects creation order - there's no need for the user + to create it for such purposes. + """ + def __init__(self, name, guid, node_type, transport_type): + self._node_type = node_type + self._transport_type = transport_type + self._name = name + self._guid = guid + + @property + def name(self): + return self._name + + @property + def node_type(self): + return self._node_type + + @property + def transport_type(self): + return self._transport_type + + @property + def guid(self): + return self._guid + + def __str__(self): + return 'Device {dev}, node type {ntype}, transport type {ttype},' \ + ' guid {guid}'.format(dev=self.name.decode(), + ntype=translate_node_type(self.node_type), + ttype=translate_transport_type(self.transport_type), + guid=guid_to_hex(self.guid)) + + +cdef class Context(PyverbsCM): + """ + Context class represents the C ibv_context. + """ + def __init__(self, **kwargs): + """ + Initializes a Context object. The function searches the IB devices list + for a device with the name provided by the user. If such a device is + found, it is opened (unless provider attributes were given). + In case of cmid argument, CMID object already holds an ibv_context + initiated pointer, hence all we have to do is assign this pointer to + Context's object pointer. + :param kwargs: Arguments: + * *name* + The device's name + * *attr* + Provider-specific attributes. If not None, it means that the + device will be opened by the provider and __init__ will return + after locating the requested device. + * *cmid* + A CMID object. If not None, it means that the device was already + opened by a CMID class, and only a pointer assignment is missing. + :return: None + """ + cdef int count + cdef v.ibv_device **dev_list + cdef CMID cmid + + super().__init__() + self.pds = weakref.WeakSet() + self.dms = weakref.WeakSet() + self.ccs = weakref.WeakSet() + self.cqs = weakref.WeakSet() + self.qps = weakref.WeakSet() + self.xrcds = weakref.WeakSet() + self.vars = weakref.WeakSet() + + self.name = kwargs.get('name') + provider_attr = kwargs.get('attr') + cmid = kwargs.get('cmid') + if cmid is not None: + self.context = cmid.id.verbs + cmid.ctx = self + return + + if self.name is None: + raise PyverbsUserError('Device name must be provided') + dev_list = v.ibv_get_device_list(&count) + if dev_list == NULL: + raise PyverbsRDMAError('Failed to get devices list') + try: + for i in range(count): + if dev_list[i].name.decode() == self.name: + if provider_attr is not None: + # A provider opens its own context, we're just + # setting its IB device + self.device = dev_list[i] + return + self.context = v.ibv_open_device(dev_list[i]) + if self.context == NULL: + raise PyverbsRDMAErrno('Failed to open device {dev}'. + format(dev=self.name)) + self.logger.debug('Context: opened device {dev}'. + format(dev=self.name)) + break + else: + raise PyverbsRDMAError('Failed to find device {dev}'. + format(dev=self.name)) + finally: + v.ibv_free_device_list(dev_list) + + def __dealloc__(self): + """ + Closes the inner IB device. + :return: None + """ + self.close() + + cpdef close(self): + if self.context != NULL: + self.logger.debug('Closing Context') + close_weakrefs([self.qps, self.ccs, self.cqs, self.dms, self.pds, + self.xrcds, self.vars]) + rc = v.ibv_close_device(self.context) + if rc != 0: + raise PyverbsRDMAErrno('Failed to close device {dev}'. + format(dev=self.device.name)) + self.context = NULL + + @property + def num_comp_vectors(self): + return self.context.num_comp_vectors + + def query_device(self): + """ + Queries the device's attributes. + :return: A DeviceAttr object which holds the device's attributes as + reported by the hardware. + """ + dev_attr = DeviceAttr() + rc = v.ibv_query_device(self.context, &dev_attr.dev_attr) + if rc != 0: + raise PyverbsRDMAError('Failed to query device {name}'. + format(name=self.name), rc) + return dev_attr + + def query_device_ex(self, QueryDeviceExInput ex_input = None): + """ + Queries the device's extended attributes. + :param ex_input: An extensible input struct for possible future + extensions + :return: DeviceAttrEx object + """ + dev_attr_ex = DeviceAttrEx() + rc = v.ibv_query_device_ex(self.context, + &ex_input.input if ex_input is not None else NULL, + &dev_attr_ex.dev_attr) + if rc != 0: + raise PyverbsRDMAError('Failed to query EX device {name}'. + format(name=self.name), rc) + return dev_attr_ex + + def query_gid(self, unsigned int port_num, int index): + gid = GID() + rc = v.ibv_query_gid(self.context, port_num, index, &gid.gid) + if rc != 0: + raise PyverbsRDMAError('Failed to query gid {idx} of port {port}'. + format(idx=index, port=port_num)) + return gid + + def query_gid_type(self, unsigned int port_num, unsigned int index): + cdef v.ibv_gid_type gid_type + rc = v.ibv_query_gid_type(self.context, port_num, index, &gid_type) + if rc != 0: + raise PyverbsRDMAErrno('Failed to query gid type of port {p} and gid index {g}' + .format(p=port_num, g=index)) + return gid_type + + def query_port(self, unsigned int port_num): + """ + Query port <port_num> of the device and returns its attributes. + :param port_num: Port number to query + :return: PortAttr object on success + """ + port_attrs = PortAttr() + rc = v.ibv_query_port(self.context, port_num, &port_attrs.attr) + if rc != 0: + raise PyverbsRDMAError('Failed to query port {p}'. + format(p=port_num), rc) + return port_attrs + + cdef add_ref(self, obj): + if isinstance(obj, PD): + self.pds.add(obj) + elif isinstance(obj, DM): + self.dms.add(obj) + elif isinstance(obj, CompChannel): + self.ccs.add(obj) + elif isinstance(obj, CQ) or isinstance(obj, CQEX): + self.cqs.add(obj) + elif isinstance(obj, QP): + self.qps.add(obj) + elif isinstance(obj, XRCD): + self.xrcds.add(obj) + elif isinstance(obj, VAR): + self.vars.add(obj) + else: + raise PyverbsError('Unrecognized object type') + + @property + def cmd_fd(self): + return self.context.cmd_fd + + +cdef class DeviceAttr(PyverbsObject): + """ + DeviceAttr represents ibv_device_attr C class. It exposes the same + properties (read only) and also provides an __str__() function for + readability. + """ + @property + def fw_version(self): + return self.dev_attr.fw_ver.decode() + @property + def node_guid(self): + return self.dev_attr.node_guid + @property + def sys_image_guid(self): + return self.dev_attr.sys_image_guid + @property + def max_mr_size(self): + return self.dev_attr.max_mr_size + @property + def page_size_cap(self): + return self.dev_attr.page_size_cap + @property + def vendor_id(self): + return self.dev_attr.vendor_id + @property + def vendor_part_id(self): + return self.dev_attr.vendor_part_id + @property + def hw_ver(self): + return self.dev_attr.hw_ver + @property + def max_qp(self): + return self.dev_attr.max_qp + @property + def max_qp_wr(self): + return self.dev_attr.max_qp_wr + @property + def device_cap_flags(self): + return self.dev_attr.device_cap_flags + @property + def max_sge(self): + return self.dev_attr.max_sge + @property + def max_sge_rd(self): + return self.dev_attr.max_sge_rd + @property + def max_cq(self): + return self.dev_attr.max_cq + @property + def max_cqe(self): + return self.dev_attr.max_cqe + @property + def max_mr(self): + return self.dev_attr.max_mr + @property + def max_pd(self): + return self.dev_attr.max_pd + @property + def max_qp_rd_atom(self): + return self.dev_attr.max_qp_rd_atom + @property + def max_ee_rd_atom(self): + return self.dev_attr.max_ee_rd_atom + @property + def max_res_rd_atom(self): + return self.dev_attr.max_res_rd_atom + @property + def max_qp_init_rd_atom(self): + return self.dev_attr.max_qp_init_rd_atom + @property + def max_ee_init_rd_atom(self): + return self.dev_attr.max_ee_init_rd_atom + @property + def atomic_caps(self): + return self.dev_attr.atomic_cap + @property + def max_ee(self): + return self.dev_attr.max_ee + @property + def max_rdd(self): + return self.dev_attr.max_rdd + @property + def max_mw(self): + return self.dev_attr.max_mw + @property + def max_raw_ipv6_qps(self): + return self.dev_attr.max_raw_ipv6_qp + @property + def max_raw_ethy_qp(self): + return self.dev_attr.max_raw_ethy_qp + @property + def max_mcast_grp(self): + return self.dev_attr.max_mcast_grp + @property + def max_mcast_qp_attach(self): + return self.dev_attr.max_mcast_qp_attach + @property + def max_ah(self): + return self.dev_attr.max_ah + @property + def max_fmr(self): + return self.dev_attr.max_fmr + @property + def max_map_per_fmr(self): + return self.dev_attr.max_map_per_fmr + @property + def max_srq(self): + return self.dev_attr.max_srq + @property + def max_srq_wr(self): + return self.dev_attr.max_srq_wr + @property + def max_srq_sge(self): + return self.dev_attr.max_srq_sge + @property + def max_pkeys(self): + return self.dev_attr.max_pkeys + @property + def local_ca_ack_delay(self): + return self.dev_attr.local_ca_ack_delay + @property + def phys_port_cnt(self): + return self.dev_attr.phys_port_cnt + + def __str__(self): + print_format = '{:<22}: {:<20}\n' + return print_format.format('FW version', self.fw_version) +\ + print_format.format('Node guid', guid_format(self.node_guid)) +\ + print_format.format('Sys image GUID', guid_format(self.sys_image_guid)) +\ + print_format.format('Max MR size', hex(self.max_mr_size).replace('L', '')) +\ + print_format.format('Page size cap', hex(self.page_size_cap).replace('L', '')) +\ + print_format.format('Vendor ID', hex(self.vendor_id)) +\ + print_format.format('Vendor part ID', self.vendor_part_id) +\ + print_format.format('HW version', self.hw_ver) +\ + print_format.format('Max QP', self.max_qp) +\ + print_format.format('Max QP WR', self.max_qp_wr) +\ + print_format.format('Device cap flags', + translate_device_caps(self.device_cap_flags)) +\ + print_format.format('Max SGE', self.max_sge) +\ + print_format.format('Max SGE RD', self.max_sge_rd) +\ + print_format.format('MAX CQ', self.max_cq) +\ + print_format.format('Max CQE', self.max_cqe) +\ + print_format.format('Max MR', self.max_mr) +\ + print_format.format('Max PD', self.max_pd) +\ + print_format.format('Max QP RD atom', self.max_qp_rd_atom) +\ + print_format.format('Max EE RD atom', self.max_ee_rd_atom) +\ + print_format.format('Max res RD atom', self.max_res_rd_atom) +\ + print_format.format('Max QP init RD atom', self.max_qp_init_rd_atom) +\ + print_format.format('Max EE init RD atom', self.max_ee_init_rd_atom) +\ + print_format.format('Atomic caps', self.atomic_caps) +\ + print_format.format('Max EE', self.max_ee) +\ + print_format.format('Max RDD', self.max_rdd) +\ + print_format.format('Max MW', self.max_mw) +\ + print_format.format('Max raw IPv6 QPs', self.max_raw_ipv6_qps) +\ + print_format.format('Max raw ethy QP', self.max_raw_ethy_qp) +\ + print_format.format('Max mcast group', self.max_mcast_grp) +\ + print_format.format('Max mcast QP attach', self.max_mcast_qp_attach) +\ + print_format.format('Max AH', self.max_ah) +\ + print_format.format('Max FMR', self.max_fmr) +\ + print_format.format('Max map per FMR', self.max_map_per_fmr) +\ + print_format.format('Max SRQ', self.max_srq) +\ + print_format.format('Max SRQ WR', self.max_srq_wr) +\ + print_format.format('Max SRQ SGE', self.max_srq_sge) +\ + print_format.format('Max PKeys', self.max_pkeys) +\ + print_format.format('local CA ack delay', self.local_ca_ack_delay) +\ + print_format.format('Phys port count', self.phys_port_cnt) + + +cdef class QueryDeviceExInput(PyverbsObject): + def __init__(self, comp_mask): + super().__init__() + self.ex_input.comp_mask = comp_mask + + +cdef class ODPCaps(PyverbsObject): + @property + def general_caps(self): + return self.odp_caps.general_caps + @property + def rc_odp_caps(self): + return self.odp_caps.per_transport_caps.rc_odp_caps + @property + def uc_odp_caps(self): + return self.odp_caps.per_transport_caps.uc_odp_caps + @property + def ud_odp_caps(self): + return self.odp_caps.per_transport_caps.ud_odp_caps + @property + def xrc_odp_caps(self): + return self.xrc_odp_caps + @xrc_odp_caps.setter + def xrc_odp_caps(self, val): + self.xrc_odp_caps = val + + def __str__(self): + general_caps = {e.IBV_ODP_SUPPORT: 'IBV_ODP_SUPPORT', + e.IBV_ODP_SUPPORT_IMPLICIT: 'IBV_ODP_SUPPORT_IMPLICIT'} + + l = {e.IBV_ODP_SUPPORT_SEND: 'IBV_ODP_SUPPORT_SEND', + e.IBV_ODP_SUPPORT_RECV: 'IBV_ODP_SUPPORT_RECV', + e.IBV_ODP_SUPPORT_WRITE: 'IBV_ODP_SUPPORT_WRITE', + e.IBV_ODP_SUPPORT_READ: 'IBV_ODP_SUPPORT_READ', + e.IBV_ODP_SUPPORT_ATOMIC: 'IBV_ODP_SUPPORT_ATOMIC', + e.IBV_ODP_SUPPORT_SRQ_RECV: 'IBV_ODP_SUPPORT_SRQ_RECV'} + + print_format = '{}: {}\n' + return print_format.format('ODP General caps', str_from_flags(self.general_caps, general_caps)) +\ + print_format.format('RC ODP caps', str_from_flags(self.rc_odp_caps, l)) +\ + print_format.format('UD ODP caps', str_from_flags(self.ud_odp_caps, l)) +\ + print_format.format('UC ODP caps', str_from_flags(self.uc_odp_caps, l)) +\ + print_format.format('XRC ODP caps', str_from_flags(self.xrc_odp_caps, l)) + + +cdef class PCIAtomicCaps(PyverbsObject): + @property + def fetch_add(self): + return self.caps.fetch_add + @property + def swap(self): + return self.caps.swap + @property + def compare_swap(self): + return self.caps.compare_swap + + +cdef class TSOCaps(PyverbsObject): + @property + def max_tso(self): + return self.tso_caps.max_tso + @property + def supported_qpts(self): + return self.tso_caps.supported_qpts + + +cdef class RSSCaps(PyverbsObject): + @property + def supported_qpts(self): + return self.rss_caps.supported_qpts + @property + def max_rwq_indirection_tables(self): + return self.rss_caps.max_rwq_indirection_tables + @property + def rx_hash_fields_mask(self): + return self.rss_caps.rx_hash_fields_mask + @property + def rx_hash_function(self): + return self.rss_caps.rx_hash_function + @property + def max_rwq_indirection_table_size(self): + return self.rss_caps.max_rwq_indirection_table_size + + +cdef class PacketPacingCaps(PyverbsObject): + @property + def qp_rate_limit_min(self): + return self.packet_pacing_caps.qp_rate_limit_min + @property + def qp_rate_limit_max(self): + return self.packet_pacing_caps.qp_rate_limit_max + @property + def supported_qpts(self): + return self.packet_pacing_caps.supported_qpts + + +cdef class TMCaps(PyverbsObject): + @property + def max_rndv_hdr_size(self): + return self.tm_caps.max_rndv_hdr_size + @property + def max_num_tags(self): + return self.tm_caps.max_num_tags + @property + def flags(self): + return self.tm_caps.flags + @property + def max_ops(self): + return self.tm_caps.max_ops + @property + def max_sge(self): + return self.tm_caps.max_sge + + +cdef class CQModerationCaps(PyverbsObject): + @property + def max_cq_count(self): + return self.cq_mod_caps.max_cq_count + @property + def max_cq_period(self): + return self.cq_mod_caps.max_cq_period + + +cdef class DeviceAttrEx(PyverbsObject): + @property + def orig_attr(self): + attr = DeviceAttr() + attr.dev_attr = self.dev_attr.orig_attr + return attr + @property + def comp_mask(self): + return self.dev_attr.comp_mask + @comp_mask.setter + def comp_mask(self, val): + self.dev_attr.comp_mask = val + @property + def odp_caps(self): + caps = ODPCaps() + caps.odp_caps = self.dev_attr.odp_caps + caps.xrc_odp_caps = self.dev_attr.xrc_odp_caps + return caps + @property + def completion_timestamp_mask(self): + return self.dev_attr.completion_timestamp_mask + @property + def hca_core_clock(self): + return self.dev_attr.hca_core_clock + @property + def device_cap_flags_ex(self): + return self.dev_attr.device_cap_flags_ex + @property + def tso_caps(self): + caps = TSOCaps() + caps.tso_caps = self.dev_attr.tso_caps + return caps + @property + def pci_atomic_caps(self): + caps = PCIAtomicCaps() + caps.caps = self.dev_attr.pci_atomic_caps + return caps + @property + def rss_caps(self): + caps = RSSCaps() + caps.rss_caps = self.dev_attr.rss_caps + return caps + @property + def max_wq_type_rq(self): + return self.dev_attr.max_wq_type_rq + @property + def packet_pacing_caps(self): + caps = PacketPacingCaps() + caps.packet_pacing_caps = self.dev_attr.packet_pacing_caps + return caps + @property + def raw_packet_caps(self): + return self.dev_attr.raw_packet_caps + @property + def tm_caps(self): + caps = TMCaps() + caps.tm_caps = self.dev_attr.tm_caps + return caps + @property + def cq_mod_caps(self): + caps = CQModerationCaps() + caps.cq_mod_caps = self.dev_attr.cq_mod_caps + return caps + @property + def max_dm_size(self): + return self.dev_attr.max_dm_size + + +cdef class AllocDmAttr(PyverbsObject): + def __init__(self, length, log_align_req = 0, comp_mask = 0): + """ + Creates an AllocDmAttr object with the given parameters. This object + can than be used to create a DM object. + :param length: Length of the future device memory + :param log_align_req: log2 of address alignment requirement + :param comp_mask: compatibility mask + :return: An AllocDmAttr object + """ + super().__init__() + self.alloc_dm_attr.length = length + self.alloc_dm_attr.log_align_req = log_align_req + self.alloc_dm_attr.comp_mask = comp_mask + + @property + def length(self): + return self.alloc_dm_attr.length + + @length.setter + def length(self, val): + self.alloc_dm_attr.length = val + + @property + def log_align_req(self): + return self.alloc_dm_attr.log_align_req + + @log_align_req.setter + def log_align_req(self, val): + self.alloc_dm_attr.log_align_req = val + + @property + def comp_mask(self): + return self.alloc_dm_attr.comp_mask + + @comp_mask.setter + def comp_mask(self, val): + self.alloc_dm_attr.comp_mask = val + + +cdef class DM(PyverbsCM): + def __init__(self, Context context, AllocDmAttr dm_attr not None): + """ + Allocate a device (direct) memory. + :param context: The context of the device on which to allocate memory + :param dm_attr: Attributes that define the DM + :return: A DM object on success + """ + super().__init__() + self.dm_mrs = weakref.WeakSet() + device_attr = context.query_device_ex() + if device_attr.max_dm_size <= 0: + raise PyverbsUserError('Device doesn\'t support dm allocation') + self.dm = v.ibv_alloc_dm(<v.ibv_context*>context.context, + &dm_attr.alloc_dm_attr) + if self.dm == NULL: + raise PyverbsRDMAErrno('Failed to allocate device memory of size ' + '{size}. Max available size {max}.' + .format(size=dm_attr.length, + max=device_attr.max_dm_size)) + self.context = context + context.add_ref(self) + + def __dealloc__(self): + self.close() + + cpdef close(self): + if self.dm != NULL: + self.logger.debug('Closing DM') + close_weakrefs([self.dm_mrs]) + rc = v.ibv_free_dm(self.dm) + if rc != 0: + raise PyverbsRDMAError('Failed to free dm', rc) + self.dm = NULL + self.context = None + + cdef add_ref(self, obj): + if isinstance(obj, DMMR): + self.dm_mrs.add(obj) + + def copy_to_dm(self, dm_offset, data, length): + rc = v.ibv_memcpy_to_dm(<v.ibv_dm *>self.dm, <uint64_t>dm_offset, + <char *>data, <size_t>length) + if rc != 0: + raise PyverbsRDMAError('Failed to copy to dm', rc) + + def copy_from_dm(self, dm_offset, length): + cdef char *data =<char*>malloc(length) + memset(data, 0, length) + rc = v.ibv_memcpy_from_dm(<void *>data, <v.ibv_dm *>self.dm, + <uint64_t>dm_offset, <size_t>length) + if rc != 0: + raise PyverbsRDMAError('Failed to copy from dm', rc) + res = data[:length] + free(data) + return res + + +cdef class PortAttr(PyverbsObject): + @property + def state(self): + return self.attr.state + @property + def max_mtu(self): + return self.attr.max_mtu + @property + def active_mtu(self): + return self.attr.active_mtu + @property + def gid_tbl_len(self): + return self.attr.gid_tbl_len + @property + def port_cap_flags(self): + return self.attr.port_cap_flags + @property + def max_msg_sz(self): + return self.attr.max_msg_sz + @property + def bad_pkey_cntr(self): + return self.attr.bad_pkey_cntr + @property + def qkey_viol_cntr(self): + return self.attr.qkey_viol_cntr + @property + def pkey_tbl_len(self): + return self.attr.pkey_tbl_len + @property + def lid(self): + return self.attr.lid + @property + def sm_lid(self): + return self.attr.sm_lid + @property + def lmc(self): + return self.attr.lmc + @property + def max_vl_num(self): + return self.attr.max_vl_num + @property + def sm_sl(self): + return self.attr.sm_sl + @property + def subnet_timeout(self): + return self.attr.subnet_timeout + @property + def init_type_reply(self): + return self.attr.init_type_reply + @property + def active_width(self): + return self.attr.active_width + @property + def active_speed(self): + return self.attr.active_speed + @property + def phys_state(self): + return self.attr.phys_state + @property + def link_layer(self): + return self.attr.link_layer + @property + def flags(self): + return self.attr.flags + @property + def port_cap_flags2(self): + return self.attr.port_cap_flags2 + + def __str__(self): + print_format = '{:<24}: {:<20}\n' + return print_format.format('Port state', port_state_to_str(self.attr.state)) +\ + print_format.format('Max MTU', translate_mtu(self.attr.max_mtu)) +\ + print_format.format('Active MTU', translate_mtu(self.attr.active_mtu)) +\ + print_format.format('SM lid', self.attr.sm_lid) +\ + print_format.format('Port lid', self.attr.lid) +\ + print_format.format('lmc', hex(self.attr.lmc)) +\ + print_format.format('Link layer', translate_link_layer(self.attr.link_layer)) +\ + print_format.format('Max message size', hex(self.attr.max_msg_sz)) +\ + print_format.format('Port cap flags', translate_port_cap_flags(self.attr.port_cap_flags)) +\ + print_format.format('Port cap flags 2', translate_port_cap_flags2(self.attr.port_cap_flags2)) +\ + print_format.format('max VL num', self.attr.max_vl_num) +\ + print_format.format('Bad Pkey counter', self.attr.bad_pkey_cntr) +\ + print_format.format('Qkey violations counter', self.attr.qkey_viol_cntr) +\ + print_format.format('GID table len', self.attr.gid_tbl_len) +\ + print_format.format('Pkey table len', self.attr.pkey_tbl_len) +\ + print_format.format('SM sl', self.attr.sm_sl) +\ + print_format.format('Subnet timeout', self.attr.subnet_timeout) +\ + print_format.format('Init type reply', self.attr.init_type_reply) +\ + print_format.format('Active width', width_to_str(self.attr.active_width)) +\ + print_format.format('Active speed', speed_to_str(self.attr.active_speed)) +\ + print_format.format('Phys state', phys_state_to_str(self.attr.phys_state)) +\ + print_format.format('Flags', self.attr.flags) + + +def guid_format(num): + """ + Get GUID representation of the given number, including change of endianness. + :param num: Number to change to GUID format. + :return: GUID-formatted string. + """ + num = be64toh(num) + hex_str = "%016x" % (num) + hex_array = [hex_str[i:i+2] for i in range(0, len(hex_str), 2)] + hex_array = [''.join(x) for x in zip(hex_array[0::2], hex_array[1::2])] + return ':'.join(hex_array) + + +def translate_transport_type(transport_type): + l = {0: 'IB', 1: 'IWARP', 2: 'USNIC', 3: 'USNIC UDP'} + try: + return l[transport_type] + except KeyError: + return 'Unknown' + + +def translate_node_type(node_type): + l = {1: 'CA', 2: 'Switch', 3: 'Router', 4: 'RNIC', 5: 'USNIC', + 6: 'USNIC UDP'} + try: + return l[node_type] + except KeyError: + return 'Unknown' + + +def guid_to_hex(node_guid): + return hex(node_guid).replace('L', '').replace('0x', '') + + +def port_state_to_str(port_state): + l = {0: 'NOP', 1: 'Down', 2: 'Init', 3: 'Armed', 4: 'Active', 5: 'Defer'} + try: + return '{s} ({n})'.format(s=l[port_state], n=port_state) + except KeyError: + return 'Invalid state ({s})'.format(s=port_state) + + +def translate_mtu(mtu): + l = {1: 256, 2: 512, 3: 1024, 4: 2048, 5: 4096} + try: + return '{s} ({n})'.format(s=l[mtu], n=mtu) + except KeyError: + return 'Invalid MTU ({m})'.format(m=mtu) + + +def translate_link_layer(ll): + l = {0: 'Unspecified', 1:'InfiniBand', 2:'Ethernet'} + try: + return l[ll] + except KeyError: + return 'Invalid link layer ({ll})'.format(ll=ll) + + +def translate_port_cap_flags(flags): + l = {e.IBV_PORT_SM: 'IBV_PORT_SM', + e.IBV_PORT_NOTICE_SUP: 'IBV_PORT_NOTICE_SUP', + e.IBV_PORT_TRAP_SUP: 'IBV_PORT_TRAP_SUP', + e.IBV_PORT_OPT_IPD_SUP: 'IBV_PORT_OPT_IPD_SUP', + e.IBV_PORT_AUTO_MIGR_SUP: 'IBV_PORT_AUTO_MIGR_SUP', + e.IBV_PORT_SL_MAP_SUP: 'IBV_PORT_SL_MAP_SUP', + e.IBV_PORT_MKEY_NVRAM: 'IBV_PORT_MKEY_NVRAM', + e.IBV_PORT_PKEY_NVRAM: 'IBV_PORT_PKEY_NVRAM', + e.IBV_PORT_LED_INFO_SUP: 'IBV_PORT_LED_INFO_SUP', + e.IBV_PORT_SYS_IMAGE_GUID_SUP: 'IBV_PORT_SYS_IMAGE_GUID_SUP', + e.IBV_PORT_PKEY_SW_EXT_PORT_TRAP_SUP: 'IBV_PORT_PKEY_SW_EXT_PORT_TRAP_SUP', + e.IBV_PORT_EXTENDED_SPEEDS_SUP: 'IBV_PORT_EXTENDED_SPEEDS_SUP', + e.IBV_PORT_CAP_MASK2_SUP: 'IBV_PORT_CAP_MASK2_SUP', + e.IBV_PORT_CM_SUP: 'IBV_PORT_CM_SUP', + e.IBV_PORT_SNMP_TUNNEL_SUP: 'IBV_PORT_SNMP_TUNNEL_SUP', + e.IBV_PORT_REINIT_SUP: 'IBV_PORT_REINIT_SUP', + e.IBV_PORT_DEVICE_MGMT_SUP: 'IBV_PORT_DEVICE_MGMT_SUP', + e.IBV_PORT_VENDOR_CLASS_SUP: 'IBV_PORT_VENDOR_CLASS_SUP', + e.IBV_PORT_DR_NOTICE_SUP: 'IBV_PORT_DR_NOTICE_SUP', + e.IBV_PORT_CAP_MASK_NOTICE_SUP: 'IBV_PORT_CAP_MASK_NOTICE_SUP', + e.IBV_PORT_BOOT_MGMT_SUP: 'IBV_PORT_BOOT_MGMT_SUP', + e.IBV_PORT_LINK_LATENCY_SUP: 'IBV_PORT_LINK_LATENCY_SUP', + e.IBV_PORT_CLIENT_REG_SUP: 'IBV_PORT_CLIENT_REG_SUP', + e.IBV_PORT_IP_BASED_GIDS: 'IBV_PORT_IP_BASED_GIDS'} + return str_from_flags(flags, l) + + +def translate_port_cap_flags2(flags): + l = {e.IBV_PORT_SET_NODE_DESC_SUP: 'IBV_PORT_SET_NODE_DESC_SUP', + e.IBV_PORT_INFO_EXT_SUP: 'IBV_PORT_INFO_EXT_SUP', + e.IBV_PORT_VIRT_SUP: 'IBV_PORT_VIRT_SUP', + e.IBV_PORT_SWITCH_PORT_STATE_TABLE_SUP: 'IBV_PORT_SWITCH_PORT_STATE_TABLE_SUP', + e.IBV_PORT_LINK_WIDTH_2X_SUP: 'IBV_PORT_LINK_WIDTH_2X_SUP', + e.IBV_PORT_LINK_SPEED_HDR_SUP: 'IBV_PORT_LINK_SPEED_HDR_SUP'} + return str_from_flags(flags, l) + + +def translate_device_caps(flags): + l = {e.IBV_DEVICE_RESIZE_MAX_WR: 'IBV_DEVICE_RESIZE_MAX_WR', + e.IBV_DEVICE_BAD_PKEY_CNTR: 'IBV_DEVICE_BAD_PKEY_CNTR', + e.IBV_DEVICE_BAD_QKEY_CNTR: 'IBV_DEVICE_BAD_QKEY_CNTR', + e.IBV_DEVICE_RAW_MULTI: 'IBV_DEVICE_RAW_MULTI', + e.IBV_DEVICE_AUTO_PATH_MIG: 'IBV_DEVICE_AUTO_PATH_MIG', + e.IBV_DEVICE_CHANGE_PHY_PORT: 'IBV_DEVICE_CHANGE_PHY_PORT', + e.IBV_DEVICE_UD_AV_PORT_ENFORCE: 'IBV_DEVICE_UD_AV_PORT_ENFORCE', + e.IBV_DEVICE_CURR_QP_STATE_MOD: 'IBV_DEVICE_CURR_QP_STATE_MOD', + e.IBV_DEVICE_SHUTDOWN_PORT: 'IBV_DEVICE_SHUTDOWN_PORT', + e.IBV_DEVICE_INIT_TYPE: 'IBV_DEVICE_INIT_TYPE', + e.IBV_DEVICE_PORT_ACTIVE_EVENT: 'IBV_DEVICE_PORT_ACTIVE_EVENT', + e.IBV_DEVICE_SYS_IMAGE_GUID: 'IBV_DEVICE_SYS_IMAGE_GUID', + e.IBV_DEVICE_RC_RNR_NAK_GEN: 'IBV_DEVICE_RC_RNR_NAK_GEN', + e.IBV_DEVICE_SRQ_RESIZE: 'IBV_DEVICE_SRQ_RESIZE', + e.IBV_DEVICE_N_NOTIFY_CQ: 'IBV_DEVICE_N_NOTIFY_CQ', + e.IBV_DEVICE_MEM_WINDOW: 'IBV_DEVICE_MEM_WINDOW', + e.IBV_DEVICE_UD_IP_CSUM: 'IBV_DEVICE_UD_IP_CSUM', + e.IBV_DEVICE_XRC: 'IBV_DEVICE_XRC', + e.IBV_DEVICE_MEM_MGT_EXTENSIONS: 'IBV_DEVICE_MEM_MGT_EXTENSIONS', + e.IBV_DEVICE_MEM_WINDOW_TYPE_2A: 'IBV_DEVICE_MEM_WINDOW_TYPE_2A', + e.IBV_DEVICE_MEM_WINDOW_TYPE_2B: 'IBV_DEVICE_MEM_WINDOW_TYPE_2B', + e.IBV_DEVICE_RC_IP_CSUM: 'IBV_DEVICE_RC_IP_CSUM', + e.IBV_DEVICE_RAW_IP_CSUM: 'IBV_DEVICE_RAW_IP_CSUM', + e.IBV_DEVICE_MANAGED_FLOW_STEERING: 'IBV_DEVICE_MANAGED_FLOW_STEERING'} + return str_from_flags(flags, l) + + +def str_from_flags(flags, dictionary): + str_flags = "\n " + for bit in dictionary: + if flags & bit: + str_flags += dictionary[bit] + str_flags += '\n ' + return str_flags + + +def phys_state_to_str(phys): + l = {1: 'Sleep', 2: 'Polling', 3: 'Disabled', + 4: 'Port configuration training', 5: 'Link up', + 6: 'Link error recovery', 7: 'Phy test'} + try: + return '{s} ({n})'.format(s=l[phys], n=phys) + except KeyError: + return 'Invalid physical state' + + +def width_to_str(width): + l = {1: '1X', 2: '4X', 4: '8X', 16: '2X'} + try: + return '{s} ({n})'.format(s=l[width], n=width) + except KeyError: + return 'Invalid width' + + +def speed_to_str(speed): + l = {0: '0.0 Gbps', 1: '2.5 Gbps', 2: '5.0 Gbps', 4: '5.0 Gbps', + 8: '10.0 Gbps', 16: '14.0 Gbps', 32: '25.0 Gbps', 64: '50.0 Gbps'} + try: + return '{s} ({n})'.format(s=l[speed], n=speed) + except KeyError: + return 'Invalid speed' + + +def get_device_list(): + """ + :return: list of IB_devices on current node + each list element contains a Device with: + device name + device node type + device transport type + device guid + """ + cdef int count = 0; + cdef v.ibv_device **dev_list; + dev_list = v.ibv_get_device_list(&count) + if dev_list == NULL: + raise PyverbsRDMAError('Failed to get devices list') + devices = [] + try: + for i in range(count): + name = dev_list[i].name + node = dev_list[i].node_type + transport = dev_list[i].transport_type + guid = be64toh(v.ibv_get_device_guid(dev_list[i])) + devices.append(Device(name, guid, node, transport)) + finally: + v.ibv_free_device_list(dev_list) + return devices + + +cdef class VAR(PyverbsObject): + """ + This is an abstract class of Virtio Access Region (VAR). + Each device specific VAR implementation should inherit this class + and initialize it according to the device attributes. + """ + def __init__(self, Context context not None, **kwargs): + self.context = context + + def __dealloc__(self): + self.close() + + cpdef close(self): + pass diff --git a/pyverbs/enums.pyx b/pyverbs/enums.pyx new file mode 120000 index 0000000..cbf6607 --- /dev/null +++ b/pyverbs/enums.pyx @@ -0,0 +1 @@ +libibverbs_enums.pxd \ No newline at end of file diff --git a/pyverbs/examples/ib_devices.py b/pyverbs/examples/ib_devices.py new file mode 100755 index 0000000..2888437 --- /dev/null +++ b/pyverbs/examples/ib_devices.py @@ -0,0 +1,21 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: (GPL-2.0 OR Linux-OpenIB) +# Copyright (c) 2018, Mellanox Technologies. All rights reserved. See COPYING file + +from pyverbs import device as d +import sys + + +lst = d.get_device_list() +dev = 'Device' +node = 'Node Type' +trans = 'Transport Type' +guid = 'Node GUID' +print_format = '{:^20}{:^20}{:^20}{:^20}' +print (print_format.format(dev, node, trans, guid)) +print (print_format.format('-'*len(dev), '-'*len(node), '-'*len(trans), + '-'*len(guid))) +for i in lst: + print (print_format.format(i.name.decode(), d.translate_node_type(i.node_type), + d.translate_transport_type(i.transport_type), + d.guid_to_hex(i.guid))) diff --git a/pyverbs/libibverbs.pxd b/pyverbs/libibverbs.pxd new file mode 100755 index 0000000..6ffa303 --- /dev/null +++ b/pyverbs/libibverbs.pxd @@ -0,0 +1,601 @@ +# SPDX-License-Identifier: (GPL-2.0 OR Linux-OpenIB) +# Copyright (c) 2018, Mellanox Technologies. All rights reserved. See COPYING file + +include 'libibverbs_enums.pxd' +from libc.stdint cimport uint8_t, uint16_t, uint32_t, uint64_t + +cdef extern from 'infiniband/verbs.h': + + cdef struct anon: + unsigned long subnet_prefix + unsigned long interface_id + + cdef union ibv_gid: + anon _global "global" + uint8_t raw[16] + + cdef struct ibv_device: + char *name + int node_type + int transport_type + + cdef struct ibv_context: + ibv_device *device + int num_comp_vectors + int cmd_fd + + cdef struct ibv_device_attr: + char *fw_ver + unsigned long node_guid + unsigned long sys_image_guid + unsigned long max_mr_size + unsigned long page_size_cap + unsigned int vendor_id + unsigned int vendor_part_id + unsigned int hw_ver + unsigned int max_qp + unsigned int max_qp_wr + unsigned int device_cap_flags + unsigned int max_sge + unsigned int max_sge_rd + unsigned int max_cq + unsigned int max_cqe + unsigned int max_mr + unsigned int max_pd + unsigned int max_qp_rd_atom + unsigned int max_ee_rd_atom + unsigned int max_res_rd_atom + unsigned int max_qp_init_rd_atom + unsigned int max_ee_init_rd_atom + ibv_atomic_cap atomic_cap + unsigned int max_ee + unsigned int max_rdd + unsigned int max_mw + unsigned int max_raw_ipv6_qp + unsigned int max_raw_ethy_qp + unsigned int max_mcast_grp + unsigned int max_mcast_qp_attach + unsigned int max_total_mcast_qp_attach + unsigned int max_ah + unsigned int max_fmr + unsigned int max_map_per_fmr + unsigned int max_srq + unsigned int max_srq_wr + unsigned int max_srq_sge + unsigned int max_pkeys + unsigned int local_ca_ack_delay + unsigned int phys_port_cnt + + struct ibv_pd: + ibv_context *context + unsigned int handle + + cdef struct ibv_mr: + ibv_context *context + ibv_pd *pd + void *addr + size_t length + unsigned int handle + unsigned int lkey + unsigned int rkey + + cdef struct ibv_query_device_ex_input: + unsigned int comp_mask + + cdef struct per_transport_caps: + uint32_t rc_odp_caps + uint32_t uc_odp_caps + uint32_t ud_odp_caps + + cdef struct ibv_odp_caps: + uint64_t general_caps + per_transport_caps per_transport_caps + + cdef struct ibv_tso_caps: + unsigned int max_tso + unsigned int supported_qpts + + cdef struct ibv_rss_caps: + unsigned int supported_qpts + unsigned int max_rwq_indirection_tables + unsigned int max_rwq_indirection_table_size + unsigned long rx_hash_fields_mask + unsigned int rx_hash_function + + cdef struct ibv_packet_pacing_caps: + unsigned int qp_rate_limit_min + unsigned int qp_rate_limit_max + unsigned int supported_qpts + + cdef struct ibv_tm_caps: + unsigned int max_rndv_hdr_size + unsigned int max_num_tags + unsigned int flags + unsigned int max_ops + unsigned int max_sge + + cdef struct ibv_cq_moderation_caps: + unsigned int max_cq_count + unsigned int max_cq_period + + cdef struct ibv_pci_atomic_caps: + uint16_t fetch_add + uint16_t swap + uint16_t compare_swap + + cdef struct ibv_device_attr_ex: + ibv_device_attr orig_attr + unsigned int comp_mask + ibv_odp_caps odp_caps + unsigned long completion_timestamp_mask + unsigned long hca_core_clock + unsigned long device_cap_flags_ex + ibv_tso_caps tso_caps + ibv_rss_caps rss_caps + unsigned int max_wq_type_rq + ibv_packet_pacing_caps packet_pacing_caps + unsigned int raw_packet_caps + ibv_tm_caps tm_caps + ibv_cq_moderation_caps cq_mod_caps + unsigned long max_dm_size + ibv_pci_atomic_caps pci_atomic_caps + uint32_t xrc_odp_caps + + cdef struct ibv_mw: + ibv_context *context + ibv_pd *pd + unsigned int rkey + unsigned int handle + ibv_mw_type mw_type + + cdef struct ibv_alloc_dm_attr: + size_t length + unsigned int log_align_req + unsigned int comp_mask + + cdef struct ibv_dm: + ibv_context *context + unsigned int comp_mask + + cdef struct ibv_port_attr: + ibv_port_state state + ibv_mtu max_mtu + ibv_mtu active_mtu + int gid_tbl_len + unsigned int port_cap_flags + unsigned int max_msg_sz + unsigned int bad_pkey_cntr + unsigned int qkey_viol_cntr + unsigned short pkey_tbl_len + unsigned short lid + unsigned short sm_lid + unsigned char lmc + unsigned char max_vl_num + unsigned char sm_sl + unsigned char subnet_timeout + unsigned char init_type_reply + unsigned char active_width + unsigned char active_speed + unsigned char phys_state + unsigned char link_layer + unsigned char flags + unsigned short port_cap_flags2 + + cdef struct ibv_comp_channel: + ibv_context *context + unsigned int fd + unsigned int refcnt + + cdef struct ibv_cq: + ibv_context *context + ibv_comp_channel *channel + void *cq_context + int handle + int cqe + + cdef struct ibv_wc: + unsigned long wr_id + ibv_wc_status status + ibv_wc_opcode opcode + unsigned int vendor_err + unsigned int byte_len + unsigned int qp_num + unsigned int imm_data + unsigned int src_qp + int wc_flags + unsigned int pkey_index + unsigned int slid + unsigned int sl + unsigned int dlid_path_bits + + cdef struct ibv_cq_init_attr_ex: + unsigned int cqe + void *cq_context + ibv_comp_channel *channel + unsigned int comp_vector + unsigned long wc_flags + unsigned int comp_mask + unsigned int flags + ibv_pd *parent_domain + + cdef struct ibv_cq_ex: + ibv_context *context + ibv_comp_channel *channel + void *cq_context + unsigned int handle + int cqe + unsigned int comp_events_completed + unsigned int async_events_completed + unsigned int comp_mask + ibv_wc_status status + unsigned long wr_id + + cdef struct ibv_poll_cq_attr: + unsigned int comp_mask + + cdef struct ibv_wc_tm_info: + unsigned long tag + unsigned int priv + + cdef struct ibv_grh: + unsigned int version_tclass_flow + unsigned short paylen + unsigned char next_hdr + unsigned char hop_limit + ibv_gid sgid + ibv_gid dgid + + cdef struct ibv_global_route: + ibv_gid dgid + unsigned int flow_label + unsigned char sgid_index + unsigned char hop_limit + unsigned char traffic_class + + cdef struct ibv_ah_attr: + ibv_global_route grh + unsigned short dlid + unsigned char sl + unsigned char src_path_bits + unsigned char static_rate + unsigned char is_global + unsigned char port_num + + cdef struct ibv_ah: + ibv_context *context + ibv_pd *pd + unsigned int handle + + cdef struct ibv_sge: + unsigned long addr + unsigned int length + unsigned int lkey + + cdef struct ibv_recv_wr: + unsigned long wr_id + ibv_recv_wr *next + ibv_sge *sg_list + int num_sge + + cdef struct rdma: + unsigned long remote_addr + unsigned int rkey + + cdef struct atomic: + unsigned long remote_addr + unsigned long compare_add + unsigned long swap + unsigned int rkey + + cdef struct ud: + ibv_ah *ah + unsigned int remote_qpn + unsigned int remote_qkey + + cdef union wr: + rdma rdma + atomic atomic + ud ud + + cdef struct ibv_mw_bind_info: + ibv_mr *mr + unsigned long addr + unsigned long length + unsigned int mw_access_flags + + cdef struct bind_mw: + ibv_mw *mw + unsigned int rkey + ibv_mw_bind_info bind_info + + cdef struct tso: + void *hdr + unsigned short hdr_sz + unsigned short mss + + cdef union unnamed: + bind_mw bind_mw + tso tso + + cdef struct xrc: + unsigned int remote_srqn + + cdef union qp_type: + xrc xrc + + cdef struct ibv_send_wr: + unsigned long wr_id + ibv_send_wr *next + ibv_sge *sg_list + int num_sge + ibv_wr_opcode opcode + unsigned int send_flags + wr wr + qp_type qp_type + unnamed unnamed + + cdef struct ibv_qp_cap: + unsigned int max_send_wr + unsigned int max_recv_wr + unsigned int max_send_sge + unsigned int max_recv_sge + unsigned int max_inline_data + + cdef struct ibv_qp_init_attr: + void *qp_context + ibv_cq *send_cq + ibv_cq *recv_cq + ibv_srq *srq + ibv_qp_cap cap + ibv_qp_type qp_type + int sq_sig_all + + cdef struct ibv_xrcd_init_attr: + uint32_t comp_mask + int fd + int oflags + + cdef struct ibv_xrcd: + pass + + cdef struct ibv_srq_attr: + unsigned int max_wr + unsigned int max_sge + unsigned int srq_limit + + cdef struct ibv_srq_init_attr: + void *srq_context + ibv_srq_attr attr + + cdef struct ibv_srq_init_attr_ex: + void *srq_context + ibv_srq_attr attr + unsigned int comp_mask + ibv_srq_type srq_type + ibv_pd *pd + ibv_xrcd *xrcd + ibv_cq *cq + ibv_tm_caps tm_cap + + cdef struct ibv_srq: + ibv_context *context + void *srq_context + ibv_pd *pd + unsigned int handle + unsigned int events_completed + + cdef struct ibv_rwq_ind_table: + pass + + cdef struct ibv_rx_hash_conf: + pass + + cdef struct ibv_qp_init_attr_ex: + void *qp_context + ibv_cq *send_cq + ibv_cq *recv_cq + ibv_srq *srq + ibv_qp_cap cap + ibv_qp_type qp_type + int sq_sig_all + unsigned int comp_mask + ibv_pd *pd + ibv_xrcd *xrcd + unsigned int create_flags + unsigned short max_tso_header + ibv_rwq_ind_table *rwq_ind_tbl + ibv_rx_hash_conf rx_hash_conf + unsigned int source_qpn + unsigned long send_ops_flags + + cdef struct ibv_qp_attr: + ibv_qp_state qp_state + ibv_qp_state cur_qp_state + ibv_mtu path_mtu + ibv_mig_state path_mig_state + unsigned int qkey + unsigned int rq_psn + unsigned int sq_psn + unsigned int dest_qp_num + unsigned int qp_access_flags + ibv_qp_cap cap + ibv_ah_attr ah_attr + ibv_ah_attr alt_ah_attr + unsigned short pkey_index + unsigned short alt_pkey_index + unsigned char en_sqd_async_notify + unsigned char sq_draining + unsigned char max_rd_atomic + unsigned char max_dest_rd_atomic + unsigned char min_rnr_timer + unsigned char port_num + unsigned char timeout + unsigned char retry_cnt + unsigned char rnr_retry + unsigned char alt_port_num + unsigned char alt_timeout + unsigned int rate_limit + + cdef struct ibv_srq: + ibv_context *context + void *srq_context + ibv_pd *pd + unsigned int handle + unsigned int events_completed + + cdef struct ibv_data_buf: + void *addr + size_t length + + cdef struct ibv_qp: + ibv_context *context; + void *qp_context; + ibv_pd *pd; + ibv_cq *send_cq; + ibv_cq *recv_cq; + ibv_srq *srq; + unsigned int handle; + unsigned int qp_num; + ibv_qp_state state; + ibv_qp_type qp_type; + unsigned int events_completed; + + cdef struct ibv_parent_domain_init_attr: + ibv_pd *pd; + uint32_t comp_mask; + void *(*alloc)(ibv_pd *pd, void *pd_context, size_t size, + size_t alignment, uint64_t resource_type); + void (*free)(ibv_pd *pd, void *pd_context, void *ptr, + uint64_t resource_type); + void *pd_context; + + cdef struct ibv_qp_ex: + ibv_qp qp_base + uint64_t comp_mask + uint64_t wr_id + unsigned int wr_flags + + ibv_device **ibv_get_device_list(int *n) + void ibv_free_device_list(ibv_device **list) + ibv_context *ibv_open_device(ibv_device *device) + int ibv_close_device(ibv_context *context) + int ibv_query_device(ibv_context *context, ibv_device_attr *device_attr) + int ibv_query_device_ex(ibv_context *context, + ibv_query_device_ex_input *input, + ibv_device_attr_ex *attr) + unsigned long ibv_get_device_guid(ibv_device *device) + int ibv_query_gid(ibv_context *context, unsigned int port_num, + int index, ibv_gid *gid) + ibv_pd *ibv_alloc_pd(ibv_context *context) + int ibv_dealloc_pd(ibv_pd *pd) + ibv_mr *ibv_reg_mr(ibv_pd *pd, void *addr, size_t length, int access) + int ibv_dereg_mr(ibv_mr *mr) + ibv_mw *ibv_alloc_mw(ibv_pd *pd, ibv_mw_type type) + int ibv_dealloc_mw(ibv_mw *mw) + ibv_dm *ibv_alloc_dm(ibv_context *context, ibv_alloc_dm_attr *attr) + int ibv_free_dm(ibv_dm *dm) + ibv_mr *ibv_reg_dm_mr(ibv_pd *pd, ibv_dm *dm, unsigned long dm_offset, + size_t length, unsigned int access) + int ibv_memcpy_to_dm(ibv_dm *dm, unsigned long dm_offset, void *host_addr, + size_t length) + int ibv_memcpy_from_dm(void *host_addr, ibv_dm *dm, unsigned long dm_offset, + size_t length) + int ibv_query_port(ibv_context *context, uint8_t port_num, + ibv_port_attr *port_attr) + ibv_comp_channel *ibv_create_comp_channel(ibv_context *context) + int ibv_destroy_comp_channel(ibv_comp_channel *channel) + int ibv_get_cq_event(ibv_comp_channel *channel, ibv_cq **cq, + void **cq_context) + int ibv_req_notify_cq(ibv_cq *cq, int solicited_only) + void ibv_ack_cq_events(ibv_cq *cq, int nevents) + ibv_cq *ibv_create_cq(ibv_context *context, int cqe, void *cq_context, + ibv_comp_channel *channel, int comp_vector) + int ibv_destroy_cq(ibv_cq *cq) + int ibv_poll_cq(ibv_cq *cq, int num_entries, ibv_wc *wc) + ibv_cq_ex *ibv_create_cq_ex(ibv_context *context, + ibv_cq_init_attr_ex *cq_attr) + ibv_cq *ibv_cq_ex_to_cq(ibv_cq_ex *cq) + int ibv_start_poll(ibv_cq_ex *cq, ibv_poll_cq_attr *attr) + int ibv_next_poll(ibv_cq_ex *cq) + void ibv_end_poll(ibv_cq_ex *cq) + ibv_wc_opcode ibv_wc_read_opcode(ibv_cq_ex *cq) + unsigned int ibv_wc_read_vendor_err(ibv_cq_ex *cq) + unsigned int ibv_wc_read_byte_len(ibv_cq_ex *cq) + unsigned int ibv_wc_read_imm_data(ibv_cq_ex *cq) + unsigned int ibv_wc_read_invalidated_rkey(ibv_cq_ex *cq) + unsigned int ibv_wc_read_qp_num(ibv_cq_ex *cq) + unsigned int ibv_wc_read_src_qp(ibv_cq_ex *cq) + unsigned int ibv_wc_read_wc_flags(ibv_cq_ex *cq) + unsigned int ibv_wc_read_slid(ibv_cq_ex *cq) + unsigned char ibv_wc_read_sl(ibv_cq_ex *cq) + unsigned char ibv_wc_read_dlid_path_bits(ibv_cq_ex *cq) + unsigned long ibv_wc_read_completion_ts(ibv_cq_ex *cq) + unsigned short ibv_wc_read_cvlan(ibv_cq_ex *cq) + unsigned int ibv_wc_read_flow_tag(ibv_cq_ex *cq) + void ibv_wc_read_tm_info(ibv_cq_ex *cq, ibv_wc_tm_info *tm_info) + unsigned long ibv_wc_read_completion_wallclock_ns(ibv_cq_ex *cq) + ibv_ah *ibv_create_ah(ibv_pd *pd, ibv_ah_attr *attr) + int ibv_init_ah_from_wc(ibv_context *context, uint8_t port_num, + ibv_wc *wc, ibv_grh *grh, ibv_ah_attr *ah_attr) + ibv_ah *ibv_create_ah_from_wc(ibv_pd *pd, ibv_wc *wc, ibv_grh *grh, + uint8_t port_num) + int ibv_destroy_ah(ibv_ah *ah) + ibv_qp *ibv_create_qp(ibv_pd *pd, ibv_qp_init_attr *qp_init_attr) + ibv_qp *ibv_create_qp_ex(ibv_context *context, + ibv_qp_init_attr_ex *qp_init_attr_ex) + int ibv_modify_qp(ibv_qp *qp, ibv_qp_attr *qp_attr, int comp_mask) + int ibv_query_qp(ibv_qp *qp, ibv_qp_attr *attr, int attr_mask, + ibv_qp_init_attr *init_attr) + int ibv_destroy_qp(ibv_qp *qp) + int ibv_post_recv(ibv_qp *qp, ibv_recv_wr *wr, ibv_recv_wr **bad_wr) + int ibv_post_send(ibv_qp *qp, ibv_send_wr *wr, ibv_send_wr **bad_wr) + ibv_xrcd *ibv_open_xrcd(ibv_context *context, + ibv_xrcd_init_attr *xrcd_init_attr) + int ibv_close_xrcd(ibv_xrcd *xrcd) + ibv_srq *ibv_create_srq(ibv_pd *pd, ibv_srq_init_attr *srq_init_attr) + ibv_srq *ibv_create_srq_ex(ibv_context *context, + ibv_srq_init_attr_ex *srq_init_attr) + int ibv_modify_srq(ibv_srq *srq, ibv_srq_attr *srq_attr, int srq_attr_mask) + int ibv_query_srq(ibv_srq *srq, ibv_srq_attr *srq_attr) + int ibv_get_srq_num(ibv_srq *srq, unsigned int *srq_num) + int ibv_destroy_srq(ibv_srq *srq) + int ibv_post_srq_recv(ibv_srq *srq, ibv_recv_wr *recv_wr, + ibv_recv_wr **bad_recv_wr) + ibv_pd *ibv_alloc_parent_domain(ibv_context *context, + ibv_parent_domain_init_attr *attr) + uint32_t ibv_inc_rkey(uint32_t rkey) + ibv_qp_ex *ibv_qp_to_qp_ex(ibv_qp *qp) + void ibv_wr_atomic_cmp_swp(ibv_qp_ex *qp, uint32_t rkey, + uint64_t remote_addr, uint64_t compare, + uint64_t swap) + void ibv_wr_atomic_fetch_add(ibv_qp_ex *qp, uint32_t rkey, + uint64_t remote_addr, uint64_t add) + void ibv_wr_bind_mw(ibv_qp_ex *qp, ibv_mw *mw, uint32_t rkey, + ibv_mw_bind_info *bind_info) + void ibv_wr_local_inv(ibv_qp_ex *qp, uint32_t invalidate_rkey) + void ibv_wr_rdma_read(ibv_qp_ex *qp, uint32_t rkey, uint64_t remote_addr) + void ibv_wr_rdma_write(ibv_qp_ex *qp, uint32_t rkey, uint64_t remote_addr) + void ibv_wr_rdma_write_imm(ibv_qp_ex *qp, uint32_t rkey, + uint64_t remote_addr, uint32_t imm_data) + void ibv_wr_send(ibv_qp_ex *qp) + void ibv_wr_send_imm(ibv_qp_ex *qp, uint32_t imm_data) + void ibv_wr_send_inv(ibv_qp_ex *qp, uint32_t invalidate_rkey) + void ibv_wr_send_tso(ibv_qp_ex *qp, void *hdr, uint16_t hdr_sz, + uint16_t mss) + void ibv_wr_set_ud_addr(ibv_qp_ex *qp, ibv_ah *ah, uint32_t remote_qpn, + uint32_t remote_qkey) + void ibv_wr_set_xrc_srqn(ibv_qp_ex *qp, uint32_t remote_srqn) + void ibv_wr_set_inline_data(ibv_qp_ex *qp, void *addr, size_t length) + void ibv_wr_set_inline_data_list(ibv_qp_ex *qp, size_t num_buf, + ibv_data_buf *buf_list) + void ibv_wr_set_sge(ibv_qp_ex *qp, uint32_t lkey, uint64_t addr, + uint32_t length) + void ibv_wr_set_sge_list(ibv_qp_ex *qp, size_t num_sge, ibv_sge *sg_list) + void ibv_wr_start(ibv_qp_ex *qp) + int ibv_wr_complete(ibv_qp_ex *qp) + void ibv_wr_abort(ibv_qp_ex *qp) + + +cdef extern from 'infiniband/driver.h': + int ibv_query_gid_type(ibv_context *context, uint8_t port_num, + unsigned int index, ibv_gid_type *type) diff --git a/pyverbs/libibverbs_enums.pxd b/pyverbs/libibverbs_enums.pxd new file mode 100755 index 0000000..7f61be2 --- /dev/null +++ b/pyverbs/libibverbs_enums.pxd @@ -0,0 +1,438 @@ +# SPDX-License-Identifier: (GPL-2.0 OR Linux-OpenIB) +# Copyright (c) 2018, Mellanox Technologies. All rights reserved. + +#cython: language_level=3 + + +cdef extern from '<infiniband/verbs.h>': + + cpdef enum ibv_transport_type: + IBV_TRANSPORT_UNKNOWN + IBV_TRANSPORT_IB + IBV_TRANSPORT_IWARP + IBV_TRANSPORT_USNIC + IBV_TRANSPORT_USNIC_UDP + + cpdef enum ibv_node_type: + IBV_NODE_UNKNOWN + IBV_NODE_CA + IBV_NODE_SWITCH + IBV_NODE_ROUTER + IBV_NODE_RNIC + IBV_NODE_USNIC + IBV_NODE_USNIC_UDP + + cpdef enum: + IBV_LINK_LAYER_UNSPECIFIED + IBV_LINK_LAYER_INFINIBAND + IBV_LINK_LAYER_ETHERNET + + cpdef enum ibv_atomic_cap: + IBV_ATOMIC_NONE + IBV_ATOMIC_HCA + IBV_ATOMIC_GLOB + + cpdef enum ibv_port_state: + IBV_PORT_NOP + IBV_PORT_DOWN + IBV_PORT_INIT + IBV_PORT_ARMED + IBV_PORT_ACTIVE + IBV_PORT_ACTIVE_DEFER + + cpdef enum ibv_port_cap_flags: + IBV_PORT_SM + IBV_PORT_NOTICE_SUP + IBV_PORT_TRAP_SUP + IBV_PORT_OPT_IPD_SUP + IBV_PORT_AUTO_MIGR_SUP + IBV_PORT_SL_MAP_SUP + IBV_PORT_MKEY_NVRAM + IBV_PORT_PKEY_NVRAM + IBV_PORT_LED_INFO_SUP + IBV_PORT_SYS_IMAGE_GUID_SUP + IBV_PORT_PKEY_SW_EXT_PORT_TRAP_SUP + IBV_PORT_EXTENDED_SPEEDS_SUP + IBV_PORT_CAP_MASK2_SUP + IBV_PORT_CM_SUP + IBV_PORT_SNMP_TUNNEL_SUP + IBV_PORT_REINIT_SUP + IBV_PORT_DEVICE_MGMT_SUP + IBV_PORT_VENDOR_CLASS_SUP + IBV_PORT_DR_NOTICE_SUP + IBV_PORT_CAP_MASK_NOTICE_SUP + IBV_PORT_BOOT_MGMT_SUP + IBV_PORT_LINK_LATENCY_SUP + IBV_PORT_CLIENT_REG_SUP + IBV_PORT_IP_BASED_GIDS + + cpdef enum ibv_port_cap_flags2: + IBV_PORT_SET_NODE_DESC_SUP + IBV_PORT_INFO_EXT_SUP + IBV_PORT_VIRT_SUP + IBV_PORT_SWITCH_PORT_STATE_TABLE_SUP + IBV_PORT_LINK_WIDTH_2X_SUP + IBV_PORT_LINK_SPEED_HDR_SUP + + cpdef enum ibv_mtu: + IBV_MTU_256 + IBV_MTU_512 + IBV_MTU_1024 + IBV_MTU_2048 + IBV_MTU_4096 + + cpdef enum ibv_event_type: + IBV_EVENT_CQ_ERR + IBV_EVENT_QP_FATAL + IBV_EVENT_QP_REQ_ERR + IBV_EVENT_QP_ACCESS_ERR + IBV_EVENT_COMM_EST + IBV_EVENT_SQ_DRAINED + IBV_EVENT_PATH_MIG + IBV_EVENT_PATH_MIG_ERR + IBV_EVENT_DEVICE_FATAL + IBV_EVENT_PORT_ACTIVE + IBV_EVENT_PORT_ERR + IBV_EVENT_LID_CHANGE + IBV_EVENT_PKEY_CHANGE + IBV_EVENT_SM_CHANGE + IBV_EVENT_SRQ_ERR + IBV_EVENT_SRQ_LIMIT_REACHED + IBV_EVENT_QP_LAST_WQE_REACHED + IBV_EVENT_CLIENT_REREGISTER + IBV_EVENT_GID_CHANGE + IBV_EVENT_WQ_FATAL + + cpdef enum ibv_access_flags: + IBV_ACCESS_LOCAL_WRITE + IBV_ACCESS_REMOTE_WRITE + IBV_ACCESS_REMOTE_READ + IBV_ACCESS_REMOTE_ATOMIC + IBV_ACCESS_MW_BIND + IBV_ACCESS_ZERO_BASED + IBV_ACCESS_ON_DEMAND + IBV_ACCESS_HUGETLB + IBV_ACCESS_RELAXED_ORDERING + + cpdef enum ibv_wr_opcode: + IBV_WR_RDMA_WRITE + IBV_WR_RDMA_WRITE_WITH_IMM + IBV_WR_SEND + IBV_WR_SEND_WITH_IMM + IBV_WR_RDMA_READ + IBV_WR_ATOMIC_CMP_AND_SWP + IBV_WR_ATOMIC_FETCH_AND_ADD + IBV_WR_LOCAL_INV + IBV_WR_BIND_MW + IBV_WR_SEND_WITH_INV + IBV_WR_TSO + + cpdef enum ibv_send_flags: + IBV_SEND_FENCE + IBV_SEND_SIGNALED + IBV_SEND_SOLICITED + IBV_SEND_INLINE + IBV_SEND_IP_CSUM + + cpdef enum ibv_qp_type: + IBV_QPT_RC + IBV_QPT_UC + IBV_QPT_UD + IBV_QPT_RAW_PACKET + IBV_QPT_XRC_SEND + IBV_QPT_XRC_RECV + IBV_QPT_DRIVER + + cpdef enum ibv_qp_state: + IBV_QPS_RESET + IBV_QPS_INIT + IBV_QPS_RTR + IBV_QPS_RTS + IBV_QPS_SQD + IBV_QPS_SQE + IBV_QPS_ERR + IBV_QPS_UNKNOWN + + cpdef enum ibv_mw_type: + IBV_MW_TYPE_1 + IBV_MW_TYPE_2 + + cpdef enum ibv_wc_status: + IBV_WC_SUCCESS + IBV_WC_LOC_LEN_ERR + IBV_WC_LOC_QP_OP_ERR + IBV_WC_LOC_EEC_OP_ERR + IBV_WC_LOC_PROT_ERR + IBV_WC_WR_FLUSH_ERR + IBV_WC_MW_BIND_ERR + IBV_WC_BAD_RESP_ERR + IBV_WC_LOC_ACCESS_ERR + IBV_WC_REM_INV_REQ_ERR + IBV_WC_REM_ACCESS_ERR + IBV_WC_REM_OP_ERR + IBV_WC_RETRY_EXC_ERR + IBV_WC_RNR_RETRY_EXC_ERR + IBV_WC_LOC_RDD_VIOL_ERR + IBV_WC_REM_INV_RD_REQ_ERR + IBV_WC_REM_ABORT_ERR + IBV_WC_INV_EECN_ERR + IBV_WC_INV_EEC_STATE_ERR + IBV_WC_FATAL_ERR + IBV_WC_RESP_TIMEOUT_ERR + IBV_WC_GENERAL_ERR + + cpdef enum ibv_wc_opcode: + IBV_WC_SEND + IBV_WC_RDMA_WRITE + IBV_WC_RDMA_READ + IBV_WC_COMP_SWAP + IBV_WC_FETCH_ADD + IBV_WC_BIND_MW + IBV_WC_LOCAL_INV + IBV_WC_TSO + IBV_WC_RECV + IBV_WC_RECV_RDMA_WITH_IMM + + cpdef enum ibv_create_cq_wc_flags: + IBV_WC_EX_WITH_BYTE_LEN + IBV_WC_EX_WITH_IMM + IBV_WC_EX_WITH_QP_NUM + IBV_WC_EX_WITH_SRC_QP + IBV_WC_EX_WITH_SLID + IBV_WC_EX_WITH_SL + IBV_WC_EX_WITH_DLID_PATH_BITS + IBV_WC_EX_WITH_COMPLETION_TIMESTAMP + IBV_WC_EX_WITH_CVLAN + IBV_WC_EX_WITH_FLOW_TAG + IBV_WC_EX_WITH_COMPLETION_TIMESTAMP_WALLCLOCK + + cpdef enum ibv_wc_flags: + IBV_WC_GRH + IBV_WC_WITH_IMM + IBV_WC_IP_CSUM_OK + IBV_WC_WITH_INV + + cpdef enum ibv_srq_attr_mask: + IBV_SRQ_MAX_WR + IBV_SRQ_LIMIT + + cpdef enum ibv_srq_type: + IBV_SRQT_BASIC + IBV_SRQT_XRC + + cpdef enum ibv_srq_init_attr_mask: + IBV_SRQ_INIT_ATTR_TYPE + IBV_SRQ_INIT_ATTR_PD + IBV_SRQ_INIT_ATTR_XRCD + IBV_SRQ_INIT_ATTR_CQ + + cpdef enum ibv_mig_state: + IBV_MIG_MIGRATED + IBV_MIG_REARM + IBV_MIG_ARMED + + cpdef enum ibv_qp_init_attr_mask: + IBV_QP_INIT_ATTR_PD + IBV_QP_INIT_ATTR_XRCD + IBV_QP_INIT_ATTR_CREATE_FLAGS + IBV_QP_INIT_ATTR_MAX_TSO_HEADER + IBV_QP_INIT_ATTR_IND_TABLE + IBV_QP_INIT_ATTR_RX_HASH + IBV_QP_INIT_ATTR_SEND_OPS_FLAGS + + cpdef enum ibv_qp_create_flags: + IBV_QP_CREATE_BLOCK_SELF_MCAST_LB + IBV_QP_CREATE_SCATTER_FCS + IBV_QP_CREATE_CVLAN_STRIPPING + IBV_QP_CREATE_SOURCE_QPN + IBV_QP_CREATE_PCI_WRITE_END_PADDING + + cpdef enum ibv_qp_attr_mask: + IBV_QP_STATE + IBV_QP_CUR_STATE + IBV_QP_EN_SQD_ASYNC_NOTIFY + IBV_QP_ACCESS_FLAGS + IBV_QP_PKEY_INDEX + IBV_QP_PORT + IBV_QP_QKEY + IBV_QP_AV + IBV_QP_PATH_MTU + IBV_QP_TIMEOUT + IBV_QP_RETRY_CNT + IBV_QP_RNR_RETRY + IBV_QP_RQ_PSN + IBV_QP_MAX_QP_RD_ATOMIC + IBV_QP_ALT_PATH + IBV_QP_MIN_RNR_TIMER + IBV_QP_SQ_PSN + IBV_QP_MAX_DEST_RD_ATOMIC + IBV_QP_PATH_MIG_STATE + IBV_QP_CAP + IBV_QP_DEST_QPN + IBV_QP_RATE_LIMIT + + cpdef enum ibv_wq_type: + IBV_WQT_RQ + + cpdef enum ibv_wq_init_attr_mask: + IBV_WQ_INIT_ATTR_FLAGS + + cpdef enum ibv_wq_flags: + IBV_WQ_FLAGS_CVLAN_STRIPPING + IBV_WQ_FLAGS_SCATTER_FCS + IBV_WQ_FLAGS_DELAY_DROP + IBV_WQ_FLAGS_PCI_WRITE_END_PADDING + + cpdef enum ibv_wq_state: + IBV_WQS_RESET + IBV_WQS_RDY + IBV_WQS_ERR + IBV_WQS_UNKNOWN + + cpdef enum ibv_wq_attr_mask: + IBV_WQ_ATTR_STATE + IBV_WQ_ATTR_CURR_STATE + IBV_WQ_ATTR_FLAGS + + cpdef enum ibv_rx_hash_function_flags: + IBV_RX_HASH_FUNC_TOEPLITZ + + cpdef enum ibv_rx_hash_fields: + IBV_RX_HASH_SRC_IPV4 + IBV_RX_HASH_DST_IPV4 + IBV_RX_HASH_SRC_IPV6 + IBV_RX_HASH_DST_IPV6 + IBV_RX_HASH_SRC_PORT_TCP + IBV_RX_HASH_DST_PORT_TCP + IBV_RX_HASH_SRC_PORT_UDP + IBV_RX_HASH_DST_PORT_UDP + + cpdef enum ibv_flow_flags: + IBV_FLOW_ATTR_FLAGS_DONT_TRAP + IBV_FLOW_ATTR_FLAGS_EGRESS + + cpdef enum ibv_flow_attr_type: + IBV_FLOW_ATTR_NORMAL + IBV_FLOW_ATTR_ALL_DEFAULT + IBV_FLOW_ATTR_MC_DEFAULT + IBV_FLOW_ATTR_SNIFFER + + cpdef enum ibv_flow_spec_type: + IBV_FLOW_SPEC_ETH + IBV_FLOW_SPEC_IPV4 + IBV_FLOW_SPEC_IPV6 + IBV_FLOW_SPEC_IPV4_EXT + IBV_FLOW_SPEC_ESP + IBV_FLOW_SPEC_TCP + IBV_FLOW_SPEC_UDP + IBV_FLOW_SPEC_VXLAN_TUNNEL + IBV_FLOW_SPEC_GRE + IBV_FLOW_SPEC_MPLS + IBV_FLOW_SPEC_INNER + IBV_FLOW_SPEC_ACTION_TAG + IBV_FLOW_SPEC_ACTION_DROP + IBV_FLOW_SPEC_ACTION_HANDLE + IBV_FLOW_SPEC_ACTION_COUNT + + cpdef enum: + IBV_QPF_GRH_REQUIRED + + cpdef enum ibv_counter_description: + IBV_COUNTER_PACKETS + IBV_COUNTER_BYTES + + cpdef enum ibv_read_counters_flags: + IBV_READ_COUNTERS_ATTR_PREFER_CACHED + + cpdef enum ibv_cq_init_attr_mask: + IBV_CQ_INIT_ATTR_MASK_FLAGS + IBV_CQ_INIT_ATTR_MASK_PD + + cpdef enum ibv_create_cq_attr_flags: + IBV_CREATE_CQ_ATTR_SINGLE_THREADED + IBV_CREATE_CQ_ATTR_IGNORE_OVERRUN + + cpdef enum ibv_odp_general_caps: + IBV_ODP_SUPPORT + IBV_ODP_SUPPORT_IMPLICIT + + cpdef enum ibv_odp_transport_cap_bits: + IBV_ODP_SUPPORT_SEND + IBV_ODP_SUPPORT_RECV + IBV_ODP_SUPPORT_WRITE + IBV_ODP_SUPPORT_READ + IBV_ODP_SUPPORT_ATOMIC + IBV_ODP_SUPPORT_SRQ_RECV + + cpdef enum ibv_device_cap_flags: + IBV_DEVICE_RESIZE_MAX_WR + IBV_DEVICE_BAD_PKEY_CNTR + IBV_DEVICE_BAD_QKEY_CNTR + IBV_DEVICE_RAW_MULTI + IBV_DEVICE_AUTO_PATH_MIG + IBV_DEVICE_CHANGE_PHY_PORT + IBV_DEVICE_UD_AV_PORT_ENFORCE + IBV_DEVICE_CURR_QP_STATE_MOD + IBV_DEVICE_SHUTDOWN_PORT + IBV_DEVICE_INIT_TYPE + IBV_DEVICE_PORT_ACTIVE_EVENT + IBV_DEVICE_SYS_IMAGE_GUID + IBV_DEVICE_RC_RNR_NAK_GEN + IBV_DEVICE_SRQ_RESIZE + IBV_DEVICE_N_NOTIFY_CQ + IBV_DEVICE_MEM_WINDOW + IBV_DEVICE_UD_IP_CSUM + IBV_DEVICE_XRC + IBV_DEVICE_MEM_MGT_EXTENSIONS + IBV_DEVICE_MEM_WINDOW_TYPE_2A + IBV_DEVICE_MEM_WINDOW_TYPE_2B + IBV_DEVICE_RC_IP_CSUM + IBV_DEVICE_RAW_IP_CSUM + IBV_DEVICE_MANAGED_FLOW_STEERING + + cpdef enum ibv_raw_packet_caps: + IBV_RAW_PACKET_CAP_CVLAN_STRIPPING + IBV_RAW_PACKET_CAP_SCATTER_FCS + IBV_RAW_PACKET_CAP_IP_CSUM + IBV_RAW_PACKET_CAP_DELAY_DROP + + cpdef enum ibv_xrcd_init_attr_mask: + IBV_XRCD_INIT_ATTR_FD + IBV_XRCD_INIT_ATTR_OFLAGS + IBV_XRCD_INIT_ATTR_RESERVED + + cpdef enum: + IBV_WC_STANDARD_FLAGS + + cpdef enum ibv_qp_create_send_ops_flags: + IBV_QP_EX_WITH_RDMA_WRITE + IBV_QP_EX_WITH_RDMA_WRITE_WITH_IMM + IBV_QP_EX_WITH_SEND + IBV_QP_EX_WITH_SEND_WITH_IMM + IBV_QP_EX_WITH_RDMA_READ + IBV_QP_EX_WITH_ATOMIC_CMP_AND_SWP + IBV_QP_EX_WITH_ATOMIC_FETCH_AND_ADD + IBV_QP_EX_WITH_LOCAL_INV + IBV_QP_EX_WITH_BIND_MW + IBV_QP_EX_WITH_SEND_WITH_INV + IBV_QP_EX_WITH_TSO + + cdef unsigned long long IBV_DEVICE_RAW_SCATTER_FCS + cdef unsigned long long IBV_DEVICE_PCI_WRITE_END_PADDING + + cpdef enum ibv_parent_domain_init_attr_mask: + IBV_PARENT_DOMAIN_INIT_ATTR_ALLOCATORS + IBV_PARENT_DOMAIN_INIT_ATTR_PD_CONTEXT + + cdef void *IBV_ALLOCATOR_USE_DEFAULT + + +_IBV_DEVICE_RAW_SCATTER_FCS = IBV_DEVICE_RAW_SCATTER_FCS +_IBV_DEVICE_PCI_WRITE_END_PADDING = IBV_DEVICE_PCI_WRITE_END_PADDING +_IBV_ALLOCATOR_USE_DEFAULT = <size_t>IBV_ALLOCATOR_USE_DEFAULT + + +cdef extern from '<infiniband/driver.h>': + cpdef enum ibv_gid_type: + IBV_GID_TYPE_IB_ROCE_V1 + IBV_GID_TYPE_ROCE_V2 diff --git a/pyverbs/librdmacm.pxd b/pyverbs/librdmacm.pxd new file mode 100755 index 0000000..03c0cdd --- /dev/null +++ b/pyverbs/librdmacm.pxd @@ -0,0 +1,127 @@ +# SPDX-License-Identifier: (GPL-2.0 OR Linux-OpenIB) +# Copyright (c) 2019, Mellanox Technologies. All rights reserved. See COPYING file + +include 'libibverbs.pxd' +include 'librdmacm_enums.pxd' +from libc.stdint cimport uint8_t, uint32_t + +cdef extern from '<rdma/rdma_cma.h>': + + cdef struct rdma_cm_id: + ibv_context *verbs + rdma_event_channel *channel + void *context + ibv_qp *qp + rdma_port_space ps + uint8_t port_num + rdma_cm_event *event + ibv_comp_channel *send_cq_channel + ibv_cq *send_cq + ibv_comp_channel *recv_cq_channel + ibv_cq *recv_cq + ibv_srq *srq + ibv_pd *pd + ibv_qp_type qp_type + + cdef struct rdma_event_channel: + int fd + + cdef struct rdma_conn_param: + const void *private_data + uint8_t private_data_len + uint8_t responder_resources + uint8_t initiator_depth + uint8_t flow_control + uint8_t retry_count + uint8_t rnr_retry_count + uint8_t srq + uint32_t qp_num + + cdef struct rdma_ud_param: + const void *private_data + uint8_t private_data_len + ibv_ah_attr ah_attr + uint32_t qp_num + uint32_t qkey + + cdef union param: + rdma_conn_param conn + rdma_ud_param ud + + cdef struct rdma_cm_event: + rdma_cm_id *id + rdma_cm_id *listen_id + rdma_cm_event_type event + int status + param param + + cdef struct rdma_addrinfo: + int ai_flags + int ai_family + int ai_qp_type + int ai_port_space + int ai_src_len + int ai_dst_len + sockaddr *ai_src_addr + sockaddr *ai_dst_addr + char *ai_src_canonname + char *ai_dst_canonname + size_t ai_route_len + void *ai_route + size_t ai_connect_len + void *ai_connect + rdma_addrinfo *ai_next + +# These non rdmacm structs defined in one of rdma_cma.h's included header files + cdef struct sockaddr: + unsigned short sa_family + char sa_data[14] + + cdef struct in_addr: + uint32_t s_addr + + cdef struct sockaddr_in: + short sin_family + unsigned short sin_port + in_addr sin_addr + char sin_zero[8] + + rdma_event_channel *rdma_create_event_channel() + void rdma_destroy_event_channel(rdma_event_channel *channel) + int rdma_get_cm_event(rdma_event_channel *channel, rdma_cm_event **event) + int rdma_ack_cm_event(rdma_cm_event *event) + char *rdma_event_str(rdma_cm_event_type event) + int rdma_create_ep(rdma_cm_id **id, rdma_addrinfo *res, + ibv_pd *pd, ibv_qp_init_attr *qp_init_attr) + void rdma_destroy_ep(rdma_cm_id *id) + int rdma_create_id(rdma_event_channel *channel, rdma_cm_id **id, + void *context, rdma_port_space ps) + int rdma_destroy_id(rdma_cm_id *id) + int rdma_get_request(rdma_cm_id *listen, rdma_cm_id **id) + int rdma_bind_addr(rdma_cm_id *id, sockaddr *addr) + int rdma_resolve_addr(rdma_cm_id *id, sockaddr *src_addr, + sockaddr *dst_addr, int timeout_ms) + int rdma_resolve_route(rdma_cm_id *id, int timeout_ms) + int rdma_connect(rdma_cm_id *id, rdma_conn_param *conn_param) + int rdma_disconnect(rdma_cm_id *id) + int rdma_listen(rdma_cm_id *id, int backlog) + int rdma_accept(rdma_cm_id *id, rdma_conn_param *conn_param) + int rdma_establish(rdma_cm_id *id) + int rdma_getaddrinfo(char *node, char *service, rdma_addrinfo *hints, + rdma_addrinfo **res) + void rdma_freeaddrinfo(rdma_addrinfo *res) + int rdma_init_qp_attr(rdma_cm_id *id, ibv_qp_attr *qp_attr, + int *qp_attr_mask) + int rdma_create_qp(rdma_cm_id *id, ibv_pd *pd, + ibv_qp_init_attr *qp_init_attr) + void rdma_destroy_qp(rdma_cm_id *id) + +cdef extern from '<rdma/rdma_verbs.h>': + int rdma_post_recv(rdma_cm_id *id, void *context, void *addr, + size_t length, ibv_mr *mr) + int rdma_post_send(rdma_cm_id *id, void *context, void *addr, + size_t length, ibv_mr *mr, int flags) + int rdma_get_send_comp(rdma_cm_id *id, ibv_wc *wc) + int rdma_get_recv_comp(rdma_cm_id *id, ibv_wc *wc) + ibv_mr *rdma_reg_msgs(rdma_cm_id *id, void *addr, size_t length) + int rdma_dereg_mr(ibv_mr *mr) diff --git a/pyverbs/librdmacm_enums.pxd b/pyverbs/librdmacm_enums.pxd new file mode 100755 index 0000000..a47e484 --- /dev/null +++ b/pyverbs/librdmacm_enums.pxd @@ -0,0 +1,38 @@ +# SPDX-License-Identifier: (GPL-2.0 OR Linux-OpenIB) +# Copyright (c) 2019, Mellanox Technologies. All rights reserved. + +#cython: language_level=3 + + +cdef extern from '<rdma/rdma_cma.h>': + + cpdef enum rdma_cm_event_type: + RDMA_CM_EVENT_ADDR_RESOLVED + RDMA_CM_EVENT_ADDR_ERROR + RDMA_CM_EVENT_ROUTE_RESOLVED + RDMA_CM_EVENT_ROUTE_ERROR + RDMA_CM_EVENT_CONNECT_REQUEST + RDMA_CM_EVENT_CONNECT_RESPONSE + RDMA_CM_EVENT_CONNECT_ERROR + RDMA_CM_EVENT_UNREACHABLE + RDMA_CM_EVENT_REJECTED + RDMA_CM_EVENT_ESTABLISHED + RDMA_CM_EVENT_DISCONNECTED + RDMA_CM_EVENT_DEVICE_REMOVAL + RDMA_CM_EVENT_MULTICAST_JOIN + RDMA_CM_EVENT_MULTICAST_ERROR + RDMA_CM_EVENT_ADDR_CHANGE + RDMA_CM_EVENT_TIMEWAIT_EXIT + + cpdef enum rdma_port_space: + RDMA_PS_IPOIB + RDMA_PS_TCP + RDMA_PS_UDP + RDMA_PS_IB + + # Hint flags which control the operation. + cpdef enum: + RAI_PASSIVE + RAI_NUMERICHOST + RAI_NOROUTE + RAI_FAMILY diff --git a/pyverbs/mem_alloc.pyx b/pyverbs/mem_alloc.pyx new file mode 100644 index 0000000..3be1031 --- /dev/null +++ b/pyverbs/mem_alloc.pyx @@ -0,0 +1,98 @@ +# SPDX-License-Identifier: (GPL-2.0 OR Linux-OpenIB) +# Copyright (c) 2018, Mellanox Technologies. All rights reserved. See COPYING file + +#cython: language_level=3 + +from posix.stdlib cimport posix_memalign as c_posix_memalign +from libc.stdlib cimport malloc as c_malloc, free as c_free +from posix.mman cimport mmap as c_mmap, munmap as c_munmap +from libc.stdint cimport uintptr_t +cimport posix.mman as mm + +cdef extern from 'sys/mman.h': + cdef void* MAP_FAILED + + +def mmap(addr=0, length=100, prot=mm.PROT_READ | mm.PROT_WRITE, + flags=mm.MAP_PRIVATE | mm.MAP_ANONYMOUS, fd=0, offset=0): + """ + Python wrapper for sys mmap function + :param addr: Address to mmap the memory + :param length: The length of the requested memory in bytes + :param prot: Indicate the protection of this memory + :param flags: Specify speicific flags to this memory + :param fd: File descriptor to mmap specific file + :param offset: Offset to use when mmap + :return: The address to the mapped memory + """ + # uintptr_t is guaranteed to be large enough to hold any pointer. + # In order to safely cast addr to void*, it is firstly cast to uintptr_t. + ptr = c_mmap(<void*><uintptr_t>addr, length, prot, flags, fd, offset) + if <void *>ptr == MAP_FAILED: + raise MemoryError('Failed to mmap memory') + return <uintptr_t> ptr + + +def munmap(addr, length): + """ + Python wrapper for sys munmap function + :param addr: The address of the mapped memory to unmap + :param length: The length of this mapped memory + """ + ret = c_munmap(<void*><uintptr_t>addr, length) + if ret: + raise MemoryError('Failed to munmap requested memory') + + +def malloc(size): + """ + Python wrapper for stdlib malloc function + :param size: The size of the memory block in bytes + :return: The address of the allocated memory, or 0 if the request fails + """ + ptr = c_malloc(size) + if not ptr: + raise MemoryError('Failed to allocate memory') + return <uintptr_t>ptr + + +def posix_memalign(size, alignment=8): + """ + Python wrapper for the stdlib posix_memalign function + :param size: The size of the memory block in bytes + :param alignment: Alignment of the allocated memory, must be a power of two + :return: The address of the allocated memory, which is a multiple of + alignment. + """ + cdef void* ptr + ret = c_posix_memalign(&ptr, alignment, size) + if ret: + raise MemoryError('Failed to allocate memory ({err}'.format(ret)) + return <uintptr_t>ptr + + +def free(ptr): + """ + Python wrapper for stdlib free function + :param ptr: The address of a previously allocated memory block + """ + c_free(<void*><uintptr_t>ptr) + + +# protection bits for mmap/mprotect +PROT_EXEC_ = mm.PROT_EXEC +PROT_READ_ = mm.PROT_READ +PROT_WRITE_ = mm.PROT_WRITE +PROT_NONE_ = mm.PROT_NONE + +# flag bits for mmap +MAP_PRIVATE_ = mm.MAP_PRIVATE +MAP_SHARED_ = mm.MAP_SHARED +MAP_FIXED_ = mm.MAP_FIXED +MAP_ANONYMOUS_ = mm.MAP_ANONYMOUS +MAP_STACK_ = mm.MAP_STACK +MAP_LOCKED_ = mm.MAP_LOCKED +MAP_HUGETLB_ = mm.MAP_HUGETLB +MAP_POPULATE_ = mm.MAP_POPULATE +MAP_NORESERVE_ = mm.MAP_NORESERVE +MAP_GROWSDOWN_ = mm.MAP_GROWSDOWN diff --git a/pyverbs/mr.pxd b/pyverbs/mr.pxd new file mode 100644 index 0000000..82ae79f --- /dev/null +++ b/pyverbs/mr.pxd @@ -0,0 +1,28 @@ +# SPDX-License-Identifier: (GPL-2.0 OR Linux-OpenIB) +# Copyright (c) 2019, Mellanox Technologies. All rights reserved. See COPYING file + +#cython: language_level=3 + +from pyverbs.base cimport PyverbsCM +from . cimport libibverbs as v + + +cdef class MR(PyverbsCM): + cdef object pd + cdef v.ibv_mr *mr + cdef int mmap_length + cdef object is_huge + cdef object is_user_addr + cdef void *buf + cpdef read(self, length, offset) + +cdef class MWBindInfo(PyverbsCM): + cdef v.ibv_mw_bind_info info + cdef object mr + +cdef class MW(PyverbsCM): + cdef object pd + cdef v.ibv_mw *mw + +cdef class DMMR(MR): + cdef object dm diff --git a/pyverbs/mr.pyx b/pyverbs/mr.pyx new file mode 100644 index 0000000..b7b2196 --- /dev/null +++ b/pyverbs/mr.pyx @@ -0,0 +1,237 @@ +# SPDX-License-Identifier: (GPL-2.0 OR Linux-OpenIB) +# Copyright (c) 2019, Mellanox Technologies. All rights reserved. See COPYING file + +import resource +import logging + +from posix.mman cimport mmap, munmap, MAP_PRIVATE, PROT_READ, PROT_WRITE, \ + MAP_ANONYMOUS, MAP_HUGETLB +from pyverbs.pyverbs_error import PyverbsError, PyverbsRDMAError +from pyverbs.base import PyverbsRDMAErrno +from posix.stdlib cimport posix_memalign +from libc.string cimport memcpy, memset +cimport pyverbs.libibverbs_enums as e +from libc.stdint cimport uintptr_t +from pyverbs.device cimport DM +from libc.stdlib cimport free +from .pd cimport PD + +cdef extern from 'sys/mman.h': + cdef void* MAP_FAILED + +HUGE_PAGE_SIZE = 0x200000 + + +cdef class MR(PyverbsCM): + """ + MR class represents ibv_mr. Buffer allocation in done in the c'tor. Freeing + it is done in close(). + """ + def __init__(self, PD pd not None, length, access, address=None): + """ + Allocate a user-level buffer of length <length> and register a Memory + Region of the given length and access flags. + :param pd: A PD object + :param length: Length in bytes + :param access: Access flags, see ibv_access_flags enum + :param address: Memory address to register (Optional). If it's not + provided, a memory will be allocated in the class + initialization. + :return: The newly created MR on success + """ + super().__init__() + if self.mr != NULL: + return + self.is_huge = True if access & e.IBV_ACCESS_HUGETLB else False + # We want to enable registering an MR of size 0 but this fails with a + # buffer of size 0, so in this case lets increase the buffer + if length == 0: + length = 10 + if address: + self.is_user_addr = True + # uintptr_t is guaranteed to be large enough to hold any pointer. + # In order to safely cast addr to void*, it is firstly cast to uintptr_t. + self.buf = <void*><uintptr_t>address + else: + if self.is_huge: + # Rounding up to multiple of HUGE_PAGE_SIZE + self.mmap_length = length + (HUGE_PAGE_SIZE - length % HUGE_PAGE_SIZE) \ + if length % HUGE_PAGE_SIZE else length + self.buf = mmap(NULL, self.mmap_length, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, -1, 0) + if self.buf == MAP_FAILED: + raise PyverbsError('Failed to allocate MR buffer of size {l}'. + format(l=length)) + else: + rc = posix_memalign(&self.buf, resource.getpagesize(), length) + if rc: + raise PyverbsError('Failed to allocate MR buffer of size {l}'. + format(l=length)) + memset(self.buf, 0, length) + self.mr = v.ibv_reg_mr(<v.ibv_pd*>pd.pd, self.buf, length, access) + if self.mr == NULL: + raise PyverbsRDMAErrno('Failed to register a MR. length: {l}, access flags: {a}'. + format(l=length, a=access)) + self.pd = pd + pd.add_ref(self) + self.logger.debug('Registered ibv_mr. Length: {l}, access flags {a}'. + format(l=length, a=access)) + + def __dealloc__(self): + self.close() + + cpdef close(self): + """ + Closes the underlying C object of the MR and frees the memory allocated. + MR may be deleted directly or indirectly by closing its context, which + leaves the Python PD object without the underlying C object, so during + destruction, need to check whether or not the C object exists. + :return: None + """ + if self.mr != NULL: + self.logger.debug('Closing MR') + rc = v.ibv_dereg_mr(self.mr) + if rc != 0: + raise PyverbsRDMAError('Failed to dereg MR', rc) + self.mr = NULL + self.pd = None + if not self.is_user_addr: + if self.is_huge: + munmap(self.buf, self.mmap_length) + else: + free(self.buf) + self.buf = NULL + + def write(self, data, length): + """ + Write user data to the MR's buffer using memcpy + :param data: User data to write + :param length: Length of the data to write + :return: None + """ + # If data is a string, cast it to bytes as Python3 doesn't + # automatically convert it. + if isinstance(data, str): + data = data.encode() + memcpy(self.buf, <char *>data, length) + + cpdef read(self, length, offset): + """ + Reads data from the MR's buffer + :param length: Length of data to read + :param offset: Reading offset + :return: The data on the buffer in the requested offset + """ + cdef char *data + cdef int off = offset # we can't use offset in the next line, as it is + # a Python object and not C + data = <char*>(self.buf + off) + return data[:length] + + @property + def buf(self): + return <uintptr_t>self.buf + + @property + def lkey(self): + return self.mr.lkey + + @property + def rkey(self): + return self.mr.rkey + + @property + def length(self): + return self.mr.length + + +cdef class MWBindInfo(PyverbsCM): + def __init__(self, MR mr not None, addr, length, mw_access_flags): + super().__init__() + self.mr = mr + self.info.mr = mr.mr + self.info.addr = addr + self.info.length = length + self.info.mw_access_flags = mw_access_flags + + +cdef class MW(PyverbsCM): + def __init__(self, PD pd not None, v.ibv_mw_type mw_type): + """ + Initializes a memory window object of the given type + :param pd: A PD object + :param mw_type: Type of of the memory window, see ibv_mw_type enum + :return: + """ + super().__init__() + self.mw = NULL + self.mw = v.ibv_alloc_mw(pd.pd, mw_type) + if self.mw == NULL: + raise PyverbsRDMAErrno('Failed to allocate MW') + self.pd = pd + pd.add_ref(self) + self.logger.debug('Allocated memory window of type {t}'. + format(t=mwtype2str(mw_type))) + + def __dealloc__(self): + self.close() + + cpdef close(self): + """ + Closes the underlaying C MW object. + MW may be deleted directly or by deleting its PD, which leaves the + Python object without the underlaying MW. + Need to check that the underlaying MW wasn't dealloced before. + :return: None + """ + if self.mw is not NULL: + self.logger.debug('Closing MW') + rc = v.ibv_dealloc_mw(self.mw) + if rc != 0: + raise PyverbsRDMAError('Failed to dealloc MW', rc) + self.mw = NULL + self.pd = None + + +cdef class DMMR(MR): + def __init__(self, PD pd not None, length, access, DM dm, offset): + """ + Initializes a DMMR (Device Memory Memory Region) of the given length + and access flags using the given PD and DM objects. + :param pd: A PD object + :param length: Length in bytes + :param access: Access flags, see ibv_access_flags enum + :param dm: A DM (device memory) object to be used for this DMMR + :param offset: Byte offset from the beginning of the allocated device + memory buffer + :return: The newly create DMMR + """ + # Initialize the logger here as the parent's __init__ is called after + # the DMMR is allocated. Allocation can fail, which will lead to + # exceptions thrown during object's teardown. + self.logger = logging.getLogger(self.__class__.__name__) + self.mr = v.ibv_reg_dm_mr(pd.pd, dm.dm, offset, length, access) + if self.mr == NULL: + raise PyverbsRDMAErrno('Failed to register a device MR. length: {len}, access flags: {flags}'. + format(len=length, flags=access,)) + super().__init__(pd, length, access) + self.pd = pd + self.dm = dm + pd.add_ref(self) + dm.add_ref(self) + self.logger.debug('Registered device ibv_mr. Length: {len}, access flags {flags}'. + format(len=length, flags=access)) + + def write(self, data, length): + return self.dm.copy_to_dm(0, data, length) + + cpdef read(self, length, offset): + return self.dm.copy_from_dm(offset, length) + + +def mwtype2str(mw_type): + mw_types = {1:'IBV_MW_TYPE_1', 2:'IBV_MW_TYPE_2'} + try: + return mw_types[mw_type] + except KeyError: + return 'Unknown MW type ({t})'.format(t=mw_type) diff --git a/pyverbs/pd.pxd b/pyverbs/pd.pxd new file mode 100644 index 0000000..ae4324a --- /dev/null +++ b/pyverbs/pd.pxd @@ -0,0 +1,37 @@ +# SPDX-License-Identifier: (GPL-2.0 OR Linux-OpenIB) +# Copyright (c) 2019, Mellanox Technologies. All rights reserved. + +#cython: language_level=3 + +from pyverbs.base cimport PyverbsObject +from pyverbs.device cimport Context +cimport pyverbs.libibverbs as v +from .base cimport PyverbsCM + + +cdef class PD(PyverbsCM): + cdef v.ibv_pd *pd + cdef Context ctx + cdef add_ref(self, obj) + cdef object srqs + cdef object mrs + cdef object mws + cdef object ahs + cdef object qps + cdef object parent_domains + +cdef class ParentDomainInitAttr(PyverbsObject): + cdef v.ibv_parent_domain_init_attr init_attr + cdef object pd + cdef object alloc + cdef object dealloc + +cdef class ParentDomain(PD): + cdef add_ref(self, obj) + cdef object protection_domain + cdef object cqs + +cdef class ParentDomainContext(PyverbsObject): + cdef object p_alloc + cdef object p_free + cdef object pd diff --git a/pyverbs/pd.pyx b/pyverbs/pd.pyx new file mode 100755 index 0000000..9e8395f --- /dev/null +++ b/pyverbs/pd.pyx @@ -0,0 +1,209 @@ +# SPDX-License-Identifier: (GPL-2.0 OR Linux-OpenIB) +# Copyright (c) 2019, Mellanox Technologies. All rights reserved. +import weakref +import logging + +from pyverbs.pyverbs_error import PyverbsUserError, PyverbsError, \ + PyverbsRDMAError +from pyverbs.base import PyverbsRDMAErrno +from pyverbs.base cimport close_weakrefs +from pyverbs.device cimport Context +from libc.stdint cimport uintptr_t +from pyverbs.cmid cimport CMID +from .mr cimport MR, MW, DMMR +from pyverbs.srq cimport SRQ +from pyverbs.addr cimport AH +from pyverbs.cq cimport CQEX +from pyverbs.qp cimport QP + + +cdef class PD(PyverbsCM): + def __init__(self, object creator not None): + """ + Initializes a PD object. A reference for the creating Context is kept + so that Python's GC will destroy the objects in the right order. + :param creator: The Context/CMID object creating the PD + """ + super().__init__() + if issubclass(type(creator), Context): + # Check if the ibv_pd* was initialized by an inheriting class + if self.pd == NULL: + self.pd = v.ibv_alloc_pd((<Context>creator).context) + if self.pd == NULL: + raise PyverbsRDMAErrno('Failed to allocate PD') + self.ctx = creator + elif issubclass(type(creator), CMID): + cmid = <CMID>creator + self.pd = cmid.id.pd + self.ctx = cmid.ctx + cmid.pd = self + else: + raise PyverbsUserError('Cannot create PD from {type}' + .format(type=type(creator))) + self.ctx.add_ref(self) + self.logger.debug('PD: Allocated ibv_pd') + self.srqs = weakref.WeakSet() + self.mrs = weakref.WeakSet() + self.mws = weakref.WeakSet() + self.ahs = weakref.WeakSet() + self.qps = weakref.WeakSet() + self.parent_domains = weakref.WeakSet() + + def __dealloc__(self): + """ + Closes the inner PD. + :return: None + """ + self.close() + + cpdef close(self): + """ + Closes the underlying C object of the PD. + PD may be deleted directly or indirectly by closing its context, which + leaves the Python PD object without the underlying C object, so during + destruction, need to check whether or not the C object exists. + :return: None + """ + if self.pd != NULL: + self.logger.debug('Closing PD') + close_weakrefs([self.parent_domains, self.qps, self.ahs, self.mws, + self.mrs, self.srqs]) + rc = v.ibv_dealloc_pd(self.pd) + if rc != 0: + raise PyverbsRDMAError('Failed to dealloc PD', rc) + self.pd = NULL + self.ctx = None + + cdef add_ref(self, obj): + if isinstance(obj, MR) or isinstance(obj, DMMR): + self.mrs.add(obj) + elif isinstance(obj, MW): + self.mws.add(obj) + elif isinstance(obj, AH): + self.ahs.add(obj) + elif isinstance(obj, QP): + self.qps.add(obj) + elif isinstance(obj, SRQ): + self.srqs.add(obj) + elif isinstance(obj, ParentDomain): + self.parent_domains.add(obj) + else: + raise PyverbsError('Unrecognized object type') + + +cdef void *pd_alloc(v.ibv_pd *pd, void *pd_context, size_t size, + size_t alignment, v.uint64_t resource_type): + """ + Parent Domain allocator wrapper. This function is used to wrap a + user-defined Python alloc function which should be a part of pd_context. + :param pd: Parent domain + :param pd_context: User-specific context of type ParentDomainContext + :param size: Size of the requested buffer + :param alignment: Alignment of the requested buffer + :param resource_type: Vendor-specific resource type + :return: Pointer to the allocated buffer, or NULL to designate an error. + It may also return IBV_ALLOCATOR_USE_DEFAULT asking the callee to + allocate the buffer using the default allocator. + + """ + cdef ParentDomainContext pd_ctx + pd_ctx = <object>pd_context + ptr = <uintptr_t>pd_ctx.p_alloc(pd_ctx.pd, pd_ctx, size, alignment, + resource_type) + return <void*>ptr + + +cdef void pd_free(v.ibv_pd *pd, void *pd_context, void *ptr, + v.uint64_t resource_type): + """ + Parent Domain deallocator wrapper. This function is used to wrap a + user-defined Python free function which should be part of pd_context. + :param pd: Parent domain + :param pd_context: User-specific context of type ParentDomainContext + :param ptr: Pointer to the buffer to be freed + :param resource_type: Vendor-specific resource type + """ + cdef ParentDomainContext pd_ctx + pd_ctx = <object>pd_context + pd_ctx.p_free(pd_ctx.pd, pd_ctx, <uintptr_t>ptr, resource_type) + + +cdef class ParentDomainContext(PyverbsObject): + def __init__(self, PD pd, alloc_func, free_func): + """ + Initializes ParentDomainContext object which is used as a pd_context. + It contains the relevant fields in order to allow the user to write + alloc and free functions in Python + :param pd: PD object that represents the ibv_pd which is passed to the + creation of the Parent Domain + :param alloc_func: Python alloc function + :param free_func: Python free function + """ + super().__init__() + self.pd = pd + self.p_alloc = alloc_func + self.p_free = free_func + + +cdef class ParentDomainInitAttr(PyverbsObject): + def __init__(self, PD pd not None, ParentDomainContext pd_context=None): + """ + Represents ibv_parent_domain_init_attr C struct + :param pd: PD to initialize the ParentDomain with + :param pd_context: ParentDomainContext object including the alloc and + free Python callbacks + """ + super().__init__() + self.pd = pd + self.init_attr.pd = <v.ibv_pd*>pd.pd + if pd_context: + self.init_attr.alloc = pd_alloc + self.init_attr.free = pd_free + self.init_attr.pd_context = <void*>pd_context + # The only way to use Python callbacks is to pass the (Python) + # functions through pd_context. Hence, we must set PD_CONTEXT + # in the comp mask. + self.init_attr.comp_mask = v.IBV_PARENT_DOMAIN_INIT_ATTR_PD_CONTEXT | \ + v.IBV_PARENT_DOMAIN_INIT_ATTR_ALLOCATORS + + @property + def comp_mask(self): + return self.init_attr.comp_mask + + +cdef class ParentDomain(PD): + def __init__(self, Context context not None, ParentDomainInitAttr attr not None): + """ + Initializes ParentDomain object which represents a parent domain of + ibv_pd C struct type + :param context: Device context + :param attr: Attribute of type ParentDomainInitAttr to initialize the + ParentDomain with + """ + # Initialize the logger here as the parent's __init__ is called after + # the PD is allocated. Allocation can fail, which will lead to exceptions + # thrown during object's teardown. + self.logger = logging.getLogger(self.__class__.__name__) + (<PD>attr.pd).add_ref(self) + self.protection_domain = attr.pd + self.pd = v.ibv_alloc_parent_domain(context.context, &attr.init_attr) + if self.pd == NULL: + raise PyverbsRDMAErrno('Failed to allocate Parent Domain') + super().__init__(context) + self.cqs = weakref.WeakSet() + self.logger.debug('Allocated ParentDomain') + + def __dealloc__(self): + self.close() + + cpdef close(self): + if self.pd != NULL: + self.logger.debug('Closing ParentDomain') + close_weakrefs([self.cqs]) + super(ParentDomain, self).close() + + cdef add_ref(self, obj): + if isinstance(obj, CQEX): + self.cqs.add(obj) + else: + PD.add_ref(self, obj) diff --git a/pyverbs/providers/__init__.pxd b/pyverbs/providers/__init__.pxd new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/pyverbs/providers/__init__.pxd diff --git a/pyverbs/providers/__init__.py b/pyverbs/providers/__init__.py new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/pyverbs/providers/__init__.py diff --git a/pyverbs/providers/mlx5/CMakeLists.txt b/pyverbs/providers/mlx5/CMakeLists.txt new file mode 100644 index 0000000..d9b0849 --- /dev/null +++ b/pyverbs/providers/mlx5/CMakeLists.txt @@ -0,0 +1,7 @@ +# SPDX-License-Identifier: (GPL-2.0 OR Linux-OpenIB) +# Copyright (c) 2019, Mellanox Technologies. All rights reserved. See COPYING file + +rdma_cython_module(pyverbs/providers/mlx5 mlx5 + mlx5dv.pyx + mlx5_enums.pyx +) diff --git a/pyverbs/providers/mlx5/__init__.pxd b/pyverbs/providers/mlx5/__init__.pxd new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/pyverbs/providers/mlx5/__init__.pxd diff --git a/pyverbs/providers/mlx5/__init__.py b/pyverbs/providers/mlx5/__init__.py new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/pyverbs/providers/mlx5/__init__.py diff --git a/pyverbs/providers/mlx5/libmlx5.pxd b/pyverbs/providers/mlx5/libmlx5.pxd new file mode 100644 index 0000000..b346326 --- /dev/null +++ b/pyverbs/providers/mlx5/libmlx5.pxd @@ -0,0 +1,87 @@ +# SPDX-License-Identifier: (GPL-2.0 OR Linux-OpenIB) +# Copyright (c) 2019 Mellanox Technologies, Inc. All rights reserved. See COPYING file + +include 'mlx5dv_enums.pxd' + +from libc.stdint cimport uint16_t, uint32_t, uint64_t +from libcpp cimport bool + +cimport pyverbs.libibverbs as v + + +cdef extern from 'infiniband/mlx5dv.h': + + cdef struct mlx5dv_context_attr: + unsigned int flags + unsigned long comp_mask + + cdef struct mlx5dv_cqe_comp_caps: + unsigned int max_num + unsigned int supported_format + + cdef struct mlx5dv_sw_parsing_caps: + unsigned int sw_parsing_offloads + unsigned int supported_qpts + + cdef struct mlx5dv_striding_rq_caps: + unsigned int min_single_stride_log_num_of_bytes + unsigned int max_single_stride_log_num_of_bytes + unsigned int min_single_wqe_log_num_of_strides + unsigned int max_single_wqe_log_num_of_strides + unsigned int supported_qpts + + cdef struct mlx5dv_context: + unsigned char version + unsigned long flags + unsigned long comp_mask + mlx5dv_cqe_comp_caps cqe_comp_caps + mlx5dv_sw_parsing_caps sw_parsing_caps + mlx5dv_striding_rq_caps striding_rq_caps + unsigned int tunnel_offloads_caps + unsigned int max_dynamic_bfregs + unsigned long max_clock_info_update_nsec + unsigned int flow_action_flags + unsigned int dc_odp_caps + + cdef struct mlx5dv_dc_init_attr: + mlx5dv_dc_type dc_type + unsigned long dct_access_key + + cdef struct mlx5dv_qp_init_attr: + unsigned long comp_mask + unsigned int create_flags + mlx5dv_dc_init_attr dc_init_attr + unsigned long send_ops_flags + + cdef struct mlx5dv_cq_init_attr: + unsigned long comp_mask + unsigned char cqe_comp_res_format + unsigned int flags + unsigned short cqe_size + + cdef struct mlx5dv_var: + uint32_t page_id + uint32_t length + long mmap_off + uint64_t comp_mask + + cdef struct mlx5dv_pp: + uint16_t index + + bool mlx5dv_is_supported(v.ibv_device *device) + v.ibv_context* mlx5dv_open_device(v.ibv_device *device, + mlx5dv_context_attr *attr) + int mlx5dv_query_device(v.ibv_context *ctx, mlx5dv_context *attrs_out) + + v.ibv_qp *mlx5dv_create_qp(v.ibv_context *context, + v.ibv_qp_init_attr_ex *qp_attr, + mlx5dv_qp_init_attr *mlx5_qp_attr) + v.ibv_cq_ex *mlx5dv_create_cq(v.ibv_context *context, + v.ibv_cq_init_attr_ex *cq_attr, + mlx5dv_cq_init_attr *mlx5_cq_attr) + + mlx5dv_var *mlx5dv_alloc_var(v.ibv_context *context, uint32_t flags) + void mlx5dv_free_var(mlx5dv_var *dv_var) + mlx5dv_pp *mlx5dv_pp_alloc(v.ibv_context *context, size_t pp_context_sz, + const void *pp_context, uint32_t flags) + void mlx5dv_pp_free(mlx5dv_pp *pp) diff --git a/pyverbs/providers/mlx5/mlx5_enums.pyx b/pyverbs/providers/mlx5/mlx5_enums.pyx new file mode 120000 index 0000000..ba0e916 --- /dev/null +++ b/pyverbs/providers/mlx5/mlx5_enums.pyx @@ -0,0 +1 @@ +mlx5dv_enums.pxd \ No newline at end of file diff --git a/pyverbs/providers/mlx5/mlx5dv.pxd b/pyverbs/providers/mlx5/mlx5dv.pxd new file mode 100644 index 0000000..23af002 --- /dev/null +++ b/pyverbs/providers/mlx5/mlx5dv.pxd @@ -0,0 +1,45 @@ +# SPDX-License-Identifier: (GPL-2.0 OR Linux-OpenIB) +# Copyright (c) 2019 Mellanox Technologies, Inc. All rights reserved. See COPYING file + +#cython: language_level=3 + +cimport pyverbs.providers.mlx5.libmlx5 as dv +from pyverbs.device cimport Context, VAR +from pyverbs.base cimport PyverbsObject +from pyverbs.cq cimport CQEX +from pyverbs.qp cimport QP + + +cdef class Mlx5Context(Context): + cdef object pps + cpdef close(self) + +cdef class Mlx5DVContextAttr(PyverbsObject): + cdef dv.mlx5dv_context_attr attr + +cdef class Mlx5DVContext(PyverbsObject): + cdef dv.mlx5dv_context dv + +cdef class Mlx5DVDCInitAttr(PyverbsObject): + cdef dv.mlx5dv_dc_init_attr attr + +cdef class Mlx5DVQPInitAttr(PyverbsObject): + cdef dv.mlx5dv_qp_init_attr attr + +cdef class Mlx5QP(QP): + cdef object dc_type + +cdef class Mlx5DVCQInitAttr(PyverbsObject): + cdef dv.mlx5dv_cq_init_attr attr + +cdef class Mlx5CQ(CQEX): + pass + +cdef class Mlx5VAR(VAR): + cdef dv.mlx5dv_var *var + cpdef close(self) + +cdef class Mlx5PP(PyverbsObject): + cdef dv.mlx5dv_pp *pp + cdef object context + cpdef close(self) diff --git a/pyverbs/providers/mlx5/mlx5dv.pyx b/pyverbs/providers/mlx5/mlx5dv.pyx new file mode 100644 index 0000000..7ea6fbb --- /dev/null +++ b/pyverbs/providers/mlx5/mlx5dv.pyx @@ -0,0 +1,647 @@ +# SPDX-License-Identifier: (GPL-2.0 OR Linux-OpenIB) +# Copyright (c) 2019 Mellanox Technologies, Inc. All rights reserved. See COPYING file + +import logging + +from pyverbs.pyverbs_error import PyverbsUserError +cimport pyverbs.providers.mlx5.mlx5dv_enums as dve +cimport pyverbs.providers.mlx5.libmlx5 as dv +from pyverbs.base import PyverbsRDMAErrno +from pyverbs.base cimport close_weakrefs +cimport pyverbs.libibverbs_enums as e +from pyverbs.qp cimport QPInitAttrEx +from pyverbs.cq cimport CqInitAttrEx +cimport pyverbs.libibverbs as v +from pyverbs.pd cimport PD +import weakref + + +cdef class Mlx5DVContextAttr(PyverbsObject): + """ + Represent mlx5dv_context_attr struct. This class is used to open an mlx5 + device. + """ + def __init__(self, flags=0, comp_mask=0): + super().__init__() + self.attr.flags = flags + self.attr.comp_mask = comp_mask + + def __str__(self): + print_format = '{:20}: {:<20}\n' + return print_format.format('flags', self.attr.flags) +\ + print_format.format('comp_mask', self.attr.comp_mask) + + @property + def flags(self): + return self.attr.flags + @flags.setter + def flags(self, val): + self.attr.flags = val + + @property + def comp_mask(self): + return self.attr.comp_mask + @comp_mask.setter + def comp_mask(self, val): + self.attr.comp_mask = val + + +cdef class Mlx5Context(Context): + """ + Represent mlx5 context, which extends Context. + """ + def __init__(self, Mlx5DVContextAttr attr not None, name=''): + """ + Open an mlx5 device using the given attributes + :param name: The RDMA device's name (used by parent class) + :param attr: mlx5-specific device attributes + :return: None + """ + super().__init__(name=name, attr=attr) + if not dv.mlx5dv_is_supported(self.device): + raise PyverbsUserError('This is not an MLX5 device') + self.pps = weakref.WeakSet() + self.context = dv.mlx5dv_open_device(self.device, &attr.attr) + if self.context == NULL: + raise PyverbsRDMAErrno('Failed to open mlx5 context on {dev}' + .format(dev=self.name)) + + def query_mlx5_device(self, comp_mask=-1): + """ + Queries the provider for device-specific attributes. + :param comp_mask: Which attributes to query. Default value is -1. If + not changed by user, pyverbs will pass a bitwise OR + of all available enum entries. + :return: A Mlx5DVContext containing the attributes. + """ + dv_attr = Mlx5DVContext() + if comp_mask == -1: + dv_attr.comp_mask = \ + dve.MLX5DV_CONTEXT_MASK_CQE_COMPRESION |\ + dve.MLX5DV_CONTEXT_MASK_SWP |\ + dve.MLX5DV_CONTEXT_MASK_STRIDING_RQ |\ + dve.MLX5DV_CONTEXT_MASK_TUNNEL_OFFLOADS |\ + dve.MLX5DV_CONTEXT_MASK_DYN_BFREGS |\ + dve.MLX5DV_CONTEXT_MASK_CLOCK_INFO_UPDATE |\ + dve.MLX5DV_CONTEXT_MASK_FLOW_ACTION_FLAGS + else: + dv_attr.comp_mask = comp_mask + rc = dv.mlx5dv_query_device(self.context, &dv_attr.dv) + if rc != 0: + raise PyverbsRDMAErrno('Failed to query mlx5 device {name}, got {rc}'. + format(name=self.name, rc=rc)) + return dv_attr + + cdef add_ref(self, obj): + if isinstance(obj, Mlx5PP): + self.pps.add(obj) + else: + super().add_ref(obj) + + def __dealloc__(self): + self.close() + + cpdef close(self): + if self.context != NULL: + close_weakrefs([self.pps]) + super(Mlx5Context, self).close() + + +cdef class Mlx5DVContext(PyverbsObject): + """ + Represents mlx5dv_context struct, which exposes mlx5-specific capabilities, + reported by mlx5dv_query_device. + """ + @property + def version(self): + return self.dv.version + + @property + def flags(self): + return self.dv.flags + + @property + def comp_mask(self): + return self.dv.comp_mask + @comp_mask.setter + def comp_mask(self, val): + self.dv.comp_mask = val + + @property + def cqe_comp_caps(self): + return self.dv.cqe_comp_caps + + @property + def sw_parsing_caps(self): + return self.dv.sw_parsing_caps + + @property + def striding_rq_caps(self): + return self.dv.striding_rq_caps + + @property + def tunnel_offload_caps(self): + return self.dv.tunnel_offloads_caps + + @property + def max_dynamic_bfregs(self): + return self.dv.max_dynamic_bfregs + + @property + def max_clock_info_update_nsec(self): + return self.dv.max_clock_info_update_nsec + + @property + def flow_action_flags(self): + return self.dv.flow_action_flags + + @property + def dc_odp_caps(self): + return self.dv.dc_odp_caps + + def __str__(self): + print_format = '{:20}: {:<20}\n' + ident_format = ' {:20}: {:<20}\n' + cqe = 'CQE compression caps:\n' +\ + ident_format.format('max num', + self.dv.cqe_comp_caps.max_num) +\ + ident_format.format('supported formats', + cqe_comp_to_str(self.dv.cqe_comp_caps.supported_format)) + swp = 'SW parsing caps:\n' +\ + ident_format.format('SW parsing offloads', + swp_to_str(self.dv.sw_parsing_caps.sw_parsing_offloads)) +\ + ident_format.format('supported QP types', + qpts_to_str(self.dv.sw_parsing_caps.supported_qpts)) + strd = 'Striding RQ caps:\n' +\ + ident_format.format('min single stride log num of bytes', + self.dv.striding_rq_caps.min_single_stride_log_num_of_bytes) +\ + ident_format.format('max single stride log num of bytes', + self.dv.striding_rq_caps.max_single_stride_log_num_of_bytes) +\ + ident_format.format('min single wqe log num of strides', + self.dv.striding_rq_caps.min_single_wqe_log_num_of_strides) +\ + ident_format.format('max single wqe log num of strides', + self.dv.striding_rq_caps.max_single_wqe_log_num_of_strides) +\ + ident_format.format('supported QP types', + qpts_to_str(self.dv.striding_rq_caps.supported_qpts)) + return print_format.format('Version', self.dv.version) +\ + print_format.format('Flags', + context_flags_to_str(self.dv.flags)) +\ + print_format.format('comp mask', + context_comp_mask_to_str(self.dv.comp_mask)) +\ + cqe + swp + strd +\ + print_format.format('Tunnel offloads caps', + tunnel_offloads_to_str(self.dv.tunnel_offloads_caps)) +\ + print_format.format('Max dynamic BF registers', + self.dv.max_dynamic_bfregs) +\ + print_format.format('Max clock info update [nsec]', + self.dv.max_clock_info_update_nsec) +\ + print_format.format('Flow action flags', + self.dv.flow_action_flags) +\ + print_format.format('DC ODP caps', self.dv.dc_odp_caps) + + +cdef class Mlx5DVDCInitAttr(PyverbsObject): + """ + Represents mlx5dv_dc_init_attr struct, which defines initial attributes + for DC QP creation. + """ + def __init__(self, dc_type=dve.MLX5DV_DCTYPE_DCI, dct_access_key=0): + """ + Initializes an Mlx5DVDCInitAttr object with the given DC type and DCT + access key. + :param dc_type: Which DC QP to create (DCI/DCT). + :param dct_access_key: Access key to be used by the DCT + :return: An initializes object + """ + super().__init__() + self.attr.dc_type = dc_type + self.attr.dct_access_key = dct_access_key + + def __str__(self): + print_format = '{:20}: {:<20}\n' + return print_format.format('DC type', dc_type_to_str(self.attr.dc_type)) +\ + print_format.format('DCT access key', self.attr.dct_access_key) + + @property + def dc_type(self): + return self.attr.dc_type + @dc_type.setter + def dc_type(self, val): + self.attr.dc_type = val + + @property + def dct_access_key(self): + return self.attr.dct_access_key + @dct_access_key.setter + def dct_access_key(self, val): + self.attr.dct_access_key = val + + +cdef class Mlx5DVQPInitAttr(PyverbsObject): + """ + Represents mlx5dv_qp_init_attr struct, initial attributes used for mlx5 QP + creation. + """ + def __init__(self, comp_mask=0, create_flags=0, + Mlx5DVDCInitAttr dc_init_attr=None, send_ops_flags=0): + """ + Initializes an Mlx5DVQPInitAttr object with the given user data. + :param comp_mask: A bitmask specifying which fields are valid + :param create_flags: A bitwise OR of mlx5dv_qp_create_flags + :param dc_init_attr: Mlx5DVDCInitAttr object + :param send_ops_flags: A bitwise OR of mlx5dv_qp_create_send_ops_flags + :return: An initialized Mlx5DVQPInitAttr object + """ + super().__init__() + self.attr.comp_mask = comp_mask + self.attr.create_flags = create_flags + self.attr.send_ops_flags = send_ops_flags + if dc_init_attr is not None: + self.attr.dc_init_attr.dc_type = dc_init_attr.dc_type + self.attr.dc_init_attr.dct_access_key = dc_init_attr.dct_access_key + + def __str__(self): + print_format = '{:20}: {:<20}\n' + return print_format.format('Comp mask', + qp_comp_mask_to_str(self.attr.comp_mask)) +\ + print_format.format('Create flags', + qp_create_flags_to_str(self.attr.create_flags)) +\ + 'DC init attr:\n' +\ + print_format.format(' DC type', + dc_type_to_str(self.attr.dc_init_attr.dc_type)) +\ + print_format.format(' DCT access key', + self.attr.dc_init_attr.dct_access_key) +\ + print_format.format('Send ops flags', + send_ops_flags_to_str(self.attr.send_ops_flags)) + + @property + def comp_mask(self): + return self.attr.comp_mask + @comp_mask.setter + def comp_mask(self, val): + self.attr.comp_mask = val + + @property + def create_flags(self): + return self.attr.create_flags + @create_flags.setter + def create_flags(self, val): + self.attr.create_flags = val + + @property + def send_ops_flags(self): + return self.attr.send_ops_flags + @send_ops_flags.setter + def send_ops_flags(self, val): + self.attr.send_ops_flags = val + + @property + def dc_type(self): + return self.attr.dc_init_attr.dc_type + @dc_type.setter + def dc_type(self, val): + self.attr.dc_init_attr.dc_type = val + + @property + def dct_access_key(self): + return self.attr.dc_init_attr.dct_access_key + @dct_access_key.setter + def dct_access_key(self, val): + self.attr.dc_init_attr.dct_access_key = val + + +cdef class Mlx5QP(QP): + def __init__(self, Mlx5Context context, QPInitAttrEx init_attr, + Mlx5DVQPInitAttr dv_init_attr): + """ + Initializes an mlx5 QP according to the user-provided data. + :param context: mlx5 Context object + :param init_attr: QPInitAttrEx object + :param dv_init_attr: Mlx5DVQPInitAttr object + :return: An initialized Mlx5QP + """ + cdef PD pd + + # Initialize the logger here as the parent's __init__ is called after + # the QP is allocated. Allocation can fail, which will lead to exceptions + # thrown during object's teardown. + self.logger = logging.getLogger(self.__class__.__name__) + self.dc_type = dv_init_attr.dc_type if dv_init_attr else 0 + if init_attr.pd is not None: + pd = <PD>init_attr.pd + pd.add_ref(self) + self.qp = \ + dv.mlx5dv_create_qp(context.context, + &init_attr.attr, + &dv_init_attr.attr if dv_init_attr is not None + else NULL) + if self.qp == NULL: + raise PyverbsRDMAErrno('Failed to create MLX5 QP.\nQPInitAttrEx ' + 'attributes:\n{}\nMLX5DVQPInitAttr:\n{}'. + format(init_attr, dv_init_attr)) + super().__init__(context, init_attr) + + def _get_comp_mask(self, dst): + masks = {dve.MLX5DV_DCTYPE_DCT: {'INIT': e.IBV_QP_PKEY_INDEX | + e.IBV_QP_PORT | e.IBV_QP_ACCESS_FLAGS, + 'RTR': e.IBV_QP_AV |\ + e.IBV_QP_PATH_MTU |\ + e.IBV_QP_MIN_RNR_TIMER}, + dve.MLX5DV_DCTYPE_DCI: {'INIT': e.IBV_QP_PKEY_INDEX |\ + e.IBV_QP_PORT, + 'RTR': e.IBV_QP_PATH_MTU, + 'RTS': e.IBV_QP_TIMEOUT |\ + e.IBV_QP_RETRY_CNT |\ + e.IBV_QP_RNR_RETRY | e.IBV_QP_SQ_PSN |\ + e.IBV_QP_MAX_QP_RD_ATOMIC}} + if self.dc_type == 0: + return super()._get_comp_mask(dst) + return masks[self.dc_type][dst] | e.IBV_QP_STATE + + +cdef class Mlx5DVCQInitAttr(PyverbsObject): + """ + Represents mlx5dv_cq_init_attr struct, initial attributes used for mlx5 CQ + creation. + """ + def __init__(self, comp_mask=0, cqe_comp_res_format=0, flags=0, cqe_size=0): + """ + Initializes an Mlx5CQInitAttr object with zeroes as default values. + :param comp_mask: Marks which of the following fields should be + considered. Use mlx5dv_cq_init_attr_mask enum. + :param cqe_comp_res_format: The various CQE response formats of the + responder side. Use + mlx5dv_cqe_comp_res_format enum. + :param flags: A bitwise OR of the various values described in + mlx5dv_cq_init_attr_flags. + :param cqe_size: Configure the CQE size to be 64 or 128 bytes, other + values will cause the CQ creation process to fail. + Valid when MLX5DV_CQ_INIT_ATTR_MASK_CQE_SIZE is set. + :return: None + """ + super().__init__() + self.attr.comp_mask = comp_mask + self.attr.cqe_comp_res_format = cqe_comp_res_format + self.attr.flags = flags + self.attr.cqe_size = cqe_size + + @property + def comp_mask(self): + return self.attr.comp_mask + @comp_mask.setter + def comp_mask(self, val): + self.attr.comp_mask = val + + @property + def cqe_comp_res_format(self): + return self.attr.cqe_comp_res_format + @cqe_comp_res_format.setter + def cqe_comp_res_format(self, val): + self.attr.cqe_comp_res_format = val + + @property + def flags(self): + return self.attr.flags + @flags.setter + def flags(self, val): + self.attr.flags = val + + @property + def cqe_size(self): + return self.attr.cqe_size + @cqe_size.setter + def cqe_size(self, val): + self.attr.cqe_size = val + + def __str__(self): + print_format = '{:22}: {:<20}\n' + flags = {dve.MLX5DV_CQ_INIT_ATTR_FLAGS_CQE_PAD: + "MLX5DV_CQ_INIT_ATTR_FLAGS_CQE_PAD}"} + mask = {dve.MLX5DV_CQ_INIT_ATTR_MASK_COMPRESSED_CQE: + "MLX5DV_CQ_INIT_ATTR_MASK_COMPRESSED_CQE", + dve.MLX5DV_CQ_INIT_ATTR_MASK_FLAGS: + "MLX5DV_CQ_INIT_ATTR_MASK_FLAGS", + dve.MLX5DV_CQ_INIT_ATTR_MASK_CQE_SIZE: + "MLX5DV_CQ_INIT_ATTR_MASK_CQE_SIZE"} + fmt = {dve.MLX5DV_CQE_RES_FORMAT_HASH: "MLX5DV_CQE_RES_FORMAT_HASH", + dve.MLX5DV_CQE_RES_FORMAT_CSUM: "MLX5DV_CQE_RES_FORMAT_CSUM", + dve.MLX5DV_CQE_RES_FORMAT_CSUM_STRIDX: + "MLX5DV_CQE_RES_FORMAT_CSUM_STRIDX"} + + return 'Mlx5DVCQInitAttr:\n' +\ + print_format.format('comp_mask', bitmask_to_str(self.comp_mask, + mask)) +\ + print_format.format('CQE compression format', + bitmask_to_str(self.cqe_comp_res_format, + fmt)) +\ + print_format.format('flags', bitmask_to_str(self.flags, + flags)) + \ + print_format.format('CQE size', self.cqe_size) + + +cdef class Mlx5CQ(CQEX): + def __init__(self, Mlx5Context context, CqInitAttrEx init_attr, + Mlx5DVCQInitAttr dv_init_attr): + # Initialize the logger here as the parent's __init__ is called after + # the CQ is allocated. Allocation can fail, which will lead to exceptions + # thrown during object's teardown. + self.logger = logging.getLogger(self.__class__.__name__) + self.cq = \ + dv.mlx5dv_create_cq(context.context, &init_attr.attr, + &dv_init_attr.attr if dv_init_attr is not None + else NULL) + if self.cq == NULL: + raise PyverbsRDMAErrno('Failed to create MLX5 CQ.\nCQInitAttrEx:\n' + '{}\nMLX5DVCQInitAttr:\n{}'. + format(init_attr, dv_init_attr)) + self.ibv_cq = v.ibv_cq_ex_to_cq(self.cq) + self.context = context + context.add_ref(self) + super().__init__(context, init_attr) + + def __str__(self): + print_format = '{:<22}: {:<20}\n' + return 'Mlx5 CQ:\n' +\ + print_format.format('Handle', self.cq.handle) +\ + print_format.format('CQEs', self.cq.cqe) + + +def qpts_to_str(qp_types): + numeric_types = qp_types + qpts_str = '' + qpts = {e.IBV_QPT_RC: 'RC', e.IBV_QPT_UC: 'UC', e.IBV_QPT_UD: 'UD', + e.IBV_QPT_RAW_PACKET: 'Raw Packet', e.IBV_QPT_XRC_SEND: 'XRC Send', + e.IBV_QPT_XRC_RECV: 'XRC Recv', e.IBV_QPT_DRIVER: 'Driver QPT'} + for t in qpts.keys(): + if (1 << t) & qp_types: + qpts_str += qpts[t] + ', ' + qp_types -= t + if qp_types == 0: + break + return qpts_str[:-2] + ' ({})'.format(numeric_types) + + +def bitmask_to_str(bits, values): + numeric_bits = bits + res = '' + for t in values.keys(): + if t & bits: + res += values[t] + ', ' + bits -= t + if bits == 0: + break + return res[:-2] + ' ({})'.format(numeric_bits) # Remove last comma and space + + +def context_comp_mask_to_str(mask): + l = {dve.MLX5DV_CONTEXT_MASK_CQE_COMPRESION: 'CQE compression', + dve.MLX5DV_CONTEXT_MASK_SWP: 'SW parsing', + dve.MLX5DV_CONTEXT_MASK_STRIDING_RQ: 'Striding RQ', + dve.MLX5DV_CONTEXT_MASK_TUNNEL_OFFLOADS: 'Tunnel offloads', + dve.MLX5DV_CONTEXT_MASK_DYN_BFREGS: 'Dynamic BF regs', + dve.MLX5DV_CONTEXT_MASK_CLOCK_INFO_UPDATE: 'Clock info update', + dve.MLX5DV_CONTEXT_MASK_FLOW_ACTION_FLAGS: 'Flow action flags'} + return bitmask_to_str(mask, l) + + +def context_flags_to_str(flags): + l = {dve.MLX5DV_CONTEXT_FLAGS_CQE_V1: 'CQE v1', + dve.MLX5DV_CONTEXT_FLAGS_MPW_ALLOWED: 'Multi packet WQE allowed', + dve.MLX5DV_CONTEXT_FLAGS_ENHANCED_MPW: 'Enhanced multi packet WQE', + dve.MLX5DV_CONTEXT_FLAGS_CQE_128B_COMP: 'Support CQE 128B compression', + dve.MLX5DV_CONTEXT_FLAGS_CQE_128B_PAD: 'Support CQE 128B padding', + dve.MLX5DV_CONTEXT_FLAGS_PACKET_BASED_CREDIT_MODE: + 'Support packet based credit mode (in RC QP)'} + return bitmask_to_str(flags, l) + + +def swp_to_str(swps): + l = {dve.MLX5DV_SW_PARSING: 'SW Parsing', + dve.MLX5DV_SW_PARSING_CSUM: 'SW Parsing CSUM', + dve.MLX5DV_SW_PARSING_LSO: 'SW Parsing LSO'} + return bitmask_to_str(swps, l) + + +def cqe_comp_to_str(cqe): + l = {dve.MLX5DV_CQE_RES_FORMAT_HASH: 'with hash', + dve.MLX5DV_CQE_RES_FORMAT_CSUM: 'with RX checksum CSUM', + dve.MLX5DV_CQE_RES_FORMAT_CSUM_STRIDX: 'with stride index'} + return bitmask_to_str(cqe, l) + + +def tunnel_offloads_to_str(tun): + l = {dve.MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_VXLAN: 'VXLAN', + dve.MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_GRE: 'GRE', + dve.MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_GENEVE: 'Geneve', + dve.MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_CW_MPLS_OVER_GRE:\ + 'Ctrl word + MPLS over GRE', + dve.MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_CW_MPLS_OVER_UDP:\ + 'Ctrl word + MPLS over UDP'} + return bitmask_to_str(tun, l) + + +def dc_type_to_str(dctype): + l = {dve.MLX5DV_DCTYPE_DCT: 'DCT', dve.MLX5DV_DCTYPE_DCI: 'DCI'} + try: + return l[dctype] + except KeyError: + return 'Unknown DC type ({dc})'.format(dc=dctype) + + +def qp_comp_mask_to_str(flags): + l = {dve.MLX5DV_QP_INIT_ATTR_MASK_QP_CREATE_FLAGS: 'Create flags', + dve.MLX5DV_QP_INIT_ATTR_MASK_DC: 'DC', + dve.MLX5DV_QP_INIT_ATTR_MASK_SEND_OPS_FLAGS: 'Send ops flags'} + return bitmask_to_str(flags, l) + + +def qp_create_flags_to_str(flags): + l = {dve.MLX5DV_QP_CREATE_TUNNEL_OFFLOADS: 'Tunnel offloads', + dve.MLX5DV_QP_CREATE_TIR_ALLOW_SELF_LOOPBACK_UC: + 'Allow UC self loopback', + dve.MLX5DV_QP_CREATE_TIR_ALLOW_SELF_LOOPBACK_MC: + 'Allow MC self loopback', + dve.MLX5DV_QP_CREATE_DISABLE_SCATTER_TO_CQE: 'Disable scatter to CQE', + dve.MLX5DV_QP_CREATE_ALLOW_SCATTER_TO_CQE: 'Allow scatter to CQE', + dve.MLX5DV_QP_CREATE_PACKET_BASED_CREDIT_MODE: + 'Packet based credit mode'} + return bitmask_to_str(flags, l) + + +def send_ops_flags_to_str(flags): + l = {dve.MLX5DV_QP_EX_WITH_MR_INTERLEAVED: 'With MR interleaved', + dve.MLX5DV_QP_EX_WITH_MR_LIST: 'With MR list'} + return bitmask_to_str(flags, l) + + +cdef class Mlx5VAR(VAR): + def __init__(self, Context context not None, flags=0): + self.var = dv.mlx5dv_alloc_var(context.context, flags) + if self.var == NULL: + raise PyverbsRDMAErrno('Failed to allocate VAR') + context.add_ref(self) + + def __dealloc__(self): + self.close() + + cpdef close(self): + if self.var != NULL: + dv.mlx5dv_free_var(self.var) + self.var = NULL + + def __str__(self): + print_format = '{:20}: {:<20}\n' + return print_format.format('page id', self.var.page_id) +\ + print_format.format('length', self.var.length) +\ + print_format.format('mmap offset', self.var.mmap_off) +\ + print_format.format('compatibility mask', self.var.comp_mask) + + @property + def page_id(self): + return self.var.page_id + + @property + def length(self): + return self.var.length + + @property + def mmap_off(self): + return self.var.mmap_off + + @property + def comp_mask(self): + return self.var.comp_mask + + +cdef class Mlx5PP(PyverbsObject): + """ + Represents mlx5dv_pp, packet pacing struct. + """ + def __init__(self, Context context not None, pp_context, flags=0): + """ + Initializes a Mlx5PP object. + :param context: DevX context + :param pp_context: Bytes of packet pacing context according to the + device specs. Must be bytes type or implements + __bytes__ method + :param flags: Packet pacing allocation flags + """ + self.context = context + pp_ctx_bytes = bytes(pp_context) + self.pp = dv.mlx5dv_pp_alloc(context.context, len(pp_ctx_bytes), + <char*>pp_ctx_bytes, flags) + if self.pp == NULL: + raise PyverbsRDMAErrno('Failed to allocate packet pacing entry') + (<Mlx5Context>context).add_ref(self) + + def __dealloc__(self): + self.close() + + cpdef close(self): + if self.pp != NULL: + dv.mlx5dv_pp_free(self.pp) + self.pp = NULL + + @property + def index(self): + return self.pp.index diff --git a/pyverbs/providers/mlx5/mlx5dv_enums.pxd b/pyverbs/providers/mlx5/mlx5dv_enums.pxd new file mode 100644 index 0000000..2c12ddb --- /dev/null +++ b/pyverbs/providers/mlx5/mlx5dv_enums.pxd @@ -0,0 +1,93 @@ +# SPDX-License-Identifier: (GPL-2.0 OR Linux-OpenIB) +# Copyright (c) 2019 Mellanox Technologies, Inc. All rights reserved. See COPYING file + +#cython: language_level=3 + +cdef extern from 'infiniband/mlx5dv.h': + + cpdef enum mlx5dv_context_attr_flags: + MLX5DV_CONTEXT_FLAGS_DEVX + + cpdef enum mlx5dv_context_comp_mask: + MLX5DV_CONTEXT_MASK_CQE_COMPRESION = 1 << 0 + MLX5DV_CONTEXT_MASK_SWP = 1 << 1 + MLX5DV_CONTEXT_MASK_STRIDING_RQ = 1 << 2 + MLX5DV_CONTEXT_MASK_TUNNEL_OFFLOADS = 1 << 3 + MLX5DV_CONTEXT_MASK_DYN_BFREGS = 1 << 4 + MLX5DV_CONTEXT_MASK_CLOCK_INFO_UPDATE = 1 << 5 + MLX5DV_CONTEXT_MASK_FLOW_ACTION_FLAGS = 1 << 6 + + cpdef enum mlx5dv_context_flags: + MLX5DV_CONTEXT_FLAGS_CQE_V1 = 1 << 0 + MLX5DV_CONTEXT_FLAGS_MPW_ALLOWED = 1 << 2 + MLX5DV_CONTEXT_FLAGS_ENHANCED_MPW = 1 << 3 + MLX5DV_CONTEXT_FLAGS_CQE_128B_COMP = 1 << 4 + MLX5DV_CONTEXT_FLAGS_CQE_128B_PAD = 1 << 5 + MLX5DV_CONTEXT_FLAGS_PACKET_BASED_CREDIT_MODE = 1 << 6 + + cpdef enum mlx5dv_sw_parsing_offloads: + MLX5DV_SW_PARSING = 1 << 0 + MLX5DV_SW_PARSING_CSUM = 1 << 1 + MLX5DV_SW_PARSING_LSO = 1 << 2 + + cpdef enum mlx5dv_cqe_comp_res_format: + MLX5DV_CQE_RES_FORMAT_HASH = 1 << 0 + MLX5DV_CQE_RES_FORMAT_CSUM = 1 << 1 + MLX5DV_CQE_RES_FORMAT_CSUM_STRIDX = 1 << 2 + + cpdef enum mlx5dv_tunnel_offloads: + MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_VXLAN = 1 << 0 + MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_GRE = 1 << 1 + MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_GENEVE = 1 << 2 + MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_CW_MPLS_OVER_GRE = 1 << 3 + MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_CW_MPLS_OVER_UDP = 1 << 4 + + cpdef enum mlx5dv_flow_action_cap_flags: + MLX5DV_FLOW_ACTION_FLAGS_ESP_AES_GCM = 1 << 0 + MLX5DV_FLOW_ACTION_FLAGS_ESP_AES_GCM_REQ_METADATA = 1 << 1 + MLX5DV_FLOW_ACTION_FLAGS_ESP_AES_GCM_SPI_STEERING = 1 << 2 + MLX5DV_FLOW_ACTION_FLAGS_ESP_AES_GCM_FULL_OFFLOAD = 1 << 3 + MLX5DV_FLOW_ACTION_FLAGS_ESP_AES_GCM_TX_IV_IS_ESN = 1 << 4 + + cpdef enum mlx5dv_qp_init_attr_mask: + MLX5DV_QP_INIT_ATTR_MASK_QP_CREATE_FLAGS = 1 << 0 + MLX5DV_QP_INIT_ATTR_MASK_DC = 1 << 1 + MLX5DV_QP_INIT_ATTR_MASK_SEND_OPS_FLAGS = 1 << 2 + + cpdef enum mlx5dv_qp_create_flags: + MLX5DV_QP_CREATE_TUNNEL_OFFLOADS = 1 << 0 + MLX5DV_QP_CREATE_TIR_ALLOW_SELF_LOOPBACK_UC = 1 << 1 + MLX5DV_QP_CREATE_TIR_ALLOW_SELF_LOOPBACK_MC = 1 << 2 + MLX5DV_QP_CREATE_DISABLE_SCATTER_TO_CQE = 1 << 3 + MLX5DV_QP_CREATE_ALLOW_SCATTER_TO_CQE = 1 << 4 + MLX5DV_QP_CREATE_PACKET_BASED_CREDIT_MODE = 1 << 5 + + cpdef enum mlx5dv_dc_type: + MLX5DV_DCTYPE_DCT = 1 + MLX5DV_DCTYPE_DCI = 2 + + cpdef enum mlx5dv_qp_create_send_ops_flags: + MLX5DV_QP_EX_WITH_MR_INTERLEAVED = 1 << 0 + MLX5DV_QP_EX_WITH_MR_LIST = 1 << 1 + + cpdef enum mlx5dv_cq_init_attr_mask: + MLX5DV_CQ_INIT_ATTR_MASK_COMPRESSED_CQE = 1 << 0 + MLX5DV_CQ_INIT_ATTR_MASK_FLAGS = 1 << 1 + MLX5DV_CQ_INIT_ATTR_MASK_CQE_SIZE = 1 << 2 + + cpdef enum mlx5dv_cq_init_attr_flags: + MLX5DV_CQ_INIT_ATTR_FLAGS_CQE_PAD = 1 << 0 + MLX5DV_CQ_INIT_ATTR_FLAGS_RESERVED = 1 << 1 + + cpdef unsigned long long MLX5DV_RES_TYPE_QP + cpdef unsigned long long MLX5DV_RES_TYPE_RWQ + cpdef unsigned long long MLX5DV_RES_TYPE_DBR + cpdef unsigned long long MLX5DV_RES_TYPE_SRQ + cpdef unsigned long long MLX5DV_PP_ALLOC_FLAGS_DEDICATED_INDEX + + +_MLX5DV_RES_TYPE_QP = MLX5DV_RES_TYPE_QP +_MLX5DV_RES_TYPE_RWQ = MLX5DV_RES_TYPE_RWQ +_MLX5DV_RES_TYPE_DBR = MLX5DV_RES_TYPE_DBR +_MLX5DV_RES_TYPE_SRQ = MLX5DV_RES_TYPE_SRQ +_MLX5DV_PP_ALLOC_FLAGS_DEDICATED_INDEX = MLX5DV_PP_ALLOC_FLAGS_DEDICATED_INDEX diff --git a/pyverbs/pyverbs_error.py b/pyverbs/pyverbs_error.py new file mode 100644 index 0000000..3f096f7 --- /dev/null +++ b/pyverbs/pyverbs_error.py @@ -0,0 +1,47 @@ +# SPDX-License-Identifier: (GPL-2.0 OR Linux-OpenIB) +# Copyright (c) 2018, Mellanox Technologies. All rights reserved. +import os + + +class PyverbsError(Exception): + """ + Base exception class for Pyverbs. Inherited by PyverbsRDMAError (for errors + returned by rdma-core) and PyverbsUserError (for user-related errors + found by Pyverbs, e.g. non-existing device name). + """ + def __init__(self, msg, error_code = -1): + """ + Initializes a PyverbsError instance + :param msg: The exception's message + :param error_code: errno value + """ + if error_code != -1: + msg = '{msg}. Errno: {err}, {err_str}'.\ + format(msg=msg, err=error_code, err_str=os.strerror(error_code)) + super(PyverbsError, self).__init__(msg) + +class PyverbsRDMAError(PyverbsError): + """ + This exception is raised when an rdma-core function returns an error. + """ + def __init__(self, msg, error_code = -1): + super(PyverbsRDMAError, self).__init__(msg, error_code) + self._error_code = error_code + + @property + def error_code(self): + return self._error_code + + +class PyverbsUserError(PyverbsError): + """ + This exception is raised when Pyverbs encounters an error resulting from + user's action or input. + """ + def __init__(self, msg): + """ + Initializes a PyverbsUserError instance + :param msg: The exception's message + """ + super(PyverbsUserError, self).__init__(msg) + diff --git a/pyverbs/qp.pxd b/pyverbs/qp.pxd new file mode 100644 index 0000000..209a243 --- /dev/null +++ b/pyverbs/qp.pxd @@ -0,0 +1,45 @@ +# SPDX-License-Identifier: (GPL-2.0 OR Linux-OpenIB) +# Copyright (c) 2019 Mellanox Technologies, Inc. All rights reserved. + +#cython: language_level=3 + +from pyverbs.base cimport PyverbsObject, PyverbsCM +cimport pyverbs.libibverbs as v + +cdef class QPCap(PyverbsObject): + cdef v.ibv_qp_cap cap + +cdef class QPInitAttr(PyverbsObject): + cdef v.ibv_qp_init_attr attr + cdef object scq + cdef object rcq + cdef object srq + +cdef class QPInitAttrEx(PyverbsObject): + cdef v.ibv_qp_init_attr_ex attr + cdef object scq + cdef object rcq + cdef object _pd + cdef object xrcd + cdef object srq + +cdef class QPAttr(PyverbsObject): + cdef v.ibv_qp_attr attr + +cdef class QP(PyverbsCM): + cdef v.ibv_qp *qp + cdef int type + cdef int state + cdef object pd + cdef object context + cdef object xrcd + cpdef close(self) + cdef update_cqs(self, init_attr) + cdef object scq + cdef object rcq + +cdef class DataBuffer(PyverbsCM): + cdef v.ibv_data_buf data + +cdef class QPEx(QP): + cdef v.ibv_qp_ex *qp_ex diff --git a/pyverbs/qp.pyx b/pyverbs/qp.pyx new file mode 100755 index 0000000..95ef554 --- /dev/null +++ b/pyverbs/qp.pyx @@ -0,0 +1,1323 @@ +# SPDX-License-Identifier: (GPL-2.0 OR Linux-OpenIB) +# Copyright (c) 2019 Mellanox Technologies, Inc. All rights reserved. + +from libc.stdlib cimport malloc, free +from libc.string cimport memcpy + +from pyverbs.utils import gid_str, qp_type_to_str, qp_state_to_str, mtu_to_str +from pyverbs.pyverbs_error import PyverbsUserError, PyverbsError, \ + PyverbsRDMAError +from pyverbs.utils import access_flags_to_str, mig_state_to_str +from pyverbs.base import PyverbsRDMAErrno +from pyverbs.wr cimport RecvWR, SendWR, SGE +from pyverbs.addr cimport AHAttr, GID, AH +from pyverbs.mr cimport MW, MWBindInfo +cimport pyverbs.libibverbs_enums as e +from pyverbs.addr cimport GlobalRoute +from pyverbs.device cimport Context +from cpython.ref cimport PyObject +from pyverbs.cq cimport CQ, CQEX +cimport pyverbs.libibverbs as v +from pyverbs.xrcd cimport XRCD +from pyverbs.srq cimport SRQ +from pyverbs.pd cimport PD + +cdef extern from 'Python.h': + void* PyLong_AsVoidPtr(object) +cdef extern from 'endian.h': + unsigned long htobe32(unsigned long host_32bits) + + +cdef class QPCap(PyverbsObject): + def __init__(self, max_send_wr=1, max_recv_wr=10, max_send_sge=1, + max_recv_sge=1, max_inline_data=0): + """ + Initializes a QPCap object with user-provided or default values. + :param max_send_wr: max number of outstanding WRs in the SQ + :param max_recv_wr: max number of outstanding WRs in the RQ + :param max_send_sge: Requested max number of scatter-gather elements in + a WR in the SQ + :param max_recv_sge: Requested max number of scatter-gather elements in + a WR in the RQ + :param max_inline_data: max number of data (bytes) that can be posted + inline to the SQ, otherwise 0 + :return: + """ + super().__init__() + self.cap.max_send_wr = max_send_wr + self.cap.max_recv_wr = max_recv_wr + self.cap.max_send_sge = max_send_sge + self.cap.max_recv_sge = max_recv_sge + self.cap.max_inline_data = max_inline_data + + @property + def max_send_wr(self): + return self.cap.max_send_wr + @max_send_wr.setter + def max_send_wr(self, val): + self.cap.max_send_wr = val + + @property + def max_recv_wr(self): + return self.cap.max_recv_wr + @max_recv_wr.setter + def max_recv_wr(self, val): + self.cap.max_recv_wr = val + + @property + def max_send_sge(self): + return self.cap.max_send_sge + @max_send_sge.setter + def max_send_sge(self, val): + self.cap.max_send_sge = val + + @property + def max_recv_sge(self): + return self.cap.max_recv_sge + @max_recv_sge.setter + def max_recv_sge(self, val): + self.cap.max_recv_sge = val + + @property + def max_inline_data(self): + return self.cap.max_inline_data + @max_inline_data.setter + def max_inline_data(self, val): + self.cap.max_inline_data = val + + def __str__(self): + print_format = '{:20}: {:<20}\n' + return print_format.format('max send wrs', self.cap.max_send_wr) +\ + print_format.format('max recv wrs', self.cap.max_recv_wr) +\ + print_format.format('max send sges', self.cap.max_send_sge) +\ + print_format.format('max recv sges', self.cap.max_recv_sge) +\ + print_format.format('max inline data', self.cap.max_inline_data) + + +cdef class QPInitAttr(PyverbsObject): + def __init__(self, qp_type=e.IBV_QPT_UD, qp_context=None, + PyverbsObject scq=None, PyverbsObject rcq=None, + SRQ srq=None, QPCap cap=None, sq_sig_all=1): + """ + Initializes a QpInitAttr object representing ibv_qp_init_attr struct. + Note that SRQ object is not yet supported in pyverbs so can't be passed + as a parameter. None should be used until such support is added. + :param qp_type: The desired QP type (see enum ibv_qp_type) + :param qp_context: Associated QP context + :param scq: Send CQ to be used for this QP + :param rcq: Receive CQ to be used for this QP + :param srq: Shared receive queue to be used as RQ in QP + :param cap: A QPCap object + :param sq_sig_all: If set, each send WR will generate a completion + entry + :return: A QpInitAttr object + """ + super().__init__() + _copy_caps(cap, self) + self.attr.qp_context = <void*>qp_context + if scq is not None: + if type(scq) is CQ: + self.attr.send_cq = (<CQ>scq).cq + elif type(scq) is CQEX: + self.attr.send_cq = (<CQEX>scq).ibv_cq + else: + raise PyverbsUserError('Expected CQ/CQEX, got {t}'.\ + format(t=type(scq))) + self.scq = scq + + if rcq is not None: + if type(rcq) is CQ: + self.attr.recv_cq = (<CQ>rcq).cq + elif type(rcq) is CQEX: + self.attr.recv_cq = (<CQEX>rcq).ibv_cq + else: + raise PyverbsUserError('Expected CQ/CQEX, got {t}'.\ + format(t=type(rcq))) + self.rcq = rcq + self.attr.qp_type = qp_type + self.attr.sq_sig_all = sq_sig_all + self.srq = srq + self.attr.srq = srq.srq if srq else NULL + + @property + def send_cq(self): + return self.scq + @send_cq.setter + def send_cq(self, val): + if type(val) is CQ: + self.attr.send_cq = (<CQ>val).cq + elif type(val) is CQEX: + self.attr.send_cq = (<CQEX>val).ibv_cq + self.scq = val + + @property + def srq(self): + return self.srq + @srq.setter + def srq(self, SRQ val): + self.attr.srq = <v.ibv_srq*>val.srq + self.srq = val + + @property + def recv_cq(self): + return self.rcq + @recv_cq.setter + def recv_cq(self, val): + if type(val) is CQ: + self.attr.recv_cq = (<CQ>val).cq + elif type(val) is CQEX: + self.attr.recv_cq = (<CQEX>val).ibv_cq + self.rcq = val + + @property + def cap(self): + return QPCap(max_send_wr=self.attr.cap.max_send_wr, + max_recv_wr=self.attr.cap.max_recv_wr, + max_send_sge=self.attr.cap.max_send_sge, + max_recv_sge=self.attr.cap.max_recv_sge, + max_inline_data=self.attr.cap.max_inline_data) + @cap.setter + def cap(self, val): + _copy_caps(val, self) + + @property + def qp_type(self): + return self.attr.qp_type + @qp_type.setter + def qp_type(self, val): + self.attr.qp_type = val + + @property + def sq_sig_all(self): + return self.attr.sq_sig_all + @sq_sig_all.setter + def sq_sig_all(self, val): + self.attr.sq_sig_all = val + + @property + def max_send_wr(self): + return self.attr.cap.max_send_wr + @max_send_wr.setter + def max_send_wr(self, val): + self.attr.cap.max_send_wr = val + + @property + def max_recv_wr(self): + return self.attr.cap.max_recv_wr + @max_recv_wr.setter + def max_recv_wr(self, val): + self.attr.cap.max_recv_wr = val + + @property + def max_send_sge(self): + return self.attr.cap.max_send_sge + @max_send_sge.setter + def max_send_sge(self, val): + self.attr.cap.max_send_sge = val + + @property + def max_recv_sge(self): + return self.attr.cap.max_recv_sge + @max_recv_sge.setter + def max_recv_sge(self, val): + self.attr.cap.max_recv_sge = val + + @property + def max_inline_data(self): + return self.attr.cap.max_inline_data + @max_inline_data.setter + def max_inline_data(self, val): + self.attr.cap.max_inline_data = val + + def __str__(self): + print_format = '{:20}: {:<20}\n' + ident_format = ' {:20}: {:<20}\n' + return print_format.format('QP type', qp_type_to_str(self.qp_type)) +\ + print_format.format('SQ sig. all', self.sq_sig_all) +\ + 'QP caps:\n' +\ + ident_format.format('max send WR', self.attr.cap.max_send_wr) +\ + ident_format.format('max recv WR', self.attr.cap.max_recv_wr) +\ + ident_format.format('max send SGE', + self.attr.cap.max_send_sge) +\ + ident_format.format('max recv SGE', + self.attr.cap.max_recv_sge) +\ + ident_format.format('max inline data', + self.attr.cap.max_inline_data) + + +cdef class QPInitAttrEx(PyverbsObject): + def __init__(self, qp_type=e.IBV_QPT_UD, qp_context=None, + PyverbsObject scq=None, PyverbsObject rcq=None, + SRQ srq=None, QPCap cap=None, sq_sig_all=0, comp_mask=0, + PD pd=None, XRCD xrcd=None, create_flags=0, + max_tso_header=0, source_qpn=0, object hash_conf=None, + object ind_table=None, send_ops_flags=0): + """ + Initialize a QPInitAttrEx object with user-defined or default values. + :param qp_type: QP type to be created + :param qp_context: Associated user context + :param scq: Send CQ to be used for this QP + :param rcq: Recv CQ to be used for this QP + :param srq: Shared receive queue to be used as RQ in QP + :param cap: A QPCap object + :param sq_sig_all: If set, each send WR will generate a completion + entry + :param comp_mask: bit mask to determine which of the following fields + are valid + :param pd: A PD object to be associated with this QP + :param xrcd: XRC domain to be used for XRC QPs + :param create_flags: Creation flags for this QP + :param max_tso_header: Maximum TSO header size + :param source_qpn: Source QP number (requires IBV_QP_CREATE_SOURCE_QPN + set in create_flags) + :param hash_conf: Not yet supported + :param ind_table: Not yet supported + :param send_ops_flags: Send opcodes to be supported by the extended QP. + Use ibv_qp_create_send_ops_flags enum + :return: An initialized QPInitAttrEx object + """ + super().__init__() + _copy_caps(cap, self) + if scq is not None: + if type(scq) is CQ: + self.attr.send_cq = (<CQ>scq).cq + elif type(scq) is CQEX: + self.attr.send_cq = (<CQEX>scq).ibv_cq + else: + raise PyverbsUserError('Expected CQ/CQEX, got {t}'.\ + format(t=type(scq))) + self.scq = scq + + if rcq is not None: + if type(rcq) is CQ: + self.attr.recv_cq = (<CQ>rcq).cq + elif type(rcq) is CQEX: + self.attr.recv_cq = (<CQEX>rcq).ibv_cq + else: + raise PyverbsUserError('Expected CQ/CQEX, got {t}'.\ + format(t=type(rcq))) + self.rcq = rcq + + self.srq = srq + self.attr.srq = srq.srq if srq else NULL + self.xrcd = xrcd + self.attr.xrcd = xrcd.xrcd if xrcd else NULL + self.attr.rwq_ind_tbl = NULL # Until RSS support is added + self.attr.qp_type = qp_type + self.attr.sq_sig_all = sq_sig_all + unsupp_flags = e.IBV_QP_INIT_ATTR_IND_TABLE | e.IBV_QP_INIT_ATTR_RX_HASH + if comp_mask & unsupp_flags: + raise PyverbsUserError('RSS is not yet supported in pyverbs') + self.attr.comp_mask = comp_mask + if pd is not None: + self._pd = pd + self.attr.pd = pd.pd + self.attr.create_flags = create_flags + self.attr.max_tso_header = max_tso_header + self.attr.source_qpn = source_qpn + self.attr.send_ops_flags = send_ops_flags + + @property + def send_cq(self): + return self.scq + @send_cq.setter + def send_cq(self, val): + if type(val) is CQ: + self.attr.send_cq = (<CQ>val).cq + elif type(val) is CQEX: + self.attr.send_cq = (<CQEX>val).ibv_cq + self.scq = val + + @property + def recv_cq(self): + return self.rcq + @recv_cq.setter + def recv_cq(self, val): + if type(val) is CQ: + self.attr.recv_cq = (<CQ>val).cq + elif type(val) is CQEX: + self.attr.recv_cq = (<CQEX>val).ibv_cq + self.rcq = val + + @property + def cap(self): + return QPCap(max_send_wr=self.attr.cap.max_send_wr, + max_recv_wr=self.attr.cap.max_recv_wr, + max_send_sge=self.attr.cap.max_send_sge, + max_recv_sge=self.attr.cap.max_recv_sge, + max_inline_data=self.attr.cap.max_inline_data) + @cap.setter + def cap(self, val): + _copy_caps(val, self) + + @property + def qp_type(self): + return self.attr.qp_type + @qp_type.setter + def qp_type(self, val): + self.attr.qp_type = val + + @property + def sq_sig_all(self): + return self.attr.sq_sig_all + @sq_sig_all.setter + def sq_sig_all(self, val): + self.attr.sq_sig_all = val + + @property + def comp_mask(self): + return self.attr.comp_mask + @comp_mask.setter + def comp_mask(self, val): + self.attr.comp_mask = val + + @property + def pd(self): + return self._pd + @pd.setter + def pd(self, PD val): + self.attr.pd = <v.ibv_pd*>val.pd + self._pd = val + + @property + def xrcd(self): + return self.xrcd + @xrcd.setter + def xrcd(self, XRCD val): + self.attr.xrcd = <v.ibv_xrcd*>val.xrcd + self.xrcd = val + + @property + def srq(self): + return self.srq + @srq.setter + def srq(self, SRQ val): + self.attr.srq = <v.ibv_srq*>val.srq + self.srq = val + + @property + def create_flags(self): + return self.attr.create_flags + @create_flags.setter + def create_flags(self, val): + self.attr.create_flags = val + + @property + def max_tso_header(self): + return self.attr.max_tso_header + @max_tso_header.setter + def max_tso_header(self, val): + self.attr.max_tso_header = val + + @property + def source_qpn(self): + return self.attr.source_qpn + @source_qpn.setter + def source_qpn(self, val): + self.attr.source_qpn = val + + @property + def max_send_wr(self): + return self.attr.cap.max_send_wr + @max_send_wr.setter + def max_send_wr(self, val): + self.attr.cap.max_send_wr = val + + @property + def max_recv_wr(self): + return self.attr.cap.max_recv_wr + @max_recv_wr.setter + def max_recv_wr(self, val): + self.attr.cap.max_recv_wr = val + + @property + def max_send_sge(self): + return self.attr.cap.max_send_sge + @max_send_sge.setter + def max_send_sge(self, val): + self.attr.cap.max_send_sge = val + + @property + def max_recv_sge(self): + return self.attr.cap.max_recv_sge + @max_recv_sge.setter + def max_recv_sge(self, val): + self.attr.cap.max_recv_sge = val + + @property + def max_inline_data(self): + return self.attr.cap.max_inline_data + @max_inline_data.setter + def max_inline_data(self, val): + self.attr.cap.max_inline_data = val + + def mask_to_str(self, mask): + comp_masks = {1: 'PD', 2: 'XRCD', 4: 'Create Flags', + 8: 'Max TSO header', 16: 'Indirection Table', + 32: 'RX hash'} + mask_str = '' + for f in comp_masks: + if mask & f: + mask_str += comp_masks[f] + mask_str += ' ' + return mask_str + + def flags_to_str(self, flags): + create_flags = {1: 'Block self mcast loopback', 2: 'Scatter FCS', + 4: 'CVLAN stripping', 8: 'Source QPN', + 16: 'PCI write end padding'} + create_str = '' + for f in create_flags: + if flags & f: + create_str += create_flags[f] + create_str += ' ' + return create_str + + def __str__(self): + print_format = '{:20}: {:<20}\n' + return print_format.format('QP type', qp_type_to_str(self.qp_type)) +\ + print_format.format('SQ sig. all', self.sq_sig_all) +\ + 'QP caps:\n' +\ + print_format.format(' max send WR', + self.attr.cap.max_send_wr) +\ + print_format.format(' max recv WR', + self.attr.cap.max_recv_wr) +\ + print_format.format(' max send SGE', + self.attr.cap.max_send_sge) +\ + print_format.format(' max recv SGE', + self.attr.cap.max_recv_sge) +\ + print_format.format(' max inline data', + self.attr.cap.max_inline_data) +\ + print_format.format('comp mask', + self.mask_to_str(self.attr.comp_mask)) +\ + print_format.format('create flags', + self.flags_to_str(self.attr.create_flags)) +\ + print_format.format('max TSO header', + self.attr.max_tso_header) +\ + print_format.format('Source QPN', self.attr.source_qpn) + + +cdef class QPAttr(PyverbsObject): + def __init__(self, qp_state=e.IBV_QPS_INIT, cur_qp_state=e.IBV_QPS_RESET, + port_num=1, path_mtu=e.IBV_MTU_1024): + """ + Initializes a QPQttr object which represents ibv_qp_attr structs. It + can be used to modify a QP. + This function initializes default values for reset-to-init transition. + :param qp_state: Desired QP state + :param cur_qp_state: Current QP state + :return: An initialized QpAttr object + """ + super().__init__() + self.attr.qp_state = qp_state + self.attr.cur_qp_state = cur_qp_state + self.attr.port_num = port_num + self.attr.path_mtu = path_mtu + + @property + def qp_state(self): + return self.attr.qp_state + @qp_state.setter + def qp_state(self, val): + self.attr.qp_state = val + + @property + def cur_qp_state(self): + return self.attr.cur_qp_state + @cur_qp_state.setter + def cur_qp_state(self, val): + self.attr.cur_qp_state = val + + @property + def path_mtu(self): + return self.attr.path_mtu + @path_mtu.setter + def path_mtu(self, val): + self.attr.path_mtu = val + + @property + def path_mig_state(self): + return self.attr.path_mig_state + @path_mig_state.setter + def path_mig_state(self, val): + self.attr.path_mig_state = val + + @property + def qkey(self): + return self.attr.qkey + @qkey.setter + def qkey(self, val): + self.attr.qkey = val + + @property + def rq_psn(self): + return self.attr.rq_psn + @rq_psn.setter + def rq_psn(self, val): + self.attr.rq_psn = val + + @property + def sq_psn(self): + return self.attr.sq_psn + @sq_psn.setter + def sq_psn(self, val): + self.attr.sq_psn = val + + @property + def dest_qp_num(self): + return self.attr.dest_qp_num + @dest_qp_num.setter + def dest_qp_num(self, val): + self.attr.dest_qp_num = val + + @property + def qp_access_flags(self): + return self.attr.qp_access_flags + @qp_access_flags.setter + def qp_access_flags(self, val): + self.attr.qp_access_flags = val + + @property + def cap(self): + return QPCap(max_send_wr=self.attr.cap.max_send_wr, + max_recv_wr=self.attr.cap.max_recv_wr, + max_send_sge=self.attr.cap.max_send_sge, + max_recv_sge=self.attr.cap.max_recv_sge, + max_inline_data=self.attr.cap.max_inline_data) + @cap.setter + def cap(self, val): + _copy_caps(val, self) + + @property + def ah_attr(self): + if self.attr.ah_attr.is_global: + gid = gid_str(self.attr.ah_attr.grh.dgid._global.subnet_prefix, + self.attr.ah_attr.grh.dgid._global.interface_id) + g = GID(gid) + gr = GlobalRoute(flow_label=self.attr.ah_attr.grh.flow_label, + sgid_index=self.attr.ah_attr.grh.sgid_index, + hop_limit=self.attr.ah_attr.grh.hop_limit, dgid=g, + traffic_class=self.attr.ah_attr.grh.traffic_class) + else: + gr = None + ah = AHAttr(dlid=self.attr.ah_attr.dlid, sl=self.attr.ah_attr.sl, + port_num=self.attr.ah_attr.port_num, + src_path_bits=self.attr.ah_attr.src_path_bits, + static_rate=self.attr.ah_attr.static_rate, + is_global=self.attr.ah_attr.is_global, gr=gr) + return ah + + @ah_attr.setter + def ah_attr(self, val): + self._copy_ah(val) + + @property + def alt_ah_attr(self): + if self.attr.alt_ah_attr.is_global: + gid = gid_str(self.attr.alt_ah_attr.grh.dgid._global.subnet_prefix, + self.attr.alt_ah_attr.grh.dgid._global.interface_id) + g = GID(gid) + gr = GlobalRoute(flow_label=self.attr.alt_ah_attr.grh.flow_label, + sgid_index=self.attr.alt_ah_attr.grh.sgid_index, + hop_limit=self.attr.alt_ah_attr.grh.hop_limit, + dgid=g, + traffic_class=self.attr.alt_ah_attr.grh.traffic_class) + else: + gr = None + ah = AHAttr(dlid=self.attr.alt_ah_attr.dlid, + port_num=self.attr.ah_attr.port_num, + sl=self.attr.alt_ah_attr.sl, + src_path_bits=self.attr.alt_ah_attr.src_path_bits, + static_rate=self.attr.alt_ah_attr.static_rate, + is_global=self.attr.alt_ah_attr.is_global, gr=gr) + return ah + + @alt_ah_attr.setter + def alt_ah_attr(self, val): + self._copy_ah(val, True) + + def _copy_ah(self, AHAttr ah_attr, is_alt=False): + if ah_attr is None: + return + if not is_alt: + for i in range(16): + self.attr.ah_attr.grh.dgid.raw[i] = \ + ah_attr.ah_attr.grh.dgid.raw[i] + self.attr.ah_attr.grh.flow_label = ah_attr.ah_attr.grh.flow_label + self.attr.ah_attr.grh.sgid_index = ah_attr.ah_attr.grh.sgid_index + self.attr.ah_attr.grh.hop_limit = ah_attr.ah_attr.grh.hop_limit + self.attr.ah_attr.grh.traffic_class = \ + ah_attr.ah_attr.grh.traffic_class + self.attr.ah_attr.dlid = ah_attr.ah_attr.dlid + self.attr.ah_attr.sl = ah_attr.ah_attr.sl + self.attr.ah_attr.src_path_bits = ah_attr.ah_attr.src_path_bits + self.attr.ah_attr.static_rate = ah_attr.ah_attr.static_rate + self.attr.ah_attr.is_global = ah_attr.ah_attr.is_global + self.attr.ah_attr.port_num = ah_attr.ah_attr.port_num + else: + for i in range(16): + self.attr.alt_ah_attr.grh.dgid.raw[i] = \ + ah_attr.ah_attr.grh.dgid.raw[i] + self.attr.alt_ah_attr.grh.flow_label = \ + ah_attr.ah_attr.grh.flow_label + self.attr.alt_ah_attr.grh.sgid_index = \ + ah_attr.ah_attr.grh.sgid_index + self.attr.alt_ah_attr.grh.hop_limit = ah_attr.ah_attr.grh.hop_limit + self.attr.alt_ah_attr.grh.traffic_class = \ + ah_attr.ah_attr.grh.traffic_class + self.attr.alt_ah_attr.dlid = ah_attr.ah_attr.dlid + self.attr.alt_ah_attr.sl = ah_attr.ah_attr.sl + self.attr.alt_ah_attr.src_path_bits = ah_attr.ah_attr.src_path_bits + self.attr.alt_ah_attr.static_rate = ah_attr.ah_attr.static_rate + self.attr.alt_ah_attr.is_global = ah_attr.ah_attr.is_global + self.attr.alt_ah_attr.port_num = ah_attr.ah_attr.port_num + + @property + def pkey_index(self): + return self.attr.pkey_index + @pkey_index.setter + def pkey_index(self, val): + self.attr.pkey_index = val + + @property + def alt_pkey_index(self): + return self.attr.alt_pkey_index + @alt_pkey_index.setter + def alt_pkey_index(self, val): + self.attr.alt_pkey_index = val + + @property + def en_sqd_async_notify(self): + return self.attr.en_sqd_async_notify + @en_sqd_async_notify.setter + def en_sqd_async_notify(self, val): + self.attr.en_sqd_async_notify = val + + @property + def sq_draining(self): + return self.attr.sq_draining + @sq_draining.setter + def sq_draining(self, val): + self.attr.sq_draining = val + + @property + def max_rd_atomic(self): + return self.attr.max_rd_atomic + @max_rd_atomic.setter + def max_rd_atomic(self, val): + self.attr.max_rd_atomic = val + + @property + def max_dest_rd_atomic(self): + return self.attr.max_dest_rd_atomic + @max_dest_rd_atomic.setter + def max_dest_rd_atomic(self, val): + self.attr.max_dest_rd_atomic = val + + @property + def min_rnr_timer(self): + return self.attr.min_rnr_timer + @min_rnr_timer.setter + def min_rnr_timer(self, val): + self.attr.min_rnr_timer = val + + @property + def port_num(self): + return self.attr.port_num + @port_num.setter + def port_num(self, val): + self.attr.port_num = val + + @property + def timeout(self): + return self.attr.timeout + @timeout.setter + def timeout(self, val): + self.attr.timeout = val + + @property + def retry_cnt(self): + return self.attr.retry_cnt + @retry_cnt.setter + def retry_cnt(self, val): + self.attr.retry_cnt = val + + @property + def rnr_retry(self): + return self.attr.rnr_retry + @rnr_retry.setter + def rnr_retry(self, val): + self.attr.rnr_retry = val + + @property + def alt_port_num(self): + return self.attr.alt_port_num + @alt_port_num.setter + def alt_port_num(self, val): + self.attr.alt_port_num = val + + @property + def alt_timeout(self): + return self.attr.alt_timeout + @alt_timeout.setter + def alt_timeout(self, val): + self.attr.alt_timeout = val + + @property + def rate_limit(self): + return self.attr.rate_limit + @rate_limit.setter + def rate_limit(self, val): + self.attr.rate_limit = val + + def __str__(self): + print_format = '{:22}: {:<20}\n' + ah_format = ' {:22}: {:<20}\n' + ident_format = ' {:22}: {:<20}\n' + if self.attr.ah_attr.is_global: + global_ah = ah_format.format('dgid', + gid_str(self.attr.ah_attr.grh.dgid._global.subnet_prefix, + self.attr.ah_attr.grh.dgid._global.interface_id)) +\ + ah_format.format('flow label', + self.attr.ah_attr.grh.flow_label) +\ + ah_format.format('sgid index', + self.attr.ah_attr.grh.sgid_index) +\ + ah_format.format('hop limit', + self.attr.ah_attr.grh.hop_limit) +\ + ah_format.format('traffic_class', + self.attr.ah_attr.grh.traffic_class) + else: + global_ah = '' + if self.attr.alt_ah_attr.is_global: + alt_global_ah = ah_format.format('dgid', + gid_str(self.attr.alt_ah_attr.grh.dgid._global.subnet_prefix, + self.attr.alt_ah_attr.grh.dgid._global.interface_id)) +\ + ah_format.format('flow label', + self.attr.alt_ah_attr.grh.flow_label) +\ + ah_format.format('sgid index', + self.attr.alt_ah_attr.grh.sgid_index) +\ + ah_format.format('hop limit', + self.attr.alt_ah_attr.grh.hop_limit) +\ + ah_format.format('traffic_class', + self.attr.alt_ah_attr.grh.traffic_class) + else: + alt_global_ah = '' + return print_format.format('QP state', + qp_state_to_str(self.attr.qp_state)) +\ + print_format.format('QP current state', + qp_state_to_str(self.attr.cur_qp_state)) +\ + print_format.format('Path MTU', + mtu_to_str(self.attr.path_mtu)) +\ + print_format.format('Path mig. state', + mig_state_to_str(self.attr.path_mig_state)) +\ + print_format.format('QKey', self.attr.qkey) +\ + print_format.format('RQ PSN', self.attr.rq_psn) +\ + print_format.format('SQ PSN', self.attr.sq_psn) +\ + print_format.format('Dest QP number', self.attr.dest_qp_num) +\ + print_format.format('QP access flags', + access_flags_to_str(self.attr.qp_access_flags)) +\ + 'QP caps:\n' +\ + ident_format.format('max send WR', + self.attr.cap.max_send_wr) +\ + ident_format.format('max recv WR', + self.attr.cap.max_recv_wr) +\ + ident_format.format('max send SGE', + self.attr.cap.max_send_sge) +\ + ident_format.format('max recv SGE', + self.attr.cap.max_recv_sge) +\ + ident_format.format('max inline data', + self.attr.cap.max_inline_data) +\ + 'AH Attr:\n' +\ + ident_format.format('port num', self.attr.ah_attr.port_num) +\ + ident_format.format('sl', self.attr.ah_attr.sl) +\ + ident_format.format('source path bits', + self.attr.ah_attr.src_path_bits) +\ + ident_format.format('dlid', self.attr.ah_attr.dlid) +\ + ident_format.format('port num', self.attr.ah_attr.port_num) +\ + ident_format.format('static rate', + self.attr.ah_attr.static_rate) +\ + ident_format.format('is global', + self.attr.ah_attr.is_global) +\ + global_ah +\ + 'Alt. AH Attr:\n' +\ + ident_format.format('port num', self.attr.alt_ah_attr.port_num) +\ + ident_format.format('sl', self.attr.alt_ah_attr.sl) +\ + ident_format.format('source path bits', + self.attr.alt_ah_attr.src_path_bits) +\ + ident_format.format('dlid', self.attr.alt_ah_attr.dlid) +\ + ident_format.format('port num', self.attr.alt_ah_attr.port_num) +\ + ident_format.format('static rate', + self.attr.alt_ah_attr.static_rate) +\ + ident_format.format('is global', + self.attr.alt_ah_attr.is_global) +\ + alt_global_ah +\ + print_format.format('PKey index', self.attr.pkey_index) +\ + print_format.format('Alt. PKey index', + self.attr.alt_pkey_index) +\ + print_format.format('En. SQD async notify', + self.attr.en_sqd_async_notify) +\ + print_format.format('SQ draining', self.attr.sq_draining) +\ + print_format.format('Max RD atomic', self.attr.max_rd_atomic) +\ + print_format.format('Max dest. RD atomic', + self.attr.max_dest_rd_atomic) +\ + print_format.format('Min RNR timer', self.attr.min_rnr_timer) +\ + print_format.format('Port number', self.attr.port_num) +\ + print_format.format('Timeout', self.attr.timeout) +\ + print_format.format('Retry counter', self.attr.retry_cnt) +\ + print_format.format('RNR retry', self.attr.rnr_retry) +\ + print_format.format('Alt. port number', + self.attr.alt_port_num) +\ + print_format.format('Alt. timeout', self.attr.alt_timeout) +\ + print_format.format('Rate limit', self.attr.rate_limit) + + +cdef class QP(PyverbsCM): + def __init__(self, object creator not None, object init_attr not None, + QPAttr qp_attr=None): + """ + Initializes a QP object and performs state transitions according to + user request. + A C ibv_qp object will be created using the provided init_attr. + If a qp_attr object is provided, pyverbs will consider this a hint to + transit the QP's state as far as possible towards RTS: + - In case of UD and Raw Packet QP types, if a qp_attr is provided the + QP will be returned in RTS state. + - In case of connected QPs (RC, UC), remote QPN is needed for INIT2RTR + transition, so if a qp_attr is provided, the QP will be returned in + INIT state. + :param creator: The object creating the QP. Can be of type PD so + ibv_create_qp will be used or of type Context, so + ibv_create_qp_ex will be used. + :param init_attr: QP initial attributes of type QPInitAttr (when + created using PD) or QPInitAttrEx (when created + using Context). + :param qp_attr: Optional QPAttr object. Will be used for QP state + transitions after creation. + :return: An initialized QP object + """ + cdef PD pd + cdef Context ctx + super().__init__() + self.update_cqs(init_attr) + # QP initialization was not done by the provider, we should do it here + if self.qp == NULL: + # In order to use cdef'd methods, a proper casting must be done, + # let's infer the type. + if issubclass(type(creator), Context): + self._create_qp_ex(creator, init_attr) + if self.qp == NULL: + raise PyverbsRDMAErrno('Failed to create QP') + ctx = <Context>creator + self.context = ctx + ctx.add_ref(self) + if init_attr.pd is not None: + pd = <PD>init_attr.pd + pd.add_ref(self) + self.pd = pd + if init_attr.xrcd is not None: + xrcd = <XRCD>init_attr.xrcd + xrcd.add_ref(self) + self.xrcd = xrcd + + else: + self._create_qp(creator, init_attr) + if self.qp == NULL: + raise PyverbsRDMAErrno('Failed to create QP') + pd = <PD>creator + self.pd = pd + pd.add_ref(self) + self.context = None + + if qp_attr is not None: + funcs = {e.IBV_QPT_RC: self.to_init, e.IBV_QPT_UC: self.to_init, + e.IBV_QPT_UD: self.to_rts, + e.IBV_QPT_XRC_RECV: self.to_init, + e.IBV_QPT_XRC_SEND: self.to_init, + e.IBV_QPT_RAW_PACKET: self.to_rts} + funcs[self.qp.qp_type](qp_attr) + + cdef update_cqs(self, init_attr): + cdef CQ cq + cdef CQEX cqex + if init_attr.send_cq is not None: + if type(init_attr.send_cq) == CQ: + cq = <CQ>init_attr.send_cq + cq.add_ref(self) + self.scq = cq + else: + cqex = <CQEX>init_attr.send_cq + cqex.add_ref(self) + self.scq = cqex + if init_attr.send_cq != init_attr.recv_cq and init_attr.recv_cq is not None: + if type(init_attr.recv_cq) == CQ: + cq = <CQ>init_attr.recv_cq + cq.add_ref(self) + self.rcq = cq + else: + cqex = <CQEX>init_attr.recv_cq + cqex.add_ref(self) + self.rcq = cqex + + def _create_qp(self, PD pd, QPInitAttr attr): + self.qp = v.ibv_create_qp(pd.pd, &attr.attr) + + def _create_qp_ex(self, Context ctx, QPInitAttrEx attr): + self.qp = v.ibv_create_qp_ex(ctx.context, &attr.attr) + + def __dealloc__(self): + self.close() + + cpdef close(self): + if self.qp != NULL: + self.logger.debug('Closing QP') + rc = v.ibv_destroy_qp(self.qp) + if rc: + raise PyverbsRDMAError('Failed to destroy QP', rc) + self.qp = NULL + self.pd = None + self.context = None + self.scq = None + self.rcq = None + + def _get_comp_mask(self, dst): + masks = {e.IBV_QPT_RC: {'INIT': e.IBV_QP_PKEY_INDEX | e.IBV_QP_PORT |\ + e.IBV_QP_ACCESS_FLAGS, 'RTR': e.IBV_QP_AV |\ + e.IBV_QP_PATH_MTU | e.IBV_QP_DEST_QPN |\ + e.IBV_QP_RQ_PSN |\ + e.IBV_QP_MAX_DEST_RD_ATOMIC |\ + e.IBV_QP_MIN_RNR_TIMER, + 'RTS': e.IBV_QP_TIMEOUT |\ + e.IBV_QP_RETRY_CNT | e.IBV_QP_RNR_RETRY |\ + e.IBV_QP_SQ_PSN | e.IBV_QP_MAX_QP_RD_ATOMIC}, + e.IBV_QPT_UC: {'INIT': e.IBV_QP_PKEY_INDEX | e.IBV_QP_PORT |\ + e.IBV_QP_ACCESS_FLAGS, 'RTR': e.IBV_QP_AV |\ + e.IBV_QP_PATH_MTU | e.IBV_QP_DEST_QPN |\ + e.IBV_QP_RQ_PSN, 'RTS': e.IBV_QP_SQ_PSN}, + e.IBV_QPT_UD: {'INIT': e.IBV_QP_PKEY_INDEX | e.IBV_QP_PORT |\ + e.IBV_QP_QKEY, 'RTR': 0, + 'RTS': e.IBV_QP_SQ_PSN}, + e.IBV_QPT_RAW_PACKET: {'INIT': e.IBV_QP_PORT, 'RTR': 0, + 'RTS': 0}, + e.IBV_QPT_XRC_RECV: {'INIT': e.IBV_QP_PKEY_INDEX |\ + e.IBV_QP_PORT | e.IBV_QP_ACCESS_FLAGS, + 'RTR': e.IBV_QP_AV | e.IBV_QP_PATH_MTU |\ + e.IBV_QP_DEST_QPN | e.IBV_QP_RQ_PSN | \ + e.IBV_QP_MAX_DEST_RD_ATOMIC |\ + e.IBV_QP_MIN_RNR_TIMER, + 'RTS': e.IBV_QP_TIMEOUT | e.IBV_QP_SQ_PSN }, + e.IBV_QPT_XRC_SEND: {'INIT': e.IBV_QP_PKEY_INDEX |\ + e.IBV_QP_PORT | e.IBV_QP_ACCESS_FLAGS, + 'RTR': e.IBV_QP_AV | e.IBV_QP_PATH_MTU |\ + e.IBV_QP_DEST_QPN | e.IBV_QP_RQ_PSN, + 'RTS': e.IBV_QP_TIMEOUT |\ + e.IBV_QP_RETRY_CNT | e.IBV_QP_RNR_RETRY |\ + e.IBV_QP_SQ_PSN | e.IBV_QP_MAX_QP_RD_ATOMIC}} + + return masks[self.qp.qp_type][dst] | e.IBV_QP_STATE + + def to_init(self, QPAttr qp_attr): + """ + Modify the current QP's state to INIT. If the current state doesn't + support transition to INIT, an exception will be raised. + The comp mask provided to the kernel includes the needed bits for 2INIT + transition for this QP type. + :param qp_attr: QPAttr object containing the needed attributes for + 2INIT transition + :return: None + """ + mask = self._get_comp_mask('INIT') + qp_attr.qp_state = e.IBV_QPS_INIT + rc = v.ibv_modify_qp(self.qp, &qp_attr.attr, mask) + if rc != 0: + raise PyverbsRDMAError('Failed to modify QP state to init', rc) + + def to_rtr(self, QPAttr qp_attr): + """ + Modify the current QP's state to RTR. It assumes that its current + state is INIT or RESET, in which case it will attempt a transition to + INIT prior to transition to RTR. As a result, if current state doesn't + support transition to INIT, an exception will be raised. + The comp mask provided to the kernel includes the needed bits for 2RTR + transition for this QP type. + :param qp_attr: QPAttr object containing the needed attributes for + 2RTR transition. + :return: None + """ + if self.qp_state != e.IBV_QPS_INIT: #assume reset + self.to_init(qp_attr) + mask = self._get_comp_mask('RTR') + qp_attr.qp_state = e.IBV_QPS_RTR + rc = v.ibv_modify_qp(self.qp, &qp_attr.attr, mask) + if rc != 0: + raise PyverbsRDMAError('Failed to modify QP state to RTR', rc) + + def to_rts(self, QPAttr qp_attr): + """ + Modify the current QP's state to RTS. It assumes that its current + state is either RTR, INIT or RESET. If current state is not RTR, to_rtr() + will be called. + The comp mask provided to the kernel includes the needed bits for 2RTS + transition for this QP type. + :param qp_attr: QPAttr object containing the needed attributes for + 2RTS transition. + :return: None + """ + if self.qp_state != e.IBV_QPS_RTR: #assume reset/init + self.to_rtr(qp_attr) + mask = self._get_comp_mask('RTS') + qp_attr.qp_state = e.IBV_QPS_RTS + rc = v.ibv_modify_qp(self.qp, &qp_attr.attr, mask) + if rc != 0: + raise PyverbsRDMAError('Failed to modify QP state to RTS', rc) + + def query(self, attr_mask): + """ + Query the QP + :param attr_mask: The minimum list of attributes to retrieve. Some + devices may return additional attributes as well + (see enum ibv_qp_attr_mask) + :return: (QPAttr, QPInitAttr) tuple containing the QP requested + attributes + """ + attr = QPAttr() + init_attr = QPInitAttr() + rc = v.ibv_query_qp(self.qp, &attr.attr, attr_mask, &init_attr.attr) + if rc != 0: + raise PyverbsRDMAError('Failed to query QP', rc) + return attr, init_attr + + def modify(self, QPAttr qp_attr not None, comp_mask): + """ + Modify the QP + :param qp_attr: A QPAttr object with updated values to be applied to + the QP + :param comp_mask: A bitmask specifying which QP attributes should be + modified (see enum ibv_qp_attr_mask) + :return: None + """ + rc = v.ibv_modify_qp(self.qp, &qp_attr.attr, comp_mask) + if rc != 0: + raise PyverbsRDMAError('Failed to modify QP', rc) + + def post_recv(self, RecvWR wr not None, RecvWR bad_wr=None): + """ + Post a receive WR on the QP. + :param wr: The work request to post + :param bad_wr: A RecvWR object to hold the bad WR if it is available in + case of a failure + :return: None + """ + cdef v.ibv_recv_wr *my_bad_wr + # In order to provide a pointer to a pointer, use a temporary cdef'ed + # variable. + rc = v.ibv_post_recv(self.qp, &wr.recv_wr, &my_bad_wr) + if rc != 0: + if (bad_wr): + memcpy(&bad_wr.recv_wr, my_bad_wr, sizeof(bad_wr.recv_wr)) + raise PyverbsRDMAError('Failed to post recv', rc) + + def post_send(self, SendWR wr not None, SendWR bad_wr=None): + """ + Post a send WR on the QP. + :param wr: The work request to post + :param bad_wr: A SendWR object to hold the bad WR if it is available in + case of a failure + :return: None + """ + # In order to provide a pointer to a pointer, use a temporary cdef'ed + # variable. + cdef v.ibv_send_wr *my_bad_wr + rc = v.ibv_post_send(self.qp, &wr.send_wr, &my_bad_wr) + if rc != 0: + if (bad_wr): + memcpy(&bad_wr.send_wr, my_bad_wr, sizeof(bad_wr.send_wr)) + raise PyverbsRDMAError('Failed to post send', rc) + + @property + def qp_type(self): + return self.qp.qp_type + + @property + def qp_state(self): + return self.qp.state + + @property + def qp_num(self): + return self.qp.qp_num + + def __str__(self): + print_format = '{:22}: {:<20}\n' + return print_format.format('QP type', qp_type_to_str(self.qp_type)) +\ + print_format.format(' number', self.qp_num) +\ + print_format.format(' state', qp_state_to_str(self.qp_state)) + + +cdef class DataBuffer(PyverbsCM): + def __init__(self, addr, length): + super().__init__() + self.data.addr = PyLong_AsVoidPtr(addr) + self.data.length = length + + +cdef class QPEx(QP): + def __init__(self, object creator not None, object init_attr not None, + QPAttr qp_attr=None): + """ + Initializes a QPEx object. Since this is an extension of a QP, QP + creation is done in the parent class. The extended QP is retrieved by + casting the ibv_qp to ibv_qp_ex. + :return: An initialized QPEx object + """ + super().__init__(creator, init_attr, qp_attr) + self.qp_ex = v.ibv_qp_to_qp_ex(self.qp) + if self.qp_ex == NULL: + raise PyverbsRDMAErrno('Failed to create extended QP') + + @property + def comp_mask(self): + return self.qp_ex.comp_mask + @comp_mask.setter + def comp_mask(self, val): + self.qp_ex.comp_mask = val + + @property + def wr_id(self): + return self.qp_ex.wr_id + @wr_id.setter + def wr_id(self, val): + self.qp_ex.wr_id = val + + @property + def wr_flags(self): + return self.qp_ex.wr_flags + @wr_flags.setter + def wr_flags(self, val): + self.qp_ex.wr_flags = val + + def wr_atomic_cmp_swp(self, rkey, remote_addr, compare, swap): + v.ibv_wr_atomic_cmp_swp(self.qp_ex, rkey, remote_addr, compare, swap) + + def wr_atomic_fetch_add(self, rkey, remote_addr, add): + v.ibv_wr_atomic_fetch_add(self.qp_ex, rkey, remote_addr, add) + + def wr_bind_mw(self, MW mw, rkey, MWBindInfo bind_info): + cdef v.ibv_mw_bind_info *info + info = &bind_info.info + v.ibv_wr_bind_mw(self.qp_ex, <v.ibv_mw*>mw.mw, rkey, + <v.ibv_mw_bind_info*>info) + + def wr_local_inv(self, invalidate_rkey): + v.ibv_wr_local_inv(self.qp_ex, invalidate_rkey) + + def wr_rdma_read(self, rkey, remote_addr): + v.ibv_wr_rdma_read(self.qp_ex, rkey, remote_addr) + + def wr_rdma_write(self, rkey, remote_addr): + v.ibv_wr_rdma_write(self.qp_ex, rkey, remote_addr) + + def wr_rdma_write_imm(self, rkey, remote_addr, data): + cdef unsigned int imm_data = htobe32(data) + v.ibv_wr_rdma_write_imm(self.qp_ex, rkey, remote_addr, imm_data) + + def wr_send(self): + v.ibv_wr_send(self.qp_ex) + + def wr_send_imm(self, data): + cdef unsigned int imm_data = htobe32(data) + return v.ibv_wr_send_imm(self.qp_ex, imm_data) + + def wr_send_inv(self, invalidate_rkey): + v.ibv_wr_send_inv(self.qp_ex, invalidate_rkey) + + def wr_send_tso(self, hdr, hdr_sz, mss): + ptr = PyLong_AsVoidPtr(hdr) + v.ibv_wr_send_tso(self.qp_ex, ptr, hdr_sz, mss) + + def wr_set_ud_addr(self, AH ah, remote_qpn, remote_rkey): + v.ibv_wr_set_ud_addr(self.qp_ex, ah.ah, remote_qpn, remote_rkey) + + def wr_set_xrc_srqn(self, remote_srqn): + v.ibv_wr_set_xrc_srqn(self.qp_ex, remote_srqn) + + def wr_set_inline_data(self, addr, length): + ptr = PyLong_AsVoidPtr(addr) + v.ibv_wr_set_inline_data(self.qp_ex, ptr, length) + + def wr_set_inline_data_list(self, num_buf, buf_list): + cdef v.ibv_data_buf *data = NULL + data = <v.ibv_data_buf*>malloc(num_buf * sizeof(v.ibv_data_buf)) + if data == NULL: + raise PyverbsError('Failed to allocate data buffer') + for i in range(num_buf): + data_buf = <DataBuffer>buf_list[i] + data[i].addr = data_buf.data.addr + data[i].length = data_buf.data.length + v.ibv_wr_set_inline_data_list(self.qp_ex, num_buf, data) + free(data) + + def wr_set_sge(self, SGE sge not None): + v.ibv_wr_set_sge(self.qp_ex, sge.lkey, sge.addr, sge.length) + + def wr_set_sge_list(self, num_sge, sg_list): + cdef v.ibv_sge *sge = NULL + sge = <v.ibv_sge*>malloc(num_sge * sizeof(v.ibv_sge)) + if sge == NULL: + raise PyverbsError('Failed to allocate SGE buffer') + for i in range(num_sge): + sge[i].addr = sg_list[i].addr + sge[i].length = sg_list[i].length + sge[i].lkey = sg_list[i].lkey + v.ibv_wr_set_sge_list(self.qp_ex, num_sge, sge) + free(sge) + + def wr_start(self): + v.ibv_wr_start(self.qp_ex) + + def wr_complete(self): + rc = v.ibv_wr_complete(self.qp_ex) + if rc != 0: + raise PyverbsRDMAErrno('ibv_wr_complete failed , returned {}'. + format(rc)) + + def wr_abort(self): + v.ibv_wr_abort(self.qp_ex) + + +def _copy_caps(QPCap src, dst): + """ + Copy the QPCaps values of src into the inner ibv_qp_cap struct of dst. + Since both ibv_qp_init_attr and ibv_qp_attr have an inner ibv_qp_cap inner + struct, they can both be used. + :param src: A QPCap object + :param dst: A QPInitAttr / QPInitAttrEx / QPAttr object + :return: None + """ + # we're assigning to C structs here, we must have type-specific objects in + # order to do that. Instead of having this function smaller but in 3 + # classes, it appears here once. + cdef QPInitAttr qia + cdef QPInitAttrEx qiae + cdef QPAttr qa + if src is None: + return + if type(dst) == QPInitAttr: + qia = <QPInitAttr>dst + qia.attr.cap.max_send_wr = src.cap.max_send_wr + qia.attr.cap.max_recv_wr = src.cap.max_recv_wr + qia.attr.cap.max_send_sge = src.cap.max_send_sge + qia.attr.cap.max_recv_sge = src.cap.max_recv_sge + qia.attr.cap.max_inline_data = src.cap.max_inline_data + elif type(dst) == QPInitAttrEx: + qiae = <QPInitAttrEx>dst + qiae.attr.cap.max_send_wr = src.cap.max_send_wr + qiae.attr.cap.max_recv_wr = src.cap.max_recv_wr + qiae.attr.cap.max_send_sge = src.cap.max_send_sge + qiae.attr.cap.max_recv_sge = src.cap.max_recv_sge + qiae.attr.cap.max_inline_data = src.cap.max_inline_data + else: + qa = <QPAttr>dst + qa.attr.cap.max_send_wr = src.cap.max_send_wr + qa.attr.cap.max_recv_wr = src.cap.max_recv_wr + qa.attr.cap.max_send_sge = src.cap.max_send_sge + qa.attr.cap.max_recv_sge = src.cap.max_recv_sge + qa.attr.cap.max_inline_data = src.cap.max_inline_data diff --git a/pyverbs/srq.pxd b/pyverbs/srq.pxd new file mode 100755 index 0000000..a7b7b34 --- /dev/null +++ b/pyverbs/srq.pxd @@ -0,0 +1,24 @@ +# SPDX-License-Identifier: (GPL-2.0 OR Linux-OpenIB) +# Copyright (c) 2019 Mellanox Technologies, Inc. All rights reserved. + +#cython: language_level=3 + +from pyverbs.base cimport PyverbsObject, PyverbsCM +from . cimport libibverbs as v + +cdef class SrqAttr(PyverbsObject): + cdef v.ibv_srq_attr attr + +cdef class SrqInitAttr(PyverbsObject): + cdef v.ibv_srq_init_attr attr + +cdef class SrqInitAttrEx(PyverbsObject): + cdef v.ibv_srq_init_attr_ex attr + cdef object _cq + cdef object _pd + cdef object _xrcd + +cdef class SRQ(PyverbsCM): + cdef v.ibv_srq *srq + cdef object cq + cpdef close(self) diff --git a/pyverbs/srq.pyx b/pyverbs/srq.pyx new file mode 100755 index 0000000..826579a --- /dev/null +++ b/pyverbs/srq.pyx @@ -0,0 +1,196 @@ +from pyverbs.pyverbs_error import PyverbsRDMAError +from pyverbs.base import PyverbsRDMAErrno +from pyverbs.device cimport Context +from pyverbs.cq cimport CQEX, CQ +from pyverbs.xrcd cimport XRCD +from pyverbs.wr cimport RecvWR +from pyverbs.pd cimport PD +from libc.errno cimport errno +from libc.string cimport memcpy + + +cdef class SrqAttr(PyverbsObject): + def __init__(self, max_wr=100, max_sge=1, srq_limit=0): + super().__init__() + self.attr.max_wr = max_wr + self.attr.max_sge = max_sge + self.attr.srq_limit = srq_limit + + @property + def max_wr(self): + return self.attr.max_wr + @max_wr.setter + def max_wr(self, val): + self.attr.max_wr = val + + @property + def max_sge(self): + return self.attr.max_sge + @max_sge.setter + def max_sge(self, val): + self.attr.max_sge = val + + @property + def srq_limit(self): + return self.attr.srq_limit + @srq_limit.setter + def srq_limit(self, val): + self.attr.srq_limit = val + + +cdef class SrqInitAttr(PyverbsObject): + def __init__(self, SrqAttr attr = None): + super().__init__() + if attr is not None: + self.attr.attr.max_wr = attr.max_wr + self.attr.attr.max_sge = attr.max_sge + self.attr.attr.srq_limit = attr.srq_limit + + @property + def max_wr(self): + return self.attr.attr.max_wr + + @property + def max_sge(self): + return self.attr.attr.max_sge + + @property + def srq_limit(self): + return self.attr.attr.srq_limit + + +cdef class SrqInitAttrEx(PyverbsObject): + def __init__(self, max_wr=100, max_sge=1, srq_limit=0): + super().__init__() + self.attr.attr.max_wr = max_wr + self.attr.attr.max_sge = max_sge + self.attr.attr.srq_limit = srq_limit + self._cq = None + self._pd = None + self._xrcd = None + + @property + def max_wr(self): + return self.attr.attr.max_wr + + @property + def max_sge(self): + return self.attr.attr.max_sge + + @property + def srq_limit(self): + return self.attr.attr.srq_limit + + @property + def comp_mask(self): + return self.attr.comp_mask + @comp_mask.setter + def comp_mask(self, val): + self.attr.comp_mask = val + + @property + def srq_type(self): + return self.attr.srq_type + @srq_type.setter + def srq_type(self, val): + self.attr.srq_type = val + + @property + def pd(self): + return self._pd + @pd.setter + def pd(self, PD val): + self._pd = val + self.attr.pd = val.pd + + @property + def xrcd(self): + return self._xrcd + @xrcd.setter + def xrcd(self, XRCD val): + self._xrcd = val + self.attr.xrcd = val.xrcd + + @property + def cq(self): + return self._cq + @cq.setter + def cq(self, val): + if type(val) == CQ: + self.attr.cq = (<CQ>val).cq + self._cq = val + else: + self.attr.cq = (<CQEX>val).ibv_cq + self._cq = val + + +cdef class SRQ(PyverbsCM): + def __init__(self, object creator not None, object attr not None): + super().__init__() + self.srq = NULL + self.cq = None + if isinstance(creator, PD): + self._create_srq(creator, attr) + elif type(creator) == Context: + self._create_srq_ex(creator, attr) + else: + raise PyverbsRDMAError('Srq needs either Context or PD for creation') + if self.srq == NULL: + raise PyverbsRDMAErrno('Failed to create SRQ (errno is {err})'. + format(err=errno)) + self.logger.debug('SRQ Created') + + def __dealloc__(self): + self.close() + + cpdef close(self): + if self.srq != NULL: + self.logger.debug('Closing SRQ') + rc = v.ibv_destroy_srq(self.srq) + if rc != 0: + raise PyverbsRDMAError('Failed to destroy SRQ', rc) + self.srq = NULL + self.cq =None + + def _create_srq(self, PD pd, SrqInitAttr init_attr): + self.srq = v.ibv_create_srq(pd.pd, &init_attr.attr) + + def _create_srq_ex(self, Context context, SrqInitAttrEx init_attr_ex): + self.srq = v.ibv_create_srq_ex(context.context, &init_attr_ex.attr) + if init_attr_ex.cq: + cq = <CQ>init_attr_ex.cq + cq.add_ref(self) + self.cq = cq + if init_attr_ex.xrcd: + xrcd = <XRCD>init_attr_ex.xrcd + xrcd.add_ref(self) + if init_attr_ex.pd: + pd = <PD>init_attr_ex.pd + pd.add_ref(self) + + def get_srq_num(self): + cdef unsigned int srqn + rc = v.ibv_get_srq_num(self.srq, &srqn) + if rc != 0: + raise PyverbsRDMAError('Failed to retrieve SRQ number', rc) + return srqn + + def modify(self, SrqAttr attr, comp_mask): + rc = v.ibv_modify_srq(self.srq, &attr.attr, comp_mask) + if rc != 0: + raise PyverbsRDMAError('Failed to modify SRQ', rc) + + def query(self): + attr = SrqAttr() + rc = v.ibv_query_srq(self.srq, &attr.attr) + if rc != 0: + raise PyverbsRDMAError('Failed to query SRQ', rc) + return attr + + def post_recv(self, RecvWR wr not None, RecvWR bad_wr=None): + cdef v.ibv_recv_wr *my_bad_wr + rc = v.ibv_post_srq_recv(self.srq, &wr.recv_wr, &my_bad_wr) + if rc != 0: + if bad_wr: + memcpy(&bad_wr.recv_wr, my_bad_wr, sizeof(bad_wr.recv_wr)) + raise PyverbsRDMAError('Failed to post receive to SRQ.', rc) diff --git a/pyverbs/utils.py b/pyverbs/utils.py new file mode 100644 index 0000000..a59d627 --- /dev/null +++ b/pyverbs/utils.py @@ -0,0 +1,81 @@ +# SPDX-License-Identifier: (GPL-2.0 OR Linux-OpenIB) +# Copyright (c) 2019 Mellanox Technologies, Inc. All rights reserved. See COPYING file + +import struct + +from pyverbs.pyverbs_error import PyverbsUserError + +be64toh = lambda num: struct.unpack('Q', struct.pack('!Q', num))[0] + +def gid_str(subnet_prefix, interface_id): + hex_values = '%016x%016x' % (be64toh(subnet_prefix), be64toh(interface_id)) + return ':'.join([hex_values[0:4], hex_values[4:8], hex_values[8:12], + hex_values[12:16], hex_values[16:20], hex_values[20:24], + hex_values[24:28],hex_values[28:32]]) + + +def gid_str_to_array(val): + """ + Splits a GID to an array of u8 that can be easily assigned to a GID's raw + array. + :param val: GID value in 8 words format + 'xxxx:xxxx:xxxx:xxxx:xxxx:xxxx:xxxx:xxxx' + :return: An array of format xx:xx etc. + """ + val = val.split(':') + if len(val) != 8: + raise PyverbsUserError('Invalid GID value ({val})'.format(val=val)) + if any([len(v) != 4 for v in val]): + raise PyverbsUserError('Invalid GID value ({val})'.format(val=val)) + val_int = int(''.join(val), 16) + vals = [] + for i in range(8): + vals.append(val[i][0:2]) + vals.append(val[i][2:4]) + return vals + + +def qp_type_to_str(qp_type): + types = {2: 'RC', 3: 'UC', 4: 'UD', 8: 'Raw Packet', 9: 'XRCD_SEND', + 10: 'XRCD_RECV', 0xff:'Driver QP'} + try: + return types[qp_type] + except KeyError: + return 'Unknown ({qpt})'.format(qpt=qp_type) + + +def qp_state_to_str(qp_state): + states = {0: 'Reset', 1: 'Init', 2: 'RTR', 3: 'RTS', 4: 'SQD', + 5: 'SQE', 6: 'Error', 7: 'Unknown'} + try: + return states[qp_state] + except KeyError: + return 'Unknown ({qps})'.format(qps=qp_state_to_str) + + +def mtu_to_str(mtu): + mtus = {1: 256, 2: 512, 3: 1024, 4: 2048, 5: 4096} + try: + return mtus[mtu] + except KeyError: + return 0 + + +def access_flags_to_str(flags): + access_flags = {1: 'Local write', 2: 'Remote write', 4: 'Remote read', + 8: 'Remote atomic', 16: 'MW bind', 32: 'Zero based', + 64: 'On demand'} + access_str = '' + for f in access_flags: + if flags & f: + access_str += access_flags[f] + access_str += ' ' + return access_str + + +def mig_state_to_str(mig): + mig_states = {0: 'Migrated', 1: 'Re-arm', 2: 'Armed'} + try: + return mig_states[mig] + except KeyError: + return 'Unknown ({m})'.format(m=mig) diff --git a/pyverbs/wr.pxd b/pyverbs/wr.pxd new file mode 100644 index 0000000..ea054ff --- /dev/null +++ b/pyverbs/wr.pxd @@ -0,0 +1,19 @@ +# SPDX-License-Identifier: (GPL-2.0 OR Linux-OpenIB) +# Copyright (c) 2019 Mellanox Technologies, Inc. All rights reserved. See COPYING file + +#cython: language_level=3 + +from .base cimport PyverbsCM +from pyverbs cimport libibverbs as v + + +cdef class SGE(PyverbsCM): + cdef v.ibv_sge *sge + cpdef read(self, length, offset) + +cdef class RecvWR(PyverbsCM): + cdef v.ibv_recv_wr recv_wr + +cdef class SendWR(PyverbsCM): + cdef v.ibv_send_wr send_wr + cdef object ah diff --git a/pyverbs/wr.pyx b/pyverbs/wr.pyx new file mode 100644 index 0000000..37028c5 --- /dev/null +++ b/pyverbs/wr.pyx @@ -0,0 +1,305 @@ +# SPDX-License-Identifier: (GPL-2.0 OR Linux-OpenIB) +# Copyright (c) 2019 Mellanox Technologies Inc. All rights reserved. See COPYING file + +from pyverbs.pyverbs_error import PyverbsUserError, PyverbsError +from pyverbs.base import PyverbsRDMAErrno +cimport pyverbs.libibverbs_enums as e +from pyverbs.addr cimport AH +from libc.stdlib cimport free, malloc +from libc.string cimport memcpy + + +cdef class SGE(PyverbsCM): + """ + Represents ibv_sge struct. It has a read function to allow users to keep + track of data. Write function is not provided as a scatter-gather element + can be using a MR or a DMMR. In case direct (device's) memory is used, + write can't be done using memcpy that relies on CPU-specific optimizations. + A SGE has no way to tell which memory it is using. + """ + def __init__(self, addr, length, lkey): + """ + Initializes a SGE object. + :param addr: The address to be used for read/write + :param length: Available buffer size + :param lkey: Local key of the used MR/DMMR + :return: A SGE object + """ + super().__init__() + self.sge = <v.ibv_sge*>malloc(sizeof(v.ibv_sge)) + if self.sge == NULL: + raise PyverbsError('Failed to allocate an SGE') + self.sge.addr = addr + self.sge.length = length + self.sge.lkey = lkey + + def __dealloc(self): + self.close() + + cpdef close(self): + free(self.sge) + + cpdef read(self, length, offset): + """ + Reads <length> bytes of data starting at <offset> bytes from the + SGE's address. + :param length: How many bytes to read + :param offset: Offset from the SGE's address in bytes + :return: The data written at the SGE's address + offset + """ + cdef char *sg_data + cdef int off = offset + sg_data = <char*>(self.sge.addr + off) + return sg_data[:length] + + def __str__(self): + print_format = '{:22}: {:<20}\n' + return print_format.format('Address', hex(self.sge.addr)) +\ + print_format.format('Length', self.sge.length) +\ + print_format.format('Key', hex(self.sge.lkey)) + + @property + def addr(self): + return self.sge.addr + @addr.setter + def addr(self, val): + self.sge.addr = val + + @property + def length(self): + return self.sge.length + @length.setter + def length(self, val): + self.sge.length = val + + @property + def lkey(self): + return self.sge.lkey + @lkey.setter + def lkey(self, val): + self.sge.lkey = val + + +cdef class RecvWR(PyverbsCM): + def __init__(self, wr_id=0, num_sge=0, sg=None, + RecvWR next_wr=None): + """ + Initializes a RecvWR object. + :param wr_id: A user-defined WR ID + :param num_sge: Size of the scatter-gather array + :param sg: A scatter-gather array + :param: next_wr: The next WR in the list + :return: A RecvWR object + """ + super().__init__() + cdef v.ibv_sge *dst + if num_sge < 1 or sg is None: + raise PyverbsUserError('A WR needs at least one SGE') + self.recv_wr.sg_list = <v.ibv_sge*>malloc(num_sge * sizeof(v.ibv_sge)) + if self.recv_wr.sg_list == NULL: + raise PyverbsRDMAErrno('Failed to malloc SG buffer') + dst = self.recv_wr.sg_list + copy_sg_array(dst, sg, num_sge) + self.recv_wr.num_sge = num_sge + self.recv_wr.wr_id = wr_id + if next_wr is not None: + self.recv_wr.next = &next_wr.recv_wr + + def __dealloc(self): + self.close() + + cpdef close(self): + free(self.recv_wr.sg_list) + + def __str__(self): + print_format = '{:22}: {:<20}\n' + return print_format.format('WR ID', self.recv_wr.wr_id) +\ + print_format.format('Num SGE', self.recv_wr.num_sge) + + @property + def next_wr(self): + if self.recv_wr.next == NULL: + return None + val = RecvWR() + val.recv_wr = self.recv_wr.next[0] + return val + @next_wr.setter + def next_wr(self, RecvWR val not None): + self.recv_wr.next = &val.recv_wr + + @property + def wr_id(self): + return self.recv_wr.wr_id + @wr_id.setter + def wr_id(self, val): + self.recv_wr.wr_id = val + + @property + def num_sge(self): + return self.recv_wr.num_sge + @num_sge.setter + def num_sge(self, val): + self.recv_wr.num_sge = val + + +cdef class SendWR(PyverbsCM): + def __init__(self, wr_id=0, opcode=e.IBV_WR_SEND, num_sge=0, sg = None, + send_flags=e.IBV_SEND_SIGNALED, SendWR next_wr = None): + """ + Initialize a SendWR object with user-provided or default values. + :param wr_id: A user-defined WR ID + :param opcode: The WR's opcode + :param num_sge: Number of scatter-gather elements in the WR + :param send_flags: Send flags as define in ibv_send_flags enum + :param sg: A SGE element, head of the scatter-gather list + :return: An initialized SendWR object + """ + cdef v.ibv_sge *dst + + super().__init__() + if num_sge < 1 or sg is None: + raise PyverbsUserError('A WR needs at least one SGE') + self.send_wr.sg_list = <v.ibv_sge*>malloc(num_sge * sizeof(v.ibv_sge)) + if self.send_wr.sg_list == NULL: + raise PyverbsRDMAErrno('Failed to malloc SG buffer') + dst = self.send_wr.sg_list + copy_sg_array(dst, sg, num_sge) + self.send_wr.num_sge = num_sge + self.send_wr.wr_id = wr_id + if next_wr is not None: + self.send_wr.next = &next_wr.send_wr + self.send_wr.opcode = opcode + self.send_wr.send_flags = send_flags + self.ah = None + + def __dealloc(self): + self.close() + + cpdef close(self): + free(self.send_wr.sg_list) + + def __str__(self): + print_format = '{:22}: {:<20}\n' + return print_format.format('WR ID', self.send_wr.wr_id) +\ + print_format.format('Num SGE', self.send_wr.num_sge) +\ + print_format.format('Opcode', self.send_wr.opcode) +\ + print_format.format('Send flags', + send_flags_to_str(self.send_wr.send_flags)) + + @property + def next_wr(self): + if self.send_wr.next == NULL: + return None + val = SendWR() + val.send_wr = self.send_wr.next[0] + return val + @next_wr.setter + def next_wr(self, SendWR val not None): + self.send_wr.next = &val.send_wr + + @property + def wr_id(self): + return self.send_wr.wr_id + @wr_id.setter + def wr_id(self, val): + self.send_wr.wr_id = val + + @property + def num_sge(self): + return self.send_wr.num_sge + @num_sge.setter + def num_sge(self, val): + self.send_wr.num_sge = val + + @property + def opcode(self): + return self.send_wr.opcode + @opcode.setter + def opcode(self, val): + self.send_wr.opcode = val + + @property + def send_flags(self): + return self.send_wr.send_flags + @send_flags.setter + def send_flags(self, val): + self.send_wr.send_flags = val + + property sg_list: + def __set__(self, SGE val not None): + self.send_wr.sg_list = val.sge + + def set_wr_ud(self, AH ah not None, rqpn, rqkey): + """ + Set the members of the ud struct in the send_wr's wr union. + :param ah: An address handle object + :param rqpn: The remote QP number + :param rqkey: The remote QKey, authorizing access to the destination QP + :return: None + """ + self.ah = ah + self.send_wr.wr.ud.ah = ah.ah + self.send_wr.wr.ud.remote_qpn = rqpn + self.send_wr.wr.ud.remote_qkey = rqkey + + def set_wr_rdma(self, rkey, addr): + """ + Set the members of the rdma struct in the send_wr's wr union, used for + RDMA extended transport header creation. + :param rkey: Key to access the specified memory address. + :param addr: Start address of the buffer + :return: None + """ + self.send_wr.wr.rdma.remote_addr = addr + self.send_wr.wr.rdma.rkey = rkey + + def set_wr_atomic(self, rkey, addr, compare_add, swap=0): + """ + Set the members of the atomic struct in the send_wr's wr union, used + for the atomic extended transport header. + :param rkey: Key to access the specified memory address. + :param addr: Start address of the buffer + :param compare_add: The data operand used in the compare portion of the + compare and swap operation + :param swap: The data operand used in atomic operations: + - In compare and swap this field is swapped into the + addressed buffer + - In fetch and add this field is added to the contents of + the addressed buffer + :return: None + """ + self.send_wr.wr.atomic.remote_addr = addr + self.send_wr.wr.atomic.rkey = rkey + self.send_wr.wr.atomic.compare_add = compare_add + self.send_wr.wr.atomic.swap = swap + + def set_qp_type_xrc(self, remote_srqn): + """ + Set the members of the xrc struct in the send_wr's qp_type union, used + for the XRC extended transport header. + :param remote_srqn: The XRC SRQ number to be used by the responder fot + this packet + :return: None + """ + self.send_wr.qp_type.xrc.remote_srqn = remote_srqn + +def send_flags_to_str(flags): + send_flags = {e.IBV_SEND_FENCE: 'IBV_SEND_FENCE', + e.IBV_SEND_SIGNALED: 'IBV_SEND_SIGNALED', + e.IBV_SEND_SOLICITED: 'IBV_SEND_SOLICITED', + e.IBV_SEND_INLINE: 'IBV_SEND_INLINE', + e.IBV_SEND_IP_CSUM: 'IBV_SEND_IP_CSUM'} + flags_str = '' + for f in send_flags: + if flags & f: + flags_str += send_flags[f] + flags_str += ' ' + return flags_str + + +cdef copy_sg_array(v.ibv_sge *dst, sg, num_sge): + cdef v.ibv_sge *src + for i in range(num_sge): + src = (<SGE>sg[i]).sge + memcpy(dst, src, sizeof(v.ibv_sge)) + dst += 1 diff --git a/pyverbs/xrcd.pxd b/pyverbs/xrcd.pxd new file mode 100755 index 0000000..3897c28 --- /dev/null +++ b/pyverbs/xrcd.pxd @@ -0,0 +1,20 @@ +# SPDX-License-Identifier: (GPL-2.0 OR Linux-OpenIB) +# Copyright (c) 2019, Mellanox Technologies. All rights reserved. + +#cython: language_level=3 + +from pyverbs.base cimport PyverbsCM, PyverbsObject +from pyverbs.device cimport Context +cimport pyverbs.libibverbs as v + + +cdef class XRCDInitAttr(PyverbsObject): + cdef v.ibv_xrcd_init_attr attr + + +cdef class XRCD(PyverbsCM): + cdef v.ibv_xrcd *xrcd + cdef Context ctx + cdef add_ref(self, obj) + cdef object srqs + cdef object qps diff --git a/pyverbs/xrcd.pyx b/pyverbs/xrcd.pyx new file mode 100755 index 0000000..450276f --- /dev/null +++ b/pyverbs/xrcd.pyx @@ -0,0 +1,91 @@ +# SPDX-License-Identifier: (GPL-2.0 OR Linux-OpenIB) +# Copyright (c) 2019, Mellanox Technologies. All rights reserved. +import weakref + +from pyverbs.pyverbs_error import PyverbsError, PyverbsRDMAError +from pyverbs.base import PyverbsRDMAErrno +from pyverbs.base cimport close_weakrefs +from pyverbs.device cimport Context +from pyverbs.srq cimport SRQ +from pyverbs.qp cimport QP + + +cdef class XRCDInitAttr(PyverbsObject): + def __init__(self, comp_mask, oflags, fd): + super().__init__() + self.attr.fd = fd + self.attr.comp_mask = comp_mask + self.attr.oflags = oflags + + @property + def fd(self): + return self.attr.fd + @fd.setter + def fd(self, val): + self.attr.fd = val + + @property + def comp_mask(self): + return self.attr.comp_mask + @comp_mask.setter + def comp_mask(self, val): + self.attr.comp_mask = val + + @property + def oflags(self): + return self.attr.oflags + @oflags.setter + def oflags(self, val): + self.attr.oflags = val + + +cdef class XRCD(PyverbsCM): + def __init__(self, Context context not None, XRCDInitAttr init_attr not None): + """ + Initializes a XRCD object. + :param context: The Context object creating the XRCD + :return: The newly created XRCD on success + """ + super().__init__() + self.xrcd = v.ibv_open_xrcd(<v.ibv_context*> context.context, + &init_attr.attr) + if self.xrcd == NULL: + raise PyverbsRDMAErrno('Failed to allocate XRCD') + self.ctx = context + context.add_ref(self) + self.logger.debug('XRCD: Allocated ibv_xrcd') + self.srqs = weakref.WeakSet() + self.qps = weakref.WeakSet() + + def __dealloc__(self): + """ + Closes the inner XRCD. + :return: None + """ + self.close() + + cpdef close(self): + """ + Closes the underlying C object of the XRCD. + :return: None + """ + # XRCD may be deleted directly or indirectly by closing its context, + # which leaves the Python XRCD object without the underlying C object, + # so during destruction, need to check whether or not the C object + # exists. + if self.xrcd != NULL: + self.logger.debug('Closing XRCD') + close_weakrefs([self.qps, self.srqs]) + rc = v.ibv_close_xrcd(self.xrcd) + if rc != 0: + raise PyverbsRDMAError('Failed to dealloc XRCD', rc) + self.xrcd = NULL + self.ctx = None + + cdef add_ref(self, obj): + if isinstance(obj, QP): + self.qps.add(obj) + elif isinstance(obj, SRQ): + self.srqs.add(obj) + else: + raise PyverbsError('Unrecognized object type') diff --git a/rdma-ndd/CMakeLists.txt b/rdma-ndd/CMakeLists.txt new file mode 100644 index 0000000..e8419a6 --- /dev/null +++ b/rdma-ndd/CMakeLists.txt @@ -0,0 +1,27 @@ +# COPYRIGHT (c) 2016 Intel Corporation. +# Licensed under BSD (MIT variant) or GPLv2. See COPYING. + +set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS}") + +rdma_sbin_executable(rdma-ndd + rdma-ndd.c + ) + +target_link_libraries(rdma-ndd LINK_PRIVATE + ${SYSTEMD_LIBRARIES} + ${UDEV_LIBRARIES} + ) + +# FIXME Autogenerate from the .rst +rdma_man_pages( + rdma-ndd.8.in + ) + +install(FILES "rdma-ndd.rules" + RENAME "60-rdma-ndd.rules" + DESTINATION "${CMAKE_INSTALL_UDEV_RULESDIR}") + +rdma_subst_install(FILES "rdma-ndd.service.in" + DESTINATION "${CMAKE_INSTALL_SYSTEMD_SERVICEDIR}" + RENAME "rdma-ndd.service") + diff --git a/rdma-ndd/rdma-ndd.8.in b/rdma-ndd/rdma-ndd.8.in new file mode 100644 index 0000000..d6c75d3 --- /dev/null +++ b/rdma-ndd/rdma-ndd.8.in @@ -0,0 +1,91 @@ +.\" Man page generated from reStructuredText. +. +.TH RDMA-NDD 8 "@BUILD_DATE@" "" "OpenIB Diagnostics" +.SH NAME +RDMA-NDD \- RDMA device Node Description update daemon +. +.nr rst2man-indent-level 0 +. +.de1 rstReportMargin +\\$1 \\n[an-margin] +level \\n[rst2man-indent-level] +level margin: \\n[rst2man-indent\\n[rst2man-indent-level]] +- +\\n[rst2man-indent0] +\\n[rst2man-indent1] +\\n[rst2man-indent2] +.. +.de1 INDENT +.\" .rstReportMargin pre: +. RS \\$1 +. nr rst2man-indent\\n[rst2man-indent-level] \\n[an-margin] +. nr rst2man-indent-level +1 +.\" .rstReportMargin post: +.. +.de UNINDENT +. RE +.\" indent \\n[an-margin] +.\" old: \\n[rst2man-indent\\n[rst2man-indent-level]] +.nr rst2man-indent-level -1 +.\" new: \\n[rst2man-indent\\n[rst2man-indent-level]] +.in \\n[rst2man-indent\\n[rst2man-indent-level]]u +.. +.SH SYNOPSIS +.sp +rdma\-ndd <options> +.SH DESCRIPTION +.sp +rdma\-ndd is a system daemon which watches for rdma device changes and/or +hostname changes and updates the Node Description of the rdma devices based on +those changes. +.SH DETAILS +.sp +Full operation of this daemon requires kernels which support polling of the +procfs hostname file as well as libudev. +.sp +If your system does not support either of these features, the daemon will set +the Node Descriptions at start up and then sleep forever. +.SS Node Description configuration +.sp +The daemon uses the environment variable RDMA_NDD_ND_FORMAT to set the node +description. The following wild cards can be specified for more dynamic +control. +.sp +%h \-\- replace with the current hostname (not including domain) +.sp +%d \-\- replace with the device name (for example mlx4_0, qib0, etc.) +.sp +If not specified the default is "%h %d". +.sp +NOTE: At startup, and on new device detection, the Node Description is always +written to ensure the SM and rdma\-ndd are in sync. Subsequent events will only +write the Node Description on a device if it has changed. +.SS Using systemd +.sp +Setting the environment variable for the daemon is normally be done via a +systemd drop in unit. For example the following could be added to a file named +/etc/systemd/system/rdma\-ndd.service.d/nd\-format.conf to use only the +hostname as your node description. +.sp +[Service] +Environment="RDMA_NDD_ND_FORMAT=%%h" +.sp +NOTE: Systemd requires an extra \(aq%\(aq. +.SH OPTIONS +.sp +\fB\-f, \-\-foreground\fP +Run in the foreground instead of as a daemon +.sp +\fB\-d, \-\-debugging\fP +Log additional debugging information to syslog +.sp +\fB\-\-systemd\fP +Enable systemd integration. +.SH AUTHOR +.INDENT 0.0 +.TP +.B Ira Weiny +< \fI\%ira.weiny@intel.com\fP > +.UNINDENT +.\" Generated by docutils manpage writer. +. diff --git a/rdma-ndd/rdma-ndd.8.in.rst b/rdma-ndd/rdma-ndd.8.in.rst new file mode 100644 index 0000000..767f388 --- /dev/null +++ b/rdma-ndd/rdma-ndd.8.in.rst @@ -0,0 +1,85 @@ +======== +RDMA-NDD +======== + +------------------------------------------ +RDMA device Node Description update daemon +------------------------------------------ + +:Date: @BUILD_DATE@ +:Manual section: 8 +:Manual group: OpenIB Diagnostics + + +SYNOPSIS +======== + +rdma-ndd <options> + +DESCRIPTION +=========== + +rdma-ndd is a system daemon which watches for rdma device changes and/or +hostname changes and updates the Node Description of the rdma devices based on +those changes. + + +DETAILS +======= + +Full operation of this daemon requires kernels which support polling of the +procfs hostname file as well as libudev. + +If your system does not support either of these features, the daemon will set +the Node Descriptions at start up and then sleep forever. + + +Node Description configuration +------------------------------ + +The daemon uses the environment variable RDMA_NDD_ND_FORMAT to set the node +description. The following wild cards can be specified for more dynamic +control. + +%h -- replace with the current hostname (not including domain) + +%d -- replace with the device name (for example mlx4_0, qib0, etc.) + +If not specified the default is "%h %d". + +NOTE: At startup, and on new device detection, the Node Description is always +written to ensure the SM and rdma-ndd are in sync. Subsequent events will only +write the Node Description on a device if it has changed. + +Using systemd +------------- + +Setting the environment variable for the daemon is normally be done via a +systemd drop in unit. For example the following could be added to a file named +/etc/systemd/system/rdma-ndd.service.d/nd-format.conf to use only the +hostname as your node description. + +[Service] +Environment="RDMA_NDD_ND_FORMAT=%%h" + +NOTE: Systemd requires an extra '%'. + + +OPTIONS +======= + +**-f, --foreground** +Run in the foreground instead of as a daemon + +**-d, --debugging** +Log additional debugging information to syslog + +**--systemd** +Enable systemd integration. + + +AUTHOR +====== + +Ira Weiny + < ira.weiny@intel.com > diff --git a/rdma-ndd/rdma-ndd.c b/rdma-ndd/rdma-ndd.c new file mode 100644 index 0000000..8a98270 --- /dev/null +++ b/rdma-ndd/rdma-ndd.c @@ -0,0 +1,348 @@ +/* + * Copyright (c) 2014,2016 Intel Corporation. All Rights Reserved + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#include <config.h> + +#include <poll.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> +#include <assert.h> +#include <string.h> +#include <limits.h> +#include <stdio.h> +#include <syslog.h> +#include <dirent.h> +#include <errno.h> +#include <unistd.h> +#include <getopt.h> +#include <stdlib.h> +#include <stdbool.h> + +#include <systemd/sd-daemon.h> +#include <libudev.h> + +static struct udev *g_udev; +static struct udev_monitor *g_mon; + +#define SYS_HOSTNAME "/proc/sys/kernel/hostname" +#define SYS_INFINIBAND "/sys/class/infiniband" +#define DEFAULT_ND_FORMAT "%h %d" + +static char *g_nd_format = NULL; +static bool debugging; + +static void newline_to_null(char *str) +{ + char *term = index(str, '\n'); + if (term) + *term = '\0'; +} + +static void strip_domain(char *str) +{ + char *term = index(str, '.'); + if (term) + *term = '\0'; +} + +static __attribute__((format(printf, 1, 2))) void dbg_log(const char *fmt, ...) +{ + va_list ap; + + if (!debugging) + return; + + va_start(ap, fmt); + vsyslog(LOG_DEBUG, fmt, ap); + va_end(ap); +} + +static void build_node_desc(char *dest, size_t len, + const char *device, const char *hostname) +{ + char *end = dest + len-1; + const char *field; + char *src = g_nd_format; + + while (*src && (dest < end)) { + if (*src != '%') { + *dest++ = *src++; + } else { + src++; + switch (*src) { + case 'h': + field = hostname; + while (*field && (*field != '.') && (dest < end)) + *dest++ = *field++; + break; + case 'd': + field = device; + while (*field && (dest < end)) + *dest++ = *field++; + break; + } + src++; + } + } + *dest = 0; +} + +static int update_node_desc(const char *device, const char *hostname, int force) +{ + int rc; + char nd[128]; + char new_nd[64]; + char nd_file[PATH_MAX]; + FILE *f; + + snprintf(nd_file, sizeof(nd_file), SYS_INFINIBAND "/%s/node_desc", + device); + nd_file[sizeof(nd_file)-1] = '\0'; + + f = fopen(nd_file, "r+"); + if (!f) + return -EIO; + + if (!fgets(nd, sizeof(nd), f)) { + syslog(LOG_ERR, "Failed to read %s\n", nd_file); + rc = -EIO; + goto error; + } + newline_to_null(nd); + + build_node_desc(new_nd, sizeof(new_nd), device, hostname); + + if (!force && strncmp(new_nd, nd, sizeof(new_nd)) == 0) { + dbg_log("%s: no change (%s)\n", device, new_nd); + } else { + dbg_log("%s: change (%s) -> (%s)\n", device, nd, new_nd); + rewind(f); + fprintf(f, "%s", new_nd); + } + + rc = 0; +error: + fclose(f); + return rc; +} + +static void set_rdma_node_desc(const char *hostname, int force) +{ + DIR *class_dir; + struct dirent *dent; + + class_dir = opendir(SYS_INFINIBAND); + if (!class_dir) { + syslog(LOG_ERR, "Failed to open " SYS_INFINIBAND); + return; + } + + while ((dent = readdir(class_dir))) { + if (dent->d_name[0] == '.') + continue; + + if (update_node_desc(dent->d_name, hostname, force)) + syslog(LOG_DEBUG, "set Node Description failed on %s\n", + dent->d_name); + } + + closedir(class_dir); +} + +static void read_hostname(int fd, char *name, size_t len) +{ + memset(name, 0, len); + if (read(fd, name, len-1) >= 0) { + newline_to_null(name); + strip_domain(name); + } else { + syslog(LOG_ERR, "Read %s Failed\n", SYS_HOSTNAME); + } + lseek(fd, 0, SEEK_SET); +} + +static void setup_udev(void) +{ + g_udev = udev_new(); + if (!g_udev) { + syslog(LOG_ERR, "udev_new failed\n"); + return; + } +} + +static int get_udev_fd(void) +{ + g_mon = udev_monitor_new_from_netlink(g_udev, "udev"); + if (!g_mon) { + syslog(LOG_ERR, "udev monitoring failed\n"); + return -1; + } + + udev_monitor_filter_add_match_subsystem_devtype(g_mon, "infiniband", NULL); + udev_monitor_enable_receiving(g_mon); + return udev_monitor_get_fd(g_mon); +} + +static void process_udev_event(int ud_fd, const char *hostname) +{ + struct udev_device *dev; + + dev = udev_monitor_receive_device(g_mon); + if (dev) { + const char *device = udev_device_get_sysname(dev); + const char *action = udev_device_get_action(dev); + + dbg_log("Device event: %s, %s, %s\n", + udev_device_get_subsystem(dev), device, action); + + if (device && action && + (!strncmp(action, "add", sizeof("add")) || + !strncmp(action, "move", sizeof("add")))) + if (update_node_desc(device, hostname, 1)) + syslog(LOG_DEBUG, "set Node Description failed on %s\n", + device); + + udev_device_unref(dev); + } +} + +static void monitor(bool systemd) +{ + char hostname[128]; + int hn_fd; + struct pollfd fds[2]; + int numfds = 1; + int ud_fd; + + hn_fd = open(SYS_HOSTNAME, O_RDONLY); + if (hn_fd < 0) { + syslog(LOG_ERR, "Open %s Failed exiting\n", + SYS_HOSTNAME); + exit(EXIT_FAILURE); + } + + read_hostname(hn_fd, hostname, sizeof(hostname)); + set_rdma_node_desc((const char *)hostname, 1); + + fds[0].fd = hn_fd; + fds[0].events = 0; + + ud_fd = get_udev_fd(); + if (ud_fd >= 0) + numfds = 2; + + fds[1].fd = ud_fd; + fds[1].events = POLLIN; + + if (systemd) + sd_notify(0, "READY=1"); + + while (1) { + if (poll(fds, numfds, -1) <= 0) { + syslog(LOG_ERR, "Poll %s failed; exiting\n", SYS_HOSTNAME); + exit(EXIT_FAILURE); + } + + if (fds[0].revents != 0) { + read_hostname(hn_fd, hostname, sizeof(hostname)); + dbg_log("Hostname event: %s\n", hostname); + set_rdma_node_desc((const char *)hostname, 0); + } + + if (fds[1].revents != 0) + process_udev_event(ud_fd, hostname); + } +} + +int main(int argc, char *argv[]) +{ + bool foreground = false; + bool systemd = false; + + openlog(NULL, LOG_NDELAY | LOG_CONS | LOG_PID, LOG_DAEMON); + + while (1) { + static const struct option long_opts[] = { + { "foreground", 0, NULL, 'f' }, + { "systemd", 0, NULL, 's' }, + { "help", 0, NULL, 'h' }, + { "debug", 0, NULL, 'd' }, + { } + }; + + int c = getopt_long(argc, argv, "fh", long_opts, NULL); + if (c == -1) + break; + + switch (c) { + case 'f': + foreground = true; + break; + case 's': + systemd = true; + break; + case 'd': + debugging = true; + break; + case 'h': + printf("rdma-ndd [options]\n"); + printf(" See 'man rdma-ndd' for details\n"); + return 0; + default: + break; + + } + } + + if (!foreground && !systemd) { + if (daemon(0, 0) != 0) { + syslog(LOG_ERR, "Failed to daemonize\n"); + return EXIT_FAILURE; + } + } + + setup_udev(); + + g_nd_format = getenv("RDMA_NDD_ND_FORMAT"); + if (g_nd_format && strncmp("", g_nd_format, strlen(g_nd_format)) != 0) + g_nd_format = strdup(g_nd_format); + else + g_nd_format = strdup(DEFAULT_ND_FORMAT); + + dbg_log("Node Descriptor format (%s)\n", g_nd_format); + + monitor(systemd); + + return 0; +} diff --git a/rdma-ndd/rdma-ndd.rules b/rdma-ndd/rdma-ndd.rules new file mode 100644 index 0000000..ef7768e --- /dev/null +++ b/rdma-ndd/rdma-ndd.rules @@ -0,0 +1,3 @@ +# If an InfiniBand/RDMA device is installed with a writable node_description +# sysfs then start rdma-ndd to keep it up to date +SUBSYSTEM=="infiniband", TAG+="systemd", ATTRS{node_desc}=="*", ENV{SYSTEMD_WANTS}+="rdma-ndd.service" diff --git a/rdma-ndd/rdma-ndd.service.in b/rdma-ndd/rdma-ndd.service.in new file mode 100644 index 0000000..a63399a --- /dev/null +++ b/rdma-ndd/rdma-ndd.service.in @@ -0,0 +1,24 @@ +[Unit] +Description=RDMA Node Description Daemon +Documentation=man:rdma-ndd +StopWhenUnneeded=yes +# rdma-ndd is a kernel support program and needs to run as early as possible, +# before the network link is brought up, and before an external manager tries +# to read the local node description. +DefaultDependencies=no +Before=sysinit.target +# Do not execute concurrently with an ongoing shutdown (required for DefaultDependencies=no) +Conflicts=shutdown.target +Before=shutdown.target +# Networking, particularly link up, should not happen until ndd is ready +Wants=network-pre.target +Before=network-pre.target +# rdma-hw is not ready until ndd is running +Before=rdma-hw.target + +[Service] +Type=notify +Restart=always +ExecStart=@CMAKE_INSTALL_FULL_SBINDIR@/rdma-ndd --systemd + +# rdma-ndd is automatically wanted by udev when an RDMA device with a node description is present diff --git a/redhat/rdma-core.spec b/redhat/rdma-core.spec new file mode 100644 index 0000000..dcd6ad3 --- /dev/null +++ b/redhat/rdma-core.spec @@ -0,0 +1,663 @@ +Name: rdma-core +Version: 29.0 +Release: 1%{?dist} +Summary: RDMA core userspace libraries and daemons + +# Almost everything is licensed under the OFA dual GPLv2, 2 Clause BSD license +# providers/ipathverbs/ Dual licensed using a BSD license with an extra patent clause +# providers/rxe/ Incorporates code from ipathverbs and contains the patent clause +# providers/hfi1verbs Uses the 3 Clause BSD license +License: GPLv2 or BSD +Url: https://github.com/linux-rdma/rdma-core +Source: rdma-core-%{version}.tgz +# Do not build static libs by default. +%define with_static %{?_with_static: 1} %{?!_with_static: 0} + +# 32-bit arm is missing required arch-specific memory barriers, +ExcludeArch: %{arm} + +BuildRequires: binutils +BuildRequires: cmake >= 2.8.11 +BuildRequires: gcc +BuildRequires: libudev-devel +BuildRequires: pkgconfig +BuildRequires: pkgconfig(libnl-3.0) +BuildRequires: pkgconfig(libnl-route-3.0) +BuildRequires: /usr/bin/rst2man +BuildRequires: valgrind-devel +BuildRequires: systemd +BuildRequires: systemd-devel +%if 0%{?fedora} >= 32 +%define with_pyverbs %{?_with_pyverbs: 0} %{?!_with_pyverbs: 1} +%else +%define with_pyverbs %{?_with_pyverbs: 1} %{?!_with_pyverbs: 0} +%endif +%if %{with_pyverbs} +BuildRequires: python3-devel +BuildRequires: python3-Cython +%else +%if 0%{?rhel} >= 8 || 0%{?fedora} >= 30 +BuildRequires: python3 +%else +BuildRequires: python +%endif +%endif + +%if 0%{?rhel} >= 8 || 0%{?fedora} >= 30 || %{with_pyverbs} +BuildRequires: python3-docutils +%else +BuildRequires: python-docutils +%endif + +%if 0%{?fedora} >= 21 || 0%{?rhel} >= 8 +BuildRequires: perl-generators +%endif + +Requires: dracut, kmod, systemd, pciutils +# Red Hat/Fedora previously shipped redhat/ as a stand-alone +# package called 'rdma', which we're supplanting here. +Provides: rdma = %{version}-%{release} +Obsoletes: rdma < %{version}-%{release} +Conflicts: infiniband-diags <= 1.6.7 + +# Since we recommend developers use Ninja, so should packagers, for consistency. +%define CMAKE_FLAGS %{nil} +%if 0%{?fedora} >= 23 || 0%{?rhel} >= 8 +# Ninja was introduced in FC23 +BuildRequires: ninja-build +%define CMAKE_FLAGS -GNinja +%define make_jobs ninja-build -v %{?_smp_mflags} +%define cmake_install DESTDIR=%{buildroot} ninja-build install +%else +# Fallback to make otherwise +BuildRequires: make +%define make_jobs make VERBOSE=1 %{?_smp_mflags} +%define cmake_install DESTDIR=%{buildroot} make install +%endif + +%if 0%{?fedora} >= 25 || 0%{?rhel} >= 8 +# pandoc was introduced in FC25, Centos8 +BuildRequires: pandoc +%endif + +%description +RDMA core userspace infrastructure and documentation, including initialization +scripts, kernel driver-specific modprobe override configs, IPoIB network +scripts, dracut rules, and the rdma-ndd utility. + +%package devel +Summary: RDMA core development libraries and headers +Requires: %{name}%{?_isa} = %{version}-%{release} +Requires: libibverbs%{?_isa} = %{version}-%{release} +Provides: libibverbs-devel = %{version}-%{release} +Obsoletes: libibverbs-devel < %{version}-%{release} +Requires: libibumad%{?_isa} = %{version}-%{release} +Provides: libibumad-devel = %{version}-%{release} +Obsoletes: libibumad-devel < %{version}-%{release} +Requires: librdmacm%{?_isa} = %{version}-%{release} +Provides: librdmacm-devel = %{version}-%{release} +Obsoletes: librdmacm-devel < %{version}-%{release} +Requires: ibacm%{?_isa} = %{version}-%{release} +Provides: ibacm-devel = %{version}-%{release} +Obsoletes: ibacm-devel < %{version}-%{release} +Requires: infiniband-diags%{?_isa} = %{version}-%{release} +Provides: infiniband-diags-devel = %{version}-%{release} +Obsoletes: infiniband-diags-devel < %{version}-%{release} +Provides: libibmad-devel = %{version}-%{release} +Obsoletes: libibmad-devel < %{version}-%{release} +%if %{with_static} +# Since our pkg-config files include private references to these packages they +# need to have their .pc files installed too, even for dynamic linking, or +# pkg-config breaks. +BuildRequires: pkgconfig(libnl-3.0) +BuildRequires: pkgconfig(libnl-route-3.0) +%endif + +%description devel +RDMA core development libraries and headers. + +%package -n infiniband-diags +Summary: InfiniBand Diagnostic Tools +Provides: perl(IBswcountlimits) +Provides: libibmad = %{version}-%{release} +Obsoletes: libibmad < %{version}-%{release} +Obsoletes: openib-diags < 1.3 + +%description -n infiniband-diags +This package provides IB diagnostic programs and scripts needed to diagnose an +IB subnet. infiniband-diags now also provides libibmad. libibmad provides +low layer IB functions for use by the IB diagnostic and management +programs. These include MAD, SA, SMP, and other basic IB functions. + +%package -n infiniband-diags-compat +Summary: OpenFabrics Alliance InfiniBand Diagnostic Tools + +%description -n infiniband-diags-compat +Deprecated scripts and utilities which provide duplicated functionality, most +often at a reduced performance. These are maintained for the time being for +compatibility reasons. + +%package -n libibverbs +Summary: A library and drivers for direct userspace use of RDMA (InfiniBand/iWARP/RoCE) hardware +Requires(post): /sbin/ldconfig +Requires(postun): /sbin/ldconfig +Requires: %{name}%{?_isa} = %{version}-%{release} +Provides: libcxgb4 = %{version}-%{release} +Obsoletes: libcxgb4 < %{version}-%{release} +Provides: libefa = %{version}-%{release} +Obsoletes: libefa < %{version}-%{release} +Provides: libhfi1 = %{version}-%{release} +Obsoletes: libhfi1 < %{version}-%{release} +Provides: libi40iw = %{version}-%{release} +Obsoletes: libi40iw < %{version}-%{release} +Provides: libipathverbs = %{version}-%{release} +Obsoletes: libipathverbs < %{version}-%{release} +Provides: libmlx4 = %{version}-%{release} +Obsoletes: libmlx4 < %{version}-%{release} +Provides: libmlx5 = %{version}-%{release} +Obsoletes: libmlx5 < %{version}-%{release} +Provides: libmthca = %{version}-%{release} +Obsoletes: libmthca < %{version}-%{release} +Provides: libocrdma = %{version}-%{release} +Obsoletes: libocrdma < %{version}-%{release} +Provides: librxe = %{version}-%{release} +Obsoletes: librxe < %{version}-%{release} + +%description -n libibverbs +libibverbs is a library that allows userspace processes to use RDMA +"verbs" as described in the InfiniBand Architecture Specification and +the RDMA Protocol Verbs Specification. This includes direct hardware +access from userspace to InfiniBand/iWARP adapters (kernel bypass) for +fast path operations. + +Device-specific plug-in ibverbs userspace drivers are included: + +- libcxgb4: Chelsio T4 iWARP HCA +- libefa: Amazon Elastic Fabric Adapter +- libhfi1: Intel Omni-Path HFI +- libhns: HiSilicon Hip06 SoC +- libi40iw: Intel Ethernet Connection X722 RDMA +- libipathverbs: QLogic InfiniPath HCA +- libmlx4: Mellanox ConnectX-3 InfiniBand HCA +- libmlx5: Mellanox Connect-IB/X-4+ InfiniBand HCA +- libmthca: Mellanox InfiniBand HCA +- libocrdma: Emulex OneConnect RDMA/RoCE Device +- libqedr: QLogic QL4xxx RoCE HCA +- librxe: A software implementation of the RoCE protocol +- libsiw: A software implementation of the iWarp protocol +- libvmw_pvrdma: VMware paravirtual RDMA device + +%package -n libibverbs-utils +Summary: Examples for the libibverbs library +Requires: libibverbs%{?_isa} = %{version}-%{release} + +%description -n libibverbs-utils +Useful libibverbs example programs such as ibv_devinfo, which +displays information about RDMA devices. + +%package -n ibacm +Summary: InfiniBand Communication Manager Assistant +Requires(post): systemd-units +Requires(preun): systemd-units +Requires(postun): systemd-units +Requires: %{name}%{?_isa} = %{version}-%{release} + +%description -n ibacm +The ibacm daemon helps reduce the load of managing path record lookups on +large InfiniBand fabrics by providing a user space implementation of what +is functionally similar to an ARP cache. The use of ibacm, when properly +configured, can reduce the SA packet load of a large IB cluster from O(n^2) +to O(n). The ibacm daemon is started and normally runs in the background, +user applications need not know about this daemon as long as their app +uses librdmacm to handle connection bring up/tear down. The librdmacm +library knows how to talk directly to the ibacm daemon to retrieve data. + +%package -n iwpmd +Summary: iWarp Port Mapper userspace daemon +Requires(post): systemd-units +Requires(preun): systemd-units +Requires(postun): systemd-units +Requires: %{name}%{?_isa} = %{version}-%{release} + +%description -n iwpmd +iwpmd provides a userspace service for iWarp drivers to claim +tcp ports through the standard socket interface. + +%package -n libibumad +Summary: OpenFabrics Alliance InfiniBand umad (userspace management datagram) library +Requires: %{name}%{?_isa} = %{version}-%{release} + +%description -n libibumad +libibumad provides the userspace management datagram (umad) library +functions, which sit on top of the umad modules in the kernel. These +are used by the IB diagnostic and management tools, including OpenSM. + +%package -n librdmacm +Summary: Userspace RDMA Connection Manager +Requires: %{name}%{?_isa} = %{version}-%{release} + +%description -n librdmacm +librdmacm provides a userspace RDMA Communication Management API. + +%package -n librdmacm-utils +Summary: Examples for the librdmacm library +Requires: librdmacm%{?_isa} = %{version}-%{release} + +%description -n librdmacm-utils +Example test programs for the librdmacm library. + +%package -n srp_daemon +Summary: Tools for using the InfiniBand SRP protocol devices +Obsoletes: srptools <= 1.0.3 +Provides: srptools = %{version}-%{release} +Obsoletes: openib-srptools <= 0.0.6 +Requires(post): systemd-units +Requires(preun): systemd-units +Requires(postun): systemd-units +Requires: %{name}%{?_isa} = %{version}-%{release} + +%description -n srp_daemon +In conjunction with the kernel ib_srp driver, srp_daemon allows you to +discover and use SCSI devices via the SCSI RDMA Protocol over InfiniBand. + +%if %{with_pyverbs} +%package -n python3-pyverbs +Summary: Python3 API over IB verbs +%{?python_provide:%python_provide python3-pyverbs} + +%description -n python3-pyverbs +Pyverbs is a Cython-based Python API over libibverbs, providing an +easy, object-oriented access to IB verbs. +%endif + +%prep +%setup + +%build + +# New RPM defines _rundir, usually as /run +%if 0%{?_rundir:1} +%else +%define _rundir /var/run +%endif + +%{!?EXTRA_CMAKE_FLAGS: %define EXTRA_CMAKE_FLAGS %{nil}} + +# Pass all of the rpm paths directly to GNUInstallDirs and our other defines. +%cmake %{CMAKE_FLAGS} \ + -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_INSTALL_BINDIR:PATH=%{_bindir} \ + -DCMAKE_INSTALL_SBINDIR:PATH=%{_sbindir} \ + -DCMAKE_INSTALL_LIBDIR:PATH=%{_libdir} \ + -DCMAKE_INSTALL_LIBEXECDIR:PATH=%{_libexecdir} \ + -DCMAKE_INSTALL_LOCALSTATEDIR:PATH=%{_localstatedir} \ + -DCMAKE_INSTALL_SHAREDSTATEDIR:PATH=%{_sharedstatedir} \ + -DCMAKE_INSTALL_INCLUDEDIR:PATH=%{_includedir} \ + -DCMAKE_INSTALL_INFODIR:PATH=%{_infodir} \ + -DCMAKE_INSTALL_MANDIR:PATH=%{_mandir} \ + -DCMAKE_INSTALL_SYSCONFDIR:PATH=%{_sysconfdir} \ + -DCMAKE_INSTALL_SYSTEMD_SERVICEDIR:PATH=%{_unitdir} \ + -DCMAKE_INSTALL_INITDDIR:PATH=%{_initrddir} \ + -DCMAKE_INSTALL_RUNDIR:PATH=%{_rundir} \ + -DCMAKE_INSTALL_DOCDIR:PATH=%{_docdir}/%{name}-%{version} \ + -DCMAKE_INSTALL_UDEV_RULESDIR:PATH=%{_udevrulesdir} \ + -DCMAKE_INSTALL_PERLDIR:PATH=%{perl_vendorlib} \ + -DENABLE_IBDIAGS_COMPAT:BOOL=True \ +%if %{with_static} + -DENABLE_STATIC=1 \ +%endif + %{EXTRA_CMAKE_FLAGS} \ +%if %{defined __python3} + -DPYTHON_EXECUTABLE:PATH=%{__python3} \ + -DCMAKE_INSTALL_PYTHON_ARCH_LIB:PATH=%{python3_sitearch} \ +%endif +%if %{with_pyverbs} + -DNO_PYVERBS=0 +%else + -DNO_PYVERBS=1 +%endif +%make_jobs + +%install +%cmake_install + +mkdir -p %{buildroot}/%{_sysconfdir}/rdma + +# Red Hat specific glue +%global dracutlibdir %{_prefix}/lib/dracut +%global sysmodprobedir %{_prefix}/lib/modprobe.d +mkdir -p %{buildroot}%{_sysconfdir}/udev/rules.d +mkdir -p %{buildroot}%{_libexecdir} +mkdir -p %{buildroot}%{_udevrulesdir} +mkdir -p %{buildroot}%{dracutlibdir}/modules.d/05rdma +mkdir -p %{buildroot}%{sysmodprobedir} +install -D -m0644 redhat/rdma.conf %{buildroot}/%{_sysconfdir}/rdma/rdma.conf +install -D -m0644 redhat/rdma.sriov-vfs %{buildroot}/%{_sysconfdir}/rdma/sriov-vfs +install -D -m0644 redhat/rdma.mlx4.conf %{buildroot}/%{_sysconfdir}/rdma/mlx4.conf +install -D -m0644 redhat/rdma.service %{buildroot}%{_unitdir}/rdma.service +install -D -m0755 redhat/rdma.modules-setup.sh %{buildroot}%{dracutlibdir}/modules.d/05rdma/module-setup.sh +install -D -m0644 redhat/rdma.udev-rules %{buildroot}%{_udevrulesdir}/98-rdma.rules +install -D -m0644 redhat/rdma.mlx4.sys.modprobe %{buildroot}%{sysmodprobedir}/libmlx4.conf +install -D -m0755 redhat/rdma.kernel-init %{buildroot}%{_libexecdir}/rdma-init-kernel +install -D -m0755 redhat/rdma.sriov-init %{buildroot}%{_libexecdir}/rdma-set-sriov-vf +install -D -m0755 redhat/rdma.mlx4-setup.sh %{buildroot}%{_libexecdir}/mlx4-setup.sh + +# ibacm +bin/ib_acme -D . -O +install -D -m0644 ibacm_opts.cfg %{buildroot}%{_sysconfdir}/rdma/ + +# Delete the package's init.d scripts +rm -rf %{buildroot}/%{_initrddir}/ +rm -rf %{buildroot}/%{_sbindir}/srp_daemon.sh + +%post -n rdma-core +# we ship udev rules, so trigger an update. +/sbin/udevadm trigger --subsystem-match=infiniband --action=change || true +/sbin/udevadm trigger --subsystem-match=net --action=change || true +/sbin/udevadm trigger --subsystem-match=infiniband_mad --action=change || true + +%post -n infiniband-diags -p /sbin/ldconfig +%postun -n infiniband-diags -p /sbin/ldconfig + +%post -n libibverbs -p /sbin/ldconfig +%postun -n libibverbs -p /sbin/ldconfig + +%post -n libibumad -p /sbin/ldconfig +%postun -n libibumad -p /sbin/ldconfig + +%post -n librdmacm -p /sbin/ldconfig +%postun -n librdmacm -p /sbin/ldconfig + +%post -n ibacm +%systemd_post ibacm.service +%preun -n ibacm +%systemd_preun ibacm.service +%postun -n ibacm +%systemd_postun_with_restart ibacm.service + +%post -n srp_daemon +%systemd_post srp_daemon.service +%preun -n srp_daemon +%systemd_preun srp_daemon.service +%postun -n srp_daemon +%systemd_postun_with_restart srp_daemon.service + +%post -n iwpmd +%systemd_post iwpmd.service +%preun -n iwpmd +%systemd_preun iwpmd.service +%postun -n iwpmd +%systemd_postun_with_restart iwpmd.service + +%files +%dir %{_sysconfdir}/rdma +%dir %{_docdir}/%{name}-%{version} +%doc %{_docdir}/%{name}-%{version}/README.md +%doc %{_docdir}/%{name}-%{version}/rxe.md +%doc %{_docdir}/%{name}-%{version}/udev.md +%doc %{_docdir}/%{name}-%{version}/tag_matching.md +%config(noreplace) %{_sysconfdir}/rdma/mlx4.conf +%config(noreplace) %{_sysconfdir}/rdma/modules/infiniband.conf +%config(noreplace) %{_sysconfdir}/rdma/modules/iwarp.conf +%config(noreplace) %{_sysconfdir}/rdma/modules/opa.conf +%config(noreplace) %{_sysconfdir}/rdma/modules/rdma.conf +%config(noreplace) %{_sysconfdir}/rdma/modules/roce.conf +%config(noreplace) %{_sysconfdir}/rdma/rdma.conf +%config(noreplace) %{_sysconfdir}/rdma/sriov-vfs +%config(noreplace) %{_sysconfdir}/udev/rules.d/* +%config(noreplace) %{_sysconfdir}/modprobe.d/mlx4.conf +%config(noreplace) %{_sysconfdir}/modprobe.d/truescale.conf +%{_unitdir}/rdma-hw.target +%{_unitdir}/rdma-load-modules@.service +%{_unitdir}/rdma.service +%dir %{dracutlibdir}/modules.d/05rdma +%{dracutlibdir}/modules.d/05rdma/module-setup.sh +%{_udevrulesdir}/../rdma_rename +%{_udevrulesdir}/60-rdma-ndd.rules +%{_udevrulesdir}/60-rdma-persistent-naming.rules +%{_udevrulesdir}/75-rdma-description.rules +%{_udevrulesdir}/90-rdma-hw-modules.rules +%{_udevrulesdir}/90-rdma-ulp-modules.rules +%{_udevrulesdir}/90-rdma-umad.rules +%{_udevrulesdir}/98-rdma.rules +%{sysmodprobedir}/libmlx4.conf +%{_libexecdir}/rdma-init-kernel +%{_libexecdir}/rdma-set-sriov-vf +%{_libexecdir}/mlx4-setup.sh +%{_libexecdir}/truescale-serdes.cmds +%{_sbindir}/rdma-ndd +%{_unitdir}/rdma-ndd.service +%{_mandir}/man7/rxe* +%{_mandir}/man8/rdma-ndd.* +%license COPYING.* + +%files devel +%doc %{_docdir}/%{name}-%{version}/MAINTAINERS +%dir %{_includedir}/infiniband +%dir %{_includedir}/rdma +%{_includedir}/infiniband/* +%{_includedir}/rdma/* +%if %{with_static} +%{_libdir}/lib*.a +%endif +%{_libdir}/lib*.so +%{_libdir}/pkgconfig/*.pc +%{_mandir}/man3/efadv* +%{_mandir}/man3/ibv_* +%{_mandir}/man3/rdma* +%{_mandir}/man3/umad* +%{_mandir}/man3/*_to_ibv_rate.* +%{_mandir}/man7/rdma_cm.* +%{_mandir}/man3/mlx5dv* +%{_mandir}/man3/mlx4dv* +%{_mandir}/man7/efadv* +%{_mandir}/man7/mlx5dv* +%{_mandir}/man7/mlx4dv* +%{_mandir}/man3/ibnd_* + +%files -n infiniband-diags-compat +%{_sbindir}/ibcheckerrs +%{_mandir}/man8/ibcheckerrs* +%{_sbindir}/ibchecknet +%{_mandir}/man8/ibchecknet* +%{_sbindir}/ibchecknode +%{_mandir}/man8/ibchecknode* +%{_sbindir}/ibcheckport +%{_mandir}/man8/ibcheckport.* +%{_sbindir}/ibcheckportwidth +%{_mandir}/man8/ibcheckportwidth* +%{_sbindir}/ibcheckportstate +%{_mandir}/man8/ibcheckportstate* +%{_sbindir}/ibcheckwidth +%{_mandir}/man8/ibcheckwidth* +%{_sbindir}/ibcheckstate +%{_mandir}/man8/ibcheckstate* +%{_sbindir}/ibcheckerrors +%{_mandir}/man8/ibcheckerrors* +%{_sbindir}/ibdatacounts +%{_mandir}/man8/ibdatacounts* +%{_sbindir}/ibdatacounters +%{_mandir}/man8/ibdatacounters* +%{_sbindir}/ibdiscover.pl +%{_mandir}/man8/ibdiscover* +%{_sbindir}/ibswportwatch.pl +%{_mandir}/man8/ibswportwatch* +%{_sbindir}/ibqueryerrors.pl +%{_sbindir}/iblinkinfo.pl +%{_sbindir}/ibprintca.pl +%{_mandir}/man8/ibprintca* +%{_sbindir}/ibprintswitch.pl +%{_mandir}/man8/ibprintswitch* +%{_sbindir}/ibprintrt.pl +%{_mandir}/man8/ibprintrt* +%{_sbindir}/set_nodedesc.sh + +%files -n infiniband-diags +%{_sbindir}/ibaddr +%{_mandir}/man8/ibaddr* +%{_sbindir}/ibnetdiscover +%{_mandir}/man8/ibnetdiscover* +%{_sbindir}/ibping +%{_mandir}/man8/ibping* +%{_sbindir}/ibportstate +%{_mandir}/man8/ibportstate* +%{_sbindir}/ibroute +%{_mandir}/man8/ibroute.* +%{_sbindir}/ibstat +%{_mandir}/man8/ibstat.* +%{_sbindir}/ibsysstat +%{_mandir}/man8/ibsysstat* +%{_sbindir}/ibtracert +%{_mandir}/man8/ibtracert* +%{_sbindir}/perfquery +%{_mandir}/man8/perfquery* +%{_sbindir}/sminfo +%{_mandir}/man8/sminfo* +%{_sbindir}/smpdump +%{_mandir}/man8/smpdump* +%{_sbindir}/smpquery +%{_mandir}/man8/smpquery* +%{_sbindir}/saquery +%{_mandir}/man8/saquery* +%{_sbindir}/vendstat +%{_mandir}/man8/vendstat* +%{_sbindir}/iblinkinfo +%{_mandir}/man8/iblinkinfo* +%{_sbindir}/ibqueryerrors +%{_mandir}/man8/ibqueryerrors* +%{_sbindir}/ibcacheedit +%{_mandir}/man8/ibcacheedit* +%{_sbindir}/ibccquery +%{_mandir}/man8/ibccquery* +%{_sbindir}/ibccconfig +%{_mandir}/man8/ibccconfig* +%{_sbindir}/dump_fts +%{_mandir}/man8/dump_fts* +%{_sbindir}/ibhosts +%{_mandir}/man8/ibhosts* +%{_sbindir}/ibswitches +%{_mandir}/man8/ibswitches* +%{_sbindir}/ibnodes +%{_mandir}/man8/ibnodes* +%{_sbindir}/ibrouters +%{_mandir}/man8/ibrouters* +%{_sbindir}/ibfindnodesusing.pl +%{_mandir}/man8/ibfindnodesusing* +%{_sbindir}/ibidsverify.pl +%{_mandir}/man8/ibidsverify* +%{_sbindir}/check_lft_balance.pl +%{_mandir}/man8/check_lft_balance* +%{_sbindir}/dump_lfts.sh +%{_mandir}/man8/dump_lfts* +%{_sbindir}/dump_mfts.sh +%{_mandir}/man8/dump_mfts* +%{_sbindir}/ibclearerrors +%{_mandir}/man8/ibclearerrors* +%{_sbindir}/ibclearcounters +%{_mandir}/man8/ibclearcounters* +%{_sbindir}/ibstatus +%{_mandir}/man8/ibstatus* +%{_mandir}/man8/infiniband-diags* +%{_libdir}/libibmad*.so.* +%{_libdir}/libibnetdisc*.so.* +%{perl_vendorlib}/IBswcountlimits.pm +%config(noreplace) %{_sysconfdir}/infiniband-diags/error_thresholds +%config(noreplace) %{_sysconfdir}/infiniband-diags/ibdiag.conf + +%files -n libibverbs +%dir %{_sysconfdir}/libibverbs.d +%dir %{_libdir}/libibverbs +%{_libdir}/libefa.so.* +%{_libdir}/libibverbs*.so.* +%{_libdir}/libibverbs/*.so +%{_libdir}/libmlx5.so.* +%{_libdir}/libmlx4.so.* +%config(noreplace) %{_sysconfdir}/libibverbs.d/*.driver +%doc %{_docdir}/%{name}-%{version}/libibverbs.md + +%files -n libibverbs-utils +%{_bindir}/ibv_* +%{_mandir}/man1/ibv_* + +%files -n ibacm +%config(noreplace) %{_sysconfdir}/rdma/ibacm_opts.cfg +%{_bindir}/ib_acme +%{_sbindir}/ibacm +%{_mandir}/man1/ib_acme.* +%{_mandir}/man7/ibacm.* +%{_mandir}/man7/ibacm_prov.* +%{_mandir}/man8/ibacm.* +%{_unitdir}/ibacm.service +%{_unitdir}/ibacm.socket +%dir %{_libdir}/ibacm +%{_libdir}/ibacm/* +%doc %{_docdir}/%{name}-%{version}/ibacm.md + +%files -n iwpmd +%{_sbindir}/iwpmd +%{_unitdir}/iwpmd.service +%config(noreplace) %{_sysconfdir}/rdma/modules/iwpmd.conf +%config(noreplace) %{_sysconfdir}/iwpmd.conf +%{_udevrulesdir}/90-iwpmd.rules +%{_mandir}/man8/iwpmd.* +%{_mandir}/man5/iwpmd.* + +%files -n libibumad +%{_libdir}/libibumad*.so.* + +%files -n librdmacm +%{_libdir}/librdmacm*.so.* +%dir %{_libdir}/rsocket +%{_libdir}/rsocket/*.so* +%doc %{_docdir}/%{name}-%{version}/librdmacm.md +%{_mandir}/man7/rsocket.* + +%files -n librdmacm-utils +%{_bindir}/cmtime +%{_bindir}/mckey +%{_bindir}/rcopy +%{_bindir}/rdma_client +%{_bindir}/rdma_server +%{_bindir}/rdma_xclient +%{_bindir}/rdma_xserver +%{_bindir}/riostream +%{_bindir}/rping +%{_bindir}/rstream +%{_bindir}/ucmatose +%{_bindir}/udaddy +%{_bindir}/udpong +%{_mandir}/man1/cmtime.* +%{_mandir}/man1/mckey.* +%{_mandir}/man1/rcopy.* +%{_mandir}/man1/rdma_client.* +%{_mandir}/man1/rdma_server.* +%{_mandir}/man1/rdma_xclient.* +%{_mandir}/man1/rdma_xserver.* +%{_mandir}/man1/riostream.* +%{_mandir}/man1/rping.* +%{_mandir}/man1/rstream.* +%{_mandir}/man1/ucmatose.* +%{_mandir}/man1/udaddy.* +%{_mandir}/man1/udpong.* + +%files -n srp_daemon +%config(noreplace) %{_sysconfdir}/srp_daemon.conf +%config(noreplace) %{_sysconfdir}/rdma/modules/srp_daemon.conf +%{_libexecdir}/srp_daemon/start_on_all_ports +%{_unitdir}/srp_daemon.service +%{_unitdir}/srp_daemon_port@.service +%{_sbindir}/ibsrpdm +%{_sbindir}/srp_daemon +%{_sbindir}/run_srp_daemon +%{_udevrulesdir}/60-srp_daemon.rules +%{_mandir}/man5/srp_daemon.service.5* +%{_mandir}/man5/srp_daemon_port@.service.5* +%{_mandir}/man8/ibsrpdm.8* +%{_mandir}/man8/srp_daemon.8* +%doc %{_docdir}/%{name}-%{version}/ibsrpdm.md + +%if %{with_pyverbs} +%files -n python3-pyverbs +%{python3_sitearch}/pyverbs +%{_docdir}/%{name}-%{version}/tests/*.py +%endif diff --git a/redhat/rdma.conf b/redhat/rdma.conf new file mode 100644 index 0000000..f5b74b2 --- /dev/null +++ b/redhat/rdma.conf @@ -0,0 +1,18 @@ +# Load IPoIB +IPOIB_LOAD=yes +# Load SRP (SCSI Remote Protocol initiator support) module +SRP_LOAD=yes +# Load SRPT (SCSI Remote Protocol target support) module +SRPT_LOAD=yes +# Load iSER (iSCSI over RDMA initiator support) module +ISER_LOAD=yes +# Load iSERT (iSCSI over RDMA target support) module +ISERT_LOAD=yes +# Load RDS (Reliable Datagram Service) network protocol +RDS_LOAD=no +# Load NFSoRDMA client transport module +XPRTRDMA_LOAD=yes +# Load NFSoRDMA server transport module +SVCRDMA_LOAD=no +# Load Tech Preview device driver modules +TECH_PREVIEW_LOAD=no diff --git a/redhat/rdma.kernel-init b/redhat/rdma.kernel-init new file mode 100644 index 0000000..c7444a1 --- /dev/null +++ b/redhat/rdma.kernel-init @@ -0,0 +1,229 @@ +#!/bin/bash +# +# Bring up the kernel RDMA stack +# +# This is usually run automatically by systemd after a hardware activation +# event in udev has triggered a start of the rdma.service unit +# + +shopt -s nullglob + +CONFIG=/etc/rdma/rdma.conf + +LOAD_ULP_MODULES="" +LOAD_CORE_USER_MODULES="ib_umad ib_uverbs ib_ucm rdma_ucm" +LOAD_CORE_CM_MODULES="iw_cm ib_cm rdma_cm" +LOAD_CORE_MODULES="ib_core" +LOAD_TECH_PREVIEW_DRIVERS="no" + +if [ -f $CONFIG ]; then + . $CONFIG + + if [ "${RDS_LOAD}" == "yes" ]; then + IPOIB_LOAD=yes + fi + + if [ "${IPOIB_LOAD}" == "yes" ]; then + LOAD_ULP_MODULES="ib_ipoib" + fi + + if [ "${RDS_LOAD}" == "yes" -a -f /lib/modules/`uname -r`/kernel/net/rds/rds.ko ]; then + LOAD_ULP_MODULES="$LOAD_ULP_MODULES rds" + if [ -f /lib/modules/`uname -r`/kernel/net/rds/rds_tcp.ko ]; then + LOAD_ULP_MODULES="$LOAD_ULP_MODULES rds_tcp" + fi + if [ -f /lib/modules/`uname -r`/kernel/net/rds/rds_rdma.ko ]; then + LOAD_ULP_MODULES="$LOAD_ULP_MODULES rds_rdma" + fi + fi + + if [ "${SRP_LOAD}" == "yes" ]; then + LOAD_ULP_MODULES="$LOAD_ULP_MODULES ib_srp" + fi + + if [ "${SRPT_LOAD}" == "yes" ]; then + LOAD_ULP_MODULES="$LOAD_ULP_MODULES ib_srpt" + fi + + if [ "${ISER_LOAD}" == "yes" ]; then + LOAD_ULP_MODULES="$LOAD_ULP_MODULES ib_iser" + fi + + if [ "${ISERT_LOAD}" == "yes" ]; then + LOAD_ULP_MODULES="$LOAD_ULP_MODULES ib_isert" + fi + + if [ "${XPRTRDMA_LOAD}" == "yes" ]; then + LOAD_ULP_MODULES="$LOAD_ULP_MODULES xprtrdma" + fi + + if [ "${SVCRDMA_LOAD}" == "yes" ]; then + LOAD_ULP_MODULES="$LOAD_ULP_MODULES svcrdma" + fi + if [ "${TECH_PREVIEW_LOAD}" == "yes" ]; then + LOAD_TECH_PREVIEW_DRIVERS="$TECH_PREVIEW_LOAD" + fi +else + LOAD_ULP_MODULES="ib_ipoib" +fi + +# If module $1 is loaded return - 0 else - 1 +is_loaded() +{ + /sbin/lsmod | grep -w "$1" > /dev/null 2>&1 + return $? +} + +load_modules() +{ + local RC=0 + + for module in $*; do + if ! /sbin/modinfo $module > /dev/null 2>&1; then + # do not attempt to load modules which do not exist + continue + fi + if ! is_loaded $module; then + /sbin/modprobe $module + res=$? + RC=$[ $RC + $res ] + if [ $res -ne 0 ]; then + echo + echo "Failed to load module $module" + fi + fi + done + return $RC +} + +load_hardware_modules() +{ + local -i RC=0 + + # We match both class NETWORK and class INFINIBAND devices since our + # iWARP hardware is listed under class NETWORK. The side effect of + # this is that we might cause a non-iWARP network driver to be loaded. + udevadm trigger --subsystem-match=pci --attr-nomatch=driver --attr-match=class=0x020000 --attr-match=class=0x0c0600 + udevadm settle + if [ -r /proc/device-tree ]; then + if [ -n "`ls /proc/device-tree | grep lhca`" ]; then + if ! is_loaded ib_ehca; then + load_modules ib_ehca + RC+=$? + fi + fi + fi + if is_loaded mlx4_core -a ! is_loaded mlx4_ib; then + load_modules mlx4_ib + RC+=$? + fi + if is_loaded mlx4_core -a ! is_loaded mlx4_en; then + load_modules mlx4_en + RC+=$? + fi + if is_loaded mlx5_core -a ! is_loaded mlx5_ib; then + load_modules mlx5_ib + RC+=$? + fi + if is_loaded cxgb4 -a ! is_loaded iw_cxgb4; then + load_modules iw_cxgb4 + RC+=$? + fi + if is_loaded be2net -a ! is_loaded ocrdma; then + load_modules ocrdma + RC+=$? + fi + if is_loaded enic -a ! is_loaded usnic_verbs; then + load_modules usnic_verbs + RC+=$? + fi + if [ "${LOAD_TECH_PREVIEW_DRIVERS}" == "yes" ]; then + if is_loaded i40e -a ! is_loaded i40iw; then + load_modules i40iw + RC+=$? + fi + fi + return $RC +} + +errata_58() +{ + # Check AMD chipset issue Errata #58 + if test -x /sbin/lspci && test -x /sbin/setpci; then + if ( /sbin/lspci -nd 1022:1100 | grep "1100" > /dev/null ) && + ( /sbin/lspci -nd 1022:7450 | grep "7450" > /dev/null ) && + ( /sbin/lspci -nd 15b3:5a46 | grep "5a46" > /dev/null ); then + CURVAL=`/sbin/setpci -d 1022:1100 69` + for val in $CURVAL + do + if [ "${val}" != "c0" ]; then + /sbin/setpci -d 1022:1100 69=c0 + if [ $? -eq 0 ]; then + break + else + echo "Failed to apply AMD-8131 Errata #58 workaround" + fi + fi + done + fi + fi +} + +errata_56() +{ + # Check AMD chipset issue Errata #56 + if test -x /sbin/lspci && test -x /sbin/setpci; then + if ( /sbin/lspci -nd 1022:1100 | grep "1100" > /dev/null ) && + ( /sbin/lspci -nd 1022:7450 | grep "7450" > /dev/null ) && + ( /sbin/lspci -nd 15b3:5a46 | grep "5a46" > /dev/null ); then + bus="" + # Look for devices AMD-8131 + for dev in `/sbin/setpci -v -f -d 1022:7450 19 | cut -d':' -f1,2` + do + bus=`/sbin/setpci -s $dev 19` + rev=`/sbin/setpci -s $dev 8` + # Look for Tavor attach to secondary bus of this devices + for device in `/sbin/setpci -f -s $bus: -d 15b3:5a46 19` + do + if [ $rev -lt 13 ]; then + /sbin/setpci -d 15b3:5a44 72=14 + if [ $? -eq 0 ]; then + break + else + echo + echo "Failed to apply AMD-8131 Errata #56 workaround" + fi + else + continue + fi + # If more than one device is on the bus the issue a + # warning + num=`/sbin/setpci -f -s $bus: 0 | wc -l | sed 's/\ *//g'` + if [ $num -gt 1 ]; then + echo "Warning: your current PCI-X configuration might be incorrect." + echo "see AMD-8131 Errata 56 for more details." + fi + done + done + fi + fi +} + + +load_hardware_modules +RC=$[ $RC + $? ] +load_modules $LOAD_CORE_MODULES +RC=$[ $RC + $? ] +load_modules $LOAD_CORE_CM_MODULES +RC=$[ $RC + $? ] +load_modules $LOAD_CORE_USER_MODULES +RC=$[ $RC + $? ] +load_modules $LOAD_ULP_MODULES +RC=$[ $RC + $? ] + +errata_58 +errata_56 + +/usr/libexec/rdma-set-sriov-vf + +exit $RC diff --git a/redhat/rdma.mlx4-setup.sh b/redhat/rdma.mlx4-setup.sh new file mode 100644 index 0000000..ff8caf5 --- /dev/null +++ b/redhat/rdma.mlx4-setup.sh @@ -0,0 +1,91 @@ +#!/bin/bash +dir="/sys/bus/pci/drivers/mlx4_core" +[ ! -d $dir ] && exit 1 +pushd $dir >/dev/null + +function set_dual_port() { + device=$1 + port1=$2 + port2=$3 + pushd $device >/dev/null + cur_p1=`cat mlx4_port1` + cur_p2=`cat mlx4_port2` + + # special case the "eth eth" mode as we need port2 to + # actually switch to eth before the driver will let us + # switch port1 to eth as well + if [ "$port1" == "eth" ]; then + if [ "$port2" != "eth" ]; then + echo "In order for port1 to be eth, port2 to must also be eth" + popd >/dev/null + return + fi + if [ "$cur_p2" != "eth" -a "$cur_p2" != "auto (eth)" ]; then + tries=0 + echo "$port2" > mlx4_port2 2>/dev/null + sleep .25 + cur_p2=`cat mlx4_port2` + while [ "$cur_p2" != "eth" -a "$cur_p2" != "auto (eth)" -a $tries -lt 10 ]; do + sleep .25 + let tries++ + cur_p2=`cat mlx4_port2` + done + if [ "$cur_p2" != "eth" -a "$cur_p2" != "auto (eth)" ]; then + echo "Failed to set port2 to eth mode" + popd >/dev/null + return + fi + fi + if [ "$cur_p1" != "eth" -a "$cur_p1" != "auto (eth)" ]; then + tries=0 + echo "$port1" > mlx4_port1 2>/dev/null + sleep .25 + cur_p1=`cat mlx4_port1` + while [ "$cur_p1" != "eth" -a "$cur_p1" != "auto (eth)" -a $tries -lt 10 ]; do + sleep .25 + let tries++ + cur_p1=`cat mlx4_port1` + done + if [ "$cur_p1" != "eth" -a "$cur_p1" != "auto (eth)" ]; then + echo "Failed to set port1 to eth mode" + fi + fi + popd >/dev/null + return + fi + + # our mode is not eth <anything> as that is covered above + # so we should be able to successfully set the ports in + # port1 then port2 order + if [ "$cur_p1" != "$port1" -o "$cur_p2" != "$port2" ]; then + # Try setting the ports in order first + echo "$port1" > mlx4_port1 2>/dev/null ; sleep .1 + echo "$port2" > mlx4_port2 2>/dev/null ; sleep .1 + cur_p1=`cat mlx4_port1` + cur_p2=`cat mlx4_port2` + fi + + if [ "$cur_p1" != "$port1" -o "$cur_p2" != "$port2" ]; then + # Try reverse order this time + echo "$port2" > mlx4_port2 2>/dev/null ; sleep .1 + echo "$port1" > mlx4_port1 2>/dev/null ; sleep .1 + cur_p1=`cat mlx4_port1` + cur_p2=`cat mlx4_port2` + fi + + if [ "$cur_p1" != "$port1" -o "$cur_p2" != "$port2" ]; then + echo "Error setting port type on mlx4 device $device" + fi + + popd >/dev/null + return +} + + +while read device port1 port2 ; do + [ -d "$device" ] || continue + [ -z "$port1" ] && continue + [ -f "$device/mlx4_port2" -a -z "$port2" ] && continue + [ -f "$device/mlx4_port2" ] && set_dual_port $device $port1 $port2 || echo "$port1" > "$device/mlx4_port1" +done +popd 2&>/dev/null diff --git a/redhat/rdma.mlx4.conf b/redhat/rdma.mlx4.conf new file mode 100644 index 0000000..71207cc --- /dev/null +++ b/redhat/rdma.mlx4.conf @@ -0,0 +1,27 @@ +# Config file for mlx4 hardware port settings +# This file is read when the mlx4_core module is loaded and used to +# set the port types for any hardware found. If a card is not listed +# in this file, then its port types are left alone. +# +# Format: +# <pci_device_of_card> <port1_type> [port2_type] +# +# @port1 and @port2: +# One of auto, ib, or eth. No checking is performed to make sure that +# combinations are valid. Invalid inputs will result in the driver +# not setting the port to the type requested. port1 is required at +# all times, port2 is required for dual port cards. +# +# Example: +# 0000:0b:00.0 eth eth +# +# You can find the right pci device to use for any given card by loading +# the mlx4_core module, then going to /sys/bus/pci/drivers/mlx4_core and +# seeing what possible PCI devices are listed there. The possible values +# for ports are: ib, eth, and auto. However, not all cards support all +# types, so if you get messages from the kernel that your selected port +# type isn't supported, there's nothing this script can do about it. Also, +# some cards don't support using different types on the two ports (aka, +# both ports must be either eth or ib). Again, we can't set what the kernel +# or hardware won't support. +# diff --git a/redhat/rdma.mlx4.sys.modprobe b/redhat/rdma.mlx4.sys.modprobe new file mode 100644 index 0000000..781562c --- /dev/null +++ b/redhat/rdma.mlx4.sys.modprobe @@ -0,0 +1,5 @@ +# WARNING! - This file is overwritten any time the rdma rpm package is +# updated. Please do not make any changes to this file. Instead, make +# changes to the mlx4.conf file. It's contents are preserved if they +# have been changed from the default values. +install mlx4_core /sbin/modprobe --ignore-install mlx4_core $CMDLINE_OPTS && (if [ -f /usr/libexec/mlx4-setup.sh -a -f /etc/rdma/mlx4.conf ]; then /usr/libexec/mlx4-setup.sh < /etc/rdma/mlx4.conf; fi; /sbin/modprobe mlx4_en; if /sbin/modinfo mlx4_ib > /dev/null 2>&1; then /sbin/modprobe mlx4_ib; fi) diff --git a/redhat/rdma.modules-setup.sh b/redhat/rdma.modules-setup.sh new file mode 100644 index 0000000..803fc60 --- /dev/null +++ b/redhat/rdma.modules-setup.sh @@ -0,0 +1,29 @@ +#!/bin/bash + +check() { + [ -n "$hostonly" -a -c /sys/class/infiniband_verbs/uverbs0 ] && return 0 + [ -n "$hostonly" ] && return 255 + return 0 +} + +depends() { + return 0 +} + +install() { + inst /etc/rdma/rdma.conf + inst /etc/rdma/mlx4.conf + inst /etc/rdma/sriov-vfs + inst /usr/libexec/rdma-init-kernel + inst /usr/libexec/mlx4-setup.sh + inst /usr/libexec/rdma-set-sriov-vf + inst /usr/lib/modprobe.d/libmlx4.conf + inst_multiple lspci setpci awk sleep + inst_multiple -o /etc/modprobe.d/mlx4.conf + inst_rules 98-rdma.rules 70-persistent-ipoib.rules +} + +installkernel() { + hostonly='' instmods =drivers/infiniband =drivers/net/ethernet/mellanox =drivers/net/ethernet/chelsio =drivers/net/ethernet/cisco =drivers/net/ethernet/emulex =drivers/target + hostonly='' instmods crc-t10dif crct10dif_common +} diff --git a/redhat/rdma.service b/redhat/rdma.service new file mode 100644 index 0000000..514ef58 --- /dev/null +++ b/redhat/rdma.service @@ -0,0 +1,15 @@ +[Unit] +Description=Initialize the iWARP/InfiniBand/RDMA stack in the kernel +Documentation=file:/etc/rdma/rdma.conf +RefuseManualStop=true +DefaultDependencies=false +Conflicts=emergency.target emergency.service +Before=network.target remote-fs-pre.target + +[Service] +Type=oneshot +RemainAfterExit=yes +ExecStart=/usr/libexec/rdma-init-kernel + +[Install] +WantedBy=sysinit.target diff --git a/redhat/rdma.sriov-init b/redhat/rdma.sriov-init new file mode 100644 index 0000000..0d7cbc6 --- /dev/null +++ b/redhat/rdma.sriov-init @@ -0,0 +1,137 @@ +#!/bin/bash +# +# Initialize SRIOV virtual devices +# +# This is usually run automatically by systemd after a hardware activation +# event in udev has triggered a start of the rdma.service unit +port=1 + +function __get_parent_pci_dev() +{ + pushd /sys/bus/pci/devices/$pci_dev >/dev/null 2>&1 + ppci_dev=`ls -l physfn | cut -f 2 -d '/'` + popd >/dev/null 2>&1 +} + +function __get_parent_ib_dev() +{ + ib_dev=`ls -l | awk '/'$ppci_dev'/ { print $9 }'` +} + +function __get_parent_net_dev() +{ + for netdev in /sys/bus/pci/devices/$ppci_dev/net/* ; do + if [ "$port" -eq `cat $netdev/dev_port` ]; then + netdev=`basename $netdev` + break + fi + done +} + +function __get_vf_num() +{ + pushd /sys/bus/pci/devices/$ppci_dev >/dev/null 2>&1 + vf=`ls -l virtfn* | awk '/'$pci_dev'/ { print $9 }' | sed -e 's/virtfn//'` + popd >/dev/null 2>&1 +} + +function __en_sriov_set_vf() +{ + pci_dev=$1 + shift + [ "$1" = "port" ] && port=$2 && shift 2 + # We find our parent device by the netdev registered port number, + # however, the netdev port numbers start at 0 while the port + # numbers on the card start at 1, so we subtract 1 from our + # configured port number to get the netdev number + let port-- + # Now we need to fill in the necessary information to pass to the ip + # command + __get_parent_pci_dev + __get_parent_net_dev + __get_vf_num + # The rest is easy. Either the user passed valid arguments as options + # or they didn't + ip link set dev $netdev vf $vf $* +} + +function __ib_sriov_set_vf() +{ + pci_dev=$1 + shift + [ "$1" = "port" ] && port=$2 && shift 2 + guid="" + __get_parent_pci_dev + __get_parent_ib_dev + [ -f $ib_dev/iov/$pci_dev/ports/$port/gid_idx/0 ] || return + while [ -n "$1" ]; do + case $1 in + guid) + guid=$2 + shift 2 + ;; + pkey) + shift 1 + break + ;; + *) + echo "Unknown option in $src" + shift + ;; + esac + done + if [ -n "$guid" ]; then + guid_idx=`cat "$ib_dev/iov/$pci_dev/ports/$port/gid_idx/0"` + echo "$guid" > "$ib_dev/iov/ports/$port/admin_guids/$guid_idx" + fi + i=0 + while [ -n "$1" ]; do + for pkey in $ib_dev/iov/ports/$port/pkeys/*; do + if [ `cat $pkey` = "$1" ]; then + echo `basename $pkey` > $ib_dev/iov/$pci_dev/ports/$port/pkey_idx/$i + let i++ + break + fi + done + shift + done +} + +[ -d /sys/class/infiniband ] || return +pushd /sys/class/infiniband >/dev/null 2>&1 + +if [ -z "$*" ]; then + src=/etc/rdma/sriov-vfs + [ -f "$src" ] || return + grep -v "^#" $src | while read -a args; do + # When we use read -a to read into an array, the index starts at + # 0, unlike below where the arg count starts at 1 + port=1 + next_arg=1 + [ "${args[$next_arg]}" = "port" ] && next_arg=3 + case ${args[$next_arg]} in + guid|pkey) + __ib_sriov_set_vf ${args[*]} + ;; + mac|vlan|rate|spoofchk|enable) + __en_sriov_set_vf ${args[*]} + ;; + *) + ;; + esac + done +else + [ "$2" = "port" ] && next_arg=$4 || next_arg=$2 + case $next_arg in + guid|pkey) + __ib_sriov_set_vf $* + ;; + mac|vlan|rate|spoofchk|enable) + __en_sriov_set_vf $* + ;; + *) + ;; + esac +fi + +popd >/dev/null 2>&1 diff --git a/redhat/rdma.sriov-vfs b/redhat/rdma.sriov-vfs new file mode 100644 index 0000000..ef3e6c0 --- /dev/null +++ b/redhat/rdma.sriov-vfs @@ -0,0 +1,41 @@ +# All lines in this file that start with a # are comments, +# all other lines will be processed without argument checks +# Format of this file is one sriov vf setting per line with +# arguments as follows: +# vf [port #] [ethernet settings | infiniband settings] +# +# @vf - PCI address of device to configure as found in +# /sys/bus/pci/devices/ +# +# [port @port] - Optional: the port number we are setting on +# the device. We always assume port 1 unless told +# otherwise. +# +# Ethernet settings: +# mac <mac address> [additional options] +# @mac - mac address to assign to vf...this is currently required by +# the ip program if you wish to be able to set any of the other +# settings. If you don't set anything on a vf, it will get a +# random mac address and you may use static IP addressing to +# have a consistent IP address in spite of the random mac +# @* - additional arguments are passed to ip link without any +# further processing/checking, additional options that could +# be passed as of the time of writing this are: +# [ vlan VLANID [ qos VLAN-QOS ] ] +# [ rate TXRATE ] +# [ spoofchk { on | off} ] +# [ state { auto | enable | disable} ] +# +# InfiniBand settings: +# [guid <guid>] [pkey <space separated list of pkeys>] +# @guid - 64bit GUID value to assign to vf. Omit this option to +# use a subnet manager assigned GUID. +# @pkey - one or more pkeys to assign to this guest, must be last +# item on line +# +# Examples: +# +# 0000:44:00.1 guid 05011403007bcba1 pkey 0xffff 0x8002 +# 0000:44:00.1 port 2 mac aa:bb:cc:dd:ee:f0 spoofchk on +# 0000:44:00.2 port 1 pkey 0x7fff 0x0002 +# 0000:44:00.2 port 2 mac aa:bb:cc:dd:ee:f1 vlan 10 spoofchk on state enable diff --git a/redhat/rdma.udev-rules b/redhat/rdma.udev-rules new file mode 100644 index 0000000..2005048 --- /dev/null +++ b/redhat/rdma.udev-rules @@ -0,0 +1,13 @@ +# We list all the various kernel modules that drive hardware in the +# InfiniBand stack (and a few in the network stack that might not actually +# be RDMA capable, but we don't know that at this time and it's safe to +# enable the IB stack, so do so unilaterally) and on load of any of that +# hardware, we trigger the rdma.service load in systemd + +SUBSYSTEM=="module", KERNEL=="cxgb*", ACTION=="add", TAG+="systemd", ENV{SYSTEMD_WANTS}="rdma.service" +SUBSYSTEM=="module", KERNEL=="ib_*", ACTION=="add", TAG+="systemd", ENV{SYSTEMD_WANTS}="rdma.service" +SUBSYSTEM=="module", KERNEL=="mlx*", ACTION=="add", TAG+="systemd", ENV{SYSTEMD_WANTS}="rdma.service" +SUBSYSTEM=="module", KERNEL=="iw_*", ACTION=="add", TAG+="systemd", ENV{SYSTEMD_WANTS}="rdma.service" +SUBSYSTEM=="module", KERNEL=="be2net", ACTION=="add", TAG+="systemd", ENV{SYSTEMD_WANTS}="rdma.service" +SUBSYSTEM=="module", KERNEL=="enic", ACTION=="add", TAG+="systemd", ENV{SYSTEMD_WANTS}="rdma.service" +SUBSYSTEM=="module", KERNEL=="efa", ACTION=="add", TAG+="systemd", ENV{SYSTEMD_WANTS}="rdma.service" diff --git a/srp_daemon/CMakeLists.txt b/srp_daemon/CMakeLists.txt new file mode 100644 index 0000000..b253872 --- /dev/null +++ b/srp_daemon/CMakeLists.txt @@ -0,0 +1,62 @@ +set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${NO_STRICT_ALIASING_FLAGS}") + +rdma_man_pages( + ibsrpdm.8 + srp_daemon.8.in + srp_daemon.service.5 + srp_daemon_port@.service.5 + ) + +rdma_sbin_executable(srp_daemon + srp_daemon.c + srp_handle_traps.c + srp_sync.c + ) +target_link_libraries(srp_daemon LINK_PRIVATE + ibverbs + ibumad + ${RT_LIBRARIES} + ${CMAKE_THREAD_LIBS_INIT} + ) + +rdma_install_symlink(srp_daemon "${CMAKE_INSTALL_SBINDIR}/ibsrpdm") +# FIXME: Why? +rdma_install_symlink(srp_daemon "${CMAKE_INSTALL_SBINDIR}/run_srp_daemon") +rdma_subst_install(FILES "srp_daemon.sh.in" + DESTINATION "${CMAKE_INSTALL_SBINDIR}" + RENAME "srp_daemon.sh" + PERMISSIONS OWNER_WRITE OWNER_READ GROUP_READ WORLD_READ OWNER_EXECUTE GROUP_EXECUTE WORLD_EXECUTE) + +install(FILES start_on_all_ports + DESTINATION "${CMAKE_INSTALL_LIBEXECDIR}/srp_daemon" + PERMISSIONS OWNER_WRITE OWNER_READ GROUP_READ WORLD_READ OWNER_EXECUTE GROUP_EXECUTE WORLD_EXECUTE) + +rdma_subst_install(FILES srp_daemon.service.in + DESTINATION "${CMAKE_INSTALL_SYSTEMD_SERVICEDIR}" + RENAME srp_daemon.service + PERMISSIONS OWNER_WRITE OWNER_READ GROUP_READ WORLD_READ) + +rdma_subst_install(FILES srp_daemon_port@.service.in + DESTINATION "${CMAKE_INSTALL_SYSTEMD_SERVICEDIR}" + RENAME srp_daemon_port@.service + PERMISSIONS OWNER_WRITE OWNER_READ GROUP_READ WORLD_READ) + +install(FILES srp_daemon.conf DESTINATION "${CMAKE_INSTALL_SYSCONFDIR}") + +install(FILES "srp_daemon.rules" + RENAME "60-srp_daemon.rules" + DESTINATION "${CMAKE_INSTALL_UDEV_RULESDIR}") + +install(FILES modules-srp_daemon.conf + RENAME "srp_daemon.conf" + DESTINATION "${CMAKE_INSTALL_SYSCONFDIR}/rdma/modules") + +# FIXME: The ib init.d file should really be included in rdma-core as well. +set(RDMA_SERVICE "openibd" CACHE STRING "init.d file service name to order srpd after") +# NOTE: These defaults are for CentOS, packagers should override. +set(SRP_DEFAULT_START "2 3 4 5" CACHE STRING "Default-Start service data for srpd") +set(SRP_DEFAULT_STOP "0 1 6" CACHE STRING "Default-Stop service data for srpd") +configure_file(srpd.in "${CMAKE_CURRENT_BINARY_DIR}/srpd") +install(FILES "${CMAKE_CURRENT_BINARY_DIR}/srpd" + DESTINATION "${CMAKE_INSTALL_INITDDIR}" + PERMISSIONS OWNER_WRITE OWNER_READ GROUP_READ WORLD_READ OWNER_EXECUTE GROUP_EXECUTE WORLD_EXECUTE) diff --git a/srp_daemon/ibsrpdm.8 b/srp_daemon/ibsrpdm.8 new file mode 100644 index 0000000..4e8788c --- /dev/null +++ b/srp_daemon/ibsrpdm.8 @@ -0,0 +1,38 @@ +.\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md +.TH IBSRPDM 8 "August 30, 2005" "OpenFabrics" "USER COMMANDS" + +.SH NAME +ibsrpdm \- Discover SRP targets on an InfiniBand Fabric + +.SH SYNOPSIS +.B ibsrpdm [\fIOPTIONS\fB] + +.SH DESCRIPTION +.PP +List InfiniBand SCSI RDMA Protocol (SRP) targets on an IB fabric. + +.SH OPTIONS + +.PP +.TP +\fB\-c\fR +Generate output suitable for piping directly to a +/sys/class/infiniband_srp/srp\-<device>\-<port>/add_target file +.TP +\fB\-d\fR \fIDEVICE\fR +Use device file \fIDEVICE\fR (default /dev/infiniband/umad0) +.TP +\fB\-k\fR \fIP_KEY\fR +Use InfiniBand partition key \fIP_KEY\fR (default 0xffff) +.TP +\fB\-v\fR +Print more verbose output + +.SH SEE ALSO +.BR srp_daemon (1) + +.SH AUTHORS +.TP +Roland Dreier +.RI < roland@kernel.org > + diff --git a/srp_daemon/modules-srp_daemon.conf b/srp_daemon/modules-srp_daemon.conf new file mode 100644 index 0000000..dbe4343 --- /dev/null +++ b/srp_daemon/modules-srp_daemon.conf @@ -0,0 +1,2 @@ +# These modules are loaded by the system if srp_daemon is to be run +ib_srp diff --git a/srp_daemon/srp_daemon.8.in b/srp_daemon/srp_daemon.8.in new file mode 100644 index 0000000..b72a6b7 --- /dev/null +++ b/srp_daemon/srp_daemon.8.in @@ -0,0 +1,137 @@ +.\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md +.TH SRP_DAEMON 8 "September 5, 2006" "OpenFabrics" "USER COMMANDS" + +.SH NAME +srp_daemon \- Discovers SRP targets in an InfiniBand Fabric + +.SH SYNOPSIS +.B srp_daemon\fR [\fB-vVcaeon\fR] [\fB-d \fIumad-device\fR | \fB-i \fIinfiniband-device\fR [\fB-p \fIport-num\fR] | \fB-j \fIdev:port\fR] [\fB-t \fItimeout(ms)\fR] [\fB-r \fIretries\fR] [\fB-R \fIrescan-time\fR] [\fB-f \fIrules-file\fR] + + +.SH DESCRIPTION +.PP +Discovers and connects to InfiniBand SCSI RDMA Protocol (SRP) targets in an IB fabric. + +Each srp_daemon instance operates on one local port. Upon boot it performs a +full rescan of the fabric and then waits for an srp_daemon event. An +srp_daemon event can be a join of a new machine to the fabric, a change in the +capabilities of a machine, an SA change, or an expiration of a predefined +timeout. + +When a new machine joins the fabric, srp_daemon checks if it is an SRP +target. When there is a change of capabilities, srp_daemon checks if the +machine has turned into an SRP target. When there is an SA change or a timeout +expiration, srp_daemon performs a full rescan of the fabric. + +For each target srp_daemon finds, it checks if it should connect to this +target according to its rules (the default rules file is +@CMAKE_INSTALL_FULL_SYSCONFDIR@/srp_daemon.conf) and if it is already +connected to the local port. If it should connect to this target and if it is +not connected yet, srp_daemon can either print the target details or connect +to it. + +.SH OPTIONS + +.PP +.TP +\fB\-v\fR +Print more verbose output +.TP +\fB\-V\fR +Print even more verbose output (debug mode) +.TP +\fB\-i\fR \fIinfiniband-device\fR +Work on \fIinfiniband-device\fR. This option should not be used with -d nor +with -j. +.TP +\fB\-p\fR \fIport-num\fR +Work on port \fIport-num\fR (default 1). This option must be used with -i and +should not be used with -d nor with -j. +.TP +\fB\-j\fR \fIdev:port\fR +Work on port number \fIport\fR of InfiniBand device \fIdev\fR. This option +should not be used with -d, -i nor with -p. +.TP +\fB\-d\fR \fIumad-device\fR +Use device file \fIumad-device\fR (default /dev/infiniband/umad0) This option +should not be used with -i, -p nor with -j. +.TP +\fB\-c\fR +Generate output suitable for piping directly to a +/sys/class/infiniband_srp/srp\-<device>\-<port>/add_target file. +.TP +\fB\-a\fR +Prints all the targets in the fabric, not only targets that are not connected +through the local port. This is the same behavior as that of ibsrpdm. +.TP +\fB\-e\fR +Execute the connection command, i.e., make the connection to the target. +.TP +\fB\-o\fR +Perform only one rescan and exit just like ibsrpdm. +.TP +\fB\-R\fR \fIrescan-time\fR +Force a complete rescan every \fIrescan-time\fR seconds. If -R is not specified, no timeout rescans will be performed. +.TP +\fB\-T\fR \fIretry-timeout\fR +Retries to connect to existing target after \fIretry-timeout\fR seconds. If -R is not specified, uses 5 Seconds timeout. if retry-timeout is 0, will not try to reconnect. The reason srp_daemon retries to connect to the target is because there may be a rare scnerio in which srp_daemon will try to connect to add a target when the target is about to be removed, but is not removed yet. +.TP +\fB\-f\fR \fIrules-file\fR +Decide to which targets to connect according to the rules in \fIrules-file\fR. +If \fB\-f\fR is not specified, uses the default rules file @CMAKE_INSTALL_FULL_SYSCONFDIR@/srp_daemon.conf. +Each line in the \fIrules-file\fR is a rule which can be either an allow connection or a disallow connection according to +the first character in the line (a or d accordingly). The rest of the line is values for id_ext, ioc_guid, dgid, +service_id. Please take a look at the example section for an example of the file. srp_daemon decide whether to allow or disallow each target according to first rule that match the target. If no rule matches the target, the target is allowed and will be connected. In an allow rule it is possible to set attributes for the connection to the target. Supported attributes are max_cmd_per_lun and max_sect. +.TP +\fB\-t\fR \fItimeout\fR +Use timeout of \fItimeout\fR msec for MAD responses (default: 5 sec). +.TP +\fB\-r\fR \fIretries\fR +Perform \fIretries\fR retries on each send to MAD (default: 3 retries). +.TP +\fB\-n\fR +New format - use also initiator_ext in the connection command. +.TP +\fB\--systemd\fR +Enable systemd integration. + +.SH FILES +@CMAKE_INSTALL_FULL_SYSCONFDIR@/srp_daemon.conf - +Default rules configuration file that indicates to which targets to connect. Can be overridden using the \fB\-f\fR \fIrules-file\fR option. +Each line in this file is a rule which can be either an allow connection or a disallow connection according to +the first character in the line (a or d accordingly). The rest of the line is values for id_ext, ioc_guid, dgid, +service_id. Please take a look at the example section for an example of the file. srp_daemon decide whether to allow or disallow each target according to first rule that match the target. If no rule matches the target, the target is allowed and will be connected. In an allow rule it is possible to set attributes for the connection to the target. Supported attributes are max_cmd_per_lun and max_sect. + +.SH EXAMPLES +srp_daemon -e -i mthca0 -p 1 -R 60 (Connects to the targets accessible through port 1 of mthca0. Performs a complete rescan every minute) + +srp_daemon -o -c -a (Prints the connection commands for the targets in the fabric and exits - similar to ibsrpdm) + +srp_daemon -e -f rules.txt (Connects to the targets allowed in the rules file rules.txt) + +.nf +An example for a rules configuration file (such as @CMAKE_INSTALL_FULL_SYSCONFDIR@/srp_daemon.conf) +------------------------------------------------------------------------ +# Rules file example +# This is a comment +# disallow the following dgid +d dgid=fe800000000000000002c90200402bd5 +# allow target with the following ioc_guid +a ioc_guid=00a0b80200402bd7 +# allow target with the following id_ext and ioc_guid. And setting max_cmd_per_lun to 31. +a id_ext=200500A0B81146A1,ioc_guid=00a0b80200402bef,max_cmd_per_lun=31 +# disallow all the rest +d +.fi + + +.SH SEE ALSO +.BR ibsrpdm (8) + +.SH AUTHORS +.TP +Roland Dreier +.RI < rolandd@cisco.com > +.TP +Ishai Rabinovitz +.RI < ishai@mellanox.co.il > diff --git a/srp_daemon/srp_daemon.c b/srp_daemon/srp_daemon.c new file mode 100644 index 0000000..f14d9f5 --- /dev/null +++ b/srp_daemon/srp_daemon.c @@ -0,0 +1,2498 @@ +/* + * srp_daemon - discover SRP targets over IB + * Copyright (c) 2005 Topspin Communications. All rights reserved. + * Copyright (c) 2006 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2006 Mellanox Technologies Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * $Author: ishai Rabinovitz [ishai@mellanox.co.il]$ + * Based on Roland Dreier's initial code [rdreier@cisco.com] + * + */ + +#define _GNU_SOURCE + +#include <assert.h> +#include <stdarg.h> +#include <stddef.h> +#include <stdbool.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <ctype.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <sys/ioctl.h> +#include <fcntl.h> +#include <linux/types.h> +#include <endian.h> +#include <errno.h> +#include <getopt.h> +#include <dirent.h> +#include <pthread.h> +#include <string.h> +#include <signal.h> +#include <sys/syslog.h> +#include <infiniband/umad.h> +#include <infiniband/umad_types.h> +#include <infiniband/umad_sa.h> +#include "srp_ib_types.h" + +#include "srp_daemon.h" + +#define IBDEV_STR_SIZE 16 +#define IBPORT_STR_SIZE 16 +#define IGNORE(value) do { if (value) { } } while (0) +#define max_t(type, x, y) ({ \ + type __max1 = (x); \ + type __max2 = (y); \ + __max1 > __max2 ? __max1: __max2; }) + +#define get_data_ptr(mad) ((void *) ((mad).hdr.data)) + +enum log_dest { log_to_syslog, log_to_stderr }; + +static int get_lid(struct umad_resources *umad_res, union umad_gid *gid, + uint16_t *lid); + +static const int node_table_response_size = 1 << 18; +static const char *sysfs_path = "/sys"; +static enum log_dest s_log_dest = log_to_syslog; +static int wakeup_pipe[2] = { -1, -1 }; + + +void wake_up_main_loop(char ch) +{ + int res; + + assert(wakeup_pipe[1] >= 0); + res = write(wakeup_pipe[1], &ch, 1); + IGNORE(res); +} + +static void signal_handler(int signo) +{ + wake_up_main_loop(signo); +} + +/* + * Return either the received signal (SIGINT, SIGTERM, ...) or 0 if no signal + * has been received before the timeout has expired. + */ +static int get_received_signal(time_t tv_sec, suseconds_t tv_usec) +{ + int fd, ret, received_signal = 0; + fd_set rset; + struct timeval timeout; + char buf[16]; + + fd = wakeup_pipe[0]; + FD_ZERO(&rset); + FD_SET(fd, &rset); + timeout.tv_sec = tv_sec; + timeout.tv_usec = tv_usec; + ret = select(fd + 1, &rset, NULL, NULL, &timeout); + if (ret < 0) + assert(errno == EINTR); + while ((ret = read(fd, buf, sizeof(buf))) > 0) + received_signal = buf[ret - 1]; + + return received_signal; +} + +static int check_process_uniqueness(struct config_t *conf) +{ + char path[256]; + int fd; + + snprintf(path, sizeof(path), SRP_DEAMON_LOCK_PREFIX "_%s_%d", + conf->dev_name, conf->port_num); + + if ((fd = open(path, O_CREAT|O_RDWR, + S_IRUSR|S_IRGRP|S_IROTH|S_IWUSR)) < 0) { + pr_err("cannot open file \"%s\" (errno: %d).\n", path, errno); + return -1; + } + + fchmod(fd, S_IRUSR|S_IRGRP|S_IROTH|S_IWUSR|S_IWGRP|S_IWOTH); + if (0 != lockf(fd, F_TLOCK, 0)) { + pr_err("failed to lock %s (errno: %d). possibly another " + "srp_daemon is locking it\n", path, errno); + close(fd); + fd = -1; + } + + return fd; +} + +static int srpd_sys_read_string(const char *dir_name, const char *file_name, + char *str, int max_len) +{ + char path[256], *s; + int fd, r; + + snprintf(path, sizeof(path), "%s/%s", dir_name, file_name); + + if ((fd = open(path, O_RDONLY)) < 0) + return (errno > 0) ? -errno : errno; + + if ((r = read(fd, str, max_len)) < 0) { + int e = errno; + close(fd); + return (e > 0) ? -e : e; + } + + str[(r < max_len) ? r : max_len - 1] = 0; + + if ((s = strrchr(str, '\n'))) + *s = 0; + + close(fd); + return 0; +} + +static int srpd_sys_read_gid(const char *dir_name, const char *file_name, + uint8_t *gid) +{ + char buf[64], *str, *s; + __be16 *ugid = (__be16 *)gid; + int r, i; + + if ((r = srpd_sys_read_string(dir_name, file_name, buf, sizeof(buf))) < 0) + return r; + + for (s = buf, i = 0 ; i < 8; i++) { + if (!(str = strsep(&s, ": \t\n"))) + return -EINVAL; + ugid[i] = htobe16(strtoul(str, NULL, 16) & 0xffff); + } + + return 0; +} + +static int srpd_sys_read_uint64(const char *dir_name, const char *file_name, + uint64_t *u) +{ + char buf[32]; + int r; + + if ((r = srpd_sys_read_string(dir_name, file_name, buf, sizeof(buf))) < 0) + return r; + + *u = strtoull(buf, NULL, 0); + + return 0; +} + + + + +static void usage(const char *argv0) +{ + fprintf(stderr, "Usage: %s [-vVcaeon] [-d <umad device> | -i <infiniband device> [-p <port_num>]] [-t <timeout (ms)>] [-r <retries>] [-R <rescan time>] [-f <rules file>\n", argv0); + fprintf(stderr, "-v Verbose\n"); + fprintf(stderr, "-V debug Verbose\n"); + fprintf(stderr, "-c prints connection Commands\n"); + fprintf(stderr, "-a show All - prints also targets that are already connected\n"); + fprintf(stderr, "-e Executes connection commands\n"); + fprintf(stderr, "-o runs only Once and stop\n"); + fprintf(stderr, "-d <umad device> use umad Device \n"); + fprintf(stderr, "-i <infiniband device> use InfiniBand device \n"); + fprintf(stderr, "-p <port_num> use Port num \n"); + fprintf(stderr, "-j <dev>:<port_num> use the IB dev / port_num combination \n"); + fprintf(stderr, "-R <rescan time> perform complete Rescan every <rescan time> seconds\n"); + fprintf(stderr, "-T <retry timeout> Retries to connect to existing target after Timeout of <retry timeout> seconds\n"); + fprintf(stderr, "-l <tl_retry timeout> Transport retry count before failing IO. should be in range [2..7], (default 2)\n"); + fprintf(stderr, "-f <rules file> use rules File to set to which target(s) to connect (default: " SRP_DEAMON_CONFIG_FILE ")\n"); + fprintf(stderr, "-t <timeout> Timeout for mad response in milliseconds\n"); + fprintf(stderr, "-r <retries> number of send Retries for each mad\n"); + fprintf(stderr, "-n New connection command format - use also initiator extension\n"); + fprintf(stderr, "--systemd Enable systemd integration.\n"); + fprintf(stderr, "\nExample: srp_daemon -e -n -i mthca0 -p 1 -R 60\n"); +} + +static int +check_equal_uint64(char *dir_name, const char *attr, uint64_t val) +{ + uint64_t attr_value; + + if (srpd_sys_read_uint64(dir_name, attr, &attr_value)) + return 0; + + return attr_value == val; +} + +static int +check_equal_uint16(char *dir_name, const char *attr, uint16_t val) +{ + uint64_t attr_value; + + if (srpd_sys_read_uint64(dir_name, attr, &attr_value)) + return 0; + + return val == (attr_value & 0xffff); +} + +static int recalc(struct resources *res); + +static void pr_cmd(char *target_str, int not_connected) +{ + int ret; + + if (config->cmd) + printf("%s\n", target_str); + + if (config->execute && not_connected) { + int fd = open(config->add_target_file, O_WRONLY); + if (fd < 0) { + pr_err("unable to open %s, maybe ib_srp is not loaded\n", config->add_target_file); + return; + } + ret = write(fd, target_str, strlen(target_str)); + pr_debug("Adding target returned %d\n", ret); + close(fd); + } +} + +void pr_debug(const char *fmt, ...) +{ + va_list args; + + if (!config->debug_verbose) + return; + + va_start(args, fmt); + vprintf(fmt, args); + va_end(args); +} + +void pr_err(const char *fmt, ...) +{ + va_list args; + + va_start(args, fmt); + switch (s_log_dest) { + case log_to_syslog: + vsyslog(LOG_DAEMON | LOG_ERR, fmt, args); + break; + case log_to_stderr: + vfprintf(stderr, fmt, args); + break; + } + va_end(args); +} + +static int check_not_equal_str(const char *dir_name, const char *attr, + const char *value) +{ + char attr_value[64]; + int len = strlen(value); + + if (len > sizeof(attr_value)) { + pr_err("string %s is too long\n", value); + return 1; + } + + if (srpd_sys_read_string(dir_name, attr, attr_value, + sizeof(attr_value))) + return 0; + if (strncmp(attr_value, value, len)) + return 1; + + return 0; +} + +static int check_not_equal_int(const char *dir_name, const char *attr, + int value) +{ + char attr_value[64]; + + if (srpd_sys_read_string(dir_name, attr, attr_value, + sizeof(attr_value))) + return 0; + if (value != atoi(attr_value)) + return 1; + + return 0; +} + +static int is_enabled_by_rules_file(struct target_details *target) +{ + int rule; + struct config_t *conf = config; + + if (NULL == conf->rules) { + pr_debug("Allowing SRP target with id_ext %s because not using a rules file\n", target->id_ext); + return 1; + } + + rule = -1; + do { + rule++; + if (conf->rules[rule].id_ext[0] != '\0' && + strtoull(target->id_ext, NULL, 16) != + strtoull(conf->rules[rule].id_ext, NULL, 16)) + continue; + + if (conf->rules[rule].ioc_guid[0] != '\0' && + be64toh(target->ioc_prof.guid) != + strtoull(conf->rules[rule].ioc_guid, NULL, 16)) + continue; + + if (conf->rules[rule].dgid[0] != '\0') { + char tmp = conf->rules[rule].dgid[16]; + conf->rules[rule].dgid[16] = '\0'; + if (strtoull(conf->rules[rule].dgid, NULL, 16) != + target->subnet_prefix) { + conf->rules[rule].dgid[16] = tmp; + continue; + } + conf->rules[rule].dgid[16] = tmp; + if (strtoull(&conf->rules[rule].dgid[16], NULL, 16) != + target->h_guid) + continue; + } + + if (conf->rules[rule].service_id[0] != '\0' && + strtoull(conf->rules[rule].service_id, NULL, 16) != + target->h_service_id) + continue; + + if (conf->rules[rule].pkey[0] != '\0' && + (uint16_t)strtoul(conf->rules[rule].pkey, NULL, 16) != + target->pkey) + continue; + + target->options = conf->rules[rule].options; + + pr_debug("SRP target with id_ext %s %s by rules file\n", + target->id_ext, + conf->rules[rule].allow ? "allowed" : "disallowed"); + return conf->rules[rule].allow; + + } while (1); +} + + +static bool use_imm_data(void) +{ + bool ret = false; + char flag = 0; + int cnt; + int fd = open("/sys/module/ib_srp/parameters/use_imm_data", O_RDONLY); + + if (fd < 0) + return false; + cnt = read(fd, &flag, 1); + if (cnt != 1) { + close(fd); + return false; + } + + if (!strncmp(&flag, "Y", 1)) + ret = true; + close(fd); + return ret; +} + +static bool imm_data_size_gt_send_size(unsigned int send_size) +{ + bool ret = false; + unsigned int srp_max_imm_data = 0; + FILE *fp = fopen("/sys/module/ib_srp/parameters/max_imm_data", "r"); + int cnt; + + if (fp == NULL) + return ret; + + cnt = fscanf(fp, "%d", &srp_max_imm_data); + if (cnt <= 0) { + fclose(fp); + return ret; + } + + if (srp_max_imm_data > send_size) + ret = true; + + fclose(fp); + return ret; +} + +static int add_non_exist_target(struct target_details *target) +{ + char scsi_host_dir[256]; + DIR *dir; + struct dirent *subdir; + char *subdir_name_ptr; + int prefix_len; + union umad_gid dgid_val; + char target_config_str[255]; + int len; + int not_connected = 1; + unsigned int send_size; + + pr_debug("Found an SRP target with id_ext %s - check if it is already connected\n", target->id_ext); + + strcpy(scsi_host_dir, "/sys/class/scsi_host/"); + dir=opendir(scsi_host_dir); + if (!dir) { + perror("opendir - /sys/class/scsi_host/"); + return -1; + } + prefix_len = strlen(scsi_host_dir); + subdir_name_ptr = scsi_host_dir + prefix_len; + + subdir = (void *) 1; /* Dummy value to enter the loop */ + while (subdir) { + subdir = readdir(dir); + + if (!subdir) + continue; + + if (subdir->d_name[0] == '.') + continue; + + strncpy(subdir_name_ptr, subdir->d_name, + sizeof(scsi_host_dir) - prefix_len); + if (!check_equal_uint64(scsi_host_dir, "id_ext", + strtoull(target->id_ext, NULL, 16))) + continue; + if (!check_equal_uint16(scsi_host_dir, "pkey", target->pkey) && + !config->execute) + continue; + + if (!check_equal_uint64(scsi_host_dir, "service_id", + target->h_service_id)) + continue; + if (!check_equal_uint64(scsi_host_dir, "ioc_guid", + be64toh(target->ioc_prof.guid))) + continue; + if (srpd_sys_read_gid(scsi_host_dir, "orig_dgid", + dgid_val.raw)) { + /* + * In case this is an old kernel that does not have + * orig_dgid in sysfs, use dgid instead (this is + * problematic when there is a dgid redirection + * by the CM) + */ + if (srpd_sys_read_gid(scsi_host_dir, "dgid", + dgid_val.raw)) + continue; + } + if (htobe64(target->subnet_prefix) != + dgid_val.global.subnet_prefix) + continue; + if (htobe64(target->h_guid) != dgid_val.global.interface_id) + continue; + + /* If there is no local_ib_device in the scsi host dir (old kernel module), assumes it is equal */ + if (check_not_equal_str(scsi_host_dir, "local_ib_device", config->dev_name)) + continue; + + /* If there is no local_ib_port in the scsi host dir (old kernel module), assumes it is equal */ + if (check_not_equal_int(scsi_host_dir, "local_ib_port", config->port_num)) + continue; + + /* there is a match - this target is already connected */ + + /* There is a rare possibility of a race in the following + scenario: + a. A link goes down, + b. ib_srp decide to remove the corresponding scsi_host. + c. Before removing it, the link returns + d. srp_daemon gets trap 64. + e. srp_daemon thinks that this target is still + connected (ib_srp has not removed it yet) so it + does not connect to it. + f. ib_srp continue to remove the scsi_host. + As a result there is no connection to a target in the fabric + and there will not be a new trap. + + To solve this race we schedule here another call to check + if this target exist in the near future. + */ + + + + /* If there is a need to print all we will continue to pr_cmd. + not_connected is set to zero to make sure that this target + will be printed but not connected. + */ + if (config->all) { + not_connected = 0; + break; + } + + pr_debug("This target is already connected - skip\n"); + closedir(dir); + + return 0; + + } + + len = snprintf(target_config_str, sizeof(target_config_str), "id_ext=%s," + "ioc_guid=%016llx," + "dgid=%016llx%016llx," + "pkey=%04x," + "service_id=%016llx", + target->id_ext, + (unsigned long long) be64toh(target->ioc_prof.guid), + (unsigned long long) target->subnet_prefix, + (unsigned long long) target->h_guid, + target->pkey, + (unsigned long long) target->h_service_id); + if (len >= sizeof(target_config_str)) { + pr_err("Target config string is too long, ignoring target\n"); + closedir(dir); + return -1; + } + + if (target->ioc_prof.io_class != htobe16(SRP_REV16A_IB_IO_CLASS)) { + len += snprintf(target_config_str+len, + sizeof(target_config_str) - len, + ",io_class=%04hx", be16toh(target->ioc_prof.io_class)); + + if (len >= sizeof(target_config_str)) { + pr_err("Target config string is too long, ignoring target\n"); + closedir(dir); + return -1; + } + } + + if (config->print_initiator_ext) { + len += snprintf(target_config_str+len, + sizeof(target_config_str) - len, + ",initiator_ext=%016llx", + (unsigned long long) target->h_guid); + + if (len >= sizeof(target_config_str)) { + pr_err("Target config string is too long, ignoring target\n"); + closedir(dir); + return -1; + } + } + + if (config->execute && config->tl_retry_count) { + len += snprintf(target_config_str + len, + sizeof(target_config_str) - len, + ",tl_retry_count=%d", config->tl_retry_count); + + if (len >= sizeof(target_config_str)) { + pr_err("Target config string is too long, ignoring target\n"); + closedir(dir); + return -1; + } + } + + if (target->options) { + len += snprintf(target_config_str+len, + sizeof(target_config_str) - len, + "%s", + target->options); + + if (len >= sizeof(target_config_str)) { + pr_err("Target config string is too long, ignoring target\n"); + closedir(dir); + return -1; + } + } + + /* + * The SRP initiator stops parsing parameters if it encounters + * an unrecognized parameter. Rest parameters will be ignored. + * Append 'max_it_iu_size' in the very end of login string to + * avoid breaking SRP login. + */ + send_size = be32toh(target->ioc_prof.send_size); + if (use_imm_data() && imm_data_size_gt_send_size(send_size)) { + len += snprintf(target_config_str+len, + sizeof(target_config_str) - len, + ",max_it_iu_size=%d", send_size); + + if (len >= sizeof(target_config_str)) { + pr_err("Target config string is too long, ignoring target\n"); + closedir(dir); + return -1; + } + } + + target_config_str[len] = '\0'; + + pr_cmd(target_config_str, not_connected); + + closedir(dir); + + return 1; +} + +static int send_and_get(int portid, int agent, struct srp_ib_user_mad *out_mad, + struct srp_ib_user_mad *in_mad, int in_mad_size) +{ + struct umad_dm_packet *out_dm_mad = (void *) out_mad->hdr.data; + struct umad_dm_packet *in_dm_mad = (void *) in_mad->hdr.data; + int i, len; + int in_agent; + int ret; + static uint32_t tid; + uint32_t received_tid; + + for (i = 0; i < config->mad_retries; ++i) { + /* Skip tid 0 because OpenSM ignores it. */ + if (++tid == 0) + ++tid; + out_dm_mad->mad_hdr.tid = htobe64(tid); + + ret = umad_send(portid, agent, out_mad, MAD_BLOCK_SIZE, + config->timeout, 0); + if (ret < 0) { + pr_err("umad_send to %u failed\n", + (uint16_t) be16toh(out_mad->hdr.addr.lid)); + return ret; + } + + do { +recv: + len = in_mad_size ? in_mad_size : MAD_BLOCK_SIZE; + in_agent = umad_recv(portid, (struct ib_user_mad *) in_mad, + &len, config->timeout); + if (in_agent < 0) { + pr_err("umad_recv from %u failed - %d\n", + (uint16_t) be16toh(out_mad->hdr.addr.lid), + in_agent); + return in_agent; + } + if (in_agent != agent) { + pr_debug("umad_recv returned different agent\n"); + goto recv; + } + + ret = umad_status(in_mad); + if (ret) { + pr_err( + "bad MAD status (%u) from lid %#x\n", + ret, be16toh(out_mad->hdr.addr.lid)); + return -ret; + } + + received_tid = be64toh(in_dm_mad->mad_hdr.tid); + if (tid != received_tid) + pr_debug("umad_recv returned different transaction id sent %d got %d\n", + tid, received_tid); + + } while ((int32_t)(tid - received_tid) > 0); + + if (len > 0) + return len; + } + + return -1; +} + +static void initialize_sysfs(void) +{ + char *env; + + env = getenv("SYSFS_PATH"); + if (env) { + int len; + char *dup; + + sysfs_path = dup = strndup(env, 256); + len = strlen(dup); + while (len > 0 && dup[len - 1] == '/') { + --len; + dup[len] = '\0'; + } + } +} + +static int translate_umad_to_ibdev_and_port(char *umad_dev, char **ibdev, + char **ibport) +{ + char *class_dev_path; + char *umad_dev_name; + int ret; + + *ibdev = NULL; + *ibport = NULL; + + umad_dev_name = rindex(umad_dev, '/'); + if (!umad_dev_name) { + pr_err("Couldn't find device name in '%s'\n", umad_dev); + return -1; + } + + ret = asprintf(&class_dev_path, "%s/class/infiniband_mad/%s", sysfs_path, + umad_dev_name); + + if (ret < 0) { + pr_err("out of memory\n"); + return -ENOMEM; + } + + *ibdev = malloc(IBDEV_STR_SIZE); + if (!*ibdev) { + pr_err("out of memory\n"); + ret = -ENOMEM; + goto end; + } + + if (srpd_sys_read_string(class_dev_path, "ibdev", *ibdev, + IBDEV_STR_SIZE) < 0) { + pr_err("Couldn't read ibdev attribute\n"); + ret = -1; + goto end; + } + + *ibport = malloc(IBPORT_STR_SIZE); + if (!*ibport) { + pr_err("out of memory\n"); + ret = -ENOMEM; + goto end; + } + if (srpd_sys_read_string(class_dev_path, "port", *ibport, IBPORT_STR_SIZE) < 0) { + pr_err("Couldn't read port attribute\n"); + ret = -1; + goto end; + } + + ret = 0; + +end: + if (ret) { + free(*ibport); + free(*ibdev); + *ibdev = NULL; + } + free(class_dev_path); + + return ret; +} + +static void init_srp_mad(struct srp_ib_user_mad *out_umad, int agent, + uint16_t h_dlid, uint16_t h_attr_id, uint32_t h_attr_mod) +{ + struct umad_dm_packet *out_mad; + + memset(out_umad, 0, sizeof *out_umad); + + out_umad->hdr.agent_id = agent; + out_umad->hdr.addr.qpn = htobe32(1); + out_umad->hdr.addr.qkey = htobe32(UMAD_QKEY); + out_umad->hdr.addr.lid = htobe16(h_dlid); + + out_mad = (void *) out_umad->hdr.data; + + out_mad->mad_hdr.base_version = UMAD_BASE_VERSION; + out_mad->mad_hdr.method = UMAD_METHOD_GET; + out_mad->mad_hdr.attr_id = htobe16(h_attr_id); + out_mad->mad_hdr.attr_mod = htobe32(h_attr_mod); +} + +static void init_srp_dm_mad(struct srp_ib_user_mad *out_mad, int agent, uint16_t h_dlid, + uint16_t h_attr_id, uint32_t h_attr_mod) +{ + struct umad_sa_packet *out_dm_mad = get_data_ptr(*out_mad); + + init_srp_mad(out_mad, agent, h_dlid, h_attr_id, h_attr_mod); + out_dm_mad->mad_hdr.mgmt_class = UMAD_CLASS_DEVICE_MGMT; + out_dm_mad->mad_hdr.class_version = 1; +} + +static void init_srp_sa_mad(struct srp_ib_user_mad *out_mad, int agent, uint16_t h_dlid, + uint16_t h_attr_id, uint32_t h_attr_mod) +{ + struct umad_sa_packet *out_sa_mad = get_data_ptr(*out_mad); + + init_srp_mad(out_mad, agent, h_dlid, h_attr_id, h_attr_mod); + out_sa_mad->mad_hdr.mgmt_class = UMAD_CLASS_SUBN_ADM; + out_sa_mad->mad_hdr.class_version = UMAD_SA_CLASS_VERSION; +} + +static int check_sm_cap(struct umad_resources *umad_res, int *mask_match) +{ + struct srp_ib_user_mad out_mad, in_mad; + struct umad_sa_packet *in_sa_mad; + struct umad_class_port_info *cpi; + int ret; + + in_sa_mad = get_data_ptr(in_mad); + + init_srp_sa_mad(&out_mad, umad_res->agent, umad_res->sm_lid, + UMAD_ATTR_CLASS_PORT_INFO, 0); + + ret = send_and_get(umad_res->portid, umad_res->agent, &out_mad, &in_mad, 0); + if (ret < 0) + return ret; + + cpi = (void *) in_sa_mad->data; + + *mask_match = !!(be16toh(cpi->cap_mask) & SRP_SM_SUPPORTS_MASK_MATCH); + + return 0; +} + +int pkey_index_to_pkey(struct umad_resources *umad_res, int pkey_index, + __be16 *pkey) +{ + if (ibv_query_pkey(umad_res->ib_ctx, config->port_num, pkey_index, + pkey) < 0) + return -1; + if (*pkey) + pr_debug("discover Targets for P_key %04x (index %d)\n", + *pkey, pkey_index); + return 0; +} + +static int pkey_to_pkey_index(struct umad_resources *umad_res, uint16_t h_pkey, + uint16_t *pkey_index) +{ + int res = ibv_get_pkey_index(umad_res->ib_ctx, config->port_num, + htobe16(h_pkey)); + if (res >= 0) + *pkey_index = res; + return res; +} + +static int set_class_port_info(struct umad_resources *umad_res, uint16_t dlid, uint16_t h_pkey) +{ + struct srp_ib_user_mad in_mad, out_mad; + struct umad_dm_packet *out_dm_mad, *in_dm_mad; + struct umad_class_port_info *cpi; + char val[64]; + int i; + + init_srp_dm_mad(&out_mad, umad_res->agent, dlid, UMAD_ATTR_CLASS_PORT_INFO, 0); + + if (pkey_to_pkey_index(umad_res, h_pkey, &out_mad.hdr.addr.pkey_index) + < 0) { + pr_err("set_class_port_info: Unable to find pkey_index for pkey %#x\n", h_pkey); + return -1; + } + + out_dm_mad = get_data_ptr(out_mad); + out_dm_mad->mad_hdr.method = UMAD_METHOD_SET; + + cpi = (void *) out_dm_mad->data; + + if (srpd_sys_read_string(umad_res->port_sysfs_path, "lid", val, sizeof val) < 0) { + pr_err("Couldn't read LID\n"); + return -1; + } + + cpi->trap_lid = htobe16(strtol(val, NULL, 0)); + + if (srpd_sys_read_string(umad_res->port_sysfs_path, "gids/0", val, sizeof val) < 0) { + pr_err("Couldn't read GID[0]\n"); + return -1; + } + + for (i = 0; i < 8; ++i) + cpi->trapgid.raw_be16[i] = htobe16(strtol(val + i * 5, NULL, 16)); + + if (send_and_get(umad_res->portid, umad_res->agent, &out_mad, &in_mad, 0) < 0) + return -1; + + in_dm_mad = get_data_ptr(in_mad); + if (in_dm_mad->mad_hdr.status) { + pr_err("Class Port Info set returned status 0x%04x\n", + be16toh(in_dm_mad->mad_hdr.status)); + return -1; + } + + return 0; +} + +static int get_iou_info(struct umad_resources *umad_res, uint16_t dlid, + uint16_t h_pkey, struct srp_dm_iou_info *iou_info) +{ + struct srp_ib_user_mad in_mad, out_mad; + struct umad_dm_packet *in_dm_mad; + + init_srp_dm_mad(&out_mad, umad_res->agent, dlid, SRP_DM_ATTR_IO_UNIT_INFO, 0); + if (pkey_to_pkey_index(umad_res, h_pkey, &out_mad.hdr.addr.pkey_index) + < 0) { + pr_err("get_iou_info: Unable to find pkey_index for pkey %#x\n", h_pkey); + return -1; + } + + if (send_and_get(umad_res->portid, umad_res->agent, &out_mad, &in_mad, 0) < 0) + return -1; + + in_dm_mad = get_data_ptr(in_mad); + if (in_dm_mad->mad_hdr.status) { + pr_err("IO Unit Info query returned status 0x%04x\n", + be16toh(in_dm_mad->mad_hdr.status)); + return -1; + } + + memcpy(iou_info, in_dm_mad->data, sizeof *iou_info); +/* + pr_debug("iou_info->max_controllers is %d\n", iou_info->max_controllers); +*/ + return 0; +} + +static int get_ioc_prof(struct umad_resources *umad_res, uint16_t h_dlid, uint16_t h_pkey, int ioc, + struct srp_dm_ioc_prof *ioc_prof) +{ + struct srp_ib_user_mad in_mad, out_mad; + struct umad_dm_packet *in_dm_mad; + + init_srp_dm_mad(&out_mad, umad_res->agent, h_dlid, SRP_DM_ATTR_IO_CONTROLLER_PROFILE, ioc); + + if (pkey_to_pkey_index(umad_res, h_pkey, &out_mad.hdr.addr.pkey_index) + < 0) { + pr_err("get_ioc_prof: Unable to find pkey_index for pkey %#x\n", + h_pkey); + return -1; + } + + if (send_and_get(umad_res->portid, umad_res->agent, &out_mad, &in_mad, 0) < 0) + return -1; + + in_dm_mad = get_data_ptr(in_mad); + if (in_dm_mad->mad_hdr.status) { + pr_err("IO Controller Profile query returned status 0x%04x for %d\n", + be16toh(in_dm_mad->mad_hdr.status), ioc); + return -1; + } + + memcpy(ioc_prof, in_dm_mad->data, sizeof *ioc_prof); + + return 0; +} + +static int get_svc_entries(struct umad_resources *umad_res, uint16_t dlid, uint16_t h_pkey, int ioc, + int start, int end, struct srp_dm_svc_entries *svc_entries) +{ + struct srp_ib_user_mad in_mad, out_mad; + struct umad_dm_packet *in_dm_mad; + + init_srp_dm_mad(&out_mad, umad_res->agent, dlid, SRP_DM_ATTR_SERVICE_ENTRIES, + (ioc << 16) | (end << 8) | start); + + if (pkey_to_pkey_index(umad_res, h_pkey, &out_mad.hdr.addr.pkey_index) + < 0) { + pr_err("get_svc_entries: Unable to find pkey_index for pkey %#x\n", + h_pkey); + return -1; + } + + if (send_and_get(umad_res->portid, umad_res->agent, &out_mad, &in_mad, 0) < 0) + return -1; + + in_dm_mad = get_data_ptr(in_mad); + if (in_dm_mad->mad_hdr.status) { + pr_err("Service Entries query returned status 0x%04x\n", + be16toh(in_dm_mad->mad_hdr.status)); + return -1; + } + + memcpy(svc_entries, in_dm_mad->data, sizeof *svc_entries); + + return 0; +} + +static int do_port(struct resources *res, uint16_t pkey, uint16_t dlid, + uint64_t subnet_prefix, uint64_t h_guid) +{ + struct umad_resources *umad_res = res->umad_res; + struct srp_dm_iou_info iou_info; + struct srp_dm_svc_entries svc_entries; + int i, j, k, ret; + + static const uint64_t topspin_oui = 0x0005ad0000000000ull; + static const uint64_t oui_mask = 0xffffff0000000000ull; + + struct target_details *target = (struct target_details *) + malloc(sizeof(struct target_details)); + + target->subnet_prefix = subnet_prefix; + target->h_guid = h_guid; + target->options = NULL; + + pr_debug("enter do_port\n"); + if ((target->h_guid & oui_mask) == topspin_oui && + set_class_port_info(umad_res, dlid, pkey)) + pr_err("Warning: set of ClassPortInfo failed\n"); + + ret = get_iou_info(umad_res, dlid, pkey, &iou_info); + if (ret < 0) { + pr_err("failed to get iou info for dlid %#x\n", dlid); + goto out; + } + + pr_human("IO Unit Info:\n"); + pr_human(" port LID: %04x\n", dlid); + pr_human(" port GID: %016llx%016llx\n", + (unsigned long long) target->subnet_prefix, + (unsigned long long) target->h_guid); + pr_human(" change ID: %04x\n", be16toh(iou_info.change_id)); + pr_human(" max controllers: 0x%02x\n", iou_info.max_controllers); + + if (config->verbose > 0) + for (i = 0; i < iou_info.max_controllers; ++i) { + pr_human(" controller[%3d]: ", i + 1); + switch ((iou_info.controller_list[i / 2] >> + (4 * (1 - i % 2))) & 0xf) { + case SRP_DM_NO_IOC: pr_human("not installed\n"); break; + case SRP_DM_IOC_PRESENT: pr_human("present\n"); break; + case SRP_DM_NO_SLOT: pr_human("no slot\n"); break; + default: pr_human("<unknown>\n"); break; + } + } + + for (i = 0; i < iou_info.max_controllers; ++i) { + if (((iou_info.controller_list[i / 2] >> (4 * (1 - i % 2))) & 0xf) == + SRP_DM_IOC_PRESENT) { + pr_human("\n"); + + if (get_ioc_prof(umad_res, dlid, pkey, i + 1, &target->ioc_prof)) + continue; + + pr_human(" controller[%3d]\n", i + 1); + + pr_human(" GUID: %016llx\n", + (unsigned long long) be64toh(target->ioc_prof.guid)); + pr_human(" vendor ID: %06x\n", be32toh(target->ioc_prof.vendor_id) >> 8); + pr_human(" device ID: %06x\n", be32toh(target->ioc_prof.device_id)); + pr_human(" IO class : %04hx\n", be16toh(target->ioc_prof.io_class)); + pr_human(" Maximum size of Send Messages in bytes: %d\n", + be32toh(target->ioc_prof.send_size)); + pr_human(" ID: %s\n", target->ioc_prof.id); + pr_human(" service entries: %d\n", target->ioc_prof.service_entries); + + for (j = 0; j < target->ioc_prof.service_entries; j += 4) { + int n; + + n = j + 3; + if (n >= target->ioc_prof.service_entries) + n = target->ioc_prof.service_entries - 1; + + if (get_svc_entries(umad_res, dlid, pkey, i + 1, + j, n, &svc_entries)) + continue; + + for (k = 0; k <= n - j; ++k) { + + if (sscanf(svc_entries.service[k].name, + "SRP.T10:%16s", + target->id_ext) != 1) + continue; + + pr_human(" service[%3d]: %016llx / %s\n", + j + k, + (unsigned long long) be64toh(svc_entries.service[k].id), + svc_entries.service[k].name); + + target->h_service_id = be64toh(svc_entries.service[k].id); + target->pkey = pkey; + if (is_enabled_by_rules_file(target)) { + if (!add_non_exist_target(target) && !config->once) { + target->retry_time = + time(NULL) + config->retry_timeout; + push_to_retry_list(res->sync_res, target); + } + } + } + } + } + } + + pr_human("\n"); + +out: + free(target); + return ret; +} + +int get_node(struct umad_resources *umad_res, uint16_t dlid, uint64_t *guid) +{ + struct srp_ib_user_mad out_mad, in_mad; + struct umad_sa_packet *out_sa_mad, *in_sa_mad; + struct srp_sa_node_rec *node; + + in_sa_mad = get_data_ptr(in_mad); + out_sa_mad = get_data_ptr(out_mad); + + init_srp_sa_mad(&out_mad, umad_res->agent, umad_res->sm_lid, + UMAD_SA_ATTR_NODE_REC, 0); + + out_sa_mad->comp_mask = htobe64(1); /* LID */ + node = (void *) out_sa_mad->data; + node->lid = htobe16(dlid); + + if (send_and_get(umad_res->portid, umad_res->agent, &out_mad, &in_mad, 0) < 0) + return -1; + + node = (void *) in_sa_mad->data; + *guid = be64toh(node->port_guid); + + return 0; +} + +static int get_port_info(struct umad_resources *umad_res, uint16_t dlid, + uint64_t *subnet_prefix, int *isdm) +{ + struct srp_ib_user_mad out_mad, in_mad; + struct umad_sa_packet *out_sa_mad, *in_sa_mad; + struct srp_sa_port_info_rec *port_info; + + in_sa_mad = get_data_ptr(in_mad); + out_sa_mad = get_data_ptr(out_mad); + + init_srp_sa_mad(&out_mad, umad_res->agent, umad_res->sm_lid, + UMAD_SA_ATTR_PORT_INFO_REC, 0); + + out_sa_mad->comp_mask = htobe64(1); /* LID */ + port_info = (void *) out_sa_mad->data; + port_info->endport_lid = htobe16(dlid); + + if (send_and_get(umad_res->portid, umad_res->agent, &out_mad, &in_mad, 0) < 0) + return -1; + + port_info = (void *) in_sa_mad->data; + *subnet_prefix = be64toh(port_info->subnet_prefix); + *isdm = !!(be32toh(port_info->capability_mask) & SRP_IS_DM); + + return 0; +} + +static int get_shared_pkeys(struct resources *res, + uint16_t dest_port_lid, + uint16_t *pkeys) +{ + struct umad_resources *umad_res = res->umad_res; + uint8_t *in_mad_buf; + struct srp_ib_user_mad out_mad; + struct ib_user_mad *in_mad; + struct umad_sa_packet *out_sa_mad, *in_sa_mad; + struct ib_path_rec *path_rec; + ssize_t len; + int i, num_pkeys = 0; + __be16 pkey; + uint16_t local_port_lid = get_port_lid(res->ud_res->ib_ctx, + config->port_num, NULL); + + in_mad_buf = malloc(sizeof(struct ib_user_mad) + + node_table_response_size); + if (!in_mad_buf) + return -ENOMEM; + + in_mad = (void *)in_mad_buf; + in_sa_mad = (void *)in_mad->data; + out_sa_mad = get_data_ptr(out_mad); + + init_srp_sa_mad(&out_mad, umad_res->agent, umad_res->sm_lid, + UMAD_SA_ATTR_PATH_REC, 0); + + /** + * Due to OpenSM bug (issue #335016) SM won't return + * table of all shared P_Keys, it will return only the first + * shared P_Key, So we send path_rec over each P_Key in the P_Key + * table. SM will return path record if P_Key is shared or else None. + * Once SM bug will be fixed, this loop should be removed. + **/ + for (i = 0; ; i++) { + if (pkey_index_to_pkey(umad_res, i, &pkey)) + break; + if (!pkey) + continue; + + /* Mark components: DLID, SLID, PKEY */ + out_sa_mad->comp_mask = htobe64(1 << 4 | 1 << 5 | 1 << 13); + path_rec = (struct ib_path_rec *)out_sa_mad->data; + path_rec->slid = htobe16(local_port_lid); + path_rec->dlid = htobe16(dest_port_lid); + path_rec->pkey = pkey; + + len = send_and_get(umad_res->portid, umad_res->agent, &out_mad, + (struct srp_ib_user_mad *)in_mad, + node_table_response_size); + if (len < 0) + goto err; + + path_rec = (struct ib_path_rec *)in_sa_mad->data; + pkeys[num_pkeys++] = be16toh(path_rec->pkey); + } + + free(in_mad_buf); + return num_pkeys; +err: + free(in_mad_buf); + return -1; +} + +static int do_dm_port_list(struct resources *res) +{ + struct umad_resources *umad_res = res->umad_res; + uint8_t *in_mad_buf; + struct srp_ib_user_mad out_mad; + struct ib_user_mad *in_mad; + struct umad_sa_packet *out_sa_mad, *in_sa_mad; + struct srp_sa_port_info_rec *port_info; + ssize_t len; + int size; + int i, j,num_pkeys; + uint16_t pkeys[SRP_MAX_SHARED_PKEYS]; + uint64_t guid; + + in_mad_buf = malloc(sizeof(struct ib_user_mad) + + node_table_response_size); + if (!in_mad_buf) + return -ENOMEM; + + in_mad = (void *) in_mad_buf; + in_sa_mad = (void *) in_mad->data; + out_sa_mad = get_data_ptr(out_mad); + + init_srp_sa_mad(&out_mad, umad_res->agent, umad_res->sm_lid, + UMAD_SA_ATTR_PORT_INFO_REC, SRP_SM_CAP_MASK_MATCH_ATTR_MOD); + + out_sa_mad->mad_hdr.method = UMAD_SA_METHOD_GET_TABLE; + out_sa_mad->comp_mask = htobe64(1 << 7); /* Capability mask */ + out_sa_mad->rmpp_hdr.rmpp_version = UMAD_RMPP_VERSION; + out_sa_mad->rmpp_hdr.rmpp_type = 1; + port_info = (void *) out_sa_mad->data; + port_info->capability_mask = htobe32(SRP_IS_DM); /* IsDM */ + + len = send_and_get(umad_res->portid, umad_res->agent, &out_mad, + (struct srp_ib_user_mad *) in_mad, + node_table_response_size); + if (len < 0) { + free(in_mad_buf); + return len; + } + + size = ib_get_attr_size(in_sa_mad->attr_offset); + if (!size) { + if (config->verbose) { + printf("Query did not find any targets\n"); + } + free(in_mad_buf); + return 0; + } + + for (i = 0; (i + 1) * size <= len - MAD_RMPP_HDR_SIZE; ++i) { + port_info = (void *) in_sa_mad->data + i * size; + if (get_node(umad_res, be16toh(port_info->endport_lid), &guid)) + continue; + + num_pkeys = get_shared_pkeys(res, be16toh(port_info->endport_lid), + pkeys); + if (num_pkeys < 0) { + pr_err("failed to get shared P_Keys with LID %#x\n", + be16toh(port_info->endport_lid)); + free(in_mad_buf); + return num_pkeys; + } + + for (j = 0; j < num_pkeys; ++j) + do_port(res, pkeys[j], be16toh(port_info->endport_lid), + be64toh(port_info->subnet_prefix), guid); + } + + free(in_mad_buf); + return 0; +} + +void handle_port(struct resources *res, uint16_t pkey, uint16_t lid, uint64_t h_guid) +{ + struct umad_resources *umad_res = res->umad_res; + uint64_t subnet_prefix; + int isdm; + + pr_debug("enter handle_port for lid %#x\n", lid); + if (get_port_info(umad_res, lid, &subnet_prefix, &isdm)) + return; + + if (!isdm) + return; + + do_port(res, pkey, lid, subnet_prefix, h_guid); +} + + +static int do_full_port_list(struct resources *res) +{ + struct umad_resources *umad_res = res->umad_res; + uint8_t *in_mad_buf; + struct srp_ib_user_mad out_mad; + struct ib_user_mad *in_mad; + struct umad_sa_packet *out_sa_mad, *in_sa_mad; + struct srp_sa_node_rec *node; + ssize_t len; + int size; + int i, j, num_pkeys; + uint16_t pkeys[SRP_MAX_SHARED_PKEYS]; + + in_mad_buf = malloc(sizeof(struct ib_user_mad) + + node_table_response_size); + if (!in_mad_buf) + return -ENOMEM; + + in_mad = (void *) in_mad_buf; + in_sa_mad = (void *) in_mad->data; + out_sa_mad = get_data_ptr(out_mad); + + init_srp_sa_mad(&out_mad, umad_res->agent, umad_res->sm_lid, + UMAD_SA_ATTR_NODE_REC, 0); + + out_sa_mad->mad_hdr.method = UMAD_SA_METHOD_GET_TABLE; + out_sa_mad->comp_mask = 0; /* Get all end ports */ + out_sa_mad->rmpp_hdr.rmpp_version = UMAD_RMPP_VERSION; + out_sa_mad->rmpp_hdr.rmpp_type = 1; + + len = send_and_get(umad_res->portid, umad_res->agent, &out_mad, + (struct srp_ib_user_mad *) in_mad, + node_table_response_size); + if (len < 0) { + free(in_mad_buf); + return len; + } + + size = be16toh(in_sa_mad->attr_offset) * 8; + + for (i = 0; (i + 1) * size <= len - MAD_RMPP_HDR_SIZE; ++i) { + node = (void *) in_sa_mad->data + i * size; + + num_pkeys = get_shared_pkeys(res, be16toh(node->lid), + pkeys); + if (num_pkeys < 0) { + pr_err("failed to get shared P_Keys with LID %#x\n", + be16toh(node->lid)); + free(in_mad_buf); + return num_pkeys; + } + + for (j = 0; j < num_pkeys; ++j) + (void) handle_port(res, pkeys[j], be16toh(node->lid), + be64toh(node->port_guid)); + } + + free(in_mad_buf); + return 0; +} + +struct config_t *config; + +static void print_config(struct config_t *conf) +{ + printf(" configuration report\n"); + printf(" ------------------------------------------------\n"); + printf(" Current pid : %u\n", getpid()); + printf(" Device name : \"%s\"\n", conf->dev_name); + printf(" IB port : %u\n", conf->port_num); + printf(" Mad Retries : %d\n", conf->mad_retries); + printf(" Number of outstanding WR : %u\n", conf->num_of_oust); + printf(" Mad timeout (msec) : %u\n", conf->timeout); + printf(" Prints add target command : %d\n", conf->cmd); + printf(" Executes add target command : %d\n", conf->execute); + printf(" Print also connected targets : %d\n", conf->all); + printf(" Report current targets and stop : %d\n", conf->once); + if (conf->rules_file) + printf(" Reads rules from : %s\n", conf->rules_file); + if (conf->print_initiator_ext) + printf(" Print initiator_ext\n"); + else + printf(" Do not print initiator_ext\n"); + if (conf->recalc_time) + printf(" Performs full target rescan every %d seconds\n", conf->recalc_time); + else + printf(" No full target rescan\n"); + if (conf->retry_timeout) + printf(" Retries to connect to existing target after %d seconds\n", conf->retry_timeout); + else + printf(" Do not retry to connect to existing targets\n"); + printf(" ------------------------------------------------\n"); +} + +static char *copy_till_comma(char *d, char *s, int len, int base) +{ + int i=0; + + while (strchr(", \t\n", *s) == NULL) { + if (i == len) + return NULL; + if ((base == 16 && isxdigit(*s)) || (base == 10 && isdigit(*s))) { + *d=*s; + ++d; + ++s; + ++i; + } else + return NULL; + } + *d='\0'; + + if (*s == '\n') + return s; + + ++s; + return s; +} + +static char *parse_main_option(struct rule *rule, char *ptr) +{ + struct option_info { + const char *name; + size_t offset; + size_t len; + int base; + }; +#define OPTION_INFO(n, base) { #n "=", offsetof(struct rule, n), \ + sizeof(((struct rule *)NULL)->n), base} + static const struct option_info opt_info[] = { + OPTION_INFO(id_ext, 16), + OPTION_INFO(ioc_guid, 16), + OPTION_INFO(dgid, 16), + OPTION_INFO(service_id, 16), + OPTION_INFO(pkey, 16), + }; + int i, optnamelen; + char *ptr2 = NULL; + + for (i = 0; i < sizeof(opt_info) / sizeof(opt_info[0]); i++) { + optnamelen = strlen(opt_info[i].name); + if (strncmp(ptr, opt_info[i].name, optnamelen) == 0) { + ptr2 = copy_till_comma((char *)rule + + opt_info[i].offset, + ptr + optnamelen, + opt_info[i].len - 1, + opt_info[i].base); + break; + } + } + + return ptr2; +} + +/* + * Return values: + * -1 if the output buffer is not large enough. + * 0 if an unsupported option has been encountered. + * > 0 if parsing succeeded. + */ +static int parse_other_option(struct rule *rule, char *ptr) +{ + static const char *const opt[] = { + "allow_ext_sg=", + "cmd_sg_entries=", + "comp_vector=", + "max_cmd_per_lun=", + "max_sect=", + "queue_size=", + "sg_tablesize=", + "tl_retry_count=", + }; + + char *ptr2 = NULL, *optr, option[17]; + int i, optnamelen, len, left; + + optr = rule->options; + left = sizeof(rule->options); + len = strlen(optr); + optr += len; + left -= len; + for (i = 0; i < sizeof(opt)/sizeof(opt[0]); ++i) { + optnamelen = strlen(opt[i]); + if (strncmp(ptr, opt[i], optnamelen) != 0) + continue; + ptr2 = copy_till_comma(option, ptr + optnamelen, + sizeof(option) - 1, 10); + if (!ptr2) + return -1; + len = snprintf(optr, left, ",%s%s", opt[i], option); + optr += len; + left -= len; + if (left <= 0) + return -1; + break; + } + return ptr2 ? ptr2 - ptr : 0; +} + +static int get_rules_file(struct config_t *conf) +{ + int line_number = 1, len, line_number_for_output, ret = -1; + char line[255]; + char *ptr, *ptr2; + struct rule *rule; + FILE *infile = fopen(conf->rules_file, "r"); + + if (infile == NULL) { + pr_debug("Could not find rules file %s, going with default\n", + conf->rules_file); + return 0; + } + + while (fgets(line, sizeof(line), infile) != NULL) { + if (line[0] != '#' && line[0] != '\n') + line_number++; + } + + if (fseek(infile, 0L, SEEK_SET) != 0) { + pr_err("internal error while seeking %s\n", conf->rules_file); + goto out; + } + + conf->rules = malloc(sizeof(struct rule) * line_number); + + rule = &conf->rules[0] - 1; + line_number_for_output = 0; + while (fgets(line, sizeof(line), infile) != NULL) { + line_number_for_output++; + if (line[0] == '#' || line[0] == '\n') + continue; + + rule++; + switch (line[0]) { + case 'a': + case 'A': + rule->allow = 1; + break; + case 'd': + case 'D': + rule->allow = 0; + break; + default: + pr_err("Bad syntax in rules file %s line %d:" + " line should start with 'a' or 'd'\n", + conf->rules_file, line_number_for_output); + goto out; + } + + rule->id_ext[0] = '\0'; + rule->ioc_guid[0] = '\0'; + rule->dgid[0] = '\0'; + rule->service_id[0] = '\0'; + rule->pkey[0] = '\0'; + rule->options[0] = '\0'; + + ptr = &line[1]; + while (*ptr == ' ' || *ptr == '\t') + ptr++; + + while (*ptr != '\n') { + ptr2 = parse_main_option(rule, ptr); + if (!ptr2 && rule->allow) { + len = parse_other_option(rule, ptr); + if (len < 0) { + pr_err("Buffer overflow triggered by" + " rules file %s line %d\n", + conf->rules_file, + line_number_for_output); + goto out; + } + ptr2 = len ? ptr + len : NULL; + } + + if (ptr2 == NULL) { + pr_err("Bad syntax in rules file %s line %d\n", + conf->rules_file, line_number_for_output); + goto out; + } + ptr = ptr2; + + while (*ptr == ' ' || *ptr == '\t') + ptr++; + } + } + rule++; + rule->id_ext[0] = '\0'; + rule->ioc_guid[0] = '\0'; + rule->dgid[0] = '\0'; + rule->service_id[0] = '\0'; + rule->pkey[0] = '\0'; + rule->options[0] = '\0'; + rule->allow = 1; + ret = 0; + +out: + fclose(infile); + + return ret; +} + +static int set_conf_dev_and_port(char *umad_dev, struct config_t *conf) +{ + int ret; + + if (umad_dev) { + char *ibport; + + ret = translate_umad_to_ibdev_and_port(umad_dev, + &conf->dev_name, + &ibport); + if (ret) { + pr_err("Fail to translate umad to ibdev and port\n"); + goto out; + } + conf->port_num = atoi(ibport); + if (conf->port_num == 0) { + pr_err("Bad port number %s\n", ibport); + ret = -1; + } + free(ibport); + } else { + umad_ca_t ca; + umad_port_t port; + + ret = umad_get_ca(NULL, &ca); + if (ret) { + pr_err("Failed to get default CA\n"); + goto out; + } + + ret = umad_get_port(ca.ca_name, 0, &port); + if (ret) { + pr_err("Failed to get default port for CA %s\n", + ca.ca_name); + umad_release_ca(&ca); + goto out; + } + conf->dev_name = strdup(ca.ca_name); + conf->port_num = port.portnum; + umad_release_port(&port); + umad_release_ca(&ca); + pr_debug("Using device %s port %d\n", conf->dev_name, + conf->port_num); + } +out: + return ret; +} + +static const struct option long_opts[] = { + { "systemd", 0, NULL, 'S' }, + {} +}; +static const char short_opts[] = "caveod:i:j:p:t:r:R:T:l:Vhnf:"; + +/* Check if the --systemd options was passed in very early so we can setup + * logging properly. + */ +static bool is_systemd(int argc, char *argv[]) +{ + while (1) { + int c; + + c = getopt_long(argc, argv, short_opts, long_opts, NULL); + if (c == -1) + break; + if (c == 'S') + return true; + + } + return false; +} + +static int get_config(struct config_t *conf, int argc, char *argv[]) +{ + /* set defaults */ + char* umad_dev = NULL; + int ret; + + conf->port_num = 1; + conf->num_of_oust = 10; + conf->dev_name = NULL; + conf->cmd = 0; + conf->once = 0; + conf->execute = 0; + conf->all = 0; + conf->verbose = 0; + conf->debug_verbose = 0; + conf->timeout = 5000; + conf->mad_retries = 3; + conf->recalc_time = 0; + conf->retry_timeout = 20; + conf->add_target_file = NULL; + conf->print_initiator_ext = 0; + conf->rules_file = SRP_DEAMON_CONFIG_FILE; + conf->rules = NULL; + conf->tl_retry_count = 0; + + optind = 1; + while (1) { + int c; + + c = getopt_long(argc, argv, short_opts, long_opts, NULL); + if (c == -1) + break; + + switch (c) { + case 'd': + umad_dev = optarg; + break; + case 'i': + conf->dev_name = strdup(optarg); + if (!conf->dev_name) { + pr_err("Fail to alloc space for dev_name\n"); + return -ENOMEM; + } + break; + case 'p': + conf->port_num = atoi(optarg); + if (conf->port_num == 0) { + pr_err("Bad port number %s\n", optarg); + return -1; + } + break; + case 'j': { + char dev[32]; + int port_num; + + if (sscanf(optarg, "%31[^:]:%d", dev, &port_num) != 2) { + pr_err("Bad dev:port specification %s\n", + optarg); + return -1; + } + conf->dev_name = strdup(dev); + conf->port_num = port_num; + } + break; + case 'c': + ++conf->cmd; + break; + case 'o': + ++conf->once; + break; + case 'a': + ++conf->all; + break; + case 'e': + ++conf->execute; + break; + case 'v': + ++conf->verbose; + break; + case 'V': + ++conf->debug_verbose; + break; + case 'n': + ++conf->print_initiator_ext; + break; + case 't': + conf->timeout = atoi(optarg); + if (conf->timeout == 0) { + pr_err("Bad timeout - %s\n", optarg); + return -1; + } + break; + case 'r': + conf->mad_retries = atoi(optarg); + if (conf->mad_retries == 0) { + pr_err("Bad number of retries - %s\n", optarg); + return -1; + } + break; + case 'R': + conf->recalc_time = atoi(optarg); + if (conf->recalc_time == 0) { + pr_err("Bad Rescan time window - %s\n", optarg); + return -1; + } + break; + case 'T': + conf->retry_timeout = atoi(optarg); + if (conf->retry_timeout == 0 && strcmp(optarg, "0")) { + pr_err("Bad retry Timeout value- %s.\n", optarg); + return -1; + } + break; + case 'f': + conf->rules_file = optarg; + break; + case 'l': + conf->tl_retry_count = atoi(optarg); + if (conf->tl_retry_count < 2 || + conf->tl_retry_count > 7) { + pr_err("Bad tl_retry_count argument (%d), " + "must be 2 <= tl_retry_count <= 7\n", + conf->tl_retry_count); + return -1; + } + break; + case 'S': + break; + case 'h': + default: + usage(argv[0]); + return -1; + } + } + + initialize_sysfs(); + + if (conf->dev_name == NULL) { + ret = set_conf_dev_and_port(umad_dev, conf); + if (ret) { + pr_err("Failed to build config\n"); + return ret; + } + } + ret = asprintf(&conf->add_target_file, + "%s/class/infiniband_srp/srp-%s-%d/add_target", sysfs_path, + conf->dev_name, conf->port_num); + if (ret < 0) { + pr_err("error while allocating add_target\n"); + return ret; + } + + if (get_rules_file(conf)) + return -1; + + return 0; +} + +static void free_config(struct config_t *conf) +{ + free(conf->dev_name); + free(conf->add_target_file); + free(conf->rules); + free(conf); +} + +static void umad_resources_init(struct umad_resources *umad_res) +{ + umad_res->portid = -1; + umad_res->agent = -1; + umad_res->agent = -1; + umad_res->port_sysfs_path = NULL; +} + +static void umad_resources_destroy(struct umad_resources *umad_res) +{ + if (umad_res->port_sysfs_path) + free(umad_res->port_sysfs_path); + + if (umad_res->portid >= 0) { + if (umad_res->agent >= 0) + umad_unregister(umad_res->portid, umad_res->agent); + umad_close_port(umad_res->portid); + } + + umad_done(); +} + +static int umad_resources_create(struct umad_resources *umad_res) +{ + + int ret; + + ret = asprintf(&umad_res->port_sysfs_path, "%s/class/infiniband/%s/ports/%d", + sysfs_path, config->dev_name, config->port_num); + + if (ret < 0) { + umad_res->port_sysfs_path = NULL; + return -ENOMEM; + } + + umad_res->portid = umad_open_port(config->dev_name, config->port_num); + if (umad_res->portid < 0) { + pr_err("umad_open_port failed for device %s port %d\n", + config->dev_name, config->port_num); + return -ENXIO; + } + + umad_res->agent = umad_register(umad_res->portid, UMAD_CLASS_SUBN_ADM, + UMAD_SA_CLASS_VERSION, + UMAD_RMPP_VERSION, NULL); + if (umad_res->agent < 0) { + pr_err("umad_register failed\n"); + return umad_res->agent; + } + + return 0; +} + +static void *run_thread_retry_to_connect(void *res_in) +{ + struct resources *res = (struct resources *)res_in; + struct target_details *target; + time_t sleep_time; + + pthread_mutex_lock(&res->sync_res->retry_mutex); + while (!res->sync_res->stop_threads) { + if (retry_list_is_empty(res->sync_res)) + pthread_cond_wait(&res->sync_res->retry_cond, + &res->sync_res->retry_mutex); + while (!res->sync_res->stop_threads && + (target = pop_from_retry_list(res->sync_res)) != NULL) { + pthread_mutex_unlock(&res->sync_res->retry_mutex); + sleep_time = target->retry_time - time(NULL); + + if (sleep_time > 0) + srp_sleep(sleep_time, 0); + + add_non_exist_target(target); + free(target); + pthread_mutex_lock(&res->sync_res->retry_mutex); + } + } + /* empty retry_list */ + while ((target = pop_from_retry_list(res->sync_res))) + free(target); + pthread_mutex_unlock(&res->sync_res->retry_mutex); + + pr_debug("retry_to_connect thread ended\n"); + + pthread_exit(NULL); +} + +static void free_res(struct resources *res) +{ + void *status; + + if (!res) + return; + + if (res->sync_res) { + pthread_mutex_lock(&res->sync_res->retry_mutex); + res->sync_res->stop_threads = 1; + pthread_cond_signal(&res->sync_res->retry_cond); + pthread_mutex_unlock(&res->sync_res->retry_mutex); + } + + if (res->ud_res) + modify_qp_to_err(res->ud_res->qp); + + if (res->reconnect_thread) { + pthread_kill(res->reconnect_thread, SIGINT); + pthread_join(res->reconnect_thread, &status); + } + if (res->async_ev_thread) { + pthread_kill(res->async_ev_thread, SIGINT); + pthread_join(res->async_ev_thread, &status); + } + if (res->trap_thread) { + pthread_kill(res->trap_thread, SIGINT); + pthread_join(res->trap_thread, &status); + } + if (res->sync_res) + sync_resources_cleanup(res->sync_res); + if (res->ud_res) + ud_resources_destroy(res->ud_res); + if (res->umad_res) + umad_resources_destroy(res->umad_res); + free(res); +} + +static struct resources *alloc_res(void) +{ + struct all_resources { + struct resources res; + struct ud_resources ud_res; + struct umad_resources umad_res; + struct sync_resources sync_res; + }; + + struct all_resources *res; + int ret; + + res = calloc(1, sizeof(*res)); + if (!res) + goto err; + + umad_resources_init(&res->umad_res); + ret = umad_resources_create(&res->umad_res); + if (ret) + goto err; + res->res.umad_res = &res->umad_res; + + ud_resources_init(&res->ud_res); + ret = ud_resources_create(&res->ud_res); + if (ret) + goto err; + res->res.ud_res = &res->ud_res; + res->umad_res.ib_ctx = res->ud_res.ib_ctx; + + ret = sync_resources_init(&res->sync_res); + if (ret) + goto err; + res->res.sync_res = &res->sync_res; + + if (!config->once) { + ret = pthread_create(&res->res.trap_thread, NULL, + run_thread_get_trap_notices, &res->res); + if (ret) + goto err; + + ret = pthread_create(&res->res.async_ev_thread, NULL, + run_thread_listen_to_events, &res->res); + if (ret) + goto err; + } + + if (config->retry_timeout && !config->once) { + ret = pthread_create(&res->res.reconnect_thread, NULL, + run_thread_retry_to_connect, &res->res); + if (ret) + goto err; + } + + return &res->res; +err: + if (res) + free_res(&res->res); + return NULL; +} + +/* *c = *a - *b. See also the BSD macro timersub(). */ +static void ts_sub(const struct timespec *a, const struct timespec *b, + struct timespec *res) +{ + res->tv_sec = a->tv_sec - b->tv_sec; + res->tv_nsec = a->tv_nsec - b->tv_nsec; + if (res->tv_nsec < 0) { + res->tv_sec--; + res->tv_nsec += 1000 * 1000 * 1000; + } +} + +static void cleanup_wakeup_fd(void) +{ + struct sigaction sa = {}; + + sigemptyset(&sa.sa_mask); + sa.sa_handler = SIG_DFL; + sigaction(SIGINT, &sa, NULL); + sigaction(SIGTERM, &sa, NULL); + sigaction(SRP_CATAS_ERR, &sa, NULL); + + close(wakeup_pipe[1]); + close(wakeup_pipe[0]); + wakeup_pipe[0] = -1; + wakeup_pipe[1] = -1; +} + +static int setup_wakeup_fd(void) +{ + struct sigaction sa = {}; + int ret; + + ret = pipe2(wakeup_pipe, O_NONBLOCK | O_CLOEXEC); + if (ret < 0) { + pr_err("could not create pipe\n"); + return -1; + } + + sigemptyset(&sa.sa_mask); + sa.sa_handler = signal_handler; + sigaction(SIGINT, &sa, NULL); + sigaction(SIGTERM, &sa, NULL); + sigaction(SRP_CATAS_ERR, &sa, NULL); + return 0; +} + +static int ibsrpdm(int argc, char *argv[]) +{ + char* umad_dev = NULL; + struct resources *res; + int ret; + + s_log_dest = log_to_stderr; + + config = calloc(1, sizeof(*config)); + config->num_of_oust = 10; + config->timeout = 5000; + config->mad_retries = 3; + config->all = 1; + config->once = 1; + + while (1) { + int c; + + c = getopt(argc, argv, "cd:h:v"); + if (c == -1) + break; + + switch (c) { + case 'c': + ++config->cmd; + break; + case 'd': + umad_dev = optarg; + break; + case 'v': + ++config->debug_verbose; + break; + case 'h': + default: + fprintf(stderr, + "Usage: %s [-vc] [-d <umad device>]\n", + argv[0]); + return 1; + } + } + + initialize_sysfs(); + + ret = set_conf_dev_and_port(umad_dev, config); + if (ret) { + pr_err("Failed to build config\n"); + goto out; + } + + ret = umad_init(); + if (ret != 0) + goto out; + + res = alloc_res(); + if (!res) { + ret = 1; + pr_err("Resource allocation failed\n"); + goto umad_done; + } + ret = recalc(res); + if (ret) + pr_err("Querying SRP targets failed\n"); + + free_res(res); +umad_done: + umad_done(); +out: + free_config(config); + + return ret; +} + +int main(int argc, char *argv[]) +{ + int ret; + struct resources *res; + uint16_t lid, sm_lid; + uint16_t pkey; + union umad_gid gid; + struct target_details *target; + int subscribed; + int lockfd = -1; + int received_signal = 0; + bool systemd; + +#ifndef __CHECKER__ + /* + * Hide these checks for sparse because these checks fail with + * older versions of sparse. + */ + BUILD_ASSERT(sizeof(struct ib_path_rec) == 64); + BUILD_ASSERT(sizeof(struct ib_inform_info) == 36); + BUILD_ASSERT(sizeof(struct ib_mad_notice_attr) == 80); + BUILD_ASSERT(offsetof(struct ib_mad_notice_attr, generic.trap_num) == + 4); + BUILD_ASSERT(offsetof(struct ib_mad_notice_attr, vend.dev_id) == 4); + BUILD_ASSERT(offsetof(struct ib_mad_notice_attr, ntc_64_67.gid) == 16); + BUILD_ASSERT(offsetof(struct ib_mad_notice_attr, + ntc_144.new_cap_mask) == 16); +#endif + BUILD_ASSERT(sizeof(struct srp_sa_node_rec) == 108); + BUILD_ASSERT(sizeof(struct srp_sa_port_info_rec) == 58); + BUILD_ASSERT(sizeof(struct srp_dm_iou_info) == 132); + BUILD_ASSERT(sizeof(struct srp_dm_ioc_prof) == 128); + + if (strcmp(argv[0] + max_t(int, 0, strlen(argv[0]) - strlen("ibsrpdm")), + "ibsrpdm") == 0) { + ret = ibsrpdm(argc, argv); + goto out; + } + + systemd = is_systemd(argc, argv); + + if (systemd) + openlog(NULL, LOG_NDELAY | LOG_CONS | LOG_PID, LOG_DAEMON); + else + openlog("srp_daemon", LOG_PID, LOG_DAEMON); + + config = calloc(1, sizeof(*config)); + if (!config) { + pr_err("out of memory\n"); + ret = ENOMEM; + goto close_log; + } + + if (get_config(config, argc, argv)) { + ret = EINVAL; + goto free_config; + } + + if (config->verbose) + print_config(config); + + if (!config->once) { + lockfd = check_process_uniqueness(config); + if (lockfd < 0) { + ret = EPERM; + goto free_config; + } + } + + ret = setup_wakeup_fd(); + if (ret) + goto cleanup_wakeup; + +catas_start: + subscribed = 0; + + ret = umad_init(); + if (ret < 0) { + pr_err("umad_init failed\n"); + goto close_lockfd; + } + + res = alloc_res(); + if (!res && received_signal == SRP_CATAS_ERR) + pr_err("Device has not yet recovered from catas error\n"); + if (!res) + goto clean_umad; + + /* + * alloc_res() fails while the HCA is recovering from a catastrophic + * error. Clear 'received_signal' after alloc_res() has succeeded to + * finish the alloc_res() retry loop. + */ + if (received_signal == SRP_CATAS_ERR) { + pr_err("Device recovered from catastrophic error\n"); + received_signal = 0; + } + + if (config->once) { + ret = recalc(res); + goto free_res; + } + + while (received_signal == 0) { + pthread_mutex_lock(&res->sync_res->mutex); + if (__rescan_scheduled(res->sync_res)) { + uint16_t port_lid; + + pthread_mutex_unlock(&res->sync_res->mutex); + + pr_debug("Starting a recalculation\n"); + port_lid = get_port_lid(res->ud_res->ib_ctx, + config->port_num, &sm_lid); + if (port_lid > 0 && port_lid < 0xc000 && + (port_lid != res->ud_res->port_attr.lid || + sm_lid != res->ud_res->port_attr.sm_lid)) { + + if (res->ud_res->ah) { + ibv_destroy_ah(res->ud_res->ah); + res->ud_res->ah = NULL; + } + ret = create_ah(res->ud_res); + if (ret) { + received_signal = get_received_signal(10, 0); + goto kill_threads; + } + } + + if (res->ud_res->ah) { + if (register_to_traps(res, 1)) + pr_err("Fail to register to traps, maybe there " + "is no SM running on fabric or IB port is down\n"); + else + subscribed = 1; + } + + clear_traps_list(res->sync_res); + schedule_rescan(res->sync_res, config->recalc_time ? + config->recalc_time : -1); + + /* empty retry_list */ + pthread_mutex_lock(&res->sync_res->retry_mutex); + while ((target = pop_from_retry_list(res->sync_res))) + free(target); + pthread_mutex_unlock(&res->sync_res->retry_mutex); + + recalc(res); + } else if (pop_from_list(res->sync_res, &lid, &gid, &pkey)) { + pthread_mutex_unlock(&res->sync_res->mutex); + if (lid) { + uint64_t guid; + ret = get_node(res->umad_res, lid, &guid); + if (ret) + /* unexpected error - do a full rescan */ + schedule_rescan(res->sync_res, 0); + else + handle_port(res, pkey, lid, guid); + } else { + ret = get_lid(res->umad_res, &gid, &lid); + if (ret < 0) + /* unexpected error - do a full rescan */ + schedule_rescan(res->sync_res, 0); + else { + pr_debug("lid is %#x\n", lid); + + srp_sleep(0, 100); + handle_port(res, pkey, lid, + be64toh(ib_gid_get_guid(&gid))); + } + } + } else { + static const struct timespec zero; + struct timespec now, delta; + struct timespec recalc = { + .tv_sec = config->recalc_time + }; + struct timeval timeout; + + clock_gettime(CLOCK_MONOTONIC, &now); + ts_sub(&res->sync_res->next_recalc_time, &now, &delta); + pthread_mutex_unlock(&res->sync_res->mutex); + + if (ts_cmp(&zero, &delta, <=) && + ts_cmp(&delta, &recalc, <)) + recalc = delta; + timeout.tv_sec = recalc.tv_sec; + timeout.tv_usec = recalc.tv_nsec / 1000 + 1; + + received_signal = get_received_signal(timeout.tv_sec, + timeout.tv_usec) ? : + received_signal; + } + } + + ret = 0; + +kill_threads: + switch (received_signal) { + case SIGINT: + pr_err("Got SIGINT\n"); + break; + case SIGTERM: + pr_err("Got SIGTERM\n"); + break; + case SRP_CATAS_ERR: + pr_err("Got SIG SRP_CATAS_ERR\n"); + break; + case 0: + break; + default: + pr_err("Got SIG???\n"); + break; + } + + if (subscribed && received_signal != SRP_CATAS_ERR) { + pr_err("Deregistering traps ...\n"); + register_to_traps(res, 0); + pr_err("Finished trap deregistration.\n"); + } +free_res: + free_res(res); + /* Discard the SIGINT triggered by the free_res() implementation. */ + get_received_signal(0, 0); +clean_umad: + umad_done(); + if (received_signal == SRP_CATAS_ERR) { + /* + * Device got a catastrophic error. Let's wait a grace + * period and try to probe the device by attempting to + * allocate IB resources. Once it recovers, we will + * start all over again. + */ + received_signal = get_received_signal(10, 0) ? : + received_signal; + if (received_signal == SRP_CATAS_ERR) + goto catas_start; + } +close_lockfd: + if (lockfd >= 0) + close(lockfd); +cleanup_wakeup: + cleanup_wakeup_fd(); +free_config: + free_config(config); +close_log: + closelog(); +out: + exit(ret ? 1 : 0); +} + +static int recalc(struct resources *res) +{ + struct umad_resources *umad_res = res->umad_res; + int mask_match; + char val[7]; + int ret; + + ret = srpd_sys_read_string(umad_res->port_sysfs_path, "sm_lid", val, sizeof val); + if (ret < 0) { + pr_err("Couldn't read SM LID\n"); + return ret; + } + + umad_res->sm_lid = strtol(val, NULL, 0); + if (umad_res->sm_lid == 0) { + pr_err("SM LID is 0, maybe no SM is running\n"); + return -1; + } + + ret = check_sm_cap(umad_res, &mask_match); + if (ret < 0) + return ret; + + if (mask_match) { + pr_debug("Advanced SM, performing a capability query\n"); + ret = do_dm_port_list(res); + } else { + pr_debug("Old SM, performing a full node query\n"); + ret = do_full_port_list(res); + } + + return ret; +} + +static int get_lid(struct umad_resources *umad_res, union umad_gid *gid, + uint16_t *lid) +{ + struct srp_ib_user_mad out_mad, in_mad; + struct umad_sa_packet *in_sa_mad = get_data_ptr(in_mad); + struct umad_sa_packet *out_sa_mad = get_data_ptr(out_mad); + struct ib_path_rec *path_rec = (struct ib_path_rec *) out_sa_mad->data; + + memset(&in_mad, 0, sizeof(in_mad)); + init_srp_sa_mad(&out_mad, umad_res->agent, umad_res->sm_lid, + UMAD_SA_ATTR_PATH_REC, 0); + + out_sa_mad->comp_mask = htobe64( 4 | 8 | 64 | 512 | 4096 ); + + path_rec->sgid = *gid; + path_rec->dgid = *gid; + path_rec->reversible_numpath = 1; + path_rec->hop_flow_raw = htobe32(1 << 31); /* rawtraffic=1 hoplimit = 0 */ + + if (send_and_get(umad_res->portid, umad_res->agent, &out_mad, &in_mad, 0) < 0) + return -1; + + path_rec = (struct ib_path_rec *) in_sa_mad->data; + + *lid = be16toh(path_rec->dlid); + + return 0; +} diff --git a/srp_daemon/srp_daemon.conf b/srp_daemon/srp_daemon.conf new file mode 100644 index 0000000..8a3abe5 --- /dev/null +++ b/srp_daemon/srp_daemon.conf @@ -0,0 +1,19 @@ +## This is an example rules configuration file for srp_daemon. +## +#This is a comment +## disallow the following dgid +#d dgid=fe800000000000000002c90200402bd5 +## allow target with the following ioc_guid +#a ioc_guid=00a0b80200402bd7 +## allow target with the following pkey +#a pkey=ffff +## allow target with the following id_ext and ioc_guid +#a id_ext=200500A0B81146A1,ioc_guid=00a0b80200402bef +## disallow all the rest +#d +## +## Here is another example: +## +## Allow all targets and set queue size to 128. +# a queue_size=128,max_cmd_per_lun=128 + diff --git a/srp_daemon/srp_daemon.h b/srp_daemon/srp_daemon.h new file mode 100644 index 0000000..b753cec --- /dev/null +++ b/srp_daemon/srp_daemon.h @@ -0,0 +1,326 @@ +/* + * srp_daemon - discover SRP targets over IB + * Copyright (c) 2005 Topspin Communications. All rights reserved. + * Copyright (c) 2006 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2006 Mellanox Technologies Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef SRP_DM_H +#define SRP_DM_H + +#include <stdint.h> +#include <signal.h> +#include <endian.h> +#include <util/util.h> +#include <infiniband/verbs.h> +#include <infiniband/umad.h> +#include <linux/types.h> /* __be16, __be32 and __be64 */ +#include <ccan/build_assert.h> + +#include "config.h" +#include "srp_ib_types.h" + +#define SRP_CATAS_ERR SIGUSR1 + +enum { + SRP_DM_ATTR_IO_UNIT_INFO = 0x0010, + SRP_DM_ATTR_IO_CONTROLLER_PROFILE = 0x0011, + SRP_DM_ATTR_SERVICE_ENTRIES = 0x0012 +}; + +enum { + SRP_DM_NO_IOC = 0x0, + SRP_DM_IOC_PRESENT = 0x1, + SRP_DM_NO_SLOT = 0xf +}; + +enum { + SRP_SM_SUPPORTS_MASK_MATCH = 1 << 13, + SRP_IS_DM = 1 << 19, + SRP_SM_CAP_MASK_MATCH_ATTR_MOD = 1 << 31, +}; + +enum { + SRP_REV10_IB_IO_CLASS = 0xff00, + SRP_REV16A_IB_IO_CLASS = 0x0100 +}; + +struct srp_sa_node_rec { + __be16 lid; + __be16 reserved; + uint8_t base_version; + uint8_t class_version; + uint8_t type; + uint8_t num_ports; + __be64 sys_guid __attribute__((packed)); + __be64 node_guid __attribute__((packed)); + __be64 port_guid __attribute__((packed)); + __be16 partition_cap; + __be16 device_id; + __be32 revision; + __be32 port_num_vendor_id; + uint8_t desc[64]; +}; + +struct srp_sa_port_info_rec { + __be16 endport_lid; + uint8_t port_num; + uint8_t reserved; + __be64 m_key __attribute__((packed)); + __be64 subnet_prefix __attribute__((packed)); + __be16 base_lid; + __be16 master_sm_base_lid; + __be32 capability_mask __attribute__((packed)); + __be16 diag_code; + __be16 m_key_lease_period; + uint8_t local_port_num; + uint8_t link_width_enabled; + uint8_t link_width_supported; + uint8_t link_width_active; + uint8_t state_info1; + uint8_t state_info2; + uint8_t mkey_lmc; + uint8_t link_speed; + uint8_t mtu_smsl; + uint8_t vl_cap; + uint8_t vl_high_limit; + uint8_t vl_arb_high_cap; + uint8_t vl_arb_low_cap; + uint8_t mtu_cap; + uint8_t vl_stall_life; + uint8_t vl_enforce; + __be16 m_key_violations; + __be16 p_key_violations; + __be16 q_key_violations; + uint8_t guid_cap; + uint8_t subnet_timeout; + uint8_t resp_time_value; + uint8_t error_threshold; +}; + +struct srp_dm_iou_info { + __be16 change_id; + uint8_t max_controllers; + uint8_t diagid_optionrom; + uint8_t controller_list[128]; +}; + +struct srp_dm_ioc_prof { + __be64 guid; + __be32 vendor_id; + __be32 device_id; + __be16 device_version; + __be16 reserved1; + __be32 subsys_vendor_id; + __be32 subsys_device_id; + __be16 io_class; + __be16 io_subclass; + __be16 protocol; + __be16 protocol_version; + __be32 reserved2; + __be16 send_queue_depth; + uint8_t reserved3; + uint8_t rdma_read_depth; + __be32 send_size; + __be32 rdma_size; + uint8_t cap_mask; + uint8_t reserved4; + uint8_t service_entries; + uint8_t reserved5[9]; + char id[64]; +}; + +struct srp_dm_svc_entries { + struct { + char name[40]; + __be64 id; + } service[4]; +}; + +enum { + SEND_SIZE = 256, + GRH_SIZE = 40, + RECV_BUF_SIZE = SEND_SIZE + GRH_SIZE, +}; + +struct rule { + int allow; + char id_ext[17], ioc_guid[17], dgid[33], service_id[17], pkey[10], options[128]; +}; + +#define SRP_MAX_SHARED_PKEYS 127 +#define MAX_ID_EXT_STRING_LENGTH 17 + +struct target_details { + uint16_t pkey; + char id_ext[MAX_ID_EXT_STRING_LENGTH]; + struct srp_dm_ioc_prof ioc_prof; + uint64_t subnet_prefix; + uint64_t h_guid; + uint64_t h_service_id; + time_t retry_time; + char *options; + struct target_details *next; +}; + +struct config_t { + char *dev_name; + int port_num; + char *add_target_file; + int mad_retries; + int num_of_oust; + int cmd; + int once; + int execute; + int all; + int verbose; + int debug_verbose; + int timeout; + int recalc_time; + int print_initiator_ext; + const char *rules_file; + struct rule *rules; + int retry_timeout; + int tl_retry_count; +}; + +extern struct config_t *config; + +struct ud_resources { + struct ibv_device **dev_list; + struct ibv_context *ib_ctx; + struct ibv_pd *pd; + struct ibv_cq *send_cq; + struct ibv_cq *recv_cq; + struct ibv_qp *qp; + struct ibv_mr *mr; + struct ibv_ah *ah; + char *recv_buf; + char *send_buf; + struct ibv_device_attr device_attr; + struct ibv_port_attr port_attr; + int cq_size; + struct ibv_comp_channel *channel; + pthread_mutex_t *mad_buffer_mutex; + struct umad_sa_packet *mad_buffer; +}; + +struct umad_resources { + struct ibv_context *ib_ctx; + int portid; + int agent; + char *port_sysfs_path; + uint16_t sm_lid; +}; + +enum { + SIZE_OF_TASKS_LIST = 5, +}; + +struct sync_resources { + int stop_threads; + int next_task; + struct timespec next_recalc_time; + struct { + uint16_t lid; + uint16_t pkey; + union umad_gid gid; + } tasks[SIZE_OF_TASKS_LIST]; + pthread_mutex_t mutex; + struct target_details *retry_tasks_head; + struct target_details *retry_tasks_tail; + pthread_mutex_t retry_mutex; + pthread_cond_t retry_cond; +}; + +struct resources { + struct ud_resources *ud_res; + struct umad_resources *umad_res; + struct sync_resources *sync_res; + pthread_t trap_thread; + pthread_t async_ev_thread; + pthread_t reconnect_thread; + pthread_t timer_thread; +}; + +struct srp_ib_user_mad { + struct ib_user_mad hdr; + char filler[MAD_BLOCK_SIZE]; +}; + +#include <valgrind/drd.h> + +#define pr_human(arg...) \ + do { \ + if (!config->cmd && !config->execute) \ + printf(arg); \ + } while (0) + +void pr_debug(const char *fmt, ...) __attribute__((format(printf, 1, 2))); +void pr_err(const char *fmt, ...) __attribute__((format(printf, 1, 2))); + +int pkey_index_to_pkey(struct umad_resources *umad_res, int pkey_index, + __be16 *pkey); +void handle_port(struct resources *res, uint16_t pkey, uint16_t lid, uint64_t h_guid); +void ud_resources_init(struct ud_resources *res); +int ud_resources_create(struct ud_resources *res); +int ud_resources_destroy(struct ud_resources *res); +int wait_for_recalc(struct resources *res_in); +int trap_main(struct resources *res); +void *run_thread_get_trap_notices(void *res_in); +void *run_thread_listen_to_events(void *res_in); +int get_node(struct umad_resources *umad_res, uint16_t dlid, uint64_t *guid); +int create_trap_resources(struct ud_resources *ud_res); +int register_to_traps(struct resources *res, int subscribe); +uint16_t get_port_lid(struct ibv_context *ib_ctx, int port_num, + uint16_t *sm_lid); +int create_ah(struct ud_resources *ud_res); +void push_gid_to_list(struct sync_resources *res, union umad_gid *gid, + uint16_t pkey); +void push_lid_to_list(struct sync_resources *res, uint16_t lid, uint16_t pkey); +struct target_details *pop_from_retry_list(struct sync_resources *res); +void push_to_retry_list(struct sync_resources *res, + struct target_details *target); +int retry_list_is_empty(struct sync_resources *res); +void clear_traps_list(struct sync_resources *res); +int pop_from_list(struct sync_resources *res, uint16_t *lid, + union umad_gid *gid, uint16_t *pkey); +int sync_resources_init(struct sync_resources *res); +void sync_resources_cleanup(struct sync_resources *res); +int modify_qp_to_err(struct ibv_qp *qp); +void srp_sleep(time_t sec, time_t usec); +void wake_up_main_loop(char ch); +void __schedule_rescan(struct sync_resources *res, int when); +void schedule_rescan(struct sync_resources *res, int when); +int __rescan_scheduled(struct sync_resources *res); +int rescan_scheduled(struct sync_resources *res); + +#endif /* SRP_DM_H */ diff --git a/srp_daemon/srp_daemon.rules b/srp_daemon/srp_daemon.rules new file mode 100644 index 0000000..b6411dc --- /dev/null +++ b/srp_daemon/srp_daemon.rules @@ -0,0 +1 @@ +SUBSYSTEM=="infiniband_mad", KERNEL=="*umad*", PROGRAM=="/bin/systemctl show srp_daemon -p ActiveState", RESULT=="ActiveState=active", ENV{SYSTEMD_WANTS}+="srp_daemon_port@$attr{ibdev}:$attr{port}.service" diff --git a/srp_daemon/srp_daemon.service.5 b/srp_daemon/srp_daemon.service.5 new file mode 100644 index 0000000..a6b25d6 --- /dev/null +++ b/srp_daemon/srp_daemon.service.5 @@ -0,0 +1,30 @@ +'\" t +.TH "SRP_DAEMON\&.SERVICE" "5" "" "srp_daemon" "srp_daemon.service" +.\" ----------------------------------------------------------------- +.\" * set default formatting +.\" ----------------------------------------------------------------- +.\" disable hyphenation +.nh +.\" disable justification (adjust text to left margin only) +.ad l +.\" ----------------------------------------------------------------- +.\" * MAIN CONTENT STARTS HERE * +.\" ----------------------------------------------------------------- +.SH "NAME" +srp_daemon.service \- srp_daemon systemd service that controls all ports +.SH "SYNOPSIS" +.PP +srp_daemon\&.service +.SH "DESCRIPTION" +.PP +The srp_daemon\&.service controls whether or not any srp_daemon processes are +running. Although no srp_daemon processes are controlled directly by the +srp_daemon\&.service, this service controls whether or not any +srp_daemon_port@\&.service are allowed to be active. Each +srp_daemon_port@\&.service controls one srp_daemon process. + +.SH "SEE ALSO" +.PP +\fBsrp_daemon\fR(1), +\fBsrp_daemon_port@.service\fR(5), +\fBsystemctl\fR(1) diff --git a/srp_daemon/srp_daemon.service.in b/srp_daemon/srp_daemon.service.in new file mode 100644 index 0000000..188b7e1 --- /dev/null +++ b/srp_daemon/srp_daemon.service.in @@ -0,0 +1,19 @@ +[Unit] +Description=Daemon that discovers and logs in to SRP target systems +Documentation=man:srp_daemon file:/etc/srp_daemon.conf +DefaultDependencies=false +Conflicts=emergency.target emergency.service +Before=remote-fs-pre.target + +[Service] +Type=oneshot +RemainAfterExit=yes +ExecStart=@CMAKE_INSTALL_FULL_LIBEXECDIR@/srp_daemon/start_on_all_ports +MemoryDenyWriteExecute=yes +PrivateTmp=yes +ProtectHome=yes +ProtectKernelModules=yes +RestrictRealtime=yes + +[Install] +WantedBy=remote-fs-pre.target diff --git a/srp_daemon/srp_daemon.sh.in b/srp_daemon/srp_daemon.sh.in new file mode 100755 index 0000000..75e8a31 --- /dev/null +++ b/srp_daemon/srp_daemon.sh.in @@ -0,0 +1,84 @@ +#!/bin/bash +# +# Copyright (c) 2006 Mellanox Technologies. All rights reserved. +# +# This Software is licensed under one of the following licenses: +# +# 1) under the terms of the "Common Public License 1.0" a copy of which is +# available from the Open Source Initiative, see +# http://www.opensource.org/licenses/cpl.php. +# +# 2) under the terms of the "The BSD License" a copy of which is +# available from the Open Source Initiative, see +# http://www.opensource.org/licenses/bsd-license.php. +# +# 3) under the terms of the "GNU General Public License (GPL) Version 2" a +# copy of which is available from the Open Source Initiative, see +# http://www.opensource.org/licenses/gpl-license.php. +# +# Licensee has the right to choose one of the above licenses. +# +# Redistributions of source code must retain the above copyright +# notice and one of the license notices. +# +# Redistributions in binary form must reproduce both the above copyright +# notice, one of the license notices in the documentation +# and/or other materials provided with the distribution. +# +# $Id$ +# + +shopt -s nullglob + +prog=@CMAKE_INSTALL_FULL_SBINDIR@/srp_daemon +params=("$@") +ibdir="/sys/class/infiniband" +rescan_interval=60 +pids=() +pidfile="@CMAKE_INSTALL_FULL_RUNDIR@/srp_daemon.sh.pid" +mypid=$$ + +trap_handler() +{ + if [ "${#pids[@]}" ]; then + kill -15 "${pids[@]}" > /dev/null 2>&1 + wait "${pids[@]}" + fi + logger -i -t "$(basename "$0")" "killing $prog." + /bin/rm -f "$pidfile" + exit 0 +} + +# Check if there is another copy running of srp_daemon.sh +if [ -f "$pidfile" ]; then + if [ -e "/proc/$(cat "$pidfile" 2>/dev/null)/status" ]; then + echo "$(basename "$0") is already running. Exiting." + exit 1 + else + /bin/rm -f "$pidfile" + fi +fi + +if ! echo $mypid > "$pidfile"; then + echo "Creating $pidfile for pid $mypid failed" + exit 1 +fi + +trap 'trap_handler' 2 15 + +while [ ! -d ${ibdir} ] +do + sleep 30 +done + +for d in ${ibdir}_mad/umad*; do + hca_id="$(<"$d/ibdev")" + port="$(<"$d/port")" + add_target="${ibdir}_srp/srp-${hca_id}-${port}/add_target" + if [ -e "${add_target}" ]; then + ${prog} -e -c -n -i "${hca_id}" -p "${port}" -R "${rescan_interval}" "${params[@]}" >/dev/null 2>&1 & + pids+=($!) + fi +done + +wait diff --git a/srp_daemon/srp_daemon_port@.service.5 b/srp_daemon/srp_daemon_port@.service.5 new file mode 100644 index 0000000..9136f1a --- /dev/null +++ b/srp_daemon/srp_daemon_port@.service.5 @@ -0,0 +1,49 @@ +'\" t +.TH "SRP_DAEMON_PORT@\&.SERVICE" "5" "" "srp_daemon" "srp_daemon_port@.service" +.\" ----------------------------------------------------------------- +.\" * set default formatting +.\" ----------------------------------------------------------------- +.\" disable hyphenation +.nh +.\" disable justification (adjust text to left margin only) +.ad l +.\" ----------------------------------------------------------------- +.\" * MAIN CONTENT STARTS HERE * +.\" ----------------------------------------------------------------- +.SH "NAME" +srp_daemon_port@.service \- srp_daemon_port@ systemd service that controls a +single port +.SH "SYNOPSIS" +.PP +srp_daemon_port@\&.service +.SH "DESCRIPTION" +.PP +The srp_daemon_port@\&.service controls whether or not an srp_daemon process +is monitoring the RDMA port specified as template argument. The format for the +RDMA port name is \fIdev:port\fR where \fIdev\fR is the name of an RDMA device +and \fIport\fR is an port number starting from one. Starting an instance of +this template will start an srp_daemon process. Stopping an instance of this +template will stop the srp_daemon process for the specified port. It can be +prevented that srp_daemon is started for a certain port by masking the +corresponding systemd service, e.g. \fBsystemctl mask +srp_daemon_port@mlx4_0:1\fR. + +A list of all RDMA device and port number pairs can be obtained e.g. as follows: +.PP +.nf +.RS +$ (cd /sys/class/infiniband >&/dev/null && for p in */ports/*; do + [ -e "$p" ] && echo "${p/\\/ports\\//:}"; done) +mlx4_0:1 +mlx4_0:2 +mlx4_1:1 +mlx4_1:2 +.RE +.fi +.PP + +.SH "SEE ALSO" +.PP +\fBsrp_daemon\fR(1), +\fBsrp_daemon.service\fR(5), +\fBsystemctl\fR(1) diff --git a/srp_daemon/srp_daemon_port@.service.in b/srp_daemon/srp_daemon_port@.service.in new file mode 100644 index 0000000..3d5a11e --- /dev/null +++ b/srp_daemon/srp_daemon_port@.service.in @@ -0,0 +1,42 @@ +[Unit] +Description=SRP daemon that monitors port %i +Documentation=man:srp_daemon file:/etc/rdma/rdma.conf file:/etc/srp_daemon.conf +# srp_daemon is required to mount filesystems, and could run before sysinit.target +DefaultDependencies=false +Before=remote-fs-pre.target +# Do not execute concurrently with an ongoing shutdown (required for DefaultDependencies=no) +Conflicts=shutdown.target +Before=shutdown.target +# Ensure required kernel modules are loaded before starting +Requires=rdma-load-modules@srp_daemon.service +After=rdma-load-modules@srp_daemon.service +# Complete setting up low level RDMA hardware +After=rdma-hw.target +# Only run while the RDMA udev device is in an active state, and shutdown if +# it becomes unplugged. +After=sys-subsystem-rdma-devices-%i-umad.device +BindsTo=sys-subsystem-rdma-devices-%i-umad.device +# Allow srp_daemon to act as a leader for all of the port services for +# stop/start/reset +After=srp_daemon.service +BindsTo=srp_daemon.service + +[Service] +Type=simple +ExecStart=@CMAKE_INSTALL_FULL_SBINDIR@/srp_daemon --systemd -e -c -n -j %I -R 60 +MemoryDenyWriteExecute=yes +PrivateNetwork=yes +PrivateTmp=yes +ProtectControlGroups=yes +ProtectHome=yes +ProtectKernelModules=yes +ProtectSystem=full +RestrictRealtime=yes +SystemCallFilter=~@clock @cpu-emulation @debug @keyring @module @mount @obsolete @raw-io + +[Install] +# Instances of this template unit file is started automatically by udev or by +# srp_daemon.service as devices are discovered. However, if the user manually +# enables a template unit then it will be installed with remote-fs-pre. Note +# that systemd will defer starting the unit until the rdma .device appears. +WantedBy=remote-fs-pre.target diff --git a/srp_daemon/srp_handle_traps.c b/srp_daemon/srp_handle_traps.c new file mode 100644 index 0000000..2279b2c --- /dev/null +++ b/srp_daemon/srp_handle_traps.c @@ -0,0 +1,894 @@ +/* + * Copyright (c) 2006 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * $Author: ishai Rabinovitz [ishai@mellanox.co.il]$ + * + */ +#include <assert.h> +#include <stdlib.h> +#include <stdio.h> +#include <unistd.h> +#include <sys/time.h> +#include <endian.h> +#include <time.h> +#include <errno.h> +#include <string.h> +#include <infiniband/verbs.h> +#include <infiniband/umad_sa.h> +#include <infiniband/umad_sm.h> + +#include "srp_ib_types.h" + +#include "srp_daemon.h" + +void srp_sleep(time_t sec, time_t usec) +{ + struct timespec req, rem; + + if (usec > 1000) { + sec += usec / 1000; + usec = usec % 1000; + } + req.tv_sec = sec; + req.tv_nsec = usec * 1000000; + + nanosleep(&req, &rem); +} + +/***************************************************************************** +* Function: ud_resources_init +*****************************************************************************/ +void +ud_resources_init(struct ud_resources *res) +{ + res->dev_list = NULL; + res->ib_ctx = NULL; + res->send_cq = NULL; + res->recv_cq = NULL; + res->channel = NULL; + res->qp = NULL; + res->pd = NULL; + res->mr = NULL; + res->ah = NULL; + res->send_buf = NULL; + res->recv_buf = NULL; +} + + +/***************************************************************************** +* Function: modify_qp_to_rts +*****************************************************************************/ +static int modify_qp_to_rts(struct ibv_qp *qp) +{ + struct ibv_qp_attr attr; + int flags; + int rc; + + /* RESET -> INIT */ + memset(&attr, 0, sizeof(struct ibv_qp_attr)); + + attr.qp_state = IBV_QPS_INIT; + attr.port_num = config->port_num; + attr.pkey_index = 0; + attr.qkey = UMAD_QKEY; + + flags = IBV_QP_STATE | IBV_QP_PKEY_INDEX | IBV_QP_PORT | IBV_QP_QKEY; + + rc = ibv_modify_qp(qp, &attr, flags); + if (rc) { + pr_err("failed to modify QP state to INIT\n"); + return rc; + } + + /* INIT -> RTR */ + memset(&attr, 0, sizeof(attr)); + + attr.qp_state = IBV_QPS_RTR; + + flags = IBV_QP_STATE; + + rc = ibv_modify_qp(qp, &attr, flags); + if (rc) { + pr_err("failed to modify QP state to RTR\n"); + return rc; + } + + /* RTR -> RTS */ + /* memset(&attr, 0, sizeof(attr)); */ + + attr.qp_state = IBV_QPS_RTS; + attr.sq_psn = 0; + + flags = IBV_QP_STATE | IBV_QP_SQ_PSN; + + rc = ibv_modify_qp(qp, &attr, flags); + if (rc) { + pr_err("failed to modify QP state to RTS\n"); + return rc; + } + + return 0; +} + +int modify_qp_to_err(struct ibv_qp *qp) +{ + static struct ibv_qp_attr attr = { + .qp_state = IBV_QPS_ERR, + }; + + return ibv_modify_qp(qp, &attr, IBV_QP_STATE); +} + +/***************************************************************************** +* Function: fill_rq_entry +*****************************************************************************/ +static int fill_rq_entry(struct ud_resources *res, int cur_receive) +{ + struct ibv_recv_wr rr; + struct ibv_sge sg; + struct ibv_recv_wr *_bad_wr = NULL; + struct ibv_recv_wr **bad_wr = &_bad_wr; + int ret; + + memset(&rr, 0, sizeof(rr)); + + sg.length = RECV_BUF_SIZE; + sg.lkey = res->mr->lkey; + + rr.next = NULL; + rr.sg_list = &sg; + rr.num_sge = 1; + + sg.addr = (((unsigned long)res->recv_buf) + RECV_BUF_SIZE * cur_receive); + rr.wr_id = cur_receive; + + ret = ibv_post_recv(res->qp, &rr, bad_wr); + if (ret < 0) { + pr_err("failed to post RR\n"); + return ret; + } + return 0; +} + +/***************************************************************************** +* Function: fill_rq +*****************************************************************************/ +static int fill_rq(struct ud_resources *res) +{ + int cur_receive; + int ret; + + for (cur_receive=0; cur_receive<config->num_of_oust; ++cur_receive) { + ret = fill_rq_entry(res, cur_receive); + if (ret < 0) { + pr_err("failed to fill_rq_entry\n"); + return ret; + } + } + + return 0; +} + +/***************************************************************************** +* Function: ud_resources_create +*****************************************************************************/ +int ud_resources_create(struct ud_resources *res) +{ + struct ibv_device *ib_dev = NULL; + size_t size; + int i; + int cq_size; + int num_devices; + + /* get device names in the system */ + res->dev_list = ibv_get_device_list(&num_devices); + if (!res->dev_list) { + pr_err("failed to get IB devices list\n"); + return -1; + } + + for (i = 0; i < num_devices; i ++) { + if (!strcmp(ibv_get_device_name(res->dev_list[i]), config->dev_name)) { + ib_dev = res->dev_list[i]; + break; + } + } + + if (!ib_dev) { + pr_err("IB device %s wasn't found\n", config->dev_name); + return -ENXIO; + } + + pr_debug("Device %s was found\n", config->dev_name); + + /* get device handle */ + res->ib_ctx = ibv_open_device(ib_dev); + if (!res->ib_ctx) { + pr_err("failed to open device %s\n", config->dev_name); + return -ENXIO; + } + + res->channel = ibv_create_comp_channel(res->ib_ctx); + if (!res->channel) { + pr_err("failed to create completion channel \n"); + return -ENXIO; + } + + res->pd = ibv_alloc_pd(res->ib_ctx); + if (!res->pd) { + pr_err("ibv_alloc_pd failed\n"); + return -1; + } + + cq_size = config->num_of_oust; + res->recv_cq = ibv_create_cq(res->ib_ctx, cq_size, NULL, res->channel, 0); + if (!res->recv_cq) { + pr_err("failed to create CQ with %u entries\n", cq_size); + return -1; + } + pr_debug("CQ was created with %u CQEs\n", cq_size); + + if (ibv_req_notify_cq(res->recv_cq, 0)) { + pr_err("Couldn't request CQ notification\n"); + return -1; + } + + + res->send_cq = ibv_create_cq(res->ib_ctx, 1, NULL, NULL, 0); + if (!res->send_cq) { + pr_err("failed to create CQ with %u entries\n", 1); + return -1; + } + pr_debug("CQ was created with %u CQEs\n", 1); + + size = cq_size * RECV_BUF_SIZE + SEND_SIZE; + res->recv_buf = malloc(size); + if (!res->recv_buf) { + pr_err("failed to malloc %zu bytes to memory buffer\n", size); + return -ENOMEM; + } + + memset(res->recv_buf, 0, size); + + res->send_buf = res->recv_buf + cq_size * RECV_BUF_SIZE; + + res->mr = ibv_reg_mr(res->pd, res->recv_buf, size, IBV_ACCESS_LOCAL_WRITE); + if (!res->mr) { + pr_err("ibv_reg_mr failed\n"); + return -1; + } + pr_debug("MR was created with addr=%p, lkey=0x%x,\n", res->recv_buf, res->mr->lkey); + + { + struct ibv_qp_init_attr attr = { + .send_cq = res->send_cq, + .recv_cq = res->recv_cq, + .cap = { + .max_send_wr = 1, + .max_recv_wr = config->num_of_oust, + .max_send_sge = 1, + .max_recv_sge = 1 + }, + .qp_type = IBV_QPT_UD, + .sq_sig_all = 1, + }; + + res->qp = ibv_create_qp(res->pd, &attr); + if (!res->qp) { + pr_err("failed to create QP\n"); + return -1; + } + pr_debug("QP was created, QP number=0x%x\n", res->qp->qp_num); + } + + /* modify the QP to RTS (connect the QPs) */ + if (modify_qp_to_rts(res->qp)) { + pr_err("failed to modify QP state from RESET to RTS\n"); + return -1; + } + + pr_debug("QPs were modified to RTS\n"); + + if (fill_rq(res)) + return -1; + + res->mad_buffer = malloc(sizeof(struct umad_sa_packet)); + if (!res->mad_buffer) { + pr_err("Could not alloc mad_buffer, abort\n"); + return -1; + } + + res->mad_buffer_mutex = malloc(sizeof(pthread_mutex_t)); + if (!res->mad_buffer_mutex) { + pr_err("Could not alloc mad_buffer_mutex, abort\n"); + return -1; + } + + if (pthread_mutex_init(res->mad_buffer_mutex, NULL)) { + pr_err("Could not init mad_buffer_mutex, abort\n"); + return -1; + } + + return 0; +} + +uint16_t get_port_lid(struct ibv_context *ib_ctx, int port_num, + uint16_t *sm_lid) +{ + struct ibv_port_attr port_attr; + int ret; + + ret = ibv_query_port(ib_ctx, port_num, &port_attr); + + if (!ret) { + if (sm_lid) + *sm_lid = port_attr.sm_lid; + return port_attr.lid; + } + + return 0; +} + +int create_ah(struct ud_resources *ud_res) +{ + struct ibv_ah_attr ah_attr; + + assert(!ud_res->ah); + + /* create the UD AV */ + memset(&ah_attr, 0, sizeof(ah_attr)); + + if (ibv_query_port(ud_res->ib_ctx, config->port_num, &ud_res->port_attr)) { + pr_err("ibv_query_port on port %u failed\n", config->port_num); + return -1; + } + + ah_attr.dlid = ud_res->port_attr.sm_lid; + ah_attr.port_num = config->port_num; + + ud_res->ah = ibv_create_ah(ud_res->pd, &ah_attr); + if (!ud_res->ah) { + pr_err("failed to create UD AV\n"); + return -1; + } + + return 0; +} + +/***************************************************************************** +* Function: ud_resources_destroy +*****************************************************************************/ +int ud_resources_destroy(struct ud_resources *res) +{ + int test_result = 0; + + if (res->qp) { + if (ibv_destroy_qp(res->qp)) { + pr_err("failed to destroy QP\n"); + test_result = 1; + } + } + + if (res->mr) { + if (ibv_dereg_mr(res->mr)) { + pr_err("ibv_dereg_mr failed\n"); + test_result = 1; + } + } + + if (res->send_cq) { + if (ibv_destroy_cq(res->send_cq)) { + pr_err("ibv_destroy_cq of CQ failed\n"); + test_result = 1; + } + } + + if (res->recv_cq) { + if (ibv_destroy_cq(res->recv_cq)) { + pr_err("ibv_destroy_cq of CQ failed\n"); + test_result = 1; + } + } + + if (res->channel) { + if (ibv_destroy_comp_channel(res->channel)) { + pr_err("ibv_destroy_comp_channel failed\n"); + test_result = 1; + } + } + + if (res->ah) { + if (ibv_destroy_ah(res->ah)) { + pr_err("ibv_destroy_ah failed\n"); + test_result = 1; + } + } + + if (res->pd) { + if (ibv_dealloc_pd(res->pd)) { + pr_err("ibv_dealloc_pd failed\n"); + test_result = 1; + } + } + + if (res->ib_ctx) { + if (ibv_close_device(res->ib_ctx)) { + pr_err("ibv_close_device failed\n"); + test_result = 1; + } + } + + if (res->dev_list) + ibv_free_device_list(res->dev_list); + + if (res->recv_buf) + free(res->recv_buf); + + if (res->mad_buffer) + free(res->mad_buffer); + + if (res->mad_buffer_mutex) + free(res->mad_buffer_mutex); + + return test_result; +} + +static void fill_send_request(struct ud_resources *res, struct ibv_send_wr *psr, + struct ibv_sge *psg, struct umad_hdr *mad_hdr) +{ + static int wr_id=0; + + assert(res->ah); + + memset(psr, 0, sizeof(*psr)); + + psr->next = NULL; + psr->wr_id = wr_id++; + psr->sg_list = psg; + psr->num_sge = 1; + psr->opcode = IBV_WR_SEND; +// psr->send_flags = IBV_SEND_SIGNALED | IBV_SEND_INLINE; + psr->send_flags = IBV_SEND_SIGNALED; + psr->wr.ud.ah = res->ah; + psr->wr.ud.remote_qpn = 1; + psr->wr.ud.remote_qkey = UMAD_QKEY; + + psg->addr = (uintptr_t) mad_hdr; + psg->length = SEND_SIZE; + psg->lkey = res->mr->lkey; +} + +static int stop_threads(struct sync_resources *sync_res) +{ + int result; + + pthread_mutex_lock(&sync_res->retry_mutex); + result = sync_res->stop_threads; + pthread_mutex_unlock(&sync_res->retry_mutex); + + return result; +} + +/***************************************************************************** + * Function: poll_cq_once + * Poll a CQ once. + * Returns the number of completion polled (0 or 1). + * Returns a negative value on error. + *****************************************************************************/ +static int poll_cq_once(struct sync_resources *sync_res, struct ibv_cq *cq, + struct ibv_wc *wc) +{ + int ret; + + ret = ibv_poll_cq(cq, 1, wc); + if (ret < 0) { + pr_err("poll CQ failed\n"); + return ret; + } + + if (ret > 0 && wc->status != IBV_WC_SUCCESS) { + if (!stop_threads(sync_res)) + pr_err("got bad completion with status: 0x%x\n", + wc->status); + return -ret; + } + + return ret; +} + + +static int poll_cq(struct sync_resources *sync_res, struct ibv_cq *cq, + struct ibv_wc *wc, struct ibv_comp_channel *channel) +{ + int ret; + struct ibv_cq *ev_cq; + void *ev_ctx; + + if (channel) { + /* There may be extra completions that + * were associated to the previous event. + * Only poll for the first one. If there are more than one, + * they will be handled by later call to poll_cq */ + ret = poll_cq_once(sync_res, cq, wc); + /* return directly if there was an error or + * 1 completion polled */ + if (ret) + return ret; + + if (ibv_get_cq_event(channel, &ev_cq, &ev_ctx)) { + pr_err("Failed to get cq_event\n"); + return -1; + } + + ibv_ack_cq_events(ev_cq, 1); + + if (ev_cq != cq) { + pr_debug("CQ event for unknown CQ %p\n", ev_cq); + return -1; + } + + if (ibv_req_notify_cq(cq, 0)) { + pr_err("Couldn't request CQ notification\n"); + return -1; + } + + } + + do { + ret = poll_cq_once(sync_res, cq, wc); + if (ret < 0) + return ret; + + if (ret == 0 && channel) { + pr_err("Weird poll returned no cqe after CQ event\n"); + return -1; + } + } while (ret == 0); + + return 0; +} + +/***************************************************************************** +* Function: register_to_trap +*****************************************************************************/ +static int register_to_trap(struct sync_resources *sync_res, + struct ud_resources *res, int dest_lid, + int trap_num, int subscribe) +{ + struct ibv_send_wr sr; + struct ibv_wc wc; + struct ibv_sge sg; + struct ibv_send_wr *_bad_wr = NULL; + struct ibv_send_wr **bad_wr = &_bad_wr; + int counter; + int rc; + int ret; + long long unsigned comp_mask = 0; + + struct umad_hdr *mad_hdr = (struct umad_hdr *) (res->send_buf); + struct umad_sa_packet *p_sa_mad = (struct umad_sa_packet *) (res->send_buf); + struct ib_inform_info *data = (struct ib_inform_info *) (p_sa_mad->data); + static uint64_t trans_id = 0x0000FFFF; + + if (subscribe) + pr_debug("Registering to trap:%d (sm in %#x)\n", trap_num, dest_lid); + else + pr_debug("Deregistering from trap:%d (sm in %#x)\n", trap_num, dest_lid); + + memset(res->send_buf, 0, SEND_SIZE); + + fill_send_request(res, &sr, &sg, mad_hdr); + + umad_init_new(mad_hdr, /* Mad Header */ + UMAD_CLASS_SUBN_ADM, /* Management Class */ + UMAD_SA_CLASS_VERSION, /* Class Version */ + UMAD_METHOD_SET, /* Method */ + 0, /* Transaction ID - will be set before the send in the loop*/ + htobe16(UMAD_ATTR_INFORM_INFO), /* Attribute ID */ + 0 ); /* Attribute Modifier */ + + + data->lid_range_begin = htobe16(0xFFFF); + data->is_generic = 1; + data->subscribe = subscribe; + if (trap_num == UMAD_SM_GID_IN_SERVICE_TRAP) + data->trap_type = htobe16(3); /* SM */ + else if (trap_num == UMAD_SM_LOCAL_CHANGES_TRAP) + data->trap_type = htobe16(4); /* Informational */ + data->g_or_v.generic.trap_num = htobe16(trap_num); + data->g_or_v.generic.node_type_msb = 0; + if (trap_num == UMAD_SM_GID_IN_SERVICE_TRAP) + /* Class Manager */ + data->g_or_v.generic.node_type_lsb = htobe16(4); + else if (trap_num == UMAD_SM_LOCAL_CHANGES_TRAP) + /* Channel Adapter */ + data->g_or_v.generic.node_type_lsb = htobe16(1); + + comp_mask |= SRP_INFORMINFO_LID_COMP | + SRP_INFORMINFO_ISGENERIC_COMP | + SRP_INFORMINFO_SUBSCRIBE_COMP | + SRP_INFORMINFO_TRAPTYPE_COMP | + SRP_INFORMINFO_TRAPNUM_COMP | + SRP_INFORMINFO_PRODUCER_COMP; + + if (!data->subscribe) { + data->g_or_v.generic.qpn_resp_time_val = htobe32(res->qp->qp_num << 8); + comp_mask |= SRP_INFORMINFO_QPN_COMP; + } + + p_sa_mad->comp_mask = htobe64(comp_mask); + pr_debug("comp_mask: %llx\n", comp_mask); + + for (counter = 3, rc = 0; counter > 0 && rc == 0; counter--) { + pthread_mutex_lock(res->mad_buffer_mutex); + res->mad_buffer->mad_hdr.base_version = 0; // flag that the buffer is empty + pthread_mutex_unlock(res->mad_buffer_mutex); + mad_hdr->tid = htobe64(trans_id); + trans_id++; + + ret = ibv_post_send(res->qp, &sr, bad_wr); + if (ret) { + pr_err("failed to post SR\n"); + return ret; + } + + ret = poll_cq(sync_res, res->send_cq, &wc, NULL); + if (ret < 0) + return ret; + + /* sleep and check for response from SA */ + do { + srp_sleep(1, 0); + pthread_mutex_lock(res->mad_buffer_mutex); + if (res->mad_buffer->mad_hdr.base_version == 0) + rc = 0; + else if (res->mad_buffer->mad_hdr.tid == mad_hdr->tid) + rc = 1; + else { + res->mad_buffer->mad_hdr.base_version = 0; + rc = 2; + } + pthread_mutex_unlock(res->mad_buffer_mutex); + } while (rc == 2); // while old response. + } + + if (counter == 0) { + pr_err("No response to inform info registration\n"); + return -EAGAIN; + } + + return 0; +} + + +/***************************************************************************** +* Function: response_to_trap +*****************************************************************************/ +static int response_to_trap(struct sync_resources *sync_res, + struct ud_resources *res, + struct umad_sa_packet *mad_buffer) +{ + struct ibv_send_wr sr; + struct ibv_sge sg; + struct ibv_send_wr *_bad_wr = NULL; + struct ibv_send_wr **bad_wr = &_bad_wr; + int ret; + struct ibv_wc wc; + + struct umad_sa_packet *response_buffer = (struct umad_sa_packet *) (res->send_buf); + + memcpy(response_buffer, mad_buffer, sizeof(struct umad_sa_packet)); + response_buffer->mad_hdr.method = UMAD_METHOD_REPORT_RESP; + + fill_send_request(res, &sr, &sg, (struct umad_hdr *) response_buffer); + ret = ibv_post_send(res->qp, &sr, bad_wr); + if (ret < 0) { + pr_err("failed to post response\n"); + return ret; + } + ret = poll_cq(sync_res, res->send_cq, &wc, NULL); + + return ret; +} + + +/***************************************************************************** +* Function: get_trap_notices +*****************************************************************************/ +static int get_trap_notices(struct resources *res) +{ + struct ibv_wc wc; + int cur_receive = 0; + int ret = 0; + int pkey_index; + __be16 pkey; + char *buffer; + struct umad_sa_packet *mad_buffer; + struct ib_mad_notice_attr *notice_buffer; + int trap_num; + + while (!stop_threads(res->sync_res)) { + + ret = poll_cq(res->sync_res, res->ud_res->recv_cq, &wc, + res->ud_res->channel); + if (ret < 0) + continue; + + pr_debug("get_trap_notices: Got CQE wc.wr_id=%lld\n", (long long int) wc.wr_id); + cur_receive = wc.wr_id; + buffer = res->ud_res->recv_buf + RECV_BUF_SIZE * cur_receive; + mad_buffer = (struct umad_sa_packet *) (buffer + GRH_SIZE); + + if ((mad_buffer->mad_hdr.mgmt_class == UMAD_CLASS_SUBN_ADM) && + (mad_buffer->mad_hdr.method == UMAD_METHOD_GET_RESP) && + (be16toh(mad_buffer->mad_hdr.attr_id) == UMAD_ATTR_INFORM_INFO)) { + /* this is probably a response to register to trap */ + pthread_mutex_lock(res->ud_res->mad_buffer_mutex); + *res->ud_res->mad_buffer = *mad_buffer; + pthread_mutex_unlock(res->ud_res->mad_buffer_mutex); + } else if ((mad_buffer->mad_hdr.mgmt_class == UMAD_CLASS_SUBN_ADM) && + (mad_buffer->mad_hdr.method == UMAD_METHOD_REPORT) && + (be16toh(mad_buffer->mad_hdr.attr_id) == UMAD_ATTR_NOTICE)) + { /* this is a trap notice */ + pkey_index = wc.pkey_index; + ret = pkey_index_to_pkey(res->umad_res, pkey_index, &pkey); + if (ret) { + pr_err("get_trap_notices: Got Bad pkey_index (%d)\n", + pkey_index); + wake_up_main_loop(0); + break; + } + + notice_buffer = (struct ib_mad_notice_attr *) (mad_buffer->data); + trap_num = be16toh(notice_buffer->generic.trap_num); + response_to_trap(res->sync_res, res->ud_res, mad_buffer); + if (trap_num == UMAD_SM_GID_IN_SERVICE_TRAP) + push_gid_to_list(res->sync_res, + ¬ice_buffer->ntc_64_67.gid, + be16toh(pkey)); + else if (trap_num == UMAD_SM_LOCAL_CHANGES_TRAP) { + if (be32toh(notice_buffer->ntc_144.new_cap_mask) & SRP_IS_DM) + push_lid_to_list(res->sync_res, + be16toh(notice_buffer->ntc_144.lid), + be16toh(pkey)); + } else { + pr_err("Unhandled trap_num %d\n", trap_num); + } + } + + ret = fill_rq_entry(res->ud_res, cur_receive); + if (ret < 0) { + wake_up_main_loop(0); + break; + } + } + return ret; +} + +void *run_thread_get_trap_notices(void *res_in) +{ + int ret; + + ret = get_trap_notices((struct resources *)res_in); + + pr_debug("get_trap_notices thread ended\n"); + + pthread_exit((void *)(long)ret); +} + + +/***************************************************************************** +* Function: register_to_traps +*****************************************************************************/ +int register_to_traps(struct resources *res, int subscribe) +{ + int rc; + int trap_numbers[] = {UMAD_SM_GID_IN_SERVICE_TRAP, UMAD_SM_LOCAL_CHANGES_TRAP}; + int i; + + for (i=0; i < sizeof(trap_numbers) / sizeof(*trap_numbers); ++i) { + rc = register_to_trap(res->sync_res, res->ud_res, + res->ud_res->port_attr.sm_lid, + trap_numbers[i], subscribe); + if (rc != 0) + return rc; + } + + return 0; + +} + +void *run_thread_listen_to_events(void *res_in) +{ + struct resources *res = (struct resources *)res_in; + struct ibv_async_event event; + + while (!stop_threads(res->sync_res)) { + if (ibv_get_async_event(res->ud_res->ib_ctx, &event)) { + if (errno != EINTR) + pr_err("ibv_get_async_event failed (errno = %d)\n", + errno); + break; + } + + pr_debug("event_type %d, port %d\n", + event.event_type, event.element.port_num); + + switch (event.event_type) { + case IBV_EVENT_PORT_ACTIVE: + case IBV_EVENT_SM_CHANGE: + case IBV_EVENT_LID_CHANGE: + case IBV_EVENT_CLIENT_REREGISTER: + case IBV_EVENT_PKEY_CHANGE: + if (event.element.port_num == config->port_num) { + pthread_mutex_lock(&res->sync_res->mutex); + __schedule_rescan(res->sync_res, 0); + wake_up_main_loop(0); + pthread_mutex_unlock(&res->sync_res->mutex); + } + break; + + case IBV_EVENT_DEVICE_FATAL: + case IBV_EVENT_CQ_ERR: + case IBV_EVENT_QP_FATAL: + /* clean and restart */ + pr_err("Critical event %d, raising catastrophic " + "error signal\n", event.event_type); + raise(SRP_CATAS_ERR); + break; + + /* + + case IBV_EVENT_PORT_ERR: + case IBV_EVENT_QP_REQ_ERR: + case IBV_EVENT_QP_ACCESS_ERR: + case IBV_EVENT_COMM_EST: + case IBV_EVENT_SQ_DRAINED: + case IBV_EVENT_PATH_MIG: + case IBV_EVENT_PATH_MIG_ERR: + case IBV_EVENT_SRQ_ERR: + case IBV_EVENT_SRQ_LIMIT_REACHED: + case IBV_EVENT_QP_LAST_WQE_REACHED: + + */ + + + default: + break; + } + + ibv_ack_async_event(&event); + + } + + return NULL; +} + diff --git a/srp_daemon/srp_ib_types.h b/srp_daemon/srp_ib_types.h new file mode 100644 index 0000000..a5bd9dc --- /dev/null +++ b/srp_daemon/srp_ib_types.h @@ -0,0 +1,250 @@ +/* + * srp-ib_types - discover SRP targets over IB + * Copyright (c) 2005 Topspin Communications. All rights reserved. + * Copyright (c) 2006 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2006 Mellanox Technologies Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef SRP_IB_TYPES_H +#define SRP_IB_TYPES_H + +#include <endian.h> +#include <stdint.h> +#include <linux/types.h> /* __be16, __be32 and __be64 */ +#include <infiniband/umad.h> /* union umad_gid */ +#include <infiniband/umad_types.h> + +#define SRP_INFORMINFO_LID_COMP (1 << 1) +#define SRP_INFORMINFO_ISGENERIC_COMP (1 << 4) +#define SRP_INFORMINFO_SUBSCRIBE_COMP (1 << 5) +#define SRP_INFORMINFO_TRAPTYPE_COMP (1 << 6) +#define SRP_INFORMINFO_TRAPNUM_COMP (1 << 7) +#define SRP_INFORMINFO_QPN_COMP (1 << 8) +#define SRP_INFORMINFO_PRODUCER_COMP (1 << 12) + +#define PACK_SUFFIX4 __attribute__((aligned(4))) __attribute__((packed)) +#define PACK_SUFFIX __attribute__((packed)) + +/****d* IBA Base: Constants/MAD_BLOCK_SIZE +* NAME +* MAD_BLOCK_SIZE +* +* DESCRIPTION +* Size of a non-RMPP MAD datagram. +* +* SOURCE +*/ +#define MAD_BLOCK_SIZE 256 + +static inline uint32_t ib_get_attr_size(const __be16 attr_offset) +{ + return( ((uint32_t)be16toh( attr_offset )) << 3 ); +} + +/************************************************************ +* NAME +* MAD_RMPP_HDR_SIZE +* +* DESCRIPTION +* Size of an RMPP header, including the common MAD header. +* +* SOURCE +*/ +enum { + MAD_RMPP_HDR_SIZE = 36, +}; + +/****s* IBA Base: Types/struct ib_path_rec +* NAME +* struct ib_path_rec +* +* DESCRIPTION +* Path records encapsulate the properties of a given +* route between two end-points on a subnet. +* +* SYNOPSIS +* +* NOTES +* The role of this data structure is identical to the role of struct +* ibv_path_record in libibverbs/sa.h. +*/ +struct ib_path_rec { + uint8_t resv0[8]; + union umad_gid dgid; + union umad_gid sgid; + __be16 dlid; + __be16 slid; + __be32 hop_flow_raw; + uint8_t tclass; + uint8_t reversible_numpath; /* reversible-7:7 num path-6:0 */ + __be16 pkey; + __be16 sl; + uint8_t mtu; + uint8_t rate; + uint8_t pkt_life; + uint8_t preference; + uint8_t resv2[6]; +}; + + +/****f* IBA Base: Types/umad_init_new +* NAME +* umad_init_new +* +* DESCRIPTION +* Initialize UMAD common header. +* +* SYNOPSIS +*/ +static inline void +umad_init_new(struct umad_hdr* const p_mad, + const uint8_t mgmt_class, + const uint8_t class_ver, + const uint8_t method, + const __be64 trans_id, + const __be16 attr_id, + const __be32 attr_mod) +{ + p_mad->base_version = 1; + p_mad->mgmt_class = mgmt_class; + p_mad->class_version = class_ver; + p_mad->method = method; + p_mad->status = 0; + p_mad->class_specific = 0; + p_mad->tid = trans_id; + p_mad->attr_id = attr_id; + p_mad->resv = 0; + p_mad->attr_mod = attr_mod; +} + + +struct ib_inform_info +{ + union umad_gid gid; + __be16 lid_range_begin; + __be16 lid_range_end; + __be16 reserved1; + uint8_t is_generic; + uint8_t subscribe; + __be16 trap_type; + union _inform_g_or_v + { + struct _inform_generic + { + __be16 trap_num; + __be32 qpn_resp_time_val; + uint8_t reserved2; + uint8_t node_type_msb; + __be16 node_type_lsb; + } PACK_SUFFIX generic; + + struct _inform_vend + { + __be16 dev_id; + __be32 qpn_resp_time_val; + uint8_t reserved2; + uint8_t vendor_id_msb; + __be16 vendor_id_lsb; + } PACK_SUFFIX vend; + + } PACK_SUFFIX g_or_v; + +} PACK_SUFFIX4; + +struct ib_mad_notice_attr // Total Size calc Accumulated +{ + union + { + uint8_t generic_type; // 1 1 + + struct _notice_generic + { + uint8_t generic_type; + uint8_t prod_type_msb; + __be16 prod_type_lsb; + __be16 trap_num; + } generic; + + struct _notice_vend + { + uint8_t generic_type; + uint8_t vend_id_msb; + __be16 vend_id_lsb; + __be16 dev_id; + } vend; + }; + + __be16 issuer_lid; // 2 8 + + union // 54 64 + { + __be16 toggle_count; // 2 10 + struct _raw_data + { + __be16 toggle_count; + uint8_t details[54]; + } raw_data; + + struct _ntc_64_67 + { + __be16 toggle_count; + uint8_t res[6]; + union umad_gid gid; // the Node or Multicast Group that came in/out + } ntc_64_67; + + struct _ntc_144 { + __be16 toggle_count; + __be16 pad1; + __be16 lid; // lid where capability mask changed + __be16 pad2; + __be32 new_cap_mask; // new capability mask + } ntc_144; + }; + + union umad_gid issuer_gid; // 16 80 + +}; + +/****f* IBA Base: Types/ib_gid_get_guid +* NAME +* ib_gid_get_guid +* +* DESCRIPTION +* Gets the guid from a GID. +* +* SYNOPSIS +*/ +static inline __be64 ib_gid_get_guid(const union umad_gid *const p_gid) +{ + return p_gid->global.interface_id; +} + +#endif diff --git a/srp_daemon/srp_sync.c b/srp_daemon/srp_sync.c new file mode 100644 index 0000000..036fbe5 --- /dev/null +++ b/srp_daemon/srp_sync.c @@ -0,0 +1,272 @@ +/* + * srp_sync - discover SRP targets over IB + * Copyright (c) 2005 Topspin Communications. All rights reserved. + * Copyright (c) 2006 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2006 Mellanox Technologies Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * $Author: ishai Rabinovitz [ishai@mellanox.co.il]$ + */ + +#include <pthread.h> + +#include <limits.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <fcntl.h> + +#include "srp_daemon.h" + +/* + * Schedule a rescan at now + when if when >= 0 or disable rescanning if + * when < 0. + */ +void __schedule_rescan(struct sync_resources *res, int when) +{ + struct timespec *ts = &res->next_recalc_time; + + clock_gettime(CLOCK_MONOTONIC, ts); + ts->tv_sec = when >= 0 ? ts->tv_sec + when : LONG_MAX; +} + +void schedule_rescan(struct sync_resources *res, int when) +{ + pthread_mutex_lock(&res->mutex); + __schedule_rescan(res, when); + pthread_mutex_unlock(&res->mutex); +} + +int __rescan_scheduled(struct sync_resources *res) +{ + struct timespec now; + + clock_gettime(CLOCK_MONOTONIC, &now); + return ts_cmp(&res->next_recalc_time, &now, <=); +} + +int rescan_scheduled(struct sync_resources *res) +{ + int ret; + + pthread_mutex_lock(&res->mutex); + ret = __rescan_scheduled(res); + pthread_mutex_unlock(&res->mutex); + + return ret; +} + +int sync_resources_init(struct sync_resources *res) +{ + int ret; + + res->stop_threads = 0; + __schedule_rescan(res, 0); + res->next_task = 0; + ret = pthread_mutex_init(&res->mutex, NULL); + if (ret < 0) { + pr_err("could not initialize mutex\n"); + return ret; + } + + res->retry_tasks_head = NULL; + ret = pthread_mutex_init(&res->retry_mutex, NULL); + if (ret < 0) { + pr_err("could not initialize mutex\n"); + return ret; + } + ret = pthread_cond_init(&res->retry_cond, NULL); + if (ret < 0) + pr_err("could not initialize cond\n"); + + return ret; +} + +void sync_resources_cleanup(struct sync_resources *res) +{ + pthread_cond_destroy(&res->retry_cond); + pthread_mutex_destroy(&res->retry_mutex); + pthread_mutex_destroy(&res->mutex); +} + +void push_gid_to_list(struct sync_resources *res, union umad_gid *gid, + uint16_t pkey) +{ + int i; + + /* If there is going to be a recalc soon - do nothing */ + if (rescan_scheduled(res)) + return; + + pthread_mutex_lock(&res->mutex); + + /* check if the gid is already in the list */ + + for (i=0; i < res->next_task; ++i) + if (!memcmp(&res->tasks[i].gid, gid, 16) && + res->tasks[i].pkey == pkey) { + pr_debug("gid is already in task list\n"); + pthread_mutex_unlock(&res->mutex); + return; + } + + if (res->next_task == SIZE_OF_TASKS_LIST) { + /* if the list is full, lets do a full rescan */ + + __schedule_rescan(res, 0); + res->next_task = 0; + } else { + /* otherwise enter to the next entry */ + + res->tasks[res->next_task].gid = *gid; + res->tasks[res->next_task].lid = 0; + res->tasks[res->next_task].pkey = pkey; + ++res->next_task; + } + + wake_up_main_loop(0); + pthread_mutex_unlock(&res->mutex); +} + +void push_lid_to_list(struct sync_resources *res, uint16_t lid, uint16_t pkey) +{ + int i; + + /* If there is going to be a recalc soon - do nothing */ + if (rescan_scheduled(res)) + return; + + pthread_mutex_lock(&res->mutex); + + + /* check if the lid is already in the list */ + + for (i=0; i < res->next_task; ++i) + if (res->tasks[i].lid == lid && res->tasks[i].pkey == pkey) { + pr_debug("lid %#x is already in task list\n", lid); + pthread_mutex_unlock(&res->mutex); + return; + } + + if (res->next_task == SIZE_OF_TASKS_LIST) { + /* if the list is full, lets do a full rescan */ + + __schedule_rescan(res, 0); + res->next_task = 0; + } else { + /* otherwise enter to the next entry */ + + res->tasks[res->next_task].lid = lid; + res->tasks[res->next_task].pkey = pkey; + memset(&res->tasks[res->next_task].gid, 0, 16); + ++res->next_task; + } + + wake_up_main_loop(0); + pthread_mutex_unlock(&res->mutex); +} + +void clear_traps_list(struct sync_resources *res) +{ + pthread_mutex_lock(&res->mutex); + res->next_task = 0; + pthread_mutex_unlock(&res->mutex); +} + + +/* assumes that res->mutex is locked !!! */ +int pop_from_list(struct sync_resources *res, uint16_t *lid, + union umad_gid *gid, uint16_t *pkey) +{ + int ret=0; + int i; + + if (res->next_task) { + *lid = res->tasks[0].lid; + *pkey = res->tasks[0].pkey; + *gid = res->tasks[0].gid; + /* push the rest down */ + for (i=1; i < res->next_task; ++i) + res->tasks[i-1] = res->tasks[i]; + ret = 1; + --res->next_task; + } + + return ret; +} + + +/* assumes that res->retry_mutex is locked !!! */ +struct target_details *pop_from_retry_list(struct sync_resources *res) +{ + struct target_details *ret = res->retry_tasks_head; + + if (ret) + res->retry_tasks_head = ret->next; + else + res->retry_tasks_tail = NULL; + + return ret; +} + +void push_to_retry_list(struct sync_resources *res, + struct target_details *orig_target) +{ + struct target_details *target; + + /* If there is going to be a recalc soon - do nothing */ + if (rescan_scheduled(res)) + return; + + target = malloc(sizeof(struct target_details)); + memcpy(target, orig_target, sizeof(struct target_details)); + + pthread_mutex_lock(&res->retry_mutex); + + if (!res->retry_tasks_head) + res->retry_tasks_head = target; + + if (res->retry_tasks_tail) + res->retry_tasks_tail->next = target; + + res->retry_tasks_tail = target; + + target->next = NULL; + + pthread_cond_signal(&res->retry_cond); + pthread_mutex_unlock(&res->retry_mutex); +} + +/* assumes that res->retry_mutex is locked !!! */ +int retry_list_is_empty(struct sync_resources *res) +{ + return res->retry_tasks_head == NULL; +} diff --git a/srp_daemon/srpd.in b/srp_daemon/srpd.in new file mode 100755 index 0000000..7e2316f --- /dev/null +++ b/srp_daemon/srpd.in @@ -0,0 +1,163 @@ +#!/bin/bash +# Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md +# +# Manage the SRP client daemon (srp_daemon) +# +# chkconfig: - 25 75 +# description: Starts/Stops InfiniBand SRP client service +# config: @CMAKE_INSTALL_FULL_SYSCONFDIR@/srp_daemon.conf +# +### BEGIN INIT INFO +# Provides: srpd +# Required-Start: $syslog @RDMA_SERVICE@ +# Required-Stop: $syslog @RDMA_SERVICE@ +# Default-Start: @SRP_DEFAULT_START@ +# Default-Stop: @SRP_DEFAULT_STOP@ +# Should-Start: +# Should-Stop: +# Short-Description: Starts and stops the InfiniBand SRP client service +# Description: The InfiniBand SRP client service attaches to SRP devices +# on the InfiniBand fabric and makes them appear as local disks to +# to the system. This service starts the client daemon that's +# responsible for initiating and maintaining the connections to +# remote devices. +### END INIT INFO + +if [ -e /etc/rdma/rdma.conf ]; then + # RHEL / Fedora. + RDMA_CONFIG=/etc/rdma/rdma.conf +else + # OFED + RDMA_CONFIG=/etc/infiniband/openib.conf +fi +if [ -f $RDMA_CONFIG ]; then + . $RDMA_CONFIG +fi +pidfile=@CMAKE_INSTALL_FULL_RUNDIR@/srp_daemon.sh.pid +prog=@CMAKE_INSTALL_FULL_SBINDIR@/srp_daemon.sh + +checkpid() { + [ -e "/proc/$1" ] +} + +stop_srp_daemon() { + if ! running; then + return 1 + fi + + local pid=`cat $pidfile` + kill $pid + # timeout 30 seconds for termination + for i in `seq 300`; do + if ! checkpid $pid; then + return 0 + fi + sleep 0.1 + done + kill -9 $pid + # If srp_daemon executables didn't finish by now + # force kill + pkill -9 srp_daemon + + return 0 +} + +# if the ib_srp module is loaded or built into the kernel return 0 otherwise +# return 1. +is_srp_mod_loaded() { + [ -e /sys/module/ib_srp ] +} + +running() { + [ -f $pidfile ] && checkpid "$(cat $pidfile)" +} + +start() { + if ! is_srp_mod_loaded; then + echo "SRP kernel module is not loaded, unable to start SRP daemon" + return 6 + fi + if running; then + echo "Already started" + return 0 + fi + + echo -n "Starting SRP daemon service" + + if [ "$SRP_DEFAULT_TL_RETRY_COUNT" ]; then + params=$params"-l $SRP_DEFAULT_TL_RETRY_COUNT " + fi + + setsid $prog $params </dev/null >&/dev/null & + RC=$? + [ $RC -eq 0 ] && echo || echo " ...failed" + return $RC +} + +stop() { + echo -n "Stopping SRP daemon service" + + stop_srp_daemon + RC=$? + for ((i=0;i<5;i++)); do + if ! running; then + rm -f $pidfile + break + fi + sleep 1 + done + [ $RC -eq 0 ] && echo || echo " ...failed" + return $RC +} + +status() { + local ret + + if [ ! -f $pidfile ]; then + ret=3 # program not running + else + checkpid "$(cat $pidfile)" + ret=$? # 1: pid file exists and not running / 0: running + fi + if [ $ret -eq 0 ] ; then + echo "$prog is running... pid=$(cat $pidfile)" + else + echo "$prog is not running." + fi + return $ret +} + +restart() { + stop + start +} + +condrestart() { + [ -f $pidfile ] && restart || return 0 +} + +usage() { + echo + echo "Usage: `basename $0` {start|stop|restart|condrestart|try-restart|force-reload|status}" + echo + return 2 +} + +case $1 in + start|stop|restart|condrestart|try-restart|force-reload) + [ `id -u` != "0" ] && exit 4 ;; +esac + +case $1 in + start) start; RC=$? ;; + stop) stop; RC=$? ;; + restart) restart; RC=$? ;; + reload) RC=3 ;; + condrestart) condrestart; RC=$? ;; + try-restart) condrestart; RC=$? ;; + force-reload) condrestart; RC=$? ;; + status) status; RC=$? ;; + *) usage; RC=$? ;; +esac + +exit $RC diff --git a/srp_daemon/start_on_all_ports b/srp_daemon/start_on_all_ports new file mode 100644 index 0000000..0a7e72e --- /dev/null +++ b/srp_daemon/start_on_all_ports @@ -0,0 +1,7 @@ +#!/bin/bash + +for p in /sys/class/infiniband/*/ports/*; do + [ -e "$p" ] || continue + p=${p#/sys/class/infiniband/} + nohup /bin/systemctl start "srp_daemon_port@${p/\/ports\//:}" </dev/null >&/dev/null & +done diff --git a/suse/module-setup.sh b/suse/module-setup.sh new file mode 100644 index 0000000..52b7747 --- /dev/null +++ b/suse/module-setup.sh @@ -0,0 +1,31 @@ +#!/bin/bash + +check() { + [ -n "$hostonly" -a -c /sys/class/infiniband_verbs/uverbs0 ] && return 0 + [ -n "$hostonly" ] && return 255 + return 0 +} + +depends() { + return 0 +} + +install() { + inst /etc/rdma/mlx4.conf + inst /etc/rdma/modules/infiniband.conf + inst /etc/rdma/modules/iwarp.conf + inst /etc/rdma/modules/opa.conf + inst /etc/rdma/modules/rdma.conf + inst /etc/rdma/modules/roce.conf + inst /usr/lib/mlx4-setup.sh + inst_multiple lspci setpci awk sleep + inst_rules 60-rdma-persistent-naming.rules 70-persistent-ipoib.rules 75-rdma-description.rules 90-rdma-hw-modules.rules 90-rdma-ulp-modules.rules + inst_multiple -o \ + $systemdsystemunitdir/rdma-hw.target \ + $systemdsystemunitdir/rdma-load-modules@.service +} + +installkernel() { + hostonly='' instmods =drivers/infiniband =drivers/net/ethernet/mellanox =drivers/net/ethernet/chelsio =drivers/net/ethernet/cisco =drivers/net/ethernet/emulex =drivers/target + hostonly='' instmods crc-t10dif crct10dif_common +} diff --git a/suse/rdma-core.spec b/suse/rdma-core.spec new file mode 100644 index 0000000..019e1db --- /dev/null +++ b/suse/rdma-core.spec @@ -0,0 +1,862 @@ +# +# spec file for package rdma-core +# +# Copyright (c) 2019 SUSE LINUX GmbH, Nuernberg, Germany. +# +# All modifications and additions to the file contributed by third parties +# remain the property of their copyright owners, unless otherwise agreed +# upon. The license for this file, and modifications and additions to the +# file, is the same license as for the pristine package itself (unless the +# license for the pristine package is not an Open Source License, in which +# case the license is the MIT License). An "Open Source License" is a +# license that conforms to the Open Source Definition (Version 1.9) +# published by the Open Source Initiative. + +# Please submit bugfixes or comments via https://bugs.opensuse.org/ +# + + +%bcond_without systemd +# Do not build static libs by default. +%define with_static %{?_with_static: 1} %{?!_with_static: 0} +%define with_pyverbs %{?_with_pyverbs: 1} %{?!_with_pyverbs: 0} + +%define git_ver %{nil} +Name: rdma-core +Version: 29.0 +Release: 0 +Summary: RDMA core userspace libraries and daemons +License: GPL-2.0-only OR BSD-2-Clause +Group: Productivity/Networking/Other + +%define efa_so_major 1 +%define verbs_so_major 1 +%define rdmacm_so_major 1 +%define umad_so_major 3 +%define mlx4_so_major 1 +%define mlx5_so_major 1 +%define ibnetdisc_major 5 +%define mad_major 5 + +%define efa_lname libefa%{efa_so_major} +%define verbs_lname libibverbs%{verbs_so_major} +%define rdmacm_lname librdmacm%{rdmacm_so_major} +%define umad_lname libibumad%{umad_so_major} +%define mlx4_lname libmlx4-%{mlx4_so_major} +%define mlx5_lname libmlx5-%{mlx5_so_major} + +%ifnarch s390 %arm +%define dma_coherent 1 +%endif + +# Almost everything is licensed under the OFA dual GPLv2, 2 Clause BSD license +# providers/ipathverbs/ Dual licensed using a BSD license with an extra patent clause +# providers/rxe/ Incorporates code from ipathverbs and contains the patent clause +# providers/hfi1verbs Uses the 3 Clause BSD license +Url: https://github.com/linux-rdma/rdma-core +Source: rdma-core-%{version}%{git_ver}.tar.gz +Source1: baselibs.conf +BuildRequires: binutils +BuildRequires: cmake >= 2.8.11 +BuildRequires: gcc +BuildRequires: pandoc +BuildRequires: pkgconfig +BuildRequires: python3-base +BuildRequires: python3-docutils +BuildRequires: pkgconfig(libsystemd) +BuildRequires: pkgconfig(libudev) +BuildRequires: pkgconfig(systemd) +BuildRequires: pkgconfig(udev) +%if %{with_pyverbs} +BuildRequires: python3-Cython +BuildRequires: python3-devel +%endif +%ifnarch s390 s390x +%if 0%{?suse_version} >= 1550 +BuildRequires: valgrind-client-headers +%else +BuildRequires: valgrind-devel +%endif +%endif +BuildRequires: systemd-rpm-macros +BuildRequires: pkgconfig(libnl-3.0) +BuildRequires: pkgconfig(libnl-route-3.0) +BuildRequires: pkgconfig(systemd) +Requires: kmod +Requires: systemd +Requires: udev + +# SUSE previously shipped rdma as a stand-alone +# package which we're supplanting here. + +Provides: rdma = %{version} +Obsoletes: rdma < %{version} +Provides: ofed = %{version} +Obsoletes: ofed < %{version} + +# Trickery to handle both SUSE OpenBuild System and Manual build +# In OBS, rdma-core must use curl-mini instead of curl to avoid +# a build dependency loop: +# rdma-core -> cmake -> curl -> ... -> boost -> rdma-core +# Thus we force a BuildRequires to curl-mini which as no impact +# as it is not used during the build. +# However curl-mini is not a published RPM. This would prevent any build +# outside of OBS. Thus we add a bcond to allow manual build. +# To force build without the use of curl-mini, --without=curlmini +# should be passed to rpmbuild +%bcond_without curlmini +%if 0%{?suse_version} >= 1330 +%if %{with curlmini} +BuildRequires: curl-mini +%endif +%endif + +# Tumbleweed's cmake RPM macro adds -Wl,--no-undefined to the module flags +# which is totally inappropriate and breaks building 'ENABLE_EXPORTS' style +# module libraries (eg ibacmp). +#%%define CMAKE_FLAGS -DCMAKE_MODULE_LINKER_FLAGS="" + +# Since we recommend developers use Ninja, so should packagers, for consistency. +%define CMAKE_FLAGS %{nil} +%if 0%{?suse_version} >= 1300 +BuildRequires: ninja +%define CMAKE_FLAGS -GNinja +%define make_jobs ninja -v %{?_smp_mflags} +%define cmake_install DESTDIR=%{buildroot} ninja install +%else +# Fallback to make otherwise +BuildRequires: make +%define make_jobs make VERBOSE=1 %{?_smp_mflags} +%define cmake_install DESTDIR=%{buildroot} make install +%endif + +%description +RDMA core userspace infrastructure and documentation, including initialization +scripts, kernel driver-specific modprobe override configs, IPoIB network +scripts, dracut rules, and the rdma-ndd utility. + +%package devel +Summary: RDMA core development libraries and headers +Group: Development/Libraries/C and C++ +Requires: %{name}%{?_isa} = %{version}-%{release} + +Requires: %{rdmacm_lname} = %{version}-%{release} +Requires: %{umad_lname} = %{version}-%{release} +Requires: %{verbs_lname} = %{version}-%{release} +%if 0%{?dma_coherent} +Requires: %{efa_lname} = %{version}-%{release} +Requires: %{mlx4_lname} = %{version}-%{release} +Requires: %{mlx5_lname} = %{version}-%{release} +%endif +Requires: rsocket = %{version}-%{release} + +Provides: libibverbs-devel = %{version}-%{release} +Obsoletes: libibverbs-devel < %{version}-%{release} + +Provides: libibumad-devel = %{version}-%{release} +Obsoletes: libibumad-devel < %{version}-%{release} +Provides: librdmacm-devel = %{version}-%{release} + +Obsoletes: librdmacm-devel < %{version}-%{release} +#Requires: ibacm = %%{version}-%%{release} +Provides: ibacm-devel = %{version}-%{release} +Obsoletes: ibacm-devel < %{version}-%{release} +%if %{with_static} +# Since our pkg-config files include private references to these packages they +# need to have their .pc files installed too, even for dynamic linking, or +# pkg-config breaks. +BuildRequires: pkgconfig(libnl-3.0) +BuildRequires: pkgconfig(libnl-route-3.0) +%endif + +Requires: infiniband-diags = %{version}-%{release} +Provides: infiniband-diags-devel = %{version}-%{release} +Obsoletes: infiniband-diags-devel < %{version}-%{release} +Provides: libibmad-devel = %{version}-%{release} +Obsoletes: libibmad-devel < %{version} + +%description devel +RDMA core development libraries and headers. + +%package -n libibverbs +Summary: Library & drivers for direct userspace use of InfiniBand/iWARP/RoCE hardware +Group: System/Libraries +Requires: %{name}%{?_isa} = %{version}-%{release} +Obsoletes: libcxgb4-rdmav2 < %{version}-%{release} +Obsoletes: libefa-rdmav2 < %{version}-%{release} +Obsoletes: libhfi1verbs-rdmav2 < %{version}-%{release} +Obsoletes: libi40iw-rdmav2 < %{version}-%{release} +Obsoletes: libipathverbs-rdmav2 < %{version}-%{release} +Obsoletes: libmlx4-rdmav2 < %{version}-%{release} +Obsoletes: libmlx5-rdmav2 < %{version}-%{release} +Obsoletes: libmthca-rdmav2 < %{version}-%{release} +Obsoletes: libocrdma-rdmav2 < %{version}-%{release} +Obsoletes: librxe-rdmav2 < %{version}-%{release} +%if 0%{?dma_coherent} +Requires: %{efa_lname} = %{version}-%{release} +Requires: %{mlx4_lname} = %{version}-%{release} +Requires: %{mlx5_lname} = %{version}-%{release} +%endif +# Recommended packages for rxe +Recommends: iproute2 + +%description -n libibverbs +libibverbs is a library that allows userspace processes to use RDMA +"verbs" as described in the InfiniBand Architecture Specification and +the RDMA Protocol Verbs Specification. This includes direct hardware +access from userspace to InfiniBand/iWARP adapters (kernel bypass) for +fast path operations. + +Device-specific plug-in ibverbs userspace drivers are included: + +- libcxgb4: Chelsio T4 iWARP HCA +- libefa: Amazon Elastic Fabric Adapter +- libhfi1: Intel Omni-Path HFI +- libhns: HiSilicon Hip06 SoC +- libi40iw: Intel Ethernet Connection X722 RDMA +- libipathverbs: QLogic InfiniPath HCA +- libmlx4: Mellanox ConnectX-3 InfiniBand HCA +- libmlx5: Mellanox Connect-IB/X-4+ InfiniBand HCA +- libmthca: Mellanox InfiniBand HCA +- libocrdma: Emulex OneConnect RDMA/RoCE Device +- libqedr: QLogic QL4xxx RoCE HCA +- librxe: A software implementation of the RoCE protocol +- libsiw: A software implementation of the iWarp protocol +- libvmw_pvrdma: VMware paravirtual RDMA device + +%package -n %verbs_lname +Summary: Ibverbs runtime library +Group: System/Libraries +Requires: libibverbs = %{version} + +%description -n %verbs_lname +This package contains the ibverbs runtime library. + +%package -n %efa_lname +Summary: EFA runtime library +Group: System/Libraries + +%description -n %efa_lname +This package contains the efa runtime library. + +%package -n %mlx4_lname +Summary: MLX4 runtime library +Group: System/Libraries + +%description -n %mlx4_lname +This package contains the mlx4 runtime library. + +%package -n %mlx5_lname +Summary: MLX5 runtime library +Group: System/Libraries + +%description -n %mlx5_lname +This package contains the mlx5 runtime library. + +%package -n libibnetdisc%{ibnetdisc_major} +Summary: Infiniband Net Discovery runtime library +Group: System/Libraries + +%description -n libibnetdisc%{ibnetdisc_major} +This package contains the Infiniband Net Discovery runtime library needed +mainly by infiniband-diags. + +%package -n libibverbs-utils +Summary: Examples for the libibverbs library +Group: Productivity/Networking/Other +Requires: libibverbs%{?_isa} = %{version} + +%description -n libibverbs-utils +Useful libibverbs example programs such as ibv_devinfo, which +displays information about RDMA devices. + +%package -n ibacm +Summary: InfiniBand Communication Manager Assistant +Group: Productivity/Networking/Other +%{?systemd_requires} +Requires: %{name}%{?_isa} = %{version} +Obsoletes: libibacmp1 < %{version} +Provides: libibacmp1 = %{version} + +%description -n ibacm +The ibacm daemon helps reduce the load of managing path record lookups on +large InfiniBand fabrics by providing a user space implementation of what +is functionally similar to an ARP cache. The use of ibacm, when properly +configured, can reduce the SA packet load of a large IB cluster from O(n^2) +to O(n). The ibacm daemon is started and normally runs in the background, +user applications need not know about this daemon as long as their app +uses librdmacm to handle connection bring up/tear down. The librdmacm +library knows how to talk directly to the ibacm daemon to retrieve data. + +%package -n infiniband-diags +Summary: InfiniBand Diagnostic Tools +Group: Productivity/Networking/Diagnostic +Requires: perl = %{perl_version} + +%description -n infiniband-diags +diags provides IB diagnostic programs and scripts needed to diagnose an +IB subnet. + +%package -n libibmad%{mad_major} +Summary: Libibmad runtime library +Group: System/Libraries + +%description -n libibmad%{mad_major} +Libibmad provides low layer IB functions for use by the IB diagnostic +and management programs. These include MAD, SA, SMP, and other basic IB +functions. This package contains the runtime library. + +%package -n iwpmd +Summary: Userspace iWarp Port Mapper daemon +Group: Development/Libraries/C and C++ +Requires: %{name}%{?_isa} = %{version} +%{?systemd_requires} + +%description -n iwpmd +iwpmd provides a userspace service for iWarp drivers to claim +tcp ports through the standard socket interface. + +%package -n %umad_lname +Summary: OpenFabrics Alliance InfiniBand Userspace Management Datagram library +Group: System/Libraries + +%description -n %umad_lname +libibumad provides the userspace management datagram (umad) library +functions, which sit on top of the umad modules in the kernel. These +are used by the IB diagnostic and management tools, including OpenSM. + +%package -n %rdmacm_lname +Summary: Userspace RDMA Connection Manager +Group: System/Libraries +Requires: %{name} = %{version} +Provides: librdmacm = %{version} +Obsoletes: librdmacm < %{version} + +%description -n %rdmacm_lname +librdmacm provides a userspace RDMA Communication Management API. + +%package -n rsocket +Summary: Preloadable library to turn the socket API RDMA-aware +# Older librdmacm-tools used to provide rsocket +Group: System/Libraries +Conflicts: librdmacm-tools < 2 + +%description -n rsocket +Existing applications can make use of rsockets through the use this +preloadable library. See the documentation in the packaged rsocket(7) +manpage for details. + +%package -n librdmacm-utils +Summary: Examples for the librdmacm library +Group: Productivity/Networking/Other +Obsoletes: librdmacm-tools < %{version} +Provides: librdmacm-tools = %{version} + +%description -n librdmacm-utils +Example test programs for the librdmacm library. + +%package -n srp_daemon +Summary: Tools for using the InfiniBand SRP protocol devices +Group: Development/Libraries/C and C++ +Requires: %{name} = %{version} +Obsoletes: srptools <= 1.0.3 +Provides: srptools = %{version} +%{?systemd_requires} + +%description -n srp_daemon +In conjunction with the kernel ib_srp driver, srp_daemon allows you to +discover and use SCSI devices via the SCSI RDMA Protocol over InfiniBand. + +%package -n rdma-ndd +Summary: Daemon to manage RDMA Node Description +Group: System/Daemons +Requires: %{name} = %{version} +# The udev rules in rdma need to be aware of rdma-ndd: +Conflicts: rdma < 2.1 +%{?systemd_requires} + +%description -n rdma-ndd +rdma-ndd is a system daemon which watches for rdma device changes and/or +hostname changes and updates the Node Description of the rdma devices based +on those changes. + +%package -n python3-pyverbs +Summary: Python3 API over IB verbs +Group: Development/Languages/Python + +%description -n python3-pyverbs +Pyverbs is a Cython-based Python API over libibverbs, providing an +easy, object-oriented access to IB verbs. + +%prep +# Make sure LTO is disable as rdma-core fails to compile with LTO enabled +%define _lto_cflags %{nil} +%setup -q -n %{name}-%{version}%{git_ver} + +%build + +# New RPM defines _rundir, usually as /run +%if 0%{?_rundir:1} +%else +%define _rundir /var/run +%endif + +%{!?EXTRA_CMAKE_FLAGS: %define EXTRA_CMAKE_FLAGS %{nil}} + +# Pass all of the rpm paths directly to GNUInstallDirs and our other defines. +%cmake %{CMAKE_FLAGS} \ + -DCMAKE_MODULE_LINKER_FLAGS="-Wl,--as-needed -Wl,-z,now" \ + -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_INSTALL_BINDIR:PATH=%{_bindir} \ + -DCMAKE_INSTALL_SBINDIR:PATH=%{_sbindir} \ + -DCMAKE_INSTALL_LIBDIR:PATH=%{_libdir} \ + -DCMAKE_INSTALL_LIBEXECDIR:PATH=%{_libexecdir} \ + -DCMAKE_INSTALL_LOCALSTATEDIR:PATH=%{_localstatedir} \ + -DCMAKE_INSTALL_SHAREDSTATEDIR:PATH=%{_sharedstatedir} \ + -DCMAKE_INSTALL_INCLUDEDIR:PATH=%{_includedir} \ + -DCMAKE_INSTALL_INFODIR:PATH=%{_infodir} \ + -DCMAKE_INSTALL_MANDIR:PATH=%{_mandir} \ + -DCMAKE_INSTALL_SYSCONFDIR:PATH=%{_sysconfdir} \ + -DCMAKE_INSTALL_SYSTEMD_SERVICEDIR:PATH=%{_unitdir} \ + -DCMAKE_INSTALL_SYSTEMD_BINDIR:PATH=%{_libexecdir}/systemd \ + -DCMAKE_INSTALL_INITDDIR:PATH=%{_initddir} \ + -DCMAKE_INSTALL_RUNDIR:PATH=%{_rundir} \ + -DCMAKE_INSTALL_DOCDIR:PATH=%{_docdir}/%{name}-%{version} \ + -DCMAKE_INSTALL_UDEV_RULESDIR:PATH=%{_udevrulesdir} \ + -DCMAKE_INSTALL_PERLDIR:PATH=%{perl_vendorlib} \ +%if %{with_static} + -DENABLE_STATIC=1 \ +%endif + %{EXTRA_CMAKE_FLAGS} \ +%if %{defined __python3} + -DPYTHON_EXECUTABLE:PATH=%{__python3} \ + -DCMAKE_INSTALL_PYTHON_ARCH_LIB:PATH=%{python3_sitearch} \ +%endif +%if %{with_pyverbs} + -DNO_PYVERBS=0 +%else + -DNO_PYVERBS=1 +%endif +%make_jobs + +%install +cd build +%cmake_install +cd .. +mkdir -p %{buildroot}/%{_sysconfdir}/rdma + +%global dracutlibdir %%{_libexecdir}/dracut/ +%global sysmodprobedir %%{_sysconfdir}/modprobe.d + +mkdir -p %{buildroot}%{_udevrulesdir} +mkdir -p %{buildroot}%{dracutlibdir}/modules.d/05rdma +mkdir -p %{buildroot}%{sysmodprobedir} +mkdir -p %{buildroot}%{_unitdir} + +# SRIOV service +install -D -m0644 redhat/rdma.sriov-vfs %{buildroot}/%{_sysconfdir}/rdma/sriov-vfs +install -D -m0755 redhat/rdma.sriov-init %{buildroot}%{_libexecdir}/rdma-set-sriov-vf +install -D -m0644 suse/rdma.sriov-rules %{buildroot}%{_udevrulesdir}/98-rdma-sriov.rules +install -D -m0644 suse/rdma.sriov-service %{buildroot}%{_unitdir}/rdma-sriov.service + +# Port type setup for mlx4 dual port cards +install -D -m0644 redhat/rdma.mlx4.conf %{buildroot}/%{_sysconfdir}/rdma/mlx4.conf +sed 's%/usr/libexec%/usr/lib%g' redhat/rdma.mlx4.sys.modprobe > %{buildroot}%{sysmodprobedir}/50-libmlx4.conf +chmod 0644 %{buildroot}%{sysmodprobedir}/50-libmlx4.conf +install -D -m0755 redhat/rdma.mlx4-setup.sh %{buildroot}%{_libexecdir}/mlx4-setup.sh + +# Dracut file for IB support during boot +install -D -m0644 suse/module-setup.sh %{buildroot}%{dracutlibdir}/modules.d/05rdma/module-setup.sh + +# ibacm +cd build +LD_LIBRARY_PATH=./lib bin/ib_acme -D . -O +install -D -m0644 ibacm_opts.cfg %{buildroot}%{_sysconfdir}/rdma/ + +for service in rdma rdma-ndd ibacm iwpmd srp_daemon; do ln -sf %{_sbindir}/service %{buildroot}%{_sbindir}/rc${service}; done + +# Delete the package's init.d scripts +rm -rf %{buildroot}/%{_initddir}/ +rm -rf %{buildroot}/%{_sbindir}/srp_daemon.sh + +%post -n %verbs_lname -p /sbin/ldconfig +%postun -n %verbs_lname -p /sbin/ldconfig + +%post -n %efa_lname -p /sbin/ldconfig +%postun -n %efa_lname -p /sbin/ldconfig + +%post -n %mlx4_lname -p /sbin/ldconfig +%postun -n %mlx4_lname -p /sbin/ldconfig + +%post -n %mlx5_lname -p /sbin/ldconfig +%postun -n %mlx5_lname -p /sbin/ldconfig + +%post -n %umad_lname -p /sbin/ldconfig +%postun -n %umad_lname -p /sbin/ldconfig + +%post -n %rdmacm_lname -p /sbin/ldconfig +%postun -n %rdmacm_lname -p /sbin/ldconfig + +%post -n libibnetdisc%{ibnetdisc_major} -p /sbin/ldconfig +%postun -n libibnetdisc%{ibnetdisc_major} -p /sbin/ldconfig + +%post -n libibmad%{mad_major} -p /sbin/ldconfig +%postun -n libibmad%{mad_major} -p /sbin/ldconfig + +%post +# we ship udev rules, so trigger an update. +%{_bindir}/udevadm trigger --subsystem-match=infiniband --action=change || true +%{_bindir}/udevadm trigger --subsystem-match=infiniband_mad --action=change || true + +# +# ibacm +# +%pre -n ibacm +%service_add_pre ibacm.service ibacm.socket + +%post -n ibacm +%service_add_post ibacm.service ibacm.socket + +%preun -n ibacm +%service_del_preun ibacm.service ibacm.socket + +%postun -n ibacm +%service_del_postun ibacm.service ibacm.socket + +# +# srp daemon +# +%pre -n srp_daemon +%service_add_pre srp_daemon.service + +%post -n srp_daemon +%service_add_post srp_daemon.service +# we ship udev rules, so trigger an update. +%{_bindir}/udevadm trigger --subsystem-match=infiniband_mad --action=change + +%preun -n srp_daemon +%service_del_preun srp_daemon.service + +%postun -n srp_daemon +%service_del_postun srp_daemon.service + +# +# iwpmd +# +%pre -n iwpmd +%service_add_pre ibiwpmd.service + +%post -n iwpmd +%service_add_post iwpmd.service + +%preun -n iwpmd +%service_del_preun iwpmd.service + +%postun -n iwpmd +%service_del_postun iwpmd.service + +# +# rdma-ndd +# +%pre -n rdma-ndd +%service_add_pre rdma-ndd.service + +%preun -n rdma-ndd +%service_del_preun rdma-ndd.service + +%post -n rdma-ndd +%service_add_post rdma-ndd.service + +%postun -n rdma-ndd +%service_del_postun rdma-ndd.service + +%files +%defattr(-,root,root) +%dir %{_sysconfdir}/rdma +%dir %{_sysconfdir}/rdma/modules +%dir %{_docdir}/%{name}-%{version} +%dir %{_udevrulesdir} +%dir %{_sysconfdir}/udev +%dir %{_sysconfdir}/udev/rules.d +%dir %{_sysconfdir}/modprobe.d +%doc %{_docdir}/%{name}-%{version}/README.md +%doc %{_docdir}/%{name}-%{version}/udev.md +%config(noreplace) %{_sysconfdir}/rdma/mlx4.conf +%config(noreplace) %{_sysconfdir}/rdma/modules/infiniband.conf +%config(noreplace) %{_sysconfdir}/rdma/modules/iwarp.conf +%config(noreplace) %{_sysconfdir}/rdma/modules/opa.conf +%config(noreplace) %{_sysconfdir}/rdma/modules/rdma.conf +%config(noreplace) %{_sysconfdir}/rdma/modules/roce.conf +%config(noreplace) %{_sysconfdir}/rdma/sriov-vfs +%if 0%{?dma_coherent} +%config(noreplace) %{_sysconfdir}/modprobe.d/mlx4.conf +%endif +%config(noreplace) %{_sysconfdir}/modprobe.d/truescale.conf +%config(noreplace) %{_sysconfdir}/udev/rules.d/70-persistent-ipoib.rules +%{_unitdir}/rdma-hw.target +%{_unitdir}/rdma-load-modules@.service +%{_unitdir}/rdma-sriov.service +%dir %{dracutlibdir} +%dir %{dracutlibdir}/modules.d +%dir %{dracutlibdir}/modules.d/05rdma +%{dracutlibdir}/modules.d/05rdma/module-setup.sh +%{_udevrulesdir}/../rdma_rename +%{_udevrulesdir}/60-rdma-persistent-naming.rules +%{_udevrulesdir}/75-rdma-description.rules +%{_udevrulesdir}/90-rdma-hw-modules.rules +%{_udevrulesdir}/90-rdma-ulp-modules.rules +%{_udevrulesdir}/90-rdma-umad.rules +%{_udevrulesdir}/98-rdma-sriov.rules +%{sysmodprobedir}/50-libmlx4.conf +%{_libexecdir}/rdma-set-sriov-vf +%{_libexecdir}/mlx4-setup.sh +%{_libexecdir}/truescale-serdes.cmds +%license COPYING.* +%{_sbindir}/rcrdma + +%files devel +%defattr(-,root,root) +%doc %{_docdir}/%{name}-%{version}/MAINTAINERS +%dir %{_includedir}/infiniband +%dir %{_includedir}/rdma +%{_includedir}/infiniband/* +%{_includedir}/rdma/* +%if %{with_static} +%{_libdir}/lib*.a +%endif +%{_libdir}/lib*.so +%{_libdir}/pkgconfig/*.pc +%{_mandir}/man3/ibnd_* +%{_mandir}/man3/ibv_* +%{_mandir}/man3/rdma* +%{_mandir}/man3/umad* +%{_mandir}/man3/*_to_ibv_rate.* +%{_mandir}/man7/rdma_cm.* +%if 0%{?dma_coherent} +%{_mandir}/man3/efadv* +%{_mandir}/man3/mlx5dv* +%{_mandir}/man3/mlx4dv* +%{_mandir}/man7/efadv* +%{_mandir}/man7/mlx5dv* +%{_mandir}/man7/mlx4dv* +%endif + +%files -n libibverbs +%defattr(-,root,root) +%dir %{_sysconfdir}/libibverbs.d +%dir %{_libdir}/libibverbs +%{_libdir}/libibverbs/*.so +%config(noreplace) %{_sysconfdir}/libibverbs.d/*.driver +%doc %{_docdir}/%{name}-%{version}/libibverbs.md +%doc %{_docdir}/%{name}-%{version}/rxe.md +%doc %{_docdir}/%{name}-%{version}/tag_matching.md +%{_mandir}/man7/rxe* + +%files -n libibnetdisc%{ibnetdisc_major} +%defattr(-, root, root) +%{_libdir}/libibnetdisc.so.* + +%files -n libibmad%{mad_major} +%defattr(-, root, root) +%{_libdir}/libibmad.so.* + +%files -n %verbs_lname +%defattr(-,root,root) +%{_libdir}/libibverbs*.so.* + +%if 0%{?dma_coherent} +%files -n %efa_lname +%defattr(-,root,root) +%{_libdir}/libefa*.so.* + +%files -n %mlx4_lname +%defattr(-,root,root) +%{_libdir}/libmlx4*.so.* + +%files -n %mlx5_lname +%defattr(-,root,root) +%{_libdir}/libmlx5*.so.* +%endif + +%files -n libibverbs-utils +%defattr(-,root,root) +%{_bindir}/ibv_* +%{_mandir}/man1/ibv_* + +%files -n ibacm +%defattr(-,root,root) +%config(noreplace) %{_sysconfdir}/rdma/ibacm_opts.cfg +%{_bindir}/ib_acme +%{_sbindir}/ibacm +%{_mandir}/man1/ib_acme.* +%{_mandir}/man7/ibacm.* +%{_mandir}/man7/ibacm_prov.* +%{_mandir}/man8/ibacm.* +%{_unitdir}/ibacm.service +%{_unitdir}/ibacm.socket +%dir %{_libdir}/ibacm +%{_libdir}/ibacm/* +%{_sbindir}/rcibacm +%doc %{_docdir}/%{name}-%{version}/ibacm.md + +%files -n infiniband-diags +%defattr(-, root, root) +%dir %{_sysconfdir}/infiniband-diags +%config(noreplace) %{_sysconfdir}/infiniband-diags/* +%{_sbindir}/ibaddr +%{_mandir}/man8/ibaddr* +%{_sbindir}/ibnetdiscover +%{_mandir}/man8/ibnetdiscover* +%{_sbindir}/ibping +%{_mandir}/man8/ibping* +%{_sbindir}/ibportstate +%{_mandir}/man8/ibportstate* +%{_sbindir}/ibroute +%{_mandir}/man8/ibroute.* +%{_sbindir}/ibstat +%{_mandir}/man8/ibstat.* +%{_sbindir}/ibsysstat +%{_mandir}/man8/ibsysstat* +%{_sbindir}/ibtracert +%{_mandir}/man8/ibtracert* +%{_sbindir}/perfquery +%{_mandir}/man8/perfquery* +%{_sbindir}/sminfo +%{_mandir}/man8/sminfo* +%{_sbindir}/smpdump +%{_mandir}/man8/smpdump* +%{_sbindir}/smpquery +%{_mandir}/man8/smpquery* +%{_sbindir}/saquery +%{_mandir}/man8/saquery* +%{_sbindir}/vendstat +%{_mandir}/man8/vendstat* +%{_sbindir}/iblinkinfo +%{_mandir}/man8/iblinkinfo* +%{_sbindir}/ibqueryerrors +%{_mandir}/man8/ibqueryerrors* +%{_sbindir}/ibcacheedit +%{_mandir}/man8/ibcacheedit* +%{_sbindir}/ibccquery +%{_mandir}/man8/ibccquery* +%{_sbindir}/ibccconfig +%{_mandir}/man8/ibccconfig* +%{_sbindir}/dump_fts +%{_mandir}/man8/dump_fts* +%{_sbindir}/ibhosts +%{_mandir}/man8/ibhosts* +%{_sbindir}/ibswitches +%{_mandir}/man8/ibswitches* +%{_sbindir}/ibnodes +%{_mandir}/man8/ibnodes* +%{_sbindir}/ibrouters +%{_mandir}/man8/ibrouters* +%{_sbindir}/ibfindnodesusing.pl +%{_mandir}/man8/ibfindnodesusing* +%{_sbindir}/ibidsverify.pl +%{_mandir}/man8/ibidsverify* +%{_sbindir}/check_lft_balance.pl +%{_mandir}/man8/check_lft_balance* +%{_sbindir}/dump_lfts.sh +%{_mandir}/man8/dump_lfts* +%{_sbindir}/dump_mfts.sh +%{_mandir}/man8/dump_mfts* +%{_sbindir}/ibstatus +%{_mandir}/man8/ibstatus* +%{_mandir}/man8/infiniband-diags* +%{perl_vendorlib}/IBswcountlimits.pm + +%files -n iwpmd +%defattr(-,root,root) +%dir %{_sysconfdir}/rdma +%dir %{_sysconfdir}/rdma/modules +%{_sbindir}/iwpmd +%{_sbindir}/rciwpmd +%{_unitdir}/iwpmd.service +%config(noreplace) %{_sysconfdir}/rdma/modules/iwpmd.conf +%config(noreplace) %{_sysconfdir}/iwpmd.conf +%{_udevrulesdir}/90-iwpmd.rules +%{_mandir}/man8/iwpmd.* +%{_mandir}/man5/iwpmd.* + +%files -n %umad_lname +%defattr(-,root,root) +%{_libdir}/libibumad*.so.* + +%files -n %rdmacm_lname +%defattr(-,root,root) +%{_libdir}/librdmacm*.so.* +%doc %{_docdir}/%{name}-%{version}/librdmacm.md + +%files -n rsocket +%defattr(-,root,root) +%dir %{_libdir}/rsocket +%{_libdir}/rsocket/*.so* +%{_mandir}/man7/rsocket.* + +%files -n librdmacm-utils +%defattr(-,root,root) +%{_bindir}/cmtime +%{_bindir}/mckey +%{_bindir}/rcopy +%{_bindir}/rdma_client +%{_bindir}/rdma_server +%{_bindir}/rdma_xclient +%{_bindir}/rdma_xserver +%{_bindir}/riostream +%{_bindir}/rping +%{_bindir}/rstream +%{_bindir}/ucmatose +%{_bindir}/udaddy +%{_bindir}/udpong +%{_mandir}/man1/cmtime.* +%{_mandir}/man1/mckey.* +%{_mandir}/man1/rcopy.* +%{_mandir}/man1/rdma_client.* +%{_mandir}/man1/rdma_server.* +%{_mandir}/man1/rdma_xclient.* +%{_mandir}/man1/rdma_xserver.* +%{_mandir}/man1/riostream.* +%{_mandir}/man1/rping.* +%{_mandir}/man1/rstream.* +%{_mandir}/man1/ucmatose.* +%{_mandir}/man1/udaddy.* +%{_mandir}/man1/udpong.* + +%files -n srp_daemon +%defattr(-,root,root) +%dir %{_libexecdir}/srp_daemon +%dir %{_sysconfdir}/rdma +%dir %{_sysconfdir}/rdma/modules +%config(noreplace) %{_sysconfdir}/srp_daemon.conf +%config(noreplace) %{_sysconfdir}/rdma/modules/srp_daemon.conf +%{_udevrulesdir}/60-srp_daemon.rules +%{_libexecdir}/srp_daemon/start_on_all_ports +%{_unitdir}/srp_daemon.service +%{_unitdir}/srp_daemon_port@.service +%{_sbindir}/ibsrpdm +%{_sbindir}/srp_daemon +%{_sbindir}/run_srp_daemon +%{_sbindir}/rcsrp_daemon +%{_mandir}/man5/srp_daemon.service.5* +%{_mandir}/man5/srp_daemon_port@.service.5* +%{_mandir}/man8/ibsrpdm.8* +%{_mandir}/man8/srp_daemon.8* +%doc %{_docdir}/%{name}-%{version}/ibsrpdm.md + +%files -n rdma-ndd +%defattr(-, root, root) +%{_sbindir}/rdma-ndd +%{_sbindir}/rcrdma-ndd +%{_unitdir}/rdma-ndd.service +%{_mandir}/man8/rdma-ndd.8* +%{_udevrulesdir}/60-rdma-ndd.rules + +%if %{with_pyverbs} +%files -n python3-pyverbs +%{python3_sitearch}/pyverbs +%dir %{_docdir}/%{name}-%{version}/tests/ +%{_docdir}/%{name}-%{version}/tests/*.py +%endif + +%changelog diff --git a/suse/rdma.sriov-rules b/suse/rdma.sriov-rules new file mode 100644 index 0000000..722ffce --- /dev/null +++ b/suse/rdma.sriov-rules @@ -0,0 +1,7 @@ +ACTION=="remove", GOTO="rdma_sriov_end" +SUBSYSTEM!="infiniband", GOTO="rdma_sriov_end" + +# Automatically load general RDMA ULP modules when RDMA hardware is installed +TAG+="systemd", ENV{ID_RDMA_INFINIBAND}=="1", ENV{SYSTEMD_WANTS}+="rdma-sriov.service" + +LABEL="rdma_sriov_end" diff --git a/suse/rdma.sriov-service b/suse/rdma.sriov-service new file mode 100644 index 0000000..2b701b8 --- /dev/null +++ b/suse/rdma.sriov-service @@ -0,0 +1,16 @@ +[Unit] +Description=Initialize SRIOV for RDMA devices +RefuseManualStop=true +DefaultDependencies=false +Conflicts=emergency.target emergency.service +# Partially support distro network setup scripts that try to run after hardware is configured. +# Run them after any SRIOV devices have been created +Wants=network-pre.target +Before=network-pre.target +# RDMA is not ready until all SRIOV devices are created +Before=rdma-hw.target + +[Service] +Type=oneshot +RemainAfterExit=yes +ExecStart=/usr/lib/rdma-set-sriov-vf diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt new file mode 100644 index 0000000..d90c89e --- /dev/null +++ b/tests/CMakeLists.txt @@ -0,0 +1,32 @@ +# SPDX-License-Identifier: (GPL-2.0 OR Linux-OpenIB) +# Copyright (c) 2019, Mellanox Technologies. All rights reserved. See COPYING file + +rdma_python_test(tests + __init__.py + base.py + rdmacm_utils.py + test_addr.py + test_cq.py + test_cq_events.py + test_cqex.py + test_device.py + test_mlx5_pp.py + test_mlx5_var.py + test_mr.py + test_odp.py + test_pd.py + test_parent_domain.py + test_qp.py + test_qpex.py + test_rdmacm.py + test_relaxed_ordering.py + utils.py + ) + +rdma_python_test(tests + run_tests.py + ) + +rdma_internal_binary( + run_tests.py + ) diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..a746e71 --- /dev/null +++ b/tests/__init__.py @@ -0,0 +1,27 @@ +# SPDX-License-Identifier: (GPL-2.0 OR Linux-OpenIB) +# Copyright (c) 2019 Mellanox Technologies, Inc . All rights reserved. See COPYING file + +import importlib +import os + +# Load every test as a module in the system so that unittest's loader can find it +def _load_tests(): + res = [] + for fn in sorted(os.listdir(os.path.dirname(__file__))): + if fn.endswith(".py") and fn.startswith("test_"): + m = importlib.import_module("." + os.path.basename(fn)[:-3], __name__) + res.append(m) + return res +__test_modules__ = _load_tests() + +# unittest -v prints names like 'tests.test_foo', but it always starts +# searching from the tests module, adding the name 'tests.test' lets the user +# specify the same test name from logging on the command line to trivially run +# a single test. +tests = importlib.import_module(".", __name__) + +def load_tests(loader, standard_tests, pattern): + """Implement the loadTestsFromModule protocol""" + for mod in __test_modules__: + standard_tests.addTests(loader.loadTestsFromModule(mod, pattern)) + return standard_tests diff --git a/tests/base.py b/tests/base.py new file mode 100755 index 0000000..ece2443 --- /dev/null +++ b/tests/base.py @@ -0,0 +1,535 @@ +# SPDX-License-Identifier: (GPL-2.0 OR Linux-OpenIB) +# Copyright (c) 2019 Mellanox Technologies, Inc . All rights reserved. See COPYING file + +import unittest +import tempfile +import random +import errno +import stat +import os + +from pyverbs.pyverbs_error import PyverbsRDMAError, PyverbsUserError +from pyverbs.qp import QPCap, QPInitAttrEx, QPInitAttr, QPAttr, QP +from pyverbs.cmid import CMID, AddrInfo, CMEventChannel, ConnParam +from pyverbs.addr import AHAttr, GlobalRoute +from pyverbs.xrcd import XRCD, XRCDInitAttr +from pyverbs.srq import SRQ, SrqInitAttrEx +from pyverbs.device import Context +import pyverbs.cm_enums as ce +import pyverbs.device as d +import pyverbs.enums as e +from pyverbs.pd import PD +from pyverbs.cq import CQ +from pyverbs.mr import MR + + +PATH_MTU = e.IBV_MTU_1024 +MAX_DEST_RD_ATOMIC = 1 +MAX_RD_ATOMIC = 1 +MIN_RNR_TIMER =12 +RETRY_CNT = 7 +RNR_RETRY = 7 +TIMEOUT = 14 +# Devices that don't support RoCEv2 should be added here +MLNX_VENDOR_ID = 0x02c9 +CX3_MLNX_PART_ID = 4099 +CX3Pro_MLNX_PART_ID = 4103 +# Dictionary: vendor_id -> array of part_ids of devices that lack RoCEv2 support +ROCEV2_UNSUPPORTED_DEVS = {MLNX_VENDOR_ID: [CX3Pro_MLNX_PART_ID, + CX3_MLNX_PART_ID]} + + +def has_roce_hw_bug(vendor_id, vendor_part_id): + return vendor_part_id in ROCEV2_UNSUPPORTED_DEVS.get(vendor_id, []) + + +class PyverbsAPITestCase(unittest.TestCase): + def setUp(self): + """ + Opens the devices and queries them + """ + lst = d.get_device_list() + self.devices = [] + if len(lst) == 0: + raise unittest.SkipTest('No IB devices found') + for dev in lst: + c = d.Context(name=dev.name.decode()) + attr = c.query_device() + attr_ex = c.query_device_ex() + self.devices.append((c, attr, attr_ex)) + + def tearDown(self): + for tup in self.devices: + tup[0].close() + + +class RDMATestCase(unittest.TestCase): + """ + A base class for test cases which provides the option for user parameters. + These can be provided by manually adding the test case to the runner: + suite = unittest.TestSuite() + ... # Regular auto-detection of test cases, no parameters used. + # Now follows your manual addition of test cases e.g: + suite.addTest(RDMATestCase.parametrize(<TestCaseName>, dev_name='..', + ib_port=1, gid_index=3, + pkey_index=42)) + """ + ZERO_GID = '0000:0000:0000:0000' + + def __init__(self, methodName='runTest', dev_name=None, ib_port=None, + gid_index=None, pkey_index=None): + super(RDMATestCase, self).__init__(methodName) + self.dev_name = dev_name + self.ib_port = ib_port + self.gid_index = gid_index + self.pkey_index = pkey_index + + @staticmethod + def parametrize(testcase_klass, dev_name=None, ib_port=None, gid_index=None, + pkey_index=None): + """ + Create a test suite containing all the tests from the given subclass + with the given dev_name, port, gid index and pkey_index. + """ + loader = unittest.TestLoader() + names = loader.getTestCaseNames(testcase_klass) + suite = unittest.TestSuite() + for n in names: + suite.addTest(testcase_klass(n, dev_name=dev_name, ib_port=ib_port, + gid_index=gid_index, + pkey_index=pkey_index)) + return suite + + def setUp(self): + """ + Verify that the test case has dev_name, ib_port, gid_index and pkey index. + If not provided by the user, a random valid combination will be used. + """ + if self.pkey_index is None: + # To avoid iterating the entire pkeys table, if a pkey index wasn't + # provided, use index 0 which is always valid + self.pkey_index = 0 + + self.args = [] + if self.dev_name is not None: + ctx = d.Context(name=self.dev_name) + if self.ib_port is not None: + if self.gid_index is not None: + # We have all we need, return + return + else: + # Add avaiable GIDs of the given dev_name + port + self._add_gids_per_port(ctx, self.dev_name, self.ib_port) + else: + # Add available GIDs for each port of the given dev_name + self._add_gids_per_device(ctx, self.dev_name) + else: + # Iterate available devices, add available GIDs for each of + # their ports + lst = d.get_device_list() + for dev in lst: + dev_name = dev.name.decode() + ctx = d.Context(name=dev_name) + self._add_gids_per_device(ctx, dev_name) + + if not self.args: + raise unittest.SkipTest('No port is up, can\'t run traffic') + # Choose one combination and use it + args = random.choice(self.args) + self.dev_name = args[0] + self.ib_port = args[1] + self.gid_index = args[2] + + def _add_gids_per_port(self, ctx, dev, port): + # Don't add ports which are not active + port_attrs = ctx.query_port(port) + if port_attrs.state != e.IBV_PORT_ACTIVE: + return + dev_attrs = ctx.query_device() + vendor_id = dev_attrs.vendor_id + vendor_pid = dev_attrs.vendor_part_id + for idx in range(port_attrs.gid_tbl_len): + gid = ctx.query_gid(port, idx) + # Avoid adding ZERO GIDs + if gid.gid[-19:] == self.ZERO_GID: + continue + # Avoid RoCEv2 GIDs on unsupported devices + if port_attrs.link_layer == e.IBV_LINK_LAYER_ETHERNET and \ + ctx.query_gid_type(port, idx) == e.IBV_GID_TYPE_ROCE_V2 and \ + has_roce_hw_bug(vendor_id, vendor_pid): + continue + self.args.append([dev, port, idx]) + + def _add_gids_per_device(self, ctx, dev): + port_count = ctx.query_device().phys_port_cnt + for port in range(port_count): + self._add_gids_per_port(ctx, dev, port+1) + + +class CMResources: + """ + CMResources class is a base aggregator object which contains basic + resources for RDMA CM communication. + """ + def __init__(self, **kwargs): + """ + :param kwargs: Arguments: + * *src* (str) + Local address to bind to (for passive side) + * *dst* (str) + Destination address to connect (for active side) + * *port* (str) + Port number of the address + * *is_async* (bool) + A flag which indicates if its asynchronous RDMACM + * *with_ext_qp* (bool) + If set, an external RC QP will be created and used by RDMACM + """ + src = kwargs.get('src') + dst = kwargs.get('dst') + self.is_server = True if dst is None else False + self.qp_init_attr = None + self.is_async = kwargs.get('is_async', False) + self.with_ext_qp = kwargs.get('with_ext_qp', False) + self.connected = False + # When passive side (server) listens to incoming connection requests, + # for each new request it creates a new cmid which is used to establish + # the connection with the remote side + self.child_id = None + self.msg_size = 1024 + self.num_msgs = 100 + self.channel = None + self.cq = None + self.qp = None + self.port = kwargs.get('port') if kwargs.get('port') else '7471' + self.mr = None + if self.is_server: + self.ai = AddrInfo(src, None, self.port, ce.RDMA_PS_TCP, + ce.RAI_PASSIVE) + else: + self.ai = AddrInfo(src, dst, self.port, ce.RDMA_PS_TCP) + if self.is_async: + self.create_event_channel() + self.cmid = CMID(creator=self.channel) + else: + self.cmid = CMID(creator=self.ai, + qp_init_attr=self.create_qp_init_attr()) + + def create_mr(self): + if self.is_server: + self.mr = self.child_id.reg_msgs(self.msg_size) + else: + self.mr = self.cmid.reg_msgs(self.msg_size) + + def create_event_channel(self): + self.channel = CMEventChannel() + + @staticmethod + def create_qp_init_attr(rcq=None, scq=None): + return QPInitAttr(qp_type=e.IBV_QPT_RC, rcq=rcq, scq=scq, + cap=QPCap(max_recv_wr=1)) + + @staticmethod + def create_conn_param(qp_num=0): + return ConnParam(qp_num=qp_num) + + def create_child_id(self, cm_event=None): + if not self.is_server: + raise PyverbsUserError('create_child_id can be used only in passive side') + if self.is_async: + self.child_id = CMID(creator=cm_event, listen_id=self.cmid) + else: + self.child_id = self.cmid.get_request() + + def create_qp(self): + """ + Create a rdmacm QP. If self.with_ext_qp is set, then an external CQ and + RC QP will be created and set in self.cq and self.qp + respectively. + """ + cmid = self.child_id if self.is_server else self.cmid + if not self.with_ext_qp: + cmid.create_qp(self.create_qp_init_attr()) + else: + self.cq = CQ(cmid.context, self.num_msgs, None, None, 0) + init_attr = self.create_qp_init_attr(rcq=self.cq, scq=self.cq) + self.qp = QP(cmid.pd, init_attr, QPAttr()) + + def modify_ext_qp_to_rts(self): + cmid = self.child_id if self.is_server else self.cmid + attr, mask = cmid.init_qp_attr(e.IBV_QPS_INIT) + self.qp.modify(attr, mask) + attr, mask = cmid.init_qp_attr(e.IBV_QPS_RTR) + self.qp.modify(attr, mask) + attr, mask = cmid.init_qp_attr(e.IBV_QPS_RTS) + self.qp.modify(attr, mask) + + +class BaseResources(object): + """ + BaseResources class is a base aggregator object which contains basic + resources like Context and PD. It opens a context over the given device + and port and allocates a PD. + """ + def __init__(self, dev_name, ib_port, gid_index): + """ + Initializes a BaseResources object. + :param dev_name: Device name to be used (default: 'ibp0s8f0') + :param ib_port: IB port of the device to use (default: 1) + :param gid_index: Which GID index to use (default: 0) + """ + self.ctx = Context(name=dev_name) + self.gid_index = gid_index + self.pd = PD(self.ctx) + self.ib_port = ib_port + + +class TrafficResources(BaseResources): + """ + Basic traffic class. It provides the basic RDMA resources and operations + needed for traffic. + """ + def __init__(self, dev_name, ib_port, gid_index): + """ + Initializes a TrafficResources object with the given values and creates + basic RDMA resources. + :param dev_name: Device name to be used + :param ib_port: IB port of the device to use + :param gid_index: Which GID index to use + """ + super(TrafficResources, self).__init__(dev_name=dev_name, + ib_port=ib_port, + gid_index=gid_index) + self.psn = random.getrandbits(24) + self.msg_size = 1024 + self.num_msgs = 1000 + self.port_attr = None + self.mr = None + self.cq = None + self.qp = None + self.rqpn = 0 + self.rpsn = 0 + self.init_resources() + + @property + def qpn(self): + return self.qp.qp_num + + def init_resources(self): + """ + Initializes a CQ, MR and an RC QP. + :return: None + """ + self.port_attr = self.ctx.query_port(self.ib_port) + self.create_cq() + self.create_mr() + self.create_qp() + + def create_cq(self): + """ + Initializes self.cq with a CQ of depth <num_msgs> - defined by each + test. + :return: None + """ + self.cq = CQ(self.ctx, self.num_msgs, None, None, 0) + + def create_mr(self): + """ + Initializes self.mr with an MR of length <msg_size> - defined by each + test. + :return: None + """ + self.mr = MR(self.pd, self.msg_size, e.IBV_ACCESS_LOCAL_WRITE) + + def create_qp(self): + """ + Initializes self.qp with an RC QP. + :return: None + """ + qp_caps = QPCap(max_recv_wr=self.num_msgs) + qp_init_attr = QPInitAttr(qp_type=e.IBV_QPT_RC, scq=self.cq, + rcq=self.cq, cap=qp_caps) + qp_attr = QPAttr(port_num=self.ib_port) + self.qp = QP(self.pd, qp_init_attr, qp_attr) + + def pre_run(self, rpsn, rqpn): + """ + Modify the QP's state to RTS and fill receive queue with <num_msgs> work + requests. + This method is not implemented in this class. + :param rpsn: Remote PSN + :param rqpn: Remote QPN + :return: None + """ + raise NotImplementedError() + + +class RCResources(TrafficResources): + + def to_rts(self): + """ + Set the QP attributes' values to arbitrary values (same values used in + ibv_rc_pingpong). + :return: None + """ + attr = QPAttr(port_num=self.ib_port) + attr.dest_qp_num = self.rqpn + attr.path_mtu = PATH_MTU + attr.max_dest_rd_atomic = MAX_DEST_RD_ATOMIC + attr.min_rnr_timer = MIN_RNR_TIMER + attr.rq_psn = self.psn + attr.sq_psn = self.rpsn + attr.timeout = TIMEOUT + attr.retry_cnt = RETRY_CNT + attr.rnr_retry = RNR_RETRY + attr.max_rd_atomic = MAX_RD_ATOMIC + gr = GlobalRoute(dgid=self.ctx.query_gid(self.ib_port, self.gid_index), + sgid_index=self.gid_index) + ah_attr = AHAttr(port_num=self.ib_port, is_global=1, gr=gr, + dlid=self.port_attr.lid) + attr.ah_attr = ah_attr + self.qp.to_rts(attr) + + def pre_run(self, rpsn, rqpn): + """ + Configure Resources before running traffic + :param rpsn: Remote PSN (packet serial number) + :param rqpn: Remote QP number + :return: None + """ + self.rqpn = rqpn + self.rpsn = rpsn + self.to_rts() + + +class UDResources(TrafficResources): + UD_QKEY = 0x11111111 + UD_PKEY_INDEX = 0 + GRH_SIZE = 40 + + def create_mr(self): + self.mr = MR(self.pd, self.msg_size + self.GRH_SIZE, + e.IBV_ACCESS_LOCAL_WRITE) + + def create_qp(self): + qp_caps = QPCap(max_recv_wr=self.num_msgs) + qp_init_attr = QPInitAttr(qp_type=e.IBV_QPT_UD, cap=qp_caps, + scq=self.cq, rcq=self.cq) + qp_attr = QPAttr(port_num=self.ib_port) + qp_attr.qkey = self.UD_QKEY + qp_attr.pkey_index = self.UD_PKEY_INDEX + self.qp = QP(self.pd, qp_init_attr, qp_attr) + + def pre_run(self, rpsn, rqpn): + self.rqpn = rqpn + self.rpsn = rpsn + + +class XRCResources(TrafficResources): + def __init__(self, dev_name, ib_port, gid_index, qp_count=2): + self.temp_file = None + self.xrcd_fd = -1 + self.xrcd = None + self.srq = None + self.qp_count = qp_count + self.sqp_lst = [] + self.rqp_lst = [] + self.qps_num = [] + self.psns = [] + self.rqps_num = None + self.rpsns = None + super(XRCResources, self).__init__(dev_name, ib_port, gid_index) + + def close(self): + os.close(self.xrcd_fd) + self.temp_file.close() + + def create_qp(self): + """ + Initializes self.qp with an XRC SEND/RECV QP. + :return: None + """ + qp_attr = QPAttr(port_num=self.ib_port) + qp_attr.pkey_index = 0 + + for _ in range(self.qp_count): + attr_ex = QPInitAttrEx(qp_type=e.IBV_QPT_XRC_RECV, + comp_mask=e.IBV_QP_INIT_ATTR_XRCD, + xrcd=self.xrcd) + qp_attr.qp_access_flags = e.IBV_ACCESS_REMOTE_WRITE | \ + e.IBV_ACCESS_REMOTE_READ + recv_qp = QP(self.ctx, attr_ex, qp_attr) + self.rqp_lst.append(recv_qp) + + qp_caps = QPCap(max_send_wr=self.num_msgs, max_recv_sge=0, + max_recv_wr=0) + attr_ex = QPInitAttrEx(qp_type=e.IBV_QPT_XRC_SEND, sq_sig_all=1, + comp_mask=e.IBV_QP_INIT_ATTR_PD, + pd=self.pd, scq=self.cq, cap=qp_caps) + qp_attr.qp_access_flags = 0 + send_qp =QP(self.ctx, attr_ex, qp_attr) + self.sqp_lst.append(send_qp) + self.qps_num.append((recv_qp.qp_num, send_qp.qp_num)) + self.psns.append(random.getrandbits(24)) + + def create_xrcd(self): + """ + Initializes self.xrcd with an XRC Domain object. + :return: None + """ + self.temp_file = tempfile.NamedTemporaryFile() + self.xrcd_fd = os.open(self.temp_file.name, os.O_RDONLY | os.O_CREAT, + stat.S_IRUSR | stat.S_IRGRP) + init = XRCDInitAttr( + e.IBV_XRCD_INIT_ATTR_FD | e.IBV_XRCD_INIT_ATTR_OFLAGS, + os.O_CREAT, self.xrcd_fd) + try: + self.xrcd = XRCD(self.ctx, init) + except PyverbsRDMAError as ex: + if ex.error_code == errno.EOPNOTSUPP: + raise unittest.SkipTest('Create XRCD is not supported') + raise ex + + def create_srq(self): + """ + Initializes self.srq with a Shared Receive QP object. + :return: None + """ + srq_attr = SrqInitAttrEx(max_wr=self.qp_count*self.num_msgs) + srq_attr.srq_type = e.IBV_SRQT_XRC + srq_attr.pd = self.pd + srq_attr.xrcd = self.xrcd + srq_attr.cq = self.cq + srq_attr.comp_mask = e.IBV_SRQ_INIT_ATTR_TYPE | e.IBV_SRQ_INIT_ATTR_PD | \ + e.IBV_SRQ_INIT_ATTR_CQ | e.IBV_SRQ_INIT_ATTR_XRCD + self.srq = SRQ(self.ctx, srq_attr) + + def to_rts(self): + gid = self.ctx.query_gid(self.ib_port, self.gid_index) + gr = GlobalRoute(dgid=gid, sgid_index=self.gid_index) + ah_attr = AHAttr(port_num=self.ib_port, is_global=True, + gr=gr, dlid=self.port_attr.lid) + qp_attr = QPAttr() + qp_attr.path_mtu = PATH_MTU + qp_attr.timeout = TIMEOUT + qp_attr.retry_cnt = RETRY_CNT + qp_attr.rnr_retry = RNR_RETRY + qp_attr.min_rnr_timer = MIN_RNR_TIMER + qp_attr.ah_attr = ah_attr + for i in range(self.qp_count): + qp_attr.dest_qp_num = self.rqps_num[i][1] + qp_attr.rq_psn = self.psns[i] + qp_attr.sq_psn = self.rpsns[i] + self.rqp_lst[i].to_rts(qp_attr) + qp_attr.dest_qp_num = self.rqps_num[i][0] + self.sqp_lst[i].to_rts(qp_attr) + + def init_resources(self): + self.create_xrcd() + super(XRCResources, self).init_resources() + self.create_srq() + + def pre_run(self, rpsns, rqps_num): + self.rqps_num = rqps_num + self.rpsns = rpsns + self.to_rts() diff --git a/tests/rdmacm_utils.py b/tests/rdmacm_utils.py new file mode 100755 index 0000000..c71bab1 --- /dev/null +++ b/tests/rdmacm_utils.py @@ -0,0 +1,237 @@ +# SPDX-License-Identifier: (GPL-2.0 OR Linux-OpenIB) +# Copyright (c) 2019 Mellanox Technologies, Inc. All rights reserved. See COPYING file +""" +Provide some useful helper function for pyverbs rdmacm' tests. +""" +from tests.utils import validate, poll_cq, get_send_element, get_recv_wr +from pyverbs.pyverbs_error import PyverbsError +from tests.base import CMResources +from pyverbs.cmid import CMEvent +import pyverbs.cm_enums as ce +import os + +events_dict = {ce.RDMA_CM_EVENT_ADDR_ERROR: 'Resolve Address Error', + ce.RDMA_CM_EVENT_ROUTE_ERROR: 'Resolve Route Error', + ce.RDMA_CM_EVENT_CONNECT_ERROR: 'Connection Error', + ce.RDMA_CM_EVENT_UNREACHABLE: 'Node is Unreachable', + ce.RDMA_CM_EVENT_REJECTED: 'Connection Rejected', + ce.RDMA_CM_EVENT_DEVICE_REMOVAL: 'Device Removal', + ce.RDMA_CM_EVENT_MULTICAST_JOIN: 'Multicast Join', + ce.RDMA_CM_EVENT_MULTICAST_ERROR: 'Multicast Error', + ce.RDMA_CM_EVENT_ADDR_CHANGE: 'Address Change', + ce.RDMA_CM_EVENT_TIMEWAIT_EXIT: 'Time wait Exit'} + + +def _server_traffic_with_ext_qp(agr_obj, syncer): + recv_wr = get_recv_wr(agr_obj) + agr_obj.qp.post_recv(recv_wr) + syncer.wait() + for _ in range(agr_obj.num_msgs): + poll_cq(agr_obj.cq) + agr_obj.qp.post_recv(recv_wr) + msg_received = agr_obj.mr.read(agr_obj.msg_size, 0) + validate(msg_received, agr_obj.is_server, agr_obj.msg_size) + send_wr = get_send_element(agr_obj, agr_obj.is_server)[0] + agr_obj.qp.post_send(send_wr) + poll_cq(agr_obj.cq) + + +def server_traffic(agr_obj, syncer): + """ + RDMACM passive side traffic function which sends and receives a message, and + then validates the received message. This operation is executed + <agr_obj.num_msgs> times. If agr_obj.with_ext_qp is set, the traffic will + use the external QP (agr_obj.qp). + :param agr_obj: Aggregation object which contains all necessary resources + :param syncer: multiprocessing.Barrier object for processes synchronization + :return: None + """ + if agr_obj.with_ext_qp: + return _server_traffic_with_ext_qp(agr_obj, syncer) + send_msg = agr_obj.msg_size * 's' + cmid = agr_obj.child_id + for _ in range(agr_obj.num_msgs): + cmid.post_recv(agr_obj.mr) + syncer.wait() + syncer.wait() + cmid.get_recv_comp() + msg_received = agr_obj.mr.read(agr_obj.msg_size, 0) + validate(msg_received, agr_obj.is_server, agr_obj.msg_size) + agr_obj.mr.write(send_msg, agr_obj.msg_size) + cmid.post_send(agr_obj.mr) + cmid.get_send_comp() + syncer.wait() + + +def _client_traffic_with_ext_qp(agr_obj, syncer): + recv_wr = get_recv_wr(agr_obj) + syncer.wait() + for _ in range(agr_obj.num_msgs): + send_wr = get_send_element(agr_obj, agr_obj.is_server)[0] + agr_obj.qp.post_send(send_wr) + poll_cq(agr_obj.cq) + agr_obj.qp.post_recv(recv_wr) + poll_cq(agr_obj.cq) + msg_received = agr_obj.mr.read(agr_obj.msg_size, 0) + validate(msg_received, agr_obj.is_server, agr_obj.msg_size) + + +def client_traffic(agr_obj, syncer): + """ + RDMACM active side traffic function which sends and receives a message, and + then validates the received message. This operation is executed + <agr_obj.num_msgs> times. If agr_obj.with_ext_qp is set, the traffic will + use the external QP (agr_obj.qp). + :param agr_obj: Aggregation object which contains all necessary resources + :param syncer: multiprocessing.Barrier object for processes synchronization + :return: None + """ + if agr_obj.with_ext_qp: + return _client_traffic_with_ext_qp(agr_obj, syncer) + send_msg = agr_obj.msg_size * 'c' + cmid = agr_obj.cmid + for _ in range(agr_obj.num_msgs): + agr_obj.mr.write(send_msg, agr_obj.msg_size) + syncer.wait() + cmid.post_send(agr_obj.mr) + cmid.get_send_comp() + syncer.wait() + cmid.post_recv(agr_obj.mr) + syncer.wait() + cmid.get_recv_comp() + msg_received = agr_obj.mr.read(agr_obj.msg_size, 0) + validate(msg_received, agr_obj.is_server, agr_obj.msg_size) + + +def event_handler(agr_obj): + """ + Handle and execute corresponding API for RDMACM events of asynchronous + communication + :param agr_obj: Aggregation object which contains all necessary resources + :return: None + """ + cm_event = CMEvent(agr_obj.cmid.event_channel) + if cm_event.event_type == ce.RDMA_CM_EVENT_ADDR_RESOLVED: + agr_obj.cmid.resolve_route() + elif cm_event.event_type == ce.RDMA_CM_EVENT_ROUTE_RESOLVED: + agr_obj.create_qp() + param = agr_obj.create_conn_param() + if agr_obj.with_ext_qp: + param.qpn = agr_obj.qp.qp_num + agr_obj.cmid.connect(param) + elif cm_event.event_type == ce.RDMA_CM_EVENT_CONNECT_REQUEST: + agr_obj.create_child_id(cm_event) + param = agr_obj.create_conn_param() + agr_obj.create_qp() + if agr_obj.with_ext_qp: + agr_obj.modify_ext_qp_to_rts() + param.qpn = agr_obj.qp.qp_num + agr_obj.child_id.accept(param) + elif cm_event.event_type == ce.RDMA_CM_EVENT_ESTABLISHED: + agr_obj.connected = True + elif cm_event.event_type == ce.RDMA_CM_EVENT_CONNECT_RESPONSE: + agr_obj.connected = True + if agr_obj.with_ext_qp: + agr_obj.modify_ext_qp_to_rts() + agr_obj.cmid.establish() + elif cm_event.event_type == ce.RDMA_CM_EVENT_DISCONNECTED: + if agr_obj.is_server: + agr_obj.child_id.disconnect() + agr_obj.connected = False + else: + agr_obj.cmid.disconnect() + agr_obj.connected = False + else: + if cm_event.event_type in events_dict: + raise PyverbsError('Unexpected event - {}'.format( + events_dict[cm_event.event_type])) + else: + raise PyverbsError('The event {} is not supported'.format( + cm_event.event_type)) + cm_event.ack_cm_event() + + +def sync_traffic(addr, syncer, notifier, is_server): + """ + RDMACM synchronous data and control path which first establish a connection + using RDMACM's synchronous API and then execute RDMACM synchronous traffic. + :param addr: Address to connect to and to bind to + :param syncer: multiprocessing.Barrier object for processes synchronization + :param notifier: Notify parent process about any exceptions or success + :param is_server: A flag which indicates if this is a server or client + :return: None + """ + try: + if is_server: + server = CMResources(src=addr) + server.cmid.listen() + syncer.wait() + server.create_child_id() + server.child_id.accept() + server.create_mr() + server_traffic(server, syncer) + server.child_id.disconnect() + else: + client = CMResources(dst=addr) + syncer.wait() + client.cmid.connect() + client.create_mr() + client_traffic(client, syncer) + client.cmid.disconnect() + except Exception as ex: + side = 'passive' if is_server else 'active' + notifier.put('Caught exception in {side} side process: pid {pid}\n' + .format(side=side, pid=os.getpid()) + + 'Exception message: {ex}'.format(ex=str(ex))) + else: + notifier.put(None) + + +def async_traffic_with_ext_qp(addr, syncer, notifier, is_server): + return async_traffic(addr, syncer, notifier, is_server, True) + + +def async_traffic(addr, syncer, notifier, is_server, with_ext_qp=False): + """ + RDMACM asynchronous data and control path function that first establishes a + connection using RDMACM events API and then executes RDMACM asynchronous + traffic. + :param addr: Address to connect to and to bind to + :param syncer: multiprocessing.Barrier object for processes synchronization + :param notifier: Notify parent process about any exceptions or success + :param is_server: A flag which indicates if this is a server or not + :param with_ext_qp: If set, an external RC QP will be created and used by + RDMACM (default: False) + :return: None + """ + try: + if is_server: + server = CMResources(src=addr, is_async=True, + with_ext_qp=with_ext_qp) + listen_id = server.cmid + listen_id.bind_addr(server.ai) + listen_id.listen() + syncer.wait() + while not server.connected: + event_handler(server) + server.create_mr() + server_traffic(server, syncer) + server.child_id.disconnect() + else: + client = CMResources(src=addr, dst=addr, is_async=True, + with_ext_qp=with_ext_qp) + id = client.cmid + id.resolve_addr(client.ai) + syncer.wait() + while not client.connected: + event_handler(client) + client.create_mr() + client_traffic(client, syncer) + event_handler(client) + except Exception as ex: + side = 'passive' if is_server else 'active' + notifier.put('Caught exception in {side} side process: pid {pid}\n' + .format(side=side, pid=os.getpid()) + + 'Exception message: {ex}'.format(ex=str(ex))) + else: + notifier.put(None) diff --git a/tests/run_tests.py b/tests/run_tests.py new file mode 100755 index 0000000..9e2e5d2 --- /dev/null +++ b/tests/run_tests.py @@ -0,0 +1,12 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: (GPL-2.0 OR Linux-OpenIB) +# Copyright (c) 2018, Mellanox Technologies. All rights reserved. See COPYING file + +import unittest +import os +from importlib.machinery import SourceFileLoader + + +module_path = os.path.join(os.path.dirname(__file__), '__init__.py') +tests = SourceFileLoader('tests', module_path).load_module() +unittest.main(module=tests) diff --git a/tests/test_addr.py b/tests/test_addr.py new file mode 100644 index 0000000..3789606 --- /dev/null +++ b/tests/test_addr.py @@ -0,0 +1,106 @@ +# SPDX-License-Identifier: (GPL-2.0 OR Linux-OpenIB) +# Copyright (c) 2019 Mellanox Technologies, Inc. All rights reserved. See COPYING file + +import unittest +import errno + +from pyverbs.pyverbs_error import PyverbsError, PyverbsRDMAError +from pyverbs.addr import GlobalRoute, AHAttr, AH +from tests.base import PyverbsAPITestCase +import pyverbs.enums as e +from pyverbs.pd import PD + + +class AHTest(PyverbsAPITestCase): + """ + Test various functionalities of the AH class. + """ + def test_create_ah(self): + """ + Test ibv_create_ah. + """ + done = 0 + for ctx, attr, attr_ex in self.devices: + pd = PD(ctx) + for port_num in range(1, 1 + attr.phys_port_cnt): + state = ctx.query_port(port_num).state + if state != e.IBV_PORT_ACTIVE and state != e.IBV_PORT_INIT: + continue + gr = get_global_route(ctx, port_num=port_num) + ah_attr = AHAttr(gr=gr, is_global=1, port_num=port_num) + try: + with AH(pd, attr=ah_attr): + done += 1 + except PyverbsRDMAError as ex: + if ex.error_code == errno.EOPNOTSUPP: + raise unittest.SkipTest('Create AH is not supported') + raise ex + if done == 0: + raise unittest.SkipTest('No port is up, can\'t create AH') + # TODO: Test ibv_create_ah_from_wc once we have traffic + + def test_create_ah_roce(self): + """ + Verify that AH can't be created without GRH in RoCE + """ + done = 0 + for ctx, attr, attr_ex in self.devices: + pd = PD(ctx) + for port_num in range(1, 1 + attr.phys_port_cnt): + port_attr = ctx.query_port(port_num) + if port_attr.state != e.IBV_PORT_ACTIVE and \ + port_attr.state != e.IBV_PORT_INIT: + continue + if port_attr.link_layer == e.IBV_LINK_LAYER_INFINIBAND: + raise unittest.SkipTest('Can\'t run RoCE tests on IB link layer') + ah_attr = AHAttr(is_global=0, port_num=port_num) + try: + ah = AH(pd, attr=ah_attr) + except PyverbsRDMAError as ex: + if ex.error_code == errno.EOPNOTSUPP: + raise unittest.SkipTest('Create AH is not supported') + assert 'Failed to create AH' in str(ex) + done +=1 + else: + raise PyverbsError('Created a non-global AH on RoCE') + if done == 0: + raise unittest.SkipTest('No port is up, can\'t create AH') + + def test_destroy_ah(self): + """ + Test ibv_destroy_ah. + """ + done = 0 + for ctx, attr, attr_ex in self.devices: + pd = PD(ctx) + for port_num in range(1, 1 + attr.phys_port_cnt): + state = ctx.query_port(port_num).state + if state != e.IBV_PORT_ACTIVE and state != e.IBV_PORT_INIT: + continue + gr = get_global_route(ctx) + ah_attr = AHAttr(gr=gr, is_global=1, port_num=port_num) + try: + with AH(pd, attr=ah_attr) as ah: + ah.close() + done += 1 + except PyverbsRDMAError as ex: + if ex.error_code == errno.EOPNOTSUPP: + raise unittest.SkipTest('Create AH is not supported') + raise ex + if done == 0: + raise unittest.SkipTest('No port is up, can\'t create AH') + + +def get_global_route(ctx, gid_index=0, port_num=1): + """ + Queries the provided Context's gid <gid_index> and creates a GlobalRoute + object with sgid_index <gid_index> and the queried GID as dgid. + :param ctx: Context object to query + :param gid_index: GID index to query and use. Default: 0, as it's always + valid + :param port_num: Number of the port to query. Default: 1 + :return: GlobalRoute object + """ + gid = ctx.query_gid(port_num, gid_index) + gr = GlobalRoute(dgid=gid, sgid_index=gid_index) + return gr diff --git a/tests/test_cq.py b/tests/test_cq.py new file mode 100644 index 0000000..4a207fa --- /dev/null +++ b/tests/test_cq.py @@ -0,0 +1,234 @@ +# SPDX-License-Identifier: (GPL-2.0 OR Linux-OpenIB) +# Copyright (c) 2019 Mellanox Technologies, Inc. All rights reserved. See COPYING file +""" +Test module for pyverbs' cq module. +""" +import random + +from pyverbs.pyverbs_error import PyverbsError, PyverbsRDMAError +from pyverbs.cq import CompChannel, CQ, CqInitAttrEx, CQEX +from tests.base import PyverbsAPITestCase +import pyverbs.enums as e +import unittest +import errno + + +class CQTest(PyverbsAPITestCase): + """ + Test various functionalities of the CQ class. + """ + def test_create_cq(self): + """ + Test ibv_create_cq() + """ + for ctx, attr, attr_ex in self.devices: + for i in range(10): + cqes = get_num_cqes(attr) + comp_vector = int(ctx.num_comp_vectors * random.random()) + if random.choice([True, False]): + with CompChannel(ctx) as cc: + with CQ(ctx, cqes, None, cc, comp_vector): + pass + else: + with CQ(ctx, cqes, None, None, comp_vector): + pass + + def test_create_cq_bad_flow(self): + """ + Test ibv_create_cq() with a wrong comp_vector / cqe number + """ + for ctx, attr, attr_ex in self.devices: + for i in range(10): + cc = CompChannel(ctx) + cqes = 100 + comp_vector = ctx.num_comp_vectors + int(100 * + random.random()) + has_cc = random.choice([True, False]) + if not has_cc: + cc = None + try: + with CQ(ctx, cqes, None, cc, comp_vector): + pass + except PyverbsError as ex: + assert 'Failed to create a CQ' in ex.args[0] + assert 'Invalid argument' in ex.args[0] + else: + raise PyverbsError( + 'Created a CQ with comp_vector={n} while device\'s num_comp_vectors={nc}'. + format(n=comp_vector, nc=ctx.num_comp_vectors)) + max_cqe = ctx.query_device().max_cqe + cqes = random.randint(max_cqe + 1, max_cqe + 100) + try: + with CQ(ctx, cqes, None, cc, 0): + pass + except PyverbsError as ex: + assert 'Failed to create a CQ' in ex.args[0] + assert 'Invalid argument' in ex.args[0] + else: + raise PyverbsError( + 'Created a CQ with cqe={n} while device\'s max_cqe={nc}'. + format(n=cqes, nc=max_cqe)) + + def test_destroy_cq(self): + """ + Test ibv_destroy_cq() + """ + for ctx, attr, attr_ex in self.devices: + for i in range(10): + cqes = get_num_cqes(attr) + comp_vector = int(ctx.num_comp_vectors * random.random()) + if random.choice([True, False]): + with CompChannel(ctx) as cc: + cq = CQ(ctx, cqes, None, cc, comp_vector) + else: + cq = CQ(ctx, cqes, None, None, comp_vector) + cq.close() + + +class CCTest(PyverbsAPITestCase): + """ + Test various functionalities of the Completion Channel class. + """ + def test_create_comp_channel(self): + """ + Test ibv_create_comp_channel() + """ + for ctx, attr, attr_ex in self.devices: + with CompChannel(ctx): + pass + + def test_destroy_comp_channel(self): + """ + Test ibv_destroy_comp_channel() + """ + for ctx, attr, attr_ex in self.devices: + cc = CompChannel(ctx) + cc.close() + + +class CQEXTest(PyverbsAPITestCase): + """ + Test various functionalities of the CQEX class. + """ + def test_create_cq_ex(self): + """ + Test ibv_create_cq_ex() + """ + for ctx, attr, attr_ex in self.devices: + cqe = get_num_cqes(attr) + cq_init_attrs_ex = CqInitAttrEx(cqe=cqe, wc_flags=0, comp_mask=0, flags=0) + wc_flags = get_cq_flags_with_caps() + if attr_ex.raw_packet_caps & e.IBV_RAW_PACKET_CAP_CVLAN_STRIPPING == 0: + wc_flags.remove(e.IBV_WC_EX_WITH_CVLAN) + for f in wc_flags: + cq_init_attrs_ex.wc_flags = f + with CQEX(ctx, cq_init_attrs_ex): + pass + # For the wc_flags that have no capability bit, we're not raising + # an exception for EOPNOTSUPPORT + wc_flags = get_cq_flags_with_no_caps() + for f in wc_flags: + cq_init_attrs_ex.wc_flags = f + try: + with CQEX(ctx, cq_init_attrs_ex): + pass + except PyverbsError as ex: + assert 'Failed to create extended CQ' in ex.args[0] + assert ' Errno: 95' in ex.args[0] + cq_init_attrs_ex.wc_flags = 0 + cq_init_attrs_ex.comp_mask = e.IBV_CQ_INIT_ATTR_MASK_FLAGS + attr_flags = list(e.ibv_create_cq_attr_flags) + for f in attr_flags: + cq_init_attrs_ex.flags = f + try: + with CQEX(ctx, cq_init_attrs_ex): + pass + except PyverbsError as ex: + assert 'Failed to create extended CQ' in ex.args[0] + assert ' Errno: 95' in ex.args[0] + + def test_create_cq_ex_bad_flow(self): + """ + Test ibv_create_cq_ex() with wrong comp_vector / number of cqes + """ + for ctx, attr, attr_ex in self.devices: + for i in range(10): + cq_attrs_ex = CqInitAttrEx(cqe=0, wc_flags=0, comp_mask=0, flags=0) + max_cqe = attr.max_cqe + cq_attrs_ex.cqe = max_cqe + 1 + int(100 * random.random()) + try: + CQEX(ctx, cq_attrs_ex) + except PyverbsRDMAError as ex: + if ex.error_code == errno.EOPNOTSUPP: + raise unittest.SkipTest('Create extended CQ is not supported') + assert 'Failed to create extended CQ' in ex.args[0] + assert ' Errno: 22' in ex.args[0] + else: + raise PyverbsError( + 'Created a CQEX with {c} CQEs while device\'s max CQE={dc}'. + format(c=cq_attrs_ex.cqe, dc=max_cqe)) + comp_channel = random.randint(ctx.num_comp_vectors, 100) + cq_attrs_ex.comp_vector = comp_channel + cq_attrs_ex.cqe = get_num_cqes(attr) + try: + CQEX(ctx, cq_attrs_ex) + except PyverbsRDMAError as ex: + if ex.error_code == errno.EOPNOTSUPP: + raise unittest.SkipTest('Create extended CQ is not supported') + assert 'Failed to create extended CQ' in ex.args[0] + assert ' Errno: 22' in ex.args[0] + else: + raise PyverbsError( + 'Created a CQEX with comp_vector={c} while device\'s num_comp_vectors={dc}'. + format(c=comp_channel, dc=ctx.num_comp_vectors)) + + def test_destroy_cq_ex(self): + """ + Test ibv_destroy_cq() for extended CQs + """ + for ctx, attr, attr_ex in self.devices: + cqe = get_num_cqes(attr) + cq_init_attrs_ex = CqInitAttrEx(cqe=cqe, wc_flags=0, comp_mask=0, flags=0) + wc_flags = get_cq_flags_with_caps() + if attr_ex.raw_packet_caps & e.IBV_RAW_PACKET_CAP_CVLAN_STRIPPING == 0: + wc_flags.remove(e.IBV_WC_EX_WITH_CVLAN) + for f in wc_flags: + cq_init_attrs_ex.wc_flags = f + with CQEX(ctx, cq_init_attrs_ex) as cq: + cq.close() + # For the wc_flags that have no capability bit, we're not raising + # an exception for EOPNOTSUPPORT + wc_flags = get_cq_flags_with_no_caps() + for f in wc_flags: + cq_init_attrs_ex.wc_flags = f + try: + with CQEX(ctx, cq_init_attrs_ex) as cq: + cq.close() + except PyverbsError as ex: + assert 'Failed to create extended CQ' in ex.args[0] + assert ' Errno: 95' in ex.args[0] + cq_init_attrs_ex.wc_flags = 0 + cq_init_attrs_ex.comp_mask = e.IBV_CQ_INIT_ATTR_MASK_FLAGS + attr_flags = list(e.ibv_create_cq_attr_flags) + for f in attr_flags: + cq_init_attrs_ex.flags = f + try: + with CQEX(ctx, cq_init_attrs_ex) as cq: + cq.close() + except PyverbsError as ex: + assert 'Failed to create extended CQ' in ex.args[0] + assert ' Errno: 95' in ex.args[0] + +def get_num_cqes(attr): + max_cqe = attr.max_cqe + return int((max_cqe + 1) * random.random()) + + +def get_cq_flags_with_no_caps(): + wc_flags = list(e.ibv_create_cq_wc_flags) + wc_flags.remove(e.IBV_WC_EX_WITH_CVLAN) + return wc_flags + + +def get_cq_flags_with_caps(): + return [e.IBV_WC_EX_WITH_CVLAN] diff --git a/tests/test_cq_events.py b/tests/test_cq_events.py new file mode 100644 index 0000000..bcb3f7d --- /dev/null +++ b/tests/test_cq_events.py @@ -0,0 +1,45 @@ +from tests.base import RCResources, UDResources +from tests.base import RDMATestCase +from tests.utils import traffic + +from pyverbs.cq import CQ, CompChannel + + +def create_cq_with_comp_channel(agr_obj): + agr_obj.comp_channel = CompChannel(agr_obj.ctx) + agr_obj.cq = CQ(agr_obj.ctx, agr_obj.num_msgs, None, agr_obj.comp_channel) + agr_obj.cq.req_notify() + + +class CqEventsUD(UDResources): + def create_cq(self): + create_cq_with_comp_channel(self) + + +class CqEventsRC(RCResources): + def create_cq(self): + create_cq_with_comp_channel(self) + + +class CqEventsTestCase(RDMATestCase): + def setUp(self): + super().setUp() + self.iters = 100 + self.qp_dict = {'ud': CqEventsUD, 'rc': CqEventsRC} + + def create_players(self, qp_type): + client = self.qp_dict[qp_type](self.dev_name, self.ib_port, + self.gid_index) + server = self.qp_dict[qp_type](self.dev_name, self.ib_port, + self.gid_index) + client.pre_run(server.psn, server.qpn) + server.pre_run(client.psn, client.qpn) + return client, server + + def test_cq_events_ud(self): + client, server = self.create_players('ud') + traffic(client, server, self.iters, self.gid_index, self.ib_port) + + def test_cq_events_rc(self): + client, server = self.create_players('rc') + traffic(client, server, self.iters, self.gid_index, self.ib_port) diff --git a/tests/test_cqex.py b/tests/test_cqex.py new file mode 100644 index 0000000..63c6c00 --- /dev/null +++ b/tests/test_cqex.py @@ -0,0 +1,82 @@ +from pyverbs.pyverbs_error import PyverbsRDMAError +from pyverbs.cq import CqInitAttrEx, CQEX +import pyverbs.enums as e +from pyverbs.mr import MR + +from tests.base import RCResources, UDResources, XRCResources, RDMATestCase +import tests.utils as u +import unittest +import errno + + +def create_ex_cq(res): + """ + Create an Extended CQ using res's context and assign it to res's cq member. + IBV_WC_STANDARD_FLAGS is used for WC flags to avoid support differences + between devices. + :param res: An instance of TrafficResources + """ + wc_flags = e.IBV_WC_STANDARD_FLAGS + cia = CqInitAttrEx(cqe=2000, wc_flags=wc_flags) + try: + res.cq = CQEX(res.ctx, cia) + except PyverbsRDMAError as ex: + if ex.error_code == errno.EOPNOTSUPP: + raise unittest.SkipTest('Create Extended CQ is not supported') + raise ex + +class CqExUD(UDResources): + def create_cq(self): + create_ex_cq(self) + + def create_mr(self): + self.mr = MR(self.pd, self.msg_size + self.GRH_SIZE, + e.IBV_ACCESS_LOCAL_WRITE) + + +class CqExRC(RCResources): + def create_cq(self): + create_ex_cq(self) + + +class CqExXRC(XRCResources): + def create_cq(self): + create_ex_cq(self) + + +class CqExTestCase(RDMATestCase): + """ + Run traffic over the existing UD, RC and XRC infrastructure, but use + ibv_cq_ex instead of legacy ibv_cq + """ + def setUp(self): + super().setUp() + self.iters = 100 + self.qp_dict = {'ud': CqExUD, 'rc': CqExRC, 'xrc': CqExXRC} + + def create_players(self, qp_type): + client = self.qp_dict[qp_type](self.dev_name, self.ib_port, + self.gid_index) + server = self.qp_dict[qp_type](self.dev_name, self.ib_port, + self.gid_index) + if qp_type == 'xrc': + client.pre_run(server.psns, server.qps_num) + server.pre_run(client.psns, client.qps_num) + else: + client.pre_run(server.psn, server.qpn) + server.pre_run(client.psn, client.qpn) + return client, server + + def test_ud_traffic_cq_ex(self): + client, server = self.create_players('ud') + u.traffic(client, server, self.iters, self.gid_index, self.ib_port, + is_cq_ex=True) + + def test_rc_traffic_cq_ex(self): + client, server = self.create_players('rc') + u.traffic(client, server, self.iters, self.gid_index, self.ib_port, + is_cq_ex=True) + + def test_xrc_traffic_cq_ex(self): + client, server = self.create_players('xrc') + u.xrc_traffic(client, server, is_cq_ex=True) diff --git a/tests/test_device.py b/tests/test_device.py new file mode 100644 index 0000000..eb1e94f --- /dev/null +++ b/tests/test_device.py @@ -0,0 +1,271 @@ +# SPDX-License-Identifier: (GPL-2.0 OR Linux-OpenIB) +# Copyright (c) 2018 Mellanox Technologies, Inc. All rights reserved. See COPYING file +""" +Test module for pyverbs' device module. +""" +import unittest +import resource +import random + +from pyverbs.pyverbs_error import PyverbsError, PyverbsRDMAError +from tests.base import PyverbsAPITestCase +import tests.utils as u +import pyverbs.device as d + +PAGE_SIZE = resource.getpagesize() + + +class DeviceTest(unittest.TestCase): + """ + Test various functionalities of the Device class. + """ + + def test_dev_list(self): + """ + Verify that it's possible to get IB devices list. + """ + d.get_device_list() + + @staticmethod + def get_device_list(): + lst = d.get_device_list() + if len(lst) == 0: + raise unittest.SkipTest('No IB device found') + return lst + + def test_open_dev(self): + """ + Test ibv_open_device() + """ + for dev in self.get_device_list(): + d.Context(name=dev.name.decode()) + + def test_query_device(self): + """ + Test ibv_query_device() + """ + for dev in self.get_device_list(): + with d.Context(name=dev.name.decode()) as ctx: + attr = ctx.query_device() + self.verify_device_attr(attr) + + def test_query_gid(self): + """ + Test ibv_query_gid() + """ + for dev in self.get_device_list(): + with d.Context(name=dev.name.decode()) as ctx: + ctx.query_gid(port_num=1, index=0) + + @staticmethod + def verify_device_attr(attr): + """ + Helper method that verifies correctness of some members of DeviceAttr + object. + :param attr: A DeviceAttr object + :return: None + """ + assert attr.node_guid != 0 + assert attr.sys_image_guid != 0 + assert attr.max_mr_size > PAGE_SIZE + assert attr.page_size_cap >= PAGE_SIZE + assert attr.vendor_id != 0 + assert attr.vendor_part_id != 0 + assert attr.max_qp > 0 + assert attr.max_qp_wr > 0 + assert attr.max_sge > 0 + assert attr.max_sge_rd > 0 + assert attr.max_cq > 0 + assert attr.max_cqe > 0 + assert attr.max_mr > 0 + assert attr.max_pd > 0 + assert attr.max_pkeys > 0 + + def test_query_device_ex(self): + """ + Test ibv_query_device_ex() + """ + for dev in self.get_device_list(): + with d.Context(name=dev.name.decode()) as ctx: + attr_ex = ctx.query_device_ex() + self.verify_device_attr(attr_ex.orig_attr) + + @staticmethod + def verify_port_attr(attr): + """ + Helper method that verifies correctness of some members of PortAttr + object. + :param attr: A PortAttr object + :return: None + """ + assert 'Invalid' not in d.phys_state_to_str(attr.state) + assert 'Invalid' not in d.translate_mtu(attr.max_mtu) + assert 'Invalid' not in d.translate_mtu(attr.active_mtu) + assert 'Invalid' not in d.width_to_str(attr.active_width) + assert 'Invalid' not in d.speed_to_str(attr.active_speed) + assert 'Invalid' not in d.translate_link_layer(attr.link_layer) + assert attr.max_msg_sz > 0x1000 + + def test_query_port(self): + """ + Test ibv_query_port + """ + for dev in self.get_device_list(): + with d.Context(name=dev.name.decode()) as ctx: + num_ports = ctx.query_device().phys_port_cnt + for p in range(num_ports): + port_attr = ctx.query_port(p + 1) + self.verify_port_attr(port_attr) + + def test_query_port_bad_flow(self): + """ + Verify that querying non-existing ports fails as expected + """ + for dev in self.get_device_list(): + with d.Context(name=dev.name.decode()) as ctx: + num_ports = ctx.query_device().phys_port_cnt + try: + port = num_ports + random.randint(1, 10) + ctx.query_port(port) + except PyverbsRDMAError as e: + assert 'Failed to query port' in e.args[0] + assert 'Invalid argument' in e.args[0] + else: + raise PyverbsRDMAError( + 'Successfully queried non-existing port {p}'. \ + format(p=port)) + + +class DMTest(PyverbsAPITestCase): + """ + Test various functionalities of the DM class. + """ + + def test_create_dm(self): + """ + test ibv_alloc_dm() + """ + for ctx, attr, attr_ex in self.devices: + if attr_ex.max_dm_size == 0: + return + dm_len = random.randrange(u.MIN_DM_SIZE, attr_ex.max_dm_size/2, + u.DM_ALIGNMENT) + dm_attrs = u.get_dm_attrs(dm_len) + with d.DM(ctx, dm_attrs): + pass + + def test_destroy_dm(self): + """ + test ibv_free_dm() + """ + for ctx, attr, attr_ex in self.devices: + if attr_ex.max_dm_size == 0: + return + dm_len = random.randrange(u.MIN_DM_SIZE, attr_ex.max_dm_size/2, + u.DM_ALIGNMENT) + dm_attrs = u.get_dm_attrs(dm_len) + dm = d.DM(ctx, dm_attrs) + dm.close() + + def test_create_dm_bad_flow(self): + """ + test ibv_alloc_dm() with an illegal size and comp mask + """ + for ctx, attr, attr_ex in self.devices: + if attr_ex.max_dm_size == 0: + return + dm_len = attr_ex.max_dm_size + 1 + dm_attrs = u.get_dm_attrs(dm_len) + try: + d.DM(ctx, dm_attrs) + except PyverbsRDMAError as e: + assert 'Failed to allocate device memory of size' in \ + e.args[0] + assert 'Max available size' in e.args[0] + else: + raise PyverbsError( + 'Created a DM with size larger than max reported') + dm_attrs.comp_mask = random.randint(1, 100) + try: + d.DM(ctx, dm_attrs) + except PyverbsRDMAError as e: + assert 'Failed to allocate device memory of size' in \ + e.args[0] + else: + raise PyverbsError( + 'Created a DM with illegal comp mask {c}'. \ + format(c=dm_attrs.comp_mask)) + + def test_destroy_dm_bad_flow(self): + """ + Test calling ibv_free_dm() twice + """ + for ctx, attr, attr_ex in self.devices: + if attr_ex.max_dm_size == 0: + return + dm_len = random.randrange(u.MIN_DM_SIZE, attr_ex.max_dm_size/2, + u.DM_ALIGNMENT) + dm_attrs = u.get_dm_attrs(dm_len) + dm = d.DM(ctx, dm_attrs) + dm.close() + dm.close() + + def test_dm_write(self): + """ + Test writing to the device memory + """ + for ctx, attr, attr_ex in self.devices: + if attr_ex.max_dm_size == 0: + return + dm_len = random.randrange(u.MIN_DM_SIZE, attr_ex.max_dm_size/2, + u.DM_ALIGNMENT) + dm_attrs = u.get_dm_attrs(dm_len) + with d.DM(ctx, dm_attrs) as dm: + data_length = random.randrange(4, dm_len, u.DM_ALIGNMENT) + data_offset = random.randrange(0, dm_len - data_length, + u.DM_ALIGNMENT) + data = 'a' * data_length + dm.copy_to_dm(data_offset, data.encode(), data_length) + + def test_dm_write_bad_flow(self): + """ + Test writing to the device memory with bad offset and length + """ + for ctx, attr, attr_ex in self.devices: + if attr_ex.max_dm_size == 0: + return + dm_len = random.randrange(u.MIN_DM_SIZE, attr_ex.max_dm_size/2, + u.DM_ALIGNMENT) + dm_attrs = u.get_dm_attrs(dm_len) + with d.DM(ctx, dm_attrs) as dm: + data_length = random.randrange(4, dm_len, u.DM_ALIGNMENT) + data_offset = random.randrange(0, dm_len - data_length, + u.DM_ALIGNMENT) + data_offset += 1 # offset needs to be a multiple of 4 + data = 'a' * data_length + try: + dm.copy_to_dm(data_offset, data.encode(), data_length) + except PyverbsRDMAError as e: + assert 'Failed to copy to dm' in e.args[0] + else: + raise PyverbsError( + 'Wrote to device memory with a bad offset') + + def test_dm_read(self): + """ + Test reading from the device memory + """ + for ctx, attr, attr_ex in self.devices: + if attr_ex.max_dm_size == 0: + return + dm_len = random.randrange(u.MIN_DM_SIZE, attr_ex.max_dm_size/2, + u.DM_ALIGNMENT) + dm_attrs = u.get_dm_attrs(dm_len) + with d.DM(ctx, dm_attrs) as dm: + data_length = random.randrange(4, dm_len, u.DM_ALIGNMENT) + data_offset = random.randrange(0, dm_len - data_length, + u.DM_ALIGNMENT) + data = 'a' * data_length + dm.copy_to_dm(data_offset, data.encode(), data_length) + read_str = dm.copy_from_dm(data_offset, data_length) + assert read_str.decode() == data diff --git a/tests/test_mlx5_pp.py b/tests/test_mlx5_pp.py new file mode 100644 index 0000000..0dce54a --- /dev/null +++ b/tests/test_mlx5_pp.py @@ -0,0 +1,56 @@ +# SPDX-License-Identifier: (GPL-2.0 OR Linux-OpenIB) +# Copyright (c) 2019 Mellanox Technologies, Inc. All rights reserved. See COPYING file + +""" +Test module for mlx5 packet pacing entry allocation. +""" + +from pyverbs.providers.mlx5.mlx5dv import Mlx5PP, Mlx5Context, Mlx5DVContextAttr +from pyverbs.pyverbs_error import PyverbsRDMAError, PyverbsUserError +import pyverbs.providers.mlx5.mlx5_enums as e +from tests.base import RDMATestCase +import unittest +import struct +import errno + + +class Mlx5PPRes: + def __init__(self, dev_name): + try: + mlx5dv_attr = Mlx5DVContextAttr(e.MLX5DV_CONTEXT_FLAGS_DEVX) + self.ctx = Mlx5Context(mlx5dv_attr, dev_name) + except PyverbsUserError as ex: + raise unittest.SkipTest('Could not open mlx5 context ({})' + .format(str(ex))) + except PyverbsRDMAError: + raise unittest.SkipTest('Opening mlx5 DevX context is not supported') + self.pps = [] + + +class Mlx5PPTestCase(RDMATestCase): + def setUp(self): + super().setUp() + self.pp_res = Mlx5PPRes(self.dev_name) + + def test_pp_alloc(self): + """ + Allocate two packet pacing entries with the same configuration. One of + the entries is allocated with a dedicated index. + Then verify that the indexes are different and free the entries. + """ + # An arbitrary valid rate limit value (in kbps) + rate_limit = struct.pack('>I', 100) + try: + self.pp_res.pps.append(Mlx5PP(self.pp_res.ctx, rate_limit)) + # Create a dedicated entry of the same previous configuration + # and verify that it has a different index + self.pp_res.pps.append(Mlx5PP(self.pp_res.ctx, rate_limit, + flags=e._MLX5DV_PP_ALLOC_FLAGS_DEDICATED_INDEX)) + self.assertNotEqual(self.pp_res.pps[0].index, self.pp_res.pps[1].index, + 'Dedicated PP index is not unique') + for pp in self.pp_res.pps: + pp.close() + except PyverbsRDMAError as ex: + if ex.error_code == errno.EOPNOTSUPP or ex.error_code == errno.EPROTONOSUPPORT: + raise unittest.SkipTest('Packet pacing entry allocation is not supported') + raise ex diff --git a/tests/test_mlx5_var.py b/tests/test_mlx5_var.py new file mode 100644 index 0000000..2b85def --- /dev/null +++ b/tests/test_mlx5_var.py @@ -0,0 +1,39 @@ +# SPDX-License-Identifier: (GPL-2.0 OR Linux-OpenIB) +# Copyright (c) 2019 Mellanox Technologies, Inc. All rights reserved. See COPYING file + +""" +Test module for Mlx5 VAR allocation. +""" + +from pyverbs.pyverbs_error import PyverbsRDMAError +from pyverbs.providers.mlx5.mlx5dv import Mlx5VAR +from tests.base import BaseResources +from tests.base import RDMATestCase +import unittest +import errno +import mmap + + +class Mlx5VarRes(BaseResources): + def __init__(self, dev_name, ib_port=None, gid_index=None): + super().__init__(dev_name, ib_port, gid_index) + try: + self.var = Mlx5VAR(self.ctx) + except PyverbsRDMAError as ex: + if ex.error_code == errno.EOPNOTSUPP or ex.error_code == errno.EPROTONOSUPPORT: + raise unittest.SkipTest('VAR allocation is not supported') + + +class Mlx5VarTestCase(RDMATestCase): + def setUp(self): + super().setUp() + self.var_res = Mlx5VarRes(self.dev_name) + + def test_var_map_unmap(self): + var_map = mmap.mmap(fileno=self.var_res.ctx.cmd_fd, + length=self.var_res.var.length, + offset=self.var_res.var.mmap_off) + # There is no munmap method in mmap Python module, but by closing the + # mmap instance the memory is unmapped. + var_map.close() + self.var_res.var.close() diff --git a/tests/test_mr.py b/tests/test_mr.py new file mode 100644 index 0000000..b54e99c --- /dev/null +++ b/tests/test_mr.py @@ -0,0 +1,268 @@ +# SPDX-License-Identifier: (GPL-2.0 OR Linux-OpenIB) +# Copyright (c) 2019 Mellanox Technologies, Inc. All rights reserved. See COPYING file +""" +Test module for pyverbs' mr module. +""" +import unittest +import random +import errno + +from pyverbs.pyverbs_error import PyverbsRDMAError, PyverbsError +from tests.base import PyverbsAPITestCase +from pyverbs.mr import MR, MW, DMMR +import pyverbs.device as d +from pyverbs.pd import PD +import pyverbs.enums as e +import tests.utils as u + +MAX_IO_LEN = 1048576 + + +class MRTest(PyverbsAPITestCase): + """ + Test various functionalities of the MR class. + """ + def test_reg_mr(self): + """ + Test ibv_reg_mr() + """ + for ctx, attr, attr_ex in self.devices: + with PD(ctx) as pd: + flags = u.get_access_flags(ctx) + for f in flags: + with MR(pd, u.get_mr_length(), f) as mr: + pass + + def test_dereg_mr(self): + """ + Test ibv_dereg_mr() + """ + for ctx, attr, attr_ex in self.devices: + with PD(ctx) as pd: + flags = u.get_access_flags(ctx) + for f in flags: + with MR(pd, u.get_mr_length(), f) as mr: + mr.close() + + def test_dereg_mr_twice(self): + """ + Verify that explicit call to MR's close() doesn't fail + """ + for ctx, attr, attr_ex in self.devices: + with PD(ctx) as pd: + flags = u.get_access_flags(ctx) + for f in flags: + with MR(pd, u.get_mr_length(), f) as mr: + # Pyverbs supports multiple destruction of objects, + # we are not expecting an exception here. + mr.close() + mr.close() + + def test_reg_mr_bad_flags(self): + """ + Verify that illegal flags combination fails as expected + """ + for ctx, attr, attr_ex in self.devices: + with PD(ctx) as pd: + for i in range(5): + flags = random.sample([e.IBV_ACCESS_REMOTE_WRITE, + e.IBV_ACCESS_REMOTE_ATOMIC], + random.randint(1, 2)) + mr_flags = 0 + for i in flags: + mr_flags += i.value + try: + MR(pd, u.get_mr_length(), mr_flags) + except PyverbsRDMAError as err: + assert 'Failed to register a MR' in err.args[0] + else: + raise PyverbsRDMAError('Registered a MR with illegal falgs') + + def test_write(self): + """ + Test writing to MR's buffer + """ + for ctx, attr, attr_ex in self.devices: + with PD(ctx) as pd: + for i in range(10): + mr_len = u.get_mr_length() + flags = u.get_access_flags(ctx) + for f in flags: + with MR(pd, mr_len, f) as mr: + write_len = min(random.randint(1, MAX_IO_LEN), + mr_len) + mr.write('a' * write_len, write_len) + + def test_read(self): + """ + Test reading from MR's buffer + """ + for ctx, attr, attr_ex in self.devices: + with PD(ctx) as pd: + for i in range(10): + mr_len = u.get_mr_length() + flags = u.get_access_flags(ctx) + for f in flags: + with MR(pd, mr_len, f) as mr: + write_len = min(random.randint(1, MAX_IO_LEN), + mr_len) + write_str = 'a' * write_len + mr.write(write_str, write_len) + read_len = random.randint(1, write_len) + offset = random.randint(0, write_len-read_len) + read_str = mr.read(read_len, offset).decode() + assert read_str in write_str + + def test_lkey(self): + """ + Test reading lkey property + """ + for ctx, attr, attr_ex in self.devices: + with PD(ctx) as pd: + length = u.get_mr_length() + flags = u.get_access_flags(ctx) + for f in flags: + with MR(pd, length, f) as mr: + mr.lkey + + def test_rkey(self): + """ + Test reading rkey property + """ + for ctx, attr, attr_ex in self.devices: + with PD(ctx) as pd: + length = u.get_mr_length() + flags = u.get_access_flags(ctx) + for f in flags: + with MR(pd, length, f) as mr: + mr.rkey + + def test_buffer(self): + """ + Test reading buf property + """ + for ctx, attr, attr_ex in self.devices: + with PD(ctx) as pd: + length = u.get_mr_length() + flags = u.get_access_flags(ctx) + for f in flags: + with MR(pd, length, f) as mr: + mr.buf + + +class MWTest(PyverbsAPITestCase): + """ + Test various functionalities of the MW class. + """ + def test_reg_mw_type1(self): + """ + Test ibv_alloc_mw() for type 1 MW + """ + for ctx, attr, attr_ex in self.devices: + with PD(ctx) as pd: + try: + with MW(pd, e.IBV_MW_TYPE_1): + pass + except PyverbsRDMAError as ex: + if ex.error_code == errno.EOPNOTSUPP: + raise unittest.SkipTest('Create memory window of type 1 is not supported') + raise ex + + def test_reg_mw_type2(self): + """ + Test ibv_alloc_mw() for type 2 MW + """ + for ctx, attr, attr_ex in self.devices: + with PD(ctx) as pd: + try: + with MW(pd, e.IBV_MW_TYPE_2): + pass + except PyverbsRDMAError as ex: + if ex.error_code == errno.EOPNOTSUPP: + raise unittest.SkipTest('Create memory window of type 2 is not supported') + raise ex + + def test_dereg_mw_type1(self): + """ + Test ibv_dealloc_mw() for type 1 MW + """ + for ctx, attr, attr_ex in self.devices: + with PD(ctx) as pd: + try: + with MW(pd, e.IBV_MW_TYPE_1) as mw: + mw.close() + except PyverbsRDMAError as ex: + if ex.error_code == errno.EOPNOTSUPP: + raise unittest.SkipTest('Create memory window of type 1 is not supported') + raise ex + + def test_dereg_mw_type2(self): + """ + Test ibv_dealloc_mw() for type 2 MW + """ + for ctx, attr, attr_ex in self.devices: + with PD(ctx) as pd: + try: + with MW(pd, e.IBV_MW_TYPE_2) as mw: + mw.close() + except PyverbsRDMAError as ex: + if ex.error_code == errno.EOPNOTSUPP: + raise unittest.SkipTest('Create memory window of type 2 is not supported') + raise ex + + def test_reg_mw_wrong_type(self): + """ + Verify that trying to create a MW of a wrong type fails + """ + for ctx, attr, attr_ex in self.devices: + with PD(ctx) as pd: + try: + mw_type = random.randint(3, 100) + MW(pd, mw_type) + except PyverbsRDMAError: + pass + else: + raise PyverbsError('Created a MW with type {t}'.\ + format(t=mw_type)) + + +class DMMRTest(PyverbsAPITestCase): + """ + Test various functionalities of the DMMR class. + """ + def test_create_dm_mr(self): + """ + Test ibv_reg_dm_mr + """ + for ctx, attr, attr_ex in self.devices: + if attr_ex.max_dm_size == 0: + raise unittest.SkipTest('Device memory is not supported') + with PD(ctx) as pd: + for i in range(10): + dm_len = random.randrange(u.MIN_DM_SIZE, attr_ex.max_dm_size/2, + u.DM_ALIGNMENT) + dm_attrs = u.get_dm_attrs(dm_len) + with d.DM(ctx, dm_attrs) as dm: + dm_mr_len = random.randint(1, dm_len) + dm_mr_offset = random.randint(0, (dm_len - dm_mr_len)) + DMMR(pd, dm_mr_len, e.IBV_ACCESS_ZERO_BASED, dm=dm, + offset=dm_mr_offset) + + def test_destroy_dm_mr(self): + """ + Test freeing of dm_mr + """ + for ctx, attr, attr_ex in self.devices: + if attr_ex.max_dm_size == 0: + return + with PD(ctx) as pd: + for i in range(10): + dm_len = random.randrange(u.MIN_DM_SIZE, attr_ex.max_dm_size/2, + u.DM_ALIGNMENT) + dm_attrs = u.get_dm_attrs(dm_len) + with d.DM(ctx, dm_attrs) as dm: + dm_mr_len = random.randint(1, dm_len) + dm_mr_offset = random.randint(0, (dm_len - dm_mr_len)) + dm_mr = DMMR(pd, dm_mr_len, e.IBV_ACCESS_ZERO_BASED, + dm=dm, offset=dm_mr_offset) + dm_mr.close() diff --git a/tests/test_odp.py b/tests/test_odp.py new file mode 100755 index 0000000..0fa8d94 --- /dev/null +++ b/tests/test_odp.py @@ -0,0 +1,108 @@ +from pyverbs.mem_alloc import mmap, munmap, MAP_ANONYMOUS_, MAP_PRIVATE_, \ + MAP_HUGETLB_ +from tests.utils import requires_odp, requires_huge_pages, traffic, \ + xrc_traffic, create_custom_mr +from tests.base import RCResources, UDResources, XRCResources +from tests.base import RDMATestCase +from pyverbs.mr import MR +import pyverbs.enums as e + + +HUGE_PAGE_SIZE = 0x200000 + + +class OdpUD(UDResources): + @requires_odp('ud') + def create_mr(self): + self.mr = create_custom_mr(self, e.IBV_ACCESS_ON_DEMAND, + self.msg_size + self.GRH_SIZE) + + +class OdpRC(RCResources): + def __init__(self, dev_name, ib_port, gid_index, is_huge=False, + user_addr=None): + """ + Initialize an OdpRC object. + :param dev_name: Device name to be used + :param ib_port: IB port of the device to use + :param gid_index: Which GID index to use + :param is_huge: If True, use huge pages for MR registration + :param user_addr: The MR's buffer address. If None, the buffer will be + allocated by pyverbs. + """ + self.is_huge = is_huge + self.user_addr = user_addr + super(OdpRC, self).__init__(dev_name=dev_name, ib_port=ib_port, + gid_index=gid_index) + + @requires_odp('rc') + def create_mr(self): + access = e.IBV_ACCESS_LOCAL_WRITE | e.IBV_ACCESS_ON_DEMAND + if self.is_huge: + access |= e.IBV_ACCESS_HUGETLB + self.mr = MR(self.pd, self.msg_size, access, address=self.user_addr) + + +class OdpXRC(XRCResources): + @requires_odp('xrc') + def create_mr(self): + self.mr = create_custom_mr(self, e.IBV_ACCESS_ON_DEMAND) + + +class OdpTestCase(RDMATestCase): + def setUp(self): + super(OdpTestCase, self).setUp() + self.iters = 100 + self.user_addr = None + self.qp_dict = {'rc': OdpRC, 'ud': OdpUD, 'xrc': OdpXRC} + + def create_players(self, qp_type, is_huge=False): + if qp_type == 'rc': + client = self.qp_dict[qp_type](self.dev_name, self.ib_port, + self.gid_index, is_huge=is_huge, + user_addr=self.user_addr) + server = self.qp_dict[qp_type](self.dev_name, self.ib_port, + self.gid_index, is_huge=is_huge, + user_addr=self.user_addr) + else: + client = self.qp_dict[qp_type](self.dev_name, self.ib_port, + self.gid_index) + server = self.qp_dict[qp_type](self.dev_name, self.ib_port, + self.gid_index) + if qp_type == 'xrc': + client.pre_run(server.psns, server.qps_num) + server.pre_run(client.psns, client.qps_num) + else: + client.pre_run(server.psn, server.qpn) + server.pre_run(client.psn, client.qpn) + return client, server + + def tearDown(self): + if self.user_addr: + munmap(self.user_addr, HUGE_PAGE_SIZE) + super(OdpTestCase, self).tearDown() + + def test_odp_rc_traffic(self): + client, server = self.create_players('rc') + traffic(client, server, self.iters, self.gid_index, self.ib_port) + + def test_odp_ud_traffic(self): + client, server = self.create_players('ud') + traffic(client, server, self.iters, self.gid_index, self.ib_port) + + def test_odp_xrc_traffic(self): + client, server = self.create_players('xrc') + xrc_traffic(client, server) + + @requires_huge_pages() + def test_odp_rc_huge_traffic(self): + client, server = self.create_players('rc', is_huge=True) + traffic(client, server, self.iters, self.gid_index, self.ib_port) + + @requires_huge_pages() + def test_odp_rc_huge_user_addr_traffic(self): + self.user_addr = mmap(length=HUGE_PAGE_SIZE, + flags=MAP_ANONYMOUS_| MAP_PRIVATE_| MAP_HUGETLB_) + client, server = self.create_players('rc', is_huge=True) + traffic(client, server, self.iters, self.gid_index, self.ib_port) + diff --git a/tests/test_parent_domain.py b/tests/test_parent_domain.py new file mode 100644 index 0000000..23f973b --- /dev/null +++ b/tests/test_parent_domain.py @@ -0,0 +1,87 @@ +# SPDX-License-Identifier: (GPL-2.0 OR Linux-OpenIB) +# Copyright (c) 2019 Mellanox Technologies, Inc. All rights reserved. See COPYING file +""" +Test module for Pyverbs' ParentDomain. +""" +from pyverbs.pd import ParentDomainInitAttr, ParentDomain, ParentDomainContext +from pyverbs.pyverbs_error import PyverbsRDMAError +from pyverbs.srq import SrqAttr, SrqInitAttr, SRQ +from pyverbs.cq import CqInitAttrEx, CQEX, CQ +from pyverbs.qp import QPInitAttr, QP +from tests.base import BaseResources +from tests.base import RDMATestCase +import pyverbs.mem_alloc as mem +import pyverbs.enums as e +import tests.utils as u +import unittest +import errno + + +class ParentDomainRes(BaseResources): + def __init__(self, dev_name, ib_port=None, gid_index=None): + super().__init__(dev_name=dev_name, ib_port=ib_port, + gid_index=gid_index) + # Parent Domain will be created according to the test + self.pd_ctx = None + self.parent_domain = None + + +class ParentDomainTestCase(RDMATestCase): + def setUp(self): + super().setUp() + self.pd_res = ParentDomainRes(self.dev_name) + + def _create_parent_domain_with_allocators(self, alloc_func, free_func): + if alloc_func and free_func: + self.pd_res.pd_ctx = ParentDomainContext(self.pd_res.pd, alloc_func, + free_func) + pd_attr = ParentDomainInitAttr(pd=self.pd_res.pd, + pd_context=self.pd_res.pd_ctx) + try: + self.pd_res.parent_domain = ParentDomain(self.pd_res.ctx, + attr=pd_attr) + except PyverbsRDMAError as ex: + if ex.error_code == errno.EOPNOTSUPP: + raise unittest.SkipTest('Parent Domain is not supported on this device') + raise ex + + def _create_rdma_objects(self): + cq = CQ(self.pd_res.ctx, 100, None, None, 0) + dev_attr = self.pd_res.ctx.query_device() + qp_cap = u.random_qp_cap(dev_attr) + qia = QPInitAttr(scq=cq, rcq=cq, cap=qp_cap) + qia.qp_type = e.IBV_QPT_RC + QP(self.pd_res.parent_domain, qia) + srq_init_attr = SrqInitAttr(SrqAttr()) + SRQ(self.pd_res.parent_domain, srq_init_attr) + cq_init_attrs_ex = CqInitAttrEx(comp_mask=e.IBV_CQ_INIT_ATTR_MASK_PD, + parent_domain=self.pd_res.parent_domain) + CQEX(self.pd_res.ctx, cq_init_attrs_ex) + + def test_without_allocators(self): + self._create_parent_domain_with_allocators(None, None) + self._create_rdma_objects() + self.pd_res.parent_domain.close() + + def test_default_allocators(self): + def alloc_p_func(pd, context, size, alignment, resource_type): + return e._IBV_ALLOCATOR_USE_DEFAULT + + def free_p_func(pd, context, ptr, resource_type): + return e._IBV_ALLOCATOR_USE_DEFAULT + + self._create_parent_domain_with_allocators(alloc_p_func, free_p_func) + self._create_rdma_objects() + self.pd_res.parent_domain.close() + + def test_mem_align_allocators(self): + def alloc_p_func(pd, context, size, alignment, resource_type): + p = mem.posix_memalign(size, alignment) + return p + + def free_p_func(pd, context, ptr, resource_type): + mem.free(ptr) + + self._create_parent_domain_with_allocators(alloc_p_func, free_p_func) + self._create_rdma_objects() + self.pd_res.parent_domain.close() diff --git a/tests/test_pd.py b/tests/test_pd.py new file mode 100755 index 0000000..dc9893c --- /dev/null +++ b/tests/test_pd.py @@ -0,0 +1,50 @@ +# SPDX-License-Identifier: (GPL-2.0 OR Linux-OpenIB) +# Copyright (c) 2019 Mellanox Technologies, Inc. All rights reserved. See COPYING file +""" +Test module for pyverbs' pd module. +""" +import random + +from tests.base import PyverbsAPITestCase +from pyverbs.pd import PD + + +class PDTest(PyverbsAPITestCase): + """ + Test various functionalities of the PD class. + """ + def test_alloc_pd(self): + """ + Test ibv_alloc_pd() + """ + for ctx, attr, attr_ex in self.devices: + with PD(ctx): + pass + + def test_dealloc_pd(self): + """ + Test ibv_dealloc_pd() + """ + for ctx, attr, attr_ex in self.devices: + with PD(ctx) as pd: + pd.close() + + def test_multiple_pd_creation(self): + """ + Test multiple creations and destructions of a PD object + """ + for ctx, attr, attr_ex in self.devices: + for i in range(random.randint(1, 200)): + with PD(ctx) as pd: + pd.close() + + def test_destroy_pd_twice(self): + """ + Test bad flow cases in destruction of a PD object + """ + for ctx, attr, attr_ex in self.devices: + with PD(ctx) as pd: + # Pyverbs supports multiple destruction of objects, we are + # not expecting an exception here. + pd.close() + pd.close() diff --git a/tests/test_qp.py b/tests/test_qp.py new file mode 100644 index 0000000..612fca3 --- /dev/null +++ b/tests/test_qp.py @@ -0,0 +1,357 @@ +# SPDX-License-Identifier: (GPL-2.0 OR Linux-OpenIB) +# Copyright (c) 2019 Mellanox Technologies, Inc. All rights reserved. See COPYING file +""" +Test module for pyverbs' qp module. +""" +import unittest +import random +import errno +import os + +from pyverbs.pyverbs_error import PyverbsRDMAError +from pyverbs.qp import QPInitAttr, QPAttr, QP +from tests.base import PyverbsAPITestCase +import pyverbs.enums as e +from pyverbs.pd import PD +from pyverbs.cq import CQ +import tests.utils as u + + +class QPTest(PyverbsAPITestCase): + """ + Test various functionalities of the QP class. + """ + + def test_create_qp_no_attr_connected(self): + """ + Test QP creation via ibv_create_qp without a QPAttr object proivded. + Checked QP types are RC and UC. + """ + for ctx, attr, attr_ex in self.devices: + with PD(ctx) as pd: + with CQ(ctx, 100, None, None, 0) as cq: + qia = get_qp_init_attr(cq, attr) + qia.qp_type = e.IBV_QPT_RC + with QP(pd, qia) as qp: + assert qp.qp_state == e.IBV_QPS_RESET, 'RC QP should have been in RESET' + qia.qp_type = e.IBV_QPT_UC + with QP(pd, qia) as qp: + assert qp.qp_state == e.IBV_QPS_RESET, 'UC QP should have been in RESET' + + + def test_create_qp_no_attr(self): + """ + Test QP creation via ibv_create_qp without a QPAttr object proivded. + Checked QP types are Raw Packet and UD. Raw Packet is skipped for + non-root users / Infiniband link layer. + """ + for ctx, attr, attr_ex in self.devices: + with PD(ctx) as pd: + with CQ(ctx, 100, None, None, 0) as cq: + for i in range(1, attr.phys_port_cnt + 1): + qia = get_qp_init_attr(cq, attr) + qia.qp_type = e.IBV_QPT_UD + with QP(pd, qia) as qp: + assert qp.qp_state == e.IBV_QPS_RESET, 'UD QP should have been in RESET' + if is_eth(ctx, i) and is_root(): + qia.qp_type = e.IBV_QPT_RAW_PACKET + with QP(pd, qia) as qp: + assert qp.qp_state == e.IBV_QPS_RESET, 'Raw Packet QP should have been in RESET' + + def test_create_qp_with_attr_connected(self): + """ + Test QP creation via ibv_create_qp without a QPAttr object proivded. + Checked QP types are RC and UC. + """ + for ctx, attr, attr_ex in self.devices: + with PD(ctx) as pd: + with CQ(ctx, 100, None, None, 0) as cq: + qia = get_qp_init_attr(cq, attr) + qia.qp_type = e.IBV_QPT_RC + with QP(pd, qia, QPAttr()) as qp: + assert qp.qp_state == e.IBV_QPS_INIT, 'RC QP should have been in INIT' + qia.qp_type = e.IBV_QPT_UC + with QP(pd, qia, QPAttr()) as qp: + assert qp.qp_state == e.IBV_QPS_INIT, 'UC QP should have been in INIT' + + def test_create_qp_with_attr(self): + """ + Test QP creation via ibv_create_qp with a QPAttr object proivded. + Checked QP types are Raw Packet and UD. Raw Packet is skipped for + non-root users / Infiniband link layer. + """ + for ctx, attr, attr_ex in self.devices: + with PD(ctx) as pd: + with CQ(ctx, 100, None, None, 0) as cq: + for i in range(1, attr.phys_port_cnt + 1): + qpts = [e.IBV_QPT_UD, e.IBV_QPT_RAW_PACKET] \ + if is_eth(ctx, i) else [e.IBV_QPT_UD] + qia = get_qp_init_attr(cq, attr) + qia.qp_type = e.IBV_QPT_UD + with QP(pd, qia, QPAttr()) as qp: + assert qp.qp_state == e.IBV_QPS_RTS, 'UD QP should have been in RTS' + if is_eth(ctx, i) and is_root(): + qia.qp_type = e.IBV_QPT_RAW_PACKET + with QP(pd, qia, QPAttr()) as qp: + assert qp.qp_state == e.IBV_QPS_RTS, 'Raw Packet QP should have been in RTS' + + def test_create_qp_ex_no_attr_connected(self): + """ + Test QP creation via ibv_create_qp_ex without a QPAttr object proivded. + Checked QP types are RC and UC. + """ + for ctx, attr, attr_ex in self.devices: + with PD(ctx) as pd: + with CQ(ctx, 100, None, None, 0) as cq: + qia = get_qp_init_attr_ex(cq, pd, attr, attr_ex, e.IBV_QPT_RC) + try: + with QP(ctx, qia) as qp: + assert qp.qp_state == e.IBV_QPS_RESET, 'RC QP should have been in RESET' + except PyverbsRDMAError as ex: + if ex.error_code == errno.EOPNOTSUPP: + raise unittest.SkipTest('Create QP with extended attrs is not supported') + raise ex + qia = get_qp_init_attr_ex(cq, pd, attr, attr_ex, e.IBV_QPT_UC) + try: + with QP(ctx, qia) as qp: + assert qp.qp_state == e.IBV_QPS_RESET, 'UC QP should have been in RESET' + except PyverbsRDMAError as ex: + if ex.error_code == errno.EOPNOTSUPP: + raise unittest.SkipTest('Create QP with extended attrs is not supported') + raise ex + + def test_create_qp_ex_no_attr(self): + """ + Test QP creation via ibv_create_qp_ex without a QPAttr object proivded. + Checked QP types are Raw Packet and UD. Raw Packet is skipped for + non-root users / Infiniband link layer. + """ + for ctx, attr, attr_ex in self.devices: + with PD(ctx) as pd: + with CQ(ctx, 100, None, None, 0) as cq: + for i in range(1, attr.phys_port_cnt + 1): + qia = get_qp_init_attr_ex(cq, pd, attr, attr_ex, + e.IBV_QPT_UD) + try: + with QP(ctx, qia) as qp: + assert qp.qp_state == e.IBV_QPS_RESET, 'UD QP should have been in RESET' + except PyverbsRDMAError as ex: + if ex.error_code == errno.EOPNOTSUPP: + raise unittest.SkipTest('Create QP with extended attrs is not supported') + raise ex + if is_eth(ctx, i) and is_root(): + qia = get_qp_init_attr_ex(cq, pd, attr, attr_ex, + e.IBV_QPT_RAW_PACKET) + try: + with QP(ctx, qia) as qp: + assert qp.qp_state == e.IBV_QPS_RESET, 'Raw Packet QP should have been in RESET' + except PyverbsRDMAError as ex: + if ex.error_code == errno.EOPNOTSUPP: + raise unittest.SkipTest('Create QP with extended attrs is not supported') + raise ex + + def test_create_qp_ex_with_attr_connected(self): + """ + Test QP creation via ibv_create_qp_ex with a QPAttr object proivded. + Checked QP type are RC and UC. + """ + for ctx, attr, attr_ex in self.devices: + with PD(ctx) as pd: + with CQ(ctx, 100, None, None, 0) as cq: + qia = get_qp_init_attr_ex(cq, pd, attr, attr_ex, + e.IBV_QPT_RC) + try: + with QP(ctx, qia, QPAttr()) as qp: + assert qp.qp_state == e.IBV_QPS_INIT, 'RC QP should have been in INIT' + except PyverbsRDMAError as ex: + if ex.error_code == errno.EOPNOTSUPP: + raise unittest.SkipTest('Create QP with extended attrs is not supported') + raise ex + qia = get_qp_init_attr_ex(cq, pd, attr, attr_ex, + e.IBV_QPT_UC) + try: + with QP(ctx, qia, QPAttr()) as qp: + assert qp.qp_state == e.IBV_QPS_INIT, 'UC QP should have been in INIT' + except PyverbsRDMAError as ex: + if ex.error_code == errno.EOPNOTSUPP: + raise unittest.SkipTest('Create QP with extended attrs is not supported') + raise ex + + def test_create_qp_ex_with_attr(self): + """ + Test QP creation via ibv_create_qp_ex with a QPAttr object proivded. + Checked QP types are Raw Packet and UD. Raw Packet is skipped for + non-root users / Infiniband link layer. + """ + for ctx, attr, attr_ex in self.devices: + with PD(ctx) as pd: + with CQ(ctx, 100, None, None, 0) as cq: + for i in range(1, attr.phys_port_cnt + 1): + qia = get_qp_init_attr_ex(cq, pd, attr, attr_ex, + e.IBV_QPT_UD) + try: + with QP(ctx, qia, QPAttr()) as qp: + assert qp.qp_state == e.IBV_QPS_RTS, 'UD QP should have been in RTS' + except PyverbsRDMAError as ex: + if ex.error_code == errno.EOPNOTSUPP: + raise unittest.SkipTest('Create QP with extended attrs is not supported') + raise ex + if is_eth(ctx, i) and is_root(): + qia = get_qp_init_attr_ex(cq, pd, attr, attr_ex, + e.IBV_QPT_RAW_PACKET) + try: + with QP(ctx, qia, QPAttr()) as qp: + assert qp.qp_state == e.IBV_QPS_RTS, 'Raw Packet QP should have been in RTS' + except PyverbsRDMAError as ex: + if ex.error_code == errno.EOPNOTSUPP: + raise unittest.SkipTest('Create QP with extended attrs is not supported') + raise ex + + def test_query_qp(self): + """ + Queries a QP after creation. Verifies that its properties are as + expected. + """ + for ctx, attr, attr_ex in self.devices: + with PD(ctx) as pd: + with CQ(ctx, 100, None, None, 0) as cq: + for i in range(1, attr.phys_port_cnt + 1): + qpts = get_qp_types(ctx, i) + for qpt in qpts: + # Extended QP + qia = get_qp_init_attr_ex(cq, pd, attr, attr_ex, + qpt) + caps = qia.cap # Save them to verify values later + try: + qp = QP(ctx, qia) + except PyverbsRDMAError as ex: + if ex.error_code == errno.EOPNOTSUPP: + raise unittest.SkipTest('Create QP with extended attrs is not supported') + raise ex + qp_attr, qp_init_attr = qp.query(e.IBV_QP_CUR_STATE | + e.IBV_QP_CAP) + verify_qp_attrs(caps, e.IBV_QPS_RESET, qp_init_attr, + qp_attr) + # Legacy QP + qia = get_qp_init_attr(cq, attr) + qia.qp_type = qpt + caps = qia.cap # Save them to verify values later + qp = QP(pd, qia) + qp_attr, qp_init_attr = qp.query(e.IBV_QP_CUR_STATE | + e.IBV_QP_CAP) + verify_qp_attrs(caps, e.IBV_QPS_RESET, qp_init_attr, + qp_attr) + + def test_modify_qp(self): + """ + Queries a QP after calling modify(). Verifies that its properties are + as expected. + """ + for ctx, attr, attr_ex in self.devices: + with PD(ctx) as pd: + with CQ(ctx, 100, None, None, 0) as cq: + # Extended QP + qia = get_qp_init_attr_ex(cq, pd, attr, attr_ex, e.IBV_QPT_UD) + try: + qp = QP(ctx, qia) + except PyverbsRDMAError as ex: + if ex.error_code == errno.EOPNOTSUPP: + raise unittest.SkipTest('Create QP with extended attrs is not supported') + raise ex + qa = QPAttr() + qa.qkey = 0x123 + qp.to_init(qa) + qp_attr, qp_iattr = qp.query(e.IBV_QP_QKEY) + assert qp_attr.qkey == qa.qkey, 'Extended QP, QKey is not as expected' + qp.to_rtr(qa) + qa.sq_psn = 0x45 + qp.to_rts(qa) + qp_attr, qp_iattr = qp.query(e.IBV_QP_SQ_PSN) + assert qp_attr.sq_psn == qa.sq_psn, 'Extended QP, SQ PSN is not as expected' + qa.qp_state = e.IBV_QPS_RESET + qp.modify(qa, e.IBV_QP_STATE) + assert qp.qp_state == e.IBV_QPS_RESET, 'Extended QP, QP state is not as expected' + # Legacy QP + qia = get_qp_init_attr(cq, attr) + qp = QP(pd, qia) + qa = QPAttr() + qa.qkey = 0x123 + qp.to_init(qa) + qp_attr, qp_iattr = qp.query(e.IBV_QP_QKEY) + assert qp_attr.qkey == qa.qkey, 'Legacy QP, QKey is not as expected' + qp.to_rtr(qa) + qa.sq_psn = 0x45 + qp.to_rts(qa) + qp_attr, qp_iattr = qp.query(e.IBV_QP_SQ_PSN) + assert qp_attr.sq_psn == qa.sq_psn, 'Legacy QP, SQ PSN is not as expected' + qa.qp_state = e.IBV_QPS_RESET + qp.modify(qa, e.IBV_QP_STATE) + assert qp.qp_state == e.IBV_QPS_RESET, 'Legacy QP, QP state is not as expected' + + +def get_qp_types(ctx, port_num): + """ + Returns a list of the commonly used QP types. Raw Packet QP will not be + included if link layer is not Ethernet or it current user is not root. + :param ctx: The device's Context, to query the port's link layer + :param port_num: Port number to query + :return: An array of QP types that can be created on this port + """ + qpts = [e.IBV_QPT_RC, e.IBV_QPT_UC, e.IBV_QPT_UD] + if is_eth(ctx, port_num) and is_root(): + qpts.append(e.IBV_QPT_RAW_PACKET) + return qpts + + +def verify_qp_attrs(orig_cap, state, init_attr, attr): + assert state == attr.cur_qp_state + assert orig_cap.max_send_wr <= init_attr.cap.max_send_wr + assert orig_cap.max_recv_wr <= init_attr.cap.max_recv_wr + assert orig_cap.max_send_sge <= init_attr.cap.max_send_sge + assert orig_cap.max_recv_sge <= init_attr.cap.max_recv_sge + assert orig_cap.max_inline_data <= init_attr.cap.max_inline_data + + +def get_qp_init_attr(cq, attr): + """ + Creates a QPInitAttr object with a QP type of the provided <qpts> array and + other random values. + :param cq: CQ to be used as send and receive CQ + :param attr: Device attributes for capability checks + :return: An initialized QPInitAttr object + """ + qp_cap = u.random_qp_cap(attr) + sig = random.randint(0, 1) + return QPInitAttr(scq=cq, rcq=cq, cap=qp_cap, sq_sig_all=sig) + + +def get_qp_init_attr_ex(cq, pd, attr, attr_ex, qpt): + """ + Creates a QPInitAttrEx object with a QP type of the provided <qpts> array + and other random values. + :param cq: CQ to be used as send and receive CQ + :param pd: A PD object to use + :param attr: Device attributes for capability checks + :param attr_ex: Extended device attributes for capability checks + :param qpt: QP type + :return: An initialized QPInitAttrEx object + """ + qia = u.random_qp_init_attr_ex(attr_ex, attr, qpt) + qia.send_cq = cq + qia.recv_cq = cq + qia.pd = pd # Only XRCD can be created without a PD + return qia + + +def is_eth(ctx, port_num): + """ + Querires the device's context's <port_num> port for its link layer. + :param ctx: The Context to query + :param port_num: Which Context's port to query + :return: True if the port's link layer is Ethernet, else False + """ + return ctx.query_port(port_num).link_layer == e.IBV_LINK_LAYER_ETHERNET + + +def is_root(): + return os.geteuid() == 0 diff --git a/tests/test_qpex.py b/tests/test_qpex.py new file mode 100644 index 0000000..c6786c7 --- /dev/null +++ b/tests/test_qpex.py @@ -0,0 +1,300 @@ +import unittest +import random +import errno + +from pyverbs.qp import QPCap, QPInitAttrEx, QPAttr, QPEx, QP +from pyverbs.pyverbs_error import PyverbsRDMAError +from pyverbs.mr import MW, MWBindInfo +from pyverbs.base import inc_rkey +import pyverbs.enums as e + +from tests.base import UDResources, RCResources, RDMATestCase, XRCResources +import tests.utils as u + + +def create_qp_ex(agr_obj, qp_type, send_flags): + if qp_type == e.IBV_QPT_XRC_SEND: + cap = QPCap(max_send_wr=agr_obj.num_msgs, max_recv_wr=0, max_recv_sge=0, + max_send_sge=1) + else: + cap = QPCap(max_send_wr=agr_obj.num_msgs, max_recv_wr=agr_obj.num_msgs, + max_recv_sge=1, max_send_sge=1) + qia = QPInitAttrEx(cap=cap, qp_type=qp_type, scq=agr_obj.cq, + rcq=agr_obj.cq, pd=agr_obj.pd, send_ops_flags=send_flags, + comp_mask=e.IBV_QP_INIT_ATTR_PD | + e.IBV_QP_INIT_ATTR_SEND_OPS_FLAGS) + qp_attr = QPAttr(port_num=agr_obj.ib_port) + if qp_type == e.IBV_QPT_UD: + qp_attr.qkey = agr_obj.UD_QKEY + qp_attr.pkey_index = agr_obj.UD_PKEY_INDEX + if qp_type == e.IBV_QPT_RC: + qp_attr.qp_access_flags = e.IBV_ACCESS_REMOTE_WRITE | \ + e.IBV_ACCESS_REMOTE_READ | \ + e.IBV_ACCESS_REMOTE_ATOMIC + try: + # We don't have capability bits for this + qp = QPEx(agr_obj.ctx, qia, qp_attr) + except PyverbsRDMAError as ex: + if ex.error_code == errno.EOPNOTSUPP: + raise unittest.SkipTest('Extended QP is not supported on this device') + raise ex + return qp + + +class QpExUDSend(UDResources): + def create_qp(self): + self.qp = create_qp_ex(self, e.IBV_QPT_UD, e.IBV_QP_EX_WITH_SEND) + + +class QpExRCSend(RCResources): + def create_qp(self): + self.qp = create_qp_ex(self, e.IBV_QPT_RC, e.IBV_QP_EX_WITH_SEND) + + +class QpExXRCSend(XRCResources): + def create_qp(self): + qp_attr = QPAttr(port_num=self.ib_port) + qp_attr.pkey_index = 0 + for _ in range(self.qp_count): + attr_ex = QPInitAttrEx(qp_type=e.IBV_QPT_XRC_RECV, + comp_mask=e.IBV_QP_INIT_ATTR_XRCD, + xrcd=self.xrcd) + qp_attr.qp_access_flags = e.IBV_ACCESS_REMOTE_WRITE | \ + e.IBV_ACCESS_REMOTE_READ + recv_qp = QP(self.ctx, attr_ex, qp_attr) + self.rqp_lst.append(recv_qp) + + send_qp = create_qp_ex(self, e.IBV_QPT_XRC_SEND, e.IBV_QP_EX_WITH_SEND) + self.sqp_lst.append(send_qp) + self.qps_num.append((recv_qp.qp_num, send_qp.qp_num)) + self.psns.append(random.getrandbits(24)) + + +class QpExUDSendImm(UDResources): + def create_qp(self): + self.qp = create_qp_ex(self, e.IBV_QPT_UD, e.IBV_QP_EX_WITH_SEND_WITH_IMM) + + +class QpExRCSendImm(RCResources): + def create_qp(self): + self.qp = create_qp_ex(self, e.IBV_QPT_RC, e.IBV_QP_EX_WITH_SEND_WITH_IMM) + + +class QpExXRCSendImm(XRCResources): + def create_qp(self): + qp_attr = QPAttr(port_num=self.ib_port) + qp_attr.pkey_index = 0 + for _ in range(self.qp_count): + attr_ex = QPInitAttrEx(qp_type=e.IBV_QPT_XRC_RECV, + comp_mask=e.IBV_QP_INIT_ATTR_XRCD, + xrcd=self.xrcd) + qp_attr.qp_access_flags = e.IBV_ACCESS_REMOTE_WRITE | \ + e.IBV_ACCESS_REMOTE_READ + recv_qp = QP(self.ctx, attr_ex, qp_attr) + self.rqp_lst.append(recv_qp) + + send_qp = create_qp_ex(self, e.IBV_QPT_XRC_SEND, + e.IBV_QP_EX_WITH_SEND_WITH_IMM) + self.sqp_lst.append(send_qp) + self.qps_num.append((recv_qp.qp_num, send_qp.qp_num)) + self.psns.append(random.getrandbits(24)) + + +class QpExRCRDMAWrite(RCResources): + def create_qp(self): + self.qp = create_qp_ex(self, e.IBV_QPT_RC, e.IBV_QP_EX_WITH_RDMA_WRITE) + + def create_mr(self): + self.mr = u.create_custom_mr(self, e.IBV_ACCESS_REMOTE_WRITE) + + +class QpExRCRDMAWriteImm(RCResources): + def create_qp(self): + self.qp = create_qp_ex(self, e.IBV_QPT_RC, + e.IBV_QP_EX_WITH_RDMA_WRITE_WITH_IMM) + + def create_mr(self): + self.mr = u.create_custom_mr(self, e.IBV_ACCESS_REMOTE_WRITE) + + +class QpExRCRDMARead(RCResources): + def create_qp(self): + self.qp = create_qp_ex(self, e.IBV_QPT_RC, e.IBV_QP_EX_WITH_RDMA_READ) + + def create_mr(self): + self.mr = u.create_custom_mr(self, e.IBV_ACCESS_REMOTE_READ) + + +class QpExRCAtomicCmpSwp(RCResources): + def create_qp(self): + self.qp = create_qp_ex(self, e.IBV_QPT_RC, + e.IBV_QP_EX_WITH_ATOMIC_CMP_AND_SWP) + self.mr = u.create_custom_mr(self, e.IBV_ACCESS_REMOTE_ATOMIC) + + +class QpExRCAtomicFetchAdd(RCResources): + def create_qp(self): + self.qp = create_qp_ex(self, e.IBV_QPT_RC, + e.IBV_QP_EX_WITH_ATOMIC_FETCH_AND_ADD) + self.mr = u.create_custom_mr(self, e.IBV_ACCESS_REMOTE_ATOMIC) + + +class QpExRCBindMw(RCResources): + def create_qp(self): + self.qp = create_qp_ex(self, e.IBV_QPT_RC, e.IBV_QP_EX_WITH_BIND_MW) + + def create_mr(self): + self.mr = u.create_custom_mr(self, e.IBV_ACCESS_REMOTE_WRITE) + + +class QpExTestCase(RDMATestCase): + """ Run traffic using the new post send API. """ + def setUp(self): + super().setUp() + self.iters = 100 + self.qp_dict = {'ud_send': QpExUDSend, 'rc_send': QpExRCSend, + 'xrc_send': QpExXRCSend, 'ud_send_imm': QpExUDSendImm, + 'rc_send_imm': QpExRCSendImm, + 'xrc_send_imm': QpExXRCSendImm, + 'rc_write': QpExRCRDMAWrite, + 'rc_write_imm': QpExRCRDMAWriteImm, + 'rc_read': QpExRCRDMARead, + 'rc_cmp_swp': QpExRCAtomicCmpSwp, + 'rc_fetch_add': QpExRCAtomicFetchAdd, + 'rc_bind_mw': QpExRCBindMw} + + def create_players(self, qp_type): + client = self.qp_dict[qp_type](self.dev_name, self.ib_port, + self.gid_index) + server = self.qp_dict[qp_type](self.dev_name, self.ib_port, + self.gid_index) + if 'xrc' in qp_type: + client.pre_run(server.psns, server.qps_num) + server.pre_run(client.psns, client.qps_num) + else: + client.pre_run(server.psn, server.qpn) + server.pre_run(client.psn, client.qpn) + return client, server + + def test_qp_ex_ud_send(self): + client, server = self.create_players('ud_send') + u.traffic(client, server, self.iters, self.gid_index, self.ib_port, + is_cq_ex=False, send_op=e.IBV_QP_EX_WITH_SEND) + + def test_qp_ex_rc_send(self): + client, server = self.create_players('rc_send') + u.traffic(client, server, self.iters, self.gid_index, self.ib_port, + is_cq_ex=False, send_op=e.IBV_QP_EX_WITH_SEND) + + def test_qp_ex_xrc_send(self): + client, server = self.create_players('xrc_send') + u.xrc_traffic(client, server, send_op=e.IBV_QP_EX_WITH_SEND) + + def test_qp_ex_ud_send_imm(self): + client, server = self.create_players('ud_send_imm') + u.traffic(client, server, self.iters, self.gid_index, self.ib_port, + is_cq_ex=False, send_op=e.IBV_QP_EX_WITH_SEND_WITH_IMM) + + def test_qp_ex_rc_send_imm(self): + client, server = self.create_players('rc_send_imm') + u.traffic(client, server, self.iters, self.gid_index, self.ib_port, + is_cq_ex=False, send_op=e.IBV_QP_EX_WITH_SEND_WITH_IMM) + + def test_qp_ex_xrc_send_imm(self): + client, server = self.create_players('xrc_send_imm') + u.xrc_traffic(client, server, send_op=e.IBV_QP_EX_WITH_SEND_WITH_IMM) + + def test_qp_ex_rc_rdma_write(self): + client, server = self.create_players('rc_write') + client.rkey = server.mr.rkey + server.rkey = client.mr.rkey + client.raddr = server.mr.buf + server.raddr = client.mr.buf + u.rdma_traffic(client, server, self.iters, self.gid_index, self.ib_port, + is_cq_ex=False, send_op=e.IBV_QP_EX_WITH_RDMA_WRITE) + + def test_qp_ex_rc_rdma_write_imm(self): + client, server = self.create_players('rc_write_imm') + client.rkey = server.mr.rkey + server.rkey = client.mr.rkey + client.raddr = server.mr.buf + server.raddr = client.mr.buf + u.traffic(client, server, self.iters, self.gid_index, self.ib_port, + is_cq_ex=False, send_op=e.IBV_QP_EX_WITH_RDMA_WRITE_WITH_IMM) + + def test_qp_ex_rc_rdma_read(self): + client, server = self.create_players('rc_read') + client.rkey = server.mr.rkey + server.rkey = client.mr.rkey + client.raddr = server.mr.buf + server.raddr = client.mr.buf + server.mr.write('s' * server.msg_size, server.msg_size) + u.rdma_traffic(client, server, self.iters, self.gid_index, self.ib_port, + is_cq_ex=False, send_op=e.IBV_QP_EX_WITH_RDMA_READ) + + def test_qp_ex_rc_atomic_cmp_swp(self): + client, server = self.create_players('rc_cmp_swp') + client.msg_size = 8 # Atomic work on 64b operators + server.msg_size = 8 + client.rkey = server.mr.rkey + server.rkey = client.mr.rkey + client.raddr = server.mr.buf + server.raddr = client.mr.buf + server.mr.write('s' * 8, 8) + u.rdma_traffic(client, server, self.iters, self.gid_index, self.ib_port, + is_cq_ex=False, send_op=e.IBV_QP_EX_WITH_ATOMIC_CMP_AND_SWP) + + def test_qp_ex_rc_atomic_fetch_add(self): + client, server = self.create_players('rc_fetch_add') + client.msg_size = 8 # Atomic work on 64b operators + server.msg_size = 8 + client.rkey = server.mr.rkey + server.rkey = client.mr.rkey + client.raddr = server.mr.buf + server.raddr = client.mr.buf + server.mr.write('s' * 8, 8) + u.rdma_traffic(client, server, self.iters, self.gid_index, self.ib_port, + is_cq_ex=False, send_op=e.IBV_QP_EX_WITH_ATOMIC_FETCH_AND_ADD) + + def test_qp_ex_rc_bind_mw(self): + """ + Verify bind memory window operation using the new post_send API. + Instead of checking through regular pingpong style traffic, we'll + do as follows: + - Register an MR with remote write access + - Bind a MW without remote write permission to the MR + - Verify that remote write fails + Since it's a unique flow, it's an integral part of that test rather + than a utility method. + """ + client, server = self.create_players('rc_bind_mw') + client_sge = u.get_send_element(client, False)[1] + # Create a MW and bind it + server.qp.wr_start() + server.qp.wr_id = 0x123 + server.qp.wr_flags = e.IBV_SEND_SIGNALED + bind_info = MWBindInfo(server.mr, server.mr.buf, server.mr.length, + e.IBV_ACCESS_LOCAL_WRITE) + try: + mw = MW(server.pd, mw_type=e.IBV_MW_TYPE_2) + except PyverbsRDMAError as ex: + if ex.error_code == errno.EOPNOTSUPP: + raise unittest.SkipTest('Memory Window allocation is not supported') + raise ex + new_key = inc_rkey(server.mr.rkey) + server.qp.wr_bind_mw(mw, new_key, bind_info) + server.qp.wr_complete() + u.poll_cq(server.cq) + # Verify that remote write fails + client.qp.wr_start() + client.qp.wr_id = 0x124 + client.qp.wr_flags = e.IBV_SEND_SIGNALED + client.qp.wr_rdma_write(new_key, server.mr.buf) + client.qp.wr_set_sge(client_sge) + client.qp.wr_complete() + try: + u.poll_cq(client.cq) + except PyverbsRDMAError as ex: + if ex.error_code != e.IBV_WC_REM_ACCESS_ERR: + raise ex + diff --git a/tests/test_rdmacm.py b/tests/test_rdmacm.py new file mode 100755 index 0000000..880f9a9 --- /dev/null +++ b/tests/test_rdmacm.py @@ -0,0 +1,87 @@ +# SPDX-License-Identifier: (GPL-2.0 OR Linux-OpenIB) +# Copyright (c) 2019 Mellanox Technologies, Inc. All rights reserved. See COPYING file + +from tests.rdmacm_utils import sync_traffic, async_traffic, \ + async_traffic_with_ext_qp +from pyverbs.pyverbs_error import PyverbsError +from tests.base import RDMATestCase +import multiprocessing as mp +import pyverbs.device as d +import subprocess +import unittest +import json + +NUM_OF_PROCESSES = 2 + + +class CMTestCase(RDMATestCase): + def setUp(self): + if self.dev_name is not None: + net_name = self.get_net_name(self.dev_name) + try: + self.ip_addr = self.get_ip_address(net_name) + except KeyError: + raise unittest.SkipTest('Device {} doesn\'t have net interface' + .format(self.dev_name)) + else: + dev_list = d.get_device_list() + for dev in dev_list: + net_name = self.get_net_name(dev.name.decode()) + try: + self.ip_addr = self.get_ip_address(net_name) + except IndexError: + continue + else: + self.dev_name = dev.name.decode() + break + if self.dev_name is None: + raise unittest.SkipTest('No devices with net interface') + super().setUp() + + @staticmethod + def get_net_name(dev): + out = subprocess.check_output(['ls', '/sys/class/infiniband/{}/device/net/' + .format(dev)]) + return out.decode().split('\n')[0] + + @staticmethod + def get_ip_address(ifname): + out = subprocess.check_output(['ip', '-j', 'addr', 'show', ifname]) + loaded_json = json.loads(out.decode()) + interface = loaded_json[0]['addr_info'][0]['local'] + if 'fe80::' in interface: + interface = interface + '%' + ifname + return interface + + @staticmethod + def two_nodes_rdmacm_traffic(ip_addr, traffic_func): + ctx = mp.get_context('fork') + syncer = ctx.Barrier(NUM_OF_PROCESSES, timeout=5) + notifier = ctx.Queue() + passive = ctx.Process(target=traffic_func, + args=[ip_addr, syncer, notifier, True]) + active = ctx.Process(target=traffic_func, + args=[ip_addr, syncer, notifier, False]) + passive.start() + active.start() + while notifier.empty(): + pass + + for _ in range(NUM_OF_PROCESSES): + res = notifier.get() + if res is not None: + passive.terminate() + active.terminate() + raise PyverbsError(res) + + passive.join() + active.join() + + def test_rdmacm_sync_traffic(self): + self.two_nodes_rdmacm_traffic(self.ip_addr, sync_traffic) + + def test_rdmacm_async_traffic(self): + self.two_nodes_rdmacm_traffic(self.ip_addr, async_traffic) + + def test_rdmacm_async_traffic_external_qp(self): + self.two_nodes_rdmacm_traffic(self.ip_addr, async_traffic_with_ext_qp) diff --git a/tests/test_relaxed_ordering.py b/tests/test_relaxed_ordering.py new file mode 100644 index 0000000..27af992 --- /dev/null +++ b/tests/test_relaxed_ordering.py @@ -0,0 +1,55 @@ +from tests.base import RCResources, UDResources, XRCResources +from tests.utils import traffic, xrc_traffic +from tests.base import RDMATestCase +from pyverbs.mr import MR +import pyverbs.enums as e + + +class RoUD(UDResources): + def create_mr(self): + self.mr = MR(self.pd, self.msg_size + self.GRH_SIZE, + e.IBV_ACCESS_LOCAL_WRITE | e.IBV_ACCESS_RELAXED_ORDERING) + + +class RoRC(RCResources): + def create_mr(self): + self.mr = MR(self.pd, self.msg_size, + e.IBV_ACCESS_LOCAL_WRITE | e.IBV_ACCESS_RELAXED_ORDERING) + + +class RoXRC(XRCResources): + def create_mr(self): + self.mr = MR(self.pd, self.msg_size, + e.IBV_ACCESS_LOCAL_WRITE | e.IBV_ACCESS_RELAXED_ORDERING) + + +class RoTestCase(RDMATestCase): + def setUp(self): + super(RoTestCase, self).setUp() + self.iters = 100 + self.qp_dict = {'rc': RoRC, 'ud': RoUD, 'xrc': RoXRC} + + def create_players(self, qp_type): + client = self.qp_dict[qp_type](self.dev_name, self.ib_port, + self.gid_index) + server = self.qp_dict[qp_type](self.dev_name, self.ib_port, + self.gid_index) + if qp_type == 'xrc': + client.pre_run(server.psns, server.qps_num) + server.pre_run(client.psns, client.qps_num) + else: + client.pre_run(server.psn, server.qpn) + server.pre_run(client.psn, client.qpn) + return client, server + + def test_ro_rc_traffic(self): + client, server = self.create_players('rc') + traffic(client, server, self.iters, self.gid_index, self.ib_port) + + def test_ro_ud_traffic(self): + client, server = self.create_players('ud') + traffic(client, server, self.iters, self.gid_index, self.ib_port) + + def test_ro_xrc_traffic(self): + client, server = self.create_players('xrc') + xrc_traffic(client, server) diff --git a/tests/utils.py b/tests/utils.py new file mode 100755 index 0000000..45bb735 --- /dev/null +++ b/tests/utils.py @@ -0,0 +1,671 @@ +# SPDX-License-Identifier: (GPL-2.0 OR Linux-OpenIB) +# Copyright (c) 2019 Mellanox Technologies, Inc. All rights reserved. See COPYING file +""" +Provide some useful helper function for pyverbs' tests. +""" +from itertools import combinations as com +import unittest +import random +import socket +import os + +from pyverbs.pyverbs_error import PyverbsError, PyverbsRDMAError +from pyverbs.addr import AHAttr, AH, GlobalRoute +from pyverbs.wr import SGE, SendWR, RecvWR +from pyverbs.qp import QPCap, QPInitAttrEx +from pyverbs.base import PyverbsRDMAErrno +from pyverbs.mr import MW, MWBindInfo +from tests.base import XRCResources +from pyverbs.cq import PollCqAttr +import pyverbs.device as d +import pyverbs.enums as e +from pyverbs.mr import MR + +MAX_MR_SIZE = 4194304 +# Some HWs limit DM address and length alignment to 4 for read and write +# operations. Use a minimal length and alignment that respect that. +# For creation purposes use random alignments. As this is log2 of address +# alignment, no need for large numbers. +MIN_DM_SIZE = 4 +DM_ALIGNMENT = 4 +MIN_DM_LOG_ALIGN = 0 +MAX_DM_LOG_ALIGN = 6 +# Raw Packet QP supports TSO header, which creates a larger send WQE. +MAX_RAW_PACKET_SEND_WR = 2500 +GRH_SIZE = 40 +IMM_DATA = 1234 + + +def get_mr_length(): + """ + Provide a random value for MR length. We avoid large buffers as these + allocations typically fails. + We use random.random() instead of randrange() or randint() due to + performance issues when generating very large pseudo random numbers. + :return: A random MR length + """ + return int(MAX_MR_SIZE * random.random()) + + +def filter_illegal_access_flags(element): + """ + Helper function to filter illegal access flags combinations + :param element: A list of access flags to check + :return: True if this list is legal, else False + """ + if e.IBV_ACCESS_REMOTE_ATOMIC in element or e.IBV_ACCESS_REMOTE_WRITE: + if e.IBV_ACCESS_LOCAL_WRITE: + return False + return True + + +def get_access_flags(ctx): + """ + Provide an array of random legal access flags for an MR. + Since remote write and remote atomic require local write permission, if + one of them is randomly selected without local write, local write will be + added as well. + After verifying that the flags selection is legal, it is appended to an + array, assuming it wasn't previously appended. + :param ctx: Device Context to check capabilities + :param num: Size of initial collection + :return: A random legal value for MR flags + """ + attr = ctx.query_device() + attr_ex = ctx.query_device_ex() + vals = list(e.ibv_access_flags) + if not attr_ex.odp_caps.general_caps & e.IBV_ODP_SUPPORT: + vals.remove(e.IBV_ACCESS_ON_DEMAND) + if not attr.device_cap_flags & e.IBV_DEVICE_MEM_WINDOW: + vals.remove(e.IBV_ACCESS_MW_BIND) + if not attr.atomic_caps & e.IBV_ATOMIC_HCA: + vals.remove(e.IBV_ACCESS_REMOTE_ATOMIC) + arr = [] + for i in range(1, len(vals)): + tmp = list(com(vals, i)) + tmp = filter(filter_illegal_access_flags, tmp) + for t in tmp: # Iterate legal combinations and bitwise OR them + val = 0 + for flag in t: + val += flag.value + arr.append(val) + return arr + + +def get_dm_attrs(dm_len): + """ + Initializes an AllocDmAttr member with the given length and random + alignment. It currently sets comp_mask = 0 since other comp_mask values + are not supported. + :param dm_len: + :return: An initialized AllocDmAttr object + """ + align = random.randint(MIN_DM_LOG_ALIGN, MAX_DM_LOG_ALIGN) + return d.AllocDmAttr(dm_len, align, 0) + + +def sample(coll): + """ + Returns a random-length subset of the given collection. + :param coll: The collection to sample + :return: A subset of <collection> + """ + return random.sample(coll, int((len(coll) + 1) * random.random())) + + +def random_qp_cap(attr): + """ + Initializes a QPCap object with valid values based on the device's + attributes. + It doesn't check the max WR limits since they're reported for smaller WR + sizes. + :return: A QPCap object + """ + # We use significantly smaller values than those in device attributes. + # The attributes reported by the device don't take into account possible + # larger WQEs that include e.g. memory window. + send_wr = random.randint(1, int(attr.max_qp_wr / 8)) + recv_wr = random.randint(1, int(attr.max_qp_wr / 8)) + send_sge = random.randint(1, int(attr.max_sge / 2)) + recv_sge = random.randint(1, int(attr.max_sge / 2)) + inline = random.randint(0, 16) + return QPCap(send_wr, recv_wr, send_sge, recv_sge, inline) + + +def random_qp_create_mask(qpt, attr_ex): + """ + Select a random sublist of ibv_qp_init_attr_mask. Some of the options are + not yet supported by pyverbs and will not be returned. TSO support is + checked for the device and the QP type. If it doesn't exist, TSO will not + be set. + :param qpt: Current QP type + :param attr_ex: Extended device attributes for capability checks + :return: A sublist of ibv_qp_init_attr_mask + """ + has_tso = attr_ex.tso_caps.max_tso > 0 and \ + attr_ex.tso_caps.supported_qpts & 1 << qpt + supp_flags = [e.IBV_QP_INIT_ATTR_CREATE_FLAGS, + e.IBV_QP_INIT_ATTR_MAX_TSO_HEADER] + # Either PD or XRCD flag is needed, XRCD is not supported yet + selected = sample(supp_flags) + selected.append(e.IBV_QP_INIT_ATTR_PD) + if e.IBV_QP_INIT_ATTR_MAX_TSO_HEADER in selected and not has_tso: + selected.remove(e.IBV_QP_INIT_ATTR_MAX_TSO_HEADER) + mask = 0 + for s in selected: + mask += s.value + return mask + + +def get_create_qp_flags_raw_packet(attr_ex): + """ + Select random QP creation flags for Raw Packet QP. Filter out unsupported + flags prior to selection. + :param attr_ex: Device extended attributes to check capabilities + :return: A random combination of QP creation flags + """ + has_fcs = attr_ex.device_cap_flags_ex & e._IBV_DEVICE_RAW_SCATTER_FCS + has_cvlan = attr_ex.raw_packet_caps & e.IBV_RAW_PACKET_CAP_CVLAN_STRIPPING + has_padding = attr_ex.device_cap_flags_ex & \ + e._IBV_DEVICE_PCI_WRITE_END_PADDING + l = list(e.ibv_qp_create_flags) + l.remove(e.IBV_QP_CREATE_SOURCE_QPN) # UD only + if not has_fcs: + l.remove(e.IBV_QP_CREATE_SCATTER_FCS) + if not has_cvlan: + l.remove(e.IBV_QP_CREATE_CVLAN_STRIPPING) + if not has_padding: + l.remove(e.IBV_QP_CREATE_PCI_WRITE_END_PADDING) + flags = sample(l) + val = 0 + for i in flags: + val |= i.value + return val + + +def random_qp_create_flags(qpt, attr_ex): + """ + Select a random sublist of ibv_qp_create_flags according to the QP type. + :param qpt: Current QP type + :param attr_ex: Used for Raw Packet QP to check device capabilities + :return: A sublist of ibv_qp_create_flags + """ + if qpt == e.IBV_QPT_RAW_PACKET: + return get_create_qp_flags_raw_packet(attr_ex) + elif qpt == e.IBV_QPT_UD: + # IBV_QP_CREATE_SOURCE_QPN is only supported by mlx5 driver and is not + # to be check in unittests. + return random.choice([0, 2]) # IBV_QP_CREATE_BLOCK_SELF_MCAST_LB + else: + return 0 + + +def random_qp_init_attr_ex(attr_ex, attr, qpt=None): + """ + Create a random-valued QPInitAttrEx object with the given QP type. + QP type affects QP capabilities, so allow users to set it and still get + valid attributes. + :param attr_ex: Extended device attributes for capability checks + :param attr: Device attributes for capability checks + :param qpt: Requested QP type + :return: A valid initialized QPInitAttrEx object + """ + max_tso = 0 + if qpt is None: + qpt = random.choice([e.IBV_QPT_RC, e.IBV_QPT_UC, e.IBV_QPT_UD, + e.IBV_QPT_RAW_PACKET]) + qp_cap = random_qp_cap(attr) + if qpt == e.IBV_QPT_RAW_PACKET and \ + qp_cap.max_send_wr > MAX_RAW_PACKET_SEND_WR: + qp_cap.max_send_wr = MAX_RAW_PACKET_SEND_WR + sig = random.randint(0, 1) + mask = random_qp_create_mask(qpt, attr_ex) + if mask & e.IBV_QP_INIT_ATTR_CREATE_FLAGS: + cflags = random_qp_create_flags(qpt, attr_ex) + else: + cflags = 0 + if mask & e.IBV_QP_INIT_ATTR_MAX_TSO_HEADER: + if qpt != e.IBV_QPT_RAW_PACKET: + mask -= e.IBV_QP_INIT_ATTR_MAX_TSO_HEADER + else: + max_tso = \ + random.randint(16, int(attr_ex.tso_caps.max_tso / 800)) + qia = QPInitAttrEx(qp_type=qpt, cap=qp_cap, sq_sig_all=sig, comp_mask=mask, + create_flags=cflags, max_tso_header=max_tso) + if mask & e.IBV_QP_INIT_ATTR_MAX_TSO_HEADER: + # TSO increases send WQE size, let's be on the safe side + qia.cap.max_send_sge = 2 + return qia + + +def wc_status_to_str(status): + try: + return \ + {0: 'Success', 1: 'Local length error', + 2: 'local QP operation error', 3: 'Local EEC operation error', + 4: 'Local protection error', 5: 'WR flush error', + 6: 'Memory window bind error', 7: 'Bad response error', + 8: 'Local access error', 9: 'Remote invalidate request error', + 10: 'Remote access error', 11: 'Remote operation error', + 12: 'Retry exceeded', 13: 'RNR retry exceeded', + 14: 'Local RDD violation error', + 15: 'Remote invalidate RD request error', + 16: 'Remote aort error', 17: 'Invalidate EECN error', + 18: 'Invalidate EEC state error', 19: 'Fatal error', + 20: 'Response timeout error', 21: 'General error'}[status] + except KeyError: + return 'Unknown WC status ({s})'.format(s=status) + + +def create_custom_mr(agr_obj, additional_access_flags=0, size=None): + """ + Creates a memory region using the aggregation object's PD. + If size is None, the agr_obj's message size is used to set the MR's size. + The access flags are local write and the additional_access_flags. + :param agr_obj: The aggregation object that creates the MR + :param additional_access_flags: Addition access flags to set in the MR + :param size: MR's length. If None, agr_obj.msg_size is used. + """ + mr_length = size if size else agr_obj.msg_size + return MR(agr_obj.pd, mr_length, + e.IBV_ACCESS_LOCAL_WRITE | additional_access_flags) + +# Traffic helpers + +def get_send_element(agr_obj, is_server): + """ + Creates a single SGE and a single Send WR for agr_obj's QP type. The content + of the message is either 's' for server side or 'c' for client side. + :param agr_obj: Aggregation object which contains all resources necessary + :param is_server: Indicates whether this is server or client side + :return: send wr and its SGE + """ + mr = agr_obj.mr + qp_type = agr_obj.sqp_lst[0].qp_type if isinstance(agr_obj, XRCResources) \ + else agr_obj.qp.qp_type + offset = GRH_SIZE if qp_type == e.IBV_QPT_UD else 0 + msg = (agr_obj.msg_size + offset) * ('s' if is_server else 'c') + mr.write(msg, agr_obj.msg_size + offset) + sge = SGE(mr.buf + offset, agr_obj.msg_size, mr.lkey) + return SendWR(num_sge=1, sg=[sge]), sge + + +def get_recv_wr(agr_obj): + """ + Creates a single SGE Recv WR for agr_obj's QP type. + :param agr_obj: Aggregation object which contains all resources necessary + :return: recv wr + """ + qp_type = agr_obj.rqp_lst[0].qp_type if isinstance(agr_obj, XRCResources) \ + else agr_obj.qp.qp_type + mr = agr_obj.mr + length = agr_obj.msg_size + GRH_SIZE if qp_type == e.IBV_QPT_UD \ + else agr_obj.msg_size + recv_sge = SGE(mr.buf, length, mr.lkey) + return RecvWR(sg=[recv_sge], num_sge=1) + + +def get_global_ah(agr_obj, gid_index, port): + gr = GlobalRoute(dgid=agr_obj.ctx.query_gid(port, gid_index), + sgid_index=gid_index) + ah_attr = AHAttr(port_num=port, is_global=1, gr=gr, + dlid=agr_obj.port_attr.lid) + return AH(agr_obj.pd, attr=ah_attr) + + +def xrc_post_send(agr_obj, qp_num, send_object, gid_index, port, send_op=None): + agr_obj.qp = agr_obj.sqp_lst[qp_num] + if send_op: + post_send_ex(agr_obj, send_object, gid_index, port, send_op) + else: + post_send(agr_obj, send_object, gid_index, port) + + +def post_send_ex(agr_obj, send_object, gid_index, port, send_op=None): + qp_type = agr_obj.qp.qp_type + agr_obj.qp.wr_start() + agr_obj.qp.wr_id = 0x123 + agr_obj.qp.wr_flags = e.IBV_SEND_SIGNALED + if send_op == e.IBV_QP_EX_WITH_SEND: + agr_obj.qp.wr_send() + elif send_op == e.IBV_QP_EX_WITH_RDMA_WRITE: + agr_obj.qp.wr_rdma_write(agr_obj.rkey, agr_obj.raddr) + elif send_op == e.IBV_QP_EX_WITH_SEND_WITH_IMM: + agr_obj.qp.wr_send_imm(IMM_DATA) + elif send_op == e.IBV_QP_EX_WITH_RDMA_WRITE_WITH_IMM: + agr_obj.qp.wr_rdma_write_imm(agr_obj.rkey, agr_obj.raddr, IMM_DATA) + elif send_op == e.IBV_QP_EX_WITH_RDMA_READ: + agr_obj.qp.wr_rdma_read(agr_obj.rkey, agr_obj.raddr) + elif send_op == e.IBV_QP_EX_WITH_ATOMIC_CMP_AND_SWP: + # We're checking the returned value (remote's content), so cmp/swp + # values are of no importance. + agr_obj.qp.wr_atomic_cmp_swp(agr_obj.rkey, agr_obj.raddr, 42, 43) + elif send_op == e.IBV_QP_EX_WITH_ATOMIC_FETCH_AND_ADD: + agr_obj.qp.wr_atomic_fetch_add(agr_obj.rkey, agr_obj.raddr, 1) + elif send_op == e.IBV_QP_EX_WITH_BIND_MW: + bind_info = MWBindInfo(agr_obj.mr, agr_obj.mr.buf, agr_obj.mr.rkey, + e.IBV_ACCESS_REMOTE_WRITE) + mw = MW(agr_obj.pd, mw_type=e.IBV_MW_TYPE_2) + # A new rkey is needed to be set into bind_info, modify rkey + agr_obj.qp.wr_bind_mw(mw, agr_obj.mr.rkey + 12, bind_info) + agr_obj.qp.wr_send() + if qp_type == e.IBV_QPT_UD: + ah = get_global_ah(agr_obj, gid_index, port) + agr_obj.qp.wr_set_ud_addr(ah, agr_obj.rqpn, agr_obj.UD_QKEY) + if qp_type == e.IBV_QPT_XRC_SEND: + agr_obj.qp.wr_set_xrc_srqn(agr_obj.remote_srqn) + agr_obj.qp.wr_set_sge(send_object) + agr_obj.qp.wr_complete() + + +def post_send(agr_obj, send_wr, gid_index, port): + """ + Post a single send WR to the QP. Post_send's second parameter (send bad wr) + is ignored for simplicity. For UD traffic an address vector is added as + well. + :param agr_obj: aggregation object which contains all resources necessary + :param send_wr: Send work request to post send + :param gid_index: Local gid index + :param port: IB port number + :return: None + """ + qp_type = agr_obj.qp.qp_type + if qp_type == e.IBV_QPT_UD: + ah = get_global_ah(agr_obj, gid_index, port) + send_wr.set_wr_ud(ah, agr_obj.rqpn, agr_obj.UD_QKEY) + agr_obj.qp.post_send(send_wr, None) + + +def post_recv(qp, recv_wr, num_wqes=1): + """ + Call the QP's post_recv() method <num_wqes> times. Post_recv's second + parameter (recv bad wr) is ignored for simplicity. + :param qp: QP which posts receive work request + :param recv_wr: Receive work request to post + :param num_wqes: Number of WQEs to post + :return: None + """ + for _ in range(num_wqes): + qp.post_recv(recv_wr, None) + + +def poll_cq(cq, count=1, data=None): + """ + Poll <count> completions from the CQ. + Note: This function calls the blocking poll() method of the CQ + until <count> completions were received. Alternatively, gets a + single CQ event when events are used. + :param cq: CQ to poll from + :param count: How many completions to poll + :param data: In case of a work request with immediate, the immediate data + to be compared after poll + :return: An array of work completions of length <count>, None + when events are used + """ + wcs = [] + channel = cq.comp_channel + while count > 0: + if channel: + channel.get_cq_event(cq) + cq.req_notify() + nc, tmp_wcs = cq.poll(count) + for wc in tmp_wcs: + if wc.status != e.IBV_WC_SUCCESS: + raise PyverbsRDMAError('Completion status is {s}'. + format(s=wc_status_to_str(wc.status)), + wc.status) + if data: + if wc.wc_flags & e.IBV_WC_WITH_IMM == 0: + raise PyverbsRDMAError('Completion without immediate') + assert socket.ntohl(wc.imm_data) == data + count -= nc + wcs.extend(tmp_wcs) + return wcs + + +def poll_cq_ex(cqex, count=1, data=None): + """ + Poll <count> completions from the extended CQ. + :param cq: CQEX to poll from + :param count: How many completions to poll + :param data: In case of a work request with immediate, the immediate data + to be compared after poll + :return: None + """ + poll_attr = PollCqAttr() + ret = cqex.start_poll(poll_attr) + while ret == 2: # ENOENT + ret = cqex.start_poll(poll_attr) + if ret != 0: + raise PyverbsRDMAErrno('Failed to poll CQ') + count -= 1 + if cqex.status != e.IBV_WC_SUCCESS: + raise PyverbsRDMAErrno('Completion status is {s}'. + format(s=cqex.status)) + if data: + assert data == socket.ntohl(cqex.read_imm_data()) + # Now poll the rest of the packets + while count > 0: + ret = cqex.poll_next() + while ret == 2: + ret = cqex.poll_next() + if ret != 0: + raise PyverbsRDMAErrno('Failed to poll CQ') + if cqex.status != e.IBV_WC_SUCCESS: + raise PyverbsRDMAErrno('Completion status is {s}'. + format(s=cqex.status)) + if data: + assert data == socket.ntohl(cqex.read_imm_data()) + count -= 1 + cqex.end_poll() + + +def validate(received_str, is_server, msg_size): + """ + Validates the received buffer against the expected result. + The application should set client's send buffer to 'c's and the + server's send buffer to 's's. + If the expected buffer is different than the actual, an exception will + be raised. + :param received_str: The received buffer to check + :param is_server: Indicates whether this is the server (receiver) or + client side + :param msg_size: the message size of the received packet + :return: None + """ + expected_str = msg_size * ('c' if is_server else 's') + received_str = received_str.decode() + if received_str[0:msg_size] == \ + expected_str[0:msg_size]: + return + else: + raise PyverbsError( + 'Data validation failure: expected {exp}, received {rcv}'. + format(exp=expected_str, rcv=received_str)) + + +def send(agr_obj, send_wr, gid_index, port, send_op=None): + if send_op: + return post_send_ex(agr_obj, send_wr, gid_index, port, send_op) + return post_send(agr_obj, send_wr, gid_index, port) + + +def traffic(client, server, iters, gid_idx, port, is_cq_ex=False, send_op=None): + """ + Runs basic traffic between two sides + :param client: client side, clients base class is BaseTraffic + :param server: server side, servers base class is BaseTraffic + :param iters: number of traffic iterations + :param gid_idx: local gid index + :param port: IB port + :param is_cq_ex: If True, use poll_cq_ex() rather than poll_cq() + :param send_op: If not None, new post send API is assumed. + :return: + """ + poll = poll_cq_ex if is_cq_ex else poll_cq + if send_op == e.IBV_QP_EX_WITH_SEND_WITH_IMM or \ + send_op == e.IBV_QP_EX_WITH_RDMA_WRITE_WITH_IMM: + imm_data = IMM_DATA + else: + imm_data = None + # Using the new post send API, we need the SGE, not the SendWR + send_element_idx = 1 if send_op else 0 + s_recv_wr = get_recv_wr(server) + c_recv_wr = get_recv_wr(client) + post_recv(client.qp, c_recv_wr, client.num_msgs) + post_recv(server.qp, s_recv_wr, server.num_msgs) + read_offset = GRH_SIZE if client.qp.qp_type == e.IBV_QPT_UD else 0 + for _ in range(iters): + c_send_wr = get_send_element(client, False)[send_element_idx] + send(client, c_send_wr, gid_idx, port, send_op) + poll(client.cq) + poll(server.cq, data=imm_data) + post_recv(server.qp, s_recv_wr) + msg_received = server.mr.read(server.msg_size, read_offset) + validate(msg_received, True, server.msg_size) + s_send_wr = get_send_element(server, True)[send_element_idx] + send(server, s_send_wr, gid_idx, port, send_op) + poll(server.cq) + poll(client.cq, data=imm_data) + post_recv(client.qp, c_recv_wr) + msg_received = client.mr.read(client.msg_size, read_offset) + validate(msg_received, False, client.msg_size) + + +def rdma_traffic(client, server, iters, gid_idx, port, is_cq_ex=False, send_op=None): + """ + Runs basic RDMA traffic between two sides. No receive WQEs are posted. For + RDMA send with immediate, use traffic(). + :param client: client side, clients base class is BaseTraffic + :param server: server side, servers base class is BaseTraffic + :param iters: number of traffic iterations + :param gid_idx: local gid index + :param port: IB port + :param is_cq_ex: If True, use poll_cq_ex() rather than poll_cq() + :param send_op: If not None, new post send API is assumed. + :return: + """ + # Using the new post send API, we need the SGE, not the SendWR + send_element_idx = 1 if send_op else 0 + same_side_check = (send_op == e.IBV_QP_EX_WITH_RDMA_READ or + send_op == e.IBV_QP_EX_WITH_ATOMIC_CMP_AND_SWP or + send_op == e.IBV_QP_EX_WITH_ATOMIC_FETCH_AND_ADD) + for _ in range(iters): + c_send_wr = get_send_element(client, False)[send_element_idx] + send(client, c_send_wr, gid_idx, port, send_op) + poll_cq(client.cq) + if same_side_check: + msg_received = client.mr.read(client.msg_size, 0) + else: + msg_received = server.mr.read(server.msg_size, 0) + validate(msg_received, False if same_side_check else True, + server.msg_size) + s_send_wr = get_send_element(server, True)[send_element_idx] + if same_side_check: + client.mr.write('c' * client.msg_size, client.msg_size) + send(server, s_send_wr, gid_idx, port, send_op) + poll_cq(server.cq) + if same_side_check: + msg_received = server.mr.read(client.msg_size, 0) + else: + msg_received = client.mr.read(server.msg_size, 0) + validate(msg_received, True if same_side_check else False, + client.msg_size) + if same_side_check: + server.mr.write('s' * server.msg_size, server.msg_size) + + +def xrc_traffic(client, server, is_cq_ex=False, send_op=None): + """ + Runs basic xrc traffic, this function assumes that number of QPs, which + server and client have are equal, server.send_qp[i] is connected to + client.recv_qp[i], each time server.send_qp[i] sends a message, it is + redirected to client.srq because client.recv_qp[i] and client.srq are + under the same xrcd. The traffic flow in the opposite direction is the same. + :param client: Aggregation object of the active side, should be an instance + of XRCResources class + :param server: Aggregation object of the passive side, should be an instance + of XRCResources class + :param is_cq_ex: If True, use poll_cq_ex() rather than poll_cq() + :param send_op: If not None, new post send API is assumed. + :return: None + """ + poll = poll_cq_ex if is_cq_ex else poll_cq + server.remote_srqn = client.srq.get_srq_num() + client.remote_srqn = server.srq.get_srq_num() + s_recv_wr = get_recv_wr(server) + c_recv_wr = get_recv_wr(client) + post_recv(client.srq, c_recv_wr, client.qp_count*client.num_msgs) + post_recv(server.srq, s_recv_wr, server.qp_count*server.num_msgs) + # Using the new post send API, we need the SGE, not the SendWR + send_element_idx = 1 if send_op else 0 + for _ in range(client.num_msgs): + for i in range(server.qp_count): + c_send_wr = get_send_element(client, False)[send_element_idx] + if send_op is None: + c_send_wr.set_qp_type_xrc(client.remote_srqn) + xrc_post_send(client, i, c_send_wr, 0, 0, send_op) + poll(client.cq) + poll(server.cq) + msg_received = server.mr.read(server.msg_size, 0) + validate(msg_received, True, server.msg_size) + s_send_wr = get_send_element(server, True)[send_element_idx] + if send_op is None: + s_send_wr.set_qp_type_xrc(server.remote_srqn) + xrc_post_send(server, i, s_send_wr, 0, 0, send_op) + poll(server.cq) + poll(client.cq) + msg_received = client.mr.read(client.msg_size, 0) + validate(msg_received, False, client.msg_size) + + +# Decorators +def requires_odp(qp_type): + def outer(func): + def inner(instance): + odp_supported(instance.ctx, qp_type) + return func(instance) + return inner + return outer + + +def odp_supported(ctx, qp_type): + """ + Check device ODP capabilities, support only send/recv so far. + :param ctx: Device Context + :param qp_type: QP type ('rc', 'ud' or 'uc') + :return: None + """ + odp_caps = ctx.query_device_ex().odp_caps + if odp_caps.general_caps == 0: + raise unittest.SkipTest('ODP is not supported - No ODP caps') + qp_odp_caps = getattr(odp_caps, '{}_odp_caps'.format(qp_type)) + has_odp_send = qp_odp_caps & e.IBV_ODP_SUPPORT_SEND + has_odp_recv = qp_odp_caps & e.IBV_ODP_SUPPORT_SRQ_RECV if qp_type == 'xrc'\ + else qp_odp_caps & e.IBV_ODP_SUPPORT_RECV + if has_odp_send == 0: + raise unittest.SkipTest('ODP is not supported - ODP send not supported') + if has_odp_recv == 0: + raise unittest.SkipTest('ODP is not supported - ODP recv not supported') + + +def requires_huge_pages(): + def outer(func): + def inner(instance): + huge_pages_supported() + return func(instance) + return inner + return outer + + +def huge_pages_supported(): + """ + Check if huge pages are supported in the kernel. + :return: None + """ + huge_path = '/sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages' + if not os.path.isfile(huge_path): + raise unittest.SkipTest('Huge pages of size 2M is not supported in this platform') + with open(huge_path, 'r') as f: + if not int(f.read()): + raise unittest.SkipTest('There are no huge pages of size 2M allocated') diff --git a/util/CMakeLists.txt b/util/CMakeLists.txt new file mode 100644 index 0000000..e8646bf --- /dev/null +++ b/util/CMakeLists.txt @@ -0,0 +1,32 @@ +publish_internal_headers(util + cl_qmap.h + compiler.h + node_name_map.h + rdma_nl.h + symver.h + util.h + ) + +set(C_FILES + cl_map.c + node_name_map.c + open_cdev.c + rdma_nl.c + util.c + ) + +if (HAVE_COHERENT_DMA) + publish_internal_headers(util + mmio.h + udma_barrier.h + ) + + set(C_FILES ${C_FILES} + mmio.c + ) + set_source_files_properties(mmio.c PROPERTIES COMPILE_FLAGS "${SSE_FLAGS}") +endif() + +add_library(rdma_util STATIC ${C_FILES}) +add_library(rdma_util_pic STATIC ${C_FILES}) +set_property(TARGET rdma_util_pic PROPERTY POSITION_INDEPENDENT_CODE TRUE) diff --git a/util/cl_map.c b/util/cl_map.c new file mode 100644 index 0000000..f48efec --- /dev/null +++ b/util/cl_map.c @@ -0,0 +1,700 @@ +/* + * Copyright (c) 2004-2009 Voltaire, Inc. All rights reserved. + * Copyright (c) 2002-2005 Mellanox Technologies LTD. All rights reserved. + * Copyright (c) 1996-2003 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +/* + * Abstract: + * Implementation of quick map, a binary tree where the caller always + * provides all necessary storage. + * + */ + +/***************************************************************************** +* +* Map +* +* Map is an associative array. By providing a key, the caller can retrieve +* an object from the map. All objects in the map have an associated key, +* as specified by the caller when the object was inserted into the map. +* In addition to random access, the caller can traverse the map much like +* a linked list, either forwards from the first object or backwards from +* the last object. The objects in the map are always traversed in +* order since the nodes are stored sorted. +* +* This implementation of Map uses a red black tree verified against +* Cormen-Leiserson-Rivest text, McGraw-Hill Edition, fourteenth +* printing, 1994. +* +*****************************************************************************/ + +#include <util/cl_qmap.h> +#include <string.h> + +static inline void __cl_primitive_insert(cl_list_item_t *const p_list_item, + cl_list_item_t *const p_new_item) +{ + /* CL_ASSERT that a non-null pointer is provided. */ + assert(p_list_item); + /* CL_ASSERT that a non-null pointer is provided. */ + assert(p_new_item); + + p_new_item->p_next = p_list_item; + p_new_item->p_prev = p_list_item->p_prev; + p_list_item->p_prev = p_new_item; + p_new_item->p_prev->p_next = p_new_item; +} + +static inline void __cl_primitive_remove(cl_list_item_t *const p_list_item) +{ + /* CL_ASSERT that a non-null pointer is provided. */ + assert(p_list_item); + + /* set the back pointer */ + p_list_item->p_next->p_prev = p_list_item->p_prev; + /* set the next pointer */ + p_list_item->p_prev->p_next = p_list_item->p_next; + + /* if we're debugging, spruce up the pointers to help find bugs */ +#if defined( _DEBUG_ ) + if (p_list_item != p_list_item->p_next) { + p_list_item->p_next = NULL; + p_list_item->p_prev = NULL; + } +#endif /* defined( _DEBUG_ ) */ +} + +/****************************************************************************** + IMPLEMENTATION OF QUICK MAP +******************************************************************************/ + +/* + * Get the root. + */ +static inline cl_map_item_t *__cl_map_root(const cl_qmap_t * const p_map) +{ + assert(p_map); + return (p_map->root.p_left); +} + +/* + * Returns whether a given item is on the left of its parent. + */ +static bool __cl_map_is_left_child(const cl_map_item_t * const p_item) +{ + assert(p_item); + assert(p_item->p_up); + assert(p_item->p_up != p_item); + + return (p_item->p_up->p_left == p_item); +} + +/* + * Retrieve the pointer to the parent's pointer to an item. + */ +static cl_map_item_t **__cl_map_get_parent_ptr_to_item(cl_map_item_t * + const p_item) +{ + assert(p_item); + assert(p_item->p_up); + assert(p_item->p_up != p_item); + + if (__cl_map_is_left_child(p_item)) + return (&p_item->p_up->p_left); + + assert(p_item->p_up->p_right == p_item); + return (&p_item->p_up->p_right); +} + +/* + * Rotate a node to the left. This rotation affects the least number of links + * between nodes and brings the level of C up by one while increasing the depth + * of A one. Note that the links to/from W, X, Y, and Z are not affected. + * + * R R + * | | + * A C + * / \ / \ + * W C A Z + * / \ / \ + * B Z W B + * / \ / \ + * X Y X Y + */ +static void __cl_map_rot_left(cl_qmap_t * const p_map, + cl_map_item_t * const p_item) +{ + cl_map_item_t **pp_root; + + assert(p_map); + assert(p_item); + assert(p_item->p_right != &p_map->nil); + + pp_root = __cl_map_get_parent_ptr_to_item(p_item); + + /* Point R to C instead of A. */ + *pp_root = p_item->p_right; + /* Set C's parent to R. */ + (*pp_root)->p_up = p_item->p_up; + + /* Set A's right to B */ + p_item->p_right = (*pp_root)->p_left; + /* + * Set B's parent to A. We trap for B being NIL since the + * caller may depend on NIL not changing. + */ + if ((*pp_root)->p_left != &p_map->nil) + (*pp_root)->p_left->p_up = p_item; + + /* Set C's left to A. */ + (*pp_root)->p_left = p_item; + /* Set A's parent to C. */ + p_item->p_up = *pp_root; +} + +/* + * Rotate a node to the right. This rotation affects the least number of links + * between nodes and brings the level of A up by one while increasing the depth + * of C one. Note that the links to/from W, X, Y, and Z are not affected. + * + * R R + * | | + * C A + * / \ / \ + * A Z W C + * / \ / \ + * W B B Z + * / \ / \ + * X Y X Y + */ +static void __cl_map_rot_right(cl_qmap_t * const p_map, + cl_map_item_t * const p_item) +{ + cl_map_item_t **pp_root; + + assert(p_map); + assert(p_item); + assert(p_item->p_left != &p_map->nil); + + /* Point R to A instead of C. */ + pp_root = __cl_map_get_parent_ptr_to_item(p_item); + (*pp_root) = p_item->p_left; + /* Set A's parent to R. */ + (*pp_root)->p_up = p_item->p_up; + + /* Set C's left to B */ + p_item->p_left = (*pp_root)->p_right; + /* + * Set B's parent to C. We trap for B being NIL since the + * caller may depend on NIL not changing. + */ + if ((*pp_root)->p_right != &p_map->nil) + (*pp_root)->p_right->p_up = p_item; + + /* Set A's right to C. */ + (*pp_root)->p_right = p_item; + /* Set C's parent to A. */ + p_item->p_up = *pp_root; +} + +void cl_qmap_init(cl_qmap_t * const p_map) +{ + assert(p_map); + + memset(p_map, 0, sizeof(cl_qmap_t)); + + /* special setup for the root node */ + p_map->root.p_up = &p_map->root; + p_map->root.p_left = &p_map->nil; + p_map->root.p_right = &p_map->nil; + p_map->root.color = CL_MAP_BLACK; + + /* Setup the node used as terminator for all leaves. */ + p_map->nil.p_up = &p_map->nil; + p_map->nil.p_left = &p_map->nil; + p_map->nil.p_right = &p_map->nil; + p_map->nil.color = CL_MAP_BLACK; + + cl_qmap_remove_all(p_map); +} + +cl_map_item_t *cl_qmap_get(const cl_qmap_t * const p_map, + const uint64_t key) +{ + cl_map_item_t *p_item; + + assert(p_map); + + p_item = __cl_map_root(p_map); + + while (p_item != &p_map->nil) { + if (key == p_item->key) + break; /* just right */ + + if (key < p_item->key) + p_item = p_item->p_left; /* too small */ + else + p_item = p_item->p_right; /* too big */ + } + + return (p_item); +} + +cl_map_item_t *cl_qmap_get_next(const cl_qmap_t * const p_map, + const uint64_t key) +{ + cl_map_item_t *p_item; + cl_map_item_t *p_item_found; + + assert(p_map); + + p_item = __cl_map_root(p_map); + p_item_found = (cl_map_item_t *) & p_map->nil; + + while (p_item != &p_map->nil) { + if (key < p_item->key) { + p_item_found = p_item; + p_item = p_item->p_left; + } else { + p_item = p_item->p_right; + } + } + + return (p_item_found); +} + +void cl_qmap_apply_func(const cl_qmap_t * const p_map, + cl_pfn_qmap_apply_t pfn_func, + const void *const context) +{ + cl_map_item_t *p_map_item; + + /* Note that context can have any arbitrary value. */ + assert(p_map); + assert(pfn_func); + + p_map_item = cl_qmap_head(p_map); + while (p_map_item != cl_qmap_end(p_map)) { + pfn_func(p_map_item, (void *)context); + p_map_item = cl_qmap_next(p_map_item); + } +} + +/* + * Balance a tree starting at a given item back to the root. + */ +static void __cl_map_ins_bal(cl_qmap_t * const p_map, + cl_map_item_t * p_item) +{ + cl_map_item_t *p_grand_uncle; + + assert(p_map); + assert(p_item); + assert(p_item != &p_map->root); + + while (p_item->p_up->color == CL_MAP_RED) { + if (__cl_map_is_left_child(p_item->p_up)) { + p_grand_uncle = p_item->p_up->p_up->p_right; + assert(p_grand_uncle); + if (p_grand_uncle->color == CL_MAP_RED) { + p_grand_uncle->color = CL_MAP_BLACK; + p_item->p_up->color = CL_MAP_BLACK; + p_item->p_up->p_up->color = CL_MAP_RED; + p_item = p_item->p_up->p_up; + continue; + } + + if (!__cl_map_is_left_child(p_item)) { + p_item = p_item->p_up; + __cl_map_rot_left(p_map, p_item); + } + p_item->p_up->color = CL_MAP_BLACK; + p_item->p_up->p_up->color = CL_MAP_RED; + __cl_map_rot_right(p_map, p_item->p_up->p_up); + } else { + p_grand_uncle = p_item->p_up->p_up->p_left; + assert(p_grand_uncle); + if (p_grand_uncle->color == CL_MAP_RED) { + p_grand_uncle->color = CL_MAP_BLACK; + p_item->p_up->color = CL_MAP_BLACK; + p_item->p_up->p_up->color = CL_MAP_RED; + p_item = p_item->p_up->p_up; + continue; + } + + if (__cl_map_is_left_child(p_item)) { + p_item = p_item->p_up; + __cl_map_rot_right(p_map, p_item); + } + p_item->p_up->color = CL_MAP_BLACK; + p_item->p_up->p_up->color = CL_MAP_RED; + __cl_map_rot_left(p_map, p_item->p_up->p_up); + } + } +} + +cl_map_item_t *cl_qmap_insert(cl_qmap_t * const p_map, + const uint64_t key, + cl_map_item_t * const p_item) +{ + cl_map_item_t *p_insert_at, *p_comp_item; + + assert(p_map); + assert(p_item); + assert(p_map->root.p_up == &p_map->root); + assert(p_map->root.color != CL_MAP_RED); + assert(p_map->nil.color != CL_MAP_RED); + + p_item->p_left = &p_map->nil; + p_item->p_right = &p_map->nil; + p_item->key = key; + p_item->color = CL_MAP_RED; + + /* Find the insertion location. */ + p_insert_at = &p_map->root; + p_comp_item = __cl_map_root(p_map); + + while (p_comp_item != &p_map->nil) { + p_insert_at = p_comp_item; + + if (key == p_insert_at->key) + return (p_insert_at); + + /* Traverse the tree until the correct insertion point is found. */ + if (key < p_insert_at->key) + p_comp_item = p_insert_at->p_left; + else + p_comp_item = p_insert_at->p_right; + } + + assert(p_insert_at != &p_map->nil); + assert(p_comp_item == &p_map->nil); + /* Insert the item. */ + if (p_insert_at == &p_map->root) { + p_insert_at->p_left = p_item; + /* + * Primitive insert places the new item in front of + * the existing item. + */ + __cl_primitive_insert(&p_map->nil.pool_item.list_item, + &p_item->pool_item.list_item); + } else if (key < p_insert_at->key) { + p_insert_at->p_left = p_item; + /* + * Primitive insert places the new item in front of + * the existing item. + */ + __cl_primitive_insert(&p_insert_at->pool_item.list_item, + &p_item->pool_item.list_item); + } else { + p_insert_at->p_right = p_item; + /* + * Primitive insert places the new item in front of + * the existing item. + */ + __cl_primitive_insert(p_insert_at->pool_item.list_item.p_next, + &p_item->pool_item.list_item); + } + /* Increase the count. */ + p_map->count++; + + p_item->p_up = p_insert_at; + + /* + * We have added depth to this section of the tree. + * Rebalance as necessary as we retrace our path through the tree + * and update colors. + */ + __cl_map_ins_bal(p_map, p_item); + + __cl_map_root(p_map)->color = CL_MAP_BLACK; + + /* + * Note that it is not necessary to re-color the nil node black because all + * red color assignments are made via the p_up pointer, and nil is never + * set as the value of a p_up pointer. + */ + +#ifdef _DEBUG_ + /* Set the pointer to the map in the map item for consistency checking. */ + p_item->p_map = p_map; +#endif + + return (p_item); +} + +static void __cl_map_del_bal(cl_qmap_t * const p_map, + cl_map_item_t * p_item) +{ + cl_map_item_t *p_uncle; + + while ((p_item->color != CL_MAP_RED) && (p_item->p_up != &p_map->root)) { + if (__cl_map_is_left_child(p_item)) { + p_uncle = p_item->p_up->p_right; + + if (p_uncle->color == CL_MAP_RED) { + p_uncle->color = CL_MAP_BLACK; + p_item->p_up->color = CL_MAP_RED; + __cl_map_rot_left(p_map, p_item->p_up); + p_uncle = p_item->p_up->p_right; + } + + if (p_uncle->p_right->color != CL_MAP_RED) { + if (p_uncle->p_left->color != CL_MAP_RED) { + p_uncle->color = CL_MAP_RED; + p_item = p_item->p_up; + continue; + } + + p_uncle->p_left->color = CL_MAP_BLACK; + p_uncle->color = CL_MAP_RED; + __cl_map_rot_right(p_map, p_uncle); + p_uncle = p_item->p_up->p_right; + } + p_uncle->color = p_item->p_up->color; + p_item->p_up->color = CL_MAP_BLACK; + p_uncle->p_right->color = CL_MAP_BLACK; + __cl_map_rot_left(p_map, p_item->p_up); + break; + } else { + p_uncle = p_item->p_up->p_left; + + if (p_uncle->color == CL_MAP_RED) { + p_uncle->color = CL_MAP_BLACK; + p_item->p_up->color = CL_MAP_RED; + __cl_map_rot_right(p_map, p_item->p_up); + p_uncle = p_item->p_up->p_left; + } + + if (p_uncle->p_left->color != CL_MAP_RED) { + if (p_uncle->p_right->color != CL_MAP_RED) { + p_uncle->color = CL_MAP_RED; + p_item = p_item->p_up; + continue; + } + + p_uncle->p_right->color = CL_MAP_BLACK; + p_uncle->color = CL_MAP_RED; + __cl_map_rot_left(p_map, p_uncle); + p_uncle = p_item->p_up->p_left; + } + p_uncle->color = p_item->p_up->color; + p_item->p_up->color = CL_MAP_BLACK; + p_uncle->p_left->color = CL_MAP_BLACK; + __cl_map_rot_right(p_map, p_item->p_up); + break; + } + } + p_item->color = CL_MAP_BLACK; +} + +void cl_qmap_remove_item(cl_qmap_t * const p_map, + cl_map_item_t * const p_item) +{ + cl_map_item_t *p_child, *p_del_item; + + assert(p_map); + assert(p_item); + + if (p_item == cl_qmap_end(p_map)) + return; + + if ((p_item->p_right == &p_map->nil) || (p_item->p_left == &p_map->nil)) { + /* The item being removed has children on at most on side. */ + p_del_item = p_item; + } else { + /* + * The item being removed has children on both side. + * We select the item that will replace it. After removing + * the substitute item and rebalancing, the tree will have the + * correct topology. Exchanging the substitute for the item + * will finalize the removal. + */ + p_del_item = cl_qmap_next(p_item); + assert(p_del_item != &p_map->nil); + } + + /* Remove the item from the list. */ + __cl_primitive_remove(&p_item->pool_item.list_item); + /* Decrement the item count. */ + p_map->count--; + + /* Get the pointer to the new root's child, if any. */ + if (p_del_item->p_left != &p_map->nil) + p_child = p_del_item->p_left; + else + p_child = p_del_item->p_right; + + /* + * This assignment may modify the parent pointer of the nil node. + * This is inconsequential. + */ + p_child->p_up = p_del_item->p_up; + (*__cl_map_get_parent_ptr_to_item(p_del_item)) = p_child; + + if (p_del_item->color != CL_MAP_RED) + __cl_map_del_bal(p_map, p_child); + + /* + * Note that the splicing done below does not need to occur before + * the tree is balanced, since the actual topology changes are made by the + * preceding code. The topology is preserved by the color assignment made + * below (reader should be reminded that p_del_item == p_item in some cases). + */ + if (p_del_item != p_item) { + /* + * Finalize the removal of the specified item by exchanging it with + * the substitute which we removed above. + */ + p_del_item->p_up = p_item->p_up; + p_del_item->p_left = p_item->p_left; + p_del_item->p_right = p_item->p_right; + (*__cl_map_get_parent_ptr_to_item(p_item)) = p_del_item; + p_item->p_right->p_up = p_del_item; + p_item->p_left->p_up = p_del_item; + p_del_item->color = p_item->color; + } + + assert(p_map->nil.color != CL_MAP_RED); + +#ifdef _DEBUG_ + /* Clear the pointer to the map since the item has been removed. */ + p_item->p_map = NULL; +#endif +} + +cl_map_item_t *cl_qmap_remove(cl_qmap_t * const p_map, const uint64_t key) +{ + cl_map_item_t *p_item; + + assert(p_map); + + /* Seek the node with the specified key */ + p_item = cl_qmap_get(p_map, key); + + cl_qmap_remove_item(p_map, p_item); + + return (p_item); +} + +void cl_qmap_merge(cl_qmap_t * const p_dest_map, + cl_qmap_t * const p_src_map) +{ + cl_map_item_t *p_item, *p_item2, *p_next; + + assert(p_dest_map); + assert(p_src_map); + + p_item = cl_qmap_head(p_src_map); + + while (p_item != cl_qmap_end(p_src_map)) { + p_next = cl_qmap_next(p_item); + + /* Remove the item from its current map. */ + cl_qmap_remove_item(p_src_map, p_item); + /* Insert the item into the destination map. */ + p_item2 = + cl_qmap_insert(p_dest_map, cl_qmap_key(p_item), p_item); + /* Check that the item was successfully inserted. */ + if (p_item2 != p_item) { + /* Put the item in back in the source map. */ + p_item2 = + cl_qmap_insert(p_src_map, cl_qmap_key(p_item), + p_item); + assert(p_item2 == p_item); + } + p_item = p_next; + } +} + +static void __cl_qmap_delta_move(cl_qmap_t * const p_dest, + cl_qmap_t * const p_src, + cl_map_item_t ** const pp_item) +{ + cl_map_item_t __attribute__((__unused__)) *p_temp; + cl_map_item_t *p_next; + + /* + * Get the next item so that we can ensure that pp_item points to + * a valid item upon return from the function. + */ + p_next = cl_qmap_next(*pp_item); + /* Move the old item from its current map the the old map. */ + cl_qmap_remove_item(p_src, *pp_item); + p_temp = cl_qmap_insert(p_dest, cl_qmap_key(*pp_item), *pp_item); + /* We should never have duplicates. */ + assert(p_temp == *pp_item); + /* Point pp_item to a valid item in the source map. */ + (*pp_item) = p_next; +} + +void cl_qmap_delta(cl_qmap_t * const p_map1, + cl_qmap_t * const p_map2, + cl_qmap_t * const p_new, cl_qmap_t * const p_old) +{ + cl_map_item_t *p_item1, *p_item2; + uint64_t key1, key2; + + assert(p_map1); + assert(p_map2); + assert(p_new); + assert(p_old); + assert(cl_is_qmap_empty(p_new)); + assert(cl_is_qmap_empty(p_old)); + + p_item1 = cl_qmap_head(p_map1); + p_item2 = cl_qmap_head(p_map2); + + while (p_item1 != cl_qmap_end(p_map1) && p_item2 != cl_qmap_end(p_map2)) { + key1 = cl_qmap_key(p_item1); + key2 = cl_qmap_key(p_item2); + if (key1 < key2) { + /* We found an old item. */ + __cl_qmap_delta_move(p_old, p_map1, &p_item1); + } else if (key1 > key2) { + /* We found a new item. */ + __cl_qmap_delta_move(p_new, p_map2, &p_item2); + } else { + /* Move both forward since they have the same key. */ + p_item1 = cl_qmap_next(p_item1); + p_item2 = cl_qmap_next(p_item2); + } + } + + /* Process the remainder if the end of either source map was reached. */ + while (p_item2 != cl_qmap_end(p_map2)) + __cl_qmap_delta_move(p_new, p_map2, &p_item2); + + while (p_item1 != cl_qmap_end(p_map1)) + __cl_qmap_delta_move(p_old, p_map1, &p_item1); +} diff --git a/util/cl_qmap.h b/util/cl_qmap.h new file mode 100644 index 0000000..1a800f2 --- /dev/null +++ b/util/cl_qmap.h @@ -0,0 +1,970 @@ +/* + * Copyright (c) 2004, 2005 Voltaire, Inc. All rights reserved. + * Copyright (c) 2002-2005 Mellanox Technologies LTD. All rights reserved. + * Copyright (c) 1996-2003 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +/* + * Abstract: + * Declaration of quick map, a binary tree where the caller always provides + * all necessary storage. + */ + +#ifndef _CL_QMAP_H_ +#define _CL_QMAP_H_ + +#include <stdbool.h> +#include <assert.h> +#include <inttypes.h> +#include <stdio.h> + +typedef struct _cl_list_item { + struct _cl_list_item *p_next; + struct _cl_list_item *p_prev; +} cl_list_item_t; + +typedef struct _cl_pool_item { + cl_list_item_t list_item; +} cl_pool_item_t; + +/****h* Component Library/Quick Map +* NAME +* Quick Map +* +* DESCRIPTION +* Quick map implements a binary tree that stores user provided cl_map_item_t +* structures. Each item stored in a quick map has a unique 64-bit key +* (duplicates are not allowed). Quick map provides the ability to +* efficiently search for an item given a key. +* +* Quick map does not allocate any memory, and can therefore not fail +* any operations due to insufficient memory. Quick map can thus be useful +* in minimizing the error paths in code. +* +* Quick map is not thread safe, and users must provide serialization when +* adding and removing items from the map. +* +* The quick map functions operate on a cl_qmap_t structure which should be +* treated as opaque and should be manipulated only through the provided +* functions. +* +* SEE ALSO +* Structures: +* cl_qmap_t, cl_map_item_t, cl_map_obj_t +* +* Callbacks: +* cl_pfn_qmap_apply_t +* +* Item Manipulation: +* cl_qmap_set_obj, cl_qmap_obj, cl_qmap_key +* +* Initialization: +* cl_qmap_init +* +* Iteration: +* cl_qmap_end, cl_qmap_head, cl_qmap_tail, cl_qmap_next, cl_qmap_prev +* +* Manipulation: +* cl_qmap_insert, cl_qmap_get, cl_qmap_remove_item, cl_qmap_remove, +* cl_qmap_remove_all, cl_qmap_merge, cl_qmap_delta, cl_qmap_get_next +* +* Search: +* cl_qmap_apply_func +* +* Attributes: +* cl_qmap_count, cl_is_qmap_empty, +*********/ +/****i* Component Library: Quick Map/cl_map_color_t +* NAME +* cl_map_color_t +* +* DESCRIPTION +* The cl_map_color_t enumerated type is used to note the color of +* nodes in a map. +* +* SYNOPSIS +*/ +typedef enum _cl_map_color { + CL_MAP_RED, + CL_MAP_BLACK +} cl_map_color_t; +/* +* VALUES +* CL_MAP_RED +* The node in the map is red. +* +* CL_MAP_BLACK +* The node in the map is black. +* +* SEE ALSO +* Quick Map, cl_map_item_t +*********/ + +/****s* Component Library: Quick Map/cl_map_item_t +* NAME +* cl_map_item_t +* +* DESCRIPTION +* The cl_map_item_t structure is used by maps to store objects. +* +* The cl_map_item_t structure should be treated as opaque and should +* be manipulated only through the provided functions. +* +* SYNOPSIS +*/ +typedef struct _cl_map_item { + /* Must be first to allow casting. */ + cl_pool_item_t pool_item; + struct _cl_map_item *p_left; + struct _cl_map_item *p_right; + struct _cl_map_item *p_up; + cl_map_color_t color; + uint64_t key; +#ifdef _DEBUG_ + struct _cl_qmap *p_map; +#endif +} cl_map_item_t; +/* +* FIELDS +* pool_item +* Used to store the item in a doubly linked list, allowing more +* efficient map traversal. +* +* p_left +* Pointer to the map item that is a child to the left of the node. +* +* p_right +* Pointer to the map item that is a child to the right of the node. +* +* p_up +* Pointer to the map item that is the parent of the node. +* +* color +* Indicates whether a node is red or black in the map. +* +* key +* Value that uniquely represents a node in a map. This value is +* set by calling cl_qmap_insert and can be retrieved by calling +* cl_qmap_key. +* +* NOTES +* None of the fields of this structure should be manipulated by users, as +* they are crititcal to the proper operation of the map in which they +* are stored. +* +* To allow storing items in either a quick list, a quick pool, or a quick +* map, the map implementation guarantees that the map item can be safely +* cast to a pool item used for storing an object in a quick pool, or cast +* to a list item used for storing an object in a quick list. This removes +* the need to embed a map item, a list item, and a pool item in objects +* that need to be stored in a quick list, a quick pool, and a quick map. +* +* SEE ALSO +* Quick Map, cl_qmap_insert, cl_qmap_key, cl_pool_item_t, cl_list_item_t +*********/ + +/****s* Component Library: Quick Map/cl_map_obj_t +* NAME +* cl_map_obj_t +* +* DESCRIPTION +* The cl_map_obj_t structure is used to store objects in maps. +* +* The cl_map_obj_t structure should be treated as opaque and should +* be manipulated only through the provided functions. +* +* SYNOPSIS +*/ +typedef struct _cl_map_obj { + cl_map_item_t item; + const void *p_object; +} cl_map_obj_t; +/* +* FIELDS +* item +* Map item used by internally by the map to store an object. +* +* p_object +* User defined context. Users should not access this field directly. +* Use cl_qmap_set_obj and cl_qmap_obj to set and retrieve the value +* of this field. +* +* NOTES +* None of the fields of this structure should be manipulated by users, as +* they are crititcal to the proper operation of the map in which they +* are stored. +* +* Use cl_qmap_set_obj and cl_qmap_obj to set and retrieve the object +* stored in a map item, respectively. +* +* SEE ALSO +* Quick Map, cl_qmap_set_obj, cl_qmap_obj, cl_map_item_t +*********/ + +/****s* Component Library: Quick Map/cl_qmap_t +* NAME +* cl_qmap_t +* +* DESCRIPTION +* Quick map structure. +* +* The cl_qmap_t structure should be treated as opaque and should +* be manipulated only through the provided functions. +* +* SYNOPSIS +*/ +typedef struct _cl_qmap { + cl_map_item_t root; + cl_map_item_t nil; + size_t count; +} cl_qmap_t; +/* +* PARAMETERS +* root +* Map item that serves as root of the map. The root is set up to +* always have itself as parent. The left pointer is set to point +* to the item at the root. +* +* nil +* Map item that serves as terminator for all leaves, as well as +* providing the list item used as quick list for storing map items +* in a list for faster traversal. +* +* state +* State of the map, used to verify that operations are permitted. +* +* count +* Number of items in the map. +* +* SEE ALSO +* Quick Map +*********/ + +/****d* Component Library: Quick Map/cl_pfn_qmap_apply_t +* NAME +* cl_pfn_qmap_apply_t +* +* DESCRIPTION +* The cl_pfn_qmap_apply_t function type defines the prototype for +* functions used to iterate items in a quick map. +* +* SYNOPSIS +*/ +typedef void + (*cl_pfn_qmap_apply_t) (cl_map_item_t * const p_map_item, void *context); +/* +* PARAMETERS +* p_map_item +* [in] Pointer to a cl_map_item_t structure. +* +* context +* [in] Value passed to the callback function. +* +* RETURN VALUE +* This function does not return a value. +* +* NOTES +* This function type is provided as function prototype reference for the +* function provided by users as a parameter to the cl_qmap_apply_func +* function. +* +* SEE ALSO +* Quick Map, cl_qmap_apply_func +*********/ + +/****f* Component Library: Quick Map/cl_qmap_count +* NAME +* cl_qmap_count +* +* DESCRIPTION +* The cl_qmap_count function returns the number of items stored +* in a quick map. +* +* SYNOPSIS +*/ +static inline uint32_t cl_qmap_count(const cl_qmap_t * const p_map) +{ + assert(p_map); + return ((uint32_t) p_map->count); +} + +/* +* PARAMETERS +* p_map +* [in] Pointer to a cl_qmap_t structure whose item count to return. +* +* RETURN VALUE +* Returns the number of items stored in the map. +* +* SEE ALSO +* Quick Map, cl_is_qmap_empty +*********/ + +/****f* Component Library: Quick Map/cl_is_qmap_empty +* NAME +* cl_is_qmap_empty +* +* DESCRIPTION +* The cl_is_qmap_empty function returns whether a quick map is empty. +* +* SYNOPSIS +*/ +static inline bool cl_is_qmap_empty(const cl_qmap_t * const p_map) +{ + assert(p_map); + + return (p_map->count == 0); +} + +/* +* PARAMETERS +* p_map +* [in] Pointer to a cl_qmap_t structure to test for emptiness. +* +* RETURN VALUES +* TRUE if the quick map is empty. +* +* FALSE otherwise. +* +* SEE ALSO +* Quick Map, cl_qmap_count, cl_qmap_remove_all +*********/ + +/****f* Component Library: Quick Map/cl_qmap_set_obj +* NAME +* cl_qmap_set_obj +* +* DESCRIPTION +* The cl_qmap_set_obj function sets the object stored in a map object. +* +* SYNOPSIS +*/ +static inline void +cl_qmap_set_obj(cl_map_obj_t * const p_map_obj, + const void *const p_object) +{ + assert(p_map_obj); + p_map_obj->p_object = p_object; +} + +/* +* PARAMETERS +* p_map_obj +* [in] Pointer to a map object stucture whose object pointer +* is to be set. +* +* p_object +* [in] User defined context. +* +* RETURN VALUE +* This function does not return a value. +* +* SEE ALSO +* Quick Map, cl_qmap_obj +*********/ + +/****f* Component Library: Quick Map/cl_qmap_obj +* NAME +* cl_qmap_obj +* +* DESCRIPTION +* The cl_qmap_obj function returns the object stored in a map object. +* +* SYNOPSIS +*/ +static inline void *cl_qmap_obj(const cl_map_obj_t * const p_map_obj) +{ + assert(p_map_obj); + return ((void *)p_map_obj->p_object); +} + +/* +* PARAMETERS +* p_map_obj +* [in] Pointer to a map object stucture whose object pointer to return. +* +* RETURN VALUE +* Returns the value of the object pointer stored in the map object. +* +* SEE ALSO +* Quick Map, cl_qmap_set_obj +*********/ + +/****f* Component Library: Quick Map/cl_qmap_key +* NAME +* cl_qmap_key +* +* DESCRIPTION +* The cl_qmap_key function retrieves the key value of a map item. +* +* SYNOPSIS +*/ +static inline uint64_t cl_qmap_key(const cl_map_item_t * const p_item) +{ + assert(p_item); + return (p_item->key); +} + +/* +* PARAMETERS +* p_item +* [in] Pointer to a map item whose key value to return. +* +* RETURN VALUE +* Returns the 64-bit key value for the specified map item. +* +* NOTES +* The key value is set in a call to cl_qmap_insert. +* +* SEE ALSO +* Quick Map, cl_qmap_insert +*********/ + +/****f* Component Library: Quick Map/cl_qmap_init +* NAME +* cl_qmap_init +* +* DESCRIPTION +* The cl_qmap_init function initialized a quick map for use. +* +* SYNOPSIS +*/ +void cl_qmap_init(cl_qmap_t * const p_map); +/* +* PARAMETERS +* p_map +* [in] Pointer to a cl_qmap_t structure to initialize. +* +* RETURN VALUES +* This function does not return a value. +* +* NOTES +* Allows calling quick map manipulation functions. +* +* SEE ALSO +* Quick Map, cl_qmap_insert, cl_qmap_remove +*********/ + +/****f* Component Library: Quick Map/cl_qmap_end +* NAME +* cl_qmap_end +* +* DESCRIPTION +* The cl_qmap_end function returns the end of a quick map. +* +* SYNOPSIS +*/ +static inline const cl_map_item_t *cl_qmap_end(const cl_qmap_t * const p_map) +{ + assert(p_map); + /* Nil is the end of the map. */ + return (&p_map->nil); +} + +/* +* PARAMETERS +* p_map +* [in] Pointer to a cl_qmap_t structure whose end to return. +* +* RETURN VALUE +* Pointer to the end of the map. +* +* NOTES +* cl_qmap_end is useful for determining the validity of map items returned +* by cl_qmap_head, cl_qmap_tail, cl_qmap_next, or cl_qmap_prev. If the +* map item pointer returned by any of these functions compares to the end, +* the end of the map was encoutered. +* When using cl_qmap_head or cl_qmap_tail, this condition indicates that +* the map is empty. +* +* SEE ALSO +* Quick Map, cl_qmap_head, cl_qmap_tail, cl_qmap_next, cl_qmap_prev +*********/ + +/****f* Component Library: Quick Map/cl_qmap_head +* NAME +* cl_qmap_head +* +* DESCRIPTION +* The cl_qmap_head function returns the map item with the lowest key +* value stored in a quick map. +* +* SYNOPSIS +*/ +static inline cl_map_item_t *cl_qmap_head(const cl_qmap_t * const p_map) +{ + assert(p_map); + return ((cl_map_item_t *) p_map->nil.pool_item.list_item.p_next); +} + +/* +* PARAMETERS +* p_map +* [in] Pointer to a cl_qmap_t structure whose item with the lowest +* key is returned. +* +* RETURN VALUES +* Pointer to the map item with the lowest key in the quick map. +* +* Pointer to the map end if the quick map was empty. +* +* NOTES +* cl_qmap_head does not remove the item from the map. +* +* SEE ALSO +* Quick Map, cl_qmap_tail, cl_qmap_next, cl_qmap_prev, cl_qmap_end, +* cl_qmap_item_t +*********/ + +/****f* Component Library: Quick Map/cl_qmap_tail +* NAME +* cl_qmap_tail +* +* DESCRIPTION +* The cl_qmap_tail function returns the map item with the highest key +* value stored in a quick map. +* +* SYNOPSIS +*/ +static inline cl_map_item_t *cl_qmap_tail(const cl_qmap_t * const p_map) +{ + assert(p_map); + return ((cl_map_item_t *) p_map->nil.pool_item.list_item.p_prev); +} + +/* +* PARAMETERS +* p_map +* [in] Pointer to a cl_qmap_t structure whose item with the +* highest key is returned. +* +* RETURN VALUES +* Pointer to the map item with the highest key in the quick map. +* +* Pointer to the map end if the quick map was empty. +* +* NOTES +* cl_qmap_end does not remove the item from the map. +* +* SEE ALSO +* Quick Map, cl_qmap_head, cl_qmap_next, cl_qmap_prev, cl_qmap_end, +* cl_qmap_item_t +*********/ + +/****f* Component Library: Quick Map/cl_qmap_next +* NAME +* cl_qmap_next +* +* DESCRIPTION +* The cl_qmap_next function returns the map item with the next higher +* key value than a specified map item. +* +* SYNOPSIS +*/ +static inline cl_map_item_t *cl_qmap_next(const cl_map_item_t * const p_item) +{ + assert(p_item); + return ((cl_map_item_t *) p_item->pool_item.list_item.p_next); +} + +/* +* PARAMETERS +* p_item +* [in] Pointer to a map item whose successor to return. +* +* RETURN VALUES +* Pointer to the map item with the next higher key value in a quick map. +* +* Pointer to the map end if the specified item was the last item in +* the quick map. +* +* SEE ALSO +* Quick Map, cl_qmap_head, cl_qmap_tail, cl_qmap_prev, cl_qmap_end, +* cl_map_item_t +*********/ + +/****f* Component Library: Quick Map/cl_qmap_prev +* NAME +* cl_qmap_prev +* +* DESCRIPTION +* The cl_qmap_prev function returns the map item with the next lower +* key value than a precified map item. +* +* SYNOPSIS +*/ +static inline cl_map_item_t *cl_qmap_prev(const cl_map_item_t * const p_item) +{ + assert(p_item); + return ((cl_map_item_t *) p_item->pool_item.list_item.p_prev); +} + +/* +* PARAMETERS +* p_item +* [in] Pointer to a map item whose predecessor to return. +* +* RETURN VALUES +* Pointer to the map item with the next lower key value in a quick map. +* +* Pointer to the map end if the specifid item was the first item in +* the quick map. +* +* SEE ALSO +* Quick Map, cl_qmap_head, cl_qmap_tail, cl_qmap_next, cl_qmap_end, +* cl_map_item_t +*********/ + +/****f* Component Library: Quick Map/cl_qmap_insert +* NAME +* cl_qmap_insert +* +* DESCRIPTION +* The cl_qmap_insert function inserts a map item into a quick map. +* NOTE: Only if such a key does not alerady exist in the map !!!! +* +* SYNOPSIS +*/ +cl_map_item_t *cl_qmap_insert(cl_qmap_t * const p_map, + const uint64_t key, + cl_map_item_t * const p_item); +/* +* PARAMETERS +* p_map +* [in] Pointer to a cl_qmap_t structure into which to add the item. +* +* key +* [in] Value to assign to the item. +* +* p_item +* [in] Pointer to a cl_map_item_t stucture to insert into the quick map. +* +* RETURN VALUE +* Pointer to the item in the map with the specified key. If insertion +* was successful, this is the pointer to the item. If an item with the +* specified key already exists in the map, the pointer to that item is +* returned - but the new key is NOT inserted... +* +* NOTES +* Insertion operations may cause the quick map to rebalance. +* +* SEE ALSO +* Quick Map, cl_qmap_remove, cl_map_item_t +*********/ + +/****f* Component Library: Quick Map/cl_qmap_get +* NAME +* cl_qmap_get +* +* DESCRIPTION +* The cl_qmap_get function returns the map item associated with a key. +* +* SYNOPSIS +*/ +cl_map_item_t *cl_qmap_get(const cl_qmap_t * const p_map, + const uint64_t key); +/* +* PARAMETERS +* p_map +* [in] Pointer to a cl_qmap_t structure from which to retrieve the +* item with the specified key. +* +* key +* [in] Key value used to search for the desired map item. +* +* RETURN VALUES +* Pointer to the map item with the desired key value. +* +* Pointer to the map end if there was no item with the desired key value +* stored in the quick map. +* +* NOTES +* cl_qmap_get does not remove the item from the quick map. +* +* SEE ALSO +* Quick Map, cl_qmap_get_next, cl_qmap_remove +*********/ + +/****f* Component Library: Quick Map/cl_qmap_get_next +* NAME +* cl_qmap_get_next +* +* DESCRIPTION +* The cl_qmap_get_next function returns the first map item associated with a +* key > the key specified. +* +* SYNOPSIS +*/ +cl_map_item_t *cl_qmap_get_next(const cl_qmap_t * const p_map, + const uint64_t key); +/* +* PARAMETERS +* p_map +* [in] Pointer to a cl_qmap_t structure from which to retrieve the +* first item with a key > the specified key. +* +* key +* [in] Key value used to search for the desired map item. +* +* RETURN VALUES +* Pointer to the first map item with a key > the desired key value. +* +* Pointer to the map end if there was no item with a key > the desired key +* value stored in the quick map. +* +* NOTES +* cl_qmap_get_next does not remove the item from the quick map. +* +* SEE ALSO +* Quick Map, cl_qmap_get, cl_qmap_remove +*********/ + +/****f* Component Library: Quick Map/cl_qmap_remove_item +* NAME +* cl_qmap_remove_item +* +* DESCRIPTION +* The cl_qmap_remove_item function removes the specified map item +* from a quick map. +* +* SYNOPSIS +*/ +void +cl_qmap_remove_item(cl_qmap_t * const p_map, + cl_map_item_t * const p_item); +/* +* PARAMETERS +* p_map +* [in] Pointer to a cl_qmap_t structure from which to +* remove item. +* +* p_item +* [in] Pointer to a map item to remove from its quick map. +* +* RETURN VALUES +* This function does not return a value. +* +* In a debug build, cl_qmap_remove_item asserts that the item being removed +* is in the specified map. +* +* NOTES +* Removes the map item pointed to by p_item from its quick map. +* +* SEE ALSO +* Quick Map, cl_qmap_remove, cl_qmap_remove_all, cl_qmap_insert +*********/ + +/****f* Component Library: Quick Map/cl_qmap_remove +* NAME +* cl_qmap_remove +* +* DESCRIPTION +* The cl_qmap_remove function removes the map item with the specified key +* from a quick map. +* +* SYNOPSIS +*/ +cl_map_item_t *cl_qmap_remove(cl_qmap_t * const p_map, + const uint64_t key); +/* +* PARAMETERS +* p_map +* [in] Pointer to a cl_qmap_t structure from which to remove the item +* with the specified key. +* +* key +* [in] Key value used to search for the map item to remove. +* +* RETURN VALUES +* Pointer to the removed map item if it was found. +* +* Pointer to the map end if no item with the specified key exists in the +* quick map. +* +* SEE ALSO +* Quick Map, cl_qmap_remove_item, cl_qmap_remove_all, cl_qmap_insert +*********/ + +/****f* Component Library: Quick Map/cl_qmap_remove_all +* NAME +* cl_qmap_remove_all +* +* DESCRIPTION +* The cl_qmap_remove_all function removes all items in a quick map, +* leaving it empty. +* +* SYNOPSIS +*/ +static inline void cl_qmap_remove_all(cl_qmap_t * const p_map) +{ + assert(p_map); + + p_map->root.p_left = &p_map->nil; + p_map->nil.pool_item.list_item.p_next = &p_map->nil.pool_item.list_item; + p_map->nil.pool_item.list_item.p_prev = &p_map->nil.pool_item.list_item; + p_map->count = 0; +} + +/* +* PARAMETERS +* p_map +* [in] Pointer to a cl_qmap_t structure to empty. +* +* RETURN VALUES +* This function does not return a value. +* +* SEE ALSO +* Quick Map, cl_qmap_remove, cl_qmap_remove_item +*********/ + +/****f* Component Library: Quick Map/cl_qmap_merge +* NAME +* cl_qmap_merge +* +* DESCRIPTION +* The cl_qmap_merge function moves all items from one map to another, +* excluding duplicates. +* +* SYNOPSIS +*/ +void +cl_qmap_merge(cl_qmap_t * const p_dest_map, + cl_qmap_t * const p_src_map); +/* +* PARAMETERS +* p_dest_map +* [out] Pointer to a cl_qmap_t structure to which items should be added. +* +* p_src_map +* [in/out] Pointer to a cl_qmap_t structure whose items to add +* to p_dest_map. +* +* RETURN VALUES +* This function does not return a value. +* +* NOTES +* Items are evaluated based on their keys only. +* +* Upon return from cl_qmap_merge, the quick map referenced by p_src_map +* contains all duplicate items. +* +* SEE ALSO +* Quick Map, cl_qmap_delta +*********/ + +/****f* Component Library: Quick Map/cl_qmap_delta +* NAME +* cl_qmap_delta +* +* DESCRIPTION +* The cl_qmap_delta function computes the differences between two maps. +* +* SYNOPSIS +*/ +void +cl_qmap_delta(cl_qmap_t * const p_map1, + cl_qmap_t * const p_map2, + cl_qmap_t * const p_new, cl_qmap_t * const p_old); +/* +* PARAMETERS +* p_map1 +* [in/out] Pointer to the first of two cl_qmap_t structures whose +* differences to compute. +* +* p_map2 +* [in/out] Pointer to the second of two cl_qmap_t structures whose +* differences to compute. +* +* p_new +* [out] Pointer to an empty cl_qmap_t structure that contains the +* items unique to p_map2 upon return from the function. +* +* p_old +* [out] Pointer to an empty cl_qmap_t structure that contains the +* items unique to p_map1 upon return from the function. +* +* RETURN VALUES +* This function does not return a value. +* +* NOTES +* Items are evaluated based on their keys. Items that exist in both +* p_map1 and p_map2 remain in their respective maps. Items that +* exist only p_map1 are moved to p_old. Likewise, items that exist only +* in p_map2 are moved to p_new. This function can be useful in evaluating +* changes between two maps. +* +* Both maps pointed to by p_new and p_old must be empty on input. This +* requirement removes the possibility of failures. +* +* SEE ALSO +* Quick Map, cl_qmap_merge +*********/ + +/****f* Component Library: Quick Map/cl_qmap_apply_func +* NAME +* cl_qmap_apply_func +* +* DESCRIPTION +* The cl_qmap_apply_func function executes a specified function +* for every item stored in a quick map. +* +* SYNOPSIS +*/ +void +cl_qmap_apply_func(const cl_qmap_t * const p_map, + cl_pfn_qmap_apply_t pfn_func, + const void *const context); +/* +* PARAMETERS +* p_map +* [in] Pointer to a cl_qmap_t structure. +* +* pfn_func +* [in] Function invoked for every item in the quick map. +* See the cl_pfn_qmap_apply_t function type declaration for +* details about the callback function. +* +* context +* [in] Value to pass to the callback functions to provide context. +* +* RETURN VALUE +* This function does not return a value. +* +* NOTES +* The function provided must not perform any map operations, as these +* would corrupt the quick map. +* +* SEE ALSO +* Quick Map, cl_pfn_qmap_apply_t +*********/ + +#endif /* _CL_QMAP_H_ */ diff --git a/util/compiler.h b/util/compiler.h new file mode 100644 index 0000000..dfce82f --- /dev/null +++ b/util/compiler.h @@ -0,0 +1,54 @@ +/* GPLv2 or OpenIB.org BSD (MIT) See COPYING file */ +#ifndef UTIL_COMPILER_H +#define UTIL_COMPILER_H + +/* Use to tag a variable that causes compiler warnings. Use as: + int uninitialized_var(sz) + + This is only enabled for old compilers. gcc 6.x and beyond have excellent + static flow analysis. If code solicits a warning from 6.x it is almost + certainly too complex for a human to understand. For some reason powerpc + uses a different scheme than gcc for flow analysis. +*/ +#if (__GNUC__ >= 6 && !defined(__powerpc__)) || defined(__clang__) +#define uninitialized_var(x) x +#else +#define uninitialized_var(x) x = x +#endif + +#ifndef likely +#ifdef __GNUC__ +#define likely(x) __builtin_expect(!!(x), 1) +#else +#define likely(x) (x) +#endif +#endif + +#ifndef unlikely +#ifdef __GNUC__ +#define unlikely(x) __builtin_expect(!!(x), 0) +#else +#define unlikely(x) (x) +#endif +#endif + +#ifdef HAVE_FUNC_ATTRIBUTE_ALWAYS_INLINE +#define ALWAYS_INLINE __attribute__((always_inline)) +#else +#define ALWAYS_INLINE +#endif + +/* Use to mark fall through on switch statements as desired. */ +#if __GNUC__ >= 7 +#define SWITCH_FALLTHROUGH __attribute__ ((fallthrough)) +#else +#define SWITCH_FALLTHROUGH +#endif + +#ifdef __CHECKER__ +# define __force __attribute__((force)) +#else +# define __force +#endif + +#endif diff --git a/util/mmio.c b/util/mmio.c new file mode 100644 index 0000000..b362a65 --- /dev/null +++ b/util/mmio.c @@ -0,0 +1,83 @@ +/* GPLv2 or OpenIB.org BSD (MIT) See COPYING file */ +#include <util/mmio.h> +#include <util/udma_barrier.h> +#include <config.h> + +#include <pthread.h> +#include <stdbool.h> + +#if SIZEOF_LONG != 8 + +static pthread_spinlock_t mmio_spinlock; + +static __attribute__((constructor)) void lock_constructor(void) +{ + pthread_spin_init(&mmio_spinlock, PTHREAD_PROCESS_PRIVATE); +} + +/* When the arch does not have a 64 bit store we provide an emulation that + does two stores in address ascending order while holding a global + spinlock. */ +static void pthread_mmio_write64_be(void *addr, __be64 val) +{ + __be32 first_dword = htobe32(be64toh(val) >> 32); + __be32 second_dword = htobe32(be64toh(val)); + + /* The WC spinlock, by definition, provides global ordering for all UC + and WC stores within the critical region. */ + mmio_wc_spinlock(&mmio_spinlock); + + mmio_write32_be(addr, first_dword); + mmio_write32_be(addr + 4, second_dword); + + mmio_wc_spinunlock(&mmio_spinlock); +} + +#if defined(__i386__) +#include <xmmintrin.h> +#include <cpuid.h> + +/* For ia32 we have historically emitted movlps SSE instructions to do the 64 + bit operations. */ +static void __attribute__((target("sse"))) +sse_mmio_write64_be(void *addr, __be64 val) +{ + __m128 tmp = {}; + tmp = _mm_loadl_pi(tmp, (__force __m64 *)&val); + _mm_storel_pi((__m64 *)addr,tmp); +} + +static bool have_sse(void) +{ + unsigned int ax,bx,cx,dx; + + if (!__get_cpuid(1,&ax,&bx,&cx,&dx)) + return false; + return dx & bit_SSE; +} + +#endif /* defined(__i386__) */ + +typedef void (*write64_fn_t)(void *, __be64); + +/* This uses the STT_GNU_IFUNC extension to have the dynamic linker select the + best above implementations at runtime. */ +#if HAVE_FUNC_ATTRIBUTE_IFUNC +void mmio_write64_be(void *addr, __be64 val) + __attribute__((ifunc("resolve_mmio_write64_be"))); +static write64_fn_t resolve_mmio_write64_be(void); +#else +__asm__(".type mmio_write64_be, %gnu_indirect_function"); +write64_fn_t resolve_mmio_write64_be(void) __asm__("mmio_write64_be"); +#endif + +write64_fn_t resolve_mmio_write64_be(void) +{ +#if defined(__i386__) + if (have_sse()) + return &sse_mmio_write64_be; +#endif + return &pthread_mmio_write64_be; +} + +#endif /* SIZEOF_LONG != 8 */ diff --git a/util/mmio.h b/util/mmio.h new file mode 100644 index 0000000..101af9d --- /dev/null +++ b/util/mmio.h @@ -0,0 +1,267 @@ +/* GPLv2 or OpenIB.org BSD (MIT) See COPYING file + + These accessors always map to PCI-E TLPs in predictable ways. Translation + to other buses should follow similar definitions. + + write32(mem, 1) + Produce a 4 byte MemWr TLP with bit 0 of DW byte offset 0 set + write32_be(mem, htobe32(1)) + Produce a 4 byte MemWr TLP with bit 0 of DW byte offset 3 set + write32_le(mem, htole32(1)) + Produce a 4 byte MemWr TLP with bit 0 of DW byte offset 0 set + + For ordering these accessors are similar to the Kernel's concept of + writel_relaxed(). When working with UC memory the following hold: + + 1) Strong ordering is required when talking to the same device (eg BAR), + and combining is not permitted: + + write32(mem, 1); + write32(mem + 4, 1); + write32(mem, 1); + + Must produce three TLPs, in order. + + 2) Ordering ignores all pthread locking: + + pthread_spin_lock(&lock); + write32(mem, global++); + pthread_spin_unlock(&lock); + + When run concurrently on all CPUs the device must observe all stores, + but the data value will not be strictly increasing. + + 3) Interaction with DMA is not ordered. Explicit use of a barrier from + udma_barriers is required: + + *dma_mem = 1; + udma_to_device_barrier(); + write32(mem, GO_DMA); + + 4) Access out of program order (eg speculation), either by the CPU or + compiler is not permitted: + + if (cond) + read32(); + + Must not issue a read TLP if cond is false. + + If these are used with WC memory then #1 and #4 do not apply, and all WC + accesses must be bracketed with mmio_wc_start() // mmio_flush_writes() +*/ + +#ifndef __UTIL_MMIO_H +#define __UTIL_MMIO_H + +#include <linux/types.h> +#include <stdatomic.h> +#include <stdint.h> +#include <stddef.h> +#include <endian.h> + +#include <config.h> +#include <util/compiler.h> + +/* The first step is to define the 'raw' accessors. To make this very safe + with sparse we define two versions of each, a le and a be - however the + code is always identical. +*/ +#ifdef __s390x__ +#include <unistd.h> +#include <sys/syscall.h> + +/* s390 requires a privileged instruction to access IO memory, these syscalls + perform that instruction using a memory buffer copy semantic. +*/ +static inline void s390_mmio_write(void *mmio_addr, const void *val, + size_t length) +{ + // FIXME: Check for error and call abort? + syscall(__NR_s390_pci_mmio_write, mmio_addr, val, length); +} + +static inline void s390_mmio_read(const void *mmio_addr, void *val, + size_t length) +{ + // FIXME: Check for error and call abort? + syscall(__NR_s390_pci_mmio_read, mmio_addr, val, length); +} + +#define MAKE_WRITE(_NAME_, _SZ_) \ + static inline void _NAME_##_be(void *addr, __be##_SZ_ value) \ + { \ + s390_mmio_write(addr, &value, sizeof(value)); \ + } \ + static inline void _NAME_##_le(void *addr, __le##_SZ_ value) \ + { \ + s390_mmio_write(addr, &value, sizeof(value)); \ + } +#define MAKE_READ(_NAME_, _SZ_) \ + static inline __be##_SZ_ _NAME_##_be(const void *addr) \ + { \ + __be##_SZ_ res; \ + s390_mmio_read(addr, &res, sizeof(res)); \ + return res; \ + } \ + static inline __le##_SZ_ _NAME_##_le(const void *addr) \ + { \ + __le##_SZ_ res; \ + s390_mmio_read(addr, &res, sizeof(res)); \ + return res; \ + } + +static inline void mmio_write8(void *addr, uint8_t value) +{ + s390_mmio_write(addr, &value, sizeof(value)); +} + +static inline uint8_t mmio_read8(const void *addr) +{ + uint8_t res; + s390_mmio_read(addr, &res, sizeof(res)); + return res; +} + +#else /* __s390x__ */ + +#define MAKE_WRITE(_NAME_, _SZ_) \ + static inline void _NAME_##_be(void *addr, __be##_SZ_ value) \ + { \ + atomic_store_explicit((_Atomic(uint##_SZ_##_t) *)addr, \ + (__force uint##_SZ_##_t)value, \ + memory_order_relaxed); \ + } \ + static inline void _NAME_##_le(void *addr, __le##_SZ_ value) \ + { \ + atomic_store_explicit((_Atomic(uint##_SZ_##_t) *)addr, \ + (__force uint##_SZ_##_t)value, \ + memory_order_relaxed); \ + } +#define MAKE_READ(_NAME_, _SZ_) \ + static inline __be##_SZ_ _NAME_##_be(const void *addr) \ + { \ + return (__force __be##_SZ_)atomic_load_explicit( \ + (_Atomic(uint##_SZ_##_t) *)addr, memory_order_relaxed); \ + } \ + static inline __le##_SZ_ _NAME_##_le(const void *addr) \ + { \ + return (__force __le##_SZ_)atomic_load_explicit( \ + (_Atomic(uint##_SZ_##_t) *)addr, memory_order_relaxed); \ + } + +static inline void mmio_write8(void *addr, uint8_t value) +{ + atomic_store_explicit((_Atomic(uint8_t) *)addr, value, + memory_order_relaxed); +} +static inline uint8_t mmio_read8(const void *addr) +{ + return atomic_load_explicit((_Atomic(uint32_t) *)addr, + memory_order_relaxed); +} +#endif /* __s390x__ */ + +MAKE_WRITE(mmio_write16, 16) +MAKE_WRITE(mmio_write32, 32) + +MAKE_READ(mmio_read16, 16) +MAKE_READ(mmio_read32, 32) + +#if SIZEOF_LONG == 8 +MAKE_WRITE(mmio_write64, 64) +MAKE_READ(mmio_read64, 64) +#else +void mmio_write64_be(void *addr, __be64 val); +static inline void mmio_write64_le(void *addr, __le64 val) +{ + mmio_write64_be(addr, (__be64 __force)val); +} + +/* There is no way to do read64 atomically, rather than provide some sketchy + implementation we leave these functions undefined, users should not call + them if SIZEOF_LONG != 8, but instead implement an appropriate version. +*/ +__be64 mmio_read64_be(const void *addr); +__le64 mmio_read64_le(const void *addr); +#endif /* SIZEOF_LONG == 8 */ + +#undef MAKE_WRITE +#undef MAKE_READ + +/* Now we can define the host endian versions of the operator, this just includes + a call to htole. +*/ +#define MAKE_WRITE(_NAME_, _SZ_) \ + static inline void _NAME_(void *addr, uint##_SZ_##_t value) \ + { \ + _NAME_##_le(addr, htole##_SZ_(value)); \ + } +#define MAKE_READ(_NAME_, _SZ_) \ + static inline uint##_SZ_##_t _NAME_(const void *addr) \ + { \ + return le##_SZ_##toh(_NAME_##_le(addr)); \ + } + +/* This strictly guarantees the order of TLP generation for the memory copy to + be in ascending address order. +*/ +#ifdef __s390x__ +static inline void mmio_memcpy_x64(void *dest, const void *src, size_t bytecnt) +{ + s390_mmio_write(dest, src, bytecnt); +} +#else + +/* Transfer is some multiple of 64 bytes */ +static inline void mmio_memcpy_x64(void *dest, const void *src, size_t bytecnt) +{ + uintptr_t *dst_p = dest; + + /* Caller must guarantee: + assert(bytecnt != 0); + assert((bytecnt % 64) == 0); + assert(((uintptr_t)dest) % __alignof__(*dst) == 0); + assert(((uintptr_t)src) % __alignof__(*dst) == 0); + */ + + /* Use the native word size for the copy */ + if (sizeof(*dst_p) == 8) { + const __be64 *src_p = src; + + do { + /* Do 64 bytes at a time */ + mmio_write64_be(dst_p++, *src_p++); + mmio_write64_be(dst_p++, *src_p++); + mmio_write64_be(dst_p++, *src_p++); + mmio_write64_be(dst_p++, *src_p++); + mmio_write64_be(dst_p++, *src_p++); + mmio_write64_be(dst_p++, *src_p++); + mmio_write64_be(dst_p++, *src_p++); + mmio_write64_be(dst_p++, *src_p++); + + bytecnt -= 8 * sizeof(*dst_p); + } while (bytecnt > 0); + } else if (sizeof(*dst_p) == 4) { + const __be32 *src_p = src; + + do { + mmio_write32_be(dst_p++, *src_p++); + mmio_write32_be(dst_p++, *src_p++); + bytecnt -= 2 * sizeof(*dst_p); + } while (bytecnt > 0); + } +} +#endif + +MAKE_WRITE(mmio_write16, 16) +MAKE_WRITE(mmio_write32, 32) +MAKE_WRITE(mmio_write64, 64) + +MAKE_READ(mmio_read16, 16) +MAKE_READ(mmio_read32, 32) +MAKE_READ(mmio_read64, 64) + +#undef MAKE_WRITE +#undef MAKE_READ + +#endif diff --git a/util/node_name_map.c b/util/node_name_map.c new file mode 100644 index 0000000..cd73bbc --- /dev/null +++ b/util/node_name_map.c @@ -0,0 +1,222 @@ +/* + * Copyright (c) 2008 Voltaire, Inc. All rights reserved. + * Copyright (c) 2007 Lawrence Livermore National Lab + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#include <config.h> + +#include <string.h> +#include <stdlib.h> +#include <sys/stat.h> +#include <unistd.h> +#include <ctype.h> +#include <errno.h> + +#include <ccan/minmax.h> + +#include <util/node_name_map.h> +#include <util/cl_qmap.h> + +#define PARSE_NODE_MAP_BUFLEN 256 + +typedef struct _name_map_item { + cl_map_item_t item; + uint64_t guid; + char *name; +} name_map_item_t; + +struct nn_map { + cl_qmap_t map; +}; + +static int map_name(void *cxt, uint64_t guid, char *p) +{ + cl_qmap_t *map = cxt; + name_map_item_t *item; + + p = strtok(p, "\"#"); + if (!p) + return 0; + + item = malloc(sizeof(*item)); + if (!item) + return -1; + item->guid = guid; + item->name = strdup(p); + cl_qmap_insert(map, item->guid, (cl_map_item_t *) item); + return 0; +} + +void close_node_name_map(nn_map_t * map) +{ + name_map_item_t *item = NULL; + + if (!map) + return; + + item = (name_map_item_t *) cl_qmap_head(&map->map); + while (item != (name_map_item_t *) cl_qmap_end(&map->map)) { + item = (name_map_item_t *) cl_qmap_remove(&map->map, item->guid); + free(item->name); + free(item); + item = (name_map_item_t *) cl_qmap_head(&map->map); + } + free(map); +} + +char *remap_node_name(nn_map_t * map, uint64_t target_guid, char *nodedesc) +{ + char *rc = NULL; + name_map_item_t *item = NULL; + + if (!map) + goto done; + + item = (name_map_item_t *) cl_qmap_get(&map->map, target_guid); + if (item != (name_map_item_t *) cl_qmap_end(&map->map)) + rc = strdup(item->name); + +done: + if (rc == NULL) + rc = strdup(clean_nodedesc(nodedesc)); + return (rc); +} + +char *clean_nodedesc(char *nodedesc) +{ + int i = 0; + + nodedesc[63] = '\0'; + while (nodedesc[i]) { + if (!isprint(nodedesc[i])) + nodedesc[i] = ' '; + i++; + } + + return (nodedesc); +} + +static int parse_node_map_wrap(const char *file_name, + int (*create) (void *, uint64_t, char *), + void *cxt, + char *linebuf, + unsigned int linebuflen) +{ + char line[PARSE_NODE_MAP_BUFLEN]; + FILE *f; + + if (!(f = fopen(file_name, "r"))) + return -1; + + while (fgets(line, sizeof(line), f)) { + uint64_t guid; + char *p, *e; + + p = line; + while (isspace(*p)) + p++; + if (*p == '\0' || *p == '\n' || *p == '#') + continue; + + guid = strtoull(p, &e, 0); + if (e == p || (!isspace(*e) && *e != '#' && *e != '\0')) { + fclose(f); + errno = EIO; + if (linebuf) { + memcpy(linebuf, line, + min_t(size_t, PARSE_NODE_MAP_BUFLEN, + linebuflen)); + e = strpbrk(linebuf, "\n"); + if (e) + *e = '\0'; + } + return -1; + } + + p = e; + while (isspace(*p)) + p++; + + e = strpbrk(p, "\n"); + if (e) + *e = '\0'; + + if (create(cxt, guid, p)) { + fclose(f); + return -1; + } + } + + fclose(f); + return 0; +} + +nn_map_t *open_node_name_map(const char *node_name_map) +{ + nn_map_t *map; + char linebuf[PARSE_NODE_MAP_BUFLEN + 1]; + + if (!node_name_map) { + struct stat buf; + node_name_map = IBDIAG_NODENAME_MAP_PATH; + if (stat(node_name_map, &buf)) + return NULL; + } + + map = malloc(sizeof(*map)); + if (!map) + return NULL; + cl_qmap_init(&map->map); + + memset(linebuf, '\0', PARSE_NODE_MAP_BUFLEN + 1); + if (parse_node_map_wrap(node_name_map, map_name, map, + linebuf, PARSE_NODE_MAP_BUFLEN)) { + if (errno == EIO) { + fprintf(stderr, + "WARNING failed to parse node name map " + "\"%s\"\n", + node_name_map); + fprintf(stderr, + "WARNING failed line: \"%s\"\n", + linebuf); + } + else + fprintf(stderr, + "WARNING failed to open node name map " + "\"%s\" (%s)\n", + node_name_map, strerror(errno)); + close_node_name_map(map); + return NULL; + } + + return map; +} diff --git a/util/node_name_map.h b/util/node_name_map.h new file mode 100644 index 0000000..e78d274 --- /dev/null +++ b/util/node_name_map.h @@ -0,0 +1,19 @@ +/* Copyright (c) 2019 Mellanox Technologies. All rights reserved. + * + * Connect to opensm's cl_nodenamemap.h if it is available. + */ +#ifndef __LIBUTIL_NODE_NAME_MAP_H__ +#define __LIBUTIL_NODE_NAME_MAP_H__ + +#include <stdint.h> + +struct nn_map; +typedef struct nn_map nn_map_t; + +nn_map_t *open_node_name_map(const char *node_name_map); +void close_node_name_map(nn_map_t *map); +/* NOTE: parameter "nodedesc" may be modified here. */ +char *remap_node_name(nn_map_t *map, uint64_t target_guid, char *nodedesc); +char *clean_nodedesc(char *nodedesc); + +#endif diff --git a/util/open_cdev.c b/util/open_cdev.c new file mode 100644 index 0000000..e7e1b35 --- /dev/null +++ b/util/open_cdev.c @@ -0,0 +1,146 @@ +/* + * Copyright (c) 2019, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#define _GNU_SOURCE +#include <stdio.h> +#include <stdlib.h> +#include <sys/stat.h> +#include <fcntl.h> +#include <unistd.h> +#include <sys/timerfd.h> +#include <sys/inotify.h> +#include <sys/sysmacros.h> +#include <poll.h> + +#include <util/util.h> + +#include <config.h> + +static int open_cdev_internal(const char *path, dev_t cdev) +{ + struct stat st; + int fd; + + fd = open(path, O_RDWR | O_CLOEXEC); + if (fd == -1) + return -1; + if (fstat(fd, &st) || !S_ISCHR(st.st_mode) || + (cdev != 0 && st.st_rdev != cdev)) { + close(fd); + return -1; + } + return fd; +} + +/* + * In case the cdev was not exactly where we should be, use this more + * elaborate approach to find it. This is designed to resolve a race with + * module autoloading where udev is concurrently creately the cdev as we are + * looking for it. udev has 5 seconds to create the link or we fail. + * + * Modern userspace and kernels create the /dev/infiniband/X synchronously via + * devtmpfs before returning from the netlink query, so they should never use + * this path. + */ +static int open_cdev_robust(const char *devname_hint, dev_t cdev) +{ + struct itimerspec ts = { .it_value = { .tv_sec = 5 } }; + struct inotify_event buf[16]; + struct pollfd fds[2]; + char *devpath; + int res = -1; + int ifd; + int tfd; + + /* + * This assumes that udev is being used and is creating the /dev/char/ + * symlinks. + */ + if (asprintf(&devpath, "/dev/char/%u:%u", major(cdev), minor(cdev)) < 0) + return -1; + + /* Use inotify to speed up the resolution time. */ + ifd = inotify_init1(IN_CLOEXEC | IN_NONBLOCK); + if (ifd == -1) + goto err_mem; + if (inotify_add_watch(ifd, "/dev/char/", IN_CREATE) == -1) + goto err_inotify; + + /* Timerfd is simpler than working with relative time outs */ + tfd = timerfd_create(CLOCK_MONOTONIC, TFD_CLOEXEC); + if (tfd == -1) + goto err_inotify; + if (timerfd_settime(tfd, 0, &ts, NULL) == -1) + goto out_timer; + + res = open_cdev_internal(devpath, cdev); + if (res != -1) + goto out_timer; + + fds[0].fd = ifd; + fds[0].events = POLLIN; + fds[1].fd = tfd; + fds[1].events = POLLIN; + while (poll(fds, 2, -1) > 0) { + res = open_cdev_internal(devpath, cdev); + if (res != -1) + goto out_timer; + + if (fds[0].revents) { + if (read(ifd, buf, sizeof(buf)) == -1) + goto out_timer; + } + if (fds[1].revents) + goto out_timer; + } + +out_timer: + close(tfd); +err_inotify: + close(ifd); +err_mem: + free(devpath); + return res; +} + +int open_cdev(const char *devname_hint, dev_t cdev) +{ + char *devpath; + int fd; + + if (asprintf(&devpath, RDMA_CDEV_DIR "/%s", devname_hint) < 0) + return -1; + fd = open_cdev_internal(devpath, cdev); + free(devpath); + if (fd == -1 && cdev != 0) + return open_cdev_robust(devname_hint, cdev); + return fd; +} diff --git a/util/rdma_nl.c b/util/rdma_nl.c new file mode 100644 index 0000000..065eff3 --- /dev/null +++ b/util/rdma_nl.c @@ -0,0 +1,142 @@ +/* + * Copyright (c) 2019, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <util/rdma_nl.h> + +#include <stdbool.h> +#include <sys/sysmacros.h> + +struct nla_policy rdmanl_policy[RDMA_NLDEV_ATTR_MAX] = { + [RDMA_NLDEV_ATTR_CHARDEV] = { .type = NLA_U64 }, + [RDMA_NLDEV_ATTR_CHARDEV_ABI] = { .type = NLA_U64 }, + [RDMA_NLDEV_ATTR_DEV_INDEX] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_DEV_NODE_TYPE] = { .type = NLA_U8 }, + [RDMA_NLDEV_ATTR_NODE_GUID] = { .type = NLA_U64 }, + [RDMA_NLDEV_ATTR_UVERBS_DRIVER_ID] = { .type = NLA_U32 }, +#ifdef NLA_NUL_STRING + [RDMA_NLDEV_ATTR_CHARDEV_NAME] = { .type = NLA_NUL_STRING }, + [RDMA_NLDEV_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING }, + [RDMA_NLDEV_ATTR_DEV_PROTOCOL] = { .type = NLA_NUL_STRING }, + [RDMA_NLDEV_ATTR_FW_VERSION] = { .type = NLA_NUL_STRING }, +#endif /* NLA_NUL_STRING */ +}; + +static int rdmanl_saw_err_cb(struct sockaddr_nl *nla, struct nlmsgerr *nlerr, + void *arg) +{ + bool *failed = arg; + + *failed = true; + return 0; +} + +struct nl_sock *rdmanl_socket_alloc(void) +{ + struct nl_sock *nl; + + nl = nl_socket_alloc(); + if (!nl) + return NULL; + nl_socket_disable_auto_ack(nl); + nl_socket_disable_msg_peek(nl); + + if (nl_connect(nl, NETLINK_RDMA)) { + nl_socket_free(nl); + return NULL; + } + return nl; +} + +int rdmanl_get_devices(struct nl_sock *nl, nl_recvmsg_msg_cb_t cb_func, + void *data) +{ + bool failed = false; + int ret; + + if (nl_send_simple(nl, + RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_GET), + NLM_F_DUMP, NULL, 0) < 0) + return -1; + + if (nl_socket_modify_err_cb(nl, NL_CB_CUSTOM, rdmanl_saw_err_cb, + &failed)) + return -1; + if (nl_socket_modify_cb(nl, NL_CB_VALID, NL_CB_CUSTOM, cb_func, data)) + return -1; + do { + ret = nl_recvmsgs_default(nl); + } while (ret > 0); + nl_socket_modify_err_cb(nl, NL_CB_CUSTOM, NULL, NULL); + + if (ret || failed) + return -1; + return 0; +} + +int rdmanl_get_chardev(struct nl_sock *nl, int ibidx, const char *name, + nl_recvmsg_msg_cb_t cb_func, void *data) + +{ + bool failed = false; + struct nl_msg *msg; + int ret; + + msg = nlmsg_alloc_simple( + RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_GET_CHARDEV), 0); + if (!msg) + return -1; + if (ibidx != -1) + NLA_PUT_U32(msg, RDMA_NLDEV_ATTR_DEV_INDEX, ibidx); + NLA_PUT_STRING(msg, RDMA_NLDEV_ATTR_CHARDEV_TYPE, name); + ret = nl_send_auto(nl, msg); + nlmsg_free(msg); + if (ret < 0) + return -1; + + if (nl_socket_modify_err_cb(nl, NL_CB_CUSTOM, rdmanl_saw_err_cb, + &failed)) + return -1; + if (nl_socket_modify_cb(nl, NL_CB_VALID, NL_CB_CUSTOM, cb_func, data)) + return -1; + do { + ret = nl_recvmsgs_default(nl); + } while (ret > 0); + nl_socket_modify_err_cb(nl, NL_CB_CUSTOM, NULL, NULL); + + if (ret || failed) + return -1; + return 0; + +nla_put_failure: + nlmsg_free(msg); + return -1; +} diff --git a/util/rdma_nl.h b/util/rdma_nl.h new file mode 100644 index 0000000..2af4641 --- /dev/null +++ b/util/rdma_nl.h @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2019, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef UTIL_RDMA_NL_H +#define UTIL_RDMA_NL_H + +#include <rdma/rdma_netlink.h> +#include <netlink/netlink.h> +#include <netlink/msg.h> +#include <netlink/attr.h> + +extern struct nla_policy rdmanl_policy[RDMA_NLDEV_ATTR_MAX]; +struct nl_sock *rdmanl_socket_alloc(void); +int rdmanl_get_devices(struct nl_sock *nl, nl_recvmsg_msg_cb_t cb_func, + void *data); +int rdmanl_get_chardev(struct nl_sock *nl, int ibidx, const char *name, + nl_recvmsg_msg_cb_t cb_func, void *data); + +#endif diff --git a/util/symver.h b/util/symver.h new file mode 100644 index 0000000..ae41305 --- /dev/null +++ b/util/symver.h @@ -0,0 +1,102 @@ +/* GPLv2 or OpenIB.org BSD (MIT) See COPYING file + + These definitions help using the ELF symbol version feature, and must be + used in conjunction with the library's map file. + */ + +#ifndef __UTIL_SYMVER_H +#define __UTIL_SYMVER_H + +#include <config.h> +#include <ccan/str.h> + +/* + These macros should only be used if the library is defining compatibility + symbols, eg: + + 213: 000000000000a650 315 FUNC GLOBAL DEFAULT 13 ibv_get_device_list@IBVERBS_1.0 + 214: 000000000000b020 304 FUNC GLOBAL DEFAULT 13 ibv_get_device_list@@IBVERBS_1.1 + + Symbols which have only a single implementation should use a normal extern + function and be placed in the correct stanza in the linker map file. + + Follow this pattern to use this feature: + public.h: + struct ibv_device **ibv_get_device_list(int *num_devices); + foo.c: + // Implement the latest version + LATEST_SYMVER_FUNC(ibv_get_device_list, 1_1, "IBVERBS_1.1", + struct ibv_device **, + int *num_devices) + { + ... + } + + // Implement the compat version + COMPAT_SYMVER_FUNC(ibv_get_device_list, 1_0, "IBVERBS_1.0", + struct ibv_device_1_0 **, + int *num_devices) + { + ... + } + + As well as matching information in the map file. + + These macros deal with the various uglyness in gcc surrounding symbol + versions + + - The internal name __public_1_x is synthesized by the macro + - A prototype for the internal name is created by the macro + - If statically linking the latest symbol expands into a normal function + definition + - If statically linking the compat symbols expand into unused static + functions are are discarded by the compiler. + - The prototype of the latest symbol is checked against the public + prototype (only when compiling statically) + + The extra prototypes are included only to avoid -Wmissing-prototypes + warnings. See also Documentation/versioning.md +*/ + +#define _MAKE_SYMVER(_local_sym, _public_sym, _ver_str) \ + asm(".symver " #_local_sym "," #_public_sym "@" _ver_str) +#define _MAKE_SYMVER_FUNC(_public_sym, _uniq, _ver_str, _ret, ...) \ + _ret __##_public_sym##_##_uniq(__VA_ARGS__); \ + _MAKE_SYMVER(__##_public_sym##_##_uniq, _public_sym, _ver_str); \ + _ret __##_public_sym##_##_uniq(__VA_ARGS__) + +#if defined(HAVE_FULL_SYMBOL_VERSIONS) && !defined(_STATIC_LIBRARY_BUILD_) + + // Produce all symbol versions for dynamic linking + +# define COMPAT_SYMVER_FUNC(_public_sym, _uniq, _ver_str, _ret, ...) \ + _MAKE_SYMVER_FUNC(_public_sym, _uniq, _ver_str, _ret, __VA_ARGS__) +# define LATEST_SYMVER_FUNC(_public_sym, _uniq, _ver_str, _ret, ...) \ + _MAKE_SYMVER_FUNC(_public_sym, _uniq, "@" _ver_str, _ret, __VA_ARGS__) + +#elif defined(HAVE_LIMITED_SYMBOL_VERSIONS) && !defined(_STATIC_LIBRARY_BUILD_) + + /* Produce only implemenations for the latest symbol and tag it with the + * correct symbol versions. This supports dynamic linkers that do not + * understand symbol versions + */ +# define COMPAT_SYMVER_FUNC(_public_sym, _uniq, _ver_str, _ret, ...) \ + static inline _ret __##_public_sym##_##_uniq(__VA_ARGS__) +# define LATEST_SYMVER_FUNC(_public_sym, _uniq, _ver_str, _ret, ...) \ + _MAKE_SYMVER_FUNC(_public_sym, _uniq, "@" _ver_str, _ret, __VA_ARGS__) + +#else + + // Static linking, or linker does not support symbol versions +#define COMPAT_SYMVER_FUNC(_public_sym, _uniq, _ver_str, _ret, ...) \ + static inline __attribute__((unused)) \ + _ret __##_public_sym##_##_uniq(__VA_ARGS__) +#define LATEST_SYMVER_FUNC(_public_sym, _uniq, _ver_str, _ret, ...) \ + static __attribute__((unused)) \ + _ret __##_public_sym##_##_uniq(__VA_ARGS__) \ + __attribute__((alias(stringify(_public_sym)))); \ + extern _ret _public_sym(__VA_ARGS__) + +#endif + +#endif diff --git a/util/udma_barrier.h b/util/udma_barrier.h new file mode 100644 index 0000000..23acf23 --- /dev/null +++ b/util/udma_barrier.h @@ -0,0 +1,261 @@ +/* + * Copyright (c) 2005 Topspin Communications. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef __UTIL_UDMA_BARRIER_H +#define __UTIL_UDMA_BARRIER_H + +#include <pthread.h> + +/* Barriers for DMA. + + These barriers are expliclty only for use with user DMA operations. If you + are looking for barriers to use with cache-coherent multi-threaded + consitency then look in stdatomic.h. If you need both kinds of synchronicity + for the same address then use an atomic operation followed by one + of these barriers. + + When reasoning about these barriers there are two objects: + - CPU attached address space (the CPU memory could be a range of things: + cached/uncached/non-temporal CPU DRAM, uncached MMIO space in another + device, pMEM). Generally speaking the ordering is only relative + to the local CPU's view of the system. Eg if the local CPU + is not guaranteed to see a write from another CPU then it is also + OK for the DMA device to also not see the write after the barrier. + - A DMA initiator on a bus. For instance a PCI-E device issuing + MemRd/MemWr TLPs. + + The ordering guarantee is always stated between those two streams. Eg what + happens if a MemRd TLP is sent in via PCI-E relative to a CPU WRITE to the + same memory location. + + The providers have a very regular and predictable use of these barriers, + to make things very clear each narrow use is given a name and the proper + name should be used in the provider as a form of documentation. +*/ + +/* Ensure that the device's view of memory matches the CPU's view of memory. + This should be placed before any MMIO store that could trigger the device + to begin doing DMA, such as a device doorbell ring. + + eg + *dma_buf = 1; + udma_to_device_barrier(); + mmio_write(DO_DMA_REG, dma_buf); + Must ensure that the device sees the '1'. + + This is required to fence writes created by the libibverbs user. Those + writes could be to any CPU mapped memory object with any cachability mode. + + NOTE: x86 has historically used a weaker semantic for this barrier, and + only fenced normal stores to normal memory. libibverbs users using other + memory types or non-temporal stores are required to use SFENCE in their own + code prior to calling verbs to start a DMA. +*/ +#if defined(__i386__) +#define udma_to_device_barrier() asm volatile("" ::: "memory") +#elif defined(__x86_64__) +#define udma_to_device_barrier() asm volatile("" ::: "memory") +#elif defined(__PPC64__) +#define udma_to_device_barrier() asm volatile("sync" ::: "memory") +#elif defined(__PPC__) +#define udma_to_device_barrier() asm volatile("sync" ::: "memory") +#elif defined(__ia64__) +#define udma_to_device_barrier() asm volatile("mf" ::: "memory") +#elif defined(__sparc_v9__) +#define udma_to_device_barrier() asm volatile("membar #StoreStore" ::: "memory") +#elif defined(__aarch64__) +#define udma_to_device_barrier() asm volatile("dsb st" ::: "memory"); +#elif defined(__sparc__) || defined(__s390x__) +#define udma_to_device_barrier() asm volatile("" ::: "memory") +#else +#error No architecture specific memory barrier defines found! +#endif + +/* Ensure that all ordered stores from the device are observable from the + CPU. This only makes sense after something that observes an ordered store + from the device - eg by reading a MMIO register or seeing that CPU memory is + updated. + + This guarantees that all reads that follow the barrier see the ordered + stores that preceded the observation. + + For instance, this would be used after testing a valid bit in a memory + that is a DMA target, to ensure that the following reads see the + data written before the MemWr TLP that set the valid bit. +*/ +#if defined(__i386__) +#define udma_from_device_barrier() asm volatile("lock; addl $0,0(%%esp) " ::: "memory") +#elif defined(__x86_64__) +#define udma_from_device_barrier() asm volatile("lfence" ::: "memory") +#elif defined(__PPC64__) +#define udma_from_device_barrier() asm volatile("lwsync" ::: "memory") +#elif defined(__PPC__) +#define udma_from_device_barrier() asm volatile("sync" ::: "memory") +#elif defined(__ia64__) +#define udma_from_device_barrier() asm volatile("mf" ::: "memory") +#elif defined(__sparc_v9__) +#define udma_from_device_barrier() asm volatile("membar #LoadLoad" ::: "memory") +#elif defined(__aarch64__) +#define udma_from_device_barrier() asm volatile("dsb ld" ::: "memory"); +#elif defined(__sparc__) || defined(__s390x__) +#define udma_from_device_barrier() asm volatile("" ::: "memory") +#else +#error No architecture specific memory barrier defines found! +#endif + +/* Order writes to CPU memory so that a DMA device cannot view writes after + the barrier without also seeing all writes before the barrier. This does + not guarantee any writes are visible to DMA. + + This would be used in cases where a DMA buffer might have a valid bit and + data, this barrier is placed after writing the data but before writing the + valid bit to ensure the DMA device cannot observe a set valid bit with + unwritten data. + + Compared to udma_to_device_barrier() this barrier is not required to fence + anything but normal stores to normal malloc memory. Usage should be: + + write_wqe + udma_to_device_barrier(); // Get user memory ready for DMA + wqe->addr = ...; + wqe->flags = ...; + udma_ordering_write_barrier(); // Guarantee WQE written in order + wqe->valid = 1; +*/ +#define udma_ordering_write_barrier() udma_to_device_barrier() + +/* Promptly flush writes to MMIO Write Cominbing memory. + This should be used after a write to WC memory. This is both a barrier + and a hint to the CPU to flush any buffers to reduce latency to TLP + generation. + + This is not required to have any effect on CPU memory. + + If done while holding a lock then the ordering of MMIO writes across CPUs + must be guaranteed to follow the natural ordering implied by the lock. + + This must also act as a barrier that prevents write combining, eg + *wc_mem = 1; + mmio_flush_writes(); + *wc_mem = 2; + Must always produce two MemWr TLPs, '1' and '2'. Without the barrier + the CPU is allowed to produce a single TLP '2'. + + Note that there is no order guarantee for writes to WC memory without + barriers. + + This is intended to be used in conjunction with WC memory to generate large + PCI-E MemWr TLPs from the CPU. +*/ +#if defined(__i386__) +#define mmio_flush_writes() asm volatile("lock; addl $0,0(%%esp) " ::: "memory") +#elif defined(__x86_64__) +#define mmio_flush_writes() asm volatile("sfence" ::: "memory") +#elif defined(__PPC64__) +#define mmio_flush_writes() asm volatile("sync" ::: "memory") +#elif defined(__PPC__) +#define mmio_flush_writes() asm volatile("sync" ::: "memory") +#elif defined(__ia64__) +#define mmio_flush_writes() asm volatile("fwb" ::: "memory") +#elif defined(__sparc_v9__) +#define mmio_flush_writes() asm volatile("membar #StoreStore" ::: "memory") +#elif defined(__aarch64__) +#define mmio_flush_writes() asm volatile("dsb st" ::: "memory"); +#elif defined(__sparc__) || defined(__s390x__) +#define mmio_flush_writes() asm volatile("" ::: "memory") +#else +#error No architecture specific memory barrier defines found! +#endif + +/* Prevent WC writes from being re-ordered relative to other MMIO + writes. This should be used before a write to WC memory. + + This must act as a barrier to prevent write re-ordering from different + memory types: + *mmio_mem = 1; + mmio_flush_writes(); + *wc_mem = 2; + Must always produce a TLP '1' followed by '2'. + + This barrier implies udma_to_device_barrier() + + This is intended to be used in conjunction with WC memory to generate large + PCI-E MemWr TLPs from the CPU. +*/ +#define mmio_wc_start() mmio_flush_writes() + +/* Keep MMIO writes in order. + Currently we lack writel macros that universally guarantee MMIO + writes happen in order, like the kernel does. Even worse many + providers haphazardly open code writes to MMIO memory omitting even + volatile. + + Until this can be fixed with a proper writel macro, this barrier + is a stand in to indicate places where MMIO writes should be switched + to some future writel. +*/ +#define mmio_ordered_writes_hack() mmio_flush_writes() + +/* Write Combining Spinlock primitive + + Any access to a multi-value WC region must ensure that multiple cpus do not + write to the same values concurrently, these macros make that + straightforward and efficient if the choosen exclusion is a spinlock. + + The spinlock guarantees that the WC writes issued within the critical + section are made visible as TLP to the device. The TLP must be seen by the + device strictly in the order that the spinlocks are acquired, and combining + WC writes between different sections is not permitted. + + Use of these macros allow the fencing inside the spinlock to be combined + with the fencing required for DMA. + */ +static inline void mmio_wc_spinlock(pthread_spinlock_t *lock) +{ + pthread_spin_lock(lock); +#if !defined(__i386__) && !defined(__x86_64__) + /* For x86 the serialization within the spin lock is enough to + * strongly order WC and other memory types. */ + mmio_wc_start(); +#endif +} + +static inline void mmio_wc_spinunlock(pthread_spinlock_t *lock) +{ + /* It is possible that on x86 the atomic in the lock is strong enough + * to force-flush the WC buffers quickly, and this SFENCE can be + * omitted too. */ + mmio_flush_writes(); + pthread_spin_unlock(lock); +} + +#endif diff --git a/util/util.c b/util/util.c new file mode 100644 index 0000000..8c5f8f1 --- /dev/null +++ b/util/util.c @@ -0,0 +1,22 @@ +/* GPLv2 or OpenIB.org BSD (MIT) See COPYING file */ +#include <util/util.h> +#include <unistd.h> +#include <fcntl.h> + +int set_fd_nonblock(int fd, bool nonblock) +{ + int val; + + val = fcntl(fd, F_GETFL); + if (val == -1) + return -1; + + if (nonblock) + val |= O_NONBLOCK; + else + val &= ~(unsigned int)(O_NONBLOCK); + + if (fcntl(fd, F_SETFL, val) == -1) + return -1; + return 0; +} diff --git a/util/util.h b/util/util.h new file mode 100644 index 0000000..514302b --- /dev/null +++ b/util/util.h @@ -0,0 +1,45 @@ +/* GPLv2 or OpenIB.org BSD (MIT) See COPYING file */ +#ifndef UTIL_UTIL_H +#define UTIL_UTIL_H + +#include <ccan/ilog.h> +#include <stdbool.h> +#include <sys/types.h> +#include <stdio.h> + +/* Return true if the snprintf succeeded, false if there was truncation or + * error */ +static inline bool __good_snprintf(size_t len, int rc) +{ + return (rc < len && rc >= 0); +} + +#define check_snprintf(buf, len, fmt, ...) \ + __good_snprintf(len, snprintf(buf, len, fmt, ##__VA_ARGS__)) + +/* a CMP b. See also the BSD macro timercmp(). */ +#define ts_cmp(a, b, CMP) \ + (((a)->tv_sec == (b)->tv_sec) ? \ + ((a)->tv_nsec CMP (b)->tv_nsec) : \ + ((a)->tv_sec CMP (b)->tv_sec)) + +static inline unsigned long align(unsigned long val, unsigned long align) +{ + return (val + align - 1) & ~(align - 1); +} + +static inline uint64_t roundup_pow_of_two(uint64_t n) +{ + return n == 1 ? 1 : 1ULL << ilog64(n - 1); +} + +static inline unsigned long DIV_ROUND_UP(unsigned long n, unsigned long d) +{ + return (n + d - 1) / d; +} + +int set_fd_nonblock(int fd, bool nonblock); + +int open_cdev(const char *devname_hint, dev_t cdev); + +#endif